#!/bin/bash
#
# check_k8s_pods
#
# Author        : Nohaj
# Contact       : johan@slashroot.fr
# Date          : 19/02/19
# Version       : 1.0
# Description   : Basic script to check the pods status of a k8s cluster 
# Require       : jq and kubectl utility with a configured access to a k8s cluster
#

#
# Variables and checks
#

# This script needs jq and kubectl
type jq &> /dev/null || { echo >&2 "UNKNOWN: This script need the jq utility to run."; exit 3; }
type kubectl &> /dev/null || { echo >&2 "UNKNOWN: This script need the kubectl utility to run."; exit 3; }

# Check kubectl command
if ( ! kubectl get pods &> /dev/null ) ; then
    echo "UNKNOWN: Kubectl command doesn't seems to be working"
    exit 3
fi

# Pods checks and expected results are hardcoded =/
checks="PodScheduled;True Initialized;True ContainersReady;True Ready;True"

# Defaults thresolds
pods_warn_thresold=1
pods_crit_thresold=5
rs_warn_thresold=5
rs_crit_thresold=10

usage (){
cat <<EOF

Usage : check_k8s_pods [-n NAMESPACE[,NAMESPACE2,NAMESPACE3]] [-w PODS_WARN_THRESOLD] [-c PODS_CRIT_THRESOLD] [-W RESTARTS_WARN_THRESOLD] [-C RESTARTS_CRIT_THRESOLD]

Options:
    -h                          Print help
    -n NAMESPACE                Namespace(s) inside which we'll check the pods (if not precised, all pods will be checked)
    -w PODS_WARN_THRESOLD       Warning threshold for problematics pods (default : $pods_warn_thresold)
    -c PODS_CRIT_THRESOLD       Critical threshold for problematics pods (default : $pods_crit_thresold)
    -W RESTARTS_WARN_THRESOLD   Warning thresold for number of containers restarts (default : $rs_warn_thresold)
    -W RESTARTS_CRIT_THRESOLD   Critical thresold for number of containers restarts (default : $rs_warn_thresold)

EOF
exit 3
}

#
# Let's go
#

while getopts ":n:h:w:c:W:C:" opt; do
    case "$opt" in
        n)
            namespaces="$OPTARG"
            ;;
        w)
            pods_warn_thresold="$OPTARG"
            ;;
        c)
            pods_crit_thresold="$OPTARG"
            ;;
        W)
            rs_warn_thresold="$OPTARG"
            ;;
        C)
            rs_crit_thresold="$OPTARG"
            ;;
        h) 
            usage 
            ;;
        *) 
            usage
            ;;
    esac
done

# Retrive all the states
if [[ -n $namespaces ]] ; then
    for namespace in $(echo $namespaces | tr ',' ' ') ; do
        states=""$states"$(kubectl get pods -n $namespace -o json)"
    done
else
    states=$(kubectl get pods --all-namespaces -o json)
fi

# Retrieve all the pods names
pods=$(echo $states | jq -r '.items[].metadata.name')

pods_warn=0
pods_crit=0
pods_ok=0
rs_warn=0
rs_crit=0
rs_ok=0
ok_message=""
warn_message=""
crit_message=""

# Retrieve all the pods status
for pod in $pods ; do
    pod_err=0
    pod_status=$(echo $states | jq -r '.items[] | select(.metadata.name | contains("'$pod'"))')
    phase=$(echo $pod_status | jq -r '.status.phase')
    if [[ $phase == "Running" ]] ; then
        for i in $checks ; do
            check=$(echo $i | cut -d ";" -f 1)
            expected_res=$(echo $i | cut -d ";" -f 2)
            res=$(echo $pod_status | jq -r '.status.conditions[] | select(.type=="'$check'") | .status')
            if [[ $res != $expected_res ]] ; then
                warn_message=""$warn_message"- Pod $pod is in WARNING state\n"
                pods_warn=$((pods_warn+1))
                pod_err=1
                break
            fi
        done
    else
        crit_message=""$crit_message"- Pod $pod\n"
        pods_crit=$((pods_crit+1))
        pod_err=2
    fi
    if [[ $pod_err -eq 0 ]] ; then
        ok_message=""$ok_message"- Pod $pod\n"
        pods_ok=$((pods_ok+1))
    fi
    if [[ $pod_err -ne 2 ]] ; then
        containers=$(echo $pod_status | jq -r '.status.containerStatuses[].name')
        for container in $containers ; do
            container_status=$(echo $pod_status | jq -r '.status.containerStatuses[] | select(.name=="'$container'") | .ready')
            container_restart=$(echo "$pod_status" | jq -r '.status.containerStatuses[] | select(.name=="'$container'") | .restartCount')
            if [[ $container_status == "true" ]] ; then
                if [[ $container_restart -ge $rs_warn_thresold ]] ; then
                    message="-- Container $container is ready with $container_restart restart(s)\n"
                    rs_warn=$((rs_warn+1))
                else
                    message="-- Container $container is ready\n"
                    rs_ok=$((rs_ok+1))
                fi
            else
                message="-- Container $container is not ready\n"
                rs_crit=$((rs_crit+1))
            fi
        done
        if [[ $pod_err -eq 1 ]] ; then
            warn_message="$warn_message$message"
        else
            ok_message="$ok_message$message"
        fi
    else
        if [[ $phase == "Succeeded" ]] ; then
            containers=$(echo $pod_status | jq -r '.status.containerStatuses[].name')
        else
            containers=$(echo $pod_status | jq -r '.spec.initContainers[].name')
        fi
        for container in $containers ; do
            crit_message=""$crit_message"-- Container $container is not ready\n"
            rs_crit=$((rs_crit+1))
        done
    fi
done

total_pods=$((pods_warn+pods_crit+pods_ok))
total_containers=$((rs_warn+rs_crit+rs_ok))
pods_errors=$((pods_warn+pods_crit))

if [[ $rs_crit -gt $rs_crit_thresold ]] ; then
    rs_thresold=$rs_crit_thresold
else
    rs_thresold=$rs_warn_thresold
fi

if [[ $pods_errors -gt $pods_crit_thresold || $rs_crit -gt $rs_crit_thresold ]] ; then 
    exit_code=2
    echo -e "CRITICAL: There is some serious issues inside the $namespaces namespace(s)\n"
elif [[ $pods_errors -gt $pods_warn_thresold || $rs_warn -gt $rs_warn_thresold ]] ; then
    exit_code=1
    echo -e "WARNING: There is some issues inside the $namespaces namespace(s)\n"
else
    echo -e "OK: The pods inside the $namespaces namespace(s) are OK\n"
fi

echo -e "State : $pods_ok/$total_pods pods are OK and $rs_ok/$total_containers containers are ready with less than $rs_thresold restart(s)\n"

if [[ $pods_crit -gt 0 || $rs_crit -gt 0 ]] ; then
    echo "List of pods in CRITICAL state (not ready and unschedulable) :"
    echo -e $crit_message
fi

if [[ $pods_warn -gt 0 || $rs_warn -gt 0 ]] ; then
    echo "List of pods in WARNING state (not ready but schedulable) :"
    echo -e $warn_message
fi

if [[ $pods_ok -gt 0 || $rs_ok -gt 0 ]] ; then
    echo "List of pods in OK state :"
    echo -e $ok_message
fi

exit $exit_code

