#!/bin/bash
# 
# check_linux_raid
#
# Author        : Nohaj
# Contact       : johan@slashroot.fr
# Date          : 24/05/19
# Version       : 1.1 (31/05/19)
# Description   : Basic script to check the consistancy of software RAID on Linux boxes
# Require       : mdadm utility
#

#
# Changelog
#
# v1.1 : add the proper information when a raid is degraded with a removed device and use the proper numbers for raid_devices and total_devices
#

#
# Variables and checks
#

# Check mdadm
type mdadm &> /dev/null || { echo >&2 "UNKNOWN: This script need the mdadm utility to run."; exit 3; }

# Initialisation des compteurs
ok=0
warn=0
crit=0

# On verifie qu'il y a du RAID sur la machine
raid_count=$(grep ^md -c /proc/mdstat)
if [[ $raid_count -eq 0 ]] ; then
    echo "UNKNOWN: No RAID device found"
    exit 3
fi

#
# Au boulot
#

# On recupere le ou les noms des RAID
raids=$(grep ^md /proc/mdstat | cut -d " " -f 1)

# On verifie que chaque RAID se porte bien
for raid in $raids ; do
    level=$(mdadm --detail /dev/$raid | grep "Raid Level" | awk '{print $4}' | sed 's/ //g')
    state=$(mdadm --detail /dev/$raid | grep -m 1 "State" | awk '{print $3,$4,$5}' | sed 's/ //g') 
    raid_devices=$(mdadm --detail /dev/$raid | grep "Raid Devices" | awk '{print $4}' | sed 's/ //g') 
    total_devices=$(mdadm --detail /dev/$raid | grep "Total Devices" | awk '{print $4}' | sed 's/ //g')
    if [[ $state == "clean" ]] ; then
        ok=$((ok+1))
        ok_raid="$ok_raid & $raid"
        message="$message$raid ($level) is OK with $total_devices/$raid_devices devices\n"
    fi
    if [[ $state == "clean,degraded,recovering" ]] ; then
        rebuild=$(mdadm --detail /dev/$raid | grep "Rebuild Status" | awk '{print $4}' | sed 's/ //g')
        warn=$((warn+1))
        warn_raid="$warn_raid & $raid"
        message="$message$raid ($level) is recovering (rebuild : $rebuild) with $total_devices/$raid_devices devices\n"
    fi
    if [[ $state == "clean,degraded" ]] ; then
        crit=$((crit+1))
        crit_raid="$crit_raid & $raid"
        message="$message$raid ($level) is failed with $total_devices/$raid_devices devices\n"
    fi
done

# On verifie qu'on s'est pas melange les pinceaux
res=$((ok+warn+crit))
if [[ $res -ne $raid_count ]] ; then
    echo "UNKNOWN: The result is inconsistent. Please check the script."
    exit 3
fi

# On affiche le resultat et on sort avec le code qui plait a nagios
if [[ $crit -gt 0 ]] ; then
    crit_raid=$(echo ${crit_raid:2})
    if [[ $crit -eq 1 ]] ; then
        echo "CRITICAL: $crit_raid is in bad shape"
    else
        echo "CRITICAL: $crit_raid are in bad shapes"
    fi
    echo -e ${message::-2}
    exit 2
fi
if [[ $warn -gt 0 ]] ; then
    warn_raid=$(echo ${warn_raid:2})
    echo "WARNING: $warn_raid needs attention"
    echo -e ${message::-2}
    exit 1
else
    ok_raid=$(echo ${ok_raid:2})
    if [[ $ok -eq 1 ]] ; then
        echo "OK: $ok_raid is clean"
    else
        echo "OK: $ok_raid are clean"
    fi
    echo -e ${message::-2}
    exit 0
fi

