#!/bin/bash
#
# Warewulf Node Health Check -- Node Offlining Helper
#
# Michael Jennings <mej@lbl.gov>
# 16 September 2011
#
# $Id$
#

# This script is a simple pbsnodes wrapper that the node health check
# can run in the background to mark nodes offline.  It will first
# obtain the current node state information to avoid overwriting notes
# which were not placed by NHC.  If these checks pass, the node is
# marked offline with the note supplied.

IGNORE_EMPTY_NOTE="${IGNORE_EMPTY_NOTE:-0}"
LEADER="NHC:"

echo "`date '+%Y%m%d %H:%M:%S'` $0 $*"

HOSTNAME="$1"
shift
NOTE="$*"

### PBS (TORQUE)
if [[ "$NHC_RM" == "pbs" ]]; then
    PBSNODES="${PBSNODES:-pbsnodes}"
    PBSNODES_LIST_ARGS="${PBSNODES_LIST_ARGS:--n -l all}"
    PBSNODES_OFFLINE_ARGS="${PBSNODES_OFFLINE_ARGS:--o -N}"

    LINE=( $($PBSNODES $PBSNODES_LIST_ARGS $HOSTNAME) )
    STATUS="${LINE[1]}"
    OLD_NOTE_LEADER="${LINE[2]}"
    OLD_NOTE="${LINE[*]:3}"
    case "$STATUS" in
        *down*|*offline*|*unknown*)
            if [[ "${STATUS/offline}" != "${STATUS}" ]]; then
                # If the node is already offline, and there is no old note, and
                # we've not been told to ignore that, do not touch the node.
                if [[ -z "$OLD_NOTE_LEADER" && "$IGNORE_EMPTY_NOTE" != "1" ]]; then
                    echo "$0:  Not offlining $HOSTNAME:  Already offline with no note set."
                    exit 0
                fi
            fi
            # If there's an old note that wasn't set by NHC, preserve it.
            if [[ -n "$OLD_NOTE_LEADER" && "$OLD_NOTE_LEADER" != "$LEADER" ]]; then
                LEADER="$OLD_NOTE_LEADER"
                NOTE="$OLD_NOTE"
            fi
            ;;
    esac

    echo "$0:  Marking $STATUS $HOSTNAME offline:  $LEADER $NOTE"
    exec $PBSNODES $PBSNODES_OFFLINE_ARGS "$LEADER $NOTE" $HOSTNAME

### SLURM
elif [[ "$NHC_RM" == "slurm" ]]; then
    HOSTNAME="${HOSTNAME/%.*}"
    SLURM_SINFO="${SLURM_SINFO:-sinfo}"
    SLURM_SCONTROL="${SLURM_SCONTROL:-scontrol}"
    SLURM_SC_OFFLINE_ARGS="${SLURM_SC_OFFLINE_ARGS:-update State=DRAIN}"

    LINE=( $($SLURM_SINFO -o '%t %E' -hn $HOSTNAME) )
    STATUS="${LINE[0]}"
    OLD_NOTE_LEADER="${LINE[1]}"
    OLD_NOTE="${LINE[*]:2}"
    case "$STATUS" in
        alloc*|comp*|drain*|drng*|fail*|idle*)
            case "$STATUS" in
                drain*|drng*)
                    # If the node is already offline, and there is no old note, and
                    # we've not been told to ignore that, do not touch the node.
                    if [[ "$OLD_NOTE_LEADER" == "none" && "$IGNORE_EMPTY_NOTE" != "1" ]]; then
                        echo "$0:  Not offlining $HOSTNAME:  Already offline with no note set."
                        exit 0
                    fi
                    ;;
            esac
            # If there's an old note that wasn't set by NHC, preserve it.
            if [[ "$OLD_NOTE_LEADER" != "none" && "$OLD_NOTE_LEADER" != "$LEADER" ]]; then
                LEADER="$OLD_NOTE_LEADER"
                NOTE="$OLD_NOTE"
            fi
            echo "$0:  Marking $STATUS $HOSTNAME offline:  $LEADER $NOTE"
            exec $SLURM_SCONTROL $SLURM_SC_OFFLINE_ARGS NodeName=$HOSTNAME Reason="$LEADER $NOTE"
            ;;
    esac

### IBM Platform LSF
elif [[ "$NHC_RM" == "lsf" ]]; then
    LSF_BHOSTS="${LSF_BHOSTS:-bhosts}"
    LSF_BADMIN="${LSF_BADMIN:-badmin}"
    LSF_OFFLINE_ARGS="${LSF_OFFLINE_ARGS:-hclose -C}"

    STATUS=""
    OLD_NOTE_LEADER=""
    OLD_NOTE=""
    IFS=$'\n'
    LINES=( $($LSF_BHOSTS -l $HOSTNAME) )
    IFS=$' \t\n'
    for ((i=0; i < ${#LINES[*]}; i++)) ; do
        LINE=( ${LINES[$i]} )
        if [[ "${LINE[0]}" == "STATUS" ]]; then
            ((i++))
            LINE=( ${LINES[$i]} )
            STATUS="${LINE[0]}"
        elif [[ "${LINE[0]}" == "ADMIN" && "${LINE[2]}" == "COMMENT:" ]]; then
            OLD_NOTE_LEADER="${LINE[3]/\"}"
            OLD_NOTE="${LINE[*]:4}"
            OLD_NOTE="${OLD_NOTE/%\"}"
            break
        fi
    done
    case "$STATUS" in
        ok|closed*)
            if [[ "$STATUS" == "closed_Adm" ]]; then
                # If the node is already offline, and there is no old note, and
                # we've not been told to ignore that, do not touch the node.
                if [[ -z "$OLD_NOTE_LEADER" && "$IGNORE_EMPTY_NOTE" != "1" ]]; then
                    echo "$0:  Not offlining $HOSTNAME:  Already offline with no note set."
                    exit 0
                fi
            fi
            # If there's an old note that wasn't set by NHC, preserve it.
            if [[ -n "$OLD_NOTE_LEADER" && "$OLD_NOTE_LEADER" != "$LEADER" ]]; then
                LEADER="$OLD_NOTE_LEADER"
                NOTE="$OLD_NOTE"
            fi
            echo "$0:  Marking $STATUS $HOSTNAME offline:  $LEADER $NOTE"
            exec $LSF_BADMIN $LSF_OFFLINE_ARGS "$LEADER $NOTE" $HOSTNAME
            ;;
    esac

### Sun Grid Engine (and variants)
elif [[ "$NHC_RM" == "sge" ]]; then
    echo "$0:  No additional node marking necessary for SGE and variants."

### Everything else is unsupported.
else
    echo "$0:  Unsupported RM detected in $0:  \"$NHC_RM\""
    exit -1
fi
exit 0
