#!/bin/bash
#
# Warewulf Node Health Check Script
#
# Michael Jennings <mej@lbl.gov>
# 13 December 2010
#
# $Id$
#

# This is the driver program for the node health check script
# subsystem.  The include directory (/etc/nhc/scripts by default)
# contains a series of bash scripts which, when sourced, should define
# bash functions which will later be invoked to check node health.
#
# The configuration file (/etc/nhc/nhc.conf by default) is then read a
# line at a time.  Any lines beginning with a mask that matches the
# current hostname will invoke the specified check (usually one of the
# bash functions loaded above, but could also be an external command
# or script).  Failure of any check will result in the node being
# flagged as "unhealthy" and the termination of further checks.

### Library functions

# Declare a print-error-and-exit function.
function die() {
    IFS=$' \t\n'
    local RET="$1"
    shift
    log "Health check failed:  $*"
    syslog "Health check failed:  $*"
    syslog_flush
    if [[ "$MARK_OFFLINE" = "1" ]]; then
        eval '$OFFLINE_NODE "$HOSTNAME" "$*" </dev/null >/dev/null' $LOGFILE '2>&1 &'
    fi
    if [[ -n "$NHC_DETACHED" ]]; then
        echo "$RET $*" > $RESULTFILE
    elif [[ "$NHC_RM" == "sge" ]]; then
        echo "begin"
        echo "$HOSTNAME:healthy:false"
        echo "$HOSTNAME:diagnosis:NHC: $*"
        echo "end"
        CHECK_DIED=1
        return 77
    else
        echo "ERROR Health check failed:  $*"
    fi
    kill_watchdog
    exit $RET
}

# Quick-and-dirty debugging output
function dbg() {
    if [[ "$DEBUG" != "0" ]]; then
        eval echo '"DEBUG:  $*"' $LOGFILE
    fi
}

# Quick-and-dirty log output
function log() {
    if [[ "$SILENT" = "0" ]]; then
        eval echo '"$@"' $LOGFILE
    fi
}

# Store syslog output, send at end of script execution.
function syslog() {
    if [[ -z "$LOGGER_TEXT" ]]; then
        LOGGER_TEXT="$*"
    else
        LOGGER_TEXT="$LOGGER_TEXT"$'\n'"$*"
    fi
}

function syslog_flush() {
    if [[ -n "$LOGGER_TEXT" ]]; then
        echo "$LOGGER_TEXT" | logger -p daemon.err -t "$NAME[$$]"
    fi
    LOGGER_TEXT=""
}

function kill_watchdog() {
    dbg "$FUNCNAME:  Watchdog PID is $WATCHDOG_PID."
    [[ $WATCHDOG_PID -gt 0 ]] && kill -9 $WATCHDOG_PID >/dev/null 2>&1
    return 0
}

#########################

function nhcmain_init_env() {
    ### Variable declarations

    # Static variables
    PATH="/sbin:/usr/sbin:/bin:/usr/bin"
    SYSCONFIGDIR="/etc/sysconfig"
    LIBEXECDIR="/usr/libexec"
    RESULTFILE="/var/run/nhc.status"
    if [[ -r /proc/sys/kernel/hostname ]]; then
        read HOSTNAME < /proc/sys/kernel/hostname
    elif [[ -z "$HOSTNAME" ]]; then
        HOSTNAME="localhost"
    fi
    HOSTNAME_S=${HOSTNAME/%.*}
    RET=0
    LOGGER_TEXT=""
    NHC_PID=$$
    WATCHDOG_PID=0
    export PATH SYSCONFIGDIR LIBEXECDIR RESULTFILE HOSTNAME HOSTNAME_S RET LOGGER_TEXT NHC_PID WATCHDOG_PID

    # Users may override this in /etc/sysconfig/nhc.
    NAME=${0/#*\/}

    # Don't allow previous environment to leak in.  Must be done from /etc/sysconfig/nhc only.
    unset CONFDIR CONFFILE INCDIR HELPERDIR ONLINE_NODE OFFLINE_NODE LOGFILE DEBUG SILENT TIMEOUT MAX_SYS_UID NHC_RM

    if [[ -n "$NHC_DETACHED" ]]; then
        # We're running detached.
        export NHC_DETACHED
        DETACHED_MODE=1
    fi
}

function nhcmain_help() {
    local PROGNAME=$0
    local TITLE UNDERLINE

    PROGNAME="${PROGNAME/#*\/}"
    TITLE="$PROGNAME Usage"
    UNDERLINE="${TITLE//?/-}"

    cat <<EOF

$TITLE
$UNDERLINE

  Syntax:  $PROGNAME [<options>] [<var>=<value> [...]]

 OPTION            DESCRIPTION
-------------------------------------------------------------------------------
 -h                Show command line help (this info)
 -D <confdir>      Use config directory <confdir> (default: /etc/<name>)
 -c <conffile>     Load config from <conffile> (default: <confdir>/<name>.conf)
 -d                Activate debugging output (i.e., DEBUG=1)
 -n <name>         Set program name to <name> (default: nhc); see -D & -c above
 -q                Run quietly (i.e., SILENT=1)
 -t <timeout>      Use timeout of <timeout> seconds (default: 10)

 All other command line parameters, if any, must be environment variable
 settings in the form VARNAME=value.

EXAMPLES:
---------
 To run in debug mode with a timeout of 60 seconds:
    # $PROGNAME -d -t 60
  OR
    # $PROGNAME DEBUG=1 TIMEOUT=60

 To run with the name "nhc-cron" which will alter default config paths:
    # $PROGNAME -n nhc-cron
  OR
    # $PROGNAME NAME=nhc-cron

EOF
}

function nhcmain_parse_cmdline() {
    local OPTION

    OPTIND=1
    while getopts ":D:c:dhn:qt:" OPTION ; do
        case "$OPTION" in
            D) CONFDIR="$OPTARG" ; dbg "\$CONFDIR set to $CONFDIR." ;;
            c) CONFFILE="$OPTARG" ; dbg "\$CONFFILE set to $CONFFILE." ;;
            d) DEBUG=1 ; dbg "Debugging activated via -d option." ;;
            h) nhcmain_help ; exit 0 ;;
            n) NAME="$OPTARG" ; dbg "\$NAME set to $NAME." ;;
            q) SILENT=1 ; dbg "Silent mode activated via -q option." ;;
            t) TIMEOUT="$OPTARG" ; dbg "Timeout set to $TIMEOUT." ;;
            :) nhcmain_help ; echo "$NAME:  ERROR:  Option -$OPTARG requires an argument." ; return 8 ;;
            \?) nhcmain_help ; echo "$NAME:  ERROR:  Invalid option:  -$OPTARG" ; return 9 ;;
        esac
    done
    shift $((OPTIND-1))
    while [[ ! -z "$1" ]]; do
        eval "$1"
        shift
    done
    return 0
}

function nhcmain_load_sysconfig() {
    # Load settings from system-wide location.  NOTE:  To change value of $NAME
    # here, the driver script must be renamed to something other than "nhc."
    if [[ -f $SYSCONFIGDIR/$NAME ]]; then
        . $SYSCONFIGDIR/$NAME
    fi
}

function nhcmain_finalize_env() {
    # Set some variables relative to possible /etc/sysconfig/nhc
    # modifications.  Users may have overridden some of these.
    CONFDIR="${CONFDIR:-/etc/$NAME}"
    CONFFILE="${CONFFILE:-$CONFDIR/$NAME.conf}"
    INCDIR="${INCDIR:-$CONFDIR/scripts}"
    HELPERDIR="${HELPERDIR:-$LIBEXECDIR/$NAME}"
    ONLINE_NODE="${ONLINE_NODE:-$HELPERDIR/node-mark-online}"
    OFFLINE_NODE="${OFFLINE_NODE:-$HELPERDIR/node-mark-offline}"
    LOGFILE="${LOGFILE:->>/var/log/$NAME.log}"
    DEBUG=${DEBUG:-0}
    SILENT=${SILENT:-0}
    MARK_OFFLINE=${MARK_OFFLINE:-1}
    DETACHED_MODE=${DETACHED_MODE:-0}
    DETACHED_MODE_FAIL_NODATA=${DETACHED_MODE_FAIL_NODATA:-0}
    TIMEOUT=${TIMEOUT:-10}
    MAX_SYS_UID=${MAX_SYS_UID:-99}

    if [[ -z "$NHC_RM" ]]; then
        if ! nhcmain_find_rm ; then
            ONLINE_NODE=:
            OFFLINE_NODE=:
            MARK_OFFLINE=0
        fi
    fi
    if [[ "$NHC_RM" == "sge" ]]; then
        # With SGE, we return the status and note directly from NHC.
        ONLINE_NODE=:
        OFFLINE_NODE=:
        MARK_OFFLINE=0
        # SGE's looping model is incompatible with detached mode and the watchdog timer.
        DETACHED_MODE=0
        TIMEOUT=0
    fi

    if [[ -n "$NHC_DETACHED" ]]; then
        dbg "This session is running detached from $NHC_DETACHED."
    elif [[ $DETACHED_MODE -eq 1 ]]; then
        dbg "Activating detached mode."
        nhcmain_detach
        return
    fi

    export NAME CONFDIR CONFFILE INCDIR HELPERDIR ONLINE_NODE OFFLINE_NODE LOGFILE DEBUG SILENT TIMEOUT MAX_SYS_UID NHC_RM
}

function nhcmain_find_rm() {
    local DIR
    local -a DIRLIST

    if [[ -d /var/spool/torque ]]; then
        NHC_RM="pbs"
        return 0
    fi

    IFS=':'
    DIRLIST=( $PATH )
    IFS=$' \t\n'
    for DIR in "${DIRLIST[@]}" ; do
        if [[ -x "$DIR/pbsnodes" ]]; then
            NHC_RM="pbs"
            return 0
        elif [[ -x "$DIR/scontrol" ]]; then
            NHC_RM="slurm"
            return 0
        elif [[ -x "$DIR/badmin" ]]; then
            NHC_RM="lsf"
            return 0
        elif [[ -x "$DIR/qselect" ]]; then
            NHC_RM="sge"
            return 0
        fi
    done
    if [[ -z "$NHC_RM" ]]; then
        log "Unable to detect resource manager."
        return 1
    fi
}

function nhcmain_check_conffile() {
    # Check for config file before we do too much work.
    if [[ ! -f "$CONFFILE" ]]; then
        # Missing config means no checks.  No checks means no failures.
        return 1
    fi
    return 0
}

function nhcmain_load_scripts() {
    log "Node Health Check starting."

    # Load all include scripts.
    dbg "Loading scripts from $INCDIR..."
    for SCRIPT in $INCDIR/* ; do
        dbg "Loading ${SCRIPT/#*\/}"
        . $SCRIPT
    done
}

function nhcmain_watchdog_timer() {
    local TIMEOUT="$1" NHC_PID="$2"

    sleep $TIMEOUT
    kill -s ALRM -- $NHC_PID || return 0
    sleep 1
    kill -s TERM -- $NHC_PID 2>/dev/null || return 0
    sleep 3
    kill -s KILL -- $NHC_PID 2>/dev/null
    return 0
}

function nhcmain_set_watchdog() {
    # Set ALARM to timeout script.
    if [[ $TIMEOUT -gt 0 ]]; then
        eval nhcmain_watchdog_timer $TIMEOUT $NHC_PID $LOGFILE &
        WATCHDOG_PID=$!
        export WATCHDOG_PID
        dbg "Watchdog PID is $WATCHDOG_PID, NHC PID is $NHC_PID"
    else
        dbg "No watchdog, NHC PID is $NHC_PID"
    fi
}

function nhcmain_spawn_detached() {
    rm -f "$RESULTFILE" >/dev/null 2>&1
    export NHC_DETACHED=$$
    exec -a nhc-detached $0 </dev/null >/dev/null 2>&1
}

function nhcmain_detach() {
    local RC MSG

    # If the results file exists but the system has rebooted since its
    # creation, assume it's stale and remove it.
    if [[ -e "$RESULTFILE" && -d "/proc/1" && "$RESULTFILE" -ot "/proc" ]]; then
        rm -f "$RESULTFILE"
    fi
    if [[ -r "$RESULTFILE" ]]; then
        read RC MSG < "$RESULTFILE"
    elif [[ "$DETACHED_MODE_FAIL_NODATA" == "1" ]]; then
        RC=1
        MSG="Detached mode -- pending checks (no data found)"
    else
        RC=0
        MSG=""
    fi

    # Launch detached process
    nhcmain_spawn_detached &

    # Only mark offline/online in detached copy.
    MARK_OFFLINE=0

    if [[ $RC != 0 ]]; then
        die $RC "$MSG"
        return 1
    fi
    nhcmain_finish
}

function nhcmain_run_checks() {
    CHECK_DIED=0
    CHECKS=( )
    nhc_load_conf "$CONFFILE"
    for ((CNUM=0; CNUM<${#CHECKS[*]}; CNUM++)); do
        CHECK="${CHECKS[$CNUM]}"

        # Run the check.
        log "Running check:  \"$CHECK\""
        eval $CHECK
        RET=$?

        # Check for failure.
        if [[ $RET != 0 ]]; then
            if [[ $CHECK_DIED == 0 ]]; then
                log "Node Health Check failed.  Check $CHECK returned $RET"
                die $RET "Check $CHECK returned $RET"
            fi
            return $RET
        fi
    done
}

function nhcmain_mark_online() {
    if [[ "$MARK_OFFLINE" = "1" ]]; then
        eval '$ONLINE_NODE "$HOSTNAME" </dev/null >/dev/null' $LOGFILE '2>&1 &'
    fi
}

function nhcmain_finish() {
    syslog_flush
    log "Node Health Check completed successfully (${SECONDS}s${BASH_SUBSHELL:+, $BASH_SUBSHELL subshells})."
    if [[ "$NHC_RM" == "sge" ]]; then
        echo "begin"
        echo "$HOSTNAME:healthy:true"
        echo "$HOSTNAME:diagnosis:HEALTHY"
        echo "end"
        return 0
    fi        
    kill_watchdog
    exit 0
}

### Script guts begin here.
if [[ -n "$NHC_LOAD_ONLY" ]]; then
    # We're only supposed to define functions, not actually run anything.
    return 0 || exit 0
fi

trap 'die 129 "Terminated by signal SIGHUP." ; exit 129' 1
trap 'die 130 "Terminated by signal SIGINT." ; exit 130' 2
trap 'die 143 "Terminated by signal SIGTERM." ; exit 143' 15
trap 'die 127 "Script timed out." ; exit 127' 14

nhcmain_init_env
nhcmain_parse_cmdline "$@" || exit 10
nhcmain_load_sysconfig
nhcmain_finalize_env
nhcmain_check_conffile || exit 0
nhcmain_load_scripts
if [[ "$NHC_RM" == "sge" ]]; then
    while : ; do
        read INPUT
        if [[ $? != 0 || "$INPUT" == "quit" ]]; then
            exit 0
        fi
        if nhcmain_run_checks ; then
            nhcmain_finish
        fi
    done
else
    nhcmain_set_watchdog
    nhcmain_run_checks
    nhcmain_mark_online
    nhcmain_finish
fi
