#!/bin/bash
#
# Wrapper utility for NHC
#
# Michael Jennings <mej@lbl.gov>
# 19 August 2014
#
# Copyright (c) 2011-2015, Michael Jennings <mej@eterm.org>
#
# LBNL Node Health Check (NHC), Copyright (c) 2015, The Regents of the
# University of California, through Lawrence Berkeley National
# Laboratory (subject to receipt of any required approvals from the
# U.S. Dept. of Energy).  All rights reserved.
#
# If you have questions about your rights to use or distribute this
# software, please contact Berkeley Lab's Innovation & Partnerships
# Office at IPO@lbl.gov.
#
# NOTICE.  This Software was developed under funding from the
# U.S. Department of Energy and the U.S. Government consequently
# retains certain rights. As such, the U.S. Government has been
# granted for itself and others acting on its behalf a paid-up,
# nonexclusive, irrevocable, worldwide license in the Software to
# reproduce, distribute copies to the public, prepare derivative
# works, and perform publicly and display publicly, and to permit
# other to do so.
#

# This script is intended to provide a convenient wrapper for
# executing the LBNL Node Health Check (NHC) driver program in an
# out-of-band (i.e., outside a scheduler/resource manager) context.
# It provides facilities for passing parameters to NHC, launching
# specific NHC contexts, and sending notification e-mails when issues
# are found (and includes some anti-annoyance features too).

trap 'echo "Terminated by signal SIGHUP." ; exit 129' 1
trap 'echo "Terminated by signal SIGINT." ; exit 130' 2
trap 'echo "Terminated by signal SIGTERM." ; exit 143' 15

function vecho() {
    [[ "$VERBOSE" == "1" ]] && echo "$@"
}

function nhc_parse_timespec() {
    local VARNAME="$1"
    local i TIMESPEC CHR SECS=0 NUMSTR=''

    if [[ -n "$2" ]]; then
        TIMESPEC="$2"
    else
        eval 'TIMESPEC="${'"$VARNAME"'}"'
    fi

    for ((i=0; i<${#TIMESPEC}; i++)); do
        CHR="${TIMESPEC:$i:1}"
        if [[ -z "${CHR#[0-9]}" ]]; then
            NUMSTR="$NUMSTR$CHR"
        elif [[ -z "$NUMSTR" ]]; then
            echo "ERROR:  Missing number before character '$CHR' in timespec \"$TIMESPEC\" at position $i."
            echo "ERROR:  Parsing terminated.  Using $SECS seconds for expiration of old results."
            break
        else
            case "$CHR" in
                [Ww])  ((SECS += NUMSTR * 604800)) ;;
                [Dd])  ((SECS += NUMSTR * 86400))  ;;
                [Hh])  ((SECS += NUMSTR * 3600))   ;;
                [Mm])  ((SECS += NUMSTR * 60))     ;;
                [Ss])  ((SECS += NUMSTR))          ;;
                [Ff])  FUDGETIME=$NUMSTR           ;;
                *)
                    echo "ERROR:  Invalid character '$CHR' encountered in timespec \"$TIMESPEC\" at position $i."
                    echo "ERROR:  Parsing terminated.  Using $SECS seconds for expiration of old results."
                    NUMSTR=""
                    break
                    ;;
            esac
            NUMSTR=""
        fi
    done
    [[ -n "$NUMSTR" ]] && ((SECS += NUMSTR))
    eval $VARNAME='"$SECS"'
}

function nhcwrap_init_env() {
    umask 0077
    PATH="/sbin:/usr/sbin:/bin:/usr/bin"
    if [ -f /etc/debian_version ]
    then
        SYSCONFIGDIR="/etc/default"
        LIBEXECDIR="/usr/lib"
    else
        SYSCONFIGDIR="/etc/sysconfig"
        LIBEXECDIR="/usr/libexec"
    fi
    if [[ -r /proc/sys/kernel/hostname ]]; then
        read HOSTNAME < /proc/sys/kernel/hostname
    else
        HOSTNAME=$(/bin/hostname)
    fi
    HOSTNAME_S=${HOSTNAME/%.*}
    RET=0
    export PATH SYSCONFIGDIR LIBEXECDIR HOSTNAME HOSTNAME_S RET

    # Set name of wrapped script/program based on invocation.
    SUBPROG=${0/#*\/}
    SUBPROG=${SUBPROG/%-wrapper}
    export SUBPROG

    unset ARGS STATEDIR MAILTO SUBJECT EXPIRE VERBOSE LOOP_TS PREV_RESULTS
}

function nhcwrap_help() {
    local PROGNAME=$0
    local TITLE UNDERLINE

    PROGNAME="${PROGNAME/#*\/}"
    TITLE="$PROGNAME Usage"
    UNDERLINE="${TITLE//?/-}"

    cat <<EOF

$TITLE
$UNDERLINE

  Syntax:  $PROGNAME [<options>] [<var>=<value> [...]]

 OPTION            DESCRIPTION
-------------------------------------------------------------------------------
 -h                Show command line help (this info)
 -A <args>         Pass <args> to $SUBPROG
 -D <directory>    Use <directory> for cache/bookkeeping
 -L <timespec>     Executes $SUBPROG in a loop with interval of <timespec>
 -M <mailto>       Send output of $SUBPROG via e-mail to <mailto>
 -S <subject>      Set e-mail subject header (for -M) to <subject>
 -V                Activate verbose mode (extra diagnostic output)
 -X <timespec>     Expire cached output after <timespec> (e.g., "6h" or "1d")
 --                Stop parsing command line; pass remaining args to $SUBPROG

 All other command line parameters, if any, are passed directly to $SUBPROG.

 Timespecs may specify weeks (w), days (d), hours (h), minutes (m), and/or
 seconds (s).  If no suffix is given, seconds are assumed.  Result expiration
 timespec may have trailing fudge factor (f) in seconds (e.g., 6h10f).  Loop
 timespec may contain trailing flags.

EXAMPLES:
---------
 To run $SUBPROG, e-mail output to root, and pass parameters "-d -t 60":
    # $PROGNAME -M root -- -d -t 60
  OR
    # $PROGNAME -M root -A "-d -t 60"

 To only e-mail results if they change or every 12 hours:
    # $PROGNAME -M root -X 12h

EOF
}

function nhcwrap_parse_cmdline() {
    local OPTION

    OPTIND=1
    while getopts ":A:D:L:M:P:S:VX:h-" OPTION ; do
        case "$OPTION" in
            A) ARGS="$OPTARG" ;;
            D) STATEDIR="$OPTARG" ;;
            L) LOOP_TS="$OPTARG" ;;
            M) MAILTO="$OPTARG" ;;
            P) SUBPROG="$OPTARG" ;;
            S) SUBJECT="$OPTARG" ;;
            V) VERBOSE=1 ;;
            X) EXPIRE="$OPTARG" ;;
            h) nhcwrap_help ; exit 0 ;;
            :) echo "ERROR:  Option -$OPTARG requires an argument." ; nhcwrap_help ; return 1 ;;
            -) break ;;
            \?) ((OPTIND--)) ; break ;;
        esac
    done
    shift $((OPTIND-1))
    ARGLIST=( $ARGS "$@" )
    return 0
}

function nhcwrap_finalize_env() {
    local TMPDIR

    NW_DATEFMT="${NW_DATEFMT:-(%F %T)}"
    STATEDIR="${STATEDIR:-/tmp/${SUBPROG}-${EUID}}"
    SUBJECT="${SUBJECT:-NHC Report for ${HOSTNAME_S}}"
    VERBOSE="${VERBOSE:-0}"
    OUTFILE="$STATEDIR/$SUBPROG.out"
    SAVEFILE="$STATEDIR/$SUBPROG.save"
    FUDGETIME="${FUDGETIME:-5}"

    # We have to be very careful about how we create and manage the
    # state directory since it lives in /tmp by default (in order to
    # have a good chance of being on a tmpfs in RAM, and so that
    # normal users can run the wrapper too).
    if [[ ! -d "$STATEDIR" ]]; then
        # Securely create state directory
        TMPDIR=$(mktemp -d "${STATEDIR}.XXXXXXXXXXX")
        mv "$TMPDIR" "$STATEDIR"
    fi
    chmod 0000 "$STATEDIR" 2>/dev/null
    chown $EUID "$STATEDIR" 2>/dev/null
    chmod 0700 "$STATEDIR" 2>/dev/null
    if [[ -h "$STATEDIR" || ! -d "$STATEDIR" || ! -O "$STATEDIR" || ! -w "$STATEDIR" ]]; then
        echo "ERROR:  State directory \"$STATEDIR\" create/chown/chmod failed for uid $EUID.  Aborting."
        exit -1
    fi

    if [[ "${SUBPROG#*/}" != "${SUBPROG}" ]]; then
        PATH="${SUBPROG%/*}:$PATH"
        SUBPROG="${SUBPROG##*/}"
    fi

    if [[ -n "$EXPIRE" ]]; then
        nhc_parse_timespec EXPIRE
        vecho "Note:  Results will expire after $EXPIRE seconds."
    fi
}

function nhcwrap_spawn_command() {
    local RC SAVEFILE_TIME CTIME MTIME NOW

    vecho "Executing \"$SUBPROG ${ARGLIST[*]}\"" `date "+$NW_DATEFMT"`
    # Remove output file so that it will be recreated by bash with ctime of now.
    rm -f "$OUTFILE"
    "$SUBPROG" "${ARGLIST[@]}" >& "$OUTFILE"
    RC=$?
    vecho "$SUBPROG returned $RC" `date "+$NW_DATEFMT"`

    # Expire old results file if too old.
    # NOTE: We do the comparison *after* subprogram execution in an
    # attempt to minimize jitter caused by variability in the
    # execution time of the subprogram.  Otherwise the time difference
    # may be off by however long the subprogram took to run because
    # the comparison was done *before* subprogram execution but the
    # timestamp on the results file is *after* the previous execution.
    if [[ $EXPIRE -gt 0 && -f "$SAVEFILE" ]]; then
        # Obtain the older of ctime/mtime and use that.
        SAVEFILE_TIME=$(stat -c '%Y %Z' "$SAVEFILE")
        MTIME="${SAVEFILE_TIME// *}"
        CTIME="${SAVEFILE_TIME//* }"
        ((SAVEFILE_TIME=MTIME<CTIME?MTIME:CTIME))
        NOW=$(date '+%s')
        if ((NOW + FUDGETIME - SAVEFILE_TIME >= EXPIRE)); then
            vecho "Results file expired after $((NOW-SAVEFILE_TIME)) seconds (+/- ${FUDGETIME}s)."
            rm -f "$SAVEFILE"
        else
            vecho "Results file found with age $((NOW-SAVEFILE_TIME)) seconds (+/- ${FUDGETIME}s)."
        fi
    fi

    return $RC
}

function nhcwrap_check_output() {
    local RC

    if [[ -f "$SAVEFILE" ]]; then
        # We have previous results, so compare them against the new results.
        if [[ ! -s "$OUTFILE" && ! -s "$SAVEFILE" ]]; then
            # Shortcut for common case:  Both files are zero-byte.
            vecho "Output file matches saved results file (both empty)."
            rm -f "$OUTFILE"
            RC=0
        elif diff -q "$OUTFILE" "$SAVEFILE" >&/dev/null; then
            # Files are identical.
            vecho "Output file matches saved results file."
            rm -f "$OUTFILE"
            RC=0
        else
            # Both files exist, but they differ.
            vecho "Output file does not match saved results file."
            PREV_RESULTS=$(< "$SAVEFILE")
            rm -f "$SAVEFILE"
            mv -f "$OUTFILE" "$SAVEFILE"
            RC=1
        fi
    elif [[ ! -s "$OUTFILE" ]]; then
        # Empty output and no previous save file.
        vecho "Output file is empty (no saved results file found)."
        mv -f "$OUTFILE" "$SAVEFILE"
        RC=0
    else
        # Non-empty output and no saved results file.
        vecho "Output file is non-empty (no saved results file found)."
        mv -f "$OUTFILE" "$SAVEFILE"
        RC=1
    fi

    return $RC
}

function nhcwrap_report_results() {
    # If we get here, we have results to report.
    if [[ ! -s "$SAVEFILE" ]]; then
        # We have empty results this time but non-empty previous
        # results.  This means a prior error is now cleared.
        if [[ -n "$MAILTO" ]]; then
            echo -e "The following $SUBPROG errors on $HOSTNAME_S have now been CLEARED:"$'\n\n'"${PREV_RESULTS}" | /bin/mail -s "$SUBJECT" $MAILTO
            vecho "E-mail notification of cleared error sent to $MAILTO."
        else
            echo -e "The following $SUBPROG errors on $HOSTNAME_S have now been CLEARED:"$'\n\n'"${PREV_RESULTS}"
        fi
    else
        if [[ -n "$MAILTO" ]]; then
            /bin/mail -s "$SUBJECT" $MAILTO < "$SAVEFILE"
            vecho "Results e-mailed to $MAILTO"
        else
            echo "$(< $SAVEFILE)"
        fi
    fi
    return 0
}

function nhcwrap_loop() {
    local LOOP_FLAGS LOOP_INTERVAL NEXT_LOOP MIDNIGHT NOW
    local LOOP_CLS=0 LOOP_TIME=0 LOOP_RULER

    LOOP_FLAGS="${LOOP_TS//[0-9WwDdHhMmSs]}"
    LOOP_TS="${LOOP_TS//[^0-9WwDdHhMmSs]}"
    if [[ -n "$LOOP_FLAGS" ]]; then
        if [[ "${LOOP_FLAGS//[Cc]}" != "$LOOP_FLAGS" ]]; then
            LOOP_CLS=1
        fi
        if [[ "${LOOP_FLAGS//[Tt]}" != "$LOOP_FLAGS" ]]; then
            LOOP_TIME=1
        fi
        if [[ "${LOOP_FLAGS//[Rr]}" != "$LOOP_FLAGS" ]]; then
            LOOP_RULER=1
        fi
    fi
    nhc_parse_timespec LOOP_INTERVAL "$LOOP_TS"
    if [[ $LOOP_INTERVAL -eq 0 ]]; then
        vecho "WARNING:  Looping requested but no interval found.  Defaulting to 5 minutes."
        LOOP_INTERVAL=300
    fi

    MIDNIGHT=$(date '+%s' -d '00:00')
    while : ; do
        NOW=$(date '+%s')
        NEXT_LOOP=$((LOOP_INTERVAL-(NOW-MIDNIGHT)%LOOP_INTERVAL))
        vecho "Looping every $LOOP_INTERVAL seconds.  Next loop in $NEXT_LOOP seconds."
        sleep $NEXT_LOOP
        if [[ $LOOP_CLS -eq 1 && -t 0 ]]; then
            clear
        fi
        if [[ $LOOP_RULER -eq 1 ]]; then
            echo "----------------------------------------------------------------------------"
        fi
        if [[ $LOOP_TIME -eq 1 ]]; then
            echo "Invoking $SUBPROG at" `date "+$NW_DATEFMT"`
        fi
        nhcwrap_spawn_command
        nhcwrap_check_output || nhcwrap_report_results
    done
    return 0
}

nhcwrap_init_env
nhcwrap_parse_cmdline "$@" || exit 99
nhcwrap_finalize_env

nhcwrap_spawn_command
nhcwrap_check_output || nhcwrap_report_results
if [[ -n "$LOOP_TS" ]]; then
    nhcwrap_loop
fi
