#! /bin/sh

# This is a simple nagios plug-in to monitor a DCC client or server.

# Copyright (c) 2010 by Rhyolite Software, LLC
#
# This agreement is not applicable to any entity which sells anti-spam
# solutions to others or provides an anti-spam solution as part of a
# security solution sold to other entities, or to a private network
# which employs the DCC or uses data provided by operation of the DCC
# but does not provide corresponding data to other users.
#
# Permission to use, copy, modify, and distribute this software without
# changes for any purpose with or without fee is hereby granted, provided
# that the above copyright notice and this permission notice appear in all
# copies and any distributed versions or copies are either unchanged
# or not called anything similar to "DCC" or "Distributed Checksum
# Clearinghouse".
#
# Parties not eligible to receive a license under this agreement can
# obtain a commercial license to use DCC by contacting Rhyolite Software
# at sales@rhyolite.com.
#
# A commercial license would be for Distributed Checksum and Reputation
# Clearinghouse software.  That software includes additional features.  This
# free license for Distributed ChecksumClearinghouse Software does not in any
# way grant permision to use Distributed Checksum and Reputation Clearinghouse
# software
#
# THE SOFTWARE IS PROVIDED "AS IS" AND RHYOLITE SOFTWARE, LLC DISCLAIMS ALL
# WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES
# OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL RHYOLITE SOFTWARE, LLC
# BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES
# OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
# WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION,
# ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
#
#	Rhyolite Software DCC 1.3.139-1.3 $Revision$
#	Generated automatically from dcc-nagios.in by configure.


EXIT_OK=0
EXIT_WARN=1
EXIT_CRIT=2
EXIT_UNK=3

ME=`basename $0`
USAGE="$ME: [-xhv] [-s server[,port]] [-T /tmp] [-t msec] [-C cdcc] [-m map-file] 
	[-i ID] [-p passwd] [-G on|off]"


CDCC=/usr/bin/cdcc
VERBOSE=0
SRVR_PARMS=
CLNT_PARMS=
MODE=client
TMPDIR=/tmp
MAP=
GREY=
SRVR=
OK_DELAY=400
while getopts "xhvs:T:t:C:m:i:p:G:" c; do
    case $c in
	x) set -x;;
	h) MODE=help;;
	v) VERBOSE=`expr $VERBOSE + 1`;;
	s) MODE=srvr; SRVR="$OPTARG";;
	T) if test -d "$OPTARG" -a -w "$OPTARG"; then
		TMPDIR=$OPTARG
	    else
		echo "invalid temporary directory \"$OPTARG\"" 1>&2
	    fi
	    ;;
	t) if expr "$OPTARG" : '[0-9][0-9]*$' >/dev/null; then
		OK_DELAY=$OPTARG
	    else
		echo "invalid delay in -t $OPTARG" 1>&2
	    fi
	    ;;
	C) CDCC="$OPTARG";;
	m) MAP="$OPTARG";;
	i) SRVR_PARMS="$SRVR_PARMS id $OPTARG;";;
	p) SRVR_PARMS="$SRVR_PARMS password $OPTARG;";;
	G) case "$OPTARG" in
	    [oO][nN]) GREY='1,/^#.* greylist /d';;
	    [oO][fF][fF]) GREY= ;;
	    *) echo "$USAGE" 1>&2;;
	    esac
	    ;;
	*) echo "$USAGE" 1>&2; exit $EXIT_UNK;;
    esac
done
shift `expr $OPTIND - 1 || true`
if test "$#" -ne 0; then
    echo "$USAGE" 1>&2
    exit $EXIT_UNK
fi

if test $VERBOSE -ge 3; then
    set -x
    VERBOSE=0
fi


# sed pattern to find server delay from `cdcc info` output
DELAY_PAT='/^# \*/,/requests ok/s/.*ok  *\([0-9]\{1,\}\)[-+.0-9]* ms.*/\1/p'


case $MODE in
help) cat <<EOF
    $USAGE

    This is a simple nagios plug-in to monitor a DCC client or server.
    Use "$ME" to check whether local DCC clients can reach
    a DCC server by parsing the output of "cdcc info".
    Use "$ME -s server" to check the health of a DCC server.

    The health of a DCC client is determined by the delay a client sees to
    the current fastest server.  The health of a single server is also
    determined the speed of its answers.

    -t msec can be used to change the default healthy delay threshold.
    A DCC server has no working DCC flooding peers delays its answers
    by an extra 400 milliseconds.  A DCC client that should be using
    a local DCC server might see delays increase from less than 100 ms.
    to several 100 ms.

    -x		debug script
    -h		this message
    -v		increase verbosity
    -s srvr	name or IP address of of DCC server
    -T /tmp	directory to keep file for this script
    -t msec	tolerable DCC server queing delay in milliseconds
    -C cdcc	alternative to /usr/bin/cdcc
    -m map	alternative to /var/dcc/map
    -i ID	client- or server-ID
    -p passwd	password for client- or server-ID
    -G on	check greylist server
EOF
    exit $EXIT_OK
    ;;



client)
    # Things are OK for a DCC client if there is at least one working server
    # and its average delay is less than the 400 ms that results from
    # having no work flood peers.
    # Things are critical if there is no working server.
    # Only warn if the best working server has long delays.

    if test -z "$GREY"; then
	GREY='/^# [0-9/]* [0-9:]* .* greylist /,$d'
	GREYLABEL="servers"
    else
	GREY='1,/^# [0-9/]* [0-9:]* .* greylist /d'
	GREYLABEL="greylist servers"
	CLNT_PARMS="$CLNT_PARMS grey on;"
    fi
    if test $VERBOSE -gt 0 -a -n "$SRVR_PARMS"; then
	echo "$ME: client mode does not use -i or -p"
    fi
    INFO=`$CDCC -q "$CLNT_PARMS quiet off; file ${MAP:=map}; info" 2>&1`
    SRVRS=`echo "$INFO"							\
	| sed -n -e "$GREY"						\
	    -e 's/.* total, \([0-9][0-9]*\) working servers.*$/\1/p'`
    if test -z "$SRVRS"; then
	echo "$ME: 'cdcc$CLNT_PARMS info' failed"
	if test $VERBOSE -ge 1; then
	    echo "$INFO"
	fi
	exit $EXIT_UNK
    fi
    if test $VERBOSE -ge 2; then
	echo "$INFO"
    fi
    if test "$SRVRS" -eq 0; then
	echo "DCC client CRITICAL: $SRVRS working $GREYLABEL"
	exit $EXIT_CRIT
    fi
    DELAY=`echo "$INFO" | sed -n -e "$GREY" -e "$DELAY_PAT"`
    if test -z "$DELAY"; then
	echo "$ME: failed to compute delay"
	exit $EXIT_UNK
    fi
    if test $DELAY -ge $OK_DELAY; then
	echo "DCC client WARNING: $SRVRS working $GREYLABEL; $DELAY ms delay"
	exit $EXIT_WARN
    fi
    echo "DCC client OK: $SRVRS working $GREYLABEL; $DELAY ms delay"
    exit $EXIT_OK
    ;;



srvr)
    # A DCC server is OK if it answers and its announced delay is less than
    # the 400 ms that results from having no working flood peers.
    # Warn about its status if it answers but with long delays.
    # Its status is critical if it does not answer.

    FFILE="$TMPDIR/.dcc-nagios-$SRVR-flood"

    if test $VERBOSE -gt 0 -a -n "$MAP$CLNT_PARMS"; then
	echo "$ME: -s or server mode does not use -m"
    fi
    if test -z "$GREY"; then
	GREYLABEL="DCC server $SRVR"
    else
	GREYLABEL="DCC greylist server $SRVR"
	SRVR_PARMS="grey on; $SRVR_PARMS"
    fi

    # see what the server says
    SOUT=`$CDCC -q "$SRVR_PARMS quiet off; host $SRVR; stats; info; flood list" 2>&1`
    if test $VERBOSE -ge 2; then
	echo "$SOUT"
    fi
    DELAY=`echo "$SOUT" | sed -n -e "$DELAY_PAT"`

    # critical problem if the server did not answer
    if test -z "$DELAY"; then
	/bin/rm -f "$FFILE"
	echo "$GREYLABEL CRITICAL: not answering"
	exit $EXIT_CRIT
    fi

    STATE="$DELAY ms delay"
    WARN=
    if test $DELAY -ge $OK_DELAY; then
	# possible problem if the server is slow
	WARN=yes
    fi

    # check flooding
    FLINE=`echo "$SOUT" | sed -n -e '/^ *flood/p'`
    FTOTAL=`expr "$FLINE" : '.* \([0-9][0-9]*\) streams .*'`
    FOUT=`expr "$FLINE" : '.* \([0-9][0-9]*\) out .*'`
    FIN=`expr "$FLINE" : '.* \([0-9][0-9]*\) in .*'`
    if test "$FIN" -le "$FOUT"; then
	F="$FIN"
    else
	F="$FOUT"
    fi
    FPASSIVE=`echo "$SOUT" | sed -n -e '/forced passive/p' | wc -l | tr -d ' '`
    ANAT=`echo "$SOUT" | sed -n -e '/auto-NAT/p' | wc -l | tr -d ' '`
    if test "$F" -ge "$FTOTAL" -a "$FPASSIVE$ANAT" -eq 0; then
	# do not mention missing peers of an isolated greylist server
	if test "$FTOTAL" -ne 0 -o -z "$GREY"; then
	    STATE="$STATE and $FTOTAL working flood peers"
	fi
	/bin/rm -f "$FFILE"
    else
	if test "$F" -eq 0; then
	    FMSG="flooding not working"
	else
	    if test "$F" -lt "$FTOTAL"; then
		FMSG="only $F of $FTOTAL flood peers working"
	    else
		if test "$FPASSIVE" -ne 0; then
		    if test "$FPASSIVE" -ne 1; then
			PLURAL=s
		    else
			PLURAL=
		    fi
		    FMSG="$FPASSIVE peer$PLURAL forcing passive flooding"
		else
		    if test "$ANAT" -ne 1; then
			PLURAL=s
		    else
			PLURAL=
		    fi
		    FMSG="using auto-NAT flooding with $ANAT peer$PLURAL"
		fi
	    fi
	fi
	if test ! -s "$FFILE"; then
	    echo "$FMSG" >"$FFILE"
	fi
	STATE="$STATE and $FMSG"
    fi


    # problem if flooding has been broken for at least 2 hours,
    OLDFILE=`find $FFILE -mtime +2h 2>/dev/null`
    if test -z "$OLDFILE"; then
	# deal with old version of `find` by waiting a day or perhaps 2
	OLDFILE=`find $FFILE -mtime +1 2>/dev/null`
    fi
    if test -n "$OLDFILE"; then
	WARN=yes
    fi

    # announce a problem
    if test -n "$WARN"; then
	echo "$GREYLABEL WARNING: $STATE"
	exit $EXIT_WARN
    fi

    echo "$GREYLABEL OK: $STATE"
    exit $EXIT_OK
    ;;
esac
