#!/bin/sh
#
# Copyright (C) 2003, 2007 Red Hat, Inc.
#
# This program is Free Software.  You may modify and/or redistribute it under
# the terms of the GNU General Public License version 2, or (at your option)
# any later version.
#
# description:  Starts and stops Red Hat Cluster Manager
# chkconfig: 2345 99 01
#

# Source function library
. /etc/init.d/functions

# Grab the network config file
. /etc/sysconfig/network

SIMPLE_OPERATION=0
ID="Red Hat Cluster Manager"
CFG_DIR="/etc"
CFG_FILE="$CFG_DIR/cluster.xml"
FW_CHAIN=RH-Firewall-1-INPUT

SVCMGRD="clusvcmgrd"
QUORUMD="cluquorumd"
MEMBD="clumembd"
LOCKD="clulockd"
RMTABD="clurmtabd"

LOG_ERR=3
LOG_WARNING=4
LOG_NOTICE=5
LOG_INFO=6

#
# Set if you want the clumanager script to give up
# during shutdown. (runlevels 6 or 0)
# If this is unset, we will wait forever.
#
#declare -i SHUTDOWN_TIMEOUT=180
#

#
# Only root wants to run this...
#
[ `id -u` = 0 ] || exit 0

#
# If we're not configured, then don't start anything.
#
[ "${NETWORKING}" = "yes" ] || exit 0
[ -f "/etc/cluster.xml" ] || exit 0


#
# Check the cluster configuration file and load the watchdog timer if 
# necessary.
# XXX This loads the watchdog timer on ALL machines, even if they do not
# have 'watchdog=yes' specified.
#
watchdog_init()
{ 
	#
	# Check the cluster configuration for watchdog support
	#
	# XXX This creates and loads the watchdog module regardless of whether
	# the local member uses it.
	#
	grep -q -i "watchdog=\"yes\"" $CFG_FILE
	if [ $? -eq 1 ]; then
		return 0
	fi

	# Check to ensure we have /dev/watchdog
	if ! [ -c /dev/watchdog ]; then
		if [ -f /dev/watchdog ]; then
			action "Removing invalid /dev/watchdog:" \
				rm -f /dev/watchdog
		fi
		action "Creating /dev/watchdog:" /dev/MAKEDEV watchdog
	fi

	# Check /etc/modules.conf for "alias watchdog xxxxxx" line; xxxxxx =
	# the specific driver (see below) we're dealing with.
	# If there is no alias, default to softdog.
	_WDT=`grep "alias wdt" /etc/modules.conf | awk '{print $3}'`
	if [ -z "$_WDT" ]; then
		_PROBE=softdog
		_WDT=softdog
	else
		_PROBE=wdt
	fi

	# Don't try to load the module a second time.
	lsmod | grep -q $_WDT
	if [ $? -ne 0 ]; then
		action "Loading Watchdog Timer ($_WDT): " modprobe $_PROBE
	fi

	unset _WDT _PROBE
	return 0
}


#
# open a port in our firewall
#
open_port()
{
	declare PROTO=$1
	declare -i PORT=$2

	if [ -z "$1" -o -z "$2" ]; then
		echo "usage: $0 <protocol> <port>"
		return 1
	fi

	/sbin/iptables -I $FW_CHAIN -m state \
		--state NEW -m $PROTO -p $PROTO --dport $PORT -j ACCEPT
}


#
# Close a port in the firewall.
#
close_port()
{
	declare -i PORT=$1

	while [ 0 ]; do

		#
		# Grab the rule number so we can delete it.
		# - List our table w/ line numbers for each rule.
		# - Grab the rule number from column 1 of the first line
		#
		declare rule=`iptables -L $FW_CHAIN -n --line-numbers | grep $PORT | head -1 | awk {'print $1'}`
		[ -z "$rule" ] && break;
		iptables -D $FW_CHAIN $rule
	done

	return 0
}


#
# Read the cluster configuration and open the default and/or configured
# ports.
#
cluster_firewall()
{
	declare -i clumembd_addr=34001
	declare -i clusvcmgrd_addr=34002
	declare -i cluquorumd_addr=34003
	declare -i clulockd_addr=34004
	declare -i hb_bcast=1228
	declare -i hb_mcast=1229
	declare port

	if ! iptables -L $FW_CHAIN &> /dev/null; then
		return 0
	fi

	port=`cludb -g clumembd%addr`
	if [ "$port" != "not found" ]; then
		clumembd_addr=$port
	fi

	port=`cludb -g clusvcmgrd%addr`
	if [ "$port" != "not found" ]; then
		clusvcmgrd_addr=$port
	fi

	port=`cludb -g cluquorumd%addr`
	if [ "$port" != "not found" ]; then
		cluquorumd_addr=$port
	fi

	port=`cludb -g clulockd%addr`
	if [ "$port" != "not found" ]; then
		clulockd_addr=$port
	fi

	port=`cludb -g clumembd%port`
	if [ "$port" != "not found" ]; then
		hb_bcast=$port
		hb_mcast=$((port+1))
	fi

	case $1 in 
	start)
		open_port udp $hb_bcast || return 1
		open_port udp $hb_mcast || return 1
		open_port tcp $clumembd_addr || return 1
		open_port tcp $clusvcmgrd_addr || return 1
		open_port tcp $cluquorumd_addr || return 1
		open_port tcp $clulockd_addr || return 1
		;;
	stop)
		close_port $hb_bcast || return 1
		close_port $hb_mcast || return 1
		close_port $clumembd_addr || return 1
		close_port $clusvcmgrd_addr || return 1
		close_port $cluquorumd_addr || return 1
		close_port $clulockd_addr || return 1
		;;
	*)
		echo "usage: $0 {start|stop}"
		return 1
		;;
	esac
	return 0
}


#
# Open firewall ports
#
open_firewall() {
	if ! iptables -L $FW_CHAIN &> /dev/null; then
		return 0
	fi

	echo -n "Opening Firewall Ports:"
	if ! cluster_firewall start; then
		echo_failure
		echo
		return 1
	fi

	echo_success
	echo
}


#
# Close firewall ports
#
close_firewall() {
	if ! iptables -L $FW_CHAIN &> /dev/null; then
		return 0
	fi

	echo -n "Closing Firewall Ports:"
	if ! cluster_firewall stop; then
		echo_failure
		echo
		return 1
	fi

	echo_success
	echo
	return 0
}


#
# log_and_print <level> <message>
#
log_and_print()
{
	if [ -z "$1" -o -z "$2" ]; then
		return 1;
	fi

	/usr/sbin/clulog -p $$ -n "clumanager" -s $1 "$2"
	echo $2

	return 0;
}


#
# Bring down the cluster on a node.
#
stop_cluster()
{
	#
	# Sometimes, people stop the cluster before the service manager
	# is running - this causes the cluster stop script to hang; since
	# the service manager never actually receives the shutdown signal.
	# In this case, we need to resend the TERM singal to the quorum
	# daemon - so that it can notify everyone properly.
	#
	declare rlevel=$(who -r | awk '{print $2}')
	declare -i timeout_enabled=0

	if [ "$rlevel" = "6" ] || [ "$rlevel" = "0" ]; then
		if [ -n "$SHUTDOWN_TIMEOUT" ]; then
			timeout_enabled=1
			echo "Timeout in $SHUTDOWN_TIMEOUT seconds."
		fi
	fi

	while [ 0 ]; do

		echo -n $"Initiating shutdown of Quorum Services: "
		killproc $QUORUMD -TERM
		echo

		if [ -n "`pidof $SVCMGRD`" ]; then
			echo -n $"Waiting for User Services to stop: " 
			while [ -n "`pidof $SVCMGRD`" ]; do
				sleep 1
				if [ $timeout_enabled -eq 1 ]; then
					((SHUTDOWN_TIMEOUT--))
					if [ $SHUTDOWN_TIMEOUT -le 0 ]; then
						echo_failure
						echo
						return 1
					fi
				fi
			done
			echo_success
			echo
		else
			echo $"User Services are stopped."
		fi

		# Ensure all NFS rmtab daemons are dead.
		killall $RMTABD &> /dev/null
		
		# Just in case the service manager blew up during shutdown...
		killproc $QUORUMD -TERM &> /dev/null

		if [ -n "`pidof $QUORUMD`" ]; then
			echo -n $"Waiting for Quorum Services to stop: "
			while [ -n "`pidof $QUORUMD`" ]; do
				sleep 1
				if [ $timeout_enabled -eq 1 ]; then
					((SHUTDOWN_TIMEOUT--))
					if [ $SHUTDOWN_TIMEOUT -le 0 ]; then
						echo_failure
						echo
						return 1
					fi
				fi
				if [ -n "`pidof $SVCMGRD`" ]; then
					echo_failure
					echo
					echo "Retrying..."
					continue 2
				fi	
			done
			echo_success
			echo
		else
			echo $"Quorum Services are stopped."
		fi

		if [ -n "`pidof $MEMBD`" ]; then
			echo -n $"Waiting for Membership Services to stop: "
			while [ -n "`pidof $MEMBD`" ]; do
				sleep 1
				if [ $timeout_enabled -eq 1 ]; then
					((SHUTDOWN_TIMEOUT--))
					if [ $SHUTDOWN_TIMEOUT -le 0 ]; then
						echo_failure
						echo
						return 1
					fi
				fi
			done
			echo_success
			echo
		else
			echo $"Membership Services are stopped."
		fi

		rm -f /var/run/$SVCMGRD.pid
		rm -f /var/run/$QUORUMD.pid

		for dmn in $MEMBD $LOCKD $SVCMGRD $QUORUMD; do
			killall -9 $dmn &> /dev/null
			rm -f /var/run/$dmn.pid
		done

		return 0
	done
}


#
# update_status daemon last_return
#
update_status()
{
	status $1

	local new_status=$?;
	local old_status=$2;


	if [ -z "$2" ]; then
		old_status=$new_status;
	fi

	if [ $old_status -ne $new_status ]; then
		# Return 5 to signal nonuniform statuses
		return 5;
	fi

	return $new_status;
}


case $1 in
	start)
		SIMPLE_OPERATION=$(/usr/sbin/clugetconfig cluster%simple_operation)
		if [ $? -eq 2 ]; then
			SIMPLE_OPERATION=0
		fi

		if [ $SIMPLE_OPERATION -ne 0 ]; then
			ID="Red Hat Quorum Services"
		fi
									
		log_and_print $LOG_NOTICE "Starting $ID..."
		watchdog_init

		open_firewall || exit 1

		echo -n $"Starting Quorum Daemon: "
		daemon $QUORUMD $FLAGS
		echo

		# To be consistent...
		touch /var/lock/subsys/clumanager
		;;

	restart)
		$0 status &> /dev/null
		if [ $? -ne 1 ]; then
			$0 stop
		fi
		$0 start
		;;
		
	condrestart)
		$0 status $> /dev/null
		if [ $? -eq 0 ]; then
			$0 stop
			$0 start
		fi
		;;

	reload)
		clulog -p $LOG_NOTICE "Reloading Cluster Configuration."
		echo -n $"Reloading Cluster Configuration: "
		killproc $SVCMGRD -HUP
		rv=$?
		echo

		exit $rv
		;;

	status)
		update_status $MEMBD
		update_status $QUORUMD $?
		rv=$?

		if [ $SIMPLE_OPERATION -eq 0 ]; then
			update_status $LOCKD $rv
			rv=$?
			update_status $SVCMGRD $rv
			svcmgr=$?

			#
			# Check to see if, when the service manager is dead
			# but everything else is running
			#
			if [ $rv -ne 5 -a $svcmgr -eq 5 ]; then
				clustat -Q
				# No Quorm + No Service Manager -> OK!
				if [ $? -eq 1 ]; then
	echo "Note: Service manager is not running because this member"
	echo "      is not participating in the cluster quorum."
					exit 0;
				fi
			fi
			rv=$svcmgr
		fi

		[ $rv -eq 5 ] && exit 1;

		exit $rv
		;;

	stop)
		if [ -n "`pidof $QUORUMD`" ]; then
			log_and_print $LOG_NOTICE "Shutting down $ID..."
			stop_cluster
			if [ $? -ne 0 ]; then
				log_and_print $LOG_WARNING "$ID is NOT fully stopped (timed out)"
				exit 1
			fi
		elif [ -n "`pidof $MEMBD`" ]; then
			log_and_print $LOG_NOTICE "Shutting down $ID..."
			echo $"Stopping Cluster Membership Daemon: "
			killproc $MEMBD -KILL
			echo
		fi

		close_firewall

		rm -f /var/lock/subsys/clumanager
		log_and_print $LOG_NOTICE "$ID is stopped."
		;;

  	*)
		echo $"Usage: $0 {start|stop|status|restart|condrestart|reload}"
		exit 1
		;;
esac
