#!/bin/bash
#
# diskdump	This starts, stops, and reloads the diskdump
#		and crashdump facility
#
# chkconfig: - 04 94
# description: Save dump file if previous system crashed and initialize diskdump module.
# config: /etc/sysconfig/diskdump
#
# $Id: diskdump.sh,v 1.101 2007/06/28 19:10:37 tachino Exp $

# Source function library.
. /etc/rc.d/init.d/functions

# VERSION is given when "make install" is executed.
VERSION="1.4.1"
SERVICE_NAME=$(basename $0 .sh)
CONF_DISKDUMP="/etc/sysconfig/diskdump"
DISKDUMPFMT="/sbin/diskdumpfmt"
SAVECORE="/sbin/savecore"
DISKDUMPCTL="/sbin/diskdumpctl"
DISKDUMPMSG="/sbin/diskdumpmsg"
PROC_DISKDUMP="/proc/diskdump"
MODINFO="/sbin/modinfo"
MODPROBE="/sbin/modprobe"
RMMOD="/sbin/rmmod"
SYSFSROOT=$(grep -w -m1 sysfs /proc/mounts | cut -d\  -f2)
KERNEL=$(uname -r | awk '{print substr($0,1,3);}')
MODDIR=/lib/modules/$(uname -r)/kernel/
SENDMAIL="/usr/sbin/sendmail"
MAIL_TEMPLATE="/etc/diskdump/mail_template.us"
LOGGER="/usr/bin/logger"

# diskdump version
if [ "$1" = "version" ]; then
	echo "$SERVICE_NAME version $VERSION" >&2
	exit 0
fi

# Source Dump Device
if [ -f $CONF_DISKDUMP ]; then
	. $CONF_DISKDUMP
fi

if [ -z $DEVICE ]; then
	if [ "$1" != "swapsavecore" -a "$1" != "checkswapdump" ]; then
		echo "Device not specified in $CONF_DISKDUMP" >&2
	fi
	exit 1
fi

# Deferred Savecore
if [[ $EXPIRATION -lt 0 ]]; then
	if [ "$1" != "swapsavecore" -a "$1" != "checkswapdump" ]; then
		echo "EXPIRATION(=$EXPIRATION) invalid in $CONF_DISKDUMP" >&2
	fi
	exit 1
fi

# Swap Partition Support and Message Complement
case "$KERNEL" in
"2.4")	SALVAGEMESSAGE="no" ;;
"2.6")	if [ "$SALVAGEMESSAGE" != no ]; then
		SALVAGEMESSAGE=yes
	fi ;;
*)	echo "kernel version '$KERNEL' incorrect" >&2
	exit 1 ;;
esac

RETVAL=0

logging() {
	local cmnt="$1"

	if [ -f $LOGGER ]; then
		logger -p info -t $SERVICE_NAME "$cmnt"
	else
		echo "$cmnt" >&2
	fi
}

start_device() {
	local dev=$1
	local dev_type=$(get_device_type $dev)

	ret=0
	for reg in $(get_registered_device); do
		if [ "$dev_type" = "$(get_device_type $reg)" ]; then
			return
		fi
	done

	errmsg=$($DISKDUMPCTL $dev 2>&1)
	ret=$?
	if [ $ret -ne 0 ]; then
		logging "$errmsg"
	fi
}

stop_device() {
	local dev=$1

	errmsg=$($DISKDUMPCTL -u $dev 2>&1)
	if [ $? -ne 0 ]; then
		logging "$errmsg"
	fi
}

get_device_type() {
	local dev=$1

	ls -l $dev | gawk '{printf "%s%s\n", $5, $6}'
}

get_registered_device() {
	local devices=""
	local tmp=""

	devices=$(grep -v '^#' $PROC_DISKDUMP | cut -d\  -f1)
	if [ -z "$devices" ]; then
		return
	fi

	if [ "$KERNEL" = "2.6" ]; then
		for dev in $devices; do
			tmp="$tmp /dev/$dev"
		done
		devices=$tmp
	fi

	echo "$devices"
}

load_module() {
	local mod=$1
	local dir=$2

	if [ "$dir" ]; then
		case "$KERNEL" in
		"2.4")	if [ ! -f "$MODDIR/$dir/$mod.o" ]; then
				return
			fi ;;
		"2.6")	if [ ! -f "$MODDIR/$dir/$mod.ko" ]; then
				return
			fi ;;
		esac
	fi

	errmsg=$($MODPROBE $mod)
	if [ $? -ne 0 ]; then
		logging "$errmsg"
	fi
}

show_result() {
	local string=$1
	local good=$2
	local bad=$3

	if [ $bad -eq 0 ]; then
		success $string
	elif [ $good -eq 0 ]; then
		failure $string
	else
		warning $string
	fi
	echo >&2
}

reverse() {
	local list=$*
	local tmp=""

	for item in $list; do
		tmp="$item $tmp"
	done

	echo "$tmp"
}

get_dump_info()
{
	local dev=$1
	local param="$2"

	$DISKDUMPFMT -cv $dev 2>&1 | grep "$param" | sed 's/^.*: //g'
}

notify() {
	local dev=$1
	local cause=$2
	local node_name=$(get_dump_info $dev "node name")
	local machine=$(get_dump_info $dev "machine")
	local release=$(get_dump_info $dev "release")
	local dumped_date=$(get_dump_info $dev "dumped date")
	local passed_days=$(get_dump_info $dev "passed days")

	if [ ! -e $SENDMAIL ]; then
		logging "unable to email ($SENDMAIL not exist)"
		return
	fi

	if [ ! -e $MAIL_TEMPLATE ]; then
		logging "unable to email ($MAIL_TEMPLATE not exist)"
		return
	fi

	if [ -z "$MAILTO" ]; then
		logging "MAILTO not specified in $CONF_DISKDUMP"
		return
	fi

	if [ -z "$FROM" ]; then
		FROM="root@$node_name"
	fi

	content=$(grep -v '^#' $MAIL_TEMPLATE\
	| sed "s|%MAILTO%|$MAILTO|g"\
	| sed "s|%FROM%|$FROM|g"\
	| sed "s|%CURRENT%|$(date +'%D %T')|g"\
	| sed "s|%NODENAME%|$node_name|g"\
	| sed "s|%RELEASE%|$release|g"\
	| sed "s|%MACHINE%|$machine|g"\
	| sed "s|%DEVICE%|$dev|g"\
	| sed "s|%CAUSE%|$cause|g"\
	| sed "s|%DAYS%|$passed_days|g"\
	| sed "s|%DUMPED_DATE%|$dumped_date|g")

	$SENDMAIL -t << EOS &
$content
EOS
	if [ $? -eq 0 ]; then
		logging "emailed from $FROM to $MAILTO"
	else
		logging "failed to email (an error occurred)"
	fi
}

complement_messages()
{
	local old_vmcores=$*
	local new_vmcores=$(shopt -s nullglob; echo /var/crash/127.0.0.1-*/vmcore)

	for vmcore in $new_vmcores; do
		if [[ $old_vmcores == ${old_vmcores//$vmcore/} ]]; then
			# new vmcore
			$DISKDUMPMSG $vmcore
			if [ $? -ne 0 ]; then
				logging "$DISKDUMPMSG failed"
			fi
		fi
	done
}

start() {
	local opt=$1
	local good_devices
	local bad_devices
	local savecore_devices
	local reformat_devices
	local skipped_devices
	local specified_devices
	local dump_size
	local incomplete_devices
	local container
	local good_dev
	local savecore_date
	local dumped_date
	declare -i local good_count=0
	declare -i local bad_count=0

	if [ "$opt" = "-f" ]; then
		# Disable PRESERVEDUMP and SKIPSAVECORE temporarily
		PRESERVEDUMP="no"
		SKIPSAVECORE="no"
	else
		# Check if diskdump is already running
		status 2> /dev/null
		if [ $? -eq 0 ]; then
			echo "$SERVICE_NAME is already running" >&2
			return
		fi
	fi

	load_module scsi_dump

	# CCISS kludge
	if [ "${DEVICE/cciss/}" != "$DEVICE" ];then
		load_module block_dump drivers/block
		load_module cciss_dump drivers/block
	fi
	load_module ide-dump drivers/ide

	for dev in $(echo $DEVICE | sed 's/:/ /g'); do
		$DISKDUMPFMT -c $dev 2> /dev/null
		ret=$?
		case $ret in
		0) # device is formatted
		   good_devices="$good_devices $dev" ;;
		1) # device has panic dump
		   dumped_date=$(get_dump_info $dev "dumped date")
		   savecore_date="$savecore_date\n$dumped_date"
		   if [ "$SKIPSAVECORE" = "yes" ]; then
			skipped_devices="$skipped_devices $dev"
		   else
			savecore_devices="$savecore_devices $dev"
		   fi ;;
		3) # device requires reformatting
		   reformat_devices="$reformat_devices $dev" ;;
		5) # device is swap space
		   good_devices="$good_devices $dev" ;;
		6) # device is swap space and has panic dump
		   if [ "$PRESERVEDUMP" = "yes" ]; then
			bad_devices="$bad_devices $dev"
			logging "$dev is a preserved dump device"
			notify $dev "PRESERVEDUMP"
		   else
			good_devices="$good_devices $dev"
		   fi ;;
		8) # device has incomplete panic dump
		   dump_size=$(get_dump_info $dev "dump size")
		   if [ $dump_size -gt 0 ]; then
		   	container=$(classify "$container" $dev)
		   fi;;
		*) bad_devices="$bad_devices $dev" ;;
					# delay error message later
		esac
	done

	if [ ! -z "$container" ]; then
		for dev in $(echo -e "$container" | sort -fnrk1 | cut -f3); do
			dumped_date=$(get_dump_info $dev "dumped date")
			echo -e "$savecore_date" | grep "$dumped_date" > /dev/null
			if [ $? -eq 0 ]; then
				reformat_devices="$reformat_devices $dev"
			elif [ "$SKIPSAVECORE" = "yes" ]; then
				savecore_date="$savecore_date\n$dumped_date"
				skipped_devices="$skipped_devices $dev"
			else
				savecore_date="$savecore_date\n$dumped_date"
				savecore_devices="$savecore_devices $dev"
			fi
		done
	fi

	if [ ! -z "$skipped_devices" ]; then
		for dev in $skipped_devices; do
			if [[ $EXPIRATION -eq 0 ]]; then
				logging "$dev has no expiration"
				bad_devices="$bad_devices $dev"
				notify $dev "SKIPSAVECORE"
				continue
			fi

			passed_days=$(get_dump_info $dev "passed days")
			logging "$passed_days days passed since dumped to $dev"
			if [[ $passed_days -ge $EXPIRATION ]]; then
				reformat_devices="$reformat_devices $dev"
				logging "$dev expired"
			else
				bad_devices="$bad_devices $dev"
				logging "skipped \"savecore $dev\""
				notify $dev "SKIPSAVECORE"
			fi
		done
	fi

	if [ ! -z "$savecore_devices" ]; then
		echo "Saving panic dump: " >&2

		old_vmcores=$(shopt -s nullglob; echo /var/crash/127.0.0.1-*/vmcore)

		for dev in $savecore_devices; do
			$SAVECORE -p $opt $dev
			if [ $? -eq 0 ]; then
				good_count=good_count+1
				reformat_devices="$reformat_devices $dev"
				continue
			fi

			bad_count=bad_count+1
			if [ "$PRESERVEDUMP" = "yes" ]; then
				bad_devices="$bad_devices $dev"
				logging "$dev is a preserved dump device"
				notify $dev "PRESERVEDUMP"
			else
				reformat_devices="$reformat_devices $dev"
			fi
		done

		if [ "$SALVAGEMESSAGE" = yes ]; then
			complement_messages $old_vmcores
		fi

		rm -f /var/crash/127.0.0.1-*/vmcore-uncompressedrawdata

		show_result "saving" $good_count $bad_count
	fi

	good_count=0; bad_count=0

	if [ ! -z "$reformat_devices" ]; then
		echo "Formatting dump device: " >&2

		# Forcibly write the dirty pages out.
		sync

		load_module diskdump
		for dev in $reformat_devices; do
			$DISKDUMPFMT -p $dev
			if [ $? -eq 0 ]; then
				good_devices="$good_devices $dev"
				good_count=good_count+1
			else
				bad_count=bad_count+1
			fi
		done

		show_result "formatting" $good_count $bad_count
	fi

	good_count=0

	echo -n "Starting diskdump: " >&2
	for dev in $bad_devices; do
		logging "$dev is not suited for dump device"
		bad_count=bad_count+1
	done

	specified_devices=$(echo $DEVICE | sed 's/:/ /g')
	if [ "$KERNEL" = "2.4" ]; then
		specified_devices=$(reverse $specified_devices)
	fi
	for dev in $specified_devices; do
		for good_dev in $good_devices; do
			if [ "$good_dev" = "$dev" ]; then
				start_device $dev
				if [ $ret -eq 0 ]; then
					good_count=good_count+1
				else
					bad_count=bad_count+1
				fi
				break
			fi
		done
	done

	show_result "activating" $good_count $bad_count

	if [ $good_count -eq 0 ]; then
		stop
		RETVAL=1
	else
		RETVAL=0
	fi
}

unload_module() {
	if grep ^scsi_dump\  /proc/modules > /dev/null; then
		$RMMOD scsi_dump
	fi
	if grep ^ide_dump\  /proc/modules > /dev/null; then
		$RMMOD ide-dump
	fi
	if grep ^block_dump\  /proc/modules > /dev/null; then
		$RMMOD block_dump
	fi
	if grep ^cciss_dump\  /proc/modules > /dev/null; then
		$RMMOD cciss_dump
	fi
	if grep ^diskdump\  /proc/modules > /dev/null; then
		$RMMOD diskdump
	fi
}

stop() {
	case "$KERNEL" in
	"2.4")	if [ -f $PROC_DISKDUMP ]; then
			grep -v '^#' $PROC_DISKDUMP | while read dev sector; do
				stop_device $dev
			done
		fi ;;
	"2.6")	find $SYSFSROOT -name dump | while read f; do
			exec 3<&0 <$f
			while read dev; do
				stop_device "/dev/$dev"
			done
			exec 0<&3 3<&-
		done ;;
	*)	echo "kernel version '$KERNEL' incorrect" >&2
		exit 1
	esac
	unload_module
}

classify() {
	local devices=$1
	local dev=$2
	local dumped_date
	local dump_size

	dumped_date=$($DISKDUMPFMT -civ $dev 2>&1\
		| grep "dumped date" | sed 's/^.*: //')
	dump_size=$($DISKDUMPFMT -civ $dev 2>&1\
		| grep "dump size" | sed 's/^.*: //')

	echo "$devices$dump_size\t$dumped_date\t$dev\n"
}

get_last_dumped_date()
{
	local container=$1
	local dumped_date

	dumped_date=$(echo -e "$container" | sort -fnrk1 | head -n1 | cut -f2)
	echo "$dumped_date"
}

find_incomplete_devices()
{
	local container=$1
	local devices

	devices=$(echo -e "$container" | sort -fnrk1 | cut -f3)
	echo "$devices"
}

save_dump_from_swap()
{
	local device=$1

	if [ -z "$device" ]; then
		return
	fi

	echo -e $"Saving panic dump from swap partition:\r" >&2

	old_vmcores=$(shopt -s nullglob; echo /var/crash/127.0.0.1-*/vmcore)

	$SAVECORE -D -p $device
	RETVAL=$?

	if [ "$SALVAGEMESSAGE" = yes ]; then
		complement_messages $old_vmcores
	fi

	echo -ne "\r" >&2
}

get_last_incomplete_device()
{
	local container=$1
	local device

	device=$(echo -e "$container" | head -n1)
	echo "$device"
}

delete_dump()
{
	local devices=$1

	for dev in $devices; do
		$SAVECORE -X $dev
	done
}

swapsavecore() {
	local opt=$1
	local last_dumped_date
	local tmp_devices
	local complete_devices
	local last_incomp_dump_dev
	local last_incomp_swap_dev
	local delete_incomp_devices
	local incomp_swap_devices
	local contnr_dump_dev
	local contnr_swap_dev
	local container

	for dev in $(echo $DEVICE | sed 's/:/ /g'); do
		$DISKDUMPFMT -ci $dev 2> /dev/null
		case $? in
		1) # device has panic dump
		   complete_devices=$(classify "$complete_devices" $dev)
		   continue ;;
		6) # device is swap space and has panic dump
		   if [ "$opt" = "-c" ]; then
			   RETVAL=1
			   break
		   fi
		   tmp_devices=$(classify "$complete_devices" $dev)
		   last_dumped_date=$(get_last_dumped_date $(classify "" $dev))
		   save_dump_from_swap $dev
		   if [ $RETVAL -eq 0 ]; then
			complete_devices=$tmp_devices
		   else
			delete_dump $dev
		   fi
		   continue ;;
		8) # device has incomplete panic dump
		   contnr_dump_dev=$(classify "$contnr_dump_dev" $dev)
		   continue ;;
		9) # device is swap space and has incomplete panic dump
		   last_dumped_date=$(get_last_dumped_date $(classify "" $dev))
		   contnr_swap_dev=$(classify "$contnr_swap_dev" $dev)
		   continue ;;
		esac
	done

	if [ "$KERNEL" = "2.4" ]; then
		# The later process is to handle multiple dump partition.
		# No more process required for kernel-2.4.x.
		return
	fi

	if [ -z "$last_dumped_date" ]; then
		return
	fi

	container="$contnr_dump_dev$contnr_swap_dev"

	incomp_swap_devices=$(find_incomplete_devices "$contnr_swap_dev")
	last_incomp_swap_dev=$(get_last_incomplete_device "$incomp_swap_devices")
	last_incomp_dump_dev=$(echo -e "$container" | sort -fnrk1\
		| grep "$last_dumped_date" | head -n1 | cut -f3)
	delete_incomp_devices=$(echo -e "$container"\
		| grep "$last_dumped_date" | cut -f3)

	echo -e "$complete_devices" | grep "$last_dumped_date" > /dev/null
	if [ $? -ne 0 ]; then
		if [ "$last_incomp_dump_dev" = "$last_incomp_swap_dev" ]; then
			save_dump_from_swap $last_incomp_swap_dev
			delete_dump "$delete_incomp_devices"
		else
			delete_dump "$incomp_swap_devices"
		fi
	else
		# The following are prevented:
		# If 1.the vmcore from swap exists at the file system
		# 2.the same date incomplete dump at the non swap device exists
		# 3.PRESERVEDUMP=yes,
		# the directory overlapping error makes the dump remain.
		delete_dump "$delete_incomp_devices"
	fi
}

format() {
	local opts=$1
	local ret=1
	local RM_MOD=0

	if [ $opts != "-ap" -a ! -f $PROC_DISKDUMP ]; then
		$MODPROBE diskdump
		RM_MOD=1
	fi

	echo "Formatting dump device: " >&2
	for dev in $(echo $DEVICE | sed 's/:/ /g'); do
		$DISKDUMPFMT -c $dev 2> /dev/null
		case $? in
		5|6) # device is swap space
		   echo "$dev: skipped (swap device)" >&2
		   ret=0
		   continue ;;
		7) # device is mounted
		   echo "$dev: skipped (mounted device)" >&2
		   ret=0
		   continue ;;
		esac

		if [ "$opts" = "-fp" -a "$INITFMTSILENT" != yes ]; then
			echo -n "Do you want to format $dev (yes/NO)? " >&2
			read ans
			case $ans in
			y|Y|[yY][eE][sS]) ;;
			*)	echo "$dev: skipped" >&2
				continue ;;
			esac
		fi

		$DISKDUMPFMT $opts $dev
		if [ $? -eq 0 ]; then
			ret=0
		fi
	done

	if [ $RM_MOD -eq 1 ]; then
		$RMMOD diskdump
	fi

	if [ $ret -ne 0 ]; then
		echo "Each format processing failed" >&2
	fi

	RETVAL=$ret
}

devicestatus() {
	declare -i local bad_count=0

	for dev in $(echo $DEVICE | sed 's/:/ /g'); do
		$DISKDUMPFMT -cv $dev
		ret=$?
		case $ret in
			0|5|6|7)# formatted device, swap space, or mounted
				# device does not require reformatting
			;;
		*)	bad_count=bad_count+1 ;;
		esac
	done

	if [ $bad_count -gt 0 ]; then
		echo "Each device has not been formatted" >&2
	fi
}

status() {
	if [ ! -f $PROC_DISKDUMP ]; then
		echo "$SERVICE_NAME not enabled" >&2
		return 3
	fi

	echo "$SERVICE_NAME enabled" >&2
	if [ "$PRESERVEDUMP" = "yes" ]; then
		echo "PRESERVEDUMP enabled" >&2
	else
		echo "PRESERVEDUMP not enabled" >&2
	fi
	if [ "$SKIPSAVECORE" = "yes" ]; then
		echo "SKIPSAVECORE enabled" >&2
	else
		echo "SKIPSAVECORE not enabled" >&2
	fi
	if [ "$KERNEL" != "2.6" ]; then
		return 0
	fi
	if [ "$SALVAGEMESSAGE" = "yes" ]; then
		echo "SALVAGEMESSAGE enabled" >&2
	else
		echo "SALVAGEMESSAGE not enabled" >&2
	fi
	return 0
}

restart() {
	stop
	start
}

case "$1" in
start)		start ;;
stop)		stop ;;
swapsavecore)	swapsavecore ;;
checkswapdump)	swapsavecore -c ;;
format)		format -p ;;
initialformat)	format -fp ;;
regularformat)	format -ap ;;
devicestatus)	devicestatus ;;
status)		status
		RETVAL=$? ;;
restart|reload)	restart ;;
enabledevice)	start -f ;;
*) echo "\
Usage: service $SERVICE_NAME {subcommand}
Subcommands:
	start		start $SERVICE_NAME
	stop		stop $SERVICE_NAME
	format		do quick format all dump devices
	initialformat	do initial format all dump devices
	regularformat	do full format all dump devices
	status		show if $SERVICE_NAME is activated
	restart|reload	stop and start $SERVICE_NAME
	enabledevice	enable all dump devices and restart $SERVICE_NAME
	devicestatus	show status of all dump devices
	version		show $SERVICE_NAME version information\
" >&2
	exit 1
esac

exit $RETVAL
