#!/bin/sh
#
#pragma ident	"@(#)initucmm	1.8	03/04/17 SMI"
#
# Copyright 1999-2003 Sun Microsystems, Inc.  All rights reserved.
# Use is subject to license terms.
#

# Start/stop user cluster membership monitor daemon.

BINDIR=/usr/cluster/lib/ucmm
USRBIN=/usr/bin
SERVER=ucmmd
RGMD=rgmd
UCMMSTATE=${BINDIR}/ucmmstate
OK_TO_JOIN=0
NOT_OK_TO_JOIN=1
STEP_VERSION_FILE="/var/cluster/ucmm/step_version_lock";

#
# The RETRY_COUNT * RETRY_INTERVAL is set to 2 * UCMMD global step timout
# plus a small margin to allow any current reconfiguration to complete
#
RETRY_COUNT=80
RETRY_INTERVAL=10

#
# The following list represents the exit codes from the ucmmstate command
#
UCMMSTATE_RET_NOTMEMBER=2       # Node is not a ucmm member
UCMMSTATE_RET_MEMBER=3          # Node is a ucmm member
UCMMSTATE_EPERM=10              # Not super-user
UCMMSTATE_ENOMEM=11             # Not enough memory
UCMMSTATE_EACCES=12             # Permission denied
UCMMSTATE_EUSAGE=13             # Usage error
UCMMSTATE_EUCMEMBER=14          # is_ucmm_member() returned -1
UCMMSTATE_ESCMEMBER=15          # Client not a cluster member
UCMMSTATE_ERROR=16              # Other error
UCMMSTATE_ERPC_TIMEOUT=30       # RPC Call timed out
UCMMSTATE_ERPC_VERS=31          # RPC versions not compatible
UCMMSTATE_ERPC_AUTH=32          # RPC authentication error
UCMMSTATE_ERPC_UNAVAIL=33       # RPC program not available
UCMMSTATE_ERPC_PVERS=34         # RPC program version mismatch
UCMMSTATE_ERPC_PROC=34          # RPC procedure unavailable
UCMMSTATE_ERPC_SYSTEM=35        # RPC_SYSTEMERROR
UCMMSTATE_ERPC_HOST=36          # RPC unknown host name
UCMMSTATE_ERPC_ADDR=37          # RPC remote address unknown
UCMMSTATE_ERPC_BIND=38          # RPC pmapper call falled
UCMMSTATE_ERPC_REG=39           # RPC remote program not registered
UCMMSTATE_ERPC_INTR=40          # RPC call interupted
UCMMSTATE_ERPC=41               # Unexpected RPC error

#
# The environment variable ${RECONF_PROG} must be a non-zero length
# string in order to start this daemon.
#
RECONF_PROG=${BINDIR}/ucmm_reconf

get_nodeids() {
	nodeids=""
	for nodeid in ${cmm_members}; do
		if [ ${nodeid} -ne ${my_nodeid} ]; then
			nodeids="${nodeids} ${nodeid}"
		fi
	done
}

get_cmm_members() {
	cmm_members=`${UCMMSTATE} -c printmembers 2> /dev/null`
	exitcode=$?
	if [ ${exitcode} -ne 0 ]; then
		/bin/logger -p local0.err -t INITUCMM \
			"Warning: ucmmstate printmembers returned: ${exitcode}"
	fi
}

#
# Check if the local node is in the UCMM membership of the other CMM members
# in parallel.
#
# UCMMSTATE will return 0 on success with the indication of ucmm_membership
# as a single integer on the stdout. In this case the function will return
# that single integer.
#
# If UCMMSTATE returns non-zero, an error has occurred and the stdout is
# ignored. The actual error code returned by UCMMSTATE is then returned
# from this function.
# 
# The get_ucmm_info() return values are captured by the caller via the
# VARIABLE=`get_ucmm_info` shell construct.
#
get_ucmm_info() {
	for nodeid in ${nodeids}; do
		(
			v=`${UCMMSTATE} -c ucmm_membership \
				-N ${nodeid} 2>/dev/null`
			rv=$?
			if [ $rv -ne 0 ]; then
				echo $rv
			else
				echo $v
			fi
		)&
		wait
	done
}

#
# Determine if it is OK to join the cluster
#
# Returns either OK_TO_JOIN or NOT_OK_TO_JOIN
#
check_ok_to_join() {
	get_cmm_members

	get_nodeids

	exitcodes=`get_ucmm_info`


	# If STEP_VERSION file exists, perform checks that
	# can remove STEP_VERSION file. This file will be created
	# only when upgrading a node that was running step version A.
	# The step version lock file will be removed after
	# all nodes are capable of running step version B
	# after upgrade
	# 

	if [ -f "${STEP_VERSION_FILE}" -a  -n "${exitcodes}" ]; then
		num_nodes_contacted=`echo "$exitcodes" | /bin/wc -w`

		if [ ${num_nodes_contacted} -gt 1 ]; then
			# Remove the file
			/bin/rm -f ${STEP_VERSION_FILE}
		else
			case "${exitcodes}" in
				${UCMMSTATE_RET_MEMBER}|\
				${UCMMSTATE_RET_NOTMEMBER}|\
				${UCMMSTATE_EUSAGE}|\
				${UCMMSTATE_EUCMEMBER} )
					/bin/rm -f ${STEP_VERSION_FILE}
					;;
				*)
					;;
			esac

		fi

		if [ ! -f ${STEP_VERSION_FILE} ]; then
			/bin/logger -p local0.notice -t INITUCMM  \
			    "Removed step version file ${STEP_VERSION_FILE}."
		fi

	fi

	num_exit_codes=0
	num_ok_codes=0
	for exitcode in ${exitcodes}; do
		num_exit_codes=`/bin/expr ${num_exit_codes} + 1`
		case ${exitcode} in

			#
			# Exit codes that result in immediate retry
			#
			${UCMMSTATE_RET_MEMBER})
				return ${NOT_OK_TO_JOIN};;

			#
			# Exit codes that allow the join to proceed
			#
			${UCMMSTATE_RET_NOTMEMBER}|\
			${UCMMSTATE_EPERM}|\
			${UCMMSTATE_ENOMEM}|\
			${UCMMSTATE_EACCES}|\
			${UCMMSTATE_EUSAGE}|\
			${UCMMSTATE_EUCMEMBER}|\
			${UCMMSTATE_ESCMEMBER}|\
			${UCMMSTATE_ERROR}|\
			${UCMMSTATE_ERPC_TIMEOUT}|\
			${UCMMSTATE_ERPC_VERS}|\
			${UCMMSTATE_ERPC_AUTH}|\
			${UCMMSTATE_ERPC_UNAVAIL}|\
			${UCMMSTATE_ERPC_PVERS}|\
			${UCMMSTATE_ERPC_PROC}|\
			${UCMMSTATE_ERPC_SYSTEM}|\
			${UCMMSTATE_ERPC_HOST}|\
			${UCMMSTATE_ERPC_ADDR}|\
			${UCMMSTATE_ERPC_BIND}|\
			${UCMMSTATE_ERPC_REG}|\
			${UCMMSTATE_ERPC_INTR}|\
			${UCMMSTATE_ERPC})
				num_ok_codes=`/bin/expr ${num_ok_codes} + 1`;;

			#
			# Exit codes that are not defined
			#
			*)
				/bin/logger -p local0.err -t INITUCMM \
					"Error: ucmmstate ucmm_membership returned: ${exitcode}"
				return ${NOT_OK_TO_JOIN};;
		esac
	done

	if [ ${num_ok_codes} -eq ${num_exit_codes} ]; then
		return ${OK_TO_JOIN}
	else
		return ${NOT_OK_TO_JOIN}
	fi
}

case "$1" in
'start')
	# test whether we are a cluster and exit if not a cluster
	/usr/sbin/clinfo > /dev/null 2>&1
	if [ $? != 0 ] ; then
		exit 0
	fi

	my_nodeid=`/usr/sbin/clinfo -n`

	# exit if ucmmd is already running
	pidlist=`${USRBIN}/pgrep ${SERVER}`
	if [ $? -eq 0 ] ; then
		exit 1
	fi

	# exit if rgmd is not running
	pidlist=`${USRBIN}/pgrep ${RGMD}`
	if [ $? -eq 1 ] ; then
		/bin/logger -p local0.err -t INITUCMM "Error: rgmd is not running, not starting ucmmd."
		exit 1
	fi

	# check for existence of executible ucmmstate program
	if [ ! -x ${UCMMSTATE} ]; then
		/bin/logger -p local0.err -t INITUCMM \
			"Error: ${UCMMSTATE} not an executible."
		exit 1
	fi

	retry_num=1
	check_ok_to_join
	if [ $? -ne ${OK_TO_JOIN} ]; then
		while : ; do
			/bin/logger -p local0.err -t INITUCMM \
				"Notice: not OK to join, retry in ${RETRY_INTERVAL} seconds (#${retry_num})"
			/bin/sleep ${RETRY_INTERVAL}

			check_ok_to_join
			if [ $? -eq ${OK_TO_JOIN} ]; then
				break
			fi
			retry_num=`expr ${retry_num} + 1`
			if [ ${retry_num} -gt ${RETRY_COUNT} ]; then
				/bin/logger -p local0.err -t INITUCMM \
					"Warning: all retries unsucessful, starting anyway"
				break
			fi
		done
	fi

	if [ -n "${RECONF_PROG}" ]; then
		${BINDIR}/${SERVER} -r ${RECONF_PROG}
	fi
	;;

'stop')
	# kill ucmmd if it is running
	# This works on a list, but there shouldn't be more than one
	# instance running (see start case).
	pidlist=`${USRBIN}/pgrep ${SERVER}`
	if [ -n "${pidlist}" ] ; then
		# kill each process in list
		for pid in ${pidlist}
		do
			kill -1 ${pid}
		done
	fi

	;;
*)
	echo "Usage: /etc/init.d/initucmm { start | stop }"
	;;
esac
