#! /bin/sh
#
# ident	"@(#)hasap_probe.shi	1.19	99/01/29 SMI"
#
# Copyright (c) 1997-1999 by Sun Microsystems, Inc.
# All rights reserved.
#
#
# Usage: hasap_probe InstanceName
#
# Started up in the background via pmfd in sap_fm_start during reconfiguration.

usage()
{
	logerr "$prefix.4013" `gettext "Usage: hasap_probe InstanceName"`
}

# ######################################################################
#
# Probing starts here:
#
# ######################################################################

grace_probe()
{
	#
	# Re-initialize the probe variables
	#
	cicnt=0
	dbcnt=0
	proccnt=0
	mscnt=0

	#
	# Check if SAP has been started. 
	# If so exit to normal probing. 
	# If not, continue testing until CI_STARTSAP_RETRY_CNT has been
	# exhausted. Then exit to normal probing to act on any failures.
	#
	# Do not go to normal mode unless all test has passed or
	# CI_STARTSAP_RETRY_CNT has been exhausted.
	#

	cur_retry=1

	#
	# Loop CI_STARTSAP_RETRY_CNT times and check if the CI
	# has been started.
	#
	while [ ${cur_retry} -le ${CI_STARTSAP_RETRY_CNT} ]; do

		lognotice "$prefix.2031" `gettext "Probe in grace mode (retry ${cur_retry} of ${CI_STARTSAP_RETRY_CNT})"`

		procprobe grace
		procprobe_rc=$?

		# diaprobe grace
		# diaprobe_rc=$?

		dbprobe grace
		dbprobe_rc=$?

		msprobe grace
		msprobe_rc=$?

		if [ ${procprobe_rc} -eq 0 ] &&
		   # [ ${diaprobe_rc} -eq 0 ] &&
		   [ ${dbprobe_rc} -eq 0 ] &&
		   [ ${msprobe_rc} -eq 0 ] ; then

			#
			# Check if SAP is running outside of the cluster.
			#
			ha_svc_not_running ${_INST_NAME}
			if [ $? -eq 0 ]; then
				logerr "$prefix.4077" `gettext "SAP instance ${_INST_NAME} appears to be running outside of the control of the clustering software. You must turn off the SAP data service, manually turn off SAP, and then start SAP by turning on the SAP data service. Failure to do so may prevent proper shutdown and switchover. The probe is exiting."`

				pmfadm -s ${_INST_NAME}.probe >/dev/null 2>&1
				exit 1
			fi

			#
			# SAP passed all probes
			#
			lognotice "$prefix.2032" `gettext "SAP passed all probes. Exiting grace mode."`
			break
		else
			lognotice "$prefix.2057" `gettext "SAP startup needs more time. Probe status: (procprobe=${procprobe_rc}) (dbprobe=${dbprobe_rc}) (msprobe=${msprobe_rc})"`
		fi

		#
		# Sleep the amount of time that the user has configured to run the
		# stop_all_instances script in the foreground before starting 
		# the CI. The probe needs to give SAP a chance to be started so it
		# will wait this amount of time once, during the first iteration
		# of the grace probe loop.
		#
		if [ $cur_retry -eq 1 ]; then

			if [ -x ${PATH0}/hasap_stop_all_instances ] &&
			   [ "${STOP_ALL_RUNTIME}" -gt 0 ] ; then
				lognotice "$prefix.2075" `gettext "Probe is sleeping for ${STOP_ALL_RUNTIME} seconds while hasap_stop_all_instances runs before the CI starts."`
				sleep ${STOP_ALL_RUNTIME}
			fi
		fi

		cur_retry=`expr $cur_retry + 1`
		sleep ${CI_STARTSAP_RETRY_INTERVAL}
	done

	lognotice "$prefix.2034" `gettext "Starting normal SAP probe"`
	return 0
}


procprobe()
{
	if [ $# -eq 1 ] && [ "${1}" = "grace" ]; then
		grace_mode=1
	else
		grace_mode=0
	fi

	hatimerun -t ${COMMAND_TIMEOUT} ${PATH0}/sapmon $_INST_NAME proc > /dev/null 2>&1
	probe_status=$?
	
	if [ $probe_status -ne 0 ]; then

		#
		# if we are in grace mode, do not issue failover, 
		# instead just inform caller what is failing
		#
		if [ ${grace_mode} -eq 1 ]; then
			return $probe_status
		fi

		procerr=

		if [ $probe_status -eq 99 ]; then
			logerr "$prefix.4071" `gettext "Probe timed out checking SAP processes.  Probe will assume that SAP has failed."`
		elif [ $probe_status -eq 98 ]; then
			logerr "$prefix.4072" `gettext "Probe detected an error while checking SAP processes. Probe will assume that SAP has failed."`
		elif [ $probe_status -le 7 -a $probe_status -gt 0 ]; then
			
			#
			# Translate the exit code to error message string
			#
			if [ `expr $probe_status - 4` -ge 0 ]; then
				procerr=$procerr"[dispatcher]"
				probe_status=`expr $probe_status - 4`
			fi
		
			if [ `expr $probe_status - 2` -ge 0 ]; then
				procerr=$procerr"[dw.sap]"
				probe_status=`expr $probe_status - 2`
			fi
		
			if [ `expr $probe_status - 1` -ge 0 ]; then
				procerr=$procerr"[ms.sap]"
				probe_status=`expr $probe_status - 1`
			fi
			
			logerr "$prefix.4017" `gettext "SAP process failure in instance ${SAPSID} has been detected!"`
			logerr "$prefix.4018" `gettext "${procerr} process died!"`
		else
			logerr "$prefix.4016" `gettext "sapmon ${_INST_NAME} proc returned the value ${probe_status}, which is out of the expected range.  Probe will assume that SAP has failed."`
		
		fi

		handle_failure
		return 1
	fi

	#
	# put out message every ${STATUS_REPORT_FREQUENCY} probes
	#
	if [ $proccnt -eq 0 ]; then
		lognotice "$prefix.2038" `gettext "SAP processes for $SAPSID are OK."`
		proccnt=`expr $proccnt + 1`
	else
		proccnt=`expr $proccnt + 1`
		if [ $proccnt -ge ${STATUS_REPORT_FREQUENCY} ]; then
			proccnt=0
		fi
	fi

	#
	# return okay status 
	#
	return 0
}

#
# Not used in 2.2
#
diaprobe() {
	if [ $# -eq 1 ] && [ "${1}" = "grace" ]; then
		grace_mode=1
	else
		grace_mode=0
	fi

	hatimerun -t ${COMMAND_TIMEOUT} ${PATH0}/sapmon $_INST_NAME dia > /dev/null 2>&1
	probe_status=$?

	if [ $probe_status -ne 0 ]; then
		#
		# if we are in grace mode, do not issue failover, instead
		# inform caller that this test is still failing
		#
		if [ ${grace_mode} -eq 1 ]; then
			return 1
		fi

		#
		# some problem encountered in dialog
		#
		logerr "$prefix.4022" `gettext "SAP failure of $SAPSID has been detected!"`
		logerr "$prefix.4067" `gettext "SAP dialog died!"`

		handle_failure
		return 1
	fi

	#
	# put out message every ${STATUS_REPORT_FREQUENCY} probes
	#
	if [ $cicnt -eq 0 ]; then
		lognotice "$prefix.2039" `gettext "SAP instance $SAPSID is OK."`
		cicnt=`expr $cicnt + 1`
	else
		cicnt=`expr $cicnt + 1`
		if [ $cicnt -ge ${STATUS_REPORT_FREQUENCY} ]; then
			cicnt=0
		fi
	fi

	#
	# return okay status
	#
	return 0
}

msprobe() {
	if [ $# -eq 1 ] && [ "${1}" = "grace" ]; then
		grace_mode=1
	else
		grace_mode=0
	fi

	hatimerun -t ${COMMAND_TIMEOUT} ${PATH0}/sapmon $_INST_NAME ms > /dev/null 2>&1
	probe_status=$?

	if [ $probe_status -ne 0 ]; then
		#
		# if we are in grace mode, do not issue failover, instead
		# inform caller that this test is still failing
		#
		if [ ${grace_mode} -eq 1 ]; then
			return $probe_status
		fi

		#
		# some problem encountered in message process
		#
		logerr "$prefix.4022" `gettext "SAP failure of $SAPSID has been detected!"`
		logerr "$prefix.4023" `gettext "Failed to connect to Message Server.  Message Server probe returned ${probe_status}."`

		handle_failure
		return 1
	fi

	#
	# put out message every ${STATUS_REPORT_FREQUENCY} probes
	#
	if [ $mscnt -eq 0 ]; then
		lognotice "$prefix.2058" `gettext "SAP Message Server for $SAPSID is OK."`
		mscnt=`expr $mscnt + 1`
	else
		mscnt=`expr $mscnt + 1`
		if [ $mscnt -ge ${STATUS_REPORT_FREQUENCY} ]; then
			mscnt=0
		fi
	fi

	#
	# return okay status
	#
	return 0
}

#
# The dbprobe function merely reports the status
# of the database. HA-SAP relies on the HA-DBMS
# to maintain the health of the Database. 
#
dbprobe() {

	#
	# Set Grace Mode flag if appropriate
	#
	if [ $# -eq 1 ] && [ "${1}" = "grace" ]; then
		grace_mode=1
	else
		grace_mode=0
	fi

	#
	# Only probe the DB if we are in grace mode or if 
	# the user wants to see DB warnings. Otherwise
	# dbprobe is a no-op.
	#
	if [ ${grace_mode} -ne 1 -a "${LOG_DB_WARNING}" != "y" ]; then
		return 0
	fi

	#
	# Probe the Database
	#
	hatimerun -t ${COMMAND_TIMEOUT} ${PATH0}/sapmon $_INST_NAME db > /dev/null 2>&1
	probe_status=$?

	if [ $probe_status -ne 0 ]; then

		#
		# If we are in Grace Mode, just return the status
		#
		if [ ${grace_mode} -eq 1 ]; then
			return $probe_status
		fi

		if [ "${LOG_DB_WARNING}" = "y" ]; then
			#
			# Print out warning because Database status is not OK
			#
			# HA-SAP doesn't failover in this case because we rely 
			# on the HA-DBMS to maintain the health of the Database
			#
			logwarning "$prefix.3006" `gettext "SAP cannot connect to the database for instance ${SAPSID}. Database probe returned ${probe_status}. HA-SAP will take no action."`

			#
			# Reset dbcnt because when the Database is OK again,
			# the user will be notified immediately
			#
			dbcnt=0

		fi
	else

		#
		# put out message every ${STATUS_REPORT_FREQUENCY} probes
		#
		if [ $dbcnt -eq 0 ]; then
			lognotice "$prefix.2040" `gettext "SAP Database for $SAPSID is OK."`
			dbcnt=`expr $dbcnt + 1`
		else
			dbcnt=`expr $dbcnt + 1`
			if [ $dbcnt -ge ${STATUS_REPORT_FREQUENCY} ]; then
				dbcnt=0
			fi
		fi
	fi

	return $probe_status
}


#
# When a failure is detected, the handle_failure code is called to 
# determine what action to take based on this instance's configuration
# parameters.
#
handle_failure()
{
	#
	# Check if the number of restarts that have already been done is 
	# less than the maximum allowed in hadsconfig
	#
	if [ "${num_restarts}" -lt "${MAX_NUM_LOCAL_RESTARTS}" ]; then

		num_restarts=`expr ${num_restarts} + 1`

		#
		# Restart the CI on the local node.
		#
		do_restart

	else
		#
		# We are not going to restart on this node any more.
		# If a failover is allowed, then try to giveup the logical host
		# Else, exit--there is nothing else we can do.
		#
		if [ "${ALLOW_CI_FAILOVER}" = "y" ]; then 

			#
			# Failover the CI logical host.
			#
			do_failover

		else
			logerr "$prefix.4045" `gettext "SAP has exhausted the number of local restarts (${MAX_NUM_LOCAL_RESTARTS}), but the ALLOW_CI_FAILOVER flag is not \"y\", so no action will be taken."`

			#
			# Stop restarting the probe under pmf
			#
			pmfadm -s ${_INST_NAME}.probe >/dev/null 2>&1
			if [ $? -ne 0 ]; then
				logerr "$prefix.4021" `gettext "Failed to stop monitoring probe for SAP instance ${_INST_NAME}"`
			else
				lognotice "$prefix.2037" `gettext "Stopped monitoring probe for SAP instance ${_INST_NAME}"`
			fi

			lognotice "$prefix.2059" `gettext "SAP probe is exiting."`
			exit 1	
		fi
	fi

	#
	# return 1 to indicate that a failure has occured
	#
	return 1
}


#
# do_failover will call hactl to give up control of the CI logical host
#
do_failover()
{
	ha_svc_not_running ${_INST_NAME}.hactl
	if [ $? -eq 0 ]; then

		# First check if the hactl will succeed.  hactl will fail, 
		# for example, in the case where there is no other possible
		# master for the logical host.  If hactl is going to fail,
		# then we shouldn't exit the probe. We will try the hactl 
		# at a later time, when there may be a backup node available.

		hactl -n -g -s sap -l ${CI}
		hactl_rc=$?

		if [ ${hactl_rc} -ne 0 ]; then

			logerr "$prefix.4069" `gettext "A giveup request for SAP instance ${_INST_NAME} will fail. (hactl returned ${hactl_rc})"`
			lognotice "$prefix.2074" `gettext "The probe will try to issue a giveup request during the next probe cycle."`
			return 1
		fi
		
		#
		# The hactl sanity check passed above so the hactl request is issued.
		#
		pmfadm -c ${_INST_NAME}.hactl hactl -g -s sap -l ${CI}
		if [ $? -ne 0 ]; then

			logerr "$prefix.4019" `gettext "Failed to issue giveup request for SAP instance ${_INST_NAME}"`
			lognotice "$prefix.2074" `gettext "The probe will try to issue a giveup request during the next probe cycle."`
			return 1
		else

			logerr "$prefix.4020" `gettext "HA-SAP has issued a giveup request for SAP instance ${_INST_NAME} running on logical host ${CI}."`
			
			#
			# Stop monitoring the hactl process so won't see message when hactl exits
			#
			pmfadm -s ${_INST_NAME}.hactl >/dev/null 2>&1
			if [ $? -ne 0 ]; then
				logerr "$prefix.4046" `gettext "Failed to stop monitoring hactl process for SAP instance ${_INST_NAME}"`
			fi

			#
			# Stop restarting the probe under pmf
			#
			pmfadm -s ${_INST_NAME}.probe >/dev/null 2>&1
			if [ $? -ne 0 ]; then
				logerr "$prefix.4021" `gettext "Failed to stop monitoring probe for SAP instance ${_INST_NAME}"`
			fi

			lognotice "$prefix.2060" `gettext "SAP probe is exiting."`
	
			#
			# We have successfully issued an hactl request.  There is nothing left for the probe to do.
			#
			exit 0
		fi	
	else

		#
		# Shouldn't get here because the probe exits if it issues
		# a giveup request for an instance.
		#
		logerr "$prefix.4076" `gettext "The probe has already issued a giveup request for SAP instance ${_INST_NAME}"`
	fi 

}


#
# do_restart will restart the CI on the same node
#
do_restart()
{

	lognotice "$prefix.2061" `gettext "Preparing to restart SAP on local node (restart "${num_restarts}" of "${MAX_NUM_LOCAL_RESTARTS}")."`
	
	MASTERED_LH=`haget -f mastered`
	if [ $? -ne 0 ]; then
		logerr "$prefix.4047" `gettext "Could not get mastered logical hosts"`
		exit 1
	fi

	NOT_MASTERED_LH=`haget -f not_mastered`
	if [ $? -ne 0 ]; then
		logerr "$prefix.4048" `gettext "Could not get non-mastered logical hosts"`
		exit 1
	fi

	STOP_NET_TIMEOUT=`hareg -q sap -T STOP_NET`
	if [ $? -ne 0 ]; then
		logerr "$prefix.4049" `gettext "Could not get method timeout for STOP_NET"`
		exit 1
	fi

	if [ ! -x  ${PATH0}/sap_svc_stop_net ]; then
		logerr "$prefix.4050" `gettext "${PATH0}/sap_svc_stop_net is not executable. Exiting."`
		exit 1
	fi

	lognotice "$prefix.2062" `gettext "Calling the STOP_NET method to stop SAP and all instances with timeout ${STOP_NET_TIMEOUT}."`

	hatimerun -t $STOP_NET_TIMEOUT ${PATH0}/sap_svc_stop_net "" "$MASTERED_LH" "$STOP_NET_TIMEOUT"
	stop_net_rc=$?

	if [ "${stop_net_rc}" -eq 0 ]; then
		lognotice "$prefix.2063" `gettext "The STOP_NET method of SAP completed successfully."`
	elif [ "${stop_net_rc}" -eq 99 ]; then
		logerr "$prefix.4051" `gettext "STOP_NET method did not finish in the given timeout: ${STOP_NET_TIMEOUT}."`
	elif [ "${stop_net_rc}" -eq 98 ]; then
		logerr "$prefix.4052" `gettext "hatimerun detected errors while running the STOP_NET method."`
	else 
		logerr  "$prefix.4053" `gettext "STOP_NET encountered errors while executing."`
	fi

	#
	# Check if the instance has any processes still associated with it
	#
	ha_svc_not_running ${_INST_NAME}
	if [ $? -ne 0 ]; then
		lognotice "$prefix.2073" `gettext "Still monitoring some SAP processes. Will stop these processes."`

		pmfadm -s ${_INST_NAME} -w ${STOP_TIMEOUT} TERM || \
			pmfadm -s ${_INST_NAME} -w ${STOP_TIMEOUT} KILL

		ha_svc_not_running ${_INST_NAME}
		if [ $? -ne 0 ]; then
			logerr "$prefix.4060" `gettext "Failed to stop some SAP processes for instance ${_INST_NAME} so cannot restart.  Will try again during next probe cycle."`
			#
			# The restart attempt failed because all of the
			# SAP processes could not be stopped.  During the
			# next probe cycle, we may try to do the restart 
			# again, depending on the user's configuration.
			#
			return 1
		else
                	lognotice "$prefix.2080" `gettext "Stopped SAP instance ${_INST_NAME}"`
                fi

	fi

	START_NET_TIMEOUT=`hareg -q sap -T START_NET`
	if [ $? -ne 0 ]; then
		logerr "$prefix.4054" `gettext "Could not get method timeout for START_NET"`
		exit 1
	fi

	if [ ! -x  ${PATH0}/sap_svc_start_net ]; then
		logerr "$prefix.4055" `gettext "${PATH0}/sap_svc_start_net is not executable. Exiting."`
		exit 1
	fi

	lognotice "$prefix.2064" `gettext "Calling the START_NET method to start SAP and all instances with timeout ${START_NET_TIMEOUT}."`

	hatimerun -t $START_NET_TIMEOUT ${PATH0}/sap_svc_start_net "$MASTERED_LH" "$NOT_MASTERED_LH" "$START_NET_TIMEOUT"
	start_net_rc=$?

	if [ "${start_net_rc}" -eq 0 ]; then
		lognotice "$prefix.2065" `gettext "The START_NET method of SAP completed successfully."`
	elif [ "${start_net_rc}" -eq 99 ]; then
		logerr "$prefix.4056" `gettext "START_NET method did not finish in the given timeout: ${START_NET_TIMEOUT}."`
	elif [ "${start_net_rc}" -eq 98 ]; then
		logerr "$prefix.4057" `gettext "hatimerun detected errors while running the START_NET method."`
	else 
		logerr  "$prefix.4058" `gettext "START_NET encountered errors while executing."`
	fi	

	#
	# Probe in grace mode so the start code has time to 
	# bring up SAP.
	#
	grace_probe
	return 1
}


# The common data service code is included here so that the environment
# of the probe will be the same as the other data service methods

#######################################################################
# The common data service code starts here

#
#pragma ident "@(#)ds_boiler	1.3	98/09/15 SMI"
#
# common boiler for HA data services
#
#


ARGV0=`basename $0`
LOGGER=logger
HA_SLOGFACILITY=`haget -f syslog_facility`
HA_SLOGTAG=hadf
prog_path=`dirname $0`

# source in ha-services common utilities
. hads_utilities

# add the ha-service directory to the path
PATH=${prog_path}:${PATH}

#
# for use by subsequent hactl command, get hostnames of local and remote hosts
#
LOCALHOST=`uname -n`

#! /bin/sh
#
# ident "@(#)do_service 1.13     00/11/21 SMI"
#
#

SYSLOG_PREFIX="SUNWcluster.ha.sap"

# Set varible for the HA-SAP configuration file
HASAP_CONFIG_FILE=/etc/opt/SUNWscsap/hadsconf

#
# Call the parser to handle the config file.
#
source_env_file $HASAP_CONFIG_FILE
if [ $? -ne 0 ]; then
        # source_env logs error message if it fails.
        # No need to log another; just exit.
        exit 1
fi

#
# Because the <sid>adm user has a check for 
# tty=console and TERM=sun to check if openwin
# should be launched, set the TERM variable
# to something else so that openwin won't be started.
#
TERM=vt100
export TERM

#
# Timeout to waiting for SIGTERM to stop a process
# This should be in the config file
#
STOP_TIMEOUT=15
NUM_PROBE_RETRIES=3
PROBE_RETRY_PERIOD=1

#
# bundle_do_svc <action>
#
# is called for each instance
#
bundle_do_svc ()
{
	action=$1
	prefix="$SYSLOG_PREFIX.$action"

	#
	# Set instance variables
	#

	SAPSID=`get_sap_config_param "$_INST_NAME" "PRIV_" "YOUR_SAP_SID" "NON_NULL" "" "" ""`
	[ -z "${SAPSID}" ] && exit 1

	CI_INSTANCE_ID=`get_sap_config_param "$_INST_NAME" "PRIV_" "CI_INSTANCE_ID" "NON_NULL" "" "" ""`
	[ -z "${CI_INSTANCE_ID}" ] && exit 1

	CI_STARTSAP_RETRY_CNT=`get_sap_config_param "$_INST_NAME" "PRIV_" "CI_STARTSAP_RETRY_CNT" "NUMBER" "10" "1" ""`
	[ -z "${CI_STARTSAP_RETRY_CNT}" ] && exit 1

	CI_STARTSAP_RETRY_INTERVAL=`get_sap_config_param "$_INST_NAME" "PRIV_" "CI_STARTSAP_RETRY_INTERVAL" "NUMBER" "30" "" ""`
	[ -z "${CI_STARTSAP_RETRY_INTERVAL}" ] && exit 1

	CI=`get_sap_config_param "$_INST_NAME" "" "LOGICAL_HOST" "NON_NULL" "" "" ""`
	[ -z "${CI}" ] && exit 1

	SAPADM=`get_sap_config_param "$_INST_NAME" "PRIV_" "SAP_ADMIN_LOGIN_NAME" "NON_NULL" "" "" ""`
	[ -z "${SAPADM}" ] && exit 1

	CI_SERVICES=`get_sap_config_param "$_INST_NAME" "PRIV_" "CI_SERVICES_STRING" "NON_NULL" "DVEBMGS" "" ""`
	[ -z "${CI_SERVICES}" ] && exit 1

	COMMAND_TIMEOUT=`get_sap_config_param "$_INST_NAME" "PRIV_" "COMMAND_TIMEOUT" "NUMBER" "60" "5" ""`
	[ -z "${COMMAND_TIMEOUT}" ] && exit 1

	PROBE_PROG=`get_sap_config_param "$_INST_NAME" "" "PROBE_1_PROG" "NON_NULL" "/opt/SUNWcluster/ha/sap/hasap_probe" "" ""`
	[ -z "${PROBE_PROG}" ] && exit 1

	#
	# Set path for SAP executable utilities
	#
	SAPEXE=/usr/sap/${SAPSID}/SYS/exe/run

	PATH0=`dirname $0`


	case $action in

	'start')
		#
		# this section of code is not used in HA-SAP
		#
		exit 0
		;;

	'start_net')
		# The code path via generic_svc already checks if the CI is 
		# mastered on this node. If the CI is not mastered on this node,
		# then we will not reach this section.  Thus, from here on, we assume
		# that the CI is mastered on this node.				

		NeedToStart=

		#
		# Test for sapmon code
		#
		if [ ! -x ${PATH0}/sapmon ]; then
			logerr "$prefix.4007" `gettext "Cannot execute ${PATH0}/sapmon. Exiting."`
			exit 1
		fi

		#
		# Check if SAP processes are running
		#
		hatimerun -t ${COMMAND_TIMEOUT} ${PATH0}/sapmon $_INST_NAME proc > /dev/null 2>&1
		probe_status=$?

		if [ $probe_status -eq 0 ]; then
			#
			# Check if message server is running
			#
			sapmon $_INST_NAME ms > /dev/null 2>&1
			probe_status=$?

			if [ $probe_status -eq 0 ]; then
				#
				# If sapmon proc and ms pass, then
				# it is likely that the user has
				# started SAP outside of the clustering
				# software.  In this case, log an error
				# message because SAP will have trouble
				# shutting down or switching over in this
				# state.
				#
				logerr "$prefix.4073" `gettext "SAP instance ${_INST_NAME} appears to be running outside of the control of the clustering software. You must turn off the SAP data service, manually turn off SAP, and then start SAP by turning on the SAP data service. Failure to do so may prevent proper shutdown and switchover. The start code is exiting."`
				exit 1
			else 
			    #
			    # Message server is not running, 
			    # so we need to start SAP
			    #
			    NeedToStart=y		
			fi
		else
   			NeedToStart=y
  		fi

		# ###############################################################
		# 
		# Restart CI and AS here, if needed.
		#
		# ###############################################################

		if [ "$NeedToStart" = "y" ]; then

			#
			# Call hasap_restartR3 script to bring up SAP
			#
			if [ -x "${PATH0}/hasap_restartR3" ]; then
				
				pmfadm -c $_INST_NAME -C 6 /bin/sh -c "${PATH0}/hasap_restartR3 ${_INST_NAME} >/dev/null 2>&1"
				if [ $? -ne 0 ]; then
					logerr "$prefix.4008" `gettext "Failed to start SAP instance ${_INST_NAME}"`
					exit 1
				fi
			else
				logerr "$prefix.4009" `gettext "Cannot execute ${PATH0}/hasap_restartR3. Exiting."`
				exit 1
			fi
		fi

		# The main work of starting SAP is done in the hasap_restartR3 script
		# which is called above
		;;

	'stop_net' | 'abort_net')

		# ###############################################################
		#
		# When this point is reached, there is a stopping job to do.
		# Do:
		#
		# 1. stop all app server/test/develop instances
		# 2. stopsap r3
		# 3. stop sap collector
		#
		# ###############################################################

		SAPOSCOL=/usr/sap/${SAPSID}/SYS/exe/run/saposcol
		stop_all_pid=0

		#
		# remove the instance from pmfd's queue
		#
		pmfadm -s ${_INST_NAME} >/dev/null 2>&1
		if [ $? -ne 0 ]; then
			logerr "$prefix.4002" `gettext "Failed to stop monitoring SAP instance ${_INST_NAME}."`
		fi

		#
		# We will stop all instances before we start the CI in start_net. 
		# However, if we can stop all instances earlier it will cause 
		# fewer problems later. stop_all_instances in start_net
		# is just a safety net for those cases where there is a system crash.
		#
		if [ -x ${PATH0}/hasap_stop_all_instances ]; then
			lognotice "$prefix.2016" `gettext "Executing hasap_stop_all_instances in background."`
			${PATH0}/hasap_stop_all_instances "${_INST_NAME}" DURING_CI_STOP `expr $METHOD_TIMEOUT - 5` &
			stop_all_pid=$!
			lognotice "$prefix.2071" `gettext "Will wait for the stop_all_instances script (pid ${stop_all_pid}) to finish before exiting."`
		fi

		lognotice "$prefix.2006" `gettext "Stopping the SAP Central Instance."`
		lognotice "$prefix.2041" `gettext "Executing stopsap r3 as user \"${SAPADM}\""`

		su - ${SAPADM} -c 'stopsap r3' >/dev/console 2>&1
		su - ${SAPADM} -c "${SAPOSCOL} -k" >/dev/null 2>&1
	    
		#
		# If the stopsap fails, cleaning the ipc's will
		# prevent the SAP processes from doing more work.
		#
		if [ -x ${SAPEXE}/cleanipc ]; then

			numipc=`${SAPEXE}/cleanipc ${CI_INSTANCE_ID} | grep 'Number of IPC-Objects' | awk -F: '{print \$2}'`
			
			if [ -z "$numipc" -o "$numipc" -gt 0 ]; then 

				lognotice "$prefix.2076" `gettext "Found $numipc leftover IPC objects for SAP instance, removing via cleanipc."`
				${SAPEXE}/cleanipc ${CI_INSTANCE_ID} remove >/dev/console 2>&1
			fi

		else
			logwarning "$prefix.3001" `gettext "Cannot execute ${SAPEXE}/cleanipc"`
		fi

		lognotice "$prefix.2042" `gettext "The SAP Central Instance has been stopped."`

		if [ ${stop_all_pid} -ne 0 ]; then
			ps -p  ${stop_all_pid} >/dev/null 2>&1
			if [ $? -eq 0 ]; then
				lognotice "$prefix.2070" `gettext "Waiting for the stop_all_instances script to finish (pid ${stop_all_pid})."`
				wait
				lognotice "$prefix.2072" `gettext "Done waiting for the stop_all_instances script to finish."`
			fi
		fi
			
		;;

	'stop' | 'abort')

		ha_svc_not_running ${_INST_NAME}
		if [ $? -ne 0 ]; then
			lognotice "$prefix.2008" `gettext "Still monitoring some SAP processes. Will stop these processes."`

			# The most likely reason to get here is because we are turning off
			# the data service while it is still trying to start. Because 
			# hasap_restartR3 is run in the background, we won't have another
			# opportunity to stop the script before it starts SAP.

			#
			# use pmf to stop SAP processes
			#
			pmfadm -s ${_INST_NAME} -w ${STOP_TIMEOUT} TERM || \
				pmfadm -s ${_INST_NAME} -w ${STOP_TIMEOUT} KILL

			ha_svc_not_running ${_INST_NAME}
			if [ $? -ne 0 ]; then
				logerr "$prefix.4003" `gettext "Failed to stop SAP instance ${_INST_NAME}"`
				exit 1
			else
				lognotice "$prefix.2009" `gettext "Stopped SAP instance ${_INST_NAME}"`
			fi
		else 
			lognotice "$prefix.2007" `gettext "No SAP processes for instance ${_INST_NAME} were found. Exiting with no action."`
		fi
		;;

	'fm_start')
		ci_physhost=`haget -f master -h "$CI"`
		if [ $? -ne 0 ]; then
		    logerr "$prefix.4004" `gettext "Cannot obtain name of master for ${CI}"`
		    exit 1
		fi

		if [ -z "$ci_physhost" ]; then
		    logerr "$prefix.4059" `gettext "Cannot obtain name of master for ${CI}"`
		    exit 1
		fi

		THIS_PHYS_HOST=`uname -n`

		#
		# If this SAP instance's diskset is in maint mode, exit now.
		#
		MAINT=`haget -f is_maint -h ${CI}`
		if [ "$MAINT" = "1" ]; then
			lognotice "$prefix.2012" `gettext "The SAP Central Instance's logical host (${CI}) is in maintenance mode so the probe will not be started."`
			exit 0
		fi

		#
		# Check if the CI logical host is mastered on this node
		#
		if [ "$ci_physhost" != "$THIS_PHYS_HOST" ]; then
			#
			# This host does not master CI, so we won't start probe
			#
			exit 0
		else
			lognotice "$prefix.2011" `gettext "SAP Central Instance is mastered on this host so the probe will be started."`
		fi

		ha_svc_not_running ${_INST_NAME}.probe
		if [ $? -eq 0 ]; then

			# pmf starts "hasap_probe InstanceName"
			# hasap_probe runs until fm_stop kills it.

			pmfadm -c ${_INST_NAME}.probe -C 1 -n ${NUM_PROBE_RETRIES} -t ${PROBE_RETRY_PERIOD} /bin/sh -c "${PROBE_PROG} ${_INST_NAME} >/dev/null 2>&1"
			if [ $? -ne 0 ]; then
				logerr "$prefix.4005" `gettext "Failed to start SAP probe for instance ${_INST_NAME}"`
				exit 1
			else
				lognotice "$prefix.2013" `gettext "Started SAP probe for instance ${_INST_NAME}"`
			fi
		else
			lognotice "$prefix.2068" `gettext "SAP probe is already running for instance ${_INST_NAME}"`
		fi
		
		;;

	'fm_stop')
		#
		# If probe not running, do nothing
		#
		ha_svc_not_running ${_INST_NAME}.probe && exit 0

		#
		# stop hasap_probe
		#
		pmfadm -s ${_INST_NAME}.probe -w ${STOP_TIMEOUT} TERM || \
			pmfadm -s ${_INST_NAME}.probe KILL

		if [ $? -ne 0 ]; then
			logerr "$prefix.4006" `gettext "Failed to stop SAP probe for instance ${_INST_NAME}"`
			exit 1
		else
			lognotice "$prefix.2014" `gettext "Stopped SAP probe for instance ${_INST_NAME}"`
		fi
		;;

	'fm_check_this_host_ok')
		lognotice "$prefix.2015" `gettext "This host is OK for SAP instance ${_INST_NAME}"`
		;;

	esac

	exit 0
}


#
# get_sap_config_param 
#
#  Gets the instance variables from hadsconf.
#  If the parameter is set incorrectly, then
#  it will try to use the default.  If there is
#  no default, then the parameter will be unset.  
#  If the value is OK, then it will return that value.
#
# Parameters:
#
# $1 is the instance name
# $2 is param prefix to hadsconfig name used for get_config_param
# $3 is param name used in hadsconfig
# $4 if NON_NULL -> value must be non null
#    if Y_OR_N -> value must be non null and either y or n
#    if NUMBER -> value must be non null and a number ( >= 0)
# $5 is the default to use if there is an incorrect or omitted
#    parameter in hadsconfig. If this argument is null and 
#    there is an invalid param, then do not return a value.
# $6 is the minimum numeric value (if $4 == NUMBER)
# $7 is the maximum numeric value (if $4 == NUMBER)
#

get_sap_config_param()
{

	HASAP_INSTANCE_NAME=$1
	PARAM_PREFIX=$2
	HADSCONFIG_NAME=$3
	VAR_TYPE=$4
	DEFAULT=$5
	MIN=$6
	MAX=$7

	#
	# Get the value that the user has set in hadsconfig
	#
	USER_SET_VALUE=`get_config_param ${HASAP_INSTANCE_NAME} "${PARAM_PREFIX}${HADSCONFIG_NAME}"`

	if [  "${VAR_TYPE}" = "NON_NULL" ]; then

		if [ -n "${USER_SET_VALUE}" ]; then
			#
			# The parameter is OK
			#
			echo "${USER_SET_VALUE}"
			return 0
		fi

		logerr "$prefix.4078" `gettext "${HADSCONFIG_NAME} was not set for instance ${HASAP_INSTANCE_NAME}."`

		if [ -z "${DEFAULT}" ]; then
			logerr "$prefix.4079" `gettext "No default value is available for parameter ${HADSCONFIG_NAME}. Run hadsconfig to set the value of this parameter. Exiting."`
			return 1
		fi

		lognotice "$prefix.2078" `gettext "${HADSCONFIG_NAME} is being temporarily set to the default value of \"${DEFAULT}\".  Run hadsconfig to set the value of this parameter."`

		echo ${DEFAULT}
		return 0

	elif [ "${VAR_TYPE}" = "Y_OR_N" ]; then

		if [ "${USER_SET_VALUE}" = "y" ] ||
		   [ "${USER_SET_VALUE}" = "Y" ] ||
		   [ "${USER_SET_VALUE}" = "yes" ] || 
		   [ "${USER_SET_VALUE}" = "YES" ] || 
		   [ "${USER_SET_VALUE}" = "Yes" ] ; then
			echo "y"
			return 0
		fi

		if [ "${USER_SET_VALUE}" = "n" ] ||
		   [ "${USER_SET_VALUE}" = "N" ] ||
		   [ "${USER_SET_VALUE}" = "no" ] || 
		   [ "${USER_SET_VALUE}" = "NO" ] || 
		   [ "${USER_SET_VALUE}" = "No" ] ; then
			echo "n"
			return 0
		fi

		logerr "$prefix.4080" `gettext "The value for parameter ${HADSCONFIG_NAME} is invalid for instance ${HASAP_INSTANCE_NAME}. The value must be \"y\" or \"n\"."`

		if [ -z "${DEFAULT}" ]; then
			logerr "$prefix.4079" `gettext "No default value is available for parameter ${HADSCONFIG_NAME}. Run hadsconfig to set the value of this parameter. Exiting."`
			return 1
		fi

		lognotice "$prefix.2078" `gettext "${HADSCONFIG_NAME} is being temporarily set to the default value of \"${DEFAULT}\".  Run hadsconfig to set the value of this parameter."`

		echo ${DEFAULT}
		return 0


	elif [ "${VAR_TYPE}" = "NUMBER" ]; then

		is_numeric "${USER_SET_VALUE}"
		if [ $? -eq 0 ]; then
			
			#
			# The value is a number.
			# Check if the number is within the appropriate bounds.
			# 

			if [ -n "${MIN}" ]; then

				if [ "${USER_SET_VALUE}" -lt "${MIN}" ]; then

					logerr "$prefix.4080" `gettext "${HADSCONFIG_NAME} must be greater than or equal to ${MIN} for instance ${HASAP_INSTANCE_NAME}."`

					if [ -z "${DEFAULT}" ]; then
						logerr "$prefix.4079" `gettext "No default value is available for parameter ${HADSCONFIG_NAME}. Run hadsconfig to set the value of this parameter. Exiting."`
						return 1
					fi

					lognotice "$prefix.2078" `gettext "${HADSCONFIG_NAME} is being temporarily set to the default value of \"${DEFAULT}\".  Run hadsconfig to set the value of this parameter."`

					echo ${DEFAULT}
					return 0
				fi
			fi

			if [ -n "${MAX}" ]; then

				if [ "${USER_SET_VALUE}" -gt "${MAX}" ]; then

					logerr "$prefix.4081" `gettext "${HADSCONFIG_NAME} must be less than or equal to ${MAX} for instance ${HASAP_INSTANCE_NAME}."`

					if [ -z "${DEFAULT}" ]; then
						logerr "$prefix.4079" `gettext "No default value is available for parameter ${HADSCONFIG_NAME}. Run hadsconfig to set the value of this parameter. Exiting."`
						return 1
					fi

					lognotice "$prefix.2078" `gettext "${HADSCONFIG_NAME} is being temporarily set to the default value of \"${DEFAULT}\".  Run hadsconfig to set the value of this parameter."`

					echo ${DEFAULT}
					return 0
				fi
			fi

			#
			# The parameter is OK
			# 
			echo "${USER_SET_VALUE}"
			return 0
		fi

		#
		# The user set value is not a number.
		#
		logerr "$prefix.4082" `gettext "${HADSCONFIG_NAME} is not a valid number for instance ${HASAP_INSTANCE_NAME}."`

		if [ -z "${DEFAULT}" ]; then
			logerr "$prefix.4079" `gettext "No default value is available for parameter ${HADSCONFIG_NAME}. Run hadsconfig to set the value of this parameter. Exiting."`
			return 1
		fi

		lognotice "$prefix.2078" `gettext "${HADSCONFIG_NAME} is being temporarily set to the default value of \"${DEFAULT}\".  Run hadsconfig to set the value of this parameter."`

		echo ${DEFAULT}
		return 0

	else
	    
		#
		# Should never get here.
		#
		return 1
	
	fi

	#
	# Should never get here.
	#
	return 1
}




#include_boiler

# The common data service code ends here
#######################################################################


#######################################################################
# The hasap_probe code starts here

prefix="${SYSLOG_PREFIX}.probe"


#
# First, check hasap_probe usage
#
if [ $# -lt 1 ]; then
	usage
	exit 2
fi

#
# Argument 1 is the name of the Instance
#
_INST_NAME=$1

if [ -z "$_INST_NAME" ]; then
	usage
	logerr "$prefix.4014" `gettext "Usage: The instance name must be non-null"`
	exit 2
fi

#
# Check that instance name is valid
#
is_member "$_INST_NAME" "$_INST_LIST"
if [ $? -ne 0 ]; then
	usage
	logerr "$prefix.4063" `gettext "Usage: The instance name \"${_INST_NAME}\" is not valid. The following are valid instances: ${_INST_LIST}"`
	exit 2
fi

#
# Used for error message logging
#
set_inst_name ${_INST_NAME}


#
# Set instance variables
#

SAPSID=`get_sap_config_param "$_INST_NAME" "PRIV_" "YOUR_SAP_SID" "NON_NULL" "" "" ""`
[ -z "${SAPSID}" ] && exit 1

CI_INSTANCE_ID=`get_sap_config_param "$_INST_NAME" "PRIV_" "CI_INSTANCE_ID" "NON_NULL" "" "" ""`
[ -z "${CI_INSTANCE_ID}" ] && exit 1

CI=`get_sap_config_param "$_INST_NAME" "" "LOGICAL_HOST" "NON_NULL" "" "" ""`
[ -z "${CI}" ] && exit 1

SAPADM=`get_sap_config_param "$_INST_NAME" "PRIV_" "SAP_ADMIN_LOGIN_NAME" "NON_NULL" "" "" ""`
[ -z "${SAPADM}" ] && exit 1

CI_SERVICES=`get_sap_config_param "$_INST_NAME" "PRIV_" "CI_SERVICES_STRING" "NON_NULL" "DVEBMGS" "" ""`
[ -z "${CI_SERVICES}" ] && exit 1

CI_STARTSAP_RETRY_CNT=`get_sap_config_param "$_INST_NAME" "PRIV_" "CI_STARTSAP_RETRY_CNT" "NUMBER" "10" "1" ""`
[ -z "${CI_STARTSAP_RETRY_CNT}" ] && exit 1

CI_STARTSAP_RETRY_INTERVAL=`get_sap_config_param "$_INST_NAME" "PRIV_" "CI_STARTSAP_RETRY_INTERVAL" "NUMBER" "30" "" ""`
[ -z "${CI_STARTSAP_RETRY_INTERVAL}" ] && exit 1

COMMAND_TIMEOUT=`get_sap_config_param "$_INST_NAME" "PRIV_" "COMMAND_TIMEOUT" "NUMBER" "60" "5" ""`
[ -z "${COMMAND_TIMEOUT}" ] && exit 1

LOG_DB_WARNING=`get_sap_config_param "$_INST_NAME" "PRIV_" "LOG_DB_WARNING" "Y_OR_N" "y" "" ""`
[ -z "${LOG_DB_WARNING}" ] && exit 1

STATUS_REPORT_FREQUENCY=`get_sap_config_param "$_INST_NAME" "PRIV_" "PROBE_STATUS_REPORT_FREQUENCY" "NUMBER" "10" "" ""`
[ -z "${STATUS_REPORT_FREQUENCY}" ] && exit 1

SAP_PROBE_INTERVAL=`get_sap_config_param "$_INST_NAME" "" "PROBE_1_INTERVAL" "NUMBER" "60" "" ""`
[ -z "${SAP_PROBE_INTERVAL}" ] && exit 1

ALLOW_CI_FAILOVER=`get_sap_config_param "$_INST_NAME" "PRIV_" "ALLOW_CI_FAILOVER" "Y_OR_N" "y" "" ""`
[ -z "${ALLOW_CI_FAILOVER}" ] && exit 1

MAX_NUM_LOCAL_RESTARTS=`get_sap_config_param "$_INST_NAME" "PRIV_" "NUM_CI_RESTARTS_ON_LOCAL_NODE" "NUMBER" "1" "" ""`
[ -z "${MAX_NUM_LOCAL_RESTARTS}" ] && exit 1

NUM_PROBE_SUCCESSES_TO_RESET_RESTART_CNT=`get_sap_config_param "$_INST_NAME" "PRIV_" "NUM_PROBE_SUCCESSES_TO_RESET_RESTART_CNT" "NUMBER" "60" "" ""`
[ -z "${NUM_PROBE_SUCCESSES_TO_RESET_RESTART_CNT}" ] && exit 1

STOP_ALL_RUNTIME=`get_sap_config_param "$_INST_NAME" "PRIV_" "TIME_ALLOWED_TO_STOP_ALL_INSTANCES_BEFORE_CI_START" "NUMBER" "60" "" ""`
[ -z "${STOP_ALL_RUNTIME}" ] && exit 1


#
# Set path for SAP executable utilities
#
SAPEXE=/usr/sap/${SAPSID}/SYS/exe/run

PATH0=`dirname $0`


#
# Test for sapmon code
#
if [ ! -x ${PATH0}/sapmon ]; then
	logerr "$prefix.4015" `gettext "Cannot execute ${PATH0}/sapmon.  Exiting."`
	exit 1
fi


#
# Initialize probe variables
#
cicnt=0
dbcnt=0
proccnt=0
mscnt=0

procprobe_rc=0
diaprobe_rc=0
dbprobe_rc=0 
msprobe_rc=0
probeall_rc=0

num_restarts=0
num_passed_probes=0
curr_probe_status=0

#
# Start probe in grace mode when first starting SAP.
#
grace_probe

#
# Normal SAP probing
#
while : ; do

	# 
	# Now do the CI probing to see if the CI is running
	#
	sleep $SAP_PROBE_INTERVAL

	#
	# Reset this value each time through the loop. If this
	# value is 0 at the bottom of the loop, then all probes
	# passed. This value is incremented by 1 for every 
	# failed probe.
	#
	curr_probe_status=0

	#
	# check for critical SAP processes
	#
	procprobe
	if [ $? -ne 0 ]; then
		curr_probe_status=`expr $curr_probe_status + 1`
	fi

	#
	# check dialog
	#
	# diaprobe

	#
	# probe the message server
	#
	msprobe
	if [ $? -ne 0 ]; then
		curr_probe_status=`expr $curr_probe_status + 1`
	fi

	#
	# probe the SAP Database
	#
	# The dbprobe return code does not affect the 
	# curr_probe_status variable because HA-SAP does 
	# not failover or restart based on DB failures.
	#
	dbprobe

	#
	# Adjust number of consecutive probe successes
	#
	if [ $curr_probe_status -eq 0 ]; then
		#
		# All probes have passed so increment the counter
		#
		num_passed_probes=`expr $num_passed_probes + 1`
	else
		#
		# Some probe has failed, so reset the counter
		#
		num_passed_probes=0
	fi

	
	#
	# If the number of passed probes is greater than or
	# equal to the number of successful probes needed
	# to reset the restart count, then reset the restart
	# count.
	#
	if [ $num_passed_probes -ge "${NUM_PROBE_SUCCESSES_TO_RESET_RESTART_CNT}" ]; then

		#
		# Never reset the restart count if the value is set to 0
		#
		if [ "${NUM_PROBE_SUCCESSES_TO_RESET_RESTART_CNT}" -eq 0 ]; then
			num_passed_probes=0
			continue
		fi

		#
		# Only print a message if there has already been a restart
		# on the local node.
		#
		if [ $num_restarts -gt 0 ]; then
			lognotice "$prefix.2077" `gettext "Resetting the restart count from ${num_restarts} to 0 after ${num_passed_probes} successful probes.  Upon the next failure, the probe will try to restart SAP ${MAX_NUM_LOCAL_RESTARTS} time(s) on the local node."`
			num_restarts=0
		fi

		num_passed_probes=0
	fi
	
done

# The hasap_probe code ends here
#######################################################################
