#! /usr/bin/ksh
#
# Copyright 2003 Sun Microsystems, Inc.  All rights reserved.
# Use is subject to license terms.
#
# ident	"@(#)run_reserve.ksh	1.52	03/10/28 SMI"
#
#
# the following invocations are evaluated:
#
# run_reserve	-c node_join
#		-c release_shared_scsi2 -n joining_node
#		-c make_primary -s service_name -C service_class
#		-c make_exclusive -s service_name -C service_class
#		-c primary_to_secondary -s service_name -C service_class
#		-c fence_node -f fenced_node
#		-c enfailfast_all
#		-c reset_shared_bus
#
# the following invocations currently require no disk fencing:
#
# run_reserve	-c make_secondary
#		-c add_secondary
#		-c remove_secondary
#
# The options -o -d -a are also currently ignored.
#
# The following invocations are also available:
#
# run_reserve	-c node_join -x
#		(Prevents running of 'release_shared_scsi2' on other cluster nodes.
#		 This is used by scgdevs since scgdevs executes on all nodes.)
#		-c node_join -S
#		(Replaces running 'release_shared_scsi2' on other cluster nodes with
#		 running 'node_join -x' on other cluster nodes.  This is used by
#		 'scdidadm -C' to ensure scrubbing may be done when only some nodes
#		 are still connected to the device.
#
#
# This script is called from the rc script /etc/rc2.d/S01MOUNTGFSYS when a node
# is joining the cluster to ensure that that the node has access to all attached
# disks.  It is also called during HA device service transitions (changes to
# primary node) in order to fence non-cluster nodes from the disks in that
# service. The main functions of this script are:
#
# 1) for make_primary and primary_to_secondary : determine which disks are
#	part of the service whose primary is changing (except for DISK services,
#	the reservation program determines the list of disks for these services)
# 2) call the reservation program, which actually performs the disk reservations
# 3) perform volume manager (SUNWmd, SUNWvxvm)
#	actions (take/release, import/deport)
#
#
# Transitions:
#
# node_join - Invoked when a node is joining the cluster to  ensure that it has
#		access to all attached disks.  The reservation program called by
#		this script will spawn a 'release_shared_scsi2' invokation of
#		this script on all other cluster nodes, so that they may release
#		scsi-2 reservations on disks they share with the joining node.
# release_shared_scsi2 - Invoked on all cluster nodes (except the joining node)
# 		by the reservation program during node_join transition. Makes
#		use of clexecd to accomplish this. Responsible for dropping
#		scsi-2 reservations held on disks shared with the joining node.
# make_primary / make_exclusive - Invoked through the HA device services
#		framework when a node is promoted to being the primary node for
#		an HA device service.  Responsible for fencing non-cluster nodes
#		from the disks in that service.  Also enables failfast, so this
#		node will panic if it loses access to the disks (receives
#		'reservation conflict' error).  Performs sds take and vxvm
#		import operations.
# primary_to_secondary - Invoked through the HA device services framework when a
#		node is demoted from primary to secondary (potential primary).
#		Responsible for turning off failfast for the disks in that
#		service.  Performs sds release and vxvm deport operations.
# fence_node - Fence all devices shared between this node and the specified node.
#		It is currently used to support OPS without CVM.
# enfailfast_all - Enables failfast for all multiported devices connected to this
#		node.  It is currently used to support OPS without CVM.
# reset_shared_bus - Initiate a scsi-bus reset for all scsi buses shared with a
#		non-cluster node.  This is to ensure that the bus is not wedged
#		attempting to contact a node which has just died.
#



resv_prog=/usr/cluster/lib/sc/reserve
command=
service_name=
service_class=
joining_node=
fenced_node=
diskfile_beg=/tmp/reservation_diskfile_$$_
diskfile=
opt_arg=
upgrade_flag_file=/scnoreservedisks
retry_num=3
retry_interval=5
metaset_path=NOT_CONFIGURED
# flags to tell if we need to back out import if failure occurs
needVxVMdeport_on_error=0
needSDSrelease_on_error=0


if [ -x /usr/sbin/metaset ]
then
	# Solaris 8 location
	metaset_path=/usr/sbin/metaset
elif [ -x /usr/opt/SUNWmd/sbin/metaset ]
then
	# Solaris 7 location
	metaset_path=/usr/opt/SUNWmd/sbin/metaset
fi

NWS_RECONFIG=/usr/opt/SUNWesm/cluster/sbin/reconfig

###########
# functions
###########
# cleanup function called before all exits
clean_exit ()
{
	rm -f $diskfile "$diskfile".tmp "$diskfile".tmp2 "$diskfile".tmp3 "$diskfile".tmp4 "$diskfile".tmp5 "$diskfile".tmp6 "$diskfile".tmp7 "$diskfile".tmp8 "$diskfile".did.tmp

	exit $1
}

# check return value of a command, exiting if it is non-zero
chk_retval_exit ()
{
	if [ $retval != 0 ]
	then
		echo `gettext "$0:  $1 failed during $command, returned $retval"`

		# backout VxVM import / SDS take if needed
		if [ $needVxVMdeport_on_error != 0 ]
		then
			# If deport/release fails, the node is halted lest the disk group
			# should end up being imported on two nodes. If failback is not
			# enabled, we should reboot the node instead. Filing a separate
			# bug for that.

			/usr/sbin/vxdg deport $service_name
			retval1=$?
			if [ $retval1 != 0 ]
			then
				echo `gettext "Fatal error: could not deport VxVM diskgroup $service_name. Halting node."`
				echo `gettext "Fatal error: could not deport VxVM diskgroup $service_name. Halting node."` >> /dev/console
				halt
			fi
		fi
		if [ $needSDSrelease_on_error != 0 ]
		then
			$metaset_path -C release -s $service_name
			retval1=$?
			if [ $retval1 != 0 ]
			then
				echo `gettext "Fatal error: could not release SDS diskset $service_name. Halting node "`
				echo `gettext "Fatal error: could not release SDS diskset $service_name. Halting node "` >> /dev/console
				halt
			fi
		fi

		clean_exit $retval
	fi
}



#
# Check mount command to see if there are any local filesystems still
# mounted on the rawdisk device group.  Return 0 if mounted local
# filesystems are found, 1 otherwise.
#
chk_for_mounted_filesystems()
{
	mount > "$diskfile".tmp
	retval=$?
	chk_retval_exit mount

	while read mntpoint on device options other
	do
		# look for mounts on rawdisk device groups
		echo $device | grep "/dev/global/dsk/" > /dev/null 2>&1
		retval=$?
		if [ $retval = 1 ]
		then
			# not a rawdisk device
			continue
		fi
		chk_retval_exit grep

		# see if this is our device group
		# grab d- part of /dev/global/d-s-
		device=`basename $device`
		device=`echo $device | sed 's/s.//'`
		# see which rawdisks devices group thisdevice belongs to
		mountdg=`/usr/cluster/lib/dcs/dgconv -d $device"`
		retval=$?
		chk_retval_exit dgconv
		if [ $mountdg != $service_name ]
		then
			# not our rawdisk device group
			continue
		fi

		# look for non-global mounts
		echo $options | grep -v global > /dev/null 2>&1
		retval=$?
		if [ $retval = 1 ]
		then
			# is a global mount
			continue
		fi
		chk_retval_exit grep

		return 0
	done < "$diskfile".tmp

	return 1
}



# perform operations for make_primary or primary_to_secondary of -C DISK
do_DISK ()
{
	if [ $command = primary_to_secondary ]
	then
		#
		# There could be local filesystems still mounted.
		# Need to fail primary_to_secondary if this is the case.
		#
		chk_for_mounted_filesystems
		retval=$?
		if [ $retval = 0 ]
		then
			echo File systems still mounted on device group $service_name, unable to shutdown device group.

			echo File systems still mounted on device group $service_name, unable to shutdown device group. >> /dev/console
			echo Please unmount file systems and re-try operation. >> /dev/console
			exit 1
		fi
	fi

	# reservation program will compute list of disks
	diskfile=NOT_USED

	# call reservation program
	$resv_prog -c $command -d $diskfile -s $service_name -C $service_class $opt_arg
	retval=$?
	if [ $command = make_primary ]
	then
		if [ $retval != 0 ]
		then
			#
			# failed to become primary
			# make sure we didn't enable any failfasts
			#
			$resv_prog -c primary_to_secondary -d $diskfile -s $service_name -C $service_class $opt_arg
		fi
	fi
	chk_retval_exit reservation_program
}

# perform operations for make_primary or primary_to_secondary of -C SUNWmd
do_SUNWmd ()
{
	if [ $metaset_path = NOT_CONFIGURED ]
	then
		echo `gettext "$0: metaset binary not found"`
		clean_exit 1
	fi

	# get disks in this metaset
	$metaset_path -s $service_name -C disksin > "$diskfile".tmp
	retval=$?
	how_many=0
	while [ $retval != 0 ]
	do
		if [ $how_many = $retry_num ]
		then
			chk_retval_exit $metaset_path
		fi
		echo `gettext "$0:  $metaset_path failed during $command, returned $retval, will retry in $retry_interval seconds"`
		sleep $retry_interval
		$metaset_path -s $service_name -C disksin > "$diskfile".tmp
		retval=$?
		how_many=$(($how_many + 1))
	done

	# change  /dev/did/dsk/dn  to  /dev/did/rdsk/dn
	sed 's/\/dsk/\/rdsk/' "$diskfile".tmp > $diskfile
	retval=$?
	chk_retval_exit sed

	# call reservation program
	$resv_prog -c $command -d $diskfile -s $service_name -C $service_class $opt_arg
	retval=$?
	if [ $command = make_primary ]
	then
		if [ $retval != 0 ]
		then
			#
			# failed to become primary
			# make sure we didn't enable any failfasts
			#
			$resv_prog -c primary_to_secondary -d $diskfile -s $service_name -C $service_class $opt_arg
		fi
	fi
	chk_retval_exit reservation_program
}

# perform operations for make_primary or primary_to_secondary of -C SUNWvxvm
do_SUNWvxvm ()
{
	# get info on all disks
	/usr/sbin/vxdisk -s list > "$diskfile".tmp
	retval=$?
	chk_retval_exit /usr/sbin/vxdisk

	# extract disk names for disks in this disk group
	awk '/Disk:/ { disk = $2 } /dgname/ { if ($2==diskgroup) print disk }' diskgroup=$service_name "$diskfile".tmp > "$diskfile"
	retval=$?
	chk_retval_exit awk

	# call reservation program
	# this may get called twice, only call resv on first time through
	if [ $1 = 1 ]
	then
		$resv_prog -c $command -d $diskfile -s $service_name -C $service_class $opt_arg
		retval=$?
		if [ $command = make_primary ]
		then
			if [ $retval != 0 ]
			then
				#
				# failed to become primary
				# make sure we didn't enable any failfasts
				#
				$resv_prog -c primary_to_secondary -d $diskfile -s $service_name -C $service_class $opt_arg
			fi
		fi
		chk_retval_exit reservation_program
	fi
}

##############
# script start
##########################
# get command line options
##########################
while getopts c:s:C:o:d:a:n:f:xmS name
do

	case $name in
		c)	command="$OPTARG"
			;;
		s)	service_name="$OPTARG"
			;;
		C)	service_class="$OPTARG"
			;;
		n)	joining_node="$OPTARG"
			;;
		f)	fenced_node="$OPTARG"
			;;
		#
		# supress calling of release_shared_scsi2
		# during calls from scgdevs
		#
		x)	opt_arg="-x "$opt_arg
			;;
		# turn on debug messages
		m)	opt_arg="-m "$opt_arg
			;;
		#
		# replace calling of release_shared_scsi2 with
		# node_join -x during calls from scdidadm -C
		#
		S)	opt_arg="-S "$opt_arg
			;;
		# dcs passes these to us, but we don't use them right now
		# may be used later to optimize some things
		o)	;;
		d)	;;
		a)	;;
		?)	echo $0:  `gettext "illegal command line option"`
			clean_exit 1
			;;
	esac
done

############################
# check command line options
############################
if [ -z $command ]
then
	echo $0:  `gettext "command not specified"`
	clean_exit 1
fi

# for these state transitions the reservation program need do nothing
if [[ $command = make_secondary || $command = add_secondary || $command = remove_secondary ]]
then
	clean_exit 0
fi

# treat both the same
if [ $command == make_exclusive ]
then
	command=make_primary
fi

if [[ $command == make_primary || $command == primary_to_secondary ]]
then
	if [ -z $service_class ]
	then
		echo $0:  `gettext "service_class not specified"`
		clean_exit 1
	elif [[ $service_class != DISK && $service_class != SUNWmd && $service_class != SUNWvxvm && $service_class != SUNWcvm ]]
	then
		echo $0:  `gettext "illegal service_class:"`  $service_class
		clean_exit 1
	fi

	if [ -z $service_name ]
	then
		echo $0:  `gettext "service_name not specified"`
		clean_exit 1
	fi
fi

if [ $command = release_shared_scsi2 ]
then
	if [ -z $joining_node ]
	then
		echo $0:  `gettext "joining_node not specified"`
		clean_exit 1
	fi
fi

if [ $command = fence_node ]
then
	if [ -z $fenced_node ]
	then
		echo $0:  `gettext "fenced_node not specified"`
		clean_exit 1
	fi
fi

#################
# perform command
######################
# release_shared_scsi2
######################
if [ $command = release_shared_scsi2 ]
then
	echo `gettext "releasing reservations for scsi-2 disks shared with"` $joining_node

	# call reservation program
	$resv_prog -c $command -j $joining_node $opt_arg -h `hostname`
	retval=$?
	chk_retval_exit reservation_program
###########
# node_join
###########
elif [ $command = node_join ]
then
	echo `gettext "obtaining access to all attached disks"`

	#
	# If this is a boot during upgrade simply return success
	# we will be run later when the system is ready.
	# This prevents us from fencing off active SC2.2 nodes.
	#
	if [ -f $upgrade_flag_file ]
	then
		clean_exit 0
	fi

	# call reservation program
	$resv_prog -c $command -h `hostname` $opt_arg
	retval=$?
	chk_retval_exit reservation_program
##############
# make_primary
##############
elif [ $command = make_primary ]
then
	echo `gettext "becoming primary for"` $service_name

	diskfile="$diskfile_beg""$command"

	if [ $service_class = DISK ]
	then
		# call reservation program
		do_DISK
	elif [ $service_class = SUNWmd ]
	then
		# get list of disks and call reservation program
		do_SUNWmd

		# tell SUNWmd to take this volume
		# execute the takeover in RT -- BudId 4337278
		# when exec'd by clexecd, this will happen
		$metaset_path -C take -f -s $service_name
		retval=$?
		if [ $retval != 0 ]
		then
			# these failures result in read-only ownership
			if [ $retval = 66 ]
			then
				echo `gettext "Stale database for diskset"` $service_name
			elif [ $retval = 2 ]
			then
				echo `gettext "Tagged data encountered for diskset"` $service_name
			elif [ $retval = 3 ]
			then
				echo `gettext "Only 50% replicas and 50% mediator hosts available for diskset"` $service_name
			fi

			#
			# XXX if it's not one of the above errors, we should
			# really retry the metaset -C take
			#

			# make sure we haven't taken read-only ownership
			# execute in RT -- BugId 4337278
			# when exec'd by clexecd, this will happen
			$metaset_path -s $service_name -C release

			# disable failfast since this node did not succeed
			$resv_prog -c primary_to_secondary -d $diskfile -s $service_name -C $service_class $opt_arg
			clean_exit $retval
		fi
	elif [ $service_class = SUNWvxvm ]
	then
		#
		# Try to import, this will fail if the group is imported elsewhere
		# but allow us to get current info on which disks are in the group.
		# Redirect standard error, so user won't see bogus error message
		# during failovers.
		#
		/usr/sbin/vxdg -t import $service_name > /dev/null 2>&1
		vxretval=$?
		#
		# error 12 = diskgroup already imported, this is not an error,
		# This can occur if this is a make_primary after a primary_to_secondary
		# failure during which the disk group was not deported.
		#
		if [ $vxretval = 12 ]
		then
			vxretval=0
		fi

		# flag import if we succeeded
		if [ $vxretval = 0 ]
		then
			needVxVMdeport_on_error=1
		fi

		# get list of disks and call reservation program
		do_SUNWvxvm 1

		# if the first import failed, now do a forced import
		if [ $vxretval != 0 ]
		then
			/usr/sbin/vxdg -C -t -f import $service_name
			retval=$?

			how_many=0
			while [ $retval != 0 ]
			do
				if [ $how_many = $retry_num ]
				then
					#
					# Disable failfast since this node
					# did not succeed in becoming primary.
					#
					$resv_prog -c primary_to_secondary -d $diskfile -s $service_name -C $service_class $opt_arg
					chk_retval_exit /usr/sbin/vxdg
				fi
				echo `gettext "$0:  /usr/sbin/vxdg failed during $command, returned $retval, will retry in $retry_interval seconds"`
				sleep $retry_interval
				/usr/sbin/vxdg -C -t -f import $service_name
				retval=$?
				how_many=$(($how_many + 1))
			done

			# at this point we have imported disk group
			needVxVMdeport_on_error=1

			#
			# Now we need to see if the diskset changed between
			# the first and second import.
			#
			cp $diskfile "$diskfile".orig
			rm -f $diskfile "$diskfile".tmp "$diskfile".tmp2 "$diskfile".tmp3 "$diskfile".tmp4 "$diskfile".tmp5 "$diskfile".tmp6 "$diskfile".tmp7 "$diskfile".tmp8 "$diskfile".did.tmp
			do_SUNWvxvm 0
			diff $diskfile "$diskfile".orig
			diffretval=$?
			if [ $diffretval = 1 ]
			then
				# diskset has changed, redo reservations
				$resv_prog -c $command -d $diskfile -s $service_name -C $service_class $opt_arg
				retval=$?
				if [ $retval != 0 ]
				then
					$resv_prog -c primary_to_secondary -d $diskfile -s $service_name -C $service_class $opt_arg
				fi
			
				chk_retval_exit reservation_program
			fi
			rm -f "$diskfile".orig
		fi

		# enable all volumes for this disk group
		/usr/sbin/vxrecover -g $service_name -s -b
		retval=$?
		how_many=0
		while [ $retval != 0 ]
		do
			if [ $how_many = $retry_num ]
			then
				# couldn't enable volumes, deport diskgroup
				/usr/sbin/vxdg deport $service_name

				#
				# Disable failfast since this node did not
				# succeed in becoming primary.
				#
				$resv_prog -c primary_to_secondary -d $diskfile -s $service_name -C $service_class $opt_arg
				chk_retval_exit /usr/sbin/vxrecover
			fi
			echo `gettext "$0:  /usr/sbin/vxrecover failed during $command, returned $retval, will retry in $retry_interval seconds"`
			sleep $retry_interval
			/usr/sbin/vxrecover -g $service_name -s -b
			retval=$?
			how_many=$(($how_many + 1))
		done
	elif [ $service_class = SUNWcvm ]
	then
		# get list of disks and call reservation program
		do_SUNWvxvm 1
	fi
	if [ -x "$NWS_RECONFIG" ]
	then
		$NWS_RECONFIG start $service_name
	fi

######################
# primary_to_secondary
######################
elif [ $command = primary_to_secondary ]
then
	echo `gettext "no longer primary for"` $service_name

	if [ -x "$NWS_RECONFIG" ]
	then
		$NWS_RECONFIG stop $service_name
	fi

	diskfile="$diskfile_beg""$command"

	if [ $service_class = DISK ]
	then
		# call reservation program
		do_DISK
	elif [ $service_class = SUNWmd ]
	then
		# get list of disks and call reservation program
		do_SUNWmd

		# tell SUNWmd to release this volume
		# execute in RT -- BugId 4337278
		# when exec'd by clexecd, this will happen
		$metaset_path -C release -s $service_name
		retval=$?
		how_many=0
		while [ $retval != 0 ]
		do
			if [ $how_many = $retry_num ]
			then
				chk_retval_exit $metaset_path
			fi
			echo `gettext "$0:  $metaset_path failed during $command, returned $retval, will retry in $retry_interval seconds"`
			sleep $retry_interval
			$metaset_path -C release -s $service_name
			retval=$?
			how_many=$(($how_many + 1))
		done
	elif [ $service_class = SUNWvxvm ]
	then
		#get list of disks and call reservation program
		do_SUNWvxvm 1

		# deport the disk group
		/usr/sbin/vxdg deport $service_name
		retval=$?
		how_many=0
		while [ $retval != 0 ]
		do
			if [ $how_many = $retry_num ]
			then
				chk_retval_exit /usr/sbin/vxdg
			fi
			echo `gettext "$0:  /usr/sbin/vxdg failed during $command, returned $retval, will retry in $retry_interval seconds"`
			sleep $retry_interval
			/usr/sbin/vxdg deport $service_name
			retval=$?
			how_many=$(($how_many + 1))
		done
	elif [ $service_class = SUNWcvm ]
	then
		# get list of disks and call reservation program
		do_SUNWvxvm 1
	fi
###########
# fence_node
###########
elif [ $command = fence_node ]
then
	echo `gettext "fencing node $fenced_node from shared devices"`

	# call reservation program
	$resv_prog -c $command -h `hostname` -f $fenced_node $opt_arg
	retval=$?
	chk_retval_exit reservation_program
###########
# enfailfast_all
###########
elif [ $command = enfailfast_all ]
then
	echo `gettext "enabling failfast on shared devices"`

	# call reservation program
	$resv_prog -c $command -h `hostname` $opt_arg
	retval=$?
	chk_retval_exit reservation_program
###########
# reset_shared_bus
###########
elif [ $command = reset_shared_bus ]
then
	echo `gettext "resetting scsi buses shared with non-cluster nodes"`

	# call reservation program
	$resv_prog -c $command -h `hostname` $opt_arg
	retval=$?
	chk_retval_exit reservation_program
else
	echo $0:  `gettext "illegal command secification:"`  -c $command
	clean_exit 1
fi

clean_exit 0
