#! /bin/sh
#
# ident "@(#)scdidadm.sh	1.18	01/03/28 SMI"
#
# Copyright (c) 1997-1998 by Sun Microsystems, Inc.
# All rights reserved.
#

PATH=/usr/bin:/bin:/etc:/sbin:/usr/sbin:${PATH}:/opt/SUNWcluster/bin;
export PATH

PROG=`basename $0`
BINDIR=/opt/SUNWcluster/bin
SC_CONFIGDIR=/etc/opt/SUNWcluster/conf
SC_DFLT_CLUSTERNAME=${SC_CONFIGDIR}/default_clustername
		
# Define message file name and location
TEXTDOMAIN=scdidadm; export TEXTDOMAIN
TEXTDOMAINDIR=/opt/SUNWcluster/locale; export TEXTDOMAINDIR

SHORT_RSH_TIMEOUT=30
LONG_RSH_TIMEOUT=600
TMPERR=/var/opt/SUNWcluster/run/didadm.err.$$

DIDFILE=/etc/did.conf
TMPDIDDIR=/var/opt/SUNWcluster/run
TMPDIDFILE=/var/opt/SUNWcluster/run/did.conf.new.$$

# Mutually exclusive options:
#
# -r	perform device id discovery as part of initial configuration or
#	after having added new disk(s)
#
# -R	perform device id re-discovery after replacing a disk (Repair)
#
# -l	List on standard output the local devices in the did
#	configuration file.  The output of this command can be
#	customized using the -o flag.
#
# -L	List on standard output all the paths, including those
#	on remote hosts, of devices in the did configuration
#	file.  The output of this command can be customized
#	using the -o flag.
#
usage="`gettext 'usage:	%s -r [-H host,...]\n\
	%s -R path | instance_number\n\
	%s -l | -L [-h] [-o fmt] [ path | instance_number ]\n'`"

Usage ()
{
	printf "${usage}\n" $PROG $PROG $PROG>&2
	exit 1
}


test_rsh ()
{
	hname=$1
	call_rsh $hname -n $hname /bin/true > /dev/null 2> $TMPERR
	if [ $? -ne 0 ]; then
		grep "permission denied" $TMPERR > /dev/null 2>&1
		if [ $? -eq 0 ]; then
			lmsg=`gettext \
		"Check /.rhosts on %s, then re-run this command"`
			printf "${lmsg}\n" $i
			rm -f $TMPERR
			exit 7
		fi
	fi
}


Error2 ()
{
	lmsg=`gettext '%s failed, but no side-effects occurred'`
	printf "${lmsg}\n" "$PROG"
	rm -f $TMPERR
	exit 2
}


Error3 ()
{
	lmsg=`gettext '%s failed; some side-effects occurred'`
	printf "${lmsg}\n" "$PROG"
	rm -f $TMPERR
	exit 3
}


Error4 ()
{
	rm -f $TMPERR
	exit 4
}


Error6 ()
{
	lmsg=`gettext \
	    "The did entries in name_to_major must be the same on all nodes."`
	printf "${lmsg}\n"
	lmsg=`gettext "Correct the problem, then re-run scdidadm."`
	printf "${lmsg}\n"
	rm -f $TMPERR
	exit 6
}


#
# Get remote HA private links and set $R_LINKS.
# Read cdb file to determine connections.
# Called when Hflag not used.
#
get_remote_halinks ()
{
	ALL_HAPRIV=""

	# first get and check my own HA private link
	my_hahost=`${BINDIR}/cdbmatch cluster.node.0.hahost ${cdbfile}`
	if [ -z "$my_hahost" ]; then
		lmsg=`gettext \
"Cannot get my own private link from cdb file.\n\
Re-run command using\n\
	%s %s -H <hostname,...>\n\
having made sure that the specified hostname addresses are up\n\
and that the appropriate hostname entries exist in /.rhosts files."`
		printf "${lmsg}\n" "$PROG" "$OPT"
		Error4
	else
		# check our own HA private link
		ping $my_hahost > /dev/null 2>&1
		if [ $? -ne 0 ]; then
			my_iface0=`${BINDIR}/cdbmatch \
			    cluster.node.0.if.0 ${cdbfile}`
			my_iface1=`${BINDIR}/cdbmatch \
			    cluster.node.0.if.1 ${cdbfile}`
			lmsg=`gettext \
"Our own HA private link %s appears to be down.\n\
Bring it up on interface %s or %s, or re-execute this command using\n\
	%s %s -H <hostname,...>\n\
having made sure that the specified hostname addresses are up\n\
and that the appropriate hostname entries exist in /.rhosts files."`
			printf "${lmsg}\n" $my_hahost $my_iface0 $my_iface1 \
				"$PROG" "$OPT"
			Error4
		fi
	fi

	# get # of nodes in configuration
	num_hosts=`${BINDIR}/cdbmatch cluster.number.nodes ${cdbfile}`	
	# get HA private link for each node and add to list
	# we are node 0; start with node 1
	n=1
	wentwrong=0
	while true; do
		if [ ${n} -eq ${num_hosts} ]; then
			break
		fi

		hahost=`${BINDIR}/cdbmatch cluster.node.${n}.hahost ${cdbfile}`
		if [ -z "$hahost" ]; then
			lmsg=`gettext "Cannot get private link for node %d"`
			printf "${lmsg}\n" $n
			wentwrong=1
			n=`expr ${n} + 1`
			continue
		fi
		ping $hahost > /dev/null 2>&1
		if [ $? -ne 0 ]; then
			wentwrong=1
			nodename=`${BINDIR}/cdbmatch \
			    cluster.node.${n}.hostname ${cdbfile}`
			iface0=`${BINDIR}/cdbmatch \
			    cluster.node.${n}.if.0 ${cdbfile}`
			iface1=`${BINDIR}/cdbmatch \
			    cluster.node.${n}.if.1 ${cdbfile}`
			lmsg=`gettext \
"Remote HA private link %s appears to be down.\n\
Bring it up on interface %s or %s on node %s,\n\
or re-execute this command using\n\
	%s %s -H <hostname,...>\n\
having made sure that the specified hostname addresses are up\n\
and that the appropriate hostname entries exist in /.rhosts files."`
			printf "${lmsg}\n" $hahost $iface0 $iface1 $nodename \
			    "$PROG" "$OPT"
			n=`expr ${n} + 1`
			continue
		fi
		ALL_HAPRIV="${ALL_HAPRIV} ${hahost}"
		n=`expr ${n} + 1`
  	done
	if [ $wentwrong -eq 1 ]; then
		Error4
	fi

  	if [ -z "$ALL_HAPRIV" ]; then
		lmsg=`gettext \
"Cannot get names of HA private links.\n\
Re-run command using\n\
	%s %s -H <hostname,...>\n\
having made sure that the specified hostname addresses are up\n\
and that the appropriate hostname entries exist in /.rhosts files."`
		printf "${lmsg}\n" "$PROG" "$OPT"
		Error4
	fi

	R_LINKS=$ALL_HAPRIV
}


#
# Test remote connections.
# If Hflag was specified, global $R_LINKS already is list of connections.
# Otherwise call get_remote_halinks to determine connections.
#
test_remote_conns ()
{
	if [ $Hflag -eq 0 ]; then
		# sets R_LINKS
		get_remote_halinks
	fi

	for i in $R_LINKS; do
		test_rsh $i
	done
}


#
# XXX re-write using awk?
#
major_num_check ()
{
	if [ $debugflag -eq 1 ]; then
		lmsg=`gettext \
		    "Verifying did major number is the same on all nodes"`
		printf "${lmsg}\n"
	fi
	maj_status=0
	admin_maj=`grep "^did" /etc/name_to_major`
	if [ $? -ne 0 ]; then
		lmsg=`gettext \
		    "No did entry exists in name_to_major."`
		printf "${lmsg}\n"
		Error6
	fi
	for i in $R_LINKS; do
		call_rcp -t $SHORT_RSH_TIMEOUT $i \
		    ${i}:/etc/name_to_major ${TMPDIDDIR}/maj.$i > $TMPERR 2>&1
		if [ $? -ne 0 ]; then
			tmperrmsg=`cat $TMPERR`
			lmsg=`gettext \
			    "rcp of name_to_major from %s failed/timed-out: %s"`
			printf "${lmsg}\n" $i "$tmperrmsg"
			maj_status=1
			continue
		fi
		if [ ! -f ${TMPDIDDIR}/maj.$i ]; then
			tmperrmsg=`cat $TMPERR`
                        lmsg=`gettext "rcp of name_to_major from %s failed: %s"`
                        printf "${lmsg}\n" $i "$tmperrmsg"
			exit 6
		fi
		non_admin_maj=`grep "^did" ${TMPDIDDIR}/maj.$i`
		if [ $? -ne 0 ]; then
			lmsg=`gettext \
			    "No did entry exists in %s name_to_major."`
			printf "${lmsg}\n" $i
			maj_status=1
		elif [ "$admin_maj" != "$non_admin_maj" ]; then
			lmsg="`gettext \
'did name_to_major entry on %s (%s) is\n\
different from the entry on this node (%s)'`"
			printf "${lmsg}\n" $i "$non_admin_maj" "$admin_maj"
			maj_status=1
		fi
		rm -f ${TMPDIDDIR}/maj.$i
	done
	if [ $maj_status -eq 1 ]; then
		Error6
	fi
}


#
# common functions to be performed for -r, -R
#
didadm_common ()
{
	# get name of cluster
        if [ ! -f ${SC_DFLT_CLUSTERNAME} ]; then
		lmsg=`gettext "Default cluster file not found: %s. Exiting."`
		printf "${lmsg}\n" "${SC_DFLT_CLUSTERNAME}"
                Error2
        fi 

        clustname="`cat ${SC_DFLT_CLUSTERNAME}`"
        # exit if name is null
        if [ -z "${clustname}" ] ; then
                lmsg=`gettext "Default cluster name is null. Exiting"`
                printf "${lmsg}\n"
                Error2
        fi

        # get name of cdb file
        cdbfile=${SC_CONFIGDIR}/${clustname}.cdb

	if [ ! -f ${cdbfile} ]; then
		lmsg=`gettext "Cannot open cdb file: %s. Exiting."`
		printf "${lmsg}\n" "${cdbfile}"
		Error2
	fi
									
	# get name of first node
    	nodezero=`${BINDIR}/cdbmatch cluster.node.0.hostname ${cdbfile}`
    	
        # make sure the command is running on node 0
        LOCALHOST=`uname -n`
 	if [ "$LOCALHOST" != "$nodezero" ]; then
        	lmsg=`gettext "Command must be run on node 0: %s"`
        	printf "${lmsg}\n" $nodezero
         	exit 5
        fi   
         				
	# sets R_LINKS for later use by present routine
	test_remote_conns

	# verify that did major number is the same on all nodes
	major_num_check

#echo "exiting early for testing purposes"
#exit 0

	# Create /etc/did.conf if it doesn't already exist.
	#
	if [ ! -f $DIDFILE ]; then
		if [ $debugflag -eq 1 ]; then
			lmsg=`gettext "Creating %s"`
			printf "${lmsg}\n" $DIDFILE
		fi

		didadm -f /dev/null -r -s $DIDFILE
		if [ $? -ne 0 ]; then
			tmperrmsg=`cat $TMPERR`
			lmsg=`gettext "Could not create %s: %s"`
			printf "${lmsg}\n" $DIDFILE "$tmperrmsg"
			Error2
		fi
	fi

	# -r: Discover all disks with device ids on this node.
	#
	# -R: Perform repair procedure on $DEVICE.
	#
	# The new config file will be saved in $TMPDIDFILE.
	#

	if [ $debugflag -eq 1 ]; then
		lmsg=`gettext "Running didadm %s locally"`
		printf "${lmsg}\n" $OPT
	fi

	didadm $OPT -f $DIDFILE -s $TMPDIDFILE $DEVICE >$TMPERR 2>&1
	if [ $? -ne 0 ]; then
		tmperrmsg=`cat $TMPERR`
		lmsg=`gettext "didadm failed: %s"`
		printf "${lmsg}\n" "$tmperrmsg"
		Error2
	fi

	# For each other node $i,
	#   * rcp the new config file from node 0 to each remote node
	#   * remotely run didadm against that new config file, performing
	#     (re)discovery
	#   * save the resulting information in ${i}:$TMPDIDFILE
	#   * copy that new file back to node 0 for use on the next
	#     node in the list
	#
	for i in $R_LINKS; do
		call_rcp -t $SHORT_RSH_TIMEOUT $i \
		    $TMPDIDFILE ${i}:${TMPDIDDIR} > $TMPERR 2>&1
		if [ $? -ne 0 ]; then
			tmperrmsg=`cat $TMPERR`
			lmsg=`gettext "rcp to %s failed/timed-out: %s"`
			printf "${lmsg}\n" $i "$tmperrmsg"
			Error2
		fi

		if [ $debugflag -eq 1 ]; then
			lmsg=`gettext "Running didadm on %s"`
			printf "${lmsg}\n" $i
		fi

# XXX scdidadm -R needs to be tested for both these cases:
# XXX 1) disk replacement on node0, 2) disk replacement on nodeN where
# node0 is not connected to this disk.
		# To support arbitrary topologies in SC 2.2, we now need to
		# call didadm -R on all nodes because we don't know on which
		# node the replacement was made.

		# Use a large timeout in case ssd devices are not
		# currently probed out.  (workaround)
		call_rsh $i -n $i didadm $OPT -f $TMPDIDFILE -s $TMPDIDFILE \
		    > $TMPERR 2>&1
		if [ $? -ne 0 ]; then
			tmperrmsg=`cat $TMPERR`
			lmsg=`gettext "didadm failed on %s: %s"`
			printf "${lmsg}\n" $i "$tmperrmsg"
			Error2
		fi

		call_rcp -t $SHORT_RSH_TIMEOUT $i \
		     ${i}:$TMPDIDFILE $TMPDIDDIR > $TMPERR 2>&1
		if [ $? -ne 0 ]; then
			tmperrmsg=`cat $TMPERR`
			lmsg=`gettext "rcp from %s failed/timed-out: %s"`
			printf "${lmsg}\n" $i "$tmperrmsg"
			Error2
		fi
	done

	# For each other node $i,
	#   * rcp the latest config file with the accumulated information
	#     from node 0 to each remote node
	#
	for i in $R_LINKS; do
		rcp $TMPDIDFILE ${i}:${TMPDIDDIR} > $TMPERR 2>&1
		if [ $? -ne 0 ]; then
			tmperrmsg=`cat $TMPERR`
			lmsg=`gettext "rcp to %s failed/timed-out: %s"`
			printf "${lmsg}\n" $i "$tmperrmsg"
			Error2
		fi
	done

	sync

	# Rename the config file to its real name on node 0.
	#
	mv $TMPDIDFILE $DIDFILE > $TMPERR 2>&1
	if [ $? -ne 0 ]; then
		tmperrmsg=`cat $TMPERR`
		lmsg=`gettext "Could not rename %s to %s: %s"`
		printf "${lmsg}\n" $TMPDIDFILE $DIDFILE "$tmperrmsg"
		Error2
	fi

	# Initialize the driver and upload the config table into the kernel.
	#
	didadm -u -i -f $DIDFILE > $TMPERR 2>&1
	if [ $? -ne 0 ]; then
		tmperrmsg=`cat $TMPERR`
		lmsg=`gettext "didadm -u -i failed: %s"`
		printf "${lmsg}\n" $i "$tmperrmsg"
		Error3
	fi

	#
	# Only need to run drvconfig, devlinks on initial configuration
	# or after adding disks.  Not needed for disk replacement procedure.
	#
	if [ $ACTION != replace_disk ]; then

		lmsg=`gettext \
		    "Configuring /devices and /dev; this may take a while."`
		printf "$lmsg\n"

		if [ $debugflag -eq 1 ]; then
			lmsg=`gettext "Running drvconfig locally"`
			printf "${lmsg}\n"
		fi
		drvconfig >$TMPERR 2>&1
		if [ $? -ne 0 ]; then
			tmperrmsg=`cat $TMPERR`
			lmsg=`gettext "drvconfig failed: %s"`
			printf "${lmsg}\n" "$tmperrmsg"
			Error3
		fi
		if [ $debugflag -eq 1 ]; then
			lmsg=`gettext "Running devlinks locally"`
			printf "${lmsg}\n"
		fi
		devlinks >$TMPERR 2>&1
		if [ $? -ne 0 ]; then
			tmperrmsg=`cat $TMPERR`
			lmsg=`gettext "devlinks failed: %s"`
			printf "${lmsg}\n" "$tmperrmsg"
			Error3
		fi
	fi

	for i in $R_LINKS; do
		call_rsh  -t $SHORT_RSH_TIMEOUT $i -n $i \
		    fdl_timedrun 60 sync </dev/null > $TMPERR 2>&1
		if [ $? -ne 0 ]; then
			tmperrmsg=`cat $TMPERR`
			lmsg=`gettext "sync failed/timed-out on %s: %s"`
			printf "${lmsg}\n" $i "$tmperrmsg"
		fi
	done

	for i in $R_LINKS; do
		call_fdl_rshstatus -t $SHORT_RSH_TIMEOUT $i -n $i \
		     mv $TMPDIDFILE $DIDFILE >$TMPERR 2>&1
		if [ $? -ne 0 ]; then
			tmperrmsg=`cat $TMPERR`
			lmsg=`gettext "mv failed on %s: %s"`
			printf "${lmsg}\n" $i "$tmperrmsg"
			Error3
		fi
	done

	for i in $R_LINKS; do
		call_fdl_rshstatus -t $SHORT_RSH_TIMEOUT $i -n $i \
		     didadm -u -i -f $DIDFILE >$TMPERR 2>&1
		if [ $? -ne 0 ]; then
			tmperrmsg=`cat $TMPERR`
			lmsg=`gettext "didadm failed on %s: %s"`
			printf "${lmsg}\n" $i "$tmperrmsg"
			Error3
		fi

		if [ $ACTION != replace_disk ]; then
			if [ $debugflag -eq 1 ]; then
				lmsg=`gettext "Running drvconfig on %s"`
				printf "${lmsg}\n" $i
			fi
			call_fdl_rshstatus -t $LONG_RSH_TIMEOUT $i -n $i \
			     drvconfig >$TMPERR 2>&1
			if [ $? -ne 0 ]; then
				tmperrmsg=`cat $TMPERR`
				lmsg=`gettext "drvconfig failed on %s: %s"`
				printf "${lmsg}\n" $i "$tmperrmsg"
				Error3
			fi
	
			if [ $debugflag -eq 1 ]; then
				lmsg=`gettext "Running devlinks on %s"`
				printf "${lmsg}\n" $i
			fi
			call_fdl_rshstatus -t $LONG_RSH_TIMEOUT $i -n $i \
			     devlinks >$TMPERR 2>&1
			if [ $? -ne 0 ]; then
				tmperrmsg=`cat $TMPERR`
				lmsg=`gettext "devlinks failed on %s: %s"`
				printf "${lmsg}\n" $i "$tmperrmsg"
				Error3
			fi
		fi
	done

	rm -f $TMPERR

	return 0
}


#
# Main
#

#
# Check for root
#
uid=`id`
if [ $? -ne 0 ]; then
	printf "`gettext '%s:  Cannot get user id'`\n" "${PROG}" >&2 
	exit 1
fi
uid=`set -- ${uid}; echo $1`
if [ "${uid}" != "uid=0(root)" ]; then
	printf "`gettext '%s: This program must be executed by root'`\n" \
	    "${PROG}" >&2 
	exit 1
fi

#
# Pick up the rcp/rsh library. 
#
LIBRCPRSH=${BINDIR}/librcprsh
if [ -s $LIBRCPRSH ]; then
	. $LIBRCPRSH
	if [ $? -ne 0 ]; then
		printf "`gettext '%s: Error sourcing file %s'` $LIBRCPRSH" 
		exit 1
	fi
else
	printf "`gettext 'File missing or empty: %s'` $LIBRCPRSH" 
	exit 1
fi

#
# Must be at least one argument.
#
if [ $# -eq 0 ]; then
	Usage
fi

debugflag=0
Hflag=0
format=""
while getopts rR:lLho:H:d c
do
	case $c in

	r)	if [ -n "$ACTION" ]; then
			lmsg=`gettext "May only specify one of {-r,-R,-l,-L}."`
			printf "$lmsg\n"
			Usage
		fi
		ACTION=discover
		DEVICE=
		OPT="-r"
		;;

	R)	if [ -n "$ACTION" ]; then
			lmsg=`gettext "May only specify one of {-r,-R,-l,-L}."`
			printf "$lmsg\n"
			Usage
		fi
		ACTION=replace_disk
		DEVICE=$OPTARG
		OPT="-R"
		if [ $debugflag -eq 1 ]; then
			lmsg=`gettext \
			    "Performing device id rediscovery for DEVICE %s"`
			printf "${lmsg}\n" $DEVICE
		fi
		;;

	h)	format="-h $format"
		;;

	l)	if [ -n "$ACTION" ]; then
			lmsg=`gettext "May only specify one of {-r,-R,-l,-L}."`
			printf "$lmsg\n"
			Usage
		fi
		ACTION=list
		;;

	L)	if [ -n "$ACTION" ]; then
			lmsg=`gettext "May only specify one of {-r,-R,-l,-L}."`
			printf "$lmsg\n"
			Usage
		fi
		ACTION=Biglist
		;;

	d)	debugflag=1
		;;

	H)	if [ $Hflag -eq 1 ]; then
			lmsg=`gettext "May only specify -H flag once."`
			printf "$lmsg\n"
			Usage
		fi
		R_LINKS=$OPTARG
		if [ -z "$R_LINKS" ]; then
			lmsg=`gettext "-H argument is the null string."`
			printf "$lmsg\n"
			Usage
		fi
		# convert ','-separated hostname list argument to expected
		# R_LINKS ' '-separated internal hostname list.
		R_LINKS=`echo ${R_LINKS} | tr , " "`
		Hflag=1
		;;

	o)	format="-o $OPTARG $format"
		;;

	*)	Usage
		;;

	esac
done

# Get the optional -l|-L optional path|instance arg.
# There should be exactly one argument left to process.
shift `expr $OPTIND - 1`
if [ "$1" != "" ]; then
	format="$format $1"
	if [ "$2" != "" ]; then
		# Too many args.
		Usage
	fi
fi

exitstatus=0
case $ACTION in
	"replace_disk"|"discover")
		didadm_common
		;;

	"list")
		didadm -l $format
		exitstatus=$?
		;;

	"Biglist")
		didadm -L $format
		exitstatus=$?
		;;

	*)
		Usage
		;;
esac

exit $exitstatus


