#! /bin/sh
#
# ident "@(#)librcprsh.sh 1.3     01/03/28 SMI"	
#
# Copyright (c) 1993-1998 by Sun Microsystems, Inc.
# All rights reserved.
#
# Specification: 
#
# Library of shell functions for rcp and rsh.  
#
# Preconditions for using the library:
# (1) Assumes that utilities.sh has already been shell source included
# by the client shell script.
# (2) Assumes that PATH is initialized properly.
# Does NOT assume that ha_env environment file exists
#
# This library script optionally can be told to impose some error handling;
# the error handling is appropriate ONLY when this script is being 
# used by a cluster transition step, directly or indirectly.
# The error handling can cause a cluster reconfiguration.
# To error handling is off by default.  To enable the error handling, 
# the caller should call the function:
#	librcprsh_error_on
# before calling any of the other routines of this library.
#
# The functions of the library are:
#    call_rcp  call_rsh  call_fdl_rshstatus
# Every other function in this file is internal to the library, and
# should not be called by clients.
#
# Constant symbols for rcp and rsh (minimal) support, known to clients
#
# Timeouts for rsh and rcp, and special exit code for them to use
# in combination with fdl_timedrun.
# (29-Jan-1996 Increased RSHTIMEOUT from 30 to 120).
# These symbols are regarded as being exported by this library to its
# clients:
LIBRCPRSH_TIMEOUT=120
LIBRCPRSH_TIMEOUTEC=99

#
# pick up our library of shell routines
#
. ${BINDIR}/utilities
if [ $? -ne 0 ]; then
	log_info "XXXXX" " Cannot find utilities library"
	exit 1
fi

########################################################################
#
# Implementation: (everything that follows)
# 
########################################################################

# 
# Global variable LIBRCPRSH_FAILEDLIST keeps track of which
# remote links have had an rsh/rcp timeout such that we
# do not want to pay the expense of the timeout again.
#
LIBRCPRSH_FAILEDLIST=""

#
# HA_CLUSTER: Needed in case we decide to reconfigure.
#
HA_CLUSTER=${HA_CLUSTER:=hadf}
export HA_CLUSTER

#
# Error handling:  off by default
#
LIBRCPRSH_ERRORHANDLING=0

#
# librcprsh_error_on  -- turn on error handling
#
librcprsh_error_on()
{
    LIBRCPRSH_ERRORHANDLING=1
}


#
# librcprsh_failed remotehostprivatelink
#
# Internal routine to this library.
# Called when an rcp/rsh to that remotehostprivatelink fails
#
librcprsh_failed()
{
    RRF_ARG="$1"
    LIBRCPRSH_FAILEDLIST="$LIBRCPRSH_FAILEDLIST $RRF_ARG"
    logerr "rcp/rsh to other host via private link $RRF_ARG failed/timedout"
    if [ $LIBRCPRSH_ERRORHANDLING -eq 0 ]; then
	    return
    fi
    if ping $RRF_ARG > /dev/null 2>&1 ; then
	# This implies that sibling host is still up but
	# that rsh/rcp/inetd aren't working.  Just punt and
	# return to caller.  Ultimately, this may cause the
	# two servers to disagree over which server should
	# own what diskset and/or what diskset(s) are in
	# MAINT mode -- that can cause one server to forcibly
	# takeover a diskset that the other server hasn't
	# released, causing the later to crash.  And we have
	# no guarantee that the one that crashes is the one that
	# was sicker.
	# Could try to diagnose his inetd, say, by
	# doing "mconnect -p 13", but we cannot do anything
	# about it anyway if his inetd isn't working.
	logerr "rsh/rcp failure may produce errors regarding which physical hosts should master what logical hosts"
	config_diff_err
	return
    fi
    # Either (i) the private link we had been using $RRF_ARG
    # is no good, or
    # (ii) both private links are no good,
    # (iii) sibling is down, or
    # (iv) sibling isn't running clustd.
    # Cases ii, iii, and iv are basicly the same, and we should
    # call clustm reconfigure to let this node's clustd figure
    # out what to do.
    # For case (i), the rsh/rcp command that just failed should
    # really be re-tried on the other link.  To force the retry,
    # we start cluster reconfiguration over from the beginning.
    # Thus, for all of the cases, we trigger cluster reconfigure.
    logerr "ping $RRF_ARG failed, doing cluster reconfiguration"
    clustm reconfigure $HA_CLUSTER >/dev/null 2>&1
    if [ $? -ne 0 ]; then
	logerr "clustm reconfigure failed"
	return
    fi	
    exit 0
}
	

librcprsh_usage_err()
{
    logerr "Source code error: too few arguments to call_rcp/call_rsh/call_fdl_rshstatus"
}


librcprsh_common()
{
    # Internal common routine to this library.
    # Usage: librcprsh_common [-e errorexitcode] [-t timeoutsecs] \
    #    remoteprivatelink  <same arguments as the program to run>
    # If both -e and -t are specified, the -e must come first.  
    # The timeout will default to $LIBRCPRSH_TIMEOUT if the -t is absent.
    # Uses global variable LIBRCPRSH_PROGRAM as the program to run.
    # Fail quickly if we've failed before with this link.
    # Will return $LIBRCPRSH_TIMEOUTEC if the rcp/rsh times out 
    # (overriden by the -e flag).
    if [ "$1" = "-e" ]; then
	if [ $# -lt 4 ]; then
	    librcprsh_usage_err
	    return 1
	fi
        LIBRCPRSH_EECODE="$2"
	shift
	shift
    else
	LIBRCPRSH_EECODE=$LIBRCPRSH_TIMEOUTEC
    fi

    if [ "$1" = "-t" ]; then
	if [ $# -lt 4 ]; then
	    librcprsh_usage_err
	    return 1
	fi
	LIBRCPRSH_TEMPTIMEOUT="$2"
	shift
	shift
    else
	LIBRCPRSH_TEMPTIMEOUT=$LIBRCPRSH_TIMEOUT
    fi
	    
    if [ $# -lt 2 ]; then
	librcprsh_usage_err
	return 1
    fi

    LIBRCPRSH_LINK="$1"
    shift

    if is_member "$LIBRCPRSH_LINK" "$LIBRCPRSH_FAILEDLIST" ; then
	logerr "rcp/rsh to sibling host $LIBRCPRSH_LINK failed recently, not retrying"
	logerr "command was: $*"
	return $LIBRCPRSH_EECODE
    fi
    # Compute a temporary directory and a temporary file name for stderr.
    LIBRCPRSH_TMPD=${HA_TMP:=/var/opt/SUNWhadf/hadf/tmp}
    if [ ! -d $LIBRCPRSH_TMPD ]; then
	LIBRCPRSH_TMPD=/var/opt/SUNWcluster/run
    fi
    LIBRCPRSH_TMPERR=$LIBRCPRSH_TMPD/librcprsh.tmperr.$$
    # We redirect stderr into a temporary file so that we can grep
    # for 'Connection refused' below.
    # Evade internationalization: Must set locale to vanilla:
    LC_ALL=C hatimerun -e $LIBRCPRSH_EECODE -k TERM -t $LIBRCPRSH_TEMPTIMEOUT \
        fdl_rcomm -e $LIBRCPRSH_EECODE $LIBRCPRSH_LINK $LIBRCPRSH_PROGRAM $* \
	    2>$LIBRCPRSH_TMPERR
    LIBRCPRSH_RC=$?
    # Worse case, rcp of a file from the other host to this host
    # can exit zero (success) even when the inetd on the other host
    # is down and the work did not actually get done.  When the
    # other host's inetd is down, rcp will print 'Connection refused'
    # on stderr.  Check for that case:
    if grep 'Connection refused' $LIBRCPRSH_TMPERR >/dev/null ; then
	LIBRCPRSH_RC=$LIBRCPRSH_EECODE
	# Fall thru and treat like the timeout case.
    fi
    # Copy the temporary stderr file back onto stderr where the
    # caller expects it to be:
    cat $LIBRCPRSH_TMPERR 1>&2
    if [ $LIBRCPRSH_RC -ne 0 -a $LIBRCPRSH_RC -ne $LIBRCPRSH_EECODE ]; then
	# See if ping of the remoteprivatelink still works, if not, treat that
	# like the timeout case:
	ping $LIBRCPRSH_LINK >/dev/null 2>&1
	if [ $? -ne 0 ]; then
	    LIBRCPRSH_RC=$LIBRCPRSH_EECODE
	    # Fall thru and treat like the timeout case.
	fi
    fi
    if [ $LIBRCPRSH_RC -eq $LIBRCPRSH_EECODE ]; then
	librcprsh_failed $LIBRCPRSH_LINK
	log_err_file $LIBRCPRSH_TMPERR
	logerr "command was: $*"
    fi
    rm -f $LIBRCPRSH_TMPERR >/dev/null 2>&1
    return $LIBRCPRSH_RC
}
    

call_rcp()
{
    # Usage: call_rcp [-e errorexitcode] [-t timeoutsecs] remoteprivatelink <same arguments as rcp command>
    # where errorexitcode if absent defaults to $LIBRCPRSH_TIMEOUTEC
    # and timeoutsecs if absent defaults to $LIBRCPRSH_TIMEOUT.
    # Will return errorexitcode if the rcp/rsh times out
    LIBRCPRSH_PROGRAM="rcp"
    librcprsh_common $*
    return $?    
}


call_rsh()
{
    # Usage: call_rsh [-e errorexitcode] [-t timeoutsecs] remoteprivatelink <same arguments as rsh command>
    # where errorexitcode if absent defaults to $RSHTIMEOUTEC
    # and timeoutsecs if absent defaults to $LIBRCPRSH_TIMEOUT.
    # Will return errorexitcode if the rcp/rsh times out
    LIBRCPRSH_PROGRAM="rsh"
    librcprsh_common $*
    return $?
}


call_fdl_rshstatus()
{
    # Usage: call_fdl_rshstatus [-e errorexitcode] [-t timeoutsecs] \
    #     remoteprivatelink <same arguments as fdl_rshstatusrsh command>
    # where errorexitcode if absent defaults to $RSHTIMEOUTEC
    # and timeoutsecs if absent defaults to $LIBRCPRSH_TIMEOUT.
    # Will return errorexitcode if the rcp/rsh times out
    LIBRCPRSH_PROGRAM="fdl_rshstatus"
    librcprsh_common $*
    return $?
}
