#! /bin/ksh
#
#pragma ident       "@(#)dns_probe.shi 1.9     01/03/28 SMI"
#
#	Copyright 12/20/96 Sun Microsystems, Inc.  All Rights Reserved.
#

# Usage: dns_probe <instance name>
# Started up in the background via pmfd in dns_fm_start during reconfiguration.

#
# Add the path to framework binaries, since the probe is not called in the
# context of the methods
#
PATH=${PATH}:/opt/SUNWcluster/bin:/opt/SUNWcluster/ha/dns

INST_NAME=$1

#
#	Copyright 11/18/96 Sun Microsystems, Inc.  All Rights Reserved.
#
#
#pragma ident "@(#)ds_boiler	1.1 97/06/12 SMI"
#
#ident "@(#)ds_boiler		1.7	96/11/18 SMI"
#
# common boiler for HA Internet Pro data services
#
#


ARGV0=`basename $0`
LOGGER=logger
HA_SLOGFACILITY=`haget -f syslog_facility`
HA_SLOGTAG=hadf
prog_path=`dirname $0`

# source in ha-services common utilities
. ds_utilities

# add the ha-service specific clust_progs
expr "$prog_path" : '.*/clust_progs' >/dev/null 2>&1
if [ $? -eq 0 ]; then
	PATH=${prog_path}:${PATH}
else
	PATH=${prog_path}:${prog_path}/../clust_progs:${PATH}
fi

# add the ha-service specific fault_progs
expr "$prog_path" : '.*/fault_progs' >/dev/null 2>&1
if [ $? -eq 0 ]; then
    PATH=${prog_path}:${PATH}
else
    PATH=${prog_path}:${prog_path}/../fault_progs:${PATH}
fi

#
# for use by subsequent hactl command, get hostnames of local and remote hosts
#
REMOTEHOSTS=
LOCALHOST=`uname -n`

if [ $? -ne 0 ]; then
	logerr `gettext "Cannot obtain name of local host"`
	exit 1
fi
# compute hostnames of remote nodes
PHYS_HOSTS="`haget -f all_physical_hosts`"
for i in $PHYS_HOSTS; do
	if [ "$i" != "$LOCALHOST" ]; then
		REMOTEHOSTS="$REMOTEHOSTS $i"
	fi
done
#! /bin/ksh 
#
#pragma ident   "@(#)do_service.m4 1.7     00/06/01 SMI"
#
#	Copyright 12/20/96 Sun Microsystems, Inc.  All Rights Reserved.
#
#
#

#
# this can be done at run time if we encounter issues within an OS
# release (i.e. initial motivation came from Orion)
#
NAMED_CONF_FILE=/etc/named.boot


ARG_MASTERED=$1
ARG_NOT_MASTERED=$2
SYSLOG_PREFIX="SUNWcluster.ha.dns"

# Replace comma with space to form an sh word list
MASTERED="`echo $ARG_MASTERED | tr ',' ' '`"
NOT_MASTERED="`echo $ARG_NOT_MASTERED | tr ',' ' '`"

# This file is replicated on both servers.
## HADNS_CONFIG_FILE=/etc/opt/SUNWhadns/hadsconf



#
# Call the parser to handle the config file.
#
## if [ ! -f $HADNS_CONFIG_FILE ]; then
##	logerr `gettext "$HADNS_CONFIG_FILE doesn't exist"`
##	exit 1
## fi

## source_env_file $HADNS_CONFIG_FILE

source_env DNS 

if [ $? -ne 0 ]; then
	# source_env_file logs error message if it fails.
	# No need to log another; just exit.
	exit 1
fi

#
# Timeout to waiting for SIGTERM to stop a process
# This should be in the config file
#
STOP_TIMEOUT=15

#
# bundle_do_svc <action>
#
# is called for each instance
#
bundle_do_svc ()
{
	action=$1
        typeset prefix
        prefix="$SYSLOG_PREFIX.$action"

	case $action in

	'start')

		# First do some error checking.

		if [ -f ${NAMED_CONF_FILE} ]; then
			lognotice "$prefix.2050"\
			    `gettext "${NAMED_CONF_FILE} exists but should not."`
		fi

		if [ ! -d ${_INST_CONF_DIR} ]; then
			logerr "$prefix.4040"\
`gettext "DNS missing ${_INST_CONF_DIR} directory."`
			exit 1
		fi

		if [ ! -f ${_INST_CONF_DIR}/${NAMED_CONF_FILE} ]; then
			logerr "$prefix.4041"\
`gettext "DNS missing ${_INST_CONF_DIR}/${NAMED_CONF_FILE} configuration file."`
			exit 1
		fi

		if [ ! -x ${_INST_START} ]; then
			logerr "$prefix.4042"\
`gettext "${_INST_START} is not executable."`
			exit 1
		fi

		if [ ! -x /usr/sbin/in.named ]; then
			logerr "$prefix.4043"\
`gettext "/usr/sbin/in.named is not executable"`
			exit 1
		fi

		# The process monitor facility calls the start program,
		# passing to it the instance-specific information it needs.
		# Note that we're using pmf to start/stop, but not to probe.
		# The start program, dns_start, takes the DNS database dir
		# as its argument.

		if [ ${_INST_RETRY} = "n" ]; then
			pmfadm -c ${_INST_NAME} \
			   ${_INST_START} ${_INST_CONF_DIR}
		else
			pmfadm -c ${_INST_NAME}          \
			       -n ${_INST_RETRY_TIMES}    \
			       -t ${_INST_RETRY_INTERVAL} \
			       -a ${_INST_PROBE_CALLBACK_1} \
			   ${_INST_START} ${_INST_CONF_DIR}
		fi

		if [ $? -ne 0 ]; then
			logerr "$prefix.4044" `gettext "pmfadm failed to start DNS instance ${_INST_NAME}"`
			exit 1
		else
			lognotice "$prefix.2040" `gettext "Started DNS instance ${_INST_NAME}"`
		fi
	;;

	'stop' | 'abort')

		# remove in.named from pmfd's queue and then kill it
		pmfadm -s ${_INST_NAME} -w  ${STOP_TIMEOUT} TERM || \
			pmfadm -s ${_INST_NAME} KILL
		if [ $? -ne 0 ]; then
			logerr "$prefix.4045" `gettext "pmfadm failed to stop DNS instance ${_INST_NAME}"`
			exit 1
		else
			lognotice "$prefix.2041" `gettext "Stopped DNS instance ${_INST_NAME}"`
		fi
	;;

	'fm_start')

                # Check whether probe is required on this node.

                need_to_run_probe ${_INST_LOGICAL_HOST} ${LOCALHOST}
 
                if [ $? -ne 0 ]; then
                        exit 0
                fi


		# pmf starts dns_probe
		# dns_probe runs until dns_fm_stop kills it.
		# Don't start probe if diskset is in maintenance mode.

		# If HA-DNS's diskset is in maint mode, exit now.
		MAINT=`haget -f is_maint -h ${_INST_LOGICAL_HOST}`
		if [ "$MAINT" = "1" ]; then
			exit 0
		fi

		pmfadm -c ${_INST_NAME}.probe ${_INST_PROBE_PROG_1} ${_INST_NAME}

		if [ $? -ne 0 ]; then
			logerr "$prefix.4046"\
			  `gettext "pmfadm failed to start DNS probe instance ${_INST_NAME}.probe "`
			exit 1
		else
			lognotice "$prefix.2042" `gettext "Started DNS probe instance ${_INST_NAME}.probe"`
		fi
	;;

	'fm_stop')

		# If probe not running, do nothing
		ha_svc_not_running ${_INST_NAME}.probe && exit 0


		# stop monitoring dns_probe, then kill it
		pmfadm -s ${_INST_NAME}.probe -w ${STOP_TIMEOUT} TERM || \
			pmfadm -s ${_INST_NAME} KILL
		if [ $? -ne 0 ]; then
			logerr "$prefix.4047"\
			 `gettext "pmfadm failed to stop DNS probe instance ${_INST_NAME}.probe"`
			exit 1
		else
			lognotice "$prefix.2043" `gettext "Stopped DNS probe instance ${_INST_NAME}.probe"`
		fi
	;;

	'fm_check_this_host_ok')

		# If this host is not providing name service, just return.

		# If the HA-DNS logical host is not
		# currently mastered by this machine, exit now.

		is_member "${_INST_LOGICAL_HOST}" "$MASTERED"
		if [ $? -ne 0 ]; then
			exit 0
		fi

		LOCALHOST=`uname -n`

		# Otherwise, probe name service now.
		# If dead, request will time out
		# in ${_INST_PROBE_TIMEOUT_1} secs.
		# Note: nslookup default timeout is 1.5 minutes.
		hatimerun -t ${_INST_PROBE_TIMEOUT_1} /usr/sbin/nslookup \
		    $LOCALHOST > /dev/null 2>&1
		if [ $? -ne 0 ]; then
			logerr "$prefix.4048"\
`gettext "This server is supposed to be providing DNS service, but isn't"`
			exit 1
		fi
	;;

	esac

	exit 0
}
#include_boiler


prefix="SUNWcluster.ha.dns.probe"

set_inst_name ${INST_NAME}

if [ -z "$INST_NAME" ]; then
	logerr "$prefix.4050" `gettext "Usage: $ARGV0 <instance>"`
	exit 1
fi

MASTERED_LOGICAL_HOSTS="`haget -f mastered`"

DNS_CONFIG=`get_config_param $INST_NAME CONF_DIR`
# required parameter
if [ -z "$DNS_CONFIG" ]; then
	logerr "$prefix.4051" \
	    `gettext "DNS_CONFIG value not set for instance $INST_NAME"`
	exit 1
fi

DNS_HOST=`get_config_param $INST_NAME LOGICAL_HOST`

# parser requires this to be set

DNS_PROBE_INTERVAL=`get_config_param $INST_NAME PROBE_1_INTERVAL`
# parser requires this to be set, but doesn't check for negative values
if [ $DNS_PROBE_INTERVAL -lt 0 ]; then
	lognotice "$prefix.2060"\
`gettext "INTERVAL value is negative for instance $INST_NAME; using 60 seconds"`
	DNS_PROBE_INTERVAL=60
fi

DNS_PROBE_TIMEOUT=`get_config_param $INST_NAME PROBE_1_TIMEOUT`
# optional parameter, parser doesn't check for <= 0 values
if [ -z "$DNS_PROBE_TIMEOUT" ]; then
	lognotice "$prefix.2061"\
`gettext "TIMEOUT value not set for instance $INST_NAME; using 60 seconds"`
	DNS_PROBE_TIMEOUT=60
fi
# what timeout value is too low?
if [ $DNS_PROBE_TIMEOUT -le 0 ]; then
	lognotice "$prefix.2062"\
`gettext "TIMEOUT is <= zero for instance $INST_NAME; resetting to 60 seconds"`
	DNS_PROBE_TIMEOUT=60
fi

DNS_TAKEOVER=`get_config_param $INST_NAME PROBE_1_TAKEOVER`
# optional parameter
if [ -z "$DNS_TAKEOVER" ]; then
	lognotice "$prefix.2063"\
	   `gettext "TAKEOVER value not set for instance $INST_NAME; using 'y'"`
	DNS_TAKEOVER=y
fi

# We expect this file to live on both servers.
if [ ! -f /etc/resolv.conf ]; then
	logerr "$prefix.4052" `gettext "Missing /etc/resolv.conf file"`
fi

LOCAL=no
is_member "$DNS_HOST" "$MASTERED_LOGICAL_HOSTS"
if [ $? -eq 0 ]; then
	# DNS_HOST is running locally
	LOCAL=yes
fi

DNSPROBEFILE=/var/opt/SUNWcluster/run/.`basename $0`.$INST_NAME
DNSGRACE=0
RETRY=0
FAIL=0         # This is not cleared every loop
probefail=0    # This is cleared every loop

while : ; do

	probefail=0

	# Take a nap here, instead of at the end of loop.
	# At start-up, this gives more time to the server to start

	sleep $DNS_PROBE_INTERVAL


	# nslookup normally times out in 1.5 minutes, or goes to
	# the next dns server on the /etc/resolv.conf list.
	#
	# Instead of looking up the "physical host" lookup the HA-DNS logical
	# host itself... We don't want to depend on what does named.db look like
	# in the DNS setup
	#
	hatimerun -t $DNS_PROBE_TIMEOUT /usr/sbin/nslookup $DNS_HOST $DNS_HOST \
		 > $DNSPROBEFILE 2>&1

	if [ $? -ne 0 ]; then
		probefail=1
	fi

    # In here, we don't want nslookup to contact other
	# nameservers, if nslookup fails to resolve the name server
	# locally.  Also, if nslookup can't contact the server given
	# on the command line, will resort to using servers listed
	# in /etc/resolv.conf and it may succeed even if the HA-DNS
	# server is dead/stuck.
	# 
	# We are saved by the fact that nslookup actually returns the name of the
	# DNS server which resolved the query. Thus we can compare the server name to
	# make sure that the reply came from the HA-DNS server.


	if [ $probefail -eq 0 ]; then
		SERVER=` awk ' $1=="Server:" { print $2 }' $DNSPROBEFILE | awk -F. ' { print $1 } ' `
		if [ -z "$SERVER" ]; then
			probefail=1
		else
			if [ $SERVER != $DNS_HOST ]; then
				probefail=1
			fi
		fi
	fi


	if [ $probefail -ne 0 ]; then
                FAIL=1
		# if running locally, restart it
		if [ $LOCAL = "yes" ]; then
			logerr "$prefix.5000" `gettext "dns failed locally"`
			RETRY=`expr $RETRY + 1`
			logerr "$prefix.5001" \
			    `gettext "restarting dns; restart number $RETRY"`
			dns_svc_start "$MASTERED_LOGICAL_HOSTS" ""
		else
			logerr "$prefix.5002" `gettext "dns failed on sibling"`
			# give sibling chance to restart dns
			# before doing a takeover
			if [ $DNSGRACE -eq 0 ]; then
				DNSGRACE=1
				continue
			fi
			if [ "$DNS_TAKEOVER" = "y" ]; then
				# $REMOTEHOST set in ds_boiler
				# hactl -t -s dns -p $REMOTEHOST
                                CURRENT_MASTER="`haget -f master -h $DNS_HOST`"
                                pmfadm -c ${INST_NAME}.hactl hactl -t -s dns -l $DNS_HOST

			fi
		fi
	else
                if [ $FAIL -eq 1 ]; then
			lognotice "$prefix.2064"\
			`gettext "DNS instance ${INST_NAME} is up and running"`
		fi
                FAIL=0
		DNSGRACE=0
	fi


done
