#!/opt/SUNWstade/bin/perl -I/opt/SUNWstade/lib
#<copyright>
# ----------------------------------------------------------
# Sun Proprietary/Confidential Code
# Copyright 2001, Sun Microsystems, Inc. All rights reserved.
# ----------------------------------------------------------
#</copyright>

#
#  $Id: rasagent,v 1.135 2003/11/07 05:19:00 mckenney Exp $
#
use strict 'vars', 'subs';
use Getopt::Std;
use System;
use Solution;
use Timelapse;
use Timer;
use PDM;
use PDM::ConfigFile;
use Debug;
use Repeat;
use UNIVERSAL qw( isa ) ;

use vars qw(
   $util $MASTER_LOC $RASPORT  %opts  $LB $VERSION $SIGTERM  
   $HOME $DATA %DONE $CONF $CRON_LOC $SLAVE $PIDfile $renv $devices  
   $hosts $frus $device $notifs $FORCE $HOST_ID $HOST_NAME %HMAP  %AMAP $SE_CFG
);

if ($ENV{OPTS}) {
  @ARGV = split(/ +/, $ENV{OPTS});
}

if (!getopts("MHAfeD:hc?Pd:Tv", \%opts)) {
    print " $Getopt::Std::ERROR \n";
    usage();
}

$opts{d}     = 2 if ($opts{T} && !$opts{d});
$FORCE       = 1 if ($opts{f});
my $AUDIT    = 1 if ($opts{A});

System->set_bad_health(1) if ($opts{H});
System->set_eventMaxOff($opts{M});
System->set_audit($AUDIT);

my $force_refresh;

$ENV{LANG}   = 'C';
$ENV{LC_ALL} = 'C';

System->set_testMode($opts{T} || 0);
my($e) = $opts{e} || "2";
System->set_eventDir($e);

my($op);
$op .= "Test on,"    if ($opts{T});
$op .= "Debug on,"   if ($opts{d});
$op .= "Print only," if ($opts{P});
if ($op) {
  chop($op);
  print STDERR "Options: $op\n";
}

&version if ($opts{v});
&usage if ($opts{"?"} || $opts{"h"});

my($local_p) = $INC[0];
my($ix) = rindex($local_p, "/");

$HOME   = $opts{D} || substr($local_p,0,$ix);

System->set_home($HOME);
unlink "$HOME/DATA/snapshot.log";

if ($opts{c}) {  # from crons
  $CRON_LOC = "$HOME/System/cron_started";
  if (!-f $CRON_LOC) {
     open(O, ">" . $CRON_LOC); print O Util->get_today; close(O);
     exit;
  }
}

if ($opts{M}) {  # remove Maximums Databases 
  unlink "$HOME/DATA/Max_email";
  unlink "$HOME/DATA/Max_events";
  unlink "$HOME/DATA/thresholds.cache";
}

Debug->level($opts{d});
$LB = Labels->read('rasagent');

# CHECK START FILE
#
if (! -f "$HOME/DATA/start" && Debug->level()  < 2) { 
   Debug->print2($LB->{no_start});
   exit;
}

$VERSION   = "1.0";
$SIGTERM   = 15;
$DATA      = "$HOME/DATA";
$PIDfile   = "$DATA/pid";
$CONF      = $opts{F} || "$DATA/rasagent.conf";

System->set_config($CONF);
System->set_schemaVersion("1.0");
System->set_runId(time);

$HOST_ID     = System->hostid;
$HOST_NAME   = System->hostname;
my $today    = Util->today;
System->set_ipno(Util->name2ip($HOST_NAME));

my $CD = "$HOME/DATA/Commands";
mkdir $CD, 0777 if (!-d $CD);

if (Debug->level() > 2) {
   print $LB->expand('new_snapshot', "/DATA/snapshot.log") . "\n";
}

$RASPORT     = System->getConfigPort($HOME);
System->set_rasport($RASPORT);
System->set_secure((-f "$HOME/DATA/SECURE") ? "Y" : "N");

PDM::ConfigFile->configFromMaster();

($renv, $devices, $hosts, $notifs) = PDM::ConfigFile->read();

System->set_renv($renv);
System->set_configDevices($devices);
System->set_configHosts($hosts);

#
#  FIX HOSTNAME
#
my $sys_hostname = System->hostname();

if ($renv->{solution} ne "N" && $renv->{hostname} ne $sys_hostname ) {
   Debug->print2("The hostname of this Sun-Solution $renv->{solution_model} ".
                 "changed from '$renv->{hostname} to '$sys_hostname");
   my $D = System->get_home() . "/DATA/";
   unlink "$D/REMOTE_LISTS/$renv->{hostname}";
   rename "$D/Proc/$renv->{hostname}" , "$D/Proc/$sys_hostname";
   rename "$D/state/$renv->{hostname}", "$D/state/$sys_hostname";
   my $lh = length($renv->{hostname});
   my $short = Util->shortHostname($sys_hostname);
   foreach my $d (@$devices) {
      if (substr($d->{name},0,$lh) eq $renv->{hostname}) {
         $d->{name} = $short . substr($d->{name},$lh);
      }
   }
   unlink "$D/topo/$renv->{hostname}";
   unlink "$D/OLD_REPORTS/host:$renv->{hostname}";
   unlink "$D/cache_st_last";

   $renv->{hostname} = $sys_hostname;
   PDM::ConfigFile->write($renv, $devices, $hosts, $notifs);

   require TO;
   require Events;
   TO->clearTopo($sys_hostname);
   my $to = TO->readTopo($sys_hostname,1);
   TO->copyTopo($sys_hostname,"MERGE-MASTER");
}


$renv->{"timeout.luxadm"}  = 200 if (!$renv->{"timeout.luxadm"});
$renv->{"timeout.rm6"}     = 200 if (!$renv->{"timeout.rm6"});
$renv->{"timeout.discman"} = 600 if (!$renv->{"timeout.discman"});

$MASTER_LOC          = Util->findMaster();
$renv->{hostid}      = $HOST_ID;

#$renv->{logfile}    = "/var/adm/messages" if (!$renv->{logfile});

my($s1) = ($MASTER_LOC) ? "SLAVE, MASTER=>$MASTER_LOC" : "MASTER";

$util = Util->new({  data  => $DATA, home  => $HOME, renv  => $renv });
  
# CHECK AGENT RUN FREQUENCY
#
if (!$opts{T} && ($opts{d} < 2)) {
    if ($renv->{frequency} && !$util->is_Xmins("frequency.cache", $renv->{frequency}/2)) {
       if (!$opts{c}) {  # not from crons
           my($elapsed) = sprintf("%.1f", $Util::Xmin_value);
           print STDERR $LB->expand('freq_err', $renv->{frequency}, $elapsed) . "\n";
       }
       exit;
    }
}

if (&cron_conflicts($#$devices+1) ) { # abort if agent is already running
   die($LB->{cron_conflict});
}

###############################################
#  START MONITORING
###############################################
require PDM;
require FSA;
require Modules;
require Scheduler;
require TO;
require Util::Http;
require Events;
require Agent;

$renv->{"solution_prefix"} = Util->ipPrefix(System->ifconfig("dmfe1"));

&create_lock_file();
&restart_daemons($renv);
State->cleanState();
&clear_data_host($renv);

print STDERR $LB->expand('running', $CONF, $today, "$s1 $HOST_NAME") . "\n";

# LOOK FOR CONFLICTS WITH DIAGS.
#
my($st_found);
if (!$FORCE) {
  if (($st_found= Agent->stor_conflicts()) ) {
     Debug->print2($LB->expand('diag_conflict',  $st_found));
  }
}
System->set_stFound($st_found);

# ALTERNATE MASTER CHECK
#
$renv->{role} = "M" if (!-f System->get_home() . "/DATA/MASTER" && !$renv->{role});

if ($renv->{role} eq "AM") {
   my $active = Util->file2string("ALTACTIVE");
   my $M = $MASTER_LOC || $active;
   my $alive;
   if (!$M) {
     Debug->err(TEXT => $LB->{no_master});
     exit(1);
   } else {
     $alive = Util->ping($M);
   }
   my $new_master;

   if (!$alive &&  !$active) { # take over
      Debug->print2($LB->expand('cannot_ping', $MASTER_LOC));

      $new_master = Util->name2ip($renv->{hostname}) || $renv->{hostname};
      Events->saveEvent("AM+",$renv->{hostname}, $MASTER_LOC);

      Util->string2file($MASTER_LOC, "ALTACTIVE");
      Util->clearMaster();
      $MASTER_LOC = undef;

   } elsif ($alive && $active) {  # release
      $new_master = $active;
      my $info = Util::Http->getCommand($new_master,"INFO1", 10);
      if ($info =~ /version/) {
        Events->saveEvent("AM-", $renv->{hostname} , $active);

        Util->setMaster($new_master);
        unlink System->get_home() . "/DATA/ALTACTIVE";
        $MASTER_LOC = $new_master;
        Debug->print2("Can ping $M, giving-up master...");
      }

   }
   if ($new_master) {
      foreach my $h (@$hosts) {
        next if ($h->{role} eq "AM" || $h->{role} eq "M");
        my $ip = $h->{ipno} || Util->name2ip($h->{hostname});
        my($err, $ans) = Util::Http->saveFile($ip, "MASTER", $new_master, 20);
      }
      PDM::ConfigFile->write($renv, $devices, $hosts, $notifs);
   }
}

if (Repeat->timer("agent", 60)) {  # hourly operations
   TO->write_wwn_map();
   Scheduler->cleanUp(2); # remove 2 days old tests
   &rotate_logs();
}

&check_locks($renv, $devices);

my($pdm) = PDM->new({  dir => "$HOME/DATA",
                      renv => $renv ,
                   devices => $devices,
                     hosts => $hosts,
                    notifs => $notifs,
                    });

if ($renv->{auto_discover} eq "auto") {
  my $timer = Timer->isXdays("auto_discover", 1);
  if ($timer eq "YES") {
    Debug->print2("AutoDiscovery from deviceIP.conf");
    require Logic::SWITCH;
    my($err, $nodes, $info) = Logic::SWITCH->addFromFile({
                                host => "", TEXT => 1, silent => 1,
                                monitor_on => 1,
                                fileToUse => "/etc/deviceIP.conf",
                                select_host => "local" } );
    foreach my $n (@$nodes) {
       Debug->print2("Adding $n->{ipno}");
    }
  }
}


my $agent_list = &list_agents($pdm);

Debug->dump('PDM', $pdm, 1);
my $LH = &load_map();
&load_health_monitors($pdm, $LH);  # only INITIAL or FINAL

$pdm->initialHealth();  # check if slaves work and get their events.

my( @OUT, $ras_flag, %LOADED);

require Agent::SE2;
my $skip = Agent::SE2->sequencer_status();  # check if the rack is off


foreach my $modname (@$agent_list) {
   next if ($skip && $modname ne "SE2");
   next if (Solution->exclude($renv->{solution}, $modname));
   if ($st_found) {
      if ( index("MESSAGE,STOOLS4", $modname) < 0 ) {
         Debug->print2("\n" . $LB->expand('exe_agent', $modname));
         Debug->print2($LB->{diag_skip});
         next;
      }
   }

   # SE and SE2 agents do 2 things, the rack from inside and from outside.
   # the module always runs for the inside at least.
   if (!Agent->category_selected($modname) && index("TOPO,SE,SE2", $modname) < 0) {
      Debug->print2("\n" . $LB->{skip} . " $modname.");
      next;
   }
   if (!eval "require \"Agent/${modname}.pm\"") {
        Debug->err(LOADING => "$modname: $@ \n");
        next;
   }
   
   Debug->print1("\n" . $LB->expand('exe_agent', $modname) );

   next if (!&load_health($modname) );
   my $f = "Agent::$modname";
   my $agent = $f->new();

   $agent->init({  data  => $DATA, 
                util  => $util,
                pdm   => $pdm, 
             hostname => $HOST_NAME,
                renv  => $renv, 
               master => $MASTER_LOC,
             });

   if ($agent->can('RUN')) {
      $agent->RUN($ras_flag);

   } else {
      Debug->err('RUN_MISSING', ref($agent) );   
   }
}

$pdm->finalHealth();  # final health: heartbeat..

$DB::single = 1;
Debug->print2("\n" . $LB->{exe_pro});
  
my $eventCount = $pdm->getEventCount();
Debug->print2("$eventCount new Events");

require Agent::T3;
Agent::T3->slice_map();
  
if ($MASTER_LOC) {  # a slave
  $pdm->storeMessages;
  #$pdm->push($MASTER_LOC);
  
} else {
   FSA->RUN();      
   my $providers = Modules->load("Provider");
  
   foreach my $p (@$providers) {
      my $pro = "Provider::$p";
      my $f = $pro->new($pdm);
      
      if ($f->can('RUN')) {
         Debug->print1("\nProvider $p ");
         $f->RUN($pdm);
      } else {
         Debug->err('RUN_MISSING', ref($pro) );   
      }
   }
}

if ( !$opts{s}) {
    #Agent->save_all_caches($util);  # save all caches.
    $pdm->serialize();
}

Report->pushReportsList($MASTER_LOC, $force_refresh);

if ($MASTER_LOC) {  # a slave
  State->push($MASTER_LOC, $force_refresh);
} else {
  State->sev_summary();
}
Timelapse->save_all;

Util->set_Xmins("frequency.cache");
Debug->print(PID => $LB->{remove_pid} . " (REMOVE_PID)");
unlink $PIDfile;

exit;  



##########################################################
#                  SUBROUTINES
##########################################################

sub usage {
  print "
  usage: rasagent 
   -d <level> : Debug (2=most trace, 3=more details, write snapshot.log in /DATA)
   -? -h      : Help
   -f         : Force, run all agents.
   -M         : Clear Email/Events/Thresholds maximum database.
   -v         : version
   -F <file>  : run against a dfferent config file.
   -H         : force all health test to fail.
   -c         : started from the cron.
   -A         : audit all devices, including revision check.
   -T         : test mode (will not save cache or email)
   -P         : Print only, no email
   -s         : do not save the data found
    
";
  exit;
}

sub clear_data_host {
  my($renv) = @_;
  my $D = "$HOME/DATA/DataHost";

  if (index($renv->{categories}, "datahost") < 0) {
     opendir(W, $D);
     my @files = readdir(W); closedir(W);
     foreach my $f (@files) {
        if ($f =~ /loglines/) {
           unlink "$D/$f";
        }
     }
  }
}

sub create_lock_file {
  Debug->print(PID => $LB->{create_pid});
  if (open(O,">$PIDfile")) {
    print O $$;
    close(O);
  } else {
    Debug->print(PID =>  $LB->expand('cannot_write_pid', $PIDfile));
    exit;
  }
}

sub restart_daemons {
  my($renv) = @_; 
  if ($renv->{solution} ne "N") { # INDY/MASERATI
    my $l = System->findProcess("/snmptrapd");
    if ("@$l" !~ /snmptrapd/) {
      system("$HOME/sysbin/start_trap");
    }
    my $l = System->findProcess("/snmpd");
    if ("@$l" !~ /snmpd/) {
      system("$HOME/sysbin/start_snmp");
    }
  }
}

sub rotate_logs {
  System->rotate("$HOME/log/rashstart.log");
  System->rotate("$HOME/log/cron.log");
  System->rotate("$HOME/log/errors.log");
  System->rotate("$HOME/log/snmpagent.log");
  System->rotate("$HOME/log/scheduler.log");
  System->rotate("$HOME/DATA/Events.log");
  System->rotate("/var/adm/trap_messages");
}


sub check_locks {
  my($renv, $devices) = @_;

  my $SE_CFG = "/var/adm/log/SEcfglog";
  my $list;

  if (-f $SE_CFG) {  # INDY
    my ($enter, $exit, $enter_command, $enter_list);
    my (%H, $l);
    if (open(O, "/bin/ls /opt/SUNWsecfg/etc/.[stv]*lock 2>/dev/null|")) {
      while ($l = <O>) {
        $enter_list .= substr($l, 5, -6) . ",";
      }
      close(O);
    }
    if ($enter_list) {
      my $now = Util->get_today(); 
      Debug->print2("SE_CFG: Locks found on $enter_list");
      $list = Util->shortHostname($renv->{hostname}) . "-" . $enter_list;
    }
  }

  if ($renv->{solution} eq "se2") {
    my %MAP;
    foreach my $d (@$devices) {
      $MAP{$d->{ipno}} = $d->{name};
    }
    opendir(D, "/tmp");
    my($f);
    while ($f = readdir(D)) {
       next if ($f !~ /^\d+\.\d+\.\d+\.\d+/);
       $list .= "$MAP{$f},";
    }
    closedir(D);
  }

  System->set_se_conflict($list); # skip this component;
}
  

sub load_map {
  my $D  = System->get_home() . "/lib/Health";
  my $grep = (-x "/usr/bin/grep")? "/usr/bin/grep" : "/bin/grep";
  my @LH = `$grep reportRequest $D/*.pm`;

  foreach my $l (@LH) {
     my($fn, $rest) = split(/\:/, $l, 2);
     my $ix = rindex($fn, "/");
     $fn = substr($fn,$ix+1) if ($ix > 0);
     if ($rest =~ /Report::CAT_(\w+)/) {
       $HMAP{$1} .= substr($fn,0,-3) . "|";
     }
  }

  my $D  = System->get_home() . "/lib/Agent";
  my @LA = `$grep Report::CAT_ $D/*.pm`;

  foreach my $l (@LA) {
     my($fn, $rest) = split(/\:/, $l, 2);
     my $ix = rindex($fn, "/");
     $fn = substr($fn,$ix+1) if ($ix > 0);
     if ($rest =~ /Report::CAT_(\w+)/) {
       $AMAP{substr($fn,0,-3)} .= $1 . "|";
     }
  }
  return \@LH;
}

sub load_health {
   my($modname) = @_;
   my $repcat1 = $AMAP{$modname};
   chop($repcat1);
   my @repcatlist = split(/\|/, $repcat1);
   foreach my $c (@repcatlist) {
      my $hm1 = $HMAP{$c};
      chop($hm1);
      my @hmlist = split(/\|/, $hm1);
      foreach my $h (@hmlist) {
         if (! $LOADED{$h} ) {
            $LOADED{$h} = 1;
            Debug->print2("Loading Health '$h'.");
            if (eval "require \"Health/$h.pm\"") {
                my $f = "Health::$h";
                $f->new($pdm);  
            } else {
                print STDERR "Error load_health_monitor: $@ \n";
                return undef;
            }
         }
      }
   }
   return 1;
}


sub load_health_monitors {
  my($pdm, $list) = @_;
  my($health_monitor, @mods,$d, $mod);
  my($hm, $d);
  my($list2) = ",";
  foreach my $l (@$list) {
      my($fn, $rest) = split(/\:/, $l, 2);
      if ($l =~ /INITIAL/ || $l =~ /FINAL/) {
          $list2 .= substr($fn,0,-3) . ",";
      }
  }

  my $mods = Modules->read("Health", "Slave");

  push(@$mods, 'Slave') if (!$MASTER_LOC);  # slave_health only runs on the master

  my($pre) = System->get_home() . "/lib/Health";

  foreach $health_monitor (@$mods) {
      next if (index($list2, $health_monitor) < 0);
      Debug->print2("Loading Health '$health_monitor'.");
      if (eval "require \"Health/${health_monitor}.pm\"") {
         my $f = "Health::$health_monitor";
         $hm = $f->new($pdm);   # create hm, it will register with the pdm.
      } else {
         print STDERR "Error load_health_monitor: $@ \n";
      }
  }

}


sub list_agents {
  my($pdm) = @_;
  my($mod, @mods,$d, $mod0, @mods2, $san);

  my $mods = Modules->read("Agent");

  my(@san, @mods2, @san2);
 #  push topo and san at the end of the agent list

  foreach my $m (sort @$mods) {
     next if ($m =~ /Parent/);
     if (index(",SAN,", ",$m,") >= 0) {
        push(@san2, $m);
     } elsif (index(",TOPO,HOST,SE,SE2,", ",$m,") >= 0) {
        push(@san, $m);
     } else {
        push(@mods2, $m);
     }
  }
  push(@mods2, @san) if ($#san >= 0);
  push(@mods2, @san2) if ($#san2 >= 0);

  return \@mods2;
}

sub version {
  my($v) = `/usr/bin/cat /opt/SUNWstade/System/config"`;
  print "
  Storage Automated Diagnostic Environement $v \n";
  exit;

}




# CRON CONFLICTS
# 0 = OK
# 1 = exit

sub cron_conflicts {
  my($device_cnt) = @_;
  my($cnt, $l, $created, $PID, $pspid, @a);
  my $renv = System->get_renv();

  return 0 if (!-f $PIDfile);  # no pidfile

  open(O,$PIDfile); $PID = <O>; close(O);
  $cnt = 0;
  my $sig = $SIGTERM;
  my $repeat = Repeat->new("/tmp/ras_cron_conflicts", 60*60);
  if (!$PID) {
     $repeat->clear();
     return 0;
  }

  while (1) {
     $cnt++;
     if ($cnt > 3) {
        my $last_command = Util->readf("last_run_command");
        Debug->err('FAILING_TO_KILL',$PID, $pspid . ", last command executed: $last_command");
        return 1;
     }
     $sig = 9 if ($cnt == 3);
     my ($pspid, $children) = Util->findProcesses($PID);

     if ($pspid eq $PID) {
       $created =  (stat($PIDfile))[9];
       if (time - $created > (600*$device_cnt)) {
         if ($repeat->read()) {
            require Mail;
            my $hn = System->hostname();
            my $last_command = Util->readf("last_run_command");
            Mail->mail($renv->{admin_email}, "Storage A.D.E.", 
              "CRON_CONFLICT on $hn", "\nStorage A.D.E. agent (pid=$pspid) on host $hn appears to be hung. ".
              "The next cron agent is attempting to kill it. Verify that your system ".
              "is working normally and that the 'luxadm' command works.\n" .
              "last command executed: $last_command \n"
            );
         }
         Debug->err(CRON_CONFLICT => "killing $pspid, @$children");
         kill $sig, $pspid;
         kill $sig, @$children if ($#$children >= 0);
         sleep(1);
       } else {
         Debug->print('RASAGENT_RUNNING', $pspid);
         return 1;
       }
     } elsif (!$pspid) {
       unlink $PIDfile;
       Debug->print('PID' => "Remove File");
       return 0;
     } elsif ($pspid ne $PID) {
       Debug->err('WRONG_PROGRAM', $pspid);
       kill $sig, $pspid if ($pspid =~ /\d+/);
     }
  }
  return 0; # OK
}





sub datetime {
  my(@date) = localtime(time); $date[4]++; $date[5] += 1900;
  sprintf("%2.2d-%2.2d-%d %2.2d:%2.2d:%2.2d",
           $date[4],$date[3],$date[5],$date[2],$date[1],$date[0]);
}

#######################
#     SIGNALS
#   sub die_signal {
#     my($sig) = @_;
#     print "Caught a signal SIG$sig -- aborting \n";
#     exit(1);
#   }
#   
#   sub catch_signal {
#     my($sig) = @_;
#     Debug->print3("Caught a signal SIG$sig") if ($sig ne "CHLD");
#   }
#   
#   $SIG{CHLD} = \&catch_signal; # 'IGNORE';
#   $SIG{TERM} = \&catch_signal;
#   $SIG{INT}  = \&die_signal;
#   
#   foreach my $sig ('SEGV', 'QUIT','INT','ILL','TRAP','IOT','ABRT',
#                     'EMT','FPE','KILL','BUS') {
#       $SIG{$sig} = \&die_signal;
#   }
