#!/usr/bin/perl
#
# Copyright (C) Nikhef 2011
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# 
#     http://www.apache.org/licenses/LICENSE-2.0
# 
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
# Author:
#     Mischa Sall\'e <msalle@nikhef.nl>
#     NIKHEF Amsterdam, the Netherlands
#
########################################################################
#
# Nagios probe to test functioning of gLExec
#
# Nagios state can be one of the following:
# - Missing glexec command: CRITICAL
# - input proxies empty:    UNKNOWN
# - short timeout exceeded: WARNING
# - timeout exceeded:       CRITICAL
# - gLExec exit codes:
#   0   glexec succeeded:   OK
#   201 Client error:       CRITICAL
#   202 Internal error:     CRITICAL
#   203 Auth error:         CRITICAL
#   204 Overlap:            CRITICAL
#   126 execve failed:      WARNING
#   128+n signal:           WARNING
#   !=0 rc of payload:      WARNING
#
########################################################################

# DEFAULTS
# Note: version contains multiple dots, and hence is a string
my $probeversion="0.3.2";

# Note the following defaults can be overridden using cmdline options
my $deftimeout=10;	# Overall timeout for probe
my $defcritical=8;	# When to send SIGTERM
my $defwarning=5;	# When to warn about slow running
my $defpayload="id -a";	# Which payload to run

########################################################################
# Logging package
#   keeps internal log trace which can be dumped with dump_log 
########################################################################
package logger;
use strict;
use warnings;
{
    my $loglevel;
    my @logstring;

    # Constructor
    sub new	{
	my $classname=shift;
	my $self={}; bless $self;
	my $level=shift;
	if (defined $level) {
	    $self->set_loglevel($level);
	} else {
	    $loglevel=0;
	}
	return $self;
    }

    # Sets loglevel
    sub set_loglevel($) {
	my $self=shift;
	my $level=shift;
	$loglevel=$level;
    }

    # Logging function: log_func(priority, "logstring\n");
    sub log_func($@) {
	my $self=shift;
	my $prio=shift;
	return if ($prio > $loglevel);
	for my $line (@_)	{
	    push @logstring,$line;
	}
    }

    # Dumps log
    sub get_log(@)	{
	my $self=shift;
	foreach my $myentry ( @logstring )  {
	    print $myentry;
	}
    }
}

########################################################################
# Nagios status printing package
#   Can set and dump nagios status output
########################################################################
package nagstat;
{
    my $code;
    my $summary;
    my $perfdata;
    my @stat;

    # Constructor
    sub new()   {
	my $classname=shift;
	my $self={}; bless $self;
	$code=3; # Default status unknown
	$summary=undef;
	$perfdata=undef;
	@stat=("OK","WARNING","CRITICAL","UNKNOWN");
	return $self;
    }

    # Set nagios code (0-3) plus summary
    sub set_status($$)   {
	my $self=shift;
	if (!defined $summary)	{
	    $code=shift;
	    $summary=shift;
	}
    }

    # Set internal performance data
    sub set_perfdata($)   {
	my $self=shift;
	$perfdata=shift;
    }

    # Printout nagios status, summary and optionally performance data
    # return value is code (0-3)
    sub get_status	{
	if (!defined $summary)	{
	    $summary="unknown status";
	}
	if (defined $perfdata)    {
	    print $stat[$code].": ".$summary."|".$perfdata."\n";
	} else {
	    print $stat[$code].": ".$summary."\n";
	}
	return $code;
    }
}

########################################################################
# Inter process communication package for nagios probes
#   Starts alarm handler when receiving alarm which checks status of
#   probe, and terminates or kills it.
########################################################################
package probeipc;
use POSIX ":sys_wait_h";
use Time::HiRes qw(alarm);
{
    my $pid;
    my $wpid;
    my $status;
    my $numsent;
    my $killtime;
    my $termtime;
    my $exitfunc;

    # Constructor: new(exitfunc,[kill time], [term time])
    sub new()   {
	my $classname=shift;
	my $self={}; bless $self;
	my $exitfunc=shift or die ($classname."::new() needs exitfunc arg\n");
	my $killtime=(shift or 10); # probe default timeout is 10
	my $termtime=(shift or $killtime);
	$self->set_exitfunc($exitfunc);
	$self->set_killtime($killtime);
	$self->set_termtime($termtime);
	$pid=-1;
	$wpid=0;
	$status=0;
	$numsent=0;
	$SIG{'ALRM'} = \&alarm_handler;
	$SIG{'INT'} = \&int_handler;
	$SIG{'TERM'} = \&int_handler;
	return $self;
    }

    # Sets time after which to send SIGKILL 
    sub set_killtime($)	{
	my $self=shift;
	$killtime=shift;
    }
    
    # Sets time after which to send SIGTERM
    sub set_termtime($)	{
	my $self=shift;
	$termtime=shift;
    }

    # Sets function to call when exiting after sending a SIGKILL
    sub set_exitfunc($)	{
	my $self=shift;
	$exitfunc=shift;
    }

    # Signal handler for SIGALRM
    sub alarm_handler() {
	my ($sig) = @_;
	my $rc;
	if ($pid<0)	{ # No pid, nothing to do
	    logger->log_func(2,"Payload hasn't started yet\n");
	    nagstat->set_status(2,"probe killtime exceeded");
	    &$exitfunc();
	}
	# Either is or was a process: test status
	logger->log_func(2,"subprocess is/was running with pid ".$pid."\n");
	if ($wpid!=0) { # process has finished (>0)or waitpid(-1) failed
	    return;
	}
	# Get status
	$wpid=waitpid($pid,WNOHANG);
	$status=$?;
	if ($wpid==0)   { # Still running
	    if ($killtime<=$termtime || $numsent==1)	{
		logger->log_func(2,"Sending SIGKILL to ".$pid."\n");
		kill(9,$pid);
		nagstat->set_status(2,"probe timeout exceeded");
		&$exitfunc();
	    }
	    logger->log_func(2,"Sending SIGTERM to ".$pid."\n");
	    kill(15,$pid);
	    $numsent=1;
	    alarm($killtime-$termtime);
	    nagstat->set_status(2,"probe critical time exceeded");
	}
	return;
    }

    # Signal handler for SIGINT and SIGTERM
    sub int_handler()	{
	my ($sig)=@_;

	logger->log_func(2,"Caught SIG$sig\n");
	nagstat->set_status(2,"probe interrupted with SIG$sig");
	if ($pid<0)	{ # No pid, nothing to do
	    logger->log_func(2,"Payload hasn't started yet\n");
	    &$exitfunc();
	}
	# Either is or was a process: test status
	if ($wpid!=0) { # process has finished (>0)or waitpid(-1) failed
	    logger->log_func(2,"Subprocess with pid ".$pid." already finished\n");
	    &$exitfunc();
	}
	# Get status
	$wpid=waitpid($pid,WNOHANG);
	$status=$?;
	if ($wpid==0)   { # Still running: send SIGTERM
	    logger->log_func(2,"Sending SIGTERM to ".$pid."\n");
	    kill(15,$pid);
	}
	&$exitfunc();
    }

    # Wait for specified pid and return exitcode and signal number. 
    sub wait_probe()   {
	my $self=shift;
	my $rc;
	my $signo;

	$wpid=waitpid($pid,0) if ($wpid<=0);
	if ($wpid==$pid) {	# probe exited here
	    alarm(0);
	    $rc=$? >> 8;
	    $signo=$? & 127;
	} elsif ($wpid==-1)  {	# probe exited in sighandler
	    $rc=$status >> 8;
	    $signo=$status & 127;
	}
	return ($rc,$signo);
    }

    # Starts specified command: run_probe("command",\$rc,\$signo) and returns 0
    # on normal exit, or 1 when command cannot be started.
    # Output of command is stored as log info. Nagios status is set when
    # applicable.
    sub run_probe($$$) {
	my $self=shift;
	my $command=shift;
	my $rc=shift;
	my $signo=shift;

	# Start command
	$pid = open(FOO, $command." 2>&1|");
	if (!defined($pid))	{
	    alarm(0);
	    nagstat->set_status(2,"Failed to run $command");
	    return 1;
	}
	while (my $line=<FOO>) {
	    logger->log_func(3,$line);
	}
	($$rc,$$signo)=$self->wait_probe();
	return 0;
    }
}

########################################################################
# Running main probe package
########################################################################
package main;
use strict;
use warnings;

use Env qw(@PATH GLEXEC_LOCATION GLITE_LOCATION X509_USER_PROXY GLEXEC_CLIENT_CERT);
use Getopt::Long qw(:config no_ignore_case bundling);
use Time::HiRes qw(time alarm);

my $timeout;	# Total maximum runtime for probe
my $critical;	# Time after which to kill gLExec
my $warning;	# Time after which to warn about slow gLExec
my $payload;	# Payload plus arguments: relative uses $PATH to find
my $verbose;	# Verbosity level

# Prints usage output
sub usage() {
    (my $name = $0) =~ s/.*\///;
    print <<EOHELP;
Usage: $name [options]

Options:
 -t|--timeout <timeout>          maximum runtime for probe, default: $deftimeout sec
 -w|--warning <timeout>          runtime after which to warn, default: $defwarning sec
 -c|--critical <timeout>         runtime after which the probe is to be killed,
                                 default: $defcritical sec
 -x|--x509-user-proxy <file>     set X509_USER_PROXY to given file
 -g|--glexec-client-cert <file>  set GLEXEC_CLIENT_CERT to given file
                                 default: value of variable X509_USER_PROXY
 -e|--execute <cmd>              command to be executed by gLExec
                                 default: \"$defpayload\"
 -v|--verbose                    be more verbose, more -v means more verbosity
 -V|--version                    print version
 --help                          show this helptext
 -h                              show short usage information
EOHELP
    exit 0;
}

# Prints short usage output (oneline)
sub shortusage()	{
    (my $name = $0) =~ s/.*\///;
    print <<EOHELP;
Usage: $name [options]
EOHELP
}

# Prints probe version
sub version()	{
    (my $name = $0) =~ s/.*\///;
    print <<EOHELP;
$name version: $probeversion
EOHELP
}

# Parses command line options and sets global variables
sub getopts()	{
    my $x509proxy;
    my $clientcert;
    my $version;
    my $help;
    my $shorthelp;

    $timeout=$deftimeout;
    $critical=$defcritical;
    $warning=$defwarning;
    $payload=$defpayload;
    GetOptions(
	"t|timeout=f" => \$timeout,
	"c|critical=f" => \$critical,
	"w|warning=f" => \$warning,
	"x|x509-user-proxy=s" => \$X509_USER_PROXY,
	"g|glexec-client-cert=s" => \$GLEXEC_CLIENT_CERT,
	"e|execute=s" => \$payload,
	"v|verbose+" => \$verbose,
	"help+" => \$help,
	"h+" => \$shorthelp,
	"V|version+" => \$version,
	"H|host",
	"p|port",
	"u|url"
	) or &usage and exit(1);

    $help and &usage and exit(0);
    $shorthelp and &shortusage and exit(0);
    $version and &version and exit(0);
    if (!defined $GLEXEC_CLIENT_CERT)	{
	$GLEXEC_CLIENT_CERT=$X509_USER_PROXY;
    }
    $timeout=0 if ($timeout<0);
    $critical=0 if ($critical<0);
    $warning=0 if ($warning<0);
    $critical=$timeout if ($timeout<$critical);
}

# Exit function: prints nagios status and dumps log
sub nagios_exit() {
    my $rc=nagstat->get_status();

    # Logging object
    logger->get_log();

    exit $rc;
}

# Finds gLExec in path and pre-specified directories
sub find_glexec   {
    my $self=shift;
    my $glexloc;
    my $dir;
    my @DEFAULT_PATH=("/usr/local/sbin","/usr/sbin","/sbin",
		      "/usr/local/bin","/usr/bin");

    # Try GLEXEC_LOCATION  
    if (defined $GLEXEC_LOCATION) {
	logger->log_func(3,"GLEXEC_LOCATION=".$GLEXEC_LOCATION."\n");
	$glexloc=$GLEXEC_LOCATION."/sbin/glexec";
	if (-x $glexloc)    {
	    logger->log_func(2,"gLExec found at ".$glexloc."\n");
	    return $glexloc;
	}
	logger->log_func(2,"gLExec NOT found at \$GLEXEC_LOCATION\n");
    }

    # Try GLITE_LOCATION  
    $GLITE_LOCATION="/opt/glite" if (!defined $GLITE_LOCATION);
    logger->log_func(3,"GLITE_LOCATION=".$GLITE_LOCATION."\n");
    
    @PATH=(".") if (!$PATH[1]);

    for $dir (@PATH,$GLITE_LOCATION."/sbin",@DEFAULT_PATH) {
	logger->log_func(3,"Looking for glexec in ".$dir."\n");
	$glexloc=$dir."/glexec";
	if (-x $glexloc)    {
	    logger->log_func(2,"gLExec found at ".$glexloc."\n");
	    return $glexloc;
	}
    }
    return undef;
}
    
sub glexec_to_nagios($$$)    {
    my $rc=shift;
    my $signo=shift;
    my $dt=shift;

    if ($rc==0) {
	nagstat->set_perfdata("${dt}s;$warning;$critical;0");
	if ($dt>=$warning)  {
	    nagstat->set_status(1,"gLExec took long time to succeed");
	    return 0;
	} else  {
	    nagstat->set_status(0,"Success");
	    return 1;
	}
    } elsif ($rc==126) {
	nagstat->set_status(1,"executable $payload can't be executed ($rc)");
    } elsif ($rc==201) {
	nagstat->set_status(2,"client error ($rc)");
    } elsif ($rc==202) {
	nagstat->set_status(2,"system error ($rc)");
    } elsif ($rc==203) {
	nagstat->set_status(2,"authorization error ($rc)");
    } elsif ($rc==204) {
	nagstat->set_status(2,"exit code overlap error ($rc)");
    } elsif ($signo!=0)	{
	nagstat->set_status(2,"exit due to signal $signo ($rc)");
    } else {
	nagstat->set_status(2,
	    "executable $payload failed with non-zero exit code ($rc)");
    }
    return 1;
}

# Find gLExec command, payload command (when relative), runs it and returns
# status
sub run_glexec()    {
    my $glexec;
    my $exitcode;
    my $signo;
    my $t1;
    my $t2;

    # Make sure to have starttime
    $t1=time();

    # Set alarm before looking for gLExec to prevent NFS timeouts
    alarm($critical);

    # Find glexec command
    if (!defined ($glexec=find_glexec)) {
	nagstat->set_status(2,"glexec command not found");
	return 1;
    }

    # Check proxies
    if (!defined $X509_USER_PROXY)   {
	nagstat->set_status(3,"\$X509_USER_PROXY is unset.");
	return 1;
    }
    if (! -e $X509_USER_PROXY || ! -s $X509_USER_PROXY)	{
	nagstat->set_status(3,
	    "\$X509_USER_PROXY does not point to a nonempty file.");
	return 1;
    }
    if (! -e $GLEXEC_CLIENT_CERT || ! -s $GLEXEC_CLIENT_CERT)	{
	nagstat->set_status(3,
	    "\$GLEXEC_CLIENT_CERT does not point to a nonempty file.");
	return 1;
    }

    # Find full path for payload if it's relative
    if ($payload !~ /^\/.*/)	{
	(my $name=$payload) =~ s/ .*//;
	my $fullname;
	for my $dir (@PATH) {
	    logger->log_func(3,"Looking for ".$name." in ".$dir."\n");
	    $fullname=$dir."/".$name;
	    if (-x $fullname)    {
		($payload=$payload) =~ s/^$name/$fullname/;
		logger->log_func(2,"Payload set to ".$payload."\n");
		last;
	    }
	}
    }

    # Run actual probe in child process
    if (probeipc->run_probe("$glexec $payload",\$exitcode,\$signo)!=0)	{
	return 1;
    }

    # Probe exited: find exit status
    $t2=time();
    my $dt=int(($t2-$t1)*1000+0.5)/1000;
    return glexec_to_nagios($exitcode,$signo,$dt);
}

# Parse commandline options
getopts();

# Initialize logger and set loglevel
logger->new($verbose);

# Initialize objects
nagstat->new();

# Initialize signal handling
probeipc->new(\&nagios_exit,$timeout,$critical);

# run actual gLExec probe
run_glexec();

# Dump nagios status, log and exit
nagios_exit();

