"""
Nagios probe for ARC CEs
"""

import arc, argparse, logging, os, pipes, shutil, time
from arcnagios import nagutils, vomsutils, persistence
from arcnagios.nagutils import ServiceReport, ServiceOK, \
	ServiceCRITICAL, ServiceUNKNOWN
from arcnagios.arcutils import arcstat, arcprune, ParseError, \
	terminal_job_states, nonterminal_job_states
from arcnagios.jobplugins import load_jobplugin

log = logging.getLogger(__name__)

def write_jsdl(jsdl_path, script_path, *script_args, **kwargs):
    # job_name, application_name
    log.debug('Writing job description to %s.'%jsdl_path)
    fh = open(jsdl_path, 'w')
    script_name = os.path.basename(script_path)
    _command = '<Executable>%s</Executable>'%script_name \
	+ ''.join(['\n\t<Argument>%s</Argument>'%arg for arg in script_args])
    kwargs.update(_command = _command)
    if not 'output' in kwargs:
	kwargs['output'] = 'stdout.txt'
    if not 'error' in kwargs:
	kwargs['error'] = 'stderr.txt'
    kwargs['script_name'] = script_name
    kwargs['script_path'] = script_path
    fh.write("""\
<?xml version="1.0" encoding="UTF-8"?>
<JobDefinition xmlns="http://schemas.ggf.org/jsdl/2005/11/jsdl"
	       xmlns:jsdl-arc="http://www.nordugrid.org/ws/schemas/jsdl-arc">
  <JobDescription>
    <JobIdentification>
      <JobName>%(job_name)s</JobName>
    </JobIdentification>
    <Application>
      <ApplicationName>%(application_name)s</ApplicationName>
      <POSIXApplication xmlns="http://schemas.ggf.org/jsdl/2005/11/jsdl-posix">
	%(_command)s
        <Output>%(output)s</Output>
        <Error>%(error)s</Error>
	<WallTimeLimit>600</WallTimeLimit>
      </POSIXApplication>
    </Application>
    <DataStaging>
      <FileName>%(script_name)s</FileName>
      <Source><URI>file:%(script_path)s</URI></Source>
      <CreationFlag>overwrite</CreationFlag>
    </DataStaging>\n""" % kwargs)
    for filename, url, urloptions in kwargs.get('staged_inputs', []):
	fh.write("""\
    <DataStaging>
      <FileName>%s</FileName>
      <Source><URI>%s</URI>%s</Source>
      <DeleteOnTermination>false</DeleteOnTermination>
      <CreationFlag>overwrite</CreationFlag>
    </DataStaging>\n"""
	    % (filename, url,
	       ''.join(["<URIOption>%s</URIOption>"%o for o in urloptions])))
    for filename, url, urloptions in kwargs.get('staged_outputs', []):
	fh.write("""\
    <DataStaging>
      <FileName>%s</FileName>
      <Target><URI>%s</URI>%s</Target>
      <DeleteOnTermination>false</DeleteOnTermination>
      <CreationFlag>overwrite</CreationFlag>
    </DataStaging>\n"""
	    % (filename, url,
	       ''.join(["<URIOption>%s</URIOption>"%o for o in urloptions])))
    runtime_environments = kwargs.get('runtime_environments', [])
    queue_name = kwargs.get('queue_name', None)
    if runtime_environments or queue_name:
	fh.write("      <Resources>")
    if queue_name:
	fh.write("\t<jsdl-arc:QueueName>%s</jsdl-arc:QueueName>\n" % queue_name)
    if runtime_environments:
	fh.write("""\
	<jsdl-arc:RunTimeEnvironment>\n""")
	for rte in runtime_environments:
	    fh.write("""\
	  <jsdl-arc:Software>
	    <jsdl-arc:Name>%s</jsdl-arc:Name>
	  </jsdl-arc:Software>
"""%rte)
	fh.write("""
	</jsdl-arc:RunTimeEnvironment>\n""")
    if runtime_environments or queue_name:
	fh.write("      </Resources>")
    fh.write("""\
  </JobDescription>
</JobDefinition>
""")
    fh.close()

class JobInfo(persistence.PersistentObject):
    persistent_attributes = {
	'submission_time':	persistence.pt_int,
	'host':			persistence.pt_str,
	'job_tag':		persistence.pt_str_opt,
	'termination_service':	persistence.pt_str_opt,
	'job_id':		persistence.pt_str,
	'job_state':		persistence.pt_str,
	'check_time':		persistence.pt_int_opt,
	'fetch_attempts':	persistence.pt_int_opt,
	'stored_urls':		persistence.pt_str_list,
	'tests':		persistence.pt_str_list,
    }

# COMPAT: To load old job descriptions. Only needed for one release to convert
# existing descriptions.
def compat_load_job_info(path):
    active_job_attrs = [
	(int, 'submission_time'),
	(str, 'host'),
	(str, 'job_tag'),
	(str, 'termination_service'),
	(str, 'job_id'),
	(str, 'job_state'),
	(int, 'check_time'),
	(int, 'fetch_attempts'),
    ]

    fh = open(path)
    vs = [ln.strip() for ln in fh]
    fh.close()

    # COMPAT: Addition of job_tag. Remove later.
    if len(vs) == 8:
	vs = vs[0:2] + ['__none'] + vs[2:]

    if len(active_job_attrs) + 1 != len(vs):
	msg = 'Malformed job info %s, expecting %d, not %d elements.' \
		% (path, len(active_job_attrs) + 1, len(vs))
	raise ParseError(msg)
    stored_urls = vs.pop().split()
    d = {}
    for (t, k), v in zip(active_job_attrs, vs):
	if v == '__none':
	    d[k] = None
	else:
	    try:
		d[k] = t(v)
	    except ValueError:
		raise ParseError('Bad value %s for %s in job info %s'
				 %(v, k, path))
    return JobInfo(stored_urls = stored_urls, **d)

class ARCCEProbe(nagutils.NagiosPlugin, vomsutils.NagiosPluginVomsMixin):
    """Nagios probe to test ARC CEs.  The probe has two sub-commands
    implemented by `check_submit` and `check_monitor`.  The former is run on
    all CEs, while the latter is run to collect submitted jobs."""

    probe_name = 'ARCCE'
    main_config_section = ['arcce', 'arc-ce']

    JOBID_FILENAME = 'active.jobid'
    JSDL_FILENAME = 'job.jsdl'
    JOB_SCRIPT_FILENAME = 'job.sh'
    JOB_OUTPUT_DIRNAME = 'job_output'
    ACTIVE_JOB_FILENAME = 'active.map'

    _arc_bindir = None

    prev_status = None

    @property
    def top_workdir(self):
	templ = self.opts.top_workdir
	if templ is None:
	    templ = os.path.join(self.opts.plugins_spooldir, 'arcce/%(voms)s')
	return templ % vars(self.opts)

    def workdir_for(self, host, job_tag):
	if job_tag:
	    return os.path.join(self.top_workdir, host + '#' + job_tag)
	else:
	    return os.path.join(self.top_workdir, host)

    def __init__(self):
	nagutils.NagiosPlugin.__init__(self)

	ap = self.argparser
	ap.add_argument('--fqan', dest = 'fqan')
	ap.add_argument('--top-workdir', dest = 'top_workdir',
		help = 'Parent directory of per-VO probe working directories.')

	self.argsubparsers = ap.add_subparsers(dest = 'metric_name')

	ap = self.argsubparsers.add_parser('submit')
	ap.add_argument('--prev-status', dest = 'prev_status', type = int,
		default = 0, metavar = '{0..3}',
		help = 'The previous Nagios status for this metric.')
	ap.add_argument('--stage-input', dest = 'staged_inputs',
		default = [], action = 'append',
		metavar = 'URL',
		help = 'DEPRECATED, please use --test with the staging plugin. '
		       'Stage the existing URL as an input and check for it '
		       'in the job script. '
		       'The local file name will be the basename of URL, or '
		       'you can specify an alternative name by prefixing '
		       'the URL with ALTNAME=.')
	ap.add_argument('--stage-output', dest = 'staged_outputs',
		default = [], action = 'append',
		metavar = 'URL',
		help = 'DEPRECATED, please use --test with the staging plugin. '
		       'Create a file in the job script and stage it as URL. '
		       'The local file name will be the basename of URL, or '
		       'you can specify an alternative name by prefixing '
		       'the URL with ALTNAME=.')
	ap.add_argument('--termination-service', dest = 'termination_service',
		default = '',
		help = 'The name (NAGIOS "description") of the passive '
		       'service to which to submit the results.')
	ap.add_argument('--job-submit-timeout', dest = 'job_submit_timeout',
		type = int, default = 600,
		help = 'Timeout for job submission.')
	ap.add_argument('--job-discard-timeout', dest = 'job_discard_timeout',
		type = int, default = 6*3600,
		help = 'Timeout before discarding a job.')
	ap.add_argument('--ce', dest = 'ce',
		help = 'URL for connecting to the CE, using the same format '
		       'as the -c option of arcsub(1).')
	ap.add_argument('--queue', dest = 'queue',
		help = 'Target queue name. If unspecified, let ARC choose it.')
	ap.add_argument('--job-tag', dest = 'job_tag',
		help = 'A short string suitable in directory names to '
		       'distinguish different submission services for the '
		       'same hostname.')
	ap.add_argument('--job-description', dest = 'job_description',
		help = 'Use this job description instead of generating one.  '
		       'In this case --stage-input options are ignored and '
		       'URLs passed to --stage-output will be deleted when '
		       'the job finishes.')
	ap.add_argument('--test', dest = 'tests', action='append', default=[],
		metavar = 'TESTNAME',
		help = 'Add an additional test described in the configuration '
		       'file under the section "arcce.TESTNAME"')
	ap.add_argument('--runtime-environment', dest = 'runtime_environments',
		action = 'append', default = [], metavar = 'RTE',
		help = 'Request the given runtime environment.')

	ap = self.argsubparsers.add_parser('monitor')
	ap.add_argument('--ce', dest = 'ces',
		default = [], action = 'append',
		metavar = 'CE',
		help = 'Pass one or more times to restrict monitoring '
		       'to the given CEs.')
	ap.add_argument('--termination-service', dest = 'termination_service',
		default = 'ARCCE Job Termination',
		help = 'Default service to submit result to if not specified '
		       'when submitting the job.')

	ap = self.argsubparsers.add_parser('clean')
	ap.add_argument('--timeout', dest = 'timeout',
		type = int, default = 20, help = 'Timeout')
	ap.add_argument('--max-age', dest = 'max_age',
		type = int, default = 604800,
		help = 'Max age before jobs info is cleaned.')

	self._user_config = arc.UserConfig()

    def parse_args(self, args):
	"""Parse ARCCE-specific command-line options."""

	def parse_staged(spec):
	    if ';' in spec:
		xs = spec.split(';')
		spec, urloptions = xs[0], xs[1:]
	    else:
		urloptions = []
	    if '=' in spec:
		filename, url = spec.split('=', 1)
	    else:
		filename, url = os.path.basename(spec), spec
	    return (filename, url, urloptions)

	nagutils.NagiosPlugin.parse_args(self, args)
	if self.opts.metric_name == 'submit':
	    if not self.opts.host:
		raise argparse.ArgumentError('The --host option is required.')
	    self.staged_inputs = map(parse_staged, self.opts.staged_inputs)
	    self.staged_outputs = map(parse_staged, self.opts.staged_outputs)

	# COMPAT 2011-11-22. Also remove arc-ce fallback above.
	if self.config.has_section('arc-ce'):
	    self.log.warn('The configuration section arc-ce is deprecated. '
			  'Please rename it to arcce.')

    def _cleanup_job_state(self, host, job_tag):
	self.log.debug('Cleaning up job files for %s.'%host)
	workdir = self.workdir_for(host, job_tag)
	for filename in [self.ACTIVE_JOB_FILENAME, self.JSDL_FILENAME,
			 self.JOBID_FILENAME]:
	    try:
		os.unlink(os.path.join(workdir, filename))
	    except StandardError:
		pass
	try:
	    job_output_dir = os.path.join(workdir, self.JOB_OUTPUT_DIRNAME)
	    if os.path.exists(job_output_dir) and os.listdir(job_output_dir):
		last_dir = job_output_dir + '.LAST'
		shutil.rmtree(last_dir, ignore_errors = True)
		os.rename(job_output_dir, last_dir)
	except StandardError, xc:
	    self.log.warn('Error clearing %s: %s'%(job_output_dir, xc))

    def run_arc_cmd(self, prog, *args, **kwargs):
	if self._arc_bindir:
	    prog = os.path.join(self._arc_bindir, prog)
	cmd = prog + ' ' + ' '.join([pipes.quote(str(arg)) for arg in args])
	self.log.debug('Exec: %s'%cmd)
	fh = os.popen(cmd + ' 2>&1')
	output = fh.read()
	err = fh.close()
	return err or 0, output

    def require_voms_proxy(self):
	proxy_path = vomsutils.NagiosPluginVomsMixin.require_voms_proxy(self)
#	self._user_config.KeyPath(self.opts.user_key)
#	self._user_config.CertificatePath(self.opts.user_cert)
	if proxy_path:
	    self._user_config.ProxyPath(proxy_path)
	try:
	    self._user_config.InitializeCredentials() # old API
	except TypeError:
	    self._user_config.InitializeCredentials(
		    arc.initializeCredentialsType(
			arc.initializeCredentialsType.RequireCredentials))

    def load_active_job(self, host, job_tag):
	"""Load information about the current job on `host : str` tagged with
	`job_tag : str`, or `None` if no information is found."""

	workdir = self.workdir_for(host, job_tag)
	ajf = os.path.join(workdir, self.ACTIVE_JOB_FILENAME)
	if os.path.exists(ajf):
	    self.log.debug('Loading job info from %s.'%ajf)
	    # FIXME: Lock.
	    try:
		jobinfo = JobInfo()
		jobinfo.persistent_load(ajf)
		return jobinfo
	    except ParseError, xc:
		try:
		    jobinfo = compat_load_job_info(ajf)
		    jobinfo.persistent_save(ajf) # Update to new format.
		    return jobinfo
		except Exception:
		    self.log.error('Ignoring invalid job info %s: %s'%(ajf, xc))

    def save_active_job(self, jobinfo, host, job_tag):
	"""Save information about the current job running on `host : str`
	tagged with `job_tag : str`."""

	workdir = self.workdir_for(host, job_tag)
	ajf = os.path.join(workdir, self.ACTIVE_JOB_FILENAME)
	self.log.debug('Saving active job info.')
	# FIXME: Lock.
	jobinfo.persistent_save(ajf)

    def discard_job(self, jobinfo):
	"""Discard the job described by `jobinfo : JobInfo`."""

	job_id = jobinfo.job_id
	self.log.debug('Cancelling old job %s' % job_id)
	rc, output = self.run_arc_cmd('arckill', job_id)
	if rc == 0:
	    return
	self.log.warning('Failed to kill old job, trying to clean instead.')
	rc, output = self.run_arc_cmd('arcclean', job_id)
	if rc == 0:
	    return
	self.log.warning('Failed to clean old job, carrying on.')

    def check(self):
	"""Run the sub-probe specified by the parsed command-line options.
	This method should be called from the base-class."""

	# COMPAT 2011-11-22: Change of default working directory.
	if self.opts.top_workdir is None:
	    if not os.path.exists('/var/spool/nagios/plugins/arcce') and \
		   os.path.exists('/var/spool/nagios/plugins/arc-ce'):
		os.rename('/var/spool/nagios/plugins/arc-ce',
			  '/var/spool/nagios/plugins/arcce')

	return getattr(self, 'check_' + self.opts.metric_name)()

    def load_jobtest(self, jobtest_name, **env):
	"""Load a plugin-based job-test from the section of the configuration
	specified by `jobtest_name`.  The result is an instance of `JobPlugin`
	subclass specified by the ``jobplugin`` variable of the given
	section."""

	jobplugin_section = 'arcce.%s'%jobtest_name
	if not self.config.has_section(jobplugin_section):
	    if self.config.has_section('arc-ce.%s'%jobtest_name):
		self.log.warn('The section arc-ce.%s is deprecated, please use %s.'
			      % (jobtest_name, jobplugin_section))
		jobplugin_section = 'arc-ce.%s'%jobtest_name
	    else:
		raise ServiceUNKNOWN('Missing configuration section %s for '
				     'job-plugin test.' % jobplugin_section)
	jobplugin_name = self.config.get(jobplugin_section, 'jobplugin')
	jobplugin_cls = load_jobplugin(jobplugin_name)
	return jobplugin_cls(jobplugin_name, self.config, jobplugin_section, env)

    def check_submit(self):
	"""Submit a job to a CE."""

	self.require_voms_proxy()

	workdir = self.workdir_for(self.opts.host, self.opts.job_tag)

	jobid_file = os.path.join(workdir, self.JOBID_FILENAME)
	jobinfo = self.load_active_job(self.opts.host, self.opts.job_tag)
	if not jobinfo is None:
	    t_sub = jobinfo.submission_time
	    job_state = jobinfo.job_state

	    if job_state not in terminal_job_states:
		s_sub = time.strftime('%FT%T', time.localtime(t_sub))
		self.log.info('Last job was submitted %s.'%s_sub)
		t_dis = t_sub + self.opts.job_discard_timeout
		if int(time.time()) >= t_dis:
		    self.log.warning('Discarding last job due to timeout.')
		    self.discard_job(jobinfo)
		else:
		    s_dis = time.strftime('%FT%T', time.localtime(t_dis))
		    self.log.info('Job will be discarded %s.'%s_dis)
		    status = self.opts.prev_status or 0
		    self.log.info('Keeping previous status %d.'%status)
		    return ServiceReport(status, 'Job not finished.')
	    else:
		self.log.debug('Job in terminal state %s.\n'%job_state)
		try:
		    os.unlink(jobid_file)
		except StandardError:
		    pass

	# Prepare the working directory for a new job.
	self._cleanup_job_state(self.opts.host, self.opts.job_tag)
	job_output_dir = os.path.join(workdir, self.JOB_OUTPUT_DIRNAME)
	if not os.path.exists(job_output_dir):
	    try:
		os.makedirs(job_output_dir)
	    except OSError, e:
		msg = 'Failed to create working directory: %s'%e
		return ServiceUNKNOWN(msg)

	self.log.debug('Submitting new job.')
	job_script_file = os.path.join(workdir, self.JOB_SCRIPT_FILENAME)

	# Create job script.
	fh = open(job_script_file, 'w')
	fh.write('#! /bin/sh\n\n'
		 'echo "Job started `date -Is`."\n')
	for filename, _0, _1 in self.staged_inputs:
	    fh.write('test -e %(fname)s || error "Missing file "%(fname)s\n'
		     % {'fname': pipes.quote(filename)})
	for filename, _0, _1 in self.staged_outputs:
	    fh.write('hostname >%s\n'%pipes.quote(filename))
	runtime_environments = set(self.opts.runtime_environments)
	for test_name in self.opts.tests:
	    test = self.load_jobtest(test_name, hostname = self.opts.host)
	    test.write_script(fh)

	    def adjust_staged(spec):
		if isinstance(spec, tuple):
		    filename, spec, urloptions = spec
		else:
		    if ';' in spec:
			xs = spec.split(';')
			spec, urloptions = xs[0], xs[1:]
		    else:
			urloptions = []
		    filename = os.path.basename(spec)
		if ':/' in spec:
		    url = spec
		elif os.path.isabs(spec):
		    url = 'file:' + spec
		else:
		    url = 'file:' + os.path.join(workdir, spec)
		return filename, url, urloptions
	    for stagespec in test.staged_inputs():
		self.staged_inputs.append(adjust_staged(stagespec))
	    for stagespec in test.staged_outputs():
		self.staged_outputs.append(adjust_staged(stagespec))
	    runtime_environments.update(test.runtime_environments())
	fh.write('echo "Present files before termination:"\n'
		 'ls -l\n'
		 'echo "Job finished `date -Is`, status = $status."\n'
		 'exit $status\n')
	fh.close()

	# Create JSDL file.
	if self.opts.job_description:
	    jsdl_file = self.opts.job_description
	else:
	    jsdl_file = os.path.join(workdir, self.JSDL_FILENAME)
	    write_jsdl(
		    jsdl_file,
		    job_script_file,
		    application_name = 'ARCCE-probe',
		    job_name = self.opts.termination_service,
		    output = 'stdout.txt',
		    error = 'stderr.txt',
		    staged_inputs = self.staged_inputs,
		    staged_outputs = self.staged_outputs,
		    runtime_environments = runtime_environments,
		    queue_name = self.opts.queue)

	# Submit the job.
	if self.opts.ce:
	    connection_url = self.opts.ce
	elif self.config.has_option('arcce.connection_urls', self.opts.host):
	    connection_url = self.config.get('arcce.connection_urls',
					     self.opts.host)
	# COMPAT 2011-11-22.
	elif self.config.has_option('arc-ce.connection_urls', self.opts.host):
	    self.log.warn('The section name arc-ce.connection_urls is '
			  'deprecated, please use arcce.connection_urls.')
	    connection_url = self.config.get('arc-ce.connection_urls',
					     self.opts.host)
	else:
	    if self.opts.port:
		connection_url = self.opts.host + ':' + str(self.opts.port)
	    else:
		connection_url = self.opts.host
	rc, output = \
	    self.run_arc_cmd('arcsub',
		'-c', connection_url,
		'-o', jobid_file,
		'-t', self.opts.job_submit_timeout,
		jsdl_file)
	if rc != 0:
	    self._cleanup_job_state(self.opts.host, self.opts.job_tag)
	    self.log.error('arcsub exited with code %d:\n%s'%(rc, output))
	    return ServiceCRITICAL('Job submission failed.')

	try:
	    fh = open(jobid_file)
	    job_id = fh.readline().strip()
	    fh.close()
	except StandardError:
	    self.log.info('The job ID should have been saved to %s.'%jobid_file)
	    if output:
		self.log.error('Output from arcsub:\n%s'%output)
	    # In ARC 1.1.0, arcsub exits with 0 even if no targets were found,
	    # so we'll make this CRITICAL rather than UNKNOWN for now.
	    return ServiceCRITICAL('Failed to submit job.')
	    #return ServiceUNKNOWN('Could not read job ID for submitted job.')

	t_now = int(time.time())
	jobinfo = JobInfo(
		submission_time = t_now,
		host = self.opts.host,
		job_tag = self.opts.job_tag,
		termination_service = self.opts.termination_service,
		job_id = job_id,
		job_state = 'SUBMITTED',
		check_time = t_now,
		stored_urls = [url for _0, url, _1 in self.staged_outputs],
		tests = self.opts.tests)
	self.save_active_job(jobinfo, self.opts.host, self.opts.job_tag)

	return ServiceOK('Job submitted.')

    def fetch_job(self, jd, job_state, job_error = None):
	"""Fetch a job and return true, or return false on failure.  Also
	submit passive result depending on the status of the success of the
	job itself."""

	if job_error:
	    self.log.error(job_error)
	details = None

	did_fetch = False
	if jd.job_state == 'FINISHED':
	    msg = 'Job finished successfully.'
	    status = nagutils.OK

	    self.log.info('Fetching job %s in terminal state %s.'
			  %(jd.job_id, job_state))
	    workdir = self.workdir_for(jd.host, jd.job_tag)
	    job_output_dir = os.path.join(workdir, self.JOB_OUTPUT_DIRNAME)
	    rc, output = self.run_arc_cmd('arcget', '-D', job_output_dir,
					  jd.job_id)
	    did_fetch = rc == 0
	    self.log.debug('Output from arcget: %d "%s"'%(rc, output))
	    if not did_fetch:
		msg = 'Failed to fetch job.'
		if status == nagutils.OK:
		    status = nagutils.UNKNOWN
		self.log.error('Failed to fetch %s.'%jd.job_id)
		if output:
		    details = 'Output from arcget:\n%s'%output
		    self.log.error(details)
		clean_rc, _ = self.run_arc_cmd('arcclean', jd.job_id)
		if clean_rc != 0:
		    self.log.warn('Also arcclean failed.')
	else:
	    arccat_rc, arccat_out = self.run_arc_cmd('arccat', '-e', jd.job_id)
	    msg = 'Job terminated in state %s'%jd.job_state
	    # If job_error is provided, then it not surprising if arccat
	    # failed, so don't litter the output.
	    if arccat_out.strip() and (arccat_rc == 0 or not job_error):
		details = 'Output from arccat:\n%s'%arccat_out
		self.log.error('Errors from %s:\n%s'%(jd.host, arccat_out))
	    status = nagutils.CRITICAL

	if job_error:
	    if details:
		details = job_error + '\n' + details
	    else:
		details = job_error
	service_name = jd.termination_service or self.opts.termination_service
	self.submit_passive_service_result(jd.host, service_name, status, msg,
		details = details)

	# Run check and cleanup methods for job plugin tests.
	ok_check = True
	if did_fetch:
	    # arcget stores the output in a subdirectory of the specified
	    # directory.  Locate it.
	    subdir = None
	    for subdir in os.listdir(job_output_dir):
		if subdir in ['.', '..']:
		    continue
		break
	    if subdir is None:
		self.log.error('Subdirectory from arcget not found, it '
			       'should have been under %s.'%job_output_dir)
		ok_check = False
	    else:
		# Run check.
		job_output_dir = os.path.join(job_output_dir, subdir)
		for test_name in jd.tests:
		    test = self.load_jobtest(test_name, hostname = jd.host)
		    if test.service_description:
			report = self.nagios_report_for(jd.host,
							test.service_description)
		    else:
			report = self.nagios_report
		    test.check(report, job_output_dir, jd.stored_urls)

	    # Run cleanup.
	    for test_name in jd.tests:
		test = self.load_jobtest(test_name, hostname = jd.host)
		test.cleanup(did_fetch)

	return did_fetch, ok_check

    def remove_staged_outputs(self, jd):
	ok = True
	stored_urls = []
	for stored_url in jd.stored_urls:
	    if stored_url.startswith('file:'):
		continue
	    rc, output = self.run_arc_cmd('arcrm', stored_url)
	    if rc != 0:
		self.log.warn('Failed to remove %s.'%stored_url)
		stored_urls.append(stored_url)
		ok = False
	    else:
		self.log.info('Removed test file %s.'%stored_url)
	jd.stored_urls = stored_urls
	return ok

    def check_monitor(self):
	"""Monitor submitted jobs."""

	if not os.path.exists(self.top_workdir):
	    self.log.info('The work directory is %s.'%self.top_workdir)
	    return ServiceOK('No jobs to monitor since the working directory '
			     'has not yet been created.')
	self.require_voms_proxy()

	error_count = 0
	jd_of_jobid = {}
	dirs = self.opts.ces
	if not dirs:
	    dirs = [dir for dir in os.listdir(self.top_workdir)
		    if os.path.isdir(os.path.join(self.top_workdir, dir))]
	for dir in dirs:
	    if '#' in dir:
		host, job_tag = dir.split('#', 1)
	    else:
		host, job_tag = dir, None
	    workdir = self.workdir_for(host, job_tag)
	    ajf = os.path.join(workdir, self.ACTIVE_JOB_FILENAME)
	    if not os.path.exists(ajf):
		self.log.debug('Missing active job info for %s.'%host)
	    else:
		try:
		    jd = self.load_active_job(host, job_tag)
		    jd.host = host
		    jd.job_tag = job_tag
		    jd_of_jobid[jd.job_id] = jd
		except Exception, xc:
		    self.log.error('Cannot load job file %s: %s'%(ajf, xc))

	query_jobids = [jd.job_id
			    for jd in jd_of_jobid.itervalues()
			    if jd.job_state in nonterminal_job_states or \
			       jd.fetch_attempts]
	if query_jobids == []:
	    msg = 'No jobs to query, found %d in terminal states.' \
		%len(jd_of_jobid)
	    return ServiceOK(msg)
	self.log.debug('Querying job IDs %s'%', '.join(query_jobids))
	try:
	    jobstats = arcstat(*query_jobids)
	    self.log.info('Queried %d jobs, found %d.'
			  % (len(query_jobids), len(jobstats)))
	    for jobid in query_jobids:
		jd = jd_of_jobid[jobid]
		if not jobid in jobstats:
		    # This covers two cases, a) job not visible just after
		    # submission and b) job has been lost somehow.  We can't
		    # clean up, in case a is correct.  On the hopefully rare
		    # event of b, the timeout in the submit metric should take
		    # care of it.
		    self.log.info('Job %s of kind %s on %s not found.' \
			    % (jobid, jd.job_tag, jd.host))
		    continue
		jobstat = jobstats[jobid]
		self.log.debug('Checking job on %s.'%jd.host)
		job_state = jobstat['State']
		jd.job_state = job_state
		jd.check_time = str(int(time.time()))
		if job_state in terminal_job_states:
		    did_fetch, ok_check = \
			self.fetch_job(jd, job_state,
				       jobstat.get('Job Error', None))
		    if did_fetch:
			jd.fetch_attempts = 0
		    else:
			jd.fetch_attempts = (jd.fetch_attempts or 0) + 1
		    if not ok_check:
			error_count += 1
		    self.remove_staged_outputs(jd)
		self.save_active_job(jd, jd.host, jd.job_tag)
	except ParseError, xc:
	    return ServiceUNKNOWN('%s'%xc)
	for jd in jd_of_jobid.itervalues():
	    self.log.info('Host %s is in state %s.'%(jd.host, jd.job_state))
	if error_count == 0:
	    return ServiceOK('Checked %d jobs.'%len(jobstats))
	else:
	    return ServiceCRITICAL('Checked %d jobs, got %d error(s).'
		    %(len(jobstats), error_count))

    def check_clean(self):
	if not os.path.exists(self.top_workdir):
	    self.log.info('The work directory is %s.'%self.top_workdir)
	    return ServiceOK('No jobs to clean since the working directory '
			     'has not yet been created.')
	self.require_voms_proxy()
	total_count, pruned_count, failed_count = \
	    arcprune(max_age = self.opts.max_age, timeout = self.opts.timeout,
		     log = self.log)
	if failed_count > 0:
	    return ServiceWARNING('Cleaned %d and failed to clean %d of %d jobs'
		    %(pruned_count, failed_count, total_count))
	else:
	    return ServiceOK('Cleaned %d of %d jobs'%(pruned_count,total_count))
