import os, time
from arcnagios import arcutils, jobutils, nagutils
from arcnagios.nagutils import ServiceOK, ServiceCRITICAL, ServiceUNKNOWN
from arcnagios.arcutils import ParseError

class Check_arcce_monitor(jobutils.JobNagiosPlugin):
    def __init__(self):
	jobutils.JobNagiosPlugin.__init__(self)
	ap = self.argparser
	ap.add_argument('--ce', dest = 'ces',
		default = [], action = 'append',
		metavar = 'CE',
		help = 'Pass one or more times to restrict monitoring '
		       'to the given CEs.')
	ap.add_argument('--termination-service', dest = 'termination_service',
		default = 'ARCCE Job Termination',
		help = 'Default service to submit result to if not specified '
		       'when submitting the job.')

    def parse_args(self, args):
	"""Parse ARCCE-specific command-line options."""

	jobutils.JobNagiosPlugin.parse_args(self, args)

    def fetch_job(self, jd, job_state, job_error = None):
	"""Fetch a job and return true, or return false on failure.  Also
	submit passive result depending on the status of the success of the
	job itself."""

	if job_error:
	    self.log.error(job_error)
	details = None

	did_fetch = False
	if jd.job_state == 'FINISHED':
	    msg = 'Job finished successfully.'
	    status = nagutils.OK

	    self.log.info('Fetching job %s in terminal state %s.'
			  %(jd.job_id, job_state))
	    workdir = self.workdir_for(jd.host, jd.job_tag)
	    job_output_dir = os.path.join(workdir, self.JOB_OUTPUT_DIRNAME)
	    rc, output = self.run_arc_cmd('arcget', '-D', job_output_dir,
					  jd.job_id)
	    did_fetch = rc == 0
	    self.log.debug('Output from arcget: %d "%s"'%(rc, output))
	    if not did_fetch:
		msg = 'Failed to fetch job.'
		if status == nagutils.OK:
		    status = nagutils.UNKNOWN
		self.log.error('Failed to fetch %s.'%jd.job_id)
		if output:
		    details = 'Output from arcget:\n%s'%output
		    self.log.error(details)
		clean_rc, _ = self.run_arc_cmd('arcclean', jd.job_id)
		if clean_rc != 0:
		    self.log.warn('Also arcclean failed.')
	else:
	    arccat_rc, arccat_out = self.run_arc_cmd('arccat', '-e', jd.job_id)
	    msg = 'Job terminated in state %s'%jd.job_state
	    # If job_error is provided, then it not surprising if arccat
	    # failed, so don't litter the output.
	    if arccat_out.strip() and (arccat_rc == 0 or not job_error):
		details = 'Output from arccat:\n%s'%arccat_out
		self.log.error('Errors from %s:\n%s'%(jd.host, arccat_out))
	    status = nagutils.CRITICAL

	if job_error:
	    if details:
		details = job_error + '\n' + details
	    else:
		details = job_error
	service_name = jd.termination_service or self.opts.termination_service
	self.submit_passive_service_result(jd.host, service_name, status, msg,
		details = details)

	# Run check and cleanup methods for job plugin tests.
	ok_check = True
	if did_fetch:
	    # arcget stores the output in a subdirectory of the specified
	    # directory.  Locate it.
	    subdir = None
	    for subdir in os.listdir(job_output_dir):
		if subdir in ['.', '..']:
		    continue
		break
	    if subdir is None:
		self.log.error('Subdirectory from arcget not found, it '
			       'should have been under %s.'%job_output_dir)
		ok_check = False
	    else:
		# Run check.
		job_output_dir = os.path.join(job_output_dir, subdir)
		for test_name in jd.tests:
		    test = self.load_jobtest(test_name, hostname = jd.host)
		    if test.service_description:
			report = self.nagios_report_for(jd.host,
							test.service_description)
		    else:
			report = self.nagios_report
		    test.check(report, job_output_dir, jd.stored_urls)

	    # Run cleanup.
	    for test_name in jd.tests:
		test = self.load_jobtest(test_name, hostname = jd.host)
		test.cleanup(did_fetch)

	return did_fetch, ok_check

    def remove_staged_outputs(self, jd):
	ok = True
	stored_urls = []
	for stored_url in jd.stored_urls:
	    if stored_url.startswith('file:'):
		continue
	    rc, output = self.run_arc_cmd('arcrm', stored_url)
	    if rc != 0:
		self.log.warn('Failed to remove %s.'%stored_url)
		stored_urls.append(stored_url)
		ok = False
	    else:
		self.log.info('Removed test file %s.'%stored_url)
	jd.stored_urls = stored_urls
	return ok

    def check(self):
	"""Monitor submitted jobs."""

	if not os.path.exists(self.top_workdir):
	    self.log.info('The work directory is %s.'%self.top_workdir)
	    return ServiceOK('No jobs to monitor since the working directory '
			     'has not yet been created.')
	self.require_voms_proxy()

	error_count = 0
	jd_of_jobid = {}
	dirs = self.opts.ces
	if not dirs:
	    dirs = [dir for dir in os.listdir(self.top_workdir)
		    if os.path.isdir(os.path.join(self.top_workdir, dir))]
	for dir in dirs:
	    if '#' in dir:
		host, job_tag = dir.split('#', 1)
	    else:
		host, job_tag = dir, None
	    workdir = self.workdir_for(host, job_tag)
	    ajf = os.path.join(workdir, self.ACTIVE_JOB_FILENAME)
	    if not os.path.exists(ajf):
		self.log.debug('Missing active job info for %s.'%host)
	    else:
		try:
		    jd = self.load_active_job(host, job_tag)
		    jd.host = host
		    jd.job_tag = job_tag
		    jd_of_jobid[jd.job_id] = jd
		except Exception, xc:
		    self.log.error('Cannot load job file %s: %s'%(ajf, xc))

	query_jobids = [jd.job_id
			for jd in jd_of_jobid.itervalues()
		      	if jd.job_state in arcutils.nonterminal_job_states \
			or jd.fetch_attempts]
	if query_jobids == []:
	    msg = 'No jobs to query, found %d in terminal states.' \
		%len(jd_of_jobid)
	    return ServiceOK(msg)
	self.log.debug('Querying job IDs %s'%', '.join(query_jobids))
	try:
	    jobstats = arcutils.arcstat(*query_jobids)
	    self.log.info('Queried %d jobs, found %d.'
			  % (len(query_jobids), len(jobstats)))
	    for jobid in query_jobids:
		jd = jd_of_jobid[jobid]
		if not jobid in jobstats:
		    # This covers two cases, a) job not visible just after
		    # submission and b) job has been lost somehow.  We can't
		    # clean up, in case a is correct.  On the hopefully rare
		    # event of b, the timeout in the submit metric should take
		    # care of it.
		    self.log.info('Job %s of kind %s on %s not found.' \
			    % (jobid, jd.job_tag, jd.host))
		    continue
		jobstat = jobstats[jobid]
		self.log.debug('Checking job on %s.'%jd.host)
		job_state = jobstat['State']
		jd.job_state = job_state
		jd.check_time = str(int(time.time()))
		if job_state in arcutils.terminal_job_states:
		    did_fetch, ok_check = \
			self.fetch_job(jd, job_state,
				       jobstat.get('Job Error', None))
		    if did_fetch:
			jd.fetch_attempts = 0
		    else:
			jd.fetch_attempts = (jd.fetch_attempts or 0) + 1
		    if not ok_check:
			error_count += 1
		    self.remove_staged_outputs(jd)
		self.save_active_job(jd, jd.host, jd.job_tag)
	except ParseError, xc:
	    return ServiceUNKNOWN('%s'%xc)
	for jd in jd_of_jobid.itervalues():
	    self.log.info('Host %s is in state %s.'%(jd.host, jd.job_state))
	if error_count == 0:
	    return ServiceOK('Checked %d jobs.'%len(jobstats))
	else:
	    return ServiceCRITICAL('Checked %d jobs, got %d error(s).'
		    %(len(jobstats), error_count))
