import os, re, time
from arcnagios import arcutils, jobutils, nagutils, utils
from arcnagios.nagutils import ServiceOK, ServiceCRITICAL, ServiceUNKNOWN
from arcnagios.arcutils import ParseError
from arcnagios.utils import nth

_arcget_harmless_re = re.compile('|'.join([
    r'$',
    r'Results stored at:.*',
    r'Warning: Some jobs were not removed.*',
]))

def check_arcget_output(output):
    need_arcclean = False
    for ln in output.split('\n'):
	ln = ln.strip()
	if ln.startswith('Use arcclean to remove'):
	    need_arcclean = True
	elif not re.match(_arcget_harmless_re, ln):
	    return False, need_arcclean
    return True, need_arcclean

class Check_arcce_monitor(jobutils.JobNagiosPlugin):
    def __init__(self):
	jobutils.JobNagiosPlugin.__init__(self)
	ap = self.argparser
	ap.add_argument('--ce', dest = 'ces',
		default = [], action = 'append',
		metavar = 'CE',
		help = 'Pass one or more times to restrict monitoring '
		       'to the given CEs.')
	ap.add_argument('--termination-service', dest = 'termination_service',
		default = 'ARCCE Job Termination',
		help = 'Default service to submit result to if not specified '
		       'when submitting the job. '
		       'Deprecated: Should be passed on submission.')
	ap.add_argument('--max-sysinfo-lag', dest = 'max_infosys_lag',
		default = 3600.0, metavar = 'T',
		help = 'The maximum time to wait for a job to turn up in '
		       'the arcstat listing before abandoning it.')
	ap.add_argument('--max-check-attempts', dest = 'max_check_attempts',
		default = 12, metavar = 'N',
		help = 'The maximum number of consecutive times a job in '
		       'post-SUBMITTED state is absent from arcstat listing '
		       'before it is abandoned.')
	ap.add_argument('--max-fetch-attempts', dest = 'max_fetch_attempts',
		default = 8, metavar = 'N',
		help = 'The maximum number of attempts to fetch a job before '
		       'abandoning it.')
	ap.add_argument('--keep-failed-jobdata', dest = 'keep_failed_jobdata',
		action = 'store_true', default = False,
		help = 'Keep the job descriptions and output directories for '
		       'failed jobs. These will not be removed automatically.')
	ap.add_argument('--keep-all-jobdata', dest = 'keep_all_jobdata',
		action = 'store_true', default = False,
		help = 'Keep the job descriptions and output directories for '
		       'all jobs. These will not be removed automatically.')

    def parse_args(self, args):
	"""Parse ARCCE-specific command-line options."""

	jobutils.JobNagiosPlugin.parse_args(self, args)

    def _clean_output_dir(self, dir):
	conflict = '.conflict-%d' % int(time.time())
	for entry in os.listdir(dir):
	    subdir = os.path.join(dir, entry)
	    self.log.warn('Moving away partially fetched output %s.' % subdir)
	    os.rename(subdir, subdir + conflict)

    def prepare_top_output_dir(self, jd):
	workdir = self.workdir_for(jd.host, jd.job_tag)
	job_output_dir = os.path.join(workdir, self.JOB_OUTPUT_DIRNAME)
	if os.path.exists(job_output_dir):
	    self._clean_output_dir(job_output_dir)
	return job_output_dir

    def locate_output_dir(self, top_output_dir):
	for subdir in os.listdir(top_output_dir):
	    if not subdir in ['.', '..'] or '.conflict-' in subdir:
		return os.path.join(top_output_dir, subdir)
	return None

    def fetch_job(self, jd, job_error = None):
	"""Fetch the job described by `jd : JobInfo`, submit passive results,
	and return a tuple `(did_fetch, check_ok, status_code)`, where

	    `did_fetch`   indicates whether the job was fetched,
	    `check_ok`    indicates whether checking went well, and
	    `status_code` is the overall Nagios status reported to the passive
			  services for this job.
	"""

	service_name = jd.termination_service or self.opts.termination_service
	termination_report = self.nagios_report_for(jd.host, service_name)
	termination_report.update_status(nagutils.OK, 'Job succeeded.')

	# Report the final job state if the job failed.
	if jd.job_state != arcutils.J_FINISHED:
	    termination_report.update_status(nagutils.CRITICAL,
		    'Job terminated as %s.'%jd.job_state)

	if job_error:
	    self.log.error(job_error)
	    termination_report.log.error(job_error)

	# Try to fetch the job. Exit if no files where fetched.
	self.log.info('Fetching job %s in terminal state %s.'
		      %(jd.job_id, jd.job_state))
	top_output_dir = self.prepare_top_output_dir(jd)
	arcget_rc, arcget_out = \
	    self.run_arc_cmd('arcget', '-D', top_output_dir, jd.job_id)
	job_output_dir = self.locate_output_dir(top_output_dir)
	if job_output_dir is None:
	    if arcget_rc == 0:
		self.log.error('Subdirectory from arcget not found, it '
			       'should have been under %s.'%job_output_dir)
		if termination_report.status_code == nagutils.OK:
		    termination_report.update_status(nagutils.UNKNOWN,
			    'Output directory from arcget not found.')
		termination_report.log.error('JID: %s'%jd.job_id)
		return True, False, termination_report.status_code
	    else:
		self.log.error('Failed to fetch %s.'%jd.job_id)
		termination_report.update_status(nagutils.WARNING,
			'Failed to fetch job.')
		if arcget_out:
		    details = 'Output from arcget:\n%s'%arcget_out
		    self.log.error(details)
		    termination_report.log.error(details)
		termination_report.log.error('JID: %s'%jd.job_id)
		return False, True, termination_report.status_code

	# Check if arcget returned non-zero despite having fetched something.
	if arcget_rc != 0:
	    is_harmless, need_arcclean = check_arcget_output(arcget_out)
	    if need_arcclean:
		termination_report.log.warning('Separate arcclean needed.')
		self.cleaner.call('arcclean', jd.job_id)
	    if not is_harmless:
		termination_report.update_status(nagutils.WARNING,
		    'arcget %s %s: %s'
		    % (jd.job_id, utils.exited_with(arcget_rc), arcget_out))

	if jd.job_state != arcutils.J_FINISHED:
	    errors = \
		utils.file_contents(os.path.join(job_output_dir, 'stderr.txt'))
	    if not errors is None:
		self.log.error('Errors:\n%s' % errors)
		details = 'Errors:\n%s'%errors
		termination_report.log.error(details)
	    elif not job_error:
		self.log.error('No stderr.txt found for %s.' % jd.job_id)
            termination_report.log.error('JID: %s'%jd.job_id)
	    return True, True, termination_report.status_code

	# Run check and publish results from job tests.
	termination_report.log.info('JID: %s'%jd.job_id)
	status_code = termination_report.status_code
	for test_name in jd.tests:
	    test = self.load_jobtest(test_name, hostname = jd.host)
	    if test.service_description:
		report = self.nagios_report_for(jd.host,
						test.service_description)
	    else:
		report = self.nagios_report
	    test.check(report, job_output_dir, jd.stored_urls)
	    if report.status_code > status_code:
		status_code = report.status_code

        if status_code != nagutils.OK:
	    termination_report.log.error('JID: %s'%jd.job_id)
	return True, True, status_code

    def check_job_state_timeout(self, jd, jobstat):
	if jd.job_state_time is None or jd.progress_service is None:
	    return
        attrs = {}
        for ck_state in [jd.job_specific_state, jd.job_state.name]:
            if ck_state and \
                    self.config.has_option('arcce.job-states', str(ck_state)):
                specs = self.config.get('arcce.job-states', str(ck_state))
                attrs = dict(kv.split(':', 1) for kv in specs.split()
                                              if ':' in kv)
                break
	job_state_age = time.time() - jd.job_state_time
	if 'c' in attrs and job_state_age > int(attrs['c']):
	    status = nagutils.CRITICAL
	    msg = 'Stuck in state %s (%s).' \
                % (jd.specific_state, jd.job_state.name)
	elif 'w' in attrs and job_state_age > int(attrs['w']):
	    status = nagutils.WARNING
	    msg = 'Stuck in state %s (%s).' \
                % (jd.specific_state, jd.job_state.name)
	else:
	    status = nagutils.OK
	    msg = 'Normal progress.'
	# This also triggers in the initial case when jd.job_state_alert is
	# None, to clear any lingering alerts.
	if status != jd.job_state_alert:
	    report = self.nagios_report_for(jd.host, jd.progress_service)
	    report.update_status(status, msg)
	    jd.job_state_alert = status

    def check(self):
	"""Monitor submitted jobs."""

	if not os.path.exists(self.top_workdir):
	    self.log.info('The work directory is %s.'%self.top_workdir)
	    return ServiceOK('No jobs to monitor since the working directory '
			     'has not yet been created.')
	self.require_voms_proxy()

	error_count = 0
	jd_of_jobid = {}
	dirs = self.opts.ces
	if not dirs:
	    dirs = [dir for dir in os.listdir(self.top_workdir)
		    if os.path.isdir(os.path.join(self.top_workdir, dir))]
	for dir in dirs:
	    if '#' in dir:
		host, job_tag = dir.split('#', 1)
	    else:
		host, job_tag = dir, None
	    workdir = self.workdir_for(host, job_tag)
	    ajf = os.path.join(workdir, self.ACTIVE_JOB_FILENAME)
	    if not os.path.exists(ajf):
		self.log.debug('No active job info for %s.'%host)
	    else:
		try:
		    jd = self.load_active_job(host, job_tag)
		    jd.host = host
		    jd.job_tag = job_tag
		    jd_of_jobid[jd.job_id] = jd
		except Exception, xc:
		    self.log.error('Cannot load job file %s: %s'%(ajf, xc))

	query_jobids = [jd.job_id for jd in jd_of_jobid.itervalues()]
	if query_jobids == []:
	    msg = 'No jobs to query, found %d in terminal states.' \
		%len(jd_of_jobid)
	    return ServiceOK(msg)
	self.log.debug('Querying job IDs %s'%', '.join(query_jobids))
	try:
	    jobstats = arcutils.arcstat(query_jobids, log = self.log)
	    self.log.info('Queried %d jobs, found %d.'
			  % (len(query_jobids), len(jobstats)))
	    for jobid in query_jobids:
		jd = jd_of_jobid[jobid]
		if not jobid in jobstats:
		    # Job missing from from arcstat output can happen
		    #   a) right after submission before it becomes available,
		    #   b) temporarily if the CE infosys is unavailable, or
		    #   c) if the job has been permanently removed.
		    jd.check_attempts = jd.check_attempts or 0
		    if jd.job_state == arcutils.J_NOT_SEEN \
			    and time.time() - jd.submission_time \
			      < self.opts.max_infosys_lag:
			# We hope it's case a and give it more time.
			self.log.info('Job %s of kind %s on %s not found yet.'
				% (jobid, jd.job_tag, jd.host))
		    elif jd.check_attempts < self.opts.max_check_attempts:
			# We hope it's case a or b and make a fixed number of
			# attempts.
			jd.check_attempts = jd.check_attempts + 1
			self.log.info('Job %s of kind %s on %s missing for '
				      'the %s time in state %s, still checking.'
				% (jobid, jd.job_tag, jd.host,
				   nth(jd.check_attempts), jd.job_state))
			self.save_active_job(jd, jd.host, jd.job_tag)
		    else:
			# We give up, assuming c) the job has been removed,
			# but discard_job schedules repeated attemts to remove
			# the job and any staged files while new jobs are run.
			self.log.info('Job %s of kind %s on %s disappeared in '
				      'state %s, removing active job info.' \
				% (jobid, jd.job_tag, jd.host, jd.job_state))
			self.discard_job(jd,
				archive = self.opts.keep_failed_jobdata)
		    continue
		jd.check_attempts = 0
		jobstat = jobstats[jobid]
		self.log.debug('Checking job on %s.'%jd.host)

		if jd.job_state != jobstat.state \
			or jd.job_specific_state != jobstat.specific_state:
		    jd.job_state = jobstat.state
		    jd.job_specific_state = jobstat.specific_state
		    jd.job_state_time = int(time.time())
		self.check_job_state_timeout(jd, jobstat)

		jd.check_time = str(int(time.time()))
		if jd.job_state.is_final():
		    did_fetch, ok_check, status_code = \
			self.fetch_job(jd, jobstat.job_error)
		    if not ok_check:
			error_count += 1
		    archive = self.opts.keep_failed_jobdata \
				and status_code != nagutils.OK \
			   or self.opts.keep_all_jobdata
		    if did_fetch:
			self.cleanup_job(jd, archive = archive)
		    elif jd.fetch_attempts >= self.opts.max_fetch_attempts:
			self.log.warning('Giving up on fetching %s.' % jobid)
			self.discard_job(jd, archive = archive)
		    else:
			jd.fetch_attempts = (jd.fetch_attempts or 0) + 1
			self.log.info('Will retry fetching %s.' % jobid)
			self.save_active_job(jd, jd.host, jd.job_tag)
		else:
		    self.save_active_job(jd, jd.host, jd.job_tag)
	except ParseError, xc:
	    return ServiceUNKNOWN('%s'%xc)
	self.cleaner # Trigger self.cleaner.run at exit
	for jd in jd_of_jobid.itervalues():
	    self.log.info('Host %s is in state %s.'%(jd.host, jd.job_state))
	if error_count == 0:
	    return ServiceOK('Checked %d jobs.'%len(jobstats))
	else:
	    return ServiceCRITICAL('Checked %d jobs, got %d error(s).'
		    %(len(jobstats), error_count))
