Commit a0e86541 authored by André Anjos's avatar André Anjos 💬

Initial version

parents
*.py?
*~
*.egg-info
*.swp
#!/idiap/group/torch5spro/nightlies/externals/v2/linux-x86_64/pyenv/bin/python
# vim: set fileencoding=utf-8 :
# Andre Anjos <andre.anjos@idiap.ch>
# Wed 24 Aug 2011 17:21:28 CEST
import os
import sys
install_dir = os.path.realpath(os.path.dirname(sys.argv[0]))
sys.path.append(install_dir)
from gridtk.scripts.grid import main
main()
#!/usr/bin/env python
# vim: set fileencoding=utf-8 :
# Andre Anjos <andre.anjos@idiap.ch>
# Wed 24 Aug 2011 13:06:25 CEST
"""Defines the job manager which can help you managing submitted grid jobs.
"""
import os
import time
import logging
import anydbm
from cPickle import dumps, loads
from .tools import qsub, qstat, qdel
from .setshell import environ
class Job:
"""The job class describes a job"""
def __init__(self, data, args, kwargs):
self.data = data
self.data['user_args'] = args
self.data['user_kwargs'] = kwargs
def id(self):
"""Returns my own numerical id"""
return int(self.data['job_number'])
def age(self, short=True):
"""Returns a string representation indicating, approximately, how much time
has ellapsed since the job was submitted. The input argument must be a
string as defined in the filed 'submission_time' """
translate = {
's': 'second',
'm': 'minute',
'h': 'hour',
'd': 'day',
'w': 'week',
}
s = time.mktime(time.strptime(self.data['submission_time']))
diff = time.time() - s
unit = 's'
if diff > 60: # more than a minute
unit = 'm'
diff /= 60.
if diff > 60: # more than an hour
unit = 'h'
diff /= 60.
if diff > 24: # more than a day
diff /= 24.
unit = 'd'
if diff > 7: # more than a week
diff /= 7.
unit = 'w'
value = int(round(diff))
if short:
return "%d%s" % (value, unit)
else:
plural = "" if value == 1 else "s"
return "%d %s%s" % (value, translate[unit], plural)
def stdout_filename(self):
"""Returns the stdout filename for this job, with the full path"""
base_dir = self.data['sge_o_home']
if self.data.has_key('cwd'): base_dir = self.data['cwd']
# add-on outor directory
if self.data.has_key('stdout_path_list'):
p = self.data['stdout_path_list'].split(':')[2]
if p[0] == os.sep: base_dir = p
else: base_dir = os.path.join(base_dir, p)
return os.path.join(base_dir, self.data['job_name'] +
'.o%s' % self.data['job_number'])
def stderr_filename(self):
"""Returns the stderr filename for this job, with the full path"""
base_dir = self.data['sge_o_home']
if self.data.has_key('cwd'): base_dir = self.data['cwd']
# add-on error directory
if self.data.has_key('stderr_path_list'):
p = self.data['stderr_path_list'].split(':')[2]
if p[0] == os.sep: base_dir = p
else: base_dir = os.path.join(base_dir, p)
return os.path.join(base_dir, self.data['job_name'] +
'.e%s' % self.data['job_number'])
def check(self):
"""Checks if the job was detected to be completed"""
err_file = self.stderr_filename()
try:
if os.stat(err_file).st_size != 0:
logging.debug("Job %s has a stderr file with size != 0" % \
self.data['job_number'])
return False
except OSError, e:
logging.warn("Could not find error file '%s'" % err_file)
logging.debug("Zero size error log at '%s'" % err_file)
return True
def __str__(self):
"""Returns a string containing a short job description"""
return "%s @%s (%s ago) %s" % (self.data['job_number'],
self.data['hard'].split('=')[1], self.age(short=False),
' '.join(self.data['user_args'][0]))
def row(self, fmt):
"""Returns a string containing the job description suitable for a table"""
return fmt % (self.data['job_number'],
self.data['hard'].split('=')[1], self.age(),
' '.join(self.data['user_args'][0]))
def stderr(self):
"""Returns a string with the contents of the stderr file"""
err_file = self.stderr_filename()
try:
return open(err_file, 'rt').read()
except OSError, e:
logging.warn("Could not find error file '%s'" % err_file)
return ""
def stdout(self):
"""Returns a string with the contents of the stdout file"""
out_file = self.stdout_filename()
try:
return open(out_file, 'rt').read()
except OSError, e:
logging.warn("Could not find output file '%s'" % output_file)
return ""
def has_key(self, key):
return self.data.has_key(key)
def keys(self):
return self.data.keys()
def values(self):
return self.data.values()
def __getitem__(self, key):
return self.data[key]
def __setitem__(self, key, value):
self.data[key] = value
def __delitem__(self, key):
del self.data[key]
class JobManager:
"""The JobManager will submit and control the status of submitted jobs"""
def __init__(self, statefile='.jobmanager.db', context='grid'):
"""Intializes this object with a state file and a method for qsub'bing.
Keyword parameters:
statefile
The file containing a valid status database for the manager. If the file
does not exist it is initialized. If it exists, it is loaded.
context
The context to provide when setting up the environment to call the SGE
utilities such as qsub, qstat and qdel (normally 'grid', which also
happens to be default)
"""
self.state_file = statefile
self.state_db = anydbm.open(self.state_file, 'c')
self.job = {}
logging.debug("Loading previous state...")
for k in self.state_db.keys():
ki = loads(k)
self.job[ki] = loads(self.state_db[k])
logging.debug("Job %d loaded" % ki)
self.context = environ(context)
def __del__(self):
"""Safely terminates the JobManager"""
db = anydbm.open(self.state_file, 'n') # erase previously recorded jobs
for k in sorted(self.job.keys()): db[dumps(k)] = dumps(self.job[k])
if not self.job:
logging.debug("Removing file %s because there are no more jobs to store" \
% self.state_file)
os.unlink(self.state_file)
def submit(self, *args, **kwargs):
"""Calls the configure qsub method and registers the job"""
kwargs['context'] = self.context
jobid = qsub(*args, **kwargs)
del kwargs['context']
self.job[jobid] = Job(qstat(jobid, context=self.context), args, kwargs)
return self.job[jobid]
def resubmit(self, job, dependencies=[]):
"""Re-submit jobs automatically"""
if dependencies: job['user_kwargs']['deps'] = dependencies
return self.submit(job['user_args'][0], **job['user_kwargs'])
def keys(self):
return self.job.keys()
def __getitem__(self, key):
return self.job[key]
def __delitem__(self, key):
if not self.job.has_key(key): raise KeyError, key
qdel(key, context=self.context)
del self.job[key]
def __str__(self):
"""Returns the status of each job still being tracked"""
# configuration
fields = ("job-id", "queue", "age", "arguments")
lengths = (8, 5, 3, 55)
marker = '='
# work
fmt = "%%%ds %%%ds %%%ds %%-%ds" % lengths
delimiter = fmt % tuple([k*marker for k in lengths])
header = [fields[k].center(lengths[k]) for k in range(len(lengths))]
header = ' '.join(header)
return '\n'.join([header] + [delimiter] + \
[self[k].row(fmt) for k in self.job])
def clear(self):
"""Clear the whole job queue"""
for k in self.keys(): del self[k]
def describe(self, key):
"""Returns a string explaining a certain job"""
return str(self[key])
def stdout(self, key):
"""Gets the output of a certain job"""
return self[key].stdout()
def stderr(self, key):
"""Gets the error output of a certain job"""
return self[key].stderr()
def refresh(self):
"""Conducts a qstat over all jobs in the cache. If the job is not present
anymore check the logs directory for output and error files. If the size of
the error file is different than zero, warn the user.
Returns two lists: jobs that work and jobs that require attention
(error file does not have size 0).
"""
success = []
error = []
for k in sorted(self.job.keys()):
d = qstat(k, context=self.context)
if not d: #job has finished. check
status = self.job[k].check()
if status:
success.append(self.job[k])
del self.job[k]
logging.debug("Job %d completed successfuly" % k)
else:
error.append(self.job[k])
del self.job[k]
logging.debug("Job %d probably did not complete successfuly" % k)
return success, error
#!/usr/bin/env python
# vim: set fileencoding=utf-8 :
# Andre Anjos <andre.anjos@idiap.ch>
# Wed 27 Jul 2011 14:36:06 CEST
"""Executes a given command within the context of a shell script that has its
enviroment set like Idiap's 'SETSHELL grid' does."""
import os
import sys
def main():
if len(sys.argv) < 2:
print __doc__
print "usage: %s <command> [arg [arg ...]]" % \
os.path.basename(sys.argv[0])
sys.exit(1)
from ..setshell import replace
replace('grid', sys.argv[1:])
#!/usr/bin/env python
# vim: set fileencoding=utf-8 :
# Andre Anjos <andre.anjos@idiap.ch>
# Wed 24 Aug 2011 16:13:31 CEST
"""A logging Idiap/SGE job manager
"""
__epilog__ = """ For a list of available commands:
>>> %(prog)s --help
For a list of options for a particular command:
>>> %(prog)s <command> --help
"""
import os
import sys
import anydbm
from cPickle import dumps
import argparse
from ..manager import JobManager
from ..tools import make_python_wrapper, make_torch_wrapper
def setup(args):
"""Returns the JobManager and sets up the basic infrastructure"""
kwargs = {}
if args.db: kwargs['statefile'] = args.db
jm = JobManager(**kwargs)
# set-up logging
if args.verbose:
import logging
logging.basicConfig(level=logging.DEBUG)
return jm
def ls(args):
"""List action"""
jm = setup(args)
print jm
def save_jobs(j, name):
"""Saves jobs in a database"""
db = anydbm.open(name, 'c')
for k in j:
ki = int(k['job_number'])
db[dumps(ki)] = dumps(k)
def refresh(args):
"""Refresh action"""
jm = setup(args)
(good, bad) = jm.refresh()
if good:
print "These jobs finished well:"
for k in good: print k
if args.successdb: save_jobs(good, args.successdb)
if bad:
print "These jobs require attention:"
for k in bad: print k
if args.faildb: save_jobs(bad, args.faildb)
def remove(f):
"""Remove files in a safe way"""
if os.path.exists(f):
os.unlink(f)
print " removed `%s'" % f
def delete(args):
jm = setup(args)
for k in args.jobid:
if jm.has_key(k):
J = jm[k]
del jm[k]
print "Deleted job %s" % descr
if args.also_logs:
remove(J.stdout_filename())
remove(J.stderr_filename())
else:
print "Ignored job %d (not found on manager)" % k
def submit(args):
jm = setup(args)
kwargs = {
'queue': args.qname,
'cwd': True,
'name': args.name,
'deps': args.deps,
'stdout': args.stdout,
'stderr': args.stderr,
'env': args.env,
}
jobid = jm.submit(args.job, **kwargs)
print 'Submitted', jm.describe(jobid)
def wsubmit(args):
jm = setup(args)
kwargs = {
'queue': args.qname,
'cwd': True,
'name': args.name,
'deps': args.deps,
'stdout': args.stdout,
'stderr': args.stderr,
'env': args.env,
}
command, kwargs = make_python_wrapper(args.wrapper, args.job, kwargs)
job = jm.submit(command, **kwargs)
job = jm.submit(args.wrapper, args.job, **kwargs)
print 'Submitted (wrapped)', job
def tsubmit(args):
jm = setup(args)
kwargs = {
'queue': args.qname,
'cwd': True,
'name': args.name,
'deps': args.deps,
'stdout': args.stdout,
'stderr': args.stderr,
'env': args.env,
}
command, kwargs = make_torch_wrapper(args.torch, args.torch_debug,
args.job, kwargs)
job = jm.submit(command, **kwargs)
print 'Submitted (torch\'d)', job
def explain(args):
"""Explain action"""
jm = setup(args)
jobs = jm.keys()
if args.jobid: jobs = args.jobid
first_time = True
for k in jobs:
if not first_time: print 79*'-'
first_time = False
J = jm[k]
print "Job", J
print "Command line:", J['user_args'], J['user_kwargs']
print
print "%d stdout (%s)" % (k, J.stdout_filename())
print J.stdout()
print
print "%d stderr (%s)" % (k, J.stderr_filename())
print J.stderr()
def cleanup(args):
"""Cleanup action"""
jm = setup(args)
jobs = jm.keys()
if args.jobid: jobs = args.jobid
for k in jobs:
J = jm[k]
print 'Cleaning-up logs for job', J
remove(J.stdout_filename())
remove(J.stderr_filename())
if args.remove_job:
del jm[k]
print ' deleted job %s from database' % J['job_number']
def resubmit(args):
jm = setup(args)
fromjm = JobManager(args.fromdb)
jobs = fromjm.keys()
if args.jobid: jobs = args.jobid
for k in jobs:
O = fromjm[k]
J = jm.resubmit(O, args.deps)
print 'Re-submitted job', J
if args.cleanup:
remove(O.stdout_filename())
remove(O.stderr_filename())
del fromjm[k]
print ' deleted job %s from database' % O['job_number']
def add_submission_options(parser):
"""Adds standard submission options to a given parser"""
parser.add_argument('-q', '--queue', metavar='QNAME',
dest='qname', default='all.q',
help='the name of the SGE queue to submit the job to (defaults to %(default)s)')
#this is ON by default as it helps job management
#parser.add_argument('-c', '--cwd', default=False, action='store_true',
# dest='cwd', help='Makes SGE switch to the current working directory before executing the job')
parser.add_argument('-n', '--name', dest='name', help='Sets the jobname')
parser.add_argument('-x', '--dependencies', '--deps', dest='deps', type=int,
default=[], metavar='ID', nargs='*', help='set job dependencies by giving this option an a list of job identifiers separated by spaces')
parser.add_argument('-o', '--stdout', '--out', metavar='DIR', dest='stdout', default='logs', help='Set the standard output of the job to be placed in the given directory - relative paths are interpreted according to the currently working directory or the home directory if the option --cwd was not given')
parser.add_argument('-e', '--stderr', '--err', metavar='DIR', dest='stderr', default='logs', help='Set the standard error of the job to be placed in the given directory - relative paths are interpreted according to the currently working directory or the home directory if the option --cwd was not given')
parser.add_argument('-s', '--environment', '--env', metavar='KEY=VALUE',
dest='env', nargs='*', default=[],
help='Passes specific environment variables to the job')
class AliasedSubParsersAction(argparse._SubParsersAction):
"""Hack taken from https://gist.github.com/471779 to allow aliases in
argparse for python 2.x (this has been implemented on python 3.2)
"""
class _AliasedPseudoAction(argparse.Action):
def __init__(self, name, aliases, help):
dest = name
if aliases:
dest += ' (%s)' % ','.join(aliases)
sup = super(AliasedSubParsersAction._AliasedPseudoAction, self)
sup.__init__(option_strings=[], dest=dest, help=help)
def add_parser(self, name, **kwargs):
if 'aliases' in kwargs:
aliases = kwargs['aliases']
del kwargs['aliases']
else:
aliases = []
parser = super(AliasedSubParsersAction, self).add_parser(name, **kwargs)
# Make the aliases work.
for alias in aliases:
self._name_parser_map[alias] = parser
# Make the help text reflect them, first removing old help entry.
if 'help' in kwargs:
help = kwargs.pop('help')
self._choices_actions.pop()
pseudo_action = self._AliasedPseudoAction(name, aliases, help)
self._choices_actions.append(pseudo_action)
return parser
def main():
parser = argparse.ArgumentParser(description=__doc__, epilog=__epilog__,
formatter_class=argparse.RawDescriptionHelpFormatter)
# part of the hack to support aliases in subparsers
parser.register('action', 'parsers', AliasedSubParsersAction)
# general options
parser.add_argument('-d', '--database', metavar='FILE', dest='db', help='replace the default database by one provided by you; this option is only required if you are running outside the directory where you originally submitted the jobs from or if you have altered manually the location of the JobManager database')
parser.add_argument('-v', '--verbose', dest='verbose', default=False,
action='store_true', help='increase verbosity for this script')
cmdparser = parser.add_subparsers(title='commands', help='commands accepted by %(prog)s')
# subcommand 'list'
lsparser = cmdparser.add_parser('list', aliases=['ls'],
help='lists jobs stored in the database')
lsparser.add_argument('-f', '--full', dest='full', default=False,
action='store_true', help='increases information on job lists')
lsparser.set_defaults(func=ls)
# subcommand 'refresh'
refparser = cmdparser.add_parser('refresh', aliases=['ref'],
help='refreshes the current list of executing jobs by querying SGE, updates the databases of currently executing jobs. If you wish, it may optionally save jobs that executed successfuly and/or failed execution')
refparser.add_argument('-s', '--no-success-db', default='success.db', action='store_false', dest='successdb', help='if you provide a name of a file, jobs that have succeeded will be saved on this file')
refparser.add_argument('-f', '--no-fail-db', dest='faildb', default='failure.db', action='store_false',
help='if you provide a name of a file, jobs that have failed will be saved on this file')
refparser.set_defaults(func=refresh)
# subcommand 'explain'
exparser = cmdparser.add_parser('explain', aliases=['why'],
help='explains why jobs failed in a database')
exparser.add_argument('db', metavar='FILE',
help='the name of the database to explain the jobs from')
exparser.add_argument('jobid', metavar='ID', nargs='*', type=int,
default=[], help='by default I\'ll explain all jobs, unless you limit giving job identifiers')
exparser.set_defaults(func=explain)
# subcommand 'cleanup'
cleanparser = cmdparser.add_parser('cleanup', aliases=['clean', 'mrproper'],
help='remove all logging traces of a job - this action only makes sense for completed jobs')
cleanparser.add_argument('db', metavar='FILE',
help='the name of the database to cleanup the jobs from')
cleanparser.add_argument('jobid', metavar='ID', nargs='*', type=int,
default=[], help='by default I\'ll clean-up all jobs, unless you limit giving job identifiers')
cleanparser.add_argument('-r', '--remove-job', dest='remove_job', default=False, action='store_true', help='if set I\'ll also remove the job reference from the database')
cleanparser.set_defaults(func=cleanup)
# subcommand 'delete'
delparser = cmdparser.add_parser('delete', aliases=['del', 'rm', 'remove'],
help='removes jobs from the database; if jobs are running or are still scheduled in SGE, the jobs are also removed from the SGE queue')
delparser.add_argument('jobid', metavar='ID', nargs='+', type=int,
help='the SGE job identifiers as provided by the list command (first field)')
delparser.add_argument('-r', '--remove-logs', dest='also_logs', default=False, action='store_true', help='if set I\'ll also remove the logs if they exist')
delparser.set_defaults(func=delete)
# subcommand 'submit'
subparser = cmdparser.add_parser('submit', aliases=['sub'],
help='submits self-contained jobs to the SGE queue and logs them in a private database')
add_submission_options(subparser)
subparser.set_defaults(func=submit)
subparser.add_argument('job', metavar='command', nargs='+')
# subcommand 'wsubmit'
wsubparser = cmdparser.add_parser('wsubmit', aliases=['wsub', 'wrapper'],
help='submits a job that will be executed inside the context of a python wrapper script - note the wrapper script will be considered the SGE job and the actual prefixed command just an option; the wrapper script must be able to run and self-configure using stock components available in the OS')
add_submission_options(wsubparser)
wsubparser.set_defaults(func=wsubmit)
wsubparser.add_argument('-w', '--wrapper', metavar='WRAPPER', dest='wrapper',
help='the python wrapper that will bracket the script execution and options')
wsubparser.add_argument('job', metavar='command', nargs='+')
# subcommand 'torch'
tsubparser = cmdparser.add_parser('tsubmit', aliases=['tsub', 'torch'],
help='submits a job that will be executed inside the context of a torch release')
add_submission_options(tsubparser)
tsubparser.set_defaults(func=tsubmit)
tsubparser.add_argument('-t', '--torch', '--torch-root', metavar='DIR',
default='/idiap/group/torch5spro/nightlies/last', help='the root directory of a valid torch installation (defaults to %(default)s)')
tsubparser.add_argument('-D', '--torch-debug', dest='torch_debug', default=False, action='store_true', help='if set I\'ll setup the torch environment in debug mode')
tsubparser.add_argument('job', metavar='command', nargs='+')
# subcommand 'resubmit'
resubparser = cmdparser.add_parser('resubmit', aliases=['resub', 're'],
help='resubmits all jobs in a given database, exactly like they were submitted the first time')
resubparser.add_argument('fromdb', metavar='FILE',
help='the name of the database to re-submit the jobs from')
resubparser.add_argument('jobid', metavar='ID', nargs='*', type=int,
default=[], help='by default I\'ll re-submit all jobs, unless you limit giving job identifiers')
resubparser.add_argument('-r', '--cleanup', dest='cleanup', default=False, action='store_true', help='if set I\'ll also remove the old logs if they exist and the re-submitted job from the re-submission database')
resubparser.add_argument('-x', '--dependencies', '--deps', dest='deps', type=int, default=[], metavar='ID', nargs='*', help='when you re-submit jobs, dependencies are reset; if you need dependencies, add them using this variable')
resubparser.set_defaults(func=resubmit)
args = parser.parse_args()
args.func(args)
sys.exit(0)
#!/usr/bin/env python
# vim: set fileencoding=utf-8 :
# Andre Anjos <andre.anjos@idiap.ch>
# Wed 24 Aug 2011 09:20:40 CEST
"""Wrappers for Idiap's SETSHELL functionality
"""
import os
import sys
import signal
import subprocess
import logging
def environ(context):
"""Retrieves the environment for a particular SETSHELL context"""
BASEDIRSETSHELL = os.environ['BASEDIRSETSHELL']
dosetshell = '%s/setshell/bin/dosetshell' % BASEDIRSETSHELL
command = [dosetshell, '-s', 'sh', context]
# First things first, we get the path to the temp file created by dosetshell
try:
logging.debug("Executing: '%s'", ' '.join(command))
p = subprocess.Popen(command, stdout = subprocess.PIPE)
except OSError as e:
# occurs when the file is not executable or not found
raise OSError, "Error executing '%s': %s (%d)" % \
(' '.join(command), e.strerror, e.errno)