Commit def4cfd0 authored by Manuel Günther's avatar Manuel Günther
Browse files

Implemented resubmission of jobs to another queue; enabled jman's --job-ids...

Implemented resubmission of jobs to another queue; enabled jman's --job-ids parameter to be more sophisticated; listed command line is now wrapped with '' e.g. to allow directories with spaces; local jobs can now be run with priority; added warning when queue combination is not valid (and jobs will never execute); fixed small bugs.
parent b40364d1
......@@ -154,6 +154,11 @@ Usually, it is a good idea to combine the ``-a`` option with ``-j``, which will
$ bin/jman -vv list -a -j [job_id_1] [job_id_2]
Note that the ``-j`` option is in general relatively smart.
You can use it to select a range of job ids, e.g., ``-j 1-4 6-8``.
In this case, please assert that there are no spaces between job ids and the ``-`` separator.
If any job id is specified, which is not available in the database, it will simply be ignored, including job ids that in the ranges.
Inspecting log files
--------------------
......@@ -170,6 +175,7 @@ E.g.:
will print the contents of the output and error log file from the job with the desired ID (and only the array job with the given ID).
To report only the output or only the error logs, you can use the ``-o`` or ``-e`` option, respectively.
When some (array-)jobs are still running, use the ``-u`` option to list their current output and/or error logs.
Hopefully, that helps in debugging the problem!
......@@ -198,9 +204,9 @@ E.g. use:
.. code-block:: sh
$ bin/jman -vv delete -s success
$ bin/jman -vv delete -s success -j 10-20
to delete all jobs and the logs of all successfully finished jobs from the database.
to delete all jobs and the logs of all successfully finished jobs with job ids from 10 to 20 from the database.
Other command line tools
......
......@@ -41,6 +41,9 @@ class JobManagerLocal(JobManager):
def submit(self, command_line, name = None, array = None, dependencies = [], log_dir = None, dry_run = False, stop_on_failure = False, **kwargs):
"""Submits a job that will be executed on the local machine during a call to "run".
All kwargs will simply be ignored."""
# remove duplicate dependencies
dependencies = sorted(list(set(dependencies)))
# add job to database
self.lock()
job = add_job(self.session, command_line=command_line, name=name, dependencies=dependencies, array=array, log_dir=log_dir, stop_on_failure=stop_on_failure)
......@@ -59,12 +62,12 @@ class JobManagerLocal(JobManager):
return job_id
def resubmit(self, job_ids = None, failed_only = False, running_jobs = False):
def resubmit(self, job_ids = None, also_success = False, running_jobs = False, **kwargs):
"""Re-submit jobs automatically"""
self.lock()
# iterate over all jobs
jobs = self.get_jobs(job_ids)
accepted_old_status = ('failure',) if failed_only else ('success', 'failure')
accepted_old_status = ('success', 'failure') if also_success else ('failure',)
for job in jobs:
# check if this job needs re-submission
if running_jobs or job.status in accepted_old_status:
......@@ -82,7 +85,7 @@ class JobManagerLocal(JobManager):
jobs = self.get_jobs(job_ids)
for job in jobs:
if job.status in ('executing', 'queued', 'waiting'):
if job.status in ('executing', 'queued', 'waiting') and job.queue_name == 'local':
logger.info("Reset job '%s' in the database" % job.name)
job.submit()
......@@ -115,7 +118,7 @@ class JobManagerLocal(JobManager):
#####################################################################
###### Methods to run the jobs in parallel on the local machine #####
def _run_parallel_job(self, job_id, array_id = None, no_log = False):
def _run_parallel_job(self, job_id, array_id = None, no_log = False, nice = None):
"""Executes the code for this job on the local machine."""
environ = copy.deepcopy(os.environ)
environ['JOB_ID'] = str(job_id)
......@@ -127,6 +130,9 @@ class JobManagerLocal(JobManager):
# generate call to the wrapper script
command = [self.wrapper_script, '-ld', self._database, 'run-job']
if nice is not None:
command = ['nice', '-n%d'%nice] + command
job, array_job = self._job_and_array(job_id, array_id)
logger.info("Starting execution of Job '%s': '%s'" % (self._format_log(job_id, array_id, len(job.array)), job.name))
# create log files
......@@ -152,7 +158,7 @@ class JobManagerLocal(JobManager):
def _format_log(self, job_id, array_id = None, array_count = 0):
return ("%d (%d/%d)" % (job_id, array_id, array_count)) if array_id is not None and array_count else ("%d (%d)" % (job_id, array_id)) if array_id is not None else ("%d" % job_id)
def run_scheduler(self, parallel_jobs = 1, job_ids = None, sleep_time = 0.1, die_when_finished = False, no_log = False):
def run_scheduler(self, parallel_jobs = 1, job_ids = None, sleep_time = 0.1, die_when_finished = False, no_log = False, nice = None):
"""Starts the scheduler, which is constantly checking for jobs that should be ran."""
running_tasks = []
try:
......@@ -178,6 +184,7 @@ class JobManagerLocal(JobManager):
logger.info("Job '%s' finished execution with result %s" % (self._format_log(job_id, array_id), result))
# in any case, remove the job from the list
del running_tasks[task_index]
# SECOND, check if new jobs can be submitted; THIS NEEDS TO LOCK THE DATABASE
if len(running_tasks) < parallel_jobs:
# get all unfinished jobs:
......@@ -185,7 +192,7 @@ class JobManagerLocal(JobManager):
jobs = self.get_jobs(job_ids)
# put all new jobs into the queue
for job in jobs:
if job.status == 'submitted':
if job.status == 'submitted' and job.queue_name == 'local':
job.queue()
# get all unfinished jobs that are submitted to the local queue
......@@ -202,7 +209,7 @@ class JobManagerLocal(JobManager):
for i in range(min(parallel_jobs - len(running_tasks), len(queued_array_jobs))):
array_job = queued_array_jobs[i]
# start a new job from the array
process = self._run_parallel_job(job.id, array_job.id, no_log=no_log)
process = self._run_parallel_job(job.id, array_job.id, no_log=no_log, nice=nice)
if process is None:
continue
running_tasks.append((process, job.id, array_job.id))
......@@ -215,7 +222,7 @@ class JobManagerLocal(JobManager):
else:
if job.status == 'queued':
# start a new job
process = self._run_parallel_job(job.id, no_log=no_log)
process = self._run_parallel_job(job.id, no_log=no_log, nice=nice)
if process is None:
continue
running_tasks.append((process, job.id))
......@@ -245,5 +252,5 @@ class JobManagerLocal(JobManager):
logger.warn("Killing job '%s' that was still running." % self._format_log(task[1], task[2] if len(task) > 2 else None))
task[0].kill()
self.stop_job(task[1])
# stopp all jobs that are currently running or queued
# stop all jobs that are currently running or queued
self.stop_jobs()
......@@ -209,7 +209,8 @@ class JobManager:
if print_array_jobs and job.array:
print(array_delimiter)
for array_job in job.array:
print(array_job.format(array_format))
if array_job.status in status:
print(array_job.format(array_format))
print(array_delimiter)
self.unlock()
......
......@@ -214,6 +214,7 @@ class Job(Base):
# In python 2, the command line is unicode, which needs to be converted to string before pickling;
# In python 3, the command line is bytes, which can be pickled directly
args = loads(self.grid_arguments)['kwargs'] if isinstance(self.grid_arguments, bytes) else loads(str(self.grid_arguments))['kwargs']
# in any case, the commands have to be converted to str
retval = {}
if 'pe_opt' in args:
retval['pe_opt'] = args['pe_opt']
......@@ -226,8 +227,14 @@ class Job(Base):
if 'io_big' in args and args['io_big']:
retval['io_big'] = True
# also add the queue
if self.queue_name is not None:
retval['queue'] = str(self.queue_name)
return retval
def set_arguments(self, **kwargs):
self.grid_arguments = dumps(kwargs)
def get_jobs_we_wait_for(self):
return [j.waited_for_job for j in self.jobs_we_have_to_wait_for if j.waited_for_job is not None]
......@@ -243,6 +250,16 @@ class Job(Base):
return os.path.join(self.log_dir, (self.name if self.name else 'job') + ".e" + str(self.id)) if self.log_dir else None
def _cmdline(self):
cmdline = self.get_command_line()
c = ""
for cmd in cmdline:
if cmd[0] == '-':
c += "%s " % cmd
else:
c += "'%s' " % cmd
return c
def __str__(self):
id = "%d" % self.id
if self.machine_name: m = "%s - %s" % (self.queue_name, self.machine_name)
......@@ -253,11 +270,11 @@ class Job(Base):
else: n = "<Job: %s>" % id
if self.result is not None: r = "%s (%d)" % (self.status, self.result)
else: r = "%s" % self.status
return "%s | %s : %s -- %s" % (n, m, r, " ".join(self.get_command_line()))
return "%s | %s : %s -- %s" % (n, m, r, self._cmdline())
def format(self, format, dependencies = 0, limit_command_line = None):
"""Formats the current job into a nicer string to fit into a table."""
command_line = " ".join(self.get_command_line())
command_line = self._cmdline()
if limit_command_line is not None and len(command_line) > limit_command_line:
command_line = command_line[:limit_command_line-3] + '...'
......@@ -271,7 +288,7 @@ class Job(Base):
command_line = "<" + ",".join(["%s=%s" % (key,value) for key,value in grid_opt.iteritems()]) + ">: " + command_line
if dependencies:
deps = str([dep.id for dep in self.get_jobs_we_wait_for()])
deps = str(sorted(list(set([dep.id for dep in self.get_jobs_we_wait_for()]))))
if dependencies < len(deps):
deps = deps[:dependencies-3] + '...'
return format.format(job_id, queue, status, self.name, deps, command_line)
......
......@@ -73,6 +73,23 @@ def get_array(array):
return (a,b,c)
def get_ids(jobs):
if jobs is None:
return None
indexes = []
for job in jobs:
# check if a range is specified
separator = job.find('-')
if separator == -1:
index = int(job)
indexes.append(index)
else:
first = int(job[0:separator])
last = int(job[separator+1:])
indexes.extend(range(first, last+1))
return indexes
def submit(args):
"""Submission command"""
......@@ -91,7 +108,7 @@ def submit(args):
'memfree': args.memory,
'hvmem': args.memory,
'io_big': args.io_big,
}
}
if args.array is not None: kwargs['array'] = get_array(args.array)
if args.log_dir is not None: kwargs['log_dir'] = args.log_dir
......@@ -111,7 +128,25 @@ def resubmit(args):
jm = setup(args)
if not args.keep_logs:
jm.delete(job_ids=args.job_ids, delete_jobs=False)
jm.resubmit(args.job_ids, args.failed_only, args.running_jobs)
kwargs = {
'cwd': True
}
if args.qname is not None:
kwargs['queue'] = args.qname
if args.memory is not None:
kwargs['memfree'] = args.memory
kwargs['hvmem'] = args.memory
if args.parallel is not None:
kwargs['pe_opt'] = "pe_mth %d" % args.parallel
kwargs['memfree'] = "%d%s" % (int(args.memory.rstrip(string.ascii_letters)) * args.parallel, args.memory.lstrip(string.digits))
if args.io_big:
kwargs['io_big'] = True
if args.no_io_big:
kwargs['io_big'] = False
jm.resubmit(get_ids(args.job_ids), args.also_success, args.running_jobs, **kwargs)
def run_scheduler(args):
......@@ -119,13 +154,13 @@ def run_scheduler(args):
if not args.local:
raise ValueError("The execute command can only be used with the '--local' command line option")
jm = setup(args)
jm.run_scheduler(parallel_jobs=args.parallel, job_ids=args.job_ids, sleep_time=args.sleep_time, die_when_finished=args.die_when_finished, no_log=args.no_log_files)
jm.run_scheduler(parallel_jobs=args.parallel, job_ids=get_ids(args.job_ids), sleep_time=args.sleep_time, die_when_finished=args.die_when_finished, no_log=args.no_log_files, nice=args.nice)
def list(args):
"""Lists the jobs in the given database."""
jm = setup(args)
jm.list(job_ids=args.job_ids, print_array_jobs=args.print_array_jobs, print_dependencies=args.print_dependencies, status=args.status, long=args.verbose > 1 or args.long, ids_only=args.ids_only)
jm.list(job_ids=get_ids(args.job_ids), print_array_jobs=args.print_array_jobs, print_dependencies=args.print_dependencies, status=args.status, long=args.verbose > 1 or args.long, ids_only=args.ids_only)
def communicate(args):
......@@ -133,13 +168,13 @@ def communicate(args):
if args.local:
raise ValueError("The communicate command can only be used without the '--local' command line option")
jm = setup(args)
jm.communicate(job_ids=args.job_ids)
jm.communicate(job_ids=get_ids(args.job_ids))
def report(args):
"""Reports the results of the finished (and unfinished) jobs."""
jm = setup(args)
jm.report(job_ids=args.job_ids, array_ids=args.array_ids, unfinished=args.unfinished_also, output=not args.errors_only, error=not args.output_only)
jm.report(job_ids=get_ids(args.job_ids), array_ids=get_ids(args.array_ids), unfinished=args.unfinished_also, output=not args.errors_only, error=not args.output_only)
def stop(args):
......@@ -147,7 +182,7 @@ def stop(args):
if args.local:
raise ValueError("Stopping commands locally is not supported (please kill them yourself)")
jm = setup(args)
jm.stop_jobs(args.job_ids)
jm.stop_jobs(get_ids(args.job_ids))
def delete(args):
......@@ -157,7 +192,7 @@ def delete(args):
if not args.local and 'executing' in args.status:
stop(args)
# then, delete them from the database
jm.delete(job_ids=args.job_ids, array_ids=args.array_ids, delete_logs=not args.keep_logs, delete_log_dir=not args.keep_log_dir, status=args.status)
jm.delete(job_ids=get_ids(args.job_ids), array_ids=get_ids(args.array_ids), delete_logs=not args.keep_logs, delete_log_dir=not args.keep_log_dir, status=args.status)
def run_job(args):
......@@ -227,7 +262,7 @@ def main(command_line_options = None):
# subcommand 'submit'
submit_parser = cmdparser.add_parser('submit', aliases=['sub'], formatter_class=formatter, help='Submits jobs to the SGE queue or to the local job scheduler and logs them in a database.')
submit_parser.add_argument('-q', '--queue', metavar='QNAME', dest='qname', default='all.q', choices=('q1d', 'q1w', 'q1m', 'q1dm', 'q1wm'), help='the name of the SGE queue to submit the job to')
submit_parser.add_argument('-q', '--queue', metavar='QNAME', dest='qname', default='all.q', choices=('all.q', 'q1d', 'q1w', 'q1m', 'q1dm', 'q1wm'), help='the name of the SGE queue to submit the job to')
submit_parser.add_argument('-m', '--memory', help='Sets both the h_vmem and the mem_free parameters when submitting the job to the specified value, e.g. 8G to set the memory requirements to 8 gigabytes')
submit_parser.add_argument('-p', '--parallel', '--pe_mth', type=int, help='Sets the number of slots per job (-pe pe_mth) and multiplies the mem_free parameter. E.g. to get 16 G of memory, use -m 8G -p 2.')
submit_parser.add_argument('-n', '--name', dest='name', help='Gives the job a name')
......@@ -237,26 +272,31 @@ def main(command_line_options = None):
submit_parser.add_argument('-s', '--environment', metavar='KEY=VALUE', dest='env', nargs='*', default=[], help='Passes specific environment variables to the job.')
submit_parser.add_argument('-t', '--array', '--parametric', metavar='(first-)last(:step)', help="Creates a parametric (array) job. You must specify the 'last' value, but 'first' (default=1) and 'step' (default=1) can be specified as well (when specifying 'step', 'first' has to be given, too).")
submit_parser.add_argument('-z', '--dry-run', action='store_true', help='Do not really submit anything, just print out what would submit in this case')
submit_parser.add_argument('-I', '--io-big', action='store_true', help='Sets "io_big" on the submitted jobs so it limits the machines in which the job is submitted to those that can do high-throughput.')
submit_parser.add_argument('-i', '--io-big', action='store_true', help='Sets "io_big" on the submitted jobs so it limits the machines in which the job is submitted to those that can do high-throughput.')
submit_parser.add_argument('job', metavar='command', nargs=argparse.REMAINDER, help = "The job that should be executed. Sometimes a -- is required to separate the job from other command line options.")
submit_parser.set_defaults(func=submit)
# subcommand 're-submit'
resubmit_parser = cmdparser.add_parser('resubmit', aliases=['reset', 'requeue', 're'], formatter_class=formatter, help='Re-submits a list of jobs.')
resubmit_parser.add_argument('-j', '--job-ids', metavar='ID', nargs='*', type=int, help='Re-submit only the jobs with the given ids (by default, all finished jobs are re-submitted).')
resubmit_parser.add_argument('-j', '--job-ids', metavar='ID', nargs='+', help='Re-submit only the jobs with the given ids (by default, all finished jobs are re-submitted).')
resubmit_parser.add_argument('-q', '--queue', metavar='QNAME', dest='qname', choices=('all.q', 'q1d', 'q1w', 'q1m', 'q1dm', 'q1wm'), help='Reset the SGE queue to submit the job to')
resubmit_parser.add_argument('-m', '--memory', help='Resets both the h_vmem and the mem_free parameters when submitting the job to the specified value, e.g. 8G to set the memory requirements to 8 gigabytes')
resubmit_parser.add_argument('-p', '--parallel', '--pe_mth', type=int, help='Resets the number of slots per job (-pe pe_mth) and multiplies the mem_free parameter. E.g. to get 16 G of memory, use -m 8G -p 2.')
resubmit_parser.add_argument('-i', '--io-big', action='store_true', help='Resubmits the job to the "io_big" queue.')
resubmit_parser.add_argument('-I', '--no-io-big', action='store_true', help='Resubmits the job NOT to the "io_big" queue.')
resubmit_parser.add_argument('-k', '--keep-logs', action='store_true', help='Do not clean the log files of the old job before re-submitting.')
resubmit_parser.add_argument('-f', '--failed-only', action='store_true', help='Re-submit only jobs that have failed.')
resubmit_parser.add_argument('-s', '--also-success', action='store_true', help='Re-submit also jobs that have finished successfully.')
resubmit_parser.add_argument('-a', '--running-jobs', action='store_true', help='Re-submit even jobs that are running or waiting (use this flag with care).')
resubmit_parser.set_defaults(func=resubmit)
# subcommand 'stop'
stop_parser = cmdparser.add_parser('stop', formatter_class=formatter, help='Stops the execution of jobs in the grid.')
stop_parser.add_argument('-j', '--job-ids', metavar='ID', nargs='*', type=int, help='Stop only the jobs with the given ids (by default, all jobs are stopped).')
stop_parser.add_argument('-j', '--job-ids', metavar='ID', nargs='+', help='Stop only the jobs with the given ids (by default, all jobs are stopped).')
stop_parser.set_defaults(func=stop)
# subcommand 'list'
list_parser = cmdparser.add_parser('list', aliases=['ls'], formatter_class=formatter, help='Lists jobs stored in the database. Use the -vv option to get a long listing.')
list_parser.add_argument('-j', '--job-ids', metavar='ID', nargs='*', type=int, help='List only the jobs with the given ids (by default, all jobs are listed)')
list_parser.add_argument('-j', '--job-ids', metavar='ID', nargs='+', help='List only the jobs with the given ids (by default, all jobs are listed)')
list_parser.add_argument('-a', '--print-array-jobs', action='store_true', help='Also list the array ids.')
list_parser.add_argument('-l', '--long', action='store_true', help='Prints additional information about the submitted job.')
list_parser.add_argument('-x', '--print-dependencies', action='store_true', help='Print the dependencies of the jobs as well.')
......@@ -266,7 +306,7 @@ def main(command_line_options = None):
# subcommand 'communicate'
stop_parser = cmdparser.add_parser('communicate', aliases = ['com'], formatter_class=formatter, help='Communicates with the grid to see if there were unexpected errors (e.g. a timeout) during the job execution.')
stop_parser.add_argument('-j', '--job-ids', metavar='ID', nargs='*', type=int, help='Check only the jobs with the given ids (by default, all jobs are checked)')
stop_parser.add_argument('-j', '--job-ids', metavar='ID', nargs='+', help='Check only the jobs with the given ids (by default, all jobs are checked)')
stop_parser.set_defaults(func=communicate)
......@@ -275,14 +315,14 @@ def main(command_line_options = None):
report_parser.add_argument('-e', '--errors-only', action='store_true', help='Only report the error logs (by default, both logs are reported).')
report_parser.add_argument('-o', '--output-only', action='store_true', help='Only report the output logs (by default, both logs are reported).')
report_parser.add_argument('-u', '--unfinished-also', action='store_true', help='Report also the unfinished jobs; use this option also to check error files for jobs with success status.')
report_parser.add_argument('-j', '--job-ids', metavar='ID', nargs='*', type=int, help='Report only the jobs with the given ids (by default, all finished jobs are reported)')
report_parser.add_argument('-a', '--array-ids', metavar='ID', nargs='*', type=int, help='Report only the jobs with the given array ids. If specified, a single job-id must be given as well.')
report_parser.add_argument('-j', '--job-ids', metavar='ID', nargs='+', help='Report only the jobs with the given ids (by default, all finished jobs are reported)')
report_parser.add_argument('-a', '--array-ids', metavar='ID', nargs='+', help='Report only the jobs with the given array ids. If specified, a single job-id must be given as well.')
report_parser.set_defaults(func=report)
# subcommand 'delete'
delete_parser = cmdparser.add_parser('delete', aliases=['del', 'rm', 'remove'], formatter_class=formatter, help='Removes jobs from the database; if jobs are running or are still scheduled in SGE, the jobs are also removed from the SGE queue.')
delete_parser.add_argument('-j', '--job-ids', metavar='ID', nargs='*', type=int, help='Delete only the jobs with the given ids (by default, all jobs are deleted).')
delete_parser.add_argument('-a', '--array-ids', metavar='ID', nargs='*', type=int, help='Delete only the jobs with the given array ids. If specified, a single job-id must be given as well. Note that the whole job including all array jobs will be removed from the SGE queue.')
delete_parser.add_argument('-j', '--job-ids', metavar='ID', nargs='+', help='Delete only the jobs with the given ids (by default, all jobs are deleted).')
delete_parser.add_argument('-a', '--array-ids', metavar='ID', nargs='+', help='Delete only the jobs with the given array ids. If specified, a single job-id must be given as well. Note that the whole job including all array jobs will be removed from the SGE queue.')
delete_parser.add_argument('-r', '--keep-logs', action='store_true', help='If set, the log files will NOT be removed.')
delete_parser.add_argument('-R', '--keep-log-dir', action='store_true', help='When removing the logs, keep the log directory.')
delete_parser.add_argument('-s', '--status', nargs='+', choices = Status, default = Status, help='Delete only jobs that have the given statuses; by default all jobs are deleted.')
......@@ -291,10 +331,11 @@ def main(command_line_options = None):
# subcommand 'run_scheduler'
scheduler_parser = cmdparser.add_parser('run-scheduler', aliases=['sched', 'x'], formatter_class=formatter, help='Runs the scheduler on the local machine. To stop the scheduler safely, please use Ctrl-C; only valid in combination with the \'--local\' option.')
scheduler_parser.add_argument('-p', '--parallel', type=int, default=1, help='Select the number of parallel jobs that you want to execute locally')
scheduler_parser.add_argument('-j', '--job-ids', metavar='ID', nargs='*', type=int, help='Select the job ids that should be run (be default, all submitted and queued jobs are run).')
scheduler_parser.add_argument('-j', '--job-ids', metavar='ID', nargs='+', help='Select the job ids that should be run (be default, all submitted and queued jobs are run).')
scheduler_parser.add_argument('-s', '--sleep-time', type=float, default=0.1, help='Set the sleep time between for the scheduler in seconds.')
scheduler_parser.add_argument('-x', '--die-when-finished', action='store_true', help='Let the job manager die when it has finished all jobs of the database.')
scheduler_parser.add_argument('-l', '--no-log-files', action='store_true', help='Overwrites the log file setup to print the results to the console.')
scheduler_parser.add_argument('-n', '--nice', type=int, help='Jobs will be run with the given priority (can only be positive, i.e., to have lower priority')
scheduler_parser.set_defaults(func=run_scheduler)
......
......@@ -42,10 +42,10 @@ class JobManagerSGE(JobManager):
process it we have to split it twice (',' and then on '='), create a
dictionary and extract just the qname"""
if not 'hard resource_list' in kwargs: return 'all.q'
d = dict([reversed(k.split('=')) for k in kwargs['hard resource_list'].split(',')])
if not 'TRUE' in d: return 'all.q'
return d['TRUE']
d = dict([k.split('=') for k in kwargs['hard resource_list'].split(',')])
for k in d:
if k[0] == 'q' and d[k] == 'TRUE': return k
return 'all.q'
def _submit_to_grid(self, job, name, array, dependencies, log_dir, **kwargs):
......@@ -54,6 +54,9 @@ class JobManagerSGE(JobManager):
jman = self.wrapper_script
python = sys.executable
# remove duplicate dependencies
dependencies = sorted(list(set(dependencies)))
# generate call to the wrapper script
command = make_shell(python, [jman, '-d', self._database, 'run-job'])
q_array = "%d-%d:%d" % array if array else None
......@@ -67,6 +70,11 @@ class JobManagerSGE(JobManager):
logger.info("Submitted job '%s' to the SGE grid." % job)
if 'io_big' in kwargs and kwargs['io_big'] and ('queue' not in kwargs or kwargs['queue'] == 'all.q'):
logger.warn("This job will never be executed since the 'io_big' flag is not available for the 'all.q'.")
if 'pe_opt' in kwargs and ('queue' not in kwargs or kwargs['queue'] not in ('q1dm', 'q_1day_mth', 'q1wm', 'q_1week_mth')):
logger.warn("This job will never be executed since the queue '%s' does not support multi-threading (pe_mth) -- use 'q1dm' or 'q1wm' instead." % kwargs['queue'] if 'queue' in kwargs else 'all.q')
assert job.id == grid_id
return grid_id
......@@ -112,22 +120,31 @@ class JobManagerSGE(JobManager):
self.unlock()
def resubmit(self, job_ids = None, failed_only = False, running_jobs = False):
def resubmit(self, job_ids = None, also_success = False, running_jobs = False, **kwargs):
"""Re-submit jobs automatically"""
self.lock()
# iterate over all jobs
jobs = self.get_jobs(job_ids)
accepted_old_status = ('failure',) if failed_only else ('success', 'failure')
accepted_old_status = ('success', 'failure') if also_success else ('failure',)
for job in jobs:
# check if this job needs re-submission
if running_jobs or job.status in accepted_old_status:
grid_status = qstat(job.id, context=self.context)
if len(grid_status) != 0:
logger.warn("Deleting job '%d' since it was still running in the grid." % job.id)
qdel(job.id, context=self.context)
# re-submit job to the grid
if job.queue_name == 'local':
arguments = job.get_arguments()
arguments.update(**kwargs)
job.set_arguments(kwargs=arguments)
# delete old status and result of the job
job.submit()
if job.queue_name == 'local' and 'queue' not in arguments:
logger.warn("Re-submitting job '%s' locally (since no queue name is specified)." % job)
else:
logger.debug("Re-submitting job '%s' to the grid." % job)
self._submit_to_grid(job, job.name, job.get_array(), [dep.id for dep in job.get_jobs_we_wait_for()], job.log_dir, **job.get_arguments())
job.submit()
deps = [dep.id for dep in job.get_jobs_we_wait_for()]
logger.debug("Re-submitting job '%s' with dependencies '%s' to the grid." % (job, deps))
self._submit_to_grid(job, job.name, job.get_array(), deps, job.log_dir, **arguments)
self.session.commit()
self.unlock()
......@@ -143,6 +160,10 @@ class JobManagerSGE(JobManager):
qdel(job.id, context=self.context)
logger.info("Stopped job '%s' in the SGE grid." % job)
job.status = 'submitted'
for array_job in job.array:
if array_job.status in ('executing', 'queued', 'waiting'):
array_job.status = 'submitted'
self.session.commit()
self.unlock()
......@@ -194,7 +194,7 @@ class GridTKTest(unittest.TestCase):
jman.main(['./bin/jman', '--database', self.database, 'report'])
# clean-up
jman.main(['./bin/jman', '--local', '--database', self.database, 'delete'])
jman.main(['./bin/jman', '--local', '--database', self.database, 'delete', '--job-ids', '1-2'])
# check that the database and the log files are gone
self.assertEqual(len(os.listdir(self.temp_dir)), 0)
......
......@@ -9,7 +9,7 @@ if sys.version_info[:2] < (2, 7) or ((3,0) <= sys.version_info[:2] < (3,2)):
setup(
name='gridtk',
version='1.1.1a0',
version='1.1.3a0',
description='SGE Grid and Local Submission and Monitoring Tools for Idiap',
url='http://github.com/idiap/gridtk',
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment