From 5b40fbe56b5738d5ffe71bc9f52f7f6531fcefdc Mon Sep 17 00:00:00 2001 From: Manuel Gunther <siebenkopf@googlemail.com> Date: Tue, 15 Mar 2016 20:00:35 -0600 Subject: [PATCH] Added option to run submitted job in a given directory (not tested in grid mode) --- gridtk/local.py | 13 ++++++++----- gridtk/manager.py | 3 ++- gridtk/models.py | 17 ++++++++++++++--- gridtk/script/jman.py | 3 +++ gridtk/sge.py | 4 ++-- gridtk/tests/__init__.py | 33 +++++++++++++++++++++++---------- version.txt | 2 +- 7 files changed, 53 insertions(+), 22 deletions(-) diff --git a/gridtk/local.py b/gridtk/local.py index d6dea66..017d23b 100644 --- a/gridtk/local.py +++ b/gridtk/local.py @@ -38,7 +38,7 @@ class JobManagerLocal(JobManager): JobManager.__init__(self, **kwargs) - def submit(self, command_line, name = None, array = None, dependencies = [], log_dir = None, dry_run = False, stop_on_failure = False, **kwargs): + def submit(self, command_line, name = None, array = None, dependencies = [], exec_dir = None, log_dir = None, dry_run = False, stop_on_failure = False, **kwargs): """Submits a job that will be executed on the local machine during a call to "run". All kwargs will simply be ignored.""" # remove duplicate dependencies @@ -46,7 +46,7 @@ class JobManagerLocal(JobManager): # add job to database self.lock() - job = add_job(self.session, command_line=command_line, name=name, dependencies=dependencies, array=array, log_dir=log_dir, stop_on_failure=stop_on_failure) + job = add_job(self.session, command_line=command_line, name=name, dependencies=dependencies, array=array, exec_dir=exec_dir, log_dir=log_dir, stop_on_failure=stop_on_failure) logger.info("Added job '%s' to the database", job) if dry_run: @@ -158,7 +158,7 @@ class JobManagerLocal(JobManager): try: return subprocess.Popen(command, env=environ, stdout=out, stderr=err, bufsize=1) except OSError as e: - logger.error("Could not execute job '%s' (%s) locally\n- reason:\t%s\n- command line:\t%s\n- command:\t%s", job.name, self._format_log(job_id, array_id, len(job.array)), e, " ".join(job.get_command_line()), " ".join(command)) + logger.error("Could not execute job '%s' (%s) locally\n- reason:\t%s\n- command line:\t%s\n- directory:\t%s\n- command:\t%s", job.name, self._format_log(job_id, array_id, len(job.array)), e, " ".join(job.get_command_line()), "." if job.exec_dir is None else job.exec_dir, " ".join(command)) job.finish(117, array_id) # ASCII 'O' return None @@ -262,8 +262,11 @@ class JobManagerLocal(JobManager): self.unlock() logger.info("Stopping task scheduler due to user interrupt.") for task in running_tasks: - logger.warn("Killing job '%s' that was still running." % self._format_log(task[1], task[2] if len(task) > 2 else None)) - task[0].kill() + logger.warn("Killing job '%s' that was still running.", self._format_log(task[1], task[2] if len(task) > 2 else None)) + try: + task[0].kill() + except OSError as e: + logger.error("Killing job '%s' was not successful: '%s'", self._format_log(task[1], task[2] if len(task) > 2 else None), e) self.stop_job(task[1]) # stop all jobs that are currently running or queued self.stop_jobs(job_ids) diff --git a/gridtk/manager.py b/gridtk/manager.py index eadb491..7223162 100644 --- a/gridtk/manager.py +++ b/gridtk/manager.py @@ -147,11 +147,12 @@ class JobManager: self.lock() job = self.get_jobs((job_id,))[0] command_line = job.get_command_line() + exec_dir = job.get_exec_dir() self.unlock() # execute the command line of the job, and wait until it has finished try: - result = subprocess.call(command_line) + result = subprocess.call(command_line, cwd=exec_dir) except Exception as e: print("ERROR: The job with id '%d' could not be executed: %s" % (job_id, e), file=sys.stderr) result = 69 # ASCII: 'E' diff --git a/gridtk/models.py b/gridtk/models.py index 92822fa..600a67e 100644 --- a/gridtk/models.py +++ b/gridtk/models.py @@ -71,6 +71,7 @@ class Job(Base): machine_name = Column(String(10)) # The name of the machine in which the job is run grid_arguments = Column(String(255)) # The kwargs arguments for the job submission (e.g. in the grid) id = Column(Integer) # The ID of the job as given from the grid + exec_dir = Column(String(255)) # The directory in which the command should be executed log_dir = Column(String(255)) # The directory where the log files will be put to array_string = Column(String(255)) # The array string (only needed for re-submission) stop_on_failure = Column(Boolean) # An indicator whether to stop depending jobs when this job finishes with an error @@ -78,13 +79,14 @@ class Job(Base): status = Column(Enum(*Status)) result = Column(Integer) - def __init__(self, command_line, name = None, log_dir = None, array_string = None, queue_name = 'local', machine_name = None, stop_on_failure = False, **kwargs): + def __init__(self, command_line, name = None, exec_dir = None, log_dir = None, array_string = None, queue_name = 'local', machine_name = None, stop_on_failure = False, **kwargs): """Constructs a Job object without an ID (needs to be set later).""" self.command_line = dumps(command_line) self.name = name self.queue_name = queue_name # will be set during the queue command later self.machine_name = machine_name # will be set during the execute command later self.grid_arguments = dumps(kwargs) + self.exec_dir = exec_dir self.log_dir = log_dir self.stop_on_failure = stop_on_failure self.array_string = dumps(array_string) @@ -207,6 +209,13 @@ class Job(Base): """Sets / overwrites the command line for the job.""" self.command_line = dumps(command_line) + def get_exec_dir(self): + """Returns the command line for the job.""" + # In python 2, the command line is unicode, which needs to be converted to string before pickling; + # In python 3, the command line is bytes, which can be pickled directly + return str(os.path.realpath(self.exec_dir)) if self.exec_dir is not None else None + + def get_array(self): """Returns the array arguments for the job; usually a string.""" @@ -292,6 +301,8 @@ class Job(Base): if grid_opt: # add additional information about the job at the end command_line = "<" + ",".join(["%s=%s" % (key,value) for key,value in grid_opt.iteritems()]) + ">: " + command_line + if self.exec_dir is not None: + command_line += "; [Executed in directory: '%s']" % self.exec_dir if dependencies: deps = str(sorted(list(set([dep.unique for dep in self.get_jobs_we_wait_for()])))) @@ -321,9 +332,9 @@ class JobDependence(Base): -def add_job(session, command_line, name = 'job', dependencies = [], array = None, log_dir = None, stop_on_failure = False, **kwargs): +def add_job(session, command_line, name = 'job', dependencies = [], array = None, exec_dir=None, log_dir = None, stop_on_failure = False, **kwargs): """Helper function to create a job, add the dependencies and the array jobs.""" - job = Job(command_line=command_line, name=name, log_dir=log_dir, array_string=array, stop_on_failure=stop_on_failure, kwargs=kwargs) + job = Job(command_line=command_line, name=name, exec_dir=exec_dir, log_dir=log_dir, array_string=array, stop_on_failure=stop_on_failure, kwargs=kwargs) session.add(job) session.flush() diff --git a/gridtk/script/jman.py b/gridtk/script/jman.py index 5734b24..11608b3 100644 --- a/gridtk/script/jman.py +++ b/gridtk/script/jman.py @@ -120,6 +120,7 @@ def submit(args): } if args.array is not None: kwargs['array'] = get_array(args.array) + if args.exec_dir is not None: kwargs['exec_dir'] = args.exec_dir if args.log_dir is not None: kwargs['log_dir'] = args.log_dir if args.dependencies is not None: kwargs['dependencies'] = args.dependencies if args.qname != 'all.q': kwargs['hvmem'] = args.memory @@ -130,6 +131,7 @@ def submit(args): kwargs['dry_run'] = args.dry_run kwargs['stop_on_failure'] = args.stop_on_failure + # submit the job job_id = jm.submit(args.job, **kwargs) @@ -283,6 +285,7 @@ def main(command_line_options = None): submit_parser.add_argument('-n', '--name', dest='name', help='Gives the job a name') submit_parser.add_argument('-x', '--dependencies', type=int, default=[], metavar='ID', nargs='*', help='Set job dependencies to the list of job identifiers separated by spaces') submit_parser.add_argument('-k', '--stop-on-failure', action='store_true', help='Stop depending jobs when this job finished with an error.') + submit_parser.add_argument('-d', '--exec-dir', metavar='DIR', help='Sets the executing directory, where the script should be executed. If not given, jobs will be executed in the current directory') submit_parser.add_argument('-l', '--log-dir', metavar='DIR', help='Sets the log directory. By default, "logs" is selected for the SGE. If the jobs are executed locally, by default the result is written to console.') submit_parser.add_argument('-s', '--environment', metavar='KEY=VALUE', dest='env', nargs='*', default=[], help='Passes specific environment variables to the job.') submit_parser.add_argument('-t', '--array', '--parametric', metavar='(first-)last(:step)', help="Creates a parametric (array) job. You must specify the 'last' value, but 'first' (default=1) and 'step' (default=1) can be specified as well (when specifying 'step', 'first' has to be given, too).") diff --git a/gridtk/sge.py b/gridtk/sge.py index be08706..ab6515b 100644 --- a/gridtk/sge.py +++ b/gridtk/sge.py @@ -80,11 +80,11 @@ class JobManagerSGE(JobManager): return job.unique - def submit(self, command_line, name = None, array = None, dependencies = [], log_dir = "logs", dry_run = False, stop_on_failure = False, **kwargs): + def submit(self, command_line, name = None, array = None, dependencies = [], exec_dir = None, log_dir = "logs", dry_run = False, stop_on_failure = False, **kwargs): """Submits a job that will be executed in the grid.""" # add job to database self.lock() - job = add_job(self.session, command_line, name, dependencies, array, log_dir=log_dir, stop_on_failure=stop_on_failure, context=self.context, **kwargs) + job = add_job(self.session, command_line, name, dependencies, array, exec_dir=exec_dir, log_dir=log_dir, stop_on_failure=stop_on_failure, context=self.context, **kwargs) logger.info("Added job '%s' to the database." % job) if dry_run: print("Would have added the Job") diff --git a/gridtk/tests/__init__.py b/gridtk/tests/__init__.py index aa6149e..e0e3e54 100644 --- a/gridtk/tests/__init__.py +++ b/gridtk/tests/__init__.py @@ -47,10 +47,12 @@ class GridTKTest(unittest.TestCase): # first, add some commands to the database script_1 = pkg_resources.resource_filename('gridtk.tests', 'test_script.sh') script_2 = pkg_resources.resource_filename('gridtk.tests', 'test_array.sh') + rdir = pkg_resources.resource_filename('gridtk', 'tests') from gridtk.script import jman # add a simple script that will write some information to the jman.main(['./bin/jman', '--local', '--database', self.database, 'submit', '--log-dir', self.log_dir, '--name', 'test_1', bash, script_1]) jman.main(['./bin/jman', '--local', '--database', self.database, 'submit', '--log-dir', self.log_dir, '--name', 'test_2', '--dependencies', '1', '--parametric', '1-7:2', bash, script_2]) + jman.main(['./bin/jman', '--local', '--database', self.database, 'submit', '--log-dir', self.log_dir, '--name', 'test_3', '--dependencies', '1', '2', '--exec-dir', rdir, bash, "test_array.sh"]) # check that the database was created successfully self.assertTrue(os.path.exists(self.database)) @@ -64,20 +66,24 @@ class GridTKTest(unittest.TestCase): job_manager = gridtk.local.JobManagerLocal(database=self.database) session = job_manager.lock() jobs = list(session.query(Job)) - self.assertEqual(len(jobs), 2) + self.assertEqual(len(jobs), 3) self.assertEqual(jobs[0].id, 1) self.assertEqual(jobs[1].id, 2) + self.assertEqual(jobs[2].id, 3) self.assertEqual(len(jobs[1].array), 4) self.assertEqual(jobs[0].status, 'submitted') self.assertEqual(jobs[1].status, 'submitted') + self.assertEqual(jobs[2].status, 'submitted') # check that the job dependencies are correct waiting = jobs[0].get_jobs_waiting_for_us() - self.assertEqual(len(waiting), 1) + self.assertEqual(len(waiting), 2) self.assertEqual(waiting[0].id, 2) - waited = jobs[1].get_jobs_we_wait_for() - self.assertEqual(len(waited), 1) + self.assertEqual(waiting[1].id, 3) + waited = jobs[2].get_jobs_we_wait_for() + self.assertEqual(len(waited), 2) self.assertEqual(waited[0].id, 1) + self.assertEqual(waited[1].id, 2) job_manager.unlock() @@ -93,13 +99,14 @@ class GridTKTest(unittest.TestCase): # now, the first job needs to have status failure, and the second needs to be queued session = job_manager.lock() jobs = list(session.query(Job)) - self.assertEqual(len(jobs), 2) + self.assertEqual(len(jobs), 3) if jobs[0].status in ('submitted', 'queued', 'executing'): # on slow machines, we don0t want the tests to fail, so we just skip job_manager.unlock() raise nose.plugins.skip.SkipTest("This machine seems to be quite slow in processing parallel jobs.") self.assertEqual(jobs[0].status, 'failure') self.assertEqual(jobs[1].status, 'queued') + self.assertEqual(jobs[2].status, 'waiting') # the result files should already be there self.assertTrue(os.path.exists(jobs[0].std_out_file())) self.assertTrue(os.path.exists(jobs[0].std_err_file())) @@ -121,7 +128,7 @@ class GridTKTest(unittest.TestCase): # Job 1 and two array jobs of job two should be finished now, the other two still need to be queued session = job_manager.lock() jobs = list(session.query(Job)) - self.assertEqual(len(jobs), 2) + self.assertEqual(len(jobs), 3) if jobs[0].status in ('queued', 'executing') or jobs[1].status == 'queued': # on slow machines, we don0t want the tests to fail, so we just skip job_manager.unlock() @@ -169,7 +176,7 @@ class GridTKTest(unittest.TestCase): # check that exactly four output and four error files have been created files = os.listdir(self.log_dir) - self.assertEqual(len(files), 10) + self.assertEqual(len(files), 12) for i in range(1,8,2): self.assertTrue('test_2.o2.%d'%i in files) self.assertTrue('test_2.e2.%d'%i in files) @@ -177,13 +184,15 @@ class GridTKTest(unittest.TestCase): # check that all array jobs are finished now session = job_manager.lock() jobs = list(session.query(Job)) - self.assertEqual(len(jobs), 2) + self.assertEqual(len(jobs), 3) self.assertEqual(jobs[1].status, 'failure') self.assertEqual(jobs[1].array[0].status, 'failure') self.assertEqual(jobs[1].array[0].result, 1) for i in range(1,4): self.assertEqual(jobs[1].array[i].status, 'success') self.assertEqual(jobs[1].array[i].result, 0) + self.assertEqual(jobs[2].status, 'success') + self.assertEqual(jobs[2].result, 0) job_manager.unlock() print() @@ -195,7 +204,7 @@ class GridTKTest(unittest.TestCase): jman.main(['./bin/jman', '--database', self.database, 'report']) # clean-up - jman.main(['./bin/jman', '--local', '--database', self.database, 'delete', '--job-ids', '1-2']) + jman.main(['./bin/jman', '--local', '--database', self.database, 'delete', '--job-ids', '1-3']) # check that the database and the log files are gone self.assertEqual(len(os.listdir(self.temp_dir)), 0) @@ -203,6 +212,7 @@ class GridTKTest(unittest.TestCase): # add the scripts again, but this time with the --stop-on-failure option jman.main(['./bin/jman', '--local', '--database', self.database, 'submit', '--log-dir', self.log_dir, '--name', 'test_1', '--stop-on-failure', bash, script_1]) jman.main(['./bin/jman', '--local', '--database', self.database, 'submit', '--log-dir', self.log_dir, '--name', 'test_2', '--dependencies', '1', '--parametric', '1-7:2', '--stop-on-failure', bash, script_2]) + jman.main(['./bin/jman', '--local', '--database', self.database, 'submit', '--log-dir', self.log_dir, '--name', 'test_3', '--dependencies', '1', '2', '--exec-dir', rdir, '--stop-on-failure', bash, "test_array.sh"]) # and execute them, but without writing the log files self.scheduler_job = subprocess.Popen(['./bin/jman', '--local', '--database', self.database, 'run-scheduler', '--sleep-time', '0.1', '--parallel', '2', '--die-when-finished', '--no-log-files']) @@ -218,15 +228,18 @@ class GridTKTest(unittest.TestCase): # check that all array jobs are finished now session = job_manager.lock() jobs = list(session.query(Job)) - self.assertEqual(len(jobs), 2) + self.assertEqual(len(jobs), 3) self.assertEqual(jobs[0].status, 'failure') self.assertEqual(jobs[0].result, 255) self.assertEqual(jobs[1].status, 'failure') self.assertTrue(jobs[1].result is None) + self.assertEqual(jobs[2].status, 'failure') + self.assertTrue(jobs[2].result is None) job_manager.unlock() # and clean up again jman.main(['./bin/jman', '--local', '--database', self.database, 'delete']) + self.assertEqual(len(os.listdir(self.temp_dir)), 0) except KeyboardInterrupt: # make sure that the keyboard interrupt is captured and the mess is cleaned up (i.e. by calling tearDown) diff --git a/version.txt b/version.txt index b118efb..e447d7d 100644 --- a/version.txt +++ b/version.txt @@ -1 +1 @@ -1.2.5b0 \ No newline at end of file +1.3.0b0 -- GitLab