diff --git a/README.rst b/README.rst index bd3398c52e6c2ce4c1e8976f214d6265ecd9f6e7..365bb513b506be012fef4daba3391e5ad3ec18d6 100644 --- a/README.rst +++ b/README.rst @@ -1,14 +1,18 @@ -================= - SGE Job Manager -================= +====================== + Parallel Job Manager +====================== The Job Manager is python wrapper around SGE utilities like ``qsub``, ``qstat`` and ``qdel``. It interacts with these tools to submit and manage grid jobs making up a complete workflow ecosystem. -Everytime you interact with the Job Manager, a local database file (normally -named ``submitted.db``) is read or written so it preserves its state during -decoupled calls. The database contains all informations about jobs that is +Since version 1.0 there is also a local submission system introduced. Instead +of sending jobs to the SGE grid, it executes them in parallel processes on the +local machine. + +Every time you interact with the Job Manager, a local database file (normally +named ``submitted.sql3``) is read or written so it preserves its state during +decoupled calls. The database contains all information about jobs that is required for the Job Manager to: * submit jobs (includes wrapped python jobs or Torch5spro specific jobs) @@ -19,11 +23,12 @@ required for the Job Manager to: * easily re-submit jobs if problems occur * support for parametric (array) jobs -Many of these features are also achieveable using the stock SGE utilities, the +Many of these features are also achievable using the stock SGE utilities, the Job Manager only makes it dead simple. -Submitting a job ----------------- + +Submitting jobs to the SGE grid ++++++++++++++++++++++++++++++++ To interact with the Job Manager we use the ``jman`` utility. Make sure to have your shell environment setup to reach it w/o requiring to type-in the full @@ -34,18 +39,19 @@ path. The first task you may need to pursue is to submit jobs. Here is how:: .. note:: - The command `submit` of the Job Manager will submit a job that will run in + The command ``submit`` of the Job Manager will submit a job that will run in a python environment. It is not the only way to submit a job using the Job Manager. You can also use `submit`, that considers the command as a self sufficient application. Read the full help message of ``jman`` for details and instructions. + Submitting a parametric job --------------------------- Parametric or array jobs are jobs that execute the same way, except for the environment variable ``SGE_TASK_ID``, which changes for every job. This way, -your program controls which bit of the full job has to be executed in each +your program controls, which bit of the full job has to be executed in each (parallel) instance. It is great for forking thousands of jobs into the grid. The next example sends 10 copies of the ``myscript.py`` job to the grid with @@ -58,6 +64,7 @@ The ``-t`` option in ``jman`` accepts different kinds of job array descriptions. Have a look at the help documentation for details with ``jman --help``. + Probing for jobs ---------------- @@ -66,41 +73,36 @@ called ``submitted.db``) has been created in the current working directory. It contains the information for the job you just submitted:: $ jman list - job-id queue age arguments + job-id queue age arguments ======== ===== === ======================================================= 6151645 all.q 2m -S /usr/bin/python myscript.py --help From this dump you can see the SGE job identifier, the queue the job has been submitted to and the command that was given to ``qsub``. The ``list`` command -from ``jman`` only lists the contents of the database, it does **not** update -it. +from ``jman`` will show the current status of the job, which is updated +automatically as soon as the grid job finishes. -Refreshing the list -------------------- -You may instruct the job manager to probe SGE and update the status of the jobs -it is monitoring. Finished jobs will be reported to the screen and removed from -the job manager database and placed on a second database (actually two) -containing jobs that failed and jobs that succeeded:: +Submitting dependent jobs +------------------------- - $ jman refresh - These jobs require attention: - 6151645 @all.q (30 minutes ago) -S /usr/bin/python myscript.py --help +Sometimes, the execution of one job might depend on the execution of another +job. The JobManager can take care of this, simply by adding the id of the +job that we have to wait for:: -.. note:: + $ jman submit --dependencies 6151645 myscript.py --help + Submitted 6151646 @all.q (0 seconds ago) -S /usr/bin/python myscript.py --help + +Now, the new job will only be run after the first one finished. - Detection of success or failure is based on the length of the standard error - output of the job. If it is greater than zero, it is considered a failure. Inspecting log files -------------------- -As can be seen the job we submitted just failed. The job manager says it -requires attention. If jobs fail, they are moved to a database named -``failure.db`` in the current directory. Otherwise, they are moved to -``success.db``. You can inspect the job log files like this:: +If jobs finish, the result of the executed job will be shown. In case it is +non-zero, might want to inspect the log files as follows:: - $ jman explain failure.db + $ jman report --errors-only Job 6151645 @all.q (34 minutes ago) -S /usr/bin/python myscript.py --help Command line: (['-S', '/usr/bin/python', '--', 'myscript.py', '--help'],) {'deps': [], 'stderr': 'logs', 'stdout': 'logs', 'queue': 'all.q', 'cwd': True, 'name': None} @@ -113,6 +115,7 @@ requires attention. If jobs fail, they are moved to a database named Hopefully, that helps in debugging the problem! + Re-submitting the job --------------------- @@ -120,15 +123,27 @@ If you are convinced the job did not work because of external conditions (e.g. temporary network outage), you may re-submit it, *exactly* like it was submitted the first time:: - $ jman resubmit --clean failure.db + $ jman resubmit --clean Re-submitted job 6151663 @all.q (1 second ago) -S /usr/bin/python myscript.py --help removed `logs/myscript.py.o6151645' removed `logs/myscript.py.e6151645' deleted job 6151645 from database -The ``--clean`` flag tells the job manager to clean-up the old failure and the -log files as it re-submits the new job. Notice the new job identifier has -changed as expected. +The ``--clean`` flag tells the job manager to clean-up the old log files as it +re-submits the new job. Notice the new job identifier has changed as expected. + + +Stopping a grid job +------------------- +In case you found an error in the code of a grid job that is currently +executing, you might want to kill the job in the grid. For this purpose, you +can use the command:: + + $ jman stop + +The job is removed from the grid, but all log files are still available. A +common use case is to stop the grid job, fix the bugs, and re-submit it. + Cleaning-up ----------- @@ -137,11 +152,51 @@ If the job in question will not work no matter how many times we re-submit it, you may just want to clean it up and do something else. The job manager is here for you again:: - $ jman cleanup --remove-job failure.db + $ jman delete Cleaning-up logs for job 6151663 @all.q (5 minutes ago) -S /usr/bin/python myscript.py --help removed `logs/myscript.py.o6151663' removed `logs/myscript.py.e6151663' deleted job 6151663 from database -Inspection on the current directory will now show you everything concerning the -said job is gone. +In case, jobs are still running in the grid, they will be stopped before they +are removed from the database. Inspection on the current directory will now +show you everything concerning the jobs is gone. + + +Running jobs on the local machine ++++++++++++++++++++++++++++++++++ + +The JobManager is designed such that it supports mainly the same infrastructure +when submitting jobs locally or in the SGE grid. To submit jobs locally, just +add the ``--local`` option to the jman command:: + + $ jman --local submit myscript.py --help + + +Differences between local and grid execution +-------------------------------------------- + +One important difference to the grid submission is that the jobs that are +submitted to the local machine **do not run immediately**, but are only +collected in the ``submitted.sql3`` database. To run the collected jobs using 4 +parallel processes, simply use:: + + $ jman --local execute --parallel 4 + +and all jobs that have not run yet are executed, keeping an eye on the +dependencies. + +Another difference is that by default, the jobs write their results into the +command line and not into log files. If you want the log file behavior back, +specify the log directory during the submission:: + + $ jman --local submit --log-dir logs myscript.py --help + +Of course, you can choose a different log directory (also for the SGE +submission). + +Furthermore, the job identifiers during local submission usually start from 1 +and increase. Also, during local re-submission, the job ID does not change, and +jobs cannot be stopped using the ``stop`` command (you have to kill the +``jman --local --execute`` job first, and then all running jobs). + diff --git a/gridtk/manager.py b/gridtk/manager.py index a266cc84f066ecfff01ab1bc50b5a2675248e7a8..8a866d953e8f3c0090a60d308ab2daa664ffa9c7 100644 --- a/gridtk/manager.py +++ b/gridtk/manager.py @@ -180,7 +180,7 @@ class JobManager: self.unlock() - def delete(self, job_ids, array_ids = None, delete_logs = True, delete_log_dir = False): + def delete(self, job_ids, array_ids = None, delete_logs = True, delete_log_dir = False, delete_jobs = True): """Deletes the jobs with the given ids from the database.""" def _delete_dir_if_empty(log_dir): if log_dir and delete_log_dir and os.path.isdir(log_dir) and not os.listdir(log_dir): @@ -194,7 +194,8 @@ class JobManager: if err_file and os.path.exists(err_file): os.remove(err_file) if try_to_delete_dir: _delete_dir_if_empty(job.log_dir) - self.session.delete(job) + if delete_jobs: + self.session.delete(job) self.lock() diff --git a/gridtk/script/jman.py b/gridtk/script/jman.py index 0036c09c84849bc7b5a71d2664748d9fd9318e43..521105c6770630385cb74b98956dc3a5f9999aaa 100644 --- a/gridtk/script/jman.py +++ b/gridtk/script/jman.py @@ -109,6 +109,8 @@ def submit(args): def resubmit(args): """Re-submits the jobs with the given ids.""" jm = setup(args) + if args.clean: + jm.delete(job_ids=args.job_ids, delete_jobs = False) jm.resubmit(args.job_ids, args.failed_only, args.running_jobs) @@ -244,6 +246,7 @@ def main(command_line_options = None): help='Re-submits a list of jobs') resubmit_parser.add_argument('-d', '--db', '--database', metavar='DATABASE', help='replace the default database to be used by one provided by you; this option is only required if you are running outside the directory where you originally submitted the jobs from or if you have altered manually the location of the JobManager database') resubmit_parser.add_argument('-j', '--job-ids', metavar='ID', nargs='*', type=int, help='List only the jobs with the given ids (by default, all jobs are listed)') + resubmit_parser.add_argument('-c', '--clean', action='store_true', help='Clean the log files of the old job before re-submitting') resubmit_parser.add_argument('-f', '--failed-only', action='store_true', help='Re-submit only jobs that have failed') resubmit_parser.add_argument('-a', '--running-jobs', action='store_true', help='Re-submit even jobs that are running or waiting') resubmit_parser.set_defaults(func=resubmit) @@ -263,6 +266,7 @@ def main(command_line_options = None): list_parser.add_argument('-x', '--print-dependencies', action='store_true', help='Print the dependencies of the jobs as well.') list_parser.set_defaults(func=list) + # report parser report_parser = cmdparser.add_parser('report', aliases=['ref', 'r'], help='Iterates through the result and error log files and prints out the logs') report_parser.add_argument('-d', '--db', metavar='DATABASE', help='replace the default database to be reported by one provided by you', nargs='?') @@ -292,7 +296,7 @@ def main(command_line_options = None): execute_parser.set_defaults(func=execute) - # subcommand 'run.job'; this is not seen on the command line since it is the actual wrapper script + # subcommand 'run-job'; this is not seen on the command line since it is actually a wrapper script run_parser = cmdparser.add_parser('run-job', help=argparse.SUPPRESS) run_parser.add_argument('db', metavar='DATABASE', nargs='?', help=argparse.SUPPRESS) run_parser.set_defaults(func=run_job) diff --git a/gridtk/tests/__init__.py b/gridtk/tests/__init__.py index c1e46f49abcbedeb3ff4ef77dc2b9e8315b79dd2..eba4ad7f059ea1f7a0efff9f7835ac46f161687e 100644 --- a/gridtk/tests/__init__.py +++ b/gridtk/tests/__init__.py @@ -83,7 +83,10 @@ class DatabaseTest(unittest.TestCase): job_manager.unlock() # reset the job 1 - jman.main(['./bin/jman', '--local', 'resubmit', '--db', self.db, '--job-id', '1']) + jman.main(['./bin/jman', '--local', 'resubmit', '--db', self.db, '--job-id', '1', '--clean']) + # check that the log files are gone, but the log dir is not + assert os.path.exists(self.log_dir) + assert len(os.listdir(self.log_dir)) == 0 # assert that job 2 still can't run nose.tools.assert_raises(RuntimeError, jman.main, ['./bin/jman', '--local', 'execute', '--db', self.db, '--job-id', '2'])