Commit 5bbf0cd9 authored by Manuel Günther's avatar Manuel Günther

Implemented Scheduler independent from job submission; added logging output; Improved job listing.

parent d751b3fe
This diff is collapsed.
This diff is collapsed.
......@@ -11,7 +11,7 @@ from .tools import logger
Base = declarative_base()
Status = ('waiting', 'executing', 'finished')
Status = ('submitted', 'queued', 'waiting', 'executing', 'success', 'failure')
class ArrayJob(Base):
"""This class defines one element of an array job."""
......@@ -43,6 +43,14 @@ class ArrayJob(Base):
else: r = "%s" % self.status
return "%s : %s" % (n, r)
def format(self, format):
"""Formats the current job into a nicer string to fit into a table."""
job_id = "%d - %d" % (self.job.id, self.id)
status = "%s" % self.status + (" (%d)" % self.result if self.result is not None else "" )
return format.format(job_id, self.job.queue_name, status)
class Job(Base):
"""This class defines one Job that was submitted to the Job Manager."""
......@@ -51,7 +59,8 @@ class Job(Base):
unique = Column(Integer, primary_key = True) # The unique ID of the job (not corresponding to the grid ID)
command_line = Column(String(255)) # The command line to execute, converted to one string
name = Column(String(20)) # A hand-chosen name for the task
arguments = Column(String(255)) # The kwargs arguments for the job submission (e.g. in the grid)
queue_name = Column(String(20)) # The name of the queue
grid_arguments = Column(String(255)) # The kwargs arguments for the job submission (e.g. in the grid)
id = Column(Integer, unique = True) # The ID of the job as given from the grid
log_dir = Column(String(255)) # The directory where the log files will be put to
array_string = Column(String(255)) # The array string (only needed for re-submission)
......@@ -59,15 +68,89 @@ class Job(Base):
status = Column(Enum(*Status))
result = Column(Integer)
def __init__(self, command_line, name = None, log_dir = None, array_string = None, **kwargs):
def __init__(self, command_line, name = None, log_dir = None, array_string = None, queue_name = 'local', **kwargs):
"""Constructs a Job object without an ID (needs to be set later)."""
self.command_line = dumps(command_line)
self.name = name
self.status = Status[0]
self.result = None
self.queue_name = queue_name # will be set during the queue command later
self.grid_arguments = dumps(kwargs)
self.log_dir = log_dir
self.array_string = dumps(array_string)
self.arguments = dumps(kwargs)
self.submit()
def submit(self):
"""Sets the status of this job to 'submitted'."""
self.status = 'submitted'
self.result = None
for array_job in self.array:
array_job.status = 'submitted'
array_job.result = None
def queue(self, new_job_id = None, new_job_name = None, queue_name = None):
"""Sets the status of this job to 'queued' or 'waiting'."""
# update the job id (i.e., when the job is executed in the grid)
if new_job_id is not None:
self.id = new_job_id
if new_job_name is not None:
self.name = new_job_name
if queue_name is not None:
self.queue_name = queue_name
new_status = 'queued'
self.result = None
# check if we have to wait for another job to finish
for job in self.get_jobs_we_wait_for():
if job is not None and job.status not in Status[-2:]:
new_status = 'waiting'
# reset the queued jobs that depend on us to waiting status
for job in self.get_jobs_waiting_for_us():
if job is not None and job.status == 'queued':
job.status = 'waiting'
self.status = new_status
for array_job in self.array:
array_job.status = new_status
def execute(self, array_id = None):
"""Sets the status of this job to 'executing'."""
self.status = 'executing'
if array_id is not None:
for array_job in self.array:
if array_job.id == array_id:
array_job.status = 'executing'
def finish(self, result, array_id = None):
"""Sets the status of this job to 'success' or 'failure'."""
# check if there is any array job still running
new_status = 'success' if result == 0 else 'failure'
new_result = result
finished = True
if array_id is not None:
for array_job in self.array:
if array_job.id == array_id:
array_job.status = new_status
array_job.result = result
if array_job.status not in Status[-2:]:
finished = False
elif new_result == 0:
new_result = array_job.result
if finished:
# There was no array job, or all array jobs finished
self.status = 'success' if new_result == 0 else 'failure'
self.result = new_result
# update all waiting jobs
for job in self.get_jobs_waiting_for_us():
if job.status == 'waiting':
job.queue()
def get_command_line(self):
return loads(str(self.command_line))
......@@ -75,13 +158,8 @@ class Job(Base):
def get_array(self):
return loads(str(self.array_string))
def set_arguments(self, **kwargs):
previous = self.get_arguments()
previous.update(kwargs)
self.arguments = dumps(previous)
def get_arguments(self):
return loads(str(self.arguments))
return loads(str(self.grid_arguments))
def get_jobs_we_wait_for(self):
return [j.waited_for_job for j in self.jobs_we_have_to_wait_for if j.waited_for_job is not None]
......@@ -99,14 +177,27 @@ class Job(Base):
def __str__(self):
id = "%d" % self.id
if self.array: j = "%s (%d-%d)" % (id, self.array[0].id, self.array[-1].id)
else: j = "%s" % id
if self.name is not None: n = "<Job: %s - '%s'>" % (j, self.name)
else: n = "<Job: %s>" % j
if self.array: a = "[%d-%d:%d]" % self.get_array()
else: a = ""
if self.name is not None: n = "<Job: %s %s - '%s'>" % (id, a, self.name)
else: n = "<Job: %s>" % id
if self.result is not None: r = "%s (%d)" % (self.status, self.result)
else: r = "%s" % self.status
return "%s : %s -- %s" % (n, r, " ".join(self.get_command_line()))
def format(self, format, add_dependencies = False, limit_command_line = None):
"""Formats the current job into a nicer string to fit into a table."""
command_line = " ".join(self.get_command_line())
if add_dependencies:
command_line = str([dep.id for dep in self.get_jobs_we_wait_for()]) + " -- " + command_line
if limit_command_line is not None:
command_line = command_line[:limit_command_line-3] + '...'
job_id = "%d" % self.id + (" [%d-%d:%d]" % self.get_array() if self.array else "")
status = "%s" % self.status + (" (%d)" % self.result if self.result is not None else "" )
return format.format(job_id, self.queue_name, status, self.name, command_line)
class JobDependence(Base):
......@@ -145,7 +236,6 @@ def add_job(session, command_line, name = 'job', dependencies = [], array = None
else:
logger.warn("Could not find dependent job with id %d in database" % d)
if array:
(start, stop, step) = array
# add array jobs
......
This diff is collapsed.
......@@ -9,14 +9,14 @@
from .manager import JobManager
from .setshell import environ
from .models import add_job
from .tools import qsub, qstat, qdel, make_shell
from .tools import logger, qsub, qstat, qdel, make_shell
import os, sys
class JobManagerSGE(JobManager):
"""The JobManager will submit and control the status of submitted jobs"""
def __init__(self, database='submitted.sql3', context='grid', wrapper_script = './bin/jman'):
def __init__(self, context='grid', **kwargs):
"""Initializes this object with a state file and a method for qsub'bing.
Keyword parameters:
......@@ -32,16 +32,28 @@ class JobManagerSGE(JobManager):
"""
self.context = environ(context)
JobManager.__init__(self, database, wrapper_script)
JobManager.__init__(self, **kwargs)
def _queue(self, kwargs):
"""The hard resource_list comes like this: '<qname>=TRUE,mem=128M'. To
process it we have to split it twice (spaces and then on '='), create a
dictionary and extract just the qname"""
if not 'hard resource_list' in kwargs: return 'all.q'
d = dict([reversed(k.split('=')) for k in kwargs['hard resource_list'].split(',')])
if not 'TRUE' in d: return 'all.q'
return d['TRUE']
def _submit_to_grid(self, job, name, array, dependencies, log_dir, **kwargs):
# ... what we will actually submit to the grid is a wrapper script that will call the desired command...
# get the name of the file that was called originally
jman = self.wrapper_script
python = jman.replace('jman', 'python')
python = sys.executable
# generate call to the wrapper script
command = make_shell(python, [jman, 'run-job', self._database])
command = make_shell(python, [jman, '-d', self._database, 'run-job'])
q_array = "%d-%d:%d" % array if array else None
grid_id = qsub(command, context=self.context, name=name, deps=dependencies, array=q_array, stdout=log_dir, stderr=log_dir, **kwargs)
......@@ -49,9 +61,12 @@ class JobManagerSGE(JobManager):
status = qstat(grid_id, context=self.context)
# set the grid id of the job
job.id = int(status['job_number'])
job.queue(new_job_id = int(status['job_number']), new_job_name = status['job_name'], queue_name = self._queue(status))
logger.info("Submitted job '%s' to the SGE grid." % job)
assert job.id == grid_id
job.name = status['job_name']
return grid_id
def submit(self, command_line, name = None, array = None, dependencies = [], log_dir = "logs", **kwargs):
......@@ -59,12 +74,11 @@ class JobManagerSGE(JobManager):
# add job to database
self.lock()
job = add_job(self.session, command_line, name, dependencies, array, log_dir=log_dir, context=self.context, **kwargs)
logger.debug("Added job '%s' to the database." % job)
self._submit_to_grid(job, name, array, dependencies, log_dir, **kwargs)
self.session.commit()
job_id = self._submit_to_grid(job, name, array, dependencies, log_dir, **kwargs)
# return the job id
job_id = job.id
self.session.commit()
self.unlock()
return job_id
......@@ -75,20 +89,17 @@ class JobManagerSGE(JobManager):
self.lock()
# iterate over all jobs
jobs = self.get_jobs(job_ids)
accepted_old_status = ('failure',) if failed_only else ('success', 'failure')
for job in jobs:
# check if this job needs re-submission
if running_jobs or job.status == 'finished':
if not failed_only or job.result != 0:
# resubmit
if job.array:
# get the array as before
array = job.get_array()
else:
array = None
job.status = 'waiting'
job.result = None
# re-submit job to the grid
self._submit_to_grid(job, job.name, array, [dep.id for dep in job.dependent_jobs], job.log_dir)
if running_jobs or job.status in accepted_old_status:
# re-submit job to the grid
if job.queue_name == 'local':
logger.warn("Re-submitting job '%s' locally (since no queue name is specified)." % job)
job.submit()
else:
logger.debug("Re-submitting job '%s' to the grid." % job)
self._submit_to_grid(job, job.name, job.get_array(), [dep.id for dep in job.get_jobs_we_wait_for()], job.log_dir)
self.session.commit()
self.unlock()
......@@ -100,9 +111,10 @@ class JobManagerSGE(JobManager):
jobs = self.get_jobs(job_ids)
for job in jobs:
qdel(job.id, context=self.context)
if job.status == 'executing':
job.status = 'waiting'
qdel(job.id, context=self.context)
logger.info("Stopped job '%s' in the SGE grid." % job)
job.status = 'submitted'
self.session.commit()
self.unlock()
......@@ -5,6 +5,8 @@ import os
import pkg_resources
import gridtk
import subprocess, signal
import time
from gridtk.models import Job
......@@ -16,10 +18,14 @@ class DatabaseTest(unittest.TestCase):
import tempfile
self.temp_dir = tempfile.mkdtemp(prefix='gridtk_test')
self.log_dir = os.path.join(self.temp_dir, 'logs')
self.db = os.path.join(self.temp_dir, 'database.sql3')
self.database = os.path.join(self.temp_dir, 'database.sql3')
self.scheduler_job = None
def tearDown(self):
# make sure that all scheduler jobs are stopped after exiting
if self.scheduler_job:
self.scheduler_job.send_signal(signal.SIGINT)
# Clean up the mess that we created
import shutil
shutil.rmtree(self.temp_dir)
......@@ -32,24 +38,27 @@ class DatabaseTest(unittest.TestCase):
script_2 = pkg_resources.resource_filename('gridtk.tests', 'test_array.sh')
from gridtk.script import jman
# add a simple script that will write some information to the
jman.main(['./bin/jman', '--local', 'submit', '--db', self.db, '--log-dir', self.log_dir, '--name', 'test_1', script_1])
jman.main(['./bin/jman', '--local', 'submit', '--db', self.db, '--log-dir', self.log_dir, '--name', 'test_2', '--dependencies', '1', '--parametric', '1-7:2', script_2])
jman.main(['./bin/jman', '--local', '--database', self.database, 'submit', '--log-dir', self.log_dir, '--name', 'test_1', script_1])
jman.main(['./bin/jman', '--local', '--database', self.database, 'submit', '--log-dir', self.log_dir, '--name', 'test_2', '--dependencies', '1', '--parametric', '1-7:2', script_2])
# check that the database was created successfully
assert os.path.exists(self.db)
assert os.path.exists(self.database)
print
# test that the list command works (should also work with the "default" grid manager
jman.main(['./bin/jman', 'list', '--db', self.db, '--job-ids', '1'])
jman.main(['./bin/jman', 'list', '--db', self.db, '--job-ids', '2', '--print-array-jobs', '--print-dependencies'])
jman.main(['./bin/jman', '--database', self.database, 'list', '--job-ids', '1'])
jman.main(['./bin/jman', '--database', self.database, 'list', '--job-ids', '2', '--print-array-jobs', '--print-dependencies'])
# get insight into the database
job_manager = gridtk.local.JobManagerLocal(self.db)
job_manager = gridtk.local.JobManagerLocal(database=self.database)
session = job_manager.lock()
jobs = list(session.query(Job))
assert len(jobs) == 2
assert jobs[0].id == 1
assert jobs[1].id == 2
assert len(jobs[1].array) == 4
assert jobs[0].status == 'submitted'
assert jobs[1].status == 'submitted'
# check that the job dependencies are correct
waiting = jobs[0].get_jobs_waiting_for_us()
......@@ -61,63 +70,115 @@ class DatabaseTest(unittest.TestCase):
job_manager.unlock()
# try to run the job 2 first (should fail since it depends on job 1)
nose.tools.assert_raises(RuntimeError, jman.main, ['./bin/jman', '--local', 'execute', '--db', self.db, '--job-id', '2'])
# now, start the local execution of the job in a parallel job
self.scheduler_job = subprocess.Popen(['./bin/jman', '--local', '--database', self.database, 'run-scheduler', '--sleep-time', '5', '--parallel', '2'])
# execute job 1
jman.main(['./bin/jman', '--local', 'execute', '--db', self.db, '--job-id', '1'])
# sleep some time to assure that the scheduler was able to start the first job
time.sleep(2)
# ... and kill the scheduler
self.scheduler_job.send_signal(signal.SIGINT)
self.scheduler_job = None
# check that the output is actually there
out_file = os.path.join(self.log_dir, 'test_1.o1')
err_file = os.path.join(self.log_dir, 'test_1.e1')
# now, the first job needs to have status failure, and the second needs to be queued
session = job_manager.lock()
jobs = list(session.query(Job))
assert len(jobs) == 2
assert jobs[0].status == 'failure'
assert jobs[1].status == 'queued'
# the result files should not be there yet
assert not os.path.exists(jobs[0].std_out_file())
assert not os.path.exists(jobs[0].std_err_file())
job_manager.unlock()
# reset the job 1
jman.main(['./bin/jman', '--local', '--database', self.database, 'resubmit', '--job-id', '1', '--running-jobs'])
# now, start the local execution of the job in a parallel job
self.scheduler_job = subprocess.Popen(['./bin/jman', '--local', '--database', self.database, 'run-scheduler', '--sleep-time', '4', '--parallel', '2'])
# sleep some time to assure that the scheduler was able to finish the first and start the second job
time.sleep(6)
# ... and kill the scheduler
self.scheduler_job.send_signal(signal.SIGINT)
self.scheduler_job = None
# Job 1 and two array jobs of job two should be finished now, the other two still need to be queued
session = job_manager.lock()
jobs = list(session.query(Job))
assert len(jobs) == 2
assert jobs[0].status == 'failure'
assert jobs[1].status == 'executing'
assert jobs[1].array[0].status == 'failure'
assert jobs[1].array[0].result == 1
assert jobs[1].array[1].status == 'success'
assert jobs[1].array[1].result == 0
assert len([a for a in jobs[1].array if a.status == 'queued']) == 2
out_file = jobs[0].std_out_file()
err_file = jobs[0].std_err_file()
job_manager.unlock()
# the result files of the first job should now be there
assert os.path.isfile(out_file)
assert os.path.isfile(err_file)
assert open(out_file).read().rstrip() == 'This is a text message to std-out'
assert open(err_file).read().rstrip() == 'This is a text message to std-err'
# check the status and the result of job 1
session = job_manager.lock()
job = list(session.query(Job).filter(Job.id == 1))[0]
assert job.status == 'finished'
assert job.result == 255
job_manager.unlock()
# reset the job 1
jman.main(['./bin/jman', '--local', 'resubmit', '--db', self.db, '--job-id', '1', '--clean'])
# check that the log files are gone, but the log dir is not
# resubmit all jobs
jman.main(['./bin/jman', '--local', '--database', self.database, 'resubmit', '--running-jobs'])
# check that the log files have been cleaned
assert not os.path.exists(out_file)
assert not os.path.exists(err_file)
# ... but the log dir still exists
assert os.path.exists(self.log_dir)
assert len(os.listdir(self.log_dir)) == 0
# assert that job 2 still can't run
nose.tools.assert_raises(RuntimeError, jman.main, ['./bin/jman', '--local', 'execute', '--db', self.db, '--job-id', '2'])
# delete job 1 from the database
jman.main(['./bin/jman', '--local', 'delete', '--db', self.db, '--job-id', '1'])
# check that the clean-up was successful
assert not os.path.exists(self.log_dir)
# now, let the scheduler run all jobs
self.scheduler_job = subprocess.Popen(['./bin/jman', '--local', '--database', self.database, 'run-scheduler', '--sleep-time', '0.1', '--parallel', '2'])
# ... and kill the scheduler
time.sleep(3)
self.scheduler_job.send_signal(signal.SIGINT)
self.scheduler_job = None
# now, execute job 2 with 2 parallel jobs (this might not work during the nightlies...)
jman.main(['./bin/jman', '--local', 'execute', '--db', self.db, '--job-id', '2', '--parallel', '2'])
# check that all output files are generated again
assert os.path.isfile(out_file)
assert os.path.isfile(err_file)
assert open(out_file).read().rstrip() == 'This is a text message to std-out'
assert open(err_file).read().rstrip() == 'This is a text message to std-err'
# check that exactly four output and four error files have been created
files = os.listdir(self.log_dir)
assert len(files) == 8
assert len(files) == 10
for i in range(1,8,2):
assert 'test_2.o2.%d'%i in files
assert 'test_2.e2.%d'%i in files
# test the result of the experiments
# check that all array jobs are finished now
session = job_manager.lock()
job = list(session.query(Job).filter(Job.id == 2))[0]
assert job.status == 'finished'
assert job.result == 1
for i in range(4):
assert job.array[i].id == 2*i+1
assert job.array[i].result == (0 if i else 1)
assert job.array[i].status == 'finished'
jobs = list(session.query(Job))
assert len(jobs) == 2
assert jobs[1].status == 'failure'
assert jobs[1].array[0].status == 'failure'
assert jobs[1].array[0].result == 1
for i in range(1,4):
assert jobs[1].array[i].status == 'success'
assert jobs[1].array[i].result == 0
job_manager.unlock()
print
# test that the list command still works
jman.main(['./bin/jman', '--database', self.database, 'list', '--print-array-jobs'])
print
# test that the list command still works
jman.main(['./bin/jman', '--database', self.database, 'report'])
# clean-up
jman.main(['./bin/jman', '--local', 'delete', '--db', self.db])
jman.main(['./bin/jman', '--local', '--database', self.database, 'delete'])
# check that the db and the log files are gone
# check that the database and the log files are gone
assert len(os.listdir(self.temp_dir)) == 0
def test02_grid(self):
# Tests the functionality of the grid toolkit in the grid
raise nose.plugins.skip.SkipTest("This test is not yet implemented. If you find a proper ways to test the grid functionality, please go ahead and implement the test.")
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment