Commit b17ae274 authored by Pavel KORSHUNOV's avatar Pavel KORSHUNOV

Memory arguments sets gpumem parameter for gpu queues

parent 122ef226
Pipeline #21059 passed with stage
in 3 minutes and 12 seconds
......@@ -28,6 +28,9 @@ def add_arguments(parser):
'q_1month', 'q1m',
'q_1day_mth', 'q1dm',
'q_1week_mth', 'q1wm',
'q_gpu', 'gpu',
'q_long_gpu', 'lgpu',
'q_short_gpu', 'sgpu',
)
parser.add_argument('--queue-name', metavar='QUEUE', type=str,
......@@ -75,7 +78,7 @@ class DryRunJob(object):
# distributed as jobs are "submitted"
current_id = 0
def __init__(self, cmd, cwd, queue, hostname, memfree, hvmem, pe_opt, stdout, stderr, name, array, deps):
def __init__(self, cmd, cwd, queue, hostname, memfree, hvmem, gpumem, pe_opt, stdout, stderr, name, array, deps):
self.myid = DryRunJob.current_id
DryRunJob.current_id += 1
......@@ -86,6 +89,7 @@ class DryRunJob(object):
self.hostname = hostname
self.memfree = memfree
self.hvmem = hvmem
self.gpumem = gpumem
self.pe_opt = pe_opt
self.stdout = stdout
self.stderr = stderr
......@@ -103,6 +107,7 @@ class DryRunJob(object):
hostname : %s
memfree : %s
hvmem : %s
gpumem : %s
pe_opt : %s
stdout : %s
stderr : %s
......@@ -116,6 +121,7 @@ class DryRunJob(object):
self.hostname,
self.memfree,
self.hvmem,
self.gpumem,
self.pe_opt,
self.stdout,
self.stderr,
......@@ -140,13 +146,13 @@ def submit(jman, command, arguments, deps=[], array=None):
if arguments.dryrun:
return DryRunJob(cmd, cwd=arguments.cwd, queue=arguments.queue,
hostname=arguments.hostname, memfree=arguments.memfree,
hvmem=arguments.hvmem, pe_opt=arguments.pe_opt,
hvmem=arguments.hvmem, gpumem=arguments.gpumem, pe_opt=arguments.pe_opt,
stdout=logdir, stderr=logdir, name=jobname, deps=deps,
array=array)
# really submit
return jman.submit(cmd, cwd=arguments.cwd, queue=arguments.queue,
hostname=arguments.hostname, memfree=arguments.memfree,
hvmem=arguments.hvmem, pe_opt=arguments.pe_opt,
hvmem=arguments.hvmem, gpumem=arguments.gpumem, pe_opt=arguments.pe_opt,
stdout=logdir, stderr=logdir, name=jobname, deps=deps,
array=array)
......@@ -259,6 +259,8 @@ class Job(Base):
retval['memfree'] = args['memfree']
if 'hvmem' in args and args['hvmem'] is not None:
retval['hvmem'] = args['hvmem']
if 'gpumem' in args and args['gpumem'] is not None:
retval['gpumem'] = args['gpumem']
if 'env' in args and len(args['env']) > 0:
retval['env'] = args['env']
if 'io_big' in args and args['io_big']:
......
......@@ -24,7 +24,7 @@ from ..tools import make_shell, logger
from .. import local, sge
from ..models import Status
QUEUES = ['all.q', 'q1d', 'q1w', 'q1m', 'q1dm', 'q1wm','gpu', 'lgpu', 'sgpu']
QUEUES = ['all.q', 'q1d', 'q1w', 'q1m', 'q1dm', 'q1wm', 'gpu', 'lgpu', 'sgpu']
def setup(args):
"""Returns the JobManager and sets up the basic infrastructure"""
......@@ -125,6 +125,9 @@ def submit(args):
if args.log_dir is not None: kwargs['log_dir'] = args.log_dir
if args.dependencies is not None: kwargs['dependencies'] = args.dependencies
if args.qname != 'all.q': kwargs['hvmem'] = args.memory
# if this is a GPU queue, we set gpumem flag
# remove 'G' last character from the args.memory string
if args.qname in ('gpu', 'lgpu', 'sgpu'): kwargs['gpumem'] = args.memory[:-1]
if args.parallel is not None:
kwargs['pe_opt'] = "pe_mth %d" % args.parallel
if args.memory is not None:
......@@ -159,6 +162,8 @@ def resubmit(args):
kwargs['memfree'] = args.memory
if args.qname not in (None, 'all.q'):
kwargs['hvmem'] = args.memory
if args.queue in ('gpu', 'lgpu', 'sgpu'):
kwargs['gpumem'] = args.memory
if args.parallel is not None:
kwargs['pe_opt'] = "pe_mth %d" % args.parallel
kwargs['memfree'] = get_memfree(args.memory, args.parallel)
......@@ -285,7 +290,10 @@ def main(command_line_options = None):
# subcommand 'submit'
submit_parser = cmdparser.add_parser('submit', aliases=['sub'], formatter_class=formatter, help='Submits jobs to the SGE queue or to the local job scheduler and logs them in a database.')
submit_parser.add_argument('-q', '--queue', metavar='QNAME', dest='qname', default='all.q', choices=QUEUES, help='the name of the SGE queue to submit the job to')
submit_parser.add_argument('-m', '--memory', help='Sets both the h_vmem and the mem_free parameters when submitting the job to the specified value, e.g. 8G to set the memory requirements to 8 gigabytes')
submit_parser.add_argument('-m', '--memory', help='Sets both the h_vmem and the mem_free parameters when submitting '
'the job to a non-GPU queue, e.g., 8G to set the memory '
'requirements to 8 gigabytes. Sets gpumem parameter when '
'submitting the job to a GPU-based queue.')
submit_parser.add_argument('-p', '--parallel', '--pe_mth', type=int, help='Sets the number of slots per job (-pe pe_mth) and multiplies the mem_free parameter. E.g. to get 16 G of memory, use -m 8G -p 2.')
submit_parser.add_argument('-n', '--name', dest='name', help='Gives the job a name')
submit_parser.add_argument('-x', '--dependencies', type=int, default=[], metavar='ID', nargs='*', help='Set job dependencies to the list of job identifiers separated by spaces')
......@@ -305,7 +313,10 @@ def main(command_line_options = None):
resubmit_parser = cmdparser.add_parser('resubmit', aliases=['reset', 'requeue', 're'], formatter_class=formatter, help='Re-submits a list of jobs.')
resubmit_parser.add_argument('-j', '--job-ids', metavar='ID', nargs='+', help='Re-submit only the jobs with the given ids (by default, all finished jobs are re-submitted).')
resubmit_parser.add_argument('-q', '--queue', metavar='QNAME', dest='qname', choices=QUEUES, help='Reset the SGE queue to submit the job to')
resubmit_parser.add_argument('-m', '--memory', help='Resets both the h_vmem and the mem_free parameters when submitting the job to the specified value, e.g. 8G to set the memory requirements to 8 gigabytes')
resubmit_parser.add_argument('-m', '--memory', help='Resets both the h_vmem and the mem_free parameters when '
'submitting the job to a non-GPU queue, e.g., 8G '
'to set the memory requirements to 8 gigabytes. Resets gpumem '
'parameter when submitting the job to a GPU-based queue.')
resubmit_parser.add_argument('-p', '--parallel', '--pe_mth', type=int, help='Resets the number of slots per job (-pe pe_mth) and multiplies the mem_free parameter. E.g. to get 16 G of memory, use -m 8G -p 2.')
resubmit_parser.add_argument('-i', '--io-big', action='store_true', help='Resubmits the job to the "io_big" queue.')
resubmit_parser.add_argument('-I', '--no-io-big', action='store_true', help='Resubmits the job NOT to the "io_big" queue.')
......
......@@ -79,6 +79,8 @@ class JobManagerSGE(JobManager):
logger.warn("This job will never be executed since the 'io_big' flag is not available for the 'all.q'.")
if 'pe_opt' in kwargs and ('queue' not in kwargs or kwargs['queue'] not in ('q1dm', 'q_1day_mth', 'q1wm', 'q_1week_mth')):
logger.warn("This job will never be executed since the queue '%s' does not support multi-threading (pe_mth) -- use 'q1dm' or 'q1wm' instead." % kwargs['queue'] if 'queue' in kwargs else 'all.q')
if 'gpumem' in kwargs and 'queue' in kwargs and kwargs['queue'] in ('gpu', 'lgpu', 'sgpu') and int(kwargs['gpumem']) > 24:
logger.warn("This job will never be executed since the GPU queue '%s' cannot have more than 24GB of memory." % kwargs['queue'])
assert job.id == grid_id
return job.unique
......
......@@ -96,7 +96,7 @@ def str_(name):
def qsub(command, queue=None, cwd=True, name=None, deps=[], stdout='',
stderr='', env=[], array=None, context='grid', hostname=None,
memfree=None, hvmem=None, pe_opt=None, io_big=False):
memfree=None, hvmem=None, gpumem=None, pe_opt=None, io_big=False):
"""Submits a shell job to a given grid queue
Keyword parameters:
......@@ -164,6 +164,11 @@ def qsub(command, queue=None, cwd=True, name=None, deps=[], stdout='',
Used only if mem is not set
(cf. qsub -l h_vmem=<...>)
gpumem
Applicable only for GPU-based queues. If set, it asks for the GPU queue
with a minimum amount of memory. The amount should not be more than 24.
(cf. qsub -l gpumem=<...>)
hostname
If set, it asks the queue to use only a subset of the available nodes
Symbols: | for OR, & for AND, ! for NOT, etc.
......@@ -188,6 +193,8 @@ def qsub(command, queue=None, cwd=True, name=None, deps=[], stdout='',
if memfree: scmd += ['-l', 'mem_free=%s' % memfree]
if hvmem: scmd += ['-l', 'h_vmem=%s' % hvmem]
if gpumem: scmd += ['-l', 'gpumem=%s' % gpumem]
if io_big: scmd += ['-l', 'io_big']
if hostname: scmd += ['-l', 'hostname=%s' % hostname]
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment