Merge branch 'add-gpu-queue' into 'master'

Added a GPU queue that defaults to short_gpu See merge request !41

Merge branch 'add-gpu-queue' into 'master'
57bd3218 · Tiago de Freitas Pereira · e34ce246 · 608a0eff · 57bd3218 · 57bd3218
Commit 57bd3218 authored Nov 5, 2020 by Tiago de Freitas Pereira
--- a/bob/pipelines/config/distributed/sge_default.py
+++ b/bob/pipelines/config/distributed/sge_default.py
@@ -2,5 +2,5 @@ from dask.distributed import Client

 from bob.pipelines.distributed.sge import SGEMultipleQueuesCluster

-cluster = SGEMultipleQueuesCluster(min_jobs=20)
+cluster = SGEMultipleQueuesCluster(min_jobs=1)
 dask_client = Client(cluster)
--- a/bob/pipelines/config/distributed/sge_light.py
+++ b/bob/pipelines/config/distributed/sge_light.py
 from dask.distributed import Client

 from bob.pipelines.distributed.sge import SGEMultipleQueuesCluster
-from bob.pipelines.distributed.sge_queues import QUEUE_LIGHT
+from bob.pipelines.distributed.sge_queues import QUEUE_GPU

-cluster = SGEMultipleQueuesCluster(min_jobs=20, sge_job_spec=QUEUE_LIGHT)
+cluster = SGEMultipleQueuesCluster(min_jobs=1, sge_job_spec=QUEUE_GPU)
 dask_client = Client(cluster)
--- a/bob/pipelines/distributed/sge.py
+++ b/bob/pipelines/distributed/sge.py
@@ -109,7 +109,7 @@ def get_max_jobs(queue_dict):

 class SGEMultipleQueuesCluster(JobQueueCluster):
    """Launch Dask jobs in the SGE cluster allowing the request of multiple
-    queus.
+    queues.

    Parameters
    ----------
@@ -127,7 +127,7 @@ class SGEMultipleQueuesCluster(JobQueueCluster):

      sge_job_spec: dict
        Dictionary containing a minimum specification for the qsub command.
-        It cosists of:
+        It consists of:

          queue: SGE queue
          memory: Memory requirement in GB (e.g. 4GB)
@@ -358,7 +358,7 @@ class AdaptiveMultipleQueue(Adaptive):
    This custom implementation extends the `Adaptive.recommendations` by looking
    at the `distributed.scheduler.TaskState.resource_restrictions`.

-    The heristics is:
+    The heuristics is:

    .. note ::
        If a certain task has the status `no-worker` and it has resource_restrictions, the scheduler should
@@ -446,7 +446,7 @@ class SchedulerResourceRestriction(Scheduler):

    def __init__(self, *args, **kwargs):
        super(SchedulerResourceRestriction, self).__init__(
-            idle_timeout=300
+            idle_timeout=3600
            if rc.get("bob.pipelines.sge.idle_timeout") is None
            else rc.get("bob.pipelines.sge.idle_timeout"),
            allowed_failures=100

--- a/bob/pipelines/distributed/sge_queues.py
+++ b/bob/pipelines/distributed/sge_queues.py
@@ -50,14 +50,14 @@ default
 """


-QUEUE_LIGHT = {
+QUEUE_GPU = {
    "default": {
-        "queue": "q_1day",
-        "memory": "4GB",
+        "queue": "q_short_gpu",
+        "memory": "30GB",
        "io_big": False,
        "resource_spec": "",
-        "max_jobs": 96,
-        "resources": "",
+        "max_jobs": 45,
+        "resources": {"q_short_gpu": 1},
    },
    "q_1day": {
        "queue": "q_1day",
@@ -98,6 +98,5 @@ QUEUE_LIGHT = {
    },
 }
 """
-This queue setup has a light arrangement.
-For CPU jobs, it prioritizes all.q and not io_big
+This queue setup uses the q_short_gpu queue of the SGE.
 """
--- a/setup.py
+++ b/setup.py
@@ -44,9 +44,9 @@ setup(
    # entry_points defines which scripts will be inside the 'bin' directory
    entry_points = {
        'dask.client': [
-            'local-parallel  = bob.pipelines.config.distributed.local_parallel',
-            'sge             = bob.pipelines.config.distributed.sge_default',
-            'sge-light       = bob.pipelines.config.distributed.sge_light',
+            'local-parallel  = bob.pipelines.config.distributed.local_parallel:dask_client',
+            'sge             = bob.pipelines.config.distributed.sge_default:dask_client',
+            'sge-gpu         = bob.pipelines.config.distributed.sge_gpu:dask_client',
        ],
    },