From 30f06f3da28949c961ba6462a79c60c667b10f9a Mon Sep 17 00:00:00 2001 From: Yannick DAYER <yannick.dayer@idiap.ch> Date: Thu, 21 Mar 2024 16:29:05 +0100 Subject: [PATCH] feat(distributed): add a Slurm default config file --- .../config/distributed/slurm_cpu_default.py | 85 +++++++++++++++++++ 1 file changed, 85 insertions(+) create mode 100644 src/bob/pipelines/config/distributed/slurm_cpu_default.py diff --git a/src/bob/pipelines/config/distributed/slurm_cpu_default.py b/src/bob/pipelines/config/distributed/slurm_cpu_default.py new file mode 100644 index 0000000..74105ca --- /dev/null +++ b/src/bob/pipelines/config/distributed/slurm_cpu_default.py @@ -0,0 +1,85 @@ +"""This config creates a Dask Client configured to use Slurm workers. + +A Dask SLURMScheduler is spun up locally, and will submit Dask Workers to be run +on the Slurm grid. + +The Client can then send work to the Scheduler who will dispatch it to workers +and scale the number of workers accordingly. + +The slurm account name must be stored in ``~/.config/bobrc.toml`` +(``slurm.account`` entry). Set it with: +``` +bob config set slurm.account your-project-name +``` + +You can specify your conda **base** path with the ``conda.base_path`` entry in +``~/.config/bobrc.toml``; otherwise, it defaults to ``~/miniconda3``. + +You can specify the conda environment to use in the Dask Workers with the +``conda.slurm_prefix`` entry in ``~/.config/bobrc.toml``; otherwise, it will try +to activate the currently activated **local** environment (or do nothing if no +conda environment is active). +""" + +import os + +from pathlib import Path + +from clapper.rc import UserDefaults +from dask.distributed import Client +from dask_jobqueue import SLURMCluster + +rc = UserDefaults(path="bobrc.toml") + +# Tries to activate the correct environment in this order: +# 1. the conda env specified in bobrc.toml conda.slurm_prefix; +# 2. the conda env in which this script is running; +# 3. no conda env. +conda_base_path = Path(rc.get("conda.base_path", default="~/miniconda3")) +conda_setup_script = conda_base_path / "etc" / "profile.d" / "conda.sh" +conda_current_prefix = rc.get( + "conda.slurm_prefix", default=os.environ.get("CONDA_PREFIX", default="") +) + +job_script_prologue = [] +if conda_current_prefix != "": + job_script_prologue.extend( + [ + f"source {conda_setup_script}", + f"conda activate {conda_current_prefix}", + ] + ) + +if "slurm.account" not in rc: + raise RuntimeError( + f"Could not retrieve slurm.account from config ({rc.path}). " + "Please set the account / project name with: " + "bob config set slurm.account your-project-name" + ) + +cluster = SLURMCluster( + n_workers=1, + queue="cpu", # Slurm's partition + account=rc.get("slurm.account"), # Billing project + cores=1, # per job + memory="8 GB", # per job + walltime="00:30:00", + local_directory="/tmp/dask", # Fast but ephemeral NVMe storage + log_directory="./logs", + job_script_prologue=job_script_prologue, + protocol="tcp://", + scheduler_options={ + "protocol": "tcp://", + "port": 8786, # Workers will connect to the scheduler on that port. + }, +) + +cluster.adapt( + minimum=1, + maximum=128, + wait_count=5, + interval=10, + target_duration="10s", +) + +dask_client = Client(cluster) -- GitLab