Skip to content
Snippets Groups Projects
Commit 76c3520c authored by André Anjos's avatar André Anjos :speech_balloon:
Browse files

[engine,script] Support for multi-GPU machines, improved random seed settings and reproducibility

parent 13aa82ad
No related branches found
No related tags found
1 merge request!15Support for multi-GPU machines, improved random seed settings and reproducibility
Pipeline #42304 passed
......@@ -116,8 +116,8 @@ def run(model, data_loader, name, device, output_folder, overlayed_folder):
the local name of this dataset (e.g. ``train``, or ``test``), to be
used when saving measures files.
device : str
device to use ``cpu`` or ``cuda:0``
device : :py:class:`torch.device`
device to use
output_folder : str
folder where to store output prediction maps (HDF5 files) and model
......
......@@ -100,8 +100,8 @@ def run(
save a checkpoint every ``n`` epochs. If set to ``0`` (zero), then do
not save intermediary checkpoints
device : str
device to use ``'cpu'`` or ``cuda:0``
device : :py:class:`torch.device`
device to use
arguments : dict
start and end epochs
......@@ -113,11 +113,10 @@ def run(
start_epoch = arguments["epoch"]
max_epoch = arguments["max_epoch"]
if device != "cpu":
if device.type == "cuda":
# asserts we do have a GPU
assert bool(gpu_constants()), (
f"Device set to '{device}', but cannot "
f"find a GPU (maybe nvidia-smi is not installed?)"
f"Device set to '{device}', but nvidia-smi is not installed"
)
os.makedirs(output_folder, exist_ok=True)
......@@ -139,7 +138,7 @@ def run(
shutil.move(static_logfile_name, backup)
with open(static_logfile_name, "w", newline="") as f:
logdata = cpu_constants()
if device != "cpu":
if device == "cuda":
logdata += gpu_constants()
logdata += (("model_size", n),)
logwriter = csv.DictWriter(f, fieldnames=[k[0] for k in logdata])
......@@ -166,7 +165,7 @@ def run(
if valid_loader is not None:
logfile_fields += ("validation_average_loss", "validation_median_loss")
logfile_fields += tuple([k[0] for k in cpu_log()])
if device != "cpu":
if device.type == "cuda":
logfile_fields += tuple([k[0] for k in gpu_log()])
# the lowest validation loss obtained so far - this value is updated only
......@@ -308,7 +307,7 @@ def run(
("validation_median_loss", f"{valid_losses.median:.6f}"),
)
logdata += cpu_log()
if device != "cpu":
if device.type == "cuda":
logdata += gpu_log()
logwriter.writerow(dict(k for k in logdata))
......
......@@ -7,6 +7,7 @@ import os
import re
import sys
import time
import random
import tempfile
import urllib.request
......@@ -15,12 +16,100 @@ import click
from click_plugins import with_plugins
from tqdm import tqdm
import numpy
import torch
from bob.extension.scripts.click_helper import AliasedGroup
import logging
logger = logging.getLogger(__name__)
def setup_pytorch_device(name):
"""Sets-up the pytorch device to use
Parameters
----------
name : str
The device name (``cpu``, ``cuda:0``, ``cuda:1``, and so on)
Returns
-------
device : :py:class:`torch.device`
The pytorch device to use, pre-configured (and checked)
"""
if name.startswith("cuda"):
# In case one has multiple devices, we must first set the one
# we would like to use so pytorch can find it.
os.environ['CUDA_VISIBLE_DEVICES'] = name.split(":",1)[1]
if not torch.cuda.is_available():
raise RuntimeError(f"CUDA is not currently available, but " \
f"you set device to '{name}'")
# Let pytorch auto-select from environment variable
return torch.device("cuda")
#cpu
return torch.device(name)
def set_seeds(value, all_gpus):
"""Sets up all relevant random seeds (numpy, python, cuda)
If running with multiple GPUs **at the same time**, set ``all_gpus`` to
``True`` to force all GPU seeds to be initialized.
Reference: `PyTorch page for reproducibility
<https://pytorch.org/docs/stable/notes/randomness.html>`_.
Parameters
----------
value : int
The random seed value to use
all_gpus : :py:class:`bool`, Optional
If set, then reset the seed on all GPUs available at once. This is
normally **not** what you want if running on a single GPU
"""
random.seed(value)
numpy.random.seed(value)
torch.manual_seed(value)
torch.cuda.manual_seed(value) #noop if cuda not available
# set seeds for all gpus
if all_gpus:
torch.cuda.manual_seed_all(value) #noop if cuda not available
def set_reproducible_cuda():
"""Turns-off all CUDA optimizations that would affect reproducibility
For full reproducibility, also ensure not to use multiple (parallel) data
lowers. That is setup ``num_workers=0``.
Reference: `PyTorch page for reproducibility
<https://pytorch.org/docs/stable/notes/randomness.html>`_.
"""
# ensure to use only optimization algos for cuda that are known to have
# a deterministic effect (not random)
torch.backends.cudnn.deterministic = True
# turns off any optimization tricks
torch.backends.cudnn.benchmark = False
def escape_name(v):
"""Escapes a name so it contains filesystem friendly characters only
......
......@@ -17,7 +17,7 @@ from bob.extension.scripts.click_helper import (
from ..engine.predictor import run
from ..utils.checkpointer import Checkpointer
from .binseg import download_to_tempfile
from .binseg import download_to_tempfile, setup_pytorch_device
import logging
logger = logging.getLogger(__name__)
......@@ -115,6 +115,8 @@ def predict(output_folder, model, dataset, batch_size, device, weight,
overlayed, **kwargs):
"""Predicts vessel map (probabilities) on input images"""
device = setup_pytorch_device(device)
dataset = dataset if isinstance(dataset, dict) else dict(test=dataset)
if weight.startswith("http"):
......
......@@ -14,6 +14,7 @@ from bob.extension.scripts.click_helper import (
)
from ..utils.checkpointer import Checkpointer
from .binseg import setup_pytorch_device, set_seeds
import logging
logger = logging.getLogger(__name__)
......@@ -216,7 +217,9 @@ def train(
abruptly.
"""
torch.manual_seed(seed)
device = setup_pytorch_device(device)
set_seeds(seed, all_gpus=False)
use_dataset = dataset
validation_dataset = None
......
No preview for this file type
......@@ -2,6 +2,7 @@
# Project: extras
# Version: stable
# The remainder of this file is compressed using zlib.
torch.device py:class 1 https://pytorch.org/docs/master/tensor_attributes.html#torch.torch.device -
torch.optim.optimizer.Optimizer py:class 1 https://pytorch.org/docs/stable/optim.html#torch.optim.Optimizer -
torch.nn.Module py:class 1 https://pytorch.org/docs/stable/nn.html?highlight=module#torch.nn.Module -
torch.nn.modules.module.Module py:class 1 https://pytorch.org/docs/stable/nn.html?highlight=module#torch.nn.Module -
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment