Skip to content
Snippets Groups Projects
Commit 6f25dfab authored by André Anjos's avatar André Anjos :speech_balloon:
Browse files

[engine.trainer] Implement thorough resource monitoring; Do not plot in the...

[engine.trainer] Implement thorough resource monitoring; Do not plot in the end - writing a CSV is fine already
parent 9bbf8918
No related branches found
No related tags found
1 merge request!12Streamlining
Pipeline #39215 failed
......@@ -8,16 +8,16 @@ import datetime
import distutils.version
import torch
import pandas
from tqdm import tqdm
from bob.ip.binseg.utils.metric import SmoothedValue
from bob.ip.binseg.utils.plot import loss_curve
from ..utils.metric import SmoothedValue
from ..utils.resources import gpu_info, cpu_info
import logging
logger = logging.getLogger(__name__)
PYTORCH_GE_110 = (distutils.version.StrictVersion(torch.__version__) >= "1.1.0")
PYTORCH_GE_110 = distutils.version.StrictVersion(torch.__version__) >= "1.1.0"
def run(
......@@ -93,8 +93,14 @@ def run(
"average-loss",
"median-loss",
"learning-rate",
"gpu-memory-megabytes",
)
cpu_data = cpu_info()
logfile_fields += tuple([k[0] for k in cpu_data])
gpu_data = gpu_info()
if gpu_data is not None: # CUDA is available on this platform
logfile_fields += tuple([k[0] for k in gpu_data])
gpu_is_available = bool(gpu_data)
with open(logfile_name, "a+", newline="") as logfile:
logwriter = csv.DictWriter(logfile, fieldnames=logfile_fields)
......@@ -109,8 +115,14 @@ def run(
# Total training timer
start_training_time = time.time()
for epoch in range(start_epoch, max_epoch):
if not PYTORCH_GE_110: scheduler.step()
for epoch in tqdm(
range(start_epoch, max_epoch),
desc="epoch",
leave=False,
disable=None,
):
if not PYTORCH_GE_110:
scheduler.step()
losses = SmoothedValue(len(data_loader))
epoch = epoch + 1
arguments["epoch"] = epoch
......@@ -120,7 +132,7 @@ def run(
# progress bar only on interactive jobs
for samples in tqdm(
data_loader, desc="batches", leave=False, disable=None,
data_loader, desc="batch", leave=False, disable=None,
):
# data forwarding on the existing network
......@@ -141,7 +153,8 @@ def run(
losses.update(loss)
logger.debug(f"batch loss: {loss.item()}")
if PYTORCH_GE_110: scheduler.step()
if PYTORCH_GE_110:
scheduler.step()
if checkpoint_period and (epoch % checkpoint_period == 0):
checkpointer.save(f"model_{epoch:03d}", **arguments)
......@@ -165,25 +178,14 @@ def run(
("average-loss", f"{losses.avg:.6f}"),
("median-loss", f"{losses.median:.6f}"),
("learning-rate", f"{optimizer.param_groups[0]['lr']:.6f}"),
(
"gpu-memory-megabytes",
f"{torch.cuda.max_memory_allocated()/(1024.0*1024.0)}"
if torch.cuda.is_available()
else "0.0",
),
)
) + cpu_info()
if gpu_is_available:
logdata += gpu_info()
logwriter.writerow(dict(k for k in logdata))
logger.info("|".join([f"{k}: {v}" for (k, v) in logdata]))
tqdm.write("|".join([f"{k}: {v}" for (k, v) in logdata]))
total_training_time = time.time() - start_training_time
logger.info(
f"Total training time: {datetime.timedelta(seconds=total_training_time)} ({(total_training_time/max_epoch):.4f}s in average per epoch)"
)
# plots a version of the CSV trainlog into a PDF
logdf = pandas.read_csv(logfile_name, header=0, names=logfile_fields)
fig = loss_curve(logdf)
figurefile_name = os.path.join(output_folder, "trainlog.pdf")
logger.info(f"Saving {figurefile_name}")
fig.savefig(figurefile_name)
#!/usr/bin/env python
# vim: set fileencoding=utf-8 :
"""Tools for interacting with the running computer or GPU"""
import os
import re
import subprocess
import shutil
import psutil
import logging
logger = logging.getLogger(__name__)
_nvidia_smi = shutil.which("nvidia-smi")
"""Location of the nvidia-smi program, if one exists"""
_nvidia_query = (
# obtain possible values with ``nvidia-smi --help-query-gpu``
"gpu_name",
"memory.total",
"memory.used",
"utilization.gpu",
)
"""Query parameters for nvidia-smi"""
GB = float(2 ** 30)
"""The number of bytes in a gigabyte"""
def gpu_info(query=_nvidia_query):
"""Returns GPU information using nvidia-smi
For a comprehensive list of options and help, execute ``nvidia-smi
--help-query-gpu`` on a host with a GPU
Parameters
----------
query : list
A list of query strings as defined by ``nvidia-smi --help-query-gpu``
Returns
-------
data : tuple
An ordered dictionary (organized as 2-tuples) containing the queried
parameters. If ``nvidia-smi`` is not available, returns a list of
``None`` objects. Dots and underscores in the original NVIDIA naming
convention are normalized with dashes.
"""
if _nvidia_smi is not None:
values = subprocess.getoutput(
"%s --query-gpu=%s --format=csv,noheader"
% (_nvidia_smi, ",".join(query))
)
values = [k.strip() for k in values.split(",")]
regexp = re.compile(r"(\.|_)")
fieldnames = [k.sub("-", k) for k in query]
return tuple(zip(fieldnames, values))
_CLUSTER = []
"""List of processes currently being monitored"""
def cpu_info():
"""Returns process (+child) information using ``psutil``.
This call examines the current process plus any spawn child and returns the
combined resource usage summary for the process group.
Returns
-------
data : tuple
An ordered dictionary (organized as 2-tuples) containing these entries:
0. ``system-memory-total`` (:py:class:`float`): total memory available,
in gigabytes
1. ``system-memory-used`` (:py:class:`float`): total memory used from
the system, in gigabytes
2. ``system-cpu-count`` (:py:class:`int`): number of logical CPUs
available
3. ``rss`` (:py:class:`float`): RAM currently used by
process and children, in gigabytes
3. ``vms`` (:py:class:`float`): total memory (RAM + swap) currently
used by process and children, in gigabytes
4. ``cpu-percent`` (:py:class:`float`): percentage of the total CPU
used by this process and children (recursively) since last call
(first time called should be ignored). This number depends on the
number of CPUs in the system and can be greater than 100%
5. ``processes`` (:py:class:`int`): total number of processes including
self and children (recursively)
6. ``open-files`` (:py:class:`int`): total number of open files by
self and children
"""
global _CLUSTER
if (not _CLUSTER) or (_CLUSTER[0] != psutil.Process()): #initialization
this = psutil.Process()
_CLUSTER = [this] + this.children(recursive=True)
# touch cpu_percent() at least once for all
[k.cpu_percent(interval=None) for k in _CLUSTER]
else:
# check all cluster components and update process list
# done so we can keep the cpu_percent() initialization
children = _CLUSTER[0].children()
stored_children = set(_CLUSTER[1:])
current_children = set(_CLUSTER[0].children())
keep_children = stored_children - current_children
new_children = current_children - stored_children
[k.cpu_percent(interval=None) for k in new_children]
_CLUSTER = _CLUSTER[:1] + list(keep_children) + list(new_children)
memory_info = [k.memory_info() for k in _CLUSTER]
return (
("system-memory-total", psutil.virtual_memory().total / GB),
("system-memory-used", psutil.virtual_memory().used / GB),
("system-cpu-count", psutil.cpu_count(logical=True)),
("rss", sum([k.rss for k in memory_info]) / GB),
("vms", sum([k.vms for k in memory_info]) / GB),
("cpu-percent", sum(k.cpu_percent(interval=None) for k in _CLUSTER)),
("processes", len(_CLUSTER)),
("open-files", sum(len(k.open_files()) for k in _CLUSTER)),
)
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment