From 6f25dfabfd3deb9b0da8d53a8e79803302208ec8 Mon Sep 17 00:00:00 2001 From: Andre Anjos <andre.dos.anjos@gmail.com> Date: Sat, 25 Apr 2020 12:08:14 +0200 Subject: [PATCH] [engine.trainer] Implement thorough resource monitoring; Do not plot in the end - writing a CSV is fine already --- bob/ip/binseg/engine/trainer.py | 50 ++++++------ bob/ip/binseg/utils/resources.py | 135 +++++++++++++++++++++++++++++++ 2 files changed, 161 insertions(+), 24 deletions(-) create mode 100644 bob/ip/binseg/utils/resources.py diff --git a/bob/ip/binseg/engine/trainer.py b/bob/ip/binseg/engine/trainer.py index 783d5dcb..689b3cca 100644 --- a/bob/ip/binseg/engine/trainer.py +++ b/bob/ip/binseg/engine/trainer.py @@ -8,16 +8,16 @@ import datetime import distutils.version import torch -import pandas from tqdm import tqdm -from bob.ip.binseg.utils.metric import SmoothedValue -from bob.ip.binseg.utils.plot import loss_curve +from ..utils.metric import SmoothedValue +from ..utils.resources import gpu_info, cpu_info import logging + logger = logging.getLogger(__name__) -PYTORCH_GE_110 = (distutils.version.StrictVersion(torch.__version__) >= "1.1.0") +PYTORCH_GE_110 = distutils.version.StrictVersion(torch.__version__) >= "1.1.0" def run( @@ -93,8 +93,14 @@ def run( "average-loss", "median-loss", "learning-rate", - "gpu-memory-megabytes", ) + cpu_data = cpu_info() + logfile_fields += tuple([k[0] for k in cpu_data]) + gpu_data = gpu_info() + if gpu_data is not None: # CUDA is available on this platform + logfile_fields += tuple([k[0] for k in gpu_data]) + gpu_is_available = bool(gpu_data) + with open(logfile_name, "a+", newline="") as logfile: logwriter = csv.DictWriter(logfile, fieldnames=logfile_fields) @@ -109,8 +115,14 @@ def run( # Total training timer start_training_time = time.time() - for epoch in range(start_epoch, max_epoch): - if not PYTORCH_GE_110: scheduler.step() + for epoch in tqdm( + range(start_epoch, max_epoch), + desc="epoch", + leave=False, + disable=None, + ): + if not PYTORCH_GE_110: + scheduler.step() losses = SmoothedValue(len(data_loader)) epoch = epoch + 1 arguments["epoch"] = epoch @@ -120,7 +132,7 @@ def run( # progress bar only on interactive jobs for samples in tqdm( - data_loader, desc="batches", leave=False, disable=None, + data_loader, desc="batch", leave=False, disable=None, ): # data forwarding on the existing network @@ -141,7 +153,8 @@ def run( losses.update(loss) logger.debug(f"batch loss: {loss.item()}") - if PYTORCH_GE_110: scheduler.step() + if PYTORCH_GE_110: + scheduler.step() if checkpoint_period and (epoch % checkpoint_period == 0): checkpointer.save(f"model_{epoch:03d}", **arguments) @@ -165,25 +178,14 @@ def run( ("average-loss", f"{losses.avg:.6f}"), ("median-loss", f"{losses.median:.6f}"), ("learning-rate", f"{optimizer.param_groups[0]['lr']:.6f}"), - ( - "gpu-memory-megabytes", - f"{torch.cuda.max_memory_allocated()/(1024.0*1024.0)}" - if torch.cuda.is_available() - else "0.0", - ), - ) + ) + cpu_info() + if gpu_is_available: + logdata += gpu_info() logwriter.writerow(dict(k for k in logdata)) - logger.info("|".join([f"{k}: {v}" for (k, v) in logdata])) + tqdm.write("|".join([f"{k}: {v}" for (k, v) in logdata])) total_training_time = time.time() - start_training_time logger.info( f"Total training time: {datetime.timedelta(seconds=total_training_time)} ({(total_training_time/max_epoch):.4f}s in average per epoch)" ) - - # plots a version of the CSV trainlog into a PDF - logdf = pandas.read_csv(logfile_name, header=0, names=logfile_fields) - fig = loss_curve(logdf) - figurefile_name = os.path.join(output_folder, "trainlog.pdf") - logger.info(f"Saving {figurefile_name}") - fig.savefig(figurefile_name) diff --git a/bob/ip/binseg/utils/resources.py b/bob/ip/binseg/utils/resources.py new file mode 100644 index 00000000..2d230322 --- /dev/null +++ b/bob/ip/binseg/utils/resources.py @@ -0,0 +1,135 @@ +#!/usr/bin/env python +# vim: set fileencoding=utf-8 : + +"""Tools for interacting with the running computer or GPU""" + +import os +import re +import subprocess +import shutil + +import psutil + +import logging + +logger = logging.getLogger(__name__) + +_nvidia_smi = shutil.which("nvidia-smi") +"""Location of the nvidia-smi program, if one exists""" + +_nvidia_query = ( + # obtain possible values with ``nvidia-smi --help-query-gpu`` + "gpu_name", + "memory.total", + "memory.used", + "utilization.gpu", +) +"""Query parameters for nvidia-smi""" + +GB = float(2 ** 30) +"""The number of bytes in a gigabyte""" + + +def gpu_info(query=_nvidia_query): + """Returns GPU information using nvidia-smi + + For a comprehensive list of options and help, execute ``nvidia-smi + --help-query-gpu`` on a host with a GPU + + + Parameters + ---------- + + query : list + A list of query strings as defined by ``nvidia-smi --help-query-gpu`` + + + Returns + ------- + + data : tuple + An ordered dictionary (organized as 2-tuples) containing the queried + parameters. If ``nvidia-smi`` is not available, returns a list of + ``None`` objects. Dots and underscores in the original NVIDIA naming + convention are normalized with dashes. + + """ + + if _nvidia_smi is not None: + values = subprocess.getoutput( + "%s --query-gpu=%s --format=csv,noheader" + % (_nvidia_smi, ",".join(query)) + ) + values = [k.strip() for k in values.split(",")] + regexp = re.compile(r"(\.|_)") + fieldnames = [k.sub("-", k) for k in query] + return tuple(zip(fieldnames, values)) + + +_CLUSTER = [] +"""List of processes currently being monitored""" + + +def cpu_info(): + """Returns process (+child) information using ``psutil``. + + This call examines the current process plus any spawn child and returns the + combined resource usage summary for the process group. + + + Returns + ------- + + data : tuple + An ordered dictionary (organized as 2-tuples) containing these entries: + + 0. ``system-memory-total`` (:py:class:`float`): total memory available, + in gigabytes + 1. ``system-memory-used`` (:py:class:`float`): total memory used from + the system, in gigabytes + 2. ``system-cpu-count`` (:py:class:`int`): number of logical CPUs + available + 3. ``rss`` (:py:class:`float`): RAM currently used by + process and children, in gigabytes + 3. ``vms`` (:py:class:`float`): total memory (RAM + swap) currently + used by process and children, in gigabytes + 4. ``cpu-percent`` (:py:class:`float`): percentage of the total CPU + used by this process and children (recursively) since last call + (first time called should be ignored). This number depends on the + number of CPUs in the system and can be greater than 100% + 5. ``processes`` (:py:class:`int`): total number of processes including + self and children (recursively) + 6. ``open-files`` (:py:class:`int`): total number of open files by + self and children + + """ + + global _CLUSTER + if (not _CLUSTER) or (_CLUSTER[0] != psutil.Process()): #initialization + this = psutil.Process() + _CLUSTER = [this] + this.children(recursive=True) + # touch cpu_percent() at least once for all + [k.cpu_percent(interval=None) for k in _CLUSTER] + else: + # check all cluster components and update process list + # done so we can keep the cpu_percent() initialization + children = _CLUSTER[0].children() + stored_children = set(_CLUSTER[1:]) + current_children = set(_CLUSTER[0].children()) + keep_children = stored_children - current_children + new_children = current_children - stored_children + [k.cpu_percent(interval=None) for k in new_children] + _CLUSTER = _CLUSTER[:1] + list(keep_children) + list(new_children) + + memory_info = [k.memory_info() for k in _CLUSTER] + + return ( + ("system-memory-total", psutil.virtual_memory().total / GB), + ("system-memory-used", psutil.virtual_memory().used / GB), + ("system-cpu-count", psutil.cpu_count(logical=True)), + ("rss", sum([k.rss for k in memory_info]) / GB), + ("vms", sum([k.vms for k in memory_info]) / GB), + ("cpu-percent", sum(k.cpu_percent(interval=None) for k in _CLUSTER)), + ("processes", len(_CLUSTER)), + ("open-files", sum(len(k.open_files()) for k in _CLUSTER)), + ) -- GitLab