[engine.trainer] Implement thorough resource monitoring; Do not plot in the...

[engine.trainer] Implement thorough resource monitoring; Do not plot in the end - writing a CSV is fine already

[engine.trainer] Implement thorough resource monitoring; Do not plot in the...
6f25dfab · André Anjos · 9bbf8918 · 6f25dfab · 6f25dfab
Commit 6f25dfab authored 5 years ago by André Anjos
--- a/bob/ip/binseg/engine/trainer.py
+++ b/bob/ip/binseg/engine/trainer.py
@@ -8,16 +8,16 @@ import datetime
 import distutils.version

 import torch
-import pandas
 from tqdm import tqdm

-from bob.ip.binseg.utils.metric import SmoothedValue
-from bob.ip.binseg.utils.plot import loss_curve
+from ..utils.metric import SmoothedValue
+from ..utils.resources import gpu_info, cpu_info

 import logging
+
 logger = logging.getLogger(__name__)

-PYTORCH_GE_110 = (distutils.version.StrictVersion(torch.__version__) >= "1.1.0")
+PYTORCH_GE_110 = distutils.version.StrictVersion(torch.__version__) >= "1.1.0"


 def run(
@@ -93,8 +93,14 @@ def run(
        "average-loss",
        "median-loss",
        "learning-rate",
-        "gpu-memory-megabytes",
    )
+    cpu_data = cpu_info()
+    logfile_fields += tuple([k[0] for k in cpu_data])
+    gpu_data = gpu_info()
+    if gpu_data is not None:  # CUDA is available on this platform
+        logfile_fields += tuple([k[0] for k in gpu_data])
+    gpu_is_available = bool(gpu_data)
+
    with open(logfile_name, "a+", newline="") as logfile:
        logwriter = csv.DictWriter(logfile, fieldnames=logfile_fields)

@@ -109,8 +115,14 @@ def run(
        # Total training timer
        start_training_time = time.time()

-        for epoch in range(start_epoch, max_epoch):
-            if not PYTORCH_GE_110: scheduler.step()
+        for epoch in tqdm(
+            range(start_epoch, max_epoch),
+            desc="epoch",
+            leave=False,
+            disable=None,
+        ):
+            if not PYTORCH_GE_110:
+                scheduler.step()
            losses = SmoothedValue(len(data_loader))
            epoch = epoch + 1
            arguments["epoch"] = epoch
@@ -120,7 +132,7 @@ def run(

            # progress bar only on interactive jobs
            for samples in tqdm(
-                data_loader, desc="batches", leave=False, disable=None,
+                data_loader, desc="batch", leave=False, disable=None,
            ):

                # data forwarding on the existing network
@@ -141,7 +153,8 @@ def run(
                losses.update(loss)
                logger.debug(f"batch loss: {loss.item()}")

-            if PYTORCH_GE_110: scheduler.step()
+            if PYTORCH_GE_110:
+                scheduler.step()

            if checkpoint_period and (epoch % checkpoint_period == 0):
                checkpointer.save(f"model_{epoch:03d}", **arguments)
@@ -165,25 +178,14 @@ def run(
                ("average-loss", f"{losses.avg:.6f}"),
                ("median-loss", f"{losses.median:.6f}"),
                ("learning-rate", f"{optimizer.param_groups[0]['lr']:.6f}"),
-                (
-                    "gpu-memory-megabytes",
-                    f"{torch.cuda.max_memory_allocated()/(1024.0*1024.0)}"
-                    if torch.cuda.is_available()
-                    else "0.0",
-                ),
-            )
+            ) + cpu_info()
+            if gpu_is_available:
+                logdata += gpu_info()

            logwriter.writerow(dict(k for k in logdata))
-            logger.info("|".join([f"{k}: {v}" for (k, v) in logdata]))
+            tqdm.write("|".join([f"{k}: {v}" for (k, v) in logdata]))

        total_training_time = time.time() - start_training_time
        logger.info(
            f"Total training time: {datetime.timedelta(seconds=total_training_time)} ({(total_training_time/max_epoch):.4f}s in average per epoch)"
        )
-
-    # plots a version of the CSV trainlog into a PDF
-    logdf = pandas.read_csv(logfile_name, header=0, names=logfile_fields)
-    fig = loss_curve(logdf)
-    figurefile_name = os.path.join(output_folder, "trainlog.pdf")
-    logger.info(f"Saving {figurefile_name}")
-    fig.savefig(figurefile_name)
--- a/bob/ip/binseg/utils/resources.py
+++ b/bob/ip/binseg/utils/resources.py
+#!/usr/bin/env python
+# vim: set fileencoding=utf-8 :
+
+"""Tools for interacting with the running computer or GPU"""
+
+import os
+import re
+import subprocess
+import shutil
+
+import psutil
+
+import logging
+
+logger = logging.getLogger(__name__)
+
+_nvidia_smi = shutil.which("nvidia-smi")
+"""Location of the nvidia-smi program, if one exists"""
+
+_nvidia_query = (
+    # obtain possible values with ``nvidia-smi --help-query-gpu``
+    "gpu_name",
+    "memory.total",
+    "memory.used",
+    "utilization.gpu",
+)
+"""Query parameters for nvidia-smi"""
+
+GB = float(2 ** 30)
+"""The number of bytes in a gigabyte"""
+
+
+def gpu_info(query=_nvidia_query):
+    """Returns GPU information using nvidia-smi
+
+    For a comprehensive list of options and help, execute ``nvidia-smi
+    --help-query-gpu`` on a host with a GPU
+
+
+    Parameters
+    ----------
+
+    query : list
+        A list of query strings as defined by ``nvidia-smi --help-query-gpu``
+
+
+    Returns
+    -------
+
+    data : tuple
+        An ordered dictionary (organized as 2-tuples) containing the queried
+        parameters.  If ``nvidia-smi`` is not available, returns a list of
+        ``None`` objects.  Dots and underscores in the original NVIDIA naming
+        convention are normalized with dashes.
+
+  """
+
+    if _nvidia_smi is not None:
+        values = subprocess.getoutput(
+            "%s --query-gpu=%s --format=csv,noheader"
+            % (_nvidia_smi, ",".join(query))
+        )
+        values = [k.strip() for k in values.split(",")]
+        regexp = re.compile(r"(\.|_)")
+        fieldnames = [k.sub("-", k) for k in query]
+        return tuple(zip(fieldnames, values))
+
+
+_CLUSTER = []
+"""List of processes currently being monitored"""
+
+
+def cpu_info():
+    """Returns process (+child) information using ``psutil``.
+
+    This call examines the current process plus any spawn child and returns the
+    combined resource usage summary for the process group.
+
+
+    Returns
+    -------
+
+    data : tuple
+        An ordered dictionary (organized as 2-tuples) containing these entries:
+
+        0. ``system-memory-total`` (:py:class:`float`): total memory available,
+           in gigabytes
+        1. ``system-memory-used`` (:py:class:`float`): total memory used from
+           the system, in gigabytes
+        2. ``system-cpu-count`` (:py:class:`int`): number of logical CPUs
+           available
+        3. ``rss`` (:py:class:`float`):  RAM currently used by
+           process and children, in gigabytes
+        3. ``vms`` (:py:class:`float`):  total memory (RAM + swap) currently
+           used by process and children, in gigabytes
+        4. ``cpu-percent`` (:py:class:`float`): percentage of the total CPU
+           used by this process and children (recursively) since last call
+           (first time called should be ignored).  This number depends on the
+           number of CPUs in the system and can be greater than 100%
+        5. ``processes`` (:py:class:`int`): total number of processes including
+           self and children (recursively)
+        6. ``open-files`` (:py:class:`int`): total number of open files by
+           self and children
+
+    """
+
+    global _CLUSTER
+    if (not _CLUSTER) or (_CLUSTER[0] != psutil.Process()):  #initialization
+        this = psutil.Process()
+        _CLUSTER = [this] + this.children(recursive=True)
+        # touch cpu_percent() at least once for all
+        [k.cpu_percent(interval=None) for k in _CLUSTER]
+    else:
+        # check all cluster components and update process list
+        # done so we can keep the cpu_percent() initialization
+        children = _CLUSTER[0].children()
+        stored_children = set(_CLUSTER[1:])
+        current_children = set(_CLUSTER[0].children())
+        keep_children = stored_children - current_children
+        new_children = current_children - stored_children
+        [k.cpu_percent(interval=None) for k in new_children]
+        _CLUSTER = _CLUSTER[:1] + list(keep_children) + list(new_children)
+
+    memory_info = [k.memory_info() for k in _CLUSTER]
+
+    return (
+        ("system-memory-total", psutil.virtual_memory().total / GB),
+        ("system-memory-used", psutil.virtual_memory().used / GB),
+        ("system-cpu-count", psutil.cpu_count(logical=True)),
+        ("rss", sum([k.rss for k in memory_info]) / GB),
+        ("vms", sum([k.vms for k in memory_info]) / GB),
+        ("cpu-percent", sum(k.cpu_percent(interval=None) for k in _CLUSTER)),
+        ("processes", len(_CLUSTER)),
+        ("open-files", sum(len(k.open_files()) for k in _CLUSTER)),
+    )