From 6ad254bf1cc7fe1261eb062d8e8bd44986cbb2e5 Mon Sep 17 00:00:00 2001
From: Andre Anjos <andre.anjos@idiap.ch>
Date: Mon, 27 Apr 2020 11:10:15 +0200
Subject: [PATCH] [engine.*] Move model summarization to training; Separate
 static information from dynamic information logging

---
 bob/ip/binseg/engine/predictor.py |   9 ---
 bob/ip/binseg/engine/trainer.py   |  76 ++++++++++++++------
 bob/ip/binseg/test/test_cli.py    |   6 +-
 bob/ip/binseg/utils/resources.py  | 115 +++++++++++++++++++++++-------
 4 files changed, 148 insertions(+), 58 deletions(-)

diff --git a/bob/ip/binseg/engine/predictor.py b/bob/ip/binseg/engine/predictor.py
index a7ab078d..f4fe00d9 100644
--- a/bob/ip/binseg/engine/predictor.py
+++ b/bob/ip/binseg/engine/predictor.py
@@ -14,7 +14,6 @@ import torchvision.transforms.functional as VF
 
 import h5py
 
-from ..utils.summary import summary
 from ..data.utils import overlayed_image
 
 import logging
@@ -182,11 +181,3 @@ def run(model, data_loader, device, output_folder, overlayed_folder):
 
     average_image_time = numpy.sum(numpy.array(times) * len_samples) / float(sum(len_samples))
     logger.info(f"Average image time: {average_image_time:g}s")
-
-    # Save model summary
-    summary_path = os.path.join(output_folder, "model-info.txt")
-    logger.info(f"Saving model summary at {summary_path}...")
-    with open(summary_path, "wt") as f:
-        r, n = summary(model)
-        logger.info(f"Model has {n} parameters...")
-        f.write(r)
diff --git a/bob/ip/binseg/engine/trainer.py b/bob/ip/binseg/engine/trainer.py
index 47f45253..1bc5bff3 100644
--- a/bob/ip/binseg/engine/trainer.py
+++ b/bob/ip/binseg/engine/trainer.py
@@ -4,6 +4,7 @@
 import os
 import csv
 import time
+import shutil
 import datetime
 import distutils.version
 
@@ -11,7 +12,8 @@ import torch
 from tqdm import tqdm
 
 from ..utils.metric import SmoothedValue
-from ..utils.resources import gpu_info, cpu_info
+from ..utils.summary import summary
+from ..utils.resources import cpu_info, gpu_info, cpu_log, gpu_log
 
 import logging
 
@@ -75,31 +77,61 @@ def run(
     start_epoch = arguments["epoch"]
     max_epoch = arguments["max_epoch"]
 
+    if device != "cpu":
+        # asserts we do have a GPU
+        assert bool(gpu_info()), (
+            f"Device set to '{device}', but cannot "
+            f"find a GPU (maybe nvidia-smi is not installed?)"
+        )
+
     if not os.path.exists(output_folder):
         logger.debug(f"Creating output directory '{output_folder}'...")
         os.makedirs(output_folder)
 
-    # Log to file
+    # Save model summary
+    summary_path = os.path.join(output_folder, "model_summary.txt")
+    logger.info(f"Saving model summary at {summary_path}...")
+    with open(summary_path, "wt") as f:
+        r, n = summary(model)
+        logger.info(f"Model has {n} parameters...")
+        f.write(r)
+
+    # write static information to a CSV file
+    static_logfile_name = os.path.join(output_folder, "constants.csv")
+    if os.path.exists(static_logfile_name):
+        backup = static_logfile_name + "~"
+        if os.path.exists(backup):
+            os.unlink(backup)
+        shutil.move(static_logfile_name, backup)
+    with open(static_logfile_name, "w", newline="") as f:
+        logdata = cpu_info()
+        if device != "cpu":
+            logdata += gpu_info()
+        logdata = (("model_size", n),) + logdata
+        logwriter = csv.DictWriter(f, fieldnames=[k[0] for k in logdata])
+        logwriter.writeheader()
+        logwriter.writerow(dict(k for k in logdata))
+
+    # Log continous information to (another) file
     logfile_name = os.path.join(output_folder, "trainlog.csv")
 
     if arguments["epoch"] == 0 and os.path.exists(logfile_name):
-        logger.info(f"Truncating {logfile_name} - training is restarting...")
-        os.unlink(logfile_name)
+        backup = logfile_name + "~"
+        if os.path.exists(backup):
+            os.unlink(backup)
+        shutil.move(logfile_name, backup)
 
     logfile_fields = (
         "epoch",
-        "total-time",
+        "total_time",
         "eta",
-        "average-loss",
-        "median-loss",
-        "learning-rate",
+        "average_loss",
+        "median_loss",
+        "learning_rate",
     )
-    cpu_data = cpu_info()
-    logfile_fields += tuple([k[0] for k in cpu_data])
-    gpu_data = gpu_info()
-    if gpu_data is not None:  # CUDA is available on this platform
-        logfile_fields += tuple([k[0] for k in gpu_data])
-    gpu_is_available = bool(gpu_data)
+    logfile_fields += tuple([k[0] for k in cpu_log()])
+    if device != "cpu":
+        logfile_fields += tuple([k[0] for k in gpu_log()])
 
     with open(logfile_name, "a+", newline="") as logfile:
         logwriter = csv.DictWriter(logfile, fieldnames=logfile_fields)
@@ -132,7 +164,7 @@ def run(
 
             # progress bar only on interactive jobs
             for samples in tqdm(
-                data_loader, desc="batch", leave=False, disable=None,
+                data_loader, desc="batch", leave=False, disable=None
             ):
 
                 # data forwarding on the existing network
@@ -171,16 +203,16 @@ def run(
             logdata = (
                 ("epoch", f"{epoch}"),
                 (
-                    "total-time",
+                    "total_time",
                     f"{datetime.timedelta(seconds=int(current_time))}",
                 ),
                 ("eta", f"{datetime.timedelta(seconds=int(eta_seconds))}"),
-                ("average-loss", f"{losses.avg:.6f}"),
-                ("median-loss", f"{losses.median:.6f}"),
-                ("learning-rate", f"{optimizer.param_groups[0]['lr']:.6f}"),
-            ) + cpu_info()
-            if gpu_is_available:
-                logdata += gpu_info()
+                ("average_loss", f"{losses.avg:.6f}"),
+                ("median_loss", f"{losses.median:.6f}"),
+                ("learning_rate", f"{optimizer.param_groups[0]['lr']:.6f}"),
+            ) + cpu_log()
+            if device != 'cpu':
+                logdata += gpu_log()
 
             logwriter.writerow(dict(k for k in logdata))
             logfile.flush()
diff --git a/bob/ip/binseg/test/test_cli.py b/bob/ip/binseg/test/test_cli.py
index 79eddbdb..d0d25c11 100644
--- a/bob/ip/binseg/test/test_cli.py
+++ b/bob/ip/binseg/test/test_cli.py
@@ -107,11 +107,12 @@ def _check_experiment_stare(overlay):
         train_folder = os.path.join(output_folder, "model")
         assert os.path.exists(os.path.join(train_folder, "model_final.pth"))
         assert os.path.exists(os.path.join(train_folder, "last_checkpoint"))
+        assert os.path.exists(os.path.join(train_folder, "constants.csv"))
         assert os.path.exists(os.path.join(train_folder, "trainlog.csv"))
+        assert os.path.exists(os.path.join(predict_folder, "model_summary.txt"))
 
         # check predictions are there
         predict_folder = os.path.join(output_folder, "predictions")
-        assert os.path.exists(os.path.join(predict_folder, "model-info.txt"))
         basedir = os.path.join(predict_folder, "stare-images")
         assert os.path.exists(basedir)
         nose.tools.eq_(len(fnmatch.filter(os.listdir(basedir), "*.hdf5")), 20)
@@ -237,7 +238,9 @@ def _check_train(runner):
 
         assert os.path.exists(os.path.join(output_folder, "model_final.pth"))
         assert os.path.exists(os.path.join(output_folder, "last_checkpoint"))
+        assert os.path.exists(os.path.join(output_folder, "constants.csv"))
         assert os.path.exists(os.path.join(output_folder, "trainlog.csv"))
+        assert os.path.exists(os.path.join(output_folder, "model_summary.txt"))
 
         keywords = {
             r"^Continuing from epoch 0$": 1,
@@ -293,7 +296,6 @@ def _check_predict(runner):
         _assert_exit_0(result)
 
         # check predictions are there
-        assert os.path.exists(os.path.join(output_folder, "model-info.txt"))
         basedir = os.path.join(output_folder, "stare-images")
         assert os.path.exists(basedir)
         nose.tools.eq_(len(fnmatch.filter(os.listdir(basedir), "*.hdf5")), 10)
diff --git a/bob/ip/binseg/utils/resources.py b/bob/ip/binseg/utils/resources.py
index 8a3c3612..27906fb8 100644
--- a/bob/ip/binseg/utils/resources.py
+++ b/bob/ip/binseg/utils/resources.py
@@ -17,21 +17,29 @@ logger = logging.getLogger(__name__)
 _nvidia_smi = shutil.which("nvidia-smi")
 """Location of the nvidia-smi program, if one exists"""
 
-_nvidia_query = (
+_nvidia_starter_query = (
     # obtain possible values with ``nvidia-smi --help-query-gpu``
     "gpu_name",
+    "driver_version",
     "memory.total",
+)
+"""Query parameters for logging static GPU information"""
+
+_nvidia_log_query = (
+    # obtain possible values with ``nvidia-smi --help-query-gpu``
     "memory.used",
+    "memory.free",
+    "utilization.memory",
     "utilization.gpu",
 )
-"""Query parameters for nvidia-smi"""
+"""Query parameters for logging performance of GPU"""
 
 GB = float(2 ** 30)
 """The number of bytes in a gigabyte"""
 
 
-def gpu_info(query=_nvidia_query):
-    """Returns GPU information using nvidia-smi
+def gpu_info(query=_nvidia_starter_query):
+    """Returns GPU (static) information using nvidia-smi
 
     For a comprehensive list of options and help, execute ``nvidia-smi
     --help-query-gpu`` on a host with a GPU
@@ -61,8 +69,49 @@ def gpu_info(query=_nvidia_query):
             % (_nvidia_smi, ",".join(query))
         )
         values = [k.strip() for k in values.split(",")]
-        regexp = re.compile(r"(\.|_)")
-        fieldnames = [regexp.sub("-", k) for k in query]
+        regexp = re.compile(r"(\.|-)")
+        fieldnames = [regexp.sub("_", k) for k in query]
+        return tuple(zip(fieldnames, values))
+
+
+def gpu_log(query=_nvidia_log_query):
+    """Returns GPU information about current non-static status using nvidia-smi
+
+    For a comprehensive list of options and help, execute ``nvidia-smi
+    --help-query-gpu`` on a host with a GPU
+
+
+    Parameters
+    ----------
+
+    query : list
+        A list of query strings as defined by ``nvidia-smi --help-query-gpu``
+
+
+    Returns
+    -------
+
+    data : tuple
+        An ordered dictionary (organized as 2-tuples) containing the queried
+        parameters.  If ``nvidia-smi`` is not available, returns a list of
+        ``None`` objects.  Dots and underscores in the original NVIDIA naming
+        convention are normalized with dashes.  Percentage information is left
+        alone, memory information is transformed in to gigabytes.
+
+  """
+
+    if _nvidia_smi is not None:
+        values = subprocess.getoutput(
+            "%s --query-gpu=%s --format=csv,noheader"
+            % (_nvidia_smi, ",".join(query))
+        )
+        values = [k.strip() for k in values.split(",")]
+        t_values = []
+        for k in values:
+            if k.endswith('%'): t_values.append(k[:-1].strip())
+            elif k.endswith('MiB'): t_values.append(float(k[:-3].strip())/1024)
+        regexp = re.compile(r"(\.|-)")
+        fieldnames = [regexp.sub("_", k) for k in query]
         return tuple(zip(fieldnames, values))
 
 
@@ -71,6 +120,28 @@ _CLUSTER = []
 
 
 def cpu_info():
+    """Returns static CPU information about the current system.
+
+
+    Returns
+    -------
+
+    data : tuple
+        An ordered dictionary (organized as 2-tuples) containing these entries:
+
+        0. ``cpu_memory_total`` (:py:class:`float`): total memory available,
+           in gigabytes
+        1. ``cpu_count`` (:py:class:`int`): number of logical CPUs available
+
+    """
+
+    return (
+        ("cpu_memory_total", psutil.virtual_memory().total / GB),
+        ("cpu_count", psutil.cpu_count(logical=True)),
+    )
+
+
+def cpu_log():
     """Returns process (+child) information using ``psutil``.
 
     This call examines the current process plus any spawn child and returns the
@@ -83,23 +154,19 @@ def cpu_info():
     data : tuple
         An ordered dictionary (organized as 2-tuples) containing these entries:
 
-        0. ``system-memory-total`` (:py:class:`float`): total memory available,
-           in gigabytes
-        1. ``system-memory-used`` (:py:class:`float`): total memory used from
+        0. ``cpu_memory_used`` (:py:class:`float`): total memory used from
            the system, in gigabytes
-        2. ``system-cpu-count`` (:py:class:`int`): number of logical CPUs
-           available
-        3. ``rss`` (:py:class:`float`):  RAM currently used by
+        1. ``cpu_rss`` (:py:class:`float`):  RAM currently used by
            process and children, in gigabytes
-        3. ``vms`` (:py:class:`float`):  total memory (RAM + swap) currently
+        2. ``cpu_vms`` (:py:class:`float`):  total memory (RAM + swap) currently
            used by process and children, in gigabytes
-        4. ``cpu-percent`` (:py:class:`float`): percentage of the total CPU
+        3. ``cpu_percent`` (:py:class:`float`): percentage of the total CPU
            used by this process and children (recursively) since last call
            (first time called should be ignored).  This number depends on the
            number of CPUs in the system and can be greater than 100%
-        5. ``processes`` (:py:class:`int`): total number of processes including
-           self and children (recursively)
-        6. ``open-files`` (:py:class:`int`): total number of open files by
+        4. ``cpu_processes`` (:py:class:`int`): total number of processes
+           including self and children (recursively)
+        5. ``cpu_open_files`` (:py:class:`int`): total number of open files by
            self and children
 
     """
@@ -124,12 +191,10 @@ def cpu_info():
     memory_info = [k.memory_info() for k in _CLUSTER]
 
     return (
-        ("system-memory-total", psutil.virtual_memory().total / GB),
-        ("system-memory-used", psutil.virtual_memory().used / GB),
-        ("system-cpu-count", psutil.cpu_count(logical=True)),
-        ("rss", sum([k.rss for k in memory_info]) / GB),
-        ("vms", sum([k.vms for k in memory_info]) / GB),
-        ("cpu-percent", sum(k.cpu_percent(interval=None) for k in _CLUSTER)),
-        ("processes", len(_CLUSTER)),
-        ("open-files", sum(len(k.open_files()) for k in _CLUSTER)),
+        ("cpu_memory_used", psutil.virtual_memory().used / GB),
+        ("cpu_rss", sum([k.rss for k in memory_info]) / GB),
+        ("cpu_vms", sum([k.vms for k in memory_info]) / GB),
+        ("cpu_percent", sum(k.cpu_percent(interval=None) for k in _CLUSTER)),
+        ("cpu_processes", len(_CLUSTER)),
+        ("cpu_open_files", sum(len(k.open_files()) for k in _CLUSTER)),
     )
-- 
GitLab