From 6ad254bf1cc7fe1261eb062d8e8bd44986cbb2e5 Mon Sep 17 00:00:00 2001 From: Andre Anjos <andre.anjos@idiap.ch> Date: Mon, 27 Apr 2020 11:10:15 +0200 Subject: [PATCH] [engine.*] Move model summarization to training; Separate static information from dynamic information logging --- bob/ip/binseg/engine/predictor.py | 9 --- bob/ip/binseg/engine/trainer.py | 76 ++++++++++++++------ bob/ip/binseg/test/test_cli.py | 6 +- bob/ip/binseg/utils/resources.py | 115 +++++++++++++++++++++++------- 4 files changed, 148 insertions(+), 58 deletions(-) diff --git a/bob/ip/binseg/engine/predictor.py b/bob/ip/binseg/engine/predictor.py index a7ab078d..f4fe00d9 100644 --- a/bob/ip/binseg/engine/predictor.py +++ b/bob/ip/binseg/engine/predictor.py @@ -14,7 +14,6 @@ import torchvision.transforms.functional as VF import h5py -from ..utils.summary import summary from ..data.utils import overlayed_image import logging @@ -182,11 +181,3 @@ def run(model, data_loader, device, output_folder, overlayed_folder): average_image_time = numpy.sum(numpy.array(times) * len_samples) / float(sum(len_samples)) logger.info(f"Average image time: {average_image_time:g}s") - - # Save model summary - summary_path = os.path.join(output_folder, "model-info.txt") - logger.info(f"Saving model summary at {summary_path}...") - with open(summary_path, "wt") as f: - r, n = summary(model) - logger.info(f"Model has {n} parameters...") - f.write(r) diff --git a/bob/ip/binseg/engine/trainer.py b/bob/ip/binseg/engine/trainer.py index 47f45253..1bc5bff3 100644 --- a/bob/ip/binseg/engine/trainer.py +++ b/bob/ip/binseg/engine/trainer.py @@ -4,6 +4,7 @@ import os import csv import time +import shutil import datetime import distutils.version @@ -11,7 +12,8 @@ import torch from tqdm import tqdm from ..utils.metric import SmoothedValue -from ..utils.resources import gpu_info, cpu_info +from ..utils.summary import summary +from ..utils.resources import cpu_info, gpu_info, cpu_log, gpu_log import logging @@ -75,31 +77,61 @@ def run( start_epoch = arguments["epoch"] max_epoch = arguments["max_epoch"] + if device != "cpu": + # asserts we do have a GPU + assert bool(gpu_info()), ( + f"Device set to '{device}', but cannot " + f"find a GPU (maybe nvidia-smi is not installed?)" + ) + if not os.path.exists(output_folder): logger.debug(f"Creating output directory '{output_folder}'...") os.makedirs(output_folder) - # Log to file + # Save model summary + summary_path = os.path.join(output_folder, "model_summary.txt") + logger.info(f"Saving model summary at {summary_path}...") + with open(summary_path, "wt") as f: + r, n = summary(model) + logger.info(f"Model has {n} parameters...") + f.write(r) + + # write static information to a CSV file + static_logfile_name = os.path.join(output_folder, "constants.csv") + if os.path.exists(static_logfile_name): + backup = static_logfile_name + "~" + if os.path.exists(backup): + os.unlink(backup) + shutil.move(static_logfile_name, backup) + with open(static_logfile_name, "w", newline="") as f: + logdata = cpu_info() + if device != "cpu": + logdata += gpu_info() + logdata = (("model_size", n),) + logdata + logwriter = csv.DictWriter(f, fieldnames=[k[0] for k in logdata]) + logwriter.writeheader() + logwriter.writerow(dict(k for k in logdata)) + + # Log continous information to (another) file logfile_name = os.path.join(output_folder, "trainlog.csv") if arguments["epoch"] == 0 and os.path.exists(logfile_name): - logger.info(f"Truncating {logfile_name} - training is restarting...") - os.unlink(logfile_name) + backup = logfile_name + "~" + if os.path.exists(backup): + os.unlink(backup) + shutil.move(logfile_name, backup) logfile_fields = ( "epoch", - "total-time", + "total_time", "eta", - "average-loss", - "median-loss", - "learning-rate", + "average_loss", + "median_loss", + "learning_rate", ) - cpu_data = cpu_info() - logfile_fields += tuple([k[0] for k in cpu_data]) - gpu_data = gpu_info() - if gpu_data is not None: # CUDA is available on this platform - logfile_fields += tuple([k[0] for k in gpu_data]) - gpu_is_available = bool(gpu_data) + logfile_fields += tuple([k[0] for k in cpu_log()]) + if device != "cpu": + logfile_fields += tuple([k[0] for k in gpu_log()]) with open(logfile_name, "a+", newline="") as logfile: logwriter = csv.DictWriter(logfile, fieldnames=logfile_fields) @@ -132,7 +164,7 @@ def run( # progress bar only on interactive jobs for samples in tqdm( - data_loader, desc="batch", leave=False, disable=None, + data_loader, desc="batch", leave=False, disable=None ): # data forwarding on the existing network @@ -171,16 +203,16 @@ def run( logdata = ( ("epoch", f"{epoch}"), ( - "total-time", + "total_time", f"{datetime.timedelta(seconds=int(current_time))}", ), ("eta", f"{datetime.timedelta(seconds=int(eta_seconds))}"), - ("average-loss", f"{losses.avg:.6f}"), - ("median-loss", f"{losses.median:.6f}"), - ("learning-rate", f"{optimizer.param_groups[0]['lr']:.6f}"), - ) + cpu_info() - if gpu_is_available: - logdata += gpu_info() + ("average_loss", f"{losses.avg:.6f}"), + ("median_loss", f"{losses.median:.6f}"), + ("learning_rate", f"{optimizer.param_groups[0]['lr']:.6f}"), + ) + cpu_log() + if device != 'cpu': + logdata += gpu_log() logwriter.writerow(dict(k for k in logdata)) logfile.flush() diff --git a/bob/ip/binseg/test/test_cli.py b/bob/ip/binseg/test/test_cli.py index 79eddbdb..d0d25c11 100644 --- a/bob/ip/binseg/test/test_cli.py +++ b/bob/ip/binseg/test/test_cli.py @@ -107,11 +107,12 @@ def _check_experiment_stare(overlay): train_folder = os.path.join(output_folder, "model") assert os.path.exists(os.path.join(train_folder, "model_final.pth")) assert os.path.exists(os.path.join(train_folder, "last_checkpoint")) + assert os.path.exists(os.path.join(train_folder, "constants.csv")) assert os.path.exists(os.path.join(train_folder, "trainlog.csv")) + assert os.path.exists(os.path.join(predict_folder, "model_summary.txt")) # check predictions are there predict_folder = os.path.join(output_folder, "predictions") - assert os.path.exists(os.path.join(predict_folder, "model-info.txt")) basedir = os.path.join(predict_folder, "stare-images") assert os.path.exists(basedir) nose.tools.eq_(len(fnmatch.filter(os.listdir(basedir), "*.hdf5")), 20) @@ -237,7 +238,9 @@ def _check_train(runner): assert os.path.exists(os.path.join(output_folder, "model_final.pth")) assert os.path.exists(os.path.join(output_folder, "last_checkpoint")) + assert os.path.exists(os.path.join(output_folder, "constants.csv")) assert os.path.exists(os.path.join(output_folder, "trainlog.csv")) + assert os.path.exists(os.path.join(output_folder, "model_summary.txt")) keywords = { r"^Continuing from epoch 0$": 1, @@ -293,7 +296,6 @@ def _check_predict(runner): _assert_exit_0(result) # check predictions are there - assert os.path.exists(os.path.join(output_folder, "model-info.txt")) basedir = os.path.join(output_folder, "stare-images") assert os.path.exists(basedir) nose.tools.eq_(len(fnmatch.filter(os.listdir(basedir), "*.hdf5")), 10) diff --git a/bob/ip/binseg/utils/resources.py b/bob/ip/binseg/utils/resources.py index 8a3c3612..27906fb8 100644 --- a/bob/ip/binseg/utils/resources.py +++ b/bob/ip/binseg/utils/resources.py @@ -17,21 +17,29 @@ logger = logging.getLogger(__name__) _nvidia_smi = shutil.which("nvidia-smi") """Location of the nvidia-smi program, if one exists""" -_nvidia_query = ( +_nvidia_starter_query = ( # obtain possible values with ``nvidia-smi --help-query-gpu`` "gpu_name", + "driver_version", "memory.total", +) +"""Query parameters for logging static GPU information""" + +_nvidia_log_query = ( + # obtain possible values with ``nvidia-smi --help-query-gpu`` "memory.used", + "memory.free", + "utilization.memory", "utilization.gpu", ) -"""Query parameters for nvidia-smi""" +"""Query parameters for logging performance of GPU""" GB = float(2 ** 30) """The number of bytes in a gigabyte""" -def gpu_info(query=_nvidia_query): - """Returns GPU information using nvidia-smi +def gpu_info(query=_nvidia_starter_query): + """Returns GPU (static) information using nvidia-smi For a comprehensive list of options and help, execute ``nvidia-smi --help-query-gpu`` on a host with a GPU @@ -61,8 +69,49 @@ def gpu_info(query=_nvidia_query): % (_nvidia_smi, ",".join(query)) ) values = [k.strip() for k in values.split(",")] - regexp = re.compile(r"(\.|_)") - fieldnames = [regexp.sub("-", k) for k in query] + regexp = re.compile(r"(\.|-)") + fieldnames = [regexp.sub("_", k) for k in query] + return tuple(zip(fieldnames, values)) + + +def gpu_log(query=_nvidia_log_query): + """Returns GPU information about current non-static status using nvidia-smi + + For a comprehensive list of options and help, execute ``nvidia-smi + --help-query-gpu`` on a host with a GPU + + + Parameters + ---------- + + query : list + A list of query strings as defined by ``nvidia-smi --help-query-gpu`` + + + Returns + ------- + + data : tuple + An ordered dictionary (organized as 2-tuples) containing the queried + parameters. If ``nvidia-smi`` is not available, returns a list of + ``None`` objects. Dots and underscores in the original NVIDIA naming + convention are normalized with dashes. Percentage information is left + alone, memory information is transformed in to gigabytes. + + """ + + if _nvidia_smi is not None: + values = subprocess.getoutput( + "%s --query-gpu=%s --format=csv,noheader" + % (_nvidia_smi, ",".join(query)) + ) + values = [k.strip() for k in values.split(",")] + t_values = [] + for k in values: + if k.endswith('%'): t_values.append(k[:-1].strip()) + elif k.endswith('MiB'): t_values.append(float(k[:-3].strip())/1024) + regexp = re.compile(r"(\.|-)") + fieldnames = [regexp.sub("_", k) for k in query] return tuple(zip(fieldnames, values)) @@ -71,6 +120,28 @@ _CLUSTER = [] def cpu_info(): + """Returns static CPU information about the current system. + + + Returns + ------- + + data : tuple + An ordered dictionary (organized as 2-tuples) containing these entries: + + 0. ``cpu_memory_total`` (:py:class:`float`): total memory available, + in gigabytes + 1. ``cpu_count`` (:py:class:`int`): number of logical CPUs available + + """ + + return ( + ("cpu_memory_total", psutil.virtual_memory().total / GB), + ("cpu_count", psutil.cpu_count(logical=True)), + ) + + +def cpu_log(): """Returns process (+child) information using ``psutil``. This call examines the current process plus any spawn child and returns the @@ -83,23 +154,19 @@ def cpu_info(): data : tuple An ordered dictionary (organized as 2-tuples) containing these entries: - 0. ``system-memory-total`` (:py:class:`float`): total memory available, - in gigabytes - 1. ``system-memory-used`` (:py:class:`float`): total memory used from + 0. ``cpu_memory_used`` (:py:class:`float`): total memory used from the system, in gigabytes - 2. ``system-cpu-count`` (:py:class:`int`): number of logical CPUs - available - 3. ``rss`` (:py:class:`float`): RAM currently used by + 1. ``cpu_rss`` (:py:class:`float`): RAM currently used by process and children, in gigabytes - 3. ``vms`` (:py:class:`float`): total memory (RAM + swap) currently + 2. ``cpu_vms`` (:py:class:`float`): total memory (RAM + swap) currently used by process and children, in gigabytes - 4. ``cpu-percent`` (:py:class:`float`): percentage of the total CPU + 3. ``cpu_percent`` (:py:class:`float`): percentage of the total CPU used by this process and children (recursively) since last call (first time called should be ignored). This number depends on the number of CPUs in the system and can be greater than 100% - 5. ``processes`` (:py:class:`int`): total number of processes including - self and children (recursively) - 6. ``open-files`` (:py:class:`int`): total number of open files by + 4. ``cpu_processes`` (:py:class:`int`): total number of processes + including self and children (recursively) + 5. ``cpu_open_files`` (:py:class:`int`): total number of open files by self and children """ @@ -124,12 +191,10 @@ def cpu_info(): memory_info = [k.memory_info() for k in _CLUSTER] return ( - ("system-memory-total", psutil.virtual_memory().total / GB), - ("system-memory-used", psutil.virtual_memory().used / GB), - ("system-cpu-count", psutil.cpu_count(logical=True)), - ("rss", sum([k.rss for k in memory_info]) / GB), - ("vms", sum([k.vms for k in memory_info]) / GB), - ("cpu-percent", sum(k.cpu_percent(interval=None) for k in _CLUSTER)), - ("processes", len(_CLUSTER)), - ("open-files", sum(len(k.open_files()) for k in _CLUSTER)), + ("cpu_memory_used", psutil.virtual_memory().used / GB), + ("cpu_rss", sum([k.rss for k in memory_info]) / GB), + ("cpu_vms", sum([k.vms for k in memory_info]) / GB), + ("cpu_percent", sum(k.cpu_percent(interval=None) for k in _CLUSTER)), + ("cpu_processes", len(_CLUSTER)), + ("cpu_open_files", sum(len(k.open_files()) for k in _CLUSTER)), ) -- GitLab