From c6b0ad723d3e7990469fa2c644fe907384409cd3 Mon Sep 17 00:00:00 2001 From: Andre Anjos <andre.anjos@idiap.ch> Date: Mon, 27 Apr 2020 12:35:11 +0200 Subject: [PATCH] [utils.resources] Fix gpu logging --- bob/ip/binseg/engine/trainer.py | 10 +-- bob/ip/binseg/utils/resources.py | 134 ++++++++++++++++++------------- 2 files changed, 82 insertions(+), 62 deletions(-) diff --git a/bob/ip/binseg/engine/trainer.py b/bob/ip/binseg/engine/trainer.py index 1bc5bff3..775470ac 100644 --- a/bob/ip/binseg/engine/trainer.py +++ b/bob/ip/binseg/engine/trainer.py @@ -13,7 +13,7 @@ from tqdm import tqdm from ..utils.metric import SmoothedValue from ..utils.summary import summary -from ..utils.resources import cpu_info, gpu_info, cpu_log, gpu_log +from ..utils.resources import cpu_constants, gpu_constants, cpu_log, gpu_log import logging @@ -79,7 +79,7 @@ def run( if device != "cpu": # asserts we do have a GPU - assert bool(gpu_info()), ( + assert bool(gpu_constants()), ( f"Device set to '{device}', but cannot " f"find a GPU (maybe nvidia-smi is not installed?)" ) @@ -104,10 +104,10 @@ def run( os.unlink(backup) shutil.move(static_logfile_name, backup) with open(static_logfile_name, "w", newline="") as f: - logdata = cpu_info() + logdata = cpu_constants() if device != "cpu": - logdata += gpu_info() - logdata = (("model_size", n),) + logdata + logdata += gpu_constants() + logdata += (("model_size", n),) logwriter = csv.DictWriter(f, fieldnames=[k[0] for k in logdata]) logwriter.writeheader() logwriter.writerow(dict(k for k in logdata)) diff --git a/bob/ip/binseg/utils/resources.py b/bob/ip/binseg/utils/resources.py index 27906fb8..ea64657c 100644 --- a/bob/ip/binseg/utils/resources.py +++ b/bob/ip/binseg/utils/resources.py @@ -4,7 +4,6 @@ """Tools for interacting with the running computer or GPU""" import os -import re import subprocess import shutil @@ -17,29 +16,13 @@ logger = logging.getLogger(__name__) _nvidia_smi = shutil.which("nvidia-smi") """Location of the nvidia-smi program, if one exists""" -_nvidia_starter_query = ( - # obtain possible values with ``nvidia-smi --help-query-gpu`` - "gpu_name", - "driver_version", - "memory.total", -) -"""Query parameters for logging static GPU information""" - -_nvidia_log_query = ( - # obtain possible values with ``nvidia-smi --help-query-gpu`` - "memory.used", - "memory.free", - "utilization.memory", - "utilization.gpu", -) -"""Query parameters for logging performance of GPU""" GB = float(2 ** 30) """The number of bytes in a gigabyte""" -def gpu_info(query=_nvidia_starter_query): - """Returns GPU (static) information using nvidia-smi +def run_nvidia_smi(query, rename=None): + """Returns GPU information from query For a comprehensive list of options and help, execute ``nvidia-smi --help-query-gpu`` on a host with a GPU @@ -51,75 +34,112 @@ def gpu_info(query=_nvidia_starter_query): query : list A list of query strings as defined by ``nvidia-smi --help-query-gpu`` + rename : :py:class:`list`, Optional + A list of keys to yield in the return value for each entry above. It + gives you the opportunity to rewrite some key names for convenience. + This list, if provided, must be of the same length as ``query``. + Returns ------- - data : tuple + data : :py:class:`tuple`, None An ordered dictionary (organized as 2-tuples) containing the queried - parameters. If ``nvidia-smi`` is not available, returns a list of - ``None`` objects. Dots and underscores in the original NVIDIA naming - convention are normalized with dashes. + parameters (``rename`` versions). If ``nvidia-smi`` is not available, + returns ``None``. Percentage information is left alone, + memory information is transformed to gigabytes (floating-point). - """ + """ if _nvidia_smi is not None: + + if rename is None: + rename = query + else: + assert len(rename) == len(query) + values = subprocess.getoutput( "%s --query-gpu=%s --format=csv,noheader" % (_nvidia_smi, ",".join(query)) ) values = [k.strip() for k in values.split(",")] - regexp = re.compile(r"(\.|-)") - fieldnames = [regexp.sub("_", k) for k in query] - return tuple(zip(fieldnames, values)) + t_values = [] + for k in values: + if k.endswith("%"): + t_values.append(float(k[:-1].strip())) + elif k.endswith("MiB"): + t_values.append(float(k[:-3].strip()) / 1024) + else: + t_values.append(k) #unchanged + return tuple(zip(rename, t_values)) -def gpu_log(query=_nvidia_log_query): - """Returns GPU information about current non-static status using nvidia-smi +def gpu_constants(): + """Returns GPU (static) information using nvidia-smi - For a comprehensive list of options and help, execute ``nvidia-smi - --help-query-gpu`` on a host with a GPU + See :py:func:`run_nvidia_smi` for operational details. + Returns + ------- - Parameters - ---------- + data : :py:class:`tuple`, None + If ``nvidia-smi`` is not available, returns ``None``, otherwise, we + return an ordered dictionary (organized as 2-tuples) containing the + following ``nvidia-smi`` query information: - query : list - A list of query strings as defined by ``nvidia-smi --help-query-gpu`` + * ``gpu_name``, as ``gpu_name`` (:py:class:`str`) + * ``driver_version``, as ``gpu_driver_version`` (:py:class:`str`) + * ``memory.total``, as ``gpu_memory_total`` (transformed to gigabytes, + :py:class:`float`) + + """ + + return run_nvidia_smi( + ("gpu_name", "driver_version", "memory.total"), + ("gpu_name", "gpu_driver_version", "gpu_memory_total"), + ) + + +def gpu_log(): + """Returns GPU information about current non-static status using nvidia-smi + See :py:func:`run_nvidia_smi` for operational details. Returns ------- - data : tuple - An ordered dictionary (organized as 2-tuples) containing the queried - parameters. If ``nvidia-smi`` is not available, returns a list of - ``None`` objects. Dots and underscores in the original NVIDIA naming - convention are normalized with dashes. Percentage information is left - alone, memory information is transformed in to gigabytes. + data : :py:class:`tuple`, None + If ``nvidia-smi`` is not available, returns ``None``, otherwise, we + return an ordered dictionary (organized as 2-tuples) containing the + following ``nvidia-smi`` query information: - """ + * ``memory.used``, as ``gpu_memory_used`` (transformed to gigabytes, + :py:class:`float`) + * ``memory.free``, as ``gpu_memory_free`` (transformed to gigabytes, + :py:class:`float`) + * ``utilization.memory``, as ``gpu_memory_percent``, + (:py:class:`float`, in percent) + * ``utilization.gpu``, as ``gpu_utilization``, + (:py:class:`float`, in percent) - if _nvidia_smi is not None: - values = subprocess.getoutput( - "%s --query-gpu=%s --format=csv,noheader" - % (_nvidia_smi, ",".join(query)) - ) - values = [k.strip() for k in values.split(",")] - t_values = [] - for k in values: - if k.endswith('%'): t_values.append(k[:-1].strip()) - elif k.endswith('MiB'): t_values.append(float(k[:-3].strip())/1024) - regexp = re.compile(r"(\.|-)") - fieldnames = [regexp.sub("_", k) for k in query] - return tuple(zip(fieldnames, values)) + """ + + return run_nvidia_smi( + ("memory.used", "memory.free", "utilization.memory", "utilization.gpu"), + ( + "gpu_memory_used", + "gpu_memory_free", + "gpu_memory_percent", + "gpu_percent", + ), + ) _CLUSTER = [] """List of processes currently being monitored""" -def cpu_info(): +def cpu_constants(): """Returns static CPU information about the current system. @@ -172,7 +192,7 @@ def cpu_log(): """ global _CLUSTER - if (not _CLUSTER) or (_CLUSTER[0] != psutil.Process()): #initialization + if (not _CLUSTER) or (_CLUSTER[0] != psutil.Process()): # initialization this = psutil.Process() _CLUSTER = [this] + this.children(recursive=True) # touch cpu_percent() at least once for all -- GitLab