Skip to content
Snippets Groups Projects
Commit c6b0ad72 authored by André Anjos's avatar André Anjos :speech_balloon:
Browse files

[utils.resources] Fix gpu logging

parent 6ad254bf
No related branches found
No related tags found
1 merge request!12Streamlining
Pipeline #39251 failed
...@@ -13,7 +13,7 @@ from tqdm import tqdm ...@@ -13,7 +13,7 @@ from tqdm import tqdm
from ..utils.metric import SmoothedValue from ..utils.metric import SmoothedValue
from ..utils.summary import summary from ..utils.summary import summary
from ..utils.resources import cpu_info, gpu_info, cpu_log, gpu_log from ..utils.resources import cpu_constants, gpu_constants, cpu_log, gpu_log
import logging import logging
...@@ -79,7 +79,7 @@ def run( ...@@ -79,7 +79,7 @@ def run(
if device != "cpu": if device != "cpu":
# asserts we do have a GPU # asserts we do have a GPU
assert bool(gpu_info()), ( assert bool(gpu_constants()), (
f"Device set to '{device}', but cannot " f"Device set to '{device}', but cannot "
f"find a GPU (maybe nvidia-smi is not installed?)" f"find a GPU (maybe nvidia-smi is not installed?)"
) )
...@@ -104,10 +104,10 @@ def run( ...@@ -104,10 +104,10 @@ def run(
os.unlink(backup) os.unlink(backup)
shutil.move(static_logfile_name, backup) shutil.move(static_logfile_name, backup)
with open(static_logfile_name, "w", newline="") as f: with open(static_logfile_name, "w", newline="") as f:
logdata = cpu_info() logdata = cpu_constants()
if device != "cpu": if device != "cpu":
logdata += gpu_info() logdata += gpu_constants()
logdata = (("model_size", n),) + logdata logdata += (("model_size", n),)
logwriter = csv.DictWriter(f, fieldnames=[k[0] for k in logdata]) logwriter = csv.DictWriter(f, fieldnames=[k[0] for k in logdata])
logwriter.writeheader() logwriter.writeheader()
logwriter.writerow(dict(k for k in logdata)) logwriter.writerow(dict(k for k in logdata))
......
...@@ -4,7 +4,6 @@ ...@@ -4,7 +4,6 @@
"""Tools for interacting with the running computer or GPU""" """Tools for interacting with the running computer or GPU"""
import os import os
import re
import subprocess import subprocess
import shutil import shutil
...@@ -17,29 +16,13 @@ logger = logging.getLogger(__name__) ...@@ -17,29 +16,13 @@ logger = logging.getLogger(__name__)
_nvidia_smi = shutil.which("nvidia-smi") _nvidia_smi = shutil.which("nvidia-smi")
"""Location of the nvidia-smi program, if one exists""" """Location of the nvidia-smi program, if one exists"""
_nvidia_starter_query = (
# obtain possible values with ``nvidia-smi --help-query-gpu``
"gpu_name",
"driver_version",
"memory.total",
)
"""Query parameters for logging static GPU information"""
_nvidia_log_query = (
# obtain possible values with ``nvidia-smi --help-query-gpu``
"memory.used",
"memory.free",
"utilization.memory",
"utilization.gpu",
)
"""Query parameters for logging performance of GPU"""
GB = float(2 ** 30) GB = float(2 ** 30)
"""The number of bytes in a gigabyte""" """The number of bytes in a gigabyte"""
def gpu_info(query=_nvidia_starter_query): def run_nvidia_smi(query, rename=None):
"""Returns GPU (static) information using nvidia-smi """Returns GPU information from query
For a comprehensive list of options and help, execute ``nvidia-smi For a comprehensive list of options and help, execute ``nvidia-smi
--help-query-gpu`` on a host with a GPU --help-query-gpu`` on a host with a GPU
...@@ -51,75 +34,112 @@ def gpu_info(query=_nvidia_starter_query): ...@@ -51,75 +34,112 @@ def gpu_info(query=_nvidia_starter_query):
query : list query : list
A list of query strings as defined by ``nvidia-smi --help-query-gpu`` A list of query strings as defined by ``nvidia-smi --help-query-gpu``
rename : :py:class:`list`, Optional
A list of keys to yield in the return value for each entry above. It
gives you the opportunity to rewrite some key names for convenience.
This list, if provided, must be of the same length as ``query``.
Returns Returns
------- -------
data : tuple data : :py:class:`tuple`, None
An ordered dictionary (organized as 2-tuples) containing the queried An ordered dictionary (organized as 2-tuples) containing the queried
parameters. If ``nvidia-smi`` is not available, returns a list of parameters (``rename`` versions). If ``nvidia-smi`` is not available,
``None`` objects. Dots and underscores in the original NVIDIA naming returns ``None``. Percentage information is left alone,
convention are normalized with dashes. memory information is transformed to gigabytes (floating-point).
""" """
if _nvidia_smi is not None: if _nvidia_smi is not None:
if rename is None:
rename = query
else:
assert len(rename) == len(query)
values = subprocess.getoutput( values = subprocess.getoutput(
"%s --query-gpu=%s --format=csv,noheader" "%s --query-gpu=%s --format=csv,noheader"
% (_nvidia_smi, ",".join(query)) % (_nvidia_smi, ",".join(query))
) )
values = [k.strip() for k in values.split(",")] values = [k.strip() for k in values.split(",")]
regexp = re.compile(r"(\.|-)") t_values = []
fieldnames = [regexp.sub("_", k) for k in query] for k in values:
return tuple(zip(fieldnames, values)) if k.endswith("%"):
t_values.append(float(k[:-1].strip()))
elif k.endswith("MiB"):
t_values.append(float(k[:-3].strip()) / 1024)
else:
t_values.append(k) #unchanged
return tuple(zip(rename, t_values))
def gpu_log(query=_nvidia_log_query): def gpu_constants():
"""Returns GPU information about current non-static status using nvidia-smi """Returns GPU (static) information using nvidia-smi
For a comprehensive list of options and help, execute ``nvidia-smi See :py:func:`run_nvidia_smi` for operational details.
--help-query-gpu`` on a host with a GPU
Returns
-------
Parameters data : :py:class:`tuple`, None
---------- If ``nvidia-smi`` is not available, returns ``None``, otherwise, we
return an ordered dictionary (organized as 2-tuples) containing the
following ``nvidia-smi`` query information:
query : list * ``gpu_name``, as ``gpu_name`` (:py:class:`str`)
A list of query strings as defined by ``nvidia-smi --help-query-gpu`` * ``driver_version``, as ``gpu_driver_version`` (:py:class:`str`)
* ``memory.total``, as ``gpu_memory_total`` (transformed to gigabytes,
:py:class:`float`)
"""
return run_nvidia_smi(
("gpu_name", "driver_version", "memory.total"),
("gpu_name", "gpu_driver_version", "gpu_memory_total"),
)
def gpu_log():
"""Returns GPU information about current non-static status using nvidia-smi
See :py:func:`run_nvidia_smi` for operational details.
Returns Returns
------- -------
data : tuple data : :py:class:`tuple`, None
An ordered dictionary (organized as 2-tuples) containing the queried If ``nvidia-smi`` is not available, returns ``None``, otherwise, we
parameters. If ``nvidia-smi`` is not available, returns a list of return an ordered dictionary (organized as 2-tuples) containing the
``None`` objects. Dots and underscores in the original NVIDIA naming following ``nvidia-smi`` query information:
convention are normalized with dashes. Percentage information is left
alone, memory information is transformed in to gigabytes.
""" * ``memory.used``, as ``gpu_memory_used`` (transformed to gigabytes,
:py:class:`float`)
* ``memory.free``, as ``gpu_memory_free`` (transformed to gigabytes,
:py:class:`float`)
* ``utilization.memory``, as ``gpu_memory_percent``,
(:py:class:`float`, in percent)
* ``utilization.gpu``, as ``gpu_utilization``,
(:py:class:`float`, in percent)
if _nvidia_smi is not None: """
values = subprocess.getoutput(
"%s --query-gpu=%s --format=csv,noheader" return run_nvidia_smi(
% (_nvidia_smi, ",".join(query)) ("memory.used", "memory.free", "utilization.memory", "utilization.gpu"),
) (
values = [k.strip() for k in values.split(",")] "gpu_memory_used",
t_values = [] "gpu_memory_free",
for k in values: "gpu_memory_percent",
if k.endswith('%'): t_values.append(k[:-1].strip()) "gpu_percent",
elif k.endswith('MiB'): t_values.append(float(k[:-3].strip())/1024) ),
regexp = re.compile(r"(\.|-)") )
fieldnames = [regexp.sub("_", k) for k in query]
return tuple(zip(fieldnames, values))
_CLUSTER = [] _CLUSTER = []
"""List of processes currently being monitored""" """List of processes currently being monitored"""
def cpu_info(): def cpu_constants():
"""Returns static CPU information about the current system. """Returns static CPU information about the current system.
...@@ -172,7 +192,7 @@ def cpu_log(): ...@@ -172,7 +192,7 @@ def cpu_log():
""" """
global _CLUSTER global _CLUSTER
if (not _CLUSTER) or (_CLUSTER[0] != psutil.Process()): #initialization if (not _CLUSTER) or (_CLUSTER[0] != psutil.Process()): # initialization
this = psutil.Process() this = psutil.Process()
_CLUSTER = [this] + this.children(recursive=True) _CLUSTER = [this] + this.children(recursive=True)
# touch cpu_percent() at least once for all # touch cpu_percent() at least once for all
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment