From 356aaa90bd859dde5b1cd7da6cc6eac8d2f29cf4 Mon Sep 17 00:00:00 2001 From: Andre Anjos <andre.dos.anjos@gmail.com> Date: Wed, 24 Jan 2024 17:17:06 +0100 Subject: [PATCH] [engine.trainer] Save more informative constants when using an MPS compute engine --- src/mednet/engine/trainer.py | 13 ++- src/mednet/utils/resources.py | 150 ++++++++++++++++++++++++---------- 2 files changed, 116 insertions(+), 47 deletions(-) diff --git a/src/mednet/engine/trainer.py b/src/mednet/engine/trainer.py index 1b9bfb0b..d4b39c4b 100644 --- a/src/mednet/engine/trainer.py +++ b/src/mednet/engine/trainer.py @@ -14,7 +14,12 @@ import lightning.pytorch.loggers import torch.nn from ..utils.checkpointer import CHECKPOINT_ALIASES -from ..utils.resources import ResourceMonitor, cpu_constants, cuda_constants +from ..utils.resources import ( + ResourceMonitor, + cpu_constants, + cuda_constants, + mps_constants, +) from .callbacks import LoggingCallback from .device import DeviceManager, SupportedPytorchDevice @@ -96,11 +101,13 @@ def static_information_to_csv( if results is not None: logdata.update(results) case "mps": - pass + results = mps_constants() + if results is not None: + logdata.update(results) case _: pass - logdata["model_size"] = model_size + logdata["number-of-model-parameters"] = model_size logwriter = csv.DictWriter(f, fieldnames=logdata.keys()) logwriter.writeheader() logwriter.writerow(logdata) diff --git a/src/mednet/utils/resources.py b/src/mednet/utils/resources.py index 7babefde..01995cac 100644 --- a/src/mednet/utils/resources.py +++ b/src/mednet/utils/resources.py @@ -77,6 +77,73 @@ def run_nvidia_smi( return retval +def run_powermetrics( + time_window_ms: int = 500, key: str | None = None +) -> dict[str, typing.Any] | None: + """Returns GPU information from the system. + + For a comprehensive list of options and help, execute ``man powermetrics`` + on a Mac computer with Apple silicon. + + + Parameters + ---------- + + time_window_ms + The amount of time, in milliseconds, to collect usage information on + the GPU. + + key + If specified returns only a sub-key of the dictionary. + + + Returns + ------- + + data + A dictionary containing the GPU information. + """ + + cmd = [ + "sudo", + "-n", + "/usr/bin/powermetrics", + "--samplers", + "gpu_power", + f"-i{time_window_ms}", + "-n1", + "-fplist", + ] + + try: + raw_bytes = subprocess.check_output(cmd) + data = plistlib.loads(raw_bytes) + return data[key] if key is not None else data + + except subprocess.CalledProcessError: + import inspect + import warnings + + warnings.warn( + inspect.cleandoc( + f"""Cannot run `sudo powermetrics` without a password. Probably, + you did not setup sudo to execute the macOS CLI application + `powermetrics` passwordlessly and therefore this warning is + being issued. This does not affect the running of your model + training, only the ability of the resource monitor of gathering + GPU usage information on macOS while using the MPS compute + backend. To configure this properly and get rid of this + warning, execute `sudo visudo` and add the following line where + suitable: `yourusername ALL=(ALL) NOPASSWD:SETENV: + /usr/bin/powermetrics`. Replace `yourusername` by your actual + username on the machine. Test the setup running the command + `{' '.join(cmd)}` by hand.""" + ) + ) + + return None + + def cuda_constants() -> dict[str, str | int | float] | None: """Returns GPU (static) information using nvidia-smi. @@ -85,7 +152,6 @@ def cuda_constants() -> dict[str, str | int | float] | None: Returns ------- - data : :py:class:`tuple`, None If ``nvidia-smi`` is not available, returns ``None``, otherwise, we return a dictionary containing the following ``nvidia-smi`` query information, in this order: @@ -105,6 +171,37 @@ def cuda_constants() -> dict[str, str | int | float] | None: return retval +def mps_constants() -> dict[str, str | int | float] | None: + """Returns GPU (static) information using `/usr/bin/powermetrics`. + + Returns + ------- + + data : :py:class:`tuple`, None + If ``nvidia-smi`` is not available, returns ``None``, otherwise, we + return a dictionary containing the following ``nvidia-smi`` query + information, in this order: + + * ``gpu_name``, as ``gpu_name`` (:py:class:`str`) + * ``driver_version``, as ``gpu_driver_version`` (:py:class:`str`) + * ``memory.total``, as ``gpu_memory_total`` (transformed to gigabytes, + :py:class:`float`) + """ + + raw_bytes = subprocess.check_output( + ["/usr/sbin/system_profiler", "-xml", "SPDisplaysDataType"] + ) + data = plistlib.loads(raw_bytes) + name = data[0]["_items"][0]["_name"] + no_gpu_cores = int(data[0]["_items"][0]["sppci_cores"]) + + return { + "apple-processor-model": name, + "number-cpu-cores": multiprocessing.cpu_count(), + "number-gpu-cores": no_gpu_cores, + } + + def cuda_log() -> dict[str, float] | None: """Returns GPU information about current non-static status using nvidia- smi. @@ -114,7 +211,6 @@ def cuda_log() -> dict[str, float] | None: Returns ------- - data If ``nvidia-smi`` is not available, returns ``None``, otherwise, we return a dictionary containing the following ``nvidia-smi`` query information, in this order: @@ -153,7 +249,6 @@ def mps_log() -> dict[str, float] | None: Returns ------- - data If ``sudo powermetrics`` is not executable (or is not configured for passwordless execution), returns ``None``, otherwise, we return a dictionary containing the following query information, in this order: @@ -163,47 +258,15 @@ def mps_log() -> dict[str, float] | None: (:py:class:`float`, in percent) """ - cmd = [ - "sudo", - "-n", - "/usr/bin/powermetrics", - "--samplers", - "gpu_power", - "-i500", - "-n1", - "-fplist", - ] + result = run_powermetrics(500, key="gpu") - try: - raw_bytes = subprocess.check_output(cmd) - data = plistlib.loads(raw_bytes) - return { - "frequency-MHz/gpu": float(data["gpu"]["freq_hz"]), - "percent-usage/gpu": 100 * (1 - data["gpu"]["idle_ratio"]), - } - - except subprocess.CalledProcessError: - import inspect - import warnings - - warnings.warn( - inspect.cleandoc( - f"""Cannot run `sudo powermetrics` without a password. Probably, - you did not setup sudo to execute the macOS CLI application - `powermetrics` passwordlessly and therefore this warning is - being issued. This does not affect the running of your model - training, only the ability of the resource monitor of gathering - GPU usage information on macOS while using the MPS compute - backend. To configure this properly and get rid of this - warning, execute `sudo visudo` and add the following line where - suitable: `yourusername ALL=(ALL) NOPASSWD:SETENV: - /usr/bin/powermetrics`. Replace `yourusername` by your actual - username on the machine. Test the setup running the command - `{' '.join(cmd)}` by hand.""" - ) - ) + if result is None: + return result - return None + return { + "frequency-MHz/gpu": float(result["freq_hz"]), + "percent-usage/gpu": 100 * (1 - result["idle_ratio"]), + } def cpu_constants() -> dict[str, int | float]: @@ -212,8 +275,7 @@ def cpu_constants() -> dict[str, int | float]: Returns ------- - data - An ordered dictionary (organized as 2-tuples) containing these entries: + A dictionary containing these entries: 0. ``cpu_memory_total`` (:py:class:`float`): total memory available, in gigabytes -- GitLab