Skip to content
Snippets Groups Projects
Commit 356aaa90 authored by André Anjos's avatar André Anjos :speech_balloon:
Browse files

[engine.trainer] Save more informative constants when using an MPS compute engine

parent dc86f848
No related branches found
No related tags found
No related merge requests found
......@@ -14,7 +14,12 @@ import lightning.pytorch.loggers
import torch.nn
from ..utils.checkpointer import CHECKPOINT_ALIASES
from ..utils.resources import ResourceMonitor, cpu_constants, cuda_constants
from ..utils.resources import (
ResourceMonitor,
cpu_constants,
cuda_constants,
mps_constants,
)
from .callbacks import LoggingCallback
from .device import DeviceManager, SupportedPytorchDevice
......@@ -96,11 +101,13 @@ def static_information_to_csv(
if results is not None:
logdata.update(results)
case "mps":
pass
results = mps_constants()
if results is not None:
logdata.update(results)
case _:
pass
logdata["model_size"] = model_size
logdata["number-of-model-parameters"] = model_size
logwriter = csv.DictWriter(f, fieldnames=logdata.keys())
logwriter.writeheader()
logwriter.writerow(logdata)
......
......@@ -77,6 +77,73 @@ def run_nvidia_smi(
return retval
def run_powermetrics(
time_window_ms: int = 500, key: str | None = None
) -> dict[str, typing.Any] | None:
"""Returns GPU information from the system.
For a comprehensive list of options and help, execute ``man powermetrics``
on a Mac computer with Apple silicon.
Parameters
----------
time_window_ms
The amount of time, in milliseconds, to collect usage information on
the GPU.
key
If specified returns only a sub-key of the dictionary.
Returns
-------
data
A dictionary containing the GPU information.
"""
cmd = [
"sudo",
"-n",
"/usr/bin/powermetrics",
"--samplers",
"gpu_power",
f"-i{time_window_ms}",
"-n1",
"-fplist",
]
try:
raw_bytes = subprocess.check_output(cmd)
data = plistlib.loads(raw_bytes)
return data[key] if key is not None else data
except subprocess.CalledProcessError:
import inspect
import warnings
warnings.warn(
inspect.cleandoc(
f"""Cannot run `sudo powermetrics` without a password. Probably,
you did not setup sudo to execute the macOS CLI application
`powermetrics` passwordlessly and therefore this warning is
being issued. This does not affect the running of your model
training, only the ability of the resource monitor of gathering
GPU usage information on macOS while using the MPS compute
backend. To configure this properly and get rid of this
warning, execute `sudo visudo` and add the following line where
suitable: `yourusername ALL=(ALL) NOPASSWD:SETENV:
/usr/bin/powermetrics`. Replace `yourusername` by your actual
username on the machine. Test the setup running the command
`{' '.join(cmd)}` by hand."""
)
)
return None
def cuda_constants() -> dict[str, str | int | float] | None:
"""Returns GPU (static) information using nvidia-smi.
......@@ -85,7 +152,6 @@ def cuda_constants() -> dict[str, str | int | float] | None:
Returns
-------
data : :py:class:`tuple`, None
If ``nvidia-smi`` is not available, returns ``None``, otherwise, we
return a dictionary containing the following ``nvidia-smi`` query
information, in this order:
......@@ -105,6 +171,37 @@ def cuda_constants() -> dict[str, str | int | float] | None:
return retval
def mps_constants() -> dict[str, str | int | float] | None:
"""Returns GPU (static) information using `/usr/bin/powermetrics`.
Returns
-------
data : :py:class:`tuple`, None
If ``nvidia-smi`` is not available, returns ``None``, otherwise, we
return a dictionary containing the following ``nvidia-smi`` query
information, in this order:
* ``gpu_name``, as ``gpu_name`` (:py:class:`str`)
* ``driver_version``, as ``gpu_driver_version`` (:py:class:`str`)
* ``memory.total``, as ``gpu_memory_total`` (transformed to gigabytes,
:py:class:`float`)
"""
raw_bytes = subprocess.check_output(
["/usr/sbin/system_profiler", "-xml", "SPDisplaysDataType"]
)
data = plistlib.loads(raw_bytes)
name = data[0]["_items"][0]["_name"]
no_gpu_cores = int(data[0]["_items"][0]["sppci_cores"])
return {
"apple-processor-model": name,
"number-cpu-cores": multiprocessing.cpu_count(),
"number-gpu-cores": no_gpu_cores,
}
def cuda_log() -> dict[str, float] | None:
"""Returns GPU information about current non-static status using nvidia-
smi.
......@@ -114,7 +211,6 @@ def cuda_log() -> dict[str, float] | None:
Returns
-------
data
If ``nvidia-smi`` is not available, returns ``None``, otherwise, we
return a dictionary containing the following ``nvidia-smi`` query
information, in this order:
......@@ -153,7 +249,6 @@ def mps_log() -> dict[str, float] | None:
Returns
-------
data
If ``sudo powermetrics`` is not executable (or is not configured for
passwordless execution), returns ``None``, otherwise, we return a
dictionary containing the following query information, in this order:
......@@ -163,47 +258,15 @@ def mps_log() -> dict[str, float] | None:
(:py:class:`float`, in percent)
"""
cmd = [
"sudo",
"-n",
"/usr/bin/powermetrics",
"--samplers",
"gpu_power",
"-i500",
"-n1",
"-fplist",
]
result = run_powermetrics(500, key="gpu")
try:
raw_bytes = subprocess.check_output(cmd)
data = plistlib.loads(raw_bytes)
return {
"frequency-MHz/gpu": float(data["gpu"]["freq_hz"]),
"percent-usage/gpu": 100 * (1 - data["gpu"]["idle_ratio"]),
}
except subprocess.CalledProcessError:
import inspect
import warnings
warnings.warn(
inspect.cleandoc(
f"""Cannot run `sudo powermetrics` without a password. Probably,
you did not setup sudo to execute the macOS CLI application
`powermetrics` passwordlessly and therefore this warning is
being issued. This does not affect the running of your model
training, only the ability of the resource monitor of gathering
GPU usage information on macOS while using the MPS compute
backend. To configure this properly and get rid of this
warning, execute `sudo visudo` and add the following line where
suitable: `yourusername ALL=(ALL) NOPASSWD:SETENV:
/usr/bin/powermetrics`. Replace `yourusername` by your actual
username on the machine. Test the setup running the command
`{' '.join(cmd)}` by hand."""
)
)
if result is None:
return result
return None
return {
"frequency-MHz/gpu": float(result["freq_hz"]),
"percent-usage/gpu": 100 * (1 - result["idle_ratio"]),
}
def cpu_constants() -> dict[str, int | float]:
......@@ -212,8 +275,7 @@ def cpu_constants() -> dict[str, int | float]:
Returns
-------
data
An ordered dictionary (organized as 2-tuples) containing these entries:
A dictionary containing these entries:
0. ``cpu_memory_total`` (:py:class:`float`): total memory available,
in gigabytes
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment