Skip to content
Snippets Groups Projects
Commit 356aaa90 authored by André Anjos's avatar André Anjos :speech_balloon:
Browse files

[engine.trainer] Save more informative constants when using an MPS compute engine

parent dc86f848
No related branches found
No related tags found
No related merge requests found
...@@ -14,7 +14,12 @@ import lightning.pytorch.loggers ...@@ -14,7 +14,12 @@ import lightning.pytorch.loggers
import torch.nn import torch.nn
from ..utils.checkpointer import CHECKPOINT_ALIASES from ..utils.checkpointer import CHECKPOINT_ALIASES
from ..utils.resources import ResourceMonitor, cpu_constants, cuda_constants from ..utils.resources import (
ResourceMonitor,
cpu_constants,
cuda_constants,
mps_constants,
)
from .callbacks import LoggingCallback from .callbacks import LoggingCallback
from .device import DeviceManager, SupportedPytorchDevice from .device import DeviceManager, SupportedPytorchDevice
...@@ -96,11 +101,13 @@ def static_information_to_csv( ...@@ -96,11 +101,13 @@ def static_information_to_csv(
if results is not None: if results is not None:
logdata.update(results) logdata.update(results)
case "mps": case "mps":
pass results = mps_constants()
if results is not None:
logdata.update(results)
case _: case _:
pass pass
logdata["model_size"] = model_size logdata["number-of-model-parameters"] = model_size
logwriter = csv.DictWriter(f, fieldnames=logdata.keys()) logwriter = csv.DictWriter(f, fieldnames=logdata.keys())
logwriter.writeheader() logwriter.writeheader()
logwriter.writerow(logdata) logwriter.writerow(logdata)
......
...@@ -77,6 +77,73 @@ def run_nvidia_smi( ...@@ -77,6 +77,73 @@ def run_nvidia_smi(
return retval return retval
def run_powermetrics(
time_window_ms: int = 500, key: str | None = None
) -> dict[str, typing.Any] | None:
"""Returns GPU information from the system.
For a comprehensive list of options and help, execute ``man powermetrics``
on a Mac computer with Apple silicon.
Parameters
----------
time_window_ms
The amount of time, in milliseconds, to collect usage information on
the GPU.
key
If specified returns only a sub-key of the dictionary.
Returns
-------
data
A dictionary containing the GPU information.
"""
cmd = [
"sudo",
"-n",
"/usr/bin/powermetrics",
"--samplers",
"gpu_power",
f"-i{time_window_ms}",
"-n1",
"-fplist",
]
try:
raw_bytes = subprocess.check_output(cmd)
data = plistlib.loads(raw_bytes)
return data[key] if key is not None else data
except subprocess.CalledProcessError:
import inspect
import warnings
warnings.warn(
inspect.cleandoc(
f"""Cannot run `sudo powermetrics` without a password. Probably,
you did not setup sudo to execute the macOS CLI application
`powermetrics` passwordlessly and therefore this warning is
being issued. This does not affect the running of your model
training, only the ability of the resource monitor of gathering
GPU usage information on macOS while using the MPS compute
backend. To configure this properly and get rid of this
warning, execute `sudo visudo` and add the following line where
suitable: `yourusername ALL=(ALL) NOPASSWD:SETENV:
/usr/bin/powermetrics`. Replace `yourusername` by your actual
username on the machine. Test the setup running the command
`{' '.join(cmd)}` by hand."""
)
)
return None
def cuda_constants() -> dict[str, str | int | float] | None: def cuda_constants() -> dict[str, str | int | float] | None:
"""Returns GPU (static) information using nvidia-smi. """Returns GPU (static) information using nvidia-smi.
...@@ -85,7 +152,6 @@ def cuda_constants() -> dict[str, str | int | float] | None: ...@@ -85,7 +152,6 @@ def cuda_constants() -> dict[str, str | int | float] | None:
Returns Returns
------- -------
data : :py:class:`tuple`, None
If ``nvidia-smi`` is not available, returns ``None``, otherwise, we If ``nvidia-smi`` is not available, returns ``None``, otherwise, we
return a dictionary containing the following ``nvidia-smi`` query return a dictionary containing the following ``nvidia-smi`` query
information, in this order: information, in this order:
...@@ -105,6 +171,37 @@ def cuda_constants() -> dict[str, str | int | float] | None: ...@@ -105,6 +171,37 @@ def cuda_constants() -> dict[str, str | int | float] | None:
return retval return retval
def mps_constants() -> dict[str, str | int | float] | None:
"""Returns GPU (static) information using `/usr/bin/powermetrics`.
Returns
-------
data : :py:class:`tuple`, None
If ``nvidia-smi`` is not available, returns ``None``, otherwise, we
return a dictionary containing the following ``nvidia-smi`` query
information, in this order:
* ``gpu_name``, as ``gpu_name`` (:py:class:`str`)
* ``driver_version``, as ``gpu_driver_version`` (:py:class:`str`)
* ``memory.total``, as ``gpu_memory_total`` (transformed to gigabytes,
:py:class:`float`)
"""
raw_bytes = subprocess.check_output(
["/usr/sbin/system_profiler", "-xml", "SPDisplaysDataType"]
)
data = plistlib.loads(raw_bytes)
name = data[0]["_items"][0]["_name"]
no_gpu_cores = int(data[0]["_items"][0]["sppci_cores"])
return {
"apple-processor-model": name,
"number-cpu-cores": multiprocessing.cpu_count(),
"number-gpu-cores": no_gpu_cores,
}
def cuda_log() -> dict[str, float] | None: def cuda_log() -> dict[str, float] | None:
"""Returns GPU information about current non-static status using nvidia- """Returns GPU information about current non-static status using nvidia-
smi. smi.
...@@ -114,7 +211,6 @@ def cuda_log() -> dict[str, float] | None: ...@@ -114,7 +211,6 @@ def cuda_log() -> dict[str, float] | None:
Returns Returns
------- -------
data
If ``nvidia-smi`` is not available, returns ``None``, otherwise, we If ``nvidia-smi`` is not available, returns ``None``, otherwise, we
return a dictionary containing the following ``nvidia-smi`` query return a dictionary containing the following ``nvidia-smi`` query
information, in this order: information, in this order:
...@@ -153,7 +249,6 @@ def mps_log() -> dict[str, float] | None: ...@@ -153,7 +249,6 @@ def mps_log() -> dict[str, float] | None:
Returns Returns
------- -------
data
If ``sudo powermetrics`` is not executable (or is not configured for If ``sudo powermetrics`` is not executable (or is not configured for
passwordless execution), returns ``None``, otherwise, we return a passwordless execution), returns ``None``, otherwise, we return a
dictionary containing the following query information, in this order: dictionary containing the following query information, in this order:
...@@ -163,47 +258,15 @@ def mps_log() -> dict[str, float] | None: ...@@ -163,47 +258,15 @@ def mps_log() -> dict[str, float] | None:
(:py:class:`float`, in percent) (:py:class:`float`, in percent)
""" """
cmd = [ result = run_powermetrics(500, key="gpu")
"sudo",
"-n",
"/usr/bin/powermetrics",
"--samplers",
"gpu_power",
"-i500",
"-n1",
"-fplist",
]
try: if result is None:
raw_bytes = subprocess.check_output(cmd) return result
data = plistlib.loads(raw_bytes)
return {
"frequency-MHz/gpu": float(data["gpu"]["freq_hz"]),
"percent-usage/gpu": 100 * (1 - data["gpu"]["idle_ratio"]),
}
except subprocess.CalledProcessError:
import inspect
import warnings
warnings.warn(
inspect.cleandoc(
f"""Cannot run `sudo powermetrics` without a password. Probably,
you did not setup sudo to execute the macOS CLI application
`powermetrics` passwordlessly and therefore this warning is
being issued. This does not affect the running of your model
training, only the ability of the resource monitor of gathering
GPU usage information on macOS while using the MPS compute
backend. To configure this properly and get rid of this
warning, execute `sudo visudo` and add the following line where
suitable: `yourusername ALL=(ALL) NOPASSWD:SETENV:
/usr/bin/powermetrics`. Replace `yourusername` by your actual
username on the machine. Test the setup running the command
`{' '.join(cmd)}` by hand."""
)
)
return None return {
"frequency-MHz/gpu": float(result["freq_hz"]),
"percent-usage/gpu": 100 * (1 - result["idle_ratio"]),
}
def cpu_constants() -> dict[str, int | float]: def cpu_constants() -> dict[str, int | float]:
...@@ -212,8 +275,7 @@ def cpu_constants() -> dict[str, int | float]: ...@@ -212,8 +275,7 @@ def cpu_constants() -> dict[str, int | float]:
Returns Returns
------- -------
data A dictionary containing these entries:
An ordered dictionary (organized as 2-tuples) containing these entries:
0. ``cpu_memory_total`` (:py:class:`float`): total memory available, 0. ``cpu_memory_total`` (:py:class:`float`): total memory available,
in gigabytes in gigabytes
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment