Skip to content
Snippets Groups Projects
Commit 6ad254bf authored by André Anjos's avatar André Anjos :speech_balloon:
Browse files

[engine.*] Move model summarization to training; Separate static information...

[engine.*] Move model summarization to training; Separate static information from dynamic information logging
parent 8a9f2ce4
No related branches found
No related tags found
1 merge request!12Streamlining
......@@ -14,7 +14,6 @@ import torchvision.transforms.functional as VF
import h5py
from ..utils.summary import summary
from ..data.utils import overlayed_image
import logging
......@@ -182,11 +181,3 @@ def run(model, data_loader, device, output_folder, overlayed_folder):
average_image_time = numpy.sum(numpy.array(times) * len_samples) / float(sum(len_samples))
logger.info(f"Average image time: {average_image_time:g}s")
# Save model summary
summary_path = os.path.join(output_folder, "model-info.txt")
logger.info(f"Saving model summary at {summary_path}...")
with open(summary_path, "wt") as f:
r, n = summary(model)
logger.info(f"Model has {n} parameters...")
f.write(r)
......@@ -4,6 +4,7 @@
import os
import csv
import time
import shutil
import datetime
import distutils.version
......@@ -11,7 +12,8 @@ import torch
from tqdm import tqdm
from ..utils.metric import SmoothedValue
from ..utils.resources import gpu_info, cpu_info
from ..utils.summary import summary
from ..utils.resources import cpu_info, gpu_info, cpu_log, gpu_log
import logging
......@@ -75,31 +77,61 @@ def run(
start_epoch = arguments["epoch"]
max_epoch = arguments["max_epoch"]
if device != "cpu":
# asserts we do have a GPU
assert bool(gpu_info()), (
f"Device set to '{device}', but cannot "
f"find a GPU (maybe nvidia-smi is not installed?)"
)
if not os.path.exists(output_folder):
logger.debug(f"Creating output directory '{output_folder}'...")
os.makedirs(output_folder)
# Log to file
# Save model summary
summary_path = os.path.join(output_folder, "model_summary.txt")
logger.info(f"Saving model summary at {summary_path}...")
with open(summary_path, "wt") as f:
r, n = summary(model)
logger.info(f"Model has {n} parameters...")
f.write(r)
# write static information to a CSV file
static_logfile_name = os.path.join(output_folder, "constants.csv")
if os.path.exists(static_logfile_name):
backup = static_logfile_name + "~"
if os.path.exists(backup):
os.unlink(backup)
shutil.move(static_logfile_name, backup)
with open(static_logfile_name, "w", newline="") as f:
logdata = cpu_info()
if device != "cpu":
logdata += gpu_info()
logdata = (("model_size", n),) + logdata
logwriter = csv.DictWriter(f, fieldnames=[k[0] for k in logdata])
logwriter.writeheader()
logwriter.writerow(dict(k for k in logdata))
# Log continous information to (another) file
logfile_name = os.path.join(output_folder, "trainlog.csv")
if arguments["epoch"] == 0 and os.path.exists(logfile_name):
logger.info(f"Truncating {logfile_name} - training is restarting...")
os.unlink(logfile_name)
backup = logfile_name + "~"
if os.path.exists(backup):
os.unlink(backup)
shutil.move(logfile_name, backup)
logfile_fields = (
"epoch",
"total-time",
"total_time",
"eta",
"average-loss",
"median-loss",
"learning-rate",
"average_loss",
"median_loss",
"learning_rate",
)
cpu_data = cpu_info()
logfile_fields += tuple([k[0] for k in cpu_data])
gpu_data = gpu_info()
if gpu_data is not None: # CUDA is available on this platform
logfile_fields += tuple([k[0] for k in gpu_data])
gpu_is_available = bool(gpu_data)
logfile_fields += tuple([k[0] for k in cpu_log()])
if device != "cpu":
logfile_fields += tuple([k[0] for k in gpu_log()])
with open(logfile_name, "a+", newline="") as logfile:
logwriter = csv.DictWriter(logfile, fieldnames=logfile_fields)
......@@ -132,7 +164,7 @@ def run(
# progress bar only on interactive jobs
for samples in tqdm(
data_loader, desc="batch", leave=False, disable=None,
data_loader, desc="batch", leave=False, disable=None
):
# data forwarding on the existing network
......@@ -171,16 +203,16 @@ def run(
logdata = (
("epoch", f"{epoch}"),
(
"total-time",
"total_time",
f"{datetime.timedelta(seconds=int(current_time))}",
),
("eta", f"{datetime.timedelta(seconds=int(eta_seconds))}"),
("average-loss", f"{losses.avg:.6f}"),
("median-loss", f"{losses.median:.6f}"),
("learning-rate", f"{optimizer.param_groups[0]['lr']:.6f}"),
) + cpu_info()
if gpu_is_available:
logdata += gpu_info()
("average_loss", f"{losses.avg:.6f}"),
("median_loss", f"{losses.median:.6f}"),
("learning_rate", f"{optimizer.param_groups[0]['lr']:.6f}"),
) + cpu_log()
if device != 'cpu':
logdata += gpu_log()
logwriter.writerow(dict(k for k in logdata))
logfile.flush()
......
......@@ -107,11 +107,12 @@ def _check_experiment_stare(overlay):
train_folder = os.path.join(output_folder, "model")
assert os.path.exists(os.path.join(train_folder, "model_final.pth"))
assert os.path.exists(os.path.join(train_folder, "last_checkpoint"))
assert os.path.exists(os.path.join(train_folder, "constants.csv"))
assert os.path.exists(os.path.join(train_folder, "trainlog.csv"))
assert os.path.exists(os.path.join(predict_folder, "model_summary.txt"))
# check predictions are there
predict_folder = os.path.join(output_folder, "predictions")
assert os.path.exists(os.path.join(predict_folder, "model-info.txt"))
basedir = os.path.join(predict_folder, "stare-images")
assert os.path.exists(basedir)
nose.tools.eq_(len(fnmatch.filter(os.listdir(basedir), "*.hdf5")), 20)
......@@ -237,7 +238,9 @@ def _check_train(runner):
assert os.path.exists(os.path.join(output_folder, "model_final.pth"))
assert os.path.exists(os.path.join(output_folder, "last_checkpoint"))
assert os.path.exists(os.path.join(output_folder, "constants.csv"))
assert os.path.exists(os.path.join(output_folder, "trainlog.csv"))
assert os.path.exists(os.path.join(output_folder, "model_summary.txt"))
keywords = {
r"^Continuing from epoch 0$": 1,
......@@ -293,7 +296,6 @@ def _check_predict(runner):
_assert_exit_0(result)
# check predictions are there
assert os.path.exists(os.path.join(output_folder, "model-info.txt"))
basedir = os.path.join(output_folder, "stare-images")
assert os.path.exists(basedir)
nose.tools.eq_(len(fnmatch.filter(os.listdir(basedir), "*.hdf5")), 10)
......
......@@ -17,21 +17,29 @@ logger = logging.getLogger(__name__)
_nvidia_smi = shutil.which("nvidia-smi")
"""Location of the nvidia-smi program, if one exists"""
_nvidia_query = (
_nvidia_starter_query = (
# obtain possible values with ``nvidia-smi --help-query-gpu``
"gpu_name",
"driver_version",
"memory.total",
)
"""Query parameters for logging static GPU information"""
_nvidia_log_query = (
# obtain possible values with ``nvidia-smi --help-query-gpu``
"memory.used",
"memory.free",
"utilization.memory",
"utilization.gpu",
)
"""Query parameters for nvidia-smi"""
"""Query parameters for logging performance of GPU"""
GB = float(2 ** 30)
"""The number of bytes in a gigabyte"""
def gpu_info(query=_nvidia_query):
"""Returns GPU information using nvidia-smi
def gpu_info(query=_nvidia_starter_query):
"""Returns GPU (static) information using nvidia-smi
For a comprehensive list of options and help, execute ``nvidia-smi
--help-query-gpu`` on a host with a GPU
......@@ -61,8 +69,49 @@ def gpu_info(query=_nvidia_query):
% (_nvidia_smi, ",".join(query))
)
values = [k.strip() for k in values.split(",")]
regexp = re.compile(r"(\.|_)")
fieldnames = [regexp.sub("-", k) for k in query]
regexp = re.compile(r"(\.|-)")
fieldnames = [regexp.sub("_", k) for k in query]
return tuple(zip(fieldnames, values))
def gpu_log(query=_nvidia_log_query):
"""Returns GPU information about current non-static status using nvidia-smi
For a comprehensive list of options and help, execute ``nvidia-smi
--help-query-gpu`` on a host with a GPU
Parameters
----------
query : list
A list of query strings as defined by ``nvidia-smi --help-query-gpu``
Returns
-------
data : tuple
An ordered dictionary (organized as 2-tuples) containing the queried
parameters. If ``nvidia-smi`` is not available, returns a list of
``None`` objects. Dots and underscores in the original NVIDIA naming
convention are normalized with dashes. Percentage information is left
alone, memory information is transformed in to gigabytes.
"""
if _nvidia_smi is not None:
values = subprocess.getoutput(
"%s --query-gpu=%s --format=csv,noheader"
% (_nvidia_smi, ",".join(query))
)
values = [k.strip() for k in values.split(",")]
t_values = []
for k in values:
if k.endswith('%'): t_values.append(k[:-1].strip())
elif k.endswith('MiB'): t_values.append(float(k[:-3].strip())/1024)
regexp = re.compile(r"(\.|-)")
fieldnames = [regexp.sub("_", k) for k in query]
return tuple(zip(fieldnames, values))
......@@ -71,6 +120,28 @@ _CLUSTER = []
def cpu_info():
"""Returns static CPU information about the current system.
Returns
-------
data : tuple
An ordered dictionary (organized as 2-tuples) containing these entries:
0. ``cpu_memory_total`` (:py:class:`float`): total memory available,
in gigabytes
1. ``cpu_count`` (:py:class:`int`): number of logical CPUs available
"""
return (
("cpu_memory_total", psutil.virtual_memory().total / GB),
("cpu_count", psutil.cpu_count(logical=True)),
)
def cpu_log():
"""Returns process (+child) information using ``psutil``.
This call examines the current process plus any spawn child and returns the
......@@ -83,23 +154,19 @@ def cpu_info():
data : tuple
An ordered dictionary (organized as 2-tuples) containing these entries:
0. ``system-memory-total`` (:py:class:`float`): total memory available,
in gigabytes
1. ``system-memory-used`` (:py:class:`float`): total memory used from
0. ``cpu_memory_used`` (:py:class:`float`): total memory used from
the system, in gigabytes
2. ``system-cpu-count`` (:py:class:`int`): number of logical CPUs
available
3. ``rss`` (:py:class:`float`): RAM currently used by
1. ``cpu_rss`` (:py:class:`float`): RAM currently used by
process and children, in gigabytes
3. ``vms`` (:py:class:`float`): total memory (RAM + swap) currently
2. ``cpu_vms`` (:py:class:`float`): total memory (RAM + swap) currently
used by process and children, in gigabytes
4. ``cpu-percent`` (:py:class:`float`): percentage of the total CPU
3. ``cpu_percent`` (:py:class:`float`): percentage of the total CPU
used by this process and children (recursively) since last call
(first time called should be ignored). This number depends on the
number of CPUs in the system and can be greater than 100%
5. ``processes`` (:py:class:`int`): total number of processes including
self and children (recursively)
6. ``open-files`` (:py:class:`int`): total number of open files by
4. ``cpu_processes`` (:py:class:`int`): total number of processes
including self and children (recursively)
5. ``cpu_open_files`` (:py:class:`int`): total number of open files by
self and children
"""
......@@ -124,12 +191,10 @@ def cpu_info():
memory_info = [k.memory_info() for k in _CLUSTER]
return (
("system-memory-total", psutil.virtual_memory().total / GB),
("system-memory-used", psutil.virtual_memory().used / GB),
("system-cpu-count", psutil.cpu_count(logical=True)),
("rss", sum([k.rss for k in memory_info]) / GB),
("vms", sum([k.vms for k in memory_info]) / GB),
("cpu-percent", sum(k.cpu_percent(interval=None) for k in _CLUSTER)),
("processes", len(_CLUSTER)),
("open-files", sum(len(k.open_files()) for k in _CLUSTER)),
("cpu_memory_used", psutil.virtual_memory().used / GB),
("cpu_rss", sum([k.rss for k in memory_info]) / GB),
("cpu_vms", sum([k.vms for k in memory_info]) / GB),
("cpu_percent", sum(k.cpu_percent(interval=None) for k in _CLUSTER)),
("cpu_processes", len(_CLUSTER)),
("cpu_open_files", sum(len(k.open_files()) for k in _CLUSTER)),
)
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment