Skip to content
Snippets Groups Projects
Commit 64fda27e authored by André Anjos's avatar André Anjos :speech_balloon:
Browse files

[engine.callbacks] Fix tensorboard logging; Remove CSV logging (closes #10)

parent 4fc45f2a
No related branches found
No related tags found
1 merge request!6Making use of LightningDataModule and simplification of data loading
......@@ -139,18 +139,18 @@ class LoggingCallback(lightning.pytorch.Callback):
# We disconsider accumulate_grad_batches and assume they were all of
# the same size. This way, the average of averages is the overall
# average.
self._to_log["train_loss"] = torch.mean(
self._to_log["loss/train"] = torch.mean(
torch.tensor(self._training_epoch_loss[0])
* torch.tensor(self._training_epoch_loss[1])
).item()
self._to_log["train_epoch_time"] = epoch_time
self._to_log["learning_rate"] = pl_module.optimizers().defaults["lr"]
self._to_log["epoch-duration-seconds/train"] = epoch_time
self._to_log["learning-rate"] = pl_module.optimizers().defaults["lr"]
metrics = self._resource_monitor.data
if metrics is not None:
for metric_name, metric_value in metrics.items():
self._to_log[f"train_{metric_name}"] = float(metric_value)
self._to_log[f"{metric_name}/train"] = float(metric_value)
else:
logger.warning(
"Unable to fetch monitoring information from "
......@@ -261,12 +261,12 @@ class LoggingCallback(lightning.pytorch.Callback):
self._resource_monitor.checkpoint()
epoch_time = time.time() - self._start_validation_epoch_time
self._to_log["validation_epoch_time"] = epoch_time
self._to_log["epoch-duration-seconds/validation"] = epoch_time
metrics = self._resource_monitor.data
if metrics is not None:
for metric_name, metric_value in metrics.items():
self._to_log[f"validation_{metric_name}"] = float(metric_value)
self._to_log[f"{metric_name}/validation"] = float(metric_value)
else:
logger.warning(
"Unable to fetch monitoring information from "
......@@ -280,9 +280,9 @@ class LoggingCallback(lightning.pytorch.Callback):
# overall average.
for key in sorted(self._validation_epoch_loss.keys()):
if key == 0:
name = "validation_loss"
name = "loss/validation"
else:
name = f"validation_loss_{key}"
name = f"loss/validation-{key}"
self._to_log[name] = torch.mean(
torch.tensor(self._validation_epoch_loss[key][0])
......@@ -365,16 +365,20 @@ class LoggingCallback(lightning.pytorch.Callback):
# Note: logging should happen at on_validation_end(), but
# apparently you can't log from there
overall_cycle_time = time.time() - self._start_training_epoch_time
self._to_log["train_cycle_time"] = overall_cycle_time
self._to_log["total_time"] = time.time() - self._start_training_time
self._to_log["eta"] = overall_cycle_time * (
self._to_log["cycle-time-seconds/train"] = overall_cycle_time
self._to_log["total-execution-time-seconds"] = (
time.time() - self._start_training_time
)
self._to_log["eta-seconds"] = overall_cycle_time * (
trainer.max_epochs - trainer.current_epoch # type: ignore
)
# Do not log during sanity check as results are not relevant
if not trainer.sanity_checking:
for k in sorted(self._to_log.keys()):
pl_module.log(k, self._to_log[k])
pl_module.log_dict(
{k: self._to_log[k], "step": float(trainer.current_epoch)}
)
self._to_log = {}
......
......@@ -156,9 +156,15 @@ def run(
# Save model summary
_, no_of_parameters = save_model_summary(output_folder, model)
csv_logger = lightning.pytorch.loggers.CSVLogger(output_folder, "logs_csv")
log_dir = "logs"
tensorboard_logger = lightning.pytorch.loggers.TensorBoardLogger(
output_folder, "logs_tensorboard"
output_folder,
log_dir,
)
logger.info(
f"Monitor experiment with `tensorboard serve "
f"--logdir={output_folder}/{log_dir}/version_*/`. "
f"Then, open a browser on the printed address."
)
resource_monitor = ResourceMonitor(
......@@ -172,7 +178,7 @@ def run(
output_folder,
"model_lowest_valid_loss",
save_last=True,
monitor="validation_loss",
monitor="loss/validation",
mode="min",
save_on_train_epoch_end=True,
every_n_epochs=checkpoint_period,
......@@ -195,7 +201,7 @@ def run(
devices=devices,
max_epochs=max_epochs,
accumulate_grad_batches=batch_chunk_count,
logger=[csv_logger, tensorboard_logger],
logger=tensorboard_logger,
check_val_every_n_epoch=1,
callbacks=[LoggingCallback(resource_monitor), checkpoint_callback],
)
......
......@@ -98,8 +98,8 @@ def gpu_constants() -> dict[str, str | int | float] | None:
return retval
# else, just update with more generic names
retval["gpu_driver_version"] = retval.pop("driver_version")
retval["gpu_memory_used_GB"] = retval.pop("memory.total")
retval["driver-version/gpu"] = retval.pop("driver_version")
retval["total-memory-GB/gpu"] = retval.pop("memory.total")
return retval
......@@ -135,12 +135,12 @@ def gpu_log() -> dict[str, float] | None:
return result
return {
"gpu_memory_used_GB": float(result["memory.used"]),
"gpu_memory_free_GB": float(result["memory.free"]),
"gpu_memory_percent": 100
"memory-used-GB/gpu": float(result["memory.used"]),
"memory-free-GB/gpu": float(result["memory.free"]),
"memory-percent/gpu": 100
* float(result["memory.used"])
/ float(result["memory.total"]),
"gpu_percent": float(result["utilization.gpu"]),
"percent-usage/gpu": float(result["utilization.gpu"]),
}
......@@ -158,8 +158,8 @@ def cpu_constants() -> dict[str, int | float]:
1. ``cpu_count`` (:py:class:`int`): number of logical CPUs available
"""
return {
"cpu_memory_total_GB": psutil.virtual_memory().total / GB,
"cpu_count": psutil.cpu_count(logical=True),
"memory-total-GB/cpu": psutil.virtual_memory().total / GB,
"count/cpu": psutil.cpu_count(logical=True),
}
......@@ -238,12 +238,12 @@ class CPULogger:
# at this point, but ensures to update counts later on
gone.add(k)
return {
"cpu_memory_used_GB": psutil.virtual_memory().used / GB,
"cpu_rss_GB": sum([k.rss for k in memory_info]) / GB,
"cpu_vms_GB": sum([k.vms for k in memory_info]) / GB,
"cpu_percent": sum(cpu_percent),
"cpu_processes": len(self.cluster) - len(gone),
"cpu_open_files": sum(open_files),
"memory-used-GB/cpu": psutil.virtual_memory().used / GB,
"rss-GB/cpu": sum([k.rss for k in memory_info]) / GB,
"vms-GB/cpu": sum([k.vms for k in memory_info]) / GB,
"percent-usage/cpu": sum(cpu_percent),
"num-processes/cpu": len(self.cluster) - len(gone),
"num-open-files/cpu": sum(open_files),
}
......@@ -342,10 +342,8 @@ def _monitor_worker(
ra.acc() # guarantees at least an entry will be available
if summary_event.is_set():
summary = ra.summary().copy()
queue.put(summary)
queue.put(ra.summary().copy())
ra.clear()
print(queue.get())
summary_event.clear()
time.sleep(interval)
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment