diff --git a/src/ptbench/engine/callbacks.py b/src/ptbench/engine/callbacks.py index 8774f9c45c248414618a24592cab7ee687e74dbf..5adada7be7ce7d7a194b351448a9b3a4dd65c2fe 100644 --- a/src/ptbench/engine/callbacks.py +++ b/src/ptbench/engine/callbacks.py @@ -139,18 +139,18 @@ class LoggingCallback(lightning.pytorch.Callback): # We disconsider accumulate_grad_batches and assume they were all of # the same size. This way, the average of averages is the overall # average. - self._to_log["train_loss"] = torch.mean( + self._to_log["loss/train"] = torch.mean( torch.tensor(self._training_epoch_loss[0]) * torch.tensor(self._training_epoch_loss[1]) ).item() - self._to_log["train_epoch_time"] = epoch_time - self._to_log["learning_rate"] = pl_module.optimizers().defaults["lr"] + self._to_log["epoch-duration-seconds/train"] = epoch_time + self._to_log["learning-rate"] = pl_module.optimizers().defaults["lr"] metrics = self._resource_monitor.data if metrics is not None: for metric_name, metric_value in metrics.items(): - self._to_log[f"train_{metric_name}"] = float(metric_value) + self._to_log[f"{metric_name}/train"] = float(metric_value) else: logger.warning( "Unable to fetch monitoring information from " @@ -261,12 +261,12 @@ class LoggingCallback(lightning.pytorch.Callback): self._resource_monitor.checkpoint() epoch_time = time.time() - self._start_validation_epoch_time - self._to_log["validation_epoch_time"] = epoch_time + self._to_log["epoch-duration-seconds/validation"] = epoch_time metrics = self._resource_monitor.data if metrics is not None: for metric_name, metric_value in metrics.items(): - self._to_log[f"validation_{metric_name}"] = float(metric_value) + self._to_log[f"{metric_name}/validation"] = float(metric_value) else: logger.warning( "Unable to fetch monitoring information from " @@ -280,9 +280,9 @@ class LoggingCallback(lightning.pytorch.Callback): # overall average. for key in sorted(self._validation_epoch_loss.keys()): if key == 0: - name = "validation_loss" + name = "loss/validation" else: - name = f"validation_loss_{key}" + name = f"loss/validation-{key}" self._to_log[name] = torch.mean( torch.tensor(self._validation_epoch_loss[key][0]) @@ -365,16 +365,20 @@ class LoggingCallback(lightning.pytorch.Callback): # Note: logging should happen at on_validation_end(), but # apparently you can't log from there overall_cycle_time = time.time() - self._start_training_epoch_time - self._to_log["train_cycle_time"] = overall_cycle_time - self._to_log["total_time"] = time.time() - self._start_training_time - self._to_log["eta"] = overall_cycle_time * ( + self._to_log["cycle-time-seconds/train"] = overall_cycle_time + self._to_log["total-execution-time-seconds"] = ( + time.time() - self._start_training_time + ) + self._to_log["eta-seconds"] = overall_cycle_time * ( trainer.max_epochs - trainer.current_epoch # type: ignore ) # Do not log during sanity check as results are not relevant if not trainer.sanity_checking: for k in sorted(self._to_log.keys()): - pl_module.log(k, self._to_log[k]) + pl_module.log_dict( + {k: self._to_log[k], "step": float(trainer.current_epoch)} + ) self._to_log = {} diff --git a/src/ptbench/engine/trainer.py b/src/ptbench/engine/trainer.py index 052bb885549087bf64c02e5be05ee1debaabaaca..0236aca773cbe21f40e7f997408c0406ae9cdd28 100644 --- a/src/ptbench/engine/trainer.py +++ b/src/ptbench/engine/trainer.py @@ -156,9 +156,15 @@ def run( # Save model summary _, no_of_parameters = save_model_summary(output_folder, model) - csv_logger = lightning.pytorch.loggers.CSVLogger(output_folder, "logs_csv") + log_dir = "logs" tensorboard_logger = lightning.pytorch.loggers.TensorBoardLogger( - output_folder, "logs_tensorboard" + output_folder, + log_dir, + ) + logger.info( + f"Monitor experiment with `tensorboard serve " + f"--logdir={output_folder}/{log_dir}/version_*/`. " + f"Then, open a browser on the printed address." ) resource_monitor = ResourceMonitor( @@ -172,7 +178,7 @@ def run( output_folder, "model_lowest_valid_loss", save_last=True, - monitor="validation_loss", + monitor="loss/validation", mode="min", save_on_train_epoch_end=True, every_n_epochs=checkpoint_period, @@ -195,7 +201,7 @@ def run( devices=devices, max_epochs=max_epochs, accumulate_grad_batches=batch_chunk_count, - logger=[csv_logger, tensorboard_logger], + logger=tensorboard_logger, check_val_every_n_epoch=1, callbacks=[LoggingCallback(resource_monitor), checkpoint_callback], ) diff --git a/src/ptbench/utils/resources.py b/src/ptbench/utils/resources.py index bda05f203bdf80c73d92fa72df0c7f637244028f..d6ed02196ca73286ac19520e6f327b8d4ebf8021 100644 --- a/src/ptbench/utils/resources.py +++ b/src/ptbench/utils/resources.py @@ -98,8 +98,8 @@ def gpu_constants() -> dict[str, str | int | float] | None: return retval # else, just update with more generic names - retval["gpu_driver_version"] = retval.pop("driver_version") - retval["gpu_memory_used_GB"] = retval.pop("memory.total") + retval["driver-version/gpu"] = retval.pop("driver_version") + retval["total-memory-GB/gpu"] = retval.pop("memory.total") return retval @@ -135,12 +135,12 @@ def gpu_log() -> dict[str, float] | None: return result return { - "gpu_memory_used_GB": float(result["memory.used"]), - "gpu_memory_free_GB": float(result["memory.free"]), - "gpu_memory_percent": 100 + "memory-used-GB/gpu": float(result["memory.used"]), + "memory-free-GB/gpu": float(result["memory.free"]), + "memory-percent/gpu": 100 * float(result["memory.used"]) / float(result["memory.total"]), - "gpu_percent": float(result["utilization.gpu"]), + "percent-usage/gpu": float(result["utilization.gpu"]), } @@ -158,8 +158,8 @@ def cpu_constants() -> dict[str, int | float]: 1. ``cpu_count`` (:py:class:`int`): number of logical CPUs available """ return { - "cpu_memory_total_GB": psutil.virtual_memory().total / GB, - "cpu_count": psutil.cpu_count(logical=True), + "memory-total-GB/cpu": psutil.virtual_memory().total / GB, + "count/cpu": psutil.cpu_count(logical=True), } @@ -238,12 +238,12 @@ class CPULogger: # at this point, but ensures to update counts later on gone.add(k) return { - "cpu_memory_used_GB": psutil.virtual_memory().used / GB, - "cpu_rss_GB": sum([k.rss for k in memory_info]) / GB, - "cpu_vms_GB": sum([k.vms for k in memory_info]) / GB, - "cpu_percent": sum(cpu_percent), - "cpu_processes": len(self.cluster) - len(gone), - "cpu_open_files": sum(open_files), + "memory-used-GB/cpu": psutil.virtual_memory().used / GB, + "rss-GB/cpu": sum([k.rss for k in memory_info]) / GB, + "vms-GB/cpu": sum([k.vms for k in memory_info]) / GB, + "percent-usage/cpu": sum(cpu_percent), + "num-processes/cpu": len(self.cluster) - len(gone), + "num-open-files/cpu": sum(open_files), } @@ -342,10 +342,8 @@ def _monitor_worker( ra.acc() # guarantees at least an entry will be available if summary_event.is_set(): - summary = ra.summary().copy() - queue.put(summary) + queue.put(ra.summary().copy()) ra.clear() - print(queue.get()) summary_event.clear() time.sleep(interval)