From f0f7784bdf5b0764b96ed06eed799c7c521f4ef3 Mon Sep 17 00:00:00 2001 From: Andre Anjos <andre.dos.anjos@gmail.com> Date: Fri, 18 Aug 2023 11:08:44 +0200 Subject: [PATCH] [scripts.experiment] Make it run completely --- src/ptbench/engine/trainer.py | 32 +-- src/ptbench/scripts/experiment.py | 214 +------------- src/ptbench/scripts/train.py | 388 ++++++++++++++------------ src/ptbench/scripts/train_analysis.py | 30 +- src/ptbench/scripts/utils.py | 30 +- src/ptbench/utils/checkpointer.py | 81 +++--- src/ptbench/utils/tensorboard.py | 27 +- 7 files changed, 324 insertions(+), 478 deletions(-) diff --git a/src/ptbench/engine/trainer.py b/src/ptbench/engine/trainer.py index 3173fcbc..fccf47b8 100644 --- a/src/ptbench/engine/trainer.py +++ b/src/ptbench/engine/trainer.py @@ -5,6 +5,7 @@ import csv import logging import os +import pathlib import shutil import lightning.pytorch @@ -20,7 +21,7 @@ logger = logging.getLogger(__name__) def save_model_summary( - output_folder: str, + output_folder: pathlib.Path, model: torch.nn.Module, ) -> tuple[lightning.pytorch.callbacks.ModelSummary, int]: """Saves a little summary of the model in a txt file. @@ -43,9 +44,9 @@ def save_model_summary( total_parameters The number of parameters of the model """ - summary_path = os.path.join(output_folder, "model_summary.txt") + summary_path = output_folder / "model-summary.txt" logger.info(f"Saving model summary at {summary_path}...") - with open(summary_path, "w") as f: + with summary_path.open("w") as f: summary = lightning.pytorch.utilities.model_summary.ModelSummary( model, max_depth=-1 ) @@ -59,7 +60,7 @@ def save_model_summary( def static_information_to_csv( - static_logfile_name: str, + static_logfile_name: pathlib.Path, device_type: str, model_size: int, ) -> None: @@ -70,7 +71,7 @@ def static_information_to_csv( static_logfile_name The static file name which is a join between the output folder and - "constant.csv" + "constants.csv" device_type The type of device we are using @@ -78,12 +79,11 @@ def static_information_to_csv( model_size The size of the model we will be training """ - if os.path.exists(static_logfile_name): - backup = static_logfile_name + "~" - if os.path.exists(backup): - os.unlink(backup) - shutil.move(static_logfile_name, backup) - with open(static_logfile_name, "w", newline="") as f: + if static_logfile_name.exists(): + backup = static_logfile_name.parent / (static_logfile_name.name + "~") + shutil.copy(static_logfile_name, backup) + + with static_logfile_name.open("w", newline="") as f: logdata: dict[str, int | float | str] = {} logdata.update(cpu_constants()) if device_type == "cuda": @@ -102,10 +102,10 @@ def run( checkpoint_period: int, device_manager: DeviceManager, max_epochs: int, - output_folder: str, + output_folder: pathlib.Path, monitoring_interval: int | float, batch_chunk_count: int, - checkpoint: str, + checkpoint: str | None, ): """Fits a CNN model using supervised learning and save it to disk. @@ -135,7 +135,7 @@ def run( The maximum number of epochs to train for. output_folder - Directory in which the results will be saved. + Folder in which the results will be saved. monitoring_interval Interval, in seconds (or fractions), through which we should monitor @@ -149,6 +149,7 @@ def run( exchanges for longer processing times in this case. checkpoint + Path to an optional checkpoint file to load """ os.makedirs(output_folder, exist_ok=True) @@ -189,9 +190,8 @@ def run( checkpoint_callback.CHECKPOINT_NAME_LAST = "model_final_epoch" # write static information to a CSV file - static_logfile_name = os.path.join(output_folder, "constants.csv") static_information_to_csv( - static_logfile_name, + output_folder / "constants.csv", device_manager.device_type, no_of_parameters, ) diff --git a/src/ptbench/scripts/experiment.py b/src/ptbench/scripts/experiment.py index 44c9a40b..fdac41a2 100644 --- a/src/ptbench/scripts/experiment.py +++ b/src/ptbench/scripts/experiment.py @@ -2,14 +2,13 @@ # # SPDX-License-Identifier: GPL-3.0-or-later -import os -import shutil - import click from clapper.click import ConfigCommand, ResourceOption, verbosity_option from clapper.logging import setup +from .train import reusable_options as training_options + logger = setup(__name__.split(".")[0], format="%(levelname)s: %(message)s") @@ -28,182 +27,7 @@ logger = setup(__name__.split(".")[0], format="%(levelname)s: %(message)s") $ ptbench experiment -vv pasa montgomery --epochs=2 """, ) -@click.option( - "--output-folder", - "-o", - help="Path where to store experiment outputs (created if does not exist)", - required=True, - type=click.Path(), - default="results", - cls=ResourceOption, -) -@click.option( - "--model", - "-m", - help="A lightining module instance implementing the network to be trained", - required=True, - cls=ResourceOption, -) -@click.option( - "--datamodule", - "-d", - help="A lighting data module containing the training and validation sets", - required=True, - cls=ResourceOption, -) -@click.option( - "--batch-size", - "-b", - help="Number of samples in every batch (this parameter affects " - "memory requirements for the network). If the number of samples in " - "the batch is larger than the total number of samples available for " - "training, this value is truncated. If this number is smaller, then " - "batches of the specified size are created and fed to the network " - "until there are no more new samples to feed (epoch is finished). " - "If the total number of training samples is not a multiple of the " - "batch-size, the last batch will be smaller than the first, unless " - "--drop-incomplete-batch is set, in which case this batch is not used.", - required=True, - show_default=True, - default=1, - type=click.IntRange(min=1), - cls=ResourceOption, -) -@click.option( - "--batch-chunk-count", - "-c", - help="Number of chunks in every batch (this parameter affects " - "memory requirements for the network). The number of samples " - "loaded for every iteration will be batch-size/batch-chunk-count. " - "batch-size needs to be divisible by batch-chunk-count, otherwise an " - "error will be raised. This parameter is used to reduce number of " - "samples loaded in each iteration, in order to reduce the memory usage " - "in exchange for processing time (more iterations). This is specially " - "interesting whe one is running with GPUs with limited RAM. The " - "default of 1 forces the whole batch to be processed at once. Otherwise " - "the batch is broken into batch-chunk-count pieces, and gradients are " - "accumulated to complete each batch.", - required=True, - show_default=True, - default=1, - type=click.IntRange(min=1), - cls=ResourceOption, -) -@click.option( - "--drop-incomplete-batch/--no-drop-incomplete-batch", - "-D", - help="If set, then may drop the last batch in an epoch, in case it is " - "incomplete. If you set this option, you should also consider " - "increasing the total number of epochs of training, as the total number " - "of training steps may be reduced", - required=True, - show_default=True, - default=False, - cls=ResourceOption, -) -@click.option( - "--epochs", - "-e", - help="Number of epochs (complete training set passes) to train for. " - "If continuing from a saved checkpoint, ensure to provide a greater " - "number of epochs than that saved on the checkpoint to be loaded. ", - show_default=True, - required=True, - default=1000, - type=click.IntRange(min=1), - cls=ResourceOption, -) -@click.option( - "--checkpoint-period", - "-p", - help="Number of epochs after which a checkpoint is saved. " - "A value of zero will disable check-pointing. If checkpointing is " - "enabled and training stops, it is automatically resumed from the " - "last saved checkpoint if training is restarted with the same " - "configuration.", - show_default=True, - required=False, - default=None, - type=click.IntRange(min=0), - cls=ResourceOption, -) -@click.option( - "--device", - "-d", - help='A string indicating the device to use (e.g. "cpu" or "cuda:0")', - show_default=True, - required=True, - default="cpu", - cls=ResourceOption, -) -@click.option( - "--cache-samples/--no-cache-samples", - help="If set to True, loads the sample into memory, " - "otherwise loads them at runtime.", - required=True, - show_default=True, - default=False, - cls=ResourceOption, -) -@click.option( - "--seed", - "-s", - help="Seed to use for the random number generator", - show_default=True, - required=False, - default=42, - type=click.IntRange(min=0), - cls=ResourceOption, -) -@click.option( - "--parallel", - "-P", - help="""Use multiprocessing for data loading: if set to -1 (default), - disables multiprocessing data loading. Set to 0 to enable as many data - loading instances as processing cores as available in the system. Set to - >= 1 to enable that many multiprocessing instances for data loading.""", - type=click.IntRange(min=-1), - show_default=True, - required=True, - default=-1, - cls=ResourceOption, -) -@click.option( - "--monitoring-interval", - "-I", - help="""Time between checks for the use of resources during each training - epoch. An interval of 5 seconds, for example, will lead to CPU and GPU - resources being probed every 5 seconds during each training epoch. - Values registered in the training logs correspond to averages (or maxima) - observed through possibly many probes in each epoch. Notice that setting a - very small value may cause the probing process to become extremely busy, - potentially biasing the overall perception of resource usage.""", - type=click.FloatRange(min=0.1), - show_default=True, - required=True, - default=5.0, - cls=ResourceOption, -) -@click.option( - "--resume-from", - help="Which checkpoint to resume training from. If set, can be one of " - "`best`, `last`, or a path to a model checkpoint.", - type=str, - required=False, - default=None, - cls=ResourceOption, -) -@click.option( - "--balance-classes/--no-balance-classes", - "-B/-N", - help="""If set, then balances weights of the random sampler during - training, so that samples from all sample classes are picked picked - equitably.""", - required=True, - show_default=True, - default=True, - cls=ResourceOption, -) +@training_options @verbosity_option(logger=logger, cls=ResourceOption) @click.pass_context def experiment( @@ -233,27 +57,21 @@ def experiment( \b └─ <output-folder>/ - ├── command + ├── command.sh ├── model/ # the generated model will be here - ├── predictions/ # the prediction outputs for the sets + ├── predictions.json # the prediction outputs for the sets └── evaluation/ # the outputs of the evaluations for the sets """ from .utils import save_sh_command - command_sh = os.path.join(output_folder, "command.sh") - if os.path.exists(command_sh): - backup = command_sh + "~" - if os.path.exists(backup): - os.unlink(backup) - shutil.move(command_sh, backup) - save_sh_command(output_folder) + save_sh_command(output_folder / "command.sh") # training logger.info("Started training") from .train import train - train_output_folder = os.path.join(output_folder, "model") + train_output_folder = output_folder / "model" ctx.invoke( train, model=model, @@ -278,12 +96,12 @@ def experiment( logger.info("Started train analysis") from .train_analysis import train_analysis - logdir = os.path.join(train_output_folder, "logs") - output_pdf = os.path.join(train_output_folder, "train_analysis.pdf") + logdir = train_output_folder / "logs" + output_pdf = train_output_folder / "trainlog.pdf" ctx.invoke( train_analysis, logdir=logdir, - output_pdf=output_pdf, + output=output_pdf, ) logger.info("Ended train analysis") @@ -294,13 +112,11 @@ def experiment( # preferably, we use the best model on the validation set # otherwise, we get the last saved model - model_file = os.path.join( - train_output_folder, "model_lowest_valid_loss.ckpt" - ) - if not os.path.exists(model_file): - model_file = os.path.join(train_output_folder, "model_final_epoch.ckpt") + model_file = train_output_folder / "model_lowest_valid_loss.ckpt" + if not model_file.exists(): + model_file = train_output_folder / "model_final_epoch.ckpt" - predictions_output = os.path.join(output_folder, "predictions.json") + predictions_output = output_folder / "predictions.json" ctx.invoke( predict, @@ -319,7 +135,7 @@ def experiment( from .evaluate import evaluate - evaluations_folder = os.path.join(output_folder, "evaluation") + evaluations_folder = output_folder / "evaluation" ctx.invoke( evaluate, diff --git a/src/ptbench/scripts/train.py b/src/ptbench/scripts/train.py index 24b77c7a..8dcd832a 100644 --- a/src/ptbench/scripts/train.py +++ b/src/ptbench/scripts/train.py @@ -2,6 +2,9 @@ # # SPDX-License-Identifier: GPL-3.0-or-later +import functools +import pathlib + import click from clapper.click import ResourceOption, verbosity_option @@ -12,6 +15,212 @@ from .click import ConfigCommand logger = setup(__name__.split(".")[0], format="%(levelname)s: %(message)s") +def reusable_options(f): + """Options that can be re-used by top-level scripts (i.e. ``experiment```). + + This decorator equips the target function ``f`` with all (reusable) + ``train`` script options. + + + Parameters + ---------- + f + The target function to equip with options. This function must have + parameters that accept such options. + + + Returns + ------- + The decorated version of function ``f`` + """ + + @click.option( + "--output-folder", + "-o", + help="Path where to store results (created if does not exist)", + required=True, + type=click.Path( + file_okay=False, + dir_okay=True, + writable=True, + path_type=pathlib.Path, + ), + default="results", + cls=ResourceOption, + ) + @click.option( + "--model", + "-m", + help="A lightining module instance implementing the network to be trained", + required=True, + cls=ResourceOption, + ) + @click.option( + "--datamodule", + "-d", + help="A lighting data module containing the training and validation sets.", + required=True, + cls=ResourceOption, + ) + @click.option( + "--batch-size", + "-b", + help="Number of samples in every batch (this parameter affects " + "memory requirements for the network). If the number of samples in " + "the batch is larger than the total number of samples available for " + "training, this value is truncated. If this number is smaller, then " + "batches of the specified size are created and fed to the network " + "until there are no more new samples to feed (epoch is finished). " + "If the total number of training samples is not a multiple of the " + "batch-size, the last batch will be smaller than the first, unless " + "--drop-incomplete-batch is set, in which case this batch is not used.", + required=True, + show_default=True, + default=1, + type=click.IntRange(min=1), + cls=ResourceOption, + ) + @click.option( + "--batch-chunk-count", + "-c", + help="Number of chunks in every batch (this parameter affects " + "memory requirements for the network). The number of samples " + "loaded for every iteration will be batch-size/batch-chunk-count. " + "batch-size needs to be divisible by batch-chunk-count, otherwise an " + "error will be raised. This parameter is used to reduce number of " + "samples loaded in each iteration, in order to reduce the memory usage " + "in exchange for processing time (more iterations). This is specially " + "interesting whe one is running with GPUs with limited RAM. The " + "default of 1 forces the whole batch to be processed at once. Otherwise " + "the batch is broken into batch-chunk-count pieces, and gradients are " + "accumulated to complete each batch.", + required=True, + show_default=True, + default=1, + type=click.IntRange(min=1), + cls=ResourceOption, + ) + @click.option( + "--drop-incomplete-batch/--no-drop-incomplete-batch", + "-D", + help="If set, then may drop the last batch in an epoch, in case it is " + "incomplete. If you set this option, you should also consider " + "increasing the total number of epochs of training, as the total number " + "of training steps may be reduced", + required=True, + show_default=True, + default=False, + cls=ResourceOption, + ) + @click.option( + "--epochs", + "-e", + help="""Number of epochs (complete training set passes) to train for. + If continuing from a saved checkpoint, ensure to provide a greater + number of epochs than that saved on the checkpoint to be loaded.""", + show_default=True, + required=True, + default=1000, + type=click.IntRange(min=1), + cls=ResourceOption, + ) + @click.option( + "--checkpoint-period", + "-p", + help="""Number of epochs after which a checkpoint is saved. A value of + zero will disable check-pointing. If checkpointing is enabled and + training stops, it is automatically resumed from the last saved + checkpoint if training is restarted with the same configuration.""", + show_default=True, + required=False, + default=None, + type=click.IntRange(min=0), + cls=ResourceOption, + ) + @click.option( + "--device", + "-x", + help='A string indicating the device to use (e.g. "cpu" or "cuda:0")', + show_default=True, + required=True, + default="cpu", + cls=ResourceOption, + ) + @click.option( + "--cache-samples/--no-cache-samples", + help="If set to True, loads the sample into memory, " + "otherwise loads them at runtime.", + required=True, + show_default=True, + default=False, + cls=ResourceOption, + ) + @click.option( + "--seed", + "-s", + help="Seed to use for the random number generator", + show_default=True, + required=False, + default=42, + type=click.IntRange(min=0), + cls=ResourceOption, + ) + @click.option( + "--parallel", + "-P", + help="""Use multiprocessing for data loading: if set to -1 (default), + disables multiprocessing data loading. Set to 0 to enable as many data + loading instances as processing cores as available in the system. Set to + >= 1 to enable that many multiprocessing instances for data loading.""", + type=click.IntRange(min=-1), + show_default=True, + required=True, + default=-1, + cls=ResourceOption, + ) + @click.option( + "--monitoring-interval", + "-I", + help="""Time between checks for the use of resources during each training + epoch. An interval of 5 seconds, for example, will lead to CPU and GPU + resources being probed every 5 seconds during each training epoch. + Values registered in the training logs correspond to averages (or maxima) + observed through possibly many probes in each epoch. Notice that setting a + very small value may cause the probing process to become extremely busy, + potentially biasing the overall perception of resource usage.""", + type=click.FloatRange(min=0.1), + show_default=True, + required=True, + default=5.0, + cls=ResourceOption, + ) + @click.option( + "--resume-from", + help="""Which checkpoint to resume training from. If set, can be one of + `best`, `last`, or a path to a model checkpoint.""", + type=click.STRING, + required=False, + default=None, + cls=ResourceOption, + ) + @click.option( + "--balance-classes/--no-balance-classes", + "-B/-N", + help="""If set, then balances weights of the random sampler during + training, so that samples from all sample classes are picked picked + equitably.""", + required=True, + show_default=True, + default=True, + cls=ResourceOption, + ) + @functools.wraps(f) + def wrapper_reusable_options(*args, **kwargs): + return f(*args, **kwargs) + + return wrapper_reusable_options + + @click.command( entry_point_group="ptbench.config", cls=ConfigCommand, @@ -24,182 +233,7 @@ logger = setup(__name__.split(".")[0], format="%(levelname)s: %(message)s") ptbench train -vv pasa montgomery --batch-size=4 --device="cuda:0" """, ) -@click.option( - "--output-folder", - "-o", - help="Path where to store the generated model (created if does not exist)", - required=True, - type=click.Path(), - default="results", - cls=ResourceOption, -) -@click.option( - "--model", - "-m", - help="A lightining module instance implementing the network to be trained", - required=True, - cls=ResourceOption, -) -@click.option( - "--datamodule", - "-d", - help="A lighting data module containing the training and validation sets.", - required=True, - cls=ResourceOption, -) -@click.option( - "--batch-size", - "-b", - help="Number of samples in every batch (this parameter affects " - "memory requirements for the network). If the number of samples in " - "the batch is larger than the total number of samples available for " - "training, this value is truncated. If this number is smaller, then " - "batches of the specified size are created and fed to the network " - "until there are no more new samples to feed (epoch is finished). " - "If the total number of training samples is not a multiple of the " - "batch-size, the last batch will be smaller than the first, unless " - "--drop-incomplete-batch is set, in which case this batch is not used.", - required=True, - show_default=True, - default=1, - type=click.IntRange(min=1), - cls=ResourceOption, -) -@click.option( - "--batch-chunk-count", - "-c", - help="Number of chunks in every batch (this parameter affects " - "memory requirements for the network). The number of samples " - "loaded for every iteration will be batch-size/batch-chunk-count. " - "batch-size needs to be divisible by batch-chunk-count, otherwise an " - "error will be raised. This parameter is used to reduce number of " - "samples loaded in each iteration, in order to reduce the memory usage " - "in exchange for processing time (more iterations). This is specially " - "interesting whe one is running with GPUs with limited RAM. The " - "default of 1 forces the whole batch to be processed at once. Otherwise " - "the batch is broken into batch-chunk-count pieces, and gradients are " - "accumulated to complete each batch.", - required=True, - show_default=True, - default=1, - type=click.IntRange(min=1), - cls=ResourceOption, -) -@click.option( - "--drop-incomplete-batch/--no-drop-incomplete-batch", - "-D", - help="If set, then may drop the last batch in an epoch, in case it is " - "incomplete. If you set this option, you should also consider " - "increasing the total number of epochs of training, as the total number " - "of training steps may be reduced", - required=True, - show_default=True, - default=False, - cls=ResourceOption, -) -@click.option( - "--epochs", - "-e", - help="Number of epochs (complete training set passes) to train for. " - "If continuing from a saved checkpoint, ensure to provide a greater " - "number of epochs than that saved on the checkpoint to be loaded. ", - show_default=True, - required=True, - default=1000, - type=click.IntRange(min=1), - cls=ResourceOption, -) -@click.option( - "--checkpoint-period", - "-p", - help="Number of epochs after which a checkpoint is saved. " - "A value of zero will disable check-pointing. If checkpointing is " - "enabled and training stops, it is automatically resumed from the " - "last saved checkpoint if training is restarted with the same " - "configuration.", - show_default=True, - required=False, - default=None, - type=click.IntRange(min=0), - cls=ResourceOption, -) -@click.option( - "--device", - "-x", - help='A string indicating the device to use (e.g. "cpu" or "cuda:0")', - show_default=True, - required=True, - default="cpu", - cls=ResourceOption, -) -@click.option( - "--cache-samples/--no-cache-samples", - help="If set to True, loads the sample into memory, " - "otherwise loads them at runtime.", - required=True, - show_default=True, - default=False, - cls=ResourceOption, -) -@click.option( - "--seed", - "-s", - help="Seed to use for the random number generator", - show_default=True, - required=False, - default=42, - type=click.IntRange(min=0), - cls=ResourceOption, -) -@click.option( - "--parallel", - "-P", - help="""Use multiprocessing for data loading: if set to -1 (default), - disables multiprocessing data loading. Set to 0 to enable as many data - loading instances as processing cores as available in the system. Set to - >= 1 to enable that many multiprocessing instances for data loading.""", - type=click.IntRange(min=-1), - show_default=True, - required=True, - default=-1, - cls=ResourceOption, -) -@click.option( - "--monitoring-interval", - "-I", - help="""Time between checks for the use of resources during each training - epoch. An interval of 5 seconds, for example, will lead to CPU and GPU - resources being probed every 5 seconds during each training epoch. - Values registered in the training logs correspond to averages (or maxima) - observed through possibly many probes in each epoch. Notice that setting a - very small value may cause the probing process to become extremely busy, - potentially biasing the overall perception of resource usage.""", - type=click.FloatRange(min=0.1), - show_default=True, - required=True, - default=5.0, - cls=ResourceOption, -) -@click.option( - "--resume-from", - help="Which checkpoint to resume training from. If set, can be one of " - "`best`, `last`, or a path to a model checkpoint.", - type=str, - required=False, - default=None, - cls=ResourceOption, -) -@click.option( - "--balance-classes/--no-balance-classes", - "-B/-N", - help="""If set, then balances weights of the random sampler during - training, so that samples from all sample classes are picked picked - equitably.""", - required=True, - show_default=True, - default=True, - cls=ResourceOption, -) +@reusable_options @verbosity_option(logger=logger, cls=ResourceOption, expose_value=False) def train( model, @@ -237,7 +271,7 @@ def train( from ..utils.checkpointer import get_checkpoint from .utils import save_sh_command - save_sh_command(output_folder) + save_sh_command(output_folder / "command.sh") seed_everything(seed) checkpoint_file = get_checkpoint(output_folder, resume_from) diff --git a/src/ptbench/scripts/train_analysis.py b/src/ptbench/scripts/train_analysis.py index ed8d7e6b..7fd99a75 100644 --- a/src/ptbench/scripts/train_analysis.py +++ b/src/ptbench/scripts/train_analysis.py @@ -2,20 +2,21 @@ # # SPDX-License-Identifier: GPL-3.0-or-later -import os +import pathlib import click +import matplotlib.figure import matplotlib.pyplot as plt import pandas -from clapper.click import ConfigCommand, ResourceOption, verbosity_option +from clapper.click import verbosity_option from clapper.logging import setup from matplotlib.ticker import MaxNLocator logger = setup(__name__.split(".")[0], format="%(levelname)s: %(message)s") -def create_figures(df: pandas.DataFrame) -> list[plt.figure]: +def create_figures(df: pandas.DataFrame) -> list[matplotlib.figure.Figure]: """Generates figures for each metric in the dataframe. Each row of the dataframe correspond to an epoch and each column to a metric. @@ -84,8 +85,6 @@ def create_figures(df: pandas.DataFrame) -> list[plt.figure]: @click.command( - entry_point_group="ptbench.config", - cls=ConfigCommand, epilog="""Examples: \b @@ -98,21 +97,21 @@ def create_figures(df: pandas.DataFrame) -> list[plt.figure]: ) @click.argument( "logdir", - type=click.Path(dir_okay=True, exists=True), + type=click.Path(dir_okay=True, exists=True, path_type=pathlib.Path), ) @click.option( - "--output-pdf", + "--output", "-o", - help="Name of the output file to dump", + help="Name of the output file to dump (multi-page PDF)", required=True, show_default=True, default="trainlog.pdf", + type=click.Path(dir_okay=False, file_okay=True, path_type=pathlib.Path), ) -@verbosity_option(logger=logger, cls=ResourceOption, expose_value=False) +@verbosity_option(logger=logger, expose_value=False) def train_analysis( - logdir: str, - output_pdf: str, - **_, + logdir: pathlib.Path, + output: pathlib.Path, ) -> None: """Creates a plot for each metric in the training logs and saves them in a pdf file.""" @@ -123,12 +122,9 @@ def train_analysis( data = get_scalars(logdir) - # makes sure the directory to save the output PDF is there - dirname = os.path.dirname(os.path.realpath(output_pdf)) - if not os.path.exists(dirname): - os.makedirs(dirname) + output.parent.mkdir(parents=True, exist_ok=True) - with PdfPages(output_pdf) as pdf: + with PdfPages(output) as pdf: for figure in create_figures(data): pdf.savefig(figure) plt.close(figure) diff --git a/src/ptbench/scripts/utils.py b/src/ptbench/scripts/utils.py index 614c2988..511fc884 100644 --- a/src/ptbench/scripts/utils.py +++ b/src/ptbench/scripts/utils.py @@ -5,13 +5,14 @@ import importlib.metadata import logging import os import pathlib +import shutil import sys import time logger = logging.getLogger(__name__) -def save_sh_command(output_folder: str | pathlib.Path) -> None: +def save_sh_command(path: pathlib.Path) -> None: """Records command-line to reproduce this script. This function can record the current command-line used to call the script @@ -24,24 +25,27 @@ def save_sh_command(output_folder: str | pathlib.Path) -> None: Parameters ---------- - output_folder : str - Path leading to the directory where the commands to reproduce the current - run will be recorded. A subdirectory will be created each time this function - is called to match lightning's versioning convention for loggers. + path + Path to a file where the commands to reproduce the current run will be + recorded. Parent directories will be created if they do not exist. An + existing copy will be backed-up if it exists. """ - if isinstance(output_folder, str): - output_folder = pathlib.Path(output_folder) + logger.info(f"Writing command-line for reproduction at `{path}`...") - destfile = output_folder / "command.sh" + # create parent directories + path.parent.mkdir(parents=True, exist_ok=True) - logger.info(f"Writing command-line for reproduction at '{destfile}'...") - os.makedirs(output_folder, exist_ok=True) + # backup if exists + if path.exists(): + backup = path.parent / (path.name + "~") + shutil.copy(path, backup) + # write the file package = __name__.split(".", 1)[0] version = importlib.metadata.version(package) - with destfile.open("w") as f: + with path.open("w") as f: f.write("#!/usr/bin/env sh\n") f.write(f"# date: {time.asctime()}\n") f.write(f"# version: {version} ({package})\n") @@ -57,4 +61,6 @@ def save_sh_command(output_folder: str | pathlib.Path) -> None: f.write(f"# conda activate {os.environ['CONDA_DEFAULT_ENV']}\n") f.write(f"# cd {os.path.realpath(os.curdir)}\n") f.write(" ".join(args) + "\n") - os.chmod(destfile, 0o755) + + # make it executable + path.chmod(0o755) diff --git a/src/ptbench/utils/checkpointer.py b/src/ptbench/utils/checkpointer.py index 88cdfbb7..8f6685b8 100644 --- a/src/ptbench/utils/checkpointer.py +++ b/src/ptbench/utils/checkpointer.py @@ -3,78 +3,77 @@ # SPDX-License-Identifier: GPL-3.0-or-later import logging -import os +import pathlib import typing logger = logging.getLogger(__name__) def get_checkpoint( - output_folder: str, resume_from: typing.Literal["last", "best"] | str | None + output_folder: pathlib.Path, + resume_from: typing.Literal["last", "best"] | str | None, ) -> str | None: """Gets a checkpoint file. Can return the best or last checkpoint, or a checkpoint at a specific path. Ensures the checkpoint exists, raising an error if it is not the case. - If resume_from is ``None``, checks the output directory if a checkpoint - already exists and returns it. If no checkpoint is found, returns ``None``. + If ``resume_from`` is ``None``, checks the output directory if a "last" + checkpoint file already exists and returns it. If no checkpoint is found, + returns ``None``. + + ``resume_from`` can also be a path to an existing checkpoint file. In this + case, we check it and return if it exists. Parameters ---------- - output_folder - Directory in which checkpoints are stored. - + Folder in which checkpoints are stored. resume_from Which model to get. Can be one of "best", "last", or a path to a checkpoint. - If None, gets the last checkpoint if it exists, otherwise returns None + If ``None``, gets the last checkpoint if it exists, otherwise returns + ``None`` (signal to start from scratch). Returns ------- + Path to the requested checkpoint (as a plain string) or ``None`` (start + from scratch). - checkpoint_file - Path to the requested checkpoint or None. - """ - last_checkpoint_path = os.path.join(output_folder, "model_final_epoch.ckpt") - best_checkpoint_path = os.path.join( - output_folder, "model_lowest_valid_loss.ckpt" - ) - - if resume_from == "last": - if os.path.isfile(last_checkpoint_path): - checkpoint_file = last_checkpoint_path - logger.info(f"Resuming training from {resume_from} checkpoint") - else: - raise FileNotFoundError( - f"Could not find checkpoint {last_checkpoint_path}" - ) - elif resume_from == "best": - if os.path.isfile(best_checkpoint_path): - checkpoint_file = last_checkpoint_path - logger.info(f"Resuming training from {resume_from} checkpoint") + Raises + ------ + FileNotFoundError + In case a required file cannot be found. + """ + # standard paths where checkpoints may be (if produced with this framework) + last_path = output_folder / "model_final_epoch.ckpt" + best_path = output_folder / "model_lowest_valid_loss.ckpt" + + if resume_from in ("last", "best"): + use_file = last_path if resume_from == "last" else best_path + if use_file.is_file(): + logger.info(f"Found checkpoint at `{str(use_file)}`") + return str(use_file) else: raise FileNotFoundError( - f"Could not find checkpoint {best_checkpoint_path}" + f"Could not find a checkpoint file at `{str(use_file)}`" ) elif resume_from is None: - if os.path.isfile(last_checkpoint_path): - checkpoint_file = last_checkpoint_path - logger.info( - f"Found existing checkpoint {last_checkpoint_path}. Loading." - ) + # use-case: user is re-starting a crashed/cancelled job + if last_path.is_file(): + logger.info(f"Found checkpoint at `{str(last_path)}`") + return str(last_path) else: return None - else: - if os.path.isfile(resume_from): - checkpoint_file = resume_from - logger.info(f"Resuming training from checkpoint {resume_from}") + elif isinstance(resume_from, str): + if pathlib.Path(resume_from).is_file(): + logger.info(f"Found checkpoint at `{resume_from}`") + return resume_from else: - raise FileNotFoundError(f"Could not find checkpoint {resume_from}") - - return checkpoint_file + raise FileNotFoundError( + f"Could not find a checkpoint file at `{resume_from}`" + ) diff --git a/src/ptbench/utils/tensorboard.py b/src/ptbench/utils/tensorboard.py index ac838a44..7e2feaa4 100644 --- a/src/ptbench/utils/tensorboard.py +++ b/src/ptbench/utils/tensorboard.py @@ -2,11 +2,8 @@ # # SPDX-License-Identifier: GPL-3.0-or-later -import glob -import os - -from collections import defaultdict -from typing import Any +import pathlib +import typing import pandas @@ -15,7 +12,7 @@ from tensorboard.backend.event_processing.event_accumulator import ( ) -def get_scalars(logdir: str) -> pandas.DataFrame: +def get_scalars(logdir: pathlib.Path) -> pandas.DataFrame: """Returns scalars stored in tensorboard event files. Parameters @@ -28,17 +25,16 @@ def get_scalars(logdir: str) -> pandas.DataFrame: ------- data: - Pandas dataframe containing the results. Rows correspond to an epoch, columns to the metrics. + Pandas dataframe containing the results. Rows correspond to an epoch, + columns to the metrics. """ - tensorboard_logs = sorted( - glob.glob(os.path.join(logdir, "events.out.tfevents.*")) - ) + tensorboard_logs = sorted(logdir.glob("events.out.tfevents.*")) - data: dict[str, dict[str, Any]] = defaultdict(dict) + data: dict[str, dict[str, typing.Any]] = {} headers = {"step"} for logfile in tensorboard_logs: - event_accumulator = EventAccumulator(logfile) + event_accumulator = EventAccumulator(str(logfile)) event_accumulator.Reload() tags = event_accumulator.Tags() @@ -52,8 +48,7 @@ def get_scalars(logdir: str) -> pandas.DataFrame: step = tag_data.step value = tag_data.value - data[step]["step"] = step - data[step][scalar_tag] = value + data.setdefault(step, {"step": step})["step"] = step + data.setdefault(step, {scalar_tag: value})[scalar_tag] = value - data = pandas.DataFrame.from_dict(data, orient="index") - return data + return pandas.DataFrame.from_dict(data, orient="index") -- GitLab