From f0f7784bdf5b0764b96ed06eed799c7c521f4ef3 Mon Sep 17 00:00:00 2001
From: Andre Anjos <andre.dos.anjos@gmail.com>
Date: Fri, 18 Aug 2023 11:08:44 +0200
Subject: [PATCH] [scripts.experiment] Make it run completely

---
 src/ptbench/engine/trainer.py         |  32 +--
 src/ptbench/scripts/experiment.py     | 214 +-------------
 src/ptbench/scripts/train.py          | 388 ++++++++++++++------------
 src/ptbench/scripts/train_analysis.py |  30 +-
 src/ptbench/scripts/utils.py          |  30 +-
 src/ptbench/utils/checkpointer.py     |  81 +++---
 src/ptbench/utils/tensorboard.py      |  27 +-
 7 files changed, 324 insertions(+), 478 deletions(-)

diff --git a/src/ptbench/engine/trainer.py b/src/ptbench/engine/trainer.py
index 3173fcbc..fccf47b8 100644
--- a/src/ptbench/engine/trainer.py
+++ b/src/ptbench/engine/trainer.py
@@ -5,6 +5,7 @@
 import csv
 import logging
 import os
+import pathlib
 import shutil
 
 import lightning.pytorch
@@ -20,7 +21,7 @@ logger = logging.getLogger(__name__)
 
 
 def save_model_summary(
-    output_folder: str,
+    output_folder: pathlib.Path,
     model: torch.nn.Module,
 ) -> tuple[lightning.pytorch.callbacks.ModelSummary, int]:
     """Saves a little summary of the model in a txt file.
@@ -43,9 +44,9 @@ def save_model_summary(
     total_parameters
         The number of parameters of the model
     """
-    summary_path = os.path.join(output_folder, "model_summary.txt")
+    summary_path = output_folder / "model-summary.txt"
     logger.info(f"Saving model summary at {summary_path}...")
-    with open(summary_path, "w") as f:
+    with summary_path.open("w") as f:
         summary = lightning.pytorch.utilities.model_summary.ModelSummary(
             model, max_depth=-1
         )
@@ -59,7 +60,7 @@ def save_model_summary(
 
 
 def static_information_to_csv(
-    static_logfile_name: str,
+    static_logfile_name: pathlib.Path,
     device_type: str,
     model_size: int,
 ) -> None:
@@ -70,7 +71,7 @@ def static_information_to_csv(
 
     static_logfile_name
         The static file name which is a join between the output folder and
-        "constant.csv"
+        "constants.csv"
 
     device_type
         The type of device we are using
@@ -78,12 +79,11 @@ def static_information_to_csv(
     model_size
         The size of the model we will be training
     """
-    if os.path.exists(static_logfile_name):
-        backup = static_logfile_name + "~"
-        if os.path.exists(backup):
-            os.unlink(backup)
-        shutil.move(static_logfile_name, backup)
-    with open(static_logfile_name, "w", newline="") as f:
+    if static_logfile_name.exists():
+        backup = static_logfile_name.parent / (static_logfile_name.name + "~")
+        shutil.copy(static_logfile_name, backup)
+
+    with static_logfile_name.open("w", newline="") as f:
         logdata: dict[str, int | float | str] = {}
         logdata.update(cpu_constants())
         if device_type == "cuda":
@@ -102,10 +102,10 @@ def run(
     checkpoint_period: int,
     device_manager: DeviceManager,
     max_epochs: int,
-    output_folder: str,
+    output_folder: pathlib.Path,
     monitoring_interval: int | float,
     batch_chunk_count: int,
-    checkpoint: str,
+    checkpoint: str | None,
 ):
     """Fits a CNN model using supervised learning and save it to disk.
 
@@ -135,7 +135,7 @@ def run(
         The maximum number of epochs to train for.
 
     output_folder
-        Directory in which the results will be saved.
+        Folder in which the results will be saved.
 
     monitoring_interval
         Interval, in seconds (or fractions), through which we should monitor
@@ -149,6 +149,7 @@ def run(
         exchanges for longer processing times in this case.
 
     checkpoint
+        Path to an optional checkpoint file to load
     """
 
     os.makedirs(output_folder, exist_ok=True)
@@ -189,9 +190,8 @@ def run(
     checkpoint_callback.CHECKPOINT_NAME_LAST = "model_final_epoch"
 
     # write static information to a CSV file
-    static_logfile_name = os.path.join(output_folder, "constants.csv")
     static_information_to_csv(
-        static_logfile_name,
+        output_folder / "constants.csv",
         device_manager.device_type,
         no_of_parameters,
     )
diff --git a/src/ptbench/scripts/experiment.py b/src/ptbench/scripts/experiment.py
index 44c9a40b..fdac41a2 100644
--- a/src/ptbench/scripts/experiment.py
+++ b/src/ptbench/scripts/experiment.py
@@ -2,14 +2,13 @@
 #
 # SPDX-License-Identifier: GPL-3.0-or-later
 
-import os
-import shutil
-
 import click
 
 from clapper.click import ConfigCommand, ResourceOption, verbosity_option
 from clapper.logging import setup
 
+from .train import reusable_options as training_options
+
 logger = setup(__name__.split(".")[0], format="%(levelname)s: %(message)s")
 
 
@@ -28,182 +27,7 @@ logger = setup(__name__.split(".")[0], format="%(levelname)s: %(message)s")
         $ ptbench experiment -vv pasa montgomery --epochs=2
 """,
 )
-@click.option(
-    "--output-folder",
-    "-o",
-    help="Path where to store experiment outputs (created if does not exist)",
-    required=True,
-    type=click.Path(),
-    default="results",
-    cls=ResourceOption,
-)
-@click.option(
-    "--model",
-    "-m",
-    help="A lightining module instance implementing the network to be trained",
-    required=True,
-    cls=ResourceOption,
-)
-@click.option(
-    "--datamodule",
-    "-d",
-    help="A lighting data module containing the training and validation sets",
-    required=True,
-    cls=ResourceOption,
-)
-@click.option(
-    "--batch-size",
-    "-b",
-    help="Number of samples in every batch (this parameter affects "
-    "memory requirements for the network).  If the number of samples in "
-    "the batch is larger than the total number of samples available for "
-    "training, this value is truncated.  If this number is smaller, then "
-    "batches of the specified size are created and fed to the network "
-    "until there are no more new samples to feed (epoch is finished).  "
-    "If the total number of training samples is not a multiple of the "
-    "batch-size, the last batch will be smaller than the first, unless "
-    "--drop-incomplete-batch is set, in which case this batch is not used.",
-    required=True,
-    show_default=True,
-    default=1,
-    type=click.IntRange(min=1),
-    cls=ResourceOption,
-)
-@click.option(
-    "--batch-chunk-count",
-    "-c",
-    help="Number of chunks in every batch (this parameter affects "
-    "memory requirements for the network). The number of samples "
-    "loaded for every iteration will be batch-size/batch-chunk-count. "
-    "batch-size needs to be divisible by batch-chunk-count, otherwise an "
-    "error will be raised. This parameter is used to reduce number of "
-    "samples loaded in each iteration, in order to reduce the memory usage "
-    "in exchange for processing time (more iterations).  This is specially "
-    "interesting whe one is running with GPUs with limited RAM. The "
-    "default of 1 forces the whole batch to be processed at once.  Otherwise "
-    "the batch is broken into batch-chunk-count pieces, and gradients are "
-    "accumulated to complete each batch.",
-    required=True,
-    show_default=True,
-    default=1,
-    type=click.IntRange(min=1),
-    cls=ResourceOption,
-)
-@click.option(
-    "--drop-incomplete-batch/--no-drop-incomplete-batch",
-    "-D",
-    help="If set, then may drop the last batch in an epoch, in case it is "
-    "incomplete.  If you set this option, you should also consider "
-    "increasing the total number of epochs of training, as the total number "
-    "of training steps may be reduced",
-    required=True,
-    show_default=True,
-    default=False,
-    cls=ResourceOption,
-)
-@click.option(
-    "--epochs",
-    "-e",
-    help="Number of epochs (complete training set passes) to train for. "
-    "If continuing from a saved checkpoint, ensure to provide a greater "
-    "number of epochs than that saved on the checkpoint to be loaded. ",
-    show_default=True,
-    required=True,
-    default=1000,
-    type=click.IntRange(min=1),
-    cls=ResourceOption,
-)
-@click.option(
-    "--checkpoint-period",
-    "-p",
-    help="Number of epochs after which a checkpoint is saved. "
-    "A value of zero will disable check-pointing. If checkpointing is "
-    "enabled and training stops, it is automatically resumed from the "
-    "last saved checkpoint if training is restarted with the same "
-    "configuration.",
-    show_default=True,
-    required=False,
-    default=None,
-    type=click.IntRange(min=0),
-    cls=ResourceOption,
-)
-@click.option(
-    "--device",
-    "-d",
-    help='A string indicating the device to use (e.g. "cpu" or "cuda:0")',
-    show_default=True,
-    required=True,
-    default="cpu",
-    cls=ResourceOption,
-)
-@click.option(
-    "--cache-samples/--no-cache-samples",
-    help="If set to True, loads the sample into memory, "
-    "otherwise loads them at runtime.",
-    required=True,
-    show_default=True,
-    default=False,
-    cls=ResourceOption,
-)
-@click.option(
-    "--seed",
-    "-s",
-    help="Seed to use for the random number generator",
-    show_default=True,
-    required=False,
-    default=42,
-    type=click.IntRange(min=0),
-    cls=ResourceOption,
-)
-@click.option(
-    "--parallel",
-    "-P",
-    help="""Use multiprocessing for data loading: if set to -1 (default),
-    disables multiprocessing data loading.  Set to 0 to enable as many data
-    loading instances as processing cores as available in the system.  Set to
-    >= 1 to enable that many multiprocessing instances for data loading.""",
-    type=click.IntRange(min=-1),
-    show_default=True,
-    required=True,
-    default=-1,
-    cls=ResourceOption,
-)
-@click.option(
-    "--monitoring-interval",
-    "-I",
-    help="""Time between checks for the use of resources during each training
-    epoch.  An interval of 5 seconds, for example, will lead to CPU and GPU
-    resources being probed every 5 seconds during each training epoch.
-    Values registered in the training logs correspond to averages (or maxima)
-    observed through possibly many probes in each epoch.  Notice that setting a
-    very small value may cause the probing process to become extremely busy,
-    potentially biasing the overall perception of resource usage.""",
-    type=click.FloatRange(min=0.1),
-    show_default=True,
-    required=True,
-    default=5.0,
-    cls=ResourceOption,
-)
-@click.option(
-    "--resume-from",
-    help="Which checkpoint to resume training from. If set, can be one of "
-    "`best`, `last`, or a path to a model checkpoint.",
-    type=str,
-    required=False,
-    default=None,
-    cls=ResourceOption,
-)
-@click.option(
-    "--balance-classes/--no-balance-classes",
-    "-B/-N",
-    help="""If set, then balances weights of the random sampler during
-    training, so that samples from all sample classes are picked picked
-    equitably.""",
-    required=True,
-    show_default=True,
-    default=True,
-    cls=ResourceOption,
-)
+@training_options
 @verbosity_option(logger=logger, cls=ResourceOption)
 @click.pass_context
 def experiment(
@@ -233,27 +57,21 @@ def experiment(
 
         \b
        â””â”€ <output-folder>/
-          â”œâ”€â”€ command
+          â”œâ”€â”€ command.sh
           â”œâ”€â”€ model/  # the generated model will be here
-          â”œâ”€â”€ predictions/  # the prediction outputs for the sets
+          â”œâ”€â”€ predictions.json  # the prediction outputs for the sets
           â””â”€â”€ evaluation/  # the outputs of the evaluations for the sets
     """
     from .utils import save_sh_command
 
-    command_sh = os.path.join(output_folder, "command.sh")
-    if os.path.exists(command_sh):
-        backup = command_sh + "~"
-        if os.path.exists(backup):
-            os.unlink(backup)
-        shutil.move(command_sh, backup)
-    save_sh_command(output_folder)
+    save_sh_command(output_folder / "command.sh")
 
     # training
     logger.info("Started training")
 
     from .train import train
 
-    train_output_folder = os.path.join(output_folder, "model")
+    train_output_folder = output_folder / "model"
     ctx.invoke(
         train,
         model=model,
@@ -278,12 +96,12 @@ def experiment(
     logger.info("Started train analysis")
     from .train_analysis import train_analysis
 
-    logdir = os.path.join(train_output_folder, "logs")
-    output_pdf = os.path.join(train_output_folder, "train_analysis.pdf")
+    logdir = train_output_folder / "logs"
+    output_pdf = train_output_folder / "trainlog.pdf"
     ctx.invoke(
         train_analysis,
         logdir=logdir,
-        output_pdf=output_pdf,
+        output=output_pdf,
     )
 
     logger.info("Ended train analysis")
@@ -294,13 +112,11 @@ def experiment(
 
     # preferably, we use the best model on the validation set
     # otherwise, we get the last saved model
-    model_file = os.path.join(
-        train_output_folder, "model_lowest_valid_loss.ckpt"
-    )
-    if not os.path.exists(model_file):
-        model_file = os.path.join(train_output_folder, "model_final_epoch.ckpt")
+    model_file = train_output_folder / "model_lowest_valid_loss.ckpt"
+    if not model_file.exists():
+        model_file = train_output_folder / "model_final_epoch.ckpt"
 
-    predictions_output = os.path.join(output_folder, "predictions.json")
+    predictions_output = output_folder / "predictions.json"
 
     ctx.invoke(
         predict,
@@ -319,7 +135,7 @@ def experiment(
 
     from .evaluate import evaluate
 
-    evaluations_folder = os.path.join(output_folder, "evaluation")
+    evaluations_folder = output_folder / "evaluation"
 
     ctx.invoke(
         evaluate,
diff --git a/src/ptbench/scripts/train.py b/src/ptbench/scripts/train.py
index 24b77c7a..8dcd832a 100644
--- a/src/ptbench/scripts/train.py
+++ b/src/ptbench/scripts/train.py
@@ -2,6 +2,9 @@
 #
 # SPDX-License-Identifier: GPL-3.0-or-later
 
+import functools
+import pathlib
+
 import click
 
 from clapper.click import ResourceOption, verbosity_option
@@ -12,6 +15,212 @@ from .click import ConfigCommand
 logger = setup(__name__.split(".")[0], format="%(levelname)s: %(message)s")
 
 
+def reusable_options(f):
+    """Options that can be re-used by top-level scripts (i.e. ``experiment```).
+
+    This decorator equips the target function ``f`` with all (reusable)
+    ``train`` script options.
+
+
+    Parameters
+    ----------
+    f
+        The target function to equip with options.  This function must have
+        parameters that accept such options.
+
+
+    Returns
+    -------
+        The decorated version of function ``f``
+    """
+
+    @click.option(
+        "--output-folder",
+        "-o",
+        help="Path where to store results (created if does not exist)",
+        required=True,
+        type=click.Path(
+            file_okay=False,
+            dir_okay=True,
+            writable=True,
+            path_type=pathlib.Path,
+        ),
+        default="results",
+        cls=ResourceOption,
+    )
+    @click.option(
+        "--model",
+        "-m",
+        help="A lightining module instance implementing the network to be trained",
+        required=True,
+        cls=ResourceOption,
+    )
+    @click.option(
+        "--datamodule",
+        "-d",
+        help="A lighting data module containing the training and validation sets.",
+        required=True,
+        cls=ResourceOption,
+    )
+    @click.option(
+        "--batch-size",
+        "-b",
+        help="Number of samples in every batch (this parameter affects "
+        "memory requirements for the network).  If the number of samples in "
+        "the batch is larger than the total number of samples available for "
+        "training, this value is truncated.  If this number is smaller, then "
+        "batches of the specified size are created and fed to the network "
+        "until there are no more new samples to feed (epoch is finished).  "
+        "If the total number of training samples is not a multiple of the "
+        "batch-size, the last batch will be smaller than the first, unless "
+        "--drop-incomplete-batch is set, in which case this batch is not used.",
+        required=True,
+        show_default=True,
+        default=1,
+        type=click.IntRange(min=1),
+        cls=ResourceOption,
+    )
+    @click.option(
+        "--batch-chunk-count",
+        "-c",
+        help="Number of chunks in every batch (this parameter affects "
+        "memory requirements for the network). The number of samples "
+        "loaded for every iteration will be batch-size/batch-chunk-count. "
+        "batch-size needs to be divisible by batch-chunk-count, otherwise an "
+        "error will be raised. This parameter is used to reduce number of "
+        "samples loaded in each iteration, in order to reduce the memory usage "
+        "in exchange for processing time (more iterations).  This is specially "
+        "interesting whe one is running with GPUs with limited RAM. The "
+        "default of 1 forces the whole batch to be processed at once.  Otherwise "
+        "the batch is broken into batch-chunk-count pieces, and gradients are "
+        "accumulated to complete each batch.",
+        required=True,
+        show_default=True,
+        default=1,
+        type=click.IntRange(min=1),
+        cls=ResourceOption,
+    )
+    @click.option(
+        "--drop-incomplete-batch/--no-drop-incomplete-batch",
+        "-D",
+        help="If set, then may drop the last batch in an epoch, in case it is "
+        "incomplete.  If you set this option, you should also consider "
+        "increasing the total number of epochs of training, as the total number "
+        "of training steps may be reduced",
+        required=True,
+        show_default=True,
+        default=False,
+        cls=ResourceOption,
+    )
+    @click.option(
+        "--epochs",
+        "-e",
+        help="""Number of epochs (complete training set passes) to train for.
+        If continuing from a saved checkpoint, ensure to provide a greater
+        number of epochs than that saved on the checkpoint to be loaded.""",
+        show_default=True,
+        required=True,
+        default=1000,
+        type=click.IntRange(min=1),
+        cls=ResourceOption,
+    )
+    @click.option(
+        "--checkpoint-period",
+        "-p",
+        help="""Number of epochs after which a checkpoint is saved. A value of
+        zero will disable check-pointing. If checkpointing is enabled and
+        training stops, it is automatically resumed from the last saved
+        checkpoint if training is restarted with the same configuration.""",
+        show_default=True,
+        required=False,
+        default=None,
+        type=click.IntRange(min=0),
+        cls=ResourceOption,
+    )
+    @click.option(
+        "--device",
+        "-x",
+        help='A string indicating the device to use (e.g. "cpu" or "cuda:0")',
+        show_default=True,
+        required=True,
+        default="cpu",
+        cls=ResourceOption,
+    )
+    @click.option(
+        "--cache-samples/--no-cache-samples",
+        help="If set to True, loads the sample into memory, "
+        "otherwise loads them at runtime.",
+        required=True,
+        show_default=True,
+        default=False,
+        cls=ResourceOption,
+    )
+    @click.option(
+        "--seed",
+        "-s",
+        help="Seed to use for the random number generator",
+        show_default=True,
+        required=False,
+        default=42,
+        type=click.IntRange(min=0),
+        cls=ResourceOption,
+    )
+    @click.option(
+        "--parallel",
+        "-P",
+        help="""Use multiprocessing for data loading: if set to -1 (default),
+        disables multiprocessing data loading.  Set to 0 to enable as many data
+        loading instances as processing cores as available in the system.  Set to
+        >= 1 to enable that many multiprocessing instances for data loading.""",
+        type=click.IntRange(min=-1),
+        show_default=True,
+        required=True,
+        default=-1,
+        cls=ResourceOption,
+    )
+    @click.option(
+        "--monitoring-interval",
+        "-I",
+        help="""Time between checks for the use of resources during each training
+        epoch.  An interval of 5 seconds, for example, will lead to CPU and GPU
+        resources being probed every 5 seconds during each training epoch.
+        Values registered in the training logs correspond to averages (or maxima)
+        observed through possibly many probes in each epoch.  Notice that setting a
+        very small value may cause the probing process to become extremely busy,
+        potentially biasing the overall perception of resource usage.""",
+        type=click.FloatRange(min=0.1),
+        show_default=True,
+        required=True,
+        default=5.0,
+        cls=ResourceOption,
+    )
+    @click.option(
+        "--resume-from",
+        help="""Which checkpoint to resume training from. If set, can be one of
+        `best`, `last`, or a path to a model checkpoint.""",
+        type=click.STRING,
+        required=False,
+        default=None,
+        cls=ResourceOption,
+    )
+    @click.option(
+        "--balance-classes/--no-balance-classes",
+        "-B/-N",
+        help="""If set, then balances weights of the random sampler during
+        training, so that samples from all sample classes are picked picked
+        equitably.""",
+        required=True,
+        show_default=True,
+        default=True,
+        cls=ResourceOption,
+    )
+    @functools.wraps(f)
+    def wrapper_reusable_options(*args, **kwargs):
+        return f(*args, **kwargs)
+
+    return wrapper_reusable_options
+
+
 @click.command(
     entry_point_group="ptbench.config",
     cls=ConfigCommand,
@@ -24,182 +233,7 @@ logger = setup(__name__.split(".")[0], format="%(levelname)s: %(message)s")
       ptbench train -vv pasa montgomery --batch-size=4 --device="cuda:0"
 """,
 )
-@click.option(
-    "--output-folder",
-    "-o",
-    help="Path where to store the generated model (created if does not exist)",
-    required=True,
-    type=click.Path(),
-    default="results",
-    cls=ResourceOption,
-)
-@click.option(
-    "--model",
-    "-m",
-    help="A lightining module instance implementing the network to be trained",
-    required=True,
-    cls=ResourceOption,
-)
-@click.option(
-    "--datamodule",
-    "-d",
-    help="A lighting data module containing the training and validation sets.",
-    required=True,
-    cls=ResourceOption,
-)
-@click.option(
-    "--batch-size",
-    "-b",
-    help="Number of samples in every batch (this parameter affects "
-    "memory requirements for the network).  If the number of samples in "
-    "the batch is larger than the total number of samples available for "
-    "training, this value is truncated.  If this number is smaller, then "
-    "batches of the specified size are created and fed to the network "
-    "until there are no more new samples to feed (epoch is finished).  "
-    "If the total number of training samples is not a multiple of the "
-    "batch-size, the last batch will be smaller than the first, unless "
-    "--drop-incomplete-batch is set, in which case this batch is not used.",
-    required=True,
-    show_default=True,
-    default=1,
-    type=click.IntRange(min=1),
-    cls=ResourceOption,
-)
-@click.option(
-    "--batch-chunk-count",
-    "-c",
-    help="Number of chunks in every batch (this parameter affects "
-    "memory requirements for the network). The number of samples "
-    "loaded for every iteration will be batch-size/batch-chunk-count. "
-    "batch-size needs to be divisible by batch-chunk-count, otherwise an "
-    "error will be raised. This parameter is used to reduce number of "
-    "samples loaded in each iteration, in order to reduce the memory usage "
-    "in exchange for processing time (more iterations).  This is specially "
-    "interesting whe one is running with GPUs with limited RAM. The "
-    "default of 1 forces the whole batch to be processed at once.  Otherwise "
-    "the batch is broken into batch-chunk-count pieces, and gradients are "
-    "accumulated to complete each batch.",
-    required=True,
-    show_default=True,
-    default=1,
-    type=click.IntRange(min=1),
-    cls=ResourceOption,
-)
-@click.option(
-    "--drop-incomplete-batch/--no-drop-incomplete-batch",
-    "-D",
-    help="If set, then may drop the last batch in an epoch, in case it is "
-    "incomplete.  If you set this option, you should also consider "
-    "increasing the total number of epochs of training, as the total number "
-    "of training steps may be reduced",
-    required=True,
-    show_default=True,
-    default=False,
-    cls=ResourceOption,
-)
-@click.option(
-    "--epochs",
-    "-e",
-    help="Number of epochs (complete training set passes) to train for. "
-    "If continuing from a saved checkpoint, ensure to provide a greater "
-    "number of epochs than that saved on the checkpoint to be loaded. ",
-    show_default=True,
-    required=True,
-    default=1000,
-    type=click.IntRange(min=1),
-    cls=ResourceOption,
-)
-@click.option(
-    "--checkpoint-period",
-    "-p",
-    help="Number of epochs after which a checkpoint is saved. "
-    "A value of zero will disable check-pointing. If checkpointing is "
-    "enabled and training stops, it is automatically resumed from the "
-    "last saved checkpoint if training is restarted with the same "
-    "configuration.",
-    show_default=True,
-    required=False,
-    default=None,
-    type=click.IntRange(min=0),
-    cls=ResourceOption,
-)
-@click.option(
-    "--device",
-    "-x",
-    help='A string indicating the device to use (e.g. "cpu" or "cuda:0")',
-    show_default=True,
-    required=True,
-    default="cpu",
-    cls=ResourceOption,
-)
-@click.option(
-    "--cache-samples/--no-cache-samples",
-    help="If set to True, loads the sample into memory, "
-    "otherwise loads them at runtime.",
-    required=True,
-    show_default=True,
-    default=False,
-    cls=ResourceOption,
-)
-@click.option(
-    "--seed",
-    "-s",
-    help="Seed to use for the random number generator",
-    show_default=True,
-    required=False,
-    default=42,
-    type=click.IntRange(min=0),
-    cls=ResourceOption,
-)
-@click.option(
-    "--parallel",
-    "-P",
-    help="""Use multiprocessing for data loading: if set to -1 (default),
-    disables multiprocessing data loading.  Set to 0 to enable as many data
-    loading instances as processing cores as available in the system.  Set to
-    >= 1 to enable that many multiprocessing instances for data loading.""",
-    type=click.IntRange(min=-1),
-    show_default=True,
-    required=True,
-    default=-1,
-    cls=ResourceOption,
-)
-@click.option(
-    "--monitoring-interval",
-    "-I",
-    help="""Time between checks for the use of resources during each training
-    epoch.  An interval of 5 seconds, for example, will lead to CPU and GPU
-    resources being probed every 5 seconds during each training epoch.
-    Values registered in the training logs correspond to averages (or maxima)
-    observed through possibly many probes in each epoch.  Notice that setting a
-    very small value may cause the probing process to become extremely busy,
-    potentially biasing the overall perception of resource usage.""",
-    type=click.FloatRange(min=0.1),
-    show_default=True,
-    required=True,
-    default=5.0,
-    cls=ResourceOption,
-)
-@click.option(
-    "--resume-from",
-    help="Which checkpoint to resume training from. If set, can be one of "
-    "`best`, `last`, or a path to a model checkpoint.",
-    type=str,
-    required=False,
-    default=None,
-    cls=ResourceOption,
-)
-@click.option(
-    "--balance-classes/--no-balance-classes",
-    "-B/-N",
-    help="""If set, then balances weights of the random sampler during
-    training, so that samples from all sample classes are picked picked
-    equitably.""",
-    required=True,
-    show_default=True,
-    default=True,
-    cls=ResourceOption,
-)
+@reusable_options
 @verbosity_option(logger=logger, cls=ResourceOption, expose_value=False)
 def train(
     model,
@@ -237,7 +271,7 @@ def train(
     from ..utils.checkpointer import get_checkpoint
     from .utils import save_sh_command
 
-    save_sh_command(output_folder)
+    save_sh_command(output_folder / "command.sh")
     seed_everything(seed)
 
     checkpoint_file = get_checkpoint(output_folder, resume_from)
diff --git a/src/ptbench/scripts/train_analysis.py b/src/ptbench/scripts/train_analysis.py
index ed8d7e6b..7fd99a75 100644
--- a/src/ptbench/scripts/train_analysis.py
+++ b/src/ptbench/scripts/train_analysis.py
@@ -2,20 +2,21 @@
 #
 # SPDX-License-Identifier: GPL-3.0-or-later
 
-import os
+import pathlib
 
 import click
+import matplotlib.figure
 import matplotlib.pyplot as plt
 import pandas
 
-from clapper.click import ConfigCommand, ResourceOption, verbosity_option
+from clapper.click import verbosity_option
 from clapper.logging import setup
 from matplotlib.ticker import MaxNLocator
 
 logger = setup(__name__.split(".")[0], format="%(levelname)s: %(message)s")
 
 
-def create_figures(df: pandas.DataFrame) -> list[plt.figure]:
+def create_figures(df: pandas.DataFrame) -> list[matplotlib.figure.Figure]:
     """Generates figures for each metric in the dataframe.
 
     Each row of the dataframe correspond to an epoch and each column to a metric.
@@ -84,8 +85,6 @@ def create_figures(df: pandas.DataFrame) -> list[plt.figure]:
 
 
 @click.command(
-    entry_point_group="ptbench.config",
-    cls=ConfigCommand,
     epilog="""Examples:
 
 \b
@@ -98,21 +97,21 @@ def create_figures(df: pandas.DataFrame) -> list[plt.figure]:
 )
 @click.argument(
     "logdir",
-    type=click.Path(dir_okay=True, exists=True),
+    type=click.Path(dir_okay=True, exists=True, path_type=pathlib.Path),
 )
 @click.option(
-    "--output-pdf",
+    "--output",
     "-o",
-    help="Name of the output file to dump",
+    help="Name of the output file to dump (multi-page PDF)",
     required=True,
     show_default=True,
     default="trainlog.pdf",
+    type=click.Path(dir_okay=False, file_okay=True, path_type=pathlib.Path),
 )
-@verbosity_option(logger=logger, cls=ResourceOption, expose_value=False)
+@verbosity_option(logger=logger, expose_value=False)
 def train_analysis(
-    logdir: str,
-    output_pdf: str,
-    **_,
+    logdir: pathlib.Path,
+    output: pathlib.Path,
 ) -> None:
     """Creates a plot for each metric in the training logs and saves them in a
     pdf file."""
@@ -123,12 +122,9 @@ def train_analysis(
 
     data = get_scalars(logdir)
 
-    # makes sure the directory to save the output PDF is there
-    dirname = os.path.dirname(os.path.realpath(output_pdf))
-    if not os.path.exists(dirname):
-        os.makedirs(dirname)
+    output.parent.mkdir(parents=True, exist_ok=True)
 
-    with PdfPages(output_pdf) as pdf:
+    with PdfPages(output) as pdf:
         for figure in create_figures(data):
             pdf.savefig(figure)
             plt.close(figure)
diff --git a/src/ptbench/scripts/utils.py b/src/ptbench/scripts/utils.py
index 614c2988..511fc884 100644
--- a/src/ptbench/scripts/utils.py
+++ b/src/ptbench/scripts/utils.py
@@ -5,13 +5,14 @@ import importlib.metadata
 import logging
 import os
 import pathlib
+import shutil
 import sys
 import time
 
 logger = logging.getLogger(__name__)
 
 
-def save_sh_command(output_folder: str | pathlib.Path) -> None:
+def save_sh_command(path: pathlib.Path) -> None:
     """Records command-line to reproduce this script.
 
     This function can record the current command-line used to call the script
@@ -24,24 +25,27 @@ def save_sh_command(output_folder: str | pathlib.Path) -> None:
     Parameters
     ----------
 
-    output_folder : str
-        Path leading to the directory where the commands to reproduce the current
-        run will be recorded. A subdirectory will be created each time this function
-        is called to match lightning's versioning convention for loggers.
+    path
+        Path to a file where the commands to reproduce the current run will be
+        recorded.  Parent directories will be created if they do not exist. An
+        existing copy will be backed-up if it exists.
     """
 
-    if isinstance(output_folder, str):
-        output_folder = pathlib.Path(output_folder)
+    logger.info(f"Writing command-line for reproduction at `{path}`...")
 
-    destfile = output_folder / "command.sh"
+    # create parent directories
+    path.parent.mkdir(parents=True, exist_ok=True)
 
-    logger.info(f"Writing command-line for reproduction at '{destfile}'...")
-    os.makedirs(output_folder, exist_ok=True)
+    # backup if exists
+    if path.exists():
+        backup = path.parent / (path.name + "~")
+        shutil.copy(path, backup)
 
+    # write the file
     package = __name__.split(".", 1)[0]
     version = importlib.metadata.version(package)
 
-    with destfile.open("w") as f:
+    with path.open("w") as f:
         f.write("#!/usr/bin/env sh\n")
         f.write(f"# date: {time.asctime()}\n")
         f.write(f"# version: {version} ({package})\n")
@@ -57,4 +61,6 @@ def save_sh_command(output_folder: str | pathlib.Path) -> None:
             f.write(f"# conda activate {os.environ['CONDA_DEFAULT_ENV']}\n")
         f.write(f"# cd {os.path.realpath(os.curdir)}\n")
         f.write(" ".join(args) + "\n")
-    os.chmod(destfile, 0o755)
+
+    # make it executable
+    path.chmod(0o755)
diff --git a/src/ptbench/utils/checkpointer.py b/src/ptbench/utils/checkpointer.py
index 88cdfbb7..8f6685b8 100644
--- a/src/ptbench/utils/checkpointer.py
+++ b/src/ptbench/utils/checkpointer.py
@@ -3,78 +3,77 @@
 # SPDX-License-Identifier: GPL-3.0-or-later
 
 import logging
-import os
+import pathlib
 import typing
 
 logger = logging.getLogger(__name__)
 
 
 def get_checkpoint(
-    output_folder: str, resume_from: typing.Literal["last", "best"] | str | None
+    output_folder: pathlib.Path,
+    resume_from: typing.Literal["last", "best"] | str | None,
 ) -> str | None:
     """Gets a checkpoint file.
 
     Can return the best or last checkpoint, or a checkpoint at a specific path.
     Ensures the checkpoint exists, raising an error if it is not the case.
 
-    If resume_from is ``None``, checks the output directory if a checkpoint
-    already exists and returns it. If no checkpoint is found, returns ``None``.
+    If ``resume_from`` is ``None``, checks the output directory if a "last"
+    checkpoint file already exists and returns it. If no checkpoint is found,
+    returns ``None``.
+
+    ``resume_from`` can also be a path to an existing checkpoint file.  In this
+    case, we check it and return if it exists.
 
 
     Parameters
     ----------
-
     output_folder
-        Directory in which checkpoints are stored.
-
+        Folder in which checkpoints are stored.
     resume_from
         Which model to get. Can be one of "best", "last", or a path to a checkpoint.
-        If None, gets the last checkpoint if it exists, otherwise returns None
+        If ``None``, gets the last checkpoint if it exists, otherwise returns
+        ``None`` (signal to start from scratch).
 
 
     Returns
     -------
+        Path to the requested checkpoint (as a plain string) or ``None`` (start
+        from scratch).
 
-    checkpoint_file
-        Path to the requested checkpoint or None.
-    """
-    last_checkpoint_path = os.path.join(output_folder, "model_final_epoch.ckpt")
-    best_checkpoint_path = os.path.join(
-        output_folder, "model_lowest_valid_loss.ckpt"
-    )
-
-    if resume_from == "last":
-        if os.path.isfile(last_checkpoint_path):
-            checkpoint_file = last_checkpoint_path
-            logger.info(f"Resuming training from {resume_from} checkpoint")
-        else:
-            raise FileNotFoundError(
-                f"Could not find checkpoint {last_checkpoint_path}"
-            )
 
-    elif resume_from == "best":
-        if os.path.isfile(best_checkpoint_path):
-            checkpoint_file = last_checkpoint_path
-            logger.info(f"Resuming training from {resume_from} checkpoint")
+    Raises
+    ------
+    FileNotFoundError
+        In case a required file cannot be found.
+    """
+    # standard paths where checkpoints may be (if produced with this framework)
+    last_path = output_folder / "model_final_epoch.ckpt"
+    best_path = output_folder / "model_lowest_valid_loss.ckpt"
+
+    if resume_from in ("last", "best"):
+        use_file = last_path if resume_from == "last" else best_path
+        if use_file.is_file():
+            logger.info(f"Found checkpoint at `{str(use_file)}`")
+            return str(use_file)
         else:
             raise FileNotFoundError(
-                f"Could not find checkpoint {best_checkpoint_path}"
+                f"Could not find a checkpoint file at `{str(use_file)}`"
             )
 
     elif resume_from is None:
-        if os.path.isfile(last_checkpoint_path):
-            checkpoint_file = last_checkpoint_path
-            logger.info(
-                f"Found existing checkpoint {last_checkpoint_path}. Loading."
-            )
+        # use-case: user is re-starting a crashed/cancelled job
+        if last_path.is_file():
+            logger.info(f"Found checkpoint at `{str(last_path)}`")
+            return str(last_path)
         else:
             return None
 
-    else:
-        if os.path.isfile(resume_from):
-            checkpoint_file = resume_from
-            logger.info(f"Resuming training from checkpoint {resume_from}")
+    elif isinstance(resume_from, str):
+        if pathlib.Path(resume_from).is_file():
+            logger.info(f"Found checkpoint at `{resume_from}`")
+            return resume_from
         else:
-            raise FileNotFoundError(f"Could not find checkpoint {resume_from}")
-
-    return checkpoint_file
+            raise FileNotFoundError(
+                f"Could not find a checkpoint file at `{resume_from}`"
+            )
diff --git a/src/ptbench/utils/tensorboard.py b/src/ptbench/utils/tensorboard.py
index ac838a44..7e2feaa4 100644
--- a/src/ptbench/utils/tensorboard.py
+++ b/src/ptbench/utils/tensorboard.py
@@ -2,11 +2,8 @@
 #
 # SPDX-License-Identifier: GPL-3.0-or-later
 
-import glob
-import os
-
-from collections import defaultdict
-from typing import Any
+import pathlib
+import typing
 
 import pandas
 
@@ -15,7 +12,7 @@ from tensorboard.backend.event_processing.event_accumulator import (
 )
 
 
-def get_scalars(logdir: str) -> pandas.DataFrame:
+def get_scalars(logdir: pathlib.Path) -> pandas.DataFrame:
     """Returns scalars stored in tensorboard event files.
 
     Parameters
@@ -28,17 +25,16 @@ def get_scalars(logdir: str) -> pandas.DataFrame:
     -------
 
     data:
-        Pandas dataframe containing the results. Rows correspond to an epoch, columns to the metrics.
+        Pandas dataframe containing the results. Rows correspond to an epoch,
+        columns to the metrics.
     """
-    tensorboard_logs = sorted(
-        glob.glob(os.path.join(logdir, "events.out.tfevents.*"))
-    )
+    tensorboard_logs = sorted(logdir.glob("events.out.tfevents.*"))
 
-    data: dict[str, dict[str, Any]] = defaultdict(dict)
+    data: dict[str, dict[str, typing.Any]] = {}
     headers = {"step"}
 
     for logfile in tensorboard_logs:
-        event_accumulator = EventAccumulator(logfile)
+        event_accumulator = EventAccumulator(str(logfile))
         event_accumulator.Reload()
 
         tags = event_accumulator.Tags()
@@ -52,8 +48,7 @@ def get_scalars(logdir: str) -> pandas.DataFrame:
                 step = tag_data.step
                 value = tag_data.value
 
-                data[step]["step"] = step
-                data[step][scalar_tag] = value
+                data.setdefault(step, {"step": step})["step"] = step
+                data.setdefault(step, {scalar_tag: value})[scalar_tag] = value
 
-    data = pandas.DataFrame.from_dict(data, orient="index")
-    return data
+    return pandas.DataFrame.from_dict(data, orient="index")
-- 
GitLab