diff --git a/src/ptbench/scripts/experiment.py b/src/ptbench/scripts/experiment.py new file mode 100644 index 0000000000000000000000000000000000000000..3c43db6a286e7f5d46dc1197867b248aed87dbcd --- /dev/null +++ b/src/ptbench/scripts/experiment.py @@ -0,0 +1,342 @@ +# SPDX-FileCopyrightText: Copyright © 2023 Idiap Research Institute <contact@idiap.ch> +# +# SPDX-License-Identifier: GPL-3.0-or-later + +import os +import shutil + +import click + +from clapper.click import ConfigCommand, ResourceOption, verbosity_option +from clapper.logging import setup + +logger = setup(__name__.split(".")[0], format="%(levelname)s: %(message)s") + +from .utils import save_sh_command + + +@click.command( + entry_point_group="ptbench.config", + cls=ConfigCommand, + epilog="""Examples: + +\b + 1. Trains a pasa model with shenzhen dataset, on the CPU, for only two epochs, then runs inference and + evaluation on stock datasets, report performance as a table and a figure: + + .. code:: sh + + $ ptbench experiment -vv pasa shenzhen --epochs=2 +""", +) +@click.option( + "--output-folder", + "-o", + help="Path where to store experiment outputs (created if does not exist)", + required=True, + type=click.Path(), + default="results", + cls=ResourceOption, +) +@click.option( + "--model", + "-m", + help="A lightining module instance implementing the network to be trained", + required=True, + cls=ResourceOption, +) +@click.option( + "--datamodule", + "-d", + help="A lighting data module containing the training and validation sets", + required=True, + cls=ResourceOption, +) +@click.option( + "--batch-size", + "-b", + help="Number of samples in every batch (this parameter affects " + "memory requirements for the network). If the number of samples in " + "the batch is larger than the total number of samples available for " + "training, this value is truncated. If this number is smaller, then " + "batches of the specified size are created and fed to the network " + "until there are no more new samples to feed (epoch is finished). " + "If the total number of training samples is not a multiple of the " + "batch-size, the last batch will be smaller than the first, unless " + "--drop-incomplete-batch is set, in which case this batch is not used.", + required=True, + show_default=True, + default=1, + type=click.IntRange(min=1), + cls=ResourceOption, +) +@click.option( + "--batch-chunk-count", + "-c", + help="Number of chunks in every batch (this parameter affects " + "memory requirements for the network). The number of samples " + "loaded for every iteration will be batch-size/batch-chunk-count. " + "batch-size needs to be divisible by batch-chunk-count, otherwise an " + "error will be raised. This parameter is used to reduce number of " + "samples loaded in each iteration, in order to reduce the memory usage " + "in exchange for processing time (more iterations). This is specially " + "interesting whe one is running with GPUs with limited RAM. The " + "default of 1 forces the whole batch to be processed at once. Otherwise " + "the batch is broken into batch-chunk-count pieces, and gradients are " + "accumulated to complete each batch.", + required=True, + show_default=True, + default=1, + type=click.IntRange(min=1), + cls=ResourceOption, +) +@click.option( + "--drop-incomplete-batch/--no-drop-incomplete-batch", + "-D", + help="If set, then may drop the last batch in an epoch, in case it is " + "incomplete. If you set this option, you should also consider " + "increasing the total number of epochs of training, as the total number " + "of training steps may be reduced", + required=True, + show_default=True, + default=False, + cls=ResourceOption, +) +@click.option( + "--epochs", + "-e", + help="Number of epochs (complete training set passes) to train for. " + "If continuing from a saved checkpoint, ensure to provide a greater " + "number of epochs than that saved on the checkpoint to be loaded. ", + show_default=True, + required=True, + default=1000, + type=click.IntRange(min=1), + cls=ResourceOption, +) +@click.option( + "--checkpoint-period", + "-p", + help="Number of epochs after which a checkpoint is saved. " + "A value of zero will disable check-pointing. If checkpointing is " + "enabled and training stops, it is automatically resumed from the " + "last saved checkpoint if training is restarted with the same " + "configuration.", + show_default=True, + required=False, + default=None, + type=click.IntRange(min=0), + cls=ResourceOption, +) +@click.option( + "--device", + "-d", + help='A string indicating the device to use (e.g. "cpu" or "cuda:0")', + show_default=True, + required=True, + default="cpu", + cls=ResourceOption, +) +@click.option( + "--cache-samples/--no-cache-samples", + help="If set to True, loads the sample into memory, " + "otherwise loads them at runtime.", + required=True, + show_default=True, + default=False, + cls=ResourceOption, +) +@click.option( + "--seed", + "-s", + help="Seed to use for the random number generator", + show_default=True, + required=False, + default=42, + type=click.IntRange(min=0), + cls=ResourceOption, +) +@click.option( + "--parallel", + "-P", + help="""Use multiprocessing for data loading: if set to -1 (default), + disables multiprocessing data loading. Set to 0 to enable as many data + loading instances as processing cores as available in the system. Set to + >= 1 to enable that many multiprocessing instances for data loading.""", + type=click.IntRange(min=-1), + show_default=True, + required=True, + default=-1, + cls=ResourceOption, +) +@click.option( + "--monitoring-interval", + "-I", + help="""Time between checks for the use of resources during each training + epoch. An interval of 5 seconds, for example, will lead to CPU and GPU + resources being probed every 5 seconds during each training epoch. + Values registered in the training logs correspond to averages (or maxima) + observed through possibly many probes in each epoch. Notice that setting a + very small value may cause the probing process to become extremely busy, + potentially biasing the overall perception of resource usage.""", + type=click.FloatRange(min=0.1), + show_default=True, + required=True, + default=5.0, + cls=ResourceOption, +) +@click.option( + "--resume-from", + help="Which checkpoint to resume training from. If set, can be one of " + "`best`, `last`, or a path to a model checkpoint.", + type=str, + required=False, + default=None, + cls=ResourceOption, +) +@click.option( + "--balance-classes/--no-balance-classes", + "-B/-N", + help="""If set, then balances weights of the random sampler during + training, so that samples from all sample classes are picked picked + equitably. It also sets the training (and validation) losses to account + for the populations of each class.""", + required=True, + show_default=True, + default=True, + cls=ResourceOption, +) +@click.option( + "--steps", + "-S", + help="This number is used to define the number of threshold steps to " + "consider when evaluating the highest possible F1-score on test data.", + default=1000, + show_default=True, + required=True, + cls=ResourceOption, +) +@click.option( + "--plot-limits", + "-L", + help="""If set, this option affects the performance comparison plots. It + must be a 4-tuple containing the bounds of the plot for the x and y axis + respectively (format: x_low, x_high, y_low, y_high]). If not set, use + normal bounds ([0, 1, 0, 1]) for the performance curve.""", + default=[0.0, 1.0, 0.0, 1.0], + show_default=True, + nargs=4, + type=float, + cls=ResourceOption, +) +@verbosity_option(logger=logger, cls=ResourceOption) +@click.pass_context +def experiment( + ctx, + model, + output_folder, + epochs, + batch_size, + batch_chunk_count, + drop_incomplete_batch, + datamodule, + checkpoint_period, + device, + cache_samples, + seed, + parallel, + monitoring_interval, + resume_from, + balance_classes, + steps, + **kwargs, +): + """Runs a complete experiment, from training, to prediction and evaluation. + + This script is just a wrapper around the individual scripts for training, + running prediction, evaluating and comparing model performance. It + organises the output in a preset way:: + + \b + └─ <output-folder>/ + ├── command + ├── model/ #the generated model will be here + ├── predictions/ #the prediction outputs for the sets + └── evaluations/ #the outputs of the evaluations for the sets + """ + + command_sh = os.path.join(output_folder, "command.sh") + if os.path.exists(command_sh): + backup = command_sh + "~" + if os.path.exists(backup): + os.unlink(backup) + shutil.move(command_sh, backup) + save_sh_command(output_folder) + + # training + logger.info("Started training") + + from .train import train + + train_output_folder = os.path.join(output_folder, "model") + ctx.invoke( + train, + model=model, + output_folder=train_output_folder, + epochs=epochs, + batch_size=batch_size, + batch_chunk_count=batch_chunk_count, + drop_incomplete_batch=drop_incomplete_batch, + datamodule=datamodule, + checkpoint_period=checkpoint_period, + device=device, + cache_samples=cache_samples, + seed=seed, + parallel=parallel, + monitoring_interval=monitoring_interval, + resume_from=resume_from, + balance_classes=balance_classes, + ) + logger.info("Ended training") + + logger.info("Started predicting") + + from .predict import predict + + # preferably, we use the best model on the validation set + # otherwise, we get the last saved model + model_file = os.path.join( + train_output_folder, "model_lowest_valid_loss.ckpt" + ) + if not os.path.exists(model_file): + model_file = os.path.join(train_output_folder, "model_final_epoch.ckpt") + + predictions_folder = os.path.join(output_folder, "predictions") + + ctx.invoke( + predict, + output_folder=predictions_folder, + model=model, + datamodule=datamodule, + device=device, + weight=model_file, + ) + + logger.info("Ended predicting") + + logger.info("Started evaluating") + + from .evaluate import evaluate + + evaluations_folder = os.path.join(output_folder, "evaluations") + + ctx.invoke( + evaluate, + output_folder=evaluations_folder, + predictions_folder=predictions_folder, + datamodule=datamodule, + threshold="train", + steps=steps, + ) + + logger.info("Ended evaluating")