Skip to content
Snippets Groups Projects
Commit a9dbca59 authored by Daniel CARRON's avatar Daniel CARRON :b:
Browse files

Added experiment script

parent 84c7a7f7
No related branches found
No related tags found
No related merge requests found
Pipeline #76442 failed
# SPDX-FileCopyrightText: Copyright © 2023 Idiap Research Institute <contact@idiap.ch>
#
# SPDX-License-Identifier: GPL-3.0-or-later
import os
import shutil
import click
from clapper.click import ConfigCommand, ResourceOption, verbosity_option
from clapper.logging import setup
logger = setup(__name__.split(".")[0], format="%(levelname)s: %(message)s")
from .utils import save_sh_command
@click.command(
entry_point_group="ptbench.config",
cls=ConfigCommand,
epilog="""Examples:
\b
1. Trains a pasa model with shenzhen dataset, on the CPU, for only two epochs, then runs inference and
evaluation on stock datasets, report performance as a table and a figure:
.. code:: sh
$ ptbench experiment -vv pasa shenzhen --epochs=2
""",
)
@click.option(
"--output-folder",
"-o",
help="Path where to store experiment outputs (created if does not exist)",
required=True,
type=click.Path(),
default="results",
cls=ResourceOption,
)
@click.option(
"--model",
"-m",
help="A lightining module instance implementing the network to be trained",
required=True,
cls=ResourceOption,
)
@click.option(
"--datamodule",
"-d",
help="A lighting data module containing the training and validation sets",
required=True,
cls=ResourceOption,
)
@click.option(
"--batch-size",
"-b",
help="Number of samples in every batch (this parameter affects "
"memory requirements for the network). If the number of samples in "
"the batch is larger than the total number of samples available for "
"training, this value is truncated. If this number is smaller, then "
"batches of the specified size are created and fed to the network "
"until there are no more new samples to feed (epoch is finished). "
"If the total number of training samples is not a multiple of the "
"batch-size, the last batch will be smaller than the first, unless "
"--drop-incomplete-batch is set, in which case this batch is not used.",
required=True,
show_default=True,
default=1,
type=click.IntRange(min=1),
cls=ResourceOption,
)
@click.option(
"--batch-chunk-count",
"-c",
help="Number of chunks in every batch (this parameter affects "
"memory requirements for the network). The number of samples "
"loaded for every iteration will be batch-size/batch-chunk-count. "
"batch-size needs to be divisible by batch-chunk-count, otherwise an "
"error will be raised. This parameter is used to reduce number of "
"samples loaded in each iteration, in order to reduce the memory usage "
"in exchange for processing time (more iterations). This is specially "
"interesting whe one is running with GPUs with limited RAM. The "
"default of 1 forces the whole batch to be processed at once. Otherwise "
"the batch is broken into batch-chunk-count pieces, and gradients are "
"accumulated to complete each batch.",
required=True,
show_default=True,
default=1,
type=click.IntRange(min=1),
cls=ResourceOption,
)
@click.option(
"--drop-incomplete-batch/--no-drop-incomplete-batch",
"-D",
help="If set, then may drop the last batch in an epoch, in case it is "
"incomplete. If you set this option, you should also consider "
"increasing the total number of epochs of training, as the total number "
"of training steps may be reduced",
required=True,
show_default=True,
default=False,
cls=ResourceOption,
)
@click.option(
"--epochs",
"-e",
help="Number of epochs (complete training set passes) to train for. "
"If continuing from a saved checkpoint, ensure to provide a greater "
"number of epochs than that saved on the checkpoint to be loaded. ",
show_default=True,
required=True,
default=1000,
type=click.IntRange(min=1),
cls=ResourceOption,
)
@click.option(
"--checkpoint-period",
"-p",
help="Number of epochs after which a checkpoint is saved. "
"A value of zero will disable check-pointing. If checkpointing is "
"enabled and training stops, it is automatically resumed from the "
"last saved checkpoint if training is restarted with the same "
"configuration.",
show_default=True,
required=False,
default=None,
type=click.IntRange(min=0),
cls=ResourceOption,
)
@click.option(
"--device",
"-d",
help='A string indicating the device to use (e.g. "cpu" or "cuda:0")',
show_default=True,
required=True,
default="cpu",
cls=ResourceOption,
)
@click.option(
"--cache-samples/--no-cache-samples",
help="If set to True, loads the sample into memory, "
"otherwise loads them at runtime.",
required=True,
show_default=True,
default=False,
cls=ResourceOption,
)
@click.option(
"--seed",
"-s",
help="Seed to use for the random number generator",
show_default=True,
required=False,
default=42,
type=click.IntRange(min=0),
cls=ResourceOption,
)
@click.option(
"--parallel",
"-P",
help="""Use multiprocessing for data loading: if set to -1 (default),
disables multiprocessing data loading. Set to 0 to enable as many data
loading instances as processing cores as available in the system. Set to
>= 1 to enable that many multiprocessing instances for data loading.""",
type=click.IntRange(min=-1),
show_default=True,
required=True,
default=-1,
cls=ResourceOption,
)
@click.option(
"--monitoring-interval",
"-I",
help="""Time between checks for the use of resources during each training
epoch. An interval of 5 seconds, for example, will lead to CPU and GPU
resources being probed every 5 seconds during each training epoch.
Values registered in the training logs correspond to averages (or maxima)
observed through possibly many probes in each epoch. Notice that setting a
very small value may cause the probing process to become extremely busy,
potentially biasing the overall perception of resource usage.""",
type=click.FloatRange(min=0.1),
show_default=True,
required=True,
default=5.0,
cls=ResourceOption,
)
@click.option(
"--resume-from",
help="Which checkpoint to resume training from. If set, can be one of "
"`best`, `last`, or a path to a model checkpoint.",
type=str,
required=False,
default=None,
cls=ResourceOption,
)
@click.option(
"--balance-classes/--no-balance-classes",
"-B/-N",
help="""If set, then balances weights of the random sampler during
training, so that samples from all sample classes are picked picked
equitably. It also sets the training (and validation) losses to account
for the populations of each class.""",
required=True,
show_default=True,
default=True,
cls=ResourceOption,
)
@click.option(
"--steps",
"-S",
help="This number is used to define the number of threshold steps to "
"consider when evaluating the highest possible F1-score on test data.",
default=1000,
show_default=True,
required=True,
cls=ResourceOption,
)
@click.option(
"--plot-limits",
"-L",
help="""If set, this option affects the performance comparison plots. It
must be a 4-tuple containing the bounds of the plot for the x and y axis
respectively (format: x_low, x_high, y_low, y_high]). If not set, use
normal bounds ([0, 1, 0, 1]) for the performance curve.""",
default=[0.0, 1.0, 0.0, 1.0],
show_default=True,
nargs=4,
type=float,
cls=ResourceOption,
)
@verbosity_option(logger=logger, cls=ResourceOption)
@click.pass_context
def experiment(
ctx,
model,
output_folder,
epochs,
batch_size,
batch_chunk_count,
drop_incomplete_batch,
datamodule,
checkpoint_period,
device,
cache_samples,
seed,
parallel,
monitoring_interval,
resume_from,
balance_classes,
steps,
**kwargs,
):
"""Runs a complete experiment, from training, to prediction and evaluation.
This script is just a wrapper around the individual scripts for training,
running prediction, evaluating and comparing model performance. It
organises the output in a preset way::
\b
└─ <output-folder>/
├── command
├── model/ #the generated model will be here
├── predictions/ #the prediction outputs for the sets
└── evaluations/ #the outputs of the evaluations for the sets
"""
command_sh = os.path.join(output_folder, "command.sh")
if os.path.exists(command_sh):
backup = command_sh + "~"
if os.path.exists(backup):
os.unlink(backup)
shutil.move(command_sh, backup)
save_sh_command(output_folder)
# training
logger.info("Started training")
from .train import train
train_output_folder = os.path.join(output_folder, "model")
ctx.invoke(
train,
model=model,
output_folder=train_output_folder,
epochs=epochs,
batch_size=batch_size,
batch_chunk_count=batch_chunk_count,
drop_incomplete_batch=drop_incomplete_batch,
datamodule=datamodule,
checkpoint_period=checkpoint_period,
device=device,
cache_samples=cache_samples,
seed=seed,
parallel=parallel,
monitoring_interval=monitoring_interval,
resume_from=resume_from,
balance_classes=balance_classes,
)
logger.info("Ended training")
logger.info("Started predicting")
from .predict import predict
# preferably, we use the best model on the validation set
# otherwise, we get the last saved model
model_file = os.path.join(
train_output_folder, "model_lowest_valid_loss.ckpt"
)
if not os.path.exists(model_file):
model_file = os.path.join(train_output_folder, "model_final_epoch.ckpt")
predictions_folder = os.path.join(output_folder, "predictions")
ctx.invoke(
predict,
output_folder=predictions_folder,
model=model,
datamodule=datamodule,
device=device,
weight=model_file,
)
logger.info("Ended predicting")
logger.info("Started evaluating")
from .evaluate import evaluate
evaluations_folder = os.path.join(output_folder, "evaluations")
ctx.invoke(
evaluate,
output_folder=evaluations_folder,
predictions_folder=predictions_folder,
datamodule=datamodule,
threshold="train",
steps=steps,
)
logger.info("Ended evaluating")
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment