diff --git a/src/ptbench/scripts/experiment.py b/src/ptbench/scripts/experiment.py
new file mode 100644
index 0000000000000000000000000000000000000000..3c43db6a286e7f5d46dc1197867b248aed87dbcd
--- /dev/null
+++ b/src/ptbench/scripts/experiment.py
@@ -0,0 +1,342 @@
+# SPDX-FileCopyrightText: Copyright © 2023 Idiap Research Institute <contact@idiap.ch>
+#
+# SPDX-License-Identifier: GPL-3.0-or-later
+
+import os
+import shutil
+
+import click
+
+from clapper.click import ConfigCommand, ResourceOption, verbosity_option
+from clapper.logging import setup
+
+logger = setup(__name__.split(".")[0], format="%(levelname)s: %(message)s")
+
+from .utils import save_sh_command
+
+
+@click.command(
+    entry_point_group="ptbench.config",
+    cls=ConfigCommand,
+    epilog="""Examples:
+
+\b
+  1. Trains a pasa model with shenzhen dataset, on the CPU, for only two epochs, then runs inference and
+     evaluation on stock datasets, report performance as a table and a figure:
+
+     .. code:: sh
+
+        $ ptbench experiment -vv pasa shenzhen --epochs=2
+""",
+)
+@click.option(
+    "--output-folder",
+    "-o",
+    help="Path where to store experiment outputs (created if does not exist)",
+    required=True,
+    type=click.Path(),
+    default="results",
+    cls=ResourceOption,
+)
+@click.option(
+    "--model",
+    "-m",
+    help="A lightining module instance implementing the network to be trained",
+    required=True,
+    cls=ResourceOption,
+)
+@click.option(
+    "--datamodule",
+    "-d",
+    help="A lighting data module containing the training and validation sets",
+    required=True,
+    cls=ResourceOption,
+)
+@click.option(
+    "--batch-size",
+    "-b",
+    help="Number of samples in every batch (this parameter affects "
+    "memory requirements for the network).  If the number of samples in "
+    "the batch is larger than the total number of samples available for "
+    "training, this value is truncated.  If this number is smaller, then "
+    "batches of the specified size are created and fed to the network "
+    "until there are no more new samples to feed (epoch is finished).  "
+    "If the total number of training samples is not a multiple of the "
+    "batch-size, the last batch will be smaller than the first, unless "
+    "--drop-incomplete-batch is set, in which case this batch is not used.",
+    required=True,
+    show_default=True,
+    default=1,
+    type=click.IntRange(min=1),
+    cls=ResourceOption,
+)
+@click.option(
+    "--batch-chunk-count",
+    "-c",
+    help="Number of chunks in every batch (this parameter affects "
+    "memory requirements for the network). The number of samples "
+    "loaded for every iteration will be batch-size/batch-chunk-count. "
+    "batch-size needs to be divisible by batch-chunk-count, otherwise an "
+    "error will be raised. This parameter is used to reduce number of "
+    "samples loaded in each iteration, in order to reduce the memory usage "
+    "in exchange for processing time (more iterations).  This is specially "
+    "interesting whe one is running with GPUs with limited RAM. The "
+    "default of 1 forces the whole batch to be processed at once.  Otherwise "
+    "the batch is broken into batch-chunk-count pieces, and gradients are "
+    "accumulated to complete each batch.",
+    required=True,
+    show_default=True,
+    default=1,
+    type=click.IntRange(min=1),
+    cls=ResourceOption,
+)
+@click.option(
+    "--drop-incomplete-batch/--no-drop-incomplete-batch",
+    "-D",
+    help="If set, then may drop the last batch in an epoch, in case it is "
+    "incomplete.  If you set this option, you should also consider "
+    "increasing the total number of epochs of training, as the total number "
+    "of training steps may be reduced",
+    required=True,
+    show_default=True,
+    default=False,
+    cls=ResourceOption,
+)
+@click.option(
+    "--epochs",
+    "-e",
+    help="Number of epochs (complete training set passes) to train for. "
+    "If continuing from a saved checkpoint, ensure to provide a greater "
+    "number of epochs than that saved on the checkpoint to be loaded. ",
+    show_default=True,
+    required=True,
+    default=1000,
+    type=click.IntRange(min=1),
+    cls=ResourceOption,
+)
+@click.option(
+    "--checkpoint-period",
+    "-p",
+    help="Number of epochs after which a checkpoint is saved. "
+    "A value of zero will disable check-pointing. If checkpointing is "
+    "enabled and training stops, it is automatically resumed from the "
+    "last saved checkpoint if training is restarted with the same "
+    "configuration.",
+    show_default=True,
+    required=False,
+    default=None,
+    type=click.IntRange(min=0),
+    cls=ResourceOption,
+)
+@click.option(
+    "--device",
+    "-d",
+    help='A string indicating the device to use (e.g. "cpu" or "cuda:0")',
+    show_default=True,
+    required=True,
+    default="cpu",
+    cls=ResourceOption,
+)
+@click.option(
+    "--cache-samples/--no-cache-samples",
+    help="If set to True, loads the sample into memory, "
+    "otherwise loads them at runtime.",
+    required=True,
+    show_default=True,
+    default=False,
+    cls=ResourceOption,
+)
+@click.option(
+    "--seed",
+    "-s",
+    help="Seed to use for the random number generator",
+    show_default=True,
+    required=False,
+    default=42,
+    type=click.IntRange(min=0),
+    cls=ResourceOption,
+)
+@click.option(
+    "--parallel",
+    "-P",
+    help="""Use multiprocessing for data loading: if set to -1 (default),
+    disables multiprocessing data loading.  Set to 0 to enable as many data
+    loading instances as processing cores as available in the system.  Set to
+    >= 1 to enable that many multiprocessing instances for data loading.""",
+    type=click.IntRange(min=-1),
+    show_default=True,
+    required=True,
+    default=-1,
+    cls=ResourceOption,
+)
+@click.option(
+    "--monitoring-interval",
+    "-I",
+    help="""Time between checks for the use of resources during each training
+    epoch.  An interval of 5 seconds, for example, will lead to CPU and GPU
+    resources being probed every 5 seconds during each training epoch.
+    Values registered in the training logs correspond to averages (or maxima)
+    observed through possibly many probes in each epoch.  Notice that setting a
+    very small value may cause the probing process to become extremely busy,
+    potentially biasing the overall perception of resource usage.""",
+    type=click.FloatRange(min=0.1),
+    show_default=True,
+    required=True,
+    default=5.0,
+    cls=ResourceOption,
+)
+@click.option(
+    "--resume-from",
+    help="Which checkpoint to resume training from. If set, can be one of "
+    "`best`, `last`, or a path to a model checkpoint.",
+    type=str,
+    required=False,
+    default=None,
+    cls=ResourceOption,
+)
+@click.option(
+    "--balance-classes/--no-balance-classes",
+    "-B/-N",
+    help="""If set, then balances weights of the random sampler during
+    training, so that samples from all sample classes are picked picked
+    equitably.  It also sets the training (and validation) losses to account
+    for the populations of each class.""",
+    required=True,
+    show_default=True,
+    default=True,
+    cls=ResourceOption,
+)
+@click.option(
+    "--steps",
+    "-S",
+    help="This number is used to define the number of threshold steps to "
+    "consider when evaluating the highest possible F1-score on test data.",
+    default=1000,
+    show_default=True,
+    required=True,
+    cls=ResourceOption,
+)
+@click.option(
+    "--plot-limits",
+    "-L",
+    help="""If set, this option affects the performance comparison plots.  It
+    must be a 4-tuple containing the bounds of the plot for the x and y axis
+    respectively (format: x_low, x_high, y_low, y_high]).  If not set, use
+    normal bounds ([0, 1, 0, 1]) for the performance curve.""",
+    default=[0.0, 1.0, 0.0, 1.0],
+    show_default=True,
+    nargs=4,
+    type=float,
+    cls=ResourceOption,
+)
+@verbosity_option(logger=logger, cls=ResourceOption)
+@click.pass_context
+def experiment(
+    ctx,
+    model,
+    output_folder,
+    epochs,
+    batch_size,
+    batch_chunk_count,
+    drop_incomplete_batch,
+    datamodule,
+    checkpoint_period,
+    device,
+    cache_samples,
+    seed,
+    parallel,
+    monitoring_interval,
+    resume_from,
+    balance_classes,
+    steps,
+    **kwargs,
+):
+    """Runs a complete experiment, from training, to prediction and evaluation.
+
+    This script is just a wrapper around the individual scripts for training,
+    running prediction, evaluating and comparing model performance.  It
+    organises the output in a preset way::
+
+        \b
+       └─ <output-folder>/
+          ├── command
+          ├── model/  #the generated model will be here
+          ├── predictions/  #the prediction outputs for the sets
+          └── evaluations/  #the outputs of the evaluations for the sets
+    """
+
+    command_sh = os.path.join(output_folder, "command.sh")
+    if os.path.exists(command_sh):
+        backup = command_sh + "~"
+        if os.path.exists(backup):
+            os.unlink(backup)
+        shutil.move(command_sh, backup)
+    save_sh_command(output_folder)
+
+    # training
+    logger.info("Started training")
+
+    from .train import train
+
+    train_output_folder = os.path.join(output_folder, "model")
+    ctx.invoke(
+        train,
+        model=model,
+        output_folder=train_output_folder,
+        epochs=epochs,
+        batch_size=batch_size,
+        batch_chunk_count=batch_chunk_count,
+        drop_incomplete_batch=drop_incomplete_batch,
+        datamodule=datamodule,
+        checkpoint_period=checkpoint_period,
+        device=device,
+        cache_samples=cache_samples,
+        seed=seed,
+        parallel=parallel,
+        monitoring_interval=monitoring_interval,
+        resume_from=resume_from,
+        balance_classes=balance_classes,
+    )
+    logger.info("Ended training")
+
+    logger.info("Started predicting")
+
+    from .predict import predict
+
+    # preferably, we use the best model on the validation set
+    # otherwise, we get the last saved model
+    model_file = os.path.join(
+        train_output_folder, "model_lowest_valid_loss.ckpt"
+    )
+    if not os.path.exists(model_file):
+        model_file = os.path.join(train_output_folder, "model_final_epoch.ckpt")
+
+    predictions_folder = os.path.join(output_folder, "predictions")
+
+    ctx.invoke(
+        predict,
+        output_folder=predictions_folder,
+        model=model,
+        datamodule=datamodule,
+        device=device,
+        weight=model_file,
+    )
+
+    logger.info("Ended predicting")
+
+    logger.info("Started evaluating")
+
+    from .evaluate import evaluate
+
+    evaluations_folder = os.path.join(output_folder, "evaluations")
+
+    ctx.invoke(
+        evaluate,
+        output_folder=evaluations_folder,
+        predictions_folder=predictions_folder,
+        datamodule=datamodule,
+        threshold="train",
+        steps=steps,
+    )
+
+    logger.info("Ended evaluating")