diff --git a/bob/ip/binseg/script/evaluate.py b/bob/ip/binseg/script/evaluate.py
index 1576f3875ebe5a1ad328293d234ebf0bfdec714d..f4a3f9fc522991b34f6cc25c26ac061e1d3afa9f 100644
--- a/bob/ip/binseg/script/evaluate.py
+++ b/bob/ip/binseg/script/evaluate.py
@@ -1,6 +1,7 @@
 #!/usr/bin/env python
 # coding=utf-8
 
+import os
 import click
 from torch.utils.data import DataLoader
 
@@ -58,7 +59,13 @@ logger = logging.getLogger(__name__)
 @click.option(
     "--dataset",
     "-d",
-    help="A torch.utils.data.dataset.Dataset instance implementing a dataset to be used for evaluating predictions, possibly including all pre-processing pipelines required",
+    help="A bob.ip.binseg.data.utils.SampleList2TorchDataset instance "
+    "implementing a dataset to be used for evaluation purposes, possibly "
+    "including all pre-processing pipelines required or, optionally, a "
+    "dictionary mapping string keys to "
+    "bob.ip.binseg.data.utils.SampleList2TorchDataset's.  In such a case, "
+    "all datasets will be used for evaluation.  Data augmentation "
+    "operations are excluded automatically in this case",
     required=True,
     cls=ResourceOption,
 )
@@ -96,7 +103,11 @@ def evaluate(output_folder, predictions_folder, dataset, overlayed,
         overlay_threshold, **kwargs):
     """Evaluates an FCN on a binary segmentation task.
     """
-    data_loader = DataLoader(dataset=dataset, batch_size=1, shuffle=False,
-            pin_memory=False)
-    run(dataset, predictions_folder, output_folder, overlayed,
-            overlay_threshold)
+    if isinstance(dataset, dict):
+        for k,v in dataset.items():
+            analysis_folder = os.path.join(output_folder, k)
+            with v.not_augmented() as d:
+                data_loader = DataLoader(dataset=d, batch_size=1,
+                        shuffle=False, pin_memory=False)
+                run(d, predictions_folder, analysis_folder, overlayed,
+                    overlay_threshold)
diff --git a/bob/ip/binseg/script/experiment.py b/bob/ip/binseg/script/experiment.py
index ec41992defd0cfa201c9da044753dcf0a98fc4f0..e4027229a7dd057065379dc534f6b17ae20083c9 100644
--- a/bob/ip/binseg/script/experiment.py
+++ b/bob/ip/binseg/script/experiment.py
@@ -22,11 +22,11 @@ logger = logging.getLogger(__name__)
     epilog="""Examples:
 
 \b
-    1. Trains a M2U-Net model (VGG-16 backbone) with STARE (vessel segmentation),
+    1. Trains a M2U-Net model (VGG-16 backbone) with DRIVE (vessel segmentation),
        on the CPU, for only two epochs, then runs inference and evaluation on
        results from its test set:
 
-       $ bob binseg experiment -vv m2unet stare --epochs=2
+       $ bob binseg experiment -vv m2unet drive --epochs=2
 
 """,
 )
@@ -47,20 +47,15 @@ logger = logging.getLogger(__name__)
     cls=ResourceOption,
 )
 @click.option(
-    "--train-dataset",
+    "--dataset",
     "-d",
-    help="A torch.utils.data.dataset.Dataset instance implementing a dataset "
-    "to be used for training the model, possibly including all pre-processing"
-    " pipelines required, including data augmentation",
-    required=True,
-    cls=ResourceOption,
-)
-@click.option(
-    "--test-dataset",
-    "-d",
-    help="A torch.utils.data.dataset.Dataset instance implementing a dataset "
-    "to be used for testing the model, possibly including all pre-processing"
-    " pipelines required",
+    help="A dictionary mapping string keys to "
+    "bob.ip.binseg.data.utils.SampleList2TorchDataset's.  At least one key "
+    "named 'train' must be available.  This dataset will be used for training "
+    "the network model.  All other datasets will be used for prediction and "
+    "evaluation. Dataset descriptions include all required pre-processing, "
+    "including eventual data augmentation, which may be eventually excluded "
+    "for prediction and evaluation purposes",
     required=True,
     cls=ResourceOption,
 )
@@ -209,8 +204,7 @@ def experiment(
     batch_size,
     drop_incomplete_batch,
     criterion,
-    train_dataset,
-    test_dataset,
+    dataset,
     checkpoint_period,
     device,
     seed,
@@ -220,7 +214,7 @@ def experiment(
     verbose,
     **kwargs,
 ):
-    """Runs a complete experiment, from training, prediction and evaluation
+    """Runs a complete experiment, from training, to prediction and evaluation
 
     This script is just a wrapper around the individual scripts for training,
     running prediction and evaluating FCN models.  It organises the output in a
@@ -259,7 +253,7 @@ def experiment(
         batch_size=batch_size,
         drop_incomplete_batch=drop_incomplete_batch,
         criterion=criterion,
-        dataset=train_dataset,
+        dataset=dataset,
         checkpoint_period=checkpoint_period,
         device=device,
         seed=seed,
@@ -282,25 +276,11 @@ def experiment(
         else None
     )
 
-    # train set
-    ctx.invoke(
-        predict,
-        output_folder=predictions_folder,
-        model=model,
-        dataset=train_dataset,
-        batch_size=batch_size,
-        device=device,
-        weight=model_file,
-        overlayed=overlayed_folder,
-        verbose=verbose,
-    )
-
-    # test set
     ctx.invoke(
         predict,
         output_folder=predictions_folder,
         model=model,
-        dataset=test_dataset,
+        dataset=dataset,
         batch_size=batch_size,
         device=device,
         weight=model_file,
@@ -320,41 +300,29 @@ def experiment(
         else None
     )
 
-    # train set
-    train_analysis_folder = os.path.join(output_folder, "analysis", "train")
+    analysis_folder = os.path.join(output_folder, "analysis")
     ctx.invoke(
         evaluate,
-        output_folder=train_analysis_folder,
+        output_folder=analysis_folder,
         predictions_folder=predictions_folder,
-        dataset=train_dataset,
+        dataset=dataset,
         overlayed=overlayed_folder,
         overlay_threshold=0.5,
         verbose=verbose,
     )
 
-    # test set
-    test_analysis_folder = os.path.join(output_folder, "analysis", "test")
-    ctx.invoke(
-        evaluate,
-        output_folder=test_analysis_folder,
-        predictions_folder=predictions_folder,
-        dataset=test_dataset,
-        overlayed=overlayed_folder,
-        overlay_threshold=0.5,
-        verbose=verbose,
-    )
     logger.info("Ended evaluation")
 
     ## Comparison
     logger.info("Started comparison")
 
-    # compare train and test set performances
+    # compare performances on the various sets
     from .compare import compare
 
-    systems = (
-            "train": os.path.join(train_analysis_folder, "metric.csv"),
-            "test": os.path.join(test_analysis_folder, "metric.csv"),
-            )
+    systems = []
+    for k, v in dataset.items():
+        systems += [k, os.path.join(output_folder, "analysis", k, "metrics.csv")]
     output_pdf = os.path.join(output_folder, "comparison.pdf")
     ctx.invoke(compare, label_path=systems, output=output_pdf, verbose=verbose)
-    logger.info("End comparison, and the experiment - bye.")
+
+    logger.info("Ended comparison, and the experiment - bye.")
diff --git a/bob/ip/binseg/script/predict.py b/bob/ip/binseg/script/predict.py
index b079551cdce28f2b92415823584206ea0faea354..d8af8c011c9b5ea2ede8d1056a5c0a9ee1d19bc4 100644
--- a/bob/ip/binseg/script/predict.py
+++ b/bob/ip/binseg/script/predict.py
@@ -61,7 +61,13 @@ logger = logging.getLogger(__name__)
 @click.option(
     "--dataset",
     "-d",
-    help="A torch.utils.data.dataset.Dataset instance implementing a dataset to be used for evaluating the model, possibly including all pre-processing pipelines required",
+    help="A bob.ip.binseg.data.utils.SampleList2TorchDataset instance "
+    "implementing a dataset to be used for running prediction, possibly "
+    "including all pre-processing pipelines required or, optionally, a "
+    "dictionary mapping string keys to "
+    "bob.ip.binseg.data.utils.SampleList2TorchDataset's.  In such a case, "
+    "all datasets will be used for running prediction.  Data augmentation "
+    "operations are excluded automatically for prediction purposes",
     required=True,
     cls=ResourceOption,
 )
@@ -108,13 +114,7 @@ def predict(output_folder, model, dataset, batch_size, device, weight,
         overlayed, **kwargs):
     """Predicts vessel map (probabilities) on input images"""
 
-    # PyTorch dataloader
-    data_loader = DataLoader(
-        dataset=dataset,
-        batch_size=batch_size,
-        shuffle=False,
-        pin_memory=torch.cuda.is_available(),
-    )
+    dataset = dataset if isinstance(dataset, dict) else dict(test=dataset)
 
     # checkpointer, loads pre-fit model
     weight_fullpath = os.path.abspath(weight)
@@ -128,4 +128,12 @@ def predict(output_folder, model, dataset, batch_size, device, weight,
     if overlayed is not None:
         overlayed = overlayed.strip()
 
-    run(model, data_loader, device, output_folder, overlayed)
+    for k,v in dataset.items():
+        with v.not_augmented() as d:  # we remove any data augmentation
+            data_loader = DataLoader(
+                dataset=d,
+                batch_size=batch_size,
+                shuffle=False,
+                pin_memory=torch.cuda.is_available(),
+            )
+            run(model, data_loader, device, output_folder, overlayed)
diff --git a/bob/ip/binseg/script/train.py b/bob/ip/binseg/script/train.py
index a027544418d87ac8a9779e74c60d72686d072fee..5df8ccfb9a6a5391761dcc490415f67931832a9f 100644
--- a/bob/ip/binseg/script/train.py
+++ b/bob/ip/binseg/script/train.py
@@ -64,8 +64,12 @@ logger = logging.getLogger(__name__)
     "--dataset",
     "-d",
     help="A torch.utils.data.dataset.Dataset instance implementing a dataset "
-    "to be used for training the model, possibly including all pre-processing"
-    " pipelines required",
+    "to be used for training the model, possibly including all pre-processing "
+    "pipelines required or, optionally, a dictionary mapping string keys to "
+    "bob.ip.binseg.data.utils.SampleList2TorchDataset's.  At least one key "
+    "named 'train' must be available.  This dataset will be used for training "
+    "the network model.  The dataset description include all required "
+    "pre-processing, including eventual data augmentation",
     required=True,
     cls=ResourceOption,
 )
@@ -222,7 +226,7 @@ def train(
 
     # PyTorch dataloader
     data_loader = DataLoader(
-        dataset=dataset,
+        dataset=dataset["train"] if isinstance(dataset, dict) else dataset,
         batch_size=batch_size,
         shuffle=True,
         drop_last=drop_incomplete_batch,