diff --git a/bob/ip/binseg/engine/evaluator.py b/bob/ip/binseg/engine/evaluator.py
index 96870093a79892e8dfe69fc35cfc022e905c7eb7..d6687554cc89c2340fcd7f653ebb387e05d64c3c 100644
--- a/bob/ip/binseg/engine/evaluator.py
+++ b/bob/ip/binseg/engine/evaluator.py
@@ -19,6 +19,7 @@ from ..utils.metric import base_metrics
 from ..utils.plot import precision_recall_f1iso_confintval
 
 import logging
+
 logger = logging.getLogger(__name__)
 
 
@@ -86,40 +87,60 @@ def _sample_metrics(pred, gt):
 
     for threshold in numpy.arange(0.0, 1.0, step_size):
 
-        tp_tensor, fp_tensor, tn_tensor, fn_tensor = _posneg(pred, gt, threshold)
+        tp_tensor, fp_tensor, tn_tensor, fn_tensor = _posneg(
+            pred, gt, threshold
+        )
 
         # calc metrics from scalars
         tp_count = torch.sum(tp_tensor).item()
         fp_count = torch.sum(fp_tensor).item()
         tn_count = torch.sum(tn_tensor).item()
         fn_count = torch.sum(fn_tensor).item()
-        precision, recall, specificity, accuracy, jaccard, f1_score = \
-                base_metrics(tp_count, fp_count, tn_count, fn_count)
-
-        data.append([threshold, precision, recall, specificity,
-            accuracy, jaccard, f1_score])
-
-    return pandas.DataFrame(data, columns=(
-        "threshold",
-        "precision",
-        "recall",
-        "specificity",
-        "accuracy",
-        "jaccard",
-        "f1_score",
-        ))
+        (
+            precision,
+            recall,
+            specificity,
+            accuracy,
+            jaccard,
+            f1_score,
+        ) = base_metrics(tp_count, fp_count, tn_count, fn_count)
+
+        data.append(
+            [
+                threshold,
+                precision,
+                recall,
+                specificity,
+                accuracy,
+                jaccard,
+                f1_score,
+            ]
+        )
+
+    return pandas.DataFrame(
+        data,
+        columns=(
+            "threshold",
+            "precision",
+            "recall",
+            "specificity",
+            "accuracy",
+            "jaccard",
+            "f1_score",
+        ),
+    )
 
 
 def _sample_analysis(
-        img,
-        pred,
-        gt,
-        threshold,
-        tp_color=(0, 255, 0),  # (128,128,128) Gray
-        fp_color=(0, 0, 255),  # (70, 240, 240) Cyan
-        fn_color=(255, 0, 0),  # (245, 130, 48) Orange
-        overlay=True,
-        ):
+    img,
+    pred,
+    gt,
+    threshold,
+    tp_color=(0, 255, 0),  # (128,128,128) Gray
+    fp_color=(0, 0, 255),  # (70, 240, 240) Cyan
+    fn_color=(255, 0, 0),  # (245, 130, 48) Orange
+    overlay=True,
+):
     """Visualizes true positives, false positives and false negatives
 
 
@@ -186,8 +207,13 @@ def _sample_analysis(
     return tp_pil_colored
 
 
-def run(dataset, predictions_folder, output_folder, overlayed_folder=None,
-        overlay_threshold=None):
+def run(
+    dataset,
+    predictions_folder,
+    output_folder=None,
+    overlayed_folder=None,
+    threshold=None,
+):
     """
     Runs inference and calculates metrics
 
@@ -202,19 +228,21 @@ def run(dataset, predictions_folder, output_folder, overlayed_folder=None,
         folder where predictions for the dataset images has been previously
         stored
 
-    output_folder : str
-        folder where to store results
+    output_folder : :py:class:`str`, Optional
+        folder where to store results.  If not provided, then do not store any
+        analysis (useful for quickly calculating overlay thresholds)
 
     overlayed_folder : :py:class:`str`, Optional
         if not ``None``, then it should be the name of a folder where to store
         overlayed versions of the images and ground-truths
 
-    overlay_threshold : :py:class:`float`, Optional
+    threshold : :py:class:`float`, Optional
         if ``overlayed_folder``, then this should be threshold (floating point)
         to apply to prediction maps to decide on positives and negatives for
         overlaying analysis (graphical output).  This number should come from
         the training set or a separate validation set.  Using a test set value
-        may bias your analysis.
+        may bias your analysis.  This number is also used to print the a priori
+        F1-score on the evaluated set.
 
 
     Returns
@@ -225,12 +253,6 @@ def run(dataset, predictions_folder, output_folder, overlayed_folder=None,
 
     """
 
-    logger.info(f"Output folder: {output_folder}")
-
-    if not os.path.exists(output_folder):
-        logger.info(f"Creating {output_folder}...")
-        os.makedirs(output_folder, exist_ok=True)
-
     # Collect overall metrics
     data = {}
 
@@ -243,13 +265,15 @@ def run(dataset, predictions_folder, output_folder, overlayed_folder=None,
             pred = f["array"][:]
         pred = torch.from_numpy(pred)
         if stem in data:
-            raise RuntimeError(f"{stem} entry already exists in data. "
-                    f"Cannot overwrite.")
+            raise RuntimeError(
+                f"{stem} entry already exists in data. Cannot overwrite."
+            )
         data[stem] = _sample_metrics(pred, gt)
 
         if overlayed_folder is not None:
-            overlay_image = _sample_analysis(image, pred, gt,
-                    threshold=overlay_threshold, overlay=True)
+            overlay_image = _sample_analysis(
+                image, pred, gt, threshold=threshold, overlay=True
+            )
             fullpath = os.path.join(overlayed_folder, f"{stem}.png")
             tqdm.write(f"Saving {fullpath}...")
             fulldir = os.path.dirname(fullpath)
@@ -281,30 +305,49 @@ def run(dataset, predictions_folder, output_folder, overlayed_folder=None,
     avg_metrics["re_lower"] = avg_metrics["recall"] - avg_metrics["std_re"]
     avg_metrics["std_f1"] = std_metrics["f1_score"]
 
-    metrics_path = os.path.join(output_folder, "metrics.csv")
-    logger.info(f"Saving averages over all input images at {metrics_path}...")
-    avg_metrics.to_csv(metrics_path)
-
     maxf1 = avg_metrics["f1_score"].max()
     optimal_f1_threshold = avg_metrics["f1_score"].idxmax()
 
-    logger.info(f"Highest F1-score of {maxf1:.5f}, achieved at "
-            f"threshold {optimal_f1_threshold:.2f}")
-
-    # Plotting
-    np_avg_metrics = avg_metrics.to_numpy().T
-    figure_path = os.path.join(output_folder, "precision-recall.pdf")
-    logger.info(f"Saving overall precision-recall plot at {figure_path}...")
-    fig = precision_recall_f1iso_confintval(
-        [np_avg_metrics[0]],
-        [np_avg_metrics[1]],
-        [np_avg_metrics[7]],
-        [np_avg_metrics[8]],
-        [np_avg_metrics[10]],
-        [np_avg_metrics[11]],
-        ["data"],
+    logger.info(
+        f"Highest (a posteriori) F1-score of {maxf1:.5f}, achieved at "
+        f"threshold {optimal_f1_threshold:.2f}"
     )
-    fig.savefig(figure_path)
+
+    if threshold is not None:
+        f1_apriori = avg_metrics["f1_score"][threshold]
+
+        logger.info(
+                f"F1-score (a priori) is {f1_apriori:.5f}, at "
+                f"threshold={threshold:.5f}"
+        )
+
+    if output_folder is not None:
+        logger.info(f"Output folder: {output_folder}")
+
+        if not os.path.exists(output_folder):
+            logger.info(f"Creating {output_folder}...")
+            os.makedirs(output_folder, exist_ok=True)
+
+        metrics_path = os.path.join(output_folder, "metrics.csv")
+        logger.info(
+            f"Saving averages over all input images at {metrics_path}..."
+        )
+        avg_metrics.to_csv(metrics_path)
+
+        # Plotting
+        np_avg_metrics = avg_metrics.to_numpy().T
+        figure_path = os.path.join(output_folder, "precision-recall.pdf")
+        logger.info(f"Saving overall precision-recall plot at {figure_path}...")
+        fig = precision_recall_f1iso_confintval(
+            [np_avg_metrics[0]],
+            [np_avg_metrics[1]],
+            [np_avg_metrics[7]],
+            [np_avg_metrics[8]],
+            [np_avg_metrics[10]],
+            [np_avg_metrics[11]],
+            ["data"],
+        )
+        fig.savefig(figure_path)
 
     return optimal_f1_threshold
 
@@ -331,13 +374,6 @@ def compare_annotators(baseline, other, output_folder, overlayed_folder=None):
         if not ``None``, then it should be the name of a folder where to store
         overlayed versions of the images and ground-truths
 
-    overlay_threshold : :py:class:`float`, Optional
-        if ``overlayed_folder``, then this should be threshold (floating point)
-        to apply to prediction maps to decide on positives and negatives for
-        overlaying analysis (graphical output).  This number should come from
-        the training set or a separate validation set.  Using a test set value
-        may bias your analysis.
-
     """
 
     logger.info(f"Output folder: {output_folder}")
@@ -349,19 +385,21 @@ def compare_annotators(baseline, other, output_folder, overlayed_folder=None):
     # Collect overall metrics
     data = {}
 
-    for baseline_sample, other_sample in tqdm(zip(baseline, other)):
+    for baseline_sample, other_sample in tqdm(list(zip(baseline, other))):
         stem = baseline_sample[0]
         image = baseline_sample[1]
         gt = baseline_sample[2]
-        pred = other_sample[2]  #works as a prediction
+        pred = other_sample[2]  # works as a prediction
         if stem in data:
-            raise RuntimeError(f"{stem} entry already exists in data. "
-                    f"Cannot overwrite.")
+            raise RuntimeError(
+                f"{stem} entry already exists in data. " f"Cannot overwrite."
+            )
         data[stem] = _sample_metrics(pred, gt)
 
         if overlayed_folder is not None:
-            overlay_image = _sample_analysis(image, pred, gt, threshold=0.5,
-                    overlay=True)
+            overlay_image = _sample_analysis(
+                image, pred, gt, threshold=0.5, overlay=True
+            )
             fullpath = os.path.join(overlayed_folder, f"{stem}.png")
             tqdm.write(f"Saving {fullpath}...")
             fulldir = os.path.dirname(fullpath)
diff --git a/bob/ip/binseg/script/evaluate.py b/bob/ip/binseg/script/evaluate.py
index 5a27eaee91d28ae4e23ddea28e4ea06e79e2ff92..5f82ace1ddaaf6e9dcfd462a254a860771013d72 100644
--- a/bob/ip/binseg/script/evaluate.py
+++ b/bob/ip/binseg/script/evaluate.py
@@ -17,6 +17,34 @@ import logging
 logger = logging.getLogger(__name__)
 
 
+def _validate_threshold(t, dataset):
+    """Validates the user threshold selection.  Returns parsed threshold."""
+
+    if t is None:
+        return 0.5
+
+    try:
+        # we try to convert it to float first
+        t = float(t)
+        if t < 0.0 or t > 1.0:
+            raise ValueError("Float thresholds must be within range [0.0, 1.0]")
+    except ValueError:
+        # it is a bit of text - assert dataset with name is available
+        if not isinstance(dataset, dict):
+            raise ValueError(
+                "Threshold should be a floating-point number "
+                "if your provide only a single dataset for evaluation"
+            )
+        if t not in dataset:
+            raise ValueError(
+                f"Text thresholds should match dataset names, "
+                f"but {t} is not available among the datasets provided ("
+                f"({', '.join(dataset.keys())})"
+            )
+
+    return t
+
+
 @click.command(
     entry_point_group="bob.ip.binseg.config",
     cls=ConfigCommand,
@@ -104,17 +132,20 @@ logger = logging.getLogger(__name__)
     cls=ResourceOption,
 )
 @click.option(
-    "--overlay-threshold",
+    "--threshold",
     "-T",
     help="If you set --overlayed, then you can provide a value to be used as "
     "threshold to be applied on probability maps and decide for positives and "
     "negatives.  This binary output will be used to define true and false "
     "positives, and false negatives for the overlay analysis.  This number "
     "should either come from the training set or a separate validation set "
-    "to avoid biasing the analysis",
-    default=0.5,
-    type=click.FloatRange(min=0.0, max=1.0),
-    show_default=True,
+    "to avoid biasing the analysis.  Optionally, if you provide a multi-set "
+    "dataset as input, this may also be the name of an existing set from "
+    "which the threshold will be estimated (highest F1-score) and then "
+    "applied to the subsequent sets.  This number is also used to print "
+    "the test set F1-score a priori performance (default: 0.5)",
+    default=None,
+    show_default=False,
     required=False,
     cls=ResourceOption,
 )
@@ -126,12 +157,14 @@ def evaluate(
     second_annotator,
     second_annotator_folder,
     overlayed,
-    overlay_threshold,
+    threshold,
     **kwargs,
 ):
     """Evaluates an FCN on a binary segmentation task.
     """
 
+    threshold = _validate_threshold(threshold, dataset)
+
     # if we work with dictionaries of datasets, then output evaluation
     # information into sub-directories of the output_folder
     config = {}
@@ -156,18 +189,28 @@ def evaluate(
                 ),
             }
 
+    if isinstance(threshold, str):
+        # first run evaluation for reference dataset, do not save overlays
+        logger.info(f"Evaluating threshold on '{threshold}' set")
+        threshold = run(dataset[threshold], predictions_folder)
+        logger.info(f"Set --threshold={threshold:.5f}")
+
+    # now run with the
     for k, v in config.items():
+        logger.info(f"Analyzing '{k}' set...")
         run(
             v["dataset"],
             predictions_folder,
             v["output_folder"],
             overlayed,
-            overlay_threshold,
+            threshold,
         )
         if v["second_annotator"] is not None:
             compare_annotators(
                 v["dataset"],
                 v["second_annotator"],
                 v["second_annotator_folder"],
-                os.path.join(overlayed, "second-annotator"),
+                os.path.join(overlayed, "second-annotator")
+                if overlayed
+                else None,
             )
diff --git a/bob/ip/binseg/script/experiment.py b/bob/ip/binseg/script/experiment.py
index a4c74d4594095d19d538bf6e916e3e879b2b139a..074056c1abce92656afcb3186cd0876f4e131f11 100644
--- a/bob/ip/binseg/script/experiment.py
+++ b/bob/ip/binseg/script/experiment.py
@@ -270,6 +270,12 @@ def experiment(
           ├── model/  #the generated model will be here
           ├── predictions/  #the prediction outputs for the train/test set
           ├── overlayed/  #the overlayed outputs for the train/test set
+             ├── predictions/  #predictions overlayed on the input images
+             ├── analysis/  #predictions overlayed on the input images
+             ├              #including analysis of false positives, negatives
+             ├              #and true positives
+             └── second-annotator/  #if set, store overlayed images for the
+                                    #second annotator here
           └── analysis /  #the outputs of the analysis of both train/test sets
 
     Training is performed for a configurable number of epochs, and generates at
@@ -278,6 +284,23 @@ def experiment(
     during the training and useful to resume the procedure in case it stops
     abruptly.
 
+    N.B.: The tool is designed to prevent analysis bias and allows one to
+    provide separate subsets for training and evaluation.  Instead of using
+    simple datasets, datasets for full experiment running should be
+    dictionaries with specific subset names:
+
+    * ``__train__``: dataset used for training, prioritarily.  It is typically
+      the dataset containing data augmentation pipelines.
+    * ``train`` (optional): a copy of the ``__train__`` dataset, without data
+      augmentation, that will be evaluated alongside other sets available
+    * ``*``: any other name, not starting with an underscore character (``_``),
+      will be considered a test set for evaluation.
+
+    N.B.2: The threshold used for calculating the F1-score on the test set, or
+    overlay analysis (false positives, negatives and true positives overprinted
+    on the original image) will be automatically calculated from a
+    ``validation`` set, if one is provided, otherwise, from the ``train`` set.
+    If none of those is provided, a fixed threshold value at 0.5 will be used.
     """
 
     _save_sh_command(os.path.join(output_folder, "command.sh"))
@@ -347,6 +370,15 @@ def experiment(
         else None
     )
 
+    # choosing the overlayed_threshold
+    if "validation" in dataset:
+        threshold = "validation"
+    elif "train" in dataset:
+        threshold = "train"
+    else:
+        threshold = 0.5
+    logger.info(f"Setting --threshold={threshold}...")
+
     analysis_folder = os.path.join(output_folder, "analysis")
     second_annotator_folder = os.path.join(analysis_folder, "second-annotator")
     ctx.invoke(
@@ -357,7 +389,7 @@ def experiment(
         second_annotator=second_annotator,
         second_annotator_folder=second_annotator_folder,
         overlayed=overlayed_folder,
-        overlay_threshold=0.5,
+        threshold=threshold,
         verbose=verbose,
     )
 
diff --git a/doc/experiment.rst b/doc/experiment.rst
index 4ef87f9004200a0888994de58b4ac086001c57f0..a3f2594426e01b3bed29b3eb92117f2a51201adf 100644
--- a/doc/experiment.rst
+++ b/doc/experiment.rst
@@ -8,9 +8,9 @@
 
 We provide an :ref:`aggregator command called "experiment"
 <bob.ip.binseg.cli.experiment>` that runs training, followed by prediction,
-evaluation and comparison.  After running, you will be able to find results
-from model fitting, prediction, evaluation and comparison under a single output
-directory.
+evaluation and comparison.  After running, you
+will be able to find results from model fitting, prediction, evaluation and
+comparison under a single output directory.
 
 For example, to train a Mobile V2 U-Net architecture on the STARE dataset,
 evaluate both train and test set performances, output prediction maps and