diff --git a/MANIFEST.in b/MANIFEST.in
index cf1d827b4de456cfd9faa016ac18001948a7caf3..9f9ab6b6edc5c72b48a8a4002ffb71f210e95bd4 100644
--- a/MANIFEST.in
+++ b/MANIFEST.in
@@ -1,3 +1,3 @@
 include README.rst buildout.cfg COPYING version.txt requirements.txt
-recursive-include doc *.rst *.png *.ico *.txt
+recursive-include doc *.sh *.rst *.png *.pdf *.ico *.txt
 recursive-include bob *.json *.png
diff --git a/bob/ip/binseg/configs/datasets/chasedb1/xtest.py b/bob/ip/binseg/configs/datasets/chasedb1/xtest.py
new file mode 100644
index 0000000000000000000000000000000000000000..cade7b85c1a4f600fd1edcc8558e326c6a4c8d10
--- /dev/null
+++ b/bob/ip/binseg/configs/datasets/chasedb1/xtest.py
@@ -0,0 +1,22 @@
+#!/usr/bin/env python
+# coding=utf-8
+
+"""CHASE-DB1 cross-evaluation dataset
+"""
+
+from bob.ip.binseg.configs.datasets.drive.default import dataset as _drive
+from bob.ip.binseg.configs.datasets.stare.ah import dataset as _stare
+from bob.ip.binseg.configs.datasets.chasedb1.first_annotator import (
+    dataset as _chase,
+)
+from bob.ip.binseg.configs.datasets.hrf.default import dataset as _hrf
+from bob.ip.binseg.configs.datasets.iostar.vessel import dataset as _iostar
+
+dataset = {
+        "train": _chase["train"],
+        "test": _chase["test"],
+        "drive": _drive["test"],
+        "stare": _stare["test"],
+        "hrf": _hrf["test"],
+        "iostar": _iostar["test"],
+        }
diff --git a/bob/ip/binseg/configs/datasets/drive/xtest.py b/bob/ip/binseg/configs/datasets/drive/xtest.py
new file mode 100644
index 0000000000000000000000000000000000000000..188606b2ce50482feed9624bd40cc625ac8c38b2
--- /dev/null
+++ b/bob/ip/binseg/configs/datasets/drive/xtest.py
@@ -0,0 +1,22 @@
+#!/usr/bin/env python
+# coding=utf-8
+
+"""DRIVE cross-evaluation dataset
+"""
+
+from bob.ip.binseg.configs.datasets.drive.default import dataset as _drive
+from bob.ip.binseg.configs.datasets.stare.ah import dataset as _stare
+from bob.ip.binseg.configs.datasets.chasedb1.first_annotator import (
+    dataset as _chase,
+)
+from bob.ip.binseg.configs.datasets.hrf.default import dataset as _hrf
+from bob.ip.binseg.configs.datasets.iostar.vessel import dataset as _iostar
+
+dataset = {
+        "train": _drive["train"],
+        "test": _drive["test"],
+        "stare": _stare["test"],
+        "chasedb1": _chase["test"],
+        "hrf": _hrf["test"],
+        "iostar": _iostar["test"],
+        }
diff --git a/bob/ip/binseg/configs/datasets/hrf/xtest.py b/bob/ip/binseg/configs/datasets/hrf/xtest.py
new file mode 100644
index 0000000000000000000000000000000000000000..6f96074fb9709f38d4ce89296e242e065520676f
--- /dev/null
+++ b/bob/ip/binseg/configs/datasets/hrf/xtest.py
@@ -0,0 +1,22 @@
+#!/usr/bin/env python
+# coding=utf-8
+
+"""HRF cross-evaluation dataset
+"""
+
+from bob.ip.binseg.configs.datasets.drive.default import dataset as _drive
+from bob.ip.binseg.configs.datasets.stare.ah import dataset as _stare
+from bob.ip.binseg.configs.datasets.chasedb1.first_annotator import (
+    dataset as _chase,
+)
+from bob.ip.binseg.configs.datasets.hrf.default import dataset as _hrf
+from bob.ip.binseg.configs.datasets.iostar.vessel import dataset as _iostar
+
+dataset = {
+        "train": _hrf["train"],
+        "test": _hrf["test"],
+        "drive": _drive["test"],
+        "stare": _stare["test"],
+        "chasedb1": _chase["test"],
+        "iostar": _iostar["test"],
+        }
diff --git a/bob/ip/binseg/configs/datasets/iostar/vessel_xtest.py b/bob/ip/binseg/configs/datasets/iostar/vessel_xtest.py
new file mode 100644
index 0000000000000000000000000000000000000000..0d6272751ee4dc916c6e46f0b55209e1263f4190
--- /dev/null
+++ b/bob/ip/binseg/configs/datasets/iostar/vessel_xtest.py
@@ -0,0 +1,22 @@
+#!/usr/bin/env python
+# coding=utf-8
+
+"""IOSTAR vessel cross-evaluation dataset
+"""
+
+from bob.ip.binseg.configs.datasets.drive.default import dataset as _drive
+from bob.ip.binseg.configs.datasets.stare.ah import dataset as _stare
+from bob.ip.binseg.configs.datasets.chasedb1.first_annotator import (
+    dataset as _chase,
+)
+from bob.ip.binseg.configs.datasets.hrf.default import dataset as _hrf
+from bob.ip.binseg.configs.datasets.iostar.vessel import dataset as _iostar
+
+dataset = {
+        "train": _iostar["train"],
+        "test": _iostar["test"],
+        "drive": _drive["test"],
+        "stare": _stare["test"],
+        "chasedb1": _chase["test"],
+        "hrf": _hrf["test"],
+        }
diff --git a/bob/ip/binseg/configs/datasets/stare/xtest.py b/bob/ip/binseg/configs/datasets/stare/xtest.py
new file mode 100644
index 0000000000000000000000000000000000000000..dcd773e872ac3eaeb49b2737e0e6d78c18578d55
--- /dev/null
+++ b/bob/ip/binseg/configs/datasets/stare/xtest.py
@@ -0,0 +1,22 @@
+#!/usr/bin/env python
+# coding=utf-8
+
+"""STARE cross-evaluation dataset
+"""
+
+from bob.ip.binseg.configs.datasets.drive.default import dataset as _drive
+from bob.ip.binseg.configs.datasets.stare.ah import dataset as _stare
+from bob.ip.binseg.configs.datasets.chasedb1.first_annotator import (
+    dataset as _chase,
+)
+from bob.ip.binseg.configs.datasets.hrf.default import dataset as _hrf
+from bob.ip.binseg.configs.datasets.iostar.vessel import dataset as _iostar
+
+dataset = {
+        "train": _stare["train"],
+        "test": _stare["test"],
+        "drive": _drive["test"],
+        "chasedb1": _chase["test"],
+        "hrf": _hrf["test"],
+        "iostar": _iostar["test"],
+        }
diff --git a/bob/ip/binseg/script/analyze.py b/bob/ip/binseg/script/analyze.py
index b18fe0f82448c4aa81afdde20913a30477787b72..bd66611d635c5a31b7163c0b69eb9da1ee5e955e 100644
--- a/bob/ip/binseg/script/analyze.py
+++ b/bob/ip/binseg/script/analyze.py
@@ -136,10 +136,9 @@ def analyze(
 
     This script is just a wrapper around the individual scripts for running
     prediction and evaluating FCN models.  It organises the output in a
-    preset way:
-
-    .. code-block:: text
+    preset way::
 
+\b
        └─ <output-folder>/
           ├── predictions/  #the prediction outputs for the train/test set
           ├── overlayed/  #the overlayed outputs for the train/test set
diff --git a/bob/ip/binseg/script/experiment.py b/bob/ip/binseg/script/experiment.py
index db17008c1ff4ddc711cb834725d4b96fffd5d71f..cbbfd56f0754327b6bb93abde03b4718c387d930 100644
--- a/bob/ip/binseg/script/experiment.py
+++ b/bob/ip/binseg/script/experiment.py
@@ -233,10 +233,9 @@ def experiment(
 
     This script is just a wrapper around the individual scripts for training,
     running prediction, evaluating and comparing FCN model performance.  It
-    organises the output in a preset way:
-
-    .. code-block:: text
+    organises the output in a preset way::
 
+\b
        └─ <output-folder>/
           ├── model/  #the generated model will be here
           ├── predictions/  #the prediction outputs for the train/test set
diff --git a/bob/ip/binseg/script/predict.py b/bob/ip/binseg/script/predict.py
index d06c7557080fda133d1a3fa1e621e672667a49e3..14c9cd7495d05aff04a6edd8f3e85be9ef1b6129 100644
--- a/bob/ip/binseg/script/predict.py
+++ b/bob/ip/binseg/script/predict.py
@@ -140,6 +140,8 @@ def predict(output_folder, model, dataset, batch_size, device, weight,
             logger.info(f"Skipping dataset '{k}' (not to be evaluated)")
             continue
 
+        logger.info(f"Running inference on '{k}' set...")
+
         data_loader = DataLoader(
             dataset=v,
             batch_size=batch_size,
diff --git a/bob/ip/binseg/utils/metric.py b/bob/ip/binseg/utils/metric.py
index d38e80df89a3c225e31a5b4ce7a9e6c930a4f81c..903836f6ef17b231ecb942efc4156c3408cff885 100644
--- a/bob/ip/binseg/utils/metric.py
+++ b/bob/ip/binseg/utils/metric.py
@@ -30,7 +30,7 @@ class SmoothedValue:
 def base_metrics(tp, fp, tn, fn):
     """
     Calculates Precision, Recall (=Sensitivity), Specificity, Accuracy, Jaccard and F1-score (Dice)
-    
+
 
     Parameters
     ----------
@@ -39,7 +39,7 @@ def base_metrics(tp, fp, tn, fn):
         True positives
 
     fp : float
-        False positives 
+        False positives
 
     tn : float
         True negatives
@@ -52,7 +52,7 @@ def base_metrics(tp, fp, tn, fn):
     -------
 
     metrics : list
-    
+
     """
     precision = tp / (tp + fp + ((tp + fp) == 0))
     recall = tp / (tp + fn + ((tp + fn) == 0))
@@ -62,3 +62,26 @@ def base_metrics(tp, fp, tn, fn):
     f1_score = (2.0 * tp) / (2.0 * tp + fp + fn + ((2.0 * tp + fp + fn) == 0))
     # f1_score = (2.0 * precision * recall) / (precision + recall)
     return [precision, recall, specificity, accuracy, jaccard, f1_score]
+
+
+def auc(precision, recall):
+    """Calculates the area under the precision-recall curve (AUC)
+
+    .. todo:: Integrate this to metrics reporting in compare.py
+    """
+
+    rec_unique, rec_unique_ndx = numpy.unique(recall, return_index=True)
+
+    prec_unique = precision[rec_unique_ndx]
+
+    if rec_unique.shape[0] > 1:
+        prec_interp = numpy.interp(
+            numpy.arange(0, 1, 0.01),
+            rec_unique,
+            prec_unique,
+            left=0.0,
+            right=0.0,
+        )
+        return prec_interp.sum() * 0.01
+
+    return 0.0
diff --git a/conda/meta.yaml b/conda/meta.yaml
index d696e7e205fb3a980aa1d0f928a306f1a98089e0..4685c1b569c4be237642cdfcb5da41f1b8d77d58 100644
--- a/conda/meta.yaml
+++ b/conda/meta.yaml
@@ -60,6 +60,7 @@ test:
     - sphinx
     - sphinx_rtd_theme
     - sphinxcontrib-programoutput
+    - graphviz
 
 about:
   summary: Binary Segmentation Benchmark Package for Bob
diff --git a/doc/api.rst b/doc/api.rst
index edd9b150da9a768053ad87ba5a630bbec04a7f70..c73aedd230a6c441868f46e6d6e5009e04a79be3 100644
--- a/doc/api.rst
+++ b/doc/api.rst
@@ -137,26 +137,31 @@ Datasets
 
    bob.ip.binseg.configs.datasets.chasedb1.first_annotator
    bob.ip.binseg.configs.datasets.chasedb1.second_annotator
+   bob.ip.binseg.configs.datasets.chasedb1.xtest
    bob.ip.binseg.configs.datasets.chasedb1.covd
    bob.ip.binseg.configs.datasets.chasedb1.ssl
 
    bob.ip.binseg.configs.datasets.drive.default
    bob.ip.binseg.configs.datasets.drive.second_annotator
+   bob.ip.binseg.configs.datasets.drive.xtest
    bob.ip.binseg.configs.datasets.drive.covd
    bob.ip.binseg.configs.datasets.drive.ssl
 
    bob.ip.binseg.configs.datasets.hrf.default
+   bob.ip.binseg.configs.datasets.hrf.xtest
    bob.ip.binseg.configs.datasets.hrf.default_fullres
    bob.ip.binseg.configs.datasets.hrf.covd
    bob.ip.binseg.configs.datasets.hrf.ssl
 
    bob.ip.binseg.configs.datasets.iostar.vessel
+   bob.ip.binseg.configs.datasets.iostar.vessel_xtest
    bob.ip.binseg.configs.datasets.iostar.optic_disc
    bob.ip.binseg.configs.datasets.iostar.covd
    bob.ip.binseg.configs.datasets.iostar.ssl
 
    bob.ip.binseg.configs.datasets.stare.ah
    bob.ip.binseg.configs.datasets.stare.vk
+   bob.ip.binseg.configs.datasets.stare.xtest
    bob.ip.binseg.configs.datasets.stare.covd
    bob.ip.binseg.configs.datasets.stare.ssl
 
diff --git a/doc/baselines.rst b/doc/baselines.rst
deleted file mode 100644
index 3c22af04c729976cff1ee549e4139a838f340989..0000000000000000000000000000000000000000
--- a/doc/baselines.rst
+++ /dev/null
@@ -1,59 +0,0 @@
-.. -*- coding: utf-8 -*-
-
-.. _bob.ip.binseg.results.baselines:
-
-===================
- Baseline Results
-===================
-
-F1 Scores (micro-level)
------------------------
-
-* Benchmark results for models: DRIU, HED, M2U-Net and U-Net.
-* Models are trained and tested on the same dataset using the
-  train-test split as indicated in :ref:`bob.ip.binseg.configs.datasets` (i.e.,
-  these are *intra*-datasets tests)
-* Standard-deviations across all test images are indicated in brakets
-* Database and Model links (table top row and left column) are linked to the
-  originating configuration files used to obtain these results.
-* For some results, the actual deep neural network models are provided (by
-  clicking on the associated F1 Score).
-* Check `our paper`_ for details on the calculation of the F1 Score and standard
-  deviations.
-
-.. list-table::
-   :header-rows: 1
-
-   * - F1 (std)
-     - :py:mod:`driu <bob.ip.binseg.configs.models.driu>`
-     - :py:mod:`hed <bob.ip.binseg.configs.models.hed>`
-     - :py:mod:`m2unet <bob.ip.binseg.configs.models.m2unet>`
-     - :py:mod:`unet <bob.ip.binseg.configs.models.unet>`
-   * - :py:mod:`chasedb1 <bob.ip.binseg.configs.datasets.chasedb1.first_annotator>`
-     - `0.810 (0.021) <driu_chasedb1.pth_>`_
-     - 0.810 (0.022)
-     - `0.802 (0.019) <m2unet_chasedb1.pth_>`_
-     - 0.812 (0.020)
-   * - :py:mod:`drive <bob.ip.binseg.configs.datasets.drive.default>`
-     - `0.820 (0.014) <driu_drive.pth_>`_
-     - 0.817 (0.013)
-     - `0.803 (0.014) <m2unet_drive.pth_>`_
-     - 0.822 (0.015)
-   * - :py:mod:`hrf <bob.ip.binseg.configs.datasets.hrf.default>`
-     - `0.783 (0.055) <driu_hrf.pth_>`_
-     - 0.783 (0.058)
-     - `0.780 (0.057) <m2unet_hrf.pth_>`_
-     - 0.788 (0.051)
-   * - :py:mod:`iostar-vessel <bob.ip.binseg.configs.datasets.iostar.vessel>`
-     - `0.825 (0.020) <driu_iostar.pth_>`_
-     - 0.825 (0.020)
-     - `0.817 (0.020) <m2unet_iostar.pth_>`_
-     - 0.818 (0.019)
-   * - :py:mod:`stare <bob.ip.binseg.configs.datasets.stare.ah>`
-     - `0.827 (0.037) <driu_stare.pth_>`_
-     - 0.823 (0.037)
-     - `0.815 (0.041) <m2unet_stare.pth_>`_
-     - 0.829 (0.042)
-
-
-.. include:: links.rst
diff --git a/doc/cli.rst b/doc/cli.rst
index e5b261d6735221d6cfe40e35256ce734e29c4f9b..588cc997c85333e3fd2cf948dd9bd650f26b8a8e 100644
--- a/doc/cli.rst
+++ b/doc/cli.rst
@@ -91,10 +91,52 @@ You may use this command to locally copy a resource file so you can change it.
 .. command-output:: bob binseg config copy --help
 
 
+.. _bob.ip.binseg.cli.combined:
+
+Running and Analyzing Experiments
+---------------------------------
+
+These applications run a combined set of steps in one go.  They work well with
+our preset :ref:`configuration resources <bob.ip.binseg.cli.config.list.all>`.
+
+
+.. _bob.ip.binseg.cli.experiment:
+
+Running a Full Experiment Cycle
+===============================
+
+This command can run training, prediction, evaluation and comparison from a
+single, multi-step application.
+
+.. command-output:: bob binseg experiment --help
+
+
+.. _bob.ip.binseg.cli.analyze:
+
+Running Complete Experiment Analysis
+====================================
+
+This command can run prediction, evaluation and comparison from a
+single, multi-step application.
+
+.. command-output:: bob binseg analyze --help
+
+
+.. _bob.ip.binseg.cli.single:
+
+Single-Step Applications
+------------------------
+
+These applications allow finer control over the experiment cycle.  They also
+work well with our preset :ref:`configuration resources
+<bob.ip.binseg.cli.config.list.all>`, but allow finer control on the input
+datasets.
+
+
 .. _bob.ip.binseg.cli.train:
 
 Training FCNs
--------------
+=============
 
 Training creates of a new PyTorch_ model.  This model can be used for
 evaluation tests or for inference.
@@ -104,8 +146,8 @@ evaluation tests or for inference.
 
 .. _bob.ip.binseg.cli.predict:
 
-FCN Inference
--------------
+Prediction with FCNs
+====================
 
 Inference takes as input a PyTorch_ model and generates output probabilities as
 HDF5 files.  The probability map has the same size as the input and indicates,
@@ -118,7 +160,7 @@ from less probable (0.0) to more probable (1.0).
 .. _bob.ip.binseg.cli.evaluate:
 
 FCN Performance Evaluation
---------------------------
+==========================
 
 Evaluation takes inference results and compares it to ground-truth, generating
 a series of analysis figures which are useful to understand model performance.
@@ -129,7 +171,7 @@ a series of analysis figures which are useful to understand model performance.
 .. _bob.ip.binseg.cli.compare:
 
 Performance Comparison
-----------------------
+======================
 
 Performance comparison takes the performance evaluation results and generate
 combined figures and tables that compare results of multiple systems.
@@ -137,15 +179,4 @@ combined figures and tables that compare results of multiple systems.
 .. command-output:: bob binseg compare --help
 
 
-.. _bob.ip.binseg.cli.experiment:
-
-Running Complete Experiments
-----------------------------
-
-This command can run training, prediction, evaluation and comparison from a
-single, multi-step application.
-
-.. command-output:: bob binseg experiment --help
-
-
 .. include:: links.rst
diff --git a/doc/experiment.rst b/doc/experiment.rst
index a3f2594426e01b3bed29b3eb92117f2a51201adf..9050af004419e2ca7fb3a2d41a72c245bcc448eb 100644
--- a/doc/experiment.rst
+++ b/doc/experiment.rst
@@ -8,9 +8,9 @@
 
 We provide an :ref:`aggregator command called "experiment"
 <bob.ip.binseg.cli.experiment>` that runs training, followed by prediction,
-evaluation and comparison.  After running, you
-will be able to find results from model fitting, prediction, evaluation and
-comparison under a single output directory.
+evaluation and comparison.  After running, you will be able to find results
+from model fitting, prediction, evaluation and comparison under a single output
+directory.
 
 For example, to train a Mobile V2 U-Net architecture on the STARE dataset,
 evaluate both train and test set performances, output prediction maps and
@@ -20,3 +20,173 @@ overlay analysis, together with a performance curve, run the following:
 
    $ bob binseg experiment -vv m2unet stare --batch-size=16 --overlayed
    # check results in the "results" folder
+
+You may run the system on a GPU by using the ``--device=cuda:0`` option.
+
+
+Using your own dataset
+======================
+
+To use your own dataset, we recommend you read our instructions at
+:py:mod:`bob.ip.binseg.configs.datasets.csv`, and setup one or more CSV file
+describing input data and ground-truth (segmentation maps), and potential test
+data.  Then, prepare a configuration file by copying our configuration example
+and edit it to apply the required transforms to your input data.  Once you are
+happy with the result, use it in place of one of our datasets:
+
+.. code-block:: sh
+
+   $ bob binseg config copy csv-dataset-example mydataset.py
+   # edit mydataset following instructions
+   $ bob binseg train ... mydataset.py ...
+
+
+Baseline Benchmarks
+===================
+
+The following table describes recommended batch sizes for 24Gb of RAM GPU
+card, for supervised training of baselines.  Use it like this:
+
+.. code-block:: sh
+
+   # change <model> and <dataset> by one of items bellow
+   $ bob binseg experiment -vv <model> <dataset> --batch-size=<see-table> --device="cuda:0"
+   # check results in the "results" folder
+
+.. list-table::
+
+  * - **Models / Datasets**
+    - :py:mod:`drive <bob.ip.binseg.configs.datasets.drive.default>`
+    - :py:mod:`stare <bob.ip.binseg.configs.datasets.stare.ah>`
+    - :py:mod:`chasedb1 <bob.ip.binseg.configs.datasets.chasedb1.first_annotator>`
+    - :py:mod:`iostar-vessel <bob.ip.binseg.configs.datasets.iostar.vessel>`
+    - :py:mod:`hrf <bob.ip.binseg.configs.datasets.hrf.default>`
+  * - :py:mod:`unet <bob.ip.binseg.configs.models.unet>`
+    - 4
+    - 2
+    - 2
+    - 2
+    - 1
+  * - :py:mod:`hed <bob.ip.binseg.configs.models.hed>`
+    - 8
+    - 4
+    - 4
+    - 4
+    - 1
+  * - :py:mod:`driu <bob.ip.binseg.configs.models.driu>` / :py:mod:`driu-bn <bob.ip.binseg.configs.models.driu_bn>`
+    - 8
+    - 5
+    - 4
+    - 4
+    - 1
+  * - :py:mod:`m2unet <bob.ip.binseg.configs.models.m2unet>`
+    - 16
+    - 6
+    - 6
+    - 6
+    - 1
+
+
+.. tip::
+
+   Instead of the default configurations, you can pass the full path of your
+   customized dataset and model files.  You may :ref:`copy any of the existing
+   configuration resources <bob.ip.binseg.cli.config.copy>` and change them
+   locally.  Once you're happy, you may use the newly created files directly on
+   your command line.  For example, suppose you wanted to slightly change the
+   DRIVE pre-processing pipeline.  You could do the following:
+
+   .. code-block:: bash
+
+      $ bob binseg config copy drive my_drive_remix.py
+      # edit my_drive_remix.py to your needs
+      $ bob binseg train -vv <model> ./my_drive_remix.py
+
+
+.. _bob.ip.binseg.gridtk-tip:
+
+.. tip::
+
+   If you are at Idiap, you may install the package ``gridtk`` (``conda install
+   gridtk``) on your environment, and submit the job like this:
+
+   .. code-block:: sh
+
+      $ jman submit --queue=gpu --memory=24G --name=myjob -- bob binseg train --device='cuda:0' ... #paste the rest of the command-line
+
+.. _bob.ip.binseg.baseline-script:
+
+The :download:`following shell script <scripts/baselines.sh>` can run the
+various baselines described above and place results in a single directory:
+
+.. literalinclude:: scripts/baselines.sh
+   :language: bash
+
+You will find results obtained running these baselines :ref:`further in this
+guide <bob.ip.binseg.results.baselines>`.
+
+
+Combined Vessel Dataset (COVD)
+==============================
+
+The following table describes recommended batch sizes for 24Gb of RAM GPU card,
+for supervised training of COVD- systems.  Use it like this:
+
+.. code-block:: sh
+
+   # change <model> and <dataset> by one of items bellow
+   $ bob binseg train -vv <model> <dataset> --batch-size=<see-table> --device="cuda:0"
+
+.. list-table::
+
+  * - **Models / Datasets**
+    - :py:mod:`drive-covd <bob.ip.binseg.configs.datasets.drive.covd>`
+    - :py:mod:`stare-covd <bob.ip.binseg.configs.datasets.stare.covd>`
+    - :py:mod:`chasedb1-covd <bob.ip.binseg.configs.datasets.chasedb1.covd>`
+    - :py:mod:`iostar-vessel-covd <bob.ip.binseg.configs.datasets.iostar.covd>`
+    - :py:mod:`hrf-covd <bob.ip.binseg.configs.datasets.hrf.covd>`
+  * - :py:mod:`driu <bob.ip.binseg.configs.models.driu>` / :py:mod:`driu-bn <bob.ip.binseg.configs.models.driu_bn>`
+    - 4
+    - 4
+    - 2
+    - 2
+    - 2
+  * - :py:mod:`m2unet <bob.ip.binseg.configs.models.m2unet>`
+    - 8
+    - 4
+    - 4
+    - 4
+    - 4
+
+
+Combined Vessel Dataset (COVD) and Semi-Supervised Learning (SSL)
+=================================================================
+
+The following table describes recommended batch sizes for 24Gb of RAM GPU
+card, for semi-supervised learning of COVD- systems.  Use it like this:
+
+.. code-block:: sh
+
+   # change <model> and <dataset> by one of items bellow
+   $ bob binseg train -vv --ssl <model> <dataset> --batch-size=<see-table> --device="cuda:0"
+
+.. list-table::
+
+  * - **Models / Datasets**
+    - :py:mod:`drive-ssl <bob.ip.binseg.configs.datasets.drive.ssl>`
+    - :py:mod:`stare-ssl <bob.ip.binseg.configs.datasets.stare.ssl>`
+    - :py:mod:`chasedb1-ssl <bob.ip.binseg.configs.datasets.chasedb1.ssl>`
+    - :py:mod:`iostar-vessel-ssl <bob.ip.binseg.configs.datasets.iostar.ssl>`
+    - :py:mod:`hrf-ssl <bob.ip.binseg.configs.datasets.hrf.ssl>`
+  * - :py:mod:`driu-ssl <bob.ip.binseg.configs.models.driu_ssl>` / :py:mod:`driu-bn-ssl <bob.ip.binseg.configs.models.driu_bn_ssl>`
+    - 4
+    - 4
+    - 2
+    - 1
+    - 1
+  * - :py:mod:`m2unet-ssl <bob.ip.binseg.configs.models.m2unet_ssl>`
+    - 4
+    - 4
+    - 2
+    - 2
+    - 2
diff --git a/doc/framework.dot b/doc/framework.dot
new file mode 100644
index 0000000000000000000000000000000000000000..50bfce5a2ff0030ee75c0138efd884c4e8f3228b
--- /dev/null
+++ b/doc/framework.dot
@@ -0,0 +1,60 @@
+digraph framework {
+
+    graph [
+        rankdir=LR,
+        ];
+    edge [
+        fontname=Helvetica,
+        fontsize=12,
+        fontcolor=blue,
+        minlen=2,
+        labeldistance=2.5,
+        ];
+
+    node [
+        fontname=Helvetica,
+        fontsize=12,
+        fontcolor=black,
+        shape=record,
+        style="filled,rounded",
+        fillcolor=grey92,
+        ];
+
+    dataset [
+        label="<train>\nTraining\n\n|<test>\nTest\n\n",
+        fillcolor=yellow,
+        style="filled",
+        ];
+
+    {rank = min; dataset;}
+
+    subgraph cluster_experiment {
+        label=<<b>experiment</b>>;
+        shape=record;
+        style="filled,rounded";
+        fillcolor=white;
+        train;
+
+        subgraph cluster_analyze {
+            label=<<b>analyze</b>>;
+            predict;
+            evaluate;
+            compare;
+        }
+    }
+
+    figure, table [
+        fillcolor=lightblue,
+        style="filled",
+    ];
+    {rank = max; figure; table; }
+
+    dataset:train -> train [headlabel="sample + label",labelangle=30];
+    dataset:test -> predict [headlabel="sample",labelangle=30];
+    train -> predict [headlabel="model"];
+    dataset:test -> evaluate [headlabel="label"];
+    predict -> evaluate [headlabel="probabilities    ",labelangle=-30];
+    evaluate -> compare [headlabel="metrics"];
+    compare -> figure;
+    compare -> table;
+}
diff --git a/doc/index.rst b/doc/index.rst
index 0cccd77fa80845cdee84d195c67b61c586265050..cae4c4d0bb7680f950583d8d40ac39c6ccd114d2 100644
--- a/doc/index.rst
+++ b/doc/index.rst
@@ -43,7 +43,7 @@ User Guide
 
    setup
    usage
-   results
+   results/index
    acknowledgements
    references
    datasets
diff --git a/doc/results/baselines/index.rst b/doc/results/baselines/index.rst
new file mode 100644
index 0000000000000000000000000000000000000000..64a361c1ef947bcfdfb74e9be4a3f940f75b20a6
--- /dev/null
+++ b/doc/results/baselines/index.rst
@@ -0,0 +1,118 @@
+.. -*- coding: utf-8 -*-
+
+.. _bob.ip.binseg.results.baselines:
+
+===================
+ Baseline Results
+===================
+
+F1 Scores (micro-level)
+-----------------------
+
+* Benchmark results for models: DRIU, HED, M2U-Net and U-Net.
+* Models are trained and tested on the same dataset (numbers in parenthesis
+  indicate number of parameters per model)
+* Database and model resource configuration links (table top row and left
+  column) are linked to the originating configuration files used to obtain
+  these results.
+* Check `our paper`_ for details on the calculation of the F1 Score and standard
+  deviations (in parentheses).
+* Single performance numbers correspond to *a priori* performance indicators,
+  where the threshold is previously selected on the training set
+* You can cross check the analysis numbers provided in this table by
+  downloading this software package, the raw data, and running ``bob binseg
+  analyze`` providing the model URL as ``--weight`` parameter.  Otherwise, we
+  also provide `CSV files
+  <https://www.idiap.ch/software/bob/data/bob/bob.ip.binseg/master/baselines/>`_
+  with the estimated performance per threshold (100
+  steps) per subset.
+* For comparison purposes, we provide "second-annotator" performances on the
+  same test set, where available.
+
+
+.. list-table::
+   :header-rows: 1
+
+   * - Dataset
+     - 2nd. Annot.
+     - :py:mod:`driu (15M) <bob.ip.binseg.configs.models.driu>`
+     - :py:mod:`hed (14.7M) <bob.ip.binseg.configs.models.hed>`
+     - :py:mod:`m2unet (0.55M) <bob.ip.binseg.configs.models.m2unet>`
+     - :py:mod:`unet (25.8M) <bob.ip.binseg.configs.models.unet>`
+   * - :py:mod:`drive <bob.ip.binseg.configs.datasets.drive.default>`
+     - 0.788 (0.021)
+     - `0.819 (0.016) <https://www.idiap.ch/software/bob/data/bob/bob.ip.binseg/master/baselines/driu/drive/model.pth>`_
+     - `0.806 (0.015) <https://www.idiap.ch/software/bob/data/bob/bob.ip.binseg/master/baselines/hed/drive/model.pth>`_
+     - `0.804 (0.014) <https://www.idiap.ch/software/bob/data/bob/bob.ip.binseg/master/baselines/m2unet/drive/model.pth>`_
+     - `0.823 (0.015) <https://www.idiap.ch/software/bob/data/bob/bob.ip.binseg/master/baselines/unet/drive/model.pth>`_
+   * - :py:mod:`stare <bob.ip.binseg.configs.datasets.stare.ah>`
+     - 0.759 (0.028)
+     - `0.824 (0.037) <https://www.idiap.ch/software/bob/data/bob/bob.ip.binseg/master/baselines/driu/stare/model.pth>`_
+     - `0.810 (0.045) <https://www.idiap.ch/software/bob/data/bob/bob.ip.binseg/master/baselines/hed/stare/model.pth>`_
+     - `0.811 (0.039) <https://www.idiap.ch/software/bob/data/bob/bob.ip.binseg/master/baselines/m2unet/stare/model.pth>`_
+     - `0.828 (0.041) <https://www.idiap.ch/software/bob/data/bob/bob.ip.binseg/master/baselines/unet/stare/model.pth>`_
+   * - :py:mod:`chasedb1 <bob.ip.binseg.configs.datasets.chasedb1.first_annotator>`
+     - 0.768  0.023
+     - `0.811 (0.018) <https://www.idiap.ch/software/bob/data/bob/bob.ip.binseg/master/baselines/driu/chasedb1/model.pth>`_
+     - `0.806 (0.021) <https://www.idiap.ch/software/bob/data/bob/bob.ip.binseg/master/baselines/hed/chasedb1/model.pth>`_
+     - `0.801 (0.018) <https://www.idiap.ch/software/bob/data/bob/bob.ip.binseg/master/baselines/m2unet/chasedb1/model.pth>`_
+     - `0.802 (0.015) <https://www.idiap.ch/software/bob/data/bob/bob.ip.binseg/master/baselines/unet/chasedb1/model.pth>`_
+   * - :py:mod:`hrf <bob.ip.binseg.configs.datasets.hrf.default>`
+     -
+     - `0.802 (0.039) <https://www.idiap.ch/software/bob/data/bob/bob.ip.binseg/master/baselines/driu/hrf/model.pth>`_
+     - `0.793 (0.041) <https://www.idiap.ch/software/bob/data/bob/bob.ip.binseg/master/baselines/hed/hrf/model.pth>`_
+     - `0.796 (0.043) <https://www.idiap.ch/software/bob/data/bob/bob.ip.binseg/master/baselines/m2unet/hrf/model.pth>`_
+     - `0.798 (0.038) <https://www.idiap.ch/software/bob/data/bob/bob.ip.binseg/master/baselines/unet/hrf/model.pth>`_
+   * - :py:mod:`iostar-vessel <bob.ip.binseg.configs.datasets.iostar.vessel>`
+     -
+     - `0.825 (0.021) <https://www.idiap.ch/software/bob/data/bob/bob.ip.binseg/master/baselines/driu/iostar-vessel/model.pth>`_
+     - `0.822 (0.023) <https://www.idiap.ch/software/bob/data/bob/bob.ip.binseg/master/baselines/hed/iostar-vessel/model.pth>`_
+     - `0.817 (0.021) <https://www.idiap.ch/software/bob/data/bob/bob.ip.binseg/master/baselines/m2unet/iostar-vessel/model.pth>`_
+     - `0.818 (0.019) <https://www.idiap.ch/software/bob/data/bob/bob.ip.binseg/master/baselines/unet/iostar-vessel/model.pth>`_
+
+
+Precision-Recall (PR) Curves
+----------------------------
+
+Next, you will find the PR plots showing confidence intervals, for the various
+models explored, on a per dataset arrangement.  All curves correspond to test
+set performances.  Single performance figures (F1-micro scores) correspond to
+its average value across all test set images, for a fixed threshold set to
+``0.5``.
+
+.. figure:: drive.png
+   :align: center
+   :alt: Model comparisons for drive datasets
+
+   :py:mod:`drive <bob.ip.binseg.configs.datasets.drive.default>`: PR curve and F1 scores at T=0.5 (:download:`pdf <drive.pdf>`)
+
+
+.. figure:: stare.png
+   :align: center
+   :alt: Model comparisons for stare datasets
+
+   :py:mod:`stare <bob.ip.binseg.configs.datasets.stare.ah>`: PR curve and F1 scores at T=0.5 (:download:`pdf <stare.pdf>`)
+
+
+.. figure:: chasedb1.png
+   :align: center
+   :alt: Model comparisons for chasedb1 datasets
+
+   :py:mod:`chasedb1 <bob.ip.binseg.configs.datasets.chasedb1.first_annotator>`: PR curve and F1 scores at T=0.5 (:download:`pdf <chasedb1.pdf>`)
+
+
+.. figure:: hrf.png
+   :align: center
+   :alt: Model comparisons for hrf datasets
+
+   :py:mod:`hrf <bob.ip.binseg.configs.datasets.hrf.default>`: PR curve and F1 scores at T=0.5 (:download:`pdf <hrf.pdf>`)
+
+
+.. figure:: iostar-vessel.png
+   :align: center
+   :alt: Model comparisons for iostar-vessel datasets
+
+   :py:mod:`iostar-vessel <bob.ip.binseg.configs.datasets.iostar.vessel>`: PR curve and F1 scores at T=0.5 (:download:`pdf <iostar-vessel.pdf>`)
+
+
+.. include:: ../../links.rst
diff --git a/doc/covd.rst b/doc/results/covd/index.rst
similarity index 95%
rename from doc/covd.rst
rename to doc/results/covd/index.rst
index 4dcb1b9df7609924ccc0f58c9ea40dec447c08aa..84c407bc027d7456953348625eb7addf7522266b 100644
--- a/doc/covd.rst
+++ b/doc/results/covd/index.rst
@@ -78,38 +78,40 @@ M2U-Net Precision vs. Recall Curves
 Precision vs. recall curves for each evaluated dataset.  Note that here the
 F1-score is calculated on a macro level (see paper for more details).
 
-.. figure:: img/pr_CHASEDB1.png
+.. figure:: pr_CHASEDB1.png
    :scale: 50 %
    :align: center
    :alt: model comparisons
 
    CHASE_DB1: Precision vs Recall curve and F1 scores
 
-.. figure:: img/pr_DRIVE.png
+.. figure:: pr_DRIVE.png
    :scale: 50 %
    :align: center
    :alt: model comparisons
 
    DRIVE: Precision vs Recall curve and F1 scores
 
-.. figure:: img/pr_HRF.png
+.. figure:: pr_HRF.png
    :scale: 50 %
    :align: center
    :alt: model comparisons
 
    HRF: Precision vs Recall curve and F1 scores
 
-.. figure:: img/pr_IOSTARVESSEL.png
+.. figure:: pr_IOSTARVESSEL.png
    :scale: 50 %
    :align: center
    :alt: model comparisons
 
    IOSTAR: Precision vs Recall curve and F1 scores
 
-.. figure:: img/pr_STARE.png
+.. figure:: pr_STARE.png
    :scale: 50 %
    :align: center
    :alt: model comparisons
 
    STARE: Precision vs Recall curve and F1 scores
 
+
+.. include:: ../../links.rst
diff --git a/doc/img/pr_CHASEDB1.png b/doc/results/covd/pr_CHASEDB1.png
similarity index 100%
rename from doc/img/pr_CHASEDB1.png
rename to doc/results/covd/pr_CHASEDB1.png
diff --git a/doc/img/pr_DRIVE.png b/doc/results/covd/pr_DRIVE.png
similarity index 100%
rename from doc/img/pr_DRIVE.png
rename to doc/results/covd/pr_DRIVE.png
diff --git a/doc/img/pr_HRF.png b/doc/results/covd/pr_HRF.png
similarity index 100%
rename from doc/img/pr_HRF.png
rename to doc/results/covd/pr_HRF.png
diff --git a/doc/img/pr_IOSTARVESSEL.png b/doc/results/covd/pr_IOSTARVESSEL.png
similarity index 100%
rename from doc/img/pr_IOSTARVESSEL.png
rename to doc/results/covd/pr_IOSTARVESSEL.png
diff --git a/doc/img/pr_STARE.png b/doc/results/covd/pr_STARE.png
similarity index 100%
rename from doc/img/pr_STARE.png
rename to doc/results/covd/pr_STARE.png
diff --git a/doc/results.rst b/doc/results/index.rst
similarity index 85%
rename from doc/results.rst
rename to doc/results/index.rst
index 0fcc3f46070a8ee710cecc2a966aa8a0410d7236..f2d7e2ac8de271a3745b1de154d0b848ef70806b 100644
--- a/doc/results.rst
+++ b/doc/results/index.rst
@@ -15,8 +15,9 @@ strategy.
 .. toctree::
    :maxdepth: 2
 
-   baselines
-   covd
+   baselines/index
+   xtest/index
+   covd/index
 
 
-.. include:: links.rst
+.. include:: ../links.rst
diff --git a/doc/scripts/baselines.sh b/doc/scripts/baselines.sh
new file mode 100755
index 0000000000000000000000000000000000000000..6f82b1aff351acd0a217a5e84221110483a75b4b
--- /dev/null
+++ b/doc/scripts/baselines.sh
@@ -0,0 +1,46 @@
+#!/usr/bin/env bash
+
+# Runs all of our baselines
+
+# set output directory and location of "bob" executable
+OUTDIR=/path/to/output/diretory
+BOB=/path/to/bob/execuble
+
+# run <modelconfig> <dbconfig> <batchsize> [<device> [<queue>]]
+function run() {
+    local device="cpu"
+    [ $# -gt 3 ] && device="${4}"
+
+    local cmd=(${BOB} binseg experiment)
+    cmd+=("-vv" "--device=${device}" ${1} ${2})
+    cmd+=("--batch-size=${3}" "--output-folder=${OUTDIR}/${1}/${2}")
+
+    # notice this assumes gridtk is installed
+    [ $# -gt 4 ] && cmd=(jman submit "--memory=24G" "--queue=${5}" -- "${cmd[@]}")
+
+    "${cmd[@]}"
+}
+
+# run/submit all baselines
+# comment out from "sgpu/gpu" to run locally
+# comment out from "cuda:0" to run on CPU
+run m2unet stare          6 #cuda:0 #sgpu
+run hed    stare          4 #cuda:0 #sgpu
+run driu   stare          5 #cuda:0 #sgpu
+run unet   stare          2 #cuda:0 #sgpu
+run m2unet drive         16 #cuda:0 #sgpu
+run hed    drive          8 #cuda:0 #sgpu
+run driu   drive          8 #cuda:0 #sgpu
+run unet   drive          4 #cuda:0 #sgpu
+run m2unet iostar-vessel  6 #cuda:0 #sgpu
+run hed    iostar-vessel  4 #cuda:0 #sgpu
+run driu   iostar-vessel  4 #cuda:0 #sgpu
+run unet   iostar-vessel  2 #cuda:0 #sgpu
+run m2unet chasedb1       6 #cuda:0 #sgpu
+run hed    chasedb1       4 #cuda:0 #sgpu
+run driu   chasedb1       4 #cuda:0 #sgpu
+run unet   chasedb1       2 #cuda:0 #sgpu
+run m2unet hrf            1 #cuda:0 #gpu
+run hed    hrf            1 #cuda:0 #gpu
+run driu   hrf            1 #cuda:0 #gpu
+run unet   hrf            1 #cuda:0 #gpu
diff --git a/doc/scripts/xtest.sh b/doc/scripts/xtest.sh
new file mode 100755
index 0000000000000000000000000000000000000000..5e409854160719dbab01a712a17d7f5cd60c51c9
--- /dev/null
+++ b/doc/scripts/xtest.sh
@@ -0,0 +1,15 @@
+#!/usr/bin/env bash
+
+# Runs cross database tests
+
+BOB=$HOME/work/bob/bob.ip.binseg/bin/bob
+
+model=m2unet
+for d in drive stare chasedb1 iostar-vessel hrf; do
+    for m in driu hed m2unet unet; do
+        cmd=(${BOB} binseg analyze -vv ${model} "${d}-xtest")
+        cmd+=("--weight=${m}/${d}/model/model_final.pth")
+        cmd+=("--output-folder=${m}/${d}/xtest")
+        "${cmd[@]}"
+    done
+done
diff --git a/doc/training.rst b/doc/training.rst
index 693561286fd8cf4a19ccddc0e75050743b155d7e..254cac14cfafcaf6f67698c2d41d62096c204d14 100644
--- a/doc/training.rst
+++ b/doc/training.rst
@@ -19,158 +19,3 @@ message containing more detailed instructions.
    size (``--batch``).
 
 
-Baseline Benchmarks
-===================
-
-The following table describes recommended batch sizes for 24Gb of RAM GPU
-card, for supervised training of baselines.  Use it like this:
-
-.. code-block:: sh
-
-   # change <model> and <dataset> by one of items bellow
-   $ bob binseg train -vv <model> <dataset> --batch-size=<see-table> --device="cuda:0"
-   # check results in the "results" folder
-
-.. list-table::
-
-  * - **Models / Datasets**
-    - :py:mod:`drive <bob.ip.binseg.configs.datasets.drive.default>`
-    - :py:mod:`stare <bob.ip.binseg.configs.datasets.stare.ah>`
-    - :py:mod:`chasedb1 <bob.ip.binseg.configs.datasets.chasedb1.first_annotator>`
-    - :py:mod:`iostar-vessel <bob.ip.binseg.configs.datasets.iostar.vessel>`
-    - :py:mod:`hrf <bob.ip.binseg.configs.datasets.hrf.default>`
-  * - :py:mod:`unet <bob.ip.binseg.configs.models.unet>`
-    - 4
-    - 2
-    - 2
-    - 2
-    - 1
-  * - :py:mod:`hed <bob.ip.binseg.configs.models.hed>`
-    - 8
-    - 4
-    - 4
-    - 4
-    - 1
-  * - :py:mod:`driu <bob.ip.binseg.configs.models.driu>` / :py:mod:`driu-bn <bob.ip.binseg.configs.models.driu_bn>`
-    - 8
-    - 5
-    - 4
-    - 4
-    - 1
-  * - :py:mod:`m2unet <bob.ip.binseg.configs.models.m2unet>`
-    - 16
-    - 6
-    - 6
-    - 6
-    - 1
-
-
-.. tip::
-
-   Instead of the default configurations, you can pass the full path of your
-   customized dataset and model files.  You may :ref:`copy any of the existing
-   configuration resources <bob.ip.binseg.cli.config.copy>` and change them
-   locally.  Once you're happy, you may use the newly created files directly on
-   your command line.  For example, suppose you wanted to slightly change the
-   DRIVE pre-processing pipeline.  You could do the following:
-
-   .. code-block:: bash
-
-      $ bob binseg config copy drive my_drive_remix.py
-      # edit my_drive_remix.py to your needs
-      $ bob binseg train -vv <model> ./my_drive_remix.py
-
-
-.. _bob.ip.binseg.gridtk-tip:
-
-.. tip::
-
-   If you are at Idiap, you may install the package ``gridtk`` (``conda install
-   gridtk``) on your environment, and submit the job like this:
-
-   .. code-block:: sh
-
-      $ jman submit --queue=gpu --memory=24G --name=myjob -- bob binseg train --device='cuda:0' ... #paste the rest of the command-line
-
-
-Combined Vessel Dataset (COVD)
-==============================
-
-The following table describes recommended batch sizes for 24Gb of RAM GPU card,
-for supervised training of COVD- systems.  Use it like this:
-
-.. code-block:: sh
-
-   # change <model> and <dataset> by one of items bellow
-   $ bob binseg train -vv <model> <dataset> --batch-size=<see-table> --device="cuda:0"
-
-.. list-table::
-
-  * - **Models / Datasets**
-    - :py:mod:`drive-covd <bob.ip.binseg.configs.datasets.drive.covd>`
-    - :py:mod:`stare-covd <bob.ip.binseg.configs.datasets.stare.covd>`
-    - :py:mod:`chasedb1-covd <bob.ip.binseg.configs.datasets.chasedb1.covd>`
-    - :py:mod:`iostar-vessel-covd <bob.ip.binseg.configs.datasets.iostar.covd>`
-    - :py:mod:`hrf-covd <bob.ip.binseg.configs.datasets.hrf.covd>`
-  * - :py:mod:`driu <bob.ip.binseg.configs.models.driu>` / :py:mod:`driu-bn <bob.ip.binseg.configs.models.driu_bn>`
-    - 4
-    - 4
-    - 2
-    - 2
-    - 2
-  * - :py:mod:`m2unet <bob.ip.binseg.configs.models.m2unet>`
-    - 8
-    - 4
-    - 4
-    - 4
-    - 4
-
-
-Combined Vessel Dataset (COVD) and Semi-Supervised Learning (SSL)
-=================================================================
-
-The following table describes recommended batch sizes for 24Gb of RAM GPU
-card, for semi-supervised learning of COVD- systems.  Use it like this:
-
-.. code-block:: sh
-
-   # change <model> and <dataset> by one of items bellow
-   $ bob binseg train -vv --ssl <model> <dataset> --batch-size=<see-table> --device="cuda:0"
-
-.. list-table::
-
-  * - **Models / Datasets**
-    - :py:mod:`drive-ssl <bob.ip.binseg.configs.datasets.drive.ssl>`
-    - :py:mod:`stare-ssl <bob.ip.binseg.configs.datasets.stare.ssl>`
-    - :py:mod:`chasedb1-ssl <bob.ip.binseg.configs.datasets.chasedb1.ssl>`
-    - :py:mod:`iostar-vessel-ssl <bob.ip.binseg.configs.datasets.iostar.ssl>`
-    - :py:mod:`hrf-ssl <bob.ip.binseg.configs.datasets.hrf.ssl>`
-  * - :py:mod:`driu-ssl <bob.ip.binseg.configs.models.driu_ssl>` / :py:mod:`driu-bn-ssl <bob.ip.binseg.configs.models.driu_bn_ssl>`
-    - 4
-    - 4
-    - 2
-    - 1
-    - 1
-  * - :py:mod:`m2unet-ssl <bob.ip.binseg.configs.models.m2unet_ssl>`
-    - 4
-    - 4
-    - 2
-    - 2
-    - 2
-
-
-Using your own dataset
-======================
-
-To use your own dataset, we recommend you read our instructions at
-:py:mod:`bob.ip.binseg.configs.datasets.csv`, and setup one or more CSV file
-describing input data and ground-truth (segmentation maps).  Then, prepare a
-configuration file by copying our configuration example and edit it to apply
-the required transforms to your input data.  Once you are happy with the
-result, use it in place of one of our datasets:
-
-.. code-block:: sh
-
-   $ bob binseg config copy csv-dataset-example mydataset.py
-   # edit mydataset following instructions
-   $ bob binseg train ... mydataset.py ...
diff --git a/doc/usage.rst b/doc/usage.rst
index c9967139f77c1e64bd634b641bf795ffdea73c55..b63cd4378782167b0c91b194167f30da1add4360 100644
--- a/doc/usage.rst
+++ b/doc/usage.rst
@@ -13,25 +13,38 @@ semantic binary segmentation with support for the following activities:
   that is trained to reconstruct annotations (pre-segmented binary maps),
   automatically, via error back propagation.  The objective of this phase is to
   produce an FCN model.
-* Inference: The FCN is used to generate vessel map predictions
+* Inference (prediction): The FCN is used to generate vessel map predictions
 * Evaluation: Vessel map predictions are used evaluate FCN performance against
-  test data, generate ROC curves or visualize prediction results overlayed on
+  provided annotations, or visualize prediction results overlayed on
   the original raw images.
+* Comparison: Use evaluation results to compare performance as you like.
 
-Each application is implemented as a :ref:`command-line utility
-<bob.ip.binseg.cli>`, that is configurable using :ref:`Bob's extensible
-configuration framework <bob.extension.framework>`.  In essence, each
-command-line option may be provided as a variable with the same name in a
-Python file.  Each file may combine any number of variables that are pertinent
-to an application.
+Whereas we provide :ref:`command-line interfaces (CLI)
+<bob.ip.binseg.cli.single>` that implement each of the phases above, we also
+provide command aggregators that can :ref:`run all of the phases
+<bob.ip.binseg.cli.combined>`.  Both interfaces are configurable using
+:ref:`Bob's extensible configuration framework <bob.extension.framework>`.  In
+essence, each command-line option may be provided as a variable with the same
+name in a Python file.  Each file may combine any number of variables that are
+pertinent to an application.
 
 .. tip::
 
    For reproducibility, we recommend you stick to configuration files when
    parameterizing our CLI.  Notice some of the options in the CLI interface
    (e.g. ``--dataset``) cannot be passed via the actual command-line as it
-   requires a :py:class:`concrete PyTorch dataset instance
-   <torch.utils.data.dataset.Dataset>`.
+   may require complex Python types that cannot be synthetized in a single
+   input parameter.
+
+
+The following flowchart represents the various experiment phases and output
+results that can be produced for each of our CLI interfaces (rounded white
+rectangles).  Processing subproducts (marked in blue), are stored on disk by
+the end of each step.
+
+.. graphviz:: framework.dot
+   :caption: Framework actions and CLI
+
 
 We provide a number of :ref:`preset configuration files
 <bob.ip.binseg.cli.config.list.all>` that can be used in one or more of the
@@ -46,10 +59,10 @@ modifying one of our configuration resources.
 .. toctree::
    :maxdepth: 2
 
+   experiment
    training
    models
    evaluation
-   experiment
 
 
 .. include:: links.rst
diff --git a/setup.py b/setup.py
index 35c6ff669fa31b2c44fc333b187f317182e17234..11b08d12091a66218622e888af7ac1911f04e8a9 100644
--- a/setup.py
+++ b/setup.py
@@ -61,23 +61,27 @@ setup(
             # drive dataset
             "drive = bob.ip.binseg.configs.datasets.drive.default",
             "drive-2nd = bob.ip.binseg.configs.datasets.drive.second_annotator",
+            "drive-xtest = bob.ip.binseg.configs.datasets.drive.xtest",
             "drive-covd = bob.ip.binseg.configs.datasets.drive.covd",
             "drive-ssl = bob.ip.binseg.configs.datasets.drive.ssl",
 
             # stare dataset
             "stare = bob.ip.binseg.configs.datasets.stare.ah",
             "stare-2nd = bob.ip.binseg.configs.datasets.stare.vk",
+            "stare-xtest = bob.ip.binseg.configs.datasets.stare.xtest",
             "stare-covd = bob.ip.binseg.configs.datasets.stare.covd",
             "stare-ssl = bob.ip.binseg.configs.datasets.stare.ssl",
 
             # iostar
             "iostar-vessel = bob.ip.binseg.configs.datasets.iostar.vessel",
+            "iostar-vessel-xtest = bob.ip.binseg.configs.datasets.iostar.vessel_xtest",
             "iostar-disc = bob.ip.binseg.configs.datasets.iostar.optic_disc",
             "iostar-vessel-covd = bob.ip.binseg.configs.datasets.iostar.covd",
             "iostar-vessel-ssl = bob.ip.binseg.configs.datasets.iostar.ssl",
 
             # hrf
             "hrf = bob.ip.binseg.configs.datasets.hrf.default",
+            "hrf-xtest = bob.ip.binseg.configs.datasets.hrf.xtest",
             "hrf-highres = bob.ip.binseg.configs.datasets.hrf.default_fullres",
             "hrf-covd = bob.ip.binseg.configs.datasets.hrf.covd",
             "hrf-ssl = bob.ip.binseg.configs.datasets.hrf.ssl",
@@ -85,6 +89,7 @@ setup(
             # chase-db1
             "chasedb1 = bob.ip.binseg.configs.datasets.chasedb1.first_annotator",
             "chasedb1-2nd = bob.ip.binseg.configs.datasets.chasedb1.second_annotator",
+            "chasedb1-xtest = bob.ip.binseg.configs.datasets.chasedb1.xtest",
             "chasedb1-covd = bob.ip.binseg.configs.datasets.chasedb1.covd",
             "chasedb1-ssl = bob.ip.binseg.configs.datasets.chasedb1.ssl",