diff --git a/bob/ip/binseg/engine/evaluator.py b/bob/ip/binseg/engine/evaluator.py
index 26151579d0e040bc3a6015094eea5316d93fd395..07a7c86f8874dd0af50763d578a91b0a6b8a9f27 100644
--- a/bob/ip/binseg/engine/evaluator.py
+++ b/bob/ip/binseg/engine/evaluator.py
@@ -306,11 +306,11 @@ def run(
     #         (avg_metrics["precision"]+avg_metrics["recall"])
 
     avg_metrics["std_pr"] = std_metrics["precision"]
-    avg_metrics["pr_upper"] = avg_metrics["precision"] + avg_metrics["std_pr"]
-    avg_metrics["pr_lower"] = avg_metrics["precision"] - avg_metrics["std_pr"]
+    avg_metrics["pr_upper"] = avg_metrics["precision"] + std_metrics["precision"]
+    avg_metrics["pr_lower"] = avg_metrics["precision"] - std_metrics["precision"]
     avg_metrics["std_re"] = std_metrics["recall"]
-    avg_metrics["re_upper"] = avg_metrics["recall"] + avg_metrics["std_re"]
-    avg_metrics["re_lower"] = avg_metrics["recall"] - avg_metrics["std_re"]
+    avg_metrics["re_upper"] = avg_metrics["recall"] + std_metrics["recall"]
+    avg_metrics["re_lower"] = avg_metrics["recall"] - std_metrics["recall"]
     avg_metrics["std_f1"] = std_metrics["f1_score"]
 
     maxf1 = avg_metrics["f1_score"].max()
@@ -406,6 +406,7 @@ def compare_annotators(baseline, other, name, output_folder,
 
     # Merges all dataframes together
     df_metrics = pandas.concat(data.values())
+    df_metrics.drop(0, inplace=True)
 
     # Report and Averages
     avg_metrics = df_metrics.groupby("index").mean()
@@ -420,17 +421,13 @@ def compare_annotators(baseline, other, name, output_folder,
     #         (avg_metrics["precision"]+avg_metrics["recall"])
 
     avg_metrics["std_pr"] = std_metrics["precision"]
-    avg_metrics["pr_upper"] = avg_metrics["precision"] + avg_metrics["std_pr"]
-    avg_metrics["pr_lower"] = avg_metrics["precision"] - avg_metrics["std_pr"]
+    avg_metrics["pr_upper"] = avg_metrics["precision"] + std_metrics["precision"]
+    avg_metrics["pr_lower"] = avg_metrics["precision"] - std_metrics["precision"]
     avg_metrics["std_re"] = std_metrics["recall"]
-    avg_metrics["re_upper"] = avg_metrics["recall"] + avg_metrics["std_re"]
-    avg_metrics["re_lower"] = avg_metrics["recall"] - avg_metrics["std_re"]
+    avg_metrics["re_upper"] = avg_metrics["recall"] + std_metrics["recall"]
+    avg_metrics["re_lower"] = avg_metrics["recall"] - std_metrics["recall"]
     avg_metrics["std_f1"] = std_metrics["f1_score"]
 
-    # we actually only need to keep the second row of the pandas dataframe
-    # with threshold == 0.5 - the first row is redundant
-    avg_metrics.drop(0, inplace=True)
-
     metrics_path = os.path.join(output_folder, "second-annotator", f"{name}.csv")
     os.makedirs(os.path.dirname(metrics_path), exist_ok=True)
     logger.info(f"Saving averages over all input images at {metrics_path}...")
diff --git a/doc/results/baselines/index.rst b/doc/results/baselines/index.rst
index 97afcdd5ae66d0996de9de0d8320695ec07169ef..ce5361fbe43cf1da75b49bcdf03a673f3efce830 100644
--- a/doc/results/baselines/index.rst
+++ b/doc/results/baselines/index.rst
@@ -10,8 +10,10 @@ F1 Scores (micro-level)
 -----------------------
 
 * Benchmark results for models: DRIU, HED, M2U-Net and U-Net.
-* Models are trained and tested on the same dataset (numbers in parenthesis
-  indicate number of parameters per model)
+* Models are trained and tested on the same dataset (**numbers in bold**
+  indicate number of parameters per model).  Models are trained for a fixed
+  number of 1000 epochs, with a learning rate of 0.001 until epoch 900 and then
+  0.0001 until the end of the training.
 * Database and model resource configuration links (table top row and left
   column) are linked to the originating configuration files used to obtain
   these results.
@@ -21,24 +23,26 @@ F1 Scores (micro-level)
   where the threshold is previously selected on the training set
 * You can cross check the analysis numbers provided in this table by
   downloading this software package, the raw data, and running ``bob binseg
-  analyze`` providing the model URL as ``--weight`` parameter.  Otherwise, we
-  also provide `CSV files
-  <https://www.idiap.ch/software/bob/data/bob/bob.ip.binseg/master/baselines/>`_
-  with the estimated performance per threshold (100
-  steps) per subset.
+  analyze`` providing the model URL as ``--weight`` parameter.
 * For comparison purposes, we provide "second-annotator" performances on the
   same test set, where available.
 
 
 .. list-table::
-   :header-rows: 1
+   :header-rows: 2
 
+   * -
+     -
+     - :py:mod:`driu <bob.ip.binseg.configs.models.driu>`
+     - :py:mod:`hed <bob.ip.binseg.configs.models.hed>`
+     - :py:mod:`m2unet <bob.ip.binseg.configs.models.m2unet>`
+     - :py:mod:`unet <bob.ip.binseg.configs.models.unet>`
    * - Dataset
      - 2nd. Annot.
-     - :py:mod:`driu (15M) <bob.ip.binseg.configs.models.driu>`
-     - :py:mod:`hed (14.7M) <bob.ip.binseg.configs.models.hed>`
-     - :py:mod:`m2unet (0.55M) <bob.ip.binseg.configs.models.m2unet>`
-     - :py:mod:`unet (25.8M) <bob.ip.binseg.configs.models.unet>`
+     - 15M
+     - 14.7M
+     - 0.55M
+     - 25.8M
    * - :py:mod:`drive <bob.ip.binseg.configs.datasets.drive.default>`
      - 0.788 (0.021)
      - `0.819 (0.016) <baselines_driu_drive_>`_
@@ -52,7 +56,7 @@ F1 Scores (micro-level)
      - `0.811 (0.039) <baselines_m2unet_stare_>`_
      - `0.828 (0.041) <baselines_unet_stare_>`_
    * - :py:mod:`chasedb1 <bob.ip.binseg.configs.datasets.chasedb1.first_annotator>`
-     - 0.768  0.023
+     - 0.768 (0.023)
      - `0.811 (0.018) <baselines_driu_chase_>`_
      - `0.806 (0.021) <baselines_hed_chase_>`_
      - `0.801 (0.018) <baselines_m2unet_chase_>`_
@@ -80,39 +84,53 @@ set performances.  Single performance figures (F1-micro scores) correspond to
 its average value across all test set images, for a fixed threshold set to
 ``0.5``.
 
-.. figure:: drive.png
-   :align: center
-   :alt: Model comparisons for drive datasets
-
-   :py:mod:`drive <bob.ip.binseg.configs.datasets.drive.default>`: PR curve and F1 scores at T=0.5 (:download:`pdf <drive.pdf>`)
-
-
-.. figure:: stare.png
-   :align: center
-   :alt: Model comparisons for stare datasets
-
-   :py:mod:`stare <bob.ip.binseg.configs.datasets.stare.ah>`: PR curve and F1 scores at T=0.5 (:download:`pdf <stare.pdf>`)
-
-
-.. figure:: chasedb1.png
-   :align: center
-   :alt: Model comparisons for chasedb1 datasets
-
-   :py:mod:`chasedb1 <bob.ip.binseg.configs.datasets.chasedb1.first_annotator>`: PR curve and F1 scores at T=0.5 (:download:`pdf <chasedb1.pdf>`)
-
-
-.. figure:: hrf.png
-   :align: center
-   :alt: Model comparisons for hrf datasets
-
-   :py:mod:`hrf <bob.ip.binseg.configs.datasets.hrf.default>`: PR curve and F1 scores at T=0.5 (:download:`pdf <hrf.pdf>`)
-
-
-.. figure:: iostar-vessel.png
-   :align: center
-   :alt: Model comparisons for iostar-vessel datasets
-
-   :py:mod:`iostar-vessel <bob.ip.binseg.configs.datasets.iostar.vessel>`: PR curve and F1 scores at T=0.5 (:download:`pdf <iostar-vessel.pdf>`)
+.. list-table::
 
+    * - .. figure:: drive.png
+           :align: center
+           :scale: 50%
+           :alt: Model comparisons for drive datasets
+
+           :py:mod:`drive <bob.ip.binseg.configs.datasets.drive.default>`: PR curve and F1 scores at T=0.5 (:download:`pdf <drive.pdf>`)
+      - .. figure:: stare.png
+           :align: center
+           :scale: 50%
+           :alt: Model comparisons for stare datasets
+
+           :py:mod:`stare <bob.ip.binseg.configs.datasets.stare.ah>`: PR curve and F1 scores at T=0.5 (:download:`pdf <stare.pdf>`)
+    * - .. figure:: chasedb1.png
+           :align: center
+           :scale: 50%
+           :alt: Model comparisons for chasedb1 datasets
+
+           :py:mod:`chasedb1 <bob.ip.binseg.configs.datasets.chasedb1.first_annotator>`: PR curve and F1 scores at T=0.5 (:download:`pdf <chasedb1.pdf>`)
+      - .. figure:: hrf.png
+           :align: center
+           :scale: 50%
+           :alt: Model comparisons for hrf datasets
+
+           :py:mod:`hrf <bob.ip.binseg.configs.datasets.hrf.default>`: PR curve and F1 scores at T=0.5 (:download:`pdf <hrf.pdf>`)
+    * - .. figure:: iostar-vessel.png
+           :align: center
+           :scale: 50%
+           :alt: Model comparisons for iostar-vessel datasets
+
+           :py:mod:`iostar-vessel <bob.ip.binseg.configs.datasets.iostar.vessel>`: PR curve and F1 scores at T=0.5 (:download:`pdf <iostar-vessel.pdf>`)
+      -
+
+
+Remarks
+-------
+
+* There seems to be no clear winner as confidence intervals based on the
+  standard deviation overlap substantially between the different models, and
+  across different datasets.
+* There seems to be almost no effect on the number of parameters on
+  performance.  U-Net, the largest model, is not a clear winner through all
+  baseline benchmarks
+* Where second annotator labels exist, model performance and variability seems
+  on par with such annotations.  One possible exception is for CHASE-DB1, where
+  models show consistently less variability than the second annotator.
+  Unfortunately, this cannot be conclusive.
 
 .. include:: ../../links.rst
diff --git a/doc/results/xtest/driu-chasedb1.pdf b/doc/results/xtest/driu-chasedb1.pdf
new file mode 100644
index 0000000000000000000000000000000000000000..bb28aafb8479d2f07336a7dfdcdd16fa148f5117
Binary files /dev/null and b/doc/results/xtest/driu-chasedb1.pdf differ
diff --git a/doc/results/xtest/driu-chasedb1.png b/doc/results/xtest/driu-chasedb1.png
new file mode 100644
index 0000000000000000000000000000000000000000..be26e9f8e8b140aaca692c417892abb515a180f4
Binary files /dev/null and b/doc/results/xtest/driu-chasedb1.png differ
diff --git a/doc/results/xtest/driu-drive.pdf b/doc/results/xtest/driu-drive.pdf
new file mode 100644
index 0000000000000000000000000000000000000000..1f9fc10b1ffa612f6f889666598620f1007ef03f
Binary files /dev/null and b/doc/results/xtest/driu-drive.pdf differ
diff --git a/doc/results/xtest/driu-drive.png b/doc/results/xtest/driu-drive.png
new file mode 100644
index 0000000000000000000000000000000000000000..fba68683bf3a43a19a0a99c2f2f63d3f5d219473
Binary files /dev/null and b/doc/results/xtest/driu-drive.png differ
diff --git a/doc/results/xtest/driu-hrf.pdf b/doc/results/xtest/driu-hrf.pdf
new file mode 100644
index 0000000000000000000000000000000000000000..01d78c98e041d4db603bce19ad549811b3cc3a81
Binary files /dev/null and b/doc/results/xtest/driu-hrf.pdf differ
diff --git a/doc/results/xtest/driu-hrf.png b/doc/results/xtest/driu-hrf.png
new file mode 100644
index 0000000000000000000000000000000000000000..0cbd94c9f9c9ffa71f5f513cd8f997070bc979ee
Binary files /dev/null and b/doc/results/xtest/driu-hrf.png differ
diff --git a/doc/results/xtest/driu-iostar-vessel.pdf b/doc/results/xtest/driu-iostar-vessel.pdf
new file mode 100644
index 0000000000000000000000000000000000000000..db822d0f5c53d1579b8c71f3e339eefe2519e582
Binary files /dev/null and b/doc/results/xtest/driu-iostar-vessel.pdf differ
diff --git a/doc/results/xtest/driu-iostar-vessel.png b/doc/results/xtest/driu-iostar-vessel.png
new file mode 100644
index 0000000000000000000000000000000000000000..5842c71646109aecb0456cb20f99783b3b3bda0d
Binary files /dev/null and b/doc/results/xtest/driu-iostar-vessel.png differ
diff --git a/doc/results/xtest/driu-stare.pdf b/doc/results/xtest/driu-stare.pdf
new file mode 100644
index 0000000000000000000000000000000000000000..f44d12dafca83b3e2d3af52a48244ad1bd43365c
Binary files /dev/null and b/doc/results/xtest/driu-stare.pdf differ
diff --git a/doc/results/xtest/driu-stare.png b/doc/results/xtest/driu-stare.png
new file mode 100644
index 0000000000000000000000000000000000000000..6573b820be3e2402e732c45d1c106f0abb103aef
Binary files /dev/null and b/doc/results/xtest/driu-stare.png differ
diff --git a/doc/results/xtest/index.rst b/doc/results/xtest/index.rst
index 37e5acb1dcd01afa7693c3300ea26d91a9f8f68d..e65c9e9997e8dca62e0a6fcfd0bea45bae11906d 100644
--- a/doc/results/xtest/index.rst
+++ b/doc/results/xtest/index.rst
@@ -2,29 +2,29 @@
 
 .. _bob.ip.binseg.results.xtest:
 
-======================
- Cross-Database Tests
-======================
+==========================
+ Cross-Database (X-)Tests
+==========================
 
 F1 Scores (micro-level)
 -----------------------
 
-* Benchmark results for models: DRIU, HED, M2U-Net and U-Net.
 * Models are trained and tested on the same dataset (numbers in parenthesis
   indicate number of parameters per model), and then evaluated across the test
-  sets of other datasets.
+  sets of other databases.  X-tested datasets therefore represent *unseen*
+  data and can be a good proxy for generalisation analysis.
+* Each table row indicates a base trained model and each column the databases
+  the model was tested against.  The native performance (intra-database) is
+  marked **in bold**.  Thresholds are chosen *a priori* on the training set of
+  the database used to generate the model being cross-tested.  Hence, the
+  threshold used for all experiments in a same row is always the same.
 * You can cross check the analysis numbers provided in this table by
   downloading this software package, the raw data, and running ``bob binseg
   analyze`` providing the model URL as ``--weight`` parameter, and then the
   ``-xtest`` resource variant of the dataset the model was trained on.  For
   example, to run cross-evaluation tests for the DRIVE dataset, use the
   configuration resource :py:mod:`drive-xtest
-  <bob.ip.binseg.configs.datasets.drive.xtest>`.  Otherwise, we
-  also provide `CSV files
-  <https://www.idiap.ch/software/bob/data/bob/bob.ip.binseg/master/xtest/>`_
-  with the estimated performance per threshold (100 steps) per subset.
-* For comparison purposes, we provide "second-annotator" performances on the
-  same test set, where available.
+  <bob.ip.binseg.configs.datasets.drive.xtest>`.
 * We only show results for DRIU (~15.4 million parameters) and M2U-Net (~550
   thousand parameters) as these models seem to represent the performance
   extremes according to our :ref:`baseline analysis
@@ -43,48 +43,199 @@ DRIU
 
 
 .. list-table::
-   :header-rows: 1
-
-   * - Model / X-Test
-     - :py:mod:`drive <bob.ip.binseg.configs.datasets.drive.xtest>`
-     - :py:mod:`stare <bob.ip.binseg.configs.datasets.stare.xtest>`
-     - :py:mod:`chasedb1 <bob.ip.binseg.configs.datasets.chasedb1.xtest>`
-     - :py:mod:`hrf <bob.ip.binseg.configs.datasets.hrf.xtest>`
-     - :py:mod:`iostar-vessel <bob.ip.binseg.configs.datasets.iostar.vessel_xtest>`
-   * - `drive <baselines_driu_drive_>`_
-     -
-     -
-     -
-     -
-     -
-   * - `stare <baselines_driu_stare_>`_
-     -
-     -
-     -
-     -
-     -
-   * - `chasedb1 <baselines_driu_chase_>`_
-     -
-     -
-     -
-     -
-     -
-   * - `hrf <baselines_driu_hrf_>`_
-     -
-     -
-     -
-     -
-     -
-   * - `iostar-vessel <baselines_driu_iostar_>`_
-     -
-     -
-     -
-     -
-     -
-
-
-Precision-Recall (PR) Curves
-----------------------------
+   :header-rows: 2
 
+   * -
+     - drive
+     - stare
+     - chasedb1
+     - hrf
+     - iostar-vessel
+   * - Model / W x H
+     - 544 x 544
+     - 704 x 608
+     - 960 x 960
+     - 1648 x 1168
+     - 1024 x 1024
+   * - :py:mod:`drive <bob.ip.binseg.configs.datasets.drive.default>` (`model <baselines_driu_drive_>`_)
+     - **0.819 (0.016)**
+     - 0.759 (0.151)
+     - 0.321 (0.068)
+     - 0.711 (0.067)
+     - 0.493 (0.049)
+   * - :py:mod:`stare <bob.ip.binseg.configs.datasets.stare.ah>` (`model <baselines_driu_stare_>`_)
+     - 0.733 (0.037)
+     - **0.824 (0.037)**
+     - 0.491 (0.094)
+     - 0.773 (0.051)
+     - 0.469 (0.055)
+   * - :py:mod:`chasedb1 <bob.ip.binseg.configs.datasets.chasedb1.first_annotator>` (`model <baselines_driu_chase_>`_)
+     - 0.730 (0.023)
+     - 0.730 (0.101)
+     - **0.811 (0.018)**
+     - 0.779 (0.043)
+     - 0.774 (0.019)
+   * - :py:mod:`hrf <bob.ip.binseg.configs.datasets.hrf.default>` (`model <baselines_driu_hrf_>`_)
+     - 0.702 (0.038)
+     - 0.641 (0.160)
+     - 0.600 (0.072)
+     - **0.802 (0.039)**
+     - 0.546  (0.078)
+   * - :py:mod:`iostar-vessel <bob.ip.binseg.configs.datasets.iostar.vessel>` (`model <baselines_driu_iostar_>`_)
+     - 0.758 (0.019)
+     - 0.724 (0.115)
+     - 0.777 (0.032)
+     - 0.727 (0.059)
+     - **0.825 (0.021)**
+
+
+Next, you will find the PR plots showing confidence intervals, for the various
+cross-tests explored, on a per cross-tested model arrangement.  All curves
+correspond to test set performances.  Single performance figures (F1-micro
+scores) correspond to its average value across all test set images, for a fixed
+threshold set *a priori* on the training set of dataset used for creating the
+model.
+
+.. list-table::
+
+    * - .. figure:: driu-drive.png
+           :align: center
+           :scale: 40%
+           :alt: X-tests for a DRIU model based on DRIVE
+
+           :py:mod:`drive <bob.ip.binseg.configs.datasets.drive.xtest>`: DRIU model X-tested (:download:`pdf <driu-drive.pdf>`)
+      - .. figure:: driu-stare.png
+           :align: center
+           :scale: 40%
+           :alt: X-tests for a DRIU model based on STARE
+
+           :py:mod:`stare <bob.ip.binseg.configs.datasets.stare.xtest>`: DRIU model X-tested (:download:`pdf <driu-stare.pdf>`)
+    * - .. figure:: driu-chasedb1.png
+           :align: center
+           :scale: 40%
+           :alt: X-tests for a DRIU model based on CHASE-DB1
+
+           :py:mod:`chasedb1 <bob.ip.binseg.configs.datasets.chasedb1.xtest>`: DRIU model X-tested (:download:`pdf <driu-chasedb1.pdf>`)
+      - .. figure:: driu-hrf.png
+           :align: center
+           :scale: 40%
+           :alt: X-tests for a DRIU model based on HRF
+
+           :py:mod:`hrf <bob.ip.binseg.configs.datasets.hrf.xtest>`: DRIU model X-tested (:download:`pdf <driu-hrf.pdf>`)
+    * - .. figure:: driu-iostar-vessel.png
+           :align: center
+           :scale: 40%
+           :alt: X-tests for a DRIU model based on IOSTAR (vessel)
+
+           :py:mod:`iostar-vessel <bob.ip.binseg.configs.datasets.iostar.vessel_xtest>`: DRIU model X-tested (:download:`pdf <driu-iostar-vessel.pdf>`)
+      -
+
+
+M2U-Net
+=======
+
+
+.. list-table::
+   :header-rows: 2
+
+   * -
+     - drive
+     - stare
+     - chasedb1
+     - hrf
+     - iostar-vessel
+   * - Model / W x H
+     - 544 x 544
+     - 704 x 608
+     - 960 x 960
+     - 1648 x 1168
+     - 1024 x 1024
+   * - :py:mod:`drive <bob.ip.binseg.configs.datasets.drive.default>` (`model <baselines_m2unet_drive_>`_)
+     - **0.804 (0.014)**
+     - 0.736 (0.144)
+     - 0.548 (0.055)
+     - 0.744 (0.058)
+     - 0.722 (0.036)
+   * - :py:mod:`stare <bob.ip.binseg.configs.datasets.stare.ah>` (`model <baselines_m2unet_stare_>`_)
+     - 0.715 (0.031)
+     - **0.811 (0.039)**
+     - 0.632 (0.033)
+     - 0.765 (0.049)
+     - 0.673 (0.033)
+   * - :py:mod:`chasedb1 <bob.ip.binseg.configs.datasets.chasedb1.first_annotator>` (`model <baselines_m2unet_chase_>`_)
+     - 0.677 (0.027)
+     - 0.695 (0.099)
+     - **0.801 (0.018)**
+     - 0.763 (0.040)
+     - 0.761 (0.018)
+   * - :py:mod:`hrf <bob.ip.binseg.configs.datasets.hrf.default>` (`model <baselines_m2unet_hrf_>`_)
+     - 0.591 (0.071)
+     - 0.460 (0.230)
+     - 0.332 (0.108)
+     - **0.796 (0.043)**
+     - 0.419 (0.088)
+   * - :py:mod:`iostar-vessel <bob.ip.binseg.configs.datasets.iostar.vessel>` (`model <baselines_m2unet_iostar_>`_)
+     - 0.743 (0.019)
+     - 0.745 (0.076)
+     - 0.771 (0.030)
+     - 0.749 (0.052)
+     - **0.817 (0.021)**
+
+
+Next, you will find the PR plots showing confidence intervals, for the various
+cross-tests explored, on a per cross-tested model arrangement.  All curves
+correspond to test set performances.  Single performance figures (F1-micro
+scores) correspond to its average value across all test set images, for a fixed
+threshold set *a priori* on the training set of dataset used for creating the
+model.
+
+.. list-table::
+
+    * - .. figure:: m2unet-drive.png
+           :align: center
+           :scale: 40%
+           :alt: X-tests for a M2U-Net model based on DRIVE
+
+           :py:mod:`drive <bob.ip.binseg.configs.datasets.drive.xtest>`: M2U-Net model X-tested (:download:`pdf <m2unet-drive.pdf>`)
+      - .. figure:: m2unet-stare.png
+           :align: center
+           :scale: 40%
+           :alt: X-tests for a M2U-Net model based on STARE
+
+           :py:mod:`stare <bob.ip.binseg.configs.datasets.stare.xtest>`: M2U-Net model X-tested (:download:`pdf <m2unet-stare.pdf>`)
+    * - .. figure:: m2unet-chasedb1.png
+           :align: center
+           :scale: 40%
+           :alt: X-tests for a M2U-Net model based on CHASE-DB1
+
+           :py:mod:`chasedb1 <bob.ip.binseg.configs.datasets.chasedb1.xtest>`: M2U-Net model X-tested (:download:`pdf <m2unet-chasedb1.pdf>`)
+      - .. figure:: m2unet-hrf.png
+           :align: center
+           :scale: 40%
+           :alt: X-tests for a M2U-Net model based on HRF
+
+           :py:mod:`hrf <bob.ip.binseg.configs.datasets.hrf.xtest>`: M2U-Net model X-tested (:download:`pdf <m2unet-hrf.pdf>`)
+    * - .. figure:: m2unet-iostar-vessel.png
+           :align: center
+           :scale: 40%
+           :alt: X-tests for a M2U-Net model based on IOSTAR (vessel)
+
+           :py:mod:`iostar-vessel <bob.ip.binseg.configs.datasets.iostar.vessel_xtest>`: M2U-Net model X-tested (:download:`pdf <m2unet-iostar-vessel.pdf>`)
+      -
+
+
+
+Remarks
+-------
+
+* For each row, the peak performance is always obtained in an intra-database
+  test (training and testing on the same database).  Conversely, we observe a
+  performance degradation (albeit not catastrophic in most cases) for all other
+  datasets in the cross test.
+* X-test performance on a model created from HRF suggests a strong bias, as
+  performance does not generalize well for other (unseen) datasets.
+* Models generated from CHASE-DB1 and IOSTAR (vessel) seem to generalize quite
+  well to unseen data, when compared to the relatively poor generalization
+  capabilites of models generated from HRF or DRIVE.
 
 .. include:: ../../links.rst
diff --git a/doc/results/xtest/m2unet-chasedb1.pdf b/doc/results/xtest/m2unet-chasedb1.pdf
new file mode 100644
index 0000000000000000000000000000000000000000..22368ff4c89b9968a63a2c937ba1945f1dc881ec
Binary files /dev/null and b/doc/results/xtest/m2unet-chasedb1.pdf differ
diff --git a/doc/results/xtest/m2unet-chasedb1.png b/doc/results/xtest/m2unet-chasedb1.png
new file mode 100644
index 0000000000000000000000000000000000000000..f7fbaffad64fd42012f2394e412b7f4183ba2f05
Binary files /dev/null and b/doc/results/xtest/m2unet-chasedb1.png differ
diff --git a/doc/results/xtest/m2unet-drive.pdf b/doc/results/xtest/m2unet-drive.pdf
new file mode 100644
index 0000000000000000000000000000000000000000..e8090cecb4c178646e0a0bb51da5b3a89f0b1548
Binary files /dev/null and b/doc/results/xtest/m2unet-drive.pdf differ
diff --git a/doc/results/xtest/m2unet-drive.png b/doc/results/xtest/m2unet-drive.png
new file mode 100644
index 0000000000000000000000000000000000000000..0b628ddfab5d6dc678d5f665f4e4eb3c7edec1fd
Binary files /dev/null and b/doc/results/xtest/m2unet-drive.png differ
diff --git a/doc/results/xtest/m2unet-hrf.pdf b/doc/results/xtest/m2unet-hrf.pdf
new file mode 100644
index 0000000000000000000000000000000000000000..73d400cf279864de7fb3d372824606c2cdba79aa
Binary files /dev/null and b/doc/results/xtest/m2unet-hrf.pdf differ
diff --git a/doc/results/xtest/m2unet-hrf.png b/doc/results/xtest/m2unet-hrf.png
new file mode 100644
index 0000000000000000000000000000000000000000..ab4bcb45f2fa74bd6fa3b6a575723d71160a5c32
Binary files /dev/null and b/doc/results/xtest/m2unet-hrf.png differ
diff --git a/doc/results/xtest/m2unet-iostar-vessel.pdf b/doc/results/xtest/m2unet-iostar-vessel.pdf
new file mode 100644
index 0000000000000000000000000000000000000000..6a59bc6ea23d7d17d54d08db8c8735b54445aae0
Binary files /dev/null and b/doc/results/xtest/m2unet-iostar-vessel.pdf differ
diff --git a/doc/results/xtest/m2unet-iostar-vessel.png b/doc/results/xtest/m2unet-iostar-vessel.png
new file mode 100644
index 0000000000000000000000000000000000000000..df9cc400f92f759f752161788e28a31826c17c94
Binary files /dev/null and b/doc/results/xtest/m2unet-iostar-vessel.png differ
diff --git a/doc/results/xtest/m2unet-stare.pdf b/doc/results/xtest/m2unet-stare.pdf
new file mode 100644
index 0000000000000000000000000000000000000000..127f8d2abbc8aafd33fcbdcd4cc9613bea15b3b0
Binary files /dev/null and b/doc/results/xtest/m2unet-stare.pdf differ
diff --git a/doc/results/xtest/m2unet-stare.png b/doc/results/xtest/m2unet-stare.png
new file mode 100644
index 0000000000000000000000000000000000000000..e80cd25d1bce4604f62358d7d01bdfb1d4f67c6d
Binary files /dev/null and b/doc/results/xtest/m2unet-stare.png differ