diff --git a/bob/ip/binseg/engine/evaluator.py b/bob/ip/binseg/engine/evaluator.py index 26151579d0e040bc3a6015094eea5316d93fd395..07a7c86f8874dd0af50763d578a91b0a6b8a9f27 100644 --- a/bob/ip/binseg/engine/evaluator.py +++ b/bob/ip/binseg/engine/evaluator.py @@ -306,11 +306,11 @@ def run( # (avg_metrics["precision"]+avg_metrics["recall"]) avg_metrics["std_pr"] = std_metrics["precision"] - avg_metrics["pr_upper"] = avg_metrics["precision"] + avg_metrics["std_pr"] - avg_metrics["pr_lower"] = avg_metrics["precision"] - avg_metrics["std_pr"] + avg_metrics["pr_upper"] = avg_metrics["precision"] + std_metrics["precision"] + avg_metrics["pr_lower"] = avg_metrics["precision"] - std_metrics["precision"] avg_metrics["std_re"] = std_metrics["recall"] - avg_metrics["re_upper"] = avg_metrics["recall"] + avg_metrics["std_re"] - avg_metrics["re_lower"] = avg_metrics["recall"] - avg_metrics["std_re"] + avg_metrics["re_upper"] = avg_metrics["recall"] + std_metrics["recall"] + avg_metrics["re_lower"] = avg_metrics["recall"] - std_metrics["recall"] avg_metrics["std_f1"] = std_metrics["f1_score"] maxf1 = avg_metrics["f1_score"].max() @@ -406,6 +406,7 @@ def compare_annotators(baseline, other, name, output_folder, # Merges all dataframes together df_metrics = pandas.concat(data.values()) + df_metrics.drop(0, inplace=True) # Report and Averages avg_metrics = df_metrics.groupby("index").mean() @@ -420,17 +421,13 @@ def compare_annotators(baseline, other, name, output_folder, # (avg_metrics["precision"]+avg_metrics["recall"]) avg_metrics["std_pr"] = std_metrics["precision"] - avg_metrics["pr_upper"] = avg_metrics["precision"] + avg_metrics["std_pr"] - avg_metrics["pr_lower"] = avg_metrics["precision"] - avg_metrics["std_pr"] + avg_metrics["pr_upper"] = avg_metrics["precision"] + std_metrics["precision"] + avg_metrics["pr_lower"] = avg_metrics["precision"] - std_metrics["precision"] avg_metrics["std_re"] = std_metrics["recall"] - avg_metrics["re_upper"] = avg_metrics["recall"] + avg_metrics["std_re"] - avg_metrics["re_lower"] = avg_metrics["recall"] - avg_metrics["std_re"] + avg_metrics["re_upper"] = avg_metrics["recall"] + std_metrics["recall"] + avg_metrics["re_lower"] = avg_metrics["recall"] - std_metrics["recall"] avg_metrics["std_f1"] = std_metrics["f1_score"] - # we actually only need to keep the second row of the pandas dataframe - # with threshold == 0.5 - the first row is redundant - avg_metrics.drop(0, inplace=True) - metrics_path = os.path.join(output_folder, "second-annotator", f"{name}.csv") os.makedirs(os.path.dirname(metrics_path), exist_ok=True) logger.info(f"Saving averages over all input images at {metrics_path}...") diff --git a/doc/results/baselines/index.rst b/doc/results/baselines/index.rst index 97afcdd5ae66d0996de9de0d8320695ec07169ef..ce5361fbe43cf1da75b49bcdf03a673f3efce830 100644 --- a/doc/results/baselines/index.rst +++ b/doc/results/baselines/index.rst @@ -10,8 +10,10 @@ F1 Scores (micro-level) ----------------------- * Benchmark results for models: DRIU, HED, M2U-Net and U-Net. -* Models are trained and tested on the same dataset (numbers in parenthesis - indicate number of parameters per model) +* Models are trained and tested on the same dataset (**numbers in bold** + indicate number of parameters per model). Models are trained for a fixed + number of 1000 epochs, with a learning rate of 0.001 until epoch 900 and then + 0.0001 until the end of the training. * Database and model resource configuration links (table top row and left column) are linked to the originating configuration files used to obtain these results. @@ -21,24 +23,26 @@ F1 Scores (micro-level) where the threshold is previously selected on the training set * You can cross check the analysis numbers provided in this table by downloading this software package, the raw data, and running ``bob binseg - analyze`` providing the model URL as ``--weight`` parameter. Otherwise, we - also provide `CSV files - <https://www.idiap.ch/software/bob/data/bob/bob.ip.binseg/master/baselines/>`_ - with the estimated performance per threshold (100 - steps) per subset. + analyze`` providing the model URL as ``--weight`` parameter. * For comparison purposes, we provide "second-annotator" performances on the same test set, where available. .. list-table:: - :header-rows: 1 + :header-rows: 2 + * - + - + - :py:mod:`driu <bob.ip.binseg.configs.models.driu>` + - :py:mod:`hed <bob.ip.binseg.configs.models.hed>` + - :py:mod:`m2unet <bob.ip.binseg.configs.models.m2unet>` + - :py:mod:`unet <bob.ip.binseg.configs.models.unet>` * - Dataset - 2nd. Annot. - - :py:mod:`driu (15M) <bob.ip.binseg.configs.models.driu>` - - :py:mod:`hed (14.7M) <bob.ip.binseg.configs.models.hed>` - - :py:mod:`m2unet (0.55M) <bob.ip.binseg.configs.models.m2unet>` - - :py:mod:`unet (25.8M) <bob.ip.binseg.configs.models.unet>` + - 15M + - 14.7M + - 0.55M + - 25.8M * - :py:mod:`drive <bob.ip.binseg.configs.datasets.drive.default>` - 0.788 (0.021) - `0.819 (0.016) <baselines_driu_drive_>`_ @@ -52,7 +56,7 @@ F1 Scores (micro-level) - `0.811 (0.039) <baselines_m2unet_stare_>`_ - `0.828 (0.041) <baselines_unet_stare_>`_ * - :py:mod:`chasedb1 <bob.ip.binseg.configs.datasets.chasedb1.first_annotator>` - - 0.768 0.023 + - 0.768 (0.023) - `0.811 (0.018) <baselines_driu_chase_>`_ - `0.806 (0.021) <baselines_hed_chase_>`_ - `0.801 (0.018) <baselines_m2unet_chase_>`_ @@ -80,39 +84,53 @@ set performances. Single performance figures (F1-micro scores) correspond to its average value across all test set images, for a fixed threshold set to ``0.5``. -.. figure:: drive.png - :align: center - :alt: Model comparisons for drive datasets - - :py:mod:`drive <bob.ip.binseg.configs.datasets.drive.default>`: PR curve and F1 scores at T=0.5 (:download:`pdf <drive.pdf>`) - - -.. figure:: stare.png - :align: center - :alt: Model comparisons for stare datasets - - :py:mod:`stare <bob.ip.binseg.configs.datasets.stare.ah>`: PR curve and F1 scores at T=0.5 (:download:`pdf <stare.pdf>`) - - -.. figure:: chasedb1.png - :align: center - :alt: Model comparisons for chasedb1 datasets - - :py:mod:`chasedb1 <bob.ip.binseg.configs.datasets.chasedb1.first_annotator>`: PR curve and F1 scores at T=0.5 (:download:`pdf <chasedb1.pdf>`) - - -.. figure:: hrf.png - :align: center - :alt: Model comparisons for hrf datasets - - :py:mod:`hrf <bob.ip.binseg.configs.datasets.hrf.default>`: PR curve and F1 scores at T=0.5 (:download:`pdf <hrf.pdf>`) - - -.. figure:: iostar-vessel.png - :align: center - :alt: Model comparisons for iostar-vessel datasets - - :py:mod:`iostar-vessel <bob.ip.binseg.configs.datasets.iostar.vessel>`: PR curve and F1 scores at T=0.5 (:download:`pdf <iostar-vessel.pdf>`) +.. list-table:: + * - .. figure:: drive.png + :align: center + :scale: 50% + :alt: Model comparisons for drive datasets + + :py:mod:`drive <bob.ip.binseg.configs.datasets.drive.default>`: PR curve and F1 scores at T=0.5 (:download:`pdf <drive.pdf>`) + - .. figure:: stare.png + :align: center + :scale: 50% + :alt: Model comparisons for stare datasets + + :py:mod:`stare <bob.ip.binseg.configs.datasets.stare.ah>`: PR curve and F1 scores at T=0.5 (:download:`pdf <stare.pdf>`) + * - .. figure:: chasedb1.png + :align: center + :scale: 50% + :alt: Model comparisons for chasedb1 datasets + + :py:mod:`chasedb1 <bob.ip.binseg.configs.datasets.chasedb1.first_annotator>`: PR curve and F1 scores at T=0.5 (:download:`pdf <chasedb1.pdf>`) + - .. figure:: hrf.png + :align: center + :scale: 50% + :alt: Model comparisons for hrf datasets + + :py:mod:`hrf <bob.ip.binseg.configs.datasets.hrf.default>`: PR curve and F1 scores at T=0.5 (:download:`pdf <hrf.pdf>`) + * - .. figure:: iostar-vessel.png + :align: center + :scale: 50% + :alt: Model comparisons for iostar-vessel datasets + + :py:mod:`iostar-vessel <bob.ip.binseg.configs.datasets.iostar.vessel>`: PR curve and F1 scores at T=0.5 (:download:`pdf <iostar-vessel.pdf>`) + - + + +Remarks +------- + +* There seems to be no clear winner as confidence intervals based on the + standard deviation overlap substantially between the different models, and + across different datasets. +* There seems to be almost no effect on the number of parameters on + performance. U-Net, the largest model, is not a clear winner through all + baseline benchmarks +* Where second annotator labels exist, model performance and variability seems + on par with such annotations. One possible exception is for CHASE-DB1, where + models show consistently less variability than the second annotator. + Unfortunately, this cannot be conclusive. .. include:: ../../links.rst diff --git a/doc/results/xtest/driu-chasedb1.pdf b/doc/results/xtest/driu-chasedb1.pdf new file mode 100644 index 0000000000000000000000000000000000000000..bb28aafb8479d2f07336a7dfdcdd16fa148f5117 Binary files /dev/null and b/doc/results/xtest/driu-chasedb1.pdf differ diff --git a/doc/results/xtest/driu-chasedb1.png b/doc/results/xtest/driu-chasedb1.png new file mode 100644 index 0000000000000000000000000000000000000000..be26e9f8e8b140aaca692c417892abb515a180f4 Binary files /dev/null and b/doc/results/xtest/driu-chasedb1.png differ diff --git a/doc/results/xtest/driu-drive.pdf b/doc/results/xtest/driu-drive.pdf new file mode 100644 index 0000000000000000000000000000000000000000..1f9fc10b1ffa612f6f889666598620f1007ef03f Binary files /dev/null and b/doc/results/xtest/driu-drive.pdf differ diff --git a/doc/results/xtest/driu-drive.png b/doc/results/xtest/driu-drive.png new file mode 100644 index 0000000000000000000000000000000000000000..fba68683bf3a43a19a0a99c2f2f63d3f5d219473 Binary files /dev/null and b/doc/results/xtest/driu-drive.png differ diff --git a/doc/results/xtest/driu-hrf.pdf b/doc/results/xtest/driu-hrf.pdf new file mode 100644 index 0000000000000000000000000000000000000000..01d78c98e041d4db603bce19ad549811b3cc3a81 Binary files /dev/null and b/doc/results/xtest/driu-hrf.pdf differ diff --git a/doc/results/xtest/driu-hrf.png b/doc/results/xtest/driu-hrf.png new file mode 100644 index 0000000000000000000000000000000000000000..0cbd94c9f9c9ffa71f5f513cd8f997070bc979ee Binary files /dev/null and b/doc/results/xtest/driu-hrf.png differ diff --git a/doc/results/xtest/driu-iostar-vessel.pdf b/doc/results/xtest/driu-iostar-vessel.pdf new file mode 100644 index 0000000000000000000000000000000000000000..db822d0f5c53d1579b8c71f3e339eefe2519e582 Binary files /dev/null and b/doc/results/xtest/driu-iostar-vessel.pdf differ diff --git a/doc/results/xtest/driu-iostar-vessel.png b/doc/results/xtest/driu-iostar-vessel.png new file mode 100644 index 0000000000000000000000000000000000000000..5842c71646109aecb0456cb20f99783b3b3bda0d Binary files /dev/null and b/doc/results/xtest/driu-iostar-vessel.png differ diff --git a/doc/results/xtest/driu-stare.pdf b/doc/results/xtest/driu-stare.pdf new file mode 100644 index 0000000000000000000000000000000000000000..f44d12dafca83b3e2d3af52a48244ad1bd43365c Binary files /dev/null and b/doc/results/xtest/driu-stare.pdf differ diff --git a/doc/results/xtest/driu-stare.png b/doc/results/xtest/driu-stare.png new file mode 100644 index 0000000000000000000000000000000000000000..6573b820be3e2402e732c45d1c106f0abb103aef Binary files /dev/null and b/doc/results/xtest/driu-stare.png differ diff --git a/doc/results/xtest/index.rst b/doc/results/xtest/index.rst index 37e5acb1dcd01afa7693c3300ea26d91a9f8f68d..e65c9e9997e8dca62e0a6fcfd0bea45bae11906d 100644 --- a/doc/results/xtest/index.rst +++ b/doc/results/xtest/index.rst @@ -2,29 +2,29 @@ .. _bob.ip.binseg.results.xtest: -====================== - Cross-Database Tests -====================== +========================== + Cross-Database (X-)Tests +========================== F1 Scores (micro-level) ----------------------- -* Benchmark results for models: DRIU, HED, M2U-Net and U-Net. * Models are trained and tested on the same dataset (numbers in parenthesis indicate number of parameters per model), and then evaluated across the test - sets of other datasets. + sets of other databases. X-tested datasets therefore represent *unseen* + data and can be a good proxy for generalisation analysis. +* Each table row indicates a base trained model and each column the databases + the model was tested against. The native performance (intra-database) is + marked **in bold**. Thresholds are chosen *a priori* on the training set of + the database used to generate the model being cross-tested. Hence, the + threshold used for all experiments in a same row is always the same. * You can cross check the analysis numbers provided in this table by downloading this software package, the raw data, and running ``bob binseg analyze`` providing the model URL as ``--weight`` parameter, and then the ``-xtest`` resource variant of the dataset the model was trained on. For example, to run cross-evaluation tests for the DRIVE dataset, use the configuration resource :py:mod:`drive-xtest - <bob.ip.binseg.configs.datasets.drive.xtest>`. Otherwise, we - also provide `CSV files - <https://www.idiap.ch/software/bob/data/bob/bob.ip.binseg/master/xtest/>`_ - with the estimated performance per threshold (100 steps) per subset. -* For comparison purposes, we provide "second-annotator" performances on the - same test set, where available. + <bob.ip.binseg.configs.datasets.drive.xtest>`. * We only show results for DRIU (~15.4 million parameters) and M2U-Net (~550 thousand parameters) as these models seem to represent the performance extremes according to our :ref:`baseline analysis @@ -43,48 +43,199 @@ DRIU .. list-table:: - :header-rows: 1 - - * - Model / X-Test - - :py:mod:`drive <bob.ip.binseg.configs.datasets.drive.xtest>` - - :py:mod:`stare <bob.ip.binseg.configs.datasets.stare.xtest>` - - :py:mod:`chasedb1 <bob.ip.binseg.configs.datasets.chasedb1.xtest>` - - :py:mod:`hrf <bob.ip.binseg.configs.datasets.hrf.xtest>` - - :py:mod:`iostar-vessel <bob.ip.binseg.configs.datasets.iostar.vessel_xtest>` - * - `drive <baselines_driu_drive_>`_ - - - - - - - - - - - * - `stare <baselines_driu_stare_>`_ - - - - - - - - - - - * - `chasedb1 <baselines_driu_chase_>`_ - - - - - - - - - - - * - `hrf <baselines_driu_hrf_>`_ - - - - - - - - - - - * - `iostar-vessel <baselines_driu_iostar_>`_ - - - - - - - - - - - - -Precision-Recall (PR) Curves ----------------------------- + :header-rows: 2 + * - + - drive + - stare + - chasedb1 + - hrf + - iostar-vessel + * - Model / W x H + - 544 x 544 + - 704 x 608 + - 960 x 960 + - 1648 x 1168 + - 1024 x 1024 + * - :py:mod:`drive <bob.ip.binseg.configs.datasets.drive.default>` (`model <baselines_driu_drive_>`_) + - **0.819 (0.016)** + - 0.759 (0.151) + - 0.321 (0.068) + - 0.711 (0.067) + - 0.493 (0.049) + * - :py:mod:`stare <bob.ip.binseg.configs.datasets.stare.ah>` (`model <baselines_driu_stare_>`_) + - 0.733 (0.037) + - **0.824 (0.037)** + - 0.491 (0.094) + - 0.773 (0.051) + - 0.469 (0.055) + * - :py:mod:`chasedb1 <bob.ip.binseg.configs.datasets.chasedb1.first_annotator>` (`model <baselines_driu_chase_>`_) + - 0.730 (0.023) + - 0.730 (0.101) + - **0.811 (0.018)** + - 0.779 (0.043) + - 0.774 (0.019) + * - :py:mod:`hrf <bob.ip.binseg.configs.datasets.hrf.default>` (`model <baselines_driu_hrf_>`_) + - 0.702 (0.038) + - 0.641 (0.160) + - 0.600 (0.072) + - **0.802 (0.039)** + - 0.546 (0.078) + * - :py:mod:`iostar-vessel <bob.ip.binseg.configs.datasets.iostar.vessel>` (`model <baselines_driu_iostar_>`_) + - 0.758 (0.019) + - 0.724 (0.115) + - 0.777 (0.032) + - 0.727 (0.059) + - **0.825 (0.021)** + + +Next, you will find the PR plots showing confidence intervals, for the various +cross-tests explored, on a per cross-tested model arrangement. All curves +correspond to test set performances. Single performance figures (F1-micro +scores) correspond to its average value across all test set images, for a fixed +threshold set *a priori* on the training set of dataset used for creating the +model. + +.. list-table:: + + * - .. figure:: driu-drive.png + :align: center + :scale: 40% + :alt: X-tests for a DRIU model based on DRIVE + + :py:mod:`drive <bob.ip.binseg.configs.datasets.drive.xtest>`: DRIU model X-tested (:download:`pdf <driu-drive.pdf>`) + - .. figure:: driu-stare.png + :align: center + :scale: 40% + :alt: X-tests for a DRIU model based on STARE + + :py:mod:`stare <bob.ip.binseg.configs.datasets.stare.xtest>`: DRIU model X-tested (:download:`pdf <driu-stare.pdf>`) + * - .. figure:: driu-chasedb1.png + :align: center + :scale: 40% + :alt: X-tests for a DRIU model based on CHASE-DB1 + + :py:mod:`chasedb1 <bob.ip.binseg.configs.datasets.chasedb1.xtest>`: DRIU model X-tested (:download:`pdf <driu-chasedb1.pdf>`) + - .. figure:: driu-hrf.png + :align: center + :scale: 40% + :alt: X-tests for a DRIU model based on HRF + + :py:mod:`hrf <bob.ip.binseg.configs.datasets.hrf.xtest>`: DRIU model X-tested (:download:`pdf <driu-hrf.pdf>`) + * - .. figure:: driu-iostar-vessel.png + :align: center + :scale: 40% + :alt: X-tests for a DRIU model based on IOSTAR (vessel) + + :py:mod:`iostar-vessel <bob.ip.binseg.configs.datasets.iostar.vessel_xtest>`: DRIU model X-tested (:download:`pdf <driu-iostar-vessel.pdf>`) + - + + +M2U-Net +======= + + +.. list-table:: + :header-rows: 2 + + * - + - drive + - stare + - chasedb1 + - hrf + - iostar-vessel + * - Model / W x H + - 544 x 544 + - 704 x 608 + - 960 x 960 + - 1648 x 1168 + - 1024 x 1024 + * - :py:mod:`drive <bob.ip.binseg.configs.datasets.drive.default>` (`model <baselines_m2unet_drive_>`_) + - **0.804 (0.014)** + - 0.736 (0.144) + - 0.548 (0.055) + - 0.744 (0.058) + - 0.722 (0.036) + * - :py:mod:`stare <bob.ip.binseg.configs.datasets.stare.ah>` (`model <baselines_m2unet_stare_>`_) + - 0.715 (0.031) + - **0.811 (0.039)** + - 0.632 (0.033) + - 0.765 (0.049) + - 0.673 (0.033) + * - :py:mod:`chasedb1 <bob.ip.binseg.configs.datasets.chasedb1.first_annotator>` (`model <baselines_m2unet_chase_>`_) + - 0.677 (0.027) + - 0.695 (0.099) + - **0.801 (0.018)** + - 0.763 (0.040) + - 0.761 (0.018) + * - :py:mod:`hrf <bob.ip.binseg.configs.datasets.hrf.default>` (`model <baselines_m2unet_hrf_>`_) + - 0.591 (0.071) + - 0.460 (0.230) + - 0.332 (0.108) + - **0.796 (0.043)** + - 0.419 (0.088) + * - :py:mod:`iostar-vessel <bob.ip.binseg.configs.datasets.iostar.vessel>` (`model <baselines_m2unet_iostar_>`_) + - 0.743 (0.019) + - 0.745 (0.076) + - 0.771 (0.030) + - 0.749 (0.052) + - **0.817 (0.021)** + + +Next, you will find the PR plots showing confidence intervals, for the various +cross-tests explored, on a per cross-tested model arrangement. All curves +correspond to test set performances. Single performance figures (F1-micro +scores) correspond to its average value across all test set images, for a fixed +threshold set *a priori* on the training set of dataset used for creating the +model. + +.. list-table:: + + * - .. figure:: m2unet-drive.png + :align: center + :scale: 40% + :alt: X-tests for a M2U-Net model based on DRIVE + + :py:mod:`drive <bob.ip.binseg.configs.datasets.drive.xtest>`: M2U-Net model X-tested (:download:`pdf <m2unet-drive.pdf>`) + - .. figure:: m2unet-stare.png + :align: center + :scale: 40% + :alt: X-tests for a M2U-Net model based on STARE + + :py:mod:`stare <bob.ip.binseg.configs.datasets.stare.xtest>`: M2U-Net model X-tested (:download:`pdf <m2unet-stare.pdf>`) + * - .. figure:: m2unet-chasedb1.png + :align: center + :scale: 40% + :alt: X-tests for a M2U-Net model based on CHASE-DB1 + + :py:mod:`chasedb1 <bob.ip.binseg.configs.datasets.chasedb1.xtest>`: M2U-Net model X-tested (:download:`pdf <m2unet-chasedb1.pdf>`) + - .. figure:: m2unet-hrf.png + :align: center + :scale: 40% + :alt: X-tests for a M2U-Net model based on HRF + + :py:mod:`hrf <bob.ip.binseg.configs.datasets.hrf.xtest>`: M2U-Net model X-tested (:download:`pdf <m2unet-hrf.pdf>`) + * - .. figure:: m2unet-iostar-vessel.png + :align: center + :scale: 40% + :alt: X-tests for a M2U-Net model based on IOSTAR (vessel) + + :py:mod:`iostar-vessel <bob.ip.binseg.configs.datasets.iostar.vessel_xtest>`: M2U-Net model X-tested (:download:`pdf <m2unet-iostar-vessel.pdf>`) + - + + + +Remarks +------- + +* For each row, the peak performance is always obtained in an intra-database + test (training and testing on the same database). Conversely, we observe a + performance degradation (albeit not catastrophic in most cases) for all other + datasets in the cross test. +* X-test performance on a model created from HRF suggests a strong bias, as + performance does not generalize well for other (unseen) datasets. +* Models generated from CHASE-DB1 and IOSTAR (vessel) seem to generalize quite + well to unseen data, when compared to the relatively poor generalization + capabilites of models generated from HRF or DRIVE. .. include:: ../../links.rst diff --git a/doc/results/xtest/m2unet-chasedb1.pdf b/doc/results/xtest/m2unet-chasedb1.pdf new file mode 100644 index 0000000000000000000000000000000000000000..22368ff4c89b9968a63a2c937ba1945f1dc881ec Binary files /dev/null and b/doc/results/xtest/m2unet-chasedb1.pdf differ diff --git a/doc/results/xtest/m2unet-chasedb1.png b/doc/results/xtest/m2unet-chasedb1.png new file mode 100644 index 0000000000000000000000000000000000000000..f7fbaffad64fd42012f2394e412b7f4183ba2f05 Binary files /dev/null and b/doc/results/xtest/m2unet-chasedb1.png differ diff --git a/doc/results/xtest/m2unet-drive.pdf b/doc/results/xtest/m2unet-drive.pdf new file mode 100644 index 0000000000000000000000000000000000000000..e8090cecb4c178646e0a0bb51da5b3a89f0b1548 Binary files /dev/null and b/doc/results/xtest/m2unet-drive.pdf differ diff --git a/doc/results/xtest/m2unet-drive.png b/doc/results/xtest/m2unet-drive.png new file mode 100644 index 0000000000000000000000000000000000000000..0b628ddfab5d6dc678d5f665f4e4eb3c7edec1fd Binary files /dev/null and b/doc/results/xtest/m2unet-drive.png differ diff --git a/doc/results/xtest/m2unet-hrf.pdf b/doc/results/xtest/m2unet-hrf.pdf new file mode 100644 index 0000000000000000000000000000000000000000..73d400cf279864de7fb3d372824606c2cdba79aa Binary files /dev/null and b/doc/results/xtest/m2unet-hrf.pdf differ diff --git a/doc/results/xtest/m2unet-hrf.png b/doc/results/xtest/m2unet-hrf.png new file mode 100644 index 0000000000000000000000000000000000000000..ab4bcb45f2fa74bd6fa3b6a575723d71160a5c32 Binary files /dev/null and b/doc/results/xtest/m2unet-hrf.png differ diff --git a/doc/results/xtest/m2unet-iostar-vessel.pdf b/doc/results/xtest/m2unet-iostar-vessel.pdf new file mode 100644 index 0000000000000000000000000000000000000000..6a59bc6ea23d7d17d54d08db8c8735b54445aae0 Binary files /dev/null and b/doc/results/xtest/m2unet-iostar-vessel.pdf differ diff --git a/doc/results/xtest/m2unet-iostar-vessel.png b/doc/results/xtest/m2unet-iostar-vessel.png new file mode 100644 index 0000000000000000000000000000000000000000..df9cc400f92f759f752161788e28a31826c17c94 Binary files /dev/null and b/doc/results/xtest/m2unet-iostar-vessel.png differ diff --git a/doc/results/xtest/m2unet-stare.pdf b/doc/results/xtest/m2unet-stare.pdf new file mode 100644 index 0000000000000000000000000000000000000000..127f8d2abbc8aafd33fcbdcd4cc9613bea15b3b0 Binary files /dev/null and b/doc/results/xtest/m2unet-stare.pdf differ diff --git a/doc/results/xtest/m2unet-stare.png b/doc/results/xtest/m2unet-stare.png new file mode 100644 index 0000000000000000000000000000000000000000..e80cd25d1bce4604f62358d7d01bdfb1d4f67c6d Binary files /dev/null and b/doc/results/xtest/m2unet-stare.png differ