From 6b6fa4d8ce4ad71e584a05dc918ebeed19ae083c Mon Sep 17 00:00:00 2001 From: Andre Anjos <andre.dos.anjos@gmail.com> Date: Fri, 17 Apr 2020 15:41:39 +0200 Subject: [PATCH] [doc] Fix result section --- doc/baselines.rst | 59 ++++++++++++++++++++++++ doc/covd.rst | 115 ++++++++++++++++++++++++++++++++++++++++++++++ doc/index.rst | 3 +- doc/results.rst | 22 +++++++++ doc/usage.rst | 6 +-- 5 files changed, 200 insertions(+), 5 deletions(-) create mode 100644 doc/baselines.rst create mode 100644 doc/covd.rst create mode 100644 doc/results.rst diff --git a/doc/baselines.rst b/doc/baselines.rst new file mode 100644 index 00000000..622aae3d --- /dev/null +++ b/doc/baselines.rst @@ -0,0 +1,59 @@ +.. -*- coding: utf-8 -*- + +.. _bob.ip.binseg.results.baselines: + +=================== + Baseline Results +=================== + +F1 Scores (micro-level) +----------------------- + +* Benchmark results for models: DRIU, HED, M2U-Net and U-Net. +* Models are trained and tested on the same dataset using the + train-test split as indicated in :ref:`bob.ip.binseg.configs.datasets` (i.e., + these are *intra*-datasets tests) +* Standard-deviations across all test images are indicated in brakets +* Database and Model links (table top row and left column) are linked to the + originating configuration files used to obtain these results. +* For some results, the actual deep neural network models are provided (by + clicking on the associated F1 Score). +* Check `our paper`_ for details on the calculation of the F1 Score and standard + deviations. + +.. list-table:: + :header-rows: 1 + + * - F1 (std) + - :py:mod:`DRIU <bob.ip.binseg.configs.models.driu>` + - :py:mod:`HED <bob.ip.binseg.configs.models.hed>` + - :py:mod:`M2U-Net <bob.ip.binseg.configs.models.m2unet>` + - :py:mod:`U-Net <bob.ip.binseg.configs.models.unet>` + * - :py:mod:`CHASE-DB1 <bob.ip.binseg.configs.datasets.chasedb1>` + - `0.810 (0.021) <driu_chasedb1.pth_>`_ + - 0.810 (0.022) + - `0.802 (0.019) <m2unet_chasedb1.pth_>`_ + - 0.812 (0.020) + * - :py:mod:`DRIVE <bob.ip.binseg.configs.datasets.drive>` + - `0.820 (0.014) <driu_drive.pth_>`_ + - 0.817 (0.013) + - `0.803 (0.014) <m2unet_drive.pth_>`_ + - 0.822 (0.015) + * - :py:mod:`HRF <bob.ip.binseg.configs.datasets.hrf_1168>` + - `0.783 (0.055) <driu_hrf.pth_>`_ + - 0.783 (0.058) + - `0.780 (0.057) <m2unet_hrf.pth_>`_ + - 0.788 (0.051) + * - :py:mod:`IOSTAR (vessel) <bob.ip.binseg.configs.datasets.iostar_vessel>` + - `0.825 (0.020) <driu_iostar.pth_>`_ + - 0.825 (0.020) + - `0.817 (0.020) <m2unet_iostar.pth_>`_ + - 0.818 (0.019) + * - :py:mod:`STARE <bob.ip.binseg.configs.datasets.stare>` + - `0.827 (0.037) <driu_stare.pth_>`_ + - 0.823 (0.037) + - `0.815 (0.041) <m2unet_stare.pth_>`_ + - 0.829 (0.042) + + +.. include:: links.rst diff --git a/doc/covd.rst b/doc/covd.rst new file mode 100644 index 00000000..3abe8445 --- /dev/null +++ b/doc/covd.rst @@ -0,0 +1,115 @@ +.. -*- coding: utf-8 -*- + +.. _bob.ip.binseg.covdresults: + +============================ + COVD- and COVD-SLL Results +============================ + +In addition to the M2U-Net architecture, we also evaluated the larger DRIU +network and a variation of it that contains batch normalization (DRIU+BN) on +COVD- (Combined Vessel Dataset from all training data minus target test set) +and COVD-SSL (COVD- and Semi-Supervised Learning). Perhaps surprisingly, for +the majority of combinations, the performance of the DRIU variants are roughly +equal or worse to the ones obtained with the much smaller M2U-Net. We +anticipate that one reason for this could be overparameterization of large +VGG-16 models that are pretrained on ImageNet. + + +F1 Scores +--------- + +Comparison of F1 Scores (micro-level and standard deviation) of DRIU and +M2U-Net on COVD- and COVD-SSL. Standard deviation across test-images in +brackets. + +.. list-table:: + :header-rows: 1 + + * - F1 score + - :py:mod:`DRIU <bob.ip.binseg.configs.models.driu>`/:py:mod:`DRIU@SSL <bob.ip.binseg.configs.models.driu_ssl>` + - :py:mod:`DRIU+BN <bob.ip.binseg.configs.models.driu_bn>`/:py:mod:`DRIU+BN@SSL <bob.ip.binseg.configs.models.driu_bn_ssl>` + - :py:mod:`M2U-Net <bob.ip.binseg.configs.models.m2unet>`/:py:mod:`M2U-Net@SSL <bob.ip.binseg.configs.models.m2unet_ssl>` + * - :py:mod:`COVD-DRIVE <bob.ip.binseg.configs.datasets.covd_drive>` + - 0.788 (0.018) + - 0.797 (0.019) + - `0.789 (0.018) <m2unet_covd-drive.pth>`_ + * - :py:mod:`COVD-DRIVE+SSL <bob.ip.binseg.configs.datasets.covd_drive_ssl>` + - 0.785 (0.018) + - 0.783 (0.019) + - `0.791 (0.014) <m2unet_covd-drive_ssl.pth>`_ + * - :py:mod:`COVD-STARE <bob.ip.binseg.configs.datasets.covd_stare>` + - 0.778 (0.117) + - 0.778 (0.122) + - `0.812 (0.046) <m2unet_covd-stare.pth>`_ + * - :py:mod:`COVD-STARE+SSL <bob.ip.binseg.configs.datasets.covd_stare_ssl>` + - 0.788 (0.102) + - 0.811 (0.074) + - `0.820 (0.044) <m2unet_covd-stare_ssl.pth>`_ + * - :py:mod:`COVD-CHASEDB1 <bob.ip.binseg.configs.datasets.covd_chasedb1>` + - 0.796 (0.027) + - 0.791 (0.025) + - `0.788 (0.024) <m2unet_covd-chasedb1.pth>`_ + * - :py:mod:`COVD-CHASEDB1+SSL <bob.ip.binseg.configs.datasets.covd_chasedb1_ssl>` + - 0.796 (0.024) + - 0.798 (0.025) + - `0.799 (0.026) <m2unet_covd-chasedb1_ssl.pth>`_ + * - :py:mod:`COVD-HRF <bob.ip.binseg.configs.datasets.covd_hrf>` + - 0.799 (0.044) + - 0.800 (0.045) + - `0.802 (0.045) <m2unet_covd-hrf.pth>`_ + * - :py:mod:`COVD-HRF+SSL <bob.ip.binseg.configs.datasets.covd_hrf_ssl>` + - 0.799 (0.044) + - 0.784 (0.048) + - `0.797 (0.044) <m2unet_covd-hrf_ssl.pth>`_ + * - :py:mod:`COVD-IOSTAR-VESSEL <bob.ip.binseg.configs.datasets.covd_iostar_vessel>` + - 0.791 (0.021) + - 0.777 (0.032) + - `0.793 (0.015) <m2unet_covd-iostar.pth>`_ + * - :py:mod:`COVD-IOSTAR-VESSEL+SSL <bob.ip.binseg.configs.datasets.covd_iostar_vessel_ssl>` + - 0.797 (0.017) + - 0.811 (0.074) + - `0.785 (0.018) <m2unet_covd-iostar_ssl.pth>`_ + + +M2U-Net Precision vs. Recall Curves +----------------------------------- + +Precision vs. recall curves for each evaluated dataset. Note that here the +F1-score is calculated on a macro level (see paper for more details). + +.. figure:: img/pr_CHASEDB1.png + :scale: 50 % + :align: center + :alt: model comparisons + + CHASE_DB1: Precision vs Recall curve and F1 scores + +.. figure:: img/pr_DRIVE.png + :scale: 50 % + :align: center + :alt: model comparisons + + DRIVE: Precision vs Recall curve and F1 scores + +.. figure:: img/pr_HRF.png + :scale: 50 % + :align: center + :alt: model comparisons + + HRF: Precision vs Recall curve and F1 scores + +.. figure:: img/pr_IOSTARVESSEL.png + :scale: 50 % + :align: center + :alt: model comparisons + + IOSTAR: Precision vs Recall curve and F1 scores + +.. figure:: img/pr_STARE.png + :scale: 50 % + :align: center + :alt: model comparisons + + STARE: Precision vs Recall curve and F1 scores + diff --git a/doc/index.rst b/doc/index.rst index 2e11ada6..0cccd77f 100644 --- a/doc/index.rst +++ b/doc/index.rst @@ -43,8 +43,7 @@ User Guide setup usage - benchmarkresults - covdresults + results acknowledgements references datasets diff --git a/doc/results.rst b/doc/results.rst new file mode 100644 index 00000000..0fcc3f46 --- /dev/null +++ b/doc/results.rst @@ -0,0 +1,22 @@ +.. -*- coding: utf-8 -*- + +.. _bob.ip.binseg.results: + +========= + Results +========= + +This section summarizes results that can be obtained with this package, and +were presented in our paper. We organize the result section in two parts, for +covering baseline results (training and testing on the same dataset) and +results using our Combined Vessel Dataset minus target dataset (COVD-) training +strategy. + +.. toctree:: + :maxdepth: 2 + + baselines + covd + + +.. include:: links.rst diff --git a/doc/usage.rst b/doc/usage.rst index d9c1ef87..be0b7f89 100644 --- a/doc/usage.rst +++ b/doc/usage.rst @@ -2,9 +2,9 @@ .. _bob.ip.binseg.usage: -================== - Usage Guidelines -================== +======= + Usage +======= This package supports a fully reproducible research experimentation cycle for semantic binary segmentation with support for the following activities: -- GitLab