From 3e8932234ae52683a465350f20cf64c908e6b200 Mon Sep 17 00:00:00 2001
From: Andre Anjos <andre.dos.anjos@gmail.com>
Date: Wed, 21 Aug 2024 14:48:30 +0200
Subject: [PATCH] [doc] Clean-up installation documentation; Move all repeated
 documentation to database docstrings

---
 doc/index.rst                           |   2 +-
 doc/install.rst                         | 371 ------------------------
 doc/links.rst                           |   5 +-
 doc/{models/classify.rst => models.rst} |  54 +++-
 doc/models/index.rst                    |  17 --
 doc/models/segment.rst                  |  44 ---
 doc/references.rst                      |  15 -
 src/mednet/data/classify/hivtb.py       |  23 +-
 src/mednet/data/classify/indian.py      |  31 +-
 src/mednet/data/classify/montgomery.py  |  12 +-
 src/mednet/data/classify/nih_cxr14.py   |  24 +-
 src/mednet/data/classify/shenzhen.py    |  22 +-
 src/mednet/data/classify/tbpoc.py       |  15 +-
 src/mednet/data/classify/tbx11k.py      |  10 +
 src/mednet/data/segment/chasedb1.py     |  27 +-
 src/mednet/data/segment/cxr8.py         |  52 +++-
 src/mednet/data/segment/drive.py        |  28 +-
 src/mednet/data/segment/hrf.py          |  30 +-
 src/mednet/data/segment/jsrt.py         |  52 +++-
 src/mednet/data/segment/montgomery.py   |  14 +-
 src/mednet/data/segment/refuge.py       |  45 +--
 src/mednet/data/segment/shenzhen.py     |  55 +++-
 src/mednet/data/segment/stare.py        |  30 +-
 src/mednet/scripts/database.py          |  30 +-
 src/mednet/scripts/utils.py             |   7 +-
 25 files changed, 412 insertions(+), 603 deletions(-)
 rename doc/{models/classify.rst => models.rst} (50%)
 delete mode 100644 doc/models/index.rst
 delete mode 100644 doc/models/segment.rst

diff --git a/doc/index.rst b/doc/index.rst
index bfaedab5..aff3c441 100644
--- a/doc/index.rst
+++ b/doc/index.rst
@@ -74,7 +74,7 @@ User Guide
    baselines
    data-model
    databases/index
-   models/index
+   models
    references
    cli
    api
diff --git a/doc/install.rst b/doc/install.rst
index ff05a1fc..c2478b78 100644
--- a/doc/install.rst
+++ b/doc/install.rst
@@ -194,375 +194,4 @@ A list of out-of-the-box supported data modules for :ref:`classification
 is available in this guide.
 
 
-The following databases contain only the tuberculosis final diagnosis (0 or 1).
-In addition to the splits presented in the following table, 10 folds
-(for cross-validation) randomly generated are available for these databases.
-
-.. list-table::
-
-   * - Database
-     - Reference
-     - H x W
-     - Samples
-     - Training
-     - Validation
-     - Test
-   * - Montgomery_
-     - [MONTGOMERY-SHENZHEN-2014]_
-     - 4020 x 4892
-     - 138
-     - 88
-     - 22
-     - 28
-   * - Shenzhen_
-     - [MONTGOMERY-SHENZHEN-2014]_
-     - Varying
-     - 662
-     - 422
-     - 107
-     - 133
-   * - Indian_
-     - [INDIAN-2013]_
-     - Varying
-     - 155
-     - 83
-     - 20
-     - 52
-
-
-.. _mednet.setup.databases.tb+signs:
-
-Tuberculosis multilabel databases
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-The following databases contain the labels healthy, sick & non-TB, active TB,
-and latent TB. The implemented tbx11k database in this package is based on
-the simplified version, which is just a more compact version of the original.
-In addition to the splits presented in the following table, 10 folds
-(for cross-validation) randomly generated are available for these databases.
-
-.. list-table::
-
-   * - Database
-     - Reference
-     - H x W
-     - Samples
-     - Training
-     - Validation
-     - Test
-   * - TBX11K_
-     - [TBX11K-2020]_
-     - 512 x 512
-     - 11'200
-     - 6600
-     - 1800
-     - 2800
-   * - TBX11K_SIMPLIFIED_
-     - [TBX11K-SIMPLIFIED-2020]_
-     - 512 x 512
-     - 11'200
-     - 6600
-     - 1800
-     - 2800
-
-
-.. _mednet.setup.databases.tbmultilabel+signs:
-
-Tuberculosis + radiological findings databases
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-The following databases contain both the tuberculosis final diagnosis (0 or 1)
-and radiological findings.
-
-.. list-table::
-
-   * - Database
-     - Reference
-     - H x W
-     - Samples
-     - Train
-     - Test
-   * - PadChest_
-     - [PADCHEST-2019]_
-     - Varying
-     - 160'861
-     - 160'861
-     - 0
-
-
-.. _mednet.setup.databases.signs:
-
-Radiological findings databases
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-The following database contains only the radiological findings without any
-information about tuberculosis.
-
-.. note::
-
-   NIH CXR14 labels for training and validation sets are the relabeled
-   versions done by the author of the CheXNeXt study [CHEXNEXT-2018]_.
-
-.. list-table::
-
-   * - Database
-     - Reference
-     - H x W
-     - Samples
-     - Training
-     - Validation
-     - Test
-   * - NIH_CXR14_re_
-     - [NIH-CXR14-2017]_
-     - 1024 x 1024
-     - 109'041
-     - 98'637
-     - 6'350
-     - 4'054
-
-
-.. _mednet.setup.databases.hiv-tb:
-
-HIV-Tuberculosis databases
-~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-The following databases contain only the tuberculosis final diagnosis (0 or 1)
-and come from HIV infected patients. 10 folds (for cross-validation) randomly
-generated are available for these databases.
-
-Please contact the authors of these databases to have access to the data.
-
-.. list-table::
-
-   * - Database
-     - Reference
-     - H x W
-     - Samples
-   * - TB POC
-     - [TB-POC-2018]_
-     - 2048 x 2500
-     - 407
-   * - HIV TB
-     - [HIV-TB-2019]_
-     - 2048 x 2500
-     - 243
-
-
-.. _mednet.setup.databases.retinography:
-
-Retinography
-------------
-
-
-.. list-table:: Supported Retinography Datasets (``*``: provided within this package)
-
-   * - Dataset
-     - Reference
-     - H x W
-     - Samples
-     - Mask
-     - Vessel
-     - OD
-     - Cup
-     - Split Reference
-     - Train
-     - Test
-   * - DRIVE_
-     - [DRIVE-2004]_
-     - 584 x 565
-     - 40
-     - ``x``
-     - ``x``
-     -
-     -
-     - [DRIVE-2004]_
-     - 20
-     - 20
-   * - STARE_
-     - [STARE-2000]_
-     - 605 x 700
-     - 20
-     - ``*``
-     - ``x``
-     -
-     -
-     - [MANINIS-2016]_
-     - 10
-     - 10
-   * - CHASE-DB1_
-     - [CHASEDB1-2012]_
-     - 960 x 999
-     - 28
-     - ``*``
-     - ``x``
-     -
-     -
-     - [CHASEDB1-2012]_
-     - 8
-     - 20
-   * - HRF_
-     - [HRF-2013]_
-     - 2336 x 3504
-     - 45
-     - ``x``
-     - ``x``
-     -
-     -
-     - [ORLANDO-2017]_
-     - 15
-     - 30
-   * - IOSTAR_
-     - [IOSTAR-2016]_
-     - 1024 x 1024
-     - 30
-     - ``x``
-     - ``x``
-     - ``x``
-     -
-     - [MEYER-2017]_
-     - 20
-     - 10
-   * - DRIONS-DB_
-     - [DRIONSDB-2008]_
-     - 400 x 600
-     - 110
-     -
-     -
-     - ``x``
-     -
-     - [MANINIS-2016]_
-     - 60
-     - 50
-   * - `RIM-ONE r3`_
-     - [RIMONER3-2015]_
-     - 1424 x 1072
-     - 159
-     -
-     -
-     - ``x``
-     - ``x``
-     - [MANINIS-2016]_
-     - 99
-     - 60
-   * - Drishti-GS1_
-     - [DRISHTIGS1-2014]_
-     - varying
-     - 101
-     -
-     -
-     - ``x``
-     - ``x``
-     - [DRISHTIGS1-2014]_
-     - 50
-     - 51
-   * - REFUGE_
-     - [REFUGE-2018]_
-     - 2056 x 2124 (1634 x 1634)
-     - 1200
-     -
-     -
-     - ``x``
-     - ``x``
-     - [REFUGE-2018]_
-     - 400 (+400)
-     - 400
-   * - DRHAGIS_
-     - [DRHAGIS-2017]_
-     - Varying
-     - 39
-     - ``x``
-     - ``x``
-     -
-     -
-     - [DRHAGIS-2017]_
-     - 19
-     - 20
-
-.. warning:: **REFUGE Dataset Support**
-
-  The original directory ``Training400/AMD`` in REFUGE is considered to be
-  replaced by an updated version provided by the `AMD Grand-Challenge`_ (with
-  matching names).
-
-  The changes concerns images ``A0012.jpg``, which was corrupted in REFUGE, and
-  ``A0013.jpg``, which only exists in the AMD Grand-Challenge version.
-
-
-.. _mednet.setup.databases.xray:
-
-X-Ray
------
-
-.. list-table:: Supported X-Ray Datasets
-
-   * - Dataset
-     - Reference
-     - H x W
-     - Radiography Type
-     - Samples
-     - Mask
-     - Split Reference
-     - Train
-     - Test
-   * - `Montgomery County`_
-     - [MC-2014]_
-     - 4020 x 4892, or 4892 x 4020
-     - Digital Radiography (DR)
-     - 138
-     - ``*``
-     - [GAAL-2020]_
-     - 96 (+14)
-     - 28
-   * - JSRT_
-     - [JSRT-2000]_
-     - 2048 x 2048
-     - Digitized Radiography (laser digitizer)
-     - 247
-     - ``*``
-     - [GAAL-2020]_
-     - 172 (+25)
-     - 50
-   * - Shenzhen_
-     - [SHENZHEN-2014]_
-     - Varying
-     - Computed Radiography (CR)
-     - 662
-     - ``*``
-     - [GAAL-2020]_
-     - 396 (+56)
-     - 114
-   * - CXR8_
-     - [CXR8-2017]_
-     - 1024 x 1024
-     - Digital Radiography
-     - 112120
-     - ``x``
-     - [GAAL-2020]_
-     - 78484 (+11212)
-     - 22424
-
-.. warning:: **SHENZHEN/JSRT/CXR8 Dataset Support**
-
-  For some datasets (in which the annotations/masks are downloaded separately
-  from the dataset with the original images), both the original images and
-  annotations must be downloaded and placed inside the same directory, to match
-  the dataset reference dictionary's path.
-
-  * The Shenzhen_ root directory should then contain at least these two
-    subdirectories:
-
-    - ``CXR_png/`` (directory containing the CXR images)
-    - ``mask/`` (contains masks downloaded from `Shenzhen Annotations`_)
-
-  * The CXR8_ root directory:
-
-    - ``images/`` (directory containing the CXR images)
-    - ``segmentations/`` (contains masks downloaded from `CXR8 Annotations`_)
-
-  * The JSRT_ root directory:
-
-    - ``All247images/`` (directory containing the CXR images, in raw format)
-    - ``scratch/`` (contains masks downloaded from `JSRT Annotations`_)
-
-
 .. include:: links.rst
diff --git a/doc/links.rst b/doc/links.rst
index 4e458599..0899e0da 100644
--- a/doc/links.rst
+++ b/doc/links.rst
@@ -27,7 +27,6 @@
 .. _NIH_CXR14_re: https://nihcc.app.box.com/v/ChestXray-NIHCC
 .. _PadChest: https://bimcv.cipf.es/bimcv-projects/padchest/
 .. _TBX11K: https://mmcheng.net/tb/
-.. _TBX11K_simplified: https://www.kaggle.com/datasets/vbookshelf/tbx11k-simplified
 
 .. _drive: https://github.com/wfdubowen/Retina-Unet/tree/master/DRIVE/
 .. _stare: http://cecas.clemson.edu/~ahoover/stare/
@@ -47,8 +46,8 @@
 
 .. Annotation data websites
 .. _shenzhen annotations: https://www.kaggle.com/yoctoman/shcxr-lung-mask
-.. _cxr8 annotations: https://github.com/lucasmansilla/NIH_chest_xray14_segmentations
-.. _jsrt annotations: https://www.isi.uu.nl/Research/Databases/SCR/download.php
+.. _cxr8-annotations: https://github.com/lucasmansilla/NIH_chest_xray14_segmentations
+.. _jsrt-annotations: https://www.isi.uu.nl/Research/Databases/SCR/download.php
 
 .. models
 .. _imagenet: https://www.image-net.org
diff --git a/doc/models/classify.rst b/doc/models.rst
similarity index 50%
rename from doc/models/classify.rst
rename to doc/models.rst
index a06c4c76..c075e115 100644
--- a/doc/models/classify.rst
+++ b/doc/models.rst
@@ -2,11 +2,19 @@
 ..
 .. SPDX-License-Identifier: GPL-3.0-or-later
 
+.. _mednet.models:
+
+=====================
+ Model Architectures
+=====================
+
+Deep-neural network models are categorized by tasks.
+
+
 .. _mednet.models.classify:
 
-================
- Classification
-================
+Classification
+--------------
 
 Pre-configured models supporting classification tasks.
 
@@ -41,4 +49,42 @@ Pre-configured models supporting classification tasks.
      - :py:class:`.models.classify.pasa.Pasa`
 
 
-.. include:: ../links.rst
+.. _mednet.models.segment:
+
+Semantic Segmentation
+---------------------
+
+Pre-configured models supporting semantic segmentation tasks.
+
+.. list-table:: Pre-configured models
+
+   * - Config. key
+     - Module
+     - Base type
+   * - ``driu``
+     - :py:mod:`.config.segment.models.driu`
+     - :py:class:`.models.segment.driu.DRIU`
+   * - ``driu-bn``
+     - :py:mod:`.config.segment.models.driu_bn`
+     - :py:class:`.models.segment.driu_bn.DRIUBN`
+   * - ``driu-od``
+     - :py:mod:`.config.segment.models.driu_od`
+     - :py:class:`.models.segment.driu_od.DRIUOD`
+   * - ``driu-pix``
+     - :py:mod:`.config.segment.models.driu_pix`
+     - :py:class:`.models.segment.driu_pix.DRIUPix`
+   * - ``hed``
+     - :py:mod:`.config.segment.models.hed`
+     - :py:class:`.models.segment.hed.HED`
+   * - ``lwnet``
+     - :py:mod:`.config.segment.models.lwnet`
+     - :py:class:`.models.segment.lwnet.LittleWNet`
+   * - ``m2unet``
+     - :py:mod:`.config.segment.models.m2unet`
+     - :py:class:`.models.segment.m2unet.M2Unet`
+   * - ``unet``
+     - :py:mod:`.config.segment.models.unet`
+     - :py:class:`.models.segment.unet.Unet`
+
+
+.. include:: links.rst
diff --git a/doc/models/index.rst b/doc/models/index.rst
deleted file mode 100644
index 86ab1778..00000000
--- a/doc/models/index.rst
+++ /dev/null
@@ -1,17 +0,0 @@
-.. SPDX-FileCopyrightText: Copyright © 2024 Idiap Research Institute <contact@idiap.ch>
-..
-.. SPDX-License-Identifier: GPL-3.0-or-later
-
-.. _mednet.models:
-
-========
- Models
-========
-
-Deep-neural network models are categorized by tasks.
-
-.. toctree::
-   :maxdepth: 2
-
-   classify
-   segment
diff --git a/doc/models/segment.rst b/doc/models/segment.rst
deleted file mode 100644
index 55f7b3a4..00000000
--- a/doc/models/segment.rst
+++ /dev/null
@@ -1,44 +0,0 @@
-.. SPDX-FileCopyrightText: Copyright © 2024 Idiap Research Institute <contact@idiap.ch>
-..
-.. SPDX-License-Identifier: GPL-3.0-or-later
-
-.. _mednet.models.segment:
-
-=======================
- Semantic Segmentation
-=======================
-
-Pre-configured models supporting semantic segmentation tasks.
-
-.. list-table:: Pre-configured models
-
-   * - Config. key
-     - Module
-     - Base type
-   * - ``driu``
-     - :py:mod:`.config.segment.models.driu`
-     - :py:class:`.models.segment.driu.DRIU`
-   * - ``driu-bn``
-     - :py:mod:`.config.segment.models.driu_bn`
-     - :py:class:`.models.segment.driu_bn.DRIUBN`
-   * - ``driu-od``
-     - :py:mod:`.config.segment.models.driu_od`
-     - :py:class:`.models.segment.driu_od.DRIUOD`
-   * - ``driu-pix``
-     - :py:mod:`.config.segment.models.driu_pix`
-     - :py:class:`.models.segment.driu_pix.DRIUPix`
-   * - ``hed``
-     - :py:mod:`.config.segment.models.hed`
-     - :py:class:`.models.segment.hed.HED`
-   * - ``lwnet``
-     - :py:mod:`.config.segment.models.lwnet`
-     - :py:class:`.models.segment.lwnet.LittleWNet`
-   * - ``m2unet``
-     - :py:mod:`.config.segment.models.m2unet`
-     - :py:class:`.models.segment.m2unet.M2Unet`
-   * - ``unet``
-     - :py:mod:`.config.segment.models.unet`
-     - :py:class:`.models.segment.unet.Unet`
-
-
-.. include:: ../links.rst
diff --git a/doc/references.rst b/doc/references.rst
index eeade046..60b7812a 100644
--- a/doc/references.rst
+++ b/doc/references.rst
@@ -61,11 +61,6 @@
    In Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern
    Recognition, pages 2646–2655.
 
-.. [TBX11K-SIMPLIFIED-2020] *Liu, Y., Wu, Y.-H., Ban, Y., Wang, H., and Cheng, M.-*,
-   **Rethinking computer-aided tuberculosis diagnosis**,
-   In Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern
-   Recognition, pages 2646–2655.
-
 .. [SCORECAM-2020] *H. Wang et al.*, **Score-CAM: Score-Weighted Visual
    Explanations for Convolutional Neural Networks** 2020 IEEE/CVF Conference on
    Computer Vision and Pattern Recognition Workshops (CVPRW), Seattle, WA, USA,
@@ -164,11 +159,6 @@
    Computer Graphics, Visualization and Computer Vision, 2015.
    https://dspace5.zcu.cz/bitstream/11025/29670/1/Fumero.pdf
 
-.. [SHENZHEN-2014] *S. Jaeger, S. Candemir, S. Antani, Y. X. Wáng, P. X. Lu, G.
-   Thoma*, **Two public chest X-ray datasets for computer-aided screening of
-   pulmonary diseases.**, Quantitative imaging in medicine and surgery. 2014.
-   https://doi:10.3978/j.issn.2223-4292.2014.11.20
-
 .. [STARE-2000] *A. D. Hoover, V. Kouznetsova and M. Goldbaum*, **Locating blood
    vessels in retinal images by piecewise threshold probing of a matched filter
    response**, in IEEE Transactions on Medical Imaging, vol. 19, no. 3, pp.
@@ -187,11 +177,6 @@
    Surface Vessels**, SPIE Journal of Medical Imaging, 2017.
    https://doi.org/10.1117/1.jmi.4.1.014503
 
-.. [MC-2014] *S. Jaeger, S. Candemir, S. Antani, Y. X. Wáng, P. X. Lu, G.
-   Thoma*, **Two public chest X-ray datasets for computer-aided screening of
-   pulmonary diseases.**, Quantitative imaging in medicine and surgery. 2014.
-   https://doi.org/10.3978/j.issn.2223-4292.2014.11.20
-
 .. [VISCERAL-2016] *O. Jimenez-del-Toro et al.*, **Cloud-Based Evaluation of
    Anatomical Structure Segmentation and Landmark Detection Algorithms:
    VISCERAL Anatomy Benchmarks**, IEEE Transactions on Medical Imaging, vol.
diff --git a/src/mednet/data/classify/hivtb.py b/src/mednet/data/classify/hivtb.py
index 79d41475..1c3f4b9c 100644
--- a/src/mednet/data/classify/hivtb.py
+++ b/src/mednet/data/classify/hivtb.py
@@ -3,30 +3,39 @@
 # SPDX-License-Identifier: GPL-3.0-or-later
 """HIV-TB dataset for computer-aided diagnosis (only BMP files).
 
+This databases contain only the tuberculosis final diagnosis (0 or 1) and come
+from HIV infected patients.
+
 * Database reference: [HIV-TB-2019]_
-* Original resolution, varying with most images being 2048 x 2500 pixels
-  or 2500 x 2048 pixels, but not all.
+
+.. important:: **Raw data organization**
+
+    The HIV-TB base datadir, which you should configure following the
+    :ref:`mednet.setup` instructions, must contain at least the directory
+    ``HIV-TB/HIV-TB_Algorithm_study_X-rays`` with all BMP and JPEG images.
 
 Data specifications:
 
 * Raw data input (on disk):
 
   * BMP (BMP3) and JPEG grayscale images encoded as 8-bit RGB, with
-    varying resolution
+    varying resolution (most images being 2048 x 2500 pixels or 2500 x 2048
+    pixels, but not all).
+  * Total samples: 243
 
 * Output image:
 
   * Transforms:
 
-    * Load raw BMP or JPEG with :py:mod:`PIL`
+    * Load raw BMP or JPEG with :py:mod:`PIL`, with auto-conversion to
+      grayscale
     * Remove black borders
     * Convert to torch tensor
-    * Torch center cropping to get square image
 
 * Final specifications
 
-  * Grayscale, encoded as a single plane tensor, 32-bit floats,
-    square at 2048 x 2048 pixels
+  * Grayscale, encoded as a single plane tensor, 32-bit floats, with varying
+    resolution depending on input.
   * Labels: 0 (healthy), 1 (active tuberculosis)
 
 This module contains the base declaration of common data modules and raw-data
diff --git a/src/mednet/data/classify/indian.py b/src/mednet/data/classify/indian.py
index 74d57ab7..88049253 100644
--- a/src/mednet/data/classify/indian.py
+++ b/src/mednet/data/classify/indian.py
@@ -9,31 +9,28 @@ pulmonary tuberculosis (TB).  This database is also known as the "Database
 A/Database B" database.
 
 * Database reference: [INDIAN-2013]_
-* Original images PNG, 8-bit grayscale, 1024 x 1024 pixels
-* Split reference: [INDIAN-2013]_ with 20% of train set for the validation
+* Split references: [INDIAN-2013]_ with 20% of train set for the validation
   set
 
-Data specifications:
-
-* Raw data input (on disk):
+.. important:: **Raw data organization**
 
-  * PNG RGB 8-bit depth images with "inverted" grayscale scale
-  * Variable width and height
+    The Indian_ base datadir, which you should configure following the
+    :ref:`mednet.setup` instructions, must contain at least these two
+    subdirectories:
 
-* Output image:
+    - ``DatasetA/`` (directory containing the dataset A images in JPG format)
+    - ``DatasetB/`` (directory containing the dataset B images in DICOM format)
 
-  * Transforms:
+Data specifications:
 
-    * Load raw PNG with :py:mod:`PIL`
-    * Remove black borders
-    * Convert to torch tensor
-    * Torch center cropping to get square image
+* Raw data input (on disk):
 
-  * Final specifications:
+  * JPG RGB 8-bit depth images with "inverted" grayscale scale, with varying
+    resolution of at least 1024 x 1024 pixels per sample
+  * Samples: 156 images and associated labels
 
-    * Grayscale, encoded as a single plane tensor, 32-bit floats,
-      square, with varying resolutions, depending on the input raw image
-    * Labels: 0 (healthy), 1 (active tuberculosis)
+* Output image:  Use the same transforms and specifications as for
+  :py:mod:`.classify.shenzhen`
 
 This module contains the base declaration of common data modules and raw-data
 loaders for this database. All configured splits inherit from this definition.
diff --git a/src/mednet/data/classify/montgomery.py b/src/mednet/data/classify/montgomery.py
index 04c8362d..a7bd1c1a 100644
--- a/src/mednet/data/classify/montgomery.py
+++ b/src/mednet/data/classify/montgomery.py
@@ -7,16 +7,16 @@ The standard digital image database for Tuberculosis was created by the
 National Library of Medicine, Maryland, USA in collaboration with Shenzhen No.3
 People’s Hospital, Guangdong Medical College, Shenzhen, China.
 
-* Database reference: [MONTGOMERY-SHENZHEN-2014]_
-* Original resolution (height x width or width x height): 4020x4892 px or
-  4892x4020 px
+* Database references: [MONTGOMERY-SHENZHEN-2014]_,
 
 Data specifications:
 
 * Raw data input (on disk):
 
-  * PNG images 8 bit grayscale
-  * resolution: fixed to one of the cases above
+  * PNG images 8 bit grayscale issued from digital radiography machines
+  * Original resolution (height x width or width x height): 4020x4892 px or
+    4892x4020 px
+  * Samples: 138 images and associated labels
 
 * Output image:
 
@@ -29,7 +29,7 @@ Data specifications:
   * Final specifications
 
     * Grayscale, encoded as a single plane tensor, 32-bit floats,
-      square at 4020 x 4020 pixels
+      square at most 4020 x 4020 pixels
     * Labels: 0 (healthy), 1 (active tuberculosis)
 
 This module contains the base declaration of common data modules and raw-data
diff --git a/src/mednet/data/classify/nih_cxr14.py b/src/mednet/data/classify/nih_cxr14.py
index df91662f..5a1981a8 100644
--- a/src/mednet/data/classify/nih_cxr14.py
+++ b/src/mednet/data/classify/nih_cxr14.py
@@ -10,19 +10,35 @@ cardiomegaly, emphysema, effusion, hernia, infiltration, mass, nodule,
 atelectasis, pneumothorax, pleural thickening, pneumonia, fibrosis, edema and
 consolidation. This is the relabeled version created in the CheXNeXt study.
 
-* Reference: [NIH-CXR14-2017]_
+* Database references:
+
+  * Original data: [NIH-CXR14-2017]_
+  * Labels and split references: [CHEXNEXT-2018]_
+
+.. important:: **Raw data organization**
+
+    The NIH_CXR14_re_ base datadir, which you should configure following the
+    :ref:`mednet.setup` instructions, must contain at least the directory
+    "images/" with all the images of the database.
+
+    The labels from [CHEXNEXT-2018]_ are already incorporated in this library
+    and do **not** need to be re-downloaded.
+
+    The flag ``idiap_folder_structure`` makes the loader search for files
+    named, e.g. ``images/00030621_006.png``, as
+    ``images/00030/00030621_006.png``.
+
 * Raw data input (on disk):
 
   * PNG RGB 8-bit depth images
   * Resolution: 1024 x 1024 pixels
+  * Total samples available: 109'041
 
-* Labels: [CHEXNEXT-2018]_
-* Split reference: [CHEXNEXT-2018]_
 * Output image:
 
   * Transforms:
 
-    * Load raw PNG with :py:mod:`PIL`
+    * Load raw PNG with :py:mod:`PIL`, with auto-conversion to grayscale
     * Convert to torch tensor
 
   * Final specifications:
diff --git a/src/mednet/data/classify/shenzhen.py b/src/mednet/data/classify/shenzhen.py
index 5a13789d..239be760 100644
--- a/src/mednet/data/classify/shenzhen.py
+++ b/src/mednet/data/classify/shenzhen.py
@@ -11,21 +11,31 @@ using Philips DR Digital Diagnose systems.
 
 * Database reference: [MONTGOMERY-SHENZHEN-2014]_
 
+.. important:: **Raw data organization**
+
+    The Shenzhen_ base datadir, which you should configure following the
+    :ref:`mednet.setup` instructions, must contain at this subdirectory:
+
+    - ``CXR_png/`` (directory containing the CXR images)
+
 Data specifications:
 
 * Raw data input (on disk):
 
-  * PNG 8-bit RGB images (grayscale, but encoded as RGB images with
-    "inverted" grayscale scale requiring special treatment).
-  * Variable width and height of 3000 x 3000 pixels or less
+  * PNG 8-bit RGB images issued from digital radiography machines (grayscale,
+    but encoded as RGB images with "inverted" grayscale scale requiring special
+    treatment).
+  * Original resolution: variable width and height of 3000 x 3000 pixels or
+    less
+  * Samples: 662 images and associated labels
 
 * Output image:
 
   * Transforms:
 
-    * Load raw PNG with :py:mod:`PIL`
-    * Remove black borders
-    * Torch center cropping to get square image
+    * Load raw data with :py:mod:`PIL` with auto-conversion to grayscale
+    * Remove (completely) black borders
+    * Convert to torch tensor
 
   * Final specifications:
 
diff --git a/src/mednet/data/classify/tbpoc.py b/src/mednet/data/classify/tbpoc.py
index bb25f2ac..5f4c5d4b 100644
--- a/src/mednet/data/classify/tbpoc.py
+++ b/src/mednet/data/classify/tbpoc.py
@@ -3,16 +3,25 @@
 # SPDX-License-Identifier: GPL-3.0-or-later
 """TB-POC dataset for computer-aided diagnosis.
 
+This databases contain only the tuberculosis final diagnosis (0 or 1) and come
+from HIV infected patients.
+
 * Database reference: [TB-POC-2018]_
-* Original resolution (height x width or width x height): 2048 x 2500 pixels
-  or 2500 x 2048 pixels
+
+.. important:: **Raw data organization**
+
+    The TB-POC base datadir, which you should configure following the
+    :ref:`mednet.setup` instructions, must contain at least the directory
+    ``TBPOC_CXR`` with all JPEG images.
 
 Data specifications:
 
 * Raw data input (on disk):
 
   * JPEG 8-bit Grayscale images
-  * resolution: fixed to one of the cases above
+  * Original resolution (height x width or width x height): 2048 x 2500 pixels
+    or 2500 x 2048 pixels
+  * Total samples: 407
 
 * Output image:
 
diff --git a/src/mednet/data/classify/tbx11k.py b/src/mednet/data/classify/tbx11k.py
index 2752a53c..58516d73 100644
--- a/src/mednet/data/classify/tbx11k.py
+++ b/src/mednet/data/classify/tbx11k.py
@@ -81,6 +81,15 @@
     The selection of samples is stratified (see comments through our split
     code, which is shipped alongside this file.)
 
+.. important:: **Raw data organization**
+
+    The TBX11k_ base datadir, which you should configure following the
+    :ref:`mednet.setup` instructions, must contain at least these two
+    subdirectories:
+
+    - ``imgs/`` (directory containing sub-directories and images in PNG format)
+    - ``annotations/`` (directory containing labels in JSON and XML format)
+
 Data specifications:
 
 * Raw data input (on disk): PNG images 8 bits RGB, 512 x 512 pixels
@@ -90,6 +99,7 @@ Data specifications:
   * Transforms:
 
     - Load raw PNG with :py:mod:`PIL`
+    - Convert to torch tensor
 
   * Final specifications:
 
diff --git a/src/mednet/data/segment/chasedb1.py b/src/mednet/data/segment/chasedb1.py
index da4c0fba..e55def08 100644
--- a/src/mednet/data/segment/chasedb1.py
+++ b/src/mednet/data/segment/chasedb1.py
@@ -17,17 +17,28 @@ blood vessels as compared with the background and wider arteriolars that have a
 bright strip running down the centre known as the central vessel reflex.
 
 * Reference: [CHASEDB1-2012]_
-* Original resolution (height x width): 960 x 999
-* Split reference: [CHASEDB1-2012]_
-* Protocol ``first-annotator``:
 
-  * Training samples: 8 (including labels from annotator "1stHO")
-  * Test samples: 20 (including labels from annotator "1stHO")
+Data specifications:
 
-* Protocol ``second-annotator``:
+* Raw data input (on disk):
 
-  * Training samples: 8 (including labels from annotator "2ndHO")
-  * Test samples: 20 (including labels from annotator "2ndHO")
+  * RGB images encoded in JPG format with resolution (HxW) = 960 x 999 pixels.
+  * Vessel annotations are encoded as PNG images with the same resolution as
+    input samples.
+  * Masks for the eye fundus are provided by this package.
+  * Total samples: 28
+
+* Output sample:
+
+    * Image: Load raw JPG images with :py:mod:`PIL`, with auto-conversion to RGB.
+    * Vessel annotations: Load annotations with :py:mod:`PIL`, with
+      auto-conversion to model ``1`` with no dithering.
+    * Eye fundus mask: Load mask with :py:mod:`PIL`, with
+      auto-conversion to model ``1`` with no dithering.
+
+Split ``first-annotator`` contains 8 training samples and 20 tests samples
+annotated by expert 1.  Split ``second-annotator`` contains the sample samples
+as in ``first-annotator``, but annotated by expert 2.
 
 This module contains the base declaration of common data modules and raw-data
 loaders for this database. All configured splits inherit from this definition.
diff --git a/src/mednet/data/segment/cxr8.py b/src/mednet/data/segment/cxr8.py
index 6adefe4d..570c2e59 100644
--- a/src/mednet/data/segment/cxr8.py
+++ b/src/mednet/data/segment/cxr8.py
@@ -3,18 +3,46 @@
 # SPDX-License-Identifier: GPL-3.0-or-later
 """ChestX-ray8: Hospital-scale Chest X-ray Database.
 
-The database contains a total of 112120 images. Image size for each X-ray is
-1024 x 1024. One set of mask annotations is available for all images.
-
-* Reference: [CXR8-2017]_
-* Original resolution (height x width): 1024 x 1024
-* Configuration resolution: 256 x 256 (after rescaling)
-* Split reference: [GAAL-2020]_
-* Protocol ``default``:
-
-  * Training samples: 78484 (including labels)
-  * Validation samples: 11212 (including labels)
-  * Test samples: 22424 (including labels)
+The database contains a total of 112'120 images. Image size for each X-ray is
+1024 x 1024. One set of automatically generated mask annotations is available
+for all images.
+
+* Database references:
+
+  * Original data: [CXR8-2017]_
+  * Split reference: [GAAL-2020]_
+
+.. important:: **Raw data organization**
+
+    The CXR8_ base datadir, which you should configure following the
+    :ref:`mednet.setup` instructions, must contain at least the following
+    directories:
+
+    - ``images/`` (directory containing the CXR images, in PNG format)
+    - ``segmentations/`` (must contain masks downloaded from `CXR8-Annotations`_)
+
+    The flag ``idiap_folder_structure`` makes the loader search for files
+    named, e.g. ``images/00030621_006.png``, as
+    ``images/00030/00030621_006.png`` (this is valid for both images and
+    segmentation masks).
+
+* Raw data input (on disk):
+
+  * PNG RGB 8-bit depth images
+  * Resolution: 1024 x 1024 pixels
+  * Total samples available: 112'120
+
+* Output image:
+
+  * Transforms:
+
+    * Load raw PNG with :py:mod:`PIL`, with auto-conversion to RGB, convert to
+      tensor
+    * Labels for each of the lungs are read from the provided GIF files and
+      merged into a single output image.
+
+The ``default`` split contains 78'484 images for training, 11'212 images for
+validation, and 22'424 images for testing.
 
 This module contains the base declaration of common data modules and raw-data
 loaders for this database. All configured splits inherit from this definition.
diff --git a/src/mednet/data/segment/drive.py b/src/mednet/data/segment/drive.py
index 0b704d3a..995cf1ea 100644
--- a/src/mednet/data/segment/drive.py
+++ b/src/mednet/data/segment/drive.py
@@ -4,19 +4,29 @@
 """DRIVE dataset for vessel segmentation.
 
 The DRIVE database has been established to enable comparative studies on
-segmentation of blood vessels in retinal images.
+segmentation of blood vessels in retinal images.  The database contains
+annotations from 2 different experts (only for the test set).
 
-* Reference: [DRIVE-2004]_
-* Original resolution (height x width): 584 x 565
-* Split reference: [DRIVE-2004]_
-* Protocol ``default``:
+* Database reference: [DRIVE-2004]_
 
-  * Training samples: 20 (including labels and masks)
-  * Test samples: 20 (including labels from annotator 1 and masks)
+Data specifications:
 
-* Protocol ``second-annotator``:
+* Raw data input (on disk):
 
-  * Test samples: 20 (including labels from annotator 2 and masks)
+  * RGB images encoded in TIFF format with resolution (HxW) = 584 x 565 pixels
+  * Total samples: 40
+
+* Output sample:
+
+    * Image: Load raw TIFF images with :py:mod:`PIL`, with auto-conversion to RGB.
+    * Vessel annotations: Load annotations with :py:mod:`PIL`, with
+      auto-conversion to model ``1`` with no dithering.
+    * Eye fundus mask: Load mask with :py:mod:`PIL`, with
+      auto-conversion to model ``1`` with no dithering.
+
+Split ``default`` includes 20 images for training and another 20 for
+testing.  Split ``second-annotator`` includes only the 20 test images with
+different vessel annotations (expert 2).
 
 This module contains the base declaration of common data modules and raw-data
 loaders for this database. All configured splits inherit from this definition.
diff --git a/src/mednet/data/segment/hrf.py b/src/mednet/data/segment/hrf.py
index 80a7f2c0..720a35ae 100644
--- a/src/mednet/data/segment/hrf.py
+++ b/src/mednet/data/segment/hrf.py
@@ -8,14 +8,28 @@ glaucomatous eyes.  It contains a total  of 45 eye fundus images with a
 resolution of 3304 x 2336. One set of ground-truth vessel annotations is
 available.
 
-* Reference: [HRF-2013]_
-* Original resolution (height x width): 2336 x 3504
-* Configuration resolution: 1168 x 1648 (after specific cropping and rescaling)
-* Split reference: [ORLANDO-2017]_
-* Protocol ``default``:
-
-* Training samples: 15 (including labels)
-* Test samples: 30 (including labels)
+* Database references:
+
+  * Original data: [HRF-2013]_
+  * Split reference: [ORLANDO-2017]_
+
+Data specifications:
+
+* Raw data input (on disk):
+
+  * Original images encoded in (color) JPG format, with resolution 3504 x 2336
+    pixels (width x height).
+  * Vessel labels: encoded as TIFF files, with the same resolution as original
+    images.
+  * Total samples: 45
+
+* Output sample:
+
+  * Image: Load raw JPG images with :py:mod:`PIL`, with auto-conversion to RGB.
+  * Vessel annotations: Load annotations with :py:mod:`PIL`, with
+    auto-conversion to mode ``1`` with no dithering.
+
+The ``default`` split contains 15 images for training and 30 for testing.
 
 This module contains the base declaration of common data modules and raw-data
 loaders for this database. All configured splits inherit from this definition.
diff --git a/src/mednet/data/segment/jsrt.py b/src/mednet/data/segment/jsrt.py
index 5398b5d2..c0c96e83 100644
--- a/src/mednet/data/segment/jsrt.py
+++ b/src/mednet/data/segment/jsrt.py
@@ -4,18 +4,48 @@
 """Japanese Society of Radiological Technology dataset for lung segmentation.
 
 The database includes 154 nodule and 93 non-nodule images.  It contains a total
-of 247 resolution of 2048 x 2048.  One set of ground-truth lung annotations is
+of 247 resolution of 2048 x 2048 pixels, issued from original digitized
+Radiographies (laser scanner). One set of ground-truth lung annotations is
 available.
 
-* Reference: [JSRT-2000]_
-* Original resolution (height x width): 2048 x 2048
-* Configuration resolution: 1024 x 1024 (after rescaling)
-* Split reference: [GAAL-2020]_
-* Protocol ``default``:
+* Database references:
 
-* Training samples: 172 (including labels)
-* Validation samples: 25 (including labels)
-* Test samples: 50 (including labels)
+  * Original data: [JSRT-2000]_
+  * Split: [GAAL-2020]_
+
+.. important:: **Raw data organization**
+
+   The JSRT_ base datadir, which you should configure following the
+   :ref:`mednet.setup` instructions, must contain at least the following
+   directories:
+
+   - ``All247images/`` (directory containing the CXR images, in raw format)
+   - ``scratch/`` (must contain masks downloaded from `JSRT-Annotations`_)
+
+Data specifications:
+
+* Raw data input (on disk):
+
+  * Original images encoded in proprietary 12-bit RAW format.  A PNG-converted
+    set of images is provided at JSRT-Kaggle_ for your reference.  Input
+    resolution is 2048 x 2048 pixels.
+  * Masks: encoded as GIF files with separate portions for left and right
+    lungs, with a resolution of 1024 x 1024 pixels
+  * Total samples: 247
+
+* Output sample:
+
+    * Image: Load raw image from folder ``All247images/`` using
+      :py:func:`numpy.fromfile`, then applies a simple histogram equalization
+      to the 8-bit representation of the image, to obtain something along the
+      lines of the PNG (unofficial) version distributed at JSRT-Kaggle_.
+      Output images have a size of 1024 x 1024 pixels, achieved by resizing the
+      original input with bilinear interpolation.
+    * Labels for each of the lungs are read from the provided GIF files and
+      merged into a single output image.
+
+The ``default`` split contains 172 samples for training, 25 for validation and
+50 for test.
 
 This module contains the base declaration of common data modules and raw-data
 loaders for this database. All configured splits inherit from this definition.
@@ -97,7 +127,9 @@ class RawDataLoader(SegmentationRawDataLoader):
             The sample representation.
         """
 
-        image = to_tensor(self.load_pil_raw_12bit_jsrt(self.datadir / sample[0]))
+        image = self.load_pil_raw_12bit_jsrt(self.datadir / sample[0])
+        assert image.size == (2048, 2048)
+        image = to_tensor(image.resize((1024, 1024), PIL.Image.Resampling.BILINEAR))
 
         # Combine left and right lung masks into a single tensor
         assert sample[2] is not None
diff --git a/src/mednet/data/segment/montgomery.py b/src/mednet/data/segment/montgomery.py
index 69a21fcb..c14f14d5 100644
--- a/src/mednet/data/segment/montgomery.py
+++ b/src/mednet/data/segment/montgomery.py
@@ -7,7 +7,7 @@ The standard digital image database for Tuberculosis was created by the National
 Library of Medicine, Maryland, USA in collaboration with Shenzhen No.3 People’s
 Hospital, Guangdong Medical College, Shenzhen, China. The Chest X-rays are from
 
-* Database reference: [MONTGOMERY-SHENZHEN-2014]_
+* Database reference: [MONTGOMERY-SHENZHEN-2014]_, [GAAL-2020]_
 * Original resolution (height x width or width x height): 4020x4892 px or
   4892x4020 px
 
@@ -15,8 +15,10 @@ Data specifications:
 
 * Raw data input (on disk):
 
-  * PNG images 8 bit grayscale
-  * resolution: fixed to one of the cases above
+  * PNG images 8 bit grayscale issued from digital radiography machines
+  * Original resolution (height x width or width x height): 4020x4892 px or
+    4892x4020 px
+  * Samples: 138 images and associated labels
 
 * Output image:
 
@@ -29,9 +31,9 @@ Data specifications:
 
     * image: Grayscale, encoded as a single plane tensor, 32-bit floats,
       original size.
-    * target: A mask containing ones where lungs are in the original image,
-      otherwise, zeroes.
-    * mask: All ones (no specific mask)
+    * target: A binary mask containing ones where lungs are in the original
+      image, otherwise, zeroes.
+    * mask: Binary, with all ones (no specific mask)
 
 This module contains the base declaration of common data modules and raw-data
 loaders for this database. All configured splits inherit from this definition.
diff --git a/src/mednet/data/segment/refuge.py b/src/mednet/data/segment/refuge.py
index 6b985189..1aa6d028 100644
--- a/src/mednet/data/segment/refuge.py
+++ b/src/mednet/data/segment/refuge.py
@@ -8,29 +8,40 @@ challenge. The goal of the challenge is to evaluate and compare automated
 algorithms for glaucoma detection and optic disc/cup segmentation on a common
 dataset of retinal fundus images.
 
-* Reference (including train/dev/test split): [REFUGE-2018]_
-* Protocols ``optic-disc`` and ``cup``:
+* Database reference (including train/dev/test split): [REFUGE-2018]_
 
-* Training samples:
+.. warning::
 
-  * 400
-  * includes optic-disc and cup labels
-  * includes label: glaucomatous and non-glaucomatous
-  * original resolution: 2056 x 2124
+   The original directory ``Training400/AMD`` in REFUGE is considered to be
+   replaced by an updated version provided by the `AMD Grand-Challenge`_ (with
+   matching names).
 
-* Validation samples:
+   The changes concerns images ``A0012.jpg``, which was corrupted in REFUGE,
+   and ``A0013.jpg``, which only exists in the AMD Grand-Challenge version.
 
-  * 400
-  * includes optic-disc and cup labels
-  * includes label: glaucomatous and non-glaucomatous
-  * original resolution: 1634 x 1634
+Data specifications:
 
-* Test samples:
+* Raw data input (on disk):
 
-  * 400
-  * includes optic-disc and cup labels
-  * includes label: glaucomatous and non-glaucomatous
-  * original resolution:
+  * RGB images encoded in JPG format with varying resolution.  Training images
+    are (HxW) 2056 x 2124 pixels; Validation (and test) images are 1634 x 1634
+    pixels.
+  * Vessel annotations are encoded as BMP images with the same resolution as
+    input samples.
+  * Masks for the eye fundus are provided by this package.
+  * Total samples: 1200 distributed as 400 (training), 400 (validation) and 400
+    (test).
+
+* Output sample:
+
+    * Image: Load raw TIFF images with :py:mod:`PIL`, with auto-conversion to RGB.
+    * Vessel annotations: Load annotations with :py:mod:`PIL`, with
+      auto-conversion to mode ``1`` with no dithering.
+    * Eye fundus mask: Load mask with :py:mod:`PIL`, with
+      auto-conversion to mode ``1`` with no dithering.
+
+Splits ``optic-disc`` and ``cup`` contain annotations for optic-disc or cup
+segmentation.
 
 This module contains the base declaration of common data modules and raw-data
 loaders for this database. All configured splits inherit from this definition.
diff --git a/src/mednet/data/segment/shenzhen.py b/src/mednet/data/segment/shenzhen.py
index 188d5778..c90cc7f1 100644
--- a/src/mednet/data/segment/shenzhen.py
+++ b/src/mednet/data/segment/shenzhen.py
@@ -1,22 +1,57 @@
 # SPDX-FileCopyrightText: Copyright © 2024 Idiap Research Institute <contact@idiap.ch>
 #
 # SPDX-License-Identifier: GPL-3.0-or-later
-"""Shenzhen No.3 People’s Hospital dataset for lung segmentation.
+"""Shenzhen DataModule for computer-aided semantic sementation of lungs.
+
+The standard digital image database for Tuberculosis was created by the
+National Library of Medicine, Maryland, USA in collaboration with Shenzhen No.3
+People’s Hospital, Guangdong Medical College, Shenzhen, China. The Chest X-rays
+are from out-patient clinics, and were captured as part of the daily routine
+using Philips DR Digital Diagnose systems.
 
 The database includes 336 cases with manifestation of tuberculosis, and 326
-normal cases.  It contains a total  of 662 images. Image size varies for each
+normal cases.  It contains a total  of 662 images.  Image size varies for each
 X-ray. It is approximately 3K x 3K. One set of ground-truth lung annotations is
 available for 566 of the 662 images.
 
-* Reference: [SHENZHEN-2014]_
-* Original resolution (height x width): Approximately 3K x 3K (varies)
-* Configuration resolution: 512 x 512 (after rescaling)
-* Split reference: [GAAL-2020]_
-* Protocol ``default``:
+* Database references:
+
+  * Original data [MONTGOMERY-SHENZHEN-2014]_
+  * Splits: [GAAL-2020]_
+
+.. important:: **Raw data organization**
+
+    The Shenzhen_ base datadir, which you should configure following the
+    :ref:`mednet.setup` instructions, must contain at least these two
+    subdirectories:
+
+    - ``CXR_png/`` (directory containing the CXR images)
+    - ``mask/`` (contains masks downloaded from `Shenzhen Annotations`_)
+
+Data specifications:
+
+* Raw data input (on disk):
+
+  * PNG 8-bit RGB images issued from digital radiography machines (grayscale,
+    but encoded as RGB images with "inverted" grayscale scale requiring special
+    treatment).
+  * Original resolution: variable width and height of 3000 x 3000 pixels or
+    less
+  * Samples: 566 images and associated labels
+
+* Output image:
+
+  * Transforms:
+
+    * Load raw PNG with :py:mod:`PIL`
+    * Torch center cropping to get square image
+
+  * Final specifications:
 
-* Training samples: 396 (including labels)
-* Validation samples: 56 (including labels)
-* Test samples: 114 (including labels)
+    * Grayscale, encoded as a 3-plane plane tensor, 32-bit floats,
+      square with varying resolutions, depending on the input image
+    * Labels: Binary mask with annotated lungs (1 where lungs are; 0 otherwise)
+    * Mask: Binary mask with all ones
 
 This module contains the base declaration of common data modules and raw-data
 loaders for this database. All configured splits inherit from this definition.
diff --git a/src/mednet/data/segment/stare.py b/src/mednet/data/segment/stare.py
index 1a89c1e7..8be5936a 100644
--- a/src/mednet/data/segment/stare.py
+++ b/src/mednet/data/segment/stare.py
@@ -9,18 +9,30 @@ vessel annotations are available. The first set by Adam Hoover ("ah") is
 commonly used for training and testing. The second set by Valentina Kouznetsova
 ("vk") is typically used as a “human” baseline.
 
-* Reference: [STARE-2000]_
-* Original resolution (width x height): 700 x 605
-* Split reference: [MANINIS-2016]_
-* Protocol ``ah`` (default baseline):
+* Database references:
 
-  * Training samples: 10 (including labels from annotator "ah")
-  * Test samples: 10 (including labels from annotator "ah")
+  * Original data: [STARE-2000]_
+  * Split reference: [MANINIS-2016]_
 
-* Protocol ``vk`` (normally used as human comparison):
+Data specifications:
 
-  * Training samples: 10 (including labels from annotator "vk")
-  * Test samples: 10 (including labels from annotator "vk")
+* Raw data input (on disk):
+
+  * RGB images encoded in PPM format with resolution (HxW) = 605 x 700
+  * Total samples: 397 (out of which only 20 are annotated for vessel
+    segmentation)
+
+* Output sample:
+
+    * Image: Load raw PPM images with :py:mod:`PIL`, with auto-conversion to RGB.
+    * Vessel annotations: Load annotations with :py:mod:`PIL`, with
+      auto-conversion to model ``1`` with no dithering.
+    * Eye fundus mask: Load mask with :py:mod:`PIL`, with
+      auto-conversion to model ``1`` with no dithering.
+
+Protocol ``ah`` (default baseline, with first, more detailed annotator)
+includes 10 training samples and 10 test samples.  Protocol ``vk`` (second
+annotator) includes the same samples but annotated by a second expert.
 
 This module contains the base declaration of common data modules and raw-data
 loaders for this database. All configured splits inherit from this definition.
diff --git a/src/mednet/scripts/database.py b/src/mednet/scripts/database.py
index a987e30e..50a89a6b 100644
--- a/src/mednet/scripts/database.py
+++ b/src/mednet/scripts/database.py
@@ -21,7 +21,7 @@ def _get_raw_databases() -> dict[str, dict[str, str | list]]:
         * ``module``: the full Pythonic module name (e.g.
           ``mednet.data.classify.montgomery``).
         * ``datadir``: points to the user-configured data directory for the
-          current dataset, if set, or ``None`` otherwise.
+          current database, if set, or ``None`` otherwise.
     """
 
     import importlib
@@ -52,7 +52,7 @@ def _get_raw_databases() -> dict[str, dict[str, str | list]]:
 
 
 def _list_raw_databases():
-    """List raw datasets to a string representation."""
+    """List raw databases to a string representation."""
 
     def _echo(left: str, right: str, color: str = "white") -> None:
         s = [
@@ -117,7 +117,7 @@ def list_():
 @database.command(
     epilog="""Examples:
 
-    1. Check if all files from the split 'montgomery-f0' of the Montgomery
+    1. Check if all files from the config split 'montgomery-f0' of the Montgomery
        database can be loaded:
 
        .. code:: sh
@@ -133,25 +133,27 @@ def list_():
 @click.option(
     "--limit",
     "-l",
-    help="Limit check to the first N samples in each split dataset, making the "
-    "check sensibly faster.  Set it to zero (default) to check everything.",
+    help="Limit check to the first N samples in each split in the "
+    "configuration, making the check sensibly faster. Set it to "
+    "zero (default) to check everything.",
     required=True,
     type=click.IntRange(0),
     default=0,
+    show_default=True,
 )
 @verbosity_option(logger=logger, expose_value=False)
 def check(entrypoint, limit):  # numpydoc ignore=PR01
-    """Check file access on one or more DataModules."""
+    """Check file access on a database configuration split."""
     import importlib.metadata
     import sys
 
-    click.secho(f"Checking entrypoint `{entrypoint}`...", fg="yellow")
+    click.secho(f"Checking database split config `{entrypoint}`...", fg="yellow")
     try:
         module = importlib.metadata.entry_points(group="mednet.config")[
             entrypoint
         ].module
     except KeyError:
-        raise Exception(f"Could not find database entrypoint `{entrypoint}`")
+        raise Exception(f"Could not find database split config `{entrypoint}`")
 
     datamodule = importlib.import_module(module).datamodule
 
@@ -165,19 +167,21 @@ def check(entrypoint, limit):  # numpydoc ignore=PR01
     for k, loader in loaders.items():
         if limit == 0:
             click.secho(
-                f"Checking all samples of dataset `{k}` at entrypoint `{entrypoint}`...",
+                f"Checking all {len(loader)} samples of split `{k}` at config "
+                f"`{entrypoint}`...",
                 fg="yellow",
             )
             loader_limit = sys.maxsize
         else:
             click.secho(
                 f"Checking first {limit} samples of dataset "
-                f"`{k}` at entrypoint `{entrypoint}`...",
+                f"`{k}` at config `{entrypoint}`...",
                 fg="yellow",
             )
             loader_limit = limit
-        # the for loop will trigger raw data loading (ie. user code), protect
-        # it
+
+        # the for loop will trigger raw data loading (ie. user code), protect it
+        i = 0
         try:
             for i, batch in enumerate(loader):
                 if loader_limit == 0:
@@ -194,7 +198,7 @@ def check(entrypoint, limit):  # numpydoc ignore=PR01
                     )
                 loader_limit -= 1
         except Exception:
-            logger.exception(f"Unable to load batch {i} in dataset {k}")
+            logger.exception(f"Unable to load sample {i} at split {k}")
             errors += 1
 
     if not errors:
diff --git a/src/mednet/scripts/utils.py b/src/mednet/scripts/utils.py
index 86eedabb..bcd246d6 100644
--- a/src/mednet/scripts/utils.py
+++ b/src/mednet/scripts/utils.py
@@ -117,12 +117,13 @@ def execution_metadata() -> dict[str, int | float | str | dict[str, str] | list[
         * ``accelerator``: acceleration devices available (e.g. ``cuda``)
     """
 
+    import datetime
     import importlib.metadata
     import importlib.util
     import os
     import sys
 
-    args = []
+    args: list[str] = []
     for k in sys.argv:
         if " " in k:
             args.append(f"'{k}'")
@@ -130,7 +131,7 @@ def execution_metadata() -> dict[str, int | float | str | dict[str, str] | list[
             args.append(k)
 
     # current date time, in ISO8610 format
-    datetime = __import__("datetime").datetime.now().astimezone().isoformat()
+    current_datetime = datetime.datetime.now().astimezone().isoformat()
 
     # collects dependency information
     package_name = __package__.split(".")[0] if __package__ is not None else "unknown"
@@ -194,7 +195,7 @@ def execution_metadata() -> dict[str, int | float | str | dict[str, str] | list[
     }
 
     return {
-        "datetime": datetime,
+        "datetime": current_datetime,
         "package-name": package_name,
         "package-version": current_version,
         "python": python,
-- 
GitLab