From 4c71da477379f7b3a60aa0136f28db21e36e867c Mon Sep 17 00:00:00 2001
From: mdelitroz <maxime.delitroz@idiap.ch>
Date: Wed, 19 Jul 2023 11:33:01 +0200
Subject: [PATCH] updated Montgomery dataset following new design principles
 added in the add-Datamodule branch

---
 src/ptbench/data/montgomery/__init__.py   |  88 -----------------
 src/ptbench/data/montgomery/default.py    |  72 +++++++-------
 src/ptbench/data/montgomery/fold_0.py     |  66 ++++++-------
 src/ptbench/data/montgomery/fold_0_rgb.py |  47 ---------
 src/ptbench/data/montgomery/fold_1.py     |  66 ++++++-------
 src/ptbench/data/montgomery/fold_1_rgb.py |  47 ---------
 src/ptbench/data/montgomery/fold_2.py     |  66 ++++++-------
 src/ptbench/data/montgomery/fold_2_rgb.py |  47 ---------
 src/ptbench/data/montgomery/fold_3.py     |  67 +++++++------
 src/ptbench/data/montgomery/fold_3_rgb.py |  47 ---------
 src/ptbench/data/montgomery/fold_4.py     |  67 +++++++------
 src/ptbench/data/montgomery/fold_4_rgb.py |  47 ---------
 src/ptbench/data/montgomery/fold_5.py     |  67 +++++++------
 src/ptbench/data/montgomery/fold_5_rgb.py |  47 ---------
 src/ptbench/data/montgomery/fold_6.py     |  67 +++++++------
 src/ptbench/data/montgomery/fold_6_rgb.py |  47 ---------
 src/ptbench/data/montgomery/fold_7.py     |  67 +++++++------
 src/ptbench/data/montgomery/fold_7_rgb.py |  47 ---------
 src/ptbench/data/montgomery/fold_8.py     |  67 +++++++------
 src/ptbench/data/montgomery/fold_8_rgb.py |  47 ---------
 src/ptbench/data/montgomery/fold_9.py     |  67 +++++++------
 src/ptbench/data/montgomery/fold_9_rgb.py |  47 ---------
 src/ptbench/data/montgomery/loader.py     | 114 ++++++++++++++++++++++
 src/ptbench/data/montgomery/rgb.py        |  47 ---------
 24 files changed, 479 insertions(+), 979 deletions(-)
 delete mode 100644 src/ptbench/data/montgomery/fold_0_rgb.py
 delete mode 100644 src/ptbench/data/montgomery/fold_1_rgb.py
 delete mode 100644 src/ptbench/data/montgomery/fold_2_rgb.py
 delete mode 100644 src/ptbench/data/montgomery/fold_3_rgb.py
 delete mode 100644 src/ptbench/data/montgomery/fold_4_rgb.py
 delete mode 100644 src/ptbench/data/montgomery/fold_5_rgb.py
 delete mode 100644 src/ptbench/data/montgomery/fold_6_rgb.py
 delete mode 100644 src/ptbench/data/montgomery/fold_7_rgb.py
 delete mode 100644 src/ptbench/data/montgomery/fold_8_rgb.py
 delete mode 100644 src/ptbench/data/montgomery/fold_9_rgb.py
 create mode 100644 src/ptbench/data/montgomery/loader.py
 delete mode 100644 src/ptbench/data/montgomery/rgb.py

diff --git a/src/ptbench/data/montgomery/__init__.py b/src/ptbench/data/montgomery/__init__.py
index 65239cbf..e69de29b 100644
--- a/src/ptbench/data/montgomery/__init__.py
+++ b/src/ptbench/data/montgomery/__init__.py
@@ -1,88 +0,0 @@
-# SPDX-FileCopyrightText: Copyright © 2023 Idiap Research Institute <contact@idiap.ch>
-#
-# SPDX-License-Identifier: GPL-3.0-or-later
-
-"""Montgomery dataset for computer-aided diagnosis.
-
-The Montgomery database has been established to foster research
-in computer-aided diagnosis of pulmonary diseases with a special
-focus on pulmonary tuberculosis (TB).
-
-* Reference: [MONTGOMERY-SHENZHEN-2014]_
-* Original resolution (height x width or width x height): 4020 x 4892
-* Split reference: none
-* Protocol ``default``:
-
-  * Training samples: 64% of TB and healthy CXR (including labels)
-  * Validation samples: 16% of TB and healthy CXR (including labels)
-  * Test samples: 20% of TB and healthy CXR (including labels)
-"""
-
-import importlib.resources
-import os
-
-from ...utils.rc import load_rc
-from .. import make_dataset
-from ..dataset import JSONDataset
-from ..loader import load_pil_baw, make_delayed
-
-_protocols = [
-    importlib.resources.files(__name__).joinpath("default.json.bz2"),
-    importlib.resources.files(__name__).joinpath("fold_0.json.bz2"),
-    importlib.resources.files(__name__).joinpath("fold_1.json.bz2"),
-    importlib.resources.files(__name__).joinpath("fold_2.json.bz2"),
-    importlib.resources.files(__name__).joinpath("fold_3.json.bz2"),
-    importlib.resources.files(__name__).joinpath("fold_4.json.bz2"),
-    importlib.resources.files(__name__).joinpath("fold_5.json.bz2"),
-    importlib.resources.files(__name__).joinpath("fold_6.json.bz2"),
-    importlib.resources.files(__name__).joinpath("fold_7.json.bz2"),
-    importlib.resources.files(__name__).joinpath("fold_8.json.bz2"),
-    importlib.resources.files(__name__).joinpath("fold_9.json.bz2"),
-]
-
-_datadir = load_rc().get("datadir.montgomery", os.path.realpath(os.curdir))
-
-
-def _raw_data_loader(sample):
-    return dict(
-        data=load_pil_baw(os.path.join(_datadir, sample["data"])),  # type: ignore
-        label=sample["label"],
-    )
-
-
-def _loader(context, sample):
-    # "context" is ignored in this case - database is homogeneous
-    # we return delayed samples to avoid loading all images at once
-    return make_delayed(sample, _raw_data_loader)
-
-
-json_dataset = JSONDataset(
-    protocols=_protocols,
-    fieldnames=("data", "label"),
-    loader=_loader,
-)
-"""Montgomery dataset object."""
-
-
-def _maker(protocol, resize_size=512, cc_size=512, RGB=False):
-    from torchvision import transforms
-
-    from ..transforms import ElasticDeformation, RemoveBlackBorders
-
-    post_transforms = []
-    if RGB:
-        post_transforms = [
-            transforms.Lambda(lambda x: x.convert("RGB")),
-            transforms.ToTensor(),
-        ]
-
-    return make_dataset(
-        [json_dataset.subsets(protocol)],
-        [
-            RemoveBlackBorders(),
-            transforms.Resize(resize_size),
-            transforms.CenterCrop(cc_size),
-        ],
-        [ElasticDeformation(p=0.8)],
-        post_transforms,
-    )
diff --git a/src/ptbench/data/montgomery/default.py b/src/ptbench/data/montgomery/default.py
index 1f5c0809..bc93c593 100644
--- a/src/ptbench/data/montgomery/default.py
+++ b/src/ptbench/data/montgomery/default.py
@@ -2,46 +2,50 @@
 #
 # SPDX-License-Identifier: GPL-3.0-or-later
 
-"""Montgomery dataset for TB detection (default protocol)
+"""Montgomery datamodule for TB detection (default protocol)
 
-* Split reference: first 64% of TB and healthy CXR for "train" 16% for
-* "validation", 20% for "test"
-* This configuration resolution: 512 x 512 (default)
-* See :py:mod:`ptbench.data.montgomery` for dataset details
-"""
+* See :py:mod:`ptbench.data.montgomery` for more database details.
 
-from clapper.logging import setup
+This configuration:
 
-from .. import return_subsets
-from ..base_datamodule import BaseDataModule
-from . import _maker
+* Raw data input (on disk):
 
-logger = setup(__name__.split(".")[0], format="%(levelname)s: %(message)s")
+    * PNG images 12 bit grayscale
+    * resolution: 4020 x 4892 px or 4892 x 4020 px
 
+* Output image:
+    
+    * Transforms:
 
-class DefaultModule(BaseDataModule):
-    def __init__(
-        self,
-        train_batch_size=1,
-        predict_batch_size=1,
-        drop_incomplete_batch=False,
-        multiproc_kwargs=None,
-    ):
-        super().__init__(
-            train_batch_size=train_batch_size,
-            predict_batch_size=predict_batch_size,
-            drop_incomplete_batch=drop_incomplete_batch,
-            multiproc_kwargs=multiproc_kwargs,
-        )
+        * Load raw PNG with :py:mod:`PIL`
+        * Remove black borders
+        * Torch resizing (512 x 512 px)
+        * Torch center cropping (512 x 512 px)
+
+    * Final specifications
+
+        * Fixed resolution: 512 x 512 px
+        * Color RGB encoding
 
-    def setup(self, stage: str):
-        self.dataset = _maker("default")
-        (
-            self.train_dataset,
-            self.validation_dataset,
-            self.extra_validation_datasets,
-            self.predict_dataset,
-        ) = return_subsets(self.dataset)
 
+Protocol ``default``:
+    
+    * Training samples: first 64% of TB and healthy CXR (including labels)
+    * Validation samples: 16% of TB and healthy CXR (including labels)
+    * Test samples: 20% of TB and healty CXR (including labels)
+"""
+
+import importlib.resources
 
-datamodule = DefaultModule
+from ..datamodule import CachingDataModule
+from ..split import JSONDatabaseSplit
+from .loader import RawDataLoader
+
+datamodule = CachingDataModule(
+    database_split=JSONDatabaseSplit(
+        importlib.resources.files(__name__.rsplit(".", 1)[0]).joinpath(
+            "default.json.bz2"
+        )
+    ),
+    raw_data_loader=RawDataLoader(),
+)
diff --git a/src/ptbench/data/montgomery/fold_0.py b/src/ptbench/data/montgomery/fold_0.py
index c60791be..04376951 100644
--- a/src/ptbench/data/montgomery/fold_0.py
+++ b/src/ptbench/data/montgomery/fold_0.py
@@ -2,46 +2,44 @@
 #
 # SPDX-License-Identifier: GPL-3.0-or-later
 
-"""Montgomery dataset for TB detection (cross validation fold 0)
+"""Montgomery datamodule for TB detection (default protocol)
 
-* Split reference: first 64% of TB and healthy CXR for "train" 16% for
-* "validation", 20% for "test"
-* This configuration resolution: 512 x 512 (default)
-* See :py:mod:`ptbench.data.montgomery` for dataset details
-"""
+* See :py:mod:`ptbench.data.montgomery` for more database details.
 
-from clapper.logging import setup
+This configuration:
 
-from .. import return_subsets
-from ..base_datamodule import BaseDataModule
-from . import _maker
+* Raw data input (on disk):
 
-logger = setup(__name__.split(".")[0], format="%(levelname)s: %(message)s")
+    * PNG images 12 bit grayscale
+    * resolution: 4020 x 4892 px or 4892 x 4020 px
 
+* Output image:
+    
+    * Transforms:
 
-class DefaultModule(BaseDataModule):
-    def __init__(
-        self,
-        train_batch_size=1,
-        predict_batch_size=1,
-        drop_incomplete_batch=False,
-        multiproc_kwargs=None,
-    ):
-        super().__init__(
-            train_batch_size=train_batch_size,
-            predict_batch_size=predict_batch_size,
-            drop_incomplete_batch=drop_incomplete_batch,
-            multiproc_kwargs=multiproc_kwargs,
-        )
+        * Load raw PNG with :py:mod:`PIL`
+        * Remove black borders
+        * Torch resizing (512 x 512 px)
+        * Torch center cropping (512 x 512 px)
 
-    def setup(self, stage: str):
-        self.dataset = _maker("fold_0")
-        (
-            self.train_dataset,
-            self.validation_dataset,
-            self.extra_validation_datasets,
-            self.predict_dataset,
-        ) = return_subsets(self.dataset)
+    * Final specifications
+
+        * Fixed resolution: 512 x 512 px
+        * Color RGB encoding
+"""
 
+import importlib.resources
+
+from ..datamodule import CachingDataModule
+from ..split import JSONDatabaseSplit
+from .loader import RawDataLoader
+
+datamodule = CachingDataModule(
+    database_split=JSONDatabaseSplit(
+        importlib.resources.files(__name__.rsplit(".", 1)[0]).joinpath(
+            "fold_0.json.bz2"
+        )
+    ),
+    raw_data_loader=RawDataLoader(),
+)
 
-datamodule = DefaultModule
diff --git a/src/ptbench/data/montgomery/fold_0_rgb.py b/src/ptbench/data/montgomery/fold_0_rgb.py
deleted file mode 100644
index 8e8b0c89..00000000
--- a/src/ptbench/data/montgomery/fold_0_rgb.py
+++ /dev/null
@@ -1,47 +0,0 @@
-# SPDX-FileCopyrightText: Copyright © 2023 Idiap Research Institute <contact@idiap.ch>
-#
-# SPDX-License-Identifier: GPL-3.0-or-later
-
-"""Montgomery dataset for TB detection (cross validation fold 0, RGB)
-
-* Split reference: first 64% of TB and healthy CXR for "train" 16% for
-* "validation", 20% for "test"
-* This configuration resolution: 512 x 512 (default)
-* See :py:mod:`ptbench.data.montgomery` for dataset details
-"""
-
-from clapper.logging import setup
-
-from .. import return_subsets
-from ..base_datamodule import BaseDataModule
-from . import _maker
-
-logger = setup(__name__.split(".")[0], format="%(levelname)s: %(message)s")
-
-
-class DefaultModule(BaseDataModule):
-    def __init__(
-        self,
-        train_batch_size=1,
-        predict_batch_size=1,
-        drop_incomplete_batch=False,
-        multiproc_kwargs=None,
-    ):
-        super().__init__(
-            train_batch_size=train_batch_size,
-            predict_batch_size=predict_batch_size,
-            drop_incomplete_batch=drop_incomplete_batch,
-            multiproc_kwargs=multiproc_kwargs,
-        )
-
-    def setup(self, stage: str):
-        self.dataset = _maker("fold_0", RGB=True)
-        (
-            self.train_dataset,
-            self.validation_dataset,
-            self.extra_validation_datasets,
-            self.predict_dataset,
-        ) = return_subsets(self.dataset)
-
-
-datamodule = DefaultModule
diff --git a/src/ptbench/data/montgomery/fold_1.py b/src/ptbench/data/montgomery/fold_1.py
index d6627e67..8500456b 100644
--- a/src/ptbench/data/montgomery/fold_1.py
+++ b/src/ptbench/data/montgomery/fold_1.py
@@ -2,46 +2,44 @@
 #
 # SPDX-License-Identifier: GPL-3.0-or-later
 
-"""Montgomery dataset for TB detection (cross validation fold 1)
+"""Montgomery datamodule for TB detection (default protocol)
 
-* Split reference: first 64% of TB and healthy CXR for "train" 16% for
-* "validation", 20% for "test"
-* This configuration resolution: 512 x 512 (default)
-* See :py:mod:`ptbench.data.montgomery` for dataset details
-"""
+* See :py:mod:`ptbench.data.montgomery` for more database details.
 
-from clapper.logging import setup
+This configuration:
 
-from .. import return_subsets
-from ..base_datamodule import BaseDataModule
-from . import _maker
+* Raw data input (on disk):
 
-logger = setup(__name__.split(".")[0], format="%(levelname)s: %(message)s")
+    * PNG images 12 bit grayscale
+    * resolution: 4020 x 4892 px or 4892 x 4020 px
 
+* Output image:
+    
+    * Transforms:
 
-class DefaultModule(BaseDataModule):
-    def __init__(
-        self,
-        train_batch_size=1,
-        predict_batch_size=1,
-        drop_incomplete_batch=False,
-        multiproc_kwargs=None,
-    ):
-        super().__init__(
-            train_batch_size=train_batch_size,
-            predict_batch_size=predict_batch_size,
-            drop_incomplete_batch=drop_incomplete_batch,
-            multiproc_kwargs=multiproc_kwargs,
-        )
+        * Load raw PNG with :py:mod:`PIL`
+        * Remove black borders
+        * Torch resizing (512 x 512 px)
+        * Torch center cropping (512 x 512 px)
 
-    def setup(self, stage: str):
-        self.dataset = _maker("fold_1")
-        (
-            self.train_dataset,
-            self.validation_dataset,
-            self.extra_validation_datasets,
-            self.predict_dataset,
-        ) = return_subsets(self.dataset)
+    * Final specifications
+
+        * Fixed resolution: 512 x 512 px
+        * Color RGB encoding
+"""
 
+import importlib.resources
+
+from ..datamodule import CachingDataModule
+from ..split import JSONDatabaseSplit
+from .loader import RawDataLoader
+
+datamodule = CachingDataModule(
+    database_split=JSONDatabaseSplit(
+        importlib.resources.files(__name__.rsplit(".", 1)[0]).joinpath(
+            "fold_1.json.bz2"
+        )
+    ),
+    raw_data_loader=RawDataLoader(),
+)
 
-datamodule = DefaultModule
diff --git a/src/ptbench/data/montgomery/fold_1_rgb.py b/src/ptbench/data/montgomery/fold_1_rgb.py
deleted file mode 100644
index bc47a322..00000000
--- a/src/ptbench/data/montgomery/fold_1_rgb.py
+++ /dev/null
@@ -1,47 +0,0 @@
-# SPDX-FileCopyrightText: Copyright © 2023 Idiap Research Institute <contact@idiap.ch>
-#
-# SPDX-License-Identifier: GPL-3.0-or-later
-
-"""Montgomery dataset for TB detection (cross validation fold 1, RGB)
-
-* Split reference: first 64% of TB and healthy CXR for "train" 16% for
-* "validation", 20% for "test"
-* This configuration resolution: 512 x 512 (default)
-* See :py:mod:`ptbench.data.montgomery` for dataset details
-"""
-
-from clapper.logging import setup
-
-from .. import return_subsets
-from ..base_datamodule import BaseDataModule
-from . import _maker
-
-logger = setup(__name__.split(".")[0], format="%(levelname)s: %(message)s")
-
-
-class DefaultModule(BaseDataModule):
-    def __init__(
-        self,
-        train_batch_size=1,
-        predict_batch_size=1,
-        drop_incomplete_batch=False,
-        multiproc_kwargs=None,
-    ):
-        super().__init__(
-            train_batch_size=train_batch_size,
-            predict_batch_size=predict_batch_size,
-            drop_incomplete_batch=drop_incomplete_batch,
-            multiproc_kwargs=multiproc_kwargs,
-        )
-
-    def setup(self, stage: str):
-        self.dataset = _maker("fold_1", RGB=True)
-        (
-            self.train_dataset,
-            self.validation_dataset,
-            self.extra_validation_datasets,
-            self.predict_dataset,
-        ) = return_subsets(self.dataset)
-
-
-datamodule = DefaultModule
diff --git a/src/ptbench/data/montgomery/fold_2.py b/src/ptbench/data/montgomery/fold_2.py
index 8c5f4a66..e4b7a614 100644
--- a/src/ptbench/data/montgomery/fold_2.py
+++ b/src/ptbench/data/montgomery/fold_2.py
@@ -2,46 +2,44 @@
 #
 # SPDX-License-Identifier: GPL-3.0-or-later
 
-"""Montgomery dataset for TB detection (cross validation fold 2)
+"""Montgomery datamodule for TB detection (default protocol)
 
-* Split reference: first 64% of TB and healthy CXR for "train" 16% for
-* "validation", 20% for "test"
-* This configuration resolution: 512 x 512 (default)
-* See :py:mod:`ptbench.data.montgomery` for dataset details
-"""
+* See :py:mod:`ptbench.data.montgomery` for more database details.
 
-from clapper.logging import setup
+This configuration:
 
-from .. import return_subsets
-from ..base_datamodule import BaseDataModule
-from . import _maker
+* Raw data input (on disk):
 
-logger = setup(__name__.split(".")[0], format="%(levelname)s: %(message)s")
+    * PNG images 12 bit grayscale
+    * resolution: 4020 x 4892 px or 4892 x 4020 px
 
+* Output image:
+    
+    * Transforms:
 
-class DefaultModule(BaseDataModule):
-    def __init__(
-        self,
-        train_batch_size=1,
-        predict_batch_size=1,
-        drop_incomplete_batch=False,
-        multiproc_kwargs=None,
-    ):
-        super().__init__(
-            train_batch_size=train_batch_size,
-            predict_batch_size=predict_batch_size,
-            drop_incomplete_batch=drop_incomplete_batch,
-            multiproc_kwargs=multiproc_kwargs,
-        )
+        * Load raw PNG with :py:mod:`PIL`
+        * Remove black borders
+        * Torch resizing (512 x 512 px)
+        * Torch center cropping (512 x 512 px)
 
-    def setup(self, stage: str):
-        self.dataset = _maker("fold_2")
-        (
-            self.train_dataset,
-            self.validation_dataset,
-            self.extra_validation_datasets,
-            self.predict_dataset,
-        ) = return_subsets(self.dataset)
+    * Final specifications
+
+        * Fixed resolution: 512 x 512 px
+        * Color RGB encoding
+"""
 
+import importlib.resources
+
+from ..datamodule import CachingDataModule
+from ..split import JSONDatabaseSplit
+from .loader import RawDataLoader
+
+datamodule = CachingDataModule(
+    database_split=JSONDatabaseSplit(
+        importlib.resources.files(__name__.rsplit(".", 1)[0]).joinpath(
+            "fold_2.json.bz2"
+        )
+    ),
+    raw_data_loader=RawDataLoader(),
+)
 
-datamodule = DefaultModule
diff --git a/src/ptbench/data/montgomery/fold_2_rgb.py b/src/ptbench/data/montgomery/fold_2_rgb.py
deleted file mode 100644
index b81a877b..00000000
--- a/src/ptbench/data/montgomery/fold_2_rgb.py
+++ /dev/null
@@ -1,47 +0,0 @@
-# SPDX-FileCopyrightText: Copyright © 2023 Idiap Research Institute <contact@idiap.ch>
-#
-# SPDX-License-Identifier: GPL-3.0-or-later
-
-"""Montgomery dataset for TB detection (cross validation fold 2, RGB)
-
-* Split reference: first 64% of TB and healthy CXR for "train" 16% for
-* "validation", 20% for "test"
-* This configuration resolution: 512 x 512 (default)
-* See :py:mod:`ptbench.data.montgomery` for dataset details
-"""
-
-from clapper.logging import setup
-
-from .. import return_subsets
-from ..base_datamodule import BaseDataModule
-from . import _maker
-
-logger = setup(__name__.split(".")[0], format="%(levelname)s: %(message)s")
-
-
-class DefaultModule(BaseDataModule):
-    def __init__(
-        self,
-        train_batch_size=1,
-        predict_batch_size=1,
-        drop_incomplete_batch=False,
-        multiproc_kwargs=None,
-    ):
-        super().__init__(
-            train_batch_size=train_batch_size,
-            predict_batch_size=predict_batch_size,
-            drop_incomplete_batch=drop_incomplete_batch,
-            multiproc_kwargs=multiproc_kwargs,
-        )
-
-    def setup(self, stage: str):
-        self.dataset = _maker("fold_2", RGB=True)
-        (
-            self.train_dataset,
-            self.validation_dataset,
-            self.extra_validation_datasets,
-            self.predict_dataset,
-        ) = return_subsets(self.dataset)
-
-
-datamodule = DefaultModule
diff --git a/src/ptbench/data/montgomery/fold_3.py b/src/ptbench/data/montgomery/fold_3.py
index 8e685d7e..719bf004 100644
--- a/src/ptbench/data/montgomery/fold_3.py
+++ b/src/ptbench/data/montgomery/fold_3.py
@@ -2,46 +2,45 @@
 #
 # SPDX-License-Identifier: GPL-3.0-or-later
 
-"""Montgomery dataset for TB detection (cross validation fold 3)
+"""Montgomery datamodule for TB detection (default protocol)
 
-* Split reference: first 64% of TB and healthy CXR for "train" 16% for
-* "validation", 20% for "test"
-* This configuration resolution: 512 x 512 (default)
-* See :py:mod:`ptbench.data.montgomery` for dataset details
-"""
+* See :py:mod:`ptbench.data.montgomery` for more database details.
 
-from clapper.logging import setup
+This configuration:
 
-from .. import return_subsets
-from ..base_datamodule import BaseDataModule
-from . import _maker
+* Raw data input (on disk):
 
-logger = setup(__name__.split(".")[0], format="%(levelname)s: %(message)s")
+    * PNG images 12 bit grayscale
+    * resolution: 4020 x 4892 px or 4892 x 4020 px
 
+* Output image:
+    
+    * Transforms:
 
-class DefaultModule(BaseDataModule):
-    def __init__(
-        self,
-        train_batch_size=1,
-        predict_batch_size=1,
-        drop_incomplete_batch=False,
-        multiproc_kwargs=None,
-    ):
-        super().__init__(
-            train_batch_size=train_batch_size,
-            predict_batch_size=predict_batch_size,
-            drop_incomplete_batch=drop_incomplete_batch,
-            multiproc_kwargs=multiproc_kwargs,
-        )
+        * Load raw PNG with :py:mod:`PIL`
+        * Remove black borders
+        * Torch resizing (512 x 512 px)
+        * Torch center cropping (512 x 512 px)
+
+    * Final specifications
+
+        * Fixed resolution: 512 x 512 px
+        * Color RGB encoding
+"""
+
+import importlib.resources
 
-    def setup(self, stage: str):
-        self.dataset = _maker("fold_3")
-        (
-            self.train_dataset,
-            self.validation_dataset,
-            self.extra_validation_datasets,
-            self.predict_dataset,
-        ) = return_subsets(self.dataset)
+from ..datamodule import CachingDataModule
+from ..split import JSONDatabaseSplit
+from .loader import RawDataLoader
+
+datamodule = CachingDataModule(
+    database_split=JSONDatabaseSplit(
+        importlib.resources.files(__name__.rsplit(".", 1)[0]).joinpath(
+            "fold_3.json.bz2"
+        )
+    ),
+    raw_data_loader=RawDataLoader(),
+)
 
 
-datamodule = DefaultModule
diff --git a/src/ptbench/data/montgomery/fold_3_rgb.py b/src/ptbench/data/montgomery/fold_3_rgb.py
deleted file mode 100644
index 7b600371..00000000
--- a/src/ptbench/data/montgomery/fold_3_rgb.py
+++ /dev/null
@@ -1,47 +0,0 @@
-# SPDX-FileCopyrightText: Copyright © 2023 Idiap Research Institute <contact@idiap.ch>
-#
-# SPDX-License-Identifier: GPL-3.0-or-later
-
-"""Montgomery dataset for TB detection (cross validation fold 3, RGB)
-
-* Split reference: first 64% of TB and healthy CXR for "train" 16% for
-* "validation", 20% for "test"
-* This configuration resolution: 512 x 512 (default)
-* See :py:mod:`ptbench.data.montgomery` for dataset details
-"""
-
-from clapper.logging import setup
-
-from .. import return_subsets
-from ..base_datamodule import BaseDataModule
-from . import _maker
-
-logger = setup(__name__.split(".")[0], format="%(levelname)s: %(message)s")
-
-
-class DefaultModule(BaseDataModule):
-    def __init__(
-        self,
-        train_batch_size=1,
-        predict_batch_size=1,
-        drop_incomplete_batch=False,
-        multiproc_kwargs=None,
-    ):
-        super().__init__(
-            train_batch_size=train_batch_size,
-            predict_batch_size=predict_batch_size,
-            drop_incomplete_batch=drop_incomplete_batch,
-            multiproc_kwargs=multiproc_kwargs,
-        )
-
-    def setup(self, stage: str):
-        self.dataset = _maker("fold_3", RGB=True)
-        (
-            self.train_dataset,
-            self.validation_dataset,
-            self.extra_validation_datasets,
-            self.predict_dataset,
-        ) = return_subsets(self.dataset)
-
-
-datamodule = DefaultModule
diff --git a/src/ptbench/data/montgomery/fold_4.py b/src/ptbench/data/montgomery/fold_4.py
index 9459cb93..2e97b114 100644
--- a/src/ptbench/data/montgomery/fold_4.py
+++ b/src/ptbench/data/montgomery/fold_4.py
@@ -2,46 +2,45 @@
 #
 # SPDX-License-Identifier: GPL-3.0-or-later
 
-"""Montgomery dataset for TB detection (cross validation fold 4)
+"""Montgomery datamodule for TB detection (default protocol)
 
-* Split reference: first 64% of TB and healthy CXR for "train" 16% for
-* "validation", 20% for "test"
-* This configuration resolution: 512 x 512 (default)
-* See :py:mod:`ptbench.data.montgomery` for dataset details
-"""
+* See :py:mod:`ptbench.data.montgomery` for more database details.
 
-from clapper.logging import setup
+This configuration:
 
-from .. import return_subsets
-from ..base_datamodule import BaseDataModule
-from . import _maker
+* Raw data input (on disk):
 
-logger = setup(__name__.split(".")[0], format="%(levelname)s: %(message)s")
+    * PNG images 12 bit grayscale
+    * resolution: 4020 x 4892 px or 4892 x 4020 px
 
+* Output image:
+    
+    * Transforms:
 
-class DefaultModule(BaseDataModule):
-    def __init__(
-        self,
-        train_batch_size=1,
-        predict_batch_size=1,
-        drop_incomplete_batch=False,
-        multiproc_kwargs=None,
-    ):
-        super().__init__(
-            train_batch_size=train_batch_size,
-            predict_batch_size=predict_batch_size,
-            drop_incomplete_batch=drop_incomplete_batch,
-            multiproc_kwargs=multiproc_kwargs,
-        )
+        * Load raw PNG with :py:mod:`PIL`
+        * Remove black borders
+        * Torch resizing (512 x 512 px)
+        * Torch center cropping (512 x 512 px)
+
+    * Final specifications
+
+        * Fixed resolution: 512 x 512 px
+        * Color RGB encoding
+"""
+
+import importlib.resources
 
-    def setup(self, stage: str):
-        self.dataset = _maker("fold_4")
-        (
-            self.train_dataset,
-            self.validation_dataset,
-            self.extra_validation_datasets,
-            self.predict_dataset,
-        ) = return_subsets(self.dataset)
+from ..datamodule import CachingDataModule
+from ..split import JSONDatabaseSplit
+from .loader import RawDataLoader
+
+datamodule = CachingDataModule(
+    database_split=JSONDatabaseSplit(
+        importlib.resources.files(__name__.rsplit(".", 1)[0]).joinpath(
+            "fold_4.json.bz2"
+        )
+    ),
+    raw_data_loader=RawDataLoader(),
+)
 
 
-datamodule = DefaultModule
diff --git a/src/ptbench/data/montgomery/fold_4_rgb.py b/src/ptbench/data/montgomery/fold_4_rgb.py
deleted file mode 100644
index 3eb136f6..00000000
--- a/src/ptbench/data/montgomery/fold_4_rgb.py
+++ /dev/null
@@ -1,47 +0,0 @@
-# SPDX-FileCopyrightText: Copyright © 2023 Idiap Research Institute <contact@idiap.ch>
-#
-# SPDX-License-Identifier: GPL-3.0-or-later
-
-"""Montgomery dataset for TB detection (cross validation fold 4, RGB)
-
-* Split reference: first 64% of TB and healthy CXR for "train" 16% for
-* "validation", 20% for "test"
-* This configuration resolution: 512 x 512 (default)
-* See :py:mod:`ptbench.data.montgomery` for dataset details
-"""
-
-from clapper.logging import setup
-
-from .. import return_subsets
-from ..base_datamodule import BaseDataModule
-from . import _maker
-
-logger = setup(__name__.split(".")[0], format="%(levelname)s: %(message)s")
-
-
-class DefaultModule(BaseDataModule):
-    def __init__(
-        self,
-        train_batch_size=1,
-        predict_batch_size=1,
-        drop_incomplete_batch=False,
-        multiproc_kwargs=None,
-    ):
-        super().__init__(
-            train_batch_size=train_batch_size,
-            predict_batch_size=predict_batch_size,
-            drop_incomplete_batch=drop_incomplete_batch,
-            multiproc_kwargs=multiproc_kwargs,
-        )
-
-    def setup(self, stage: str):
-        self.dataset = _maker("fold_4", RGB=True)
-        (
-            self.train_dataset,
-            self.validation_dataset,
-            self.extra_validation_datasets,
-            self.predict_dataset,
-        ) = return_subsets(self.dataset)
-
-
-datamodule = DefaultModule
diff --git a/src/ptbench/data/montgomery/fold_5.py b/src/ptbench/data/montgomery/fold_5.py
index 147690f6..4df1451d 100644
--- a/src/ptbench/data/montgomery/fold_5.py
+++ b/src/ptbench/data/montgomery/fold_5.py
@@ -2,46 +2,45 @@
 #
 # SPDX-License-Identifier: GPL-3.0-or-later
 
-"""Montgomery dataset for TB detection (cross validation fold 5)
+"""Montgomery datamodule for TB detection (default protocol)
 
-* Split reference: first 64% of TB and healthy CXR for "train" 16% for
-* "validation", 20% for "test"
-* This configuration resolution: 512 x 512 (default)
-* See :py:mod:`ptbench.data.montgomery` for dataset details
-"""
+* See :py:mod:`ptbench.data.montgomery` for more database details.
 
-from clapper.logging import setup
+This configuration:
 
-from .. import return_subsets
-from ..base_datamodule import BaseDataModule
-from . import _maker
+* Raw data input (on disk):
 
-logger = setup(__name__.split(".")[0], format="%(levelname)s: %(message)s")
+    * PNG images 12 bit grayscale
+    * resolution: 4020 x 4892 px or 4892 x 4020 px
 
+* Output image:
+    
+    * Transforms:
 
-class DefaultModule(BaseDataModule):
-    def __init__(
-        self,
-        train_batch_size=1,
-        predict_batch_size=1,
-        drop_incomplete_batch=False,
-        multiproc_kwargs=None,
-    ):
-        super().__init__(
-            train_batch_size=train_batch_size,
-            predict_batch_size=predict_batch_size,
-            drop_incomplete_batch=drop_incomplete_batch,
-            multiproc_kwargs=multiproc_kwargs,
-        )
+        * Load raw PNG with :py:mod:`PIL`
+        * Remove black borders
+        * Torch resizing (512 x 512 px)
+        * Torch center cropping (512 x 512 px)
+
+    * Final specifications
+
+        * Fixed resolution: 512 x 512 px
+        * Color RGB encoding
+"""
+
+import importlib.resources
 
-    def setup(self, stage: str):
-        self.dataset = _maker("fold_5")
-        (
-            self.train_dataset,
-            self.validation_dataset,
-            self.extra_validation_datasets,
-            self.predict_dataset,
-        ) = return_subsets(self.dataset)
+from ..datamodule import CachingDataModule
+from ..split import JSONDatabaseSplit
+from .loader import RawDataLoader
+
+datamodule = CachingDataModule(
+    database_split=JSONDatabaseSplit(
+        importlib.resources.files(__name__.rsplit(".", 1)[0]).joinpath(
+            "fold_5.json.bz2"
+        )
+    ),
+    raw_data_loader=RawDataLoader(),
+)
 
 
-datamodule = DefaultModule
diff --git a/src/ptbench/data/montgomery/fold_5_rgb.py b/src/ptbench/data/montgomery/fold_5_rgb.py
deleted file mode 100644
index 3e7cb73f..00000000
--- a/src/ptbench/data/montgomery/fold_5_rgb.py
+++ /dev/null
@@ -1,47 +0,0 @@
-# SPDX-FileCopyrightText: Copyright © 2023 Idiap Research Institute <contact@idiap.ch>
-#
-# SPDX-License-Identifier: GPL-3.0-or-later
-
-"""Montgomery dataset for TB detection (cross validation fold 5, RGB)
-
-* Split reference: first 64% of TB and healthy CXR for "train" 16% for
-* "validation", 20% for "test"
-* This configuration resolution: 512 x 512 (default)
-* See :py:mod:`ptbench.data.montgomery` for dataset details
-"""
-
-from clapper.logging import setup
-
-from .. import return_subsets
-from ..base_datamodule import BaseDataModule
-from . import _maker
-
-logger = setup(__name__.split(".")[0], format="%(levelname)s: %(message)s")
-
-
-class DefaultModule(BaseDataModule):
-    def __init__(
-        self,
-        train_batch_size=1,
-        predict_batch_size=1,
-        drop_incomplete_batch=False,
-        multiproc_kwargs=None,
-    ):
-        super().__init__(
-            train_batch_size=train_batch_size,
-            predict_batch_size=predict_batch_size,
-            drop_incomplete_batch=drop_incomplete_batch,
-            multiproc_kwargs=multiproc_kwargs,
-        )
-
-    def setup(self, stage: str):
-        self.dataset = _maker("fold_5", RGB=True)
-        (
-            self.train_dataset,
-            self.validation_dataset,
-            self.extra_validation_datasets,
-            self.predict_dataset,
-        ) = return_subsets(self.dataset)
-
-
-datamodule = DefaultModule
diff --git a/src/ptbench/data/montgomery/fold_6.py b/src/ptbench/data/montgomery/fold_6.py
index 69f24390..d0b36115 100644
--- a/src/ptbench/data/montgomery/fold_6.py
+++ b/src/ptbench/data/montgomery/fold_6.py
@@ -2,46 +2,45 @@
 #
 # SPDX-License-Identifier: GPL-3.0-or-later
 
-"""Montgomery dataset for TB detection (cross validation fold 6)
+"""Montgomery datamodule for TB detection (default protocol)
 
-* Split reference: first 64% of TB and healthy CXR for "train" 16% for
-* "validation", 20% for "test"
-* This configuration resolution: 512 x 512 (default)
-* See :py:mod:`ptbench.data.montgomery` for dataset details
-"""
+* See :py:mod:`ptbench.data.montgomery` for more database details.
 
-from clapper.logging import setup
+This configuration:
 
-from .. import return_subsets
-from ..base_datamodule import BaseDataModule
-from . import _maker
+* Raw data input (on disk):
 
-logger = setup(__name__.split(".")[0], format="%(levelname)s: %(message)s")
+    * PNG images 12 bit grayscale
+    * resolution: 4020 x 4892 px or 4892 x 4020 px
 
+* Output image:
+    
+    * Transforms:
 
-class DefaultModule(BaseDataModule):
-    def __init__(
-        self,
-        train_batch_size=1,
-        predict_batch_size=1,
-        drop_incomplete_batch=False,
-        multiproc_kwargs=None,
-    ):
-        super().__init__(
-            train_batch_size=train_batch_size,
-            predict_batch_size=predict_batch_size,
-            drop_incomplete_batch=drop_incomplete_batch,
-            multiproc_kwargs=multiproc_kwargs,
-        )
+        * Load raw PNG with :py:mod:`PIL`
+        * Remove black borders
+        * Torch resizing (512 x 512 px)
+        * Torch center cropping (512 x 512 px)
+
+    * Final specifications
+
+        * Fixed resolution: 512 x 512 px
+        * Color RGB encoding
+"""
+
+import importlib.resources
 
-    def setup(self, stage: str):
-        self.dataset = _maker("fold_6")
-        (
-            self.train_dataset,
-            self.validation_dataset,
-            self.extra_validation_datasets,
-            self.predict_dataset,
-        ) = return_subsets(self.dataset)
+from ..datamodule import CachingDataModule
+from ..split import JSONDatabaseSplit
+from .loader import RawDataLoader
+
+datamodule = CachingDataModule(
+    database_split=JSONDatabaseSplit(
+        importlib.resources.files(__name__.rsplit(".", 1)[0]).joinpath(
+            "fold_6.json.bz2"
+        )
+    ),
+    raw_data_loader=RawDataLoader(),
+)
 
 
-datamodule = DefaultModule
diff --git a/src/ptbench/data/montgomery/fold_6_rgb.py b/src/ptbench/data/montgomery/fold_6_rgb.py
deleted file mode 100644
index ff3a8cdb..00000000
--- a/src/ptbench/data/montgomery/fold_6_rgb.py
+++ /dev/null
@@ -1,47 +0,0 @@
-# SPDX-FileCopyrightText: Copyright © 2023 Idiap Research Institute <contact@idiap.ch>
-#
-# SPDX-License-Identifier: GPL-3.0-or-later
-
-"""Montgomery dataset for TB detection (cross validation fold 6, RGB)
-
-* Split reference: first 64% of TB and healthy CXR for "train" 16% for
-* "validation", 20% for "test"
-* This configuration resolution: 512 x 512 (default)
-* See :py:mod:`ptbench.data.montgomery` for dataset details
-"""
-
-from clapper.logging import setup
-
-from .. import return_subsets
-from ..base_datamodule import BaseDataModule
-from . import _maker
-
-logger = setup(__name__.split(".")[0], format="%(levelname)s: %(message)s")
-
-
-class DefaultModule(BaseDataModule):
-    def __init__(
-        self,
-        train_batch_size=1,
-        predict_batch_size=1,
-        drop_incomplete_batch=False,
-        multiproc_kwargs=None,
-    ):
-        super().__init__(
-            train_batch_size=train_batch_size,
-            predict_batch_size=predict_batch_size,
-            drop_incomplete_batch=drop_incomplete_batch,
-            multiproc_kwargs=multiproc_kwargs,
-        )
-
-    def setup(self, stage: str):
-        self.dataset = _maker("fold_6", RGB=True)
-        (
-            self.train_dataset,
-            self.validation_dataset,
-            self.extra_validation_datasets,
-            self.predict_dataset,
-        ) = return_subsets(self.dataset)
-
-
-datamodule = DefaultModule
diff --git a/src/ptbench/data/montgomery/fold_7.py b/src/ptbench/data/montgomery/fold_7.py
index 20ba9d3a..b132b30e 100644
--- a/src/ptbench/data/montgomery/fold_7.py
+++ b/src/ptbench/data/montgomery/fold_7.py
@@ -2,46 +2,45 @@
 #
 # SPDX-License-Identifier: GPL-3.0-or-later
 
-"""Montgomery dataset for TB detection (cross validation fold 7)
+"""Montgomery datamodule for TB detection (default protocol)
 
-* Split reference: first 64% of TB and healthy CXR for "train" 16% for
-* "validation", 20% for "test"
-* This configuration resolution: 512 x 512 (default)
-* See :py:mod:`ptbench.data.montgomery` for dataset details
-"""
+* See :py:mod:`ptbench.data.montgomery` for more database details.
 
-from clapper.logging import setup
+This configuration:
 
-from .. import return_subsets
-from ..base_datamodule import BaseDataModule
-from . import _maker
+* Raw data input (on disk):
 
-logger = setup(__name__.split(".")[0], format="%(levelname)s: %(message)s")
+    * PNG images 12 bit grayscale
+    * resolution: 4020 x 4892 px or 4892 x 4020 px
 
+* Output image:
+    
+    * Transforms:
 
-class DefaultModule(BaseDataModule):
-    def __init__(
-        self,
-        train_batch_size=1,
-        predict_batch_size=1,
-        drop_incomplete_batch=False,
-        multiproc_kwargs=None,
-    ):
-        super().__init__(
-            train_batch_size=train_batch_size,
-            predict_batch_size=predict_batch_size,
-            drop_incomplete_batch=drop_incomplete_batch,
-            multiproc_kwargs=multiproc_kwargs,
-        )
+        * Load raw PNG with :py:mod:`PIL`
+        * Remove black borders
+        * Torch resizing (512 x 512 px)
+        * Torch center cropping (512 x 512 px)
+
+    * Final specifications
+
+        * Fixed resolution: 512 x 512 px
+        * Color RGB encoding
+"""
+
+import importlib.resources
 
-    def setup(self, stage: str):
-        self.dataset = _maker("fold_7")
-        (
-            self.train_dataset,
-            self.validation_dataset,
-            self.extra_validation_datasets,
-            self.predict_dataset,
-        ) = return_subsets(self.dataset)
+from ..datamodule import CachingDataModule
+from ..split import JSONDatabaseSplit
+from .loader import RawDataLoader
+
+datamodule = CachingDataModule(
+    database_split=JSONDatabaseSplit(
+        importlib.resources.files(__name__.rsplit(".", 1)[0]).joinpath(
+            "fold_7.json.bz2"
+        )
+    ),
+    raw_data_loader=RawDataLoader(),
+)
 
 
-datamodule = DefaultModule
diff --git a/src/ptbench/data/montgomery/fold_7_rgb.py b/src/ptbench/data/montgomery/fold_7_rgb.py
deleted file mode 100644
index 05664b06..00000000
--- a/src/ptbench/data/montgomery/fold_7_rgb.py
+++ /dev/null
@@ -1,47 +0,0 @@
-# SPDX-FileCopyrightText: Copyright © 2023 Idiap Research Institute <contact@idiap.ch>
-#
-# SPDX-License-Identifier: GPL-3.0-or-later
-
-"""Montgomery dataset for TB detection (cross validation fold 7, RGB)
-
-* Split reference: first 64% of TB and healthy CXR for "train" 16% for
-* "validation", 20% for "test"
-* This configuration resolution: 512 x 512 (default)
-* See :py:mod:`ptbench.data.montgomery` for dataset details
-"""
-
-from clapper.logging import setup
-
-from .. import return_subsets
-from ..base_datamodule import BaseDataModule
-from . import _maker
-
-logger = setup(__name__.split(".")[0], format="%(levelname)s: %(message)s")
-
-
-class DefaultModule(BaseDataModule):
-    def __init__(
-        self,
-        train_batch_size=1,
-        predict_batch_size=1,
-        drop_incomplete_batch=False,
-        multiproc_kwargs=None,
-    ):
-        super().__init__(
-            train_batch_size=train_batch_size,
-            predict_batch_size=predict_batch_size,
-            drop_incomplete_batch=drop_incomplete_batch,
-            multiproc_kwargs=multiproc_kwargs,
-        )
-
-    def setup(self, stage: str):
-        self.dataset = _maker("fold_7", RGB=True)
-        (
-            self.train_dataset,
-            self.validation_dataset,
-            self.extra_validation_datasets,
-            self.predict_dataset,
-        ) = return_subsets(self.dataset)
-
-
-datamodule = DefaultModule
diff --git a/src/ptbench/data/montgomery/fold_8.py b/src/ptbench/data/montgomery/fold_8.py
index e92ff959..73169ca0 100644
--- a/src/ptbench/data/montgomery/fold_8.py
+++ b/src/ptbench/data/montgomery/fold_8.py
@@ -2,46 +2,45 @@
 #
 # SPDX-License-Identifier: GPL-3.0-or-later
 
-"""Montgomery dataset for TB detection (cross validation fold 8)
+"""Montgomery datamodule for TB detection (default protocol)
 
-* Split reference: first 64% of TB and healthy CXR for "train" 16% for
-* "validation", 20% for "test"
-* This configuration resolution: 512 x 512 (default)
-* See :py:mod:`ptbench.data.montgomery` for dataset details
-"""
+* See :py:mod:`ptbench.data.montgomery` for more database details.
 
-from clapper.logging import setup
+This configuration:
 
-from .. import return_subsets
-from ..base_datamodule import BaseDataModule
-from . import _maker
+* Raw data input (on disk):
 
-logger = setup(__name__.split(".")[0], format="%(levelname)s: %(message)s")
+    * PNG images 12 bit grayscale
+    * resolution: 4020 x 4892 px or 4892 x 4020 px
 
+* Output image:
+    
+    * Transforms:
 
-class DefaultModule(BaseDataModule):
-    def __init__(
-        self,
-        train_batch_size=1,
-        predict_batch_size=1,
-        drop_incomplete_batch=False,
-        multiproc_kwargs=None,
-    ):
-        super().__init__(
-            train_batch_size=train_batch_size,
-            predict_batch_size=predict_batch_size,
-            drop_incomplete_batch=drop_incomplete_batch,
-            multiproc_kwargs=multiproc_kwargs,
-        )
+        * Load raw PNG with :py:mod:`PIL`
+        * Remove black borders
+        * Torch resizing (512 x 512 px)
+        * Torch center cropping (512 x 512 px)
+
+    * Final specifications
+
+        * Fixed resolution: 512 x 512 px
+        * Color RGB encoding
+"""
+
+import importlib.resources
 
-    def setup(self, stage: str):
-        self.dataset = _maker("fold_8")
-        (
-            self.train_dataset,
-            self.validation_dataset,
-            self.extra_validation_datasets,
-            self.predict_dataset,
-        ) = return_subsets(self.dataset)
+from ..datamodule import CachingDataModule
+from ..split import JSONDatabaseSplit
+from .loader import RawDataLoader
+
+datamodule = CachingDataModule(
+    database_split=JSONDatabaseSplit(
+        importlib.resources.files(__name__.rsplit(".", 1)[0]).joinpath(
+            "fold_8.json.bz2"
+        )
+    ),
+    raw_data_loader=RawDataLoader(),
+)
 
 
-datamodule = DefaultModule
diff --git a/src/ptbench/data/montgomery/fold_8_rgb.py b/src/ptbench/data/montgomery/fold_8_rgb.py
deleted file mode 100644
index b7d59359..00000000
--- a/src/ptbench/data/montgomery/fold_8_rgb.py
+++ /dev/null
@@ -1,47 +0,0 @@
-# SPDX-FileCopyrightText: Copyright © 2023 Idiap Research Institute <contact@idiap.ch>
-#
-# SPDX-License-Identifier: GPL-3.0-or-later
-
-"""Montgomery dataset for TB detection (cross validation fold 8, RGB)
-
-* Split reference: first 64% of TB and healthy CXR for "train" 16% for
-* "validation", 20% for "test"
-* This configuration resolution: 512 x 512 (default)
-* See :py:mod:`ptbench.data.montgomery` for dataset details
-"""
-
-from clapper.logging import setup
-
-from .. import return_subsets
-from ..base_datamodule import BaseDataModule
-from . import _maker
-
-logger = setup(__name__.split(".")[0], format="%(levelname)s: %(message)s")
-
-
-class DefaultModule(BaseDataModule):
-    def __init__(
-        self,
-        train_batch_size=1,
-        predict_batch_size=1,
-        drop_incomplete_batch=False,
-        multiproc_kwargs=None,
-    ):
-        super().__init__(
-            train_batch_size=train_batch_size,
-            predict_batch_size=predict_batch_size,
-            drop_incomplete_batch=drop_incomplete_batch,
-            multiproc_kwargs=multiproc_kwargs,
-        )
-
-    def setup(self, stage: str):
-        self.dataset = _maker("fold_8", RGB=True)
-        (
-            self.train_dataset,
-            self.validation_dataset,
-            self.extra_validation_datasets,
-            self.predict_dataset,
-        ) = return_subsets(self.dataset)
-
-
-datamodule = DefaultModule
diff --git a/src/ptbench/data/montgomery/fold_9.py b/src/ptbench/data/montgomery/fold_9.py
index 81bbf72e..18561f80 100644
--- a/src/ptbench/data/montgomery/fold_9.py
+++ b/src/ptbench/data/montgomery/fold_9.py
@@ -2,46 +2,45 @@
 #
 # SPDX-License-Identifier: GPL-3.0-or-later
 
-"""Montgomery dataset for TB detection (cross validation fold 9)
+"""Montgomery datamodule for TB detection (default protocol)
 
-* Split reference: first 64% of TB and healthy CXR for "train" 16% for
-* "validation", 20% for "test"
-* This configuration resolution: 512 x 512 (default)
-* See :py:mod:`ptbench.data.montgomery` for dataset details
-"""
+* See :py:mod:`ptbench.data.montgomery` for more database details.
 
-from clapper.logging import setup
+This configuration:
 
-from .. import return_subsets
-from ..base_datamodule import BaseDataModule
-from . import _maker
+* Raw data input (on disk):
 
-logger = setup(__name__.split(".")[0], format="%(levelname)s: %(message)s")
+    * PNG images 12 bit grayscale
+    * resolution: 4020 x 4892 px or 4892 x 4020 px
 
+* Output image:
+    
+    * Transforms:
 
-class DefaultModule(BaseDataModule):
-    def __init__(
-        self,
-        train_batch_size=1,
-        predict_batch_size=1,
-        drop_incomplete_batch=False,
-        multiproc_kwargs=None,
-    ):
-        super().__init__(
-            train_batch_size=train_batch_size,
-            predict_batch_size=predict_batch_size,
-            drop_incomplete_batch=drop_incomplete_batch,
-            multiproc_kwargs=multiproc_kwargs,
-        )
+        * Load raw PNG with :py:mod:`PIL`
+        * Remove black borders
+        * Torch resizing (512 x 512 px)
+        * Torch center cropping (512 x 512 px)
+
+    * Final specifications
+
+        * Fixed resolution: 512 x 512 px
+        * Color RGB encoding
+"""
+
+import importlib.resources
 
-    def setup(self, stage: str):
-        self.dataset = _maker("fold_9")
-        (
-            self.train_dataset,
-            self.validation_dataset,
-            self.extra_validation_datasets,
-            self.predict_dataset,
-        ) = return_subsets(self.dataset)
+from ..datamodule import CachingDataModule
+from ..split import JSONDatabaseSplit
+from .loader import RawDataLoader
+
+datamodule = CachingDataModule(
+    database_split=JSONDatabaseSplit(
+        importlib.resources.files(__name__.rsplit(".", 1)[0]).joinpath(
+            "fold_9.json.bz2"
+        )
+    ),
+    raw_data_loader=RawDataLoader(),
+)
 
 
-datamodule = DefaultModule
diff --git a/src/ptbench/data/montgomery/fold_9_rgb.py b/src/ptbench/data/montgomery/fold_9_rgb.py
deleted file mode 100644
index e961e08f..00000000
--- a/src/ptbench/data/montgomery/fold_9_rgb.py
+++ /dev/null
@@ -1,47 +0,0 @@
-# SPDX-FileCopyrightText: Copyright © 2023 Idiap Research Institute <contact@idiap.ch>
-#
-# SPDX-License-Identifier: GPL-3.0-or-later
-
-"""Montgomery dataset for TB detection (cross validation fold 9, RGB)
-
-* Split reference: first 64% of TB and healthy CXR for "train" 16% for
-* "validation", 20% for "test"
-* This configuration resolution: 512 x 512 (default)
-* See :py:mod:`ptbench.data.montgomery` for dataset details
-"""
-
-from clapper.logging import setup
-
-from .. import return_subsets
-from ..base_datamodule import BaseDataModule
-from . import _maker
-
-logger = setup(__name__.split(".")[0], format="%(levelname)s: %(message)s")
-
-
-class DefaultModule(BaseDataModule):
-    def __init__(
-        self,
-        train_batch_size=1,
-        predict_batch_size=1,
-        drop_incomplete_batch=False,
-        multiproc_kwargs=None,
-    ):
-        super().__init__(
-            train_batch_size=train_batch_size,
-            predict_batch_size=predict_batch_size,
-            drop_incomplete_batch=drop_incomplete_batch,
-            multiproc_kwargs=multiproc_kwargs,
-        )
-
-    def setup(self, stage: str):
-        self.dataset = _maker("fold_9", RGB=True)
-        (
-            self.train_dataset,
-            self.validation_dataset,
-            self.extra_validation_datasets,
-            self.predict_dataset,
-        ) = return_subsets(self.dataset)
-
-
-datamodule = DefaultModule
diff --git a/src/ptbench/data/montgomery/loader.py b/src/ptbench/data/montgomery/loader.py
new file mode 100644
index 00000000..0dce8738
--- /dev/null
+++ b/src/ptbench/data/montgomery/loader.py
@@ -0,0 +1,114 @@
+# SPDX-FileCopyrightText: Copyright © 2023 Idiap Research Institute <contact@idiap.ch>
+#
+# SPDX-License-Identifier: GPL-3.0-or-later
+
+"""Montgomery datamodule for TB detection (default protocol)
+
+* See :py:mod:`ptbench.data.montgomery` for more database details.
+
+This configuration:
+
+* Raw data input (on disk):
+
+    * PNG images 12 bit grayscale
+    * resolution: 4020 x 4892 px or 4892 x 4020 px
+
+* Output image:
+    
+    * Transforms:
+
+        * Load raw PNG with :py:mod:`PIL`
+        * Remove black borders
+        * Torch resizing (512 x 512 px)
+        * Torch center cropping (512 x 512 px)
+
+    * Final specifications
+
+        * Fixed resolution: 512 x 512 px
+        * Color RGB encoding
+"""
+
+import os
+
+import torchvision.transforms
+
+from ...utils.rc import load_rc
+from ..image_utils import RemoveBlackBorders, load_pil_baw
+from ..typing import RawDataLoader as _BaseRawDataLoader
+from ..typing import Sample
+
+
+class RawDataLoader(_BaseRawDataLoader):
+    """A specialized raw-data-loader for the Montgomery dataset.
+
+    Attributes
+    ----------
+
+    datadir
+        This variable contains the base directory where the database raw data
+        is stored.
+
+    transform
+        Transforms that are always applied to the loaded raw images.
+    """
+
+    datadir: str
+    transform: torchvision.transforms.Compose
+
+    def __init__(self):
+        self.datadir = load_rc().get(
+            "datadir.montgomery", os.path.realpath(os.curdir)
+        )
+
+        self.transform = torchvision.transforms.Compose(
+            [
+                RemoveBlackBorders(),
+                torchvision.transforms.Resize(512),
+                torchvision.transforms.CenterCrop(512),
+                torchvision.transforms.ToTensor(),
+            ]
+        )
+
+    def sample(self, sample: tuple[str, int]) -> Sample:
+        """Loads a single image sample from the disk.
+
+        Parameters
+        ----------
+
+        sample:
+            A tuple containing the path suffix, within the dataset root folder,
+            where to find the image to be loaded, and an integer, representing the
+            sample label.
+
+
+        Returns
+        -------
+
+        sample
+            The sample representation
+        """
+        tensor = self.transform(
+            load_pil_baw(os.path.join(self.datadir, sample[0]))
+        )
+
+        return tensor, dict(label=sample[1], name=sample[0])  # type: ignore[arg-type]
+
+    def label(self, sample: tuple[str, int]) -> int:
+        """Loads a single image sample label from the disk.
+
+        Parameters
+        ----------
+
+        sample:
+            A tuple containing the path suffix, within the dataset root folder,
+            where to find the image to be loaded, and an integer, representing the
+            sample label.
+
+
+        Returns
+        -------
+
+        label
+            The integer label associated with the sample
+        """
+        return sample[1]
diff --git a/src/ptbench/data/montgomery/rgb.py b/src/ptbench/data/montgomery/rgb.py
deleted file mode 100644
index c1621266..00000000
--- a/src/ptbench/data/montgomery/rgb.py
+++ /dev/null
@@ -1,47 +0,0 @@
-# SPDX-FileCopyrightText: Copyright © 2023 Idiap Research Institute <contact@idiap.ch>
-#
-# SPDX-License-Identifier: GPL-3.0-or-later
-
-"""Montgomery dataset for TB detection (default protocol, converted in RGB)
-
-* Split reference: first 64% of TB and healthy CXR for "train" 16% for
-* "validation", 20% for "test"
-* This configuration resolution: 512 x 512 (default)
-* See :py:mod:`ptbench.data.montgomery` for dataset details
-"""
-
-from clapper.logging import setup
-
-from .. import return_subsets
-from ..base_datamodule import BaseDataModule
-from . import _maker
-
-logger = setup(__name__.split(".")[0], format="%(levelname)s: %(message)s")
-
-
-class DefaultModule(BaseDataModule):
-    def __init__(
-        self,
-        train_batch_size=1,
-        predict_batch_size=1,
-        drop_incomplete_batch=False,
-        multiproc_kwargs=None,
-    ):
-        super().__init__(
-            train_batch_size=train_batch_size,
-            predict_batch_size=predict_batch_size,
-            drop_incomplete_batch=drop_incomplete_batch,
-            multiproc_kwargs=multiproc_kwargs,
-        )
-
-    def setup(self, stage: str):
-        self.dataset = _maker("default", RGB=True)
-        (
-            self.train_dataset,
-            self.validation_dataset,
-            self.extra_validation_datasets,
-            self.predict_dataset,
-        ) = return_subsets(self.dataset)
-
-
-datamodule = DefaultModule
-- 
GitLab