From 4c71da477379f7b3a60aa0136f28db21e36e867c Mon Sep 17 00:00:00 2001 From: mdelitroz <maxime.delitroz@idiap.ch> Date: Wed, 19 Jul 2023 11:33:01 +0200 Subject: [PATCH] updated Montgomery dataset following new design principles added in the add-Datamodule branch --- src/ptbench/data/montgomery/__init__.py | 88 ----------------- src/ptbench/data/montgomery/default.py | 72 +++++++------- src/ptbench/data/montgomery/fold_0.py | 66 ++++++------- src/ptbench/data/montgomery/fold_0_rgb.py | 47 --------- src/ptbench/data/montgomery/fold_1.py | 66 ++++++------- src/ptbench/data/montgomery/fold_1_rgb.py | 47 --------- src/ptbench/data/montgomery/fold_2.py | 66 ++++++------- src/ptbench/data/montgomery/fold_2_rgb.py | 47 --------- src/ptbench/data/montgomery/fold_3.py | 67 +++++++------ src/ptbench/data/montgomery/fold_3_rgb.py | 47 --------- src/ptbench/data/montgomery/fold_4.py | 67 +++++++------ src/ptbench/data/montgomery/fold_4_rgb.py | 47 --------- src/ptbench/data/montgomery/fold_5.py | 67 +++++++------ src/ptbench/data/montgomery/fold_5_rgb.py | 47 --------- src/ptbench/data/montgomery/fold_6.py | 67 +++++++------ src/ptbench/data/montgomery/fold_6_rgb.py | 47 --------- src/ptbench/data/montgomery/fold_7.py | 67 +++++++------ src/ptbench/data/montgomery/fold_7_rgb.py | 47 --------- src/ptbench/data/montgomery/fold_8.py | 67 +++++++------ src/ptbench/data/montgomery/fold_8_rgb.py | 47 --------- src/ptbench/data/montgomery/fold_9.py | 67 +++++++------ src/ptbench/data/montgomery/fold_9_rgb.py | 47 --------- src/ptbench/data/montgomery/loader.py | 114 ++++++++++++++++++++++ src/ptbench/data/montgomery/rgb.py | 47 --------- 24 files changed, 479 insertions(+), 979 deletions(-) delete mode 100644 src/ptbench/data/montgomery/fold_0_rgb.py delete mode 100644 src/ptbench/data/montgomery/fold_1_rgb.py delete mode 100644 src/ptbench/data/montgomery/fold_2_rgb.py delete mode 100644 src/ptbench/data/montgomery/fold_3_rgb.py delete mode 100644 src/ptbench/data/montgomery/fold_4_rgb.py delete mode 100644 src/ptbench/data/montgomery/fold_5_rgb.py delete mode 100644 src/ptbench/data/montgomery/fold_6_rgb.py delete mode 100644 src/ptbench/data/montgomery/fold_7_rgb.py delete mode 100644 src/ptbench/data/montgomery/fold_8_rgb.py delete mode 100644 src/ptbench/data/montgomery/fold_9_rgb.py create mode 100644 src/ptbench/data/montgomery/loader.py delete mode 100644 src/ptbench/data/montgomery/rgb.py diff --git a/src/ptbench/data/montgomery/__init__.py b/src/ptbench/data/montgomery/__init__.py index 65239cbf..e69de29b 100644 --- a/src/ptbench/data/montgomery/__init__.py +++ b/src/ptbench/data/montgomery/__init__.py @@ -1,88 +0,0 @@ -# SPDX-FileCopyrightText: Copyright © 2023 Idiap Research Institute <contact@idiap.ch> -# -# SPDX-License-Identifier: GPL-3.0-or-later - -"""Montgomery dataset for computer-aided diagnosis. - -The Montgomery database has been established to foster research -in computer-aided diagnosis of pulmonary diseases with a special -focus on pulmonary tuberculosis (TB). - -* Reference: [MONTGOMERY-SHENZHEN-2014]_ -* Original resolution (height x width or width x height): 4020 x 4892 -* Split reference: none -* Protocol ``default``: - - * Training samples: 64% of TB and healthy CXR (including labels) - * Validation samples: 16% of TB and healthy CXR (including labels) - * Test samples: 20% of TB and healthy CXR (including labels) -""" - -import importlib.resources -import os - -from ...utils.rc import load_rc -from .. import make_dataset -from ..dataset import JSONDataset -from ..loader import load_pil_baw, make_delayed - -_protocols = [ - importlib.resources.files(__name__).joinpath("default.json.bz2"), - importlib.resources.files(__name__).joinpath("fold_0.json.bz2"), - importlib.resources.files(__name__).joinpath("fold_1.json.bz2"), - importlib.resources.files(__name__).joinpath("fold_2.json.bz2"), - importlib.resources.files(__name__).joinpath("fold_3.json.bz2"), - importlib.resources.files(__name__).joinpath("fold_4.json.bz2"), - importlib.resources.files(__name__).joinpath("fold_5.json.bz2"), - importlib.resources.files(__name__).joinpath("fold_6.json.bz2"), - importlib.resources.files(__name__).joinpath("fold_7.json.bz2"), - importlib.resources.files(__name__).joinpath("fold_8.json.bz2"), - importlib.resources.files(__name__).joinpath("fold_9.json.bz2"), -] - -_datadir = load_rc().get("datadir.montgomery", os.path.realpath(os.curdir)) - - -def _raw_data_loader(sample): - return dict( - data=load_pil_baw(os.path.join(_datadir, sample["data"])), # type: ignore - label=sample["label"], - ) - - -def _loader(context, sample): - # "context" is ignored in this case - database is homogeneous - # we return delayed samples to avoid loading all images at once - return make_delayed(sample, _raw_data_loader) - - -json_dataset = JSONDataset( - protocols=_protocols, - fieldnames=("data", "label"), - loader=_loader, -) -"""Montgomery dataset object.""" - - -def _maker(protocol, resize_size=512, cc_size=512, RGB=False): - from torchvision import transforms - - from ..transforms import ElasticDeformation, RemoveBlackBorders - - post_transforms = [] - if RGB: - post_transforms = [ - transforms.Lambda(lambda x: x.convert("RGB")), - transforms.ToTensor(), - ] - - return make_dataset( - [json_dataset.subsets(protocol)], - [ - RemoveBlackBorders(), - transforms.Resize(resize_size), - transforms.CenterCrop(cc_size), - ], - [ElasticDeformation(p=0.8)], - post_transforms, - ) diff --git a/src/ptbench/data/montgomery/default.py b/src/ptbench/data/montgomery/default.py index 1f5c0809..bc93c593 100644 --- a/src/ptbench/data/montgomery/default.py +++ b/src/ptbench/data/montgomery/default.py @@ -2,46 +2,50 @@ # # SPDX-License-Identifier: GPL-3.0-or-later -"""Montgomery dataset for TB detection (default protocol) +"""Montgomery datamodule for TB detection (default protocol) -* Split reference: first 64% of TB and healthy CXR for "train" 16% for -* "validation", 20% for "test" -* This configuration resolution: 512 x 512 (default) -* See :py:mod:`ptbench.data.montgomery` for dataset details -""" +* See :py:mod:`ptbench.data.montgomery` for more database details. -from clapper.logging import setup +This configuration: -from .. import return_subsets -from ..base_datamodule import BaseDataModule -from . import _maker +* Raw data input (on disk): -logger = setup(__name__.split(".")[0], format="%(levelname)s: %(message)s") + * PNG images 12 bit grayscale + * resolution: 4020 x 4892 px or 4892 x 4020 px +* Output image: + + * Transforms: -class DefaultModule(BaseDataModule): - def __init__( - self, - train_batch_size=1, - predict_batch_size=1, - drop_incomplete_batch=False, - multiproc_kwargs=None, - ): - super().__init__( - train_batch_size=train_batch_size, - predict_batch_size=predict_batch_size, - drop_incomplete_batch=drop_incomplete_batch, - multiproc_kwargs=multiproc_kwargs, - ) + * Load raw PNG with :py:mod:`PIL` + * Remove black borders + * Torch resizing (512 x 512 px) + * Torch center cropping (512 x 512 px) + + * Final specifications + + * Fixed resolution: 512 x 512 px + * Color RGB encoding - def setup(self, stage: str): - self.dataset = _maker("default") - ( - self.train_dataset, - self.validation_dataset, - self.extra_validation_datasets, - self.predict_dataset, - ) = return_subsets(self.dataset) +Protocol ``default``: + + * Training samples: first 64% of TB and healthy CXR (including labels) + * Validation samples: 16% of TB and healthy CXR (including labels) + * Test samples: 20% of TB and healty CXR (including labels) +""" + +import importlib.resources -datamodule = DefaultModule +from ..datamodule import CachingDataModule +from ..split import JSONDatabaseSplit +from .loader import RawDataLoader + +datamodule = CachingDataModule( + database_split=JSONDatabaseSplit( + importlib.resources.files(__name__.rsplit(".", 1)[0]).joinpath( + "default.json.bz2" + ) + ), + raw_data_loader=RawDataLoader(), +) diff --git a/src/ptbench/data/montgomery/fold_0.py b/src/ptbench/data/montgomery/fold_0.py index c60791be..04376951 100644 --- a/src/ptbench/data/montgomery/fold_0.py +++ b/src/ptbench/data/montgomery/fold_0.py @@ -2,46 +2,44 @@ # # SPDX-License-Identifier: GPL-3.0-or-later -"""Montgomery dataset for TB detection (cross validation fold 0) +"""Montgomery datamodule for TB detection (default protocol) -* Split reference: first 64% of TB and healthy CXR for "train" 16% for -* "validation", 20% for "test" -* This configuration resolution: 512 x 512 (default) -* See :py:mod:`ptbench.data.montgomery` for dataset details -""" +* See :py:mod:`ptbench.data.montgomery` for more database details. -from clapper.logging import setup +This configuration: -from .. import return_subsets -from ..base_datamodule import BaseDataModule -from . import _maker +* Raw data input (on disk): -logger = setup(__name__.split(".")[0], format="%(levelname)s: %(message)s") + * PNG images 12 bit grayscale + * resolution: 4020 x 4892 px or 4892 x 4020 px +* Output image: + + * Transforms: -class DefaultModule(BaseDataModule): - def __init__( - self, - train_batch_size=1, - predict_batch_size=1, - drop_incomplete_batch=False, - multiproc_kwargs=None, - ): - super().__init__( - train_batch_size=train_batch_size, - predict_batch_size=predict_batch_size, - drop_incomplete_batch=drop_incomplete_batch, - multiproc_kwargs=multiproc_kwargs, - ) + * Load raw PNG with :py:mod:`PIL` + * Remove black borders + * Torch resizing (512 x 512 px) + * Torch center cropping (512 x 512 px) - def setup(self, stage: str): - self.dataset = _maker("fold_0") - ( - self.train_dataset, - self.validation_dataset, - self.extra_validation_datasets, - self.predict_dataset, - ) = return_subsets(self.dataset) + * Final specifications + + * Fixed resolution: 512 x 512 px + * Color RGB encoding +""" +import importlib.resources + +from ..datamodule import CachingDataModule +from ..split import JSONDatabaseSplit +from .loader import RawDataLoader + +datamodule = CachingDataModule( + database_split=JSONDatabaseSplit( + importlib.resources.files(__name__.rsplit(".", 1)[0]).joinpath( + "fold_0.json.bz2" + ) + ), + raw_data_loader=RawDataLoader(), +) -datamodule = DefaultModule diff --git a/src/ptbench/data/montgomery/fold_0_rgb.py b/src/ptbench/data/montgomery/fold_0_rgb.py deleted file mode 100644 index 8e8b0c89..00000000 --- a/src/ptbench/data/montgomery/fold_0_rgb.py +++ /dev/null @@ -1,47 +0,0 @@ -# SPDX-FileCopyrightText: Copyright © 2023 Idiap Research Institute <contact@idiap.ch> -# -# SPDX-License-Identifier: GPL-3.0-or-later - -"""Montgomery dataset for TB detection (cross validation fold 0, RGB) - -* Split reference: first 64% of TB and healthy CXR for "train" 16% for -* "validation", 20% for "test" -* This configuration resolution: 512 x 512 (default) -* See :py:mod:`ptbench.data.montgomery` for dataset details -""" - -from clapper.logging import setup - -from .. import return_subsets -from ..base_datamodule import BaseDataModule -from . import _maker - -logger = setup(__name__.split(".")[0], format="%(levelname)s: %(message)s") - - -class DefaultModule(BaseDataModule): - def __init__( - self, - train_batch_size=1, - predict_batch_size=1, - drop_incomplete_batch=False, - multiproc_kwargs=None, - ): - super().__init__( - train_batch_size=train_batch_size, - predict_batch_size=predict_batch_size, - drop_incomplete_batch=drop_incomplete_batch, - multiproc_kwargs=multiproc_kwargs, - ) - - def setup(self, stage: str): - self.dataset = _maker("fold_0", RGB=True) - ( - self.train_dataset, - self.validation_dataset, - self.extra_validation_datasets, - self.predict_dataset, - ) = return_subsets(self.dataset) - - -datamodule = DefaultModule diff --git a/src/ptbench/data/montgomery/fold_1.py b/src/ptbench/data/montgomery/fold_1.py index d6627e67..8500456b 100644 --- a/src/ptbench/data/montgomery/fold_1.py +++ b/src/ptbench/data/montgomery/fold_1.py @@ -2,46 +2,44 @@ # # SPDX-License-Identifier: GPL-3.0-or-later -"""Montgomery dataset for TB detection (cross validation fold 1) +"""Montgomery datamodule for TB detection (default protocol) -* Split reference: first 64% of TB and healthy CXR for "train" 16% for -* "validation", 20% for "test" -* This configuration resolution: 512 x 512 (default) -* See :py:mod:`ptbench.data.montgomery` for dataset details -""" +* See :py:mod:`ptbench.data.montgomery` for more database details. -from clapper.logging import setup +This configuration: -from .. import return_subsets -from ..base_datamodule import BaseDataModule -from . import _maker +* Raw data input (on disk): -logger = setup(__name__.split(".")[0], format="%(levelname)s: %(message)s") + * PNG images 12 bit grayscale + * resolution: 4020 x 4892 px or 4892 x 4020 px +* Output image: + + * Transforms: -class DefaultModule(BaseDataModule): - def __init__( - self, - train_batch_size=1, - predict_batch_size=1, - drop_incomplete_batch=False, - multiproc_kwargs=None, - ): - super().__init__( - train_batch_size=train_batch_size, - predict_batch_size=predict_batch_size, - drop_incomplete_batch=drop_incomplete_batch, - multiproc_kwargs=multiproc_kwargs, - ) + * Load raw PNG with :py:mod:`PIL` + * Remove black borders + * Torch resizing (512 x 512 px) + * Torch center cropping (512 x 512 px) - def setup(self, stage: str): - self.dataset = _maker("fold_1") - ( - self.train_dataset, - self.validation_dataset, - self.extra_validation_datasets, - self.predict_dataset, - ) = return_subsets(self.dataset) + * Final specifications + + * Fixed resolution: 512 x 512 px + * Color RGB encoding +""" +import importlib.resources + +from ..datamodule import CachingDataModule +from ..split import JSONDatabaseSplit +from .loader import RawDataLoader + +datamodule = CachingDataModule( + database_split=JSONDatabaseSplit( + importlib.resources.files(__name__.rsplit(".", 1)[0]).joinpath( + "fold_1.json.bz2" + ) + ), + raw_data_loader=RawDataLoader(), +) -datamodule = DefaultModule diff --git a/src/ptbench/data/montgomery/fold_1_rgb.py b/src/ptbench/data/montgomery/fold_1_rgb.py deleted file mode 100644 index bc47a322..00000000 --- a/src/ptbench/data/montgomery/fold_1_rgb.py +++ /dev/null @@ -1,47 +0,0 @@ -# SPDX-FileCopyrightText: Copyright © 2023 Idiap Research Institute <contact@idiap.ch> -# -# SPDX-License-Identifier: GPL-3.0-or-later - -"""Montgomery dataset for TB detection (cross validation fold 1, RGB) - -* Split reference: first 64% of TB and healthy CXR for "train" 16% for -* "validation", 20% for "test" -* This configuration resolution: 512 x 512 (default) -* See :py:mod:`ptbench.data.montgomery` for dataset details -""" - -from clapper.logging import setup - -from .. import return_subsets -from ..base_datamodule import BaseDataModule -from . import _maker - -logger = setup(__name__.split(".")[0], format="%(levelname)s: %(message)s") - - -class DefaultModule(BaseDataModule): - def __init__( - self, - train_batch_size=1, - predict_batch_size=1, - drop_incomplete_batch=False, - multiproc_kwargs=None, - ): - super().__init__( - train_batch_size=train_batch_size, - predict_batch_size=predict_batch_size, - drop_incomplete_batch=drop_incomplete_batch, - multiproc_kwargs=multiproc_kwargs, - ) - - def setup(self, stage: str): - self.dataset = _maker("fold_1", RGB=True) - ( - self.train_dataset, - self.validation_dataset, - self.extra_validation_datasets, - self.predict_dataset, - ) = return_subsets(self.dataset) - - -datamodule = DefaultModule diff --git a/src/ptbench/data/montgomery/fold_2.py b/src/ptbench/data/montgomery/fold_2.py index 8c5f4a66..e4b7a614 100644 --- a/src/ptbench/data/montgomery/fold_2.py +++ b/src/ptbench/data/montgomery/fold_2.py @@ -2,46 +2,44 @@ # # SPDX-License-Identifier: GPL-3.0-or-later -"""Montgomery dataset for TB detection (cross validation fold 2) +"""Montgomery datamodule for TB detection (default protocol) -* Split reference: first 64% of TB and healthy CXR for "train" 16% for -* "validation", 20% for "test" -* This configuration resolution: 512 x 512 (default) -* See :py:mod:`ptbench.data.montgomery` for dataset details -""" +* See :py:mod:`ptbench.data.montgomery` for more database details. -from clapper.logging import setup +This configuration: -from .. import return_subsets -from ..base_datamodule import BaseDataModule -from . import _maker +* Raw data input (on disk): -logger = setup(__name__.split(".")[0], format="%(levelname)s: %(message)s") + * PNG images 12 bit grayscale + * resolution: 4020 x 4892 px or 4892 x 4020 px +* Output image: + + * Transforms: -class DefaultModule(BaseDataModule): - def __init__( - self, - train_batch_size=1, - predict_batch_size=1, - drop_incomplete_batch=False, - multiproc_kwargs=None, - ): - super().__init__( - train_batch_size=train_batch_size, - predict_batch_size=predict_batch_size, - drop_incomplete_batch=drop_incomplete_batch, - multiproc_kwargs=multiproc_kwargs, - ) + * Load raw PNG with :py:mod:`PIL` + * Remove black borders + * Torch resizing (512 x 512 px) + * Torch center cropping (512 x 512 px) - def setup(self, stage: str): - self.dataset = _maker("fold_2") - ( - self.train_dataset, - self.validation_dataset, - self.extra_validation_datasets, - self.predict_dataset, - ) = return_subsets(self.dataset) + * Final specifications + + * Fixed resolution: 512 x 512 px + * Color RGB encoding +""" +import importlib.resources + +from ..datamodule import CachingDataModule +from ..split import JSONDatabaseSplit +from .loader import RawDataLoader + +datamodule = CachingDataModule( + database_split=JSONDatabaseSplit( + importlib.resources.files(__name__.rsplit(".", 1)[0]).joinpath( + "fold_2.json.bz2" + ) + ), + raw_data_loader=RawDataLoader(), +) -datamodule = DefaultModule diff --git a/src/ptbench/data/montgomery/fold_2_rgb.py b/src/ptbench/data/montgomery/fold_2_rgb.py deleted file mode 100644 index b81a877b..00000000 --- a/src/ptbench/data/montgomery/fold_2_rgb.py +++ /dev/null @@ -1,47 +0,0 @@ -# SPDX-FileCopyrightText: Copyright © 2023 Idiap Research Institute <contact@idiap.ch> -# -# SPDX-License-Identifier: GPL-3.0-or-later - -"""Montgomery dataset for TB detection (cross validation fold 2, RGB) - -* Split reference: first 64% of TB and healthy CXR for "train" 16% for -* "validation", 20% for "test" -* This configuration resolution: 512 x 512 (default) -* See :py:mod:`ptbench.data.montgomery` for dataset details -""" - -from clapper.logging import setup - -from .. import return_subsets -from ..base_datamodule import BaseDataModule -from . import _maker - -logger = setup(__name__.split(".")[0], format="%(levelname)s: %(message)s") - - -class DefaultModule(BaseDataModule): - def __init__( - self, - train_batch_size=1, - predict_batch_size=1, - drop_incomplete_batch=False, - multiproc_kwargs=None, - ): - super().__init__( - train_batch_size=train_batch_size, - predict_batch_size=predict_batch_size, - drop_incomplete_batch=drop_incomplete_batch, - multiproc_kwargs=multiproc_kwargs, - ) - - def setup(self, stage: str): - self.dataset = _maker("fold_2", RGB=True) - ( - self.train_dataset, - self.validation_dataset, - self.extra_validation_datasets, - self.predict_dataset, - ) = return_subsets(self.dataset) - - -datamodule = DefaultModule diff --git a/src/ptbench/data/montgomery/fold_3.py b/src/ptbench/data/montgomery/fold_3.py index 8e685d7e..719bf004 100644 --- a/src/ptbench/data/montgomery/fold_3.py +++ b/src/ptbench/data/montgomery/fold_3.py @@ -2,46 +2,45 @@ # # SPDX-License-Identifier: GPL-3.0-or-later -"""Montgomery dataset for TB detection (cross validation fold 3) +"""Montgomery datamodule for TB detection (default protocol) -* Split reference: first 64% of TB and healthy CXR for "train" 16% for -* "validation", 20% for "test" -* This configuration resolution: 512 x 512 (default) -* See :py:mod:`ptbench.data.montgomery` for dataset details -""" +* See :py:mod:`ptbench.data.montgomery` for more database details. -from clapper.logging import setup +This configuration: -from .. import return_subsets -from ..base_datamodule import BaseDataModule -from . import _maker +* Raw data input (on disk): -logger = setup(__name__.split(".")[0], format="%(levelname)s: %(message)s") + * PNG images 12 bit grayscale + * resolution: 4020 x 4892 px or 4892 x 4020 px +* Output image: + + * Transforms: -class DefaultModule(BaseDataModule): - def __init__( - self, - train_batch_size=1, - predict_batch_size=1, - drop_incomplete_batch=False, - multiproc_kwargs=None, - ): - super().__init__( - train_batch_size=train_batch_size, - predict_batch_size=predict_batch_size, - drop_incomplete_batch=drop_incomplete_batch, - multiproc_kwargs=multiproc_kwargs, - ) + * Load raw PNG with :py:mod:`PIL` + * Remove black borders + * Torch resizing (512 x 512 px) + * Torch center cropping (512 x 512 px) + + * Final specifications + + * Fixed resolution: 512 x 512 px + * Color RGB encoding +""" + +import importlib.resources - def setup(self, stage: str): - self.dataset = _maker("fold_3") - ( - self.train_dataset, - self.validation_dataset, - self.extra_validation_datasets, - self.predict_dataset, - ) = return_subsets(self.dataset) +from ..datamodule import CachingDataModule +from ..split import JSONDatabaseSplit +from .loader import RawDataLoader + +datamodule = CachingDataModule( + database_split=JSONDatabaseSplit( + importlib.resources.files(__name__.rsplit(".", 1)[0]).joinpath( + "fold_3.json.bz2" + ) + ), + raw_data_loader=RawDataLoader(), +) -datamodule = DefaultModule diff --git a/src/ptbench/data/montgomery/fold_3_rgb.py b/src/ptbench/data/montgomery/fold_3_rgb.py deleted file mode 100644 index 7b600371..00000000 --- a/src/ptbench/data/montgomery/fold_3_rgb.py +++ /dev/null @@ -1,47 +0,0 @@ -# SPDX-FileCopyrightText: Copyright © 2023 Idiap Research Institute <contact@idiap.ch> -# -# SPDX-License-Identifier: GPL-3.0-or-later - -"""Montgomery dataset for TB detection (cross validation fold 3, RGB) - -* Split reference: first 64% of TB and healthy CXR for "train" 16% for -* "validation", 20% for "test" -* This configuration resolution: 512 x 512 (default) -* See :py:mod:`ptbench.data.montgomery` for dataset details -""" - -from clapper.logging import setup - -from .. import return_subsets -from ..base_datamodule import BaseDataModule -from . import _maker - -logger = setup(__name__.split(".")[0], format="%(levelname)s: %(message)s") - - -class DefaultModule(BaseDataModule): - def __init__( - self, - train_batch_size=1, - predict_batch_size=1, - drop_incomplete_batch=False, - multiproc_kwargs=None, - ): - super().__init__( - train_batch_size=train_batch_size, - predict_batch_size=predict_batch_size, - drop_incomplete_batch=drop_incomplete_batch, - multiproc_kwargs=multiproc_kwargs, - ) - - def setup(self, stage: str): - self.dataset = _maker("fold_3", RGB=True) - ( - self.train_dataset, - self.validation_dataset, - self.extra_validation_datasets, - self.predict_dataset, - ) = return_subsets(self.dataset) - - -datamodule = DefaultModule diff --git a/src/ptbench/data/montgomery/fold_4.py b/src/ptbench/data/montgomery/fold_4.py index 9459cb93..2e97b114 100644 --- a/src/ptbench/data/montgomery/fold_4.py +++ b/src/ptbench/data/montgomery/fold_4.py @@ -2,46 +2,45 @@ # # SPDX-License-Identifier: GPL-3.0-or-later -"""Montgomery dataset for TB detection (cross validation fold 4) +"""Montgomery datamodule for TB detection (default protocol) -* Split reference: first 64% of TB and healthy CXR for "train" 16% for -* "validation", 20% for "test" -* This configuration resolution: 512 x 512 (default) -* See :py:mod:`ptbench.data.montgomery` for dataset details -""" +* See :py:mod:`ptbench.data.montgomery` for more database details. -from clapper.logging import setup +This configuration: -from .. import return_subsets -from ..base_datamodule import BaseDataModule -from . import _maker +* Raw data input (on disk): -logger = setup(__name__.split(".")[0], format="%(levelname)s: %(message)s") + * PNG images 12 bit grayscale + * resolution: 4020 x 4892 px or 4892 x 4020 px +* Output image: + + * Transforms: -class DefaultModule(BaseDataModule): - def __init__( - self, - train_batch_size=1, - predict_batch_size=1, - drop_incomplete_batch=False, - multiproc_kwargs=None, - ): - super().__init__( - train_batch_size=train_batch_size, - predict_batch_size=predict_batch_size, - drop_incomplete_batch=drop_incomplete_batch, - multiproc_kwargs=multiproc_kwargs, - ) + * Load raw PNG with :py:mod:`PIL` + * Remove black borders + * Torch resizing (512 x 512 px) + * Torch center cropping (512 x 512 px) + + * Final specifications + + * Fixed resolution: 512 x 512 px + * Color RGB encoding +""" + +import importlib.resources - def setup(self, stage: str): - self.dataset = _maker("fold_4") - ( - self.train_dataset, - self.validation_dataset, - self.extra_validation_datasets, - self.predict_dataset, - ) = return_subsets(self.dataset) +from ..datamodule import CachingDataModule +from ..split import JSONDatabaseSplit +from .loader import RawDataLoader + +datamodule = CachingDataModule( + database_split=JSONDatabaseSplit( + importlib.resources.files(__name__.rsplit(".", 1)[0]).joinpath( + "fold_4.json.bz2" + ) + ), + raw_data_loader=RawDataLoader(), +) -datamodule = DefaultModule diff --git a/src/ptbench/data/montgomery/fold_4_rgb.py b/src/ptbench/data/montgomery/fold_4_rgb.py deleted file mode 100644 index 3eb136f6..00000000 --- a/src/ptbench/data/montgomery/fold_4_rgb.py +++ /dev/null @@ -1,47 +0,0 @@ -# SPDX-FileCopyrightText: Copyright © 2023 Idiap Research Institute <contact@idiap.ch> -# -# SPDX-License-Identifier: GPL-3.0-or-later - -"""Montgomery dataset for TB detection (cross validation fold 4, RGB) - -* Split reference: first 64% of TB and healthy CXR for "train" 16% for -* "validation", 20% for "test" -* This configuration resolution: 512 x 512 (default) -* See :py:mod:`ptbench.data.montgomery` for dataset details -""" - -from clapper.logging import setup - -from .. import return_subsets -from ..base_datamodule import BaseDataModule -from . import _maker - -logger = setup(__name__.split(".")[0], format="%(levelname)s: %(message)s") - - -class DefaultModule(BaseDataModule): - def __init__( - self, - train_batch_size=1, - predict_batch_size=1, - drop_incomplete_batch=False, - multiproc_kwargs=None, - ): - super().__init__( - train_batch_size=train_batch_size, - predict_batch_size=predict_batch_size, - drop_incomplete_batch=drop_incomplete_batch, - multiproc_kwargs=multiproc_kwargs, - ) - - def setup(self, stage: str): - self.dataset = _maker("fold_4", RGB=True) - ( - self.train_dataset, - self.validation_dataset, - self.extra_validation_datasets, - self.predict_dataset, - ) = return_subsets(self.dataset) - - -datamodule = DefaultModule diff --git a/src/ptbench/data/montgomery/fold_5.py b/src/ptbench/data/montgomery/fold_5.py index 147690f6..4df1451d 100644 --- a/src/ptbench/data/montgomery/fold_5.py +++ b/src/ptbench/data/montgomery/fold_5.py @@ -2,46 +2,45 @@ # # SPDX-License-Identifier: GPL-3.0-or-later -"""Montgomery dataset for TB detection (cross validation fold 5) +"""Montgomery datamodule for TB detection (default protocol) -* Split reference: first 64% of TB and healthy CXR for "train" 16% for -* "validation", 20% for "test" -* This configuration resolution: 512 x 512 (default) -* See :py:mod:`ptbench.data.montgomery` for dataset details -""" +* See :py:mod:`ptbench.data.montgomery` for more database details. -from clapper.logging import setup +This configuration: -from .. import return_subsets -from ..base_datamodule import BaseDataModule -from . import _maker +* Raw data input (on disk): -logger = setup(__name__.split(".")[0], format="%(levelname)s: %(message)s") + * PNG images 12 bit grayscale + * resolution: 4020 x 4892 px or 4892 x 4020 px +* Output image: + + * Transforms: -class DefaultModule(BaseDataModule): - def __init__( - self, - train_batch_size=1, - predict_batch_size=1, - drop_incomplete_batch=False, - multiproc_kwargs=None, - ): - super().__init__( - train_batch_size=train_batch_size, - predict_batch_size=predict_batch_size, - drop_incomplete_batch=drop_incomplete_batch, - multiproc_kwargs=multiproc_kwargs, - ) + * Load raw PNG with :py:mod:`PIL` + * Remove black borders + * Torch resizing (512 x 512 px) + * Torch center cropping (512 x 512 px) + + * Final specifications + + * Fixed resolution: 512 x 512 px + * Color RGB encoding +""" + +import importlib.resources - def setup(self, stage: str): - self.dataset = _maker("fold_5") - ( - self.train_dataset, - self.validation_dataset, - self.extra_validation_datasets, - self.predict_dataset, - ) = return_subsets(self.dataset) +from ..datamodule import CachingDataModule +from ..split import JSONDatabaseSplit +from .loader import RawDataLoader + +datamodule = CachingDataModule( + database_split=JSONDatabaseSplit( + importlib.resources.files(__name__.rsplit(".", 1)[0]).joinpath( + "fold_5.json.bz2" + ) + ), + raw_data_loader=RawDataLoader(), +) -datamodule = DefaultModule diff --git a/src/ptbench/data/montgomery/fold_5_rgb.py b/src/ptbench/data/montgomery/fold_5_rgb.py deleted file mode 100644 index 3e7cb73f..00000000 --- a/src/ptbench/data/montgomery/fold_5_rgb.py +++ /dev/null @@ -1,47 +0,0 @@ -# SPDX-FileCopyrightText: Copyright © 2023 Idiap Research Institute <contact@idiap.ch> -# -# SPDX-License-Identifier: GPL-3.0-or-later - -"""Montgomery dataset for TB detection (cross validation fold 5, RGB) - -* Split reference: first 64% of TB and healthy CXR for "train" 16% for -* "validation", 20% for "test" -* This configuration resolution: 512 x 512 (default) -* See :py:mod:`ptbench.data.montgomery` for dataset details -""" - -from clapper.logging import setup - -from .. import return_subsets -from ..base_datamodule import BaseDataModule -from . import _maker - -logger = setup(__name__.split(".")[0], format="%(levelname)s: %(message)s") - - -class DefaultModule(BaseDataModule): - def __init__( - self, - train_batch_size=1, - predict_batch_size=1, - drop_incomplete_batch=False, - multiproc_kwargs=None, - ): - super().__init__( - train_batch_size=train_batch_size, - predict_batch_size=predict_batch_size, - drop_incomplete_batch=drop_incomplete_batch, - multiproc_kwargs=multiproc_kwargs, - ) - - def setup(self, stage: str): - self.dataset = _maker("fold_5", RGB=True) - ( - self.train_dataset, - self.validation_dataset, - self.extra_validation_datasets, - self.predict_dataset, - ) = return_subsets(self.dataset) - - -datamodule = DefaultModule diff --git a/src/ptbench/data/montgomery/fold_6.py b/src/ptbench/data/montgomery/fold_6.py index 69f24390..d0b36115 100644 --- a/src/ptbench/data/montgomery/fold_6.py +++ b/src/ptbench/data/montgomery/fold_6.py @@ -2,46 +2,45 @@ # # SPDX-License-Identifier: GPL-3.0-or-later -"""Montgomery dataset for TB detection (cross validation fold 6) +"""Montgomery datamodule for TB detection (default protocol) -* Split reference: first 64% of TB and healthy CXR for "train" 16% for -* "validation", 20% for "test" -* This configuration resolution: 512 x 512 (default) -* See :py:mod:`ptbench.data.montgomery` for dataset details -""" +* See :py:mod:`ptbench.data.montgomery` for more database details. -from clapper.logging import setup +This configuration: -from .. import return_subsets -from ..base_datamodule import BaseDataModule -from . import _maker +* Raw data input (on disk): -logger = setup(__name__.split(".")[0], format="%(levelname)s: %(message)s") + * PNG images 12 bit grayscale + * resolution: 4020 x 4892 px or 4892 x 4020 px +* Output image: + + * Transforms: -class DefaultModule(BaseDataModule): - def __init__( - self, - train_batch_size=1, - predict_batch_size=1, - drop_incomplete_batch=False, - multiproc_kwargs=None, - ): - super().__init__( - train_batch_size=train_batch_size, - predict_batch_size=predict_batch_size, - drop_incomplete_batch=drop_incomplete_batch, - multiproc_kwargs=multiproc_kwargs, - ) + * Load raw PNG with :py:mod:`PIL` + * Remove black borders + * Torch resizing (512 x 512 px) + * Torch center cropping (512 x 512 px) + + * Final specifications + + * Fixed resolution: 512 x 512 px + * Color RGB encoding +""" + +import importlib.resources - def setup(self, stage: str): - self.dataset = _maker("fold_6") - ( - self.train_dataset, - self.validation_dataset, - self.extra_validation_datasets, - self.predict_dataset, - ) = return_subsets(self.dataset) +from ..datamodule import CachingDataModule +from ..split import JSONDatabaseSplit +from .loader import RawDataLoader + +datamodule = CachingDataModule( + database_split=JSONDatabaseSplit( + importlib.resources.files(__name__.rsplit(".", 1)[0]).joinpath( + "fold_6.json.bz2" + ) + ), + raw_data_loader=RawDataLoader(), +) -datamodule = DefaultModule diff --git a/src/ptbench/data/montgomery/fold_6_rgb.py b/src/ptbench/data/montgomery/fold_6_rgb.py deleted file mode 100644 index ff3a8cdb..00000000 --- a/src/ptbench/data/montgomery/fold_6_rgb.py +++ /dev/null @@ -1,47 +0,0 @@ -# SPDX-FileCopyrightText: Copyright © 2023 Idiap Research Institute <contact@idiap.ch> -# -# SPDX-License-Identifier: GPL-3.0-or-later - -"""Montgomery dataset for TB detection (cross validation fold 6, RGB) - -* Split reference: first 64% of TB and healthy CXR for "train" 16% for -* "validation", 20% for "test" -* This configuration resolution: 512 x 512 (default) -* See :py:mod:`ptbench.data.montgomery` for dataset details -""" - -from clapper.logging import setup - -from .. import return_subsets -from ..base_datamodule import BaseDataModule -from . import _maker - -logger = setup(__name__.split(".")[0], format="%(levelname)s: %(message)s") - - -class DefaultModule(BaseDataModule): - def __init__( - self, - train_batch_size=1, - predict_batch_size=1, - drop_incomplete_batch=False, - multiproc_kwargs=None, - ): - super().__init__( - train_batch_size=train_batch_size, - predict_batch_size=predict_batch_size, - drop_incomplete_batch=drop_incomplete_batch, - multiproc_kwargs=multiproc_kwargs, - ) - - def setup(self, stage: str): - self.dataset = _maker("fold_6", RGB=True) - ( - self.train_dataset, - self.validation_dataset, - self.extra_validation_datasets, - self.predict_dataset, - ) = return_subsets(self.dataset) - - -datamodule = DefaultModule diff --git a/src/ptbench/data/montgomery/fold_7.py b/src/ptbench/data/montgomery/fold_7.py index 20ba9d3a..b132b30e 100644 --- a/src/ptbench/data/montgomery/fold_7.py +++ b/src/ptbench/data/montgomery/fold_7.py @@ -2,46 +2,45 @@ # # SPDX-License-Identifier: GPL-3.0-or-later -"""Montgomery dataset for TB detection (cross validation fold 7) +"""Montgomery datamodule for TB detection (default protocol) -* Split reference: first 64% of TB and healthy CXR for "train" 16% for -* "validation", 20% for "test" -* This configuration resolution: 512 x 512 (default) -* See :py:mod:`ptbench.data.montgomery` for dataset details -""" +* See :py:mod:`ptbench.data.montgomery` for more database details. -from clapper.logging import setup +This configuration: -from .. import return_subsets -from ..base_datamodule import BaseDataModule -from . import _maker +* Raw data input (on disk): -logger = setup(__name__.split(".")[0], format="%(levelname)s: %(message)s") + * PNG images 12 bit grayscale + * resolution: 4020 x 4892 px or 4892 x 4020 px +* Output image: + + * Transforms: -class DefaultModule(BaseDataModule): - def __init__( - self, - train_batch_size=1, - predict_batch_size=1, - drop_incomplete_batch=False, - multiproc_kwargs=None, - ): - super().__init__( - train_batch_size=train_batch_size, - predict_batch_size=predict_batch_size, - drop_incomplete_batch=drop_incomplete_batch, - multiproc_kwargs=multiproc_kwargs, - ) + * Load raw PNG with :py:mod:`PIL` + * Remove black borders + * Torch resizing (512 x 512 px) + * Torch center cropping (512 x 512 px) + + * Final specifications + + * Fixed resolution: 512 x 512 px + * Color RGB encoding +""" + +import importlib.resources - def setup(self, stage: str): - self.dataset = _maker("fold_7") - ( - self.train_dataset, - self.validation_dataset, - self.extra_validation_datasets, - self.predict_dataset, - ) = return_subsets(self.dataset) +from ..datamodule import CachingDataModule +from ..split import JSONDatabaseSplit +from .loader import RawDataLoader + +datamodule = CachingDataModule( + database_split=JSONDatabaseSplit( + importlib.resources.files(__name__.rsplit(".", 1)[0]).joinpath( + "fold_7.json.bz2" + ) + ), + raw_data_loader=RawDataLoader(), +) -datamodule = DefaultModule diff --git a/src/ptbench/data/montgomery/fold_7_rgb.py b/src/ptbench/data/montgomery/fold_7_rgb.py deleted file mode 100644 index 05664b06..00000000 --- a/src/ptbench/data/montgomery/fold_7_rgb.py +++ /dev/null @@ -1,47 +0,0 @@ -# SPDX-FileCopyrightText: Copyright © 2023 Idiap Research Institute <contact@idiap.ch> -# -# SPDX-License-Identifier: GPL-3.0-or-later - -"""Montgomery dataset for TB detection (cross validation fold 7, RGB) - -* Split reference: first 64% of TB and healthy CXR for "train" 16% for -* "validation", 20% for "test" -* This configuration resolution: 512 x 512 (default) -* See :py:mod:`ptbench.data.montgomery` for dataset details -""" - -from clapper.logging import setup - -from .. import return_subsets -from ..base_datamodule import BaseDataModule -from . import _maker - -logger = setup(__name__.split(".")[0], format="%(levelname)s: %(message)s") - - -class DefaultModule(BaseDataModule): - def __init__( - self, - train_batch_size=1, - predict_batch_size=1, - drop_incomplete_batch=False, - multiproc_kwargs=None, - ): - super().__init__( - train_batch_size=train_batch_size, - predict_batch_size=predict_batch_size, - drop_incomplete_batch=drop_incomplete_batch, - multiproc_kwargs=multiproc_kwargs, - ) - - def setup(self, stage: str): - self.dataset = _maker("fold_7", RGB=True) - ( - self.train_dataset, - self.validation_dataset, - self.extra_validation_datasets, - self.predict_dataset, - ) = return_subsets(self.dataset) - - -datamodule = DefaultModule diff --git a/src/ptbench/data/montgomery/fold_8.py b/src/ptbench/data/montgomery/fold_8.py index e92ff959..73169ca0 100644 --- a/src/ptbench/data/montgomery/fold_8.py +++ b/src/ptbench/data/montgomery/fold_8.py @@ -2,46 +2,45 @@ # # SPDX-License-Identifier: GPL-3.0-or-later -"""Montgomery dataset for TB detection (cross validation fold 8) +"""Montgomery datamodule for TB detection (default protocol) -* Split reference: first 64% of TB and healthy CXR for "train" 16% for -* "validation", 20% for "test" -* This configuration resolution: 512 x 512 (default) -* See :py:mod:`ptbench.data.montgomery` for dataset details -""" +* See :py:mod:`ptbench.data.montgomery` for more database details. -from clapper.logging import setup +This configuration: -from .. import return_subsets -from ..base_datamodule import BaseDataModule -from . import _maker +* Raw data input (on disk): -logger = setup(__name__.split(".")[0], format="%(levelname)s: %(message)s") + * PNG images 12 bit grayscale + * resolution: 4020 x 4892 px or 4892 x 4020 px +* Output image: + + * Transforms: -class DefaultModule(BaseDataModule): - def __init__( - self, - train_batch_size=1, - predict_batch_size=1, - drop_incomplete_batch=False, - multiproc_kwargs=None, - ): - super().__init__( - train_batch_size=train_batch_size, - predict_batch_size=predict_batch_size, - drop_incomplete_batch=drop_incomplete_batch, - multiproc_kwargs=multiproc_kwargs, - ) + * Load raw PNG with :py:mod:`PIL` + * Remove black borders + * Torch resizing (512 x 512 px) + * Torch center cropping (512 x 512 px) + + * Final specifications + + * Fixed resolution: 512 x 512 px + * Color RGB encoding +""" + +import importlib.resources - def setup(self, stage: str): - self.dataset = _maker("fold_8") - ( - self.train_dataset, - self.validation_dataset, - self.extra_validation_datasets, - self.predict_dataset, - ) = return_subsets(self.dataset) +from ..datamodule import CachingDataModule +from ..split import JSONDatabaseSplit +from .loader import RawDataLoader + +datamodule = CachingDataModule( + database_split=JSONDatabaseSplit( + importlib.resources.files(__name__.rsplit(".", 1)[0]).joinpath( + "fold_8.json.bz2" + ) + ), + raw_data_loader=RawDataLoader(), +) -datamodule = DefaultModule diff --git a/src/ptbench/data/montgomery/fold_8_rgb.py b/src/ptbench/data/montgomery/fold_8_rgb.py deleted file mode 100644 index b7d59359..00000000 --- a/src/ptbench/data/montgomery/fold_8_rgb.py +++ /dev/null @@ -1,47 +0,0 @@ -# SPDX-FileCopyrightText: Copyright © 2023 Idiap Research Institute <contact@idiap.ch> -# -# SPDX-License-Identifier: GPL-3.0-or-later - -"""Montgomery dataset for TB detection (cross validation fold 8, RGB) - -* Split reference: first 64% of TB and healthy CXR for "train" 16% for -* "validation", 20% for "test" -* This configuration resolution: 512 x 512 (default) -* See :py:mod:`ptbench.data.montgomery` for dataset details -""" - -from clapper.logging import setup - -from .. import return_subsets -from ..base_datamodule import BaseDataModule -from . import _maker - -logger = setup(__name__.split(".")[0], format="%(levelname)s: %(message)s") - - -class DefaultModule(BaseDataModule): - def __init__( - self, - train_batch_size=1, - predict_batch_size=1, - drop_incomplete_batch=False, - multiproc_kwargs=None, - ): - super().__init__( - train_batch_size=train_batch_size, - predict_batch_size=predict_batch_size, - drop_incomplete_batch=drop_incomplete_batch, - multiproc_kwargs=multiproc_kwargs, - ) - - def setup(self, stage: str): - self.dataset = _maker("fold_8", RGB=True) - ( - self.train_dataset, - self.validation_dataset, - self.extra_validation_datasets, - self.predict_dataset, - ) = return_subsets(self.dataset) - - -datamodule = DefaultModule diff --git a/src/ptbench/data/montgomery/fold_9.py b/src/ptbench/data/montgomery/fold_9.py index 81bbf72e..18561f80 100644 --- a/src/ptbench/data/montgomery/fold_9.py +++ b/src/ptbench/data/montgomery/fold_9.py @@ -2,46 +2,45 @@ # # SPDX-License-Identifier: GPL-3.0-or-later -"""Montgomery dataset for TB detection (cross validation fold 9) +"""Montgomery datamodule for TB detection (default protocol) -* Split reference: first 64% of TB and healthy CXR for "train" 16% for -* "validation", 20% for "test" -* This configuration resolution: 512 x 512 (default) -* See :py:mod:`ptbench.data.montgomery` for dataset details -""" +* See :py:mod:`ptbench.data.montgomery` for more database details. -from clapper.logging import setup +This configuration: -from .. import return_subsets -from ..base_datamodule import BaseDataModule -from . import _maker +* Raw data input (on disk): -logger = setup(__name__.split(".")[0], format="%(levelname)s: %(message)s") + * PNG images 12 bit grayscale + * resolution: 4020 x 4892 px or 4892 x 4020 px +* Output image: + + * Transforms: -class DefaultModule(BaseDataModule): - def __init__( - self, - train_batch_size=1, - predict_batch_size=1, - drop_incomplete_batch=False, - multiproc_kwargs=None, - ): - super().__init__( - train_batch_size=train_batch_size, - predict_batch_size=predict_batch_size, - drop_incomplete_batch=drop_incomplete_batch, - multiproc_kwargs=multiproc_kwargs, - ) + * Load raw PNG with :py:mod:`PIL` + * Remove black borders + * Torch resizing (512 x 512 px) + * Torch center cropping (512 x 512 px) + + * Final specifications + + * Fixed resolution: 512 x 512 px + * Color RGB encoding +""" + +import importlib.resources - def setup(self, stage: str): - self.dataset = _maker("fold_9") - ( - self.train_dataset, - self.validation_dataset, - self.extra_validation_datasets, - self.predict_dataset, - ) = return_subsets(self.dataset) +from ..datamodule import CachingDataModule +from ..split import JSONDatabaseSplit +from .loader import RawDataLoader + +datamodule = CachingDataModule( + database_split=JSONDatabaseSplit( + importlib.resources.files(__name__.rsplit(".", 1)[0]).joinpath( + "fold_9.json.bz2" + ) + ), + raw_data_loader=RawDataLoader(), +) -datamodule = DefaultModule diff --git a/src/ptbench/data/montgomery/fold_9_rgb.py b/src/ptbench/data/montgomery/fold_9_rgb.py deleted file mode 100644 index e961e08f..00000000 --- a/src/ptbench/data/montgomery/fold_9_rgb.py +++ /dev/null @@ -1,47 +0,0 @@ -# SPDX-FileCopyrightText: Copyright © 2023 Idiap Research Institute <contact@idiap.ch> -# -# SPDX-License-Identifier: GPL-3.0-or-later - -"""Montgomery dataset for TB detection (cross validation fold 9, RGB) - -* Split reference: first 64% of TB and healthy CXR for "train" 16% for -* "validation", 20% for "test" -* This configuration resolution: 512 x 512 (default) -* See :py:mod:`ptbench.data.montgomery` for dataset details -""" - -from clapper.logging import setup - -from .. import return_subsets -from ..base_datamodule import BaseDataModule -from . import _maker - -logger = setup(__name__.split(".")[0], format="%(levelname)s: %(message)s") - - -class DefaultModule(BaseDataModule): - def __init__( - self, - train_batch_size=1, - predict_batch_size=1, - drop_incomplete_batch=False, - multiproc_kwargs=None, - ): - super().__init__( - train_batch_size=train_batch_size, - predict_batch_size=predict_batch_size, - drop_incomplete_batch=drop_incomplete_batch, - multiproc_kwargs=multiproc_kwargs, - ) - - def setup(self, stage: str): - self.dataset = _maker("fold_9", RGB=True) - ( - self.train_dataset, - self.validation_dataset, - self.extra_validation_datasets, - self.predict_dataset, - ) = return_subsets(self.dataset) - - -datamodule = DefaultModule diff --git a/src/ptbench/data/montgomery/loader.py b/src/ptbench/data/montgomery/loader.py new file mode 100644 index 00000000..0dce8738 --- /dev/null +++ b/src/ptbench/data/montgomery/loader.py @@ -0,0 +1,114 @@ +# SPDX-FileCopyrightText: Copyright © 2023 Idiap Research Institute <contact@idiap.ch> +# +# SPDX-License-Identifier: GPL-3.0-or-later + +"""Montgomery datamodule for TB detection (default protocol) + +* See :py:mod:`ptbench.data.montgomery` for more database details. + +This configuration: + +* Raw data input (on disk): + + * PNG images 12 bit grayscale + * resolution: 4020 x 4892 px or 4892 x 4020 px + +* Output image: + + * Transforms: + + * Load raw PNG with :py:mod:`PIL` + * Remove black borders + * Torch resizing (512 x 512 px) + * Torch center cropping (512 x 512 px) + + * Final specifications + + * Fixed resolution: 512 x 512 px + * Color RGB encoding +""" + +import os + +import torchvision.transforms + +from ...utils.rc import load_rc +from ..image_utils import RemoveBlackBorders, load_pil_baw +from ..typing import RawDataLoader as _BaseRawDataLoader +from ..typing import Sample + + +class RawDataLoader(_BaseRawDataLoader): + """A specialized raw-data-loader for the Montgomery dataset. + + Attributes + ---------- + + datadir + This variable contains the base directory where the database raw data + is stored. + + transform + Transforms that are always applied to the loaded raw images. + """ + + datadir: str + transform: torchvision.transforms.Compose + + def __init__(self): + self.datadir = load_rc().get( + "datadir.montgomery", os.path.realpath(os.curdir) + ) + + self.transform = torchvision.transforms.Compose( + [ + RemoveBlackBorders(), + torchvision.transforms.Resize(512), + torchvision.transforms.CenterCrop(512), + torchvision.transforms.ToTensor(), + ] + ) + + def sample(self, sample: tuple[str, int]) -> Sample: + """Loads a single image sample from the disk. + + Parameters + ---------- + + sample: + A tuple containing the path suffix, within the dataset root folder, + where to find the image to be loaded, and an integer, representing the + sample label. + + + Returns + ------- + + sample + The sample representation + """ + tensor = self.transform( + load_pil_baw(os.path.join(self.datadir, sample[0])) + ) + + return tensor, dict(label=sample[1], name=sample[0]) # type: ignore[arg-type] + + def label(self, sample: tuple[str, int]) -> int: + """Loads a single image sample label from the disk. + + Parameters + ---------- + + sample: + A tuple containing the path suffix, within the dataset root folder, + where to find the image to be loaded, and an integer, representing the + sample label. + + + Returns + ------- + + label + The integer label associated with the sample + """ + return sample[1] diff --git a/src/ptbench/data/montgomery/rgb.py b/src/ptbench/data/montgomery/rgb.py deleted file mode 100644 index c1621266..00000000 --- a/src/ptbench/data/montgomery/rgb.py +++ /dev/null @@ -1,47 +0,0 @@ -# SPDX-FileCopyrightText: Copyright © 2023 Idiap Research Institute <contact@idiap.ch> -# -# SPDX-License-Identifier: GPL-3.0-or-later - -"""Montgomery dataset for TB detection (default protocol, converted in RGB) - -* Split reference: first 64% of TB and healthy CXR for "train" 16% for -* "validation", 20% for "test" -* This configuration resolution: 512 x 512 (default) -* See :py:mod:`ptbench.data.montgomery` for dataset details -""" - -from clapper.logging import setup - -from .. import return_subsets -from ..base_datamodule import BaseDataModule -from . import _maker - -logger = setup(__name__.split(".")[0], format="%(levelname)s: %(message)s") - - -class DefaultModule(BaseDataModule): - def __init__( - self, - train_batch_size=1, - predict_batch_size=1, - drop_incomplete_batch=False, - multiproc_kwargs=None, - ): - super().__init__( - train_batch_size=train_batch_size, - predict_batch_size=predict_batch_size, - drop_incomplete_batch=drop_incomplete_batch, - multiproc_kwargs=multiproc_kwargs, - ) - - def setup(self, stage: str): - self.dataset = _maker("default", RGB=True) - ( - self.train_dataset, - self.validation_dataset, - self.extra_validation_datasets, - self.predict_dataset, - ) = return_subsets(self.dataset) - - -datamodule = DefaultModule -- GitLab