diff --git a/src/ptbench/data/montgomery/__init__.py b/src/ptbench/data/montgomery/__init__.py index 65239cbf5d908075346675ad10e7c86569383f77..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 100644 --- a/src/ptbench/data/montgomery/__init__.py +++ b/src/ptbench/data/montgomery/__init__.py @@ -1,88 +0,0 @@ -# SPDX-FileCopyrightText: Copyright © 2023 Idiap Research Institute <contact@idiap.ch> -# -# SPDX-License-Identifier: GPL-3.0-or-later - -"""Montgomery dataset for computer-aided diagnosis. - -The Montgomery database has been established to foster research -in computer-aided diagnosis of pulmonary diseases with a special -focus on pulmonary tuberculosis (TB). - -* Reference: [MONTGOMERY-SHENZHEN-2014]_ -* Original resolution (height x width or width x height): 4020 x 4892 -* Split reference: none -* Protocol ``default``: - - * Training samples: 64% of TB and healthy CXR (including labels) - * Validation samples: 16% of TB and healthy CXR (including labels) - * Test samples: 20% of TB and healthy CXR (including labels) -""" - -import importlib.resources -import os - -from ...utils.rc import load_rc -from .. import make_dataset -from ..dataset import JSONDataset -from ..loader import load_pil_baw, make_delayed - -_protocols = [ - importlib.resources.files(__name__).joinpath("default.json.bz2"), - importlib.resources.files(__name__).joinpath("fold_0.json.bz2"), - importlib.resources.files(__name__).joinpath("fold_1.json.bz2"), - importlib.resources.files(__name__).joinpath("fold_2.json.bz2"), - importlib.resources.files(__name__).joinpath("fold_3.json.bz2"), - importlib.resources.files(__name__).joinpath("fold_4.json.bz2"), - importlib.resources.files(__name__).joinpath("fold_5.json.bz2"), - importlib.resources.files(__name__).joinpath("fold_6.json.bz2"), - importlib.resources.files(__name__).joinpath("fold_7.json.bz2"), - importlib.resources.files(__name__).joinpath("fold_8.json.bz2"), - importlib.resources.files(__name__).joinpath("fold_9.json.bz2"), -] - -_datadir = load_rc().get("datadir.montgomery", os.path.realpath(os.curdir)) - - -def _raw_data_loader(sample): - return dict( - data=load_pil_baw(os.path.join(_datadir, sample["data"])), # type: ignore - label=sample["label"], - ) - - -def _loader(context, sample): - # "context" is ignored in this case - database is homogeneous - # we return delayed samples to avoid loading all images at once - return make_delayed(sample, _raw_data_loader) - - -json_dataset = JSONDataset( - protocols=_protocols, - fieldnames=("data", "label"), - loader=_loader, -) -"""Montgomery dataset object.""" - - -def _maker(protocol, resize_size=512, cc_size=512, RGB=False): - from torchvision import transforms - - from ..transforms import ElasticDeformation, RemoveBlackBorders - - post_transforms = [] - if RGB: - post_transforms = [ - transforms.Lambda(lambda x: x.convert("RGB")), - transforms.ToTensor(), - ] - - return make_dataset( - [json_dataset.subsets(protocol)], - [ - RemoveBlackBorders(), - transforms.Resize(resize_size), - transforms.CenterCrop(cc_size), - ], - [ElasticDeformation(p=0.8)], - post_transforms, - ) diff --git a/src/ptbench/data/montgomery/default.py b/src/ptbench/data/montgomery/default.py index 1f5c0809869be5f011880e808e160024b3c1c1b0..bc93c593f78c15d3a6b1e267117006ab408af2a4 100644 --- a/src/ptbench/data/montgomery/default.py +++ b/src/ptbench/data/montgomery/default.py @@ -2,46 +2,50 @@ # # SPDX-License-Identifier: GPL-3.0-or-later -"""Montgomery dataset for TB detection (default protocol) +"""Montgomery datamodule for TB detection (default protocol) -* Split reference: first 64% of TB and healthy CXR for "train" 16% for -* "validation", 20% for "test" -* This configuration resolution: 512 x 512 (default) -* See :py:mod:`ptbench.data.montgomery` for dataset details -""" +* See :py:mod:`ptbench.data.montgomery` for more database details. -from clapper.logging import setup +This configuration: -from .. import return_subsets -from ..base_datamodule import BaseDataModule -from . import _maker +* Raw data input (on disk): -logger = setup(__name__.split(".")[0], format="%(levelname)s: %(message)s") + * PNG images 12 bit grayscale + * resolution: 4020 x 4892 px or 4892 x 4020 px +* Output image: + + * Transforms: -class DefaultModule(BaseDataModule): - def __init__( - self, - train_batch_size=1, - predict_batch_size=1, - drop_incomplete_batch=False, - multiproc_kwargs=None, - ): - super().__init__( - train_batch_size=train_batch_size, - predict_batch_size=predict_batch_size, - drop_incomplete_batch=drop_incomplete_batch, - multiproc_kwargs=multiproc_kwargs, - ) + * Load raw PNG with :py:mod:`PIL` + * Remove black borders + * Torch resizing (512 x 512 px) + * Torch center cropping (512 x 512 px) + + * Final specifications + + * Fixed resolution: 512 x 512 px + * Color RGB encoding - def setup(self, stage: str): - self.dataset = _maker("default") - ( - self.train_dataset, - self.validation_dataset, - self.extra_validation_datasets, - self.predict_dataset, - ) = return_subsets(self.dataset) +Protocol ``default``: + + * Training samples: first 64% of TB and healthy CXR (including labels) + * Validation samples: 16% of TB and healthy CXR (including labels) + * Test samples: 20% of TB and healty CXR (including labels) +""" + +import importlib.resources -datamodule = DefaultModule +from ..datamodule import CachingDataModule +from ..split import JSONDatabaseSplit +from .loader import RawDataLoader + +datamodule = CachingDataModule( + database_split=JSONDatabaseSplit( + importlib.resources.files(__name__.rsplit(".", 1)[0]).joinpath( + "default.json.bz2" + ) + ), + raw_data_loader=RawDataLoader(), +) diff --git a/src/ptbench/data/montgomery/fold_0.py b/src/ptbench/data/montgomery/fold_0.py index c60791be50ccd5186ce8e4af263efb7d7513b07a..043769518603766044d804e2c6dd39a2176cb1db 100644 --- a/src/ptbench/data/montgomery/fold_0.py +++ b/src/ptbench/data/montgomery/fold_0.py @@ -2,46 +2,44 @@ # # SPDX-License-Identifier: GPL-3.0-or-later -"""Montgomery dataset for TB detection (cross validation fold 0) +"""Montgomery datamodule for TB detection (default protocol) -* Split reference: first 64% of TB and healthy CXR for "train" 16% for -* "validation", 20% for "test" -* This configuration resolution: 512 x 512 (default) -* See :py:mod:`ptbench.data.montgomery` for dataset details -""" +* See :py:mod:`ptbench.data.montgomery` for more database details. -from clapper.logging import setup +This configuration: -from .. import return_subsets -from ..base_datamodule import BaseDataModule -from . import _maker +* Raw data input (on disk): -logger = setup(__name__.split(".")[0], format="%(levelname)s: %(message)s") + * PNG images 12 bit grayscale + * resolution: 4020 x 4892 px or 4892 x 4020 px +* Output image: + + * Transforms: -class DefaultModule(BaseDataModule): - def __init__( - self, - train_batch_size=1, - predict_batch_size=1, - drop_incomplete_batch=False, - multiproc_kwargs=None, - ): - super().__init__( - train_batch_size=train_batch_size, - predict_batch_size=predict_batch_size, - drop_incomplete_batch=drop_incomplete_batch, - multiproc_kwargs=multiproc_kwargs, - ) + * Load raw PNG with :py:mod:`PIL` + * Remove black borders + * Torch resizing (512 x 512 px) + * Torch center cropping (512 x 512 px) - def setup(self, stage: str): - self.dataset = _maker("fold_0") - ( - self.train_dataset, - self.validation_dataset, - self.extra_validation_datasets, - self.predict_dataset, - ) = return_subsets(self.dataset) + * Final specifications + + * Fixed resolution: 512 x 512 px + * Color RGB encoding +""" +import importlib.resources + +from ..datamodule import CachingDataModule +from ..split import JSONDatabaseSplit +from .loader import RawDataLoader + +datamodule = CachingDataModule( + database_split=JSONDatabaseSplit( + importlib.resources.files(__name__.rsplit(".", 1)[0]).joinpath( + "fold_0.json.bz2" + ) + ), + raw_data_loader=RawDataLoader(), +) -datamodule = DefaultModule diff --git a/src/ptbench/data/montgomery/fold_0_rgb.py b/src/ptbench/data/montgomery/fold_0_rgb.py deleted file mode 100644 index 8e8b0c8914b6a63dd9ab854984ff2bc51cb4e255..0000000000000000000000000000000000000000 --- a/src/ptbench/data/montgomery/fold_0_rgb.py +++ /dev/null @@ -1,47 +0,0 @@ -# SPDX-FileCopyrightText: Copyright © 2023 Idiap Research Institute <contact@idiap.ch> -# -# SPDX-License-Identifier: GPL-3.0-or-later - -"""Montgomery dataset for TB detection (cross validation fold 0, RGB) - -* Split reference: first 64% of TB and healthy CXR for "train" 16% for -* "validation", 20% for "test" -* This configuration resolution: 512 x 512 (default) -* See :py:mod:`ptbench.data.montgomery` for dataset details -""" - -from clapper.logging import setup - -from .. import return_subsets -from ..base_datamodule import BaseDataModule -from . import _maker - -logger = setup(__name__.split(".")[0], format="%(levelname)s: %(message)s") - - -class DefaultModule(BaseDataModule): - def __init__( - self, - train_batch_size=1, - predict_batch_size=1, - drop_incomplete_batch=False, - multiproc_kwargs=None, - ): - super().__init__( - train_batch_size=train_batch_size, - predict_batch_size=predict_batch_size, - drop_incomplete_batch=drop_incomplete_batch, - multiproc_kwargs=multiproc_kwargs, - ) - - def setup(self, stage: str): - self.dataset = _maker("fold_0", RGB=True) - ( - self.train_dataset, - self.validation_dataset, - self.extra_validation_datasets, - self.predict_dataset, - ) = return_subsets(self.dataset) - - -datamodule = DefaultModule diff --git a/src/ptbench/data/montgomery/fold_1.py b/src/ptbench/data/montgomery/fold_1.py index d6627e673978bcf960b8fb5f72add7cb4a13a141..8500456bca2c2d2330d56085986b69ed91ab9d91 100644 --- a/src/ptbench/data/montgomery/fold_1.py +++ b/src/ptbench/data/montgomery/fold_1.py @@ -2,46 +2,44 @@ # # SPDX-License-Identifier: GPL-3.0-or-later -"""Montgomery dataset for TB detection (cross validation fold 1) +"""Montgomery datamodule for TB detection (default protocol) -* Split reference: first 64% of TB and healthy CXR for "train" 16% for -* "validation", 20% for "test" -* This configuration resolution: 512 x 512 (default) -* See :py:mod:`ptbench.data.montgomery` for dataset details -""" +* See :py:mod:`ptbench.data.montgomery` for more database details. -from clapper.logging import setup +This configuration: -from .. import return_subsets -from ..base_datamodule import BaseDataModule -from . import _maker +* Raw data input (on disk): -logger = setup(__name__.split(".")[0], format="%(levelname)s: %(message)s") + * PNG images 12 bit grayscale + * resolution: 4020 x 4892 px or 4892 x 4020 px +* Output image: + + * Transforms: -class DefaultModule(BaseDataModule): - def __init__( - self, - train_batch_size=1, - predict_batch_size=1, - drop_incomplete_batch=False, - multiproc_kwargs=None, - ): - super().__init__( - train_batch_size=train_batch_size, - predict_batch_size=predict_batch_size, - drop_incomplete_batch=drop_incomplete_batch, - multiproc_kwargs=multiproc_kwargs, - ) + * Load raw PNG with :py:mod:`PIL` + * Remove black borders + * Torch resizing (512 x 512 px) + * Torch center cropping (512 x 512 px) - def setup(self, stage: str): - self.dataset = _maker("fold_1") - ( - self.train_dataset, - self.validation_dataset, - self.extra_validation_datasets, - self.predict_dataset, - ) = return_subsets(self.dataset) + * Final specifications + + * Fixed resolution: 512 x 512 px + * Color RGB encoding +""" +import importlib.resources + +from ..datamodule import CachingDataModule +from ..split import JSONDatabaseSplit +from .loader import RawDataLoader + +datamodule = CachingDataModule( + database_split=JSONDatabaseSplit( + importlib.resources.files(__name__.rsplit(".", 1)[0]).joinpath( + "fold_1.json.bz2" + ) + ), + raw_data_loader=RawDataLoader(), +) -datamodule = DefaultModule diff --git a/src/ptbench/data/montgomery/fold_1_rgb.py b/src/ptbench/data/montgomery/fold_1_rgb.py deleted file mode 100644 index bc47a322c3fd779e3bc19924f6d7ac7c13e71847..0000000000000000000000000000000000000000 --- a/src/ptbench/data/montgomery/fold_1_rgb.py +++ /dev/null @@ -1,47 +0,0 @@ -# SPDX-FileCopyrightText: Copyright © 2023 Idiap Research Institute <contact@idiap.ch> -# -# SPDX-License-Identifier: GPL-3.0-or-later - -"""Montgomery dataset for TB detection (cross validation fold 1, RGB) - -* Split reference: first 64% of TB and healthy CXR for "train" 16% for -* "validation", 20% for "test" -* This configuration resolution: 512 x 512 (default) -* See :py:mod:`ptbench.data.montgomery` for dataset details -""" - -from clapper.logging import setup - -from .. import return_subsets -from ..base_datamodule import BaseDataModule -from . import _maker - -logger = setup(__name__.split(".")[0], format="%(levelname)s: %(message)s") - - -class DefaultModule(BaseDataModule): - def __init__( - self, - train_batch_size=1, - predict_batch_size=1, - drop_incomplete_batch=False, - multiproc_kwargs=None, - ): - super().__init__( - train_batch_size=train_batch_size, - predict_batch_size=predict_batch_size, - drop_incomplete_batch=drop_incomplete_batch, - multiproc_kwargs=multiproc_kwargs, - ) - - def setup(self, stage: str): - self.dataset = _maker("fold_1", RGB=True) - ( - self.train_dataset, - self.validation_dataset, - self.extra_validation_datasets, - self.predict_dataset, - ) = return_subsets(self.dataset) - - -datamodule = DefaultModule diff --git a/src/ptbench/data/montgomery/fold_2.py b/src/ptbench/data/montgomery/fold_2.py index 8c5f4a66fd2af0b9f26b67241f45c630f69bd06a..e4b7a61482c391fb2b4b04d189b6637c0ced3222 100644 --- a/src/ptbench/data/montgomery/fold_2.py +++ b/src/ptbench/data/montgomery/fold_2.py @@ -2,46 +2,44 @@ # # SPDX-License-Identifier: GPL-3.0-or-later -"""Montgomery dataset for TB detection (cross validation fold 2) +"""Montgomery datamodule for TB detection (default protocol) -* Split reference: first 64% of TB and healthy CXR for "train" 16% for -* "validation", 20% for "test" -* This configuration resolution: 512 x 512 (default) -* See :py:mod:`ptbench.data.montgomery` for dataset details -""" +* See :py:mod:`ptbench.data.montgomery` for more database details. -from clapper.logging import setup +This configuration: -from .. import return_subsets -from ..base_datamodule import BaseDataModule -from . import _maker +* Raw data input (on disk): -logger = setup(__name__.split(".")[0], format="%(levelname)s: %(message)s") + * PNG images 12 bit grayscale + * resolution: 4020 x 4892 px or 4892 x 4020 px +* Output image: + + * Transforms: -class DefaultModule(BaseDataModule): - def __init__( - self, - train_batch_size=1, - predict_batch_size=1, - drop_incomplete_batch=False, - multiproc_kwargs=None, - ): - super().__init__( - train_batch_size=train_batch_size, - predict_batch_size=predict_batch_size, - drop_incomplete_batch=drop_incomplete_batch, - multiproc_kwargs=multiproc_kwargs, - ) + * Load raw PNG with :py:mod:`PIL` + * Remove black borders + * Torch resizing (512 x 512 px) + * Torch center cropping (512 x 512 px) - def setup(self, stage: str): - self.dataset = _maker("fold_2") - ( - self.train_dataset, - self.validation_dataset, - self.extra_validation_datasets, - self.predict_dataset, - ) = return_subsets(self.dataset) + * Final specifications + + * Fixed resolution: 512 x 512 px + * Color RGB encoding +""" +import importlib.resources + +from ..datamodule import CachingDataModule +from ..split import JSONDatabaseSplit +from .loader import RawDataLoader + +datamodule = CachingDataModule( + database_split=JSONDatabaseSplit( + importlib.resources.files(__name__.rsplit(".", 1)[0]).joinpath( + "fold_2.json.bz2" + ) + ), + raw_data_loader=RawDataLoader(), +) -datamodule = DefaultModule diff --git a/src/ptbench/data/montgomery/fold_2_rgb.py b/src/ptbench/data/montgomery/fold_2_rgb.py deleted file mode 100644 index b81a877b2bc7372a99812a27935e6daf42401568..0000000000000000000000000000000000000000 --- a/src/ptbench/data/montgomery/fold_2_rgb.py +++ /dev/null @@ -1,47 +0,0 @@ -# SPDX-FileCopyrightText: Copyright © 2023 Idiap Research Institute <contact@idiap.ch> -# -# SPDX-License-Identifier: GPL-3.0-or-later - -"""Montgomery dataset for TB detection (cross validation fold 2, RGB) - -* Split reference: first 64% of TB and healthy CXR for "train" 16% for -* "validation", 20% for "test" -* This configuration resolution: 512 x 512 (default) -* See :py:mod:`ptbench.data.montgomery` for dataset details -""" - -from clapper.logging import setup - -from .. import return_subsets -from ..base_datamodule import BaseDataModule -from . import _maker - -logger = setup(__name__.split(".")[0], format="%(levelname)s: %(message)s") - - -class DefaultModule(BaseDataModule): - def __init__( - self, - train_batch_size=1, - predict_batch_size=1, - drop_incomplete_batch=False, - multiproc_kwargs=None, - ): - super().__init__( - train_batch_size=train_batch_size, - predict_batch_size=predict_batch_size, - drop_incomplete_batch=drop_incomplete_batch, - multiproc_kwargs=multiproc_kwargs, - ) - - def setup(self, stage: str): - self.dataset = _maker("fold_2", RGB=True) - ( - self.train_dataset, - self.validation_dataset, - self.extra_validation_datasets, - self.predict_dataset, - ) = return_subsets(self.dataset) - - -datamodule = DefaultModule diff --git a/src/ptbench/data/montgomery/fold_3.py b/src/ptbench/data/montgomery/fold_3.py index 8e685d7e3baa3a23924c62a77ffc61bf51e12056..719bf004979db29ea723433ce5bc2dc046aa05ab 100644 --- a/src/ptbench/data/montgomery/fold_3.py +++ b/src/ptbench/data/montgomery/fold_3.py @@ -2,46 +2,45 @@ # # SPDX-License-Identifier: GPL-3.0-or-later -"""Montgomery dataset for TB detection (cross validation fold 3) +"""Montgomery datamodule for TB detection (default protocol) -* Split reference: first 64% of TB and healthy CXR for "train" 16% for -* "validation", 20% for "test" -* This configuration resolution: 512 x 512 (default) -* See :py:mod:`ptbench.data.montgomery` for dataset details -""" +* See :py:mod:`ptbench.data.montgomery` for more database details. -from clapper.logging import setup +This configuration: -from .. import return_subsets -from ..base_datamodule import BaseDataModule -from . import _maker +* Raw data input (on disk): -logger = setup(__name__.split(".")[0], format="%(levelname)s: %(message)s") + * PNG images 12 bit grayscale + * resolution: 4020 x 4892 px or 4892 x 4020 px +* Output image: + + * Transforms: -class DefaultModule(BaseDataModule): - def __init__( - self, - train_batch_size=1, - predict_batch_size=1, - drop_incomplete_batch=False, - multiproc_kwargs=None, - ): - super().__init__( - train_batch_size=train_batch_size, - predict_batch_size=predict_batch_size, - drop_incomplete_batch=drop_incomplete_batch, - multiproc_kwargs=multiproc_kwargs, - ) + * Load raw PNG with :py:mod:`PIL` + * Remove black borders + * Torch resizing (512 x 512 px) + * Torch center cropping (512 x 512 px) + + * Final specifications + + * Fixed resolution: 512 x 512 px + * Color RGB encoding +""" + +import importlib.resources - def setup(self, stage: str): - self.dataset = _maker("fold_3") - ( - self.train_dataset, - self.validation_dataset, - self.extra_validation_datasets, - self.predict_dataset, - ) = return_subsets(self.dataset) +from ..datamodule import CachingDataModule +from ..split import JSONDatabaseSplit +from .loader import RawDataLoader + +datamodule = CachingDataModule( + database_split=JSONDatabaseSplit( + importlib.resources.files(__name__.rsplit(".", 1)[0]).joinpath( + "fold_3.json.bz2" + ) + ), + raw_data_loader=RawDataLoader(), +) -datamodule = DefaultModule diff --git a/src/ptbench/data/montgomery/fold_3_rgb.py b/src/ptbench/data/montgomery/fold_3_rgb.py deleted file mode 100644 index 7b600371c8d434d79049c6e6423b36e99f2a32cb..0000000000000000000000000000000000000000 --- a/src/ptbench/data/montgomery/fold_3_rgb.py +++ /dev/null @@ -1,47 +0,0 @@ -# SPDX-FileCopyrightText: Copyright © 2023 Idiap Research Institute <contact@idiap.ch> -# -# SPDX-License-Identifier: GPL-3.0-or-later - -"""Montgomery dataset for TB detection (cross validation fold 3, RGB) - -* Split reference: first 64% of TB and healthy CXR for "train" 16% for -* "validation", 20% for "test" -* This configuration resolution: 512 x 512 (default) -* See :py:mod:`ptbench.data.montgomery` for dataset details -""" - -from clapper.logging import setup - -from .. import return_subsets -from ..base_datamodule import BaseDataModule -from . import _maker - -logger = setup(__name__.split(".")[0], format="%(levelname)s: %(message)s") - - -class DefaultModule(BaseDataModule): - def __init__( - self, - train_batch_size=1, - predict_batch_size=1, - drop_incomplete_batch=False, - multiproc_kwargs=None, - ): - super().__init__( - train_batch_size=train_batch_size, - predict_batch_size=predict_batch_size, - drop_incomplete_batch=drop_incomplete_batch, - multiproc_kwargs=multiproc_kwargs, - ) - - def setup(self, stage: str): - self.dataset = _maker("fold_3", RGB=True) - ( - self.train_dataset, - self.validation_dataset, - self.extra_validation_datasets, - self.predict_dataset, - ) = return_subsets(self.dataset) - - -datamodule = DefaultModule diff --git a/src/ptbench/data/montgomery/fold_4.py b/src/ptbench/data/montgomery/fold_4.py index 9459cb938605df06823a86a96fbd1cf374fe9738..2e97b114f805846a0cb557ae85613322b4b9a73b 100644 --- a/src/ptbench/data/montgomery/fold_4.py +++ b/src/ptbench/data/montgomery/fold_4.py @@ -2,46 +2,45 @@ # # SPDX-License-Identifier: GPL-3.0-or-later -"""Montgomery dataset for TB detection (cross validation fold 4) +"""Montgomery datamodule for TB detection (default protocol) -* Split reference: first 64% of TB and healthy CXR for "train" 16% for -* "validation", 20% for "test" -* This configuration resolution: 512 x 512 (default) -* See :py:mod:`ptbench.data.montgomery` for dataset details -""" +* See :py:mod:`ptbench.data.montgomery` for more database details. -from clapper.logging import setup +This configuration: -from .. import return_subsets -from ..base_datamodule import BaseDataModule -from . import _maker +* Raw data input (on disk): -logger = setup(__name__.split(".")[0], format="%(levelname)s: %(message)s") + * PNG images 12 bit grayscale + * resolution: 4020 x 4892 px or 4892 x 4020 px +* Output image: + + * Transforms: -class DefaultModule(BaseDataModule): - def __init__( - self, - train_batch_size=1, - predict_batch_size=1, - drop_incomplete_batch=False, - multiproc_kwargs=None, - ): - super().__init__( - train_batch_size=train_batch_size, - predict_batch_size=predict_batch_size, - drop_incomplete_batch=drop_incomplete_batch, - multiproc_kwargs=multiproc_kwargs, - ) + * Load raw PNG with :py:mod:`PIL` + * Remove black borders + * Torch resizing (512 x 512 px) + * Torch center cropping (512 x 512 px) + + * Final specifications + + * Fixed resolution: 512 x 512 px + * Color RGB encoding +""" + +import importlib.resources - def setup(self, stage: str): - self.dataset = _maker("fold_4") - ( - self.train_dataset, - self.validation_dataset, - self.extra_validation_datasets, - self.predict_dataset, - ) = return_subsets(self.dataset) +from ..datamodule import CachingDataModule +from ..split import JSONDatabaseSplit +from .loader import RawDataLoader + +datamodule = CachingDataModule( + database_split=JSONDatabaseSplit( + importlib.resources.files(__name__.rsplit(".", 1)[0]).joinpath( + "fold_4.json.bz2" + ) + ), + raw_data_loader=RawDataLoader(), +) -datamodule = DefaultModule diff --git a/src/ptbench/data/montgomery/fold_4_rgb.py b/src/ptbench/data/montgomery/fold_4_rgb.py deleted file mode 100644 index 3eb136f654ab8d8d648468948e05dad774d85076..0000000000000000000000000000000000000000 --- a/src/ptbench/data/montgomery/fold_4_rgb.py +++ /dev/null @@ -1,47 +0,0 @@ -# SPDX-FileCopyrightText: Copyright © 2023 Idiap Research Institute <contact@idiap.ch> -# -# SPDX-License-Identifier: GPL-3.0-or-later - -"""Montgomery dataset for TB detection (cross validation fold 4, RGB) - -* Split reference: first 64% of TB and healthy CXR for "train" 16% for -* "validation", 20% for "test" -* This configuration resolution: 512 x 512 (default) -* See :py:mod:`ptbench.data.montgomery` for dataset details -""" - -from clapper.logging import setup - -from .. import return_subsets -from ..base_datamodule import BaseDataModule -from . import _maker - -logger = setup(__name__.split(".")[0], format="%(levelname)s: %(message)s") - - -class DefaultModule(BaseDataModule): - def __init__( - self, - train_batch_size=1, - predict_batch_size=1, - drop_incomplete_batch=False, - multiproc_kwargs=None, - ): - super().__init__( - train_batch_size=train_batch_size, - predict_batch_size=predict_batch_size, - drop_incomplete_batch=drop_incomplete_batch, - multiproc_kwargs=multiproc_kwargs, - ) - - def setup(self, stage: str): - self.dataset = _maker("fold_4", RGB=True) - ( - self.train_dataset, - self.validation_dataset, - self.extra_validation_datasets, - self.predict_dataset, - ) = return_subsets(self.dataset) - - -datamodule = DefaultModule diff --git a/src/ptbench/data/montgomery/fold_5.py b/src/ptbench/data/montgomery/fold_5.py index 147690f6d54f15d50b52f88288dbc8a41dfb7f33..4df1451de2eb8bdaca8f5d46408a298f070ab6f0 100644 --- a/src/ptbench/data/montgomery/fold_5.py +++ b/src/ptbench/data/montgomery/fold_5.py @@ -2,46 +2,45 @@ # # SPDX-License-Identifier: GPL-3.0-or-later -"""Montgomery dataset for TB detection (cross validation fold 5) +"""Montgomery datamodule for TB detection (default protocol) -* Split reference: first 64% of TB and healthy CXR for "train" 16% for -* "validation", 20% for "test" -* This configuration resolution: 512 x 512 (default) -* See :py:mod:`ptbench.data.montgomery` for dataset details -""" +* See :py:mod:`ptbench.data.montgomery` for more database details. -from clapper.logging import setup +This configuration: -from .. import return_subsets -from ..base_datamodule import BaseDataModule -from . import _maker +* Raw data input (on disk): -logger = setup(__name__.split(".")[0], format="%(levelname)s: %(message)s") + * PNG images 12 bit grayscale + * resolution: 4020 x 4892 px or 4892 x 4020 px +* Output image: + + * Transforms: -class DefaultModule(BaseDataModule): - def __init__( - self, - train_batch_size=1, - predict_batch_size=1, - drop_incomplete_batch=False, - multiproc_kwargs=None, - ): - super().__init__( - train_batch_size=train_batch_size, - predict_batch_size=predict_batch_size, - drop_incomplete_batch=drop_incomplete_batch, - multiproc_kwargs=multiproc_kwargs, - ) + * Load raw PNG with :py:mod:`PIL` + * Remove black borders + * Torch resizing (512 x 512 px) + * Torch center cropping (512 x 512 px) + + * Final specifications + + * Fixed resolution: 512 x 512 px + * Color RGB encoding +""" + +import importlib.resources - def setup(self, stage: str): - self.dataset = _maker("fold_5") - ( - self.train_dataset, - self.validation_dataset, - self.extra_validation_datasets, - self.predict_dataset, - ) = return_subsets(self.dataset) +from ..datamodule import CachingDataModule +from ..split import JSONDatabaseSplit +from .loader import RawDataLoader + +datamodule = CachingDataModule( + database_split=JSONDatabaseSplit( + importlib.resources.files(__name__.rsplit(".", 1)[0]).joinpath( + "fold_5.json.bz2" + ) + ), + raw_data_loader=RawDataLoader(), +) -datamodule = DefaultModule diff --git a/src/ptbench/data/montgomery/fold_5_rgb.py b/src/ptbench/data/montgomery/fold_5_rgb.py deleted file mode 100644 index 3e7cb73f6957086b99147812b07f733dc51af9ec..0000000000000000000000000000000000000000 --- a/src/ptbench/data/montgomery/fold_5_rgb.py +++ /dev/null @@ -1,47 +0,0 @@ -# SPDX-FileCopyrightText: Copyright © 2023 Idiap Research Institute <contact@idiap.ch> -# -# SPDX-License-Identifier: GPL-3.0-or-later - -"""Montgomery dataset for TB detection (cross validation fold 5, RGB) - -* Split reference: first 64% of TB and healthy CXR for "train" 16% for -* "validation", 20% for "test" -* This configuration resolution: 512 x 512 (default) -* See :py:mod:`ptbench.data.montgomery` for dataset details -""" - -from clapper.logging import setup - -from .. import return_subsets -from ..base_datamodule import BaseDataModule -from . import _maker - -logger = setup(__name__.split(".")[0], format="%(levelname)s: %(message)s") - - -class DefaultModule(BaseDataModule): - def __init__( - self, - train_batch_size=1, - predict_batch_size=1, - drop_incomplete_batch=False, - multiproc_kwargs=None, - ): - super().__init__( - train_batch_size=train_batch_size, - predict_batch_size=predict_batch_size, - drop_incomplete_batch=drop_incomplete_batch, - multiproc_kwargs=multiproc_kwargs, - ) - - def setup(self, stage: str): - self.dataset = _maker("fold_5", RGB=True) - ( - self.train_dataset, - self.validation_dataset, - self.extra_validation_datasets, - self.predict_dataset, - ) = return_subsets(self.dataset) - - -datamodule = DefaultModule diff --git a/src/ptbench/data/montgomery/fold_6.py b/src/ptbench/data/montgomery/fold_6.py index 69f24390ac01271c3e961950d429d973e535c380..d0b36115690ee65f47e421dab91351c4b48f5309 100644 --- a/src/ptbench/data/montgomery/fold_6.py +++ b/src/ptbench/data/montgomery/fold_6.py @@ -2,46 +2,45 @@ # # SPDX-License-Identifier: GPL-3.0-or-later -"""Montgomery dataset for TB detection (cross validation fold 6) +"""Montgomery datamodule for TB detection (default protocol) -* Split reference: first 64% of TB and healthy CXR for "train" 16% for -* "validation", 20% for "test" -* This configuration resolution: 512 x 512 (default) -* See :py:mod:`ptbench.data.montgomery` for dataset details -""" +* See :py:mod:`ptbench.data.montgomery` for more database details. -from clapper.logging import setup +This configuration: -from .. import return_subsets -from ..base_datamodule import BaseDataModule -from . import _maker +* Raw data input (on disk): -logger = setup(__name__.split(".")[0], format="%(levelname)s: %(message)s") + * PNG images 12 bit grayscale + * resolution: 4020 x 4892 px or 4892 x 4020 px +* Output image: + + * Transforms: -class DefaultModule(BaseDataModule): - def __init__( - self, - train_batch_size=1, - predict_batch_size=1, - drop_incomplete_batch=False, - multiproc_kwargs=None, - ): - super().__init__( - train_batch_size=train_batch_size, - predict_batch_size=predict_batch_size, - drop_incomplete_batch=drop_incomplete_batch, - multiproc_kwargs=multiproc_kwargs, - ) + * Load raw PNG with :py:mod:`PIL` + * Remove black borders + * Torch resizing (512 x 512 px) + * Torch center cropping (512 x 512 px) + + * Final specifications + + * Fixed resolution: 512 x 512 px + * Color RGB encoding +""" + +import importlib.resources - def setup(self, stage: str): - self.dataset = _maker("fold_6") - ( - self.train_dataset, - self.validation_dataset, - self.extra_validation_datasets, - self.predict_dataset, - ) = return_subsets(self.dataset) +from ..datamodule import CachingDataModule +from ..split import JSONDatabaseSplit +from .loader import RawDataLoader + +datamodule = CachingDataModule( + database_split=JSONDatabaseSplit( + importlib.resources.files(__name__.rsplit(".", 1)[0]).joinpath( + "fold_6.json.bz2" + ) + ), + raw_data_loader=RawDataLoader(), +) -datamodule = DefaultModule diff --git a/src/ptbench/data/montgomery/fold_6_rgb.py b/src/ptbench/data/montgomery/fold_6_rgb.py deleted file mode 100644 index ff3a8cdb0c00f511f4ebb7abcfabb10ae7853e99..0000000000000000000000000000000000000000 --- a/src/ptbench/data/montgomery/fold_6_rgb.py +++ /dev/null @@ -1,47 +0,0 @@ -# SPDX-FileCopyrightText: Copyright © 2023 Idiap Research Institute <contact@idiap.ch> -# -# SPDX-License-Identifier: GPL-3.0-or-later - -"""Montgomery dataset for TB detection (cross validation fold 6, RGB) - -* Split reference: first 64% of TB and healthy CXR for "train" 16% for -* "validation", 20% for "test" -* This configuration resolution: 512 x 512 (default) -* See :py:mod:`ptbench.data.montgomery` for dataset details -""" - -from clapper.logging import setup - -from .. import return_subsets -from ..base_datamodule import BaseDataModule -from . import _maker - -logger = setup(__name__.split(".")[0], format="%(levelname)s: %(message)s") - - -class DefaultModule(BaseDataModule): - def __init__( - self, - train_batch_size=1, - predict_batch_size=1, - drop_incomplete_batch=False, - multiproc_kwargs=None, - ): - super().__init__( - train_batch_size=train_batch_size, - predict_batch_size=predict_batch_size, - drop_incomplete_batch=drop_incomplete_batch, - multiproc_kwargs=multiproc_kwargs, - ) - - def setup(self, stage: str): - self.dataset = _maker("fold_6", RGB=True) - ( - self.train_dataset, - self.validation_dataset, - self.extra_validation_datasets, - self.predict_dataset, - ) = return_subsets(self.dataset) - - -datamodule = DefaultModule diff --git a/src/ptbench/data/montgomery/fold_7.py b/src/ptbench/data/montgomery/fold_7.py index 20ba9d3a7da5ffcb8673e685a0534d82fdb7ed2b..b132b30ea14356774e820bfba5d6b66475c56b17 100644 --- a/src/ptbench/data/montgomery/fold_7.py +++ b/src/ptbench/data/montgomery/fold_7.py @@ -2,46 +2,45 @@ # # SPDX-License-Identifier: GPL-3.0-or-later -"""Montgomery dataset for TB detection (cross validation fold 7) +"""Montgomery datamodule for TB detection (default protocol) -* Split reference: first 64% of TB and healthy CXR for "train" 16% for -* "validation", 20% for "test" -* This configuration resolution: 512 x 512 (default) -* See :py:mod:`ptbench.data.montgomery` for dataset details -""" +* See :py:mod:`ptbench.data.montgomery` for more database details. -from clapper.logging import setup +This configuration: -from .. import return_subsets -from ..base_datamodule import BaseDataModule -from . import _maker +* Raw data input (on disk): -logger = setup(__name__.split(".")[0], format="%(levelname)s: %(message)s") + * PNG images 12 bit grayscale + * resolution: 4020 x 4892 px or 4892 x 4020 px +* Output image: + + * Transforms: -class DefaultModule(BaseDataModule): - def __init__( - self, - train_batch_size=1, - predict_batch_size=1, - drop_incomplete_batch=False, - multiproc_kwargs=None, - ): - super().__init__( - train_batch_size=train_batch_size, - predict_batch_size=predict_batch_size, - drop_incomplete_batch=drop_incomplete_batch, - multiproc_kwargs=multiproc_kwargs, - ) + * Load raw PNG with :py:mod:`PIL` + * Remove black borders + * Torch resizing (512 x 512 px) + * Torch center cropping (512 x 512 px) + + * Final specifications + + * Fixed resolution: 512 x 512 px + * Color RGB encoding +""" + +import importlib.resources - def setup(self, stage: str): - self.dataset = _maker("fold_7") - ( - self.train_dataset, - self.validation_dataset, - self.extra_validation_datasets, - self.predict_dataset, - ) = return_subsets(self.dataset) +from ..datamodule import CachingDataModule +from ..split import JSONDatabaseSplit +from .loader import RawDataLoader + +datamodule = CachingDataModule( + database_split=JSONDatabaseSplit( + importlib.resources.files(__name__.rsplit(".", 1)[0]).joinpath( + "fold_7.json.bz2" + ) + ), + raw_data_loader=RawDataLoader(), +) -datamodule = DefaultModule diff --git a/src/ptbench/data/montgomery/fold_7_rgb.py b/src/ptbench/data/montgomery/fold_7_rgb.py deleted file mode 100644 index 05664b06ab6393911a77b32418d6f2afb9d455fa..0000000000000000000000000000000000000000 --- a/src/ptbench/data/montgomery/fold_7_rgb.py +++ /dev/null @@ -1,47 +0,0 @@ -# SPDX-FileCopyrightText: Copyright © 2023 Idiap Research Institute <contact@idiap.ch> -# -# SPDX-License-Identifier: GPL-3.0-or-later - -"""Montgomery dataset for TB detection (cross validation fold 7, RGB) - -* Split reference: first 64% of TB and healthy CXR for "train" 16% for -* "validation", 20% for "test" -* This configuration resolution: 512 x 512 (default) -* See :py:mod:`ptbench.data.montgomery` for dataset details -""" - -from clapper.logging import setup - -from .. import return_subsets -from ..base_datamodule import BaseDataModule -from . import _maker - -logger = setup(__name__.split(".")[0], format="%(levelname)s: %(message)s") - - -class DefaultModule(BaseDataModule): - def __init__( - self, - train_batch_size=1, - predict_batch_size=1, - drop_incomplete_batch=False, - multiproc_kwargs=None, - ): - super().__init__( - train_batch_size=train_batch_size, - predict_batch_size=predict_batch_size, - drop_incomplete_batch=drop_incomplete_batch, - multiproc_kwargs=multiproc_kwargs, - ) - - def setup(self, stage: str): - self.dataset = _maker("fold_7", RGB=True) - ( - self.train_dataset, - self.validation_dataset, - self.extra_validation_datasets, - self.predict_dataset, - ) = return_subsets(self.dataset) - - -datamodule = DefaultModule diff --git a/src/ptbench/data/montgomery/fold_8.py b/src/ptbench/data/montgomery/fold_8.py index e92ff959a9b1028c174c95719867f5086831d6c9..73169ca0b40688e379bd31948e5925552bd7b5b0 100644 --- a/src/ptbench/data/montgomery/fold_8.py +++ b/src/ptbench/data/montgomery/fold_8.py @@ -2,46 +2,45 @@ # # SPDX-License-Identifier: GPL-3.0-or-later -"""Montgomery dataset for TB detection (cross validation fold 8) +"""Montgomery datamodule for TB detection (default protocol) -* Split reference: first 64% of TB and healthy CXR for "train" 16% for -* "validation", 20% for "test" -* This configuration resolution: 512 x 512 (default) -* See :py:mod:`ptbench.data.montgomery` for dataset details -""" +* See :py:mod:`ptbench.data.montgomery` for more database details. -from clapper.logging import setup +This configuration: -from .. import return_subsets -from ..base_datamodule import BaseDataModule -from . import _maker +* Raw data input (on disk): -logger = setup(__name__.split(".")[0], format="%(levelname)s: %(message)s") + * PNG images 12 bit grayscale + * resolution: 4020 x 4892 px or 4892 x 4020 px +* Output image: + + * Transforms: -class DefaultModule(BaseDataModule): - def __init__( - self, - train_batch_size=1, - predict_batch_size=1, - drop_incomplete_batch=False, - multiproc_kwargs=None, - ): - super().__init__( - train_batch_size=train_batch_size, - predict_batch_size=predict_batch_size, - drop_incomplete_batch=drop_incomplete_batch, - multiproc_kwargs=multiproc_kwargs, - ) + * Load raw PNG with :py:mod:`PIL` + * Remove black borders + * Torch resizing (512 x 512 px) + * Torch center cropping (512 x 512 px) + + * Final specifications + + * Fixed resolution: 512 x 512 px + * Color RGB encoding +""" + +import importlib.resources - def setup(self, stage: str): - self.dataset = _maker("fold_8") - ( - self.train_dataset, - self.validation_dataset, - self.extra_validation_datasets, - self.predict_dataset, - ) = return_subsets(self.dataset) +from ..datamodule import CachingDataModule +from ..split import JSONDatabaseSplit +from .loader import RawDataLoader + +datamodule = CachingDataModule( + database_split=JSONDatabaseSplit( + importlib.resources.files(__name__.rsplit(".", 1)[0]).joinpath( + "fold_8.json.bz2" + ) + ), + raw_data_loader=RawDataLoader(), +) -datamodule = DefaultModule diff --git a/src/ptbench/data/montgomery/fold_8_rgb.py b/src/ptbench/data/montgomery/fold_8_rgb.py deleted file mode 100644 index b7d59359dcde32694affea0e3df88ad747f48e31..0000000000000000000000000000000000000000 --- a/src/ptbench/data/montgomery/fold_8_rgb.py +++ /dev/null @@ -1,47 +0,0 @@ -# SPDX-FileCopyrightText: Copyright © 2023 Idiap Research Institute <contact@idiap.ch> -# -# SPDX-License-Identifier: GPL-3.0-or-later - -"""Montgomery dataset for TB detection (cross validation fold 8, RGB) - -* Split reference: first 64% of TB and healthy CXR for "train" 16% for -* "validation", 20% for "test" -* This configuration resolution: 512 x 512 (default) -* See :py:mod:`ptbench.data.montgomery` for dataset details -""" - -from clapper.logging import setup - -from .. import return_subsets -from ..base_datamodule import BaseDataModule -from . import _maker - -logger = setup(__name__.split(".")[0], format="%(levelname)s: %(message)s") - - -class DefaultModule(BaseDataModule): - def __init__( - self, - train_batch_size=1, - predict_batch_size=1, - drop_incomplete_batch=False, - multiproc_kwargs=None, - ): - super().__init__( - train_batch_size=train_batch_size, - predict_batch_size=predict_batch_size, - drop_incomplete_batch=drop_incomplete_batch, - multiproc_kwargs=multiproc_kwargs, - ) - - def setup(self, stage: str): - self.dataset = _maker("fold_8", RGB=True) - ( - self.train_dataset, - self.validation_dataset, - self.extra_validation_datasets, - self.predict_dataset, - ) = return_subsets(self.dataset) - - -datamodule = DefaultModule diff --git a/src/ptbench/data/montgomery/fold_9.py b/src/ptbench/data/montgomery/fold_9.py index 81bbf72e78826f7e9560189be149d51cb729064e..18561f80221b05a467c72308f6ebacd5fc3280f8 100644 --- a/src/ptbench/data/montgomery/fold_9.py +++ b/src/ptbench/data/montgomery/fold_9.py @@ -2,46 +2,45 @@ # # SPDX-License-Identifier: GPL-3.0-or-later -"""Montgomery dataset for TB detection (cross validation fold 9) +"""Montgomery datamodule for TB detection (default protocol) -* Split reference: first 64% of TB and healthy CXR for "train" 16% for -* "validation", 20% for "test" -* This configuration resolution: 512 x 512 (default) -* See :py:mod:`ptbench.data.montgomery` for dataset details -""" +* See :py:mod:`ptbench.data.montgomery` for more database details. -from clapper.logging import setup +This configuration: -from .. import return_subsets -from ..base_datamodule import BaseDataModule -from . import _maker +* Raw data input (on disk): -logger = setup(__name__.split(".")[0], format="%(levelname)s: %(message)s") + * PNG images 12 bit grayscale + * resolution: 4020 x 4892 px or 4892 x 4020 px +* Output image: + + * Transforms: -class DefaultModule(BaseDataModule): - def __init__( - self, - train_batch_size=1, - predict_batch_size=1, - drop_incomplete_batch=False, - multiproc_kwargs=None, - ): - super().__init__( - train_batch_size=train_batch_size, - predict_batch_size=predict_batch_size, - drop_incomplete_batch=drop_incomplete_batch, - multiproc_kwargs=multiproc_kwargs, - ) + * Load raw PNG with :py:mod:`PIL` + * Remove black borders + * Torch resizing (512 x 512 px) + * Torch center cropping (512 x 512 px) + + * Final specifications + + * Fixed resolution: 512 x 512 px + * Color RGB encoding +""" + +import importlib.resources - def setup(self, stage: str): - self.dataset = _maker("fold_9") - ( - self.train_dataset, - self.validation_dataset, - self.extra_validation_datasets, - self.predict_dataset, - ) = return_subsets(self.dataset) +from ..datamodule import CachingDataModule +from ..split import JSONDatabaseSplit +from .loader import RawDataLoader + +datamodule = CachingDataModule( + database_split=JSONDatabaseSplit( + importlib.resources.files(__name__.rsplit(".", 1)[0]).joinpath( + "fold_9.json.bz2" + ) + ), + raw_data_loader=RawDataLoader(), +) -datamodule = DefaultModule diff --git a/src/ptbench/data/montgomery/fold_9_rgb.py b/src/ptbench/data/montgomery/fold_9_rgb.py deleted file mode 100644 index e961e08ffe49a94001252c641ba8bee86758b44f..0000000000000000000000000000000000000000 --- a/src/ptbench/data/montgomery/fold_9_rgb.py +++ /dev/null @@ -1,47 +0,0 @@ -# SPDX-FileCopyrightText: Copyright © 2023 Idiap Research Institute <contact@idiap.ch> -# -# SPDX-License-Identifier: GPL-3.0-or-later - -"""Montgomery dataset for TB detection (cross validation fold 9, RGB) - -* Split reference: first 64% of TB and healthy CXR for "train" 16% for -* "validation", 20% for "test" -* This configuration resolution: 512 x 512 (default) -* See :py:mod:`ptbench.data.montgomery` for dataset details -""" - -from clapper.logging import setup - -from .. import return_subsets -from ..base_datamodule import BaseDataModule -from . import _maker - -logger = setup(__name__.split(".")[0], format="%(levelname)s: %(message)s") - - -class DefaultModule(BaseDataModule): - def __init__( - self, - train_batch_size=1, - predict_batch_size=1, - drop_incomplete_batch=False, - multiproc_kwargs=None, - ): - super().__init__( - train_batch_size=train_batch_size, - predict_batch_size=predict_batch_size, - drop_incomplete_batch=drop_incomplete_batch, - multiproc_kwargs=multiproc_kwargs, - ) - - def setup(self, stage: str): - self.dataset = _maker("fold_9", RGB=True) - ( - self.train_dataset, - self.validation_dataset, - self.extra_validation_datasets, - self.predict_dataset, - ) = return_subsets(self.dataset) - - -datamodule = DefaultModule diff --git a/src/ptbench/data/montgomery/loader.py b/src/ptbench/data/montgomery/loader.py new file mode 100644 index 0000000000000000000000000000000000000000..0dce873853603dfeab4de7f819dc0ab4cd1f6d92 --- /dev/null +++ b/src/ptbench/data/montgomery/loader.py @@ -0,0 +1,114 @@ +# SPDX-FileCopyrightText: Copyright © 2023 Idiap Research Institute <contact@idiap.ch> +# +# SPDX-License-Identifier: GPL-3.0-or-later + +"""Montgomery datamodule for TB detection (default protocol) + +* See :py:mod:`ptbench.data.montgomery` for more database details. + +This configuration: + +* Raw data input (on disk): + + * PNG images 12 bit grayscale + * resolution: 4020 x 4892 px or 4892 x 4020 px + +* Output image: + + * Transforms: + + * Load raw PNG with :py:mod:`PIL` + * Remove black borders + * Torch resizing (512 x 512 px) + * Torch center cropping (512 x 512 px) + + * Final specifications + + * Fixed resolution: 512 x 512 px + * Color RGB encoding +""" + +import os + +import torchvision.transforms + +from ...utils.rc import load_rc +from ..image_utils import RemoveBlackBorders, load_pil_baw +from ..typing import RawDataLoader as _BaseRawDataLoader +from ..typing import Sample + + +class RawDataLoader(_BaseRawDataLoader): + """A specialized raw-data-loader for the Montgomery dataset. + + Attributes + ---------- + + datadir + This variable contains the base directory where the database raw data + is stored. + + transform + Transforms that are always applied to the loaded raw images. + """ + + datadir: str + transform: torchvision.transforms.Compose + + def __init__(self): + self.datadir = load_rc().get( + "datadir.montgomery", os.path.realpath(os.curdir) + ) + + self.transform = torchvision.transforms.Compose( + [ + RemoveBlackBorders(), + torchvision.transforms.Resize(512), + torchvision.transforms.CenterCrop(512), + torchvision.transforms.ToTensor(), + ] + ) + + def sample(self, sample: tuple[str, int]) -> Sample: + """Loads a single image sample from the disk. + + Parameters + ---------- + + sample: + A tuple containing the path suffix, within the dataset root folder, + where to find the image to be loaded, and an integer, representing the + sample label. + + + Returns + ------- + + sample + The sample representation + """ + tensor = self.transform( + load_pil_baw(os.path.join(self.datadir, sample[0])) + ) + + return tensor, dict(label=sample[1], name=sample[0]) # type: ignore[arg-type] + + def label(self, sample: tuple[str, int]) -> int: + """Loads a single image sample label from the disk. + + Parameters + ---------- + + sample: + A tuple containing the path suffix, within the dataset root folder, + where to find the image to be loaded, and an integer, representing the + sample label. + + + Returns + ------- + + label + The integer label associated with the sample + """ + return sample[1] diff --git a/src/ptbench/data/montgomery/rgb.py b/src/ptbench/data/montgomery/rgb.py deleted file mode 100644 index c162126648f0baae5a921fa7f009da171fb8ccc7..0000000000000000000000000000000000000000 --- a/src/ptbench/data/montgomery/rgb.py +++ /dev/null @@ -1,47 +0,0 @@ -# SPDX-FileCopyrightText: Copyright © 2023 Idiap Research Institute <contact@idiap.ch> -# -# SPDX-License-Identifier: GPL-3.0-or-later - -"""Montgomery dataset for TB detection (default protocol, converted in RGB) - -* Split reference: first 64% of TB and healthy CXR for "train" 16% for -* "validation", 20% for "test" -* This configuration resolution: 512 x 512 (default) -* See :py:mod:`ptbench.data.montgomery` for dataset details -""" - -from clapper.logging import setup - -from .. import return_subsets -from ..base_datamodule import BaseDataModule -from . import _maker - -logger = setup(__name__.split(".")[0], format="%(levelname)s: %(message)s") - - -class DefaultModule(BaseDataModule): - def __init__( - self, - train_batch_size=1, - predict_batch_size=1, - drop_incomplete_batch=False, - multiproc_kwargs=None, - ): - super().__init__( - train_batch_size=train_batch_size, - predict_batch_size=predict_batch_size, - drop_incomplete_batch=drop_incomplete_batch, - multiproc_kwargs=multiproc_kwargs, - ) - - def setup(self, stage: str): - self.dataset = _maker("default", RGB=True) - ( - self.train_dataset, - self.validation_dataset, - self.extra_validation_datasets, - self.predict_dataset, - ) = return_subsets(self.dataset) - - -datamodule = DefaultModule