diff --git a/src/ptbench/data/shenzhen/__init__.py b/src/ptbench/data/shenzhen/__init__.py index 1645962e8cc00443399dd60b88f017c71824e086..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 100644 --- a/src/ptbench/data/shenzhen/__init__.py +++ b/src/ptbench/data/shenzhen/__init__.py @@ -1,34 +0,0 @@ -# SPDX-FileCopyrightText: Copyright © 2023 Idiap Research Institute <contact@idiap.ch> -# -# SPDX-License-Identifier: GPL-3.0-or-later - -"""Shenzhen dataset for computer-aided diagnosis. - -The standard digital image database for Tuberculosis is created by the -National Library of Medicine, Maryland, USA in collaboration with Shenzhen -No.3 People’s Hospital, Guangdong Medical College, Shenzhen, China. -The Chest X-rays are from out-patient clinics, and were captured as part of -the daily routine using Philips DR Digital Diagnose systems. - -* Reference: [MONTGOMERY-SHENZHEN-2014]_ -* Original resolution (height x width or width x height): 3000 x 3000 or less -* Split reference: none - * Training samples: 64% of TB and healthy CXR (including labels) - * Validation samples: 16% of TB and healthy CXR (including labels) - * Test samples: 20% of TB and healthy CXR (including labels) -""" -import importlib.resources - -_protocols = [ - importlib.resources.files(__name__).joinpath("default.json.bz2"), - importlib.resources.files(__name__).joinpath("fold_0.json.bz2"), - importlib.resources.files(__name__).joinpath("fold_1.json.bz2"), - importlib.resources.files(__name__).joinpath("fold_2.json.bz2"), - importlib.resources.files(__name__).joinpath("fold_3.json.bz2"), - importlib.resources.files(__name__).joinpath("fold_4.json.bz2"), - importlib.resources.files(__name__).joinpath("fold_5.json.bz2"), - importlib.resources.files(__name__).joinpath("fold_6.json.bz2"), - importlib.resources.files(__name__).joinpath("fold_7.json.bz2"), - importlib.resources.files(__name__).joinpath("fold_8.json.bz2"), - importlib.resources.files(__name__).joinpath("fold_9.json.bz2"), -] diff --git a/src/ptbench/data/shenzhen/fold_0.py b/src/ptbench/data/shenzhen/fold_0.py index 5b4d45602d13a0e6bd0e2724a6c3202c1532eef6..888a0e60024480a3aaff65f6e3d819370fd22669 100644 --- a/src/ptbench/data/shenzhen/fold_0.py +++ b/src/ptbench/data/shenzhen/fold_0.py @@ -2,45 +2,46 @@ # # SPDX-License-Identifier: GPL-3.0-or-later -"""Shenzhen dataset for TB detection (cross validation fold 0) +"""Shenzhen datamodule for computer-aided diagnosis (fold 0) -* Split reference: first 80% of TB and healthy CXR for "train", rest for "test" -* This configuration resolution: 512 x 512 (default) -* See :py:mod:`ptbench.data.shenzhen` for dataset details -""" +See :py:mod:`ptbench.data.shenzhen` for more database details. -from clapper.logging import setup +This configuration: -from .. import return_subsets -from ..base_datamodule import BaseDataModule -from . import _maker +* Raw data input (on disk): -logger = setup(__name__.split(".")[0], format="%(levelname)s: %(message)s") + * PNG images (black and white, encoded as color images) + * Variable width and height: + * widths: from 1130 to 3001 pixels + * heights: from 948 to 3001 pixels -class Fold0Module(BaseDataModule): - def __init__( - self, - train_batch_size=1, - predict_batch_size=1, - drop_incomplete_batch=False, - multiproc_kwargs=None, - ): - super().__init__( - train_batch_size=train_batch_size, - predict_batch_size=predict_batch_size, - drop_incomplete_batch=drop_incomplete_batch, - multiproc_kwargs=multiproc_kwargs, - ) +* Output image: + + * Transforms: + + * Load raw PNG with :py:mod:`PIL` + * Remove black borders + * Torch resizing(512px, 512px) + * Torch center cropping (512px, 512px) - def setup(self, stage: str): - self.dataset = _maker("fold_0") - ( - self.train_dataset, - self.validation_dataset, - self.extra_validation_datasets, - self.predict_dataset, - ) = return_subsets(self.dataset) + * Final specifications: + * Fixed resolution: 512x512 pixels + * Color RGB encoding +""" + +import importlib.resources -datamodule = Fold0Module +from ..datamodule import CachingDataModule +from ..split import JSONDatabaseSplit +from .loader import RawDataLoader + +datamodule = CachingDataModule( + database_split=JSONDatabaseSplit( + importlib.resources.files(__name__.rsplit(".", 1)[0]).joinpath( + "fold_0.json.bz2" + ) + ), + raw_data_loader=RawDataLoader(), +) diff --git a/src/ptbench/data/shenzhen/fold_0_rgb.py b/src/ptbench/data/shenzhen/fold_0_rgb.py deleted file mode 100644 index 143ef731fae0f7d3c746e384e08e400f36c92511..0000000000000000000000000000000000000000 --- a/src/ptbench/data/shenzhen/fold_0_rgb.py +++ /dev/null @@ -1,46 +0,0 @@ -# SPDX-FileCopyrightText: Copyright © 2023 Idiap Research Institute <contact@idiap.ch> -# -# SPDX-License-Identifier: GPL-3.0-or-later - -"""Shenzhen dataset for TB detection (cross validation fold 0, RGB) - -* Split reference: first 80% of TB and healthy CXR for "train", rest for "test" -* This configuration resolution: 512 x 512 (default) -* See :py:mod:`ptbench.data.shenzhen` for dataset details -""" - -from clapper.logging import setup - -from .. import return_subsets -from ..base_datamodule import BaseDataModule -from . import _maker - -logger = setup(__name__.split(".")[0], format="%(levelname)s: %(message)s") - - -class Fold0Module(BaseDataModule): - def __init__( - self, - train_batch_size=1, - predict_batch_size=1, - drop_incomplete_batch=False, - multiproc_kwargs=None, - ): - super().__init__( - train_batch_size=train_batch_size, - predict_batch_size=predict_batch_size, - drop_incomplete_batch=drop_incomplete_batch, - multiproc_kwargs=multiproc_kwargs, - ) - - def setup(self, stage: str): - self.dataset = _maker("fold_0", RGB=True) - ( - self.train_dataset, - self.validation_dataset, - self.extra_validation_datasets, - self.predict_dataset, - ) = return_subsets(self.dataset) - - -datamodule = Fold0Module diff --git a/src/ptbench/data/shenzhen/fold_1.py b/src/ptbench/data/shenzhen/fold_1.py index f01adef0af4da152ad756036aacb4bf83c5c20cc..62d7fbd55c83ed746754cbc99dcc65fe48efbc6a 100644 --- a/src/ptbench/data/shenzhen/fold_1.py +++ b/src/ptbench/data/shenzhen/fold_1.py @@ -2,45 +2,46 @@ # # SPDX-License-Identifier: GPL-3.0-or-later -"""Shenzhen dataset for TB detection (cross validation fold 1) +"""Shenzhen datamodule for computer-aided diagnosis (fold 1) -* Split reference: first 80% of TB and healthy CXR for "train", rest for "test" -* This configuration resolution: 512 x 512 (default) -* See :py:mod:`ptbench.data.shenzhen` for dataset details -""" +See :py:mod:`ptbench.data.shenzhen` for more database details. -from clapper.logging import setup +This configuration: -from .. import return_subsets -from ..base_datamodule import BaseDataModule -from . import _maker +* Raw data input (on disk): -logger = setup(__name__.split(".")[0], format="%(levelname)s: %(message)s") + * PNG images (black and white, encoded as color images) + * Variable width and height: + * widths: from 1130 to 3001 pixels + * heights: from 948 to 3001 pixels -class Fold0Module(BaseDataModule): - def __init__( - self, - train_batch_size=1, - predict_batch_size=1, - drop_incomplete_batch=False, - multiproc_kwargs=None, - ): - super().__init__( - train_batch_size=train_batch_size, - predict_batch_size=predict_batch_size, - drop_incomplete_batch=drop_incomplete_batch, - multiproc_kwargs=multiproc_kwargs, - ) +* Output image: + + * Transforms: + + * Load raw PNG with :py:mod:`PIL` + * Remove black borders + * Torch resizing(512px, 512px) + * Torch center cropping (512px, 512px) - def setup(self, stage: str): - self.dataset = _maker("fold_1") - ( - self.train_dataset, - self.validation_dataset, - self.extra_validation_datasets, - self.predict_dataset, - ) = return_subsets(self.dataset) + * Final specifications: + * Fixed resolution: 512x512 pixels + * Color RGB encoding +""" + +import importlib.resources -datamodule = Fold0Module +from ..datamodule import CachingDataModule +from ..split import JSONDatabaseSplit +from .loader import RawDataLoader + +datamodule = CachingDataModule( + database_split=JSONDatabaseSplit( + importlib.resources.files(__name__.rsplit(".", 1)[0]).joinpath( + "fold_1.json.bz2" + ) + ), + raw_data_loader=RawDataLoader(), +) diff --git a/src/ptbench/data/shenzhen/fold_1_rgb.py b/src/ptbench/data/shenzhen/fold_1_rgb.py deleted file mode 100644 index 9d457adfa8835e45c7d5dc993ab23ffc8baafeb9..0000000000000000000000000000000000000000 --- a/src/ptbench/data/shenzhen/fold_1_rgb.py +++ /dev/null @@ -1,47 +0,0 @@ -# SPDX-FileCopyrightText: Copyright © 2023 Idiap Research Institute <contact@idiap.ch> -# -# SPDX-License-Identifier: GPL-3.0-or-later - -"""Shenzhen dataset for TB detection (cross validation fold 1, RGB) - -* Split reference: first 80% of TB and healthy CXR for "train", rest for "test" -* This configuration resolution: 512 x 512 (default) -* See :py:mod:`ptbench.data.shenzhen` for dataset details -""" - - -from clapper.logging import setup - -from .. import return_subsets -from ..base_datamodule import BaseDataModule -from . import _maker - -logger = setup(__name__.split(".")[0], format="%(levelname)s: %(message)s") - - -class Fold0Module(BaseDataModule): - def __init__( - self, - train_batch_size=1, - predict_batch_size=1, - drop_incomplete_batch=False, - multiproc_kwargs=None, - ): - super().__init__( - train_batch_size=train_batch_size, - predict_batch_size=predict_batch_size, - drop_incomplete_batch=drop_incomplete_batch, - multiproc_kwargs=multiproc_kwargs, - ) - - def setup(self, stage: str): - self.dataset = _maker("fold_1", RGB=True) - ( - self.train_dataset, - self.validation_dataset, - self.extra_validation_datasets, - self.predict_dataset, - ) = return_subsets(self.dataset) - - -datamodule = Fold0Module diff --git a/src/ptbench/data/shenzhen/fold_2.py b/src/ptbench/data/shenzhen/fold_2.py index 04dd656263cc6e71d2c61e3f5e221f7129fa7035..b41284cd9d1c4a56c70eff715078f82213dabb3c 100644 --- a/src/ptbench/data/shenzhen/fold_2.py +++ b/src/ptbench/data/shenzhen/fold_2.py @@ -2,45 +2,46 @@ # # SPDX-License-Identifier: GPL-3.0-or-later -"""Shenzhen dataset for TB detection (cross validation fold 2) +"""Shenzhen datamodule for computer-aided diagnosis (fold 2) -* Split reference: first 80% of TB and healthy CXR for "train", rest for "test" -* This configuration resolution: 512 x 512 (default) -* See :py:mod:`ptbench.data.shenzhen` for dataset details -""" +See :py:mod:`ptbench.data.shenzhen` for more database details. -from clapper.logging import setup +This configuration: -from .. import return_subsets -from ..base_datamodule import BaseDataModule -from . import _maker +* Raw data input (on disk): -logger = setup(__name__.split(".")[0], format="%(levelname)s: %(message)s") + * PNG images (black and white, encoded as color images) + * Variable width and height: + * widths: from 1130 to 3001 pixels + * heights: from 948 to 3001 pixels -class Fold0Module(BaseDataModule): - def __init__( - self, - train_batch_size=1, - predict_batch_size=1, - drop_incomplete_batch=False, - multiproc_kwargs=None, - ): - super().__init__( - train_batch_size=train_batch_size, - predict_batch_size=predict_batch_size, - drop_incomplete_batch=drop_incomplete_batch, - multiproc_kwargs=multiproc_kwargs, - ) +* Output image: + + * Transforms: + + * Load raw PNG with :py:mod:`PIL` + * Remove black borders + * Torch resizing(512px, 512px) + * Torch center cropping (512px, 512px) - def setup(self, stage: str): - self.dataset = _maker("fold_2") - ( - self.train_dataset, - self.validation_dataset, - self.extra_validation_datasets, - self.predict_dataset, - ) = return_subsets(self.dataset) + * Final specifications: + * Fixed resolution: 512x512 pixels + * Color RGB encoding +""" + +import importlib.resources -datamodule = Fold0Module +from ..datamodule import CachingDataModule +from ..split import JSONDatabaseSplit +from .loader import RawDataLoader + +datamodule = CachingDataModule( + database_split=JSONDatabaseSplit( + importlib.resources.files(__name__.rsplit(".", 1)[0]).joinpath( + "fold_2.json.bz2" + ) + ), + raw_data_loader=RawDataLoader(), +) diff --git a/src/ptbench/data/shenzhen/fold_2_rgb.py b/src/ptbench/data/shenzhen/fold_2_rgb.py deleted file mode 100644 index 37cbe10ebd72057535bacd241ecb4dbfb2bded3d..0000000000000000000000000000000000000000 --- a/src/ptbench/data/shenzhen/fold_2_rgb.py +++ /dev/null @@ -1,46 +0,0 @@ -# SPDX-FileCopyrightText: Copyright © 2023 Idiap Research Institute <contact@idiap.ch> -# -# SPDX-License-Identifier: GPL-3.0-or-later - -"""Shenzhen dataset for TB detection (cross validation fold 2, RGB) - -* Split reference: first 80% of TB and healthy CXR for "train", rest for "test" -* This configuration resolution: 512 x 512 (default) -* See :py:mod:`ptbench.data.shenzhen` for dataset details -""" - -from clapper.logging import setup - -from .. import return_subsets -from ..base_datamodule import BaseDataModule -from . import _maker - -logger = setup(__name__.split(".")[0], format="%(levelname)s: %(message)s") - - -class Fold0Module(BaseDataModule): - def __init__( - self, - train_batch_size=1, - predict_batch_size=1, - drop_incomplete_batch=False, - multiproc_kwargs=None, - ): - super().__init__( - train_batch_size=train_batch_size, - predict_batch_size=predict_batch_size, - drop_incomplete_batch=drop_incomplete_batch, - multiproc_kwargs=multiproc_kwargs, - ) - - def setup(self, stage: str): - self.dataset = _maker("fold_2", RGB=True) - ( - self.train_dataset, - self.validation_dataset, - self.extra_validation_datasets, - self.predict_dataset, - ) = return_subsets(self.dataset) - - -datamodule = Fold0Module diff --git a/src/ptbench/data/shenzhen/fold_3.py b/src/ptbench/data/shenzhen/fold_3.py index b43fcb29c9392d6e9bc5c20d9e93969b33bd2708..cca555064e9923433ef39f591b3e342365cf7afc 100644 --- a/src/ptbench/data/shenzhen/fold_3.py +++ b/src/ptbench/data/shenzhen/fold_3.py @@ -2,45 +2,46 @@ # # SPDX-License-Identifier: GPL-3.0-or-later -"""Shenzhen dataset for TB detection (cross validation fold 3) +"""Shenzhen datamodule for computer-aided diagnosis (fold 3) -* Split reference: first 80% of TB and healthy CXR for "train", rest for "test" -* This configuration resolution: 512 x 512 (default) -* See :py:mod:`ptbench.data.shenzhen` for dataset details -""" +See :py:mod:`ptbench.data.shenzhen` for more database details. -from clapper.logging import setup +This configuration: -from .. import return_subsets -from ..base_datamodule import BaseDataModule -from . import _maker +* Raw data input (on disk): -logger = setup(__name__.split(".")[0], format="%(levelname)s: %(message)s") + * PNG images (black and white, encoded as color images) + * Variable width and height: + * widths: from 1130 to 3001 pixels + * heights: from 948 to 3001 pixels -class Fold0Module(BaseDataModule): - def __init__( - self, - train_batch_size=1, - predict_batch_size=1, - drop_incomplete_batch=False, - multiproc_kwargs=None, - ): - super().__init__( - train_batch_size=train_batch_size, - predict_batch_size=predict_batch_size, - drop_incomplete_batch=drop_incomplete_batch, - multiproc_kwargs=multiproc_kwargs, - ) +* Output image: + + * Transforms: + + * Load raw PNG with :py:mod:`PIL` + * Remove black borders + * Torch resizing(512px, 512px) + * Torch center cropping (512px, 512px) - def setup(self, stage: str): - self.dataset = _maker("fold_3") - ( - self.train_dataset, - self.validation_dataset, - self.extra_validation_datasets, - self.predict_dataset, - ) = return_subsets(self.dataset) + * Final specifications: + * Fixed resolution: 512x512 pixels + * Color RGB encoding +""" + +import importlib.resources -datamodule = Fold0Module +from ..datamodule import CachingDataModule +from ..split import JSONDatabaseSplit +from .loader import RawDataLoader + +datamodule = CachingDataModule( + database_split=JSONDatabaseSplit( + importlib.resources.files(__name__.rsplit(".", 1)[0]).joinpath( + "fold_3.json.bz2" + ) + ), + raw_data_loader=RawDataLoader(), +) diff --git a/src/ptbench/data/shenzhen/fold_3_rgb.py b/src/ptbench/data/shenzhen/fold_3_rgb.py deleted file mode 100644 index 162a3f82d640633ce6d13d25b2a500dc0fb63a54..0000000000000000000000000000000000000000 --- a/src/ptbench/data/shenzhen/fold_3_rgb.py +++ /dev/null @@ -1,46 +0,0 @@ -# SPDX-FileCopyrightText: Copyright © 2023 Idiap Research Institute <contact@idiap.ch> -# -# SPDX-License-Identifier: GPL-3.0-or-later - -"""Shenzhen dataset for TB detection (cross validation fold 3, RGB) - -* Split reference: first 80% of TB and healthy CXR for "train", rest for "test" -* This configuration resolution: 512 x 512 (default) -* See :py:mod:`ptbench.data.shenzhen` for dataset details -""" - -from clapper.logging import setup - -from .. import return_subsets -from ..base_datamodule import BaseDataModule -from . import _maker - -logger = setup(__name__.split(".")[0], format="%(levelname)s: %(message)s") - - -class Fold0Module(BaseDataModule): - def __init__( - self, - train_batch_size=1, - predict_batch_size=1, - drop_incomplete_batch=False, - multiproc_kwargs=None, - ): - super().__init__( - train_batch_size=train_batch_size, - predict_batch_size=predict_batch_size, - drop_incomplete_batch=drop_incomplete_batch, - multiproc_kwargs=multiproc_kwargs, - ) - - def setup(self, stage: str): - self.dataset = _maker("fold_3", RGB=True) - ( - self.train_dataset, - self.validation_dataset, - self.extra_validation_datasets, - self.predict_dataset, - ) = return_subsets(self.dataset) - - -datamodule = Fold0Module diff --git a/src/ptbench/data/shenzhen/fold_4.py b/src/ptbench/data/shenzhen/fold_4.py index 58e0a2f221e0a6a48b3466c015c9c5790fc6ae3a..897420076303e47406cc9efb3b6bf0d294ab3611 100644 --- a/src/ptbench/data/shenzhen/fold_4.py +++ b/src/ptbench/data/shenzhen/fold_4.py @@ -2,45 +2,46 @@ # # SPDX-License-Identifier: GPL-3.0-or-later -"""Shenzhen dataset for TB detection (cross validation fold 4) +"""Shenzhen datamodule for computer-aided diagnosis (fold 4) -* Split reference: first 80% of TB and healthy CXR for "train", rest for "test" -* This configuration resolution: 512 x 512 (default) -* See :py:mod:`ptbench.data.shenzhen` for dataset details -""" +See :py:mod:`ptbench.data.shenzhen` for more database details. -from clapper.logging import setup +This configuration: -from .. import return_subsets -from ..base_datamodule import BaseDataModule -from . import _maker +* Raw data input (on disk): -logger = setup(__name__.split(".")[0], format="%(levelname)s: %(message)s") + * PNG images (black and white, encoded as color images) + * Variable width and height: + * widths: from 1130 to 3001 pixels + * heights: from 948 to 3001 pixels -class Fold0Module(BaseDataModule): - def __init__( - self, - train_batch_size=1, - predict_batch_size=1, - drop_incomplete_batch=False, - multiproc_kwargs=None, - ): - super().__init__( - train_batch_size=train_batch_size, - predict_batch_size=predict_batch_size, - drop_incomplete_batch=drop_incomplete_batch, - multiproc_kwargs=multiproc_kwargs, - ) +* Output image: + + * Transforms: + + * Load raw PNG with :py:mod:`PIL` + * Remove black borders + * Torch resizing(512px, 512px) + * Torch center cropping (512px, 512px) - def setup(self, stage: str): - self.dataset = _maker("fold_4") - ( - self.train_dataset, - self.validation_dataset, - self.extra_validation_datasets, - self.predict_dataset, - ) = return_subsets(self.dataset) + * Final specifications: + * Fixed resolution: 512x512 pixels + * Color RGB encoding +""" + +import importlib.resources -datamodule = Fold0Module +from ..datamodule import CachingDataModule +from ..split import JSONDatabaseSplit +from .loader import RawDataLoader + +datamodule = CachingDataModule( + database_split=JSONDatabaseSplit( + importlib.resources.files(__name__.rsplit(".", 1)[0]).joinpath( + "fold_4.json.bz2" + ) + ), + raw_data_loader=RawDataLoader(), +) diff --git a/src/ptbench/data/shenzhen/fold_4_rgb.py b/src/ptbench/data/shenzhen/fold_4_rgb.py deleted file mode 100644 index 0dd4ccf89c553a19030774772447bb66bf0cf9b7..0000000000000000000000000000000000000000 --- a/src/ptbench/data/shenzhen/fold_4_rgb.py +++ /dev/null @@ -1,46 +0,0 @@ -# SPDX-FileCopyrightText: Copyright © 2023 Idiap Research Institute <contact@idiap.ch> -# -# SPDX-License-Identifier: GPL-3.0-or-later - -"""Shenzhen dataset for TB detection (cross validation fold 4, RGB) - -* Split reference: first 80% of TB and healthy CXR for "train", rest for "test" -* This configuration resolution: 512 x 512 (default) -* See :py:mod:`ptbench.data.shenzhen` for dataset details -""" - -from clapper.logging import setup - -from .. import return_subsets -from ..base_datamodule import BaseDataModule -from . import _maker - -logger = setup(__name__.split(".")[0], format="%(levelname)s: %(message)s") - - -class Fold0Module(BaseDataModule): - def __init__( - self, - train_batch_size=1, - predict_batch_size=1, - drop_incomplete_batch=False, - multiproc_kwargs=None, - ): - super().__init__( - train_batch_size=train_batch_size, - predict_batch_size=predict_batch_size, - drop_incomplete_batch=drop_incomplete_batch, - multiproc_kwargs=multiproc_kwargs, - ) - - def setup(self, stage: str): - self.dataset = _maker("fold_4", RGB=True) - ( - self.train_dataset, - self.validation_dataset, - self.extra_validation_datasets, - self.predict_dataset, - ) = return_subsets(self.dataset) - - -datamodule = Fold0Module diff --git a/src/ptbench/data/shenzhen/fold_5.py b/src/ptbench/data/shenzhen/fold_5.py index ff115340f723d33ebdeb49524cc7e3b269738563..c520399d98ead9eeb1e3bdcfbe4dc48393adcebc 100644 --- a/src/ptbench/data/shenzhen/fold_5.py +++ b/src/ptbench/data/shenzhen/fold_5.py @@ -2,45 +2,46 @@ # # SPDX-License-Identifier: GPL-3.0-or-later -"""Shenzhen dataset for TB detection (cross validation fold 5) +"""Shenzhen datamodule for computer-aided diagnosis (fold 5) -* Split reference: first 80% of TB and healthy CXR for "train", rest for "test" -* This configuration resolution: 512 x 512 (default) -* See :py:mod:`ptbench.data.shenzhen` for dataset details -""" +See :py:mod:`ptbench.data.shenzhen` for more database details. -from clapper.logging import setup +This configuration: -from .. import return_subsets -from ..base_datamodule import BaseDataModule -from . import _maker +* Raw data input (on disk): -logger = setup(__name__.split(".")[0], format="%(levelname)s: %(message)s") + * PNG images (black and white, encoded as color images) + * Variable width and height: + * widths: from 1130 to 3001 pixels + * heights: from 948 to 3001 pixels -class Fold0Module(BaseDataModule): - def __init__( - self, - train_batch_size=1, - predict_batch_size=1, - drop_incomplete_batch=False, - multiproc_kwargs=None, - ): - super().__init__( - train_batch_size=train_batch_size, - predict_batch_size=predict_batch_size, - drop_incomplete_batch=drop_incomplete_batch, - multiproc_kwargs=multiproc_kwargs, - ) +* Output image: + + * Transforms: + + * Load raw PNG with :py:mod:`PIL` + * Remove black borders + * Torch resizing(512px, 512px) + * Torch center cropping (512px, 512px) - def setup(self, stage: str): - self.dataset = _maker("fold_5") - ( - self.train_dataset, - self.validation_dataset, - self.extra_validation_datasets, - self.predict_dataset, - ) = return_subsets(self.dataset) + * Final specifications: + * Fixed resolution: 512x512 pixels + * Color RGB encoding +""" + +import importlib.resources -datamodule = Fold0Module +from ..datamodule import CachingDataModule +from ..split import JSONDatabaseSplit +from .loader import RawDataLoader + +datamodule = CachingDataModule( + database_split=JSONDatabaseSplit( + importlib.resources.files(__name__.rsplit(".", 1)[0]).joinpath( + "fold_5.json.bz2" + ) + ), + raw_data_loader=RawDataLoader(), +) diff --git a/src/ptbench/data/shenzhen/fold_5_rgb.py b/src/ptbench/data/shenzhen/fold_5_rgb.py deleted file mode 100644 index 46e255e7c37c1f547d25626970324f88d58ba03e..0000000000000000000000000000000000000000 --- a/src/ptbench/data/shenzhen/fold_5_rgb.py +++ /dev/null @@ -1,46 +0,0 @@ -# SPDX-FileCopyrightText: Copyright © 2023 Idiap Research Institute <contact@idiap.ch> -# -# SPDX-License-Identifier: GPL-3.0-or-later - -"""Shenzhen dataset for TB detection (cross validation fold 5, RGB) - -* Split reference: first 80% of TB and healthy CXR for "train", rest for "test" -* This configuration resolution: 512 x 512 (default) -* See :py:mod:`ptbench.data.shenzhen` for dataset details -""" - -from clapper.logging import setup - -from .. import return_subsets -from ..base_datamodule import BaseDataModule -from . import _maker - -logger = setup(__name__.split(".")[0], format="%(levelname)s: %(message)s") - - -class Fold0Module(BaseDataModule): - def __init__( - self, - train_batch_size=1, - predict_batch_size=1, - drop_incomplete_batch=False, - multiproc_kwargs=None, - ): - super().__init__( - train_batch_size=train_batch_size, - predict_batch_size=predict_batch_size, - drop_incomplete_batch=drop_incomplete_batch, - multiproc_kwargs=multiproc_kwargs, - ) - - def setup(self, stage: str): - self.dataset = _maker("fold_5", RGB=True) - ( - self.train_dataset, - self.validation_dataset, - self.extra_validation_datasets, - self.predict_dataset, - ) = return_subsets(self.dataset) - - -datamodule = Fold0Module diff --git a/src/ptbench/data/shenzhen/fold_6.py b/src/ptbench/data/shenzhen/fold_6.py index eb81ae882369a1a99bca60e93b1922c7a9d7e9fc..a28f8fc5ca3e0ebd4b49fceaec99d3a2e94dd34c 100644 --- a/src/ptbench/data/shenzhen/fold_6.py +++ b/src/ptbench/data/shenzhen/fold_6.py @@ -2,45 +2,46 @@ # # SPDX-License-Identifier: GPL-3.0-or-later -"""Shenzhen dataset for TB detection (cross validation fold 6) +"""Shenzhen datamodule for computer-aided diagnosis (fold 6) -* Split reference: first 80% of TB and healthy CXR for "train", rest for "test" -* This configuration resolution: 512 x 512 (default) -* See :py:mod:`ptbench.data.shenzhen` for dataset details -""" +See :py:mod:`ptbench.data.shenzhen` for more database details. -from clapper.logging import setup +This configuration: -from .. import return_subsets -from ..base_datamodule import BaseDataModule -from . import _maker +* Raw data input (on disk): -logger = setup(__name__.split(".")[0], format="%(levelname)s: %(message)s") + * PNG images (black and white, encoded as color images) + * Variable width and height: + * widths: from 1130 to 3001 pixels + * heights: from 948 to 3001 pixels -class Fold0Module(BaseDataModule): - def __init__( - self, - train_batch_size=1, - predict_batch_size=1, - drop_incomplete_batch=False, - multiproc_kwargs=None, - ): - super().__init__( - train_batch_size=train_batch_size, - predict_batch_size=predict_batch_size, - drop_incomplete_batch=drop_incomplete_batch, - multiproc_kwargs=multiproc_kwargs, - ) +* Output image: + + * Transforms: + + * Load raw PNG with :py:mod:`PIL` + * Remove black borders + * Torch resizing(512px, 512px) + * Torch center cropping (512px, 512px) - def setup(self, stage: str): - self.dataset = _maker("fold_6") - ( - self.train_dataset, - self.validation_dataset, - self.extra_validation_datasets, - self.predict_dataset, - ) = return_subsets(self.dataset) + * Final specifications: + * Fixed resolution: 512x512 pixels + * Color RGB encoding +""" + +import importlib.resources -datamodule = Fold0Module +from ..datamodule import CachingDataModule +from ..split import JSONDatabaseSplit +from .loader import RawDataLoader + +datamodule = CachingDataModule( + database_split=JSONDatabaseSplit( + importlib.resources.files(__name__.rsplit(".", 1)[0]).joinpath( + "fold_6.json.bz2" + ) + ), + raw_data_loader=RawDataLoader(), +) diff --git a/src/ptbench/data/shenzhen/fold_6_rgb.py b/src/ptbench/data/shenzhen/fold_6_rgb.py deleted file mode 100644 index b9654d08008bd10f0a5d953f2a79870214218651..0000000000000000000000000000000000000000 --- a/src/ptbench/data/shenzhen/fold_6_rgb.py +++ /dev/null @@ -1,46 +0,0 @@ -# SPDX-FileCopyrightText: Copyright © 2023 Idiap Research Institute <contact@idiap.ch> -# -# SPDX-License-Identifier: GPL-3.0-or-later - -"""Shenzhen dataset for TB detection (cross validation fold 6, RGB) - -* Split reference: first 80% of TB and healthy CXR for "train", rest for "test" -* This configuration resolution: 512 x 512 (default) -* See :py:mod:`ptbench.data.shenzhen` for dataset details -""" - -from clapper.logging import setup - -from .. import return_subsets -from ..base_datamodule import BaseDataModule -from . import _maker - -logger = setup(__name__.split(".")[0], format="%(levelname)s: %(message)s") - - -class Fold0Module(BaseDataModule): - def __init__( - self, - train_batch_size=1, - predict_batch_size=1, - drop_incomplete_batch=False, - multiproc_kwargs=None, - ): - super().__init__( - train_batch_size=train_batch_size, - predict_batch_size=predict_batch_size, - drop_incomplete_batch=drop_incomplete_batch, - multiproc_kwargs=multiproc_kwargs, - ) - - def setup(self, stage: str): - self.dataset = _maker("fold_6", RGB=True) - ( - self.train_dataset, - self.validation_dataset, - self.extra_validation_datasets, - self.predict_dataset, - ) = return_subsets(self.dataset) - - -datamodule = Fold0Module diff --git a/src/ptbench/data/shenzhen/fold_7.py b/src/ptbench/data/shenzhen/fold_7.py index 79b0d1fff4483ef98eed42b6dce6c6b64076bd1e..b0ea7b4324334980a2e55e4496ac4ab6af705d17 100644 --- a/src/ptbench/data/shenzhen/fold_7.py +++ b/src/ptbench/data/shenzhen/fold_7.py @@ -2,45 +2,46 @@ # # SPDX-License-Identifier: GPL-3.0-or-later -"""Shenzhen dataset for TB detection (cross validation fold 7) +"""Shenzhen datamodule for computer-aided diagnosis (fold 7) -* Split reference: first 80% of TB and healthy CXR for "train", rest for "test" -* This configuration resolution: 512 x 512 (default) -* See :py:mod:`ptbench.data.shenzhen` for dataset details -""" +See :py:mod:`ptbench.data.shenzhen` for more database details. -from clapper.logging import setup +This configuration: -from .. import return_subsets -from ..base_datamodule import BaseDataModule -from . import _maker +* Raw data input (on disk): -logger = setup(__name__.split(".")[0], format="%(levelname)s: %(message)s") + * PNG images (black and white, encoded as color images) + * Variable width and height: + * widths: from 1130 to 3001 pixels + * heights: from 948 to 3001 pixels -class Fold0Module(BaseDataModule): - def __init__( - self, - train_batch_size=1, - predict_batch_size=1, - drop_incomplete_batch=False, - multiproc_kwargs=None, - ): - super().__init__( - train_batch_size=train_batch_size, - predict_batch_size=predict_batch_size, - drop_incomplete_batch=drop_incomplete_batch, - multiproc_kwargs=multiproc_kwargs, - ) +* Output image: + + * Transforms: + + * Load raw PNG with :py:mod:`PIL` + * Remove black borders + * Torch resizing(512px, 512px) + * Torch center cropping (512px, 512px) - def setup(self, stage: str): - self.dataset = _maker("fold_7") - ( - self.train_dataset, - self.validation_dataset, - self.extra_validation_datasets, - self.predict_dataset, - ) = return_subsets(self.dataset) + * Final specifications: + * Fixed resolution: 512x512 pixels + * Color RGB encoding +""" + +import importlib.resources -datamodule = Fold0Module +from ..datamodule import CachingDataModule +from ..split import JSONDatabaseSplit +from .loader import RawDataLoader + +datamodule = CachingDataModule( + database_split=JSONDatabaseSplit( + importlib.resources.files(__name__.rsplit(".", 1)[0]).joinpath( + "fold_7.json.bz2" + ) + ), + raw_data_loader=RawDataLoader(), +) diff --git a/src/ptbench/data/shenzhen/fold_7_rgb.py b/src/ptbench/data/shenzhen/fold_7_rgb.py deleted file mode 100644 index 8a36acb2c79a4467f5dccad532c17cf528613ab5..0000000000000000000000000000000000000000 --- a/src/ptbench/data/shenzhen/fold_7_rgb.py +++ /dev/null @@ -1,46 +0,0 @@ -# SPDX-FileCopyrightText: Copyright © 2023 Idiap Research Institute <contact@idiap.ch> -# -# SPDX-License-Identifier: GPL-3.0-or-later - -"""Shenzhen dataset for TB detection (cross validation fold 7, RGB) - -* Split reference: first 80% of TB and healthy CXR for "train", rest for "test" -* This configuration resolution: 512 x 512 (default) -* See :py:mod:`ptbench.data.shenzhen` for dataset details -""" - -from clapper.logging import setup - -from .. import return_subsets -from ..base_datamodule import BaseDataModule -from . import _maker - -logger = setup(__name__.split(".")[0], format="%(levelname)s: %(message)s") - - -class Fold0Module(BaseDataModule): - def __init__( - self, - train_batch_size=1, - predict_batch_size=1, - drop_incomplete_batch=False, - multiproc_kwargs=None, - ): - super().__init__( - train_batch_size=train_batch_size, - predict_batch_size=predict_batch_size, - drop_incomplete_batch=drop_incomplete_batch, - multiproc_kwargs=multiproc_kwargs, - ) - - def setup(self, stage: str): - self.dataset = _maker("fold_7", RGB=True) - ( - self.train_dataset, - self.validation_dataset, - self.extra_validation_datasets, - self.predict_dataset, - ) = return_subsets(self.dataset) - - -datamodule = Fold0Module diff --git a/src/ptbench/data/shenzhen/fold_8.py b/src/ptbench/data/shenzhen/fold_8.py index cf1cd36a8d2d2a40113a775131f9d5b8ba0a092a..9bbfbe84ab942cf5da5a8c5fc8318724908998f9 100644 --- a/src/ptbench/data/shenzhen/fold_8.py +++ b/src/ptbench/data/shenzhen/fold_8.py @@ -2,45 +2,46 @@ # # SPDX-License-Identifier: GPL-3.0-or-later -"""Shenzhen dataset for TB detection (cross validation fold 8) +"""Shenzhen datamodule for computer-aided diagnosis (fold 8) -* Split reference: first 80% of TB and healthy CXR for "train", rest for "test" -* This configuration resolution: 512 x 512 (default) -* See :py:mod:`ptbench.data.shenzhen` for dataset details -""" +See :py:mod:`ptbench.data.shenzhen` for more database details. -from clapper.logging import setup +This configuration: -from .. import return_subsets -from ..base_datamodule import BaseDataModule -from . import _maker +* Raw data input (on disk): -logger = setup(__name__.split(".")[0], format="%(levelname)s: %(message)s") + * PNG images (black and white, encoded as color images) + * Variable width and height: + * widths: from 1130 to 3001 pixels + * heights: from 948 to 3001 pixels -class Fold0Module(BaseDataModule): - def __init__( - self, - train_batch_size=1, - predict_batch_size=1, - drop_incomplete_batch=False, - multiproc_kwargs=None, - ): - super().__init__( - train_batch_size=train_batch_size, - predict_batch_size=predict_batch_size, - drop_incomplete_batch=drop_incomplete_batch, - multiproc_kwargs=multiproc_kwargs, - ) +* Output image: + + * Transforms: + + * Load raw PNG with :py:mod:`PIL` + * Remove black borders + * Torch resizing(512px, 512px) + * Torch center cropping (512px, 512px) - def setup(self, stage: str): - self.dataset = _maker("fold_8") - ( - self.train_dataset, - self.validation_dataset, - self.extra_validation_datasets, - self.predict_dataset, - ) = return_subsets(self.dataset) + * Final specifications: + * Fixed resolution: 512x512 pixels + * Color RGB encoding +""" + +import importlib.resources -datamodule = Fold0Module +from ..datamodule import CachingDataModule +from ..split import JSONDatabaseSplit +from .loader import RawDataLoader + +datamodule = CachingDataModule( + database_split=JSONDatabaseSplit( + importlib.resources.files(__name__.rsplit(".", 1)[0]).joinpath( + "fold_8.json.bz2" + ) + ), + raw_data_loader=RawDataLoader(), +) diff --git a/src/ptbench/data/shenzhen/fold_8_rgb.py b/src/ptbench/data/shenzhen/fold_8_rgb.py deleted file mode 100644 index 1aa0bcec76d875d8e9b966b4461228230bdf79f2..0000000000000000000000000000000000000000 --- a/src/ptbench/data/shenzhen/fold_8_rgb.py +++ /dev/null @@ -1,46 +0,0 @@ -# SPDX-FileCopyrightText: Copyright © 2023 Idiap Research Institute <contact@idiap.ch> -# -# SPDX-License-Identifier: GPL-3.0-or-later - -"""Shenzhen dataset for TB detection (cross validation fold 8, RGB) - -* Split reference: first 80% of TB and healthy CXR for "train", rest for "test" -* This configuration resolution: 512 x 512 (default) -* See :py:mod:`ptbench.data.shenzhen` for dataset details -""" - -from clapper.logging import setup - -from .. import return_subsets -from ..base_datamodule import BaseDataModule -from . import _maker - -logger = setup(__name__.split(".")[0], format="%(levelname)s: %(message)s") - - -class Fold0Module(BaseDataModule): - def __init__( - self, - train_batch_size=1, - predict_batch_size=1, - drop_incomplete_batch=False, - multiproc_kwargs=None, - ): - super().__init__( - train_batch_size=train_batch_size, - predict_batch_size=predict_batch_size, - drop_incomplete_batch=drop_incomplete_batch, - multiproc_kwargs=multiproc_kwargs, - ) - - def setup(self, stage: str): - self.dataset = _maker("fold_8", RGB=True) - ( - self.train_dataset, - self.validation_dataset, - self.extra_validation_datasets, - self.predict_dataset, - ) = return_subsets(self.dataset) - - -datamodule = Fold0Module diff --git a/src/ptbench/data/shenzhen/fold_9.py b/src/ptbench/data/shenzhen/fold_9.py index e1bb569d0389a8eb67b0ad186e48c6f98aa7cf57..87c2afb328f9b09f420a1ddce5f5d0ea54346c43 100644 --- a/src/ptbench/data/shenzhen/fold_9.py +++ b/src/ptbench/data/shenzhen/fold_9.py @@ -2,45 +2,46 @@ # # SPDX-License-Identifier: GPL-3.0-or-later -"""Shenzhen dataset for TB detection (cross validation fold 9) +"""Shenzhen datamodule for computer-aided diagnosis (fold 9) -* Split reference: first 80% of TB and healthy CXR for "train", rest for "test" -* This configuration resolution: 512 x 512 (default) -* See :py:mod:`ptbench.data.shenzhen` for dataset details -""" +See :py:mod:`ptbench.data.shenzhen` for more database details. -from clapper.logging import setup +This configuration: -from .. import return_subsets -from ..base_datamodule import BaseDataModule -from . import _maker +* Raw data input (on disk): -logger = setup(__name__.split(".")[0], format="%(levelname)s: %(message)s") + * PNG images (black and white, encoded as color images) + * Variable width and height: + * widths: from 1130 to 3001 pixels + * heights: from 948 to 3001 pixels -class Fold0Module(BaseDataModule): - def __init__( - self, - train_batch_size=1, - predict_batch_size=1, - drop_incomplete_batch=False, - multiproc_kwargs=None, - ): - super().__init__( - train_batch_size=train_batch_size, - predict_batch_size=predict_batch_size, - drop_incomplete_batch=drop_incomplete_batch, - multiproc_kwargs=multiproc_kwargs, - ) +* Output image: + + * Transforms: + + * Load raw PNG with :py:mod:`PIL` + * Remove black borders + * Torch resizing(512px, 512px) + * Torch center cropping (512px, 512px) - def setup(self, stage: str): - self.dataset = _maker("fold_9") - ( - self.train_dataset, - self.validation_dataset, - self.extra_validation_datasets, - self.predict_dataset, - ) = return_subsets(self.dataset) + * Final specifications: + * Fixed resolution: 512x512 pixels + * Color RGB encoding +""" + +import importlib.resources -datamodule = Fold0Module +from ..datamodule import CachingDataModule +from ..split import JSONDatabaseSplit +from .loader import RawDataLoader + +datamodule = CachingDataModule( + database_split=JSONDatabaseSplit( + importlib.resources.files(__name__.rsplit(".", 1)[0]).joinpath( + "fold_9.json.bz2" + ) + ), + raw_data_loader=RawDataLoader(), +) diff --git a/src/ptbench/data/shenzhen/fold_9_rgb.py b/src/ptbench/data/shenzhen/fold_9_rgb.py deleted file mode 100644 index c0a577df0a0d3667420c32042816f65ba9ad20ce..0000000000000000000000000000000000000000 --- a/src/ptbench/data/shenzhen/fold_9_rgb.py +++ /dev/null @@ -1,46 +0,0 @@ -# SPDX-FileCopyrightText: Copyright © 2023 Idiap Research Institute <contact@idiap.ch> -# -# SPDX-License-Identifier: GPL-3.0-or-later - -"""Shenzhen dataset for TB detection (cross validation fold 9, RGB) - -* Split reference: first 80% of TB and healthy CXR for "train", rest for "test" -* This configuration resolution: 512 x 512 (default) -* See :py:mod:`ptbench.data.shenzhen` for dataset details -""" - -from clapper.logging import setup - -from .. import return_subsets -from ..base_datamodule import BaseDataModule -from . import _maker - -logger = setup(__name__.split(".")[0], format="%(levelname)s: %(message)s") - - -class Fold0Module(BaseDataModule): - def __init__( - self, - train_batch_size=1, - predict_batch_size=1, - drop_incomplete_batch=False, - multiproc_kwargs=None, - ): - super().__init__( - train_batch_size=train_batch_size, - predict_batch_size=predict_batch_size, - drop_incomplete_batch=drop_incomplete_batch, - multiproc_kwargs=multiproc_kwargs, - ) - - def setup(self, stage: str): - self.dataset = _maker("fold_9", RGB=True) - ( - self.train_dataset, - self.validation_dataset, - self.extra_validation_datasets, - self.predict_dataset, - ) = return_subsets(self.dataset) - - -datamodule = Fold0Module diff --git a/src/ptbench/data/shenzhen/rgb.py b/src/ptbench/data/shenzhen/rgb.py deleted file mode 100644 index 211b49236cae22af68ad61d38849429dedb606d7..0000000000000000000000000000000000000000 --- a/src/ptbench/data/shenzhen/rgb.py +++ /dev/null @@ -1,35 +0,0 @@ -# SPDX-FileCopyrightText: Copyright © 2023 Idiap Research Institute <contact@idiap.ch> -# -# SPDX-License-Identifier: GPL-3.0-or-later - -"""Shenzhen datamodule for computer-aided diagnosis (default protocol) - -See :py:mod:`ptbench.data.shenzhen` for dataset details. - -This configuration: -* raw data (default): :py:obj:`ptbench.data.shenzhen._tranforms` -* augmentations: elastic deformation (probability = 80%) -* output image resolution: 512x512 pixels -""" - -import importlib.resources - -from torchvision import transforms - -from ..datamodule import CachingDataModule -from ..split import JSONDatabaseSplit -from .loader import RawDataLoader - -datamodule = CachingDataModule( - database_split=JSONDatabaseSplit( - importlib.resources.files(__name__.rsplit(".", 1)[0]).joinpath( - "default.json.bz2" - ) - ), - raw_data_loader=RawDataLoader(), - model_transforms=[ - transforms.ToPILImage(), - transforms.Lambda(lambda x: x.convert("RGB")), - transforms.ToTensor(), - ], -)