From 87a65d97aba6e73133d2deebd595d9988eac42a1 Mon Sep 17 00:00:00 2001 From: dcarron <daniel.carron@idiap.ch> Date: Tue, 18 Jul 2023 14:07:15 +0200 Subject: [PATCH] Update shenzhen dataset --- src/ptbench/data/shenzhen/__init__.py | 34 ------------- src/ptbench/data/shenzhen/fold_0.py | 67 +++++++++++++------------ src/ptbench/data/shenzhen/fold_0_rgb.py | 46 ----------------- src/ptbench/data/shenzhen/fold_1.py | 67 +++++++++++++------------ src/ptbench/data/shenzhen/fold_1_rgb.py | 47 ----------------- src/ptbench/data/shenzhen/fold_2.py | 67 +++++++++++++------------ src/ptbench/data/shenzhen/fold_2_rgb.py | 46 ----------------- src/ptbench/data/shenzhen/fold_3.py | 67 +++++++++++++------------ src/ptbench/data/shenzhen/fold_3_rgb.py | 46 ----------------- src/ptbench/data/shenzhen/fold_4.py | 67 +++++++++++++------------ src/ptbench/data/shenzhen/fold_4_rgb.py | 46 ----------------- src/ptbench/data/shenzhen/fold_5.py | 67 +++++++++++++------------ src/ptbench/data/shenzhen/fold_5_rgb.py | 46 ----------------- src/ptbench/data/shenzhen/fold_6.py | 67 +++++++++++++------------ src/ptbench/data/shenzhen/fold_6_rgb.py | 46 ----------------- src/ptbench/data/shenzhen/fold_7.py | 67 +++++++++++++------------ src/ptbench/data/shenzhen/fold_7_rgb.py | 46 ----------------- src/ptbench/data/shenzhen/fold_8.py | 67 +++++++++++++------------ src/ptbench/data/shenzhen/fold_8_rgb.py | 46 ----------------- src/ptbench/data/shenzhen/fold_9.py | 67 +++++++++++++------------ src/ptbench/data/shenzhen/fold_9_rgb.py | 46 ----------------- src/ptbench/data/shenzhen/rgb.py | 35 ------------- 22 files changed, 340 insertions(+), 860 deletions(-) delete mode 100644 src/ptbench/data/shenzhen/fold_0_rgb.py delete mode 100644 src/ptbench/data/shenzhen/fold_1_rgb.py delete mode 100644 src/ptbench/data/shenzhen/fold_2_rgb.py delete mode 100644 src/ptbench/data/shenzhen/fold_3_rgb.py delete mode 100644 src/ptbench/data/shenzhen/fold_4_rgb.py delete mode 100644 src/ptbench/data/shenzhen/fold_5_rgb.py delete mode 100644 src/ptbench/data/shenzhen/fold_6_rgb.py delete mode 100644 src/ptbench/data/shenzhen/fold_7_rgb.py delete mode 100644 src/ptbench/data/shenzhen/fold_8_rgb.py delete mode 100644 src/ptbench/data/shenzhen/fold_9_rgb.py delete mode 100644 src/ptbench/data/shenzhen/rgb.py diff --git a/src/ptbench/data/shenzhen/__init__.py b/src/ptbench/data/shenzhen/__init__.py index 1645962e..e69de29b 100644 --- a/src/ptbench/data/shenzhen/__init__.py +++ b/src/ptbench/data/shenzhen/__init__.py @@ -1,34 +0,0 @@ -# SPDX-FileCopyrightText: Copyright © 2023 Idiap Research Institute <contact@idiap.ch> -# -# SPDX-License-Identifier: GPL-3.0-or-later - -"""Shenzhen dataset for computer-aided diagnosis. - -The standard digital image database for Tuberculosis is created by the -National Library of Medicine, Maryland, USA in collaboration with Shenzhen -No.3 People’s Hospital, Guangdong Medical College, Shenzhen, China. -The Chest X-rays are from out-patient clinics, and were captured as part of -the daily routine using Philips DR Digital Diagnose systems. - -* Reference: [MONTGOMERY-SHENZHEN-2014]_ -* Original resolution (height x width or width x height): 3000 x 3000 or less -* Split reference: none - * Training samples: 64% of TB and healthy CXR (including labels) - * Validation samples: 16% of TB and healthy CXR (including labels) - * Test samples: 20% of TB and healthy CXR (including labels) -""" -import importlib.resources - -_protocols = [ - importlib.resources.files(__name__).joinpath("default.json.bz2"), - importlib.resources.files(__name__).joinpath("fold_0.json.bz2"), - importlib.resources.files(__name__).joinpath("fold_1.json.bz2"), - importlib.resources.files(__name__).joinpath("fold_2.json.bz2"), - importlib.resources.files(__name__).joinpath("fold_3.json.bz2"), - importlib.resources.files(__name__).joinpath("fold_4.json.bz2"), - importlib.resources.files(__name__).joinpath("fold_5.json.bz2"), - importlib.resources.files(__name__).joinpath("fold_6.json.bz2"), - importlib.resources.files(__name__).joinpath("fold_7.json.bz2"), - importlib.resources.files(__name__).joinpath("fold_8.json.bz2"), - importlib.resources.files(__name__).joinpath("fold_9.json.bz2"), -] diff --git a/src/ptbench/data/shenzhen/fold_0.py b/src/ptbench/data/shenzhen/fold_0.py index 5b4d4560..888a0e60 100644 --- a/src/ptbench/data/shenzhen/fold_0.py +++ b/src/ptbench/data/shenzhen/fold_0.py @@ -2,45 +2,46 @@ # # SPDX-License-Identifier: GPL-3.0-or-later -"""Shenzhen dataset for TB detection (cross validation fold 0) +"""Shenzhen datamodule for computer-aided diagnosis (fold 0) -* Split reference: first 80% of TB and healthy CXR for "train", rest for "test" -* This configuration resolution: 512 x 512 (default) -* See :py:mod:`ptbench.data.shenzhen` for dataset details -""" +See :py:mod:`ptbench.data.shenzhen` for more database details. -from clapper.logging import setup +This configuration: -from .. import return_subsets -from ..base_datamodule import BaseDataModule -from . import _maker +* Raw data input (on disk): -logger = setup(__name__.split(".")[0], format="%(levelname)s: %(message)s") + * PNG images (black and white, encoded as color images) + * Variable width and height: + * widths: from 1130 to 3001 pixels + * heights: from 948 to 3001 pixels -class Fold0Module(BaseDataModule): - def __init__( - self, - train_batch_size=1, - predict_batch_size=1, - drop_incomplete_batch=False, - multiproc_kwargs=None, - ): - super().__init__( - train_batch_size=train_batch_size, - predict_batch_size=predict_batch_size, - drop_incomplete_batch=drop_incomplete_batch, - multiproc_kwargs=multiproc_kwargs, - ) +* Output image: + + * Transforms: + + * Load raw PNG with :py:mod:`PIL` + * Remove black borders + * Torch resizing(512px, 512px) + * Torch center cropping (512px, 512px) - def setup(self, stage: str): - self.dataset = _maker("fold_0") - ( - self.train_dataset, - self.validation_dataset, - self.extra_validation_datasets, - self.predict_dataset, - ) = return_subsets(self.dataset) + * Final specifications: + * Fixed resolution: 512x512 pixels + * Color RGB encoding +""" + +import importlib.resources -datamodule = Fold0Module +from ..datamodule import CachingDataModule +from ..split import JSONDatabaseSplit +from .loader import RawDataLoader + +datamodule = CachingDataModule( + database_split=JSONDatabaseSplit( + importlib.resources.files(__name__.rsplit(".", 1)[0]).joinpath( + "fold_0.json.bz2" + ) + ), + raw_data_loader=RawDataLoader(), +) diff --git a/src/ptbench/data/shenzhen/fold_0_rgb.py b/src/ptbench/data/shenzhen/fold_0_rgb.py deleted file mode 100644 index 143ef731..00000000 --- a/src/ptbench/data/shenzhen/fold_0_rgb.py +++ /dev/null @@ -1,46 +0,0 @@ -# SPDX-FileCopyrightText: Copyright © 2023 Idiap Research Institute <contact@idiap.ch> -# -# SPDX-License-Identifier: GPL-3.0-or-later - -"""Shenzhen dataset for TB detection (cross validation fold 0, RGB) - -* Split reference: first 80% of TB and healthy CXR for "train", rest for "test" -* This configuration resolution: 512 x 512 (default) -* See :py:mod:`ptbench.data.shenzhen` for dataset details -""" - -from clapper.logging import setup - -from .. import return_subsets -from ..base_datamodule import BaseDataModule -from . import _maker - -logger = setup(__name__.split(".")[0], format="%(levelname)s: %(message)s") - - -class Fold0Module(BaseDataModule): - def __init__( - self, - train_batch_size=1, - predict_batch_size=1, - drop_incomplete_batch=False, - multiproc_kwargs=None, - ): - super().__init__( - train_batch_size=train_batch_size, - predict_batch_size=predict_batch_size, - drop_incomplete_batch=drop_incomplete_batch, - multiproc_kwargs=multiproc_kwargs, - ) - - def setup(self, stage: str): - self.dataset = _maker("fold_0", RGB=True) - ( - self.train_dataset, - self.validation_dataset, - self.extra_validation_datasets, - self.predict_dataset, - ) = return_subsets(self.dataset) - - -datamodule = Fold0Module diff --git a/src/ptbench/data/shenzhen/fold_1.py b/src/ptbench/data/shenzhen/fold_1.py index f01adef0..62d7fbd5 100644 --- a/src/ptbench/data/shenzhen/fold_1.py +++ b/src/ptbench/data/shenzhen/fold_1.py @@ -2,45 +2,46 @@ # # SPDX-License-Identifier: GPL-3.0-or-later -"""Shenzhen dataset for TB detection (cross validation fold 1) +"""Shenzhen datamodule for computer-aided diagnosis (fold 1) -* Split reference: first 80% of TB and healthy CXR for "train", rest for "test" -* This configuration resolution: 512 x 512 (default) -* See :py:mod:`ptbench.data.shenzhen` for dataset details -""" +See :py:mod:`ptbench.data.shenzhen` for more database details. -from clapper.logging import setup +This configuration: -from .. import return_subsets -from ..base_datamodule import BaseDataModule -from . import _maker +* Raw data input (on disk): -logger = setup(__name__.split(".")[0], format="%(levelname)s: %(message)s") + * PNG images (black and white, encoded as color images) + * Variable width and height: + * widths: from 1130 to 3001 pixels + * heights: from 948 to 3001 pixels -class Fold0Module(BaseDataModule): - def __init__( - self, - train_batch_size=1, - predict_batch_size=1, - drop_incomplete_batch=False, - multiproc_kwargs=None, - ): - super().__init__( - train_batch_size=train_batch_size, - predict_batch_size=predict_batch_size, - drop_incomplete_batch=drop_incomplete_batch, - multiproc_kwargs=multiproc_kwargs, - ) +* Output image: + + * Transforms: + + * Load raw PNG with :py:mod:`PIL` + * Remove black borders + * Torch resizing(512px, 512px) + * Torch center cropping (512px, 512px) - def setup(self, stage: str): - self.dataset = _maker("fold_1") - ( - self.train_dataset, - self.validation_dataset, - self.extra_validation_datasets, - self.predict_dataset, - ) = return_subsets(self.dataset) + * Final specifications: + * Fixed resolution: 512x512 pixels + * Color RGB encoding +""" + +import importlib.resources -datamodule = Fold0Module +from ..datamodule import CachingDataModule +from ..split import JSONDatabaseSplit +from .loader import RawDataLoader + +datamodule = CachingDataModule( + database_split=JSONDatabaseSplit( + importlib.resources.files(__name__.rsplit(".", 1)[0]).joinpath( + "fold_1.json.bz2" + ) + ), + raw_data_loader=RawDataLoader(), +) diff --git a/src/ptbench/data/shenzhen/fold_1_rgb.py b/src/ptbench/data/shenzhen/fold_1_rgb.py deleted file mode 100644 index 9d457adf..00000000 --- a/src/ptbench/data/shenzhen/fold_1_rgb.py +++ /dev/null @@ -1,47 +0,0 @@ -# SPDX-FileCopyrightText: Copyright © 2023 Idiap Research Institute <contact@idiap.ch> -# -# SPDX-License-Identifier: GPL-3.0-or-later - -"""Shenzhen dataset for TB detection (cross validation fold 1, RGB) - -* Split reference: first 80% of TB and healthy CXR for "train", rest for "test" -* This configuration resolution: 512 x 512 (default) -* See :py:mod:`ptbench.data.shenzhen` for dataset details -""" - - -from clapper.logging import setup - -from .. import return_subsets -from ..base_datamodule import BaseDataModule -from . import _maker - -logger = setup(__name__.split(".")[0], format="%(levelname)s: %(message)s") - - -class Fold0Module(BaseDataModule): - def __init__( - self, - train_batch_size=1, - predict_batch_size=1, - drop_incomplete_batch=False, - multiproc_kwargs=None, - ): - super().__init__( - train_batch_size=train_batch_size, - predict_batch_size=predict_batch_size, - drop_incomplete_batch=drop_incomplete_batch, - multiproc_kwargs=multiproc_kwargs, - ) - - def setup(self, stage: str): - self.dataset = _maker("fold_1", RGB=True) - ( - self.train_dataset, - self.validation_dataset, - self.extra_validation_datasets, - self.predict_dataset, - ) = return_subsets(self.dataset) - - -datamodule = Fold0Module diff --git a/src/ptbench/data/shenzhen/fold_2.py b/src/ptbench/data/shenzhen/fold_2.py index 04dd6562..b41284cd 100644 --- a/src/ptbench/data/shenzhen/fold_2.py +++ b/src/ptbench/data/shenzhen/fold_2.py @@ -2,45 +2,46 @@ # # SPDX-License-Identifier: GPL-3.0-or-later -"""Shenzhen dataset for TB detection (cross validation fold 2) +"""Shenzhen datamodule for computer-aided diagnosis (fold 2) -* Split reference: first 80% of TB and healthy CXR for "train", rest for "test" -* This configuration resolution: 512 x 512 (default) -* See :py:mod:`ptbench.data.shenzhen` for dataset details -""" +See :py:mod:`ptbench.data.shenzhen` for more database details. -from clapper.logging import setup +This configuration: -from .. import return_subsets -from ..base_datamodule import BaseDataModule -from . import _maker +* Raw data input (on disk): -logger = setup(__name__.split(".")[0], format="%(levelname)s: %(message)s") + * PNG images (black and white, encoded as color images) + * Variable width and height: + * widths: from 1130 to 3001 pixels + * heights: from 948 to 3001 pixels -class Fold0Module(BaseDataModule): - def __init__( - self, - train_batch_size=1, - predict_batch_size=1, - drop_incomplete_batch=False, - multiproc_kwargs=None, - ): - super().__init__( - train_batch_size=train_batch_size, - predict_batch_size=predict_batch_size, - drop_incomplete_batch=drop_incomplete_batch, - multiproc_kwargs=multiproc_kwargs, - ) +* Output image: + + * Transforms: + + * Load raw PNG with :py:mod:`PIL` + * Remove black borders + * Torch resizing(512px, 512px) + * Torch center cropping (512px, 512px) - def setup(self, stage: str): - self.dataset = _maker("fold_2") - ( - self.train_dataset, - self.validation_dataset, - self.extra_validation_datasets, - self.predict_dataset, - ) = return_subsets(self.dataset) + * Final specifications: + * Fixed resolution: 512x512 pixels + * Color RGB encoding +""" + +import importlib.resources -datamodule = Fold0Module +from ..datamodule import CachingDataModule +from ..split import JSONDatabaseSplit +from .loader import RawDataLoader + +datamodule = CachingDataModule( + database_split=JSONDatabaseSplit( + importlib.resources.files(__name__.rsplit(".", 1)[0]).joinpath( + "fold_2.json.bz2" + ) + ), + raw_data_loader=RawDataLoader(), +) diff --git a/src/ptbench/data/shenzhen/fold_2_rgb.py b/src/ptbench/data/shenzhen/fold_2_rgb.py deleted file mode 100644 index 37cbe10e..00000000 --- a/src/ptbench/data/shenzhen/fold_2_rgb.py +++ /dev/null @@ -1,46 +0,0 @@ -# SPDX-FileCopyrightText: Copyright © 2023 Idiap Research Institute <contact@idiap.ch> -# -# SPDX-License-Identifier: GPL-3.0-or-later - -"""Shenzhen dataset for TB detection (cross validation fold 2, RGB) - -* Split reference: first 80% of TB and healthy CXR for "train", rest for "test" -* This configuration resolution: 512 x 512 (default) -* See :py:mod:`ptbench.data.shenzhen` for dataset details -""" - -from clapper.logging import setup - -from .. import return_subsets -from ..base_datamodule import BaseDataModule -from . import _maker - -logger = setup(__name__.split(".")[0], format="%(levelname)s: %(message)s") - - -class Fold0Module(BaseDataModule): - def __init__( - self, - train_batch_size=1, - predict_batch_size=1, - drop_incomplete_batch=False, - multiproc_kwargs=None, - ): - super().__init__( - train_batch_size=train_batch_size, - predict_batch_size=predict_batch_size, - drop_incomplete_batch=drop_incomplete_batch, - multiproc_kwargs=multiproc_kwargs, - ) - - def setup(self, stage: str): - self.dataset = _maker("fold_2", RGB=True) - ( - self.train_dataset, - self.validation_dataset, - self.extra_validation_datasets, - self.predict_dataset, - ) = return_subsets(self.dataset) - - -datamodule = Fold0Module diff --git a/src/ptbench/data/shenzhen/fold_3.py b/src/ptbench/data/shenzhen/fold_3.py index b43fcb29..cca55506 100644 --- a/src/ptbench/data/shenzhen/fold_3.py +++ b/src/ptbench/data/shenzhen/fold_3.py @@ -2,45 +2,46 @@ # # SPDX-License-Identifier: GPL-3.0-or-later -"""Shenzhen dataset for TB detection (cross validation fold 3) +"""Shenzhen datamodule for computer-aided diagnosis (fold 3) -* Split reference: first 80% of TB and healthy CXR for "train", rest for "test" -* This configuration resolution: 512 x 512 (default) -* See :py:mod:`ptbench.data.shenzhen` for dataset details -""" +See :py:mod:`ptbench.data.shenzhen` for more database details. -from clapper.logging import setup +This configuration: -from .. import return_subsets -from ..base_datamodule import BaseDataModule -from . import _maker +* Raw data input (on disk): -logger = setup(__name__.split(".")[0], format="%(levelname)s: %(message)s") + * PNG images (black and white, encoded as color images) + * Variable width and height: + * widths: from 1130 to 3001 pixels + * heights: from 948 to 3001 pixels -class Fold0Module(BaseDataModule): - def __init__( - self, - train_batch_size=1, - predict_batch_size=1, - drop_incomplete_batch=False, - multiproc_kwargs=None, - ): - super().__init__( - train_batch_size=train_batch_size, - predict_batch_size=predict_batch_size, - drop_incomplete_batch=drop_incomplete_batch, - multiproc_kwargs=multiproc_kwargs, - ) +* Output image: + + * Transforms: + + * Load raw PNG with :py:mod:`PIL` + * Remove black borders + * Torch resizing(512px, 512px) + * Torch center cropping (512px, 512px) - def setup(self, stage: str): - self.dataset = _maker("fold_3") - ( - self.train_dataset, - self.validation_dataset, - self.extra_validation_datasets, - self.predict_dataset, - ) = return_subsets(self.dataset) + * Final specifications: + * Fixed resolution: 512x512 pixels + * Color RGB encoding +""" + +import importlib.resources -datamodule = Fold0Module +from ..datamodule import CachingDataModule +from ..split import JSONDatabaseSplit +from .loader import RawDataLoader + +datamodule = CachingDataModule( + database_split=JSONDatabaseSplit( + importlib.resources.files(__name__.rsplit(".", 1)[0]).joinpath( + "fold_3.json.bz2" + ) + ), + raw_data_loader=RawDataLoader(), +) diff --git a/src/ptbench/data/shenzhen/fold_3_rgb.py b/src/ptbench/data/shenzhen/fold_3_rgb.py deleted file mode 100644 index 162a3f82..00000000 --- a/src/ptbench/data/shenzhen/fold_3_rgb.py +++ /dev/null @@ -1,46 +0,0 @@ -# SPDX-FileCopyrightText: Copyright © 2023 Idiap Research Institute <contact@idiap.ch> -# -# SPDX-License-Identifier: GPL-3.0-or-later - -"""Shenzhen dataset for TB detection (cross validation fold 3, RGB) - -* Split reference: first 80% of TB and healthy CXR for "train", rest for "test" -* This configuration resolution: 512 x 512 (default) -* See :py:mod:`ptbench.data.shenzhen` for dataset details -""" - -from clapper.logging import setup - -from .. import return_subsets -from ..base_datamodule import BaseDataModule -from . import _maker - -logger = setup(__name__.split(".")[0], format="%(levelname)s: %(message)s") - - -class Fold0Module(BaseDataModule): - def __init__( - self, - train_batch_size=1, - predict_batch_size=1, - drop_incomplete_batch=False, - multiproc_kwargs=None, - ): - super().__init__( - train_batch_size=train_batch_size, - predict_batch_size=predict_batch_size, - drop_incomplete_batch=drop_incomplete_batch, - multiproc_kwargs=multiproc_kwargs, - ) - - def setup(self, stage: str): - self.dataset = _maker("fold_3", RGB=True) - ( - self.train_dataset, - self.validation_dataset, - self.extra_validation_datasets, - self.predict_dataset, - ) = return_subsets(self.dataset) - - -datamodule = Fold0Module diff --git a/src/ptbench/data/shenzhen/fold_4.py b/src/ptbench/data/shenzhen/fold_4.py index 58e0a2f2..89742007 100644 --- a/src/ptbench/data/shenzhen/fold_4.py +++ b/src/ptbench/data/shenzhen/fold_4.py @@ -2,45 +2,46 @@ # # SPDX-License-Identifier: GPL-3.0-or-later -"""Shenzhen dataset for TB detection (cross validation fold 4) +"""Shenzhen datamodule for computer-aided diagnosis (fold 4) -* Split reference: first 80% of TB and healthy CXR for "train", rest for "test" -* This configuration resolution: 512 x 512 (default) -* See :py:mod:`ptbench.data.shenzhen` for dataset details -""" +See :py:mod:`ptbench.data.shenzhen` for more database details. -from clapper.logging import setup +This configuration: -from .. import return_subsets -from ..base_datamodule import BaseDataModule -from . import _maker +* Raw data input (on disk): -logger = setup(__name__.split(".")[0], format="%(levelname)s: %(message)s") + * PNG images (black and white, encoded as color images) + * Variable width and height: + * widths: from 1130 to 3001 pixels + * heights: from 948 to 3001 pixels -class Fold0Module(BaseDataModule): - def __init__( - self, - train_batch_size=1, - predict_batch_size=1, - drop_incomplete_batch=False, - multiproc_kwargs=None, - ): - super().__init__( - train_batch_size=train_batch_size, - predict_batch_size=predict_batch_size, - drop_incomplete_batch=drop_incomplete_batch, - multiproc_kwargs=multiproc_kwargs, - ) +* Output image: + + * Transforms: + + * Load raw PNG with :py:mod:`PIL` + * Remove black borders + * Torch resizing(512px, 512px) + * Torch center cropping (512px, 512px) - def setup(self, stage: str): - self.dataset = _maker("fold_4") - ( - self.train_dataset, - self.validation_dataset, - self.extra_validation_datasets, - self.predict_dataset, - ) = return_subsets(self.dataset) + * Final specifications: + * Fixed resolution: 512x512 pixels + * Color RGB encoding +""" + +import importlib.resources -datamodule = Fold0Module +from ..datamodule import CachingDataModule +from ..split import JSONDatabaseSplit +from .loader import RawDataLoader + +datamodule = CachingDataModule( + database_split=JSONDatabaseSplit( + importlib.resources.files(__name__.rsplit(".", 1)[0]).joinpath( + "fold_4.json.bz2" + ) + ), + raw_data_loader=RawDataLoader(), +) diff --git a/src/ptbench/data/shenzhen/fold_4_rgb.py b/src/ptbench/data/shenzhen/fold_4_rgb.py deleted file mode 100644 index 0dd4ccf8..00000000 --- a/src/ptbench/data/shenzhen/fold_4_rgb.py +++ /dev/null @@ -1,46 +0,0 @@ -# SPDX-FileCopyrightText: Copyright © 2023 Idiap Research Institute <contact@idiap.ch> -# -# SPDX-License-Identifier: GPL-3.0-or-later - -"""Shenzhen dataset for TB detection (cross validation fold 4, RGB) - -* Split reference: first 80% of TB and healthy CXR for "train", rest for "test" -* This configuration resolution: 512 x 512 (default) -* See :py:mod:`ptbench.data.shenzhen` for dataset details -""" - -from clapper.logging import setup - -from .. import return_subsets -from ..base_datamodule import BaseDataModule -from . import _maker - -logger = setup(__name__.split(".")[0], format="%(levelname)s: %(message)s") - - -class Fold0Module(BaseDataModule): - def __init__( - self, - train_batch_size=1, - predict_batch_size=1, - drop_incomplete_batch=False, - multiproc_kwargs=None, - ): - super().__init__( - train_batch_size=train_batch_size, - predict_batch_size=predict_batch_size, - drop_incomplete_batch=drop_incomplete_batch, - multiproc_kwargs=multiproc_kwargs, - ) - - def setup(self, stage: str): - self.dataset = _maker("fold_4", RGB=True) - ( - self.train_dataset, - self.validation_dataset, - self.extra_validation_datasets, - self.predict_dataset, - ) = return_subsets(self.dataset) - - -datamodule = Fold0Module diff --git a/src/ptbench/data/shenzhen/fold_5.py b/src/ptbench/data/shenzhen/fold_5.py index ff115340..c520399d 100644 --- a/src/ptbench/data/shenzhen/fold_5.py +++ b/src/ptbench/data/shenzhen/fold_5.py @@ -2,45 +2,46 @@ # # SPDX-License-Identifier: GPL-3.0-or-later -"""Shenzhen dataset for TB detection (cross validation fold 5) +"""Shenzhen datamodule for computer-aided diagnosis (fold 5) -* Split reference: first 80% of TB and healthy CXR for "train", rest for "test" -* This configuration resolution: 512 x 512 (default) -* See :py:mod:`ptbench.data.shenzhen` for dataset details -""" +See :py:mod:`ptbench.data.shenzhen` for more database details. -from clapper.logging import setup +This configuration: -from .. import return_subsets -from ..base_datamodule import BaseDataModule -from . import _maker +* Raw data input (on disk): -logger = setup(__name__.split(".")[0], format="%(levelname)s: %(message)s") + * PNG images (black and white, encoded as color images) + * Variable width and height: + * widths: from 1130 to 3001 pixels + * heights: from 948 to 3001 pixels -class Fold0Module(BaseDataModule): - def __init__( - self, - train_batch_size=1, - predict_batch_size=1, - drop_incomplete_batch=False, - multiproc_kwargs=None, - ): - super().__init__( - train_batch_size=train_batch_size, - predict_batch_size=predict_batch_size, - drop_incomplete_batch=drop_incomplete_batch, - multiproc_kwargs=multiproc_kwargs, - ) +* Output image: + + * Transforms: + + * Load raw PNG with :py:mod:`PIL` + * Remove black borders + * Torch resizing(512px, 512px) + * Torch center cropping (512px, 512px) - def setup(self, stage: str): - self.dataset = _maker("fold_5") - ( - self.train_dataset, - self.validation_dataset, - self.extra_validation_datasets, - self.predict_dataset, - ) = return_subsets(self.dataset) + * Final specifications: + * Fixed resolution: 512x512 pixels + * Color RGB encoding +""" + +import importlib.resources -datamodule = Fold0Module +from ..datamodule import CachingDataModule +from ..split import JSONDatabaseSplit +from .loader import RawDataLoader + +datamodule = CachingDataModule( + database_split=JSONDatabaseSplit( + importlib.resources.files(__name__.rsplit(".", 1)[0]).joinpath( + "fold_5.json.bz2" + ) + ), + raw_data_loader=RawDataLoader(), +) diff --git a/src/ptbench/data/shenzhen/fold_5_rgb.py b/src/ptbench/data/shenzhen/fold_5_rgb.py deleted file mode 100644 index 46e255e7..00000000 --- a/src/ptbench/data/shenzhen/fold_5_rgb.py +++ /dev/null @@ -1,46 +0,0 @@ -# SPDX-FileCopyrightText: Copyright © 2023 Idiap Research Institute <contact@idiap.ch> -# -# SPDX-License-Identifier: GPL-3.0-or-later - -"""Shenzhen dataset for TB detection (cross validation fold 5, RGB) - -* Split reference: first 80% of TB and healthy CXR for "train", rest for "test" -* This configuration resolution: 512 x 512 (default) -* See :py:mod:`ptbench.data.shenzhen` for dataset details -""" - -from clapper.logging import setup - -from .. import return_subsets -from ..base_datamodule import BaseDataModule -from . import _maker - -logger = setup(__name__.split(".")[0], format="%(levelname)s: %(message)s") - - -class Fold0Module(BaseDataModule): - def __init__( - self, - train_batch_size=1, - predict_batch_size=1, - drop_incomplete_batch=False, - multiproc_kwargs=None, - ): - super().__init__( - train_batch_size=train_batch_size, - predict_batch_size=predict_batch_size, - drop_incomplete_batch=drop_incomplete_batch, - multiproc_kwargs=multiproc_kwargs, - ) - - def setup(self, stage: str): - self.dataset = _maker("fold_5", RGB=True) - ( - self.train_dataset, - self.validation_dataset, - self.extra_validation_datasets, - self.predict_dataset, - ) = return_subsets(self.dataset) - - -datamodule = Fold0Module diff --git a/src/ptbench/data/shenzhen/fold_6.py b/src/ptbench/data/shenzhen/fold_6.py index eb81ae88..a28f8fc5 100644 --- a/src/ptbench/data/shenzhen/fold_6.py +++ b/src/ptbench/data/shenzhen/fold_6.py @@ -2,45 +2,46 @@ # # SPDX-License-Identifier: GPL-3.0-or-later -"""Shenzhen dataset for TB detection (cross validation fold 6) +"""Shenzhen datamodule for computer-aided diagnosis (fold 6) -* Split reference: first 80% of TB and healthy CXR for "train", rest for "test" -* This configuration resolution: 512 x 512 (default) -* See :py:mod:`ptbench.data.shenzhen` for dataset details -""" +See :py:mod:`ptbench.data.shenzhen` for more database details. -from clapper.logging import setup +This configuration: -from .. import return_subsets -from ..base_datamodule import BaseDataModule -from . import _maker +* Raw data input (on disk): -logger = setup(__name__.split(".")[0], format="%(levelname)s: %(message)s") + * PNG images (black and white, encoded as color images) + * Variable width and height: + * widths: from 1130 to 3001 pixels + * heights: from 948 to 3001 pixels -class Fold0Module(BaseDataModule): - def __init__( - self, - train_batch_size=1, - predict_batch_size=1, - drop_incomplete_batch=False, - multiproc_kwargs=None, - ): - super().__init__( - train_batch_size=train_batch_size, - predict_batch_size=predict_batch_size, - drop_incomplete_batch=drop_incomplete_batch, - multiproc_kwargs=multiproc_kwargs, - ) +* Output image: + + * Transforms: + + * Load raw PNG with :py:mod:`PIL` + * Remove black borders + * Torch resizing(512px, 512px) + * Torch center cropping (512px, 512px) - def setup(self, stage: str): - self.dataset = _maker("fold_6") - ( - self.train_dataset, - self.validation_dataset, - self.extra_validation_datasets, - self.predict_dataset, - ) = return_subsets(self.dataset) + * Final specifications: + * Fixed resolution: 512x512 pixels + * Color RGB encoding +""" + +import importlib.resources -datamodule = Fold0Module +from ..datamodule import CachingDataModule +from ..split import JSONDatabaseSplit +from .loader import RawDataLoader + +datamodule = CachingDataModule( + database_split=JSONDatabaseSplit( + importlib.resources.files(__name__.rsplit(".", 1)[0]).joinpath( + "fold_6.json.bz2" + ) + ), + raw_data_loader=RawDataLoader(), +) diff --git a/src/ptbench/data/shenzhen/fold_6_rgb.py b/src/ptbench/data/shenzhen/fold_6_rgb.py deleted file mode 100644 index b9654d08..00000000 --- a/src/ptbench/data/shenzhen/fold_6_rgb.py +++ /dev/null @@ -1,46 +0,0 @@ -# SPDX-FileCopyrightText: Copyright © 2023 Idiap Research Institute <contact@idiap.ch> -# -# SPDX-License-Identifier: GPL-3.0-or-later - -"""Shenzhen dataset for TB detection (cross validation fold 6, RGB) - -* Split reference: first 80% of TB and healthy CXR for "train", rest for "test" -* This configuration resolution: 512 x 512 (default) -* See :py:mod:`ptbench.data.shenzhen` for dataset details -""" - -from clapper.logging import setup - -from .. import return_subsets -from ..base_datamodule import BaseDataModule -from . import _maker - -logger = setup(__name__.split(".")[0], format="%(levelname)s: %(message)s") - - -class Fold0Module(BaseDataModule): - def __init__( - self, - train_batch_size=1, - predict_batch_size=1, - drop_incomplete_batch=False, - multiproc_kwargs=None, - ): - super().__init__( - train_batch_size=train_batch_size, - predict_batch_size=predict_batch_size, - drop_incomplete_batch=drop_incomplete_batch, - multiproc_kwargs=multiproc_kwargs, - ) - - def setup(self, stage: str): - self.dataset = _maker("fold_6", RGB=True) - ( - self.train_dataset, - self.validation_dataset, - self.extra_validation_datasets, - self.predict_dataset, - ) = return_subsets(self.dataset) - - -datamodule = Fold0Module diff --git a/src/ptbench/data/shenzhen/fold_7.py b/src/ptbench/data/shenzhen/fold_7.py index 79b0d1ff..b0ea7b43 100644 --- a/src/ptbench/data/shenzhen/fold_7.py +++ b/src/ptbench/data/shenzhen/fold_7.py @@ -2,45 +2,46 @@ # # SPDX-License-Identifier: GPL-3.0-or-later -"""Shenzhen dataset for TB detection (cross validation fold 7) +"""Shenzhen datamodule for computer-aided diagnosis (fold 7) -* Split reference: first 80% of TB and healthy CXR for "train", rest for "test" -* This configuration resolution: 512 x 512 (default) -* See :py:mod:`ptbench.data.shenzhen` for dataset details -""" +See :py:mod:`ptbench.data.shenzhen` for more database details. -from clapper.logging import setup +This configuration: -from .. import return_subsets -from ..base_datamodule import BaseDataModule -from . import _maker +* Raw data input (on disk): -logger = setup(__name__.split(".")[0], format="%(levelname)s: %(message)s") + * PNG images (black and white, encoded as color images) + * Variable width and height: + * widths: from 1130 to 3001 pixels + * heights: from 948 to 3001 pixels -class Fold0Module(BaseDataModule): - def __init__( - self, - train_batch_size=1, - predict_batch_size=1, - drop_incomplete_batch=False, - multiproc_kwargs=None, - ): - super().__init__( - train_batch_size=train_batch_size, - predict_batch_size=predict_batch_size, - drop_incomplete_batch=drop_incomplete_batch, - multiproc_kwargs=multiproc_kwargs, - ) +* Output image: + + * Transforms: + + * Load raw PNG with :py:mod:`PIL` + * Remove black borders + * Torch resizing(512px, 512px) + * Torch center cropping (512px, 512px) - def setup(self, stage: str): - self.dataset = _maker("fold_7") - ( - self.train_dataset, - self.validation_dataset, - self.extra_validation_datasets, - self.predict_dataset, - ) = return_subsets(self.dataset) + * Final specifications: + * Fixed resolution: 512x512 pixels + * Color RGB encoding +""" + +import importlib.resources -datamodule = Fold0Module +from ..datamodule import CachingDataModule +from ..split import JSONDatabaseSplit +from .loader import RawDataLoader + +datamodule = CachingDataModule( + database_split=JSONDatabaseSplit( + importlib.resources.files(__name__.rsplit(".", 1)[0]).joinpath( + "fold_7.json.bz2" + ) + ), + raw_data_loader=RawDataLoader(), +) diff --git a/src/ptbench/data/shenzhen/fold_7_rgb.py b/src/ptbench/data/shenzhen/fold_7_rgb.py deleted file mode 100644 index 8a36acb2..00000000 --- a/src/ptbench/data/shenzhen/fold_7_rgb.py +++ /dev/null @@ -1,46 +0,0 @@ -# SPDX-FileCopyrightText: Copyright © 2023 Idiap Research Institute <contact@idiap.ch> -# -# SPDX-License-Identifier: GPL-3.0-or-later - -"""Shenzhen dataset for TB detection (cross validation fold 7, RGB) - -* Split reference: first 80% of TB and healthy CXR for "train", rest for "test" -* This configuration resolution: 512 x 512 (default) -* See :py:mod:`ptbench.data.shenzhen` for dataset details -""" - -from clapper.logging import setup - -from .. import return_subsets -from ..base_datamodule import BaseDataModule -from . import _maker - -logger = setup(__name__.split(".")[0], format="%(levelname)s: %(message)s") - - -class Fold0Module(BaseDataModule): - def __init__( - self, - train_batch_size=1, - predict_batch_size=1, - drop_incomplete_batch=False, - multiproc_kwargs=None, - ): - super().__init__( - train_batch_size=train_batch_size, - predict_batch_size=predict_batch_size, - drop_incomplete_batch=drop_incomplete_batch, - multiproc_kwargs=multiproc_kwargs, - ) - - def setup(self, stage: str): - self.dataset = _maker("fold_7", RGB=True) - ( - self.train_dataset, - self.validation_dataset, - self.extra_validation_datasets, - self.predict_dataset, - ) = return_subsets(self.dataset) - - -datamodule = Fold0Module diff --git a/src/ptbench/data/shenzhen/fold_8.py b/src/ptbench/data/shenzhen/fold_8.py index cf1cd36a..9bbfbe84 100644 --- a/src/ptbench/data/shenzhen/fold_8.py +++ b/src/ptbench/data/shenzhen/fold_8.py @@ -2,45 +2,46 @@ # # SPDX-License-Identifier: GPL-3.0-or-later -"""Shenzhen dataset for TB detection (cross validation fold 8) +"""Shenzhen datamodule for computer-aided diagnosis (fold 8) -* Split reference: first 80% of TB and healthy CXR for "train", rest for "test" -* This configuration resolution: 512 x 512 (default) -* See :py:mod:`ptbench.data.shenzhen` for dataset details -""" +See :py:mod:`ptbench.data.shenzhen` for more database details. -from clapper.logging import setup +This configuration: -from .. import return_subsets -from ..base_datamodule import BaseDataModule -from . import _maker +* Raw data input (on disk): -logger = setup(__name__.split(".")[0], format="%(levelname)s: %(message)s") + * PNG images (black and white, encoded as color images) + * Variable width and height: + * widths: from 1130 to 3001 pixels + * heights: from 948 to 3001 pixels -class Fold0Module(BaseDataModule): - def __init__( - self, - train_batch_size=1, - predict_batch_size=1, - drop_incomplete_batch=False, - multiproc_kwargs=None, - ): - super().__init__( - train_batch_size=train_batch_size, - predict_batch_size=predict_batch_size, - drop_incomplete_batch=drop_incomplete_batch, - multiproc_kwargs=multiproc_kwargs, - ) +* Output image: + + * Transforms: + + * Load raw PNG with :py:mod:`PIL` + * Remove black borders + * Torch resizing(512px, 512px) + * Torch center cropping (512px, 512px) - def setup(self, stage: str): - self.dataset = _maker("fold_8") - ( - self.train_dataset, - self.validation_dataset, - self.extra_validation_datasets, - self.predict_dataset, - ) = return_subsets(self.dataset) + * Final specifications: + * Fixed resolution: 512x512 pixels + * Color RGB encoding +""" + +import importlib.resources -datamodule = Fold0Module +from ..datamodule import CachingDataModule +from ..split import JSONDatabaseSplit +from .loader import RawDataLoader + +datamodule = CachingDataModule( + database_split=JSONDatabaseSplit( + importlib.resources.files(__name__.rsplit(".", 1)[0]).joinpath( + "fold_8.json.bz2" + ) + ), + raw_data_loader=RawDataLoader(), +) diff --git a/src/ptbench/data/shenzhen/fold_8_rgb.py b/src/ptbench/data/shenzhen/fold_8_rgb.py deleted file mode 100644 index 1aa0bcec..00000000 --- a/src/ptbench/data/shenzhen/fold_8_rgb.py +++ /dev/null @@ -1,46 +0,0 @@ -# SPDX-FileCopyrightText: Copyright © 2023 Idiap Research Institute <contact@idiap.ch> -# -# SPDX-License-Identifier: GPL-3.0-or-later - -"""Shenzhen dataset for TB detection (cross validation fold 8, RGB) - -* Split reference: first 80% of TB and healthy CXR for "train", rest for "test" -* This configuration resolution: 512 x 512 (default) -* See :py:mod:`ptbench.data.shenzhen` for dataset details -""" - -from clapper.logging import setup - -from .. import return_subsets -from ..base_datamodule import BaseDataModule -from . import _maker - -logger = setup(__name__.split(".")[0], format="%(levelname)s: %(message)s") - - -class Fold0Module(BaseDataModule): - def __init__( - self, - train_batch_size=1, - predict_batch_size=1, - drop_incomplete_batch=False, - multiproc_kwargs=None, - ): - super().__init__( - train_batch_size=train_batch_size, - predict_batch_size=predict_batch_size, - drop_incomplete_batch=drop_incomplete_batch, - multiproc_kwargs=multiproc_kwargs, - ) - - def setup(self, stage: str): - self.dataset = _maker("fold_8", RGB=True) - ( - self.train_dataset, - self.validation_dataset, - self.extra_validation_datasets, - self.predict_dataset, - ) = return_subsets(self.dataset) - - -datamodule = Fold0Module diff --git a/src/ptbench/data/shenzhen/fold_9.py b/src/ptbench/data/shenzhen/fold_9.py index e1bb569d..87c2afb3 100644 --- a/src/ptbench/data/shenzhen/fold_9.py +++ b/src/ptbench/data/shenzhen/fold_9.py @@ -2,45 +2,46 @@ # # SPDX-License-Identifier: GPL-3.0-or-later -"""Shenzhen dataset for TB detection (cross validation fold 9) +"""Shenzhen datamodule for computer-aided diagnosis (fold 9) -* Split reference: first 80% of TB and healthy CXR for "train", rest for "test" -* This configuration resolution: 512 x 512 (default) -* See :py:mod:`ptbench.data.shenzhen` for dataset details -""" +See :py:mod:`ptbench.data.shenzhen` for more database details. -from clapper.logging import setup +This configuration: -from .. import return_subsets -from ..base_datamodule import BaseDataModule -from . import _maker +* Raw data input (on disk): -logger = setup(__name__.split(".")[0], format="%(levelname)s: %(message)s") + * PNG images (black and white, encoded as color images) + * Variable width and height: + * widths: from 1130 to 3001 pixels + * heights: from 948 to 3001 pixels -class Fold0Module(BaseDataModule): - def __init__( - self, - train_batch_size=1, - predict_batch_size=1, - drop_incomplete_batch=False, - multiproc_kwargs=None, - ): - super().__init__( - train_batch_size=train_batch_size, - predict_batch_size=predict_batch_size, - drop_incomplete_batch=drop_incomplete_batch, - multiproc_kwargs=multiproc_kwargs, - ) +* Output image: + + * Transforms: + + * Load raw PNG with :py:mod:`PIL` + * Remove black borders + * Torch resizing(512px, 512px) + * Torch center cropping (512px, 512px) - def setup(self, stage: str): - self.dataset = _maker("fold_9") - ( - self.train_dataset, - self.validation_dataset, - self.extra_validation_datasets, - self.predict_dataset, - ) = return_subsets(self.dataset) + * Final specifications: + * Fixed resolution: 512x512 pixels + * Color RGB encoding +""" + +import importlib.resources -datamodule = Fold0Module +from ..datamodule import CachingDataModule +from ..split import JSONDatabaseSplit +from .loader import RawDataLoader + +datamodule = CachingDataModule( + database_split=JSONDatabaseSplit( + importlib.resources.files(__name__.rsplit(".", 1)[0]).joinpath( + "fold_9.json.bz2" + ) + ), + raw_data_loader=RawDataLoader(), +) diff --git a/src/ptbench/data/shenzhen/fold_9_rgb.py b/src/ptbench/data/shenzhen/fold_9_rgb.py deleted file mode 100644 index c0a577df..00000000 --- a/src/ptbench/data/shenzhen/fold_9_rgb.py +++ /dev/null @@ -1,46 +0,0 @@ -# SPDX-FileCopyrightText: Copyright © 2023 Idiap Research Institute <contact@idiap.ch> -# -# SPDX-License-Identifier: GPL-3.0-or-later - -"""Shenzhen dataset for TB detection (cross validation fold 9, RGB) - -* Split reference: first 80% of TB and healthy CXR for "train", rest for "test" -* This configuration resolution: 512 x 512 (default) -* See :py:mod:`ptbench.data.shenzhen` for dataset details -""" - -from clapper.logging import setup - -from .. import return_subsets -from ..base_datamodule import BaseDataModule -from . import _maker - -logger = setup(__name__.split(".")[0], format="%(levelname)s: %(message)s") - - -class Fold0Module(BaseDataModule): - def __init__( - self, - train_batch_size=1, - predict_batch_size=1, - drop_incomplete_batch=False, - multiproc_kwargs=None, - ): - super().__init__( - train_batch_size=train_batch_size, - predict_batch_size=predict_batch_size, - drop_incomplete_batch=drop_incomplete_batch, - multiproc_kwargs=multiproc_kwargs, - ) - - def setup(self, stage: str): - self.dataset = _maker("fold_9", RGB=True) - ( - self.train_dataset, - self.validation_dataset, - self.extra_validation_datasets, - self.predict_dataset, - ) = return_subsets(self.dataset) - - -datamodule = Fold0Module diff --git a/src/ptbench/data/shenzhen/rgb.py b/src/ptbench/data/shenzhen/rgb.py deleted file mode 100644 index 211b4923..00000000 --- a/src/ptbench/data/shenzhen/rgb.py +++ /dev/null @@ -1,35 +0,0 @@ -# SPDX-FileCopyrightText: Copyright © 2023 Idiap Research Institute <contact@idiap.ch> -# -# SPDX-License-Identifier: GPL-3.0-or-later - -"""Shenzhen datamodule for computer-aided diagnosis (default protocol) - -See :py:mod:`ptbench.data.shenzhen` for dataset details. - -This configuration: -* raw data (default): :py:obj:`ptbench.data.shenzhen._tranforms` -* augmentations: elastic deformation (probability = 80%) -* output image resolution: 512x512 pixels -""" - -import importlib.resources - -from torchvision import transforms - -from ..datamodule import CachingDataModule -from ..split import JSONDatabaseSplit -from .loader import RawDataLoader - -datamodule = CachingDataModule( - database_split=JSONDatabaseSplit( - importlib.resources.files(__name__.rsplit(".", 1)[0]).joinpath( - "default.json.bz2" - ) - ), - raw_data_loader=RawDataLoader(), - model_transforms=[ - transforms.ToPILImage(), - transforms.Lambda(lambda x: x.convert("RGB")), - transforms.ToTensor(), - ], -) -- GitLab