Skip to content
Snippets Groups Projects
Commit 05bbb35f authored by André Anjos's avatar André Anjos :speech_balloon:
Browse files

[data.montgomery/shenzhen] Make all declarations into a single Python module (c.f. issue #27)

parent e6db9362
No related branches found
No related tags found
1 merge request!6Making use of LightningDataModule and simplification of data loading
Showing
with 130 additions and 489 deletions
...@@ -2,8 +2,7 @@ ...@@ -2,8 +2,7 @@
# #
# SPDX-License-Identifier: GPL-3.0-or-later # SPDX-License-Identifier: GPL-3.0-or-later
"""Specialized raw-data loaders for the Montgomery dataset.""" import importlib.resources
import os import os
import PIL.Image import PIL.Image
...@@ -11,7 +10,9 @@ import PIL.Image ...@@ -11,7 +10,9 @@ import PIL.Image
from torchvision.transforms.functional import center_crop, to_tensor from torchvision.transforms.functional import center_crop, to_tensor
from ...utils.rc import load_rc from ...utils.rc import load_rc
from ..datamodule import CachingDataModule
from ..image_utils import remove_black_borders from ..image_utils import remove_black_borders
from ..split import JSONDatabaseSplit
from ..typing import RawDataLoader as _BaseRawDataLoader from ..typing import RawDataLoader as _BaseRawDataLoader
from ..typing import Sample from ..typing import Sample
...@@ -85,3 +86,64 @@ class RawDataLoader(_BaseRawDataLoader): ...@@ -85,3 +86,64 @@ class RawDataLoader(_BaseRawDataLoader):
The integer label associated with the sample The integer label associated with the sample
""" """
return sample[1] return sample[1]
class DataModule(CachingDataModule):
"""Montgomery datamodule for TB detection.
The standard digital image database for Tuberculosis was created by the National
Library of Medicine, Maryland, USA in collaboration with Shenzhen No.3 People’s
Hospital, Guangdong Medical College, Shenzhen, China. The Chest X-rays are from
* Database reference: [MONTGOMERY-SHENZHEN-2014]_
* Original resolution (height x width or width x height): 4020x4892 px or 4892x4020 px
* This split:
* Split reference: None
* Training samples: ?? of TB and healthy CXR
* Validation samples: ?? of TB and healthy CXR
* Test samples: ?? of TB and healthy CXR
Data specifications:
* Raw data input (on disk):
* PNG images 8 bit grayscale
* resolution: fixed to one of the cases above
* Output image:
* Transforms:
* Load raw PNG with :py:mod:`PIL`
* Remove black borders
* Torch center cropping to get square image
* Final specifications
* Grayscale, encoded as a single plane image, 8 bits
* Square (4020x4020 px)
"""
def __init__(self, split_filename: str):
super().__init__(
database_split=JSONDatabaseSplit(
importlib.resources.files(__name__.rsplit(".", 1)[0]).joinpath(
split_filename
)
),
raw_data_loader=RawDataLoader(),
)
default = DataModule("default.json.bz2")
fold_0 = DataModule("fold_0.json.bz2")
fold_1 = DataModule("fold_1.json.bz2")
fold_2 = DataModule("fold_2.json.bz2")
fold_3 = DataModule("fold_3.json.bz2")
fold_4 = DataModule("fold_4.json.bz2")
fold_5 = DataModule("fold_5.json.bz2")
fold_6 = DataModule("fold_6.json.bz2")
fold_7 = DataModule("fold_7.json.bz2")
fold_8 = DataModule("fold_8.json.bz2")
fold_9 = DataModule("fold_9.json.bz2")
# SPDX-FileCopyrightText: Copyright © 2023 Idiap Research Institute <contact@idiap.ch>
#
# SPDX-License-Identifier: GPL-3.0-or-later
"""Montgomery datamodule for TB detection (``default`` protocol)
The standard digital image database for Tuberculosis was created by the National
Library of Medicine, Maryland, USA in collaboration with Shenzhen No.3 People’s
Hospital, Guangdong Medical College, Shenzhen, China. The Chest X-rays are from
* Database reference: [MONTGOMERY-SHENZHEN-2014]_
* Original resolution (height x width or width x height): 4020x4892 px or 4892x4020 px
* This split:
* Split reference: None
* Training samples: ?? of TB and healthy CXR
* Validation samples: ?? of TB and healthy CXR
* Test samples: ?? of TB and healthy CXR
Data specifications:
* Raw data input (on disk):
* PNG images 8 bit grayscale
* resolution: fixed to one of the cases above
* Output image:
* Transforms:
* Load raw PNG with :py:mod:`PIL`
* Remove black borders
* Torch center cropping to get square image
* Final specifications
* Grayscale, encoded as a single plane image, 8 bits
* Square (4020x4020 px)
Protocol ``default``:
* Training samples: first 64% of TB and healthy CXR (including labels)
* Validation samples: 16% of TB and healthy CXR (including labels)
* Test samples: 20% of TB and healty CXR (including labels)
"""
import importlib.resources
from ..datamodule import CachingDataModule
from ..split import JSONDatabaseSplit
from .loader import RawDataLoader
datamodule = CachingDataModule(
database_split=JSONDatabaseSplit(
importlib.resources.files(__name__.rsplit(".", 1)[0]).joinpath(
"default.json.bz2"
)
),
raw_data_loader=RawDataLoader(),
)
# SPDX-FileCopyrightText: Copyright © 2023 Idiap Research Institute <contact@idiap.ch>
#
# SPDX-License-Identifier: GPL-3.0-or-later
"""Montgomery datamodule for TB detection (``fold 0`` protocol)
See :py:mod:`ptbench.data.montgomery.default` for input/output details.
"""
import importlib.resources
from ..datamodule import CachingDataModule
from ..split import JSONDatabaseSplit
from .loader import RawDataLoader
datamodule = CachingDataModule(
database_split=JSONDatabaseSplit(
importlib.resources.files(__name__.rsplit(".", 1)[0]).joinpath(
"fold_0.json.bz2"
)
),
raw_data_loader=RawDataLoader(),
)
# SPDX-FileCopyrightText: Copyright © 2023 Idiap Research Institute <contact@idiap.ch>
#
# SPDX-License-Identifier: GPL-3.0-or-later
"""Montgomery datamodule for TB detection (default protocol)
See :py:mod:`ptbench.data.montgomery.default` for input/output details.
"""
import importlib.resources
from ..datamodule import CachingDataModule
from ..split import JSONDatabaseSplit
from .loader import RawDataLoader
datamodule = CachingDataModule(
database_split=JSONDatabaseSplit(
importlib.resources.files(__name__.rsplit(".", 1)[0]).joinpath(
"fold_1.json.bz2"
)
),
raw_data_loader=RawDataLoader(),
)
# SPDX-FileCopyrightText: Copyright © 2023 Idiap Research Institute <contact@idiap.ch>
#
# SPDX-License-Identifier: GPL-3.0-or-later
"""Montgomery datamodule for TB detection (default protocol)
See :py:mod:`ptbench.data.montgomery.default` for input/output details.
"""
import importlib.resources
from ..datamodule import CachingDataModule
from ..split import JSONDatabaseSplit
from .loader import RawDataLoader
datamodule = CachingDataModule(
database_split=JSONDatabaseSplit(
importlib.resources.files(__name__.rsplit(".", 1)[0]).joinpath(
"fold_2.json.bz2"
)
),
raw_data_loader=RawDataLoader(),
)
# SPDX-FileCopyrightText: Copyright © 2023 Idiap Research Institute <contact@idiap.ch>
#
# SPDX-License-Identifier: GPL-3.0-or-later
"""Montgomery datamodule for TB detection (default protocol)
See :py:mod:`ptbench.data.montgomery.default` for input/output details.
"""
import importlib.resources
from ..datamodule import CachingDataModule
from ..split import JSONDatabaseSplit
from .loader import RawDataLoader
datamodule = CachingDataModule(
database_split=JSONDatabaseSplit(
importlib.resources.files(__name__.rsplit(".", 1)[0]).joinpath(
"fold_3.json.bz2"
)
),
raw_data_loader=RawDataLoader(),
)
# SPDX-FileCopyrightText: Copyright © 2023 Idiap Research Institute <contact@idiap.ch>
#
# SPDX-License-Identifier: GPL-3.0-or-later
"""Montgomery datamodule for TB detection (default protocol)
See :py:mod:`ptbench.data.montgomery.default` for input/output details.
"""
import importlib.resources
from ..datamodule import CachingDataModule
from ..split import JSONDatabaseSplit
from .loader import RawDataLoader
datamodule = CachingDataModule(
database_split=JSONDatabaseSplit(
importlib.resources.files(__name__.rsplit(".", 1)[0]).joinpath(
"fold_4.json.bz2"
)
),
raw_data_loader=RawDataLoader(),
)
# SPDX-FileCopyrightText: Copyright © 2023 Idiap Research Institute <contact@idiap.ch>
#
# SPDX-License-Identifier: GPL-3.0-or-later
"""Montgomery datamodule for TB detection (default protocol)
See :py:mod:`ptbench.data.montgomery.default` for input/output details.
"""
import importlib.resources
from ..datamodule import CachingDataModule
from ..split import JSONDatabaseSplit
from .loader import RawDataLoader
datamodule = CachingDataModule(
database_split=JSONDatabaseSplit(
importlib.resources.files(__name__.rsplit(".", 1)[0]).joinpath(
"fold_5.json.bz2"
)
),
raw_data_loader=RawDataLoader(),
)
# SPDX-FileCopyrightText: Copyright © 2023 Idiap Research Institute <contact@idiap.ch>
#
# SPDX-License-Identifier: GPL-3.0-or-later
"""Montgomery datamodule for TB detection (default protocol)
See :py:mod:`ptbench.data.montgomery.default` for input/output details.
"""
import importlib.resources
from ..datamodule import CachingDataModule
from ..split import JSONDatabaseSplit
from .loader import RawDataLoader
datamodule = CachingDataModule(
database_split=JSONDatabaseSplit(
importlib.resources.files(__name__.rsplit(".", 1)[0]).joinpath(
"fold_6.json.bz2"
)
),
raw_data_loader=RawDataLoader(),
)
# SPDX-FileCopyrightText: Copyright © 2023 Idiap Research Institute <contact@idiap.ch>
#
# SPDX-License-Identifier: GPL-3.0-or-later
"""Montgomery datamodule for TB detection (default protocol)
See :py:mod:`ptbench.data.montgomery.default` for input/output details.
"""
import importlib.resources
from ..datamodule import CachingDataModule
from ..split import JSONDatabaseSplit
from .loader import RawDataLoader
datamodule = CachingDataModule(
database_split=JSONDatabaseSplit(
importlib.resources.files(__name__.rsplit(".", 1)[0]).joinpath(
"fold_7.json.bz2"
)
),
raw_data_loader=RawDataLoader(),
)
# SPDX-FileCopyrightText: Copyright © 2023 Idiap Research Institute <contact@idiap.ch>
#
# SPDX-License-Identifier: GPL-3.0-or-later
"""Montgomery datamodule for TB detection (default protocol)
See :py:mod:`ptbench.data.montgomery.default` for input/output details.
"""
import importlib.resources
from ..datamodule import CachingDataModule
from ..split import JSONDatabaseSplit
from .loader import RawDataLoader
datamodule = CachingDataModule(
database_split=JSONDatabaseSplit(
importlib.resources.files(__name__.rsplit(".", 1)[0]).joinpath(
"fold_8.json.bz2"
)
),
raw_data_loader=RawDataLoader(),
)
# SPDX-FileCopyrightText: Copyright © 2023 Idiap Research Institute <contact@idiap.ch>
#
# SPDX-License-Identifier: GPL-3.0-or-later
"""Montgomery datamodule for TB detection (default protocol)
See :py:mod:`ptbench.data.montgomery.default` for input/output details.
"""
import importlib.resources
from ..datamodule import CachingDataModule
from ..split import JSONDatabaseSplit
from .loader import RawDataLoader
datamodule = CachingDataModule(
database_split=JSONDatabaseSplit(
importlib.resources.files(__name__.rsplit(".", 1)[0]).joinpath(
"fold_9.json.bz2"
)
),
raw_data_loader=RawDataLoader(),
)
...@@ -2,8 +2,7 @@ ...@@ -2,8 +2,7 @@
# #
# SPDX-License-Identifier: GPL-3.0-or-later # SPDX-License-Identifier: GPL-3.0-or-later
"""Specialized raw-data loaders for the Shenzen dataset.""" import importlib.resources
import os import os
import PIL.Image import PIL.Image
...@@ -11,7 +10,9 @@ import PIL.Image ...@@ -11,7 +10,9 @@ import PIL.Image
from torchvision.transforms.functional import center_crop, to_tensor from torchvision.transforms.functional import center_crop, to_tensor
from ...utils.rc import load_rc from ...utils.rc import load_rc
from ..datamodule import CachingDataModule
from ..image_utils import remove_black_borders from ..image_utils import remove_black_borders
from ..split import JSONDatabaseSplit
from ..typing import RawDataLoader as _BaseRawDataLoader from ..typing import RawDataLoader as _BaseRawDataLoader
from ..typing import Sample from ..typing import Sample
...@@ -90,3 +91,66 @@ class RawDataLoader(_BaseRawDataLoader): ...@@ -90,3 +91,66 @@ class RawDataLoader(_BaseRawDataLoader):
The integer label associated with the sample The integer label associated with the sample
""" """
return sample[1] return sample[1]
class DataModule(CachingDataModule):
"""Shenzhen datamodule for computer-aided diagnosis.
The standard digital image database for Tuberculosis was created by the National
Library of Medicine, Maryland, USA in collaboration with Shenzhen No.3 People’s
Hospital, Guangdong Medical College, Shenzhen, China. The Chest X-rays are from
out-patient clinics, and were captured as part of the daily routine using
Philips DR Digital Diagnose systems.
* Database reference: [MONTGOMERY-SHENZHEN-2014]_
* Original resolution (height x width or width x height): 3000 x 3000 or less
* This split:
* Split reference: None
* Training samples: 64% of TB and healthy CXR (including labels)
* Validation samples: 16% of TB and healthy CXR (including labels)
* Test samples: 20% of TB and healthy CXR (including labels)
Data specifications:
* Raw data input (on disk):
* PNG images (grayscale, encoded as RGB images with "inverted" grayscale scale)
* Variable width and height
* Output image:
* Transforms:
* Load raw PNG with :py:mod:`PIL`
* Remove black borders
* Torch center cropping to get square image
* Final specifications:
* Grayscale, encoded as a single plane image, 8 bits
* Square, with varying resolutions, depending on the input image
"""
def __init__(self, split_filename: str):
super().__init__(
database_split=JSONDatabaseSplit(
importlib.resources.files(__name__.rsplit(".", 1)[0]).joinpath(
split_filename
)
),
raw_data_loader=RawDataLoader(),
)
default = DataModule("default.json.bz2")
fold_0 = DataModule("fold_0.json.bz2")
fold_1 = DataModule("fold_1.json.bz2")
fold_2 = DataModule("fold_2.json.bz2")
fold_3 = DataModule("fold_3.json.bz2")
fold_4 = DataModule("fold_4.json.bz2")
fold_5 = DataModule("fold_5.json.bz2")
fold_6 = DataModule("fold_6.json.bz2")
fold_7 = DataModule("fold_7.json.bz2")
fold_8 = DataModule("fold_8.json.bz2")
fold_9 = DataModule("fold_9.json.bz2")
# SPDX-FileCopyrightText: Copyright © 2023 Idiap Research Institute <contact@idiap.ch>
#
# SPDX-License-Identifier: GPL-3.0-or-later
"""Shenzhen datamodule for computer-aided diagnosis (``default`` protocol)
The standard digital image database for Tuberculosis was created by the National
Library of Medicine, Maryland, USA in collaboration with Shenzhen No.3 People’s
Hospital, Guangdong Medical College, Shenzhen, China. The Chest X-rays are from
out-patient clinics, and were captured as part of the daily routine using
Philips DR Digital Diagnose systems.
* Database reference: [MONTGOMERY-SHENZHEN-2014]_
* Original resolution (height x width or width x height): 3000 x 3000 or less
* This split:
* Split reference: None
* Training samples: 64% of TB and healthy CXR (including labels)
* Validation samples: 16% of TB and healthy CXR (including labels)
* Test samples: 20% of TB and healthy CXR (including labels)
Data specifications:
* Raw data input (on disk):
* PNG images (grayscale, encoded as RGB images with "inverted" grayscale scale)
* Variable width and height
* Output image:
* Transforms:
* Load raw PNG with :py:mod:`PIL`
* Remove black borders
* Torch center cropping to get square image
* Final specifications:
* Grayscale, encoded as a single plane image, 8 bits
* Square, with varying resolutions, depending on the input image
"""
import importlib.resources
from ..datamodule import CachingDataModule
from ..split import JSONDatabaseSplit
from .loader import RawDataLoader
datamodule = CachingDataModule(
database_split=JSONDatabaseSplit(
importlib.resources.files(__name__.rsplit(".", 1)[0]).joinpath(
"default.json.bz2"
)
),
raw_data_loader=RawDataLoader(),
)
# SPDX-FileCopyrightText: Copyright © 2023 Idiap Research Institute <contact@idiap.ch>
#
# SPDX-License-Identifier: GPL-3.0-or-later
"""Shenzhen datamodule for computer-aided diagnosis (fold 0)
See :py:mod:`ptbench.data.shenzhen.default` for input/output details.
"""
import importlib.resources
from ..datamodule import CachingDataModule
from ..split import JSONDatabaseSplit
from .loader import RawDataLoader
datamodule = CachingDataModule(
database_split=JSONDatabaseSplit(
importlib.resources.files(__name__.rsplit(".", 1)[0]).joinpath(
"fold_0.json.bz2"
)
),
raw_data_loader=RawDataLoader(),
)
# SPDX-FileCopyrightText: Copyright © 2023 Idiap Research Institute <contact@idiap.ch>
#
# SPDX-License-Identifier: GPL-3.0-or-later
"""Shenzhen datamodule for computer-aided diagnosis (fold 1)
See :py:mod:`ptbench.data.shenzhen.default` for input/output details.
"""
import importlib.resources
from ..datamodule import CachingDataModule
from ..split import JSONDatabaseSplit
from .loader import RawDataLoader
datamodule = CachingDataModule(
database_split=JSONDatabaseSplit(
importlib.resources.files(__name__.rsplit(".", 1)[0]).joinpath(
"fold_1.json.bz2"
)
),
raw_data_loader=RawDataLoader(),
)
# SPDX-FileCopyrightText: Copyright © 2023 Idiap Research Institute <contact@idiap.ch>
#
# SPDX-License-Identifier: GPL-3.0-or-later
"""Shenzhen datamodule for computer-aided diagnosis (fold 2)
See :py:mod:`ptbench.data.shenzhen.default` for input/output details.
"""
import importlib.resources
from ..datamodule import CachingDataModule
from ..split import JSONDatabaseSplit
from .loader import RawDataLoader
datamodule = CachingDataModule(
database_split=JSONDatabaseSplit(
importlib.resources.files(__name__.rsplit(".", 1)[0]).joinpath(
"fold_2.json.bz2"
)
),
raw_data_loader=RawDataLoader(),
)
# SPDX-FileCopyrightText: Copyright © 2023 Idiap Research Institute <contact@idiap.ch>
#
# SPDX-License-Identifier: GPL-3.0-or-later
"""Shenzhen datamodule for computer-aided diagnosis (fold 3)
See :py:mod:`ptbench.data.shenzhen.default` for input/output details.
"""
import importlib.resources
from ..datamodule import CachingDataModule
from ..split import JSONDatabaseSplit
from .loader import RawDataLoader
datamodule = CachingDataModule(
database_split=JSONDatabaseSplit(
importlib.resources.files(__name__.rsplit(".", 1)[0]).joinpath(
"fold_3.json.bz2"
)
),
raw_data_loader=RawDataLoader(),
)
# SPDX-FileCopyrightText: Copyright © 2023 Idiap Research Institute <contact@idiap.ch>
#
# SPDX-License-Identifier: GPL-3.0-or-later
"""Shenzhen datamodule for computer-aided diagnosis (fold 4)
See :py:mod:`ptbench.data.shenzhen.default` for input/output details.
"""
import importlib.resources
from ..datamodule import CachingDataModule
from ..split import JSONDatabaseSplit
from .loader import RawDataLoader
datamodule = CachingDataModule(
database_split=JSONDatabaseSplit(
importlib.resources.files(__name__.rsplit(".", 1)[0]).joinpath(
"fold_4.json.bz2"
)
),
raw_data_loader=RawDataLoader(),
)
# SPDX-FileCopyrightText: Copyright © 2023 Idiap Research Institute <contact@idiap.ch>
#
# SPDX-License-Identifier: GPL-3.0-or-later
"""Shenzhen datamodule for computer-aided diagnosis (fold 5)
See :py:mod:`ptbench.data.shenzhen.default` for input/output details.
"""
import importlib.resources
from ..datamodule import CachingDataModule
from ..split import JSONDatabaseSplit
from .loader import RawDataLoader
datamodule = CachingDataModule(
database_split=JSONDatabaseSplit(
importlib.resources.files(__name__.rsplit(".", 1)[0]).joinpath(
"fold_5.json.bz2"
)
),
raw_data_loader=RawDataLoader(),
)
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment