From 93df5969f3eb875f668b9cea7664eea03c38a9f0 Mon Sep 17 00:00:00 2001 From: "ogueler@idiap.ch" <ogueler@vws110.idiap.ch> Date: Mon, 3 Apr 2023 02:50:20 +0200 Subject: [PATCH] initialized tbx11k files --- .../data/tbx11k_simplified/__init__.py | 68 +++++++++++++++++++ .../data/tbx11k_simplified_RS/__init__.py | 62 +++++++++++++++++ 2 files changed, 130 insertions(+) create mode 100644 src/ptbench/data/tbx11k_simplified/__init__.py create mode 100644 src/ptbench/data/tbx11k_simplified_RS/__init__.py diff --git a/src/ptbench/data/tbx11k_simplified/__init__.py b/src/ptbench/data/tbx11k_simplified/__init__.py new file mode 100644 index 00000000..129bbe3e --- /dev/null +++ b/src/ptbench/data/tbx11k_simplified/__init__.py @@ -0,0 +1,68 @@ +# SPDX-FileCopyrightText: Copyright © 2023 Idiap Research Institute <contact@idiap.ch> +# +# SPDX-License-Identifier: GPL-3.0-or-later + +"""TBX11K simplified dataset for computer-aided diagnosis. + +The TBX11K database has been established to foster research +in computer-aided diagnosis of pulmonary diseases with a special +focus on tuberculosis (aTB). The dataset was specifically +designed to be used with CNNs. It contains 11,000 chest X-ray +images, each of a unique patient. They were labeled by expert +radiologists with 5 - 10+ years of experience. Possible labels +are: "healthy", "active TB", "latent TB", and "sick & non-tb". +The version of the dataset used in this benchmark is a simplified. + +* Reference: [TBX11K-SIMPLIFIED-2020]_ +* Original resolution (height x width or width x height): 4020 x 4892 +* Split reference: none +* Protocol ``default``: + + * Training samples: 62.5% of TB and healthy CXR (including labels) + * Validation samples: 15.9% of TB and healthy CXR (including labels) + * Test samples: 21.6% of TB and healthy CXR (including labels) +""" + +import importlib.resources +import os + +from ...utils.rc import load_rc +from ..dataset import JSONDataset +from ..loader import load_pil_baw, make_delayed + +_protocols = [ + importlib.resources.files(__name__).joinpath("default.json.bz2"), + importlib.resources.files(__name__).joinpath("fold_0.json.bz2"), + importlib.resources.files(__name__).joinpath("fold_1.json.bz2"), + importlib.resources.files(__name__).joinpath("fold_2.json.bz2"), + importlib.resources.files(__name__).joinpath("fold_3.json.bz2"), + importlib.resources.files(__name__).joinpath("fold_4.json.bz2"), + importlib.resources.files(__name__).joinpath("fold_5.json.bz2"), + importlib.resources.files(__name__).joinpath("fold_6.json.bz2"), + importlib.resources.files(__name__).joinpath("fold_7.json.bz2"), + importlib.resources.files(__name__).joinpath("fold_8.json.bz2"), + importlib.resources.files(__name__).joinpath("fold_9.json.bz2"), +] + +_datadir = load_rc().get("datadir.tbx11k_simplified", os.path.realpath(os.curdir)) + + +def _raw_data_loader(sample): + return dict( + data=load_pil_baw(os.path.join(_datadir, sample["data"])), # type: ignore + label=sample["label"], + ) + + +def _loader(context, sample): + # "context" is ignored in this case - database is homogeneous + # we return delayed samples to avoid loading all images at once + return make_delayed(sample, _raw_data_loader) + + +dataset = JSONDataset( + protocols=_protocols, + fieldnames=("data", "label"), + loader=_loader, +) +"""TBX11K simplified dataset object.""" diff --git a/src/ptbench/data/tbx11k_simplified_RS/__init__.py b/src/ptbench/data/tbx11k_simplified_RS/__init__.py new file mode 100644 index 00000000..dce01794 --- /dev/null +++ b/src/ptbench/data/tbx11k_simplified_RS/__init__.py @@ -0,0 +1,62 @@ +# SPDX-FileCopyrightText: Copyright © 2023 Idiap Research Institute <contact@idiap.ch> +# +# SPDX-License-Identifier: GPL-3.0-or-later + +"""Extended TBX11K simplified dataset for computer-aided diagnosis +(extended with DensenetRS predictions) + +The TBX11K database has been established to foster research +in computer-aided diagnosis of pulmonary diseases with a special +focus on tuberculosis (aTB). The dataset was specifically +designed to be used with CNNs. It contains 11,000 chest X-ray +images, each of a unique patient. They were labeled by expert +radiologists with 5 - 10+ years of experience. Possible labels +are: "healthy", "active TB", "latent TB", and "sick & non-tb". +The version of the dataset used in this benchmark is a simplified. + +* Reference: [TBX11K-SIMPLIFIED-2020]_ +* Original (released) resolution (height x width or width x height): 512 x 512 +* Split reference: none +* Protocol ``default``: + + * Training samples: 62.5% of TB and healthy CXR (including labels) + * Validation samples: 15.9% of TB and healthy CXR (including labels) + * Test samples: 21.6% of TB and healthy CXR (including labels) +""" + +import importlib.resources + +from ..dataset import JSONDataset +from ..loader import make_delayed + +_protocols = [ + importlib.resources.files(__name__).joinpath("default.json.bz2"), + importlib.resources.files(__name__).joinpath("fold_0.json.bz2"), + importlib.resources.files(__name__).joinpath("fold_1.json.bz2"), + importlib.resources.files(__name__).joinpath("fold_2.json.bz2"), + importlib.resources.files(__name__).joinpath("fold_3.json.bz2"), + importlib.resources.files(__name__).joinpath("fold_4.json.bz2"), + importlib.resources.files(__name__).joinpath("fold_5.json.bz2"), + importlib.resources.files(__name__).joinpath("fold_6.json.bz2"), + importlib.resources.files(__name__).joinpath("fold_7.json.bz2"), + importlib.resources.files(__name__).joinpath("fold_8.json.bz2"), + importlib.resources.files(__name__).joinpath("fold_9.json.bz2"), +] + + +def _raw_data_loader(sample): + return dict(data=sample["data"], label=sample["label"]) + + +def _loader(context, sample): + # "context" is ignored in this case - database is homogeneous + # we returned delayed samples to avoid loading all images at once + return make_delayed(sample, _raw_data_loader, key=sample["filename"]) + + +dataset = JSONDataset( + protocols=_protocols, + fieldnames=("filename", "label", "data"), + loader=_loader, +) +"""Extended TBX11K simplified dataset object.""" -- GitLab