From 93df5969f3eb875f668b9cea7664eea03c38a9f0 Mon Sep 17 00:00:00 2001
From: "ogueler@idiap.ch" <ogueler@vws110.idiap.ch>
Date: Mon, 3 Apr 2023 02:50:20 +0200
Subject: [PATCH] initialized tbx11k files

---
 .../data/tbx11k_simplified/__init__.py        | 68 +++++++++++++++++++
 .../data/tbx11k_simplified_RS/__init__.py     | 62 +++++++++++++++++
 2 files changed, 130 insertions(+)
 create mode 100644 src/ptbench/data/tbx11k_simplified/__init__.py
 create mode 100644 src/ptbench/data/tbx11k_simplified_RS/__init__.py

diff --git a/src/ptbench/data/tbx11k_simplified/__init__.py b/src/ptbench/data/tbx11k_simplified/__init__.py
new file mode 100644
index 00000000..129bbe3e
--- /dev/null
+++ b/src/ptbench/data/tbx11k_simplified/__init__.py
@@ -0,0 +1,68 @@
+# SPDX-FileCopyrightText: Copyright © 2023 Idiap Research Institute <contact@idiap.ch>
+#
+# SPDX-License-Identifier: GPL-3.0-or-later
+
+"""TBX11K simplified dataset for computer-aided diagnosis.
+
+The TBX11K database has been established to foster research
+in computer-aided diagnosis of pulmonary diseases with a special
+focus on tuberculosis (aTB). The dataset was specifically
+designed to be used with CNNs. It contains 11,000 chest X-ray
+images, each of a unique patient. They were labeled by expert
+radiologists with 5 - 10+ years of experience. Possible labels
+are: "healthy", "active TB", "latent TB", and "sick & non-tb".
+The version of the dataset used in this benchmark is a simplified.
+
+* Reference: [TBX11K-SIMPLIFIED-2020]_
+* Original resolution (height x width or width x height): 4020 x 4892
+* Split reference: none
+* Protocol ``default``:
+
+  * Training samples: 62.5% of TB and healthy CXR (including labels)
+  * Validation samples: 15.9% of TB and healthy CXR (including labels)
+  * Test samples: 21.6% of TB and healthy CXR (including labels)
+"""
+
+import importlib.resources
+import os
+
+from ...utils.rc import load_rc
+from ..dataset import JSONDataset
+from ..loader import load_pil_baw, make_delayed
+
+_protocols = [
+    importlib.resources.files(__name__).joinpath("default.json.bz2"),
+    importlib.resources.files(__name__).joinpath("fold_0.json.bz2"),
+    importlib.resources.files(__name__).joinpath("fold_1.json.bz2"),
+    importlib.resources.files(__name__).joinpath("fold_2.json.bz2"),
+    importlib.resources.files(__name__).joinpath("fold_3.json.bz2"),
+    importlib.resources.files(__name__).joinpath("fold_4.json.bz2"),
+    importlib.resources.files(__name__).joinpath("fold_5.json.bz2"),
+    importlib.resources.files(__name__).joinpath("fold_6.json.bz2"),
+    importlib.resources.files(__name__).joinpath("fold_7.json.bz2"),
+    importlib.resources.files(__name__).joinpath("fold_8.json.bz2"),
+    importlib.resources.files(__name__).joinpath("fold_9.json.bz2"),
+]
+
+_datadir = load_rc().get("datadir.tbx11k_simplified", os.path.realpath(os.curdir))
+
+
+def _raw_data_loader(sample):
+    return dict(
+        data=load_pil_baw(os.path.join(_datadir, sample["data"])),  # type: ignore
+        label=sample["label"],
+    )
+
+
+def _loader(context, sample):
+    # "context" is ignored in this case - database is homogeneous
+    # we return delayed samples to avoid loading all images at once
+    return make_delayed(sample, _raw_data_loader)
+
+
+dataset = JSONDataset(
+    protocols=_protocols,
+    fieldnames=("data", "label"),
+    loader=_loader,
+)
+"""TBX11K simplified dataset object."""
diff --git a/src/ptbench/data/tbx11k_simplified_RS/__init__.py b/src/ptbench/data/tbx11k_simplified_RS/__init__.py
new file mode 100644
index 00000000..dce01794
--- /dev/null
+++ b/src/ptbench/data/tbx11k_simplified_RS/__init__.py
@@ -0,0 +1,62 @@
+# SPDX-FileCopyrightText: Copyright © 2023 Idiap Research Institute <contact@idiap.ch>
+#
+# SPDX-License-Identifier: GPL-3.0-or-later
+
+"""Extended TBX11K simplified dataset for computer-aided diagnosis 
+(extended with DensenetRS predictions)
+
+The TBX11K database has been established to foster research
+in computer-aided diagnosis of pulmonary diseases with a special
+focus on tuberculosis (aTB). The dataset was specifically
+designed to be used with CNNs. It contains 11,000 chest X-ray
+images, each of a unique patient. They were labeled by expert
+radiologists with 5 - 10+ years of experience. Possible labels
+are: "healthy", "active TB", "latent TB", and "sick & non-tb".
+The version of the dataset used in this benchmark is a simplified.
+
+* Reference: [TBX11K-SIMPLIFIED-2020]_
+* Original (released) resolution (height x width or width x height): 512 x 512
+* Split reference: none
+* Protocol ``default``:
+
+  * Training samples: 62.5% of TB and healthy CXR (including labels)
+  * Validation samples: 15.9% of TB and healthy CXR (including labels)
+  * Test samples: 21.6% of TB and healthy CXR (including labels)
+"""
+
+import importlib.resources
+
+from ..dataset import JSONDataset
+from ..loader import make_delayed
+
+_protocols = [
+    importlib.resources.files(__name__).joinpath("default.json.bz2"),
+    importlib.resources.files(__name__).joinpath("fold_0.json.bz2"),
+    importlib.resources.files(__name__).joinpath("fold_1.json.bz2"),
+    importlib.resources.files(__name__).joinpath("fold_2.json.bz2"),
+    importlib.resources.files(__name__).joinpath("fold_3.json.bz2"),
+    importlib.resources.files(__name__).joinpath("fold_4.json.bz2"),
+    importlib.resources.files(__name__).joinpath("fold_5.json.bz2"),
+    importlib.resources.files(__name__).joinpath("fold_6.json.bz2"),
+    importlib.resources.files(__name__).joinpath("fold_7.json.bz2"),
+    importlib.resources.files(__name__).joinpath("fold_8.json.bz2"),
+    importlib.resources.files(__name__).joinpath("fold_9.json.bz2"),
+]
+
+
+def _raw_data_loader(sample):
+    return dict(data=sample["data"], label=sample["label"])
+
+
+def _loader(context, sample):
+    # "context" is ignored in this case - database is homogeneous
+    # we returned delayed samples to avoid loading all images at once
+    return make_delayed(sample, _raw_data_loader, key=sample["filename"])
+
+
+dataset = JSONDataset(
+    protocols=_protocols,
+    fieldnames=("filename", "label", "data"),
+    loader=_loader,
+)
+"""Extended TBX11K simplified dataset object."""
-- 
GitLab