From 22caca5f13c21c1b040d6f6ddd654dd5881d543d Mon Sep 17 00:00:00 2001
From: Andre Anjos <andre.dos.anjos@gmail.com>
Date: Wed, 2 Aug 2023 20:03:34 +0200
Subject: [PATCH] [data.tbpoc] Minor adjustments

---
 src/ptbench/data/tbpoc/datamodule.py | 20 ++++++++++----------
 src/ptbench/data/tbpoc/fold_0.py     | 18 ++++--------------
 src/ptbench/data/tbpoc/fold_1.py     | 18 ++++--------------
 src/ptbench/data/tbpoc/fold_2.py     | 18 ++++--------------
 src/ptbench/data/tbpoc/fold_3.py     | 18 ++++--------------
 src/ptbench/data/tbpoc/fold_4.py     | 18 ++++--------------
 src/ptbench/data/tbpoc/fold_5.py     | 18 ++++--------------
 src/ptbench/data/tbpoc/fold_6.py     | 18 ++++--------------
 src/ptbench/data/tbpoc/fold_7.py     | 18 ++++--------------
 src/ptbench/data/tbpoc/fold_8.py     | 18 ++++--------------
 src/ptbench/data/tbpoc/fold_9.py     | 18 ++++--------------
 11 files changed, 50 insertions(+), 150 deletions(-)

diff --git a/src/ptbench/data/tbpoc/datamodule.py b/src/ptbench/data/tbpoc/datamodule.py
index 35465bac..31e2aac4 100644
--- a/src/ptbench/data/tbpoc/datamodule.py
+++ b/src/ptbench/data/tbpoc/datamodule.py
@@ -11,7 +11,7 @@ from torchvision.transforms.functional import center_crop, to_tensor
 
 from ...utils.rc import load_rc
 from ..datamodule import CachingDataModule
-from ..image_utils import load_pil_grayscale, remove_black_borders
+from ..image_utils import remove_black_borders
 from ..split import JSONDatabaseSplit
 from ..typing import DatabaseSplit
 from ..typing import RawDataLoader as _BaseRawDataLoader
@@ -57,7 +57,9 @@ class RawDataLoader(_BaseRawDataLoader):
         sample
             The sample representation
         """
-        image = load_pil_grayscale(os.path.join(self.datadir, sample[0]))
+        # images from TBPOC are encoded as grayscale JPEGs, no need to
+        # call convert("L") here.
+        image = PIL.Image.open(os.path.join(self.datadir, sample[0]))
         image = remove_black_borders(image)
         tensor = to_tensor(image)
         tensor = center_crop(tensor, min(*tensor.shape[1:]))
@@ -102,21 +104,21 @@ class DataModule(CachingDataModule):
     """TB-POC dataset for computer-aided diagnosis.
 
     * Database reference: [TB-POC-2018]_
-    * Original resolution (height x width or width x height): 2048 x 2500 pixels 
-    or 2500 x 2048 pixels
+    * Original resolution (height x width or width x height): 2048 x 2500 pixels
+      or 2500 x 2048 pixels
 
     Data specifications:
 
     * Raw data input (on disk):
 
-        * jpeg 8-bit grayscale images
+        * JPEG 8-bit Grayscale images
         * resolution: fixed to one of the cases above
 
     * Output image:
 
         * Transforms:
 
-            * Load raw jpeg with :py:mod:`PIL`
+            * Load raw grayscale jpeg with :py:mod:`PIL`
             * Remove black borders
             * Convert to torch tensor
             * Torch center cropping to get square image
@@ -124,8 +126,8 @@ class DataModule(CachingDataModule):
         * Final specifications:
 
             * Grayscale, encoded as a single plane tensor, 32-bit floats,
-              square with varying resolutions, depending on black borders' sizes
-              on the input image
+              square with varying resolutions (2048 x 2048 being the maximum),
+              but also depending on black borders' sizes on the input image.
             * Labels: 0 (healthy), 1 (active tuberculosis)
     """
 
@@ -134,5 +136,3 @@ class DataModule(CachingDataModule):
             database_split=make_split(split_filename),
             raw_data_loader=RawDataLoader(),
         )
-
-
diff --git a/src/ptbench/data/tbpoc/fold_0.py b/src/ptbench/data/tbpoc/fold_0.py
index 972e7188..775f64cf 100644
--- a/src/ptbench/data/tbpoc/fold_0.py
+++ b/src/ptbench/data/tbpoc/fold_0.py
@@ -2,20 +2,10 @@
 #
 # SPDX-License-Identifier: GPL-3.0-or-later
 
-"""TB-POC dataset for TB detection (cross validation fold 0)
-
-* Split reference: none (stratified kfolding)
-
-* Stratified kfold protocol:
-    * Training samples: 72% of TB and healthy CXR (including labels)
-    * Validation samples: 18% of TB and healthy CXR (including labels)
-    * Test samples: 10% of TB and healthy CXR (including labels)
-
-* This configuration resolution: varying depending of black borders on original
-  image
-* See :py:mod:`ptbench.data.tbpoc` for dataset details
-"""
-
 from .datamodule import DataModule
 
 datamodule = DataModule("fold-0.json")
+"""TB-POC dataset for TB detection (cross validation fold 0).
+
+See :py:class:`DataModule` for technical details.
+"""
diff --git a/src/ptbench/data/tbpoc/fold_1.py b/src/ptbench/data/tbpoc/fold_1.py
index 79b9bfca..6f0f137f 100644
--- a/src/ptbench/data/tbpoc/fold_1.py
+++ b/src/ptbench/data/tbpoc/fold_1.py
@@ -2,20 +2,10 @@
 #
 # SPDX-License-Identifier: GPL-3.0-or-later
 
-"""TB-POC dataset for TB detection (cross validation fold 1)
-
-* Split reference: none (stratified kfolding)
-
-* Stratified kfold protocol:
-    * Training samples: 72% of TB and healthy CXR (including labels)
-    * Validation samples: 18% of TB and healthy CXR (including labels)
-    * Test samples: 10% of TB and healthy CXR (including labels)
-
-* This configuration resolution: varying depending of black borders on original
-  image
-* See :py:mod:`ptbench.data.tbpoc` for dataset details
-"""
-
 from .datamodule import DataModule
 
 datamodule = DataModule("fold-1.json")
+"""TB-POC dataset for TB detection (cross validation fold 1).
+
+See :py:class:`DataModule` for technical details.
+"""
diff --git a/src/ptbench/data/tbpoc/fold_2.py b/src/ptbench/data/tbpoc/fold_2.py
index 9d41fb59..662fd32c 100644
--- a/src/ptbench/data/tbpoc/fold_2.py
+++ b/src/ptbench/data/tbpoc/fold_2.py
@@ -2,20 +2,10 @@
 #
 # SPDX-License-Identifier: GPL-3.0-or-later
 
-"""TB-POC dataset for TB detection (cross validation fold 2)
-
-* Split reference: none (stratified kfolding)
-
-* Stratified kfold protocol:
-    * Training samples: 72% of TB and healthy CXR (including labels)
-    * Validation samples: 18% of TB and healthy CXR (including labels)
-    * Test samples: 10% of TB and healthy CXR (including labels)
-
-* This configuration resolution: varying depending of black borders on original
-  image
-* See :py:mod:`ptbench.data.tbpoc` for dataset details
-"""
-
 from .datamodule import DataModule
 
 datamodule = DataModule("fold-2.json")
+"""TB-POC dataset for TB detection (cross validation fold 2).
+
+See :py:class:`DataModule` for technical details.
+"""
diff --git a/src/ptbench/data/tbpoc/fold_3.py b/src/ptbench/data/tbpoc/fold_3.py
index 08672b3f..c52b8c2e 100644
--- a/src/ptbench/data/tbpoc/fold_3.py
+++ b/src/ptbench/data/tbpoc/fold_3.py
@@ -2,20 +2,10 @@
 #
 # SPDX-License-Identifier: GPL-3.0-or-later
 
-"""TB-POC dataset for TB detection (cross validation fold 3)
-
-* Split reference: none (stratified kfolding)
-
-* Stratified kfold protocol:
-    * Training samples: 72% of TB and healthy CXR (including labels)
-    * Validation samples: 18% of TB and healthy CXR (including labels)
-    * Test samples: 10% of TB and healthy CXR (including labels)
-
-* This configuration resolution: varying depending of black borders on original
-  image
-* See :py:mod:`ptbench.data.tbpoc` for dataset details
-"""
-
 from .datamodule import DataModule
 
 datamodule = DataModule("fold-3.json")
+"""TB-POC dataset for TB detection (cross validation fold 3).
+
+See :py:class:`DataModule` for technical details.
+"""
diff --git a/src/ptbench/data/tbpoc/fold_4.py b/src/ptbench/data/tbpoc/fold_4.py
index 8354a4c2..6de0dc13 100644
--- a/src/ptbench/data/tbpoc/fold_4.py
+++ b/src/ptbench/data/tbpoc/fold_4.py
@@ -2,20 +2,10 @@
 #
 # SPDX-License-Identifier: GPL-3.0-or-later
 
-"""TB-POC dataset for TB detection (cross validation fold 4)
-
-* Split reference: none (stratified kfolding)
-
-* Stratified kfold protocol:
-    * Training samples: 72% of TB and healthy CXR (including labels)
-    * Validation samples: 18% of TB and healthy CXR (including labels)
-    * Test samples: 10% of TB and healthy CXR (including labels)
-
-* This configuration resolution: varying depending of black borders on original
-  image
-* See :py:mod:`ptbench.data.tbpoc` for dataset details
-"""
-
 from .datamodule import DataModule
 
 datamodule = DataModule("fold-4.json")
+"""TB-POC dataset for TB detection (cross validation fold 4).
+
+See :py:class:`DataModule` for technical details.
+"""
diff --git a/src/ptbench/data/tbpoc/fold_5.py b/src/ptbench/data/tbpoc/fold_5.py
index cb7f9561..bdca5a36 100644
--- a/src/ptbench/data/tbpoc/fold_5.py
+++ b/src/ptbench/data/tbpoc/fold_5.py
@@ -2,20 +2,10 @@
 #
 # SPDX-License-Identifier: GPL-3.0-or-later
 
-"""TB-POC dataset for TB detection (cross validation fold 5)
-
-* Split reference: none (stratified kfolding)
-
-* Stratified kfold protocol:
-    * Training samples: 72% of TB and healthy CXR (including labels)
-    * Validation samples: 18% of TB and healthy CXR (including labels)
-    * Test samples: 10% of TB and healthy CXR (including labels)
-
-* This configuration resolution: varying depending of black borders on original
-  image
-* See :py:mod:`ptbench.data.tbpoc` for dataset details
-"""
-
 from .datamodule import DataModule
 
 datamodule = DataModule("fold-5.json")
+"""TB-POC dataset for TB detection (cross validation fold 5).
+
+See :py:class:`DataModule` for technical details.
+"""
diff --git a/src/ptbench/data/tbpoc/fold_6.py b/src/ptbench/data/tbpoc/fold_6.py
index 379211aa..c17ba0ba 100644
--- a/src/ptbench/data/tbpoc/fold_6.py
+++ b/src/ptbench/data/tbpoc/fold_6.py
@@ -2,20 +2,10 @@
 #
 # SPDX-License-Identifier: GPL-3.0-or-later
 
-"""TB-POC dataset for TB detection (cross validation fold 6)
-
-* Split reference: none (stratified kfolding)
-
-* Stratified kfold protocol:
-    * Training samples: 72% of TB and healthy CXR (including labels)
-    * Validation samples: 18% of TB and healthy CXR (including labels)
-    * Test samples: 10% of TB and healthy CXR (including labels)
-
-* This configuration resolution: varying depending of black borders on original
-  image
-* See :py:mod:`ptbench.data.tbpoc` for dataset details
-"""
-
 from .datamodule import DataModule
 
 datamodule = DataModule("fold-6.json")
+"""TB-POC dataset for TB detection (cross validation fold 6).
+
+See :py:class:`DataModule` for technical details.
+"""
diff --git a/src/ptbench/data/tbpoc/fold_7.py b/src/ptbench/data/tbpoc/fold_7.py
index b846b88a..4310f2f4 100644
--- a/src/ptbench/data/tbpoc/fold_7.py
+++ b/src/ptbench/data/tbpoc/fold_7.py
@@ -2,20 +2,10 @@
 #
 # SPDX-License-Identifier: GPL-3.0-or-later
 
-"""TB-POC dataset for TB detection (cross validation fold 7)
-
-* Split reference: none (stratified kfolding)
-
-* Stratified kfold protocol:
-    * Training samples: 72% of TB and healthy CXR (including labels)
-    * Validation samples: 18% of TB and healthy CXR (including labels)
-    * Test samples: 10% of TB and healthy CXR (including labels)
-
-* This configuration resolution: varying depending of black borders on original
-  image
-* See :py:mod:`ptbench.data.tbpoc` for dataset details
-"""
-
 from .datamodule import DataModule
 
 datamodule = DataModule("fold-7.json")
+"""TB-POC dataset for TB detection (cross validation fold 7).
+
+See :py:class:`DataModule` for technical details.
+"""
diff --git a/src/ptbench/data/tbpoc/fold_8.py b/src/ptbench/data/tbpoc/fold_8.py
index acfd4296..d7fa5d10 100644
--- a/src/ptbench/data/tbpoc/fold_8.py
+++ b/src/ptbench/data/tbpoc/fold_8.py
@@ -2,20 +2,10 @@
 #
 # SPDX-License-Identifier: GPL-3.0-or-later
 
-"""TB-POC dataset for TB detection (cross validation fold 8)
-
-* Split reference: none (stratified kfolding)
-
-* Stratified kfold protocol:
-    * Training samples: 72% of TB and healthy CXR (including labels)
-    * Validation samples: 18% of TB and healthy CXR (including labels)
-    * Test samples: 10% of TB and healthy CXR (including labels)
-
-* This configuration resolution: varying depending of black borders on original
-  image
-* See :py:mod:`ptbench.data.tbpoc` for dataset details
-"""
-
 from .datamodule import DataModule
 
 datamodule = DataModule("fold-8.json")
+"""TB-POC dataset for TB detection (cross validation fold 8).
+
+See :py:class:`DataModule` for technical details.
+"""
diff --git a/src/ptbench/data/tbpoc/fold_9.py b/src/ptbench/data/tbpoc/fold_9.py
index 4634068e..f37e1f36 100644
--- a/src/ptbench/data/tbpoc/fold_9.py
+++ b/src/ptbench/data/tbpoc/fold_9.py
@@ -2,20 +2,10 @@
 #
 # SPDX-License-Identifier: GPL-3.0-or-later
 
-"""TB-POC dataset for TB detection (cross validation fold 9)
-
-* Split reference: none (stratified kfolding)
-
-* Stratified kfold protocol:
-    * Training samples: 72% of TB and healthy CXR (including labels)
-    * Validation samples: 18% of TB and healthy CXR (including labels)
-    * Test samples: 10% of TB and healthy CXR (including labels)
-
-* This configuration resolution: varying depending of black borders on original
-  image
-* See :py:mod:`ptbench.data.tbpoc` for dataset details
-"""
-
 from .datamodule import DataModule
 
 datamodule = DataModule("fold-9.json")
+"""TB-POC dataset for TB detection (cross validation fold 9).
+
+See :py:class:`DataModule` for technical details.
+"""
-- 
GitLab