Adding config/data/test files for Montgomery County, JSRT, Shenzhen datasets

05644f0c · Matheus Armani Renzo · a290e9a2 · 05644f0c · 05644f0c · 05644f0c
Commit 05644f0c authored 3 years ago by Matheus Armani Renzo
--- a/bob/ip/binseg/configs/datasets/JSRT/__init__.py
+++ b/bob/ip/binseg/configs/datasets/JSRT/__init__.py
+#!/usr/bin/env python
+# coding=utf-8
+
+def _maker(protocol):
+
+    from ....data.transforms import Resize
+    from ....data.JSRT import dataset as raw
+    from .. import make_dataset as mk
+    return mk(raw.subsets(protocol), [Resize((512,512))])
--- a/bob/ip/binseg/configs/datasets/JSRT/default.py
+++ b/bob/ip/binseg/configs/datasets/JSRT/default.py
+#!/usr/bin/env python
+# coding=utf-8
+
+"""Japanese Society of Radiological Technology dataset for Lung Segmentation (default protocol)
+
+* Split reference: [GAÁL-2020]_
+* Configuration resolution: 512 x 512
+* See :py:mod:`bob.ip.binseg.data.JSRT` for dataset details
+"""
+
+from bob.ip.binseg.configs.datasets.JSRT import _maker
+
+dataset = _maker("default")
--- a/bob/ip/binseg/configs/datasets/JSRT/xtest.py
+++ b/bob/ip/binseg/configs/datasets/JSRT/xtest.py
+#!/usr/bin/env python
+# coding=utf-8
+
+"""JSRT cross-evaluation dataset
+"""
+
+from bob.ip.binseg.configs.datasets.JSRT.default import (
+    dataset as _jsrt
+)
+from bob.ip.binseg.configs.datasets.MC.default import dataset as _mc
+from bob.ip.binseg.configs.datasets.Shenzhen.default import (
+    dataset as _shenzhen,
+)
+
+
+dataset = {
+    "train": _jsrt["train"],
+    "validation": _jsrt["validation"],
+    "test": _jsrt["test"],
+    "MC (train)": _mc["train"],
+    "MC (validation)": _mc["validation"],
+    "MC (test)": _mc["test"],
+    "Shenzhen (train)": _shenzhen["train"],
+    "Shenzhen (validation)": _shenzhen["validation"],
+    "Shenzhen (test)": _shenzhen["test"]
+}
--- a/bob/ip/binseg/configs/datasets/MC/__init__.py
+++ b/bob/ip/binseg/configs/datasets/MC/__init__.py
+#!/usr/bin/env python
+# coding=utf-8
+
+def _maker(protocol):
+
+    from ....data.transforms import Resize
+    from ....data.MC import dataset as raw
+    from .. import make_dataset as mk
+    return mk(raw.subsets(protocol), [Resize((512,512))])
--- a/bob/ip/binseg/configs/datasets/MC/default.py
+++ b/bob/ip/binseg/configs/datasets/MC/default.py
+#!/usr/bin/env python
+# coding=utf-8
+
+"""Montgomery County dataset for Lung Segmentation (default protocol)
+
+* Split reference: [GAÁL-2020]_
+* Configuration resolution: 512 x 512
+* See :py:mod:`bob.ip.binseg.data.MC` for dataset details
+"""
+
+from bob.ip.binseg.configs.datasets.MC import _maker
+
+dataset = _maker("default")
--- a/bob/ip/binseg/configs/datasets/MC/xtest.py
+++ b/bob/ip/binseg/configs/datasets/MC/xtest.py
+#!/usr/bin/env python
+# coding=utf-8
+
+"""MC cross-evaluation dataset
+"""
+
+from bob.ip.binseg.configs.datasets.MC.default import (
+    dataset as _mc
+)
+from bob.ip.binseg.configs.datasets.JSRT.default import dataset as _jsrt
+from bob.ip.binseg.configs.datasets.Shenzhen.default import (
+    dataset as _shenzhen,
+)
+
+
+dataset = {
+    "train": _mc["train"],
+    "validation": _mc["validation"],
+    "test": _mc["test"],
+    "JSRT (train)": _jsrt["train"],
+    "JSRT (validation)": _jsrt["validation"],
+    "JSRT (test)": _jsrt["test"],
+    "Shenzhen (train)": _shenzhen["train"],
+    "Shenzhen (validation)": _shenzhen["validation"],
+    "Shenzhen (test)": _shenzhen["test"]
+}
--- a/bob/ip/binseg/configs/datasets/Shenzhen/__init__.py
+++ b/bob/ip/binseg/configs/datasets/Shenzhen/__init__.py
+#!/usr/bin/env python
+# coding=utf-8
+
+def _maker(protocol):
+
+    from ....data.transforms import Resize
+    from ....data.Shenzhen import dataset as raw
+    from .. import make_dataset as mk
+    return mk(raw.subsets(protocol), [Resize((512, 512))])
+
+
+def _maker_256(protocol):
+
+    from ....data.transforms import Resize
+    from ....data.Shenzhen import dataset as raw
+    from .. import make_dataset as mk
+    return mk(raw.subsets(protocol), [Resize((256, 256))])
--- a/bob/ip/binseg/configs/datasets/Shenzhen/default.py
+++ b/bob/ip/binseg/configs/datasets/Shenzhen/default.py
+#!/usr/bin/env python
+# coding=utf-8
+
+"""Shenzhen dataset for Lung Segmentation (default protocol)
+
+* Split reference: [GAÁL-2020]_
+* Configuration resolution: 512 x 512
+* See :py:mod:`bob.ip.binseg.data.Shenzhen` for dataset details
+"""
+
+from bob.ip.binseg.configs.datasets.Shenzhen import _maker
+
+dataset = _maker("default")
--- a/bob/ip/binseg/configs/datasets/Shenzhen/default_256.py
+++ b/bob/ip/binseg/configs/datasets/Shenzhen/default_256.py
+#!/usr/bin/env python
+# coding=utf-8
+
+"""Shenzhen dataset for Lung Segmentation (default protocol)
+
+* Split reference: [GAÁL-2020]_
+* Configuration resolution: 512 x 512
+* See :py:mod:`bob.ip.binseg.data.Shenzhen` for dataset details
+"""
+
+from bob.ip.binseg.configs.datasets.Shenzhen import _maker_256
+
+dataset = _maker_256("default")
--- a/bob/ip/binseg/configs/datasets/Shenzhen/xtest.py
+++ b/bob/ip/binseg/configs/datasets/Shenzhen/xtest.py
+#!/usr/bin/env python
+# coding=utf-8
+
+"""Shenzhen cross-evaluation dataset
+"""
+
+from bob.ip.binseg.configs.datasets.Shenzhen.default import (
+    dataset as _shenzhen
+)
+from bob.ip.binseg.configs.datasets.JSRT.default import dataset as _jsrt
+from bob.ip.binseg.configs.datasets.MC.default import (
+    dataset as _mc,
+)
+
+
+dataset = {
+    "train": _shenzhen["train"],
+    "validation": _shenzhen["validation"],
+    "test": _shenzhen["test"],
+    "MC (train)": _mc["train"],
+    "MC (validation)": _mc["validation"],
+    "MC (test)": _mc["test"],
+    "JSRT (train)": _jsrt["train"],
+    "JSRT (validation)": _jsrt["validation"],
+    "JSRT (test)": _jsrt["test"]
+}
--- a/bob/ip/binseg/data/JSRT/__init__.py
+++ b/bob/ip/binseg/data/JSRT/__init__.py
+#!/usr/bin/env python
+# coding=utf-8
+
+"""Japanese Society of Radiological Technology dataset for Lung Segmentation
+
+The database includes 154 nodule and 93 non-nodule images.  It contains a total  
+of 247 resolution of 2048 x 2048 One set of ground-truth lung annotations is 
+available.
+
+* Reference: [SHIRAISHI-2000]_
+* Original resolution (height x width): 2048 x 2048
+* Configuration resolution: 512 x 512 (after rescaling)
+* Split reference: [GAÁL-2020]_
+* Protocol ``default``:
+
+  * Training samples: 172 (including labels)
+  * Validation samples: 25 (including labels)
+  * Test samples: 50 (including labels)
+
+"""
+
+import os
+import pkg_resources
+import numpy as np
+import bob.extension
+from PIL import Image
+
+from ..dataset import JSONDataset
+from ..loader import load_pil_rgb, load_pil_1, make_delayed
+
+_protocols = [
+    pkg_resources.resource_filename(__name__, "default.json"),
+]
+
+_root_path = bob.extension.rc.get(
+    "bob.ip.binseg.JSRT.datadir", os.path.realpath(os.curdir)
+)
+
+
+def _raw_data_loader(sample):
+    return dict(
+        data=load_pil_rgb(os.path.join(_root_path, sample["data"])),
+        label=Image.fromarray(np.ma.mask_or(np.asarray(load_pil_1(os.path.join(_root_path, sample["label_l"]))),
+                                            np.asarray(load_pil_1(os.path.join(_root_path, sample["label_r"])))
+        )),
+    )
+
+def _loader(context, sample):
+    # "context" is ignored in this case - database is homogeneous
+    # we returned delayed samples to avoid loading all images at once
+    return make_delayed(sample, _raw_data_loader)
+
+
+dataset = JSONDataset(
+    protocols=_protocols, fieldnames=("data", "label_l", "label_r"), loader=_loader)
+    
+"""Japanese Society of Radiological Technology dataset object"""
--- a/bob/ip/binseg/data/JSRT/default.json
+++ b/bob/ip/binseg/data/JSRT/default.json
--- a/bob/ip/binseg/data/MC/__init__.py
+++ b/bob/ip/binseg/data/MC/__init__.py
+#!/usr/bin/env python
+# coding=utf-8
+
+"""Montgomery County dataset for Lung Segmentation
+
+The database includes 58 cases with	manifestation of tuberculosis, and 80
+normal cases.  It contains a total  of 138 resolution of 4020 x 4892, or
+4892 x 4020. One set of ground-truth lung annotations is available.
+
+* Reference: [JAEGER-2014]_
+* Original resolution (height x width): 4020 x 4892, or 4892 x 4020
+* Configuration resolution: 512 x 512 (after rescaling)
+* Split reference: [GAÁL-2020]_
+* Protocol ``default``:
+
+  * Training samples: 96 (including labels)
+  * Validation samples: 14 (including labels)
+  * Test samples: 28 (including labels)
+
+"""
+
+import os
+import pkg_resources
+import numpy as np
+import bob.extension
+from PIL import Image
+
+from ..dataset import JSONDataset
+from ..loader import load_pil_rgb, load_pil_1, make_delayed
+
+_protocols = [
+    pkg_resources.resource_filename(__name__, "default.json"),
+]
+
+_root_path = bob.extension.rc.get(
+    "bob.ip.binseg.MC.datadir", os.path.realpath(os.curdir)
+)
+
+
+def _raw_data_loader(sample):
+    return dict(
+        data=load_pil_rgb(os.path.join(_root_path, sample["data"])),
+        label=Image.fromarray(np.ma.mask_or(np.asarray(load_pil_1(os.path.join(_root_path, sample["label_l"]))),
+                                            np.asarray(load_pil_1(os.path.join(_root_path, sample["label_r"])))
+        )),
+    )
+
+def _loader(context, sample):
+    # "context" is ignored in this case - database is homogeneous
+    # we returned delayed samples to avoid loading all images at once
+    return make_delayed(sample, _raw_data_loader)
+
+
+dataset = JSONDataset(
+    protocols=_protocols, fieldnames=("data", "label_l", "label_r"), loader=_loader)
+    
+"""Montgomery County dataset object"""
--- a/bob/ip/binseg/data/MC/default.json
+++ b/bob/ip/binseg/data/MC/default.json
--- a/bob/ip/binseg/data/Shenzhen/__init__.py
+++ b/bob/ip/binseg/data/Shenzhen/__init__.py
+#!/usr/bin/env python
+# coding=utf-8
+
+"""Shenzhen No.3 People’s Hospital dataset for Lung Segmentation
+
+The database includes 336 cases with manifestation of tuberculosis, and 326
+normal cases.  It contains a total  of 662 images. Image size varies for each 
+X-ray. It is approximately 3K x 3K. One set of ground-truth lung annotations is 
+available for 566 of the 662 images.
+
+* Reference: [JAEGER-2014]_
+* Original resolution (height x width): Approximately 3K x 3K (varies)
+* Configuration resolution: 512 x 512 (after rescaling)
+* Split reference: [GAÁL-2020]_
+* Protocol ``default``:
+
+  * Training samples: 396 (including labels)
+  * Validation samples: 56 (including labels)
+  * Test samples: 114 (including labels)
+
+"""
+
+import os
+import pkg_resources
+import numpy as np
+import bob.extension
+from PIL import Image
+
+from ..dataset import JSONDataset
+from ..loader import load_pil_rgb, load_pil_1, make_delayed
+
+_protocols = [
+    pkg_resources.resource_filename(__name__, "default.json"),
+]
+
+_root_path = bob.extension.rc.get(
+    "bob.ip.binseg.Shenzhen.datadir", os.path.realpath(os.curdir)
+)
+
+
+def _raw_data_loader(sample):
+    return dict(
+        data=load_pil_rgb(os.path.join(_root_path, sample["data"])),
+        label=load_pil_1(os.path.join(_root_path, sample["label"])),
+    )
+
+def _loader(context, sample):
+    # "context" is ignored in this case - database is homogeneous
+    # we returned delayed samples to avoid loading all images at once
+    return make_delayed(sample, _raw_data_loader)
+
+
+dataset = JSONDataset(
+    protocols=_protocols, fieldnames=("data", "label"), loader=_loader)
+    
+"""Shenzhen dataset object"""
--- a/bob/ip/binseg/data/Shenzhen/default.json
+++ b/bob/ip/binseg/data/Shenzhen/default.json
--- a/bob/ip/binseg/test/test_jsrt.py
+++ b/bob/ip/binseg/test/test_jsrt.py
+#!/usr/bin/env python
+# coding=utf-8
+
+
+"""Tests for Japanese Society of Radiological Technology"""
+
+import os
+import numpy
+import pytest
+
+from ..data.JSRT import dataset
+from .utils import count_bw
+
+
+def test_protocol_consistency():
+
+    subset = dataset.subsets("default")
+    assert len(subset) == 3
+
+    assert "train" in subset
+    assert len(subset["train"]) == 172
+    for s in subset["train"]:
+        assert s.key.startswith("JSRT")
+
+    assert "validation" in subset
+    assert len(subset["validation"]) == 25
+    for s in subset["validation"]:
+        assert s.key.startswith("JSRT")
+
+    assert "test" in subset
+    assert len(subset["test"]) == 50
+    for s in subset["test"]:
+        assert s.key.startswith("JSRT")
+
+
+@pytest.mark.skip_if_rc_var_not_set('bob.ip.binseg.JSRT.datadir')
+def test_loading():
+
+    image_size = (2048, 2048)
+    label_size = (1024, 1024)
+
+    def _check_sample(s, bw_threshold_label):
+
+        data = s.data
+        assert isinstance(data, dict)
+        assert len(data) == 2
+
+        assert "data" in data
+        assert data["data"].size == image_size
+        assert data["data"].mode == "RGB"
+
+        assert "label" in data
+        assert data["label"].size == label_size
+        assert data["label"].mode == "1"
+
+
+        b, w = count_bw(data["label"])
+        assert (b + w) == numpy.prod(label_size), (
+            f"Counts of black + white ({b}+{w}) do not add up to total "
+            f"image size ({numpy.prod(label_size)}) at '{s.key}':label"
+        )
+        assert (w / b) < bw_threshold_label, (
+            f"The proportion between black and white pixels "
+            f"({w}/{b}={w/b:.3f}) is larger than the allowed threshold "
+            f"of {bw_threshold_label} at '{s.key}':label - this could "
+            f"indicate a loading problem!"
+        )
+
+        return w/b
+
+    limit = None  #use this to limit testing to first images only
+    subset = dataset.subsets("default")
+    proportions = [_check_sample(s, 0.85) for s in subset["train"][:limit]]
+    proportions = [_check_sample(s, 0.85) for s in subset["validation"][:limit]]
+    proportions = [_check_sample(s, 0.85) for s in subset["test"][:limit]]
+
+
+
+@pytest.mark.skip_if_rc_var_not_set('bob.ip.binseg.JSRT.datadir')
+def test_check():
+    assert dataset.check() == 0
--- a/bob/ip/binseg/test/test_mc.py
+++ b/bob/ip/binseg/test/test_mc.py
+#!/usr/bin/env python
+# coding=utf-8
+
+
+"""Tests for Montgomery County"""
+
+import os
+import numpy
+import pytest
+
+from ..data.MC import dataset
+from .utils import count_bw
+
+
+def test_protocol_consistency():
+
+    subset = dataset.subsets("default")
+    assert len(subset) == 3
+
+    assert "train" in subset
+    assert len(subset["train"]) == 96
+    for s in subset["train"]:
+        assert s.key.startswith("MontgomerySet")
+
+    assert "validation" in subset
+    assert len(subset["validation"]) == 14
+    for s in subset["validation"]:
+        assert s.key.startswith("MontgomerySet")
+
+    assert "test" in subset
+    assert len(subset["test"]) == 28
+    for s in subset["test"]:
+        assert s.key.startswith("MontgomerySet")
+
+
+@pytest.mark.skip_if_rc_var_not_set('bob.ip.binseg.MC.datadir')
+def test_loading():
+
+    image_size_1 = (4892, 4020)
+    image_size_2 = (4020, 4892)
+
+
+    def _check_sample(s, bw_threshold_label):
+
+        data = s.data
+        assert isinstance(data, dict)
+        assert len(data) == 2
+
+        assert "data" in data
+        assert (data["data"].size == image_size_1 or 
+                data["data"].size == image_size_2)
+        assert data["data"].mode == "RGB"
+
+        assert "label" in data
+        assert (data["label"].size == image_size_1 or 
+                data["label"].size == image_size_2)
+        assert data["label"].mode == "1"
+
+
+        b, w = count_bw(data["label"])
+        assert (b + w) == numpy.prod(image_size_1), (
+            f"Counts of black + white ({b}+{w}) do not add up to total "
+            f"image size ({numpy.prod(image_size)}) at '{s.key}':label"
+        )
+        assert (w / b) < bw_threshold_label, (
+            f"The proportion between black and white pixels "
+            f"({w}/{b}={w/b:.3f}) is larger than the allowed threshold "
+            f"of {bw_threshold_label} at '{s.key}':label - this could "
+            f"indicate a loading problem!"
+        )
+
+        return w/b
+
+    limit = None  #use this to limit testing to first images only
+    subset = dataset.subsets("default")
+    proportions = [_check_sample(s, 0.67) for s in subset["train"][:limit]]
+    proportions = [_check_sample(s, 0.67) for s in subset["validation"][:limit]]
+    proportions = [_check_sample(s, 0.67) for s in subset["test"][:limit]]
+
+
+
+@pytest.mark.skip_if_rc_var_not_set('bob.ip.binseg.MC.datadir')
+def test_check():
+    assert dataset.check() == 0
--- a/bob/ip/binseg/test/test_shenzhen.py
+++ b/bob/ip/binseg/test/test_shenzhen.py
+#!/usr/bin/env python
+# coding=utf-8
+
+
+"""Tests for Shenzhen"""
+
+import os
+import numpy
+import pytest
+
+from ..data.Shenzhen import dataset
+from .utils import count_bw
+
+
+def test_protocol_consistency():
+
+    subset = dataset.subsets("default")
+    assert len(subset) == 3
+
+    assert "train" in subset
+    assert len(subset["train"]) == 396
+    for s in subset["train"]:
+        assert s.key.startswith("ChinaSet_AllFiles")
+
+    assert "validation" in subset
+    assert len(subset["validation"]) == 56
+    for s in subset["validation"]:
+        assert s.key.startswith("ChinaSet_AllFiles")
+
+    assert "test" in subset
+    assert len(subset["test"]) == 114
+    for s in subset["test"]:
+        assert s.key.startswith("ChinaSet_AllFiles")
+
+
+@pytest.mark.skip_if_rc_var_not_set('bob.ip.binseg.Shenzhen.datadir')
+def test_loading():
+
+    min_image_size = (1130, 948)
+    max_image_size = (3001, 3001)
+
+
+    def _check_sample(s, bw_threshold_label):
+
+        data = s.data
+        assert isinstance(data, dict)
+        assert len(data) == 2
+
+        assert "data" in data
+        assert data["data"].mode == "RGB"
+
+        assert "label" in data
+        assert data["label"].mode == "1"
+
+
+        b, w = count_bw(data["label"])
+        assert (b + w) >= numpy.prod(min_image_size), (
+            f"Counts of black + white ({b}+{w}) lower than smallest image total"
+            f"image size ({numpy.prod(min_image_size)}) at '{s.key}':label"
+        )
+        assert (b + w) <= numpy.prod(max_image_size), (
+            f"Counts of black + white ({b}+{w}) higher than largest image total"
+            f"image size ({numpy.prod(max_image_size)}) at '{s.key}':label"
+        )
+        assert (w / b) < bw_threshold_label, (
+            f"The proportion between black and white pixels "
+            f"({w}/{b}={w/b:.3f}) is larger than the allowed threshold "
+            f"of {bw_threshold_label} at '{s.key}':label - this could "
+            f"indicate a loading problem!"
+        )
+
+        return w/b
+
+    limit = None  #use this to limit testing to first images only
+    subset = dataset.subsets("default")
+    proportions = [_check_sample(s, 0.77) for s in subset["train"][:limit]]
+    proportions = [_check_sample(s, 0.77) for s in subset["validation"][:limit]]
+    proportions = [_check_sample(s, 0.77) for s in subset["test"][:limit]]
+
+
+
+@pytest.mark.skip_if_rc_var_not_set('bob.ip.binseg.Shenzhen.datadir')
+def test_check():
+    assert dataset.check() == 0