Skip to content
Snippets Groups Projects
Commit 05644f0c authored by Matheus Armani Renzo's avatar Matheus Armani Renzo
Browse files

Adding config/data/test files for Montgomery County, JSRT, Shenzhen datasets

parent a290e9a2
No related branches found
No related tags found
1 merge request!27Add config/data/test files for Montgomery County, JSRT, Shenzhen datasets
Pipeline #52300 passed
Showing
with 4797 additions and 0 deletions
#!/usr/bin/env python
# coding=utf-8
def _maker(protocol):
from ....data.transforms import Resize
from ....data.JSRT import dataset as raw
from .. import make_dataset as mk
return mk(raw.subsets(protocol), [Resize((512,512))])
#!/usr/bin/env python
# coding=utf-8
"""Japanese Society of Radiological Technology dataset for Lung Segmentation (default protocol)
* Split reference: [GAÁL-2020]_
* Configuration resolution: 512 x 512
* See :py:mod:`bob.ip.binseg.data.JSRT` for dataset details
"""
from bob.ip.binseg.configs.datasets.JSRT import _maker
dataset = _maker("default")
#!/usr/bin/env python
# coding=utf-8
"""JSRT cross-evaluation dataset
"""
from bob.ip.binseg.configs.datasets.JSRT.default import (
dataset as _jsrt
)
from bob.ip.binseg.configs.datasets.MC.default import dataset as _mc
from bob.ip.binseg.configs.datasets.Shenzhen.default import (
dataset as _shenzhen,
)
dataset = {
"train": _jsrt["train"],
"validation": _jsrt["validation"],
"test": _jsrt["test"],
"MC (train)": _mc["train"],
"MC (validation)": _mc["validation"],
"MC (test)": _mc["test"],
"Shenzhen (train)": _shenzhen["train"],
"Shenzhen (validation)": _shenzhen["validation"],
"Shenzhen (test)": _shenzhen["test"]
}
#!/usr/bin/env python
# coding=utf-8
def _maker(protocol):
from ....data.transforms import Resize
from ....data.MC import dataset as raw
from .. import make_dataset as mk
return mk(raw.subsets(protocol), [Resize((512,512))])
#!/usr/bin/env python
# coding=utf-8
"""Montgomery County dataset for Lung Segmentation (default protocol)
* Split reference: [GAÁL-2020]_
* Configuration resolution: 512 x 512
* See :py:mod:`bob.ip.binseg.data.MC` for dataset details
"""
from bob.ip.binseg.configs.datasets.MC import _maker
dataset = _maker("default")
#!/usr/bin/env python
# coding=utf-8
"""MC cross-evaluation dataset
"""
from bob.ip.binseg.configs.datasets.MC.default import (
dataset as _mc
)
from bob.ip.binseg.configs.datasets.JSRT.default import dataset as _jsrt
from bob.ip.binseg.configs.datasets.Shenzhen.default import (
dataset as _shenzhen,
)
dataset = {
"train": _mc["train"],
"validation": _mc["validation"],
"test": _mc["test"],
"JSRT (train)": _jsrt["train"],
"JSRT (validation)": _jsrt["validation"],
"JSRT (test)": _jsrt["test"],
"Shenzhen (train)": _shenzhen["train"],
"Shenzhen (validation)": _shenzhen["validation"],
"Shenzhen (test)": _shenzhen["test"]
}
#!/usr/bin/env python
# coding=utf-8
def _maker(protocol):
from ....data.transforms import Resize
from ....data.Shenzhen import dataset as raw
from .. import make_dataset as mk
return mk(raw.subsets(protocol), [Resize((512, 512))])
def _maker_256(protocol):
from ....data.transforms import Resize
from ....data.Shenzhen import dataset as raw
from .. import make_dataset as mk
return mk(raw.subsets(protocol), [Resize((256, 256))])
#!/usr/bin/env python
# coding=utf-8
"""Shenzhen dataset for Lung Segmentation (default protocol)
* Split reference: [GAÁL-2020]_
* Configuration resolution: 512 x 512
* See :py:mod:`bob.ip.binseg.data.Shenzhen` for dataset details
"""
from bob.ip.binseg.configs.datasets.Shenzhen import _maker
dataset = _maker("default")
#!/usr/bin/env python
# coding=utf-8
"""Shenzhen dataset for Lung Segmentation (default protocol)
* Split reference: [GAÁL-2020]_
* Configuration resolution: 512 x 512
* See :py:mod:`bob.ip.binseg.data.Shenzhen` for dataset details
"""
from bob.ip.binseg.configs.datasets.Shenzhen import _maker_256
dataset = _maker_256("default")
#!/usr/bin/env python
# coding=utf-8
"""Shenzhen cross-evaluation dataset
"""
from bob.ip.binseg.configs.datasets.Shenzhen.default import (
dataset as _shenzhen
)
from bob.ip.binseg.configs.datasets.JSRT.default import dataset as _jsrt
from bob.ip.binseg.configs.datasets.MC.default import (
dataset as _mc,
)
dataset = {
"train": _shenzhen["train"],
"validation": _shenzhen["validation"],
"test": _shenzhen["test"],
"MC (train)": _mc["train"],
"MC (validation)": _mc["validation"],
"MC (test)": _mc["test"],
"JSRT (train)": _jsrt["train"],
"JSRT (validation)": _jsrt["validation"],
"JSRT (test)": _jsrt["test"]
}
#!/usr/bin/env python
# coding=utf-8
"""Japanese Society of Radiological Technology dataset for Lung Segmentation
The database includes 154 nodule and 93 non-nodule images. It contains a total
of 247 resolution of 2048 x 2048 One set of ground-truth lung annotations is
available.
* Reference: [SHIRAISHI-2000]_
* Original resolution (height x width): 2048 x 2048
* Configuration resolution: 512 x 512 (after rescaling)
* Split reference: [GAÁL-2020]_
* Protocol ``default``:
* Training samples: 172 (including labels)
* Validation samples: 25 (including labels)
* Test samples: 50 (including labels)
"""
import os
import pkg_resources
import numpy as np
import bob.extension
from PIL import Image
from ..dataset import JSONDataset
from ..loader import load_pil_rgb, load_pil_1, make_delayed
_protocols = [
pkg_resources.resource_filename(__name__, "default.json"),
]
_root_path = bob.extension.rc.get(
"bob.ip.binseg.JSRT.datadir", os.path.realpath(os.curdir)
)
def _raw_data_loader(sample):
return dict(
data=load_pil_rgb(os.path.join(_root_path, sample["data"])),
label=Image.fromarray(np.ma.mask_or(np.asarray(load_pil_1(os.path.join(_root_path, sample["label_l"]))),
np.asarray(load_pil_1(os.path.join(_root_path, sample["label_r"])))
)),
)
def _loader(context, sample):
# "context" is ignored in this case - database is homogeneous
# we returned delayed samples to avoid loading all images at once
return make_delayed(sample, _raw_data_loader)
dataset = JSONDataset(
protocols=_protocols, fieldnames=("data", "label_l", "label_r"), loader=_loader)
"""Japanese Society of Radiological Technology dataset object"""
This diff is collapsed.
#!/usr/bin/env python
# coding=utf-8
"""Montgomery County dataset for Lung Segmentation
The database includes 58 cases with manifestation of tuberculosis, and 80
normal cases. It contains a total of 138 resolution of 4020 x 4892, or
4892 x 4020. One set of ground-truth lung annotations is available.
* Reference: [JAEGER-2014]_
* Original resolution (height x width): 4020 x 4892, or 4892 x 4020
* Configuration resolution: 512 x 512 (after rescaling)
* Split reference: [GAÁL-2020]_
* Protocol ``default``:
* Training samples: 96 (including labels)
* Validation samples: 14 (including labels)
* Test samples: 28 (including labels)
"""
import os
import pkg_resources
import numpy as np
import bob.extension
from PIL import Image
from ..dataset import JSONDataset
from ..loader import load_pil_rgb, load_pil_1, make_delayed
_protocols = [
pkg_resources.resource_filename(__name__, "default.json"),
]
_root_path = bob.extension.rc.get(
"bob.ip.binseg.MC.datadir", os.path.realpath(os.curdir)
)
def _raw_data_loader(sample):
return dict(
data=load_pil_rgb(os.path.join(_root_path, sample["data"])),
label=Image.fromarray(np.ma.mask_or(np.asarray(load_pil_1(os.path.join(_root_path, sample["label_l"]))),
np.asarray(load_pil_1(os.path.join(_root_path, sample["label_r"])))
)),
)
def _loader(context, sample):
# "context" is ignored in this case - database is homogeneous
# we returned delayed samples to avoid loading all images at once
return make_delayed(sample, _raw_data_loader)
dataset = JSONDataset(
protocols=_protocols, fieldnames=("data", "label_l", "label_r"), loader=_loader)
"""Montgomery County dataset object"""
This diff is collapsed.
#!/usr/bin/env python
# coding=utf-8
"""Shenzhen No.3 People’s Hospital dataset for Lung Segmentation
The database includes 336 cases with manifestation of tuberculosis, and 326
normal cases. It contains a total of 662 images. Image size varies for each
X-ray. It is approximately 3K x 3K. One set of ground-truth lung annotations is
available for 566 of the 662 images.
* Reference: [JAEGER-2014]_
* Original resolution (height x width): Approximately 3K x 3K (varies)
* Configuration resolution: 512 x 512 (after rescaling)
* Split reference: [GAÁL-2020]_
* Protocol ``default``:
* Training samples: 396 (including labels)
* Validation samples: 56 (including labels)
* Test samples: 114 (including labels)
"""
import os
import pkg_resources
import numpy as np
import bob.extension
from PIL import Image
from ..dataset import JSONDataset
from ..loader import load_pil_rgb, load_pil_1, make_delayed
_protocols = [
pkg_resources.resource_filename(__name__, "default.json"),
]
_root_path = bob.extension.rc.get(
"bob.ip.binseg.Shenzhen.datadir", os.path.realpath(os.curdir)
)
def _raw_data_loader(sample):
return dict(
data=load_pil_rgb(os.path.join(_root_path, sample["data"])),
label=load_pil_1(os.path.join(_root_path, sample["label"])),
)
def _loader(context, sample):
# "context" is ignored in this case - database is homogeneous
# we returned delayed samples to avoid loading all images at once
return make_delayed(sample, _raw_data_loader)
dataset = JSONDataset(
protocols=_protocols, fieldnames=("data", "label"), loader=_loader)
"""Shenzhen dataset object"""
This diff is collapsed.
#!/usr/bin/env python
# coding=utf-8
"""Tests for Japanese Society of Radiological Technology"""
import os
import numpy
import pytest
from ..data.JSRT import dataset
from .utils import count_bw
def test_protocol_consistency():
subset = dataset.subsets("default")
assert len(subset) == 3
assert "train" in subset
assert len(subset["train"]) == 172
for s in subset["train"]:
assert s.key.startswith("JSRT")
assert "validation" in subset
assert len(subset["validation"]) == 25
for s in subset["validation"]:
assert s.key.startswith("JSRT")
assert "test" in subset
assert len(subset["test"]) == 50
for s in subset["test"]:
assert s.key.startswith("JSRT")
@pytest.mark.skip_if_rc_var_not_set('bob.ip.binseg.JSRT.datadir')
def test_loading():
image_size = (2048, 2048)
label_size = (1024, 1024)
def _check_sample(s, bw_threshold_label):
data = s.data
assert isinstance(data, dict)
assert len(data) == 2
assert "data" in data
assert data["data"].size == image_size
assert data["data"].mode == "RGB"
assert "label" in data
assert data["label"].size == label_size
assert data["label"].mode == "1"
b, w = count_bw(data["label"])
assert (b + w) == numpy.prod(label_size), (
f"Counts of black + white ({b}+{w}) do not add up to total "
f"image size ({numpy.prod(label_size)}) at '{s.key}':label"
)
assert (w / b) < bw_threshold_label, (
f"The proportion between black and white pixels "
f"({w}/{b}={w/b:.3f}) is larger than the allowed threshold "
f"of {bw_threshold_label} at '{s.key}':label - this could "
f"indicate a loading problem!"
)
return w/b
limit = None #use this to limit testing to first images only
subset = dataset.subsets("default")
proportions = [_check_sample(s, 0.85) for s in subset["train"][:limit]]
proportions = [_check_sample(s, 0.85) for s in subset["validation"][:limit]]
proportions = [_check_sample(s, 0.85) for s in subset["test"][:limit]]
@pytest.mark.skip_if_rc_var_not_set('bob.ip.binseg.JSRT.datadir')
def test_check():
assert dataset.check() == 0
#!/usr/bin/env python
# coding=utf-8
"""Tests for Montgomery County"""
import os
import numpy
import pytest
from ..data.MC import dataset
from .utils import count_bw
def test_protocol_consistency():
subset = dataset.subsets("default")
assert len(subset) == 3
assert "train" in subset
assert len(subset["train"]) == 96
for s in subset["train"]:
assert s.key.startswith("MontgomerySet")
assert "validation" in subset
assert len(subset["validation"]) == 14
for s in subset["validation"]:
assert s.key.startswith("MontgomerySet")
assert "test" in subset
assert len(subset["test"]) == 28
for s in subset["test"]:
assert s.key.startswith("MontgomerySet")
@pytest.mark.skip_if_rc_var_not_set('bob.ip.binseg.MC.datadir')
def test_loading():
image_size_1 = (4892, 4020)
image_size_2 = (4020, 4892)
def _check_sample(s, bw_threshold_label):
data = s.data
assert isinstance(data, dict)
assert len(data) == 2
assert "data" in data
assert (data["data"].size == image_size_1 or
data["data"].size == image_size_2)
assert data["data"].mode == "RGB"
assert "label" in data
assert (data["label"].size == image_size_1 or
data["label"].size == image_size_2)
assert data["label"].mode == "1"
b, w = count_bw(data["label"])
assert (b + w) == numpy.prod(image_size_1), (
f"Counts of black + white ({b}+{w}) do not add up to total "
f"image size ({numpy.prod(image_size)}) at '{s.key}':label"
)
assert (w / b) < bw_threshold_label, (
f"The proportion between black and white pixels "
f"({w}/{b}={w/b:.3f}) is larger than the allowed threshold "
f"of {bw_threshold_label} at '{s.key}':label - this could "
f"indicate a loading problem!"
)
return w/b
limit = None #use this to limit testing to first images only
subset = dataset.subsets("default")
proportions = [_check_sample(s, 0.67) for s in subset["train"][:limit]]
proportions = [_check_sample(s, 0.67) for s in subset["validation"][:limit]]
proportions = [_check_sample(s, 0.67) for s in subset["test"][:limit]]
@pytest.mark.skip_if_rc_var_not_set('bob.ip.binseg.MC.datadir')
def test_check():
assert dataset.check() == 0
#!/usr/bin/env python
# coding=utf-8
"""Tests for Shenzhen"""
import os
import numpy
import pytest
from ..data.Shenzhen import dataset
from .utils import count_bw
def test_protocol_consistency():
subset = dataset.subsets("default")
assert len(subset) == 3
assert "train" in subset
assert len(subset["train"]) == 396
for s in subset["train"]:
assert s.key.startswith("ChinaSet_AllFiles")
assert "validation" in subset
assert len(subset["validation"]) == 56
for s in subset["validation"]:
assert s.key.startswith("ChinaSet_AllFiles")
assert "test" in subset
assert len(subset["test"]) == 114
for s in subset["test"]:
assert s.key.startswith("ChinaSet_AllFiles")
@pytest.mark.skip_if_rc_var_not_set('bob.ip.binseg.Shenzhen.datadir')
def test_loading():
min_image_size = (1130, 948)
max_image_size = (3001, 3001)
def _check_sample(s, bw_threshold_label):
data = s.data
assert isinstance(data, dict)
assert len(data) == 2
assert "data" in data
assert data["data"].mode == "RGB"
assert "label" in data
assert data["label"].mode == "1"
b, w = count_bw(data["label"])
assert (b + w) >= numpy.prod(min_image_size), (
f"Counts of black + white ({b}+{w}) lower than smallest image total"
f"image size ({numpy.prod(min_image_size)}) at '{s.key}':label"
)
assert (b + w) <= numpy.prod(max_image_size), (
f"Counts of black + white ({b}+{w}) higher than largest image total"
f"image size ({numpy.prod(max_image_size)}) at '{s.key}':label"
)
assert (w / b) < bw_threshold_label, (
f"The proportion between black and white pixels "
f"({w}/{b}={w/b:.3f}) is larger than the allowed threshold "
f"of {bw_threshold_label} at '{s.key}':label - this could "
f"indicate a loading problem!"
)
return w/b
limit = None #use this to limit testing to first images only
subset = dataset.subsets("default")
proportions = [_check_sample(s, 0.77) for s in subset["train"][:limit]]
proportions = [_check_sample(s, 0.77) for s in subset["validation"][:limit]]
proportions = [_check_sample(s, 0.77) for s in subset["test"][:limit]]
@pytest.mark.skip_if_rc_var_not_set('bob.ip.binseg.Shenzhen.datadir')
def test_check():
assert dataset.check() == 0
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment