diff --git a/bob/ip/binseg/configs/datasets/csv.py b/bob/ip/binseg/configs/datasets/csv.py index 9da9bd1d2140c9061f6b9db820cad2578b0e4762..14c394de88a321c179cf51309a4884fccddaf000 100644 --- a/bob/ip/binseg/configs/datasets/csv.py +++ b/bob/ip/binseg/configs/datasets/csv.py @@ -4,15 +4,12 @@ """Example CSV-based filelist dataset In case you have your own dataset that is organized on your filesystem, this -configuration shows an example setup so you can feed such files and -ground-truth data to train one of the available network models or to evaluate +configuration shows an example setup so you can feed such files (and +ground-truth data) to train one of the available network models or to evaluate it. You must write CSV based file (e.g. using comma as separator) that describes -the image and ground-truth locations for each image pair on your dataset. -Relative paths are considered with respect to the location of the CSV file -itself by default, also pass the ``root_path`` parameter to the -:py:class:`bob.ip.binseg.data.csvdataset.CSVDataset` object constructor. So, +the image and ground-truth locations for each image pair on your dataset. So, for example, if you have a structure like this: .. code-block:: text @@ -26,8 +23,7 @@ for example, if you have a structure like this: ├── ... └── gt_n.png -Then create a file in the same level of ``images`` and ``ground-truth`` with -the following contents: +Then create a file with the following contents: .. code-block:: text @@ -38,13 +34,13 @@ the following contents: To create a dataset without ground-truth (e.g., for prediction purposes), then omit the second column on the CSV file. -Use the path leading to the CSV file and replace ``<path.csv>`` on the example -code for this configuration, that you must copy locally to make changes: +Use the path leading to the CSV file and carefully read the comments in this +configuration. **Copy it locally to make changes**: .. code-block:: sh $ bob binseg config copy csv-dataset-example mydataset.py - # edit mydataset.py as explained here + # edit mydataset.py as explained here, follow the comments Fine-tune the transformations for your particular purpose: @@ -58,20 +54,79 @@ certain restrictions (input dimensions, image centering, etc.). Check the configuration that was used to train models and try to match it as well as possible. -See: +Finally, you must create a connector that will act as a "dataset" for pytorch. +The connector make a list of samples, returned by your raw dataset, look like +something our pytorch setup can digest (tuples of data with a certain +organisation). -* :py:class:`bob.ip.binseg.data.csvdataset.CSVDataset` for operational details. -* :py:class:`bob.ip.binseg.data.folderdataset.FolderDataset` for an alternative - implementation of an easier to generate **prediction** dataset. +More information: + +* :py:class:`bob.ip.binseg.data.dataset.CSVDataset` for operational details. +* :py:class:`bob.ip.binseg.data.dataset.JSONDataset` for an alternative for + multi-protocol datasets (all of our supported raw datasets are implemented + using this) +* :py:class:`bob.ip.binseg.data.utils.SampleList2TorchDataset` for extra + information on the sample list to pytorch connector """ -# add your transforms below - these are just examples +# First, define how to access and load the raw data. Our package provides some +# stock loaders we use for other datasets. You may have a look at the +# documentation of that module for details. +from bob.ip.binseg.data.loaders import ( + load_pil_rgb, + load_pil_1, + data_path_keymaker, +) + +# How we use the loaders - "sample" is a dictionary where keys are defined +# below and map to the columns of the CSV files you input. +def _loader(context, sample): + # "context" is ignored in this case - database is homogeneous + # it is a dictionary that passes e.g., the name of the subset + # being loaded, so you can take contextual decisions on the loading + + # Using the path leading to the various data files stored in disk allows + # the CSV file to contain only relative paths and is, therefore, more + # compact. Of course, you can make those paths absolute and then simplify + # it here. + import os + root_path = "/path/where/raw/files/sit" + + return dict( + data=load_pil_rgb(os.path.join(root_path, sample["data"])), + label=load_pil_1(os.path.join(root_path, sample["label"])), + ) + +# This is just a class that puts everything together: the CSV file, how to load +# each sample defined in the dataset, names for the various columns of the CSV +# file and how to make unique keys for each sample (keymaker). Once created, +# this object can be called to generate sample lists. +from bob.ip.binseg.data.dataset import CSVDataset +raw_dataset = CSVDataset( + # path to the CSV file(s) - you may add as many subsets as you want, each + # with an unique name, you'll use later to generate sample lists + subsets=dict(data="<path/to/train.csv>"), + fieldnames=("data", "label"), #these are the column names + loader=_loader, + keymaker=data_path_keymaker, +) + +# Finally, we build a connector to passes our dataset to the pytorch framework +# so we can, for example, evaluate a trained pytorch model + +# Add/tune your transforms below - these are just examples compatible with a +# model that requires image inputs of 544 x 544 pixels. from bob.ip.binseg.data.transforms import CenterCrop -#from bob.ip.binseg.configs.datasets.utils import DATA_AUGMENTATION as _DA -_transforms = [ - CenterCrop((544, 544)), - ] # + _DA -from bob.ip.binseg.data.csvdataset import CSVDataset -#dataset = CSVDataset("<path.csv>", check_available=False, transforms=_transforms) +# from bob.ip.binseg.configs.datasets.utils import DATA_AUGMENTATION as _DA +_transforms = [ + CenterCrop((544, 544)), +] # + _DA + +# This class will simply trigger data loading and re-arrange the data so that +# data is fed in the right order to pytorch: (key, image[, label[, mask]]). +# This class also inherits from pytorch Dataset and respect its required API. +# See the documentation for details. +from bob.ip.binseg.data.utils import SampleList2TorchDataset +dataset = SampleList2TorchDataset(raw_dataset.subset("data"), _transforms) diff --git a/bob/ip/binseg/data/chasedb1/__init__.py b/bob/ip/binseg/data/chasedb1/__init__.py index 22286f791cbd660560b0a209a62bc889ee1544bb..0f91b3bd5f945196ed202a2115151dfda78e6e53 100644 --- a/bob/ip/binseg/data/chasedb1/__init__.py +++ b/bob/ip/binseg/data/chasedb1/__init__.py @@ -36,8 +36,8 @@ import pkg_resources import bob.extension -from ..jsondataset import JSONDataset -from ..loader import load_pil_rgb, load_pil_1 +from ..dataset import JSONDataset +from ..loader import load_pil_rgb, load_pil_1, data_path_keymaker _protocols = [ pkg_resources.resource_filename(__name__, "default.json"), @@ -48,11 +48,12 @@ _root_path = bob.extension.rc.get('bob.ip.binseg.chasedb1.datadir', os.path.realpath(os.curdir)) def _loader(context, sample): - #"context" is ignore in this case - database is homogeneous + #"context" is ignored in this case - database is homogeneous return dict( - data=load_pil_rgb(sample["data"]), - label=load_pil_1(sample["label"]), + data=load_pil_rgb(os.path.join(_root_path, sample["data"])), + label=load_pil_1(os.path.join(_root_path, sample["label"])), ) -dataset = JSONDataset(protocols=_protocols, root_path=_root_path, loader=_loader) +dataset = JSONDataset(protocols=_protocols, fieldnames=("data", "label"), + loader=_loader, keymaker=data_path_keymaker) """CHASE-DB1 dataset object""" diff --git a/bob/ip/binseg/data/csvdataset.py b/bob/ip/binseg/data/csvdataset.py deleted file mode 100644 index 43699faf9e41721573bbbc55649f5dd9ddcce8ae..0000000000000000000000000000000000000000 --- a/bob/ip/binseg/data/csvdataset.py +++ /dev/null @@ -1,188 +0,0 @@ -#!/usr/bin/env python -# coding=utf-8 - -import os -import csv - -from PIL import Image - -from torch.utils.data import Dataset -import torch -import torchvision.transforms.functional as VF - -import bob.io.base - -from .transforms import Compose, ToTensor - -import logging - -logger = logging.getLogger(__name__) - - -class CSVDataset(Dataset): - """ - Generic filelist dataset - - To create a new dataset, you only need to provide a CSV formatted filelist - using any separator (e.g. comma, space, semi-colon) with the following - information: - - .. code-block:: text - - image[,label[,mask]] - - Where: - - * ``image``: absolute or relative path leading to original image - * ``label``: (optional) absolute or relative path with manual segmentation - information - * ``mask``: (optional) absolute or relative path with a mask that indicates - valid regions in the image where automatic segmentation should occur - - Relative paths are interpreted with respect to the location where the CSV - file is or to an optional ``root_path`` parameter, that can be provided. - - There are no requirements concerning image or ground-truth homogenity. - Anything that can be loaded by our image and data loaders is OK. Use - a non-white character as separator. Example - - .. code-block:: text - - image1.jpg,gt1.tif,mask1.png - image2.png,gt2.png,mask2.png - ... - - - Notice that all rows must have the same number of entries. - - .. important:: - - Images are converted to RGB after readout via PIL. Ground-truth data is - loaded using the same technique, but converted to mode ``1`` instead of - ``RGB``. If ground-truth data is encoded as an HDF5 file, we use - instead :py:func:`bob.io.base.load`, and then converted it to 32-bit - float data. - - To generate a dataset without ground-truth (e.g. for prediction tasks), - then omit the second and third columns. - - - Parameters - ---------- - path : str - Full path to the file containing the dataset description, in CSV - format as described above - - root_path : :py:class:`str`, Optional - Path to a common filesystem root where files with relative paths should - be sitting. If not set, then we use the absolute path leading to the - CSV file as ``root_path`` - - check_available : :py:class:`bool`, Optional - If set to ``True``, then checks if files in the file list are - available. Otherwise does not. - - transforms : :py:class:`list`, Optional - a list of transformations to be applied to **both** image and - ground-truth data. Notice that image changing transformations such as - :py:class:`.transforms.ColorJitter` are only applied to the image and - **not** to ground-truth. Also notice a last transform - (:py:class:`bob.ip.binseg.data.transforms.ToTensor`) is always applied. - - """ - - def __init__( - self, path, root_path=None, check_available=True, transforms=[] - ): - - self.root_path = root_path or os.path.dirname(path) - self.transform = Compose(transforms + [ToTensor()]) - - def _make_abs_path(root, s): - retval = [] - for p in s: - if not os.path.isabs(p): - retval.append(os.path.join(root, p)) - return retval - - with open(path, newline="") as f: - reader = csv.reader(f) - self.data = [_make_abs_path(self.root_path, k) for k in reader] - - # check if all files are readable, warn otherwise - if check_available: - errors = 0 - for s in self.data: - for p in s: - if not os.path.exists(p): - errors += 1 - logger.error(f"Cannot find {p}") - assert errors == 0, ( - f"There {errors} files which cannot be " - f"found on your filelist ({path}) dataset" - ) - - # check all data entries have the same size - assert all(len(k) == len(self.data[0]) for k in self.data), ( - f"There is an inconsistence on your dataset - not all " - f"entries have length=={len(self.data[0])}" - ) - - def __len__(self): - """ - - Returns - ------- - - length : int - size of the dataset - """ - - return len(self.data) - - def __getitem__(self, index): - """ - - Parameters - ---------- - index : int - - Returns - ------- - sample : list - ``[name, img, gt, mask]``, ``[name, img, gt]`` or ``[name, img]`` - depending on whether this dataset has or not ground-truth - annotations and masks. The value of ``name`` is relative to - ``root_path``, in cases it starts with ``root_path``. - """ - - sample_paths = self.data[index] - - img_path = sample_paths[0] - meta_data = sample_paths[1:] - - # images are converted to RGB mode automatically - sample = [Image.open(img_path).convert(mode="RGB")] - - # ground-truth annotations and masks are treated the same - for path in meta_data: - if path is not None: - if path.endswith(".hdf5"): - data = bob.io.base.load(str(path)).astype("float32") - # a bit hackish, but will get what we need - data = VF.to_pil_image(torch.from_numpy(data)) - else: - data = Image.open(path) - sample += [data.convert(mode="1", dither=None)] - - if self.transform: - sample = self.transform(*sample) - - # make paths relative if necessary - stem = img_path - if stem.startswith(self.root_path): - stem = os.path.relpath(stem, self.root_path) - elif stem.startswith(os.pathsep): - stem = stem[len(os.pathsep) :] - - return [stem] + sample diff --git a/bob/ip/binseg/data/dataset.py b/bob/ip/binseg/data/dataset.py new file mode 100644 index 0000000000000000000000000000000000000000..687a7d0bab474f774ffe2a6bfca6d703f0acf7da --- /dev/null +++ b/bob/ip/binseg/data/dataset.py @@ -0,0 +1,299 @@ +#!/usr/bin/env python +# coding=utf-8 + +import os +import copy +import json +import functools + +import logging + +logger = logging.getLogger(__name__) + +from .sample import DelayedSample + + +class JSONDataset: + """ + Generic multi-protocol/subset filelist dataset that yields samples + + To create a new dataset, you need to provide one or more JSON formatted + filelists (one per protocol) with the following contents: + + .. code-block:: json + + { + "subset1": [ + [ + "value1", + "value2", + "value3" + ], + [ + "value4", + "value5", + "value6" + ] + ], + "subset2": [ + ] + } + + Your dataset many contain any number of subsets, but all sample entries + must contain the same number of fields. + + + Parameters + ---------- + + protocols : list, dict + Paths to one or more JSON formatted files containing the various + protocols to be recognized by this dataset, or a dictionary, mapping + protocol names to paths of JSON files. Internally, we save a + dictionary where keys default to the basename of paths. + + fieldnames : list, tuple + An iterable over the field names (strings) to assign to each entry in + the JSON file. It should have as many items as fields in each entry of + the JSON file. + + loader : object + A function that receives as input, a context dictionary (with a + "protocol" and "subset" keys indicating which protocol and subset are + being served), and a dictionary with ``{key: path}`` entries, and + returns a dictionary with the loaded data. + + keymaker : object + A function that receives as input the same input from the ``loader``, + but outputs a single string that uniquely identifies a sample within + a given protocol. It is typically the path, without extension, of one + of the file entries for the sample, but you can tune it as you like. + + """ + + def __init__(self, protocols, fieldnames, loader, keymaker): + + if isinstance(protocols, dict): + self.protocols = dict( + (k, os.path.realpath(v)) for k, v in protocols.items() + ) + else: + self.protocols = dict( + (os.path.splitext(os.path.basename(k))[0], os.path.realpath(k)) + for k in protocols + ) + self.fieldnames = fieldnames + self.loader = loader + self.keymaker = keymaker + + def check(self): + """For each protocol, check if all data can be correctly accessed + + Returns + ------- + + errors : int + Number of errors found + + """ + + logger.info(f"Checking dataset...") + errors = 0 + for proto in self.protocols: + logger.info(f"Checking protocol '{proto}'...") + for name, samples in self.subsets(proto).items(): + logger.info(f"Checking subset '{name}'...") + for sample in samples: + try: + sample.data # triggers loading + logger.info(f"{sample.key}: OK") + except Exception as e: + logger.error(f"{sample.key}: {e}") + errors += 1 + return errors + + def _make_delayed(self, pos, sample, context): + """Checks consistence and builds a delayed loading sample + """ + assert len(sample) == len(self.fieldnames), ( + f"Entry {k} in subset {context['subset']} of protocol " + f"{context['protocol']} has {len(sample)} entries instead of " + f"{len(self.fieldnames)} (expected). Fix file " + f"{self.protocols[context['protocol']]}" + ) + item = dict(zip(self.fieldnames, sample)) + return DelayedSample( + functools.partial(self.loader, context, item), + key=self.keymaker(context, item), + ) + + def subsets(self, protocol): + """Returns all subsets in a protocol + + This method will load JSON information for a given protocol and return + all subsets of the given protocol after converting each entry into a + :py:class:`bob.ip.binseg.data.sample.DelayedSample`. + + Parameters + ---------- + + protocol : str + Name of the protocol data to load + + + Returns + ------- + + subsets : dict + A dictionary mapping subset names to lists of + :py:class:`bob.ip.binseg.data.sample.DelayedSample` objects, with + the proper loading implemented. Each delayed sample also carries a + ``key`` parameter, that contains the output of the sample + contextual data after passing through the ``keymaker``. This + parameter can be used for recording sample transforms during + check-pointing. + + """ + + with open(self.protocols[protocol], "r") as f: + data = json.load(f) + + retval = {} + for subset, samples in data.items(): + context = dict(protocol=protocol, subset=subset) + retval[subset] = [ + self._make_delayed(k, v, context) for (k, v) in enumerate(samples) + ] + return retval + + +class CSVDataset: + """ + Generic single subset filelist dataset that yields samples + + To create a new dataset, you only need to provide a CSV formatted filelist + using any separator (e.g. comma, space, semi-colon) with the following + information: + + .. code-block:: text + + value1,value2,value3 + value4,value5,value6 + ... + + Notice that all rows must have the same number of entries. + + Parameters + ---------- + + subsets : list, dict + Paths to one or more CSV formatted files containing the various + subsets to be recognized by this dataset, or a dictionary, mapping + subset names to paths of CSV files. Internally, we save a + dictionary where keys default to the basename of paths. + + fieldnames : list, tuple + An iterable over the field names (strings) to assign to each column in + the CSV file. It should have as many items as fields in each row of + the CSV file(s). + + loader : object + A function that receives, as input, a dictionary with ``{name: value}`` + entries (for each header in the input CSV file), and returns a + dictionary with the loaded data. + + keymaker : object + A function that receives as input the same input from the ``loader``, + but outputs a single string that uniquely identifies a sample within + a given protocol. It is typically the path, without extension, of one + of the file entries for the sample, but you can tune it as you like. + + """ + + def __init__(self, subsets, fieldnames, loader, keymaker): + + if isinstance(subsets, dict): + self.subsets = dict( + (k, os.path.realpath(v)) for k, v in subsets.items() + ) + else: + self.subsets = dict( + (os.path.splitext(os.path.basename(k))[0], os.path.realpath(k)) + for k in subsets + ) + self.fieldnames = fieldnames + self.loader = loader + self.keymaker = keymaker + + def check(self): + """For each subset, check if all data can be correctly accessed + + Returns + ------- + + errors : int + Number of errors found + + """ + + logger.info(f"Checking dataset...") + errors = 0 + for name in self.subsets.keys(): + logger.info(f"Checking subset '{name}'...") + for sample in self.samples(name): + try: + sample.data # triggers loading + logger.info(f"{sample.key}: OK") + except Exception as e: + logger.error(f"{sample.key}: {e}") + errors += 1 + return errors + + def _make_delayed(self, pos, sample, context): + """Checks consistence and builds a delayed loading sample + """ + assert len(sample) == len(self.fieldnames), ( + f"Entry {k} in subset {context['subset']} has {len(sample)} " + f"entries instead of {len(self.fieldnames)} (expected). Fix " + f"file {self.subsets[context['subset']]}" + ) + item = dict(zip(self.fieldnames, v)) + return DelayedSample( + functools.partial(self.loader, context, item), + key=self.keymaker(context, item), + ) + + def samples(self, subset): + """Returns all samples in a subset + + This method will load CSV information for a given subset and return + all samples of the given subset after converting each entry into a + :py:class:`bob.ip.binseg.data.sample.DelayedSample`. + + + Parameters + ---------- + + subset : str + Name of the subset data to load + + + Returns + ------- + + subset : list + A list of :py:class:`bob.ip.binseg.data.sample.DelayedSample` + objects, with the proper loading implemented. Each delayed sample + also carries a ``key`` parameter, that contains the output of the + sample contextual data after passing through the ``keymaker``. + This parameter can be used for recording sample transforms during + check-pointing. + + """ + + with open(self.subsets[subset], newline="") as f: + cf = csv.reader(f) + samples = [k for k in cf] + + context = dict(subset=subset) + return [self._make_delayed(k, v, context) for (k, v) in enumerate(samples)] diff --git a/bob/ip/binseg/data/drionsdb/__init__.py b/bob/ip/binseg/data/drionsdb/__init__.py index 3e1b5798c45636ebdc7f6169f2f91cdf33288644..33aea1ab9cb0ba19f793920e4857361d89248f10 100644 --- a/bob/ip/binseg/data/drionsdb/__init__.py +++ b/bob/ip/binseg/data/drionsdb/__init__.py @@ -30,8 +30,8 @@ import PIL.ImageDraw import bob.extension -from ..jsondataset import JSONDataset -from ..loader import load_pil_rgb +from ..dataset import JSONDataset +from ..loader import load_pil_rgb, data_path_keymaker _protocols = [ pkg_resources.resource_filename(__name__, "default.json"), @@ -63,8 +63,8 @@ def _pad_right(img): def _loader(context, sample): - data = load_pil_rgb(sample["data"]) - label = _txt_to_pil_1(sample["label"], data.size) + data = load_pil_rgb(os.path.join(_root_path, sample["data"])) + label = _txt_to_pil_1(os.path.join(_root_path, sample["label"]), data.size) if sample["data"].endswith("_101.jpg"): # pads the image on the right side to account for a difference in @@ -74,5 +74,6 @@ def _loader(context, sample): return dict(data=data, label=label) -dataset = JSONDataset(protocols=_protocols, root_path=_root_path, loader=_loader) +dataset = JSONDataset(protocols=_protocols, fieldnames=("data", "label"), + loader=_loader, keymaker=data_path_keymaker) """DRIONSDB dataset object""" diff --git a/bob/ip/binseg/data/drishtigs1/__init__.py b/bob/ip/binseg/data/drishtigs1/__init__.py index 7ab1543cfdb2dd5e32a4296c6b4801c4c9b44cdb..538139232ed83b82eb4ca4d64521c714dbb08c86 100644 --- a/bob/ip/binseg/data/drishtigs1/__init__.py +++ b/bob/ip/binseg/data/drishtigs1/__init__.py @@ -27,33 +27,49 @@ import pkg_resources import bob.extension -from ..jsondataset import JSONDataset -from ..loader import load_pil_rgb +from ..dataset import JSONDataset +from ..loader import load_pil_rgb, data_path_keymaker _protocols = { - "optic-disc-all": pkg_resources.resource_filename(__name__, "optic-disc.json"), - "optic-cup-all": pkg_resources.resource_filename(__name__, "optic-cup.json"), - "optic-disc-any": pkg_resources.resource_filename(__name__, "optic-disc.json"), - "optic-cup-any": pkg_resources.resource_filename(__name__, "optic-cup.json"), - } + "optic-disc-all": pkg_resources.resource_filename( + __name__, "optic-disc.json" + ), + "optic-cup-all": pkg_resources.resource_filename( + __name__, "optic-cup.json" + ), + "optic-disc-any": pkg_resources.resource_filename( + __name__, "optic-disc.json" + ), + "optic-cup-any": pkg_resources.resource_filename( + __name__, "optic-cup.json" + ), +} + +_root_path = bob.extension.rc.get( + "bob.ip.binseg.drishtigs1.datadir", os.path.realpath(os.curdir) +) -_root_path = bob.extension.rc.get('bob.ip.binseg.drishtigs1.datadir', - os.path.realpath(os.curdir)) def _loader(context, sample): retval = dict( - data=load_pil_rgb(sample["data"]), - label=load_pil_rgb(sample["label"]).convert("L"), - ) + data=load_pil_rgb(os.path.join(_root_path, sample["data"])), + label=load_pil_rgb(os.path.join(_root_path, sample["label"])).convert( + "L" + ), + ) # Drishti-GS provides softmaps of multiple annotators # we threshold to get gt where all/any of the annotators overlap if context["protocol"].endswith("-all"): - retval["label"] = retval["label"].point(lambda p: p>254, mode="1") + retval["label"] = retval["label"].point(lambda p: p > 254, mode="1") elif context["protocol"].endswith("-any"): - retval["label"] = retval["label"].point(lambda p: p>0, mode="1") + retval["label"] = retval["label"].point(lambda p: p > 0, mode="1") else: raise RuntimeError(f"Unknown protocol {context['protocol']}") return retval -dataset = JSONDataset(protocols=_protocols, root_path=_root_path, loader=_loader) + +dataset = JSONDataset( + protocols=_protocols, fieldnames=("data", "label"), loader=_loader, + keymaker=data_path_keymaker +) """Drishti-GS1 dataset object""" diff --git a/bob/ip/binseg/data/drive/__init__.py b/bob/ip/binseg/data/drive/__init__.py index 5298e66131ced5a2a0e428c76d21098c1d1c7903..2253bcbecd7c15db4b4cacd850f6e3d655d15ca7 100644 --- a/bob/ip/binseg/data/drive/__init__.py +++ b/bob/ip/binseg/data/drive/__init__.py @@ -25,24 +25,32 @@ import pkg_resources import bob.extension -from ..jsondataset import JSONDataset -from ..loader import load_pil_rgb, load_pil_1 +from ..dataset import JSONDataset +from ..loader import load_pil_rgb, load_pil_1, data_path_keymaker _protocols = [ - pkg_resources.resource_filename(__name__, "default.json"), - pkg_resources.resource_filename(__name__, "second-annotation.json"), - ] + pkg_resources.resource_filename(__name__, "default.json"), + pkg_resources.resource_filename(__name__, "second-annotation.json"), +] + +_root_path = bob.extension.rc.get( + "bob.ip.binseg.drive.datadir", os.path.realpath(os.curdir) +) -_root_path = bob.extension.rc.get('bob.ip.binseg.drive.datadir', - os.path.realpath(os.curdir)) def _loader(context, sample): - #"context" is ignore in this case - database is homogeneous + # "context" is ignored in this case - database is homogeneous return dict( - data=load_pil_rgb(sample["data"]), - label=load_pil_1(sample["label"]), - mask=load_pil_1(sample["mask"]), - ) - -dataset = JSONDataset(protocols=_protocols, root_path=_root_path, loader=_loader) + data=load_pil_rgb(os.path.join(_root_path, sample["data"])), + label=load_pil_1(os.path.join(_root_path, sample["label"])), + mask=load_pil_1(os.path.join(_root_path, sample["mask"])), + ) + + +dataset = JSONDataset( + protocols=_protocols, + fieldnames=("data", "label", "mask"), + loader=_loader, + keymaker=data_path_keymaker, +) """DRIVE dataset object""" diff --git a/bob/ip/binseg/data/folderdataset.py b/bob/ip/binseg/data/folderdataset.py deleted file mode 100644 index 2566fd77ca64dd357e9e3f8998bb830f10ec0c97..0000000000000000000000000000000000000000 --- a/bob/ip/binseg/data/folderdataset.py +++ /dev/null @@ -1,86 +0,0 @@ -#!/usr/bin/env python -# coding=utf-8 - -from pathlib import Path - -from PIL import Image - -from torch.utils.data import Dataset - -from .transforms import Compose, ToTensor - - -def _find_files(data_path, glob): - """ - Recursively retrieves file lists from a given path, matching a given glob - - This function will use :py:meth:`pathlib.Path.rglob`, together with the - provided glob pattern to search for anything the desired filename. - """ - - data_path = Path(data_path) - return sorted(list(data_path.rglob(glob))) - - -class FolderDataset(Dataset): - """ - Generic image folder containing images for prediction - - .. important:: - - This implementation, contrary to its sister - :py:class:`.csvdataset.CSVDataset`, does not *automatically* convert - the input image to RGB, before passing it to the transforms, so it is - possible to accomodate a wider range of input types (e.g. 16-bit PNG - images). - - Parameters - ---------- - - path : str - full path to root of dataset - - glob : str - glob that can be used to filter-down files to be loaded on the provided - path - - transforms : :py:class:`list`, Optional - a list of transformations to be applied to **both** image and - ground-truth data. Notice that image changing transformations such as - :py:class:`.transforms.ColorJitter` are only applied to the image and - **not** to ground-truth. Also notice a last transform - (:py:class:`bob.ip.binseg.data.transforms.ToTensor`) is always applied. - - """ - - def __init__(self, path, glob="*", transforms=[]): - self.transform = Compose(transforms + [ToTensor()]) - self.path = path - self.data = _find_files(path, glob) - - def __len__(self): - """ - Returns - ------- - int - size of the dataset - """ - - return len(self.data) - - def __getitem__(self, index): - """ - Parameters - ---------- - index : int - - Returns - ------- - sample : list - [name, img] - """ - - sample = [Image.open(self.data[index])] - if self.transform: - sample = self.transform(*sample) - return [self.data[index].relative_to(self.path).as_posix()] + sample diff --git a/bob/ip/binseg/data/hrf/__init__.py b/bob/ip/binseg/data/hrf/__init__.py index 8f0a387b5026e73750dd1ec6b44de6b751688cfb..1ab2919c8ee77d85560711bfbcb2a5c15ba24512 100644 --- a/bob/ip/binseg/data/hrf/__init__.py +++ b/bob/ip/binseg/data/hrf/__init__.py @@ -24,23 +24,31 @@ import pkg_resources import bob.extension -from ..jsondataset import JSONDataset -from ..loader import load_pil_rgb, load_pil_1 +from ..dataset import JSONDataset +from ..loader import load_pil_rgb, load_pil_1, data_path_keymaker _protocols = [ - pkg_resources.resource_filename(__name__, "default.json"), - ] + pkg_resources.resource_filename(__name__, "default.json"), +] + +_root_path = bob.extension.rc.get( + "bob.ip.binseg.hrf.datadir", os.path.realpath(os.curdir) +) -_root_path = bob.extension.rc.get('bob.ip.binseg.hrf.datadir', - os.path.realpath(os.curdir)) def _loader(context, sample): - #"context" is ignore in this case - database is homogeneous + # "context" is ignore in this case - database is homogeneous return dict( - data=load_pil_rgb(sample["data"]), - label=load_pil_1(sample["label"]), - mask=load_pil_1(sample["mask"]), - ) - -dataset = JSONDataset(protocols=_protocols, root_path=_root_path, loader=_loader) + data=load_pil_rgb(os.path.join(_root_path, sample["data"])), + label=load_pil_1(os.path.join(_root_path, sample["label"])), + mask=load_pil_1(os.path.join(_root_path, sample["mask"])), + ) + + +dataset = JSONDataset( + protocols=_protocols, + fieldnames=("data", "label", "mask"), + loader=_loader, + keymaker=data_path_keymaker, +) """HRF dataset object""" diff --git a/bob/ip/binseg/data/iostar/__init__.py b/bob/ip/binseg/data/iostar/__init__.py index 1654cbfda75eca139a07734f31ed3e0b14b4b095..0ce427195777310966b1cd4e9579dce0571ad23f 100644 --- a/bob/ip/binseg/data/iostar/__init__.py +++ b/bob/ip/binseg/data/iostar/__init__.py @@ -27,35 +27,44 @@ import pkg_resources import bob.extension -from ..jsondataset import JSONDataset -from ..loader import load_pil_rgb, load_pil_1 +from ..dataset import JSONDataset +from ..loader import load_pil_rgb, load_pil_1, data_path_keymaker from ..utils import invert_mode1_image, subtract_mode1_images _protocols = [ - pkg_resources.resource_filename(__name__, "vessel.json"), - pkg_resources.resource_filename(__name__, "optic-disc.json"), - ] + pkg_resources.resource_filename(__name__, "vessel.json"), + pkg_resources.resource_filename(__name__, "optic-disc.json"), +] + +_root_path = bob.extension.rc.get( + "bob.ip.binseg.iostar.datadir", os.path.realpath(os.curdir) +) -_root_path = bob.extension.rc.get('bob.ip.binseg.iostar.datadir', - os.path.realpath(os.curdir)) def _loader(context, sample): retval = dict( - data=load_pil_rgb(sample["data"]), - label=load_pil_1(sample["label"]), - mask=load_pil_1(sample["mask"]), - ) + data=load_pil_rgb(os.path.join(_root_path, sample["data"])), + label=load_pil_1(os.path.join(_root_path, sample["label"])), + mask=load_pil_1(os.path.join(_root_path, sample["mask"])), + ) if context["protocol"] == "optic-disc": # For optic-disc analysis, the label provided by IOSTAR raw data is the # "inverted" (negative) label, and does not consider the mask region, # which must be subtracted. We do this special manipulation here. retval["label"] = subtract_mode1_images( - invert_mode1_image(retval["label"]), - invert_mode1_image(retval["mask"])) + invert_mode1_image(retval["label"]), + invert_mode1_image(retval["mask"]), + ) return retval elif context["protocol"] == "vessel": return retval raise RuntimeError(f"Unknown protocol {context['protocol']}") -dataset = JSONDataset(protocols=_protocols, root_path=_root_path, loader=_loader) + +dataset = JSONDataset( + protocols=_protocols, + fieldnames=("data", "label", "mask"), + loader=_loader, + keymaker=data_path_keymaker, +) """IOSTAR dataset object""" diff --git a/bob/ip/binseg/data/jsondataset.py b/bob/ip/binseg/data/jsondataset.py deleted file mode 100644 index 427bd6e8de7caa8f197c1348403f5392d9d837a3..0000000000000000000000000000000000000000 --- a/bob/ip/binseg/data/jsondataset.py +++ /dev/null @@ -1,219 +0,0 @@ -#!/usr/bin/env python -# coding=utf-8 - -import os -import copy -import json -import functools - -import logging -logger = logging.getLogger(__name__) - -from .sample import DelayedSample - - -class JSONDataset: - """ - Generic multi-protocol filelist dataset that yields samples - - To create a new dataset, you need to provide one or more JSON formatted - filelists (one per protocol) with the following contents: - - .. code-block:: json - - { - "subset1": [ - { - "data": "path/to/data", - "label": "path/to/optional/label", - "mask": "path/to/optional/mask" - } - ], - "subset2": [ - ] - } - - Optionally, you may also format your JSON file like this, where each sample - is described as a list of up to 3 elements: - - .. code-block:: json - - { - "subset1": [ - [ - "path/to/data", - "path/to/optional/label", - "path/to/optional/mask" - ] - ], - "subset2": [ - ] - } - - If your dataset does not have labels or masks, you may also represent it - like this: - - .. code-block:: json - - { - "subset1": [ - "path/to/data1", - "path/to/data2" - ], - "subset2": [ - ] - } - - Where: - - * ``data``: absolute or relative path leading to original image, in RGB - format - * ``label``: (optional) absolute or relative path with manual segmentation - information. This image will be converted to a binary image. This - dataset shall always yield label images in which white pixels (value=1) - indicate the **presence** of the object, and black pixels (value=0), its - absence. - * ``mask``: (optional) absolute or relative path with a mask that indicates - valid regions in the image where automatic segmentation should occur. - This image will be converted to a binary image. This dataset shall - always yield mask images in which white pixels (value=1) indicate the - **valid** regions of the mask, and black pixels (value=0), invalid parts. - - Relative paths are interpreted with respect to the location where the JSON - file is or to an optional ``root_path`` parameter, that can be provided. - - There are no requirements concerning image or ground-truth homogenity. - Anything that can be loaded by our image and data loaders is OK. - - Notice that all rows must have the same number of entries. - - To generate a dataset without ground-truth (e.g. for prediction tasks), - then omit the ``label`` and ``mask`` entries. - - - Parameters - ---------- - - protocols : list, dict - Paths to one or more JSON formatted files containing the various - protocols to be recognized by this dataset, or a dictionary, mapping - protocol names to paths of JSON files. Internally, we save a - dictionary where keys default to the basename of paths. - - root_path : str - Path to a common filesystem root where files with relative paths should - be sitting. If not set, then we use the current directory to resolve - relative paths. - - loader : object - A function that receives, as input, a context dictionary (with a - "protocol" and "subset" keys indicating which protocol and subset are - being served), and a dictionary with ``{key: path}`` entries, and - returns a dictionary with the loaded data. It shall respect the - loading principles of data, label and mask objects as stated above. - - """ - - def __init__(self, protocols, root_path, loader): - - if isinstance(protocols, dict): - self.protocols = dict((k,os.path.realpath(v)) for k,v in - protocols.items()) - else: - self.protocols = dict( - (os.path.splitext(os.path.basename(k))[0], os.path.realpath(k)) - for k in protocols - ) - self.root_path = root_path - self.loader = loader - - def check(self): - """For each protocol, check all files are available on the filesystem - - Returns - ------- - - errors : int - Number of errors found - - """ - - logger.info(f"Checking dataset at '{self.root_path}'...") - - errors = 0 - for proto in self.protocols: - logger.info(f"Checking protocol '{proto}'...") - for name, samples in self.subsets(proto).items(): - logger.info(f"Checking subset '{name}'...") - for sample in samples: - try: - sample.data # triggers loading - logger.info(f"{sample.key}: OK") - except Exception as e: - logger.error(f"{sample.key}: {e}") - errors += 1 - return errors - - def subsets(self, protocol): - """Returns all subsets in a protocol - - This method will load JSON information for a given protocol and return - all subsets of the given protocol after converting each entry into a - :py:class:`bob.ip.binseg.data.sample.DelayedSample`. - - Parameters - ---------- - - protocol : str - Name of the protocol data to load - - - Returns - ------- - - subsets : dict - A dictionary mapping subset names to lists of - :py:class:`bob.ip.binseg.data.sample.DelayedSample` objects, with - the proper loading implemented. Each delayed sample also carries a - ``key`` parameter, that contains the relative path of the sample, - without its extension. This parameter can be used for recording - sample transforms during check-pointing. - - """ - - with open(self.protocols[protocol], "r") as f: - data = json.load(f) - - # returns a fixed sample representations as a DelayedSamples - retval = {} - - for subset, samples in data.items(): - delayeds = [] - context = dict(protocol=protocol, subset=subset) - for k in samples: - - if isinstance(k, dict): - item = k - - elif isinstance(k, list): - item = {"data": k[0]} - if len(k) > 1: item["label"] = k[1] - if len(k) > 2: item["mask"] = k[2] - - elif isinstance(k, str): - item = {"data": k} - - key = os.path.splitext(item["data"])[0] - - # make paths absolute - abs_item = copy.deepcopy(item) - for k,v in item.items(): - if not os.path.isabs(v): - abs_item[k] = os.path.join(self.root_path, v) - - load = functools.partial(self.loader, context, abs_item) - delayeds.append(DelayedSample(load, key=key)) - - retval[subset] = delayeds - - return retval diff --git a/bob/ip/binseg/data/loader.py b/bob/ip/binseg/data/loader.py index c5a235ceb24a46d80bdc95237824a05cacb75639..84928b7d39376bc180fd89d3bb89bc175b7f7774 100644 --- a/bob/ip/binseg/data/loader.py +++ b/bob/ip/binseg/data/loader.py @@ -5,6 +5,7 @@ """Data loading code""" +import os import PIL.Image @@ -48,3 +49,34 @@ def load_pil_1(path): """ return PIL.Image.open(path).convert(mode="1", dither=None) + + +def data_path_keymaker(context, sample): + """Returns a path without extension as a key + + This method assumes ``sample`` contains at least one entry named ``path``, + that contains a path to the sample raw data, without extension. It will + return the said path without its extension. + + + Parameters + ---------- + + context : dict + Context dictionary with entries (``protocol``, ``subset``), depending + on the context + + sample : dict + A dictionary that maps field names to sample entries from the original + dataset. + + + Returns + ------- + + key : str + A string that uniquely identifies the sample within a given context + + """ + + return os.path.splitext(sample["data"])[0] diff --git a/bob/ip/binseg/data/refuge/__init__.py b/bob/ip/binseg/data/refuge/__init__.py index 766d0d0eeceb482ae46bf8c2c47de843b7e55f07..328dec359fdbaed8fcd19545a95c5e78676941e9 100644 --- a/bob/ip/binseg/data/refuge/__init__.py +++ b/bob/ip/binseg/data/refuge/__init__.py @@ -37,8 +37,8 @@ import pkg_resources import bob.extension -from ..jsondataset import JSONDataset -from ..loader import load_pil_rgb +from ..dataset import JSONDataset +from ..loader import load_pil_rgb, data_path_keymaker _protocols = { "optic-disc": pkg_resources.resource_filename(__name__, "default.json"), @@ -50,8 +50,8 @@ _root_path = bob.extension.rc.get('bob.ip.binseg.refuge.datadir', def _loader(context, sample): retval = dict( - data=load_pil_rgb(sample["data"]), - label=load_pil_rgb(sample["label"]), + data=load_pil_rgb(os.path.join(_root_path, sample["data"])), + label=load_pil_rgb(os.path.join(_root_path, sample["label"])), ) if context["subset"] == "train": @@ -77,5 +77,6 @@ def _loader(context, sample): return retval -dataset = JSONDataset(protocols=_protocols, root_path=_root_path, loader=_loader) +dataset = JSONDataset(protocols=_protocols, fieldnames=("data", "label"), + loader=_loader, keymaker=data_path_keymaker) """REFUGE dataset object""" diff --git a/bob/ip/binseg/data/rimoner3/__init__.py b/bob/ip/binseg/data/rimoner3/__init__.py index d251bc4f0d27a66eebfc452017fbac3da79865aa..b60faddab0f3e9747d6544a447cc915d546e1653 100644 --- a/bob/ip/binseg/data/rimoner3/__init__.py +++ b/bob/ip/binseg/data/rimoner3/__init__.py @@ -25,28 +25,40 @@ import pkg_resources import bob.extension -from ..jsondataset import JSONDataset -from ..loader import load_pil_rgb, load_pil_1 +from ..dataset import JSONDataset +from ..loader import load_pil_rgb, load_pil_1, data_path_keymaker _protocols = [ - pkg_resources.resource_filename(__name__, "optic-disc-exp1.json"), - pkg_resources.resource_filename(__name__, "optic-cup-exp1.json"), - pkg_resources.resource_filename(__name__, "optic-disc-exp2.json"), - pkg_resources.resource_filename(__name__, "optic-cup-exp2.json"), - pkg_resources.resource_filename(__name__, "optic-disc-avg.json"), - pkg_resources.resource_filename(__name__, "optic-cup-avg.json"), - ] + pkg_resources.resource_filename(__name__, "optic-disc-exp1.json"), + pkg_resources.resource_filename(__name__, "optic-cup-exp1.json"), + pkg_resources.resource_filename(__name__, "optic-disc-exp2.json"), + pkg_resources.resource_filename(__name__, "optic-cup-exp2.json"), + pkg_resources.resource_filename(__name__, "optic-disc-avg.json"), + pkg_resources.resource_filename(__name__, "optic-cup-avg.json"), +] + +_root_path = bob.extension.rc.get( + "bob.ip.binseg.rimoner3.datadir", os.path.realpath(os.curdir) +) -_root_path = bob.extension.rc.get('bob.ip.binseg.rimoner3.datadir', - os.path.realpath(os.curdir)) def _loader(context, sample): # RIM-ONE r3 provides stereo images - we clip them here to get only the # left part of the image, which is also annotated return dict( - data=load_pil_rgb(sample["data"]).crop((0, 0, 1072, 1424)), - label=load_pil_1(sample["label"]).crop((0, 0, 1072, 1424)), - ) + data=load_pil_rgb(os.path.join(_root_path, sample["data"])).crop( + (0, 0, 1072, 1424) + ), + label=load_pil_1(os.path.join(_root_path, sample["label"])).crop( + (0, 0, 1072, 1424) + ), + ) + -dataset = JSONDataset(protocols=_protocols, root_path=_root_path, loader=_loader) +dataset = JSONDataset( + protocols=_protocols, + fieldnames=("data", "label"), + loader=_loader, + keymaker=data_path_keymaker, +) """RIM-ONE r3 dataset object""" diff --git a/bob/ip/binseg/data/stare/__init__.py b/bob/ip/binseg/data/stare/__init__.py index 6885fc4d8821386cee0129c3b233467b68a6ab95..1ad911552da3afc6044bc3ac9d7feededdd0c9f3 100644 --- a/bob/ip/binseg/data/stare/__init__.py +++ b/bob/ip/binseg/data/stare/__init__.py @@ -29,23 +29,35 @@ import pkg_resources import bob.extension -from ..jsondataset import JSONDataset -from ..loader import load_pil_rgb, load_pil_1 +from ..dataset import JSONDataset +from ..loader import load_pil_rgb, load_pil_1, data_path_keymaker _protocols = [ - pkg_resources.resource_filename(__name__, "default.json"), - pkg_resources.resource_filename(__name__, "second-annotation.json"), - ] + pkg_resources.resource_filename(__name__, "default.json"), + pkg_resources.resource_filename(__name__, "second-annotation.json"), +] -_root_path = bob.extension.rc.get('bob.ip.binseg.stare.datadir', - os.path.realpath(os.curdir)) +_fieldnames = ("data", "label") -def _loader(context, sample): - #"context" is ignore in this case - database is homogeneous - return dict( - data=load_pil_rgb(sample["data"]), - label=load_pil_1(sample["label"]), - ) +_root_path = bob.extension.rc.get( + "bob.ip.binseg.stare.datadir", os.path.realpath(os.curdir) +) -dataset = JSONDataset(protocols=_protocols, root_path=_root_path, loader=_loader) +def _make_loader(root_path): + + def _loader(context, sample): + # "context" is ignore in this case - database is homogeneous + return dict( + data=load_pil_rgb(os.path.join(root_path, sample["data"])), + label=load_pil_1(os.path.join(root_path, sample["label"])), + ) + return _loader + + +dataset = JSONDataset( + protocols=_protocols, + fieldnames=_fieldnames, + loader=_make_loader(_root_path), + keymaker=data_path_keymaker, +) """STARE dataset object""" diff --git a/bob/ip/binseg/test/__init__.py b/bob/ip/binseg/test/__init__.py index 428bd154ba23e7516809b9bc147f2caf2db953a5..2e507ed77cc08ad5f73b8c579defe79d5759b202 100644 --- a/bob/ip/binseg/test/__init__.py +++ b/bob/ip/binseg/test/__init__.py @@ -5,10 +5,13 @@ import tempfile import logging + logger = logging.getLogger(__name__) TESTDB_TMPDIR = None -_URL = "http://www.idiap.ch/software/bob/data/bob/bob.ip.binseg/master/_testdb.zip" +_URL = ( + "http://www.idiap.ch/software/bob/data/bob/bob.ip.binseg/master/_testdb.zip" +) _RCKEY = "bob.ip.binseg.stare.datadir" @@ -24,40 +27,54 @@ def _mock_test_skipper(name): Dummary decorator that does nothing """ import functools + def wrapped_function(test): @functools.wraps(test) def wrapper(*args, **kwargs): return test(*args, **kwargs) + return wrapper + return wrapped_function def mock_dataset(): global TESTDB_TMPDIR from bob.extension import rc + if (TESTDB_TMPDIR is not None) or (_RCKEY in rc): logger.info("Test database already set up - not downloading") else: logger.info("Test database not available, downloading...") import zipfile import urllib.request + # Download the file from `url` and save it locally under `file_name`: with urllib.request.urlopen(_URL) as r, tempfile.TemporaryFile() as f: f.write(r.read()) f.flush() f.seek(0) - TESTDB_TMPDIR = \ - tempfile.TemporaryDirectory(prefix=__name__ + '-') + TESTDB_TMPDIR = tempfile.TemporaryDirectory(prefix=__name__ + "-") print(f"Creating test database at {TESTDB_TMPDIR.name}...") logger.info(f"Creating test database at {TESTDB_TMPDIR.name}...") - with zipfile.ZipFile(f) as zf: zf.extractall(TESTDB_TMPDIR.name) + with zipfile.ZipFile(f) as zf: + zf.extractall(TESTDB_TMPDIR.name) from ..data import stare + if TESTDB_TMPDIR is None: # if the user has the STARE directory ready, then we do a normal return from .utils import rc_variable_set + return stare.dataset, rc_variable_set # else, we do a "mock" return - return stare.JSONDataset(stare._protocols, TESTDB_TMPDIR.name, - stare._loader), _mock_test_skipper + return ( + stare.JSONDataset( + stare._protocols, + stare._fieldnames, + stare._make_loader(TESTDB_TMPDIR.name), + stare.data_path_keymaker, + ), + _mock_test_skipper, + ) diff --git a/doc/api.rst b/doc/api.rst index 4e70a851f03a4af60faa1e24fbbb1360190088ca..0062dc3fb263282391799916ecc8e9898354007a 100644 --- a/doc/api.rst +++ b/doc/api.rst @@ -15,9 +15,7 @@ Data Manipulation .. autosummary:: :toctree: api/data - bob.ip.binseg.data.folderdataset - bob.ip.binseg.data.csvdataset - bob.ip.binseg.data.jsondataset + bob.ip.binseg.data.dataset bob.ip.binseg.data.loader bob.ip.binseg.data.sample bob.ip.binseg.data.utils @@ -133,7 +131,6 @@ Datasets :template: config.rst bob.ip.binseg.configs.datasets.csv - bob.ip.binseg.configs.datasets.folder bob.ip.binseg.configs.datasets.utils bob.ip.binseg.configs.datasets.chasedb1 diff --git a/setup.py b/setup.py index 269925af9ff99faecbeeb48a0bc96f0eea00d3d8..c97566905eadcd5c4c9d312f8c88c8ff197d1451 100644 --- a/setup.py +++ b/setup.py @@ -53,9 +53,8 @@ setup( "unet = bob.ip.binseg.configs.models.unet", "resunet = bob.ip.binseg.configs.models.resunet", - # datasets + # example datasets "csv-dataset-example = bob.ip.binseg.configs.datasets.csv", - "folder-dataset-example = bob.ip.binseg.configs.datasets.folder", # drive dataset "drive = bob.ip.binseg.configs.datasets.drive",