[data] Clean-up and re-organization of base code

9627759f · André Anjos · 066c7244 · 9627759f · 9627759f · 066c7244
Commit 9627759f authored 5 years ago by André Anjos
--- a/bob/ip/binseg/configs/datasets/csv.py
+++ b/bob/ip/binseg/configs/datasets/csv.py
@@ -4,15 +4,12 @@
 """Example CSV-based filelist dataset
 In case you have your own dataset that is organized on your filesystem, this
-configuration shows an example setup so you can feed such files and
+configuration shows an example setup so you can feed such files (and
-ground-truth data to train one of the available network models or to evaluate
+ground-truth data) to train one of the available network models or to evaluate
 it.
 You must write CSV based file (e.g. using comma as separator) that describes
-the image and ground-truth locations for each image pair on your dataset.
+the image and ground-truth locations for each image pair on your dataset.  So,
-Relative paths are considered with respect to the location of the CSV file
-itself by default, also pass the ``root_path`` parameter to the
-:py:class:`bob.ip.binseg.data.csvdataset.CSVDataset` object constructor.  So,
 for example, if you have a structure like this:
 .. code-block:: text
@@ -26,8 +23,7 @@ for example, if you have a structure like this:
       ├── ...
       └── gt_n.png
-Then create a file in the same level of ``images`` and ``ground-truth`` with
+Then create a file with the following contents:
-the following contents:
 .. code-block:: text
@@ -38,13 +34,13 @@ the following contents:
 To create a dataset without ground-truth (e.g., for prediction purposes), then
 omit the second column on the CSV file.
-Use the path leading to the CSV file and replace ``<path.csv>`` on the example
+Use the path leading to the CSV file and carefully read the comments in this
-code for this configuration, that you must copy locally to make changes:
+configuration.  **Copy it locally to make changes**:
 .. code-block:: sh
   $ bob binseg config copy csv-dataset-example mydataset.py
-   # edit mydataset.py as explained here
+   # edit mydataset.py as explained here, follow the comments
 Fine-tune the transformations for your particular purpose:
@@ -58,20 +54,79 @@ certain restrictions (input dimensions, image centering, etc.).  Check the
 configuration that was used to train models and try to match it as well as
 possible.
-See:
+Finally, you must create a connector that will act as a "dataset" for pytorch.
+The connector make a list of samples, returned by your raw dataset, look like
+something our pytorch setup can digest (tuples of data with a certain
+organisation).
-* :py:class:`bob.ip.binseg.data.csvdataset.CSVDataset` for operational details.
+More information:
-* :py:class:`bob.ip.binseg.data.folderdataset.FolderDataset` for an alternative
-   implementation of an easier to generate **prediction** dataset.
+* :py:class:`bob.ip.binseg.data.dataset.CSVDataset` for operational details.
+* :py:class:`bob.ip.binseg.data.dataset.JSONDataset` for an alternative for
+  multi-protocol datasets (all of our supported raw datasets are implemented
+  using this)
+* :py:class:`bob.ip.binseg.data.utils.SampleList2TorchDataset` for extra
+  information on the sample list to pytorch connector
 """
-# add your transforms below - these are just examples
+# First, define how to access and load the raw data. Our package provides some
+# stock loaders we use for other datasets. You may have a look at the
+# documentation of that module for details.
+from bob.ip.binseg.data.loaders import (
+    load_pil_rgb,
+    load_pil_1,
+    data_path_keymaker,
+)
+# How we use the loaders - "sample" is a dictionary where keys are defined
+# below and map to the columns of the CSV files you input.
+def _loader(context, sample):
+    # "context" is ignored in this case - database is homogeneous
+    # it is a dictionary that passes e.g., the name of the subset
+    # being loaded, so you can take contextual decisions on the loading
+    # Using the path leading to the various data files stored in disk allows
+    # the CSV file to contain only relative paths and is, therefore, more
+    # compact.  Of course, you can make those paths absolute and then simplify
+    # it here.
+    import os
+    root_path = "/path/where/raw/files/sit"
+    return dict(
+        data=load_pil_rgb(os.path.join(root_path, sample["data"])),
+        label=load_pil_1(os.path.join(root_path, sample["label"])),
+    )
+# This is just a class that puts everything together: the CSV file, how to load
+# each sample defined in the dataset, names for the various columns of the CSV
+# file and how to make unique keys for each sample (keymaker).  Once created,
+# this object can be called to generate sample lists.
+from bob.ip.binseg.data.dataset import CSVDataset
+raw_dataset = CSVDataset(
+    # path to the CSV file(s) - you may add as many subsets as you want, each
+    # with an unique name, you'll use later to generate sample lists
+    subsets=dict(data="<path/to/train.csv>"),
+    fieldnames=("data", "label"),  #these are the column names
+    loader=_loader,
+    keymaker=data_path_keymaker,
+)
+# Finally, we build a connector to passes our dataset to the pytorch framework
+# so we can, for example, evaluate a trained pytorch model
+# Add/tune your transforms below - these are just examples compatible with a
+# model that requires image inputs of 544 x 544 pixels.
 from bob.ip.binseg.data.transforms import CenterCrop
-#from bob.ip.binseg.configs.datasets.utils import DATA_AUGMENTATION as _DA
-_transforms = [
-        CenterCrop((544, 544)),
-        ] # + _DA
-from bob.ip.binseg.data.csvdataset import CSVDataset
+# from bob.ip.binseg.configs.datasets.utils import DATA_AUGMENTATION as _DA
-#dataset = CSVDataset("<path.csv>", check_available=False, transforms=_transforms)
+_transforms = [
+    CenterCrop((544, 544)),
+]  # + _DA
+# This class will simply trigger data loading and re-arrange the data so that
+# data is fed in the right order to pytorch: (key, image[, label[, mask]]).
+# This class also inherits from pytorch Dataset and respect its required API.
+# See the documentation for details.
+from bob.ip.binseg.data.utils import SampleList2TorchDataset
+dataset = SampleList2TorchDataset(raw_dataset.subset("data"), _transforms)
--- a/bob/ip/binseg/data/chasedb1/__init__.py
+++ b/bob/ip/binseg/data/chasedb1/__init__.py
@@ -36,8 +36,8 @@ import pkg_resources
 import bob.extension
-from ..jsondataset import JSONDataset
+from ..dataset import JSONDataset
-from ..loader import load_pil_rgb, load_pil_1
+from ..loader import load_pil_rgb, load_pil_1, data_path_keymaker
 _protocols = [
        pkg_resources.resource_filename(__name__, "default.json"),
@@ -48,11 +48,12 @@ _root_path = bob.extension.rc.get('bob.ip.binseg.chasedb1.datadir',
        os.path.realpath(os.curdir))
 def _loader(context, sample):
-    #"context" is ignore in this case - database is homogeneous
+    #"context" is ignored in this case - database is homogeneous
    return dict(
-            data=load_pil_rgb(sample["data"]),
+            data=load_pil_rgb(os.path.join(_root_path, sample["data"])),
-            label=load_pil_1(sample["label"]),
+            label=load_pil_1(os.path.join(_root_path, sample["label"])),
            )
-dataset = JSONDataset(protocols=_protocols, root_path=_root_path, loader=_loader)
+dataset = JSONDataset(protocols=_protocols, fieldnames=("data", "label"),
+        loader=_loader, keymaker=data_path_keymaker)
 """CHASE-DB1 dataset object"""
--- a/bob/ip/binseg/data/csvdataset.py
+++ b/bob/ip/binseg/data/csvdataset.py
-#!/usr/bin/env python
-# coding=utf-8
-import os
-import csv
-from PIL import Image
-from torch.utils.data import Dataset
-import torch
-import torchvision.transforms.functional as VF
-import bob.io.base
-from .transforms import Compose, ToTensor
-import logging
-logger = logging.getLogger(__name__)
-class CSVDataset(Dataset):
-    """
-    Generic filelist dataset
-    To create a new dataset, you only need to provide a CSV formatted filelist
-    using any separator (e.g. comma, space, semi-colon) with the following
-    information:
-    .. code-block:: text
-       image[,label[,mask]]
-    Where:
-    * ``image``: absolute or relative path leading to original image
-    * ``label``: (optional) absolute or relative path with manual segmentation
-      information
-    * ``mask``: (optional) absolute or relative path with a mask that indicates
-      valid regions in the image where automatic segmentation should occur
-    Relative paths are interpreted with respect to the location where the CSV
-    file is or to an optional ``root_path`` parameter, that can be provided.
-    There are no requirements concerning image or ground-truth homogenity.
-    Anything that can be loaded by our image and data loaders is OK.  Use
-    a non-white character as separator.  Example
-    .. code-block:: text
-       image1.jpg,gt1.tif,mask1.png
-       image2.png,gt2.png,mask2.png
-       ...
-    Notice that all rows must have the same number of entries.
-    .. important::
-       Images are converted to RGB after readout via PIL.  Ground-truth data is
-       loaded using the same technique, but converted to mode ``1`` instead of
-       ``RGB``.  If ground-truth data is encoded as an HDF5 file, we use
-       instead :py:func:`bob.io.base.load`, and then converted it to 32-bit
-       float data.
-    To generate a dataset without ground-truth (e.g. for prediction tasks),
-    then omit the second and third columns.
-    Parameters
-    ----------
-    path : str
-        Full path to the file containing the dataset description, in CSV
-        format as described above
-    root_path : :py:class:`str`, Optional
-        Path to a common filesystem root where files with relative paths should
-        be sitting.  If not set, then we use the absolute path leading to the
-        CSV file as ``root_path``
-    check_available : :py:class:`bool`, Optional
-        If set to ``True``, then checks if files in the file list are
-        available.  Otherwise does not.
-    transforms : :py:class:`list`, Optional
-        a list of transformations to be applied to **both** image and
-        ground-truth data.  Notice that image changing transformations such as
-        :py:class:`.transforms.ColorJitter` are only applied to the image and
-        **not** to ground-truth.  Also notice a last transform
-        (:py:class:`bob.ip.binseg.data.transforms.ToTensor`) is always applied.
-    """
-    def __init__(
-        self, path, root_path=None, check_available=True, transforms=[]
-    ):
-        self.root_path = root_path or os.path.dirname(path)
-        self.transform = Compose(transforms + [ToTensor()])
-        def _make_abs_path(root, s):
-            retval = []
-            for p in s:
-                if not os.path.isabs(p):
-                    retval.append(os.path.join(root, p))
-            return retval
-        with open(path, newline="") as f:
-            reader = csv.reader(f)
-            self.data = [_make_abs_path(self.root_path, k) for k in reader]
-        # check if all files are readable, warn otherwise
-        if check_available:
-            errors = 0
-            for s in self.data:
-                for p in s:
-                    if not os.path.exists(p):
-                        errors += 1
-                        logger.error(f"Cannot find {p}")
-            assert errors == 0, (
-                f"There {errors} files which cannot be "
-                f"found on your filelist ({path}) dataset"
-            )
-        # check all data entries have the same size
-        assert all(len(k) == len(self.data[0]) for k in self.data), (
-            f"There is an inconsistence on your dataset - not all "
-            f"entries have length=={len(self.data[0])}"
-        )
-    def __len__(self):
-        """
-        Returns
-        -------
-        length : int
-            size of the dataset
-        """
-        return len(self.data)
-    def __getitem__(self, index):
-        """
-        Parameters
-        ----------
-        index : int
-        Returns
-        -------
-        sample : list
-            ``[name, img, gt, mask]``, ``[name, img, gt]`` or ``[name, img]``
-            depending on whether this dataset has or not ground-truth
-            annotations and masks.  The value of ``name`` is relative to
-            ``root_path``, in cases it starts with ``root_path``.
-        """
-        sample_paths = self.data[index]
-        img_path = sample_paths[0]
-        meta_data = sample_paths[1:]
-        # images are converted to RGB mode automatically
-        sample = [Image.open(img_path).convert(mode="RGB")]
-        # ground-truth annotations and masks are treated the same
-        for path in meta_data:
-            if path is not None:
-                if path.endswith(".hdf5"):
-                    data = bob.io.base.load(str(path)).astype("float32")
-                    # a bit hackish, but will get what we need
-                    data = VF.to_pil_image(torch.from_numpy(data))
-                else:
-                    data = Image.open(path)
-                sample += [data.convert(mode="1", dither=None)]
-        if self.transform:
-            sample = self.transform(*sample)
-        # make paths relative if necessary
-        stem = img_path
-        if stem.startswith(self.root_path):
-            stem = os.path.relpath(stem, self.root_path)
-        elif stem.startswith(os.pathsep):
-            stem = stem[len(os.pathsep) :]
-        return [stem] + sample
--- a/bob/ip/binseg/data/jsondataset.py
+++ b/bob/ip/binseg/data/jsondataset.py
@@ -7,6 +7,7 @@ import json
 import functools
 import logging
 logger = logging.getLogger(__name__)
 from .sample import DelayedSample
@@ -14,81 +15,32 @@ from .sample import DelayedSample
 class JSONDataset:
    """
-    Generic multi-protocol filelist dataset that yields samples
+    Generic multi-protocol/subset filelist dataset that yields samples
    To create a new dataset, you need to provide one or more JSON formatted
    filelists (one per protocol) with the following contents:
-    .. code-block:: json
-       {
-           "subset1": [
-               {
-                   "data": "path/to/data",
-                   "label": "path/to/optional/label",
-                   "mask": "path/to/optional/mask"
-               }
-           ],
-           "subset2": [
-           ]
-       }
-    Optionally, you may also format your JSON file like this, where each sample
-    is described as a list of up to 3 elements:
    .. code-block:: json
       {
           "subset1": [
               [
-                   "path/to/data",
+                   "value1",
-                   "path/to/optional/label",
+                   "value2",
-                   "path/to/optional/mask"
+                   "value3"
+               ],
+               [
+                   "value4",
+                   "value5",
+                   "value6"
               ]
           ],
           "subset2": [
           ]
       }
-    If your dataset does not have labels or masks, you may also represent it
+    Your dataset many contain any number of subsets, but all sample entries
-    like this:
+    must contain the same number of fields.
-    .. code-block:: json
-       {
-           "subset1": [
-               "path/to/data1",
-               "path/to/data2"
-           ],
-           "subset2": [
-           ]
-       }
-    Where:
-    * ``data``: absolute or relative path leading to original image, in RGB
-      format
-    * ``label``: (optional) absolute or relative path with manual segmentation
-      information.  This image will be converted to a binary image.  This
-      dataset shall always yield label images in which white pixels (value=1)
-      indicate the **presence** of the object, and black pixels (value=0), its
-      absence.
-    * ``mask``: (optional) absolute or relative path with a mask that indicates
-      valid regions in the image where automatic segmentation should occur.
-      This image will be converted to a binary image.  This dataset shall
-      always yield mask images in which white pixels (value=1) indicate the
-      **valid** regions of the mask, and black pixels (value=0), invalid parts.
-    Relative paths are interpreted with respect to the location where the JSON
-    file is or to an optional ``root_path`` parameter, that can be provided.
-    There are no requirements concerning image or ground-truth homogenity.
-    Anything that can be loaded by our image and data loaders is OK.
-    Notice that all rows must have the same number of entries.
-    To generate a dataset without ground-truth (e.g. for prediction tasks),
-    then omit the ``label`` and ``mask`` entries.
    Parameters
@@ -100,35 +52,42 @@ class JSONDataset:
        protocol names to paths of JSON files.  Internally, we save a
        dictionary where keys default to the basename of paths.
-    root_path : str
+    fieldnames : list, tuple
-        Path to a common filesystem root where files with relative paths should
+        An iterable over the field names (strings) to assign to each entry in
-        be sitting.  If not set, then we use the current directory to resolve
+        the JSON file.  It should have as many items as fields in each entry of
-        relative paths.
+        the JSON file.
    loader : object
-        A function that receives, as input, a context dictionary (with a
+        A function that receives as input, a context dictionary (with a
        "protocol" and "subset" keys indicating which protocol and subset are
        being served), and a dictionary with ``{key: path}`` entries, and
-        returns a dictionary with the loaded data.  It shall respect the
+        returns a dictionary with the loaded data.
-        loading principles of data, label and mask objects as stated above.
+    keymaker : object
+        A function that receives as input the same input from the ``loader``,
+        but outputs a single string that uniquely identifies a sample within
+        a given protocol.  It is typically the path, without extension, of one
+        of the file entries for the sample, but you can tune it as you like.
    """
-    def __init__(self, protocols, root_path, loader):
+    def __init__(self, protocols, fieldnames, loader, keymaker):
        if isinstance(protocols, dict):
-            self.protocols = dict((k,os.path.realpath(v)) for k,v in
+            self.protocols = dict(
-                    protocols.items())
+                (k, os.path.realpath(v)) for k, v in protocols.items()
+            )
        else:
            self.protocols = dict(
                (os.path.splitext(os.path.basename(k))[0], os.path.realpath(k))
                for k in protocols
            )
-        self.root_path = root_path
+        self.fieldnames = fieldnames
        self.loader = loader
+        self.keymaker = keymaker
    def check(self):
-        """For each protocol, check all files are available on the filesystem
+        """For each protocol, check if all data can be correctly accessed
        Returns
        -------
@@ -138,8 +97,7 @@ class JSONDataset:
        """
-        logger.info(f"Checking dataset at '{self.root_path}'...")
+        logger.info(f"Checking dataset...")
        errors = 0
        for proto in self.protocols:
            logger.info(f"Checking protocol '{proto}'...")
@@ -154,6 +112,21 @@ class JSONDataset:
                        errors += 1
        return errors
+    def _make_delayed(self, pos, sample, context):
+        """Checks consistence and builds a delayed loading sample
+        """
+        assert len(sample) == len(self.fieldnames), (
+            f"Entry {k} in subset {context['subset']} of protocol "
+            f"{context['protocol']} has {len(sample)} entries instead of "
+            f"{len(self.fieldnames)} (expected). Fix file "
+            f"{self.protocols[context['protocol']]}"
+        )
+        item = dict(zip(self.fieldnames, sample))
+        return DelayedSample(
+            functools.partial(self.loader, context, item),
+            key=self.keymaker(context, item),
+        )
    def subsets(self, protocol):
        """Returns all subsets in a protocol
@@ -175,45 +148,152 @@ class JSONDataset:
            A dictionary mapping subset names to lists of
            :py:class:`bob.ip.binseg.data.sample.DelayedSample` objects, with
            the proper loading implemented.  Each delayed sample also carries a
-            ``key`` parameter, that contains the relative path of the sample,
+            ``key`` parameter, that contains the output of the sample
-            without its extension.  This parameter can be used for recording
+            contextual data after passing through the ``keymaker``.  This
-            sample transforms during check-pointing.
+            parameter can be used for recording sample transforms during
+            check-pointing.
        """
        with open(self.protocols[protocol], "r") as f:
            data = json.load(f)
-        # returns a fixed sample representations as a DelayedSamples
        retval = {}
        for subset, samples in data.items():
-            delayeds = []
            context = dict(protocol=protocol, subset=subset)
-            for k in samples:
+            retval[subset] = [
+                self._make_delayed(k, v, context) for (k, v) in enumerate(samples)
+            ]
+        return retval
-                if isinstance(k, dict):
-                    item = k
-                elif isinstance(k, list):
+class CSVDataset:
-                    item = {"data": k[0]}
+    """
-                    if len(k) > 1: item["label"] = k[1]
+    Generic single subset filelist dataset that yields samples
-                    if len(k) > 2: item["mask"] = k[2]
+    To create a new dataset, you only need to provide a CSV formatted filelist
+    using any separator (e.g. comma, space, semi-colon) with the following
+    information:
-                elif isinstance(k, str):
+    .. code-block:: text
-                    item = {"data": k}
-                key = os.path.splitext(item["data"])[0]
+       value1,value2,value3
+       value4,value5,value6
+       ...
-                # make paths absolute
+    Notice that all rows must have the same number of entries.
-                abs_item = copy.deepcopy(item)
-                for k,v in item.items():
-                    if not os.path.isabs(v):
-                        abs_item[k] = os.path.join(self.root_path, v)
-                load = functools.partial(self.loader, context, abs_item)
+    Parameters
-                delayeds.append(DelayedSample(load, key=key))
+    ----------
-            retval[subset] = delayeds
+    subsets : list, dict
+        Paths to one or more CSV formatted files containing the various
+        subsets to be recognized by this dataset, or a dictionary, mapping
+        subset names to paths of CSV files.  Internally, we save a
+        dictionary where keys default to the basename of paths.
-        return retval
+    fieldnames : list, tuple
+        An iterable over the field names (strings) to assign to each column in
+        the CSV file.  It should have as many items as fields in each row of
+        the CSV file(s).
+    loader : object
+        A function that receives, as input, a dictionary with ``{name: value}``
+        entries (for each header in the input CSV file), and returns a
+        dictionary with the loaded data.
+    keymaker : object
+        A function that receives as input the same input from the ``loader``,
+        but outputs a single string that uniquely identifies a sample within
+        a given protocol.  It is typically the path, without extension, of one
+        of the file entries for the sample, but you can tune it as you like.
+    """
+    def __init__(self, subsets, fieldnames, loader, keymaker):
+        if isinstance(subsets, dict):
+            self.subsets = dict(
+                (k, os.path.realpath(v)) for k, v in subsets.items()
+            )
+        else:
+            self.subsets = dict(
+                (os.path.splitext(os.path.basename(k))[0], os.path.realpath(k))
+                for k in subsets
+            )
+        self.fieldnames = fieldnames
+        self.loader = loader
+        self.keymaker = keymaker
+    def check(self):
+        """For each subset, check if all data can be correctly accessed
+        Returns
+        -------
+        errors : int
+            Number of errors found
+        """
+        logger.info(f"Checking dataset...")
+        errors = 0
+        for name in self.subsets.keys():
+            logger.info(f"Checking subset '{name}'...")
+            for sample in self.samples(name):
+                try:
+                    sample.data  # triggers loading
+                    logger.info(f"{sample.key}: OK")
+                except Exception as e:
+                    logger.error(f"{sample.key}: {e}")
+                    errors += 1
+        return errors
+    def _make_delayed(self, pos, sample, context):
+        """Checks consistence and builds a delayed loading sample
+        """
+        assert len(sample) == len(self.fieldnames), (
+            f"Entry {k} in subset {context['subset']} has {len(sample)} "
+            f"entries instead of {len(self.fieldnames)} (expected). Fix "
+            f"file {self.subsets[context['subset']]}"
+        )
+        item = dict(zip(self.fieldnames, v))
+        return DelayedSample(
+            functools.partial(self.loader, context, item),
+            key=self.keymaker(context, item),
+        )
+    def samples(self, subset):
+        """Returns all samples in a subset
+        This method will load CSV information for a given subset and return
+        all samples of the given subset after converting each entry into a
+        :py:class:`bob.ip.binseg.data.sample.DelayedSample`.
+        Parameters
+        ----------
+        subset : str
+            Name of the subset data to load
+        Returns
+        -------
+        subset : list
+            A list of :py:class:`bob.ip.binseg.data.sample.DelayedSample`
+            objects, with the proper loading implemented.  Each delayed sample
+            also carries a ``key`` parameter, that contains the output of the
+            sample contextual data after passing through the ``keymaker``.
+            This parameter can be used for recording sample transforms during
+            check-pointing.
+        """
+        with open(self.subsets[subset], newline="") as f:
+            cf = csv.reader(f)
+            samples = [k for k in cf]
+        context = dict(subset=subset)
+        return [self._make_delayed(k, v, context) for (k, v) in enumerate(samples)]
--- a/bob/ip/binseg/data/drionsdb/__init__.py
+++ b/bob/ip/binseg/data/drionsdb/__init__.py
@@ -30,8 +30,8 @@ import PIL.ImageDraw
 import bob.extension
-from ..jsondataset import JSONDataset
+from ..dataset import JSONDataset
-from ..loader import load_pil_rgb
+from ..loader import load_pil_rgb, data_path_keymaker
 _protocols = [
        pkg_resources.resource_filename(__name__, "default.json"),
@@ -63,8 +63,8 @@ def _pad_right(img):
 def _loader(context, sample):
-    data  = load_pil_rgb(sample["data"])
+    data  = load_pil_rgb(os.path.join(_root_path, sample["data"]))
-    label = _txt_to_pil_1(sample["label"], data.size)
+    label = _txt_to_pil_1(os.path.join(_root_path, sample["label"]), data.size)
    if sample["data"].endswith("_101.jpg"):
        # pads the image on the right side to account for a difference in
@@ -74,5 +74,6 @@ def _loader(context, sample):
    return dict(data=data, label=label)
-dataset = JSONDataset(protocols=_protocols, root_path=_root_path, loader=_loader)
+dataset = JSONDataset(protocols=_protocols, fieldnames=("data", "label"),
+        loader=_loader, keymaker=data_path_keymaker)
 """DRIONSDB dataset object"""
--- a/bob/ip/binseg/data/drishtigs1/__init__.py
+++ b/bob/ip/binseg/data/drishtigs1/__init__.py
@@ -27,33 +27,49 @@ import pkg_resources
 import bob.extension
-from ..jsondataset import JSONDataset
+from ..dataset import JSONDataset
-from ..loader import load_pil_rgb
+from ..loader import load_pil_rgb, data_path_keymaker
 _protocols = {
-        "optic-disc-all": pkg_resources.resource_filename(__name__, "optic-disc.json"),
+    "optic-disc-all": pkg_resources.resource_filename(
-        "optic-cup-all": pkg_resources.resource_filename(__name__, "optic-cup.json"),
+        __name__, "optic-disc.json"
-        "optic-disc-any": pkg_resources.resource_filename(__name__, "optic-disc.json"),
+    ),
-        "optic-cup-any": pkg_resources.resource_filename(__name__, "optic-cup.json"),
+    "optic-cup-all": pkg_resources.resource_filename(
-        }
+        __name__, "optic-cup.json"
+    ),
+    "optic-disc-any": pkg_resources.resource_filename(
+        __name__, "optic-disc.json"
+    ),
+    "optic-cup-any": pkg_resources.resource_filename(
+        __name__, "optic-cup.json"
+    ),
+}
+_root_path = bob.extension.rc.get(
+    "bob.ip.binseg.drishtigs1.datadir", os.path.realpath(os.curdir)
+)
-_root_path = bob.extension.rc.get('bob.ip.binseg.drishtigs1.datadir',
-        os.path.realpath(os.curdir))
 def _loader(context, sample):
    retval = dict(
-            data=load_pil_rgb(sample["data"]),
+        data=load_pil_rgb(os.path.join(_root_path, sample["data"])),
-            label=load_pil_rgb(sample["label"]).convert("L"),
+        label=load_pil_rgb(os.path.join(_root_path, sample["label"])).convert(
-            )
+            "L"
+        ),
+    )
    # Drishti-GS provides softmaps of multiple annotators
    # we threshold to get gt where all/any of the annotators overlap
    if context["protocol"].endswith("-all"):
-        retval["label"] = retval["label"].point(lambda p: p>254, mode="1")
+        retval["label"] = retval["label"].point(lambda p: p > 254, mode="1")
    elif context["protocol"].endswith("-any"):
-        retval["label"] = retval["label"].point(lambda p: p>0, mode="1")
+        retval["label"] = retval["label"].point(lambda p: p > 0, mode="1")
    else:
        raise RuntimeError(f"Unknown protocol {context['protocol']}")
    return retval
-dataset = JSONDataset(protocols=_protocols, root_path=_root_path, loader=_loader)
+dataset = JSONDataset(
+    protocols=_protocols, fieldnames=("data", "label"), loader=_loader,
+    keymaker=data_path_keymaker
+)
 """Drishti-GS1 dataset object"""
--- a/bob/ip/binseg/data/drive/__init__.py
+++ b/bob/ip/binseg/data/drive/__init__.py
@@ -25,24 +25,32 @@ import pkg_resources
 import bob.extension
-from ..jsondataset import JSONDataset
+from ..dataset import JSONDataset
-from ..loader import load_pil_rgb, load_pil_1
+from ..loader import load_pil_rgb, load_pil_1, data_path_keymaker
 _protocols = [
-        pkg_resources.resource_filename(__name__, "default.json"),
+    pkg_resources.resource_filename(__name__, "default.json"),
-        pkg_resources.resource_filename(__name__, "second-annotation.json"),
+    pkg_resources.resource_filename(__name__, "second-annotation.json"),
-        ]
+]
+_root_path = bob.extension.rc.get(
+    "bob.ip.binseg.drive.datadir", os.path.realpath(os.curdir)
+)
-_root_path = bob.extension.rc.get('bob.ip.binseg.drive.datadir',
-        os.path.realpath(os.curdir))
 def _loader(context, sample):
-    #"context" is ignore in this case - database is homogeneous
+    # "context" is ignored in this case - database is homogeneous
    return dict(
-            data=load_pil_rgb(sample["data"]),
+        data=load_pil_rgb(os.path.join(_root_path, sample["data"])),
-            label=load_pil_1(sample["label"]),
+        label=load_pil_1(os.path.join(_root_path, sample["label"])),
-            mask=load_pil_1(sample["mask"]),
+        mask=load_pil_1(os.path.join(_root_path, sample["mask"])),
-            )
+    )
-dataset = JSONDataset(protocols=_protocols, root_path=_root_path, loader=_loader)
+dataset = JSONDataset(
+    protocols=_protocols,
+    fieldnames=("data", "label", "mask"),
+    loader=_loader,
+    keymaker=data_path_keymaker,
+)
 """DRIVE dataset object"""
--- a/bob/ip/binseg/data/folderdataset.py
+++ b/bob/ip/binseg/data/folderdataset.py
-#!/usr/bin/env python
-# coding=utf-8
-from pathlib import Path
-from PIL import Image
-from torch.utils.data import Dataset
-from .transforms import Compose, ToTensor
-def _find_files(data_path, glob):
-    """
-    Recursively retrieves file lists from a given path, matching a given glob
-    This function will use :py:meth:`pathlib.Path.rglob`, together with the
-    provided glob pattern to search for anything the desired filename.
-    """
-    data_path = Path(data_path)
-    return sorted(list(data_path.rglob(glob)))
-class FolderDataset(Dataset):
-    """
-    Generic image folder containing images for prediction
-    .. important::
-        This implementation, contrary to its sister
-        :py:class:`.csvdataset.CSVDataset`, does not *automatically* convert
-        the input image to RGB, before passing it to the transforms, so it is
-        possible to accomodate a wider range of input types (e.g. 16-bit PNG
-        images).
-    Parameters
-    ----------
-    path : str
-        full path to root of dataset
-    glob : str
-        glob that can be used to filter-down files to be loaded on the provided
-        path
-    transforms : :py:class:`list`, Optional
-        a list of transformations to be applied to **both** image and
-        ground-truth data.  Notice that image changing transformations such as
-        :py:class:`.transforms.ColorJitter` are only applied to the image and
-        **not** to ground-truth.  Also notice a last transform
-        (:py:class:`bob.ip.binseg.data.transforms.ToTensor`) is always applied.
-    """
-    def __init__(self, path, glob="*", transforms=[]):
-        self.transform = Compose(transforms + [ToTensor()])
-        self.path = path
-        self.data = _find_files(path, glob)
-    def __len__(self):
-        """
-        Returns
-        -------
-        int
-            size of the dataset
-        """
-        return len(self.data)
-    def __getitem__(self, index):
-        """
-        Parameters
-        ----------
-        index : int
-        Returns
-        -------
-        sample : list
-            [name, img]
-        """
-        sample = [Image.open(self.data[index])]
-        if self.transform:
-            sample = self.transform(*sample)
-        return [self.data[index].relative_to(self.path).as_posix()] + sample
--- a/bob/ip/binseg/data/hrf/__init__.py
+++ b/bob/ip/binseg/data/hrf/__init__.py
@@ -24,23 +24,31 @@ import pkg_resources
 import bob.extension
-from ..jsondataset import JSONDataset
+from ..dataset import JSONDataset
-from ..loader import load_pil_rgb, load_pil_1
+from ..loader import load_pil_rgb, load_pil_1, data_path_keymaker
 _protocols = [
-        pkg_resources.resource_filename(__name__, "default.json"),
+    pkg_resources.resource_filename(__name__, "default.json"),
-        ]
+]
+_root_path = bob.extension.rc.get(
+    "bob.ip.binseg.hrf.datadir", os.path.realpath(os.curdir)
+)
-_root_path = bob.extension.rc.get('bob.ip.binseg.hrf.datadir',
-        os.path.realpath(os.curdir))
 def _loader(context, sample):
-    #"context" is ignore in this case - database is homogeneous
+    # "context" is ignore in this case - database is homogeneous
    return dict(
-            data=load_pil_rgb(sample["data"]),
+        data=load_pil_rgb(os.path.join(_root_path, sample["data"])),
-            label=load_pil_1(sample["label"]),
+        label=load_pil_1(os.path.join(_root_path, sample["label"])),
-            mask=load_pil_1(sample["mask"]),
+        mask=load_pil_1(os.path.join(_root_path, sample["mask"])),
-            )
+    )
-dataset = JSONDataset(protocols=_protocols, root_path=_root_path, loader=_loader)
+dataset = JSONDataset(
+    protocols=_protocols,
+    fieldnames=("data", "label", "mask"),
+    loader=_loader,
+    keymaker=data_path_keymaker,
+)
 """HRF dataset object"""
--- a/bob/ip/binseg/data/iostar/__init__.py
+++ b/bob/ip/binseg/data/iostar/__init__.py
@@ -27,35 +27,44 @@ import pkg_resources
 import bob.extension
-from ..jsondataset import JSONDataset
+from ..dataset import JSONDataset
-from ..loader import load_pil_rgb, load_pil_1
+from ..loader import load_pil_rgb, load_pil_1, data_path_keymaker
 from ..utils import invert_mode1_image, subtract_mode1_images
 _protocols = [
-        pkg_resources.resource_filename(__name__, "vessel.json"),
+    pkg_resources.resource_filename(__name__, "vessel.json"),
-        pkg_resources.resource_filename(__name__, "optic-disc.json"),
+    pkg_resources.resource_filename(__name__, "optic-disc.json"),
-        ]
+]
+_root_path = bob.extension.rc.get(
+    "bob.ip.binseg.iostar.datadir", os.path.realpath(os.curdir)
+)
-_root_path = bob.extension.rc.get('bob.ip.binseg.iostar.datadir',
-        os.path.realpath(os.curdir))
 def _loader(context, sample):
    retval = dict(
-            data=load_pil_rgb(sample["data"]),
+        data=load_pil_rgb(os.path.join(_root_path, sample["data"])),
-            label=load_pil_1(sample["label"]),
+        label=load_pil_1(os.path.join(_root_path, sample["label"])),
-            mask=load_pil_1(sample["mask"]),
+        mask=load_pil_1(os.path.join(_root_path, sample["mask"])),
-            )
+    )
    if context["protocol"] == "optic-disc":
        # For optic-disc analysis, the label provided by IOSTAR raw data is the
        # "inverted" (negative) label, and does not consider the mask region,
        # which must be subtracted.  We do this special manipulation here.
        retval["label"] = subtract_mode1_images(
-                invert_mode1_image(retval["label"]),
+            invert_mode1_image(retval["label"]),
-                invert_mode1_image(retval["mask"]))
+            invert_mode1_image(retval["mask"]),
+        )
        return retval
    elif context["protocol"] == "vessel":
        return retval
    raise RuntimeError(f"Unknown protocol {context['protocol']}")
-dataset = JSONDataset(protocols=_protocols, root_path=_root_path, loader=_loader)
+dataset = JSONDataset(
+    protocols=_protocols,
+    fieldnames=("data", "label", "mask"),
+    loader=_loader,
+    keymaker=data_path_keymaker,
+)
 """IOSTAR dataset object"""
--- a/bob/ip/binseg/data/loader.py
+++ b/bob/ip/binseg/data/loader.py
@@ -5,6 +5,7 @@
 """Data loading code"""
+import os
 import PIL.Image
@@ -48,3 +49,34 @@ def load_pil_1(path):
    """
    return PIL.Image.open(path).convert(mode="1", dither=None)
+def data_path_keymaker(context, sample):
+    """Returns a path without extension as a key
+    This method assumes ``sample`` contains at least one entry named ``path``,
+    that contains a path to the sample raw data, without extension.  It will
+    return the said path without its extension.
+    Parameters
+    ----------
+    context : dict
+        Context dictionary with entries (``protocol``, ``subset``), depending
+        on the context
+    sample : dict
+        A dictionary that maps field names to sample entries from the original
+        dataset.
+    Returns
+    -------
+    key : str
+        A string that uniquely identifies the sample within a given context
+    """
+    return os.path.splitext(sample["data"])[0]
--- a/bob/ip/binseg/data/refuge/__init__.py
+++ b/bob/ip/binseg/data/refuge/__init__.py
@@ -37,8 +37,8 @@ import pkg_resources
 import bob.extension
-from ..jsondataset import JSONDataset
+from ..dataset import JSONDataset
-from ..loader import load_pil_rgb
+from ..loader import load_pil_rgb, data_path_keymaker
 _protocols = {
        "optic-disc": pkg_resources.resource_filename(__name__, "default.json"),
@@ -50,8 +50,8 @@ _root_path = bob.extension.rc.get('bob.ip.binseg.refuge.datadir',
 def _loader(context, sample):
    retval = dict(
-            data=load_pil_rgb(sample["data"]),
+            data=load_pil_rgb(os.path.join(_root_path, sample["data"])),
-            label=load_pil_rgb(sample["label"]),
+            label=load_pil_rgb(os.path.join(_root_path, sample["label"])),
            )
    if context["subset"] == "train":
@@ -77,5 +77,6 @@ def _loader(context, sample):
    return retval
-dataset = JSONDataset(protocols=_protocols, root_path=_root_path, loader=_loader)
+dataset = JSONDataset(protocols=_protocols, fieldnames=("data", "label"),
+        loader=_loader, keymaker=data_path_keymaker)
 """REFUGE dataset object"""
--- a/bob/ip/binseg/data/rimoner3/__init__.py
+++ b/bob/ip/binseg/data/rimoner3/__init__.py
@@ -25,28 +25,40 @@ import pkg_resources
 import bob.extension
-from ..jsondataset import JSONDataset
+from ..dataset import JSONDataset
-from ..loader import load_pil_rgb, load_pil_1
+from ..loader import load_pil_rgb, load_pil_1, data_path_keymaker
 _protocols = [
-        pkg_resources.resource_filename(__name__, "optic-disc-exp1.json"),
+    pkg_resources.resource_filename(__name__, "optic-disc-exp1.json"),
-        pkg_resources.resource_filename(__name__, "optic-cup-exp1.json"),
+    pkg_resources.resource_filename(__name__, "optic-cup-exp1.json"),
-        pkg_resources.resource_filename(__name__, "optic-disc-exp2.json"),
+    pkg_resources.resource_filename(__name__, "optic-disc-exp2.json"),
-        pkg_resources.resource_filename(__name__, "optic-cup-exp2.json"),
+    pkg_resources.resource_filename(__name__, "optic-cup-exp2.json"),
-        pkg_resources.resource_filename(__name__, "optic-disc-avg.json"),
+    pkg_resources.resource_filename(__name__, "optic-disc-avg.json"),
-        pkg_resources.resource_filename(__name__, "optic-cup-avg.json"),
+    pkg_resources.resource_filename(__name__, "optic-cup-avg.json"),
-        ]
+]
+_root_path = bob.extension.rc.get(
+    "bob.ip.binseg.rimoner3.datadir", os.path.realpath(os.curdir)
+)
-_root_path = bob.extension.rc.get('bob.ip.binseg.rimoner3.datadir',
-        os.path.realpath(os.curdir))
 def _loader(context, sample):
    # RIM-ONE r3 provides stereo images - we clip them here to get only the
    # left part of the image, which is also annotated
    return dict(
-            data=load_pil_rgb(sample["data"]).crop((0, 0, 1072, 1424)),
+        data=load_pil_rgb(os.path.join(_root_path, sample["data"])).crop(
-            label=load_pil_1(sample["label"]).crop((0, 0, 1072, 1424)),
+            (0, 0, 1072, 1424)
-            )
+        ),
+        label=load_pil_1(os.path.join(_root_path, sample["label"])).crop(
+            (0, 0, 1072, 1424)
+        ),
+    )
-dataset = JSONDataset(protocols=_protocols, root_path=_root_path, loader=_loader)
+dataset = JSONDataset(
+    protocols=_protocols,
+    fieldnames=("data", "label"),
+    loader=_loader,
+    keymaker=data_path_keymaker,
+)
 """RIM-ONE r3 dataset object"""
--- a/bob/ip/binseg/data/stare/__init__.py
+++ b/bob/ip/binseg/data/stare/__init__.py
@@ -29,23 +29,35 @@ import pkg_resources
 import bob.extension
-from ..jsondataset import JSONDataset
+from ..dataset import JSONDataset
-from ..loader import load_pil_rgb, load_pil_1
+from ..loader import load_pil_rgb, load_pil_1, data_path_keymaker
 _protocols = [
-        pkg_resources.resource_filename(__name__, "default.json"),
+    pkg_resources.resource_filename(__name__, "default.json"),
-        pkg_resources.resource_filename(__name__, "second-annotation.json"),
+    pkg_resources.resource_filename(__name__, "second-annotation.json"),
-        ]
+]
-_root_path = bob.extension.rc.get('bob.ip.binseg.stare.datadir',
+_fieldnames = ("data", "label")
-        os.path.realpath(os.curdir))
-def _loader(context, sample):
+_root_path = bob.extension.rc.get(
-    #"context" is ignore in this case - database is homogeneous
+    "bob.ip.binseg.stare.datadir", os.path.realpath(os.curdir)
-    return dict(
+)
-            data=load_pil_rgb(sample["data"]),
-            label=load_pil_1(sample["label"]),
-            )
-dataset = JSONDataset(protocols=_protocols, root_path=_root_path, loader=_loader)
+def _make_loader(root_path):
+    def _loader(context, sample):
+        # "context" is ignore in this case - database is homogeneous
+        return dict(
+            data=load_pil_rgb(os.path.join(root_path, sample["data"])),
+            label=load_pil_1(os.path.join(root_path, sample["label"])),
+        )
+    return _loader
+dataset = JSONDataset(
+    protocols=_protocols,
+    fieldnames=_fieldnames,
+    loader=_make_loader(_root_path),
+    keymaker=data_path_keymaker,
+)
 """STARE dataset object"""
--- a/bob/ip/binseg/test/__init__.py
+++ b/bob/ip/binseg/test/__init__.py
@@ -5,10 +5,13 @@
 import tempfile
 import logging
 logger = logging.getLogger(__name__)
 TESTDB_TMPDIR = None
-_URL = "http://www.idiap.ch/software/bob/data/bob/bob.ip.binseg/master/_testdb.zip"
+_URL = (
+    "http://www.idiap.ch/software/bob/data/bob/bob.ip.binseg/master/_testdb.zip"
+)
 _RCKEY = "bob.ip.binseg.stare.datadir"
@@ -24,40 +27,54 @@ def _mock_test_skipper(name):
    Dummary decorator that does nothing
    """
    import functools
    def wrapped_function(test):
        @functools.wraps(test)
        def wrapper(*args, **kwargs):
            return test(*args, **kwargs)
        return wrapper
    return wrapped_function
 def mock_dataset():
    global TESTDB_TMPDIR
    from bob.extension import rc
    if (TESTDB_TMPDIR is not None) or (_RCKEY in rc):
        logger.info("Test database already set up - not downloading")
    else:
        logger.info("Test database not available, downloading...")
        import zipfile
        import urllib.request
        # Download the file from `url` and save it locally under `file_name`:
        with urllib.request.urlopen(_URL) as r, tempfile.TemporaryFile() as f:
            f.write(r.read())
            f.flush()
            f.seek(0)
-            TESTDB_TMPDIR = \
+            TESTDB_TMPDIR = tempfile.TemporaryDirectory(prefix=__name__ + "-")
-                    tempfile.TemporaryDirectory(prefix=__name__ + '-')
            print(f"Creating test database at {TESTDB_TMPDIR.name}...")
            logger.info(f"Creating test database at {TESTDB_TMPDIR.name}...")
-            with zipfile.ZipFile(f) as zf: zf.extractall(TESTDB_TMPDIR.name)
+            with zipfile.ZipFile(f) as zf:
+                zf.extractall(TESTDB_TMPDIR.name)
    from ..data import stare
    if TESTDB_TMPDIR is None:
        # if the user has the STARE directory ready, then we do a normal return
        from .utils import rc_variable_set
        return stare.dataset, rc_variable_set
    # else, we do a "mock" return
-    return stare.JSONDataset(stare._protocols, TESTDB_TMPDIR.name,
+    return (
-            stare._loader), _mock_test_skipper
+        stare.JSONDataset(
+            stare._protocols,
+            stare._fieldnames,
+            stare._make_loader(TESTDB_TMPDIR.name),
+            stare.data_path_keymaker,
+        ),
+        _mock_test_skipper,
+    )
--- a/doc/api.rst
+++ b/doc/api.rst
@@ -15,9 +15,7 @@ Data Manipulation
 .. autosummary::
   :toctree: api/data
-   bob.ip.binseg.data.folderdataset
+   bob.ip.binseg.data.dataset
-   bob.ip.binseg.data.csvdataset
-   bob.ip.binseg.data.jsondataset
   bob.ip.binseg.data.loader
   bob.ip.binseg.data.sample
   bob.ip.binseg.data.utils
@@ -133,7 +131,6 @@ Datasets
   :template: config.rst
   bob.ip.binseg.configs.datasets.csv
-   bob.ip.binseg.configs.datasets.folder
   bob.ip.binseg.configs.datasets.utils
   bob.ip.binseg.configs.datasets.chasedb1

--- a/setup.py
+++ b/setup.py
@@ -53,9 +53,8 @@ setup(
            "unet = bob.ip.binseg.configs.models.unet",
            "resunet = bob.ip.binseg.configs.models.resunet",
-            # datasets
+            # example datasets
            "csv-dataset-example = bob.ip.binseg.configs.datasets.csv",
-            "folder-dataset-example = bob.ip.binseg.configs.datasets.folder",
            # drive dataset
            "drive = bob.ip.binseg.configs.datasets.drive",