diff --git a/bob/ip/binseg/configs/datasets/__init__.py b/bob/ip/binseg/configs/datasets/__init__.py
index 436796ffb94e196dc4cfe3e4af2164f0c6fca3c2..cd6d9b34db1ac30aff072a947465a016d413b37f 100644
--- a/bob/ip/binseg/configs/datasets/__init__.py
+++ b/bob/ip/binseg/configs/datasets/__init__.py
@@ -20,9 +20,17 @@ RANDOM_FLIP_JITTER = [_hflip(), _vflip(), _jitter()]
 """Shared data augmentation transforms without random rotation"""
 
 
-def make_subset(l, transforms, prefixes, suffixes):
+def make_subset(l, transforms, prefixes=[], suffixes=[]):
     """Creates a new data set, applying transforms
 
+    .. note::
+
+       This is a convenience function for our own dataset definitions inside
+       this module, guaranteeting homogenity between dataset definitions
+       provided in this package.  It assumes certain strategies for data
+       augmentation that may not be translatable to other applications.
+
+
     Parameters
     ----------
 
@@ -44,24 +52,31 @@ def make_subset(l, transforms, prefixes, suffixes):
     Returns
     -------
 
-    subset : :py:class:`torch.utils.data.Dataset`
+    subset : :py:class:`bob.ip.binseg.data.utils.SampleListDataset`
         A pre-formatted dataset that can be fed to one of our engines
 
     """
 
-    from ...data.utils import SampleList2TorchDataset as wrapper
+    from ...data.utils import SampleListDataset as wrapper
 
-    return wrapper(l, transforms, prefixes, suffixes)
+    return wrapper(l, prefixes + transforms + suffixes)
 
 
 def make_trainset(l, transforms, rotation_before=False):
-    """Creates a new training set, with data augmentation
+    """Creates a new training set, **with data augmentation**
 
     Typically, the transforms are chained to a default set of data augmentation
     operations (random rotation, horizontal and vertical flips, and color
     jitter), but flag allows prefixing the rotation specially (useful for some
     COVD training sets).
 
+    .. note::
+
+       This is a convenience function for our own dataset definitions inside
+       this module, guaranteeting homogenity between dataset definitions
+       provided in this package.  It assumes certain strategies for data
+       augmentation that may not be translatable to other applications.
+
 
     Parameters
     ----------
@@ -76,7 +91,7 @@ def make_trainset(l, transforms, rotation_before=False):
     Returns
     -------
 
-    subset : :py:class:`torch.utils.data.Dataset`
+    subset : :py:class:`bob.ip.binseg.data.utils.SampleListDataset`
         A pre-formatted dataset that can be fed to one of our engines
 
     """
@@ -91,8 +106,7 @@ def make_trainset(l, transforms, rotation_before=False):
 
     return make_subset(
         l,
-        transforms,
-        prefixes=[],
+        transforms=transforms,
         suffixes=(RANDOM_ROTATION + RANDOM_FLIP_JITTER),
     )
 
@@ -101,13 +115,32 @@ def make_dataset(subsets, transforms):
     """Creates a new configuration dataset from dictionary and transforms
 
     This function takes as input a dictionary as those that can be returned by
-    :py:meth:`bob.ip.binseg.data.dataset.JSONDataset.subsets`, mapping protocol
+    :py:meth:`bob.ip.binseg.data.dataset.JSONDataset.subsets`,  or
+    :py:meth:`bob.ip.binseg.data.dataset.CSVDataset.subsets`, mapping protocol
     names (such as ``train``, ``dev`` and ``test``) to
     :py:class:`bob.ip.binseg.data.sample.DelayedSample` lists, and a set of
     transforms, and returns a dictionary applying
-    :py:class:`bob.ip.binseg.data.utils.SampleList2TorchDataset` to these
+    :py:class:`bob.ip.binseg.data.utils.SampleListDataset` to these
     lists, and our standard data augmentation if a ``train`` set exists.
 
+    For example, if ``subsets`` is composed of two sets named ``train`` and
+    ``test``, this function will yield a dictionary with the following entries:
+
+    * ``__train__``: Wraps the ``train`` subset, includes data augmentation
+      (note: datasets with names starting with ``_`` (underscore) are excluded
+      from prediction and evaluation by default, as they contain data
+      augmentation transformations.)
+    * ``train``: Wraps the ``train`` subset, **without** data augmentation
+    * ``train``: Wraps the ``test`` subset, **without** data augmentation
+
+    .. note::
+
+       This is a convenience function for our own dataset definitions inside
+       this module, guaranteeting homogenity between dataset definitions
+       provided in this package.  It assumes certain strategies for data
+       augmentation that may not be translatable to other applications.
+
+
     Parameters
     ----------
 
@@ -126,20 +159,18 @@ def make_dataset(subsets, transforms):
 
     dataset : dict
         A pre-formatted dataset that can be fed to one of our engines. It maps
-        string names to :py:class:`torch.utils.data.Dataset`'s.
+        string names to
+        :py:class:`bob.ip.binseg.data.utils.SampleListDataset`'s.
 
     """
 
     retval = {}
 
     for key in subsets.keys():
+        retval[key] = make_subset(subsets[key], transforms=transforms)
         if key == "train":
-            retval[key] = make_trainset(
+            retval["__train__"] = make_trainset(
                 subsets[key], transforms=transforms, rotation_before=False
             )
-        else:
-            retval[key] = make_subset(
-                subsets[key], transforms=transforms, prefixes=[], suffixes=[]
-            )
 
     return retval
diff --git a/bob/ip/binseg/configs/datasets/chasedb1/covd.py b/bob/ip/binseg/configs/datasets/chasedb1/covd.py
index e9cd82e20f6d2faa2189833ca34b7105e716cf83..ed8c37aff14036c88fc5949820efb733e802e4ae 100644
--- a/bob/ip/binseg/configs/datasets/chasedb1/covd.py
+++ b/bob/ip/binseg/configs/datasets/chasedb1/covd.py
@@ -20,6 +20,7 @@ from bob.ip.binseg.data.transforms import CenterCrop, Pad, Resize
 from bob.ip.binseg.configs.datasets import make_trainset as _maker
 
 from bob.ip.binseg.data.drive import dataset as _raw_drive
+
 _drive = _maker(
     _raw_drive.subsets("default")["train"],
     [CenterCrop((544, 544)), Resize(960)],
@@ -27,6 +28,7 @@ _drive = _maker(
 )
 
 from bob.ip.binseg.data.stare import dataset as _raw_stare
+
 # n.b.: not the best fit, but what was there for Tim's work
 _stare = _maker(
     _raw_stare.subsets("ah")["train"],
@@ -35,20 +37,21 @@ _stare = _maker(
 )
 
 from bob.ip.binseg.data.hrf import dataset as _raw_hrf
+
 _hrf = _maker(
     _raw_hrf.subsets("default")["train"], [Pad((0, 584, 0, 584)), Resize(960)],
 )
 
 from bob.ip.binseg.data.iostar import dataset as _raw_iostar
+
 # n.b.: not the best fit, but what was there for Tim's work
 _iostar = _maker(_raw_iostar.subsets("vessel")["train"], [Resize(960)])
 
 from torch.utils.data import ConcatDataset
 from bob.ip.binseg.configs.datasets.chasedb1.first_annotator import (
-    dataset as _baselines,
+    dataset as _baseline,
 )
 
-dataset = {
-    "train": ConcatDataset([_drive, _stare, _hrf, _iostar]),
-    "test": _baselines["test"],  # use the same test set always
-}
+# copy dictionary and replace only the augmented train dataset
+dataset = dict(**_baseline)
+dataset["__train__"] = ConcatDataset([_drive, _stare, _hrf, _iostar])
diff --git a/bob/ip/binseg/configs/datasets/chasedb1/ssl.py b/bob/ip/binseg/configs/datasets/chasedb1/ssl.py
index d65408e7520a667c41a025b90593eac51ebd4d7a..8bd97ddad2e8a468ec78d20c99b79240deec4f11 100644
--- a/bob/ip/binseg/configs/datasets/chasedb1/ssl.py
+++ b/bob/ip/binseg/configs/datasets/chasedb1/ssl.py
@@ -19,13 +19,12 @@ For details on datasets, consult:
 * :py:mod:`bob.ip.binseg.data.hrf`
 """
 
-from bob.ip.binseg.configs.datasets.chasedb1.covd import dataset as _labelled
+from bob.ip.binseg.configs.datasets.chasedb1.covd import dataset as _covd
 from bob.ip.binseg.configs.datasets.chasedb1.first_annotator import (
-    dataset as _baselines,
+    dataset as _baseline,
 )
 from bob.ip.binseg.data.utils import SSLDataset
 
-dataset = {
-    "train": SSLDataset(_labelled["train"], _baselines["train"]),
-    "test": _baselines["test"],  # use always the same test set
-}
+# copy dictionary and replace only the augmented train dataset
+dataset = dict(**_covd)
+dataset["__train__"] = SSLDataset(_covd["__train__"], _baseline["__train__"])
diff --git a/bob/ip/binseg/configs/datasets/drive/covd.py b/bob/ip/binseg/configs/datasets/drive/covd.py
index 20b4c45718bbd5efa623de13e31d52f43abd10d1..494ca9a9960d82fb3508d6a31238ef7810b58e60 100644
--- a/bob/ip/binseg/configs/datasets/drive/covd.py
+++ b/bob/ip/binseg/configs/datasets/drive/covd.py
@@ -20,34 +20,33 @@ from bob.ip.binseg.data.transforms import Resize, Pad, Crop
 from bob.ip.binseg.configs.datasets import make_trainset as _maker
 
 from bob.ip.binseg.data.stare import dataset as _raw_stare
+
 _stare = _maker(
-        _raw_stare.subsets("ah")["train"],
-        [Resize(471), Pad((0, 37, 0, 36))],
-        rotation_before=True,
-        )
+    _raw_stare.subsets("ah")["train"],
+    [Resize(471), Pad((0, 37, 0, 36))],
+    rotation_before=True,
+)
 
 from bob.ip.binseg.data.chasedb1 import dataset as _raw_chase
+
 _chase = _maker(
-        _raw_chase.subsets("first-annotator")["train"],
-        [Resize(544), Crop(0, 12, 544, 544)],
-        )
+    _raw_chase.subsets("first-annotator")["train"],
+    [Resize(544), Crop(0, 12, 544, 544)],
+)
 
 from bob.ip.binseg.data.iostar import dataset as _raw_iostar
-_iostar = _maker(
-        _raw_iostar.subsets("vessel")["train"],
-        [Resize(544)],
-        )
+
+_iostar = _maker(_raw_iostar.subsets("vessel")["train"], [Resize(544)],)
 
 from bob.ip.binseg.data.hrf import dataset as _raw_hrf
+
 _hrf = _maker(
-        _raw_hrf.subsets("default")["train"],
-        [Resize((363)), Pad((0, 90, 0, 91))],
-        )
+    _raw_hrf.subsets("default")["train"], [Resize((363)), Pad((0, 90, 0, 91))],
+)
 
 from torch.utils.data import ConcatDataset
-from bob.ip.binseg.configs.datasets.drive.default import dataset as _baselines
+from bob.ip.binseg.configs.datasets.drive.default import dataset as _baseline
 
-dataset = {
-        "train": ConcatDataset([_stare, _chase, _iostar, _hrf]),
-        "test": _baselines["test"],  #use the same test set always
-        }
+# copy dictionary and replace only the augmented train dataset
+dataset = dict(**_baseline)
+dataset["__train__"] = ConcatDataset([_stare, _chase, _iostar, _hrf])
diff --git a/bob/ip/binseg/configs/datasets/drive/ssl.py b/bob/ip/binseg/configs/datasets/drive/ssl.py
index edb76e6ae26baffaedd0ce46cba81880c2952c20..23af544342f1a48a8e83f87d9041d638f58ed6cf 100644
--- a/bob/ip/binseg/configs/datasets/drive/ssl.py
+++ b/bob/ip/binseg/configs/datasets/drive/ssl.py
@@ -18,11 +18,10 @@ For details on datasets, consult:
 * :py:mod:`bob.ip.binseg.data.hrf`
 """
 
-from bob.ip.binseg.configs.datasets.drive.covd import dataset as _labelled
-from bob.ip.binseg.configs.datasets.drive.default import dataset as _baselines
+from bob.ip.binseg.configs.datasets.drive.covd import dataset as _covd
+from bob.ip.binseg.configs.datasets.drive.default import dataset as _baseline
 from bob.ip.binseg.data.utils import SSLDataset
 
-dataset = {
-        "train": SSLDataset(_labelled["train"], _baselines["train"]),
-        "test": _baselines["test"],  #use always the same test set
-        }
+# copy dictionary and replace only the augmented train dataset
+dataset = dict(**_covd)
+dataset["__train__"] = SSLDataset(_covd["__train__"], _baseline["__train__"])
diff --git a/bob/ip/binseg/configs/datasets/hrf/covd.py b/bob/ip/binseg/configs/datasets/hrf/covd.py
index 06610544b85216ec4a49e362a03b38edd1072b43..792a005b6c08fee3b0dbab1cea07e1379c7cf056 100644
--- a/bob/ip/binseg/configs/datasets/hrf/covd.py
+++ b/bob/ip/binseg/configs/datasets/hrf/covd.py
@@ -20,37 +20,40 @@ from bob.ip.binseg.data.transforms import Crop, Pad, Resize
 from bob.ip.binseg.configs.datasets import make_trainset as _maker
 
 from bob.ip.binseg.data.drive import dataset as _raw_drive
+
 _drive = _maker(
-        _raw_drive.subsets("default")["train"],
-        [Crop(75, 10, 416, 544), Pad((21, 0, 22, 0)), Resize(1168)],
-        rotation_before=True,
-        )
+    _raw_drive.subsets("default")["train"],
+    [Crop(75, 10, 416, 544), Pad((21, 0, 22, 0)), Resize(1168)],
+    rotation_before=True,
+)
 
 from bob.ip.binseg.data.stare import dataset as _raw_stare
+
 _stare = _maker(
-        _raw_stare.subsets("ah")["train"],
-        [Crop(50, 0, 500, 705), Resize(1168), Pad((1, 0, 1, 0))],
-        rotation_before=True,
-        )
+    _raw_stare.subsets("ah")["train"],
+    [Crop(50, 0, 500, 705), Resize(1168), Pad((1, 0, 1, 0))],
+    rotation_before=True,
+)
 
 from bob.ip.binseg.data.chasedb1 import dataset as _raw_chase
+
 _chase = _maker(
-        _raw_chase.subsets("first-annotator")["train"],
-        [Crop(140, 18, 680, 960), Resize(1168)],
-        rotation_before=True,
-        )
+    _raw_chase.subsets("first-annotator")["train"],
+    [Crop(140, 18, 680, 960), Resize(1168)],
+    rotation_before=True,
+)
 
 from bob.ip.binseg.data.iostar import dataset as _raw_iostar
+
 _iostar = _maker(
-        _raw_iostar.subsets("vessel")["train"],
-        [Crop(144, 0, 768, 1024), Pad((30, 0, 30, 0)), Resize(1168)],
-        rotation_before=True,
-        )
+    _raw_iostar.subsets("vessel")["train"],
+    [Crop(144, 0, 768, 1024), Pad((30, 0, 30, 0)), Resize(1168)],
+    rotation_before=True,
+)
 
 from torch.utils.data import ConcatDataset
-from bob.ip.binseg.configs.datasets.hrf.default import dataset as _baselines
+from bob.ip.binseg.configs.datasets.hrf.default import dataset as _baseline
 
-dataset = {
-        "train": ConcatDataset([_drive, _stare, _chase, _iostar]),
-        "test": _baselines["test"],  #use the same test set always
-        }
+# copy dictionary and replace only the augmented train dataset
+dataset = dict(**_baseline)
+dataset["__train__"] = ConcatDataset([_drive, _stare, _chase, _iostar])
diff --git a/bob/ip/binseg/configs/datasets/hrf/ssl.py b/bob/ip/binseg/configs/datasets/hrf/ssl.py
index dea93d248109d355e1025e3195b6e16a83e314fb..7f6f369e2510bb39acee083c2caa3ffa74855fa6 100644
--- a/bob/ip/binseg/configs/datasets/hrf/ssl.py
+++ b/bob/ip/binseg/configs/datasets/hrf/ssl.py
@@ -18,11 +18,10 @@ For details on datasets, consult:
 * :py:mod:`bob.ip.binseg.data.hrf`
 """
 
-from bob.ip.binseg.configs.datasets.hrf.covd import dataset as _labelled
-from bob.ip.binseg.configs.datasets.hrf.default import dataset as _baselines
+from bob.ip.binseg.configs.datasets.hrf.covd import dataset as _covd
+from bob.ip.binseg.configs.datasets.hrf.default import dataset as _baseline
 from bob.ip.binseg.data.utils import SSLDataset
 
-dataset = {
-    "train": SSLDataset(_labelled["train"], _baselines["train"]),
-    "test": _baselines["test"],  # use always the same test set
-}
+# copy dictionary and replace only the augmented train dataset
+dataset = dict(**_covd)
+dataset["__train__"] = SSLDataset(_covd["__train__"], _baseline["__train__"])
diff --git a/bob/ip/binseg/configs/datasets/iostar/covd.py b/bob/ip/binseg/configs/datasets/iostar/covd.py
index e2a1b90b9b773d4ae0576698736bc3a4f351fcd6..e2f054feaa64222cce354265969f9e7937638be0 100644
--- a/bob/ip/binseg/configs/datasets/iostar/covd.py
+++ b/bob/ip/binseg/configs/datasets/iostar/covd.py
@@ -20,36 +20,38 @@ from bob.ip.binseg.data.transforms import CenterCrop, Crop, Pad, Resize
 from bob.ip.binseg.configs.datasets import make_trainset as _maker
 
 from bob.ip.binseg.data.drive import dataset as _raw_drive
+
 _drive = _maker(
-        _raw_drive.subsets("default")["train"],
-        [CenterCrop((540, 540)), Resize(1024)],
-        rotation_before=True,
-        )
+    _raw_drive.subsets("default")["train"],
+    [CenterCrop((540, 540)), Resize(1024)],
+    rotation_before=True,
+)
 
 from bob.ip.binseg.data.stare import dataset as _raw_stare
+
 _stare = _maker(
-        _raw_stare.subsets("ah")["train"],
-        [Pad((0, 32, 0, 32)), Resize(1024), CenterCrop(1024)],
-        rotation_before=True,
-        )
+    _raw_stare.subsets("ah")["train"],
+    [Pad((0, 32, 0, 32)), Resize(1024), CenterCrop(1024)],
+    rotation_before=True,
+)
 
 from bob.ip.binseg.data.hrf import dataset as _raw_hrf
+
 _hrf = _maker(
-        _raw_hrf.subsets("default")["train"],
-        [Pad((0, 584, 0, 584)), Resize(1024)],
-        )
+    _raw_hrf.subsets("default")["train"], [Pad((0, 584, 0, 584)), Resize(1024)],
+)
 
 from bob.ip.binseg.data.chasedb1 import dataset as _raw_chase
+
 _chase = _maker(
-        _raw_chase.subsets("first-annotator")["train"],
-        [Crop(0, 18, 960, 960), Resize(1024)],
-        rotation_before=True,
-        )
+    _raw_chase.subsets("first-annotator")["train"],
+    [Crop(0, 18, 960, 960), Resize(1024)],
+    rotation_before=True,
+)
 
 from torch.utils.data import ConcatDataset
-from bob.ip.binseg.configs.datasets.iostar.vessel import dataset as _baselines
+from bob.ip.binseg.configs.datasets.iostar.vessel import dataset as _baseline
 
-dataset = {
-        "train": ConcatDataset([_drive, _stare, _hrf, _chase]),
-        "test": _baselines["test"],  #use the same test set always
-        }
+# copy dictionary and replace only the augmented train dataset
+dataset = dict(**_baseline)
+dataset["__train__"] = ConcatDataset([_drive, _stare, _hrf, _chase])
diff --git a/bob/ip/binseg/configs/datasets/iostar/ssl.py b/bob/ip/binseg/configs/datasets/iostar/ssl.py
index f8666c01c3bca498321d2758fd5aef65449a25f6..2635552ee87704cd8c370c56a22431f5faa6b151 100644
--- a/bob/ip/binseg/configs/datasets/iostar/ssl.py
+++ b/bob/ip/binseg/configs/datasets/iostar/ssl.py
@@ -18,11 +18,10 @@ For details on datasets, consult:
 * :py:mod:`bob.ip.binseg.data.iostar`
 """
 
-from bob.ip.binseg.configs.datasets.iostar.covd import dataset as _labelled
-from bob.ip.binseg.configs.datasets.iostar.vessel import dataset as _baselines
+from bob.ip.binseg.configs.datasets.iostar.covd import dataset as _covd
+from bob.ip.binseg.configs.datasets.iostar.vessel import dataset as _baseline
 from bob.ip.binseg.data.utils import SSLDataset
 
-dataset = {
-    "train": SSLDataset(_labelled["train"], _baselines["train"]),
-    "test": _baselines["test"],  # use always the same test set
-}
+# copy dictionary and replace only the augmented train dataset
+dataset = dict(**_covd)
+dataset["__train__"] = SSLDataset(_covd["__train__"], _baseline["__train__"])
diff --git a/bob/ip/binseg/configs/datasets/stare/covd.py b/bob/ip/binseg/configs/datasets/stare/covd.py
index ffd402288e7b811247d4b5b74bd61133a9d7658b..0abbf93441a7f70036073841786ac06b61ca6528 100644
--- a/bob/ip/binseg/configs/datasets/stare/covd.py
+++ b/bob/ip/binseg/configs/datasets/stare/covd.py
@@ -20,36 +20,38 @@ from bob.ip.binseg.data.transforms import CenterCrop, Pad, Resize
 from bob.ip.binseg.configs.datasets import make_trainset as _maker
 
 from bob.ip.binseg.data.drive import dataset as _raw_drive
+
 _drive = _maker(
-        _raw_drive.subsets("default")["train"],
-        [CenterCrop((470, 544)), Pad((10, 9, 10, 8)), Resize(608)],
-        rotation_before=True,
-        )
+    _raw_drive.subsets("default")["train"],
+    [CenterCrop((470, 544)), Pad((10, 9, 10, 8)), Resize(608)],
+    rotation_before=True,
+)
 
 from bob.ip.binseg.data.chasedb1 import dataset as _raw_chase
+
 _chase = _maker(
-        _raw_chase.subsets("first-annotator")["train"],
-        [CenterCrop((829, 960)), Resize(608)],
-        rotation_before=True,
-        )
+    _raw_chase.subsets("first-annotator")["train"],
+    [CenterCrop((829, 960)), Resize(608)],
+    rotation_before=True,
+)
 
 from bob.ip.binseg.data.iostar import dataset as _raw_iostar
+
 _iostar = _maker(
-        _raw_iostar.subsets("vessel")["train"],
-        # n.b.: not the best fit, but what was there for Tim's work
-        [Pad((81, 0, 81, 0)), Resize(608)],
-        )
+    _raw_iostar.subsets("vessel")["train"],
+    # n.b.: not the best fit, but what was there for Tim's work
+    [Pad((81, 0, 81, 0)), Resize(608)],
+)
 
 from bob.ip.binseg.data.hrf import dataset as _raw_hrf
+
 _hrf = _maker(
-        _raw_hrf.subsets("default")["train"],
-        [Pad((0, 345, 0, 345)), Resize(608)],
-        )
+    _raw_hrf.subsets("default")["train"], [Pad((0, 345, 0, 345)), Resize(608)],
+)
 
 from torch.utils.data import ConcatDataset
-from bob.ip.binseg.configs.datasets.stare.ah import dataset as _baselines
+from bob.ip.binseg.configs.datasets.stare.ah import dataset as _baseline
 
-dataset = {
-        "train": ConcatDataset([_drive, _chase, _iostar, _hrf]),
-        "test": _baselines["test"],  #use the same test set always
-        }
+# copy dictionary and replace only the augmented train dataset
+dataset = dict(**_baseline)
+dataset["__train__"] = ConcatDataset([_drive, _chase, _iostar, _hrf])
diff --git a/bob/ip/binseg/configs/datasets/stare/ssl.py b/bob/ip/binseg/configs/datasets/stare/ssl.py
index e3047254ec12397eb83a2062bf4c7c401cc69ba0..10440efd9bb35db7b499c860772c272b01bcc1f6 100644
--- a/bob/ip/binseg/configs/datasets/stare/ssl.py
+++ b/bob/ip/binseg/configs/datasets/stare/ssl.py
@@ -18,11 +18,10 @@ For details on datasets, consult:
 * :py:mod:`bob.ip.binseg.data.hrf`
 """
 
-from bob.ip.binseg.configs.datasets.stare.covd import dataset as _labelled
-from bob.ip.binseg.configs.datasets.stare.ah import dataset as _baselines
+from bob.ip.binseg.configs.datasets.stare.covd import dataset as _covd
+from bob.ip.binseg.configs.datasets.stare.ah import dataset as _baseline
 from bob.ip.binseg.data.utils import SSLDataset
 
-dataset = {
-        "train": SSLDataset(_labelled["train"], _baselines["train"]),
-        "test": _baselines["test"],  #use always the same test set
-        }
+# copy dictionary and replace only the augmented train dataset
+dataset = dict(**_covd)
+dataset["__train__"] = SSLDataset(_covd["__train__"], _baseline["__train__"])
diff --git a/bob/ip/binseg/data/utils.py b/bob/ip/binseg/data/utils.py
index 17332979a780978423dcadf0fbbb2cff2dfc7216..3d77e9c37e4888adcaa852ea31a7085648437140 100644
--- a/bob/ip/binseg/data/utils.py
+++ b/bob/ip/binseg/data/utils.py
@@ -116,7 +116,7 @@ def overlayed_image(
     return retval
 
 
-class SampleList2TorchDataset(torch.utils.data.Dataset):
+class SampleListDataset(torch.utils.data.Dataset):
     """PyTorch dataset wrapper around Sample lists
 
     A transform object can be passed that will be applied to the image, ground
@@ -125,13 +125,6 @@ class SampleList2TorchDataset(torch.utils.data.Dataset):
     It supports indexing such that dataset[i] can be used to get ith sample.
 
 
-    Attributes
-    ----------
-
-    augmented : bool
-        Tells if this set has data augmentation prefixes or suffixes installed.
-
-
     Parameters
     ----------
 
@@ -143,28 +136,12 @@ class SampleList2TorchDataset(torch.utils.data.Dataset):
         ground-truth data.  Notice a last transform
         (:py:class:`bob.ip.binseg.data.transforms.ToTensor`) is always applied.
 
-    prefixes : :py:class:`list`, Optional
-        a list of data augmentation transformations to be applied to **both**
-        image and ground-truth data and **before** ``transforms`` above.
-        Notice that transforms like
-        :py:class:`bob.ip.binseg.data.transforms.ColorJitter` are only applied
-        to the input image.
-
-    suffixes : :py:class:`list`, Optional
-        a list of data augmentation transformations to be applied to **both**
-        image and ground-truth data and **after** ``transforms`` above.
-        Notice that transforms like
-        :py:class:`bob.ip.binseg.data.transforms.ColorJitter` are only applied
-        to the input image.
-
     """
 
-    def __init__(self, samples, transforms=[], prefixes=[], suffixes=[]):
+    def __init__(self, samples, transforms=[]):
 
         self._samples = samples
-        self._middle = transforms
-        self._transforms = Compose(prefixes + transforms + suffixes + [ToTensor()])
-        self.augmented = bool(prefixes or suffixes)
+        self._transforms = Compose(transforms + [ToTensor()])
 
     def __len__(self):
         """
@@ -178,18 +155,6 @@ class SampleList2TorchDataset(torch.utils.data.Dataset):
         """
         return len(self._samples)
 
-    @contextlib.contextmanager
-    def not_augmented(self):
-        """Context to avoid data augmentation to be applied to self"""
-
-        backup = (self.augmented, self._transforms)
-        self.augmented = False
-        self._transforms = Compose(self._middle + [ToTensor()])
-        try:
-            yield self
-        finally:
-            self.augmented, self._transforms = backup
-
     def __getitem__(self, key):
         """
 
diff --git a/bob/ip/binseg/engine/ssltrainer.py b/bob/ip/binseg/engine/ssltrainer.py
index d7310ed057de296eadaffd6833891dd296290257..2448782cc6b1b00965a4974af13f58e36a0dd0fc 100644
--- a/bob/ip/binseg/engine/ssltrainer.py
+++ b/bob/ip/binseg/engine/ssltrainer.py
@@ -335,7 +335,6 @@ def run(
             logwriter.writerow(dict(k for k in logdata))
             logger.info("|".join([f"{k}: {v}" for (k, v) in logdata]))
 
-        logger.info("End of training")
         total_training_time = time.time() - start_training_time
         logger.info(
             f"Total training time: {datetime.timedelta(seconds=total_training_time)} ({(total_training_time/max_epoch):.4f}s in average per epoch)"
diff --git a/bob/ip/binseg/engine/trainer.py b/bob/ip/binseg/engine/trainer.py
index dee2d6287dad0f4481b69ede9b25f0dfd52236b1..783d5dcb1643301fbcc862f7a5d3fcf4d1107f2f 100644
--- a/bob/ip/binseg/engine/trainer.py
+++ b/bob/ip/binseg/engine/trainer.py
@@ -176,7 +176,6 @@ def run(
             logwriter.writerow(dict(k for k in logdata))
             logger.info("|".join([f"{k}: {v}" for (k, v) in logdata]))
 
-        logger.info("End of training")
         total_training_time = time.time() - start_training_time
         logger.info(
             f"Total training time: {datetime.timedelta(seconds=total_training_time)} ({(total_training_time/max_epoch):.4f}s in average per epoch)"
diff --git a/bob/ip/binseg/script/evaluate.py b/bob/ip/binseg/script/evaluate.py
index 8c8e575fcb65e85ab588c7a622bf7f03d13ec360..5a27eaee91d28ae4e23ddea28e4ea06e79e2ff92 100644
--- a/bob/ip/binseg/script/evaluate.py
+++ b/bob/ip/binseg/script/evaluate.py
@@ -59,13 +59,11 @@ logger = logging.getLogger(__name__)
 @click.option(
     "--dataset",
     "-d",
-    help="A bob.ip.binseg.data.utils.SampleList2TorchDataset instance "
-    "implementing a dataset to be used for evaluation purposes, possibly "
-    "including all pre-processing pipelines required or, optionally, a "
-    "dictionary mapping string keys to "
-    "bob.ip.binseg.data.utils.SampleList2TorchDataset's.  In such a case, "
-    "all datasets will be used for evaluation.  Data augmentation "
-    "operations are excluded automatically in this case",
+    help="A torch.utils.data.dataset.Dataset instance implementing a dataset "
+    "to be used for evaluation purposes, possibly including all pre-processing "
+    "pipelines required or, optionally, a dictionary mapping string keys to "
+    "torch.utils.data.dataset.Dataset instances.  All keys that do not start "
+    "with an underscore (_) will be processed.",
     required=True,
     cls=ResourceOption,
 )
@@ -74,7 +72,8 @@ logger = logging.getLogger(__name__)
     "-S",
     help="A dataset or dictionary, like in --dataset, with the same "
     "sample keys, but with annotations from a different annotator that is "
-    "going to be compared to the one in --dataset",
+    "going to be compared to the one in --dataset.  The same rules regarding "
+    "dataset naming conventions apply",
     required=False,
     default=None,
     cls=ResourceOption,
@@ -145,6 +144,9 @@ def evaluate(
         }
     else:
         for k, v in dataset.items():
+            if k.startswith("_"):
+                logger.info(f"Skipping dataset '{k}' (not to be evaluated)")
+                continue
             config[k] = {
                 "dataset": v,
                 "output_folder": os.path.join(output_folder, k),
@@ -155,16 +157,17 @@ def evaluate(
             }
 
     for k, v in config.items():
-        with v["dataset"].not_augmented() as d:
-            run(
-                d,
-                predictions_folder,
-                v["output_folder"],
-                overlayed,
-                overlay_threshold,
+        run(
+            v["dataset"],
+            predictions_folder,
+            v["output_folder"],
+            overlayed,
+            overlay_threshold,
+        )
+        if v["second_annotator"] is not None:
+            compare_annotators(
+                v["dataset"],
+                v["second_annotator"],
+                v["second_annotator_folder"],
+                os.path.join(overlayed, "second-annotator"),
             )
-            if v["second_annotator"] is not None:
-                with v["second_annotator"].not_augmented() as d2:
-                    compare_annotators(
-                        d, d2, v["second_annotator_folder"], overlayed
-                    )
diff --git a/bob/ip/binseg/script/experiment.py b/bob/ip/binseg/script/experiment.py
index 6882d5b931bf69399d48bc83956ce78b459c4de0..a4c74d4594095d19d538bf6e916e3e879b2b139a 100644
--- a/bob/ip/binseg/script/experiment.py
+++ b/bob/ip/binseg/script/experiment.py
@@ -16,6 +16,38 @@ import logging
 logger = logging.getLogger(__name__)
 
 
+def _save_sh_command(destfile):
+    """Records command-line to reproduce this experiment"""
+
+    import sys
+    import time
+    import pkg_resources
+
+    dirname = os.path.dirname(destfile)
+
+    if not os.path.exists(dirname):
+        os.makedirs(dirname)
+
+    logger.info(f"Writing command-line for reproduction at '{destfile}'...")
+
+    with open(destfile, "wt") as f:
+        f.write("#!/usr/bin/env sh\n")
+        f.write(f"# date: {time.asctime()}\n")
+        version = pkg_resources.require('bob.ip.binseg')[0].version
+        f.write(f"# version: {version} (bob.ip.binseg)\n")
+        f.write(f"# platform: {sys.platform}\n")
+        f.write("\n")
+        args = []
+        for k in sys.argv:
+            if " " in k: args.append(f'"{k}"')
+            else: args.append(k)
+        if os.environ.get('CONDA_DEFAULT_ENV') is not None:
+            f.write(f"#conda activate {os.environ['CONDA_DEFAULT_ENV']}\n")
+        f.write(f"#cd {os.path.realpath(os.curdir)}\n")
+        f.write(" ".join(args) + "\n")
+    os.chmod(destfile, 0o755)
+
+
 @click.command(
     entry_point_group="bob.ip.binseg.config",
     cls=ConfigCommand,
@@ -248,12 +280,15 @@ def experiment(
 
     """
 
+    _save_sh_command(os.path.join(output_folder, "command.sh"))
+
     ## Training
     logger.info("Started training")
 
     from .train import train
 
     train_output_folder = os.path.join(output_folder, "model")
+
     ctx.invoke(
         train,
         model=model,
@@ -283,7 +318,7 @@ def experiment(
     model_file = os.path.join(train_output_folder, "model_final.pth")
     predictions_folder = os.path.join(output_folder, "predictions")
     overlayed_folder = (
-        os.path.join(output_folder, "overlayed", "probabilities")
+        os.path.join(output_folder, "overlayed", "predictions")
         if overlayed
         else None
     )
@@ -336,9 +371,15 @@ def experiment(
 
     systems = []
     for k, v in dataset.items():
+        if k.startswith("_"):
+            logger.info(f"Skipping dataset '{k}' (not to be compared)")
+            continue
         systems += [k, os.path.join(analysis_folder, k, "metrics.csv")]
     if second_annotator is not None:
         for k, v in second_annotator.items():
+            if k.startswith("_"):
+                logger.info(f"Skipping dataset '{k}' (not to be compared)")
+                continue
             systems += [f"{k} (2nd. annot.)",
                     os.path.join(second_annotator_folder, k, "metrics.csv")]
     output_pdf = os.path.join(output_folder, "comparison.pdf")
diff --git a/bob/ip/binseg/script/predict.py b/bob/ip/binseg/script/predict.py
index 41419ece1c3a32f41b46310e65d31b3f58ad8f5e..bf988ec4a61f8c62cb73d88d347fabd3e7608c54 100644
--- a/bob/ip/binseg/script/predict.py
+++ b/bob/ip/binseg/script/predict.py
@@ -61,13 +61,11 @@ logger = logging.getLogger(__name__)
 @click.option(
     "--dataset",
     "-d",
-    help="A bob.ip.binseg.data.utils.SampleList2TorchDataset instance "
-    "implementing a dataset to be used for running prediction, possibly "
-    "including all pre-processing pipelines required or, optionally, a "
-    "dictionary mapping string keys to "
-    "bob.ip.binseg.data.utils.SampleList2TorchDataset's.  In such a case, "
-    "all datasets will be used for running prediction.  Data augmentation "
-    "operations are excluded automatically for prediction purposes",
+    help="A torch.utils.data.dataset.Dataset instance implementing a dataset "
+    "to be used for running prediction, possibly including all pre-processing "
+    "pipelines required or, optionally, a dictionary mapping string keys to "
+    "torch.utils.data.dataset.Dataset instances.  All keys that do not start "
+    "with an underscore (_) will be processed.",
     required=True,
     cls=ResourceOption,
 )
@@ -129,11 +127,15 @@ def predict(output_folder, model, dataset, batch_size, device, weight,
         overlayed = overlayed.strip()
 
     for k,v in dataset.items():
-        with v.not_augmented() as d:  # we remove any data augmentation
-            data_loader = DataLoader(
-                dataset=d,
-                batch_size=batch_size,
-                shuffle=False,
-                pin_memory=torch.cuda.is_available(),
-            )
-            run(model, data_loader, device, output_folder, overlayed)
+
+        if k.startswith("_"):
+            logger.info(f"Skipping dataset '{k}' (not to be evaluated)")
+            continue
+
+        data_loader = DataLoader(
+            dataset=v,
+            batch_size=batch_size,
+            shuffle=False,
+            pin_memory=torch.cuda.is_available(),
+        )
+        run(model, data_loader, device, output_folder, overlayed)
diff --git a/bob/ip/binseg/script/train.py b/bob/ip/binseg/script/train.py
index 5df8ccfb9a6a5391761dcc490415f67931832a9f..3076aae4c9796fedb8ed009aea4ee6afb89edc85 100644
--- a/bob/ip/binseg/script/train.py
+++ b/bob/ip/binseg/script/train.py
@@ -66,10 +66,12 @@ logger = logging.getLogger(__name__)
     help="A torch.utils.data.dataset.Dataset instance implementing a dataset "
     "to be used for training the model, possibly including all pre-processing "
     "pipelines required or, optionally, a dictionary mapping string keys to "
-    "bob.ip.binseg.data.utils.SampleList2TorchDataset's.  At least one key "
-    "named 'train' must be available.  This dataset will be used for training "
-    "the network model.  The dataset description include all required "
-    "pre-processing, including eventual data augmentation",
+    "torch.utils.data.dataset.Dataset instances.  At least one key "
+    "named ``train`` must be available.  This dataset will be used for "
+    "training the network model.  The dataset description must include all "
+    "required pre-processing, including eventual data augmentation.  If a "
+    "dataset named ``__train__`` is available, it is used prioritarily for "
+    "training instead of ``train``.",
     required=True,
     cls=ResourceOption,
 )
@@ -224,9 +226,17 @@ def train(
 
     torch.manual_seed(seed)
 
+    use_dataset = dataset
+    if isinstance(dataset, dict):
+        if "__train__" in dataset:
+            logger.info("Found (dedicated) '__train__' set for training")
+            use_dataset = dataset["__train__"]
+        else:
+            use_dataset = dataset["train"]
+
     # PyTorch dataloader
     data_loader = DataLoader(
-        dataset=dataset["train"] if isinstance(dataset, dict) else dataset,
+        dataset=use_dataset,
         batch_size=batch_size,
         shuffle=True,
         drop_last=drop_incomplete_batch,
diff --git a/bob/ip/binseg/test/test_cli.py b/bob/ip/binseg/test/test_cli.py
index cbaeea2849a67dcf7160d071b0d965c2e2da1155..8979509553dca6bfa003a9c7246a92832e485c93 100644
--- a/bob/ip/binseg/test/test_cli.py
+++ b/bob/ip/binseg/test/test_cli.py
@@ -3,10 +3,14 @@
 
 """Tests for our CLI applications"""
 
+import os
 import re
+import fnmatch
 import tempfile
 import contextlib
 
+import nose.tools
+
 from click.testing import CliRunner
 
 from . import mock_dataset
@@ -36,7 +40,7 @@ def _assert_exit_0(result):
 
     assert (
         result.exit_code == 0
-    ), f"Exit code != 0 ({result.exit_code}); Output:\n{result.output}"
+    ), f"Exit code {result.exit_code} != 0 -- Output:\n{result.output}"
 
 
 def _check_help(entry_point):
@@ -60,17 +64,18 @@ def test_experiment_help():
 
 
 def _str_counter(substr, s):
-    return sum(1 for _ in re.finditer(r"\b%s\b" % re.escape(substr), s))
+    return sum(1 for _ in re.finditer(r"%s" % re.escape(substr), s))
 
 
 @rc_variable_set("bob.ip.binseg.stare.datadir")
 def test_experiment_stare():
+
     from ..script.experiment import experiment
 
     runner = CliRunner()
-    with runner.isolated_filesystem(), \
-            stdout_logging() as buf, \
-            tempfile.NamedTemporaryFile(mode="wt") as config:
+    with runner.isolated_filesystem(), stdout_logging() as buf, tempfile.NamedTemporaryFile(
+        mode="wt"
+    ) as config:
 
         # re-write STARE dataset configuration for test
         config.write("from bob.ip.binseg.data.stare import _make_dataset\n")
@@ -82,16 +87,73 @@ def test_experiment_stare():
         config.write("second_annotator = _maker('vk', _raw)\n")
         config.flush()
 
+        output_folder = "results"
         result = runner.invoke(
             experiment,
-            ["m2unet", config.name, "-vv", "--epochs=1", "--batch-size=1",
-                "--overlayed"],
+            [
+                "m2unet",
+                config.name,
+                "-vv",
+                "--epochs=1",
+                "--batch-size=1",
+                "--overlayed",
+                f"--output-folder={output_folder}",
+            ],
         )
         _assert_exit_0(result)
+
+        # check command-line
+        assert os.path.exists(os.path.join(output_folder, "command.sh"))
+
+        # check model was saved
+        train_folder = os.path.join(output_folder, "model")
+        assert os.path.exists(os.path.join(train_folder, "model_final.pth"))
+        assert os.path.exists(os.path.join(train_folder, "last_checkpoint"))
+        assert os.path.exists(os.path.join(train_folder, "trainlog.csv"))
+
+        # check predictions are there
+        predict_folder = os.path.join(output_folder, "predictions")
+        assert os.path.exists(os.path.join(predict_folder, "model-info.txt"))
+        basedir = os.path.join(predict_folder, "stare-images")
+        assert os.path.exists(basedir)
+        nose.tools.eq_(len(fnmatch.filter(os.listdir(basedir), "*.hdf5")), 20)
+
+        # check overlayed images are there (since we requested them)
+        overlay_folder = os.path.join(output_folder, "overlayed", "predictions")
+        basedir = os.path.join(overlay_folder, "stare-images")
+        assert os.path.exists(basedir)
+        nose.tools.eq_(len(fnmatch.filter(os.listdir(basedir), "*.png")), 20)
+
+        # check evaluation outputs
+        eval_folder = os.path.join(output_folder, "analysis")
+        second_folder = os.path.join(eval_folder, "second-annotator")
+        assert os.path.exists(os.path.join(eval_folder, "train", "metrics.csv"))
+        assert os.path.exists(os.path.join(eval_folder, "test", "metrics.csv"))
+        assert os.path.exists(os.path.join(second_folder, "train", "metrics.csv"))
+        assert os.path.exists(os.path.join(second_folder, "test", "metrics.csv"))
+
+        # check overlayed images are there (since we requested them)
+        overlay_folder = os.path.join(output_folder, "overlayed", "analysis")
+        basedir = os.path.join(overlay_folder, "stare-images")
+        assert os.path.exists(basedir)
+        nose.tools.eq_(len(fnmatch.filter(os.listdir(basedir), "*.png")), 20)
+
+        # check overlayed images from first-to-second annotator comparisons are
+        # there (since we requested them)
+        overlay_folder = os.path.join(output_folder, "overlayed", "analysis",
+                "second-annotator")
+        basedir = os.path.join(overlay_folder, "stare-images")
+        assert os.path.exists(basedir)
+        nose.tools.eq_(len(fnmatch.filter(os.listdir(basedir), "*.png")), 20)
+
+        # check outcomes of the comparison phase
+        assert os.path.exists(os.path.join(output_folder, "comparison.pdf"))
+
         keywords = {  # from different logging systems
             "Started training": 1,  # logging
+            "Found (dedicated) '__train__' set for training": 1,  # logging
             "epoch: 1|total-time": 1,  # logging
-            "Saving checkpoint to results/model/model_final.pth": 1,  # logging
+            "Saving checkpoint": 1,  # logging
             "Ended training": 1,  # logging
             "Started prediction": 1,  # logging
             "Loading checkpoint from": 2,  # logging
@@ -103,7 +165,7 @@ def test_experiment_stare():
             # "Saving results/overlayed/analysis": 1,  #tqdm.write
             "Ended evaluation": 1,  # logging
             "Started comparison": 1,  # logging
-            "Loading metrics from results/analysis": 4,  # logging
+            "Loading metrics from": 4,  # logging
             "Ended comparison": 1,  # logging
         }
         buf.seek(0)
@@ -120,6 +182,231 @@ def test_experiment_stare():
             )
 
 
+def _check_train(runner):
+
+    from ..script.train import train
+
+    with tempfile.NamedTemporaryFile(
+        mode="wt"
+    ) as config, stdout_logging() as buf:
+
+        # single training set configuration
+        config.write("from bob.ip.binseg.data.stare import _make_dataset\n")
+        config.write(f"_raw = _make_dataset('{stare_datadir}')\n")
+        config.write(
+            "from bob.ip.binseg.configs.datasets.stare import _maker\n"
+        )
+        config.write("dataset = _maker('ah', _raw)['train']\n")
+        config.flush()
+
+        output_folder = "results"
+        result = runner.invoke(
+            train,
+            ["m2unet", config.name, "-vv", "--epochs=1", "--batch-size=1",
+                f"--output-folder={output_folder}"],
+        )
+        _assert_exit_0(result)
+
+        assert os.path.exists(os.path.join(output_folder, "model_final.pth"))
+        assert os.path.exists(os.path.join(output_folder, "last_checkpoint"))
+        assert os.path.exists(os.path.join(output_folder, "trainlog.csv"))
+
+        keywords = {  # from different logging systems
+            "Continuing from epoch 0": 1,  # logging
+            "epoch: 1|total-time": 1,  # logging
+            f"Saving checkpoint to {output_folder}/model_final.pth": 1,  # logging
+            "Total training time:": 1,  # logging
+        }
+        buf.seek(0)
+        logging_output = buf.read()
+
+        for k, v in keywords.items():
+            # if _str_counter(k, logging_output) != v:
+            #    print(f"Count for string '{k}' appeared " \
+            #        f"({_str_counter(k, result.output)}) " \
+            #        f"instead of the expected {v}")
+            assert _str_counter(k, logging_output) == v, (
+                f"Count for string '{k}' appeared "
+                f"({_str_counter(k, logging_output)}) "
+                f"instead of the expected {v}:\nOutput:\n{logging_output}"
+            )
+
+
+def _check_predict(runner):
+
+    from ..script.predict import predict
+
+    with tempfile.NamedTemporaryFile(
+        mode="wt"
+    ) as config, stdout_logging() as buf:
+
+        # single training set configuration
+        config.write("from bob.ip.binseg.data.stare import _make_dataset\n")
+        config.write(f"_raw = _make_dataset('{stare_datadir}')\n")
+        config.write(
+            "from bob.ip.binseg.configs.datasets.stare import _maker\n"
+        )
+        config.write("dataset = _maker('ah', _raw)['test']\n")
+        config.flush()
+
+        output_folder = "predictions"
+        overlay_folder = os.path.join("overlayed", "predictions")
+        result = runner.invoke(
+            predict,
+            [
+                "m2unet",
+                config.name,
+                "-vv",
+                "--batch-size=1",
+                "--weight=results/model_final.pth",
+                f"--output-folder={output_folder}",
+                f"--overlayed={overlay_folder}",
+            ],
+        )
+        _assert_exit_0(result)
+
+        # check predictions are there
+        assert os.path.exists(os.path.join(output_folder, "model-info.txt"))
+        basedir = os.path.join(output_folder, "stare-images")
+        assert os.path.exists(basedir)
+        nose.tools.eq_(len(fnmatch.filter(os.listdir(basedir), "*.hdf5")), 10)
+
+        # check overlayed images are there (since we requested them)
+        basedir = os.path.join(overlay_folder, "stare-images")
+        assert os.path.exists(basedir)
+        nose.tools.eq_(len(fnmatch.filter(os.listdir(basedir), "*.png")), 10)
+
+        keywords = {  # from different logging systems
+            "Loading checkpoint from": 1,  # logging
+            "Total time:": 1,  # logging
+        }
+        buf.seek(0)
+        logging_output = buf.read()
+
+        for k, v in keywords.items():
+            # if _str_counter(k, logging_output) != v:
+            #    print(f"Count for string '{k}' appeared " \
+            #        f"({_str_counter(k, result.output)}) " \
+            #        f"instead of the expected {v}")
+            assert _str_counter(k, logging_output) == v, (
+                f"Count for string '{k}' appeared "
+                f"({_str_counter(k, logging_output)}) "
+                f"instead of the expected {v}:\nOutput:\n{logging_output}"
+            )
+
+
+def _check_evaluate(runner):
+
+    from ..script.evaluate import evaluate
+
+    with tempfile.NamedTemporaryFile(
+        mode="wt"
+    ) as config, stdout_logging() as buf:
+
+        # single training set configuration
+        config.write("from bob.ip.binseg.data.stare import _make_dataset\n")
+        config.write(f"_raw = _make_dataset('{stare_datadir}')\n")
+        config.write(
+            "from bob.ip.binseg.configs.datasets.stare import _maker\n"
+        )
+        config.write("dataset = _maker('ah', _raw)['test']\n")
+        config.write("second_annotator = _maker('vk', _raw)['test']\n")
+        config.flush()
+
+        output_folder = "evaluations"
+        second_folder = "evaluations-2nd"
+        overlay_folder = os.path.join("overlayed", "analysis")
+        result = runner.invoke(
+            evaluate,
+            [
+                config.name,
+                "-vv",
+                f"--output-folder={output_folder}",
+                "--predictions-folder=predictions",
+                f"--overlayed={overlay_folder}",
+                f"--second-annotator-folder={second_folder}",
+            ],
+        )
+        _assert_exit_0(result)
+
+        assert os.path.exists(os.path.join(output_folder, "metrics.csv"))
+        assert os.path.exists(os.path.join(second_folder, "metrics.csv"))
+
+        # check overlayed images are there (since we requested them)
+        basedir = os.path.join(overlay_folder, "stare-images")
+        assert os.path.exists(basedir)
+        nose.tools.eq_(len(fnmatch.filter(os.listdir(basedir), "*.png")), 10)
+
+        keywords = {  # from different logging systems
+            "Skipping dataset '__train__'": 0,  # logging
+            "Saving averages over all input images": 2,  # logging
+            "Highest F1-score": 2,  # logging
+        }
+        buf.seek(0)
+        logging_output = buf.read()
+
+        for k, v in keywords.items():
+            # if _str_counter(k, logging_output) != v:
+            #    print(f"Count for string '{k}' appeared " \
+            #        f"({_str_counter(k, result.output)}) " \
+            #        f"instead of the expected {v}")
+            assert _str_counter(k, logging_output) == v, (
+                f"Count for string '{k}' appeared "
+                f"({_str_counter(k, logging_output)}) "
+                f"instead of the expected {v}:\nOutput:\n{logging_output}"
+            )
+
+
+def _check_compare(runner):
+
+    from ..script.compare import compare
+
+    with stdout_logging() as buf:
+
+        output_folder = "evaluations"
+        second_folder = "evaluations-2nd"
+        result = runner.invoke(
+            compare,
+            [
+                "-vv",
+                # label - path to metrics
+                "test", os.path.join(output_folder, "metrics.csv"),
+                "test (2nd. human)", os.path.join(second_folder, "metrics.csv"),
+            ],
+        )
+        _assert_exit_0(result)
+
+        assert os.path.exists("comparison.pdf")
+
+        keywords = {  # from different logging systems
+            "Loading metrics from": 2,  # logging
+        }
+        buf.seek(0)
+        logging_output = buf.read()
+
+        for k, v in keywords.items():
+            # if _str_counter(k, logging_output) != v:
+            #    print(f"Count for string '{k}' appeared " \
+            #        f"({_str_counter(k, result.output)}) " \
+            #        f"instead of the expected {v}")
+            assert _str_counter(k, logging_output) == v, (
+                f"Count for string '{k}' appeared "
+                f"({_str_counter(k, logging_output)}) "
+                f"instead of the expected {v}:\nOutput:\n{logging_output}"
+            )
+
+
+@rc_variable_set("bob.ip.binseg.stare.datadir")
+def test_discrete_experiment_stare():
+
+    runner = CliRunner()
+    with runner.isolated_filesystem():
+        _check_train(runner)
+        _check_predict(runner)
+        _check_evaluate(runner)
+        _check_compare(runner)
+
+
 def test_train_help():
     from ..script.train import train
 
diff --git a/bob/ip/binseg/test/test_config.py b/bob/ip/binseg/test/test_config.py
index 84af4ed4dab532a088feaff95176eb26b7417d53..10d08e49e8fbb88fb024736842b94bd84326c968 100644
--- a/bob/ip/binseg/test/test_config.py
+++ b/bob/ip/binseg/test/test_config.py
@@ -1,6 +1,8 @@
 #!/usr/bin/env python
 # coding=utf-8
 
+import importlib
+
 import nose.tools
 
 import torch
@@ -16,347 +18,31 @@ N = 10
 
 
 @rc_variable_set("bob.ip.binseg.drive.datadir")
-def test_drive_default():
+def test_drive():
+
+    def _check_subset(samples, size):
+        nose.tools.eq_(len(samples), size)
+        for s in samples:
+            nose.tools.eq_(len(s), 4)
+            assert isinstance(s[0], str)
+            nose.tools.eq_(s[1].shape, (3, 544, 544)) #planes, height, width
+            nose.tools.eq_(s[1].dtype, torch.float32)
+            nose.tools.eq_(s[2].shape, (1, 544, 544)) #planes, height, width
+            nose.tools.eq_(s[2].dtype, torch.float32)
+            nose.tools.eq_(s[3].shape, (1, 544, 544)) #planes, height, width
+            nose.tools.eq_(s[3].dtype, torch.float32)
 
     from ..configs.datasets.drive.default import dataset
-    nose.tools.eq_(len(dataset["train"]), 20)
-    nose.tools.eq_(dataset["train"].augmented, True)
-    for sample in dataset["train"][:N]:
-        nose.tools.eq_(len(sample), 4)
-        assert isinstance(sample[0], str)
-        nose.tools.eq_(sample[1].shape, (3, 544, 544)) #planes, height, width
-        nose.tools.eq_(sample[1].dtype, torch.float32)
-        nose.tools.eq_(sample[2].shape, (1, 544, 544)) #planes, height, width
-        nose.tools.eq_(sample[2].dtype, torch.float32)
-        nose.tools.eq_(sample[3].shape, (1, 544, 544)) #planes, height, width
-        nose.tools.eq_(sample[3].dtype, torch.float32)
-
-    nose.tools.eq_(len(dataset["test"]), 20)
-    nose.tools.eq_(dataset["test"].augmented, False)
-    for sample in dataset["test"][:N]:
-        nose.tools.eq_(len(sample), 4)
-        assert isinstance(sample[0], str)
-        nose.tools.eq_(sample[1].shape, (3, 544, 544)) #planes, height, width
-        nose.tools.eq_(sample[1].dtype, torch.float32)
-        nose.tools.eq_(sample[2].shape, (1, 544, 544)) #planes, height, width
-        nose.tools.eq_(sample[2].dtype, torch.float32)
-        nose.tools.eq_(sample[3].shape, (1, 544, 544)) #planes, height, width
-        nose.tools.eq_(sample[3].dtype, torch.float32)
-
-
-@stare_variable_set("bob.ip.binseg.stare.datadir")
-def test_stare_augmentation_manipulation():
-
-    # some tests to check our context management for dataset augmentation works
-    # adequately, with one example dataset
-
-    # hack to allow testing on the CI
-    from ..configs.datasets.stare import _maker
-    dataset = _maker("ah", stare_dataset)
-
-    nose.tools.eq_(dataset["train"].augmented, True)
-    nose.tools.eq_(dataset["test"].augmented, False)
-    nose.tools.eq_(len(dataset["train"]._transforms.transforms),
-            len(dataset["test"]._transforms.transforms) + 4)
-
-    with dataset["train"].not_augmented() as d:
-        nose.tools.eq_(len(d._transforms.transforms), 2)
-        nose.tools.eq_(d.augmented, False)
-        nose.tools.eq_(dataset["train"].augmented, False)
-        nose.tools.eq_(dataset["test"].augmented, False)
-
-    nose.tools.eq_(dataset["train"].augmented, True)
-    nose.tools.eq_(dataset["test"].augmented, False)
-    nose.tools.eq_(len(dataset["train"]._transforms.transforms),
-            len(dataset["test"]._transforms.transforms) + 4)
-
-
-@stare_variable_set("bob.ip.binseg.stare.datadir")
-def test_stare_ah():
-
-    # hack to allow testing on the CI
-    from ..configs.datasets.stare import _maker
-    dataset = _maker("ah", stare_dataset)
-
-    nose.tools.eq_(len(dataset["train"]), 10)
-    nose.tools.eq_(dataset["train"].augmented, True)
-    for sample in dataset["train"][:N]:
-        nose.tools.eq_(len(sample), 3)
-        assert isinstance(sample[0], str)
-        nose.tools.eq_(sample[1].shape, (3, 608, 704)) #planes, height, width
-        nose.tools.eq_(sample[1].dtype, torch.float32)
-        nose.tools.eq_(sample[2].shape, (1, 608, 704)) #planes, height, width
-        nose.tools.eq_(sample[2].dtype, torch.float32)
-
-    nose.tools.eq_(len(dataset["test"]), 10)
-    nose.tools.eq_(dataset["test"].augmented, False)
-    for sample in dataset["test"][:N]:
-        nose.tools.eq_(len(sample), 3)
-        assert isinstance(sample[0], str)
-        nose.tools.eq_(sample[1].shape, (3, 608, 704)) #planes, height, width
-        nose.tools.eq_(sample[1].dtype, torch.float32)
-        nose.tools.eq_(sample[2].shape, (1, 608, 704)) #planes, height, width
-        nose.tools.eq_(sample[2].dtype, torch.float32)
-
-
-@stare_variable_set("bob.ip.binseg.stare.datadir")
-def test_stare_vk():
-
-    # hack to allow testing on the CI
-    from ..configs.datasets.stare import _maker
-    dataset = _maker("vk", stare_dataset)
-
-    nose.tools.eq_(len(dataset["train"]), 10)
-    nose.tools.eq_(dataset["train"].augmented, True)
-    for sample in dataset["train"][:N]:
-        nose.tools.eq_(len(sample), 3)
-        assert isinstance(sample[0], str)
-        nose.tools.eq_(sample[1].shape, (3, 608, 704)) #planes, height, width
-        nose.tools.eq_(sample[1].dtype, torch.float32)
-        nose.tools.eq_(sample[2].shape, (1, 608, 704)) #planes, height, width
-        nose.tools.eq_(sample[2].dtype, torch.float32)
-
-    nose.tools.eq_(len(dataset["test"]), 10)
-    nose.tools.eq_(dataset["test"].augmented, False)
-    for sample in dataset["test"][:N]:
-        nose.tools.eq_(len(sample), 3)
-        assert isinstance(sample[0], str)
-        nose.tools.eq_(sample[1].shape, (3, 608, 704)) #planes, height, width
-        nose.tools.eq_(sample[1].dtype, torch.float32)
-        nose.tools.eq_(sample[2].shape, (1, 608, 704)) #planes, height, width
-        nose.tools.eq_(sample[2].dtype, torch.float32)
-
-
-@rc_variable_set("bob.ip.binseg.chasedb1.datadir")
-def test_chasedb1_first_annotator():
-
-    from ..configs.datasets.chasedb1.first_annotator import dataset
-
-    nose.tools.eq_(len(dataset["train"]), 8)
-    nose.tools.eq_(dataset["train"].augmented, True)
-    for sample in dataset["train"][:N]:
-        nose.tools.eq_(len(sample), 3)
-        assert isinstance(sample[0], str)
-        nose.tools.eq_(sample[1].shape, (3, 960, 960)) #planes, height, width
-        nose.tools.eq_(sample[1].dtype, torch.float32)
-        nose.tools.eq_(sample[2].shape, (1, 960, 960)) #planes, height, width
-        nose.tools.eq_(sample[2].dtype, torch.float32)
-
-    nose.tools.eq_(len(dataset["test"]), 20)
-    nose.tools.eq_(dataset["test"].augmented, False)
-    for sample in dataset["test"][:N]:
-        nose.tools.eq_(len(sample), 3)
-        assert isinstance(sample[0], str)
-        nose.tools.eq_(sample[1].shape, (3, 960, 960)) #planes, height, width
-        nose.tools.eq_(sample[1].dtype, torch.float32)
-        nose.tools.eq_(sample[2].shape, (1, 960, 960)) #planes, height, width
-        nose.tools.eq_(sample[2].dtype, torch.float32)
-
-
-@rc_variable_set("bob.ip.binseg.chasedb1.datadir")
-def test_chasedb1_second_annotator():
-
-    from ..configs.datasets.chasedb1.second_annotator import dataset
 
-    nose.tools.eq_(len(dataset["train"]), 8)
-    nose.tools.eq_(dataset["train"].augmented, True)
-    for sample in dataset["train"][:N]:
-        nose.tools.eq_(len(sample), 3)
-        assert isinstance(sample[0], str)
-        nose.tools.eq_(sample[1].shape, (3, 960, 960)) #planes, height, width
-        nose.tools.eq_(sample[1].dtype, torch.float32)
-        nose.tools.eq_(sample[2].shape, (1, 960, 960)) #planes, height, width
-        nose.tools.eq_(sample[2].dtype, torch.float32)
+    nose.tools.eq_(len(dataset), 3)
+    _check_subset(dataset["__train__"], 20)
+    _check_subset(dataset["train"], 20)
+    _check_subset(dataset["test"], 20)
 
-    nose.tools.eq_(len(dataset["test"]), 20)
-    nose.tools.eq_(dataset["test"].augmented, False)
-    for sample in dataset["test"][:N]:
-        nose.tools.eq_(len(sample), 3)
-        assert isinstance(sample[0], str)
-        nose.tools.eq_(sample[1].shape, (3, 960, 960)) #planes, height, width
-        nose.tools.eq_(sample[1].dtype, torch.float32)
-        nose.tools.eq_(sample[2].shape, (1, 960, 960)) #planes, height, width
-        nose.tools.eq_(sample[2].dtype, torch.float32)
-
-
-@rc_variable_set("bob.ip.binseg.hrf.datadir")
-def test_hrf_default():
-
-    from ..configs.datasets.hrf.default import dataset
+    from ..configs.datasets.drive.second_annotator import dataset
 
-    nose.tools.eq_(len(dataset["train"]), 15)
-    nose.tools.eq_(dataset["train"].augmented, True)
-    for sample in dataset["train"][:N]:
-        nose.tools.eq_(len(sample), 4)
-        assert isinstance(sample[0], str)
-        nose.tools.eq_(sample[1].shape, (3, 1168, 1648)) #planes, height, width
-        nose.tools.eq_(sample[1].dtype, torch.float32)
-        nose.tools.eq_(sample[2].shape, (1, 1168, 1648)) #planes, height, width
-        nose.tools.eq_(sample[2].dtype, torch.float32)
-        nose.tools.eq_(sample[3].shape, (1, 1168, 1648)) #planes, height, width
-        nose.tools.eq_(sample[3].dtype, torch.float32)
-
-    nose.tools.eq_(len(dataset["test"]), 30)
-    nose.tools.eq_(dataset["test"].augmented, False)
-    for sample in dataset["test"][:N]:
-        nose.tools.eq_(len(sample), 4)
-        assert isinstance(sample[0], str)
-        nose.tools.eq_(sample[1].shape, (3, 1168, 1648)) #planes, height, width
-        nose.tools.eq_(sample[1].dtype, torch.float32)
-        nose.tools.eq_(sample[2].shape, (1, 1168, 1648)) #planes, height, width
-        nose.tools.eq_(sample[2].dtype, torch.float32)
-        nose.tools.eq_(sample[3].shape, (1, 1168, 1648)) #planes, height, width
-        nose.tools.eq_(sample[3].dtype, torch.float32)
-
-
-@rc_variable_set("bob.ip.binseg.refuge.datadir")
-def test_refuge_disc():
-
-    from ..configs.datasets.refuge.disc import dataset
-
-    nose.tools.eq_(len(dataset["train"]), 400)
-    nose.tools.eq_(dataset["train"].augmented, True)
-    for sample in dataset["train"][:N]:
-        nose.tools.eq_(len(sample), 3)
-        assert isinstance(sample[0], str)
-        nose.tools.eq_(sample[1].shape, (3, 1632, 1632)) #planes, height, width
-        nose.tools.eq_(sample[1].dtype, torch.float32)
-        nose.tools.eq_(sample[2].shape, (1, 1632, 1632)) #planes, height, width
-        nose.tools.eq_(sample[2].dtype, torch.float32)
-
-    nose.tools.eq_(len(dataset["validation"]), 400)
-    nose.tools.eq_(dataset["validation"].augmented, False)
-    for sample in dataset["validation"][:N]:
-        nose.tools.eq_(len(sample), 3)
-        assert isinstance(sample[0], str)
-        nose.tools.eq_(sample[1].shape, (3, 1632, 1632)) #planes, height, width
-        nose.tools.eq_(sample[1].dtype, torch.float32)
-        nose.tools.eq_(sample[2].shape, (1, 1632, 1632)) #planes, height, width
-        nose.tools.eq_(sample[2].dtype, torch.float32)
-
-    nose.tools.eq_(len(dataset["test"]), 400)
-    nose.tools.eq_(dataset["test"].augmented, False)
-    for sample in dataset["test"][:N]:
-        nose.tools.eq_(len(sample), 3)
-        assert isinstance(sample[0], str)
-        nose.tools.eq_(sample[1].shape, (3, 1632, 1632)) #planes, height, width
-        nose.tools.eq_(sample[1].dtype, torch.float32)
-        nose.tools.eq_(sample[2].shape, (1, 1632, 1632)) #planes, height, width
-        nose.tools.eq_(sample[2].dtype, torch.float32)
-
-
-@rc_variable_set("bob.ip.binseg.refuge.datadir")
-def test_refuge_cup():
-
-    from ..configs.datasets.refuge.cup import dataset
-
-    nose.tools.eq_(len(dataset["train"]), 400)
-    nose.tools.eq_(dataset["train"].augmented, True)
-    for sample in dataset["train"][:N]:
-        nose.tools.eq_(len(sample), 3)
-        assert isinstance(sample[0], str)
-        nose.tools.eq_(sample[1].shape, (3, 1632, 1632)) #planes, height, width
-        nose.tools.eq_(sample[1].dtype, torch.float32)
-        nose.tools.eq_(sample[2].shape, (1, 1632, 1632)) #planes, height, width
-        nose.tools.eq_(sample[2].dtype, torch.float32)
-
-    nose.tools.eq_(len(dataset["validation"]), 400)
-    nose.tools.eq_(dataset["validation"].augmented, False)
-    for sample in dataset["validation"][:N]:
-        nose.tools.eq_(len(sample), 3)
-        assert isinstance(sample[0], str)
-        nose.tools.eq_(sample[1].shape, (3, 1632, 1632)) #planes, height, width
-        nose.tools.eq_(sample[1].dtype, torch.float32)
-        nose.tools.eq_(sample[2].shape, (1, 1632, 1632)) #planes, height, width
-        nose.tools.eq_(sample[2].dtype, torch.float32)
-
-    nose.tools.eq_(len(dataset["test"]), 400)
-    nose.tools.eq_(dataset["test"].augmented, False)
-    for sample in dataset["test"][:N]:
-        nose.tools.eq_(len(sample), 3)
-        assert isinstance(sample[0], str)
-        nose.tools.eq_(sample[1].shape, (3, 1632, 1632)) #planes, height, width
-        nose.tools.eq_(sample[1].dtype, torch.float32)
-        nose.tools.eq_(sample[2].shape, (1, 1632, 1632)) #planes, height, width
-        nose.tools.eq_(sample[2].dtype, torch.float32)
-
-
-@rc_variable_set("bob.ip.binseg.drishtigs1.datadir")
-def test_drishtigs1_disc_all():
-
-    from ..configs.datasets.drishtigs1.disc_all import dataset
-
-    nose.tools.eq_(len(dataset["train"]), 50)
-    nose.tools.eq_(dataset["train"].augmented, True)
-    for sample in dataset["train"][:N]:
-        nose.tools.eq_(len(sample), 3)
-        assert isinstance(sample[0], str)
-        nose.tools.eq_(sample[1].shape, (3, 1760, 2048)) #planes, height, width
-        nose.tools.eq_(sample[1].dtype, torch.float32)
-        nose.tools.eq_(sample[2].shape, (1, 1760, 2048)) #planes, height, width
-        nose.tools.eq_(sample[2].dtype, torch.float32)
-
-    nose.tools.eq_(len(dataset["test"]), 51)
-    nose.tools.eq_(dataset["test"].augmented, False)
-    for sample in dataset["test"][:N]:
-        nose.tools.eq_(len(sample), 3)
-        assert isinstance(sample[0], str)
-        nose.tools.eq_(sample[1].shape, (3, 1760, 2048)) #planes, height, width
-        nose.tools.eq_(sample[1].dtype, torch.float32)
-        nose.tools.eq_(sample[2].shape, (1, 1760, 2048)) #planes, height, width
-        nose.tools.eq_(sample[2].dtype, torch.float32)
-
-
-@rc_variable_set("bob.ip.binseg.drishtigs1.datadir")
-def test_drishtigs1_cup_all():
-
-    from ..configs.datasets.drishtigs1.cup_all import dataset
-
-    nose.tools.eq_(len(dataset["train"]), 50)
-    nose.tools.eq_(dataset["train"].augmented, True)
-    for sample in dataset["train"][:N]:
-        nose.tools.eq_(len(sample), 3)
-        assert isinstance(sample[0], str)
-        nose.tools.eq_(sample[1].shape, (3, 1760, 2048)) #planes, height, width
-        nose.tools.eq_(sample[1].dtype, torch.float32)
-        nose.tools.eq_(sample[2].shape, (1, 1760, 2048)) #planes, height, width
-        nose.tools.eq_(sample[2].dtype, torch.float32)
-
-    nose.tools.eq_(len(dataset["test"]), 51)
-    nose.tools.eq_(dataset["test"].augmented, False)
-    for sample in dataset["test"][:N]:
-        nose.tools.eq_(len(sample), 3)
-        assert isinstance(sample[0], str)
-        nose.tools.eq_(sample[1].shape, (3, 1760, 2048)) #planes, height, width
-        nose.tools.eq_(sample[1].dtype, torch.float32)
-        nose.tools.eq_(sample[2].shape, (1, 1760, 2048)) #planes, height, width
-        nose.tools.eq_(sample[2].dtype, torch.float32)
-
-
-@rc_variable_set("bob.ip.binseg.drionsdb.datadir")
-def test_drionsdb_expert1():
-
-    from ..configs.datasets.drionsdb.expert1 import dataset
-
-    nose.tools.eq_(len(dataset["train"]), 60)
-    nose.tools.eq_(dataset["train"].augmented, True)
-    for sample in dataset["train"][:N]:
-        nose.tools.eq_(len(sample), 3)
-        assert isinstance(sample[0], str)
-        nose.tools.eq_(sample[1].shape, (3, 416, 608)) #planes, height, width
-        nose.tools.eq_(sample[1].dtype, torch.float32)
-        nose.tools.eq_(sample[2].shape, (1, 416, 608)) #planes, height, width
-        nose.tools.eq_(sample[2].dtype, torch.float32)
-
-    nose.tools.eq_(len(dataset["test"]), 50)
-    nose.tools.eq_(dataset["test"].augmented, False)
-    for sample in dataset["test"][:N]:
-        nose.tools.eq_(len(sample), 3)
-        assert isinstance(sample[0], str)
-        nose.tools.eq_(sample[1].shape, (3, 416, 608)) #planes, height, width
-        nose.tools.eq_(sample[1].dtype, torch.float32)
-        nose.tools.eq_(sample[2].shape, (1, 416, 608)) #planes, height, width
-        nose.tools.eq_(sample[2].dtype, torch.float32)
+    nose.tools.eq_(len(dataset), 1)
+    _check_subset(dataset["test"], 20)
 
 
 @rc_variable_set("bob.ip.binseg.stare.datadir")
@@ -366,11 +52,16 @@ def test_drionsdb_expert1():
 def test_drive_covd():
 
     from ..configs.datasets.drive.covd import dataset
+    nose.tools.eq_(len(dataset), 3)
+
+    from ..configs.datasets.drive.default import dataset as baseline
+    nose.tools.eq_(dataset["train"], baseline["train"])
+    nose.tools.eq_(dataset["test"], baseline["test"])
 
-    nose.tools.eq_(len(dataset["train"]), 53)
-    #nose.tools.eq_(dataset["train"].augmented, True)  ##ConcatDataset
-    nose.tools.eq_(dataset["test"].augmented, False)
-    for sample in dataset["train"]:
+    # this is the only different set from the baseline
+    nose.tools.eq_(len(dataset["__train__"]), 53)
+
+    for sample in dataset["__train__"]:
         assert 3 <= len(sample) <= 4
         assert isinstance(sample[0], str)
         nose.tools.eq_(sample[1].shape, (3, 544, 544)) #planes, height, width
@@ -390,11 +81,16 @@ def test_drive_covd():
 def test_drive_ssl():
 
     from ..configs.datasets.drive.ssl import dataset
+    nose.tools.eq_(len(dataset), 3)
+
+    from ..configs.datasets.drive.default import dataset as baseline
+    nose.tools.eq_(dataset["train"], baseline["train"])
+    nose.tools.eq_(dataset["test"], baseline["test"])
 
-    nose.tools.eq_(len(dataset["train"]), 53)
-    #nose.tools.eq_(dataset["train"].augmented, True)  ##ConcatDataset
-    nose.tools.eq_(dataset["test"].augmented, False)
-    for sample in dataset["train"]:
+    # this is the only different set from the baseline
+    nose.tools.eq_(len(dataset["__train__"]), 53)
+
+    for sample in dataset["__train__"]:
         assert 5 <= len(sample) <= 6
         assert isinstance(sample[0], str)
         nose.tools.eq_(sample[1].shape, (3, 544, 544)) #planes, height, width
@@ -413,6 +109,47 @@ def test_drive_ssl():
             nose.tools.eq_(sample[4].dtype, torch.float32)
 
 
+@stare_variable_set("bob.ip.binseg.stare.datadir")
+def test_stare_augmentation_manipulation():
+
+    # some tests to check our context management for dataset augmentation works
+    # adequately, with one example dataset
+
+    # hack to allow testing on the CI
+    from ..configs.datasets.stare import _maker
+    dataset = _maker("ah", stare_dataset)
+
+    nose.tools.eq_(len(dataset["__train__"]._transforms.transforms),
+            len(dataset["test"]._transforms.transforms) + 4)
+
+    nose.tools.eq_(len(dataset["train"]._transforms.transforms),
+            len(dataset["test"]._transforms.transforms))
+
+
+@stare_variable_set("bob.ip.binseg.stare.datadir")
+def test_stare():
+
+    def _check_subset(samples, size):
+        nose.tools.eq_(len(samples), size)
+        for s in samples:
+            nose.tools.eq_(len(s), 3)
+            assert isinstance(s[0], str)
+            nose.tools.eq_(s[1].shape, (3, 608, 704)) #planes, height, width
+            nose.tools.eq_(s[1].dtype, torch.float32)
+            nose.tools.eq_(s[2].shape, (1, 608, 704)) #planes, height, width
+            nose.tools.eq_(s[2].dtype, torch.float32)
+
+    # hack to allow testing on the CI
+    from ..configs.datasets.stare import _maker
+
+    for protocol in "ah", "vk":
+        dataset = _maker(protocol, stare_dataset)
+        nose.tools.eq_(len(dataset), 3)
+        _check_subset(dataset["__train__"], 10)
+        _check_subset(dataset["train"], 10)
+        _check_subset(dataset["test"], 10)
+
+
 @rc_variable_set("bob.ip.binseg.drive.datadir")
 @rc_variable_set("bob.ip.binseg.chasedb1.datadir")
 @rc_variable_set("bob.ip.binseg.hrf.datadir")
@@ -420,11 +157,15 @@ def test_drive_ssl():
 def test_stare_covd():
 
     from ..configs.datasets.stare.covd import dataset
+    nose.tools.eq_(len(dataset), 3)
 
-    nose.tools.eq_(len(dataset["train"]), 63)
-    #nose.tools.eq_(dataset["train"].augmented, True)  ##ConcatDataset
-    nose.tools.eq_(dataset["test"].augmented, False)
-    for sample in dataset["train"]:
+    from ..configs.datasets.stare.ah import dataset as baseline
+    nose.tools.eq_(dataset["train"], baseline["train"])
+    nose.tools.eq_(dataset["test"], baseline["test"])
+
+    # this is the only different set from the baseline
+    nose.tools.eq_(len(dataset["__train__"]), 63)
+    for sample in dataset["__train__"]:
         assert 3 <= len(sample) <= 4
         assert isinstance(sample[0], str)
         nose.tools.eq_(sample[1].shape, (3, 608, 704)) #planes, height, width
@@ -436,6 +177,28 @@ def test_stare_covd():
             nose.tools.eq_(sample[3].dtype, torch.float32)
 
 
+@rc_variable_set("bob.ip.binseg.chasedb1.datadir")
+def test_chasedb1():
+
+    def _check_subset(samples, size):
+        nose.tools.eq_(len(samples), size)
+        for s in samples:
+            nose.tools.eq_(len(s), 3)
+            assert isinstance(s[0], str)
+            nose.tools.eq_(s[1].shape, (3, 960, 960)) #planes, height, width
+            nose.tools.eq_(s[1].dtype, torch.float32)
+            nose.tools.eq_(s[2].shape, (1, 960, 960)) #planes, height, width
+            nose.tools.eq_(s[2].dtype, torch.float32)
+
+    for m in ("first_annotator", "second_annotator"):
+        d = importlib.import_module(f"...configs.datasets.chasedb1.{m}",
+                package=__name__).dataset
+        nose.tools.eq_(len(d), 3)
+        _check_subset(d["__train__"], 8)
+        _check_subset(d["train"], 8)
+        _check_subset(d["test"], 20)
+
+
 @rc_variable_set("bob.ip.binseg.drive.datadir")
 @rc_variable_set("bob.ip.binseg.stare.datadir")
 @rc_variable_set("bob.ip.binseg.hrf.datadir")
@@ -443,11 +206,15 @@ def test_stare_covd():
 def test_chasedb1_covd():
 
     from ..configs.datasets.chasedb1.covd import dataset
+    nose.tools.eq_(len(dataset), 3)
+
+    from ..configs.datasets.chasedb1.first_annotator import dataset as baseline
+    nose.tools.eq_(dataset["train"], baseline["train"])
+    nose.tools.eq_(dataset["test"], baseline["test"])
 
-    nose.tools.eq_(len(dataset["train"]), 65)
-    #nose.tools.eq_(dataset["train"].augmented, True)  ##ConcatDataset
-    nose.tools.eq_(dataset["test"].augmented, False)
-    for sample in dataset["train"]:
+    # this is the only different set from the baseline
+    nose.tools.eq_(len(dataset["__train__"]), 65)
+    for sample in dataset["__train__"]:
         assert 3 <= len(sample) <= 4
         assert isinstance(sample[0], str)
         nose.tools.eq_(sample[1].shape, (3, 960, 960)) #planes, height, width
@@ -459,6 +226,28 @@ def test_chasedb1_covd():
             nose.tools.eq_(sample[3].dtype, torch.float32)
 
 
+@rc_variable_set("bob.ip.binseg.hrf.datadir")
+def test_hrf():
+
+    def _check_subset(samples, size):
+        nose.tools.eq_(len(samples), size)
+        for s in samples:
+            nose.tools.eq_(len(s), 4)
+            assert isinstance(s[0], str)
+            nose.tools.eq_(s[1].shape, (3, 1168, 1648)) #planes, height, width
+            nose.tools.eq_(s[1].dtype, torch.float32)
+            nose.tools.eq_(s[2].shape, (1, 1168, 1648)) #planes, height, width
+            nose.tools.eq_(s[2].dtype, torch.float32)
+            nose.tools.eq_(s[3].shape, (1, 1168, 1648)) #planes, height, width
+            nose.tools.eq_(s[3].dtype, torch.float32)
+
+    from ..configs.datasets.hrf.default import dataset
+    nose.tools.eq_(len(dataset), 3)
+    _check_subset(dataset["__train__"], 15)
+    _check_subset(dataset["train"], 15)
+    _check_subset(dataset["test"], 30)
+
+
 @rc_variable_set("bob.ip.binseg.drive.datadir")
 @rc_variable_set("bob.ip.binseg.stare.datadir")
 @rc_variable_set("bob.ip.binseg.chasedb1.datadir")
@@ -466,11 +255,15 @@ def test_chasedb1_covd():
 def test_hrf_covd():
 
     from ..configs.datasets.hrf.covd import dataset
+    nose.tools.eq_(len(dataset), 3)
 
-    nose.tools.eq_(len(dataset["train"]), 58)
-    #nose.tools.eq_(dataset["train"].augmented, True)  ##ConcatDataset
-    nose.tools.eq_(dataset["test"].augmented, False)
-    for sample in dataset["train"]:
+    from ..configs.datasets.hrf.default import dataset as baseline
+    nose.tools.eq_(dataset["train"], baseline["train"])
+    nose.tools.eq_(dataset["test"], baseline["test"])
+
+    # this is the only different set from the baseline
+    nose.tools.eq_(len(dataset["__train__"]), 58)
+    for sample in dataset["__train__"]:
         assert 3 <= len(sample) <= 4
         assert isinstance(sample[0], str)
         nose.tools.eq_(sample[1].shape, (3, 1168, 1648)) #planes, height, width
@@ -482,6 +275,30 @@ def test_hrf_covd():
             nose.tools.eq_(sample[3].dtype, torch.float32)
 
 
+@rc_variable_set("bob.ip.binseg.iostar.datadir")
+def test_iostar():
+
+    def _check_subset(samples, size):
+        nose.tools.eq_(len(samples), size)
+        for s in samples:
+            nose.tools.eq_(len(s), 4)
+            assert isinstance(s[0], str)
+            nose.tools.eq_(s[1].shape, (3, 1024, 1024)) #planes, height, width
+            nose.tools.eq_(s[1].dtype, torch.float32)
+            nose.tools.eq_(s[2].shape, (1, 1024, 1024)) #planes, height, width
+            nose.tools.eq_(s[2].dtype, torch.float32)
+            nose.tools.eq_(s[3].shape, (1, 1024, 1024)) #planes, height, width
+            nose.tools.eq_(s[3].dtype, torch.float32)
+
+    for m in ("vessel", "optic_disc"):
+        d = importlib.import_module(f"...configs.datasets.iostar.{m}",
+                package=__name__).dataset
+        nose.tools.eq_(len(d), 3)
+        _check_subset(d["__train__"], 20)
+        _check_subset(d["train"], 20)
+        _check_subset(d["test"], 10)
+
+
 @rc_variable_set("bob.ip.binseg.drive.datadir")
 @rc_variable_set("bob.ip.binseg.stare.datadir")
 @rc_variable_set("bob.ip.binseg.chasedb1.datadir")
@@ -489,11 +306,15 @@ def test_hrf_covd():
 def test_iostar_covd():
 
     from ..configs.datasets.iostar.covd import dataset
+    nose.tools.eq_(len(dataset), 3)
 
-    nose.tools.eq_(len(dataset["train"]), 53)
-    #nose.tools.eq_(dataset["train"].augmented, True)  ##ConcatDataset
-    nose.tools.eq_(dataset["test"].augmented, False)
-    for sample in dataset["train"]:
+    from ..configs.datasets.iostar.vessel import dataset as baseline
+    nose.tools.eq_(dataset["train"], baseline["train"])
+    nose.tools.eq_(dataset["test"], baseline["test"])
+
+    # this is the only different set from the baseline
+    nose.tools.eq_(len(dataset["__train__"]), 53)
+    for sample in dataset["__train__"]:
         assert 3 <= len(sample) <= 4
         assert isinstance(sample[0], str)
         nose.tools.eq_(sample[1].shape, (3, 1024, 1024)) #planes, height, width
@@ -503,3 +324,92 @@ def test_iostar_covd():
         if len(sample) == 4:
             nose.tools.eq_(sample[3].shape, (1, 1024, 1024))
             nose.tools.eq_(sample[3].dtype, torch.float32)
+
+
+@rc_variable_set("bob.ip.binseg.refuge.datadir")
+def test_refuge():
+
+    def _check_subset(samples, size):
+        nose.tools.eq_(len(samples), size)
+        for s in samples[:N]:
+            nose.tools.eq_(len(s), 3)
+            assert isinstance(s[0], str)
+            nose.tools.eq_(s[1].shape, (3, 1632, 1632)) #planes, height, width
+            nose.tools.eq_(s[1].dtype, torch.float32)
+            nose.tools.eq_(s[2].shape, (1, 1632, 1632)) #planes, height, width
+            nose.tools.eq_(s[2].dtype, torch.float32)
+
+    for m in ("disc", "cup"):
+        d = importlib.import_module(f"...configs.datasets.refuge.{m}",
+                package=__name__).dataset
+        nose.tools.eq_(len(d), 4)
+        _check_subset(d["__train__"], 400)
+        _check_subset(d["train"], 400)
+        _check_subset(d["validation"], 400)
+        _check_subset(d["test"], 400)
+
+
+@rc_variable_set("bob.ip.binseg.drishtigs1.datadir")
+def test_drishtigs1():
+
+    def _check_subset(samples, size):
+        nose.tools.eq_(len(samples), size)
+        for s in samples[:N]:
+            nose.tools.eq_(len(s), 3)
+            assert isinstance(s[0], str)
+            nose.tools.eq_(s[1].shape, (3, 1760, 2048)) #planes, height, width
+            nose.tools.eq_(s[1].dtype, torch.float32)
+            nose.tools.eq_(s[2].shape, (1, 1760, 2048)) #planes, height, width
+            nose.tools.eq_(s[2].dtype, torch.float32)
+
+    for m in ("disc_all", "cup_all", "disc_any", "cup_any"):
+        d = importlib.import_module(f"...configs.datasets.drishtigs1.{m}",
+                package=__name__).dataset
+        nose.tools.eq_(len(d), 3)
+        _check_subset(d["__train__"], 50)
+        _check_subset(d["train"], 50)
+        _check_subset(d["test"], 51)
+
+
+@rc_variable_set("bob.ip.binseg.rimoner3.datadir")
+def test_rimoner3():
+
+    def _check_subset(samples, size):
+        nose.tools.eq_(len(samples), size)
+        for s in samples[:N]:
+            nose.tools.eq_(len(s), 3)
+            assert isinstance(s[0], str)
+            nose.tools.eq_(s[1].shape, (3, 1440, 1088)) #planes, height, width
+            nose.tools.eq_(s[1].dtype, torch.float32)
+            nose.tools.eq_(s[2].shape, (1, 1440, 1088)) #planes, height, width
+            nose.tools.eq_(s[2].dtype, torch.float32)
+
+    for m in ("disc_exp1", "cup_exp1", "disc_exp2", "cup_exp2"):
+        d = importlib.import_module(f"...configs.datasets.rimoner3.{m}",
+                package=__name__).dataset
+        nose.tools.eq_(len(d), 3)
+        _check_subset(d["__train__"], 99)
+        _check_subset(d["train"], 99)
+        _check_subset(d["test"], 60)
+
+
+@rc_variable_set("bob.ip.binseg.drionsdb.datadir")
+def test_drionsdb():
+
+    def _check_subset(samples, size):
+        nose.tools.eq_(len(samples), size)
+        for s in samples[:N]:
+            nose.tools.eq_(len(s), 3)
+            assert isinstance(s[0], str)
+            nose.tools.eq_(s[1].shape, (3, 416, 608)) #planes, height, width
+            nose.tools.eq_(s[1].dtype, torch.float32)
+            nose.tools.eq_(s[2].shape, (1, 416, 608)) #planes, height, width
+            nose.tools.eq_(s[2].dtype, torch.float32)
+
+    for m in ("expert1", "expert2"):
+        d = importlib.import_module(f"...configs.datasets.drionsdb.{m}",
+                package=__name__).dataset
+        nose.tools.eq_(len(d), 3)
+        _check_subset(d["__train__"], 60)
+        _check_subset(d["train"], 60)
+        _check_subset(d["test"], 50)
diff --git a/doc/cli.rst b/doc/cli.rst
index 9315447d778ef1917a1f9404d677cd8cbb2e70d4..e5b261d6735221d6cfe40e35256ce734e29c4f9b 100644
--- a/doc/cli.rst
+++ b/doc/cli.rst
@@ -137,4 +137,15 @@ combined figures and tables that compare results of multiple systems.
 .. command-output:: bob binseg compare --help
 
 
+.. _bob.ip.binseg.cli.experiment:
+
+Running Complete Experiments
+----------------------------
+
+This command can run training, prediction, evaluation and comparison from a
+single, multi-step application.
+
+.. command-output:: bob binseg experiment --help
+
+
 .. include:: links.rst
diff --git a/doc/evaluation.rst b/doc/evaluation.rst
index 28a0bf65f0cffb9f3144f5990a3549f12de976e1..3646969173ddde58f56b87d2f08c718f5c34477c 100644
--- a/doc/evaluation.rst
+++ b/doc/evaluation.rst
@@ -24,8 +24,8 @@ point numbers indicating the vessel probability (``[0.0,1.0]``) for each pixel
 in the input image.
 
 
-Inference on an existing datasets
-=================================
+Inference on an existing dataset
+================================
 
 To run inference, use the sub-command :ref:`predict
 <bob.ip.binseg.cli.predict>` to run prediction on an existing dataset:
@@ -66,9 +66,9 @@ Evaluation
 
 In evaluation, we input an **annotated** dataset and predictions to generate
 performance figures that can help analysis of a trained model.  Evaluation is
-done using ``bob binseg evaluate`` followed by the model and the annotated
-dataset configuration, and the path to the pretrained model via the
-``--weight`` argument.
+done using the :ref:`evaluate command `<bob.ip.binseg.cli.evaluate>` followed
+by the model and the annotated dataset configuration, and the path to the
+pretrained weights via the ``--weight`` argument.
 
 Use ``bob binseg evaluate --help`` for more information.
 
@@ -79,12 +79,15 @@ E.g. run inference on predictions from the DRIVE test set, do the following:
     # Point directly to saved model via -w argument:
     bob binseg evaluate -vv drive-test -p /predictions/folder -o /eval/results/folder
 
+If available, you may use the option ``--second-annotator`` to
+
 
 Comparing Systems
 =================
 
 To compare multiple systems together and generate combined plots and tables,
-use ``bob binseg compare``.  Use ``--help`` for a quick guide.
+use the :ref:`compare command <bob.ip.binseg.cli.compare>`.  Use ``--help`` for
+a quick guide.
 
 .. code-block:: bash
 
diff --git a/doc/experiment.rst b/doc/experiment.rst
new file mode 100644
index 0000000000000000000000000000000000000000..4ef87f9004200a0888994de58b4ac086001c57f0
--- /dev/null
+++ b/doc/experiment.rst
@@ -0,0 +1,22 @@
+.. -*- coding: utf-8 -*-
+
+.. _bob.ip.binseg.experiment:
+
+==============================
+ Running complete experiments
+==============================
+
+We provide an :ref:`aggregator command called "experiment"
+<bob.ip.binseg.cli.experiment>` that runs training, followed by prediction,
+evaluation and comparison.  After running, you will be able to find results
+from model fitting, prediction, evaluation and comparison under a single output
+directory.
+
+For example, to train a Mobile V2 U-Net architecture on the STARE dataset,
+evaluate both train and test set performances, output prediction maps and
+overlay analysis, together with a performance curve, run the following:
+
+.. code-block:: sh
+
+   $ bob binseg experiment -vv m2unet stare --batch-size=16 --overlayed
+   # check results in the "results" folder
diff --git a/doc/training.rst b/doc/training.rst
index 9a05c6685957115b8bd4bf231a1d717085f2e5ce..693561286fd8cf4a19ccddc0e75050743b155d7e 100644
--- a/doc/training.rst
+++ b/doc/training.rst
@@ -7,14 +7,10 @@
 ==========
 
 To train a new FCN, use the command-line interface (CLI) application ``bob
-binseg train``, available on your prompt.  To use this CLI, you must define
-the input dataset that will be used to train the FCN, as well as the type of
-model that will be trained.  You may issue ``bob binseg train --help`` for a
-help message containing more detailed instructions.
-
-To replicate our results, use our main application ``bob binseg train``
-followed by the model configuration, and dataset configuration files, and/or
-command-line options.  Use ``bob binseg train --help`` for more information.
+binseg train``, available on your prompt.  To use this CLI, you must define the
+input dataset that will be used to train the FCN, as well as the type of model
+that will be trained.  You may issue ``bob binseg train --help`` for a help
+message containing more detailed instructions.
 
 .. tip::
 
@@ -75,14 +71,14 @@ card, for supervised training of baselines.  Use it like this:
    customized dataset and model files.  You may :ref:`copy any of the existing
    configuration resources <bob.ip.binseg.cli.config.copy>` and change them
    locally.  Once you're happy, you may use the newly created files directly on
-   your training command line.  For example, suppose you wanted to slightly
-   change the drive pre-processing pipeline.  You could do the following:
+   your command line.  For example, suppose you wanted to slightly change the
+   DRIVE pre-processing pipeline.  You could do the following:
 
    .. code-block:: bash
 
       $ bob binseg config copy drive my_drive_remix.py
       # edit my_drive_remix.py to your needs
-      $ bob binseg train -vv <model> ./my_drive_remix.py --batch-size=<see-table> --device="cuda:0"
+      $ bob binseg train -vv <model> ./my_drive_remix.py
 
 
 .. _bob.ip.binseg.gridtk-tip:
@@ -94,14 +90,14 @@ card, for supervised training of baselines.  Use it like this:
 
    .. code-block:: sh
 
-      $ jman submit --queue=gpu --memory=24G --name=m2unet-drive -- bob binseg train --device='cuda:0' ... #paste the rest of the command-line
+      $ jman submit --queue=gpu --memory=24G --name=myjob -- bob binseg train --device='cuda:0' ... #paste the rest of the command-line
 
 
 Combined Vessel Dataset (COVD)
 ==============================
 
-The following table describes recommended batch sizes for 24Gb of RAM GPU
-card, for supervised training of COVD- systems.  Use it like this:
+The following table describes recommended batch sizes for 24Gb of RAM GPU card,
+for supervised training of COVD- systems.  Use it like this:
 
 .. code-block:: sh
 
@@ -167,11 +163,11 @@ Using your own dataset
 ======================
 
 To use your own dataset, we recommend you read our instructions at
-:py:mod:`bob.ip.binseg.configs.datasets.csv`, and setup a CSV file describing
-input data and ground-truth (segmentation maps).  Then, prepare a configuration
-file by copying our configuration example and edit it to apply the required
-transforms to your input data.  Once you are happy with the result, use it in
-place of one of our datasets:
+:py:mod:`bob.ip.binseg.configs.datasets.csv`, and setup one or more CSV file
+describing input data and ground-truth (segmentation maps).  Then, prepare a
+configuration file by copying our configuration example and edit it to apply
+the required transforms to your input data.  Once you are happy with the
+result, use it in place of one of our datasets:
 
 .. code-block:: sh
 
diff --git a/doc/usage.rst b/doc/usage.rst
index be0b7f893a4804b4caea5466f2e60c0a6503ad0d..c9967139f77c1e64bd634b641bf795ffdea73c55 100644
--- a/doc/usage.rst
+++ b/doc/usage.rst
@@ -49,6 +49,7 @@ modifying one of our configuration resources.
    training
    models
    evaluation
+   experiment
 
 
 .. include:: links.rst