From c840367fc0b26c2f67471295f4e8541eb6e73bb7 Mon Sep 17 00:00:00 2001
From: Tiago Freitas Pereira <tiagofrepereira@gmail.com>
Date: Thu, 20 Feb 2020 18:54:48 +0100
Subject: [PATCH] Reorganizing the vanilla_biometrics

Porting code

Implementing the Vanilla Algorithm

Developing the VanillaBiometrics Algorithm

Score computation of the vanilla pipelines

Finished PCA scoring
---
 bob/bio/base/algorithm/Algorithm.py           |   7 +
 bob/bio/base/algorithm/PCA.py                 | 230 +++------
 bob/bio/base/config/algorithm/pca.py          |  10 +-
 bob/bio/base/config/baselines/pca_atnt.py     |  20 +-
 bob/bio/base/config/extractor/linearize.py    |   2 +-
 .../pipelines/vanilla_biometrics/__init__.py  |   4 +
 .../annotated_legacy.py}                      |   3 +-
 .../pipelines/vanilla_biometrics/blocks.py    | 451 ++++++++++++++++++
 .../pipelines/vanilla_biometrics/database.py  |   0
 .../legacy.py}                                | 210 +-------
 .../pipeline.py}                              |   8 +-
 bob/bio/base/script/vanilla_biometrics.py     |   4 +-
 doc/experiments.rst                           |  44 +-
 doc/implementation.rst                        |  13 +-
 14 files changed, 584 insertions(+), 422 deletions(-)
 create mode 100644 bob/bio/base/pipelines/vanilla_biometrics/__init__.py
 rename bob/bio/base/pipelines/{annotated_blocks.py => vanilla_biometrics/annotated_legacy.py} (99%)
 create mode 100644 bob/bio/base/pipelines/vanilla_biometrics/blocks.py
 create mode 100644 bob/bio/base/pipelines/vanilla_biometrics/database.py
 rename bob/bio/base/pipelines/{blocks.py => vanilla_biometrics/legacy.py} (64%)
 rename bob/bio/base/pipelines/{vanilla_biometrics.py => vanilla_biometrics/pipeline.py} (97%)

diff --git a/bob/bio/base/algorithm/Algorithm.py b/bob/bio/base/algorithm/Algorithm.py
index c15cc9e0..696e602d 100644
--- a/bob/bio/base/algorithm/Algorithm.py
+++ b/bob/bio/base/algorithm/Algorithm.py
@@ -6,6 +6,8 @@
 import numpy
 import os
 from .. import utils
+import warnings
+
 
 class Algorithm (object):
   """This is the base class for all biometric recognition algorithms.
@@ -87,6 +89,11 @@ class Algorithm (object):
       min_t_model_file_size=1000,
       **kwargs                            # parameters from the derived class that should be reported in the __str__() function
   ):
+
+    
+    warnings.warn("bob.bio.base.Algorithm is Deprecated", DeprecationWarning)
+
+
     self.performs_projection = performs_projection
     self.requires_projector_training = performs_projection and requires_projector_training
     self.split_training_features_by_client = split_training_features_by_client
diff --git a/bob/bio/base/algorithm/PCA.py b/bob/bio/base/algorithm/PCA.py
index a856292d..425e0986 100644
--- a/bob/bio/base/algorithm/PCA.py
+++ b/bob/bio/base/algorithm/PCA.py
@@ -1,20 +1,20 @@
 #!/usr/bin/env python
 # vim: set fileencoding=utf-8 :
-# Manuel Guenther <Manuel.Guenther@idiap.ch>
+# Tiago de Freitas Pereira <tiago.pereira@idiap.ch>
 
-import bob.learn.linear
-import bob.io.base
 
+from bob.bio.base.pipelines.vanilla_biometrics.blocks import VanillaBiometricsAlgoritm
+import sklearn.decomposition
+from scipy.spatial.distance import euclidean
 import numpy
-import scipy.spatial
-
-from .Algorithm import Algorithm
 
 import logging
+
 logger = logging.getLogger("bob.bio.base")
 
-class PCA (Algorithm):
-  """Performs a principal component analysis (PCA) on the given data.
+
+class PCA(VanillaBiometricsAlgoritm):
+    """Performs a principal component analysis (PCA) on the given data.
 
   This algorithm computes a PCA projection (:py:class:`bob.learn.linear.PCATrainer`) on the given training features, projects the features to eigenspace and computes the distance of two projected features in eigenspace.
   For example, the eigenface algorithm as proposed by [TP91]_ can be run with this class.
@@ -29,181 +29,95 @@ class PCA (Algorithm):
     A function taking two parameters and returns a float.
     If ``uses_variances`` is set to ``True``, the function is provided with a third parameter, which is the vector of variances (aka. eigenvalues).
 
-  is_distance_function : bool
-    Set this flag to ``False`` if the given ``distance_function`` computes a similarity value (i.e., higher values are better)
+  svd_solver: std
+    The way to solve the eigen value problem
 
-  use_variances : bool
-    If set to ``True``, the ``distance_function`` is provided with a third argument, which is the vector of variances (aka. eigenvalues).
+  factor: float
+     Multiplication factor used for the scoring stage
 
   kwargs : ``key=value`` pairs
     A list of keyword arguments directly passed to the :py:class:`Algorithm` base class constructor.
   """
 
-  def __init__(
-      self,
-      subspace_dimension,  # if int, number of subspace dimensions; if float, percentage of variance to keep
-      distance_function = scipy.spatial.distance.euclidean,
-      is_distance_function = True,
-      uses_variances = False,
-      **kwargs  # parameters directly sent to the base class
-  ):
-
-    # call base class constructor and register that the algorithm performs a projection
-    super(PCA, self).__init__(
-        performs_projection = True,
-
-        subspace_dimension = subspace_dimension,
-        distance_function = str(distance_function),
-        is_distance_function = is_distance_function,
-        uses_variances = uses_variances,
-
-        **kwargs
-    )
-
-    self.subspace_dim = subspace_dimension
-    self.machine = None
-    self.distance_function = distance_function
-    self.factor = -1. if is_distance_function else 1.
-    self.uses_variances = uses_variances
-
-
-  def _check_feature(self, feature, projected=False):
-    """Checks that the features are appropriate"""
-    if not isinstance(feature, numpy.ndarray) or feature.ndim != 1 or feature.dtype != numpy.float64:
-      raise ValueError("The given feature is not appropriate")
-    index = 1 if projected else 0
-    if self.machine is not None and feature.shape[0] != self.machine.shape[index]:
-      raise ValueError("The given feature is expected to have %d elements, but it has %d" % (self.machine.shape[index], feature.shape[0]))
-
-
-  def train_projector(self, training_features, projector_file):
-    """Generates the PCA covariance matrix and writes it into the given projector_file.
-
-    **Parameters:**
-
-    training_features : [1D :py:class:`numpy.ndarray`]
-      A list of 1D training arrays (vectors) to train the PCA projection matrix with.
-
-    projector_file : str
-      A writable file, into which the PCA projection matrix (as a :py:class:`bob.learn.linear.Machine`) and the eigenvalues will be written.
-    """
-    # Assure that all data are 1D
-    [self._check_feature(feature) for feature in training_features]
-
-    # Initializes the data
-    data = numpy.vstack(training_features)
-    logger.info("  -> Training LinearMachine using PCA")
-    t = bob.learn.linear.PCATrainer()
-    self.machine, self.variances = t.train(data)
-    # For re-shaping, we need to copy...
-    self.variances = self.variances.copy()
-
-    # compute variance percentage, if desired
-    if isinstance(self.subspace_dim, float):
-      cummulated = numpy.cumsum(self.variances) / numpy.sum(self.variances)
-      for index in range(len(cummulated)):
-        if cummulated[index] > self.subspace_dim:
-          self.subspace_dim = index
-          break
-      self.subspace_dim = index
-    logger.info("    ... Keeping %d PCA dimensions", self.subspace_dim)
-    # re-shape machine
-    self.machine.resize(self.machine.shape[0], self.subspace_dim)
-    self.variances = numpy.resize(self.variances, (self.subspace_dim))
-
-    f = bob.io.base.HDF5File(projector_file, "w")
-    f.set("Eigenvalues", self.variances)
-    f.create_group("Machine")
-    f.cd("/Machine")
-    self.machine.save(f)
-
-
-  def load_projector(self, projector_file):
-    """Reads the PCA projection matrix and the eigenvalues from file.
-
-    **Parameters:**
-
-    projector_file : str
-      An existing file, from which the PCA projection matrix and the eigenvalues are read.
-    """
-    # read PCA projector
-    f = bob.io.base.HDF5File(projector_file)
-    self.variances = f.read("Eigenvalues")
-    f.cd("/Machine")
-    self.machine = bob.learn.linear.Machine(f)
+    def __init__(
+        self,
+        subspace_dimension,  # if int, number of subspace dimensions; if float, percentage of variance to keep
+        distance_function=euclidean,
+        svd_solver="auto",
+        factor=-1,
+        **kwargs,  # parameters directly sent to the base class
+    ):
 
+        # call base class constructor and register that the algorithm performs a projection
+        super(PCA, self).__init__(performs_projection=True)
 
-  def project(self, feature):
-    """project(feature) -> projected
+        self.subspace_dim = subspace_dimension
+        self.distance_function = distance_function
+        self.svd_solver = svd_solver
+        self.factor = -1
 
-    Projects the given feature into eigenspace.
+    def fit(self, samplesets, checkpoints):
+        """
+        This method should implement the sub-pipeline 0 of the Vanilla Biometrics Pipeline :ref:`_vanilla-pipeline-0`.
 
-    **Parameters:**
-
-    feature : 1D :py:class:`numpy.ndarray`
-      The 1D feature to be projected.
+        It represents the training of background models that an algorithm may need.
 
-    **Returns:**
+        Parameters
+        ----------
 
-    projected : 1D :py:class:`numpy.ndarray`
-      The ``feature`` projected into eigenspace.
-    """
-    self._check_feature(feature)
-    # Projects the data
-    return self.machine(feature)
+            samplesets: :py:class:`bob.pipelines.sample.sample.SampleSet`
+                         Set of samples used to train a background model
 
 
-  def enroll(self, enroll_features):
-    """enroll(enroll_features) -> model
+            checkpoint: str
+                If provided, must the path leading to a location where this
+                model should be saved at (complete path without extension) -
+                currently, it needs to be provided because of existing
+                serialization requirements (see bob/bob.io.base#106), but
+                checkpointing will still work as expected.
+         
+        """
 
-    Enrolls the model by storing all given input vectors.
+        pca = sklearn.decomposition.PCA(self.subspace_dim, svd_solver=self.svd_solver)
+        samples_array = self._stack_samples_2_ndarray(samplesets)
+        logger.info(
+            "Training PCA with samples of shape {0}".format(samples_array.shape)
+        )
+        pca.fit(samples_array)
 
-    **Parameters:**
+        # TODO: save the shit
 
-    enroll_features : [1D :py:class:`numpy.ndarray`]
-      The list of projected features to enroll the model from.
+        return pca
 
-    **Returns:**
+    def project_one_sample(self, background_model, data):
+        if data.ndim == 1:
+            return background_model.transform(data.reshape(1, -1))
 
-    model : 2D :py:class:`numpy.ndarray`
-      The enrolled model.
-    """
-    assert len(enroll_features)
-    [self._check_feature(feature, True) for feature in enroll_features]
-    # just store all the features
-    return numpy.vstack(enroll_features)
+        return background_model.transform(data)
 
+    def enroll_one_sample(self, data):
+        return numpy.mean(data, axis=0)
 
-  def score(self, model, probe):
-    """score(model, probe) -> float
+    def score_one_sample(self, biometric_reference, data):
+        """It handles the score computation for one sample
 
-    Computes the distance of the model to the probe using the distance function specified in the constructor.
+        Parameters
+        ----------
 
-    **Parameters:**
+            biometric_reference : list
+                Biometric reference to be compared
 
-    model : 2D :py:class:`numpy.ndarray`
-      The model storing all enrollment features.
+            data : list
+                Data to be compared
 
-    probe : 1D :py:class:`numpy.ndarray`
-      The probe feature vector in eigenspace.
+        Returns
+        -------
 
-    **Returns:**
+            scores : list
+                For each sample in a probe, returns as many scores as there are
+                samples in the probe, together with the probe's and the
+                relevant reference's subject identifiers.
 
-    score : float
-      A similarity value between ``model`` and ``probe``
-    """
-    self._check_feature(probe, True)
-    # return the negative distance (as a similarity measure)
-    if len(model.shape) == 2:
-      # we have multiple models, so we use the multiple model scoring
-      return self.score_for_multiple_models(model, probe)
-    elif self.uses_variances:
-      # single model, single probe (multiple probes have already been handled)
-      return self.factor * self.distance_function(model, probe, self.variances)
-    else:
-      # single model, single probe (multiple probes have already been handled)
-      return self.factor * self.distance_function(model, probe)
+        """
 
-  # re-define unused functions, just so that they do not get documented
-  def train_enroller(*args,**kwargs): raise NotImplementedError()
-  def load_enroller(*args,**kwargs): pass
+        return self.factor * self.distance_function(biometric_reference, data)
diff --git a/bob/bio/base/config/algorithm/pca.py b/bob/bio/base/config/algorithm/pca.py
index c1b4d7cc..d84d84f4 100644
--- a/bob/bio/base/config/algorithm/pca.py
+++ b/bob/bio/base/config/algorithm/pca.py
@@ -1,10 +1,4 @@
 #!/usr/bin/env python
 
-import bob.bio.base
-import scipy.spatial
-
-algorithm = bob.bio.base.algorithm.PCA(
-    subspace_dimension = .95,
-    distance_function = scipy.spatial.distance.euclidean,
-    is_distance_function = True
-)
+from bob.bio.base.algorithm import PCA
+algorithm = PCA(0.99)
\ No newline at end of file
diff --git a/bob/bio/base/config/baselines/pca_atnt.py b/bob/bio/base/config/baselines/pca_atnt.py
index 6ef2a1b4..7766f67f 100644
--- a/bob/bio/base/config/baselines/pca_atnt.py
+++ b/bob/bio/base/config/baselines/pca_atnt.py
@@ -1,21 +1,11 @@
-from bob.bio.base.pipelines.blocks import DatabaseConnector, AlgorithmAdaptor
-import functools
+from bob.bio.base.pipelines.vanilla_biometrics.legacy import DatabaseConnector, AlgorithmAdaptor
+
 import bob.db.atnt
 
 database = DatabaseConnector(bob.db.atnt.Database(), protocol="Default")
 
-from bob.bio.face.preprocessor import Base
-preprocessor = functools.partial(
-                Base,
-                color_channel="gray",
-                dtype="float64",
-            )
-
-
-from bob.bio.base.extractor import Linearize
-extractor = Linearize
-#extractor = 'linearize'
+preprocessor = "face-detect"
 
+extractor = 'linearize'
 
-from bob.bio.base.algorithm import PCA
-algorithm = AlgorithmAdaptor(functools.partial(PCA, 0.99))
+algorithm = 'pca'
diff --git a/bob/bio/base/config/extractor/linearize.py b/bob/bio/base/config/extractor/linearize.py
index 7d452d6e..ddc02165 100644
--- a/bob/bio/base/config/extractor/linearize.py
+++ b/bob/bio/base/config/extractor/linearize.py
@@ -3,4 +3,4 @@
 import bob.bio.base
 
 # Linearization of the data to a vector, no data type specified
-extractor = bob.bio.base.extractor.Linearize()
+extractor = bob.bio.base.extractor.Linearize
diff --git a/bob/bio/base/pipelines/vanilla_biometrics/__init__.py b/bob/bio/base/pipelines/vanilla_biometrics/__init__.py
new file mode 100644
index 00000000..edbb4090
--- /dev/null
+++ b/bob/bio/base/pipelines/vanilla_biometrics/__init__.py
@@ -0,0 +1,4 @@
+# see https://docs.python.org/3/library/pkgutil.html
+from pkgutil import extend_path
+
+__path__ = extend_path(__path__, __name__)
diff --git a/bob/bio/base/pipelines/annotated_blocks.py b/bob/bio/base/pipelines/vanilla_biometrics/annotated_legacy.py
similarity index 99%
rename from bob/bio/base/pipelines/annotated_blocks.py
rename to bob/bio/base/pipelines/vanilla_biometrics/annotated_legacy.py
index bafd959e..3013f87d 100644
--- a/bob/bio/base/pipelines/annotated_blocks.py
+++ b/bob/bio/base/pipelines/vanilla_biometrics/annotated_legacy.py
@@ -12,7 +12,8 @@ import functools
 import bob.io.base
 
 
-from .blocks import DatabaseConnector, SampleLoader
+from .legacy import DatabaseConnector
+from .blocks import SampleLoader
 from bob.pipelines.sample.sample import SampleSet, DelayedSample, Sample 
 
 
diff --git a/bob/bio/base/pipelines/vanilla_biometrics/blocks.py b/bob/bio/base/pipelines/vanilla_biometrics/blocks.py
new file mode 100644
index 00000000..c53483fb
--- /dev/null
+++ b/bob/bio/base/pipelines/vanilla_biometrics/blocks.py
@@ -0,0 +1,451 @@
+#!/usr/bin/env python
+# vim: set fileencoding=utf-8 :
+
+import copy
+import functools
+import numpy
+import os
+import bob.io.base
+from bob.pipelines.sample.sample import DelayedSample, SampleSet, Sample
+
+"""Re-usable blocks for legacy bob.bio.base algorithms"""
+
+
+class SampleLoader:
+    """Adaptor for loading, preprocessing and feature extracting samples
+
+    This adaptor class wraps around sample:
+
+    .. code-block:: text
+
+       [loading [-> preprocessing [-> extraction]]]
+
+    The input sample object must obbey the following (minimal) API:
+
+        * attribute ``id``: Contains an unique (string-fiable) identifier for
+          processed samples
+        * attribute ``data``: Contains the data for this sample
+
+    Optional checkpointing is also implemented for each of the states,
+    independently.  You may check-point just the preprocessing, feature
+    extraction or both.
+
+
+    Parameters
+    ----------
+
+    pipeline : :py:class:`list` of (:py:class:`str`, callable)
+        A list of doubles in which the first entry are names of each processing
+        step in the pipeline and second entry must be default-constructible
+        :py:class:`bob.bio.base.preprocessor.Preprocessor` or
+        :py:class:`bob.bio.base.preprocessor.Extractor` in any order.  Each
+        of these objects must be a python type, that can be instantiated and
+        used through its ``__call__()`` interface to process a single entry of
+        a sample.  For python types that you may want to plug-in, but do not
+        offer a default constructor that you like, pass the result of
+        :py:func:`functools.partial` instead.
+
+    """
+
+    def __init__(self, pipeline):
+        self.pipeline = copy.deepcopy(pipeline)
+
+    def _handle_step(self, sset, func, checkpoint):
+        """Handles a single step in the pipeline, with optional checkpointing
+
+        Parameters
+        ----------
+
+        sset : SampleSet
+            The original sample set to be processed (delayed or pre-loaded)
+
+        func : callable
+            The processing function to call for processing **each** sample in
+            the set, if needs be
+
+        checkpoint : str, None
+            An optional string that may point to a directory that will be used
+            for checkpointing the processing phase in question
+
+
+        Returns
+        -------
+
+        r : SampleSet
+            The prototype processed sample.  If no checkpointing required, this
+            will be of type :py:class:`Sample`.  Otherwise, it will be a
+            :py:class:`DelayedSample`
+
+        """
+
+        if checkpoint is not None:
+            samples = []  # processed samples
+            for s in sset.samples:
+                # there can be a checkpoint for the data to be processed
+                candidate = os.path.join(checkpoint, s.path + ".hdf5")
+                if not os.path.exists(candidate):
+                    # preprocessing is required, and checkpointing, do it now
+                    data = func(s.data)
+
+                    # notice this can be called in parallel w/o failing
+                    bob.io.base.create_directories_safe(os.path.dirname(candidate))
+                    # bob.bio.base standard interface for preprocessor
+                    # has a read/write_data methods
+                    writer = (
+                        getattr(func, "write_data")
+                        if hasattr(func, "write_data")
+                        else getattr(func, "write_feature")
+                    )
+                    writer(data, candidate)
+
+                # because we are checkpointing, we return a DelayedSample
+                # instead of normal (preloaded) sample. This allows the next
+                # phase to avoid loading it would it be unnecessary (e.g. next
+                # phase is already check-pointed)
+                reader = (
+                    getattr(func, "read_data")
+                    if hasattr(func, "read_data")
+                    else getattr(func, "read_feature")
+                )
+                samples.append(
+                    DelayedSample(functools.partial(reader, candidate), parent=s)
+                )
+        else:
+            # if checkpointing is not required, load the data and preprocess it
+            # as we would normally do
+            samples = [Sample(func(s.data), parent=s) for s in sset.samples]
+
+        r = SampleSet(samples, parent=sset)
+        return r
+
+    def _handle_sample(self, sset, pipeline):
+        """Handles a single sampleset through a pipelien
+
+        Parameters
+        ----------
+
+        sset : SampleSet
+            The original sample set to be processed (delayed or pre-loaded)
+
+        pipeline : :py:class:`list` of :py:class:`tuple`
+            A list of tuples, each comprising of one processing function and
+            one checkpoint directory (:py:class:`str` or ``None``, to avoid
+            checkpointing that phase), respectively
+
+
+        Returns
+        -------
+
+        r : Sample
+            The processed sample
+
+        """
+
+        r = sset
+        for func, checkpoint in pipeline:
+            r = r if func is None else self._handle_step(r, func, checkpoint)
+        return r
+
+    def __call__(self, samples, checkpoints):
+        """Applies the pipeline chaining with optional checkpointing
+
+        Our implementation is optimized to minimize disk I/O to the most.  It
+        yields :py:class:`DelayedSample`'s instead of :py:class:`Sample` if
+        checkpointing is enabled.
+
+
+        Parameters
+        ----------
+
+        samples : list
+            List of :py:class:`SampleSet` to be treated by this pipeline
+
+        checkpoints : dict
+            A dictionary (with any number of entries) that may contain as many
+            keys as those defined when you constructed this class with the
+            pipeline tuple list.  Upon execution, the existance of an entry
+            that defines checkpointing, this phase of the pipeline will be
+            checkpointed.  Notice that you are in the control of checkpointing.
+            If you miss an intermediary step, it will trigger this loader to
+            load the relevant sample, even if the next phase is supposed to be
+            checkpointed.  This strategy keeps the implementation as simple as
+            possible.
+
+
+        Returns
+        -------
+
+        samplesets : list
+            Loaded samplesets, after optional preprocessing and extraction
+
+        """
+
+        pipe = [(v(), checkpoints.get(k)) for k, v in self.pipeline]
+        return [self._handle_sample(k, pipe) for k in samples]
+
+
+class VanillaBiometricsAlgoritm(object):
+    """Describes a base biometric algorithm for the Vanilla Biometrics Pipeline :ref:`_bob.bio.base.struct_bio_rec_sys`_.
+
+    The model can be fitted (optionally).  Otherwise, it can only execute
+    biometric model enrollement, via ``enroll()`` and scoring, with
+    ``score()``.
+
+    """
+
+    def __init__(self, performs_projection=False):
+        self.performs_projection = performs_projection
+        pass
+
+    def _stack_samples_2_ndarray(self, samplesets, stack_per_sampleset=False):
+        """
+        Stack a set of :py:class:`bob.pipelines.sample.sample.SampleSet`
+        and convert them to :py:class:`numpy.ndarray`
+
+        Parameters
+        ----------
+
+            samplesets: :py:class:`bob.pipelines.sample.sample.SampleSet`
+                         Set of samples to be stackted
+
+            stack_per_sampleset: bool
+                If true will return a list of :py:class:`numpy.ndarray`, each one for a sample set
+
+        """
+
+        if stack_per_sampleset:
+            # TODO: Make it more efficient
+            all_data = []
+            for sampleset in samplesets:
+                all_data.append(
+                    numpy.array([sample.data for sample in sampleset.samples])
+                )
+            return all_data
+        else:
+            return numpy.array(
+                [
+                    sample.data
+                    for sampleset in samplesets
+                    for sample in sampleset.samples
+                ]
+            )
+
+    def fit(self, samplesets, checkpoint):
+        """
+        This method should implement the sub-pipeline 0 of the Vanilla Biometrics Pipeline :ref:`_vanilla-pipeline-0`.
+
+        It represents the training of background models that an algorithm may need.
+
+        Parameters
+        ----------
+
+            samplesets: :py:class:`bob.pipelines.sample.sample.SampleSet`
+                         Set of samples used to train a background model
+
+
+            checkpoint: str
+                If provided, must the path leading to a location where this
+                model should be saved at (complete path without extension) -
+                currently, it needs to be provided because of existing
+                serialization requirements (see bob/bob.io.base#106), but
+                checkpointing will still work as expected.
+         
+        """
+        raise NotImplemented("Please implement me")
+
+    def enroll(
+        self, references, background_model=None, checkpoint=None, *args, **kwargs
+    ):
+        """This method should implement the sub-pipeline 1 of the Vanilla Biometrics Pipeline :ref:`_vanilla-pipeline-1`.
+
+        It handles the creation of biometric references
+
+        Parameters
+        ----------
+            references : list
+                A list of :py:class:`SampleSet` objects to be used for
+                creating biometric references.  The sets must be identified
+                with a unique id and a path, for eventual checkpointing.
+
+            background_model : 
+                Object containing the background model
+
+            checkpoint : str, None
+                If passed and not ``None``, then it is considered to be the
+                path of a directory containing possible cached values for each
+                of the references in this experiment.  If that is the case, the
+                values are loaded from there and not recomputed.
+
+            *args, **kwargs :
+                Extra parameters that can be used to hook-up processing graph
+                dependencies, but are currently ignored
+
+        """
+
+        def _project(k):
+            return (
+                self.project_one_sample(background_model, k.data)
+                if self.performs_projection
+                else k.data
+            )
+
+        retval = []
+        for k in references:
+            if checkpoint is not None:
+                candidate = os.path.join(os.path.join(checkpoint, k.path + ".hdf5"))
+                if not os.path.exists(candidate):
+                    # create new checkpoint
+                    bob.io.base.create_directories_safe(os.path.dirname(candidate))
+                    data = numpy.vstack([_project(s) for s in k.samples])
+                    enrolled = self.enroll_one_sample(data)
+                    self.write_biometric_reference(enrolled, candidate)
+
+                retval.append(
+                    DelayedSample(
+                        functools.partial(self.read_biometric_reference, candidate),
+                        parent=k,
+                    )
+                )
+            else:
+                # compute on-the-fly
+                data = _project(k)
+                retval.append(Sample(model.enroll_one_sample(data), parent=k))
+
+        return retval
+
+    def write_biometric_reference(self, biometric_reference, filename):
+        """Writes the enrolled model to the given file.
+        In this base class implementation:
+
+        - If the given model has a 'save' attribute, it calls ``model.save(bob.io.base.HDF5File(model_file), 'w')``.
+          In this case, the given model_file might be either a file name or a :py:class:`bob.io.base.HDF5File`.
+        - Otherwise, it uses :py:func:`bob.io.base.save` to do that.
+
+        If you have a different format, please overwrite this function.
+
+        **Parameters:**
+
+        model : object
+          A model as returned by the :py:meth:`enroll` function, which should be written.
+
+        model_file : str or :py:class:`bob.io.base.HDF5File`
+          The file open for writing, or the file name to write to.
+        """
+        import h5py
+
+        with h5py.File(filename, "w") as f:
+            f.create_dataset("biometric_reference", data=biometric_reference)
+
+    def read_biometric_reference(self, filename):
+        import h5py
+
+        with h5py.File(filename, "r") as f:
+            data = f["biometric_reference"].value
+        return data
+
+    def enroll_one_sample(self, data):
+        """
+        It handles the creation of ONE biometric reference for the vanilla ppipeline
+
+        Parameters
+        ----------
+
+            data:
+                Data used for the creation of ONE BIOMETRIC REFERENCE        
+
+        """
+
+        raise NotImplemented("Please, implement me")
+
+    def project_one_sample(self, data):
+        """
+        If your method performs projection, it runs the projecttion
+
+        Parameters
+        ----------
+
+            data:
+                Data used for the projection of ONE BIOMETRIC REFERENCE        
+
+        """
+
+        raise NotImplemented("Please, implement me")
+
+    def score(self, probes, references, background_model=None, *args, **kwargs):
+        """Scores a new sample against multiple (potential) references
+
+        Parameters
+        ----------
+
+            probes : list
+                A list of :py:class:`SampleSet` objects to be used for
+                scoring the input references
+
+            references : list
+                A list of :py:class:`Sample` objects to be used for
+                scoring the input probes, must have an ``id`` attribute that
+                will be used to cross-reference which probes need to be scored.
+
+            background_model : 
+                Path pointing to stored model on disk
+
+            *args, **kwargs :
+                Extra parameters that can be used to hook-up processing graph
+                dependencies, but are currently ignored
+
+
+        Returns
+        -------
+
+            scores : list
+                For each sample in a probe, returns as many scores as there are
+                samples in the probe, together with the probe's and the
+                relevant reference's subject identifiers.
+
+        """
+
+        def _project(k):
+            return (
+                self.project_one_sample(background_model, k.data)
+                if self.performs_projection
+                else k.data
+            )
+
+        retval = []
+        for p in probes:
+            data = numpy.vstack([_project(s) for s in p.samples])
+
+            for subprobe_id, (s, parent) in enumerate(zip(data, p.samples)):
+                # each sub-probe in the probe needs to be checked
+                subprobe_scores = []
+                for ref in [r for r in references if r.id in p.references]:
+                    subprobe_scores.append(
+                        Sample(self.score_one_sample(ref.data, s), parent=ref)
+                    )
+                subprobe = SampleSet(subprobe_scores, parent=p)
+                subprobe.subprobe_id = subprobe_id
+                retval.append(subprobe)
+        return retval
+
+    def score_one_sample(self, biometric_reference, data):
+        """It handles the score computation for one sample
+
+        Parameters
+        ----------
+
+            biometric_reference : list
+                Biometric reference to be compared
+
+            data : list
+                Data to be compared
+
+        Returns
+        -------
+
+            scores : list
+                For each sample in a probe, returns as many scores as there are
+                samples in the probe, together with the probe's and the
+                relevant reference's subject identifiers.
+
+        """
+        raise NotImplemented("Please, implement me")
diff --git a/bob/bio/base/pipelines/vanilla_biometrics/database.py b/bob/bio/base/pipelines/vanilla_biometrics/database.py
new file mode 100644
index 00000000..e69de29b
diff --git a/bob/bio/base/pipelines/blocks.py b/bob/bio/base/pipelines/vanilla_biometrics/legacy.py
similarity index 64%
rename from bob/bio/base/pipelines/blocks.py
rename to bob/bio/base/pipelines/vanilla_biometrics/legacy.py
index 5882a3d9..961e2c27 100644
--- a/bob/bio/base/pipelines/blocks.py
+++ b/bob/bio/base/pipelines/vanilla_biometrics/legacy.py
@@ -1,10 +1,8 @@
 #!/usr/bin/env python
 # vim: set fileencoding=utf-8 :
 
-
 """Re-usable blocks for legacy bob.bio.base algorithms"""
 
-
 import os
 import copy
 import functools
@@ -12,6 +10,7 @@ import functools
 import bob.io.base
 from bob.pipelines.sample.sample import DelayedSample, SampleSet, Sample
 
+
 class DatabaseConnector:
     """Wraps a bob.bio.base database and generates conforming samples
 
@@ -100,10 +99,7 @@ class DatabaseConnector:
         for m in self.database.model_ids(protocol=self.protocol, groups=group):
 
             objects = self.database.objects(
-                protocol=self.protocol,
-                groups=group,
-                model_ids=(m,),
-                purposes="enroll",
+                protocol=self.protocol, groups=group, model_ids=(m,), purposes="enroll"
             )
 
             retval.append(
@@ -156,10 +152,7 @@ class DatabaseConnector:
             # Getting all the probe objects from a particular biometric
             # reference
             objects = self.database.objects(
-                protocol=self.protocol,
-                groups=group,
-                model_ids=(m,),
-                purposes="probe",
+                protocol=self.protocol, groups=group, model_ids=(m,), purposes="probe"
             )
 
             # Creating probe samples
@@ -184,185 +177,8 @@ class DatabaseConnector:
                     )
                 else:
                     probes[o.id].references.append(m)
-     
-        return list(probes.values())
-
-
-class SampleLoader:
-    """Adaptor for loading, preprocessing and feature extracting samples
-
-    This adaptor class wraps around sample:
-
-    .. code-block:: text
-
-       [loading [-> preprocessing [-> extraction]]]
-
-    The input sample object must obbey the following (minimal) API:
-
-        * attribute ``id``: Contains an unique (string-fiable) identifier for
-          processed samples
-        * attribute ``data``: Contains the data for this sample
-
-    Optional checkpointing is also implemented for each of the states,
-    independently.  You may check-point just the preprocessing, feature
-    extraction or both.
-
-
-    Parameters
-    ----------
-
-    pipeline : :py:class:`list` of (:py:class:`str`, callable)
-        A list of doubles in which the first entry are names of each processing
-        step in the pipeline and second entry must be default-constructible
-        :py:class:`bob.bio.base.preprocessor.Preprocessor` or
-        :py:class:`bob.bio.base.preprocessor.Extractor` in any order.  Each
-        of these objects must be a python type, that can be instantiated and
-        used through its ``__call__()`` interface to process a single entry of
-        a sample.  For python types that you may want to plug-in, but do not
-        offer a default constructor that you like, pass the result of
-        :py:func:`functools.partial` instead.
-
-    """
-
-    def __init__(self, pipeline):
-        self.pipeline = copy.deepcopy(pipeline)
 
-    def _handle_step(self, sset, func, checkpoint):
-        """Handles a single step in the pipeline, with optional checkpointing
-
-        Parameters
-        ----------
-
-        sset : SampleSet
-            The original sample set to be processed (delayed or pre-loaded)
-
-        func : callable
-            The processing function to call for processing **each** sample in
-            the set, if needs be
-
-        checkpoint : str, None
-            An optional string that may point to a directory that will be used
-            for checkpointing the processing phase in question
-
-
-        Returns
-        -------
-
-        r : SampleSet
-            The prototype processed sample.  If no checkpointing required, this
-            will be of type :py:class:`Sample`.  Otherwise, it will be a
-            :py:class:`DelayedSample`
-
-        """
-
-        if checkpoint is not None:
-            samples = []  # processed samples
-            for s in sset.samples:
-                # there can be a checkpoint for the data to be processed
-                candidate = os.path.join(checkpoint, s.path + ".hdf5")
-                if not os.path.exists(candidate):
-                    # preprocessing is required, and checkpointing, do it now
-                    data = func(s.data)
-
-                    # notice this can be called in parallel w/o failing
-                    bob.io.base.create_directories_safe(
-                        os.path.dirname(candidate)
-                    )
-                    # bob.bio.base standard interface for preprocessor
-                    # has a read/write_data methods
-                    writer = (
-                        getattr(func, "write_data")
-                        if hasattr(func, "write_data")
-                        else getattr(func, "write_feature")
-                    )
-                    writer(data, candidate)
-
-                # because we are checkpointing, we return a DelayedSample
-                # instead of normal (preloaded) sample. This allows the next
-                # phase to avoid loading it would it be unnecessary (e.g. next
-                # phase is already check-pointed)
-                reader = (
-                    getattr(func, "read_data")
-                    if hasattr(func, "read_data")
-                    else getattr(func, "read_feature")
-                )
-                samples.append(
-                    DelayedSample(
-                        functools.partial(reader, candidate), parent=s
-                    )
-                )
-        else:
-            # if checkpointing is not required, load the data and preprocess it
-            # as we would normally do
-            samples = [Sample(func(s.data), parent=s) for s in sset.samples]
-
-        r = SampleSet(samples, parent=sset)
-        return r
-
-    def _handle_sample(self, sset, pipeline):
-        """Handles a single sampleset through a pipelien
-
-        Parameters
-        ----------
-
-        sset : SampleSet
-            The original sample set to be processed (delayed or pre-loaded)
-
-        pipeline : :py:class:`list` of :py:class:`tuple`
-            A list of tuples, each comprising of one processing function and
-            one checkpoint directory (:py:class:`str` or ``None``, to avoid
-            checkpointing that phase), respectively
-
-
-        Returns
-        -------
-
-        r : Sample
-            The processed sample
-
-        """
-
-        r = sset
-        for func, checkpoint in pipeline:
-            r = r if func is None else self._handle_step(r, func, checkpoint)
-        return r
-
-    def __call__(self, samples, checkpoints):
-        """Applies the pipeline chaining with optional checkpointing
-
-        Our implementation is optimized to minimize disk I/O to the most.  It
-        yields :py:class:`DelayedSample`'s instead of :py:class:`Sample` if
-        checkpointing is enabled.
-
-
-        Parameters
-        ----------
-
-        samples : list
-            List of :py:class:`SampleSet` to be treated by this pipeline
-
-        checkpoints : dict
-            A dictionary (with any number of entries) that may contain as many
-            keys as those defined when you constructed this class with the
-            pipeline tuple list.  Upon execution, the existance of an entry
-            that defines checkpointing, this phase of the pipeline will be
-            checkpointed.  Notice that you are in the control of checkpointing.
-            If you miss an intermediary step, it will trigger this loader to
-            load the relevant sample, even if the next phase is supposed to be
-            checkpointed.  This strategy keeps the implementation as simple as
-            possible.
-
-
-        Returns
-        -------
-
-        samplesets : list
-            Loaded samplesets, after optional preprocessing and extraction
-
-        """
-
-        pipe = [(v(), checkpoints.get(k)) for k, v in self.pipeline]
-        return [self._handle_sample(k, pipe) for k in samples]
+        return list(probes.values())
 
 
 class AlgorithmAdaptor:
@@ -513,20 +329,15 @@ class AlgorithmAdaptor:
         retval = []
         for k in references:
             if checkpoint is not None:
-                candidate = os.path.join(
-                    os.path.join(checkpoint, k.path + ".hdf5")
-                )
+                candidate = os.path.join(os.path.join(checkpoint, k.path + ".hdf5"))
                 if not os.path.exists(candidate):
                     # create new checkpoint
-                    bob.io.base.create_directories_safe(
-                        os.path.dirname(candidate)
-                    )
+                    bob.io.base.create_directories_safe(os.path.dirname(candidate))
                     enrolled = model.enroll(k)
                     model.model.write_model(enrolled, candidate)
                 retval.append(
                     DelayedSample(
-                        functools.partial(model.model.read_model, candidate),
-                        parent=k,
+                        functools.partial(model.model.read_model, candidate), parent=k
                     )
                 )
             else:
@@ -576,16 +387,13 @@ class AlgorithmAdaptor:
                 data = [model.project(s.data) for s in p.samples]
             else:
                 data = [s.data for s in p.samples]
-                
+
             for subprobe_id, (s, parent) in enumerate(zip(data, p.samples)):
                 # each sub-probe in the probe needs to be checked
                 subprobe_scores = []
                 for ref in [r for r in references if r.id in p.references]:
-                    subprobe_scores.append(
-                        Sample(model.score(ref.data, s), parent=ref)
-                    )
+                    subprobe_scores.append(Sample(model.score(ref.data, s), parent=ref))
                 subprobe = SampleSet(subprobe_scores, parent=p)
                 subprobe.subprobe_id = subprobe_id
                 retval.append(subprobe)
         return retval
-
diff --git a/bob/bio/base/pipelines/vanilla_biometrics.py b/bob/bio/base/pipelines/vanilla_biometrics/pipeline.py
similarity index 97%
rename from bob/bio/base/pipelines/vanilla_biometrics.py
rename to bob/bio/base/pipelines/vanilla_biometrics/pipeline.py
index 5ef4d815..da9cb4ff 100644
--- a/bob/bio/base/pipelines/vanilla_biometrics.py
+++ b/bob/bio/base/pipelines/vanilla_biometrics/pipeline.py
@@ -2,7 +2,7 @@
 # vim: set fileencoding=utf-8 :
 
 """
-Biometric "blocks"
+Implementation of the Vanilla Biometrics pipeline using Dask :ref:`bob.bio.base.struct_bio_rec_sys`_
 
 This file contains simple processing blocks meant to be used
 for bob.bio experiments
@@ -120,7 +120,7 @@ def train_background_model(
     checkpoints,
 ):
     """
-    Train background model (without labels)
+    Train background model (without labels) :ref:`_vanilla-pipeline-1`
 
     Parameters
     ----------
@@ -196,7 +196,7 @@ def create_biometric_reference(
     checkpoints,
 ):
     """
-    Create biometric references
+    Create biometric references :ref:`_vanilla-pipeline-2`
 
     Parameters
     ----------
@@ -281,7 +281,7 @@ def compute_scores(
     npartitions,
     checkpoints,
 ):
-    """ Compute biometric scores
+    """ Compute biometric scores :ref:`_vanilla-pipeline-2`
 
     Parameters
     ----------
diff --git a/bob/bio/base/script/vanilla_biometrics.py b/bob/bio/base/script/vanilla_biometrics.py
index 85f067d1..d6c3111c 100644
--- a/bob/bio/base/script/vanilla_biometrics.py
+++ b/bob/bio/base/script/vanilla_biometrics.py
@@ -173,7 +173,7 @@ def vanilla_biometrics(
     checkpointing = True
 
     # Chooses the pipeline to run
-    from bob.bio.base.pipelines.vanilla_biometrics import biometric_pipeline
+    from bob.bio.base.pipelines.vanilla_biometrics.pipeline import biometric_pipeline
 
     if not os.path.exists(output):
         os.makedirs(output)
@@ -205,7 +205,7 @@ def vanilla_biometrics(
 
     # Mechanism that loads samples
     # from ..bob_bio.blocks import SampleLoader
-    from bob.bio.base.pipelines.annotated_blocks import SampleLoaderAnnotated as SampleLoader
+    from bob.bio.base.pipelines.vanilla_biometrics.annotated_legacy import SampleLoaderAnnotated as SampleLoader
     loader = SampleLoader(pipeline)
 
     for g in group:
diff --git a/doc/experiments.rst b/doc/experiments.rst
index 3bed99da..e288b2b9 100644
--- a/doc/experiments.rst
+++ b/doc/experiments.rst
@@ -21,7 +21,7 @@ The previous section described the :ref:`bob.bio.base.struct_bio_rec_sys` using
 This section will describe in detail such sub-pipelines and its relation with biometric experiments.
 
 These sub-pipelines were built using `Dask delayed <https://docs.dask.org/en/latest/delayed.html>`_ ; please follow the Dask documentation for more information about it.
-Another source of information is the `TAM tutorial given at Idiap <https://github.com/tiagofrepereira2012/tam->`_
+Another source of information is the `TAM tutorial given at Idiap <https://github.com/tiagofrepereira2012/tam-dask>`_
 
 
 To run biometric experiments, we provide a generic CLI command called ``bob pipelines``.
@@ -104,21 +104,20 @@ So, a minimal configuration file (say: ``pca_atnt.py``) would look something lik
 
     extractor = 'linearize'
 
-    from bob.bio.base.algorithm import PCA
-    algorithm = AlgorithmAdaptor(functools.partial(PCA, 0.99))
+    algorithm = 'pca'
 
 
 Running the experiment is then as simple as:
 
 .. code-block:: sh
 
-   $ bob pipelines vanilla-biometrics pca_atnt.py local_parallel.py
+   $ bob pipelines vanilla-biometrics pca_atnt.py local_parallel.py -o atnt-experiment
 
 .. note::
    To be able to run exactly the command line from above, it requires to have :ref:`bob.bio.face <bob.bio.face>` installed.
 
 .. note::
-   The 'dask_client' variable is defined in the configuration file `local_parallel.py`. Check it out the package `bob.pipelines <http://gitlab.idiap.ch/bob/bob.pipelines>`_.
+   The `dask_client` variable is defined in the configuration file `local_parallel.py`. Check it out the package `bob.pipelines <http://gitlab.idiap.ch/bob/bob.pipelines>`_.
 
 
 .. note::
@@ -158,7 +157,7 @@ The exact same experiment as above can, hence, be executed using:
 
 .. code-block:: sh
 
-   $ bob pipelines vanilla-biometrics --database mobio-image --preprocessor face-crop-eyes --extractor linearize --algorithm pca --output pca-experiment -vv
+   $ bob pipelines vanilla-biometrics --database atnt --preprocessor face-crop-eyes --extractor linearize --algorithm pca --output atnt-experiment -vv
 
 .. note::
    When running an experiment twice, you might realize that the second execution of the same experiment is much faster than the first one.
@@ -181,12 +180,6 @@ specified in the documentation of
 :py:func:`bob.bio.base.score.load.four_column` or
 :py:func:`bob.bio.base.score.load.five_column`.
 
-Please note that there exists another file called ``Experiment.info`` inside
-the result directory. This file is a pure text file and contains the complete
-configuration of the experiment. With this configuration it is possible to
-inspect all default parameters of the algorithms, and even to re-run the exact
-same experiment.
-
 Metrics
 =======
 
@@ -195,19 +188,20 @@ min.HTER) on a development set and apply it on an evaluation set, just do:
 
 .. code-block:: sh
 
-    $ bob bio metrics -e {dev,test}-4col.txt --legends ExpA --criterion min-hter
+    $ bob bio metrics -v ./atnt-experiment/scores-dev --criterion min-hter
+
+    [Min. criterion: MIN-HTER ] Threshold on Development set `./atnt-experiment/scores-dev`: -1.756157e+03
+    =====================  ===============
+    ..                     Development
+    =====================  ===============
+    Failure to Acquire     0.0%
+    False Match Rate       9.7% (184/1900)
+    False Non Match Rate   9.0% (9/100)
+    False Accept Rate      9.7%
+    False Reject Rate      9.0%
+    Half Total Error Rate  9.3%
+    =====================  ===============
 
-    [Min. criterion: MIN-HTER ] Threshold on Development set `ExpA`: -4.830500e-03
-    ======  ======================  =================
-    ExpA    Development dev-4col    Eval. test-4col
-    ======  ======================  =================
-    FtA     0.0%                    0.0%
-    FMR     6.7% (35/520)           2.5% (13/520)
-    FNMR    6.7% (26/390)           6.2% (24/390)
-    FAR     6.7%                    2.5%
-    FRR     6.7%                    6.2%
-    HTER    6.7%                    4.3%
-    ======  ======================  =================
 
 .. note::
     When evaluation scores are provided, ``--eval`` option must be passed.
@@ -219,7 +213,7 @@ For example:
 
 .. code-block:: sh
 
-    bob bio metrics -e {dev,test}-4col.txt --legends ExpA --criterion cllr
+    bob bio metrics -v ./atnt-experiment/scores-dev --criterion cllr
 
     ======  ======================  ================
     Computing  Cllr and minCllr...
diff --git a/doc/implementation.rst b/doc/implementation.rst
index de9e7be2..73437534 100644
--- a/doc/implementation.rst
+++ b/doc/implementation.rst
@@ -3,9 +3,9 @@
 .. author: Tiago de Freitas Pereira <tiago.pereira@idiap.ch>
 .. Mon 23 04 2012
 
-======================
-Implementation Details
-======================
+===========================================
+Vanilla Biometrics - Implementation Details
+===========================================
 
 The ``bob.bio`` module is specifically designed to be as flexible as possible while trying to keep things simple.
 Therefore, it uses python to implement tools such as preprocessors, feature extractors and recognition algorithms.
@@ -21,11 +21,10 @@ Most of the functionality is provided in the base classes, but any function can
 
 In the derived class constructors, the base class constructor needs to be called.
 For automatically tracing the algorithms, all parameters that are passed to the derived class constructor should be passed to the base class constructor as a list of keyword arguments (which is indicated by ``...`` below).
-This will assure that all parameters of the experiments are stored into the ``Experiment.info`` file.
 
 .. note::
    All tools are based on reading, processing and writing files.
-   By default, any type of file is allowed to be handled, and file names are provided to the ``read_...`` and ``write_...`` functions as strings.
+   By default, any type of file is allowed to be handled, and file names are provided to the ``read_data`` and ``write_data`` functions as strings.
    However, some of the extensions -- particularly the :ref:`bob.bio.video <bob.bio.video>` extension -- requires the read and write functions to handle files of type :py:class:`bob.io.base.HDF5File`.
 
 If you plan to write your own tools, please assure that you are following the following structure.
@@ -33,8 +32,8 @@ If you plan to write your own tools, please assure that you are following the fo
 
 .. _bob.bio.base.preprocessors:
 
-Preprocessors
-~~~~~~~~~~~~~
+Preprocessors and Extractors
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
 All preprocessor classes are derived from :py:class:`bob.bio.base.preprocessor.Preprocessor`.
 All of them implement the following two functions:
-- 
GitLab