Commit c840367f authored by Tiago de Freitas Pereira's avatar Tiago de Freitas Pereira
Browse files

Reorganizing the vanilla_biometrics

Porting code

Implementing the Vanilla Algorithm

Developing the VanillaBiometrics Algorithm

Score computation of the vanilla pipelines

Finished PCA scoring
parent ce83a2ef
Pipeline #37612 failed with stage
in 7 minutes and 25 seconds
......@@ -6,6 +6,8 @@
import numpy
import os
from .. import utils
import warnings
class Algorithm (object):
"""This is the base class for all biometric recognition algorithms.
......@@ -87,6 +89,11 @@ class Algorithm (object):
min_t_model_file_size=1000,
**kwargs # parameters from the derived class that should be reported in the __str__() function
):
warnings.warn("bob.bio.base.Algorithm is Deprecated", DeprecationWarning)
self.performs_projection = performs_projection
self.requires_projector_training = performs_projection and requires_projector_training
self.split_training_features_by_client = split_training_features_by_client
......
#!/usr/bin/env python
# vim: set fileencoding=utf-8 :
# Manuel Guenther <Manuel.Guenther@idiap.ch>
# Tiago de Freitas Pereira <tiago.pereira@idiap.ch>
import bob.learn.linear
import bob.io.base
from bob.bio.base.pipelines.vanilla_biometrics.blocks import VanillaBiometricsAlgoritm
import sklearn.decomposition
from scipy.spatial.distance import euclidean
import numpy
import scipy.spatial
from .Algorithm import Algorithm
import logging
logger = logging.getLogger("bob.bio.base")
class PCA (Algorithm):
"""Performs a principal component analysis (PCA) on the given data.
class PCA(VanillaBiometricsAlgoritm):
"""Performs a principal component analysis (PCA) on the given data.
This algorithm computes a PCA projection (:py:class:`bob.learn.linear.PCATrainer`) on the given training features, projects the features to eigenspace and computes the distance of two projected features in eigenspace.
For example, the eigenface algorithm as proposed by [TP91]_ can be run with this class.
......@@ -29,181 +29,95 @@ class PCA (Algorithm):
A function taking two parameters and returns a float.
If ``uses_variances`` is set to ``True``, the function is provided with a third parameter, which is the vector of variances (aka. eigenvalues).
is_distance_function : bool
Set this flag to ``False`` if the given ``distance_function`` computes a similarity value (i.e., higher values are better)
svd_solver: std
The way to solve the eigen value problem
use_variances : bool
If set to ``True``, the ``distance_function`` is provided with a third argument, which is the vector of variances (aka. eigenvalues).
factor: float
Multiplication factor used for the scoring stage
kwargs : ``key=value`` pairs
A list of keyword arguments directly passed to the :py:class:`Algorithm` base class constructor.
"""
def __init__(
self,
subspace_dimension, # if int, number of subspace dimensions; if float, percentage of variance to keep
distance_function = scipy.spatial.distance.euclidean,
is_distance_function = True,
uses_variances = False,
**kwargs # parameters directly sent to the base class
):
# call base class constructor and register that the algorithm performs a projection
super(PCA, self).__init__(
performs_projection = True,
subspace_dimension = subspace_dimension,
distance_function = str(distance_function),
is_distance_function = is_distance_function,
uses_variances = uses_variances,
**kwargs
)
self.subspace_dim = subspace_dimension
self.machine = None
self.distance_function = distance_function
self.factor = -1. if is_distance_function else 1.
self.uses_variances = uses_variances
def _check_feature(self, feature, projected=False):
"""Checks that the features are appropriate"""
if not isinstance(feature, numpy.ndarray) or feature.ndim != 1 or feature.dtype != numpy.float64:
raise ValueError("The given feature is not appropriate")
index = 1 if projected else 0
if self.machine is not None and feature.shape[0] != self.machine.shape[index]:
raise ValueError("The given feature is expected to have %d elements, but it has %d" % (self.machine.shape[index], feature.shape[0]))
def train_projector(self, training_features, projector_file):
"""Generates the PCA covariance matrix and writes it into the given projector_file.
**Parameters:**
training_features : [1D :py:class:`numpy.ndarray`]
A list of 1D training arrays (vectors) to train the PCA projection matrix with.
projector_file : str
A writable file, into which the PCA projection matrix (as a :py:class:`bob.learn.linear.Machine`) and the eigenvalues will be written.
"""
# Assure that all data are 1D
[self._check_feature(feature) for feature in training_features]
# Initializes the data
data = numpy.vstack(training_features)
logger.info(" -> Training LinearMachine using PCA")
t = bob.learn.linear.PCATrainer()
self.machine, self.variances = t.train(data)
# For re-shaping, we need to copy...
self.variances = self.variances.copy()
# compute variance percentage, if desired
if isinstance(self.subspace_dim, float):
cummulated = numpy.cumsum(self.variances) / numpy.sum(self.variances)
for index in range(len(cummulated)):
if cummulated[index] > self.subspace_dim:
self.subspace_dim = index
break
self.subspace_dim = index
logger.info(" ... Keeping %d PCA dimensions", self.subspace_dim)
# re-shape machine
self.machine.resize(self.machine.shape[0], self.subspace_dim)
self.variances = numpy.resize(self.variances, (self.subspace_dim))
f = bob.io.base.HDF5File(projector_file, "w")
f.set("Eigenvalues", self.variances)
f.create_group("Machine")
f.cd("/Machine")
self.machine.save(f)
def load_projector(self, projector_file):
"""Reads the PCA projection matrix and the eigenvalues from file.
**Parameters:**
projector_file : str
An existing file, from which the PCA projection matrix and the eigenvalues are read.
"""
# read PCA projector
f = bob.io.base.HDF5File(projector_file)
self.variances = f.read("Eigenvalues")
f.cd("/Machine")
self.machine = bob.learn.linear.Machine(f)
def __init__(
self,
subspace_dimension, # if int, number of subspace dimensions; if float, percentage of variance to keep
distance_function=euclidean,
svd_solver="auto",
factor=-1,
**kwargs, # parameters directly sent to the base class
):
# call base class constructor and register that the algorithm performs a projection
super(PCA, self).__init__(performs_projection=True)
def project(self, feature):
"""project(feature) -> projected
self.subspace_dim = subspace_dimension
self.distance_function = distance_function
self.svd_solver = svd_solver
self.factor = -1
Projects the given feature into eigenspace.
def fit(self, samplesets, checkpoints):
"""
This method should implement the sub-pipeline 0 of the Vanilla Biometrics Pipeline :ref:`_vanilla-pipeline-0`.
**Parameters:**
feature : 1D :py:class:`numpy.ndarray`
The 1D feature to be projected.
It represents the training of background models that an algorithm may need.
**Returns:**
Parameters
----------
projected : 1D :py:class:`numpy.ndarray`
The ``feature`` projected into eigenspace.
"""
self._check_feature(feature)
# Projects the data
return self.machine(feature)
samplesets: :py:class:`bob.pipelines.sample.sample.SampleSet`
Set of samples used to train a background model
def enroll(self, enroll_features):
"""enroll(enroll_features) -> model
checkpoint: str
If provided, must the path leading to a location where this
model should be saved at (complete path without extension) -
currently, it needs to be provided because of existing
serialization requirements (see bob/bob.io.base#106), but
checkpointing will still work as expected.
"""
Enrolls the model by storing all given input vectors.
pca = sklearn.decomposition.PCA(self.subspace_dim, svd_solver=self.svd_solver)
samples_array = self._stack_samples_2_ndarray(samplesets)
logger.info(
"Training PCA with samples of shape {0}".format(samples_array.shape)
)
pca.fit(samples_array)
**Parameters:**
# TODO: save the shit
enroll_features : [1D :py:class:`numpy.ndarray`]
The list of projected features to enroll the model from.
return pca
**Returns:**
def project_one_sample(self, background_model, data):
if data.ndim == 1:
return background_model.transform(data.reshape(1, -1))
model : 2D :py:class:`numpy.ndarray`
The enrolled model.
"""
assert len(enroll_features)
[self._check_feature(feature, True) for feature in enroll_features]
# just store all the features
return numpy.vstack(enroll_features)
return background_model.transform(data)
def enroll_one_sample(self, data):
return numpy.mean(data, axis=0)
def score(self, model, probe):
"""score(model, probe) -> float
def score_one_sample(self, biometric_reference, data):
"""It handles the score computation for one sample
Computes the distance of the model to the probe using the distance function specified in the constructor.
Parameters
----------
**Parameters:**
biometric_reference : list
Biometric reference to be compared
model : 2D :py:class:`numpy.ndarray`
The model storing all enrollment features.
data : list
Data to be compared
probe : 1D :py:class:`numpy.ndarray`
The probe feature vector in eigenspace.
Returns
-------
**Returns:**
scores : list
For each sample in a probe, returns as many scores as there are
samples in the probe, together with the probe's and the
relevant reference's subject identifiers.
score : float
A similarity value between ``model`` and ``probe``
"""
self._check_feature(probe, True)
# return the negative distance (as a similarity measure)
if len(model.shape) == 2:
# we have multiple models, so we use the multiple model scoring
return self.score_for_multiple_models(model, probe)
elif self.uses_variances:
# single model, single probe (multiple probes have already been handled)
return self.factor * self.distance_function(model, probe, self.variances)
else:
# single model, single probe (multiple probes have already been handled)
return self.factor * self.distance_function(model, probe)
"""
# re-define unused functions, just so that they do not get documented
def train_enroller(*args,**kwargs): raise NotImplementedError()
def load_enroller(*args,**kwargs): pass
return self.factor * self.distance_function(biometric_reference, data)
#!/usr/bin/env python
import bob.bio.base
import scipy.spatial
algorithm = bob.bio.base.algorithm.PCA(
subspace_dimension = .95,
distance_function = scipy.spatial.distance.euclidean,
is_distance_function = True
)
from bob.bio.base.algorithm import PCA
algorithm = PCA(0.99)
\ No newline at end of file
from bob.bio.base.pipelines.blocks import DatabaseConnector, AlgorithmAdaptor
import functools
from bob.bio.base.pipelines.vanilla_biometrics.legacy import DatabaseConnector, AlgorithmAdaptor
import bob.db.atnt
database = DatabaseConnector(bob.db.atnt.Database(), protocol="Default")
from bob.bio.face.preprocessor import Base
preprocessor = functools.partial(
Base,
color_channel="gray",
dtype="float64",
)
from bob.bio.base.extractor import Linearize
extractor = Linearize
#extractor = 'linearize'
preprocessor = "face-detect"
extractor = 'linearize'
from bob.bio.base.algorithm import PCA
algorithm = AlgorithmAdaptor(functools.partial(PCA, 0.99))
algorithm = 'pca'
......@@ -3,4 +3,4 @@
import bob.bio.base
# Linearization of the data to a vector, no data type specified
extractor = bob.bio.base.extractor.Linearize()
extractor = bob.bio.base.extractor.Linearize
# see https://docs.python.org/3/library/pkgutil.html
from pkgutil import extend_path
__path__ = extend_path(__path__, __name__)
......@@ -12,7 +12,8 @@ import functools
import bob.io.base
from .blocks import DatabaseConnector, SampleLoader
from .legacy import DatabaseConnector
from .blocks import SampleLoader
from bob.pipelines.sample.sample import SampleSet, DelayedSample, Sample
......
#!/usr/bin/env python
# vim: set fileencoding=utf-8 :
import copy
import functools
import numpy
import os
import bob.io.base
from bob.pipelines.sample.sample import DelayedSample, SampleSet, Sample
"""Re-usable blocks for legacy bob.bio.base algorithms"""
class SampleLoader:
"""Adaptor for loading, preprocessing and feature extracting samples
This adaptor class wraps around sample:
.. code-block:: text
[loading [-> preprocessing [-> extraction]]]
The input sample object must obbey the following (minimal) API:
* attribute ``id``: Contains an unique (string-fiable) identifier for
processed samples
* attribute ``data``: Contains the data for this sample
Optional checkpointing is also implemented for each of the states,
independently. You may check-point just the preprocessing, feature
extraction or both.
Parameters
----------
pipeline : :py:class:`list` of (:py:class:`str`, callable)
A list of doubles in which the first entry are names of each processing
step in the pipeline and second entry must be default-constructible
:py:class:`bob.bio.base.preprocessor.Preprocessor` or
:py:class:`bob.bio.base.preprocessor.Extractor` in any order. Each
of these objects must be a python type, that can be instantiated and
used through its ``__call__()`` interface to process a single entry of
a sample. For python types that you may want to plug-in, but do not
offer a default constructor that you like, pass the result of
:py:func:`functools.partial` instead.
"""
def __init__(self, pipeline):
self.pipeline = copy.deepcopy(pipeline)
def _handle_step(self, sset, func, checkpoint):
"""Handles a single step in the pipeline, with optional checkpointing
Parameters
----------
sset : SampleSet
The original sample set to be processed (delayed or pre-loaded)
func : callable
The processing function to call for processing **each** sample in
the set, if needs be
checkpoint : str, None
An optional string that may point to a directory that will be used
for checkpointing the processing phase in question
Returns
-------
r : SampleSet
The prototype processed sample. If no checkpointing required, this
will be of type :py:class:`Sample`. Otherwise, it will be a
:py:class:`DelayedSample`
"""
if checkpoint is not None:
samples = [] # processed samples
for s in sset.samples:
# there can be a checkpoint for the data to be processed
candidate = os.path.join(checkpoint, s.path + ".hdf5")
if not os.path.exists(candidate):
# preprocessing is required, and checkpointing, do it now
data = func(s.data)
# notice this can be called in parallel w/o failing
bob.io.base.create_directories_safe(os.path.dirname(candidate))
# bob.bio.base standard interface for preprocessor
# has a read/write_data methods
writer = (
getattr(func, "write_data")
if hasattr(func, "write_data")
else getattr(func, "write_feature")
)
writer(data, candidate)
# because we are checkpointing, we return a DelayedSample
# instead of normal (preloaded) sample. This allows the next
# phase to avoid loading it would it be unnecessary (e.g. next
# phase is already check-pointed)
reader = (
getattr(func, "read_data")
if hasattr(func, "read_data")
else getattr(func, "read_feature")
)
samples.append(
DelayedSample(functools.partial(reader, candidate), parent=s)
)
else:
# if checkpointing is not required, load the data and preprocess it
# as we would normally do
samples = [Sample(func(s.data), parent=s) for s in sset.samples]
r = SampleSet(samples, parent=sset)
return r
def _handle_sample(self, sset, pipeline):
"""Handles a single sampleset through a pipelien
Parameters
----------
sset : SampleSet
The original sample set to be processed (delayed or pre-loaded)
pipeline : :py:class:`list` of :py:class:`tuple`
A list of tuples, each comprising of one processing function and
one checkpoint directory (:py:class:`str` or ``None``, to avoid
checkpointing that phase), respectively
Returns
-------
r : Sample
The processed sample
"""
r = sset
for func, checkpoint in pipeline:
r = r if func is None else self._handle_step(r, func, checkpoint)
return r
def __call__(self, samples, checkpoints):
"""Applies the pipeline chaining with optional checkpointing
Our implementation is optimized to minimize disk I/O to the most. It
yields :py:class:`DelayedSample`'s instead of :py:class:`Sample` if
checkpointing is enabled.
Parameters
----------
samples : list
List of :py:class:`SampleSet` to be treated by this pipeline
checkpoints : dict
A dictionary (with any number of entries) that may contain as many
keys as those defined when you constructed this class with the
pipeline tuple list. Upon execution, the existance of an entry
that defines checkpointing, this phase of the pipeline will be
checkpointed. Notice that you are in the control of checkpointing.
If you miss an intermediary step, it will trigger this loader to
load the relevant sample, even if the next phase is supposed to be
checkpointed. This strategy keeps the implementation as simple as
possible.
Returns
-------
samplesets : list
Loaded samplesets, after optional preprocessing and extraction
"""
pipe = [(v(), checkpoints.get(k)) for k, v in self.pipeline]
return [self._handle_sample(k, pipe) for k in samples]
class VanillaBiometricsAlgoritm(object):
"""Describes a base biometric algorithm for the Vanilla Biometrics Pipeline :ref:`_bob.bio.base.struct_bio_rec_sys`_.
The model can be fitted (optionally). Otherwise, it can only execute
biometric model enrollement, via ``enroll()`` and scoring, with
``score()``.
"""
def __init__(self, performs_projection=False):
self.performs_projection = performs_projection
pass
def _stack_samples_2_ndarray(self, samplesets, stack_per_sampleset=False):
"""
Stack a set of :py:class:`bob.pipelines.sample.sample.SampleSet`
and convert them to :py:class:`numpy.ndarray`
Parameters
----------
samplesets: :py:class:`bob.pipelines.sample.sample.SampleSet`
Set of samples to be stackted
stack_per_sampleset: bool
If true will return a list of :py:class:`numpy.ndarray`, each one for a sample set
"""
if stack_per_sampleset:
# TODO: Make it more efficient
all_data = []
for sampleset in samplesets:
all_data.append(
numpy.array([sample.data for sample in sampleset.samples])
)
return all_data
else:
return numpy.array(
[
sample.data
for sampleset in samplesets
for sample in sampleset.samples
]
)
def fit(self, samplesets, checkpoint):
"""
This method should implement the sub-pipeline 0 of the Vanilla Biometrics Pipeline :ref:`_vanilla-pipeline-0`.
It represents the training of background models that an algorithm may need.
Parameters
----------
samplesets: :py:class:`bob.pipelines.sample.sample.SampleSet`
Set of samples used to train a background model
checkpoint: str
If provided, must the path leading to a location where this
model should be saved at (complete path without extension) -
currently, it needs to be provided because of existing
serialization requirements (see bob/bob.io.base#106), but
checkpointing will still work as expected.