diff --git a/bob/bio/base/algorithm/LDA.py b/bob/bio/base/algorithm/LDA.py new file mode 100644 index 0000000000000000000000000000000000000000..bd3940f4c6eb968bd282bbfefd0a37b47244ca31 --- /dev/null +++ b/bob/bio/base/algorithm/LDA.py @@ -0,0 +1,183 @@ +#!/usr/bin/env python +# vim: set fileencoding=utf-8 : +# Manuel Guenther <Manuel.Guenther@idiap.ch> + +import bob.io.base +import bob.learn.linear + +import numpy +import scipy.spatial + +from .Algorithm import Algorithm + +import logging +logger = logging.getLogger("bob.bio.base") + +class LDA (Algorithm): + """Tool for computing linear discriminant analysis (so-called Fisher faces)""" + + def __init__( + self, + lda_subspace_dimension = 0, # if set, the LDA subspace will be truncated to the given number of dimensions; by default it is limited to the number of classes in the training set + pca_subspace_dimension = None, # if set, a PCA subspace truncation is performed before applying LDA; might be integral or float + distance_function = scipy.spatial.distance.euclidean, + is_distance_function = True, + uses_variances = False, + **kwargs # parameters directly sent to the base class + ): + """Initializes the LDA tool with the given configuration""" + + # call base class constructor and register that the LDA tool performs projection and need the training features split by client + Algorithm.__init__( + self, + performs_projection = True, + split_training_features_by_client = True, + + lda_subspace_dimension = lda_subspace_dimension, + pca_subspace_dimension = pca_subspace_dimension, + distance_function = str(distance_function), + is_distance_function = is_distance_function, + uses_variances = uses_variances, + + **kwargs + ) + + # copy information + self.pca_subspace = pca_subspace_dimension + self.lda_subspace = lda_subspace_dimension + if self.pca_subspace and isinstance(self.pca_subspace, int) and self.lda_subspace and self.pca_subspace < self.lda_subspace: + raise ValueError("The LDA subspace is larger than the PCA subspace size. This won't work properly. Please check your setup!") + + self.machine = None + self.distance_function = distance_function + self.factor = -1 if is_distance_function else 1. + self.uses_variances = uses_variances + + + def _check_feature(self, feature): + """Checks that the features are appropriate""" + if not isinstance(feature, numpy.ndarray) or len(feature.shape) != 1 or feature.dtype != numpy.float64: + raise ValueError("The given feature is not appropriate") + + + def _arrange_data(self, training_files): + """Arranges the data to train the LDA projection matrix""" + data = [] + for client_files in training_files: + # at least two files per client are required! + if len(client_files) < 2: + logger.warn("Skipping one client since the number of client files is only %d", len(client_files)) + continue + data.append(numpy.vstack([feature.flatten() for feature in client_files])) + + # Returns the list of lists of arrays + return data + + + def _train_pca(self, training_set): + """Trains and returns a LinearMachine that is trained using PCA""" + data_list = [feature for client in training_set for feature in client] + data = numpy.vstack(data_list) + + logger.info(" -> Training Linear Machine using PCA") + t = bob.learn.linear.PCATrainer() + machine, eigen_values = t.train(data) + + if isinstance(self.pca_subspace, float): + cummulated = numpy.cumsum(eigen_values) / numpy.sum(eigen_values) + for index in range(len(cummulated)): + if cummulated[index] > self.pca_subspace: + self.pca_subspace = index + break + self.pca_subspace = index + + if self.lda_subspace and self.pca_subspace <= self.lda_subspace: + logger.warn(" ... Extending the PCA subspace dimension from %d to %d", self.pca_subspace, self.lda_subspace + 1) + self.pca_subspace = self.lda_subspace + 1 + else: + logger.info(" ... Limiting PCA subspace to %d dimensions", self.pca_subspace) + + # limit number of pcs + machine.resize(machine.shape[0], self.pca_subspace) + return machine + + + def _perform_pca(self, machine, training_set): + """Perform PCA on data of the training set""" + return [numpy.vstack([machine(feature) for feature in client_features]) for client_features in training_set] + + + def train_projector(self, training_features, projector_file): + """Generates the LDA projection matrix from the given features (that are sorted by identity)""" + # check data + [self._check_feature(feature) for client_features in training_features for feature in client_features] + + # arrange LDA training data + data = self._arrange_data(training_features) + + # train PCA of wanted + if self.pca_subspace: + # train on all training features + pca_machine = self._train_pca(training_features) + # project only the features that are used for training + logger.info(" -> Projecting training data to PCA subspace") + data = self._perform_pca(pca_machine, data) + + logger.info(" -> Training Linear Machine using LDA") + trainer = bob.learn.linear.FisherLDATrainer(strip_to_rank = (self.lda_subspace == 0)) + self.machine, self.variances = trainer.train(data) + if self.lda_subspace: + self.machine.resize(self.machine.shape[0], self.lda_subspace) + self.variances = self.variances.copy() + self.variances.resize(self.lda_subspace) + + if self.pca_subspace: + # compute combined PCA/LDA projection matrix + combined_matrix = numpy.dot(pca_machine.weights, self.machine.weights) + # set new weight matrix (and new mean vector) of novel machine + self.machine = bob.learn.linear.Machine(combined_matrix) + self.machine.input_subtract = pca_machine.input_subtract + + hdf5 = bob.io.base.HDF5File(projector_file, "w") + hdf5.set("Eigenvalues", self.variances) + hdf5.create_group("/Machine") + hdf5.cd("/Machine") + self.machine.save(hdf5) + + + def load_projector(self, projector_file): + """Reads the LDA projection matrix from file""" + # read LDA projector + hdf5 = bob.io.base.HDF5File(projector_file) + self.variances = hdf5.read("Eigenvalues") + hdf5.cd("/Machine") + self.machine = bob.learn.linear.Machine(hdf5) + + + def project(self, feature): + """Projects the data using the stored covariance matrix""" + self._check_feature(feature) + # Projects the data + return self.machine(feature) + + + def enroll(self, enroll_features): + """Enrolls the model by storing all given input vectors""" + assert len(enroll_features) + [self._check_feature(feature) for feature in enroll_features] + # just store all the features + return numpy.vstack(enroll_features) + + + def score(self, model, probe): + """Computes the distance of the model to the probe using the distance function""" + # return the negative distance (as a similarity measure) + if len(model.shape) == 2: + # we have multiple models, so we use the multiple model scoring + return self.score_for_multiple_models(model, probe) + elif self.uses_variances: + # single model, single probe (multiple probes have already been handled) + return self.factor * self.distance_function(model, probe, self.variances) + else: + # single model, single probe (multiple probes have already been handled) + return self.factor * self.distance_function(model, probe) diff --git a/bob/bio/base/algorithm/PCA.py b/bob/bio/base/algorithm/PCA.py index f9141a17518f643c0345ac0c24a581e1522d397c..0866f31985068edd126ec6b8d4349b301c3d6fbd 100644 --- a/bob/bio/base/algorithm/PCA.py +++ b/bob/bio/base/algorithm/PCA.py @@ -47,8 +47,8 @@ class PCA (Algorithm): def _check_feature(self, feature): - """Checks that the features are apropriate""" - if not isinstance(feature, numpy.ndarray) or len(feature.shape) != 1: + """Checks that the features are appropriate""" + if not isinstance(feature, numpy.ndarray) or len(feature.shape) != 1 or feature.dtype != numpy.float64: raise ValueError("The given feature is not appropriate") @@ -103,8 +103,8 @@ class PCA (Algorithm): def enroll(self, enroll_features): """Enrolls the model by storing all given input vectors""" - [self._check_feature(feature) for feature in enroll_features] assert len(enroll_features) + [self._check_feature(feature) for feature in enroll_features] # just store all the features return numpy.vstack(enroll_features) diff --git a/bob/bio/base/algorithm/__init__.py b/bob/bio/base/algorithm/__init__.py index e58905972785d83297046a8247f782185bacf0ea..0866bd1e1cae8c8149e479e2618ae278a1771a58 100644 --- a/bob/bio/base/algorithm/__init__.py +++ b/bob/bio/base/algorithm/__init__.py @@ -1,2 +1,3 @@ from .Algorithm import Algorithm from .PCA import PCA +from .LDA import LDA diff --git a/bob/bio/base/config/algorithm/lda.py b/bob/bio/base/config/algorithm/lda.py new file mode 100644 index 0000000000000000000000000000000000000000..58dff6eda17e65add925dac00e0770b188b0ef3d --- /dev/null +++ b/bob/bio/base/config/algorithm/lda.py @@ -0,0 +1,10 @@ +#!/usr/bin/env python + +import bob.bio.base +import scipy.spatial + +algorithm = bob.bio.base.algorithm.LDA( + subspace_dimension = 50, + distance_function = scipy.spatial.distance.euclidean, + is_distance_function = True +) diff --git a/bob/bio/base/config/algorithm/pca_lda.py b/bob/bio/base/config/algorithm/pca_lda.py new file mode 100644 index 0000000000000000000000000000000000000000..8bf7c4a369693662732954ad3ed1133d3f5a379d --- /dev/null +++ b/bob/bio/base/config/algorithm/pca_lda.py @@ -0,0 +1,11 @@ +#!/usr/bin/env python + +import bob.bio.base +import scipy.spatial + +algorithm = bob.bio.base.algorithm.LDA( + subspace_dimension = 50, + pca_subspace_dimension = 100, + distance_function = scipy.spatial.distance.euclidean, + is_distance_function = True +) diff --git a/bob/bio/base/test/data/lda_model.hdf5 b/bob/bio/base/test/data/lda_model.hdf5 new file mode 100644 index 0000000000000000000000000000000000000000..55b34d75582aac2d9b51898ca1dd5fc7c705a4c1 Binary files /dev/null and b/bob/bio/base/test/data/lda_model.hdf5 differ diff --git a/bob/bio/base/test/data/lda_projected.hdf5 b/bob/bio/base/test/data/lda_projected.hdf5 new file mode 100644 index 0000000000000000000000000000000000000000..c5a800eb1fa3f2c8d0aa9abef1b30a86c6edc343 Binary files /dev/null and b/bob/bio/base/test/data/lda_projected.hdf5 differ diff --git a/bob/bio/base/test/data/lda_projector.hdf5 b/bob/bio/base/test/data/lda_projector.hdf5 new file mode 100644 index 0000000000000000000000000000000000000000..f3ad8115a6a505d8f2375c5b7d140ca4d38a813c Binary files /dev/null and b/bob/bio/base/test/data/lda_projector.hdf5 differ diff --git a/bob/bio/base/test/test_algorithms.py b/bob/bio/base/test/test_algorithms.py index 609c332ed9dfa0f688fc513f2f5f6e8ad8189387..b9525276b405bb7b0f344ce877c48cb0be20763d 100644 --- a/bob/bio/base/test/test_algorithms.py +++ b/bob/bio/base/test/test_algorithms.py @@ -33,6 +33,8 @@ import sys _mac_os = sys.platform == 'darwin' +import scipy.spatial + import bob.io.base import bob.learn.linear import bob.io.base.test_utils @@ -101,7 +103,7 @@ def test_pca(): assert numpy.allclose(pca1.machine.weights[:,i], pca2.machine.weights[:,i], atol=1e-5) or numpy.allclose(pca1.machine.weights[:,i], - pca2.machine.weights[:,i], atol=1e-5) finally: - os.remove(temp_file) + if os.path.exists(temp_file): os.remove(temp_file) # generate and project random feature feature = utils.random_array(200, 0., 255., seed=84) @@ -120,7 +122,6 @@ def test_pca(): assert abs(pca1.score(model, probe) - reference_score) < 1e-5, "The scores differ: %3.8f, %3.8f" % (pca1.score(model, probe), reference_score) assert abs(pca1.score_for_multiple_probes(model, [probe, probe]) - reference_score) < 1e-5 - # test the calculation of the subspace dimension based on percentage of variance pca3 = bob.bio.base.algorithm.PCA(.9) try: @@ -131,7 +132,81 @@ def test_pca(): pca3.load_projector(temp_file) assert pca3.machine.shape[1] == 140 finally: - os.remove(temp_file) + if os.path.exists(temp_file): os.remove(temp_file) + + +def test_lda(): + temp_file = bob.io.base.test_utils.temporary_filename() + # assure that the configurations are loadable + lda1 = bob.bio.base.load_resource("lda", "algorithm") + assert isinstance(lda1, bob.bio.base.algorithm.LDA) + assert isinstance(lda1, bob.bio.base.algorithm.Algorithm) + lda2 = bob.bio.base.load_resource("pca+lda", "algorithm") + assert isinstance(lda2, bob.bio.base.algorithm.LDA) + assert isinstance(lda2, bob.bio.base.algorithm.Algorithm) + + assert lda1.performs_projection + assert lda1.requires_projector_training + assert lda1.use_projected_features_for_enrollment + assert lda1.split_training_features_by_client + assert not lda1.requires_enroller_training + + # generate a smaller PCA subspcae + lda3 = bob.bio.base.algorithm.LDA(5, 10, scipy.spatial.distance.seuclidean, True, True) + + # create random training set + train_set = utils.random_training_set_by_id(200, count=20, minimum=0., maximum=255.) + # train the projector + reference_file = pkg_resources.resource_filename('bob.bio.base.test', 'data/lda_projector.hdf5') + try: + # train projector + lda3.train_projector(train_set, temp_file) + assert os.path.exists(temp_file) + + if regenerate_refs: shutil.copy(temp_file, reference_file) + + # check projection matrix + lda1.load_projector(reference_file) + lda3.load_projector(temp_file) + + assert numpy.allclose(lda1.variances, lda3.variances, atol=1e-5) + assert lda3.machine.shape == (200, 5) + assert lda1.machine.shape == lda3.machine.shape + # ... rotation direction might change, hence either the sum or the difference should be 0 + for i in range(5): + assert numpy.allclose(lda1.machine.weights[:,i], lda3.machine.weights[:,i], atol=1e-5) or numpy.allclose(lda1.machine.weights[:,i], - lda3.machine.weights[:,i], atol=1e-5) + + finally: + if os.path.exists(temp_file): os.remove(temp_file) + + # generate and project random feature + feature = utils.random_array(200, 0., 255., seed=84) + projected = lda1.project(feature) + assert projected.shape == (5,) + _compare(projected, pkg_resources.resource_filename('bob.bio.base.test', 'data/lda_projected.hdf5'), lda1.write_feature, lda1.read_feature) + + # enroll model from random features + enroll = utils.random_training_set(5, 5, 0., 255., seed=21) + model = lda1.enroll(enroll) + _compare(model, pkg_resources.resource_filename('bob.bio.base.test', 'data/lda_model.hdf5'), lda1.write_model, lda1.read_model) + + # compare model with probe + probe = lda1.read_probe(pkg_resources.resource_filename('bob.bio.base.test', 'data/lda_projected.hdf5')) + reference_score = -233.30450012 + assert abs(lda1.score(model, probe) - reference_score) < 1e-5, "The scores differ: %3.8f, %3.8f" % (lda1.score(model, probe), reference_score) + assert abs(lda1.score_for_multiple_probes(model, [probe, probe]) - reference_score) < 1e-5 + + # test the calculation of the subspace dimension based on percentage of variance + lda4 = bob.bio.base.algorithm.LDA(pca_subspace_dimension=.9) + try: + # train projector + lda4.train_projector(train_set, temp_file) + assert os.path.exists(temp_file) + assert lda4.pca_subspace == 132 + lda4.load_projector(temp_file) + assert lda4.machine.shape[1] == 19 + finally: + if os.path.exists(temp_file): os.remove(temp_file) """ @@ -200,72 +275,6 @@ def test_pca(): - def test04_lda(self): - # read input - feature = facereclib.utils.load(self.input_dir('linearize.hdf5')) - # assure that the config file is loadable - tool = self.config('lda') - self.assertTrue(isinstance(tool, facereclib.tools.LDA)) - # assure that the config file is loadable - tool = self.config('pca+lda') - self.assertTrue(isinstance(tool, facereclib.tools.LDA)) - - # here we use a reduced tool, using the scaled Euclidean distance (mahalanobis) from scipy - import scipy.spatial - tool = facereclib.tools.LDA(5, 10, scipy.spatial.distance.seuclidean, True, True) - self.assertTrue(tool.performs_projection) - self.assertTrue(tool.requires_projector_training) - self.assertTrue(tool.use_projected_features_for_enrollment) - self.assertTrue(tool.split_training_features_by_client) - - # train the projector - t = tempfile.mkstemp('pca+lda.hdf5', prefix='frltest_')[1] - tool.train_projector(facereclib.utils.tests.random_training_set_by_id(feature.shape, count=20, minimum=0., maximum=255.), t) - if regenerate_refs: - import shutil - shutil.copy2(t, self.reference_dir('pca+lda_projector.hdf5')) - - # load the projector file - tool.load_projector(self.reference_dir('pca+lda_projector.hdf5')) - # compare the resulting machines - f = bob.io.base.HDF5File(t) - new_variances = f.read("Eigenvalues") - f.cd("/Machine") - new_machine = bob.learn.linear.Machine(f) - del f - self.assertEqual(tool.m_machine.shape, new_machine.shape) - self.assertTrue(numpy.abs(tool.m_variances - new_variances < 1e-5).all()) - # ... rotation direction might change, hence either the sum or the difference should be 0 - for i in range(5): - self.assertTrue(numpy.abs(tool.m_machine.weights[:,i] - new_machine.weights[:,i] < 1e-5).all() or numpy.abs(tool.m_machine.weights[:,i] + new_machine.weights[:,i] < 1e-5).all()) - os.remove(t) - - # project feature - projected = tool.project(feature) - self.compare(projected, 'pca+lda_feature.hdf5') - self.assertTrue(len(projected.shape) == 1) - - # enroll model - model = tool.enroll([projected]) - self.compare(model, 'pca+lda_model.hdf5') - self.assertTrue(model.shape == (1,5)) - - # score - sim = tool.score(model, projected) - self.assertAlmostEqual(sim, 0.) - - # test the calculation of the subspace dimension based on percentage of variance, - # and the usage of a different way to compute the final score in case of multiple features per model - tool = facereclib.tools.LDA(5, .9, multiple_model_scoring = 'median') - tool.train_projector(facereclib.utils.tests.random_training_set_by_id(feature.shape, count=20, minimum=0., maximum=255.), t) - self.assertEqual(tool.m_pca_subspace, 334) - tool.load_projector(t) - os.remove(t) - projected = tool.project(feature) - model = tool.enroll([projected, projected]) - self.assertTrue(model.shape == (2,5)) - self.assertAlmostEqual(tool.score(model, projected), 0.) - self.assertAlmostEqual(tool.score_for_multiple_probes(model, [projected, projected]), 0.) def test05_bic(self): diff --git a/setup.py b/setup.py index 90f3419bde53b6811cc96de83b84522e5dd8b7d5..6a0ee5e782373b23d21c925ad5667f22e8338269 100644 --- a/setup.py +++ b/setup.py @@ -121,6 +121,8 @@ setup( 'bob.bio.algorithm': [ 'dummy = bob.bio.base.test.dummy.algorithm:algorithm', # for test purposes only 'pca = bob.bio.base.config.algorithm.pca:algorithm', + 'lda = bob.bio.base.config.algorithm.lda:algorithm', + 'pca+lda = bob.bio.base.config.algorithm.lda:algorithm', ], },