Skip to content
Snippets Groups Projects
Commit 6b585852 authored by Manuel Günther's avatar Manuel Günther
Browse files

Added LDA algorithm and tests

parent 8665d114
No related branches found
No related tags found
No related merge requests found
#!/usr/bin/env python
# vim: set fileencoding=utf-8 :
# Manuel Guenther <Manuel.Guenther@idiap.ch>
import bob.io.base
import bob.learn.linear
import numpy
import scipy.spatial
from .Algorithm import Algorithm
import logging
logger = logging.getLogger("bob.bio.base")
class LDA (Algorithm):
"""Tool for computing linear discriminant analysis (so-called Fisher faces)"""
def __init__(
self,
lda_subspace_dimension = 0, # if set, the LDA subspace will be truncated to the given number of dimensions; by default it is limited to the number of classes in the training set
pca_subspace_dimension = None, # if set, a PCA subspace truncation is performed before applying LDA; might be integral or float
distance_function = scipy.spatial.distance.euclidean,
is_distance_function = True,
uses_variances = False,
**kwargs # parameters directly sent to the base class
):
"""Initializes the LDA tool with the given configuration"""
# call base class constructor and register that the LDA tool performs projection and need the training features split by client
Algorithm.__init__(
self,
performs_projection = True,
split_training_features_by_client = True,
lda_subspace_dimension = lda_subspace_dimension,
pca_subspace_dimension = pca_subspace_dimension,
distance_function = str(distance_function),
is_distance_function = is_distance_function,
uses_variances = uses_variances,
**kwargs
)
# copy information
self.pca_subspace = pca_subspace_dimension
self.lda_subspace = lda_subspace_dimension
if self.pca_subspace and isinstance(self.pca_subspace, int) and self.lda_subspace and self.pca_subspace < self.lda_subspace:
raise ValueError("The LDA subspace is larger than the PCA subspace size. This won't work properly. Please check your setup!")
self.machine = None
self.distance_function = distance_function
self.factor = -1 if is_distance_function else 1.
self.uses_variances = uses_variances
def _check_feature(self, feature):
"""Checks that the features are appropriate"""
if not isinstance(feature, numpy.ndarray) or len(feature.shape) != 1 or feature.dtype != numpy.float64:
raise ValueError("The given feature is not appropriate")
def _arrange_data(self, training_files):
"""Arranges the data to train the LDA projection matrix"""
data = []
for client_files in training_files:
# at least two files per client are required!
if len(client_files) < 2:
logger.warn("Skipping one client since the number of client files is only %d", len(client_files))
continue
data.append(numpy.vstack([feature.flatten() for feature in client_files]))
# Returns the list of lists of arrays
return data
def _train_pca(self, training_set):
"""Trains and returns a LinearMachine that is trained using PCA"""
data_list = [feature for client in training_set for feature in client]
data = numpy.vstack(data_list)
logger.info(" -> Training Linear Machine using PCA")
t = bob.learn.linear.PCATrainer()
machine, eigen_values = t.train(data)
if isinstance(self.pca_subspace, float):
cummulated = numpy.cumsum(eigen_values) / numpy.sum(eigen_values)
for index in range(len(cummulated)):
if cummulated[index] > self.pca_subspace:
self.pca_subspace = index
break
self.pca_subspace = index
if self.lda_subspace and self.pca_subspace <= self.lda_subspace:
logger.warn(" ... Extending the PCA subspace dimension from %d to %d", self.pca_subspace, self.lda_subspace + 1)
self.pca_subspace = self.lda_subspace + 1
else:
logger.info(" ... Limiting PCA subspace to %d dimensions", self.pca_subspace)
# limit number of pcs
machine.resize(machine.shape[0], self.pca_subspace)
return machine
def _perform_pca(self, machine, training_set):
"""Perform PCA on data of the training set"""
return [numpy.vstack([machine(feature) for feature in client_features]) for client_features in training_set]
def train_projector(self, training_features, projector_file):
"""Generates the LDA projection matrix from the given features (that are sorted by identity)"""
# check data
[self._check_feature(feature) for client_features in training_features for feature in client_features]
# arrange LDA training data
data = self._arrange_data(training_features)
# train PCA of wanted
if self.pca_subspace:
# train on all training features
pca_machine = self._train_pca(training_features)
# project only the features that are used for training
logger.info(" -> Projecting training data to PCA subspace")
data = self._perform_pca(pca_machine, data)
logger.info(" -> Training Linear Machine using LDA")
trainer = bob.learn.linear.FisherLDATrainer(strip_to_rank = (self.lda_subspace == 0))
self.machine, self.variances = trainer.train(data)
if self.lda_subspace:
self.machine.resize(self.machine.shape[0], self.lda_subspace)
self.variances = self.variances.copy()
self.variances.resize(self.lda_subspace)
if self.pca_subspace:
# compute combined PCA/LDA projection matrix
combined_matrix = numpy.dot(pca_machine.weights, self.machine.weights)
# set new weight matrix (and new mean vector) of novel machine
self.machine = bob.learn.linear.Machine(combined_matrix)
self.machine.input_subtract = pca_machine.input_subtract
hdf5 = bob.io.base.HDF5File(projector_file, "w")
hdf5.set("Eigenvalues", self.variances)
hdf5.create_group("/Machine")
hdf5.cd("/Machine")
self.machine.save(hdf5)
def load_projector(self, projector_file):
"""Reads the LDA projection matrix from file"""
# read LDA projector
hdf5 = bob.io.base.HDF5File(projector_file)
self.variances = hdf5.read("Eigenvalues")
hdf5.cd("/Machine")
self.machine = bob.learn.linear.Machine(hdf5)
def project(self, feature):
"""Projects the data using the stored covariance matrix"""
self._check_feature(feature)
# Projects the data
return self.machine(feature)
def enroll(self, enroll_features):
"""Enrolls the model by storing all given input vectors"""
assert len(enroll_features)
[self._check_feature(feature) for feature in enroll_features]
# just store all the features
return numpy.vstack(enroll_features)
def score(self, model, probe):
"""Computes the distance of the model to the probe using the distance function"""
# return the negative distance (as a similarity measure)
if len(model.shape) == 2:
# we have multiple models, so we use the multiple model scoring
return self.score_for_multiple_models(model, probe)
elif self.uses_variances:
# single model, single probe (multiple probes have already been handled)
return self.factor * self.distance_function(model, probe, self.variances)
else:
# single model, single probe (multiple probes have already been handled)
return self.factor * self.distance_function(model, probe)
......@@ -47,8 +47,8 @@ class PCA (Algorithm):
def _check_feature(self, feature):
"""Checks that the features are apropriate"""
if not isinstance(feature, numpy.ndarray) or len(feature.shape) != 1:
"""Checks that the features are appropriate"""
if not isinstance(feature, numpy.ndarray) or len(feature.shape) != 1 or feature.dtype != numpy.float64:
raise ValueError("The given feature is not appropriate")
......@@ -103,8 +103,8 @@ class PCA (Algorithm):
def enroll(self, enroll_features):
"""Enrolls the model by storing all given input vectors"""
[self._check_feature(feature) for feature in enroll_features]
assert len(enroll_features)
[self._check_feature(feature) for feature in enroll_features]
# just store all the features
return numpy.vstack(enroll_features)
......
from .Algorithm import Algorithm
from .PCA import PCA
from .LDA import LDA
#!/usr/bin/env python
import bob.bio.base
import scipy.spatial
algorithm = bob.bio.base.algorithm.LDA(
subspace_dimension = 50,
distance_function = scipy.spatial.distance.euclidean,
is_distance_function = True
)
#!/usr/bin/env python
import bob.bio.base
import scipy.spatial
algorithm = bob.bio.base.algorithm.LDA(
subspace_dimension = 50,
pca_subspace_dimension = 100,
distance_function = scipy.spatial.distance.euclidean,
is_distance_function = True
)
File added
File added
File added
......@@ -33,6 +33,8 @@ import sys
_mac_os = sys.platform == 'darwin'
import scipy.spatial
import bob.io.base
import bob.learn.linear
import bob.io.base.test_utils
......@@ -101,7 +103,7 @@ def test_pca():
assert numpy.allclose(pca1.machine.weights[:,i], pca2.machine.weights[:,i], atol=1e-5) or numpy.allclose(pca1.machine.weights[:,i], - pca2.machine.weights[:,i], atol=1e-5)
finally:
os.remove(temp_file)
if os.path.exists(temp_file): os.remove(temp_file)
# generate and project random feature
feature = utils.random_array(200, 0., 255., seed=84)
......@@ -120,7 +122,6 @@ def test_pca():
assert abs(pca1.score(model, probe) - reference_score) < 1e-5, "The scores differ: %3.8f, %3.8f" % (pca1.score(model, probe), reference_score)
assert abs(pca1.score_for_multiple_probes(model, [probe, probe]) - reference_score) < 1e-5
# test the calculation of the subspace dimension based on percentage of variance
pca3 = bob.bio.base.algorithm.PCA(.9)
try:
......@@ -131,7 +132,81 @@ def test_pca():
pca3.load_projector(temp_file)
assert pca3.machine.shape[1] == 140
finally:
os.remove(temp_file)
if os.path.exists(temp_file): os.remove(temp_file)
def test_lda():
temp_file = bob.io.base.test_utils.temporary_filename()
# assure that the configurations are loadable
lda1 = bob.bio.base.load_resource("lda", "algorithm")
assert isinstance(lda1, bob.bio.base.algorithm.LDA)
assert isinstance(lda1, bob.bio.base.algorithm.Algorithm)
lda2 = bob.bio.base.load_resource("pca+lda", "algorithm")
assert isinstance(lda2, bob.bio.base.algorithm.LDA)
assert isinstance(lda2, bob.bio.base.algorithm.Algorithm)
assert lda1.performs_projection
assert lda1.requires_projector_training
assert lda1.use_projected_features_for_enrollment
assert lda1.split_training_features_by_client
assert not lda1.requires_enroller_training
# generate a smaller PCA subspcae
lda3 = bob.bio.base.algorithm.LDA(5, 10, scipy.spatial.distance.seuclidean, True, True)
# create random training set
train_set = utils.random_training_set_by_id(200, count=20, minimum=0., maximum=255.)
# train the projector
reference_file = pkg_resources.resource_filename('bob.bio.base.test', 'data/lda_projector.hdf5')
try:
# train projector
lda3.train_projector(train_set, temp_file)
assert os.path.exists(temp_file)
if regenerate_refs: shutil.copy(temp_file, reference_file)
# check projection matrix
lda1.load_projector(reference_file)
lda3.load_projector(temp_file)
assert numpy.allclose(lda1.variances, lda3.variances, atol=1e-5)
assert lda3.machine.shape == (200, 5)
assert lda1.machine.shape == lda3.machine.shape
# ... rotation direction might change, hence either the sum or the difference should be 0
for i in range(5):
assert numpy.allclose(lda1.machine.weights[:,i], lda3.machine.weights[:,i], atol=1e-5) or numpy.allclose(lda1.machine.weights[:,i], - lda3.machine.weights[:,i], atol=1e-5)
finally:
if os.path.exists(temp_file): os.remove(temp_file)
# generate and project random feature
feature = utils.random_array(200, 0., 255., seed=84)
projected = lda1.project(feature)
assert projected.shape == (5,)
_compare(projected, pkg_resources.resource_filename('bob.bio.base.test', 'data/lda_projected.hdf5'), lda1.write_feature, lda1.read_feature)
# enroll model from random features
enroll = utils.random_training_set(5, 5, 0., 255., seed=21)
model = lda1.enroll(enroll)
_compare(model, pkg_resources.resource_filename('bob.bio.base.test', 'data/lda_model.hdf5'), lda1.write_model, lda1.read_model)
# compare model with probe
probe = lda1.read_probe(pkg_resources.resource_filename('bob.bio.base.test', 'data/lda_projected.hdf5'))
reference_score = -233.30450012
assert abs(lda1.score(model, probe) - reference_score) < 1e-5, "The scores differ: %3.8f, %3.8f" % (lda1.score(model, probe), reference_score)
assert abs(lda1.score_for_multiple_probes(model, [probe, probe]) - reference_score) < 1e-5
# test the calculation of the subspace dimension based on percentage of variance
lda4 = bob.bio.base.algorithm.LDA(pca_subspace_dimension=.9)
try:
# train projector
lda4.train_projector(train_set, temp_file)
assert os.path.exists(temp_file)
assert lda4.pca_subspace == 132
lda4.load_projector(temp_file)
assert lda4.machine.shape[1] == 19
finally:
if os.path.exists(temp_file): os.remove(temp_file)
"""
......@@ -200,72 +275,6 @@ def test_pca():
def test04_lda(self):
# read input
feature = facereclib.utils.load(self.input_dir('linearize.hdf5'))
# assure that the config file is loadable
tool = self.config('lda')
self.assertTrue(isinstance(tool, facereclib.tools.LDA))
# assure that the config file is loadable
tool = self.config('pca+lda')
self.assertTrue(isinstance(tool, facereclib.tools.LDA))
# here we use a reduced tool, using the scaled Euclidean distance (mahalanobis) from scipy
import scipy.spatial
tool = facereclib.tools.LDA(5, 10, scipy.spatial.distance.seuclidean, True, True)
self.assertTrue(tool.performs_projection)
self.assertTrue(tool.requires_projector_training)
self.assertTrue(tool.use_projected_features_for_enrollment)
self.assertTrue(tool.split_training_features_by_client)
# train the projector
t = tempfile.mkstemp('pca+lda.hdf5', prefix='frltest_')[1]
tool.train_projector(facereclib.utils.tests.random_training_set_by_id(feature.shape, count=20, minimum=0., maximum=255.), t)
if regenerate_refs:
import shutil
shutil.copy2(t, self.reference_dir('pca+lda_projector.hdf5'))
# load the projector file
tool.load_projector(self.reference_dir('pca+lda_projector.hdf5'))
# compare the resulting machines
f = bob.io.base.HDF5File(t)
new_variances = f.read("Eigenvalues")
f.cd("/Machine")
new_machine = bob.learn.linear.Machine(f)
del f
self.assertEqual(tool.m_machine.shape, new_machine.shape)
self.assertTrue(numpy.abs(tool.m_variances - new_variances < 1e-5).all())
# ... rotation direction might change, hence either the sum or the difference should be 0
for i in range(5):
self.assertTrue(numpy.abs(tool.m_machine.weights[:,i] - new_machine.weights[:,i] < 1e-5).all() or numpy.abs(tool.m_machine.weights[:,i] + new_machine.weights[:,i] < 1e-5).all())
os.remove(t)
# project feature
projected = tool.project(feature)
self.compare(projected, 'pca+lda_feature.hdf5')
self.assertTrue(len(projected.shape) == 1)
# enroll model
model = tool.enroll([projected])
self.compare(model, 'pca+lda_model.hdf5')
self.assertTrue(model.shape == (1,5))
# score
sim = tool.score(model, projected)
self.assertAlmostEqual(sim, 0.)
# test the calculation of the subspace dimension based on percentage of variance,
# and the usage of a different way to compute the final score in case of multiple features per model
tool = facereclib.tools.LDA(5, .9, multiple_model_scoring = 'median')
tool.train_projector(facereclib.utils.tests.random_training_set_by_id(feature.shape, count=20, minimum=0., maximum=255.), t)
self.assertEqual(tool.m_pca_subspace, 334)
tool.load_projector(t)
os.remove(t)
projected = tool.project(feature)
model = tool.enroll([projected, projected])
self.assertTrue(model.shape == (2,5))
self.assertAlmostEqual(tool.score(model, projected), 0.)
self.assertAlmostEqual(tool.score_for_multiple_probes(model, [projected, projected]), 0.)
def test05_bic(self):
......
......@@ -121,6 +121,8 @@ setup(
'bob.bio.algorithm': [
'dummy = bob.bio.base.test.dummy.algorithm:algorithm', # for test purposes only
'pca = bob.bio.base.config.algorithm.pca:algorithm',
'lda = bob.bio.base.config.algorithm.lda:algorithm',
'pca+lda = bob.bio.base.config.algorithm.lda:algorithm',
],
},
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment