diff --git a/bob/bio/gmm/__init__.py b/bob/bio/gmm/__init__.py index c020e482b2a73cb760aed3551e44f9aa4076c950..24f76d933c43724495ba261ca9288fb3300dae98 100644 --- a/bob/bio/gmm/__init__.py +++ b/bob/bio/gmm/__init__.py @@ -1,4 +1,5 @@ from . import algorithm # noqa: F401 +from . import bioalgorithm # noqa: F401 from . import test # noqa: F401 diff --git a/bob/bio/gmm/algorithm/__init__.py b/bob/bio/gmm/algorithm/__init__.py index 046970ffbfb3d835b81a43ff05ce51ca55bd1d53..fc5f4fecb72afd9081e3de2354ef24dc28d04885 100644 --- a/bob/bio/gmm/algorithm/__init__.py +++ b/bob/bio/gmm/algorithm/__init__.py @@ -1,5 +1,5 @@ -from .GMM import GMM -from .GMM import GMMRegular +# from .GMM import GMM +# from .GMM import GMMRegular from .ISV import ISV from .IVector import IVector from .JFA import JFA @@ -22,8 +22,8 @@ def __appropriate__(*args): __appropriate__( - GMM, - GMMRegular, + # GMM, + # GMMRegular, JFA, ISV, IVector, diff --git a/bob/bio/gmm/bioalgorithm/GMM.py b/bob/bio/gmm/bioalgorithm/GMM.py new file mode 100644 index 0000000000000000000000000000000000000000..23d9ebc519e03f02425ad74d474fdd496062b237 --- /dev/null +++ b/bob/bio/gmm/bioalgorithm/GMM.py @@ -0,0 +1,385 @@ +#!/usr/bin/env python +# vim: set fileencoding=utf-8 : +# Manuel Guenther <Manuel.Guenther@idiap.ch> + +"""Interface between the lower level GMM classes and the Algorithm Transformer. + +Implements the enroll and score methods using the low level GMM implementation. + +This adds the notions of models, probes, enrollment, and scores to GMM. +""" + + +import logging + +from typing import Callable + +import numpy + +from sklearn.base import BaseEstimator + +import bob.core +import bob.io.base + +from bob.bio.base.pipelines.vanilla_biometrics.abstract_classes import BioAlgorithm +from bob.learn.em.mixture import GMMMachine +from bob.learn.em.mixture import GMMStats +from bob.learn.em.mixture import linear_scoring + +logger = logging.getLogger(__name__) + + +class GMM(BioAlgorithm, BaseEstimator): + """Algorithm for computing UBM and Gaussian Mixture Models of the features. + + Features must be normalized to zero mean and unit standard deviation. + + Models are MAP GMM machines trained from a UBM on the enrollment feature set. + + The UBM is a ML GMM machine trained on the training feature set. + + Probes are GMM statistics of features projected on the UBM. + """ + + def __init__( + self, + # parameters for the GMM + number_of_gaussians: int, + # parameters of UBM training + kmeans_training_iterations: int = 25, # Maximum number of iterations for K-Means + ubm_training_iterations: int = 25, # Maximum number of iterations for GMM Training + training_threshold: float = 5e-4, # Threshold to end the ML training + variance_threshold: float = 5e-4, # Minimum value that a variance can reach + update_weights: bool = True, + update_means: bool = True, + update_variances: bool = True, + # parameters of the GMM enrollment + relevance_factor: float = 4, # Relevance factor as described in Reynolds paper + gmm_enroll_iterations: int = 1, # Number of iterations for the enrollment phase + responsibility_threshold: float = 0, # If set, the weight of a particular Gaussian will at least be greater than this threshold. In the case the real weight is lower, the prior mean value will be used to estimate the current mean and variance. + init_seed: int = 5489, + # scoring + scoring_function: Callable = linear_scoring, + # n_threads=None, + ): + """Initializes the local UBM-GMM tool chain. + + Parameters + ---------- + number_of_gaussians + The number of Gaussians used in the UBM and the models. + kmeans_training_iterations + Number of e-m iterations to train k-means initializing the UBM. + ubm_training_iterations + Number of e-m iterations for training the UBM. + training_threshold + Convergence threshold to halt the GMM training early. + variance_threshold + Minimum value a variance of the Gaussians can reach. + update_weights + Decides wether the weights of the Gaussians are updated while training. + update_means + Decides wether the means of the Gaussians are updated while training. + update_variances + Decides wether the variancess of the Gaussians are updated while training. + relevance_factor + Relevance factor as described in Reynolds paper. + gmm_enroll_iterations + Number of iterations for the MAP GMM used for enrollment. + responsibility_threshold + If set, the weight of a particular Gaussian will at least be greater than + this threshold. In the case where the real weight is lower, the prior mean + value will be used to estimate the current mean and variance. + init_seed + Seed for the random number generation. + scoring_function + Function returning a score from a model, a UBM, and a probe. + """ + + # call base class constructor and register that this tool performs projection + # super().__init__(score_reduction_operation=??) + + # copy parameters + self.number_of_gaussians = number_of_gaussians + self.kmeans_training_iterations = kmeans_training_iterations + self.ubm_training_iterations = ubm_training_iterations + self.training_threshold = training_threshold + self.variance_threshold = variance_threshold + self.update_weights = update_weights + self.update_means = update_means + self.update_variances = update_variances + self.relevance_factor = relevance_factor + self.gmm_enroll_iterations = gmm_enroll_iterations + self.init_seed = init_seed + self.rng = bob.core.random.mt19937(self.init_seed) # TODO + self.responsibility_threshold = responsibility_threshold + self.scoring_function = scoring_function + + self.ubm = None + + def _check_feature(self, feature): + """Checks that the features are appropriate""" + if ( + not isinstance(feature, numpy.ndarray) + or feature.ndim != 2 + or feature.dtype != numpy.float64 + ): + raise ValueError("The given feature is not appropriate") + if self.ubm is not None and feature.shape[1] != self.ubm.shape[1]: + raise ValueError( + "The given feature is expected to have %d elements, but it has %d" + % (self.ubm.shape[1], feature.shape[1]) + ) + + ####################################################### + # UBM training # + + def train_ubm(self, array): + + logger.debug(" .... Training UBM with %d feature vectors", array.shape[0]) + + logger.debug(" .... Creating UBM machine") + self.ubm = GMMMachine( + n_gaussians=self.number_of_gaussians, + trainer="ml", + max_fitting_steps=self.ubm_training_iterations, + convergence_threshold=self.training_threshold, + update_means=self.update_means, + update_variances=self.update_variances, + update_weights=self.update_weights, + # TODO more params? + ) + + # Trains the GMM + logger.info(" -> Training UBM GMM") + # Resetting the pseudo random number generator so we can have the same initialization for serial and parallel execution. + # self.rng = bob.core.random.mt19937(self.init_seed) + self.ubm.fit(array) + + def save_ubm(self, projector_file): + """Saves the projector to file""" + # Saves the UBM to file + logger.debug(" .... Saving model to file '%s'", projector_file) + + hdf5 = ( + projector_file + if isinstance(projector_file, bob.io.base.HDF5File) + else bob.io.base.HDF5File(projector_file, "w") + ) + self.ubm.save(hdf5) + + def train_projector(self, train_features, projector_file): + """Computes the Universal Background Model from the training ("world") data""" + [self._check_feature(feature) for feature in train_features] + + logger.info( + " -> Training UBM model with %d training files", len(train_features) + ) + + # Loads the data into an array + array = numpy.vstack(train_features) + + self.train_ubm(array) + + self.save_ubm(projector_file) + + ####################################################### + # GMM training using UBM # + + def load_ubm(self, ubm_file): + hdf5file = bob.io.base.HDF5File(ubm_file) + # read UBM + self.ubm = GMMMachine.from_hdf5(hdf5file) + self.ubm.variance_thresholds = self.variance_threshold + + def load_projector(self, projector_file): + """Reads the UBM model from file""" + # read UBM + self.load_ubm(projector_file) + # prepare MAP_GMM_Trainer + # kwargs = ( + # dict( + # mean_var_update_responsibilities_threshold=self.responsibility_threshold + # ) + # if self.responsibility_threshold > 0.0 + # else dict() + # ) + # self.enroll_trainer = bob.learn.em.MAP_GMMTrainer( + # self.ubm, + # relevance_factor=self.relevance_factor, + # update_means=True, + # update_variances=False, + # **kwargs + # ) + self.rng = bob.core.random.mt19937(self.init_seed) + + def project_ubm(self, array): + logger.debug(" .... Projecting %d feature vectors", array.shape[0]) + # Accumulates statistics + gmm_stats = GMMStats(self.ubm.shape[0], self.ubm.shape[1]) + self.ubm.acc_statistics(array, gmm_stats) + + # return the resulting statistics + return gmm_stats + + def project(self, feature): + """Computes GMM statistics against a UBM, given an input 2D numpy.ndarray of feature vectors""" + self._check_feature(feature) + return self.project_ubm(feature) + + def read_gmm_stats(self, gmm_stats_file): + """Reads GMM stats from file.""" + return GMMStats.from_hdf5(bob.io.base.HDF5File(gmm_stats_file)) + + def read_feature(self, feature_file): + """Read the type of features that we require, namely GMM_Stats""" + return self.read_gmm_stats(feature_file) + + def write_feature(self, feature, feature_file): + """Write the features (GMM_Stats)""" + return feature.save(feature_file) + + def enroll_gmm(self, array): + logger.debug(" .... Enrolling with %d feature vectors", array.shape[0]) + # TODO responsibility_threshold + gmm = GMMMachine( + n_gaussians=self.number_of_gaussians, + trainer="map", + ubm=self.ubm, + convergence_threshold=self.training_threshold, + max_fitting_steps=self.gmm_enroll_iterations, + random_state=self.rng, # TODO + update_means=True, + update_variances=True, # TODO default? + update_weights=True, # TODO default? + ) + gmm.variance_thresholds = self.variance_threshold + gmm = gmm.fit(array) + return gmm + + def enroll(self, data): + """Enrolls a GMM using MAP adaptation, given a list of 2D numpy.ndarray's of feature vectors""" + [self._check_feature(feature) for feature in data] + array = numpy.vstack(data) + # Use the array to train a GMM and return it + return self.enroll_gmm(array) + + ###################################################### + # Feature comparison # + def read_model(self, model_file): + """Reads the model, which is a GMM machine""" + return GMMMachine.from_hdf5(bob.io.base.HDF5File(model_file)) + + def score(self, biometric_reference: GMMMachine, data: GMMStats): + """Computes the score for the given model and the given probe. + + Uses the scoring function passed during initialization. + + Parameters + ---------- + biometric_reference: + The model to score against. + data: + The probe data to compare to the model. + """ + + assert isinstance(biometric_reference, GMMMachine) # TODO is it a list? + assert isinstance(data, GMMStats) + return self.scoring_function( + models_means=[biometric_reference], + ubm=self.ubm, + test_stats=data, + frame_length_normalisation=True, + )[0, 0] + + def score_multiple_biometric_references( + self, biometric_references: "list[GMMMachine]", data: GMMStats + ): + """Computes the score between multiple models and one probe. + + Uses the scoring function passed during initialization. + + Parameters + ---------- + biometric_references: + The models to score against. + data: + The probe data to compare to the models. + """ + + assert isinstance(biometric_references, GMMMachine) # TODO is it a list? + assert isinstance(data, GMMStats) + return self.scoring_function( + models_means=biometric_references, + ubm=self.ubm, + test_stats=data, + frame_length_normalisation=True, + ) + + # def score_for_multiple_probes(self, model, probes): + # """This function computes the score between the given model and several given probe files.""" + # assert isinstance(model, GMMMachine) + # for probe in probes: + # assert isinstance(probe, GMMStats) + # # logger.warn("Please verify that this function is correct") + # return self.probe_fusion_function( + # self.scoring_function( + # model.means, self.ubm, probes, [], frame_length_normalisation=True + # ) + # ) + + def fit(self, X, y=None, **kwargs): + """Trains the UBM.""" + self.train_ubm(X) + return self + + def transform(self, X, **kwargs): + """Passthrough. Enroll applies a different transform as score.""" + return X + + +class GMMRegular(GMM): + """Algorithm for computing Universal Background Models and Gaussian Mixture Models of the features""" + + def __init__(self, **kwargs): + """Initializes the local UBM-GMM tool chain with the given file selector object""" + # logger.warn("This class must be checked. Please verify that I didn't do any mistake here. I had to rename 'train_projector' into a 'train_enroller'!") + # initialize the UBMGMM base class + GMM.__init__(self, **kwargs) + # register a different set of functions in the Tool base class + BioAlgorithm.__init__( + self, requires_enroller_training=True, performs_projection=False + ) + + ####################################################### + # UBM training # + + def train_enroller(self, train_features, enroller_file): + """Computes the Universal Background Model from the training ("world") data""" + train_features = [feature for client in train_features for feature in client] + return self.train_projector(train_features, enroller_file) + + ####################################################### + # GMM training using UBM # + + def load_enroller(self, enroller_file): + """Reads the UBM model from file""" + return self.load_projector(enroller_file) + + ###################################################### + # Feature comparison # + def score(self, model, probe): + """Computes the score for the given model and the given probe. + The score are Log-Likelihood. + Therefore, the log of the likelihood ratio is obtained by computing the following difference.""" + + assert isinstance(model, GMMMachine) + self._check_feature(probe) + score = sum( + model.log_likelihood(probe[i, :]) - self.ubm.log_likelihood(probe[i, :]) + for i in range(probe.shape[0]) + ) + return score / probe.shape[0] + + def score_for_multiple_probes(self, model, probes): + raise NotImplementedError("Implement Me!") diff --git a/bob/bio/gmm/bioalgorithm/__init__.py b/bob/bio/gmm/bioalgorithm/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e1b44bce225c6f7fb9b1669d84026827fda7d9e9 --- /dev/null +++ b/bob/bio/gmm/bioalgorithm/__init__.py @@ -0,0 +1,25 @@ +from .GMM import GMM +from .GMM import GMMRegular + + +# gets sphinx autodoc done right - don't remove it +def __appropriate__(*args): + """Says object was actually declared here, and not in the import module. + Fixing sphinx warnings of not being able to find classes, when path is shortened. + Parameters: + + *args: An iterable of objects to modify + + Resolves `Sphinx referencing issues + <https://github.com/sphinx-doc/sphinx/issues/3048>` + """ + + for obj in args: + obj.__module__ = __name__ + + +__appropriate__( + GMM, + GMMRegular, +) +__all__ = [_ for _ in dir() if not _.startswith("_")] diff --git a/bob/bio/gmm/config/bioalgorithm/__init__.py b/bob/bio/gmm/config/bioalgorithm/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/bob/bio/gmm/config/bioalgorithm/gmm.py b/bob/bio/gmm/config/bioalgorithm/gmm.py new file mode 100644 index 0000000000000000000000000000000000000000..58aeddd0af491641508afa2770e0d34463103f34 --- /dev/null +++ b/bob/bio/gmm/config/bioalgorithm/gmm.py @@ -0,0 +1,3 @@ +import bob.bio.gmm + +bioalgorithm = bob.bio.gmm.bioalgorithm.GMM(number_of_gaussians=512) diff --git a/bob/bio/gmm/config/bioalgorithm/gmm_regular.py b/bob/bio/gmm/config/bioalgorithm/gmm_regular.py new file mode 100644 index 0000000000000000000000000000000000000000..f7166b534f393405c78422e6a9b97aba4c1107be --- /dev/null +++ b/bob/bio/gmm/config/bioalgorithm/gmm_regular.py @@ -0,0 +1,5 @@ +#!/usr/bin/env python + +import bob.bio.gmm + +bioalgorithm = bob.bio.gmm.bioalgorithm.GMMRegular(number_of_gaussians=512) diff --git a/bob/bio/gmm/test/data/gmm_projected.hdf5 b/bob/bio/gmm/test/data/gmm_projected.hdf5 index 31d930b955098e3ae990c1e2509d2c232d1a86be..ba17796e7d8d0a7374edc3a9ae067447043feedc 100644 Binary files a/bob/bio/gmm/test/data/gmm_projected.hdf5 and b/bob/bio/gmm/test/data/gmm_projected.hdf5 differ diff --git a/bob/bio/gmm/test/data/gmm_projector.hdf5 b/bob/bio/gmm/test/data/gmm_projector.hdf5 index 4c47be97a009e963d25301904a7420eced1b55e9..d39d6920f1e7d335eddb7ce2c2be0ba892aa0541 100644 Binary files a/bob/bio/gmm/test/data/gmm_projector.hdf5 and b/bob/bio/gmm/test/data/gmm_projector.hdf5 differ diff --git a/bob/bio/gmm/test/test_algorithms.py b/bob/bio/gmm/test/test_algorithms.py index 7cb0bb5b5c5052d936285fd1ee49cf510dbf6665..e0375ec2f645077c574c137ba34654561531e435 100644 --- a/bob/bio/gmm/test/test_algorithms.py +++ b/bob/bio/gmm/test/test_algorithms.py @@ -24,6 +24,7 @@ import sys import numpy import pkg_resources +import pytest import bob.bio.gmm import bob.io.base @@ -32,9 +33,9 @@ import bob.learn.linear from bob.bio.base.test import utils -logger = logging.getLogger("bob.bio.gmm") +logger = logging.getLogger(__name__) -regenerate_refs = False +regenerate_refs = True seed_value = 5489 @@ -72,25 +73,30 @@ def _compare_complex( assert numpy.allclose(d, r, atol=1e-5) +@pytest.mark.isolated_gmm def test_gmm(): - temp_file = bob.io.base.test_utils.temporary_filename() + temp_file = ( + "./temptest/test_file" # TODO bob.io.base.test_utils.temporary_filename() + ) gmm1 = bob.bio.base.load_resource( - "gmm", "algorithm", preferred_package="bob.bio.gmm" + "gmm", "bioalgorithm", preferred_package="bob.bio.gmm" ) - assert isinstance(gmm1, bob.bio.gmm.algorithm.GMM) - assert isinstance(gmm1, bob.bio.base.algorithm.Algorithm) - assert gmm1.performs_projection - assert gmm1.requires_projector_training - assert not gmm1.use_projected_features_for_enrollment - assert not gmm1.split_training_features_by_client - assert not gmm1.requires_enroller_training + assert isinstance(gmm1, bob.bio.gmm.bioalgorithm.GMM) + assert isinstance( + gmm1, bob.bio.base.pipelines.vanilla_biometrics.abstract_classes.BioAlgorithm + ) + # assert gmm1.performs_projection + # assert gmm1.requires_projector_training + # assert not gmm1.use_projected_features_for_enrollment + # assert not gmm1.split_training_features_by_client + # assert not gmm1.requires_enroller_training # create smaller GMM object - gmm2 = bob.bio.gmm.algorithm.GMM( + gmm2 = bob.bio.gmm.bioalgorithm.GMM( number_of_gaussians=2, kmeans_training_iterations=1, - gmm_training_iterations=1, - INIT_SEED=seed_value, + ubm_training_iterations=1, + init_seed=seed_value, ) train_data = utils.random_training_set( @@ -120,7 +126,7 @@ def test_gmm(): # generate and project random feature feature = utils.random_array((20, 45), -5.0, 5.0, seed=84) projected = gmm1.project(feature) - assert isinstance(projected, bob.learn.em.GMMStats) + assert isinstance(projected, bob.learn.em.mixture.GMMStats) _compare( projected, pkg_resources.resource_filename("bob.bio.gmm.test", "data/gmm_projected.hdf5"), @@ -131,7 +137,7 @@ def test_gmm(): # enroll model from random features enroll = utils.random_training_set((20, 45), 5, -5.0, 5.0, seed=21) model = gmm1.enroll(enroll) - assert isinstance(model, bob.learn.em.GMMMachine) + assert isinstance(model, bob.learn.em.mixture.GMMMachine) _compare( model, pkg_resources.resource_filename("bob.bio.gmm.test", "data/gmm_model.hdf5"), @@ -159,16 +165,18 @@ def test_gmm_regular(): gmm1 = bob.bio.base.load_resource( "gmm-regular", "algorithm", preferred_package="bob.bio.gmm" ) - assert isinstance(gmm1, bob.bio.gmm.algorithm.GMMRegular) - assert isinstance(gmm1, bob.bio.gmm.algorithm.GMM) - assert isinstance(gmm1, bob.bio.base.algorithm.Algorithm) + assert isinstance(gmm1, bob.bio.gmm.bioalgorithm.GMMRegular) + assert isinstance(gmm1, bob.bio.gmm.bioalgorithm.GMM) + assert isinstance( + gmm1, bob.bio.base.pipelines.vanilla_biometrics.abstract_classes.BioAlgorithm + ) assert not gmm1.performs_projection assert not gmm1.requires_projector_training assert not gmm1.use_projected_features_for_enrollment assert gmm1.requires_enroller_training # create smaller GMM object - gmm2 = bob.bio.gmm.algorithm.GMMRegular( + gmm2 = bob.bio.gmm.bioalgorithm.GMMRegular( number_of_gaussians=2, kmeans_training_iterations=1, gmm_training_iterations=1, @@ -202,7 +210,7 @@ def test_gmm_regular(): # enroll model from random features enroll = utils.random_training_set((20, 45), 5, -5.0, 5.0, seed=21) model = gmm1.enroll(enroll) - assert isinstance(model, bob.learn.em.GMMMachine) + assert isinstance(model, bob.learn.em.mixture.GMMMachine) _compare( model, pkg_resources.resource_filename("bob.bio.gmm.test", "data/gmm_model.hdf5"), diff --git a/setup.py b/setup.py index 3b512f4b9e9c7560b7bca408126910b36e2168b4..a69ee9539aed07b4c1142d3d8e12f4f11791218b 100644 --- a/setup.py +++ b/setup.py @@ -110,6 +110,10 @@ setup( "ivector-plda = bob.bio.gmm.config.algorithm.ivector_plda:algorithm", "ivector-lda-wccn-plda = bob.bio.gmm.config.algorithm.ivector_lda_wccn_plda:algorithm", ], + "bob.bio.bioalgorithm": [ + "gmm = bob.bio.gmm.config.bioalgorithm.gmm:bioalgorithm", + "gmm-regular = bob.bio.gmm.config.bioalgorithm.gmm_regular:bioalgorithm", + ], }, # Classifiers are important if you plan to distribute this package through # PyPI. You can find the complete list of classifiers that are valid and