From d779abda5702b5b4194aa44b2a0625d61da05a84 Mon Sep 17 00:00:00 2001 From: Yannick DAYER <yannick.dayer@idiap.ch> Date: Mon, 8 Nov 2021 13:32:27 +0100 Subject: [PATCH] Removed unused method left from old API. --- bob/bio/gmm/bioalgorithm/GMM.py | 251 +++++++++++---------------- bob/bio/gmm/bioalgorithm/__init__.py | 2 - 2 files changed, 103 insertions(+), 150 deletions(-) diff --git a/bob/bio/gmm/bioalgorithm/GMM.py b/bob/bio/gmm/bioalgorithm/GMM.py index 7b6d7cb..05110e7 100644 --- a/bob/bio/gmm/bioalgorithm/GMM.py +++ b/bob/bio/gmm/bioalgorithm/GMM.py @@ -14,7 +14,8 @@ import logging from typing import Callable -import numpy +import dask.array as da +import numpy as np from sklearn.base import BaseEstimator @@ -28,6 +29,8 @@ from bob.learn.em.mixture import linear_scoring logger = logging.getLogger(__name__) +# from bob.pipelines import ToDaskBag # Used when switching from samples to da.Array + class GMM(BioAlgorithm, BaseEstimator): """Algorithm for computing UBM and Gaussian Mixture Models of the features. @@ -111,109 +114,50 @@ class GMM(BioAlgorithm, BaseEstimator): self.relevance_factor = relevance_factor self.gmm_enroll_iterations = gmm_enroll_iterations self.init_seed = init_seed - self.rng = bob.core.random.mt19937(self.init_seed) # TODO + self.rng = self.init_seed # TODO verify if rng object needed self.responsibility_threshold = responsibility_threshold self.scoring_function = scoring_function self.ubm = None + super().__init__() + def _check_feature(self, feature): """Checks that the features are appropriate""" if ( - not isinstance(feature, numpy.ndarray) + not isinstance(feature, np.ndarray) or feature.ndim != 2 - or feature.dtype != numpy.float64 + or feature.dtype != np.float64 ): - raise ValueError("The given feature is not appropriate") + raise ValueError(f"The given feature is not appropriate: \n{feature}") if self.ubm is not None and feature.shape[1] != self.ubm.shape[1]: raise ValueError( "The given feature is expected to have %d elements, but it has %d" % (self.ubm.shape[1], feature.shape[1]) ) - ####################################################### - # UBM training # - - def train_ubm(self, array): - - logger.debug(" .... Training UBM with %d feature vectors", array.shape[0]) - - logger.debug(" .... Creating UBM machine") - self.ubm = GMMMachine( - n_gaussians=self.number_of_gaussians, - trainer="ml", - max_fitting_steps=self.ubm_training_iterations, - convergence_threshold=self.training_threshold, - update_means=self.update_means, - update_variances=self.update_variances, - update_weights=self.update_weights, - # TODO more params? - ) - - # Trains the GMM - logger.info(" -> Training UBM GMM") - # Resetting the pseudo random number generator so we can have the same initialization for serial and parallel execution. - # self.rng = bob.core.random.mt19937(self.init_seed) - self.ubm.fit(array) - - def save_ubm(self, projector_file): + def save_ubm(self, ubm_file): """Saves the projector to file""" # Saves the UBM to file - logger.debug(" .... Saving model to file '%s'", projector_file) + logger.debug("Saving model to file '%s'", ubm_file) hdf5 = ( - projector_file - if isinstance(projector_file, bob.io.base.HDF5File) - else bob.io.base.HDF5File(projector_file, "w") + ubm_file + if isinstance(ubm_file, bob.io.base.HDF5File) + else bob.io.base.HDF5File(ubm_file, "w") ) self.ubm.save(hdf5) - def train_projector(self, train_features, projector_file): - """Computes the Universal Background Model from the training ("world") data""" - [self._check_feature(feature) for feature in train_features] - - logger.info( - " -> Training UBM model with %d training files", len(train_features) - ) - - # Loads the data into an array - array = numpy.vstack(train_features) - - self.train_ubm(array) - - self.save_ubm(projector_file) - - ####################################################### - # GMM training using UBM # - def load_ubm(self, ubm_file): hdf5file = bob.io.base.HDF5File(ubm_file) + logger.debug("Loading model from file '%s'", ubm_file) # read UBM self.ubm = GMMMachine.from_hdf5(hdf5file) self.ubm.variance_thresholds = self.variance_threshold - def load_projector(self, projector_file): - """Reads the UBM model from file""" - # read UBM - self.load_ubm(projector_file) - # prepare MAP_GMM_Trainer - # kwargs = ( - # dict( - # mean_var_update_responsibilities_threshold=self.responsibility_threshold - # ) - # if self.responsibility_threshold > 0.0 - # else dict() - # ) - # self.enroll_trainer = bob.learn.em.MAP_GMMTrainer( - # self.ubm, - # relevance_factor=self.relevance_factor, - # update_means=True, - # update_variances=False, - # **kwargs - # ) - self.rng = bob.core.random.mt19937(self.init_seed) - - def project_ubm(self, array): + def project(self, array): + """Computes GMM statistics against a UBM, given a 2D array of feature vectors""" + self._check_feature(array) logger.debug(" .... Projecting %d feature vectors", array.shape[0]) # Accumulates statistics gmm_stats = GMMStats(self.ubm.shape[0], self.ubm.shape[1]) @@ -222,25 +166,21 @@ class GMM(BioAlgorithm, BaseEstimator): # return the resulting statistics return gmm_stats - def project(self, feature): - """Computes GMM statistics against a UBM, given an input 2D numpy.ndarray of feature vectors""" - self._check_feature(feature) - return self.project_ubm(feature) - - def read_gmm_stats(self, gmm_stats_file): - """Reads GMM stats from file.""" - return GMMStats.from_hdf5(bob.io.base.HDF5File(gmm_stats_file)) - def read_feature(self, feature_file): """Read the type of features that we require, namely GMM_Stats""" - return self.read_gmm_stats(feature_file) + return GMMStats.from_hdf5(bob.io.base.HDF5File(feature_file)) def write_feature(self, feature, feature_file): """Write the features (GMM_Stats)""" return feature.save(feature_file) - def enroll_gmm(self, array): + def enroll(self, data): + """Enrolls a GMM using MAP adaptation, given a list of 2D np.ndarray's of feature vectors""" + [self._check_feature(feature) for feature in data] + array = np.vstack(data) + # Use the array to train a GMM and return it logger.debug(" .... Enrolling with %d feature vectors", array.shape[0]) + # TODO responsibility_threshold gmm = GMMMachine( n_gaussians=self.number_of_gaussians, @@ -248,7 +188,7 @@ class GMM(BioAlgorithm, BaseEstimator): ubm=self.ubm, convergence_threshold=self.training_threshold, max_fitting_steps=self.gmm_enroll_iterations, - random_state=self.rng, # TODO + random_state=self.rng, update_means=True, update_variances=True, # TODO default? update_weights=True, # TODO default? @@ -257,15 +197,6 @@ class GMM(BioAlgorithm, BaseEstimator): gmm = gmm.fit(array) return gmm - def enroll(self, data): - """Enrolls a GMM using MAP adaptation, given a list of 2D numpy.ndarray's of feature vectors""" - [self._check_feature(feature) for feature in data] - array = numpy.vstack(data) - # Use the array to train a GMM and return it - return self.enroll_gmm(array) - - ###################################################### - # Feature comparison # def read_model(self, model_file): """Reads the model, which is a GMM machine""" return GMMMachine.from_hdf5(bob.io.base.HDF5File(model_file), ubm=self.ubm) @@ -287,12 +218,13 @@ class GMM(BioAlgorithm, BaseEstimator): The probe data to compare to the model. """ + # import ipdb; ipdb.set_trace() assert isinstance(biometric_reference, GMMMachine) - assert isinstance(data, GMMStats) + stats = self.project(data) return self.scoring_function( models_means=[biometric_reference], ubm=self.ubm, - test_stats=data, + test_stats=stats, frame_length_normalization=True, )[0, 0] @@ -311,12 +243,14 @@ class GMM(BioAlgorithm, BaseEstimator): The probe data to compare to the models. """ - assert isinstance(biometric_references, GMMMachine) - assert isinstance(data, GMMStats) + assert isinstance(biometric_references[0], GMMMachine), type( + biometric_references[0] + ) + stats = self.project(data) return self.scoring_function( models_means=biometric_references, ubm=self.ubm, - test_stats=data, + test_stats=stats, frame_length_normalization=True, ) @@ -326,65 +260,86 @@ class GMM(BioAlgorithm, BaseEstimator): for probe in probes: assert isinstance(probe, GMMStats) # logger.warn("Please verify that this function is correct") - return self.scoring_function( - models_means=model.means, - ubm=self.ubm, - test_stats=probes, - frame_length_normalization=True, - ).mean() + return ( + self.scoring_function( + models_means=model.means, + ubm=self.ubm, + test_stats=probes, + frame_length_normalization=True, + ) + .mean() + .reshape((-1,)) + ) def fit(self, X, y=None, **kwargs): """Trains the UBM.""" - self.train_ubm(X) - return self + # TODO: Delayed to dask array - def transform(self, X, **kwargs): - """Passthrough. Enroll applies a different transform as score.""" - return X + # def delayed_to_xr_dataset(delayed, meta=None): + # """Converts one dask.delayed object to a dask.array""" + # if meta is None: + # meta = np.array(delayed.data.compute()) + # print(meta.shape) + # darray = da.from_delayed(delayed.data, meta.shape, dtype=meta.dtype, name=False) + # return darray, meta -class GMMRegular(GMM): - """Algorithm for computing Universal Background Models and Gaussian Mixture Models of the features""" + # def delayed_samples_to_dask_arrays(delayed_samples, meta=None): + # output = [] + # for ds in delayed_samples: + # d_array, meta = delayed_to_xr_dataset(ds, meta) + # output.append(d_array) + # return output, meta - def __init__(self, **kwargs): - """Initializes the local UBM-GMM tool chain with the given file selector object""" - # logger.warn("This class must be checked. Please verify that I didn't do any mistake here. I had to rename 'train_projector' into a 'train_enroller'!") - # initialize the UBMGMM base class - GMM.__init__(self, **kwargs) - # register a different set of functions in the Tool base class - BioAlgorithm.__init__( - self, requires_enroller_training=True, performs_projection=False - ) + # def delayeds_to_xr_dataset(delayeds, meta=None): + # """Converts a set of dask.delayed to a list of dask.array""" + # output = [] + # for d in delayeds: + # d_array, meta = delayed_samples_to_dask_arrays(d, meta) + # output.extend(d_array) + # return output - ####################################################### - # UBM training # + # import ipdb; ipdb.set_trace() - def train_enroller(self, train_features, enroller_file): - """Computes the Universal Background Model from the training ("world") data""" - train_features = [feature for client in train_features for feature in client] - return self.train_projector(train_features, enroller_file) + # bags = ToDaskBag(npartitions=10).transform(X) - ####################################################### - # GMM training using UBM # + # delayeds = bags.to_delayed() + # lengths = bags.map_partitions(lambda samples: [len(samples)]).compute() + # for l, d in zip(lengths, delayeds): + # d._length = l + # array_data = da.from_delayed(delayeds, shape=(2,-1,60)) + # array_data = da.stack(delayeds_to_xr_dataset(delayeds)) - def load_enroller(self, enroller_file): - """Reads the UBM model from file""" - return self.load_projector(enroller_file) + # Stack all the samples in a 2D array of features + array = da.vstack(X) - ###################################################### - # Feature comparison # - def score(self, model, probe): - """Computes the score for the given model and the given probe. - The score are Log-Likelihood. - Therefore, the log of the likelihood ratio is obtained by computing the following difference.""" + logger.debug("UBM with %d feature vectors", array.shape[0]) - assert isinstance(model, GMMMachine) - self._check_feature(probe) - score = sum( - model.log_likelihood(probe[i, :]) - self.ubm.log_likelihood(probe[i, :]) - for i in range(probe.shape[0]) + logger.debug(f"Creating UBM machine with {self.number_of_gaussians} gaussians") + + self.ubm = GMMMachine( + n_gaussians=self.number_of_gaussians, + trainer="ml", + max_fitting_steps=self.ubm_training_iterations, + convergence_threshold=self.training_threshold, + update_means=self.update_means, + update_variances=self.update_variances, + update_weights=self.update_weights, + # TODO more params? ) - return score / probe.shape[0] - def score_for_multiple_probes(self, model, probes): - raise NotImplementedError("Implement Me!") + # Trains the GMM + logger.info("Training UBM GMM") + # Resetting the pseudo random number generator so we can have the same initialization for serial and parallel execution. + # self.rng = bob.core.random.mt19937(self.init_seed) + self.ubm.fit(array) + + return self + + def transform(self, X, **kwargs): + """Passthrough. Enroll applies a different transform as score.""" + # The idea would be to apply the projection in Transform (going from extracted + # to GMMStats), but we must not apply this during the training (fit requires + # extracted data directly). + # `project` is applied in the score function directly. + return X diff --git a/bob/bio/gmm/bioalgorithm/__init__.py b/bob/bio/gmm/bioalgorithm/__init__.py index e1b44bc..cf76a6b 100644 --- a/bob/bio/gmm/bioalgorithm/__init__.py +++ b/bob/bio/gmm/bioalgorithm/__init__.py @@ -1,5 +1,4 @@ from .GMM import GMM -from .GMM import GMMRegular # gets sphinx autodoc done right - don't remove it @@ -20,6 +19,5 @@ def __appropriate__(*args): __appropriate__( GMM, - GMMRegular, ) __all__ = [_ for _ in dir() if not _.startswith("_")] -- GitLab