Removed unused method left from old API.

d779abda · Yannick DAYER · a0ec1e4f · d779abda · d779abda
Commit d779abda authored 3 years ago by Yannick DAYER
--- a/bob/bio/gmm/bioalgorithm/GMM.py
+++ b/bob/bio/gmm/bioalgorithm/GMM.py
@@ -14,7 +14,8 @@ import logging
 from typing import Callable
-import numpy
+import dask.array as da
+import numpy as np
 from sklearn.base import BaseEstimator
@@ -28,6 +29,8 @@ from bob.learn.em.mixture import linear_scoring
 logger = logging.getLogger(__name__)
+# from bob.pipelines import ToDaskBag  # Used when switching from samples to da.Array
 class GMM(BioAlgorithm, BaseEstimator):
    """Algorithm for computing UBM and Gaussian Mixture Models of the features.
@@ -111,109 +114,50 @@ class GMM(BioAlgorithm, BaseEstimator):
        self.relevance_factor = relevance_factor
        self.gmm_enroll_iterations = gmm_enroll_iterations
        self.init_seed = init_seed
-        self.rng = bob.core.random.mt19937(self.init_seed)  # TODO
+        self.rng = self.init_seed  # TODO verify if rng object needed
        self.responsibility_threshold = responsibility_threshold
        self.scoring_function = scoring_function
        self.ubm = None
+        super().__init__()
    def _check_feature(self, feature):
        """Checks that the features are appropriate"""
        if (
-            not isinstance(feature, numpy.ndarray)
+            not isinstance(feature, np.ndarray)
            or feature.ndim != 2
-            or feature.dtype != numpy.float64
+            or feature.dtype != np.float64
        ):
-            raise ValueError("The given feature is not appropriate")
+            raise ValueError(f"The given feature is not appropriate: \n{feature}")
        if self.ubm is not None and feature.shape[1] != self.ubm.shape[1]:
            raise ValueError(
                "The given feature is expected to have %d elements, but it has %d"
                % (self.ubm.shape[1], feature.shape[1])
            )
-    #######################################################
+    def save_ubm(self, ubm_file):
-    #                UBM training                         #
-    def train_ubm(self, array):
-        logger.debug(" .... Training UBM with %d feature vectors", array.shape[0])
-        logger.debug(" .... Creating UBM machine")
-        self.ubm = GMMMachine(
-            n_gaussians=self.number_of_gaussians,
-            trainer="ml",
-            max_fitting_steps=self.ubm_training_iterations,
-            convergence_threshold=self.training_threshold,
-            update_means=self.update_means,
-            update_variances=self.update_variances,
-            update_weights=self.update_weights,
-            # TODO more params?
-        )
-        # Trains the GMM
-        logger.info("  -> Training UBM GMM")
-        # Resetting the pseudo random number generator so we can have the same initialization for serial and parallel execution.
-        # self.rng = bob.core.random.mt19937(self.init_seed)
-        self.ubm.fit(array)
-    def save_ubm(self, projector_file):
        """Saves the projector to file"""
        # Saves the UBM to file
-        logger.debug(" .... Saving model to file '%s'", projector_file)
+        logger.debug("Saving model to file '%s'", ubm_file)
        hdf5 = (
-            projector_file
+            ubm_file
-            if isinstance(projector_file, bob.io.base.HDF5File)
+            if isinstance(ubm_file, bob.io.base.HDF5File)
-            else bob.io.base.HDF5File(projector_file, "w")
+            else bob.io.base.HDF5File(ubm_file, "w")
        )
        self.ubm.save(hdf5)
-    def train_projector(self, train_features, projector_file):
-        """Computes the Universal Background Model from the training ("world") data"""
-        [self._check_feature(feature) for feature in train_features]
-        logger.info(
-            "  -> Training UBM model with %d training files", len(train_features)
-        )
-        # Loads the data into an array
-        array = numpy.vstack(train_features)
-        self.train_ubm(array)
-        self.save_ubm(projector_file)
-    #######################################################
-    #              GMM training using UBM                 #
    def load_ubm(self, ubm_file):
        hdf5file = bob.io.base.HDF5File(ubm_file)
+        logger.debug("Loading model from file '%s'", ubm_file)
        # read UBM
        self.ubm = GMMMachine.from_hdf5(hdf5file)
        self.ubm.variance_thresholds = self.variance_threshold
-    def load_projector(self, projector_file):
+    def project(self, array):
-        """Reads the UBM model from file"""
+        """Computes GMM statistics against a UBM, given a 2D array of feature vectors"""
-        # read UBM
+        self._check_feature(array)
-        self.load_ubm(projector_file)
-        # prepare MAP_GMM_Trainer
-        # kwargs = (
-        #     dict(
-        #         mean_var_update_responsibilities_threshold=self.responsibility_threshold
-        #     )
-        #     if self.responsibility_threshold > 0.0
-        #     else dict()
-        # )
-        # self.enroll_trainer = bob.learn.em.MAP_GMMTrainer(
-        #     self.ubm,
-        #     relevance_factor=self.relevance_factor,
-        #     update_means=True,
-        #     update_variances=False,
-        #     **kwargs
-        # )
-        self.rng = bob.core.random.mt19937(self.init_seed)
-    def project_ubm(self, array):
        logger.debug(" .... Projecting %d feature vectors", array.shape[0])
        # Accumulates statistics
        gmm_stats = GMMStats(self.ubm.shape[0], self.ubm.shape[1])
@@ -222,25 +166,21 @@ class GMM(BioAlgorithm, BaseEstimator):
        # return the resulting statistics
        return gmm_stats
-    def project(self, feature):
-        """Computes GMM statistics against a UBM, given an input 2D numpy.ndarray of feature vectors"""
-        self._check_feature(feature)
-        return self.project_ubm(feature)
-    def read_gmm_stats(self, gmm_stats_file):
-        """Reads GMM stats from file."""
-        return GMMStats.from_hdf5(bob.io.base.HDF5File(gmm_stats_file))
    def read_feature(self, feature_file):
        """Read the type of features that we require, namely GMM_Stats"""
-        return self.read_gmm_stats(feature_file)
+        return GMMStats.from_hdf5(bob.io.base.HDF5File(feature_file))
    def write_feature(self, feature, feature_file):
        """Write the features (GMM_Stats)"""
        return feature.save(feature_file)
-    def enroll_gmm(self, array):
+    def enroll(self, data):
+        """Enrolls a GMM using MAP adaptation, given a list of 2D np.ndarray's of feature vectors"""
+        [self._check_feature(feature) for feature in data]
+        array = np.vstack(data)
+        # Use the array to train a GMM and return it
        logger.debug(" .... Enrolling with %d feature vectors", array.shape[0])
        # TODO responsibility_threshold
        gmm = GMMMachine(
            n_gaussians=self.number_of_gaussians,
@@ -248,7 +188,7 @@ class GMM(BioAlgorithm, BaseEstimator):
            ubm=self.ubm,
            convergence_threshold=self.training_threshold,
            max_fitting_steps=self.gmm_enroll_iterations,
-            random_state=self.rng,  # TODO
+            random_state=self.rng,
            update_means=True,
            update_variances=True,  # TODO default?
            update_weights=True,  # TODO default?
@@ -257,15 +197,6 @@ class GMM(BioAlgorithm, BaseEstimator):
        gmm = gmm.fit(array)
        return gmm
-    def enroll(self, data):
-        """Enrolls a GMM using MAP adaptation, given a list of 2D numpy.ndarray's of feature vectors"""
-        [self._check_feature(feature) for feature in data]
-        array = numpy.vstack(data)
-        # Use the array to train a GMM and return it
-        return self.enroll_gmm(array)
-    ######################################################
-    #                Feature comparison                  #
    def read_model(self, model_file):
        """Reads the model, which is a GMM machine"""
        return GMMMachine.from_hdf5(bob.io.base.HDF5File(model_file), ubm=self.ubm)
@@ -287,12 +218,13 @@ class GMM(BioAlgorithm, BaseEstimator):
            The probe data to compare to the model.
        """
+        # import ipdb; ipdb.set_trace()
        assert isinstance(biometric_reference, GMMMachine)
-        assert isinstance(data, GMMStats)
+        stats = self.project(data)
        return self.scoring_function(
            models_means=[biometric_reference],
            ubm=self.ubm,
-            test_stats=data,
+            test_stats=stats,
            frame_length_normalization=True,
        )[0, 0]
@@ -311,12 +243,14 @@ class GMM(BioAlgorithm, BaseEstimator):
            The probe data to compare to the models.
        """
-        assert isinstance(biometric_references, GMMMachine)
+        assert isinstance(biometric_references[0], GMMMachine), type(
-        assert isinstance(data, GMMStats)
+            biometric_references[0]
+        )
+        stats = self.project(data)
        return self.scoring_function(
            models_means=biometric_references,
            ubm=self.ubm,
-            test_stats=data,
+            test_stats=stats,
            frame_length_normalization=True,
        )
@@ -326,65 +260,86 @@ class GMM(BioAlgorithm, BaseEstimator):
        for probe in probes:
            assert isinstance(probe, GMMStats)
        #    logger.warn("Please verify that this function is correct")
-        return self.scoring_function(
+        return (
-            models_means=model.means,
+            self.scoring_function(
-            ubm=self.ubm,
+                models_means=model.means,
-            test_stats=probes,
+                ubm=self.ubm,
-            frame_length_normalization=True,
+                test_stats=probes,
-        ).mean()
+                frame_length_normalization=True,
+            )
+            .mean()
+            .reshape((-1,))
+        )
    def fit(self, X, y=None, **kwargs):
        """Trains the UBM."""
-        self.train_ubm(X)
+        # TODO: Delayed to dask array
-        return self
-    def transform(self, X, **kwargs):
+        # def delayed_to_xr_dataset(delayed, meta=None):
-        """Passthrough. Enroll applies a different transform as score."""
+        #     """Converts one dask.delayed object to a dask.array"""
-        return X
+        #     if meta is None:
+        #         meta = np.array(delayed.data.compute())
+        #         print(meta.shape)
+        #     darray = da.from_delayed(delayed.data, meta.shape, dtype=meta.dtype, name=False)
+        #     return darray, meta
-class GMMRegular(GMM):
+        # def delayed_samples_to_dask_arrays(delayed_samples, meta=None):
-    """Algorithm for computing Universal Background Models and Gaussian Mixture Models of the features"""
+        #     output = []
+        #     for ds in delayed_samples:
+        #         d_array, meta = delayed_to_xr_dataset(ds, meta)
+        #         output.append(d_array)
+        #     return output, meta
-    def __init__(self, **kwargs):
+        # def delayeds_to_xr_dataset(delayeds, meta=None):
-        """Initializes the local UBM-GMM tool chain with the given file selector object"""
+        #     """Converts a set of dask.delayed to a list of dask.array"""
-        #    logger.warn("This class must be checked. Please verify that I didn't do any mistake here. I had to rename 'train_projector' into a 'train_enroller'!")
+        #     output = []
-        # initialize the UBMGMM base class
+        #     for d in delayeds:
-        GMM.__init__(self, **kwargs)
+        #         d_array, meta = delayed_samples_to_dask_arrays(d, meta)
-        # register a different set of functions in the Tool base class
+        #         output.extend(d_array)
-        BioAlgorithm.__init__(
+        #     return output
-            self, requires_enroller_training=True, performs_projection=False
-        )
-    #######################################################
+        # import ipdb; ipdb.set_trace()
-    #                UBM training                         #
-    def train_enroller(self, train_features, enroller_file):
+        # bags = ToDaskBag(npartitions=10).transform(X)
-        """Computes the Universal Background Model from the training ("world") data"""
-        train_features = [feature for client in train_features for feature in client]
-        return self.train_projector(train_features, enroller_file)
-    #######################################################
+        # delayeds = bags.to_delayed()
-    #              GMM training using UBM                 #
+        # lengths = bags.map_partitions(lambda samples: [len(samples)]).compute()
+        # for l, d in zip(lengths, delayeds):
+        #     d._length = l
+        # array_data = da.from_delayed(delayeds, shape=(2,-1,60))
+        # array_data = da.stack(delayeds_to_xr_dataset(delayeds))
-    def load_enroller(self, enroller_file):
+        # Stack all the samples in a 2D array of features
-        """Reads the UBM model from file"""
+        array = da.vstack(X)
-        return self.load_projector(enroller_file)
-    ######################################################
+        logger.debug("UBM with %d feature vectors", array.shape[0])
-    #                Feature comparison                  #
-    def score(self, model, probe):
-        """Computes the score for the given model and the given probe.
-        The score are Log-Likelihood.
-        Therefore, the log of the likelihood ratio is obtained by computing the following difference."""
-        assert isinstance(model, GMMMachine)
+        logger.debug(f"Creating UBM machine with {self.number_of_gaussians} gaussians")
-        self._check_feature(probe)
-        score = sum(
+        self.ubm = GMMMachine(
-            model.log_likelihood(probe[i, :]) - self.ubm.log_likelihood(probe[i, :])
+            n_gaussians=self.number_of_gaussians,
-            for i in range(probe.shape[0])
+            trainer="ml",
+            max_fitting_steps=self.ubm_training_iterations,
+            convergence_threshold=self.training_threshold,
+            update_means=self.update_means,
+            update_variances=self.update_variances,
+            update_weights=self.update_weights,
+            # TODO more params?
        )
-        return score / probe.shape[0]
-    def score_for_multiple_probes(self, model, probes):
+        # Trains the GMM
-        raise NotImplementedError("Implement Me!")
+        logger.info("Training UBM GMM")
+        # Resetting the pseudo random number generator so we can have the same initialization for serial and parallel execution.
+        # self.rng = bob.core.random.mt19937(self.init_seed)
+        self.ubm.fit(array)
+        return self
+    def transform(self, X, **kwargs):
+        """Passthrough. Enroll applies a different transform as score."""
+        # The idea would be to apply the projection in Transform (going from extracted
+        # to GMMStats), but we must not apply this during the training (fit requires
+        # extracted data directly).
+        # `project` is applied in the score function directly.
+        return X
--- a/bob/bio/gmm/bioalgorithm/__init__.py
+++ b/bob/bio/gmm/bioalgorithm/__init__.py
 from .GMM import GMM
-from .GMM import GMMRegular
 # gets sphinx autodoc done right - don't remove it
@@ -20,6 +19,5 @@ def __appropriate__(*args):
 __appropriate__(
    GMM,
-    GMMRegular,
 )
 __all__ = [_ for _ in dir() if not _.startswith("_")]