From d779abda5702b5b4194aa44b2a0625d61da05a84 Mon Sep 17 00:00:00 2001
From: Yannick DAYER <yannick.dayer@idiap.ch>
Date: Mon, 8 Nov 2021 13:32:27 +0100
Subject: [PATCH] Removed unused method left from old API.

---
 bob/bio/gmm/bioalgorithm/GMM.py      | 251 +++++++++++----------------
 bob/bio/gmm/bioalgorithm/__init__.py |   2 -
 2 files changed, 103 insertions(+), 150 deletions(-)

diff --git a/bob/bio/gmm/bioalgorithm/GMM.py b/bob/bio/gmm/bioalgorithm/GMM.py
index 7b6d7cb..05110e7 100644
--- a/bob/bio/gmm/bioalgorithm/GMM.py
+++ b/bob/bio/gmm/bioalgorithm/GMM.py
@@ -14,7 +14,8 @@ import logging
 
 from typing import Callable
 
-import numpy
+import dask.array as da
+import numpy as np
 
 from sklearn.base import BaseEstimator
 
@@ -28,6 +29,8 @@ from bob.learn.em.mixture import linear_scoring
 
 logger = logging.getLogger(__name__)
 
+# from bob.pipelines import ToDaskBag  # Used when switching from samples to da.Array
+
 
 class GMM(BioAlgorithm, BaseEstimator):
     """Algorithm for computing UBM and Gaussian Mixture Models of the features.
@@ -111,109 +114,50 @@ class GMM(BioAlgorithm, BaseEstimator):
         self.relevance_factor = relevance_factor
         self.gmm_enroll_iterations = gmm_enroll_iterations
         self.init_seed = init_seed
-        self.rng = bob.core.random.mt19937(self.init_seed)  # TODO
+        self.rng = self.init_seed  # TODO verify if rng object needed
         self.responsibility_threshold = responsibility_threshold
         self.scoring_function = scoring_function
 
         self.ubm = None
 
+        super().__init__()
+
     def _check_feature(self, feature):
         """Checks that the features are appropriate"""
         if (
-            not isinstance(feature, numpy.ndarray)
+            not isinstance(feature, np.ndarray)
             or feature.ndim != 2
-            or feature.dtype != numpy.float64
+            or feature.dtype != np.float64
         ):
-            raise ValueError("The given feature is not appropriate")
+            raise ValueError(f"The given feature is not appropriate: \n{feature}")
         if self.ubm is not None and feature.shape[1] != self.ubm.shape[1]:
             raise ValueError(
                 "The given feature is expected to have %d elements, but it has %d"
                 % (self.ubm.shape[1], feature.shape[1])
             )
 
-    #######################################################
-    #                UBM training                         #
-
-    def train_ubm(self, array):
-
-        logger.debug(" .... Training UBM with %d feature vectors", array.shape[0])
-
-        logger.debug(" .... Creating UBM machine")
-        self.ubm = GMMMachine(
-            n_gaussians=self.number_of_gaussians,
-            trainer="ml",
-            max_fitting_steps=self.ubm_training_iterations,
-            convergence_threshold=self.training_threshold,
-            update_means=self.update_means,
-            update_variances=self.update_variances,
-            update_weights=self.update_weights,
-            # TODO more params?
-        )
-
-        # Trains the GMM
-        logger.info("  -> Training UBM GMM")
-        # Resetting the pseudo random number generator so we can have the same initialization for serial and parallel execution.
-        # self.rng = bob.core.random.mt19937(self.init_seed)
-        self.ubm.fit(array)
-
-    def save_ubm(self, projector_file):
+    def save_ubm(self, ubm_file):
         """Saves the projector to file"""
         # Saves the UBM to file
-        logger.debug(" .... Saving model to file '%s'", projector_file)
+        logger.debug("Saving model to file '%s'", ubm_file)
 
         hdf5 = (
-            projector_file
-            if isinstance(projector_file, bob.io.base.HDF5File)
-            else bob.io.base.HDF5File(projector_file, "w")
+            ubm_file
+            if isinstance(ubm_file, bob.io.base.HDF5File)
+            else bob.io.base.HDF5File(ubm_file, "w")
         )
         self.ubm.save(hdf5)
 
-    def train_projector(self, train_features, projector_file):
-        """Computes the Universal Background Model from the training ("world") data"""
-        [self._check_feature(feature) for feature in train_features]
-
-        logger.info(
-            "  -> Training UBM model with %d training files", len(train_features)
-        )
-
-        # Loads the data into an array
-        array = numpy.vstack(train_features)
-
-        self.train_ubm(array)
-
-        self.save_ubm(projector_file)
-
-    #######################################################
-    #              GMM training using UBM                 #
-
     def load_ubm(self, ubm_file):
         hdf5file = bob.io.base.HDF5File(ubm_file)
+        logger.debug("Loading model from file '%s'", ubm_file)
         # read UBM
         self.ubm = GMMMachine.from_hdf5(hdf5file)
         self.ubm.variance_thresholds = self.variance_threshold
 
-    def load_projector(self, projector_file):
-        """Reads the UBM model from file"""
-        # read UBM
-        self.load_ubm(projector_file)
-        # prepare MAP_GMM_Trainer
-        # kwargs = (
-        #     dict(
-        #         mean_var_update_responsibilities_threshold=self.responsibility_threshold
-        #     )
-        #     if self.responsibility_threshold > 0.0
-        #     else dict()
-        # )
-        # self.enroll_trainer = bob.learn.em.MAP_GMMTrainer(
-        #     self.ubm,
-        #     relevance_factor=self.relevance_factor,
-        #     update_means=True,
-        #     update_variances=False,
-        #     **kwargs
-        # )
-        self.rng = bob.core.random.mt19937(self.init_seed)
-
-    def project_ubm(self, array):
+    def project(self, array):
+        """Computes GMM statistics against a UBM, given a 2D array of feature vectors"""
+        self._check_feature(array)
         logger.debug(" .... Projecting %d feature vectors", array.shape[0])
         # Accumulates statistics
         gmm_stats = GMMStats(self.ubm.shape[0], self.ubm.shape[1])
@@ -222,25 +166,21 @@ class GMM(BioAlgorithm, BaseEstimator):
         # return the resulting statistics
         return gmm_stats
 
-    def project(self, feature):
-        """Computes GMM statistics against a UBM, given an input 2D numpy.ndarray of feature vectors"""
-        self._check_feature(feature)
-        return self.project_ubm(feature)
-
-    def read_gmm_stats(self, gmm_stats_file):
-        """Reads GMM stats from file."""
-        return GMMStats.from_hdf5(bob.io.base.HDF5File(gmm_stats_file))
-
     def read_feature(self, feature_file):
         """Read the type of features that we require, namely GMM_Stats"""
-        return self.read_gmm_stats(feature_file)
+        return GMMStats.from_hdf5(bob.io.base.HDF5File(feature_file))
 
     def write_feature(self, feature, feature_file):
         """Write the features (GMM_Stats)"""
         return feature.save(feature_file)
 
-    def enroll_gmm(self, array):
+    def enroll(self, data):
+        """Enrolls a GMM using MAP adaptation, given a list of 2D np.ndarray's of feature vectors"""
+        [self._check_feature(feature) for feature in data]
+        array = np.vstack(data)
+        # Use the array to train a GMM and return it
         logger.debug(" .... Enrolling with %d feature vectors", array.shape[0])
+
         # TODO responsibility_threshold
         gmm = GMMMachine(
             n_gaussians=self.number_of_gaussians,
@@ -248,7 +188,7 @@ class GMM(BioAlgorithm, BaseEstimator):
             ubm=self.ubm,
             convergence_threshold=self.training_threshold,
             max_fitting_steps=self.gmm_enroll_iterations,
-            random_state=self.rng,  # TODO
+            random_state=self.rng,
             update_means=True,
             update_variances=True,  # TODO default?
             update_weights=True,  # TODO default?
@@ -257,15 +197,6 @@ class GMM(BioAlgorithm, BaseEstimator):
         gmm = gmm.fit(array)
         return gmm
 
-    def enroll(self, data):
-        """Enrolls a GMM using MAP adaptation, given a list of 2D numpy.ndarray's of feature vectors"""
-        [self._check_feature(feature) for feature in data]
-        array = numpy.vstack(data)
-        # Use the array to train a GMM and return it
-        return self.enroll_gmm(array)
-
-    ######################################################
-    #                Feature comparison                  #
     def read_model(self, model_file):
         """Reads the model, which is a GMM machine"""
         return GMMMachine.from_hdf5(bob.io.base.HDF5File(model_file), ubm=self.ubm)
@@ -287,12 +218,13 @@ class GMM(BioAlgorithm, BaseEstimator):
             The probe data to compare to the model.
         """
 
+        # import ipdb; ipdb.set_trace()
         assert isinstance(biometric_reference, GMMMachine)
-        assert isinstance(data, GMMStats)
+        stats = self.project(data)
         return self.scoring_function(
             models_means=[biometric_reference],
             ubm=self.ubm,
-            test_stats=data,
+            test_stats=stats,
             frame_length_normalization=True,
         )[0, 0]
 
@@ -311,12 +243,14 @@ class GMM(BioAlgorithm, BaseEstimator):
             The probe data to compare to the models.
         """
 
-        assert isinstance(biometric_references, GMMMachine)
-        assert isinstance(data, GMMStats)
+        assert isinstance(biometric_references[0], GMMMachine), type(
+            biometric_references[0]
+        )
+        stats = self.project(data)
         return self.scoring_function(
             models_means=biometric_references,
             ubm=self.ubm,
-            test_stats=data,
+            test_stats=stats,
             frame_length_normalization=True,
         )
 
@@ -326,65 +260,86 @@ class GMM(BioAlgorithm, BaseEstimator):
         for probe in probes:
             assert isinstance(probe, GMMStats)
         #    logger.warn("Please verify that this function is correct")
-        return self.scoring_function(
-            models_means=model.means,
-            ubm=self.ubm,
-            test_stats=probes,
-            frame_length_normalization=True,
-        ).mean()
+        return (
+            self.scoring_function(
+                models_means=model.means,
+                ubm=self.ubm,
+                test_stats=probes,
+                frame_length_normalization=True,
+            )
+            .mean()
+            .reshape((-1,))
+        )
 
     def fit(self, X, y=None, **kwargs):
         """Trains the UBM."""
-        self.train_ubm(X)
-        return self
+        # TODO: Delayed to dask array
 
-    def transform(self, X, **kwargs):
-        """Passthrough. Enroll applies a different transform as score."""
-        return X
+        # def delayed_to_xr_dataset(delayed, meta=None):
+        #     """Converts one dask.delayed object to a dask.array"""
+        #     if meta is None:
+        #         meta = np.array(delayed.data.compute())
+        #         print(meta.shape)
 
+        #     darray = da.from_delayed(delayed.data, meta.shape, dtype=meta.dtype, name=False)
+        #     return darray, meta
 
-class GMMRegular(GMM):
-    """Algorithm for computing Universal Background Models and Gaussian Mixture Models of the features"""
+        # def delayed_samples_to_dask_arrays(delayed_samples, meta=None):
+        #     output = []
+        #     for ds in delayed_samples:
+        #         d_array, meta = delayed_to_xr_dataset(ds, meta)
+        #         output.append(d_array)
+        #     return output, meta
 
-    def __init__(self, **kwargs):
-        """Initializes the local UBM-GMM tool chain with the given file selector object"""
-        #    logger.warn("This class must be checked. Please verify that I didn't do any mistake here. I had to rename 'train_projector' into a 'train_enroller'!")
-        # initialize the UBMGMM base class
-        GMM.__init__(self, **kwargs)
-        # register a different set of functions in the Tool base class
-        BioAlgorithm.__init__(
-            self, requires_enroller_training=True, performs_projection=False
-        )
+        # def delayeds_to_xr_dataset(delayeds, meta=None):
+        #     """Converts a set of dask.delayed to a list of dask.array"""
+        #     output = []
+        #     for d in delayeds:
+        #         d_array, meta = delayed_samples_to_dask_arrays(d, meta)
+        #         output.extend(d_array)
+        #     return output
 
-    #######################################################
-    #                UBM training                         #
+        # import ipdb; ipdb.set_trace()
 
-    def train_enroller(self, train_features, enroller_file):
-        """Computes the Universal Background Model from the training ("world") data"""
-        train_features = [feature for client in train_features for feature in client]
-        return self.train_projector(train_features, enroller_file)
+        # bags = ToDaskBag(npartitions=10).transform(X)
 
-    #######################################################
-    #              GMM training using UBM                 #
+        # delayeds = bags.to_delayed()
+        # lengths = bags.map_partitions(lambda samples: [len(samples)]).compute()
+        # for l, d in zip(lengths, delayeds):
+        #     d._length = l
+        # array_data = da.from_delayed(delayeds, shape=(2,-1,60))
+        # array_data = da.stack(delayeds_to_xr_dataset(delayeds))
 
-    def load_enroller(self, enroller_file):
-        """Reads the UBM model from file"""
-        return self.load_projector(enroller_file)
+        # Stack all the samples in a 2D array of features
+        array = da.vstack(X)
 
-    ######################################################
-    #                Feature comparison                  #
-    def score(self, model, probe):
-        """Computes the score for the given model and the given probe.
-        The score are Log-Likelihood.
-        Therefore, the log of the likelihood ratio is obtained by computing the following difference."""
+        logger.debug("UBM with %d feature vectors", array.shape[0])
 
-        assert isinstance(model, GMMMachine)
-        self._check_feature(probe)
-        score = sum(
-            model.log_likelihood(probe[i, :]) - self.ubm.log_likelihood(probe[i, :])
-            for i in range(probe.shape[0])
+        logger.debug(f"Creating UBM machine with {self.number_of_gaussians} gaussians")
+
+        self.ubm = GMMMachine(
+            n_gaussians=self.number_of_gaussians,
+            trainer="ml",
+            max_fitting_steps=self.ubm_training_iterations,
+            convergence_threshold=self.training_threshold,
+            update_means=self.update_means,
+            update_variances=self.update_variances,
+            update_weights=self.update_weights,
+            # TODO more params?
         )
-        return score / probe.shape[0]
 
-    def score_for_multiple_probes(self, model, probes):
-        raise NotImplementedError("Implement Me!")
+        # Trains the GMM
+        logger.info("Training UBM GMM")
+        # Resetting the pseudo random number generator so we can have the same initialization for serial and parallel execution.
+        # self.rng = bob.core.random.mt19937(self.init_seed)
+        self.ubm.fit(array)
+
+        return self
+
+    def transform(self, X, **kwargs):
+        """Passthrough. Enroll applies a different transform as score."""
+        # The idea would be to apply the projection in Transform (going from extracted
+        # to GMMStats), but we must not apply this during the training (fit requires
+        # extracted data directly).
+        # `project` is applied in the score function directly.
+        return X
diff --git a/bob/bio/gmm/bioalgorithm/__init__.py b/bob/bio/gmm/bioalgorithm/__init__.py
index e1b44bc..cf76a6b 100644
--- a/bob/bio/gmm/bioalgorithm/__init__.py
+++ b/bob/bio/gmm/bioalgorithm/__init__.py
@@ -1,5 +1,4 @@
 from .GMM import GMM
-from .GMM import GMMRegular
 
 
 # gets sphinx autodoc done right - don't remove it
@@ -20,6 +19,5 @@ def __appropriate__(*args):
 
 __appropriate__(
     GMM,
-    GMMRegular,
 )
 __all__ = [_ for _ in dir() if not _.startswith("_")]
-- 
GitLab