From ded8a6e9bdca810f34eeedcd44071461c9e50ef7 Mon Sep 17 00:00:00 2001
From: Yannick DAYER <yannick.dayer@idiap.ch>
Date: Tue, 30 Nov 2021 14:29:06 +0100
Subject: [PATCH] Adapt GMMMachine to new KMeansMachine.

---
 bob/learn/em/data/gmm_ML.hdf5 | Bin 12920 -> 12920 bytes
 bob/learn/em/mixture/gmm.py   |  34 +++++++++++++++++-----------------
 bob/learn/em/test/test_gmm.py |  28 ++++++++++++++++++++--------
 3 files changed, 37 insertions(+), 25 deletions(-)

diff --git a/bob/learn/em/data/gmm_ML.hdf5 b/bob/learn/em/data/gmm_ML.hdf5
index 0326c186f11ad38387b3a4c4b1e3cc7a66d32b80..20362881b1661a826a8773d1658a8df559099d46 100644
GIT binary patch
delta 186
zcmey7@*`!#9A;*1*=3vOGlw#P7?bBSPn@j4tTcHQTM&r9`9IrIK@bBZvDrdWhY`jB
ztCEt2sLGdCg-HC@Y@A%6nKijdRtc;S#-Ahy<9ANH@M5xtLgVJ;9QEvz4OrMV>uH<=
lYu@}{b1n-=*XH?pFxTi6PF7&nn7qNzYVspCDUc+{ApqybL4p7P

delta 194
zcmey7@*`!#9A;+yxl1<BXAWfoF(%Jvo;X>7S!wbrwjdCH^MAIbf*=M+VzY&$4kL^M
zRwX43QI#*P3X%A)**LjCGi!2_tP)ruj6X>Z#_ybX;l*SPg+{IoHVj~}!DjPpg;Ms(
s1}tow^)$|b&Ds24b1n<WfX(yuU=GtOoTOL9ID2xno+3zQ@_fBQ05Ake9RL6T

diff --git a/bob/learn/em/mixture/gmm.py b/bob/learn/em/mixture/gmm.py
index d17c7ae..65c8137 100644
--- a/bob/learn/em/mixture/gmm.py
+++ b/bob/learn/em/mixture/gmm.py
@@ -13,7 +13,6 @@ import numpy as np
 from sklearn.base import BaseEstimator
 
 from bob.learn.em.cluster import KMeansMachine
-from bob.learn.em.cluster import KMeansTrainer
 
 from h5py import File as HDF5File
 
@@ -274,7 +273,7 @@ class GMMMachine(BaseEstimator):
         max_fitting_steps: Union[int, None] = 200,
         random_state: Union[int, np.random.RandomState] = 0,
         weights: "Union[np.ndarray[('n_gaussians',), float], None]" = None,
-        k_means_trainer: Union[KMeansTrainer, None] = None,
+        k_means_trainer: Union[KMeansMachine, None] = None,
         update_means: bool = True,
         update_variances: bool = False,
         update_weights: bool = False,
@@ -317,7 +316,7 @@ class GMMMachine(BaseEstimator):
             Ratio for MAP adaptation. Used when `trainer == "map"` and
             `relevance_factor is None`)
         relevance_factor:
-            Factor for the computation of alpha with Reyolds adaptation. (Used when
+            Factor for the computation of alpha with Reynolds adaptation. (Used when
             `trainer == "map"`)
         variance_thresholds:
             The variance flooring thresholds, i.e. the minimum allowed value of variance in each dimension.
@@ -395,14 +394,14 @@ class GMMMachine(BaseEstimator):
     def variances(self, variances: "np.ndarray[('n_gaussians', 'n_features'), float]"):
         self._variances = np.maximum(self.variance_thresholds, variances)
         # Recompute g_norm for each gaussian [array of shape (n_gaussians,)]
-        n_log_2pi = self.variances.shape[-1] * np.log(2 * np.pi)
+        n_log_2pi = self._variances.shape[-1] * np.log(2 * np.pi)
         self._g_norms = np.array(n_log_2pi + np.log(self._variances).sum(axis=-1))
 
     @property
     def variance_thresholds(self):
         """Threshold below which variances are clamped to prevent precision losses."""
         if self._variance_thresholds is None:
-            raise ValueError("GMMMachine variance thresholds were never set.")
+            return EPSILON
         return self._variance_thresholds
 
     @variance_thresholds.setter
@@ -411,7 +410,8 @@ class GMMMachine(BaseEstimator):
         threshold: "Union[float, np.ndarray[('n_gaussians', 'n_features'), float]]",
     ):
         self._variance_thresholds = threshold
-        self.variances = np.maximum(threshold, self.variances)
+        if self._variances is not None:
+            self.variances = np.maximum(threshold, self._variances)
 
     @property
     def g_norms(self):
@@ -542,12 +542,11 @@ class GMMMachine(BaseEstimator):
             if data is None:
                 raise ValueError("Data is required when training with k-means.")
             logger.info("Initializing GMM with k-means.")
-            kmeans_trainer = self.k_means_trainer or KMeansTrainer(
+            kmeans_machine = self.k_means_trainer or KMeansMachine(
+                self.n_gaussians,
                 random_state=self.random_state,
             )
-            kmeans_machine = KMeansMachine(self.n_gaussians).fit(
-                data, trainer=kmeans_trainer
-            )
+            kmeans_machine = kmeans_machine.fit(data)
 
             (
                 variances,
@@ -706,6 +705,10 @@ class GMMMachine(BaseEstimator):
         else:
             logger.debug("GMM means already set. Initialization was not run!")
 
+        if self._variances is None:
+            logger.warning("Variances were not defined before fit. Using variance=1")
+            self.variances = np.ones_like(self.means)
+
         average_output = 0
         logger.info("Training GMM...")
         step = 0
@@ -713,11 +716,7 @@ class GMMMachine(BaseEstimator):
             step += 1
             logger.info(
                 f"Iteration {step:3d}"
-                + (
-                    f"/{self.max_fitting_steps:3d}"
-                    if self.max_fitting_steps is not None
-                    else ""
-                )
+                + (f"/{self.max_fitting_steps:3d}" if self.max_fitting_steps else "")
             )
 
             average_output_previous = average_output
@@ -750,8 +749,9 @@ class GMMMachine(BaseEstimator):
                     and convergence_value <= self.convergence_threshold
                 ):
                     logger.info("Reached convergence threshold. Training stopped.")
-                    return self
-        logger.info("Reached maximum step. Training stopped without convergence.")
+                    break
+        else:
+            logger.info("Reached maximum step. Training stopped without convergence.")
         self.compute()
         return self
 
diff --git a/bob/learn/em/test/test_gmm.py b/bob/learn/em/test/test_gmm.py
index 43a9449..c11d5e1 100644
--- a/bob/learn/em/test/test_gmm.py
+++ b/bob/learn/em/test/test_gmm.py
@@ -22,7 +22,7 @@ from bob.io.base import load as load_array
 from bob.learn.em.mixture import GMMMachine
 from bob.learn.em.mixture import GMMStats
 
-from bob.learn.em.cluster import KMeansTrainer
+from bob.learn.em.cluster import KMeansMachine
 
 def test_GMMStats():
   # Test a GMMStats
@@ -133,11 +133,10 @@ def test_GMMMachine_1():
   # Checks particular varianceThresholds-related methods
   varianceThresholds1D = np.array([0.3, 1, 0.5], "float64")
   gmm.variance_thresholds = varianceThresholds1D
-  np.testing.assert_equal(gmm.variance_thresholds[0,:], varianceThresholds1D)
-  np.testing.assert_equal(gmm.variance_thresholds[1,:], varianceThresholds1D)
+  np.testing.assert_equal(gmm.variance_thresholds, varianceThresholds1D)
 
   gmm.variance_thresholds = 0.005
-  np.testing.assert_equal(gmm.variance_thresholds, np.full((2,3), 0.005))
+  np.testing.assert_equal(gmm.variance_thresholds, 0.005)
 
   gmm.means     = newMeans
   gmm.variances = newVariances
@@ -251,6 +250,7 @@ def test_GMMStats_2():
     machine = GMMMachine(n_gaussians)
 
     machine.means = np.array([[0, 0, 0], [8, 8, 8]])
+    machine.variances = np.ones_like(machine.means)
 
     # Populate the GMMStats
     stats = machine.acc_statistics(data)
@@ -314,6 +314,7 @@ def test_machine_parameters():
     n_features = 2
     machine = GMMMachine(n_gaussians)
     machine.means = np.repeat([[0], [1], [-1]], n_features, 1)
+    machine.variances = np.ones_like(machine.means)
     np.testing.assert_equal(machine.means, np.repeat([[0], [1], [-1]], n_features, 1))
     np.testing.assert_equal(machine.variances, np.ones((n_gaussians, n_features)))
 
@@ -331,7 +332,10 @@ def test_machine_parameters():
 
 def test_kmeans_plusplus_init():
     n_gaussians = 3
-    machine = GMMMachine(n_gaussians, k_means_trainer=KMeansTrainer("k-means++"))
+    machine = GMMMachine(
+        n_gaussians,
+        k_means_trainer=KMeansMachine(n_clusters=n_gaussians, init_method="k-means++"),
+    )
     data = np.array([[1.5, 1], [1, 1.5], [-1, 0.5], [-1.5, 0], [2, 2], [2.5, 2.5]])
     machine = machine.fit(data)
     expected_means = np.array([[2.25, 2.25], [-1.25, 0.25], [1.25, 1.25]])
@@ -342,7 +346,10 @@ def test_kmeans_plusplus_init():
 
 def test_kmeans_parallel_init():
     n_gaussians = 3
-    machine = GMMMachine(n_gaussians, k_means_trainer=KMeansTrainer("k-means||"))
+    machine = GMMMachine(
+        n_gaussians,
+        k_means_trainer=KMeansMachine(n_clusters=n_gaussians, init_method="k-means||"),
+    )
     data = np.array([[1.5, 1], [1, 1.5], [-1, 0.5], [-1.5, 0], [2, 2], [2.5, 2.5]])
     machine = machine.fit(data)
     expected_means = np.array([[1.25, 1.25], [-1.25, 0.25], [2.25, 2.25]])
@@ -356,6 +363,7 @@ def test_likelihood():
     n_gaussians = 3
     machine = GMMMachine(n_gaussians)
     machine.means = np.repeat([[0], [1], [-1]], 3, 1)
+    machine.variances = np.ones_like(machine.means)
     log_likelihood = machine.log_likelihood(data)
     expected_ll = np.array(
         [-3.6519900964986527, -3.83151883210222, -3.83151883210222, -5.344374066745753]
@@ -390,6 +398,7 @@ def test_likelihood_weight():
     n_gaussians = 3
     machine = GMMMachine(n_gaussians)
     machine.means = np.repeat([[0], [1], [-1]], 3, 1)
+    machine.variances = np.ones_like(machine.means)
     machine.weights = [0.6, 0.1, 0.3]
     log_likelihood = machine.log_likelihood(data)
     expected_ll = np.array(
@@ -429,7 +438,7 @@ def test_ml_em():
 
     machine = GMMMachine(n_gaussians, update_means=True, update_variances=True, update_weights=True)
     machine.means = np.repeat([[2], [8]], n_features, 1)
-    machine.initialize_gaussians(None)
+    machine.variances = np.ones_like(machine.means)
 
     stats = machine.e_step( data)
     machine.m_step(stats)
@@ -447,6 +456,7 @@ def test_map_em():
     n_gaussians = 2
     prior_machine = GMMMachine(n_gaussians)
     prior_machine.means = np.array([[2, 2, 2], [8, 8, 8]])
+    prior_machine.variances = np.ones_like(prior_machine.means)
     prior_machine.weights = np.array([0.5, 0.5])
 
     machine = GMMMachine(n_gaussians, trainer="map", ubm=prior_machine,  update_means=True, update_variances=True, update_weights=True)
@@ -483,6 +493,7 @@ def test_ml_transformer():
 
     machine = GMMMachine(n_gaussians, update_means=True, update_variances=True, update_weights=True)
     machine.means = np.array([[2, 2, 2], [8, 8, 8]])
+    machine.variances = np.ones_like(machine.means)
 
     machine = machine.fit(data)
 
@@ -514,6 +525,7 @@ def test_map_transformer():
     n_features = 3
     prior_machine = GMMMachine(n_gaussians)
     prior_machine.means = np.array([[2, 2, 2], [8, 8, 8]])
+    prior_machine.variances = np.ones_like(prior_machine.means)
     prior_machine.weights = np.array([0.5, 0.5])
 
     machine = GMMMachine(n_gaussians, trainer="map", ubm=prior_machine,  update_means=True, update_variances=True, update_weights=True)
@@ -586,7 +598,7 @@ def test_gmm_ML_1():
 
 
 def test_gmm_ML_2():
-    """Trains a GMMMachine with ML_GMMTrainer; compares to an old reference"""
+    """Trains a GMMMachine with ML_GMMTrainer; compares to a reference"""
     ar = load_array(resource_filename("bob.learn.em", "data/dataNormalized.hdf5"))
 
     # Initialize GMMMachine
-- 
GitLab