From 665363a76b426732e20ec15978fd8b834e5b94a3 Mon Sep 17 00:00:00 2001
From: Tiago Freitas Pereira <tiagofrepereira@gmail.com>
Date: Mon, 14 Feb 2022 16:04:25 +0100
Subject: [PATCH] Making linear models work with numpy and dask

---
 bob/learn/em/linear/wccn.py      | 33 ++++++++++++-------
 bob/learn/em/linear/whitening.py | 20 ++++++++----
 bob/learn/em/test/test_linear.py | 54 +++++++++++++++++++++++++-------
 3 files changed, 78 insertions(+), 29 deletions(-)

diff --git a/bob/learn/em/linear/wccn.py b/bob/learn/em/linear/wccn.py
index b0c86f4..5d0dbaa 100644
--- a/bob/learn/em/linear/wccn.py
+++ b/bob/learn/em/linear/wccn.py
@@ -1,11 +1,11 @@
 from sklearn.base import BaseEstimator
 from sklearn.base import TransformerMixin
 
-# import numpy as np
-from scipy.linalg import inv, cholesky
+
+import dask
+
+# Dask doesn't have an implementation for `pinv`
 from scipy.linalg import pinv
-import dask.array as da
-import numpy as np
 
 
 class WCCN(TransformerMixin, BaseEstimator):
@@ -37,23 +37,34 @@ class WCCN(TransformerMixin, BaseEstimator):
     def __init__(self, pinv=False):
         self.pinv = pinv
 
-    def fit(self, X: da.Array, y: list):
+    def fit(self, X, y):
+
+        # CHECKING THE TYPES
+        if isinstance(X, dask.array.Array):
+            import dask.array as numerical_module
+            from dask.array.linalg import inv, cholesky
+        else:
+            import numpy as numerical_module
+            from scipy.linalg import inv, cholesky
 
         possible_labels = set(y)
-        y_ = np.array(y)
+        y_ = numerical_module.array(y)
 
         n_classes = len(possible_labels)
 
         # 1. compute the means for each label
-        mu_l = da.array(
-            [da.mean(X[np.where(y_ == l)[0]], axis=0) for l in possible_labels]
+        mu_l = numerical_module.array(
+            [
+                numerical_module.mean(X[numerical_module.where(y_ == l)[0]], axis=0)
+                for l in possible_labels
+            ]
         )
 
         # 2. Compute Sw
-        Sw = da.zeros((X.shape[1], X.shape[1]), dtype=float)
+        Sw = numerical_module.zeros((X.shape[1], X.shape[1]), dtype=float)
 
         for l in possible_labels:
-            indexes = np.where(y_ == l)[0]
+            indexes = numerical_module.where(y_ == l)[0]
             X_l_mu_l = X[indexes] - mu_l[l]
 
             Sw += X_l_mu_l.T @ X_l_mu_l
@@ -71,7 +82,7 @@ class WCCN(TransformerMixin, BaseEstimator):
 
         return self
 
-    def transform(self, X: da.Array):
+    def transform(self, X):
 
         return ((X - self.input_subtract) / self.input_divide) @ self.weights
 
diff --git a/bob/learn/em/linear/whitening.py b/bob/learn/em/linear/whitening.py
index b44084b..dd10460 100644
--- a/bob/learn/em/linear/whitening.py
+++ b/bob/learn/em/linear/whitening.py
@@ -1,9 +1,8 @@
 from sklearn.base import BaseEstimator
 from sklearn.base import TransformerMixin
 import numpy as np
-from dask.array.linalg import inv, cholesky
 from scipy.linalg import pinv
-import dask.array as da
+import dask
 
 
 class Whitening(TransformerMixin, BaseEstimator):
@@ -43,10 +42,19 @@ class Whitening(TransformerMixin, BaseEstimator):
     def __init__(self, pinv: bool = False):
         self.pinv = pinv
 
-    def fit(self, X: da.Array, y=None):
+    def fit(self, X, y=None):
+        # CHECKING THE TYPES
+        if isinstance(X, dask.array.Array):
+            import dask.array as numerical_module
+            from dask.array.linalg import inv, cholesky
+
+        else:
+            import numpy as numerical_module
+            from scipy.linalg import inv, cholesky
+
         # 1. Computes the mean vector and the covariance matrix of the training set
-        mu = da.mean(X, axis=0)
-        cov = da.cov(X.T)
+        mu = numerical_module.mean(X, axis=0)
+        cov = numerical_module.cov(X.T)
 
         # 2. Computes the inverse of the covariance matrix
         inv_cov = pinv(cov) if self.pinv else inv(cov)
@@ -60,7 +68,7 @@ class Whitening(TransformerMixin, BaseEstimator):
 
         return self
 
-    def transform(self, X: da.Array):
+    def transform(self, X):
         return ((X - self.input_subtract) / self.input_divide) @ self.weights
 
     def _more_tags(self):
diff --git a/bob/learn/em/test/test_linear.py b/bob/learn/em/test/test_linear.py
index f858af0..98ad1fb 100644
--- a/bob/learn/em/test/test_linear.py
+++ b/bob/learn/em/test/test_linear.py
@@ -18,10 +18,16 @@ from bob.learn.em.linear import (
 )
 
 
-def test_whitening_py():
+def run_whitening(with_dask):
+
+    # CHECKING THE TYPES
+    if with_dask:
+        import dask.array as numerical_module
+    else:
+        import numpy as numerical_module
 
     # Tests our Whitening extractor.
-    data = da.array(
+    data = numerical_module.array(
         [
             [1.2622, -1.6443, 0.1889],
             [0.4286, -0.8922, 1.3020],
@@ -31,18 +37,20 @@ def test_whitening_py():
             [0.4301, 0.4886, -0.1456],
         ]
     )
-    sample = da.array([1, 2, 3.0])
+    sample = numerical_module.array([1, 2, 3.0])
 
     # Expected results (from matlab)
-    mean_ref = da.array([0.096324163333333, -0.465965438333333, 0.366839091666667])
-    whit_ref = da.array(
+    mean_ref = numerical_module.array(
+        [0.096324163333333, -0.465965438333333, 0.366839091666667]
+    )
+    whit_ref = numerical_module.array(
         [
             [1.608410253685985, 0, 0],
             [1.079813355720326, 1.411083365535711, 0],
             [0.693459921529905, 0.571417184139332, 1.800117179839927],
         ]
     )
-    sample_whitened_ref = da.array(
+    sample_whitened_ref = numerical_module.array(
         [5.942255453628436, 4.984316201643742, 4.739998188373740]
     )
 
@@ -70,10 +78,16 @@ def test_whitening_py():
     assert np.allclose(s2, sample_whitened_ref, eps, eps)
 
 
-def test_wccn_py():
+def run_wccn(with_dask):
+
+    # CHECKING THE TYPES
+    if with_dask:
+        import dask.array as numerical_module
+    else:
+        import numpy as numerical_module
 
     # Tests our Whitening extractor.
-    X = da.array(
+    X = numerical_module.array(
         [
             [1.2622, -1.6443, 0.1889],
             [0.4286, -0.8922, 1.3020],
@@ -85,18 +99,18 @@ def test_wccn_py():
     )
     y = [0, 0, 1, 1, 2, 2]
 
-    sample = da.array([1, 2, 3.0])
+    sample = numerical_module.array([1, 2, 3.0])
 
     # Expected results
-    mean_ref = da.array([0.0, 0.0, 0.0])
-    weight_ref = da.array(
+    mean_ref = numerical_module.array([0.0, 0.0, 0.0])
+    weight_ref = numerical_module.array(
         [
             [15.8455444, 0.0, 0.0],
             [-10.7946764, 2.87942129, 0.0],
             [18.76762201, -2.19719292, 2.1505817],
         ]
     )
-    sample_wccn_ref = da.array([50.55905765, -0.83273618, 6.45174511])
+    sample_wccn_ref = numerical_module.array([50.55905765, -0.83273618, 6.45174511])
 
     # Runs WCCN (first method)
     t = WCCN()
@@ -118,3 +132,19 @@ def test_wccn_py():
     assert np.allclose(t.input_subtract, mean_ref, eps, eps)
     assert np.allclose(t.weights, weight_ref, eps, eps)
     assert np.allclose(s2, sample_wccn_ref, eps, eps)
+
+
+def test_wccn_numpy():
+    run_wccn(with_dask=False)
+
+
+def test_wccn_dask():
+    run_wccn(with_dask=True)
+
+
+def test_whitening_numpy():
+    run_whitening(with_dask=False)
+
+
+def test_whitening_dask():
+    run_whitening(with_dask=True)
-- 
GitLab