diff --git a/bob/learn/em/__init__.py b/bob/learn/em/__init__.py index 0c1ebcb69060792fa85da77ac7572330821e6133..879f33b9d854ad026bea3dad175dd52599a93a3e 100644 --- a/bob/learn/em/__init__.py +++ b/bob/learn/em/__init__.py @@ -1,4 +1,5 @@ import bob.extension +from .mixture import linear_scoring def get_config(): diff --git a/bob/learn/em/cluster/k_means.py b/bob/learn/em/cluster/k_means.py index 2969aa6b2dcc7467ee5f0240d625cd75fa57e0a2..72da1f18a9527fffede1fddfa43eb5862f618629 100644 --- a/bob/learn/em/cluster/k_means.py +++ b/bob/learn/em/cluster/k_means.py @@ -157,12 +157,13 @@ class KMeansMachine(BaseEstimator): weights = weights_count / weights_count.sum() # Accumulate + dask_compatible_eye = np.eye(n_cluster) * np.array(1, like=data) means_sum = np.sum( - np.eye(n_cluster)[closest_centroid_indices][:, :, None] * data[:, None], + dask_compatible_eye[closest_centroid_indices][:, :, None] * data[:, None], axis=0, ) variances_sum = np.sum( - np.eye(n_cluster)[closest_centroid_indices][:, :, None] + dask_compatible_eye[closest_centroid_indices][:, :, None] * (data[:, None] ** 2), axis=0, ) diff --git a/bob/learn/em/test/test_kmeans.py b/bob/learn/em/test/test_kmeans.py index b6a63ba15cf3868c84fe17215b4dc864c09727d3..426b3e540da9990859155f9bcb671d31a6858f3a 100644 --- a/bob/learn/em/test/test_kmeans.py +++ b/bob/learn/em/test/test_kmeans.py @@ -14,55 +14,82 @@ import numpy as np from bob.learn.em.cluster import KMeansMachine +def to_numpy(*args): + result = [] + for x in args: + result.append(np.array(x)) + if len(result) == 1: + return result[0] + return result + + +def to_dask_array(*args): + result = [] + for x in args: + result.append(da.from_array(np.array(x))) + if len(result) == 1: + return result[0] + return result + + def test_KMeansMachine(): # Test a KMeansMachine means = np.array([[3, 70, 0], [4, 72, 0]], "float64") mean = np.array([3, 70, 1], "float64") - # Initializes a KMeansMachine - km = KMeansMachine(2) - km.centroids_ = means + for transform in (to_numpy, to_dask_array): + means, mean = transform(means, mean) + + # Initializes a KMeansMachine + km = KMeansMachine(2) + km.centroids_ = means - # Distance and closest mean - np.testing.assert_almost_equal(km.transform(mean)[0], 1) - np.testing.assert_almost_equal(km.transform(mean)[1], 6) + # Distance and closest mean + np.testing.assert_almost_equal(km.transform(mean)[0], 1) + np.testing.assert_almost_equal(km.transform(mean)[1], 6) - (index, dist) = km.get_closest_centroid(mean) + (index, dist) = km.get_closest_centroid(mean) - assert index == 0, index - np.testing.assert_almost_equal(dist, 1.0) - np.testing.assert_almost_equal(km.get_min_distance(mean), 1) + assert index == 0, index + np.testing.assert_almost_equal(dist, 1.0) + np.testing.assert_almost_equal(km.get_min_distance(mean), 1) def test_KMeansMachine_var_and_weight(): - kmeans = KMeansMachine(2) - kmeans.centroids_ = np.array([[1.2, 1.3], [0.2, -0.3]]) + for transform in (to_numpy, to_dask_array): + kmeans = KMeansMachine(2) + kmeans.centroids_ = transform(np.array([[1.2, 1.3], [0.2, -0.3]])) - data = np.array([[1.0, 1], [1.2, 3], [0, 0], [0.3, 0.2], [0.2, 0]]) - variances, weights = kmeans.get_variances_and_weights_for_each_cluster(data) + data = np.array([[1.0, 1], [1.2, 3], [0, 0], [0.3, 0.2], [0.2, 0]]) + data = transform(data) + variances, weights = kmeans.get_variances_and_weights_for_each_cluster(data) - variances_result = np.array([[0.01, 1.0], [0.01555556, 0.00888889]]) - weights_result = np.array([0.4, 0.6]) + variances_result = np.array([[0.01, 1.0], [0.01555556, 0.00888889]]) + weights_result = np.array([0.4, 0.6]) + + np.testing.assert_almost_equal(variances, variances_result) + np.testing.assert_almost_equal(weights, weights_result) - np.testing.assert_almost_equal(variances, variances_result) - np.testing.assert_almost_equal(weights, weights_result) np.set_printoptions(precision=9) + def test_kmeans_fit(): np.random.seed(0) data1 = np.random.normal(loc=1, size=(2000, 3)) data2 = np.random.normal(loc=-1, size=(2000, 3)) data = np.concatenate([data1, data2], axis=0) - machine = KMeansMachine(2, random_state=0).fit(data) - centroids = machine.centroids_[np.argsort(machine.centroids_[:,0])] - expected = [ - [-1.07173464, -1.06200356, -1.00724920], - [ 0.99479125, 0.99665564, 0.97689017], - ] - print(centroids) - np.testing.assert_almost_equal(centroids, expected) + + for transform in (to_numpy, to_dask_array): + data = transform(data) + machine = KMeansMachine(2, random_state=0).fit(data) + centroids = machine.centroids_[np.argsort(machine.centroids_[:, 0])] + expected = [ + [-1.07173464, -1.06200356, -1.00724920], + [0.99479125, 0.99665564, 0.97689017], + ] + np.testing.assert_almost_equal(centroids, expected) def test_kmeans_fit_init_pp(): @@ -70,14 +97,16 @@ def test_kmeans_fit_init_pp(): data1 = np.random.normal(loc=1, size=(2000, 3)) data2 = np.random.normal(loc=-1, size=(2000, 3)) data = np.concatenate([data1, data2], axis=0) - machine = KMeansMachine(2, init_method="k-means++", random_state=0).fit(data) - centroids = machine.centroids_[np.argsort(machine.centroids_[:,0])] - expected = [ - [-1.07173464, -1.06200356, -1.00724920], - [ 0.99479125, 0.99665564, 0.97689017], - ] - print(centroids) - np.testing.assert_almost_equal(centroids, expected, decimal=7) + + for transform in (to_numpy, to_dask_array): + data = transform(data) + machine = KMeansMachine(2, init_method="k-means++", random_state=0).fit(data) + centroids = machine.centroids_[np.argsort(machine.centroids_[:, 0])] + expected = [ + [-1.07173464, -1.06200356, -1.00724920], + [0.99479125, 0.99665564, 0.97689017], + ] + np.testing.assert_almost_equal(centroids, expected, decimal=7) def test_kmeans_fit_init_random(): @@ -85,11 +114,12 @@ def test_kmeans_fit_init_random(): data1 = np.random.normal(loc=1, size=(2000, 3)) data2 = np.random.normal(loc=-1, size=(2000, 3)) data = np.concatenate([data1, data2], axis=0) - machine = KMeansMachine(2, init_method="random", random_state=0).fit(data) - centroids = machine.centroids_[np.argsort(machine.centroids_[:,0])] - expected = [ - [-1.07329460, -1.06207104, -1.00714365], - [ 0.99529015, 0.99570570, 0.97580858], - ] - print(centroids) - np.testing.assert_almost_equal(centroids, expected, decimal=7) + for transform in (to_numpy, to_dask_array): + data = transform(data) + machine = KMeansMachine(2, init_method="random", random_state=0).fit(data) + centroids = machine.centroids_[np.argsort(machine.centroids_[:, 0])] + expected = [ + [-1.07329460, -1.06207104, -1.00714365], + [0.99529015, 0.99570570, 0.97580858], + ] + np.testing.assert_almost_equal(centroids, expected, decimal=7)