Skip to content
Snippets Groups Projects
Commit 33ac375d authored by Amir MOHAMMADI's avatar Amir MOHAMMADI
Browse files

test kmeans on dask arrays

parent ded8a6e9
No related branches found
No related tags found
2 merge requests!42GMM implementation in Python,!40Transition to a pure python implementation
Pipeline #56794 failed
import bob.extension import bob.extension
from .mixture import linear_scoring
def get_config(): def get_config():
......
...@@ -157,12 +157,13 @@ class KMeansMachine(BaseEstimator): ...@@ -157,12 +157,13 @@ class KMeansMachine(BaseEstimator):
weights = weights_count / weights_count.sum() weights = weights_count / weights_count.sum()
# Accumulate # Accumulate
dask_compatible_eye = np.eye(n_cluster) * np.array(1, like=data)
means_sum = np.sum( means_sum = np.sum(
np.eye(n_cluster)[closest_centroid_indices][:, :, None] * data[:, None], dask_compatible_eye[closest_centroid_indices][:, :, None] * data[:, None],
axis=0, axis=0,
) )
variances_sum = np.sum( variances_sum = np.sum(
np.eye(n_cluster)[closest_centroid_indices][:, :, None] dask_compatible_eye[closest_centroid_indices][:, :, None]
* (data[:, None] ** 2), * (data[:, None] ** 2),
axis=0, axis=0,
) )
......
...@@ -14,55 +14,82 @@ import numpy as np ...@@ -14,55 +14,82 @@ import numpy as np
from bob.learn.em.cluster import KMeansMachine from bob.learn.em.cluster import KMeansMachine
def to_numpy(*args):
result = []
for x in args:
result.append(np.array(x))
if len(result) == 1:
return result[0]
return result
def to_dask_array(*args):
result = []
for x in args:
result.append(da.from_array(np.array(x)))
if len(result) == 1:
return result[0]
return result
def test_KMeansMachine(): def test_KMeansMachine():
# Test a KMeansMachine # Test a KMeansMachine
means = np.array([[3, 70, 0], [4, 72, 0]], "float64") means = np.array([[3, 70, 0], [4, 72, 0]], "float64")
mean = np.array([3, 70, 1], "float64") mean = np.array([3, 70, 1], "float64")
# Initializes a KMeansMachine for transform in (to_numpy, to_dask_array):
km = KMeansMachine(2) means, mean = transform(means, mean)
km.centroids_ = means
# Initializes a KMeansMachine
km = KMeansMachine(2)
km.centroids_ = means
# Distance and closest mean # Distance and closest mean
np.testing.assert_almost_equal(km.transform(mean)[0], 1) np.testing.assert_almost_equal(km.transform(mean)[0], 1)
np.testing.assert_almost_equal(km.transform(mean)[1], 6) np.testing.assert_almost_equal(km.transform(mean)[1], 6)
(index, dist) = km.get_closest_centroid(mean) (index, dist) = km.get_closest_centroid(mean)
assert index == 0, index assert index == 0, index
np.testing.assert_almost_equal(dist, 1.0) np.testing.assert_almost_equal(dist, 1.0)
np.testing.assert_almost_equal(km.get_min_distance(mean), 1) np.testing.assert_almost_equal(km.get_min_distance(mean), 1)
def test_KMeansMachine_var_and_weight(): def test_KMeansMachine_var_and_weight():
kmeans = KMeansMachine(2) for transform in (to_numpy, to_dask_array):
kmeans.centroids_ = np.array([[1.2, 1.3], [0.2, -0.3]]) kmeans = KMeansMachine(2)
kmeans.centroids_ = transform(np.array([[1.2, 1.3], [0.2, -0.3]]))
data = np.array([[1.0, 1], [1.2, 3], [0, 0], [0.3, 0.2], [0.2, 0]]) data = np.array([[1.0, 1], [1.2, 3], [0, 0], [0.3, 0.2], [0.2, 0]])
variances, weights = kmeans.get_variances_and_weights_for_each_cluster(data) data = transform(data)
variances, weights = kmeans.get_variances_and_weights_for_each_cluster(data)
variances_result = np.array([[0.01, 1.0], [0.01555556, 0.00888889]]) variances_result = np.array([[0.01, 1.0], [0.01555556, 0.00888889]])
weights_result = np.array([0.4, 0.6]) weights_result = np.array([0.4, 0.6])
np.testing.assert_almost_equal(variances, variances_result)
np.testing.assert_almost_equal(weights, weights_result)
np.testing.assert_almost_equal(variances, variances_result)
np.testing.assert_almost_equal(weights, weights_result)
np.set_printoptions(precision=9) np.set_printoptions(precision=9)
def test_kmeans_fit(): def test_kmeans_fit():
np.random.seed(0) np.random.seed(0)
data1 = np.random.normal(loc=1, size=(2000, 3)) data1 = np.random.normal(loc=1, size=(2000, 3))
data2 = np.random.normal(loc=-1, size=(2000, 3)) data2 = np.random.normal(loc=-1, size=(2000, 3))
data = np.concatenate([data1, data2], axis=0) data = np.concatenate([data1, data2], axis=0)
machine = KMeansMachine(2, random_state=0).fit(data)
centroids = machine.centroids_[np.argsort(machine.centroids_[:,0])] for transform in (to_numpy, to_dask_array):
expected = [ data = transform(data)
[-1.07173464, -1.06200356, -1.00724920], machine = KMeansMachine(2, random_state=0).fit(data)
[ 0.99479125, 0.99665564, 0.97689017], centroids = machine.centroids_[np.argsort(machine.centroids_[:, 0])]
] expected = [
print(centroids) [-1.07173464, -1.06200356, -1.00724920],
np.testing.assert_almost_equal(centroids, expected) [0.99479125, 0.99665564, 0.97689017],
]
np.testing.assert_almost_equal(centroids, expected)
def test_kmeans_fit_init_pp(): def test_kmeans_fit_init_pp():
...@@ -70,14 +97,16 @@ def test_kmeans_fit_init_pp(): ...@@ -70,14 +97,16 @@ def test_kmeans_fit_init_pp():
data1 = np.random.normal(loc=1, size=(2000, 3)) data1 = np.random.normal(loc=1, size=(2000, 3))
data2 = np.random.normal(loc=-1, size=(2000, 3)) data2 = np.random.normal(loc=-1, size=(2000, 3))
data = np.concatenate([data1, data2], axis=0) data = np.concatenate([data1, data2], axis=0)
machine = KMeansMachine(2, init_method="k-means++", random_state=0).fit(data)
centroids = machine.centroids_[np.argsort(machine.centroids_[:,0])] for transform in (to_numpy, to_dask_array):
expected = [ data = transform(data)
[-1.07173464, -1.06200356, -1.00724920], machine = KMeansMachine(2, init_method="k-means++", random_state=0).fit(data)
[ 0.99479125, 0.99665564, 0.97689017], centroids = machine.centroids_[np.argsort(machine.centroids_[:, 0])]
] expected = [
print(centroids) [-1.07173464, -1.06200356, -1.00724920],
np.testing.assert_almost_equal(centroids, expected, decimal=7) [0.99479125, 0.99665564, 0.97689017],
]
np.testing.assert_almost_equal(centroids, expected, decimal=7)
def test_kmeans_fit_init_random(): def test_kmeans_fit_init_random():
...@@ -85,11 +114,12 @@ def test_kmeans_fit_init_random(): ...@@ -85,11 +114,12 @@ def test_kmeans_fit_init_random():
data1 = np.random.normal(loc=1, size=(2000, 3)) data1 = np.random.normal(loc=1, size=(2000, 3))
data2 = np.random.normal(loc=-1, size=(2000, 3)) data2 = np.random.normal(loc=-1, size=(2000, 3))
data = np.concatenate([data1, data2], axis=0) data = np.concatenate([data1, data2], axis=0)
machine = KMeansMachine(2, init_method="random", random_state=0).fit(data) for transform in (to_numpy, to_dask_array):
centroids = machine.centroids_[np.argsort(machine.centroids_[:,0])] data = transform(data)
expected = [ machine = KMeansMachine(2, init_method="random", random_state=0).fit(data)
[-1.07329460, -1.06207104, -1.00714365], centroids = machine.centroids_[np.argsort(machine.centroids_[:, 0])]
[ 0.99529015, 0.99570570, 0.97580858], expected = [
] [-1.07329460, -1.06207104, -1.00714365],
print(centroids) [0.99529015, 0.99570570, 0.97580858],
np.testing.assert_almost_equal(centroids, expected, decimal=7) ]
np.testing.assert_almost_equal(centroids, expected, decimal=7)
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment