Commit 80ceaa89 authored by Tiago Pereira's avatar Tiago Pereira Committed by Amir MOHAMMADI
Browse files

Added a shallow input test. Issue #21

parent a2ea2eec
......@@ -15,159 +15,179 @@ from bob.io.base.test_utils import datafile
from bob.learn.em import KMeansMachine, KMeansTrainer
def equals(x, y, epsilon):
return (abs(x - y) < epsilon).all()
return (abs(x - y) < epsilon).all()
def kmeans_plus_plus(machine, data, seed):
"""Python implementation of K-Means++ (initialization)"""
n_data = data.shape[0]
rng = bob.core.random.mt19937(seed)
u = bob.core.random.uniform('int32', 0, n_data-1)
index = u(rng)
machine.set_mean(0, data[index,:])
weights = numpy.zeros(shape=(n_data,), dtype=numpy.float64)
for m in range(1,machine.dim_c):
for s in range(n_data):
s_cur = data[s,:]
w_cur = machine.get_distance_from_mean(s_cur, 0)
for i in range(m):
w_cur = min(machine.get_distance_from_mean(s_cur, i), w_cur)
weights[s] = w_cur
weights *= weights
weights /= numpy.sum(weights)
d = bob.core.random.discrete('int32', weights)
index = d(rng)
machine.set_mean(m, data[index,:])
"""Python implementation of K-Means++ (initialization)"""
n_data = data.shape[0]
rng = bob.core.random.mt19937(seed)
u = bob.core.random.uniform('int32', 0, n_data - 1)
index = u(rng)
machine.set_mean(0, data[index, :])
weights = numpy.zeros(shape=(n_data,), dtype=numpy.float64)
for m in range(1, machine.dim_c):
for s in range(n_data):
s_cur = data[s, :]
w_cur = machine.get_distance_from_mean(s_cur, 0)
for i in range(m):
w_cur = min(machine.get_distance_from_mean(s_cur, i), w_cur)
weights[s] = w_cur
weights *= weights
weights /= numpy.sum(weights)
d = bob.core.random.discrete('int32', weights)
index = d(rng)
machine.set_mean(m, data[index, :])
def NormalizeStdArray(path):
array = bob.io.base.load(path).astype('float64')
std = array.std(axis=0)
return (array/std, std)
array = bob.io.base.load(path).astype('float64')
std = array.std(axis=0)
return (array / std, std)
def multiplyVectorsByFactors(matrix, vector):
for i in range(0, matrix.shape[0]):
for j in range(0, matrix.shape[1]):
matrix[i, j] *= vector[j]
for i in range(0, matrix.shape[0]):
for j in range(0, matrix.shape[1]):
matrix[i, j] *= vector[j]
def flipRows(array):
if len(array.shape) == 2:
return numpy.array([numpy.array(array[1, :]), numpy.array(array[0, :])], 'float64')
elif len(array.shape) == 1:
return numpy.array([array[1], array[0]], 'float64')
else:
raise Exception('Input type not supportd by flipRows')
if len(array.shape) == 2:
return numpy.array([numpy.array(array[1, :]), numpy.array(array[0, :])], 'float64')
elif len(array.shape) == 1:
return numpy.array([array[1], array[0]], 'float64')
else:
raise Exception('Input type not supportd by flipRows')
if hasattr(KMeansTrainer, 'KMEANS_PLUS_PLUS'):
def test_kmeans_plus_plus():
def test_kmeans_plus_plus():
# Tests the K-Means++ initialization
dim_c = 5
dim_d = 7
n_samples = 150
data = numpy.random.randn(n_samples, dim_d)
seed = 0
# C++ implementation
machine = KMeansMachine(dim_c, dim_d)
trainer = KMeansTrainer()
trainer.rng = bob.core.random.mt19937(seed)
trainer.initialization_method = 'KMEANS_PLUS_PLUS'
trainer.initialize(machine, data)
# Python implementation
py_machine = KMeansMachine(dim_c, dim_d)
kmeans_plus_plus(py_machine, data, seed)
assert equals(machine.means, py_machine.means, 1e-8)
# Tests the K-Means++ initialization
dim_c = 5
dim_d = 7
n_samples = 150
data = numpy.random.randn(n_samples,dim_d)
seed = 0
# C++ implementation
def test_kmeans_noduplicate():
# Data/dimensions
dim_c = 2
dim_d = 3
seed = 0
data = numpy.array([[1, 2, 3], [1, 2, 3], [1, 2, 3], [4, 5, 6.]])
# Defines machine and trainer
machine = KMeansMachine(dim_c, dim_d)
trainer = KMeansTrainer()
trainer.rng = bob.core.random.mt19937(seed)
trainer.initialization_method = 'KMEANS_PLUS_PLUS'
trainer.initialize(machine, data)
# Python implementation
py_machine = KMeansMachine(dim_c, dim_d)
kmeans_plus_plus(py_machine, data, seed)
assert equals(machine.means, py_machine.means, 1e-8)
def test_kmeans_noduplicate():
# Data/dimensions
dim_c = 2
dim_d = 3
seed = 0
data = numpy.array([[1,2,3],[1,2,3],[1,2,3],[4,5,6.]])
# Defines machine and trainer
machine = KMeansMachine(dim_c, dim_d)
trainer = KMeansTrainer()
rng = bob.core.random.mt19937(seed)
trainer.initialization_method = 'RANDOM_NO_DUPLICATE'
trainer.initialize(machine, data, rng)
# Makes sure that the two initial mean vectors selected are different
assert equals(machine.get_mean(0), machine.get_mean(1), 1e-8) == False
rng = bob.core.random.mt19937(seed)
trainer.initialization_method = 'RANDOM_NO_DUPLICATE'
trainer.initialize(machine, data, rng)
# Makes sure that the two initial mean vectors selected are different
assert equals(machine.get_mean(0), machine.get_mean(1), 1e-8) == False
def test_kmeans_a():
# Trains a KMeansMachine
# This files contains draws from two 1D Gaussian distributions:
# * 100 samples from N(-10,1)
# * 100 samples from N(10,1)
data = bob.io.base.load(datafile("samplesFrom2G_f64.hdf5", __name__, path="../data/"))
# Trains a KMeansMachine
# This files contains draws from two 1D Gaussian distributions:
# * 100 samples from N(-10,1)
# * 100 samples from N(10,1)
data = bob.io.base.load(datafile("samplesFrom2G_f64.hdf5", __name__, path="../data/"))
machine = KMeansMachine(2, 1)
trainer = KMeansTrainer()
#trainer.train(machine, data)
bob.learn.em.train(trainer,machine,data)
machine = KMeansMachine(2, 1)
[variances, weights] = machine.get_variances_and_weights_for_each_cluster(data)
variances_b = numpy.ndarray(shape=(2,1), dtype=numpy.float64)
weights_b = numpy.ndarray(shape=(2,), dtype=numpy.float64)
machine.__get_variances_and_weights_for_each_cluster_init__(variances_b, weights_b)
machine.__get_variances_and_weights_for_each_cluster_acc__(data, variances_b, weights_b)
machine.__get_variances_and_weights_for_each_cluster_fin__(variances_b, weights_b)
m1 = machine.get_mean(0)
m2 = machine.get_mean(1)
trainer = KMeansTrainer()
# trainer.train(machine, data)
bob.learn.em.train(trainer, machine, data)
[variances, weights] = machine.get_variances_and_weights_for_each_cluster(data)
variances_b = numpy.ndarray(shape=(2, 1), dtype=numpy.float64)
weights_b = numpy.ndarray(shape=(2,), dtype=numpy.float64)
machine.__get_variances_and_weights_for_each_cluster_init__(variances_b, weights_b)
machine.__get_variances_and_weights_for_each_cluster_acc__(data, variances_b, weights_b)
machine.__get_variances_and_weights_for_each_cluster_fin__(variances_b, weights_b)
m1 = machine.get_mean(0)
m2 = machine.get_mean(1)
## Check means [-10,10] / variances [1,1] / weights [0.5,0.5]
if (m1 < m2):
means = numpy.array(([m1[0], m2[0]]), 'float64')
else:
means = numpy.array(([m2[0], m1[0]]), 'float64')
assert equals(means, numpy.array([-10., 10.]), 2e-1)
assert equals(variances, numpy.array([1., 1.]), 2e-1)
assert equals(weights, numpy.array([0.5, 0.5]), 1e-3)
assert equals(variances, variances_b, 1e-8)
assert equals(weights, weights_b, 1e-8)
## Check means [-10,10] / variances [1,1] / weights [0.5,0.5]
if(m1<m2): means=numpy.array(([m1[0],m2[0]]), 'float64')
else: means=numpy.array(([m2[0],m1[0]]), 'float64')
assert equals(means, numpy.array([-10.,10.]), 2e-1)
assert equals(variances, numpy.array([1.,1.]), 2e-1)
assert equals(weights, numpy.array([0.5,0.5]), 1e-3)
assert equals(variances, variances_b, 1e-8)
assert equals(weights, weights_b, 1e-8)
def test_kmeans_b():
# Trains a KMeansMachine
(arStd, std) = NormalizeStdArray(datafile("faithful.torch3.hdf5", __name__, path="../data/"))
machine = KMeansMachine(2, 2)
trainer = KMeansTrainer()
# trainer.seed = 1337
bob.learn.em.train(trainer, machine, arStd, convergence_threshold=0.001)
def test_kmeans_b():
[variances, weights] = machine.get_variances_and_weights_for_each_cluster(arStd)
# Trains a KMeansMachine
(arStd,std) = NormalizeStdArray(datafile("faithful.torch3.hdf5", __name__, path="../data/"))
means = numpy.array(machine.means)
variances = numpy.array(variances)
machine = KMeansMachine(2, 2)
multiplyVectorsByFactors(means, std)
multiplyVectorsByFactors(variances, std ** 2)
trainer = KMeansTrainer()
#trainer.seed = 1337
bob.learn.em.train(trainer,machine, arStd, convergence_threshold=0.001)
gmmWeights = bob.io.base.load(datafile('gmm.init_weights.hdf5', __name__, path="../data/"))
gmmMeans = bob.io.base.load(datafile('gmm.init_means.hdf5', __name__, path="../data/"))
gmmVariances = bob.io.base.load(datafile('gmm.init_variances.hdf5', __name__, path="../data/"))
[variances, weights] = machine.get_variances_and_weights_for_each_cluster(arStd)
if (means[0, 0] < means[1, 0]):
means = flipRows(means)
variances = flipRows(variances)
weights = flipRows(weights)
means = numpy.array(machine.means)
variances = numpy.array(variances)
assert equals(means, gmmMeans, 1e-3)
assert equals(weights, gmmWeights, 1e-3)
assert equals(variances, gmmVariances, 1e-3)
multiplyVectorsByFactors(means, std)
multiplyVectorsByFactors(variances, std ** 2)
# Check that there is no duplicate means during initialization
machine = KMeansMachine(2, 1)
trainer = KMeansTrainer()
trainer.initialization_method = 'RANDOM_NO_DUPLICATE'
data = numpy.array([[1.], [1.], [1.], [1.], [1.], [1.], [2.], [3.]])
bob.learn.em.train(trainer, machine, data)
assert (numpy.isnan(machine.means).any()) == False
gmmWeights = bob.io.base.load(datafile('gmm.init_weights.hdf5', __name__, path="../data/"))
gmmMeans = bob.io.base.load(datafile('gmm.init_means.hdf5', __name__, path="../data/"))
gmmVariances = bob.io.base.load(datafile('gmm.init_variances.hdf5', __name__, path="../data/"))
if (means[0, 0] < means[1, 0]):
means = flipRows(means)
variances = flipRows(variances)
weights = flipRows(weights)
def test_trainer_execption():
from nose.tools import assert_raises
assert equals(means, gmmMeans, 1e-3)
assert equals(weights, gmmWeights, 1e-3)
assert equals(variances, gmmVariances, 1e-3)
# Testing Inf
machine = KMeansMachine(2, 2)
data = numpy.array([[1.0, 2.0], [2, 3.], [1, 1.], [2, 5.], [numpy.inf, 1.0]])
trainer = KMeansTrainer()
assert_raises(ValueError, bob.learn.em.train, trainer, machine, data, 0.001)
# Check that there is no duplicate means during initialization
machine = KMeansMachine(2, 1)
trainer = KMeansTrainer()
trainer.initialization_method = 'RANDOM_NO_DUPLICATE'
data = numpy.array([[1.], [1.], [1.], [1.], [1.], [1.], [2.], [3.]])
bob.learn.em.train(trainer, machine, data)
assert (numpy.isnan(machine.means).any()) == False
# Testing Nan
machine = KMeansMachine(2, 2)
data = numpy.array([[1.0, 2.0], [2, 3.], [1, numpy.nan], [2, 5.], [2.0, 1.0]])
trainer = KMeansTrainer()
assert_raises(ValueError, bob.learn.em.train, trainer, machine, data, 0.001)
......@@ -7,112 +7,123 @@
import numpy
import bob.learn.em
import logging
logger = logging.getLogger('bob.learn.em')
def train(trainer, machine, data, max_iterations = 50, convergence_threshold=None, initialize=True, rng=None):
"""
Trains a machine given a trainer and the proper data
**Parameters**:
trainer : one of :py:class:`KMeansTrainer`, :py:class:`MAP_GMMTrainer`, :py:class:`ML_GMMTrainer`, :py:class:`ISVTrainer`, :py:class:`IVectorTrainer`, :py:class:`PLDATrainer`, :py:class:`EMPCATrainer`
A trainer mechanism
machine : one of :py:class:`KMeansMachine`, :py:class:`GMMMachine`, :py:class:`ISVBase`, :py:class:`IVectorMachine`, :py:class:`PLDAMachine`, :py:class:`bob.learn.linear.Machine`
A container machine
data : array_like <float, 2D>
The data to be trained
max_iterations : int
The maximum number of iterations to train a machine
convergence_threshold : float
The convergence threshold to train a machine. If None, the training procedure will stop with the iterations criteria
initialize : bool
If True, runs the initialization procedure
rng : :py:class:`bob.core.random.mt19937`
The Mersenne Twister mt19937 random generator used for the initialization of subspaces/arrays before the EM loop
"""
#Initialization
if initialize:
if rng is not None:
trainer.initialize(machine, data, rng)
else:
trainer.initialize(machine, data)
trainer.e_step(machine, data)
average_output = 0
average_output_previous = 0
if hasattr(trainer,"compute_likelihood"):
average_output = trainer.compute_likelihood(machine)
for i in range(max_iterations):
logger.info("Iteration = %d/%d", i, max_iterations)
average_output_previous = average_output
trainer.m_step(machine, data)
def train(trainer, machine, data, max_iterations=50, convergence_threshold=None, initialize=True, rng=None,
check_inputs=True):
"""
Trains a machine given a trainer and the proper data
**Parameters**:
trainer : one of :py:class:`KMeansTrainer`, :py:class:`MAP_GMMTrainer`, :py:class:`ML_GMMTrainer`, :py:class:`ISVTrainer`, :py:class:`IVectorTrainer`, :py:class:`PLDATrainer`, :py:class:`EMPCATrainer`
A trainer mechanism
machine : one of :py:class:`KMeansMachine`, :py:class:`GMMMachine`, :py:class:`ISVBase`, :py:class:`IVectorMachine`, :py:class:`PLDAMachine`, :py:class:`bob.learn.linear.Machine`
A container machine
data : array_like <float, 2D>
The data to be trained
max_iterations : int
The maximum number of iterations to train a machine
convergence_threshold : float
The convergence threshold to train a machine. If None, the training procedure will stop with the iterations criteria
initialize : bool
If True, runs the initialization procedure
rng : :py:class:`bob.core.random.mt19937`
The Mersenne Twister mt19937 random generator used for the initialization of subspaces/arrays before the EM loop
check_inputs: Shallow checks in the inputs. Check for inf and NaN
"""
if check_inputs:
if numpy.isinf(numpy.sum(data)):
raise ValueError("Please, check your inputs; numpy.inf detected in `data` ")
if numpy.isnan(numpy.sum(data)):
raise ValueError("Please, check your inputs; numpy.nan detected in `data` ")
# Initialization
if initialize:
if rng is not None:
trainer.initialize(machine, data, rng)
else:
trainer.initialize(machine, data)
trainer.e_step(machine, data)
if hasattr(trainer,"compute_likelihood"):
average_output = trainer.compute_likelihood(machine)
if type(machine) is bob.learn.em.KMeansMachine:
logger.info("average euclidean distance = %f", average_output)
else:
logger.info("log likelihood = %f", average_output)
convergence_value = abs((average_output_previous - average_output)/average_output_previous)
logger.info("convergence value = %f",convergence_value)
#Terminates if converged (and likelihood computation is set)
if convergence_threshold!=None and convergence_value <= convergence_threshold:
break
if hasattr(trainer,"finalize"):
trainer.finalize(machine, data)
average_output = 0
average_output_previous = 0
if hasattr(trainer, "compute_likelihood"):
average_output = trainer.compute_likelihood(machine)
for i in range(max_iterations):
logger.info("Iteration = %d/%d", i, max_iterations)
average_output_previous = average_output
trainer.m_step(machine, data)
trainer.e_step(machine, data)
if hasattr(trainer, "compute_likelihood"):
average_output = trainer.compute_likelihood(machine)
if type(machine) is bob.learn.em.KMeansMachine:
logger.info("average euclidean distance = %f", average_output)
else:
logger.info("log likelihood = %f", average_output)
convergence_value = abs((average_output_previous - average_output) / average_output_previous)
logger.info("convergence value = %f", convergence_value)
# Terminates if converged (and likelihood computation is set)
if convergence_threshold != None and convergence_value <= convergence_threshold:
break
if hasattr(trainer, "finalize"):
trainer.finalize(machine, data)
def train_jfa(trainer, jfa_base, data, max_iterations=10, initialize=True, rng=None):
"""
Trains a :py:class:`bob.learn.em.JFABase` given a :py:class:`bob.learn.em.JFATrainer` and the proper data
**Parameters**:
trainer : :py:class:`bob.learn.em.JFATrainer`
A JFA trainer mechanism
jfa_base : :py:class:`bob.learn.em.JFABase`
A container machine
data : [[:py:class:`bob.learn.em.GMMStats`]]
The data to be trained
max_iterations : int
The maximum number of iterations to train a machine
initialize : bool
If True, runs the initialization procedure
rng : :py:class:`bob.core.random.mt19937`
The Mersenne Twister mt19937 random generator used for the initialization of subspaces/arrays before the EM loops
"""
if initialize:
if rng is not None:
trainer.initialize(jfa_base, data, rng)
else:
trainer.initialize(jfa_base, data)
#V Subspace
logger.info("V subspace estimation...")
for i in range(max_iterations):
logger.info("Iteration = %d/%d", i, max_iterations)
trainer.e_step_v(jfa_base, data)
trainer.m_step_v(jfa_base, data)
trainer.finalize_v(jfa_base, data)
#U subspace
logger.info("U subspace estimation...")
for i in range(max_iterations):
logger.info("Iteration = %d/%d", i, max_iterations)
trainer.e_step_u(jfa_base, data)
trainer.m_step_u(jfa_base, data)
trainer.finalize_u(jfa_base, data)
# D subspace
logger.info("D subspace estimation...")
for i in range(max_iterations):
logger.info("Iteration = %d/%d", i, max_iterations)
trainer.e_step_d(jfa_base, data)
trainer.m_step_d(jfa_base, data)
trainer.finalize_d(jfa_base, data)
"""
Trains a :py:class:`bob.learn.em.JFABase` given a :py:class:`bob.learn.em.JFATrainer` and the proper data
**Parameters**:
trainer : :py:class:`bob.learn.em.JFATrainer`
A JFA trainer mechanism
jfa_base : :py:class:`bob.learn.em.JFABase`
A container machine
data : [[:py:class:`bob.learn.em.GMMStats`]]
The data to be trained
max_iterations : int
The maximum number of iterations to train a machine
initialize : bool
If True, runs the initialization procedure
rng : :py:class:`bob.core.random.mt19937`
The Mersenne Twister mt19937 random generator used for the initialization of subspaces/arrays before the EM loops
"""
if initialize:
if rng is not None:
trainer.initialize(jfa_base, data, rng)
else:
trainer.initialize(jfa_base, data)
# V Subspace
logger.info("V subspace estimation...")
for i in range(max_iterations):
logger.info("Iteration = %d/%d", i, max_iterations)
trainer.e_step_v(jfa_base, data)
trainer.m_step_v(jfa_base, data)
trainer.finalize_v(jfa_base, data)
# U subspace
logger.info("U subspace estimation...")
for i in range(max_iterations):
logger.info("Iteration = %d/%d", i, max_iterations)
trainer.e_step_u(jfa_base, data)
trainer.m_step_u(jfa_base, data)
trainer.finalize_u(jfa_base, data)
# D subspace
logger.info("D subspace estimation...")
for i in range(max_iterations):
logger.info("Iteration = %d/%d", i, max_iterations)
trainer.e_step_d(jfa_base, data)
trainer.m_step_d(jfa_base, data)
trainer.finalize_d(jfa_base, data)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment