Skip to content
Snippets Groups Projects
Commit 2f93ed06 authored by Manuel Günther's avatar Manuel Günther
Browse files

Updated GMM algorithm and implemented parallel UBM training

parent 94aaedea
No related branches found
No related tags found
No related merge requests found
Showing with 4798 additions and 76 deletions
......@@ -15,21 +15,21 @@ import logging
logger = logging.getLogger("bob.bio.gmm")
class GMM (Algorithm):
"""Algorithm for computing Universal Background Models and Gaussian Mixture Models of the features"""
"""Algorithm for computing Universal Background Models and Gaussian Mixture Models of the features.
Features must be normalized to zero mean and unit standard deviation."""
def __init__(
self,
# parameters for the GMM
number_of_gaussians,
# parameters of UBM training
k_means_training_iterations = 500, # Maximum number of iterations for K-Means
gmm_training_iterations = 500, # Maximum number of iterations for ML GMM Training
kmeans_training_iterations = 25, # Maximum number of iterations for K-Means
gmm_training_iterations = 25, # Maximum number of iterations for ML GMM Training
training_threshold = 5e-4, # Threshold to end the ML training
variance_threshold = 5e-4, # Minimum value that a variance can reach
update_weights = True,
update_means = True,
update_variances = True,
normalize_before_k_means = True, # Normalize the input features before running K-Means
# parameters of the GMM enrollment
relevance_factor = 4, # Relevance factor as described in Reynolds paper
gmm_enroll_iterations = 1, # Number of iterations for the enrollment phase
......@@ -47,14 +47,13 @@ class GMM (Algorithm):
use_projected_features_for_enrollment = False,
number_of_gaussians = number_of_gaussians,
k_means_training_iterations = k_means_training_iterations,
kmeans_training_iterations = kmeans_training_iterations,
gmm_training_iterations = gmm_training_iterations,
training_threshold = training_threshold,
variance_threshold = variance_threshold,
update_weights = update_weights,
update_means = update_means,
update_variances = update_variances,
normalize_before_k_means = normalize_before_k_means,
relevance_factor = relevance_factor,
gmm_enroll_iterations = gmm_enroll_iterations,
responsibility_threshold = responsibility_threshold,
......@@ -67,14 +66,13 @@ class GMM (Algorithm):
# copy parameters
self.gaussians = number_of_gaussians
self.k_means_training_iterations = k_means_training_iterations
self.kmeans_training_iterations = kmeans_training_iterations
self.gmm_training_iterations = gmm_training_iterations
self.training_threshold = training_threshold
self.variance_threshold = variance_threshold
self.update_weights = update_weights
self.update_means = update_means
self.update_variances = update_variances
self.normalize_before_k_means = normalize_before_k_means
self.relevance_factor = relevance_factor
self.gmm_enroll_iterations = gmm_enroll_iterations
self.init_seed = INIT_SEED
......@@ -83,6 +81,8 @@ class GMM (Algorithm):
self.scoring_function = scoring_function
self.ubm = None
self.kmeans_trainer = bob.learn.em.KMeansTrainer()
self.ubm_trainer = bob.learn.em.ML_GMMTrainer(self.update_means, self.update_variances, self.update_weights, self.responsibility_threshold)
def _check_feature(self, feature):
......@@ -94,42 +94,6 @@ class GMM (Algorithm):
#######################################################
################ UBM training #########################
def _normalize_std_array(self, array):
"""Applies a unit variance normalization to an array"""
# Initializes variables
n_samples = array.shape[0]
length = array.shape[1]
mean = numpy.zeros((length,))
std = numpy.zeros((length,))
# Computes mean and variance
for k in range(n_samples):
x = array[k,:].astype('float64')
mean += x
std += (x ** 2)
mean /= n_samples
std /= n_samples
std -= (mean ** 2)
std = std ** 0.5 # sqrt(std)
ar_std_list = []
for k in range(n_samples):
ar_std_list.append(array[k,:].astype('float64') / std)
ar_std = numpy.vstack(ar_std_list)
return (ar_std,std)
def _multiply_vectors_by_factors(self, matrix, vector):
"""Used to unnormalize some data"""
for i in range(0, matrix.shape[0]):
for j in range(0, matrix.shape[1]):
matrix[i, j] *= vector[j]
#######################################################
################ UBM training #########################
......@@ -141,35 +105,18 @@ class GMM (Algorithm):
# Computes input size
input_size = array.shape[1]
# Normalizes the array if required
logger.debug(" .... Normalizing the array")
if not self.normalize_before_k_means:
normalized_array = array
else:
normalized_array, std_array = self._normalize_std_array(array)
# Creates the machines (KMeans and GMM)
logger.debug(" .... Creating machines")
kmeans = bob.learn.em.KMeansMachine(self.gaussians, input_size)
self.ubm = bob.learn.em.GMMMachine(self.gaussians, input_size)
# Creates the KMeansTrainer
kmeans_trainer = bob.learn.em.KMeansTrainer()
# Trains using the KMeansTrainer
logger.info(" -> Training K-Means")
bob.learn.em.train(kmeans_trainer, kmeans, normalized_array, self.gmm_training_iterations, self.training_threshold, bob.core.random.mt19937(self.init_seed))
bob.learn.em.train(self.kmeans_trainer, kmeans, array, self.kmeans_training_iterations, self.training_threshold, bob.core.random.mt19937(self.init_seed))
variances, weights = kmeans.get_variances_and_weights_for_each_cluster(normalized_array)
variances, weights = kmeans.get_variances_and_weights_for_each_cluster(array)
means = kmeans.means
# Undoes the normalization
if self.normalize_before_k_means:
logger.debug(" .... Undoing normalization")
self._multiply_vectors_by_factors(means, std_array)
self._multiply_vectors_by_factors(variances, std_array ** 2)
# Initializes the GMM
self.ubm.means = means
self.ubm.variances = variances
......@@ -178,8 +125,7 @@ class GMM (Algorithm):
# Trains the GMM
logger.info(" -> Training GMM")
trainer = bob.learn.em.ML_GMMTrainer(self.update_means, self.update_variances, self.update_weights)
bob.learn.em.train(trainer, self.ubm, array, self.gmm_training_iterations, self.training_threshold, bob.core.random.mt19937(self.init_seed))
bob.learn.em.train(self.ubm_trainer, self.ubm, array, self.gmm_training_iterations, self.training_threshold, bob.core.random.mt19937(self.init_seed))
def _save_projector(self, projector_file):
......@@ -219,7 +165,7 @@ class GMM (Algorithm):
self.load_ubm(projector_file)
# prepare MAP_GMM_Trainer
kwargs = dict(mean_var_update_responsibilities_threshold=self.responsibility_threshold) if self.responsibility_threshold > 0. else dict()
self.trainer = bob.learn.em.MAP_GMMTrainer(self.ubm, relevance_factor = self.relevance_factor, update_means = True, update_variances = False, **kwargs)
self.enroll_trainer = bob.learn.em.MAP_GMMTrainer(self.ubm, relevance_factor = self.relevance_factor, update_means = True, update_variances = False, **kwargs)
self.rng = bob.core.random.mt19937(self.init_seed)
......@@ -252,7 +198,7 @@ class GMM (Algorithm):
gmm = bob.learn.em.GMMMachine(self.ubm)
gmm.set_variance_thresholds(self.variance_threshold)
bob.learn.em.train(self.trainer, gmm, array, self.gmm_enroll_iterations, self.training_threshold, self.rng)
bob.learn.em.train(self.enroll_trainer, gmm, array, self.gmm_enroll_iterations, self.training_threshold, self.rng)
return gmm
def enroll(self, feature_arrays):
......
......@@ -5,6 +5,4 @@ import numpy
algorithm = bob.bio.gmm.algorithm.GMM(
number_of_gaussians = 512,
# by default, features are expected to be normalized and, hence, we don't need to re-normalize them
normalize_before_k_means = False
)
......@@ -7,7 +7,5 @@ algorithm = bob.bio.gmm.algorithm.ISV(
# ISV parameters
subspace_dimension_of_u = 160,
# GMM parameters
number_of_gaussians = 512,
# by default, our features are normalized, so it does not need to be done here
normalize_before_k_means = False
number_of_gaussians = 512
)
#!/usr/bin/env python
# vim: set fileencoding=utf-8 :
# Manuel Guenther <Manuel.Guenther@idiap.ch>
from __future__ import print_function
import sys
import argparse
import logging
logger = logging.getLogger("bob.bio.gmm")
import bob.bio.base
from .. import tools, algorithm
from bob.bio.base import tools as base_tools
def parse_arguments(command_line_parameters, exclude_resources_from = []):
"""This function parses the given options (which by default are the command line options). If exclude_resources_from is specified (as a list), the resources from the given packages are not listed in the help message."""
# set up command line parser
parsers = base_tools.command_line_parser(exclude_resources_from = exclude_resources_from)
# add GMM-related options
tools.add_parallel_gmm_options(parsers)
# override some parameters
parsers['config'].add_argument('-g', '--grid', metavar = 'x', nargs = '+', required=True,
help = 'Configuration for the grid setup; required for the parallel execution script.')
parsers['config'].add_argument('-a', '--algorithm', metavar = 'x', nargs = '+', default = ['gmm'],
help = 'Face recognition; only GMM-related algorithms are allowed')
# Add sub-tasks that can be executed by this script
parser = parsers['main']
parser.add_argument('--sub-task',
choices = ('preprocess', 'train-extractor', 'extract', 'normalize-features', 'kmeans-init', 'kmeans-e-step', 'kmeans-m-step', 'gmm-init', 'gmm-e-step', 'gmm-m-step', 'project', 'enroll', 'compute-scores', 'concatenate'),
help = argparse.SUPPRESS) #'Executes a subtask (FOR INTERNAL USE ONLY!!!)'
parser.add_argument('--iteration', type = int,
help = argparse.SUPPRESS) #'Which type of models to generate (Normal or TModels)'
parser.add_argument('--model-type', choices = ['N', 'T'],
help = argparse.SUPPRESS) #'Which type of models to generate (Normal or TModels)'
parser.add_argument('--score-type', choices = ['A', 'B', 'C', 'D', 'Z'],
help = argparse.SUPPRESS) #'The type of scores that should be computed'
parser.add_argument('--group',
help = argparse.SUPPRESS) #'The group for which the current action should be performed'
# now that we have set up everything, get the command line arguments
args = base_tools.initialize(parsers, command_line_parameters,
skips = ['preprocessing', 'extractor-training', 'extraction', 'normalization', 'kmeans', 'gmm', 'projection', 'enroller-training', 'enrollment', 'score-computation', 'concatenation', 'calibration']
)
args.skip_projector_training = True
# and add the GMM-related parameters
tools.initialize_parallel_gmm(args)
# assert that the algorithm is a GMM
if args.algorithm.__class__ not in (algorithm.GMM, algorithm.GMMRegular):
raise ValueError("The given algorithm %s is not a (pure) GMM algorithm" % type(args.algorithm))
return args
def add_gmm_jobs(args, job_ids, deps, submitter):
"""Adds all GMM-related jobs."""
# KMeans
if not args.skip_kmeans:
# initialization
if not args.kmeans_start_iteration:
job_ids['kmeans-init'] = submitter.submit(
'--sub-task kmeans-init',
name = 'k-init',
dependencies = deps,
**args.grid.training_queue)
deps.append(job_ids['kmeans-init'])
# several iterations of E and M steps
for iteration in range(args.kmeans_start_iteration, args.algorithm.kmeans_training_iterations):
# E-step
job_ids['kmeans-e-step'] = submitter.submit(
'--sub-task kmeans-e-step --iteration %d' % iteration,
name='k-e-%d' % iteration,
number_of_parallel_jobs = args.grid.number_of_projection_jobs,
dependencies = [job_ids['kmeans-m-step']] if iteration != args.kmeans_start_iteration else deps,
**args.grid.projection_queue)
# M-step
job_ids['kmeans-m-step'] = submitter.submit(
'--sub-task kmeans-m-step --iteration %d' % iteration,
name='k-m-%d' % iteration,
dependencies = [job_ids['kmeans-e-step']],
**args.grid.training_queue)
# add dependence to the last m step
deps.append(job_ids['kmeans-m-step'])
# GMM
if not args.skip_gmm:
# initialization
if not args.gmm_start_iteration:
job_ids['gmm-init'] = submitter.submit(
'--sub-task gmm-init',
name = 'g-init',
dependencies = deps,
**args.grid.training_queue)
deps.append(job_ids['gmm-init'])
# several iterations of E and M steps
for iteration in range(args.gmm_start_iteration, args.algorithm.gmm_training_iterations):
# E-step
job_ids['gmm-e-step'] = submitter.submit(
'--sub-task gmm-e-step --iteration %d' % iteration,
name='g-e-%d' % iteration,
number_of_parallel_jobs = args.grid.number_of_projection_jobs,
dependencies = [job_ids['gmm-m-step']] if iteration != args.gmm_start_iteration else deps,
**args.grid.projection_queue)
# M-step
job_ids['gmm-m-step'] = submitter.submit(
'--sub-task gmm-m-step --iteration %d' % iteration,
name='g-m-%d' % iteration,
dependencies = [job_ids['gmm-e-step']],
**args.grid.training_queue)
# add dependence to the last m step
deps.append(job_ids['gmm-m-step'])
return job_ids, deps
def add_jobs(args, submitter):
"""Adds all (desired) jobs of the tool chain to the grid, or to the local list to be executed."""
assert args.grid is not None
# Here, we use the default bob.bio.base add_jobs function, but intercept it for adding the training
SKIPS = ['preprocessing', 'extractor_training', 'extraction', 'projector_training', 'projection', 'enroller_training', 'enrollment', 'score_computation', 'concatenation', 'calibration']
original_skips = {key : args.__dict__["skip_%s" % key] for key in SKIPS}
# first, submit preprocessing and feature extraction; skip all others
for key in SKIPS[3:]:
setattr(args, "skip_%s" % key, True)
job_ids = bob.bio.base.script.verify.add_jobs(args, submitter)
for key in SKIPS[3:]:
setattr(args, "skip_%s" % key, original_skips[key])
# reset skips
args.skip_preprocessing = original_skips['preprocessing']
args.skip_extractor_training = original_skips['extractor_training']
args.skip_extraction = original_skips['extraction']
# if there are any external dependencies, we need to respect them
deps = args.external_dependencies[:]
# also, we depend on all previous steps
for n in ['preprocessing', 'extractor-training', 'extraction']:
if n in job_ids:
deps.append(job_ids[n])
# now, add our jobs
job_ids, deps = add_gmm_jobs(args, job_ids, deps, submitter)
# alright, finish the remaining bits
for key in SKIPS[:4]:
setattr(args, "skip_%s" % key, True)
args.external_dependencies = deps
job_ids.update(bob.bio.base.script.verify.add_jobs(args, submitter))
# alright, finish the remaining bits
for key in SKIPS[:4]:
setattr(args, "skip_%s" % key, original_skips[key])
return job_ids
def execute(args):
"""Run the desired job of the tool chain that is specified on command line.
This job might be executed either in the grid, or locally."""
# first, let the base script decide if it knows how to execute the job
if bob.bio.base.script.verify.execute(args):
return True
# now, check what we can do
# the file selector object
fs = tools.FileSelector.instance()
# train the feature projector
if args.sub_task == 'kmeans-init':
tools.kmeans_initialize(
args.algorithm,
args.extractor,
args.limit_training_data,
force = args.force)
# train the feature projector
elif args.sub_task == 'kmeans-e-step':
tools.kmeans_estep(
args.algorithm,
args.extractor,
args.iteration,
indices = base_tools.indices(fs.training_list('extracted', 'train_projector'), args.grid.number_of_projection_jobs),
force = args.force)
# train the feature projector
elif args.sub_task == 'kmeans-m-step':
tools.kmeans_mstep(
args.algorithm,
args.iteration,
number_of_parallel_jobs = args.grid.number_of_projection_jobs,
clean = args.clean_intermediate,
force = args.force)
elif args.sub_task == 'gmm-init':
tools.gmm_initialize(
args.algorithm,
args.extractor,
args.limit_training_data,
force = args.force)
# train the feature projector
elif args.sub_task == 'gmm-e-step':
tools.gmm_estep(
args.algorithm,
args.extractor,
args.iteration,
indices = base_tools.indices(fs.training_list('extracted', 'train_projector'), args.grid.number_of_projection_jobs),
force = args.force)
# train the feature projector
elif args.sub_task == 'gmm-m-step':
tools.gmm_mstep(
args.algorithm,
args.iteration,
number_of_parallel_jobs = args.grid.number_of_projection_jobs,
clean = args.clean_intermediate,
force = args.force)
else:
# Not our keyword...
return False
return True
def verify(args, command_line_parameters, external_fake_job_id = 0):
"""This is the main entry point for computing verification experiments.
You just have to specify configurations for any of the steps of the toolchain, which are:
-- the database
-- the preprocessing
-- feature extraction
-- the recognition algorithm
-- and the grid configuration.
Additionally, you can skip parts of the toolchain by selecting proper --skip-... parameters.
If your probe files are not too big, you can also specify the --preload-probes switch to speed up the score computation.
If files should be re-generated, please specify the --force option (might be combined with the --skip-... options)."""
# as the main entry point, check whether the sub-task is specified
if args.sub_task is not None:
# execute the desired sub-task
if not execute(args):
raise ValueError("The specified --sub-task '%s' is not known to the system" % args.sub_task)
return {}
else:
# add jobs
submitter = base_tools.GridSubmission(args, command_line_parameters, executable = 'verify_gmm.py', first_fake_job_id = 0) if args.grid else None
retval = add_jobs(args, submitter)
base_tools.write_info(args, command_line_parameters)
if args.grid.is_local() and args.run_local_scheduler:
if args.dry_run:
print ("Would have started the local scheduler to run the experiments with parallel jobs")
else:
# start the jman local deamon
submitter.execute_local()
return {}
else:
# return job ids as a dictionary
return retval
def main(command_line_parameters = sys.argv):
"""Executes the main function"""
try:
# do the command line parsing
args = parse_arguments(command_line_parameters[1:])
# perform face verification test
verify(args, command_line_parameters)
except Exception as e:
# track any exceptions as error logs (i.e., to get a time stamp)
logger.error("During the execution, an exception was raised: %s" % e)
raise
if __name__ == "__main__":
main()
This diff is collapsed.
This diff is collapsed.
from . import extractor
import numpy
import bob.io.base
from bob.bio.base.extractor import Extractor
_data = [0., 1., 2., 3., 4.]
class DummyExtractor (Extractor):
def __init__(self):
Extractor.__init__(self, requires_training=True)
self.model = False
def train(self, train_data, extractor_file):
assert isinstance(train_data, list)
bob.io.base.save(_data, extractor_file)
def load(self, extractor_file):
data = bob.io.base.load(extractor_file)
assert (_data == data).all()
self.model = True
def __call__(self, data):
"""Does nothing, simply converts the data type of the data, ignoring any annotation."""
assert self.model
return data.astype(numpy.float)
extractor = DummyExtractor()
......@@ -81,7 +81,7 @@ def test_gmm():
# create smaller GMM object
gmm2 = bob.bio.gmm.algorithm.GMM(
number_of_gaussians = 2,
k_means_training_iterations = 1,
kmeans_training_iterations = 1,
gmm_training_iterations = 1,
INIT_SEED = seed_value,
)
......@@ -138,7 +138,7 @@ def test_gmm_regular():
# create smaller GMM object
gmm2 = bob.bio.gmm.algorithm.GMMRegular(
number_of_gaussians = 2,
k_means_training_iterations = 1,
kmeans_training_iterations = 1,
gmm_training_iterations = 1,
INIT_SEED = seed_value,
)
......@@ -193,7 +193,7 @@ def test_isv():
isv2 = bob.bio.gmm.algorithm.ISV(
number_of_gaussians = 2,
subspace_dimension_of_u = 10,
k_means_training_iterations = 1,
kmeans_training_iterations = 1,
gmm_training_iterations = 1,
isv_training_iterations = 1,
INIT_SEED = seed_value
......@@ -260,7 +260,7 @@ def test_jfa():
number_of_gaussians = 2,
subspace_dimension_of_u = 2,
subspace_dimension_of_v = 2,
k_means_training_iterations = 1,
kmeans_training_iterations = 1,
gmm_training_iterations = 1,
jfa_training_iterations = 1,
INIT_SEED = seed_value
......
from __future__ import print_function
import bob.measure
import os
import sys
import shutil
import tempfile
import numpy
import bob.io.base.test_utils
import bob.io.image
import bob.bio.base
import bob.bio.gmm
from . import utils
from nose.plugins.skip import SkipTest
import pkg_resources
regenerate_reference = False
from bob.bio.base.script.verify import main
data_dir = pkg_resources.resource_filename('bob.bio.gmm', 'test/data')
def _verify(parameters, test_dir, sub_dir, ref_modifier="", score_modifier=('scores',''), executable = main):
try:
executable([sys.argv[0]] + parameters)
# assert that the score file exists
score_files = [os.path.join(test_dir, sub_dir, 'Default', norm, '%s-dev%s'%score_modifier) for norm in ('nonorm', 'ztnorm')]
assert os.path.exists(score_files[0]), "Score file %s does not exist" % score_files[0]
assert os.path.exists(score_files[1]), "Score file %s does not exist" % score_files[1]
# also assert that the scores are still the same -- though they have no real meaning
reference_files = [os.path.join(data_dir, 'scores-%s%s-dev'%(norm, ref_modifier)) for norm in ('nonorm', 'ztnorm')]
if regenerate_reference:
for i in (0,1):
shutil.copy(score_files[i], reference_files[i])
for i in (0,1):
d = []
# read reference and new data
for score_file in (score_files[i], reference_files[i]):
f = bob.measure.load.open_file(score_file)
d_ = []
for line in f:
if isinstance(line, bytes): line = line.decode('utf-8')
d_.append(line.rstrip().split())
d.append(numpy.array(d_))
assert d[0].shape == d[1].shape
# assert that the data order is still correct
assert (d[0][:,0:3] == d[1][:, 0:3]).all()
# assert that the values are OK
assert numpy.allclose(d[0][:,3].astype(float), d[1][:,3].astype(float), 1e-5)
finally:
shutil.rmtree(test_dir)
def test_gmm_base():
test_dir = tempfile.mkdtemp(prefix='frltest_')
# define dummy parameters
parameters = [
'-d', 'dummy',
'-p', 'dummy',
'-e', 'dummy',
'-a', 'bob.bio.gmm.algorithm.GMM(2, 2, 2)', '--import', 'bob.bio.gmm',
'--zt-norm',
'-s', 'test_gmm_sequential', '-vv',
'--temp-directory', test_dir,
'--result-directory', test_dir
]
print (bob.bio.base.tools.command_line(parameters))
_verify(parameters, test_dir, 'test_gmm_sequential', ref_modifier='-gmm')
def test_gmm_parallel():
from bob.bio.gmm.script.verify_gmm import main
test_dir = tempfile.mkdtemp(prefix='frltest_')
test_database = os.path.join(test_dir, "submitted.sql3")
# define dummy parameters
parameters = [
'-d', 'dummy',
'-p', 'dummy',
'-e', 'dummy',
'-a', 'bob.bio.gmm.algorithm.GMM(2, 2, 2)', '--import', 'bob.bio.gmm', 'bob.io.image',
'-g', 'bob.bio.base.grid.Grid(grid = "local", number_of_parallel_processes = 2, scheduler_sleep_time = 0.1)', '-G', test_database, '--run-local-scheduler', '-R',
'--clean-intermediate',
'--zt-norm',
'-s', 'test_gmm_parallel', '-vv',
'--temp-directory', test_dir,
'--result-directory', test_dir,
]
print (bob.bio.base.tools.command_line(parameters))
_verify(parameters, test_dir, 'test_gmm_parallel', executable=main, ref_modifier='-gmm')
from .command_line import *
from .gmm import *
import os
import sys
import types
import bob.core
logger = bob.core.log.setup("bob.bio.gmm")
from bob.bio.base.tools import FileSelector
def add_parallel_gmm_options(parsers, additional_functions = ['gmm']):
"""Add the options for parallel UBM training to the given parsers."""
flag_group = parsers['flag']
flag_group.add_argument('-l', '--limit-training-data', type=int,
help = 'Limit the number of training examples used for KMeans initialization and the GMM initialization')
flag_group.add_argument('-k', '--kmeans-start-iteration', type=int, default=0,
help = 'Specify the first iteration for the KMeans training (i.e. to restart from there)')
flag_group.add_argument('-m', '--gmm-start-iteration', type=int, default=0,
help = 'Specify the first iteration for the GMM training (i.e. to restart from there)')
flag_group.add_argument('-C', '--clean-intermediate', action='store_true',
help = 'Clean up temporary files of older iterations?')
sub_dir_group = parsers['sub-dir']
sub_dir_group.add_argument('--kmeans-directory', default = 'kmeans_temp',
help = 'The sub-directory (relative to --temp-directory), where intermediate kmeans files should be stored')
sub_dir_group.add_argument('--gmm-directory', default = 'gmm_temp',
help = 'The sub-directory (relative to --temp-directory), where intermediate gmm files should be stored')
# Functions to be added to the FileSelector class, once it is instantiated
def _kmeans_intermediate_file(self, round):
return os.path.join(self.kmeans_temp_directory, 'round_%05d' % round, 'kmeans.hdf5')
def _kmeans_stats_file(self, round, start_index, end_index):
return os.path.join(self.kmeans_temp_directory, 'round_%05d' % round, 'stats-%05d-%95d.hdf5' % (start_index, end_index))
def _gmm_intermediate_file(self, round):
return os.path.join(self.gmm_temp_directory, 'round_%05d' % round, 'gmm.hdf5')
def _gmm_stats_file(self, round, start_index, end_index):
return os.path.join(self.gmm_temp_directory, 'round_%05d' % round, 'stats-%05d-%95d.hdf5' % (start_index, end_index))
def initialize_parallel_gmm(args):
# get the relevant sub_directory, which depends on the database and the prorocol
protocol = 'None' if args.database.protocol is None else args.database.protocol
extractor_sub_dir = protocol if args.database.training_depends_on_protocol and args.extractor.requires_training else '.'
sub_dir = protocol if args.database.training_depends_on_protocol else '.'
fs = FileSelector.instance()
# add relevant directories to file selector object
fs.kmeans_temp_directory = os.path.join(args.temp_directory, sub_dir, args.kmeans_directory)
fs.kmeans_file = os.path.join(args.temp_directory, sub_dir, "kmeans.hdf5")
fs.gmm_temp_directory = os.path.join(args.temp_directory, sub_dir, args.gmm_directory)
# fs.gmm_file = os.path.join(args.temp_directory, sub_dir, "gmm.hdf5")
fs.gmm_file = fs.projector_file
# add relevant **functions** to file selector object
fs.kmeans_intermediate_file = types.MethodType(_kmeans_intermediate_file, fs)
fs.kmeans_stats_file = types.MethodType(_kmeans_stats_file, fs)
fs.gmm_intermediate_file = types.MethodType(_gmm_intermediate_file, fs)
fs.gmm_stats_file = types.MethodType(_gmm_stats_file, fs)
import bob.io.base
import bob.learn.em
import shutil
import numpy
import os
import logging
logger = logging.getLogger("bob.bio.gmm")
from bob.bio.base.tools.FileSelector import FileSelector
from bob.bio.base import utils, tools
def kmeans_initialize(algorithm, extractor, limit_data = None, force = False):
"""Initializes the K-Means training (non-parallel)."""
fs = FileSelector.instance()
output_file = fs.kmeans_intermediate_file(0)
if utils.check_file(output_file, force, 1000):
logger.info("UBM training: Skipping KMeans initialization since the file '%s' already exists", output_file)
else:
# read data
logger.info("UBM training: initializing kmeans")
training_list = utils.selected_elements(fs.training_list('extracted', 'train_projector'), limit_data)
data = numpy.vstack([extractor.read_feature(feature_file) for feature_file in training_list])
# Perform KMeans initialization
kmeans_machine = bob.learn.em.KMeansMachine(algorithm.gaussians, data.shape[1])
# Creates the KMeansTrainer and call the initialization procedure
algorithm.kmeans_trainer.initialize(kmeans_machine, data)
bob.io.base.create_directories_safe(os.path.dirname(output_file))
kmeans_machine.save(bob.io.base.HDF5File(output_file, 'w'))
logger.info("UBM training: saved initial KMeans machine to '%s'", output_file)
def kmeans_estep(algorithm, extractor, iteration, indices, force=False):
"""Performs a single E-step of the K-Means algorithm (parallel)"""
if indices[0] > indices[1]:
return
fs = FileSelector.instance()
# check if we need to compute this step
stats_file = fs.kmeans_stats_file(iteration, indices[0], indices[1])
new_machine_file = fs.kmeans_intermediate_file(iteration + 1)
if utils.check_file(stats_file, force, 1000) or utils.check_file(new_machine_file, force, 1000):
logger.info("UBM training: Skipping KMeans E-Step since the file '%s' or '%s' already exists", stats_file, new_machine_file)
else:
training_list = fs.training_list('extracted', 'train_projector')
last_machine_file = fs.kmeans_intermediate_file(iteration)
kmeans_machine = bob.learn.em.KMeansMachine(bob.io.base.HDF5File(last_machine_file))
logger.info("UBM training: KMeans E-Step round %d from range(%d, %d)", iteration, *indices)
# read data
data = numpy.vstack([extractor.read_feature(training_list[index]) for index in range(indices[0], indices[1])])
# Performs the E-step
trainer = algorithm.kmeans_trainer
trainer.e_step(kmeans_machine, data)
# write results to file
dist = numpy.array(trainer.average_min_distance)
nsamples = numpy.array([indices[1] - indices[0]], dtype=numpy.float64)
# write statistics
bob.io.base.create_directories_safe(os.path.dirname(stats_file))
hdf5 = bob.io.base.HDF5File(stats_file, 'w')
hdf5.set('zeros', trainer.zeroeth_order_statistics)
hdf5.set('first', trainer.first_order_statistics)
hdf5.set('dist', dist * nsamples)
hdf5.set('nsamples', nsamples)
logger.info("UBM training: Wrote Stats file '%s'", stats_file)
def _read_stats(filename):
"""Reads accumulated K-Means statistics from file"""
logger.debug("UBM training: Reading stats file '%s'", filename)
hdf5 = bob.io.base.HDF5File(filename)
zeroeth = hdf5.read('zeros')
first = hdf5.read('first')
nsamples = hdf5.read('nsamples')
dist = hdf5.read('dist')
return (zeroeth, first, nsamples, dist)
def _accumulate(filenames):
zeroeth, first, nsamples, dist = _read_stats(filenames[0])
for stat in filenames[1:]:
zeroeth_, first_, nsamples_, dist_ = _read_stats(filenames[1])
zeroeth += zeroeth_
first += first_
nsamples += nsamples_
dist += dist_
return (zeroeth, first, nsamples, dist)
def kmeans_mstep(algorithm, iteration, number_of_parallel_jobs, force=False, clean=False):
"""Performs a single M-step of the K-Means algorithm (non-parallel)"""
fs = FileSelector.instance()
old_machine_file = fs.kmeans_intermediate_file(iteration)
new_machine_file = fs.kmeans_intermediate_file(iteration+1)
if utils.check_file(new_machine_file, force, 1000):
logger.info("UBM training: Skipping KMeans M-Step since the file '%s' already exists", new_machine_file)
else:
# get the files from e-step
training_list = fs.training_list('extracted', 'train_projector')
# try if there is one file containing all data
if os.path.exists(fs.kmeans_stats_file(iteration, 0, len(training_list))):
stats_file = fs.kmeans_stats_file(iteration, 0, len(training_list))
# load stats file
statistics = _read_stats(stats_file)
else:
# load several files
filenames = []
for job in range(number_of_parallel_jobs):
job_indices = tools.indices(training_list, number_of_parallel_jobs, job+1)
if job_indices[-1] >= job_indices[0]:
filenames.append(fs.kmeans_stats_file(iteration, job_indices[0], job_indices[-1]))
statistics = _accumulate(filenames)
# Creates the KMeansMachine
kmeans_machine = bob.learn.em.KMeansMachine(bob.io.base.HDF5File(old_machine_file))
trainer = algorithm.kmeans_trainer
trainer.reset_accumulators(kmeans_machine)
trainer.zeroeth_order_statistics = statistics[0]
trainer.first_order_statistics = statistics[1]
trainer.average_min_distance = statistics[3]
error = statistics[3] / statistics[2]
# Performs the M-step
trainer.m_step(kmeans_machine, None) # data is not used in M-step
logger.info("UBM training: Performed M step %d with result %f" % (iteration, error))
# Save the K-Means model
bob.io.base.create_directories_safe(os.path.dirname(new_machine_file))
kmeans_machine.save(bob.io.base.HDF5File(new_machine_file, 'w'))
# copy the k_means file in any case
if iteration == algorithm.kmeans_training_iterations-1:
shutil.copy(new_machine_file, fs.kmeans_file)
logger.info("UBM training: Wrote new KMeans machine '%s'", fs.kmeans_file)
if clean and iteration > 0:
old_dir = os.path.dirname(fs.kmeans_intermediate_file(iteration-1))
logger.info("Removing old intermediate directory '%s'", old_dir)
shutil.rmtree(old_dir)
def gmm_initialize(algorithm, extractor, limit_data = None, force = False):
"""Initializes the GMM calculation with the result of the K-Means algorithm (non-parallel).
This might require a lot of memory."""
fs = FileSelector.instance()
output_file = fs.gmm_intermediate_file(0)
if utils.check_file(output_file, force, 800):
logger.info("UBM Training: Skipping GMM initialization since '%s' already exists", output_file)
else:
logger.info("UBM Training: Initializing GMM")
# read features
training_list = utils.selected_elements(fs.training_list('extracted', 'train_projector'), limit_data)
data = numpy.vstack([extractor.read_feature(feature_file) for feature_file in training_list])
# get means and variances of kmeans result
kmeans_machine = bob.learn.em.KMeansMachine(bob.io.base.HDF5File(fs.kmeans_file))
[variances, weights] = kmeans_machine.get_variances_and_weights_for_each_cluster(data)
# Create initial GMM Machine
gmm_machine = bob.learn.em.GMMMachine(algorithm.gaussians, data.shape[1])
# Initializes the GMM
gmm_machine.means = kmeans_machine.means
gmm_machine.variances = variances
gmm_machine.weights = weights
gmm_machine.set_variance_thresholds(algorithm.variance_threshold)
# write gmm machine to file
bob.io.base.create_directories_safe(os.path.dirname(output_file))
gmm_machine.save(bob.io.base.HDF5File(output_file, 'w'))
logger.info("UBM Training: Wrote GMM file '%s'", output_file)
def gmm_estep(algorithm, extractor, iteration, indices, force=False):
"""Performs a single E-step of the GMM training (parallel)."""
if indices[0] > indices[1]:
return
fs = FileSelector.instance()
stats_file = fs.gmm_stats_file(iteration, indices[0], indices[1])
new_machine_file = fs.gmm_intermediate_file(iteration + 1)
if utils.check_file(stats_file, force, 1000) or utils.check_file(new_machine_file, force, 1000):
loggerinfo("UBM training: Skipping GMM E-Step since the file '%s' or '%s' already exists", stats_file, new_machine_file)
else:
training_list = fs.training_list('extracted', 'train_projector')
last_machine_file = fs.gmm_intermediate_file(iteration)
gmm_machine = bob.learn.em.GMMMachine(bob.io.base.HDF5File(last_machine_file))
logger.info("UBM training: GMM E-Step from range(%d, %d)", indices)
# read data
data = numpy.vstack([extractor.read_feature(training_list[index]) for index in range(indices[0], indices[1])])
trainer = algorithm.ubm_trainer
trainer.initialize(gmm_machine, None)
# Calls the E-step and extracts the GMM statistics
algorithm.ubm_trainer.e_step(gmm_machine, data)
gmm_stats = algorithm.ubm_trainer.gmm_statistics
# Saves the GMM statistics to the file
bob.io.base.create_directories_safe(os.path.dirname(stats_file))
gmm_stats.save(bob.io.base.HDF5File(stats_file, 'w'))
logger.info("UBM training: Wrote GMM stats '%s'", stats_file)
def gmm_mstep(algorithm, iteration, number_of_parallel_jobs, force=False, clean=False):
"""Performs a single M-step of the GMM training (non-parallel)"""
fs = FileSelector.instance()
old_machine_file = fs.gmm_intermediate_file(iteration)
new_machine_file = fs.gmm_intermediate_file(iteration + 1)
if utils.check_file(new_machine_file, force, 1000):
logger.info("UBM training: Skipping GMM M-Step since the file '%s' already exists", new_machine_file)
else:
# get the files from e-step
training_list = fs.training_list('extracted', 'train_projector')
# try if there is one file containing all data
if os.path.exists(fs.gmm_stats_file(iteration, 0, len(training_list))):
stats_file = fs.gmm_stats_file(iteration, 0, len(training_list))
# load stats file
gmm_stats = bob.learn.em.GMMStats(bob.io.base.HDF5File(stats_file))
else:
# load several files
stats_files = []
for job in range(number_of_parallel_jobs):
job_indices = tools.indices(training_list, number_of_parallel_jobs, job+1)
if job_indices[-1] >= job_indices[0]:
stats_files.append(fs.gmm_stats_file(iteration, job_indices[0], job_indices[-1]))
# read all stats files
gmm_stats = bob.learn.em.GMMStats(bob.io.base.HDF5File(stats_files[0]))
for stats_file in stats_files[1:]:
gmm_stats += bob.learn.em.GMMStats(bob.io.base.HDF5File(stats_file))
# load the old gmm machine
gmm_machine = bob.learn.em.GMMMachine(bob.io.base.HDF5File(old_machine_file))
# initialize the trainer
trainer = algorithm.ubm_trainer
trainer.initialize(gmm_machine)
trainer.gmm_statistics = gmm_stats
# Calls M-step (no data required)
trainer.m_step(gmm_machine)
# Saves the GMM statistics to the file
bob.io.base.create_directories_safe(os.path.dirname(new_machine_file))
gmm_machine.save(bob.io.base.HDF5File(new_machine_file, 'w'))
if iteration == algorithm.gmm_training_iterations-1:
shutil.copy(new_machine_file, fs.gmm_file)
logger.info("UBM training: Wrote new GMM machine '%s'", fs.gmm_file)
if clean and iteration > 0:
old_dir = os.path.dirname(fs.gmm_intermediate_file(iteration-1))
logger.info("Removing old intermediate directory '%s'", old_dir)
shutil.rmtree(old_dir)
This diff is collapsed.
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment