Updated GMM algorithm and implemented parallel UBM training

2f93ed06 · Manuel Günther · 94aaedea · 2f93ed06 · 2f93ed06 · 2f93ed06
Commit 2f93ed06 authored 9 years ago by Manuel Günther
--- a/bob/bio/gmm/algorithm/GMM.py
+++ b/bob/bio/gmm/algorithm/GMM.py
@@ -15,21 +15,21 @@ import logging
 logger = logging.getLogger("bob.bio.gmm")

 class GMM (Algorithm):
-  """Algorithm for computing Universal Background Models and Gaussian Mixture Models of the features"""
+  """Algorithm for computing Universal Background Models and Gaussian Mixture Models of the features.
+  Features must be normalized to zero mean and unit standard deviation."""

  def __init__(
      self,
      # parameters for the GMM
      number_of_gaussians,
      # parameters of UBM training
-      k_means_training_iterations = 500, # Maximum number of iterations for K-Means
-      gmm_training_iterations = 500,     # Maximum number of iterations for ML GMM Training
+      kmeans_training_iterations = 25,   # Maximum number of iterations for K-Means
+      gmm_training_iterations = 25,      # Maximum number of iterations for ML GMM Training
      training_threshold = 5e-4,         # Threshold to end the ML training
      variance_threshold = 5e-4,         # Minimum value that a variance can reach
      update_weights = True,
      update_means = True,
      update_variances = True,
-      normalize_before_k_means = True,  # Normalize the input features before running K-Means
      # parameters of the GMM enrollment
      relevance_factor = 4,         # Relevance factor as described in Reynolds paper
      gmm_enroll_iterations = 1,    # Number of iterations for the enrollment phase
@@ -47,14 +47,13 @@ class GMM (Algorithm):
        use_projected_features_for_enrollment = False,

        number_of_gaussians = number_of_gaussians,
-        k_means_training_iterations = k_means_training_iterations,
+        kmeans_training_iterations = kmeans_training_iterations,
        gmm_training_iterations = gmm_training_iterations,
        training_threshold = training_threshold,
        variance_threshold = variance_threshold,
        update_weights = update_weights,
        update_means = update_means,
        update_variances = update_variances,
-        normalize_before_k_means = normalize_before_k_means,
        relevance_factor = relevance_factor,
        gmm_enroll_iterations = gmm_enroll_iterations,
        responsibility_threshold = responsibility_threshold,
@@ -67,14 +66,13 @@ class GMM (Algorithm):

    # copy parameters
    self.gaussians = number_of_gaussians
-    self.k_means_training_iterations = k_means_training_iterations
+    self.kmeans_training_iterations = kmeans_training_iterations
    self.gmm_training_iterations = gmm_training_iterations
    self.training_threshold = training_threshold
    self.variance_threshold = variance_threshold
    self.update_weights = update_weights
    self.update_means = update_means
    self.update_variances = update_variances
-    self.normalize_before_k_means = normalize_before_k_means
    self.relevance_factor = relevance_factor
    self.gmm_enroll_iterations = gmm_enroll_iterations
    self.init_seed = INIT_SEED
@@ -83,6 +81,8 @@ class GMM (Algorithm):
    self.scoring_function = scoring_function

    self.ubm = None
+    self.kmeans_trainer = bob.learn.em.KMeansTrainer()
+    self.ubm_trainer = bob.learn.em.ML_GMMTrainer(self.update_means, self.update_variances, self.update_weights, self.responsibility_threshold)


  def _check_feature(self, feature):
@@ -94,42 +94,6 @@ class GMM (Algorithm):



-  #######################################################
-  ################ UBM training #########################
-  def _normalize_std_array(self, array):
-    """Applies a unit variance normalization to an array"""
-
-    # Initializes variables
-    n_samples = array.shape[0]
-    length = array.shape[1]
-    mean = numpy.zeros((length,))
-    std = numpy.zeros((length,))
-
-    # Computes mean and variance
-    for k in range(n_samples):
-      x = array[k,:].astype('float64')
-      mean += x
-      std += (x ** 2)
-
-    mean /= n_samples
-    std /= n_samples
-    std -= (mean ** 2)
-    std = std ** 0.5 # sqrt(std)
-
-    ar_std_list = []
-    for k in range(n_samples):
-      ar_std_list.append(array[k,:].astype('float64') / std)
-    ar_std = numpy.vstack(ar_std_list)
-
-    return (ar_std,std)
-
-
-  def _multiply_vectors_by_factors(self, matrix, vector):
-    """Used to unnormalize some data"""
-    for i in range(0, matrix.shape[0]):
-      for j in range(0, matrix.shape[1]):
-        matrix[i, j] *= vector[j]
-

  #######################################################
  ################ UBM training #########################
@@ -141,35 +105,18 @@ class GMM (Algorithm):
    # Computes input size
    input_size = array.shape[1]

-    # Normalizes the array if required
-    logger.debug(" .... Normalizing the array")
-    if not self.normalize_before_k_means:
-      normalized_array = array
-    else:
-      normalized_array, std_array = self._normalize_std_array(array)
-
-
    # Creates the machines (KMeans and GMM)
    logger.debug(" .... Creating machines")
    kmeans = bob.learn.em.KMeansMachine(self.gaussians, input_size)
    self.ubm = bob.learn.em.GMMMachine(self.gaussians, input_size)

-    # Creates the KMeansTrainer
-    kmeans_trainer = bob.learn.em.KMeansTrainer()
-
    # Trains using the KMeansTrainer
    logger.info("  -> Training K-Means")
-    bob.learn.em.train(kmeans_trainer, kmeans, normalized_array, self.gmm_training_iterations, self.training_threshold, bob.core.random.mt19937(self.init_seed))
+    bob.learn.em.train(self.kmeans_trainer, kmeans, array, self.kmeans_training_iterations, self.training_threshold, bob.core.random.mt19937(self.init_seed))

-    variances, weights = kmeans.get_variances_and_weights_for_each_cluster(normalized_array)
+    variances, weights = kmeans.get_variances_and_weights_for_each_cluster(array)
    means = kmeans.means

-    # Undoes the normalization
-    if self.normalize_before_k_means:
-      logger.debug(" .... Undoing normalization")
-      self._multiply_vectors_by_factors(means, std_array)
-      self._multiply_vectors_by_factors(variances, std_array ** 2)
-
    # Initializes the GMM
    self.ubm.means = means
    self.ubm.variances = variances
@@ -178,8 +125,7 @@ class GMM (Algorithm):

    # Trains the GMM
    logger.info("  -> Training GMM")
-    trainer = bob.learn.em.ML_GMMTrainer(self.update_means, self.update_variances, self.update_weights)
-    bob.learn.em.train(trainer, self.ubm, array, self.gmm_training_iterations, self.training_threshold, bob.core.random.mt19937(self.init_seed))
+    bob.learn.em.train(self.ubm_trainer, self.ubm, array, self.gmm_training_iterations, self.training_threshold, bob.core.random.mt19937(self.init_seed))


  def _save_projector(self, projector_file):
@@ -219,7 +165,7 @@ class GMM (Algorithm):
    self.load_ubm(projector_file)
    # prepare MAP_GMM_Trainer
    kwargs = dict(mean_var_update_responsibilities_threshold=self.responsibility_threshold) if self.responsibility_threshold > 0. else dict()
-    self.trainer = bob.learn.em.MAP_GMMTrainer(self.ubm, relevance_factor = self.relevance_factor, update_means = True, update_variances = False, **kwargs)
+    self.enroll_trainer = bob.learn.em.MAP_GMMTrainer(self.ubm, relevance_factor = self.relevance_factor, update_means = True, update_variances = False, **kwargs)
    self.rng = bob.core.random.mt19937(self.init_seed)


@@ -252,7 +198,7 @@ class GMM (Algorithm):

    gmm = bob.learn.em.GMMMachine(self.ubm)
    gmm.set_variance_thresholds(self.variance_threshold)
-    bob.learn.em.train(self.trainer, gmm, array, self.gmm_enroll_iterations, self.training_threshold, self.rng)
+    bob.learn.em.train(self.enroll_trainer, gmm, array, self.gmm_enroll_iterations, self.training_threshold, self.rng)
    return gmm

  def enroll(self, feature_arrays):

--- a/bob/bio/gmm/config/algorithm/gmm.py
+++ b/bob/bio/gmm/config/algorithm/gmm.py
@@ -5,6 +5,4 @@ import numpy

 algorithm = bob.bio.gmm.algorithm.GMM(
    number_of_gaussians = 512,
-    # by default, features are expected to be normalized and, hence, we don't need to re-normalize them
-    normalize_before_k_means = False
 )
--- a/bob/bio/gmm/config/algorithm/isv.py
+++ b/bob/bio/gmm/config/algorithm/isv.py
@@ -7,7 +7,5 @@ algorithm = bob.bio.gmm.algorithm.ISV(
    # ISV parameters
    subspace_dimension_of_u = 160,
    # GMM parameters
-    number_of_gaussians = 512,
-    # by default, our features are normalized, so it does not need to be done here
-    normalize_before_k_means = False
+    number_of_gaussians = 512
 )
--- a/bob/bio/gmm/script/__init__.py
+++ b/bob/bio/gmm/script/__init__.py
--- a/bob/bio/gmm/script/verify_gmm.py
+++ b/bob/bio/gmm/script/verify_gmm.py
+#!/usr/bin/env python
+# vim: set fileencoding=utf-8 :
+# Manuel Guenther <Manuel.Guenther@idiap.ch>
+from __future__ import print_function
+
+import sys
+import argparse
+
+import logging
+logger = logging.getLogger("bob.bio.gmm")
+
+import bob.bio.base
+from .. import tools, algorithm
+from bob.bio.base import tools as base_tools
+
+
+def parse_arguments(command_line_parameters, exclude_resources_from = []):
+  """This function parses the given options (which by default are the command line options). If exclude_resources_from is specified (as a list), the resources from the given packages are not listed in the help message."""
+  # set up command line parser
+  parsers = base_tools.command_line_parser(exclude_resources_from = exclude_resources_from)
+
+  # add GMM-related options
+  tools.add_parallel_gmm_options(parsers)
+
+  # override some parameters
+  parsers['config'].add_argument('-g', '--grid', metavar = 'x', nargs = '+', required=True,
+    help = 'Configuration for the grid setup; required for the parallel execution script.')
+
+  parsers['config'].add_argument('-a', '--algorithm', metavar = 'x', nargs = '+', default = ['gmm'],
+      help = 'Face recognition; only GMM-related algorithms are allowed')
+
+
+  # Add sub-tasks that can be executed by this script
+  parser = parsers['main']
+  parser.add_argument('--sub-task',
+      choices = ('preprocess', 'train-extractor', 'extract', 'normalize-features', 'kmeans-init', 'kmeans-e-step', 'kmeans-m-step', 'gmm-init', 'gmm-e-step', 'gmm-m-step', 'project', 'enroll', 'compute-scores', 'concatenate'),
+      help = argparse.SUPPRESS) #'Executes a subtask (FOR INTERNAL USE ONLY!!!)'
+  parser.add_argument('--iteration', type = int,
+      help = argparse.SUPPRESS) #'Which type of models to generate (Normal or TModels)'
+  parser.add_argument('--model-type', choices = ['N', 'T'],
+      help = argparse.SUPPRESS) #'Which type of models to generate (Normal or TModels)'
+  parser.add_argument('--score-type', choices = ['A', 'B', 'C', 'D', 'Z'],
+      help = argparse.SUPPRESS) #'The type of scores that should be computed'
+  parser.add_argument('--group',
+      help = argparse.SUPPRESS) #'The group for which the current action should be performed'
+
+  # now that we have set up everything, get the command line arguments
+  args = base_tools.initialize(parsers, command_line_parameters,
+      skips = ['preprocessing', 'extractor-training', 'extraction', 'normalization', 'kmeans', 'gmm', 'projection', 'enroller-training', 'enrollment', 'score-computation', 'concatenation', 'calibration']
+  )
+
+  args.skip_projector_training = True
+
+  # and add the GMM-related parameters
+  tools.initialize_parallel_gmm(args)
+
+  # assert that the algorithm is a GMM
+  if args.algorithm.__class__ not in (algorithm.GMM, algorithm.GMMRegular):
+    raise ValueError("The given algorithm %s is not a (pure) GMM algorithm" % type(args.algorithm))
+
+  return args
+
+def add_gmm_jobs(args, job_ids, deps, submitter):
+  """Adds all GMM-related jobs."""
+
+  # KMeans
+  if not args.skip_kmeans:
+    # initialization
+    if not args.kmeans_start_iteration:
+      job_ids['kmeans-init'] = submitter.submit(
+              '--sub-task kmeans-init',
+              name = 'k-init',
+              dependencies = deps,
+              **args.grid.training_queue)
+      deps.append(job_ids['kmeans-init'])
+
+    # several iterations of E and M steps
+    for iteration in range(args.kmeans_start_iteration, args.algorithm.kmeans_training_iterations):
+      # E-step
+      job_ids['kmeans-e-step'] = submitter.submit(
+              '--sub-task kmeans-e-step --iteration %d' % iteration,
+              name='k-e-%d' % iteration,
+              number_of_parallel_jobs = args.grid.number_of_projection_jobs,
+              dependencies = [job_ids['kmeans-m-step']] if iteration != args.kmeans_start_iteration else deps,
+              **args.grid.projection_queue)
+
+      # M-step
+      job_ids['kmeans-m-step'] = submitter.submit(
+              '--sub-task kmeans-m-step --iteration %d' % iteration,
+              name='k-m-%d' % iteration,
+              dependencies = [job_ids['kmeans-e-step']],
+              **args.grid.training_queue)
+
+    # add dependence to the last m step
+    deps.append(job_ids['kmeans-m-step'])
+
+  # GMM
+  if not args.skip_gmm:
+    # initialization
+    if not args.gmm_start_iteration:
+      job_ids['gmm-init'] = submitter.submit(
+              '--sub-task gmm-init',
+              name = 'g-init',
+              dependencies = deps,
+              **args.grid.training_queue)
+      deps.append(job_ids['gmm-init'])
+
+    # several iterations of E and M steps
+    for iteration in range(args.gmm_start_iteration, args.algorithm.gmm_training_iterations):
+      # E-step
+      job_ids['gmm-e-step'] = submitter.submit(
+              '--sub-task gmm-e-step --iteration %d' % iteration,
+              name='g-e-%d' % iteration,
+              number_of_parallel_jobs = args.grid.number_of_projection_jobs,
+              dependencies = [job_ids['gmm-m-step']] if iteration != args.gmm_start_iteration else deps,
+              **args.grid.projection_queue)
+
+      # M-step
+      job_ids['gmm-m-step'] = submitter.submit(
+              '--sub-task gmm-m-step --iteration %d' % iteration,
+              name='g-m-%d' % iteration,
+              dependencies = [job_ids['gmm-e-step']],
+              **args.grid.training_queue)
+
+    # add dependence to the last m step
+    deps.append(job_ids['gmm-m-step'])
+  return job_ids, deps
+
+
+
+def add_jobs(args, submitter):
+  """Adds all (desired) jobs of the tool chain to the grid, or to the local list to be executed."""
+
+  assert args.grid is not None
+
+  # Here, we use the default bob.bio.base add_jobs function, but intercept it for adding the training
+  SKIPS = ['preprocessing', 'extractor_training', 'extraction', 'projector_training', 'projection', 'enroller_training', 'enrollment', 'score_computation', 'concatenation', 'calibration']
+  original_skips = {key : args.__dict__["skip_%s" % key] for key in SKIPS}
+
+  # first, submit preprocessing and feature extraction; skip all others
+  for key in SKIPS[3:]:
+    setattr(args, "skip_%s" % key, True)
+
+  job_ids = bob.bio.base.script.verify.add_jobs(args, submitter)
+
+  for key in SKIPS[3:]:
+    setattr(args, "skip_%s" % key, original_skips[key])
+
+  # reset skips
+  args.skip_preprocessing = original_skips['preprocessing']
+  args.skip_extractor_training = original_skips['extractor_training']
+  args.skip_extraction = original_skips['extraction']
+
+  # if there are any external dependencies, we need to respect them
+  deps = args.external_dependencies[:]
+  # also, we depend on all previous steps
+  for n in ['preprocessing', 'extractor-training', 'extraction']:
+    if n in job_ids:
+      deps.append(job_ids[n])
+
+  # now, add our jobs
+  job_ids, deps = add_gmm_jobs(args, job_ids, deps, submitter)
+
+  # alright, finish the remaining bits
+  for key in SKIPS[:4]:
+    setattr(args, "skip_%s" % key, True)
+
+  args.external_dependencies = deps
+  job_ids.update(bob.bio.base.script.verify.add_jobs(args, submitter))
+
+  # alright, finish the remaining bits
+  for key in SKIPS[:4]:
+    setattr(args, "skip_%s" % key, original_skips[key])
+
+  return job_ids
+
+
+def execute(args):
+  """Run the desired job of the tool chain that is specified on command line.
+  This job might be executed either in the grid, or locally."""
+
+  # first, let the base script decide if it knows how to execute the job
+  if bob.bio.base.script.verify.execute(args):
+    return True
+
+  # now, check what we can do
+
+  # the file selector object
+  fs = tools.FileSelector.instance()
+
+  # train the feature projector
+  if args.sub_task == 'kmeans-init':
+    tools.kmeans_initialize(
+        args.algorithm,
+        args.extractor,
+        args.limit_training_data,
+        force = args.force)
+
+  # train the feature projector
+  elif args.sub_task == 'kmeans-e-step':
+    tools.kmeans_estep(
+        args.algorithm,
+        args.extractor,
+        args.iteration,
+        indices = base_tools.indices(fs.training_list('extracted', 'train_projector'), args.grid.number_of_projection_jobs),
+        force = args.force)
+
+  # train the feature projector
+  elif args.sub_task == 'kmeans-m-step':
+    tools.kmeans_mstep(
+        args.algorithm,
+        args.iteration,
+        number_of_parallel_jobs = args.grid.number_of_projection_jobs,
+        clean = args.clean_intermediate,
+        force = args.force)
+
+  elif args.sub_task == 'gmm-init':
+    tools.gmm_initialize(
+        args.algorithm,
+        args.extractor,
+        args.limit_training_data,
+        force = args.force)
+
+  # train the feature projector
+  elif args.sub_task == 'gmm-e-step':
+    tools.gmm_estep(
+        args.algorithm,
+        args.extractor,
+        args.iteration,
+        indices = base_tools.indices(fs.training_list('extracted', 'train_projector'), args.grid.number_of_projection_jobs),
+        force = args.force)
+
+  # train the feature projector
+  elif args.sub_task == 'gmm-m-step':
+    tools.gmm_mstep(
+        args.algorithm,
+        args.iteration,
+        number_of_parallel_jobs = args.grid.number_of_projection_jobs,
+        clean = args.clean_intermediate,
+        force = args.force)
+  else:
+    # Not our keyword...
+    return False
+  return True
+
+
+
+def verify(args, command_line_parameters, external_fake_job_id = 0):
+  """This is the main entry point for computing verification experiments.
+  You just have to specify configurations for any of the steps of the toolchain, which are:
+  -- the database
+  -- the preprocessing
+  -- feature extraction
+  -- the recognition algorithm
+  -- and the grid configuration.
+  Additionally, you can skip parts of the toolchain by selecting proper --skip-... parameters.
+  If your probe files are not too big, you can also specify the --preload-probes switch to speed up the score computation.
+  If files should be re-generated, please specify the --force option (might be combined with the --skip-... options)."""
+
+
+  # as the main entry point, check whether the sub-task is specified
+  if args.sub_task is not None:
+    # execute the desired sub-task
+    if not execute(args):
+      raise ValueError("The specified --sub-task '%s' is not known to the system" % args.sub_task)
+    return {}
+  else:
+    # add jobs
+    submitter = base_tools.GridSubmission(args, command_line_parameters, executable = 'verify_gmm.py', first_fake_job_id = 0) if args.grid else None
+    retval = add_jobs(args, submitter)
+    base_tools.write_info(args, command_line_parameters)
+
+    if args.grid.is_local() and args.run_local_scheduler:
+      if args.dry_run:
+        print ("Would have started the local scheduler to run the experiments with parallel jobs")
+      else:
+        # start the jman local deamon
+        submitter.execute_local()
+      return {}
+
+    else:
+      # return job ids as a dictionary
+      return retval
+
+
+def main(command_line_parameters = sys.argv):
+  """Executes the main function"""
+  try:
+    # do the command line parsing
+    args = parse_arguments(command_line_parameters[1:])
+
+    # perform face verification test
+    verify(args, command_line_parameters)
+  except Exception as e:
+    # track any exceptions as error logs (i.e., to get a time stamp)
+    logger.error("During the execution, an exception was raised: %s" % e)
+    raise
+
+if __name__ == "__main__":
+  main()
--- a/bob/bio/gmm/test/data/scores-nonorm-gmm-dev
+++ b/bob/bio/gmm/test/data/scores-nonorm-gmm-dev
--- a/bob/bio/gmm/test/data/scores-ztnorm-gmm-dev
+++ b/bob/bio/gmm/test/data/scores-ztnorm-gmm-dev
--- a/bob/bio/gmm/test/dummy/__init__.py
+++ b/bob/bio/gmm/test/dummy/__init__.py
+from . import extractor
--- a/bob/bio/gmm/test/dummy/extractor.py
+++ b/bob/bio/gmm/test/dummy/extractor.py
+import numpy
+import bob.io.base
+
+from bob.bio.base.extractor import Extractor
+
+_data = [0., 1., 2., 3., 4.]
+
+class DummyExtractor (Extractor):
+  def __init__(self):
+    Extractor.__init__(self, requires_training=True)
+    self.model = False
+
+  def train(self, train_data, extractor_file):
+    assert isinstance(train_data, list)
+    bob.io.base.save(_data, extractor_file)
+
+  def load(self, extractor_file):
+    data = bob.io.base.load(extractor_file)
+    assert (_data == data).all()
+    self.model = True
+
+  def __call__(self, data):
+    """Does nothing, simply converts the data type of the data, ignoring any annotation."""
+    assert self.model
+    return data.astype(numpy.float)
+
+extractor = DummyExtractor()
--- a/bob/bio/gmm/test/test_algorithms.py
+++ b/bob/bio/gmm/test/test_algorithms.py
@@ -81,7 +81,7 @@ def test_gmm():
  # create smaller GMM object
  gmm2 = bob.bio.gmm.algorithm.GMM(
    number_of_gaussians = 2,
-    k_means_training_iterations = 1,
+    kmeans_training_iterations = 1,
    gmm_training_iterations = 1,
    INIT_SEED = seed_value,
  )
@@ -138,7 +138,7 @@ def test_gmm_regular():
  # create smaller GMM object
  gmm2 = bob.bio.gmm.algorithm.GMMRegular(
    number_of_gaussians = 2,
-    k_means_training_iterations = 1,
+    kmeans_training_iterations = 1,
    gmm_training_iterations = 1,
    INIT_SEED = seed_value,
  )
@@ -193,7 +193,7 @@ def test_isv():
  isv2 = bob.bio.gmm.algorithm.ISV(
      number_of_gaussians = 2,
      subspace_dimension_of_u = 10,
-      k_means_training_iterations = 1,
+      kmeans_training_iterations = 1,
      gmm_training_iterations = 1,
      isv_training_iterations = 1,
      INIT_SEED = seed_value
@@ -260,7 +260,7 @@ def test_jfa():
      number_of_gaussians = 2,
      subspace_dimension_of_u = 2,
      subspace_dimension_of_v = 2,
-      k_means_training_iterations = 1,
+      kmeans_training_iterations = 1,
      gmm_training_iterations = 1,
      jfa_training_iterations = 1,
      INIT_SEED = seed_value

--- a/bob/bio/gmm/test/test_scripts.py
+++ b/bob/bio/gmm/test/test_scripts.py
+
+
+from __future__ import print_function
+
+import bob.measure
+
+import os
+import sys
+import shutil
+import tempfile
+import numpy
+
+import bob.io.base.test_utils
+import bob.io.image
+import bob.bio.base
+import bob.bio.gmm
+from . import utils
+
+from nose.plugins.skip import SkipTest
+
+import pkg_resources
+
+regenerate_reference = False
+
+from bob.bio.base.script.verify import main
+
+data_dir = pkg_resources.resource_filename('bob.bio.gmm', 'test/data')
+
+def _verify(parameters, test_dir, sub_dir, ref_modifier="", score_modifier=('scores',''), executable = main):
+  try:
+    executable([sys.argv[0]] + parameters)
+
+    # assert that the score file exists
+    score_files = [os.path.join(test_dir, sub_dir, 'Default', norm, '%s-dev%s'%score_modifier) for norm in ('nonorm',  'ztnorm')]
+    assert os.path.exists(score_files[0]), "Score file %s does not exist" % score_files[0]
+    assert os.path.exists(score_files[1]), "Score file %s does not exist" % score_files[1]
+
+    # also assert that the scores are still the same -- though they have no real meaning
+    reference_files = [os.path.join(data_dir, 'scores-%s%s-dev'%(norm, ref_modifier)) for norm in ('nonorm',  'ztnorm')]
+
+    if regenerate_reference:
+      for i in (0,1):
+        shutil.copy(score_files[i], reference_files[i])
+
+    for i in (0,1):
+      d = []
+      # read reference and new data
+      for score_file in (score_files[i], reference_files[i]):
+        f = bob.measure.load.open_file(score_file)
+        d_ = []
+        for line in f:
+          if isinstance(line, bytes): line = line.decode('utf-8')
+          d_.append(line.rstrip().split())
+        d.append(numpy.array(d_))
+
+      assert d[0].shape == d[1].shape
+      # assert that the data order is still correct
+      assert (d[0][:,0:3] == d[1][:, 0:3]).all()
+      # assert that the values are OK
+      assert numpy.allclose(d[0][:,3].astype(float), d[1][:,3].astype(float), 1e-5)
+
+  finally:
+    shutil.rmtree(test_dir)
+
+
+def test_gmm_base():
+  test_dir = tempfile.mkdtemp(prefix='frltest_')
+  # define dummy parameters
+  parameters = [
+      '-d', 'dummy',
+      '-p', 'dummy',
+      '-e', 'dummy',
+      '-a', 'bob.bio.gmm.algorithm.GMM(2, 2, 2)', '--import', 'bob.bio.gmm',
+      '--zt-norm',
+      '-s', 'test_gmm_sequential', '-vv',
+      '--temp-directory', test_dir,
+      '--result-directory', test_dir
+  ]
+
+  print (bob.bio.base.tools.command_line(parameters))
+
+  _verify(parameters, test_dir, 'test_gmm_sequential', ref_modifier='-gmm')
+
+
+def test_gmm_parallel():
+  from bob.bio.gmm.script.verify_gmm import main
+  test_dir = tempfile.mkdtemp(prefix='frltest_')
+  test_database = os.path.join(test_dir, "submitted.sql3")
+  # define dummy parameters
+  parameters = [
+      '-d', 'dummy',
+      '-p', 'dummy',
+      '-e', 'dummy',
+      '-a', 'bob.bio.gmm.algorithm.GMM(2, 2, 2)', '--import', 'bob.bio.gmm', 'bob.io.image',
+      '-g', 'bob.bio.base.grid.Grid(grid = "local", number_of_parallel_processes = 2, scheduler_sleep_time = 0.1)', '-G', test_database, '--run-local-scheduler', '-R',
+      '--clean-intermediate',
+      '--zt-norm',
+      '-s', 'test_gmm_parallel', '-vv',
+      '--temp-directory', test_dir,
+      '--result-directory', test_dir,
+  ]
+
+  print (bob.bio.base.tools.command_line(parameters))
+
+  _verify(parameters, test_dir, 'test_gmm_parallel', executable=main, ref_modifier='-gmm')
--- a/bob/bio/gmm/tools/__init__.py
+++ b/bob/bio/gmm/tools/__init__.py
+from .command_line import *
+from .gmm import *
--- a/bob/bio/gmm/tools/command_line.py
+++ b/bob/bio/gmm/tools/command_line.py
+import os
+import sys
+import types
+
+import bob.core
+logger = bob.core.log.setup("bob.bio.gmm")
+
+from bob.bio.base.tools import FileSelector
+
+def add_parallel_gmm_options(parsers, additional_functions = ['gmm']):
+  """Add the options for parallel UBM training to the given parsers."""
+
+  flag_group = parsers['flag']
+  flag_group.add_argument('-l', '--limit-training-data', type=int,
+      help = 'Limit the number of training examples used for KMeans initialization and the GMM initialization')
+
+  flag_group.add_argument('-k', '--kmeans-start-iteration', type=int, default=0,
+      help = 'Specify the first iteration for the KMeans training (i.e. to restart from there)')
+  flag_group.add_argument('-m', '--gmm-start-iteration', type=int, default=0,
+      help = 'Specify the first iteration for the GMM training (i.e. to restart from there)')
+  flag_group.add_argument('-C', '--clean-intermediate', action='store_true',
+      help = 'Clean up temporary files of older iterations?')
+
+  sub_dir_group = parsers['sub-dir']
+  sub_dir_group.add_argument('--kmeans-directory', default = 'kmeans_temp',
+      help = 'The sub-directory (relative to --temp-directory), where intermediate kmeans files should be stored')
+  sub_dir_group.add_argument('--gmm-directory',  default = 'gmm_temp',
+      help = 'The sub-directory (relative to --temp-directory), where intermediate gmm files should be stored')
+
+
+
+# Functions to be added to the FileSelector class, once it is instantiated
+def _kmeans_intermediate_file(self, round):
+  return os.path.join(self.kmeans_temp_directory, 'round_%05d' % round, 'kmeans.hdf5')
+
+def _kmeans_stats_file(self, round, start_index, end_index):
+  return os.path.join(self.kmeans_temp_directory, 'round_%05d' % round, 'stats-%05d-%95d.hdf5' % (start_index, end_index))
+
+def _gmm_intermediate_file(self, round):
+  return os.path.join(self.gmm_temp_directory, 'round_%05d' % round, 'gmm.hdf5')
+
+def _gmm_stats_file(self, round, start_index, end_index):
+  return os.path.join(self.gmm_temp_directory, 'round_%05d' % round, 'stats-%05d-%95d.hdf5' % (start_index, end_index))
+
+
+def initialize_parallel_gmm(args):
+  # get the relevant sub_directory, which depends on the database and the prorocol
+  protocol = 'None' if args.database.protocol is None else args.database.protocol
+  extractor_sub_dir = protocol if args.database.training_depends_on_protocol and args.extractor.requires_training else '.'
+  sub_dir = protocol if args.database.training_depends_on_protocol else '.'
+
+  fs = FileSelector.instance()
+
+  # add relevant directories to file selector object
+  fs.kmeans_temp_directory = os.path.join(args.temp_directory, sub_dir, args.kmeans_directory)
+  fs.kmeans_file = os.path.join(args.temp_directory, sub_dir, "kmeans.hdf5")
+  fs.gmm_temp_directory = os.path.join(args.temp_directory, sub_dir, args.gmm_directory)
+#  fs.gmm_file = os.path.join(args.temp_directory, sub_dir, "gmm.hdf5")
+  fs.gmm_file = fs.projector_file
+
+  # add relevant **functions** to file selector object
+  fs.kmeans_intermediate_file = types.MethodType(_kmeans_intermediate_file, fs)
+  fs.kmeans_stats_file =  types.MethodType(_kmeans_stats_file, fs)
+  fs.gmm_intermediate_file = types.MethodType(_gmm_intermediate_file, fs)
+  fs.gmm_stats_file = types.MethodType(_gmm_stats_file, fs)
--- a/bob/bio/gmm/tools/gmm.py
+++ b/bob/bio/gmm/tools/gmm.py
+import bob.io.base
+import bob.learn.em
+import shutil
+import numpy
+import os
+
+import logging
+logger = logging.getLogger("bob.bio.gmm")
+
+from bob.bio.base.tools.FileSelector import FileSelector
+from bob.bio.base import utils, tools
+
+
+def kmeans_initialize(algorithm, extractor, limit_data = None, force = False):
+  """Initializes the K-Means training (non-parallel)."""
+  fs = FileSelector.instance()
+
+  output_file = fs.kmeans_intermediate_file(0)
+
+  if utils.check_file(output_file, force, 1000):
+    logger.info("UBM training: Skipping KMeans initialization since the file '%s' already exists", output_file)
+  else:
+    # read data
+    logger.info("UBM training: initializing kmeans")
+    training_list = utils.selected_elements(fs.training_list('extracted', 'train_projector'), limit_data)
+    data = numpy.vstack([extractor.read_feature(feature_file) for feature_file in training_list])
+
+    # Perform KMeans initialization
+    kmeans_machine = bob.learn.em.KMeansMachine(algorithm.gaussians, data.shape[1])
+    # Creates the KMeansTrainer and call the initialization procedure
+    algorithm.kmeans_trainer.initialize(kmeans_machine, data)
+    bob.io.base.create_directories_safe(os.path.dirname(output_file))
+    kmeans_machine.save(bob.io.base.HDF5File(output_file, 'w'))
+    logger.info("UBM training: saved initial KMeans machine to '%s'", output_file)
+
+
+def kmeans_estep(algorithm, extractor, iteration, indices, force=False):
+  """Performs a single E-step of the K-Means algorithm (parallel)"""
+  if indices[0] > indices[1]:
+    return
+
+  fs = FileSelector.instance()
+
+  # check if we need to compute this step
+  stats_file = fs.kmeans_stats_file(iteration, indices[0], indices[1])
+  new_machine_file = fs.kmeans_intermediate_file(iteration + 1)
+
+  if  utils.check_file(stats_file, force, 1000) or utils.check_file(new_machine_file, force, 1000):
+    logger.info("UBM training: Skipping KMeans E-Step since the file '%s' or '%s' already exists", stats_file, new_machine_file)
+  else:
+    training_list = fs.training_list('extracted', 'train_projector')
+    last_machine_file = fs.kmeans_intermediate_file(iteration)
+    kmeans_machine = bob.learn.em.KMeansMachine(bob.io.base.HDF5File(last_machine_file))
+
+    logger.info("UBM training: KMeans E-Step round %d from range(%d, %d)", iteration, *indices)
+
+    # read data
+    data = numpy.vstack([extractor.read_feature(training_list[index]) for index in range(indices[0], indices[1])])
+
+    # Performs the E-step
+    trainer = algorithm.kmeans_trainer
+    trainer.e_step(kmeans_machine, data)
+
+    # write results to file
+    dist = numpy.array(trainer.average_min_distance)
+    nsamples = numpy.array([indices[1] - indices[0]], dtype=numpy.float64)
+
+    # write statistics
+    bob.io.base.create_directories_safe(os.path.dirname(stats_file))
+    hdf5 = bob.io.base.HDF5File(stats_file, 'w')
+    hdf5.set('zeros', trainer.zeroeth_order_statistics)
+    hdf5.set('first', trainer.first_order_statistics)
+    hdf5.set('dist', dist * nsamples)
+    hdf5.set('nsamples', nsamples)
+
+    logger.info("UBM training: Wrote Stats file '%s'", stats_file)
+
+
+
+def _read_stats(filename):
+  """Reads accumulated K-Means statistics from file"""
+  logger.debug("UBM training: Reading stats file '%s'", filename)
+  hdf5 = bob.io.base.HDF5File(filename)
+  zeroeth  = hdf5.read('zeros')
+  first    = hdf5.read('first')
+  nsamples = hdf5.read('nsamples')
+  dist     = hdf5.read('dist')
+  return (zeroeth, first, nsamples, dist)
+
+def _accumulate(filenames):
+  zeroeth, first, nsamples, dist = _read_stats(filenames[0])
+  for stat in filenames[1:]:
+    zeroeth_, first_, nsamples_, dist_ = _read_stats(filenames[1])
+    zeroeth += zeroeth_
+    first += first_
+    nsamples += nsamples_
+    dist += dist_
+  return (zeroeth, first, nsamples, dist)
+
+def kmeans_mstep(algorithm, iteration, number_of_parallel_jobs, force=False, clean=False):
+  """Performs a single M-step of the K-Means algorithm (non-parallel)"""
+  fs = FileSelector.instance()
+
+  old_machine_file = fs.kmeans_intermediate_file(iteration)
+  new_machine_file = fs.kmeans_intermediate_file(iteration+1)
+
+  if  utils.check_file(new_machine_file, force, 1000):
+    logger.info("UBM training: Skipping KMeans M-Step since the file '%s' already exists", new_machine_file)
+  else:
+    # get the files from e-step
+    training_list = fs.training_list('extracted', 'train_projector')
+
+    # try if there is one file containing all data
+    if os.path.exists(fs.kmeans_stats_file(iteration, 0, len(training_list))):
+      stats_file = fs.kmeans_stats_file(iteration, 0, len(training_list))
+      # load stats file
+      statistics = _read_stats(stats_file)
+    else:
+      # load several files
+      filenames = []
+      for job in range(number_of_parallel_jobs):
+        job_indices = tools.indices(training_list, number_of_parallel_jobs, job+1)
+        if job_indices[-1] >= job_indices[0]:
+          filenames.append(fs.kmeans_stats_file(iteration, job_indices[0], job_indices[-1]))
+      statistics = _accumulate(filenames)
+
+    # Creates the KMeansMachine
+    kmeans_machine = bob.learn.em.KMeansMachine(bob.io.base.HDF5File(old_machine_file))
+    trainer = algorithm.kmeans_trainer
+    trainer.reset_accumulators(kmeans_machine)
+
+    trainer.zeroeth_order_statistics = statistics[0]
+    trainer.first_order_statistics = statistics[1]
+    trainer.average_min_distance = statistics[3]
+    error = statistics[3] / statistics[2]
+
+    # Performs the M-step
+    trainer.m_step(kmeans_machine, None) # data is not used in M-step
+    logger.info("UBM training: Performed M step %d with result %f" % (iteration, error))
+
+    # Save the K-Means model
+    bob.io.base.create_directories_safe(os.path.dirname(new_machine_file))
+    kmeans_machine.save(bob.io.base.HDF5File(new_machine_file, 'w'))
+
+  # copy the k_means file in any case
+  if iteration == algorithm.kmeans_training_iterations-1:
+    shutil.copy(new_machine_file, fs.kmeans_file)
+    logger.info("UBM training: Wrote new KMeans machine '%s'", fs.kmeans_file)
+
+  if clean and iteration > 0:
+    old_dir = os.path.dirname(fs.kmeans_intermediate_file(iteration-1))
+    logger.info("Removing old intermediate directory '%s'", old_dir)
+    shutil.rmtree(old_dir)
+
+
+
+def gmm_initialize(algorithm, extractor, limit_data = None, force = False):
+  """Initializes the GMM calculation with the result of the K-Means algorithm (non-parallel).
+  This might require a lot of memory."""
+  fs = FileSelector.instance()
+
+  output_file = fs.gmm_intermediate_file(0)
+
+  if utils.check_file(output_file, force, 800):
+    logger.info("UBM Training: Skipping GMM initialization since '%s' already exists", output_file)
+  else:
+    logger.info("UBM Training: Initializing GMM")
+
+    # read features
+    training_list = utils.selected_elements(fs.training_list('extracted', 'train_projector'), limit_data)
+    data = numpy.vstack([extractor.read_feature(feature_file) for feature_file in training_list])
+
+    # get means and variances of kmeans result
+    kmeans_machine = bob.learn.em.KMeansMachine(bob.io.base.HDF5File(fs.kmeans_file))
+    [variances, weights] = kmeans_machine.get_variances_and_weights_for_each_cluster(data)
+
+    # Create initial GMM Machine
+    gmm_machine = bob.learn.em.GMMMachine(algorithm.gaussians, data.shape[1])
+
+    # Initializes the GMM
+    gmm_machine.means = kmeans_machine.means
+    gmm_machine.variances = variances
+    gmm_machine.weights = weights
+    gmm_machine.set_variance_thresholds(algorithm.variance_threshold)
+
+    # write gmm machine to file
+    bob.io.base.create_directories_safe(os.path.dirname(output_file))
+    gmm_machine.save(bob.io.base.HDF5File(output_file, 'w'))
+    logger.info("UBM Training: Wrote GMM file '%s'", output_file)
+
+
+def gmm_estep(algorithm, extractor, iteration, indices, force=False):
+  """Performs a single E-step of the GMM training (parallel)."""
+  if indices[0] > indices[1]:
+    return
+  fs = FileSelector.instance()
+
+  stats_file = fs.gmm_stats_file(iteration, indices[0], indices[1])
+  new_machine_file = fs.gmm_intermediate_file(iteration + 1)
+
+  if  utils.check_file(stats_file, force, 1000) or utils.check_file(new_machine_file, force, 1000):
+    loggerinfo("UBM training: Skipping GMM E-Step since the file '%s' or '%s' already exists", stats_file, new_machine_file)
+  else:
+    training_list = fs.training_list('extracted', 'train_projector')
+    last_machine_file = fs.gmm_intermediate_file(iteration)
+    gmm_machine = bob.learn.em.GMMMachine(bob.io.base.HDF5File(last_machine_file))
+
+    logger.info("UBM training: GMM E-Step from range(%d, %d)", indices)
+
+    # read data
+    data = numpy.vstack([extractor.read_feature(training_list[index]) for index in range(indices[0], indices[1])])
+    trainer = algorithm.ubm_trainer
+    trainer.initialize(gmm_machine, None)
+
+    # Calls the E-step and extracts the GMM statistics
+    algorithm.ubm_trainer.e_step(gmm_machine, data)
+    gmm_stats = algorithm.ubm_trainer.gmm_statistics
+
+    # Saves the GMM statistics to the file
+    bob.io.base.create_directories_safe(os.path.dirname(stats_file))
+    gmm_stats.save(bob.io.base.HDF5File(stats_file, 'w'))
+    logger.info("UBM training: Wrote GMM stats '%s'", stats_file)
+
+
+def gmm_mstep(algorithm, iteration, number_of_parallel_jobs, force=False, clean=False):
+  """Performs a single M-step of the GMM training (non-parallel)"""
+  fs = FileSelector.instance()
+
+  old_machine_file = fs.gmm_intermediate_file(iteration)
+  new_machine_file = fs.gmm_intermediate_file(iteration + 1)
+
+  if utils.check_file(new_machine_file, force, 1000):
+    logger.info("UBM training: Skipping GMM M-Step since the file '%s' already exists", new_machine_file)
+  else:
+    # get the files from e-step
+    training_list = fs.training_list('extracted', 'train_projector')
+
+    # try if there is one file containing all data
+    if os.path.exists(fs.gmm_stats_file(iteration, 0, len(training_list))):
+      stats_file = fs.gmm_stats_file(iteration, 0, len(training_list))
+      # load stats file
+      gmm_stats = bob.learn.em.GMMStats(bob.io.base.HDF5File(stats_file))
+    else:
+      # load several files
+      stats_files = []
+      for job in range(number_of_parallel_jobs):
+        job_indices = tools.indices(training_list, number_of_parallel_jobs, job+1)
+        if job_indices[-1] >= job_indices[0]:
+          stats_files.append(fs.gmm_stats_file(iteration, job_indices[0], job_indices[-1]))
+
+      # read all stats files
+      gmm_stats = bob.learn.em.GMMStats(bob.io.base.HDF5File(stats_files[0]))
+      for stats_file in stats_files[1:]:
+        gmm_stats += bob.learn.em.GMMStats(bob.io.base.HDF5File(stats_file))
+
+    # load the old gmm machine
+    gmm_machine =  bob.learn.em.GMMMachine(bob.io.base.HDF5File(old_machine_file))
+
+    # initialize the trainer
+    trainer = algorithm.ubm_trainer
+    trainer.initialize(gmm_machine)
+    trainer.gmm_statistics = gmm_stats
+
+    # Calls M-step (no data required)
+    trainer.m_step(gmm_machine)
+
+    # Saves the GMM statistics to the file
+    bob.io.base.create_directories_safe(os.path.dirname(new_machine_file))
+    gmm_machine.save(bob.io.base.HDF5File(new_machine_file, 'w'))
+
+  if iteration == algorithm.gmm_training_iterations-1:
+    shutil.copy(new_machine_file, fs.gmm_file)
+    logger.info("UBM training: Wrote new GMM machine '%s'", fs.gmm_file)
+
+  if clean and iteration > 0:
+    old_dir = os.path.dirname(fs.gmm_intermediate_file(iteration-1))
+    logger.info("Removing old intermediate directory '%s'", old_dir)
+    shutil.rmtree(old_dir)
--- a/setup.py
+++ b/setup.py