Commit 336cbe02 authored by Manuel Günther's avatar Manuel Günther
Browse files

Added IVector tool and parallel algorithm.

parent 2f94648c
......@@ -65,19 +65,20 @@ class IVector (GMM):
def train_ivector(self, training_stats):
logger.info(" -> Training IVector enroller")
self.tv = bob.learn.em.IVectorMachine(self.ubm, self.subspace_dimension_of_t)
self.tv.variance_threshold = self.variance_threshold
self.tv = bob.learn.em.IVectorMachine(self.ubm, self.subspace_dimension_of_t, self.variance_threshold)
# train IVector model
bob.learn.em.train(self.ivector_trainer, self.tv, training_stats, self.tv_training_iterations, rng=self.rng)
def train_whitening(self, training_features):
def train_whitener(self, training_features):
ivectors_matrix = numpy.vstack(training_features)
# create a Linear Machine
self.whitener = bob.learn.linear.Machine(ivectors_matrix.shape[1],ivectors_matrix.shape[1])
# create the whitening trainer
self.whitening_trainer.train(ivectors_matrix, self.whitener)
def train_projector(self, train_features, projector_file):
"""Train Projector and Enroller at the same time"""
[self._check_feature(feature) for feature in train_features]
......@@ -94,12 +95,13 @@ class IVector (GMM):
self.train_ivector(training_stats)
# project training i-vectors
whitening_train_data = [self.project_ivec(stats) for stats in training_stats]
self.train_whitening(whitening_train_data)
whitening_train_data = [self.project_ivector(stats) for stats in training_stats]
self.train_whitener(whitening_train_data)
# save
self.save_projector(projector_file)
def save_projector(self, projector_file):
# Save the IVector base AND the UBM AND the whitening into the same file
hdf5file = bob.io.base.HDF5File(projector_file, "w")
......@@ -124,7 +126,7 @@ class IVector (GMM):
# add UBM model from base class
self.tv.ubm = self.ubm
def load_whitening(self, whitening_file):
def load_whitener(self, whitening_file):
hdf5file = bob.io.base.HDF5File(whitening_file)
self.whitener = bob.learn.linear.Machine(hdf5file)
......@@ -143,10 +145,10 @@ class IVector (GMM):
# Load Whitening
hdf5file.cd('/Whitener')
self.load_whitening(hdf5file)
self.load_whitener(hdf5file)
def project_ivec(self, gmm_stats):
def project_ivector(self, gmm_stats):
return self.tv.project(gmm_stats)
def project_whitening(self, ivector):
......@@ -161,7 +163,7 @@ class IVector (GMM):
# project UBM
projected_ubm = self.project_ubm(feature_array)
# project I-Vector
ivector = self.project_ivec(projected_ubm)
ivector = self.project_ivector(projected_ubm)
# whiten I-Vector
return self.project_whitening(ivector)
......
......@@ -26,14 +26,14 @@ def parse_arguments(command_line_parameters, exclude_resources_from = []):
parsers['config'].add_argument('-g', '--grid', metavar = 'x', nargs = '+', required=True,
help = 'Configuration for the grid setup; required for the parallel execution script.')
parsers['config'].add_argument('-a', '--algorithm', metavar = 'x', nargs = '+', default = ['gmm'],
parsers['config'].add_argument('-a', '--algorithm', metavar = 'x', nargs = '+', default = ['isv'],
help = 'Face recognition; only GMM-related algorithms are allowed')
# Add sub-tasks that can be executed by this script
parser = parsers['main']
parser.add_argument('--sub-task',
choices = ('preprocess', 'train-extractor', 'extract', 'normalize-features', 'kmeans-init', 'kmeans-e-step', 'kmeans-m-step', 'gmm-init', 'gmm-e-step', 'gmm-m-step', 'gmm-project', 'isv-train', 'project', 'enroll', 'compute-scores', 'concatenate'),
choices = ('preprocess', 'train-extractor', 'extract', 'normalize-features', 'kmeans-init', 'kmeans-e-step', 'kmeans-m-step', 'gmm-init', 'gmm-e-step', 'gmm-m-step', 'gmm-project', 'train-isv', 'project', 'enroll', 'compute-scores', 'concatenate'),
help = argparse.SUPPRESS) #'Executes a subtask (FOR INTERNAL USE ONLY!!!)'
parser.add_argument('--iteration', type = int,
help = argparse.SUPPRESS) #'Which type of models to generate (Normal or TModels)'
......@@ -80,7 +80,7 @@ def add_isv_jobs(args, job_ids, deps, submitter):
deps.append(job_ids['gmm-projection'])
job_ids['isv-training'] = submitter.submit(
'--sub-task isv-train',
'--sub-task train-isv',
name = 'train-isv',
dependencies = deps,
**args.grid.training_queue)
......@@ -113,8 +113,8 @@ def execute(args):
force = args.force)
# train the feature projector
elif args.sub_task == 'isv-train':
tools.isv_training(
elif args.sub_task == 'train-isv':
tools.train_isv(
args.algorithm,
force = args.force)
......
#!/usr/bin/env python
# vim: set fileencoding=utf-8 :
# Manuel Guenther <Manuel.Guenther@idiap.ch>
from __future__ import print_function
import sys
import argparse
import logging
logger = logging.getLogger("bob.bio.gmm")
import bob.bio.base
from .. import tools, algorithm
from bob.bio.base import tools as base_tools
def parse_arguments(command_line_parameters, exclude_resources_from = []):
"""This function parses the given options (which by default are the command line options). If exclude_resources_from is specified (as a list), the resources from the given packages are not listed in the help message."""
# set up command line parser
parsers = base_tools.command_line_parser(exclude_resources_from = exclude_resources_from)
# add GMM-related options
tools.add_parallel_gmm_options(parsers, sub_module = 'ivector')
# override some parameters
parsers['config'].add_argument('-g', '--grid', metavar = 'x', nargs = '+', required=True,
help = 'Configuration for the grid setup; required for the parallel execution script.')
parsers['config'].add_argument('-a', '--algorithm', metavar = 'x', nargs = '+', default = ['ivector'],
help = 'Face recognition; only GMM-related algorithms are allowed')
# Add sub-tasks that can be executed by this script
parser = parsers['main']
parser.add_argument('--sub-task',
choices = ('preprocess', 'train-extractor', 'extract', 'normalize-features', 'kmeans-init', 'kmeans-e-step', 'kmeans-m-step', 'gmm-init', 'gmm-e-step', 'gmm-m-step', 'gmm-project', 'ivector-e-step', 'ivector-m-step', 'ivector-project', 'train-whitener', 'project', 'enroll', 'compute-scores', 'concatenate'),
help = argparse.SUPPRESS) #'Executes a subtask (FOR INTERNAL USE ONLY!!!)'
parser.add_argument('--iteration', type = int,
help = argparse.SUPPRESS) #'Which type of models to generate (Normal or TModels)'
parser.add_argument('--model-type', choices = ['N', 'T'],
help = argparse.SUPPRESS) #'Which type of models to generate (Normal or TModels)'
parser.add_argument('--score-type', choices = ['A', 'B', 'C', 'D', 'Z'],
help = argparse.SUPPRESS) #'The type of scores that should be computed'
parser.add_argument('--group',
help = argparse.SUPPRESS) #'The group for which the current action should be performed'
# now that we have set up everything, get the command line arguments
args = base_tools.initialize(parsers, command_line_parameters,
skips = ['preprocessing', 'extractor-training', 'extraction', 'normalization', 'kmeans', 'gmm', 'ivector', 'whitening', 'projection', 'enroller-training', 'enrollment', 'score-computation', 'concatenation', 'calibration']
)
args.skip_projector_training = True
# and add the GMM-related parameters
tools.initialize_parallel_gmm(args, sub_module = 'ivector')
# assert that the algorithm is a GMM
if args.algorithm.__class__ != algorithm.IVector:
raise ValueError("The given algorithm %s is not a (pure) IVector algorithm" % type(args.algorithm))
return args
from .verify_gmm import add_gmm_jobs
def add_ivector_jobs(args, job_ids, deps, submitter):
"""Adds all GMM-related jobs."""
# first, add gmm jobs
job_ids, deps = add_gmm_jobs(args, job_ids, deps, submitter)
# now, add the extra steps for ivector
if not args.skip_ivector:
# gmm projection
job_ids['gmm-projection'] = submitter.submit(
'--sub-task gmm-project',
name = 'pro-gmm',
number_of_parallel_jobs = args.grid.number_of_projection_jobs,
dependencies = deps,
**args.grid.projection_queue)
deps.append(job_ids['gmm-projection'])
# several iterations of E and M steps
for iteration in range(args.tv_start_iteration, args.algorithm.tv_training_iterations):
# E-step
job_ids['ivector-e-step'] = submitter.submit(
'--sub-task ivector-e-step --iteration %d' % iteration,
name='i-e-%d' % iteration,
number_of_parallel_jobs = args.grid.number_of_projection_jobs,
dependencies = [job_ids['ivector-m-step']] if iteration != args.tv_start_iteration else deps,
**args.grid.projection_queue)
# M-step
job_ids['ivector-m-step'] = submitter.submit(
'--sub-task ivector-m-step --iteration %d' % iteration,
name='i-m-%d' % iteration,
dependencies = [job_ids['ivector-e-step']],
**args.grid.training_queue)
deps.append(job_ids['ivector-m-step'])
# whitening
if not args.skip_whitening:
# ivector projection
job_ids['ivector-projection'] = submitter.submit(
'--sub-task ivector-project',
name = 'pro-ivector',
number_of_parallel_jobs = args.grid.number_of_projection_jobs,
dependencies = deps,
**args.grid.projection_queue)
deps.append(job_ids['ivector-projection'])
# TV training
job_ids['whitener-training'] = submitter.submit(
'--sub-task train-whitener',
name = 'train-whitener',
dependencies = deps,
**args.grid.training_queue)
deps.append(job_ids['whitener-training'])
return job_ids, deps
from .verify_gmm import execute as gmm_execute
def execute(args):
"""Run the desired job of the tool chain that is specified on command line.
This job might be executed either in the grid, or locally."""
# first, let the base script decide if it knows how to execute the job
if gmm_execute(args):
return True
# now, check what we can do
# the file selector object
fs = tools.FileSelector.instance()
if args.sub_task == 'gmm-project':
tools.gmm_project(
args.algorithm,
args.extractor,
indices = base_tools.indices(fs.training_list('extracted', 'train_projector'), args.grid.number_of_projection_jobs),
force = args.force)
elif args.sub_task == 'ivector-e-step':
tools.ivector_estep(
args.algorithm,
args.iteration,
indices = base_tools.indices(fs.training_list('projected_gmm', 'train_projector'), args.grid.number_of_projection_jobs),
force = args.force)
# train the feature projector
elif args.sub_task == 'ivector-m-step':
tools.ivector_mstep(
args.algorithm,
args.iteration,
number_of_parallel_jobs = args.grid.number_of_projection_jobs,
clean = args.clean_intermediate,
force = args.force)
elif args.sub_task == 'ivector-project':
tools.ivector_project(
args.algorithm,
indices = base_tools.indices(fs.training_list('projected_gmm', 'train_projector'), args.grid.number_of_projection_jobs),
force = args.force)
elif args.sub_task == 'train-whitener':
tools.train_whitener(
args.algorithm,
force = args.force)
else:
# Not our keyword...
return False
return True
def verify(args, command_line_parameters, external_fake_job_id = 0):
"""This is the main entry point for computing verification experiments.
You just have to specify configurations for any of the steps of the toolchain, which are:
-- the database
-- the preprocessing
-- feature extraction
-- the recognition algorithm
-- and the grid configuration.
Additionally, you can skip parts of the toolchain by selecting proper --skip-... parameters.
If your probe files are not too big, you can also specify the --preload-probes switch to speed up the score computation.
If files should be re-generated, please specify the --force option (might be combined with the --skip-... options)."""
# as the main entry point, check whether the sub-task is specified
if args.sub_task is not None:
# execute the desired sub-task
if not execute(args):
raise ValueError("The specified --sub-task '%s' is not known to the system" % args.sub_task)
return {}
else:
# add jobs
submitter = base_tools.GridSubmission(args, command_line_parameters, executable = 'verify_ivector.py', first_fake_job_id = 0) if args.grid else None
retval = tools.add_jobs(args, submitter, local_job_adder = add_ivector_jobs)
base_tools.write_info(args, command_line_parameters)
if args.grid.is_local() and args.run_local_scheduler:
if args.dry_run:
print ("Would have started the local scheduler to run the experiments with parallel jobs")
else:
# start the jman local deamon
submitter.execute_local()
return {}
else:
# return job ids as a dictionary
return retval
def main(command_line_parameters = sys.argv):
"""Executes the main function"""
try:
# do the command line parsing
args = parse_arguments(command_line_parameters[1:])
# perform face verification test
verify(args, command_line_parameters)
except Exception as e:
# track any exceptions as error logs (i.e., to get a time stamp)
logger.error("During the execution, an exception was raised: %s" % e)
raise
if __name__ == "__main__":
main()
This diff is collapsed.
This diff is collapsed.
......@@ -66,7 +66,7 @@ def test_gmm_sequential():
'-d', 'dummy',
'-p', 'dummy',
'-e', 'dummy',
'-a', 'bob.bio.gmm.algorithm.GMM(2, 2, 2)', '--import', 'bob.bio.gmm',
'-a', 'bob.bio.gmm.algorithm.GMM(2, 2, 2)',
'--zt-norm',
'-s', 'test_gmm_sequential',
'--temp-directory', test_dir,
......@@ -108,7 +108,7 @@ def test_isv_sequential():
'-d', 'dummy',
'-p', 'dummy',
'-e', 'dummy',
'-a', 'bob.bio.gmm.algorithm.ISV(10, number_of_gaussians=2, kmeans_training_iterations=2, gmm_training_iterations=2, isv_training_iterations=2)', '--import', 'bob.bio.gmm',
'-a', 'bob.bio.gmm.algorithm.ISV(10, number_of_gaussians=2, kmeans_training_iterations=2, gmm_training_iterations=2, isv_training_iterations=2)',
'--zt-norm',
'-s', 'test_isv_sequential',
'--temp-directory', test_dir,
......@@ -141,3 +141,45 @@ def test_isv_parallel():
print (bob.bio.base.tools.command_line(parameters))
_verify(parameters, test_dir, 'test_isv_parallel', executable=main, ref_modifier='-isv')
def test_ivector_sequential():
test_dir = tempfile.mkdtemp(prefix='bobtest_')
# define dummy parameters
parameters = [
'-d', 'dummy',
'-p', 'dummy',
'-e', 'dummy',
'-a', 'bob.bio.gmm.algorithm.IVector(10, number_of_gaussians=2, kmeans_training_iterations=2, gmm_training_iterations=2, tv_training_iterations=2)',
'--zt-norm',
'-s', 'test_ivector_sequential',
'--temp-directory', test_dir,
'--result-directory', test_dir
]
print (bob.bio.base.tools.command_line(parameters))
_verify(parameters, test_dir, 'test_ivector_sequential', ref_modifier='-ivector')
def test_ivector_parallel():
from bob.bio.gmm.script.verify_ivector import main
test_dir = tempfile.mkdtemp(prefix='bobtest_')
test_database = os.path.join(test_dir, "submitted.sql3")
# define dummy parameters
parameters = [
'-d', 'dummy',
'-p', 'dummy',
'-e', 'dummy',
'-a', 'bob.bio.gmm.algorithm.IVector(10, number_of_gaussians=2, kmeans_training_iterations=2, gmm_training_iterations=2, tv_training_iterations=2)', '--import', 'bob.bio.gmm', 'bob.io.image',
'-g', 'bob.bio.base.grid.Grid(grid = "local", number_of_parallel_processes = 2, scheduler_sleep_time = 0.1)', '-G', test_database, '--run-local-scheduler', '-R',
'--clean-intermediate',
'--zt-norm',
'-s', 'test_ivector_parallel',
'--temp-directory', test_dir,
'--result-directory', test_dir
]
print (bob.bio.base.tools.command_line(parameters))
_verify(parameters, test_dir, 'test_ivector_parallel', executable=main, ref_modifier='-ivector')
......@@ -2,3 +2,4 @@ from .utils import *
from .command_line import *
from .gmm import *
from .isv import *
from .ivector import *
......@@ -27,9 +27,17 @@ def add_parallel_gmm_options(parsers, sub_module = None):
sub_dir_group.add_argument('--gmm-directory', default = 'gmm_temp',
help = 'The sub-directory (relative to --temp-directory), where intermediate gmm files should be stored')
if sub_module == 'isv':
sub_dir_group.add_argument('--isv-directory', default = 'isv_temp',
help = 'The sub-directory (relative to --temp-directory), where intermediate isv training files should be stored')
if sub_module is not None:
sub_dir_group.add_argument('--projected-gmm-directory', default = 'projetced_gmm',
help = 'The sub-directory (relative to --temp-directory), where projected gmm training files should be stored')
if sub_module == 'ivector':
sub_dir_group.add_argument('--ivector-directory', default = 'ivector_temp',
help = 'The sub-directory (relative to --temp-directory), where intermediate ivector files should be stored')
sub_dir_group.add_argument('--projected-ivector-directory', default = 'projected_ivector_temp',
help = 'The sub-directory (relative to --temp-directory), where intermediate projected ivector training files should be stored')
flag_group.add_argument('-i', '--tv-start-iteration', type=int, default=0,
help = 'Specify the first iteration for the IVector training (i.e. to restart from there)')
......@@ -38,13 +46,20 @@ def _kmeans_intermediate_file(self, round):
return os.path.join(self.directories['kmeans'], 'round_%05d' % round, 'kmeans.hdf5')
def _kmeans_stats_file(self, round, start_index, end_index):
return os.path.join(self.directories['kmeans'], 'round_%05d' % round, 'stats-%05d-%95d.hdf5' % (start_index, end_index))
return os.path.join(self.directories['kmeans'], 'round_%05d' % round, 'stats-%05d-%05d.hdf5' % (start_index, end_index))
def _gmm_intermediate_file(self, round):
return os.path.join(self.directories['gmm'], 'round_%05d' % round, 'gmm.hdf5')
return os.path.join(self.directories['gmm'], 'round_%05d' % round, 'ubm.hdf5')
def _gmm_stats_file(self, round, start_index, end_index):
return os.path.join(self.directories['gmm'], 'round_%05d' % round, 'stats-%05d-%95d.hdf5' % (start_index, end_index))
return os.path.join(self.directories['gmm'], 'round_%05d' % round, 'stats-%05d-%05d.hdf5' % (start_index, end_index))
def _ivector_intermediate_file(self, round):
return os.path.join(self.directories['ivector'], 'round_%05d' % round, 'tv.hdf5')
def _ivector_stats_file(self, round, start_index, end_index):
return os.path.join(self.directories['ivector'], 'round_%05d' % round, 'stats-%05d-%05d.hdf5' % (start_index, end_index))
def initialize_parallel_gmm(args, sub_module = None):
......@@ -69,4 +84,12 @@ def initialize_parallel_gmm(args, sub_module = None):
fs.ubm_file = fs.projector_file
else:
fs.ubm_file = os.path.join(args.temp_directory, sub_dir, "ubm.hdf5")
fs.directories['isv'] = os.path.join(args.temp_directory, sub_dir, args.isv_directory)
fs.directories['projected_gmm'] = os.path.join(args.temp_directory, sub_dir, args.projected_gmm_directory)
if sub_module == 'ivector':
fs.ivector_intermediate_file = types.MethodType(_ivector_intermediate_file, fs)
fs.ivector_stats_file = types.MethodType(_ivector_stats_file, fs)
fs.directories['ivector'] = os.path.join(args.temp_directory, sub_dir, args.ivector_directory)
fs.tv_file = os.path.join(args.temp_directory, sub_dir, "tv.hdf5")
fs.directories['projected_ivector'] = os.path.join(args.temp_directory, sub_dir, args.projected_ivector_directory)
fs.whitener_file = os.path.join(args.temp_directory, sub_dir, "whitener.hdf5")
......@@ -270,9 +270,35 @@ def gmm_mstep(algorithm, iteration, number_of_parallel_jobs, force=False, clean=
if iteration == algorithm.gmm_training_iterations-1:
shutil.copy(new_machine_file, fs.ubm_file)
logger.info("UBM training: Wrote new GMM machine '%s'", fs.ubm_file)
logger.info("UBM training: Wrote new UBM '%s'", fs.ubm_file)
if clean and iteration > 0:
old_dir = os.path.dirname(fs.gmm_intermediate_file(iteration-1))
logger.info("Removing old intermediate directory '%s'", old_dir)
shutil.rmtree(old_dir)
def gmm_project(algorithm, extractor, indices, force=False):
"""Performs GMM projection"""
fs = FileSelector.instance()
algorithm.load_ubm(fs.ubm_file)
feature_files = fs.training_list('extracted', 'train_projector')
projected_files = fs.training_list('projected_gmm', 'train_projector')
logger.info("ISV training: Project features range (%d, %d) from '%s' to '%s'", indices[0], indices[1], fs.directories['extracted'], fs.directories['projected_gmm'])
# extract the features
for i in range(indices[0], indices[1]):
feature_file = feature_files[i]
projected_file = projected_files[i]
if not utils.check_file(projected_file, force):
# load feature
feature = extractor.read_feature(feature_file)
# project feature
projected = algorithm.project_ubm(feature)
# write it
bob.io.base.create_directories_safe(os.path.dirname(projected_file))
bob.bio.base.save(projected, projected_file)
......@@ -7,33 +7,7 @@ import os
from bob.bio.base.tools.FileSelector import FileSelector
from bob.bio.base import utils, tools
def gmm_project(algorithm, extractor, indices, force=False):
"""Performs GMM projection"""
fs = FileSelector.instance()
algorithm.load_ubm(fs.ubm_file)
feature_files = fs.training_list('extracted', 'train_projector')
projected_files = fs.training_list('isv', 'train_projector')
logger.info("ISV training: Project features range (%d, %d) from '%s' to '%s'", indices, fs.directories['extracted'], fs.directories['isv'])
# extract the features
for i in range(indices[0], indices[1]):
feature_file = feature_files[i]
projected_file = projected_files[i]
if not utils.check_file(projected_file, force):
# load feature
feature = extractor.read_feature(feature_file)
# project feature
projected = algorithm.project_ubm(feature)
# write it
bob.io.base.create_directories_safe(os.path.dirname(projected_file))
bob.bio.base.save(projected, projected_file)
def isv_training(algorithm, force=False):
def train_isv(algorithm, force=False):
"""Finally, the UBM is used to train the ISV projector/enroller."""
fs = FileSelector.instance()
......@@ -44,7 +18,7 @@ def isv_training(algorithm, force=False):
algorithm.load_ubm(fs.ubm_file)
# read training data
training_list = fs.training_list('isv', 'train_projector', arrange_by_client = True)
training_list = fs.training_list('projected_gmm', 'train_projector', arrange_by_client = True)
train_gmm_stats = [[algorithm.read_gmm_stats(filename) for filename in client_files] for client_files in training_list]
# perform ISV training
......
import logging
logger = logging.getLogger("bob.bio.gmm")
import bob.io.base
import os
import shutil
from bob.bio.base.tools.FileSelector import FileSelector
from bob.bio.base import utils, tools
def ivector_estep(algorithm, iteration, indices, force=False):
"""Performs a single E-step of the IVector algorithm (parallel)"""
fs = FileSelector.instance()
stats_file = fs.ivector_stats_file(iteration, indices[0], indices[1])
if utils.check_file(stats_file, force, 1000):
logger.info("IVector training: Skipping IVector E-Step since the file '%s' already exists", stats_file)
else:
logger.info("IVector training: E-Step from range(%d, %d)", *indices)
# Temporary machine used for initialization
algorithm.load_ubm(fs.ubm_file)
# get the IVectorTrainer and call the initialization procedure
trainer = algorithm.ivector_trainer
# Load machine
if iteration:
# load last TV file
tv = bob.learn.em.IVectorMachine(bob.io.base.HDF5File(fs.ivector_intermediate_file(iteration)))
tv.ubm = algorithm.ubm
else:
# create new TV machine
tv = bob.learn.em.IVectorMachine(algorithm.ubm, algorithm.subspace_dimension_of_t, algorithm.variance_threshold)
trainer.initialize(tv)
# Load data
training_list = fs.training_list('projected_gmm', 'train_projector')
data = [algorithm.read_gmm_stats(training_list[i]) for i in range(indices[0], indices[1])]
# Perform the E-step
trainer.e_step(tv