Commit 76b82784 authored by Amir MOHAMMADI's avatar Amir MOHAMMADI

Merge branch 'dask-pipelines' into 'master'

Dask pipelines

See merge request !24
parents 0709716f 85df1ff0
Pipeline #46476 skipped
from . import algorithm
from . import tools
from . import test
from . import script
def get_config():
......
from . import train_gmm
from . import verify_gmm
from . import train_isv
from . import verify_isv
from . import train_ivector
from . import verify_ivector
#!/usr/bin/env python
# vim: set fileencoding=utf-8 :
# Elie Khoury <Elie.Khoury@idiap.ch>
from __future__ import print_function
import sys
import argparse
import logging
logger = logging.getLogger("bob.bio.gmm")
from . import verify_gmm
def main(command_line_parameters = None):
"""Executes the main function"""
try:
# do the command line parsing
args = verify_gmm.parse_arguments(command_line_parameters)
args.groups = ['world']
args.group = 'world'
args.skip_projection = True
args.skip_enrollment = True
args.skip_score_computation = True
args.skip_concatenation = True
args.skip_calibration = True
# perform face verification test
verify_gmm.verify(args, command_line_parameters)
except Exception as e:
# track any exceptions as error logs (i.e., to get a time stamp)
logger.error("During the execution, an exception was raised: %s" % e)
raise
if __name__ == "__main__":
main()
#!/usr/bin/env python
# vim: set fileencoding=utf-8 :
# Elie Khoury <Elie.Khoury@idiap.ch>
from __future__ import print_function
import sys
import argparse
import logging
logger = logging.getLogger("bob.bio.gmm")
from . import verify_isv
def main(command_line_parameters = None):
"""Executes the main function"""
try:
# do the command line parsing
args = verify_isv.parse_arguments(command_line_parameters)
args.groups = ['world']
args.group = 'world'
args.skip_projection = True
args.skip_enrollment = True
args.skip_score_computation = True
args.skip_concatenation = True
args.skip_calibration = True
# perform face verification test
verify_isv.verify(args, command_line_parameters)
except Exception as e:
# track any exceptions as error logs (i.e., to get a time stamp)
logger.error("During the execution, an exception was raised: %s" % e)
raise
if __name__ == "__main__":
main()
#!/usr/bin/env python
# vim: set fileencoding=utf-8 :
# Elie Khoury <Elie.Khoury@idiap.ch>
from __future__ import print_function
import sys
import argparse
import logging
logger = logging.getLogger("bob.bio.gmm")
from . import verify_ivector
def main(command_line_parameters = None):
"""Executes the main function"""
try:
# do the command line parsing
args = verify_ivector.parse_arguments(command_line_parameters)
args.groups = ['world']
args.group = 'world'
args.skip_projection = True
args.skip_enrollment = True
args.skip_score_computation = True
args.skip_concatenation = True
args.skip_calibration = True
# perform face verification test
verify_ivector.verify(args, command_line_parameters)
except Exception as e:
# track any exceptions as error logs (i.e., to get a time stamp)
logger.error("During the execution, an exception was raised: %s" % e)
raise
if __name__ == "__main__":
main()
This diff is collapsed.
#!/usr/bin/env python
# vim: set fileencoding=utf-8 :
# Manuel Guenther <Manuel.Guenther@idiap.ch>
from __future__ import print_function
import sys
import argparse
import logging
logger = logging.getLogger("bob.bio.gmm")
import bob.bio.base
from .. import tools, algorithm
from bob.bio.base import tools as base_tools
def parse_arguments(command_line_parameters, exclude_resources_from = []):
"""This function parses the given options (which by default are the command line options). If exclude_resources_from is specified (as a list), the resources from the given packages are not listed in the help message."""
# set up command line parser
parsers = base_tools.command_line_parser(exclude_resources_from = exclude_resources_from)
# add GMM-related options
tools.add_parallel_gmm_options(parsers, sub_module = 'isv')
# override some parameters
parsers['config'].add_argument('-a', '--algorithm', metavar = 'x', nargs = '+', default = ['isv'],
help = 'Face recognition; only GMM-related algorithms are allowed')
# Add sub-tasks that can be executed by this script
parser = parsers['main']
parser.add_argument('--sub-task',
choices = ('preprocess', 'train-extractor', 'extract', 'normalize-features', 'kmeans-init', 'kmeans-e-step', 'kmeans-m-step', 'gmm-init', 'gmm-e-step', 'gmm-m-step', 'gmm-project', 'train-isv', 'project', 'enroll', 'compute-scores', 'concatenate'),
help = argparse.SUPPRESS) #'Executes a subtask (FOR INTERNAL USE ONLY!!!)'
parser.add_argument('--iteration', type = int,
help = argparse.SUPPRESS) #'Which type of models to generate (Normal or TModels)'
parser.add_argument('--model-type', choices = ['N', 'T'],
help = argparse.SUPPRESS) #'Which type of models to generate (Normal or TModels)'
parser.add_argument('--score-type', choices = ['A', 'B', 'C', 'D', 'Z'],
help = argparse.SUPPRESS) #'The type of scores that should be computed'
parser.add_argument('--group',
help = argparse.SUPPRESS) #'The group for which the current action should be performed'
# now that we have set up everything, get the command line arguments
args = base_tools.initialize(parsers, command_line_parameters,
skips = ['preprocessing', 'extractor-training', 'extraction', 'normalization', 'kmeans', 'gmm', 'isv', 'projection', 'enroller-training', 'enrollment', 'score-computation', 'concatenation', 'calibration']
)
if args.grid is None and args.parallel is None:
raise ValueError("To be able to run the parallelized ISV script, either the --grid or the --parallel option need to be specified!")
args.skip_projector_training = True
# and add the GMM-related parameters
tools.initialize_parallel_gmm(args, sub_module = 'isv')
# assert that the algorithm is a GMM
if tools.base(args.algorithm).__class__ != algorithm.ISV:
raise ValueError("The given algorithm %s is not a (pure) ISV algorithm" % type(args.algorithm))
# check if one of the parameters is given wothout the sub-task
if args.sub_task is None:
if args.iteration is not None: raise ValueError("The option --iteration is an internal option and cannot be used to define experiments")
if args.model_type is not None: raise ValueError("The option --model-type is an internal option and cannot be used to define experiments")
if args.score_type is not None: raise ValueError("The option --score-type is an internal option and cannot be used to define experiments")
if args.group is not None: raise ValueError("The option --group is an internal option and cannot be used to define experiments; did you mean to use --groups?")
return args
from .verify_gmm import add_gmm_jobs
def add_isv_jobs(args, job_ids, deps, submitter):
"""Adds all GMM-related jobs."""
# first, add gmm jobs
job_ids, deps = add_gmm_jobs(args, job_ids, deps, submitter)
# now, add two extra steps for ISV
if not args.skip_isv:
# gmm projection
job_ids['gmm-projection'] = submitter.submit(
'--sub-task gmm-project',
name = 'pro-gmm',
number_of_parallel_jobs = args.grid.number_of_projection_jobs,
dependencies = deps,
**args.grid.projection_queue)
deps.append(job_ids['gmm-projection'])
job_ids['isv-training'] = submitter.submit(
'--sub-task train-isv',
name = 'train-isv',
dependencies = deps,
**args.grid.training_queue)
deps.append(job_ids['isv-training'])
return job_ids, deps
from .verify_gmm import execute as gmm_execute
def execute(args):
"""Run the desired job of the tool chain that is specified on command line.
This job might be executed either in the grid, or locally."""
# first, let the base script decide if it knows how to execute the job
if gmm_execute(args):
return True
# now, check what we can do
algorithm = tools.base(args.algorithm)
# the file selector object
fs = tools.FileSelector.instance()
if args.sub_task == 'gmm-project':
tools.gmm_project(
algorithm,
args.extractor,
indices = base_tools.indices(fs.training_list('extracted', 'train_projector'), args.grid.number_of_projection_jobs),
allow_missing_files = args.allow_missing_files,
force = args.force)
# train the feature projector
elif args.sub_task == 'train-isv':
tools.train_isv(
algorithm,
allow_missing_files = args.allow_missing_files,
force = args.force)
else:
# Not our keyword...
return False
return True
def verify(args, command_line_parameters, external_fake_job_id = 0):
"""This is the main entry point for computing verification experiments.
You just have to specify configurations for any of the steps of the toolchain, which are:
-- the database
-- the preprocessing
-- feature extraction
-- the recognition algorithm
-- and the grid configuration.
Additionally, you can skip parts of the toolchain by selecting proper --skip-... parameters.
If your probe files are not too big, you can also specify the --preload-probes switch to speed up the score computation.
If files should be re-generated, please specify the --force option (might be combined with the --skip-... options)."""
# as the main entry point, check whether the sub-task is specified
if args.sub_task is not None:
# execute the desired sub-task
if not execute(args):
raise ValueError("The specified --sub-task '%s' is not known to the system" % args.sub_task)
return {}
else:
# add jobs
submitter = base_tools.GridSubmission(args, command_line_parameters, executable = 'verify_isv.py', first_fake_job_id = 0)
retval = tools.add_jobs(args, submitter, local_job_adder = add_isv_jobs)
base_tools.write_info(args, command_line_parameters, submitter.executable)
if args.grid.is_local() and args.run_local_scheduler:
if args.dry_run:
print ("Would have started the local scheduler to run the experiments with parallel jobs")
else:
# start the jman local deamon
submitter.execute_local()
return {}
else:
# return job ids as a dictionary
return retval
def main(command_line_parameters = None):
"""Executes the main function"""
try:
# do the command line parsing
args = parse_arguments(command_line_parameters)
# perform face verification test
verify(args, command_line_parameters)
except Exception as e:
# track any exceptions as error logs (i.e., to get a time stamp)
logger.error("During the execution, an exception was raised: %s" % e)
raise
if __name__ == "__main__":
main()
This diff is collapsed.
This diff is collapsed.
from .utils import *
from .command_line import *
from .gmm import *
from .isv import *
from .ivector import *
# gets sphinx autodoc done right - don't remove it
__all__ = [_ for _ in dir() if not _.startswith('_')]
import os
import sys
import types
import bob.core
logger = bob.core.log.setup("bob.bio.gmm")
from bob.bio.base.tools import FileSelector
def add_parallel_gmm_options(parsers, sub_module = None):
"""Add the options for parallel UBM training to the given parsers."""
flag_group = parsers['flag']
flag_group.add_argument('-l', '--limit-training-data', type=int,
help = 'Limit the number of training examples used for KMeans initialization and the GMM initialization')
flag_group.add_argument('-k', '--kmeans-start-iteration', type=int, default=0,
help = 'Specify the first iteration for the KMeans training (i.e. to restart from there)')
flag_group.add_argument('-m', '--gmm-start-iteration', type=int, default=0,
help = 'Specify the first iteration for the GMM training (i.e. to restart from there)')
flag_group.add_argument('-C', '--clean-intermediate', action='store_true',
help = 'Clean up temporary files of older iterations?')
sub_dir_group = parsers['sub-dir']
sub_dir_group.add_argument('--kmeans-directory', default = 'kmeans_temp',
help = 'The sub-directory (relative to --temp-directory), where intermediate kmeans files should be stored')
sub_dir_group.add_argument('--gmm-directory', default = 'gmm_temp',
help = 'The sub-directory (relative to --temp-directory), where intermediate gmm files should be stored')
if sub_module is not None:
sub_dir_group.add_argument('--projected-gmm-directory', default = 'projected_gmm',
help = 'The sub-directory (relative to --temp-directory), where projected gmm training files should be stored')
if sub_module == 'ivector':
sub_dir_group.add_argument('--ivector-directory', default = 'ivector_temp',
help = 'The sub-directory (relative to --temp-directory), where intermediate ivector files should be stored')
sub_dir_group.add_argument('--projected-ivector-directory', default = 'projected_ivector_temp',
help = 'The sub-directory (relative to --temp-directory), where intermediate projected ivector training files should be stored')
sub_dir_group.add_argument('--whitened-directory', default = 'whitened_temp',
help = 'The sub-directory (relative to --temp-directory), where intermediate whitened ivector training files should be stored')
sub_dir_group.add_argument('--lda-projected-directory', default = 'lda_projected_temp',
help = 'The sub-directory (relative to --temp-directory), where intermediate LDA projected ivector training files should be stored')
sub_dir_group.add_argument('--wccn-projected-directory', default = 'wccn_projected_temp',
help = 'The sub-directory (relative to --temp-directory), where intermediate WCCN projected ivector training files should be stored')
flag_group.add_argument('-i', '--tv-start-iteration', type=int, default=0,
help = 'Specify the first iteration for the IVector training (i.e. to restart from there)')
# Functions to be added to the FileSelector class, once it is instantiated
def _kmeans_intermediate_file(self, round):
return os.path.join(self.directories['kmeans'], 'round_%05d' % round, 'kmeans.hdf5')
def _kmeans_stats_file(self, round, start_index, end_index):
return os.path.join(self.directories['kmeans'], 'round_%05d' % round, 'stats-%05d-%05d.hdf5' % (start_index, end_index))
def _gmm_intermediate_file(self, round):
return os.path.join(self.directories['gmm'], 'round_%05d' % round, 'ubm.hdf5')
def _gmm_stats_file(self, round, start_index, end_index):
return os.path.join(self.directories['gmm'], 'round_%05d' % round, 'stats-%05d-%05d.hdf5' % (start_index, end_index))
def _ivector_intermediate_file(self, round):
return os.path.join(self.directories['ivector'], 'round_%05d' % round, 'tv.hdf5')
def _ivector_stats_file(self, round, start_index, end_index):
return os.path.join(self.directories['ivector'], 'round_%05d' % round, 'stats-%05d-%05d.hdf5' % (start_index, end_index))
def initialize_parallel_gmm(args, sub_module = None):
# get the relevant sub_directory, which depends on the database and the prorocol
protocol = 'None' if args.database.protocol is None else args.database.protocol
extractor_sub_dir = protocol if args.database.training_depends_on_protocol and args.extractor.requires_training else '.'
sub_dir = protocol if args.database.training_depends_on_protocol else '.'
fs = FileSelector.instance()
# add relevant **functions** to file selector object
fs.kmeans_intermediate_file = types.MethodType(_kmeans_intermediate_file, fs)
fs.kmeans_stats_file = types.MethodType(_kmeans_stats_file, fs)
fs.gmm_intermediate_file = types.MethodType(_gmm_intermediate_file, fs)
fs.gmm_stats_file = types.MethodType(_gmm_stats_file, fs)
# add relevant directories to file selector object
fs.directories['kmeans'] = os.path.join(args.temp_directory, sub_dir, args.kmeans_directory)
fs.kmeans_file = os.path.join(args.temp_directory, sub_dir, "kmeans.hdf5")
fs.directories['gmm'] = os.path.join(args.temp_directory, sub_dir, args.gmm_directory)
if sub_module is None:
fs.ubm_file = fs.projector_file
else:
fs.ubm_file = os.path.join(args.temp_directory, sub_dir, "ubm.hdf5")
fs.directories['projected_gmm'] = os.path.join(args.temp_directory, sub_dir, args.projected_gmm_directory)
if sub_module == 'ivector':
fs.ivector_intermediate_file = types.MethodType(_ivector_intermediate_file, fs)
fs.ivector_stats_file = types.MethodType(_ivector_stats_file, fs)
fs.directories['ivector'] = os.path.join(args.temp_directory, sub_dir, args.ivector_directory)
fs.tv_file = os.path.join(args.temp_directory, sub_dir, "tv.hdf5")
fs.whitener_file = os.path.join(args.temp_directory, sub_dir, "whitener.hdf5")
fs.lda_file = os.path.join(args.temp_directory, sub_dir, "lda.hdf5")
fs.wccn_file = os.path.join(args.temp_directory, sub_dir, "wccn.hdf5")
fs.plda_file = os.path.join(args.temp_directory, sub_dir, "plda.hdf5")
fs.directories['projected_ivector'] = os.path.join(args.temp_directory, sub_dir, args.projected_ivector_directory)
fs.directories['whitened'] = os.path.join(args.temp_directory, sub_dir, args.whitened_directory)
fs.directories['lda_projected'] = os.path.join(args.temp_directory, sub_dir, args.lda_projected_directory)
fs.directories['wccn_projected'] = os.path.join(args.temp_directory, sub_dir, args.wccn_projected_directory)
This diff is collapsed.
import logging
logger = logging.getLogger("bob.bio.gmm")
import bob.io.base
import os
from bob.bio.base.tools.FileSelector import FileSelector
from bob.bio.base import utils, tools
def train_isv(algorithm, force=False, allow_missing_files=False):
"""Finally, the UBM is used to train the ISV projector/enroller."""
fs = FileSelector.instance()
if utils.check_file(fs.projector_file, force, 800):
logger.info("ISV training: Skipping ISV training since '%s' already exists", fs.projector_file)
else:
# read UBM into the ISV class
algorithm.load_ubm(fs.ubm_file)
# read training data
training_list = fs.training_list('projected_gmm', 'train_projector', arrange_by_client = True)
training_list = utils.filter_missing_files(training_list, split_by_client=True, allow_missing_files=allow_missing_files)
train_gmm_stats = [[algorithm.read_gmm_stats(filename) for filename in client_files] for client_files in training_list]
# perform ISV training
logger.info("ISV training: training ISV with %d clients", len(train_gmm_stats))
algorithm.train_isv(train_gmm_stats)
# save result
bob.io.base.create_directories_safe(os.path.dirname(fs.projector_file))
algorithm.save_projector(fs.projector_file)
This diff is collapsed.
import bob.bio.base
import numpy
def add_jobs(args, submitter, local_job_adder):
"""Adds all (desired) jobs of the tool chain to the grid, or to the local list to be executed."""
assert args.grid is not None
# Here, we use the default bob.bio.base add_jobs function, but intercept it for adding the training
SKIPS = ['preprocessing', 'extractor_training', 'extraction', 'projector_training', 'projection', 'enroller_training', 'enrollment', 'score_computation', 'concatenation', 'calibration']
# original_skips = {key : args.__dict__["skip_%s" % key] for key in SKIPS}
original_skips = {}
for key in SKIPS: original_skips[key] = args.__dict__["skip_%s" % key]
# first, submit preprocessing and feature extraction; skip all others
for key in SKIPS[3:]:
setattr(args, "skip_%s" % key, True)
job_ids = bob.bio.base.script.verify.add_jobs(args, submitter)
for key in SKIPS[3:]:
setattr(args, "skip_%s" % key, original_skips[key])
# reset skips
args.skip_preprocessing = original_skips['preprocessing']
args.skip_extractor_training = original_skips['extractor_training']
args.skip_extraction = original_skips['extraction']
# if there are any external dependencies, we need to respect them
deps = args.external_dependencies[:]
# also, we depend on all previous steps
for n in ['preprocessing', 'extractor-training', 'extraction']:
if n in job_ids:
deps.append(job_ids[n])
# now, add our jobs
job_ids, deps = local_job_adder(args, job_ids, deps, submitter)
# alright, finish the remaining bits
for key in SKIPS[:4]:
setattr(args, "skip_%s" % key, True)
args.external_dependencies = deps
job_ids.update(bob.bio.base.script.verify.add_jobs(args, submitter))
# alright, finish the remaining bits
for key in SKIPS[:4]:
setattr(args, "skip_%s" % key, original_skips[key])
return job_ids
def is_video_extension(algorithm):
try:
import bob.bio.video
if isinstance(algorithm, bob.bio.video.algorithm.Wrapper):
return True
except ImportError:
pass
return False
def base(algorithm):
"""Returns the base algorithm, if it is a video extension, otherwise returns the algorithm itself"""
return algorithm.algorithm if is_video_extension(algorithm) else algorithm
def read_feature(extractor, feature_file):
feature = extractor.read_feature(feature_file)
try:
import bob.bio.video
if isinstance(extractor, bob.bio.video.extractor.Wrapper):
assert isinstance(feature, bob.bio.video.FrameContainer)
return numpy.vstack(frame for _, frame, _ in feature)
except ImportError:
pass
return feature
......@@ -7,12 +7,6 @@ package:
build:
entry_points:
- verify_gmm.py = bob.bio.gmm.script.verify_gmm:main
- verify_isv.py = bob.bio.gmm.script.verify_isv:main
- verify_ivector.py = bob.bio.gmm.script.verify_ivector:main
- train_gmm.py = bob.bio.gmm.script.train_gmm:main
- train_isv.py = bob.bio.gmm.script.train_isv:main
- train_ivector.py = bob.bio.gmm.script.train_ivector:main
number: {{ environ.get('BOB_BUILD_NUMBER', 0) }}
run_exports:
- {{ pin_subpackage(name) }}
......@@ -50,12 +44,6 @@ test:
imports:
- {{ name }}
commands:
- verify_gmm.py --help
- verify_isv.py --help
- verify_ivector.py --help
- train_gmm.py --help
- train_isv.py --help
- train_ivector.py --help
- nosetests --with-coverage --cover-package={{ name }} -sv {{ name }}
- sphinx-build -aEW {{ project_dir }}/doc {{ project_dir }}/sphinx
- sphinx-build -aEb doctest {{ project_dir }}/doc sphinx
......
......@@ -23,7 +23,6 @@ Users Guide
:maxdepth: 2
implementation
parallel
Reference Manual
================
......
.. _bob.bio.gmm.parallel:
==================================
Executing the Training in Parallel
==================================
Sometimes the training of the GMM-based models require a lot of time.
However, the training procedures can be parallelized, i.e., by running the E-steps of the EM loop in parallel.
For this purpose, we provide a set of scripts ``verify_gmm.py``, ``verify_isv.py`` and ``verify_ivector.py``.
These scripts integrate perfectly into the ``bob.bio`` packages.
Particularly, they have exactly the same set of options as documented in :ref:`bob.bio.base.experiments`.
In fact, the scripts above only run in parallelized mode, i.e., either the ``--grid`` or ``--parallel`` option is required.
During the submission of the jobs, several hundred jobs might be created (depending on the ``number_of_..._training_iterations`` that you specify in the :py:class:`bob.bio.gmm.algorithm.GMM` constructor).
However, after the training has finished, it is possible to use the normal ``verify.py`` script to run similar experiments, e.g., if you want to change the protocol of your experiment.
.. todo:: improve the documentation of the parallelized scripts.
......@@ -15,58 +15,4 @@ Miscellaneous functions
bob.bio.base.get_config
Tools to run recognition experiments
------------------------------------
Command line generation
~~~~~~~~~~~~~~~~~~~~~~~
.. autosummary::
bob.bio.gmm.tools.add_parallel_gmm_options
bob.bio.gmm.tools.initialize_parallel_gmm
bob.bio.gmm.tools.add_jobs
Parallel GMM
~~~~~~~~~~~~
.. autosummary::
bob.bio.gmm.tools.kmeans_initialize
bob.bio.gmm.tools.kmeans_estep
bob.bio.gmm.tools.kmeans_mstep
bob.bio.gmm.tools.gmm_initialize
bob.bio.gmm.tools.gmm_estep
bob.bio.gmm.tools.gmm_mstep
bob.bio.gmm.tools.gmm_project
Parallel ISV
~~~~~~~~~~~~
.. autosummary::
bob.bio.gmm.tools.train_isv
Parallel I-Vector
~~~~~~~~~~~~~~~~~
.. autosummary::
bob.bio.gmm.tools.ivector_estep
bob.bio.gmm.tools.ivector_mstep
bob.bio.gmm.tools.ivector_project
bob.bio.gmm.tools.train_whitener
Integration with bob.bio.video
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autosummary::
bob.bio.gmm.tools.is_video_extension
bob.bio.gmm.tools.base
bob.bio.gmm.tools.read_feature
Details
-------
.. automodule:: bob.bio.gmm.tools
.. include:: links.rst
......@@ -101,12 +101,6 @@ setup(
# scripts should be declared using this entry:
'console_scripts' : [
'verify_gmm.py = bob.bio.gmm.script.verify_gmm:main',
'verify_isv.py = bob.bio.gmm.script.verify_isv:main',
'verify_ivector.py = bob.bio.gmm.script.verify_ivector:main',
'train_gmm.py = bob.bio.gmm.script.train_gmm:main',
'train_isv.py = bob.bio.gmm.script.train_isv:main',
'train_ivector.py = bob.bio.gmm.script.train_ivector:main',
],
'bob.bio.database': [
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment