diff --git a/bob/pad/base/script/spoof.py b/bob/pad/base/script/spoof.py index 8e04ce849af3e8d7844c9ab1b88aad5ed4db5ba5..e985e271ceadc5e4bb1ed15cd8eb22e8e5d9467d 100644 --- a/bob/pad/base/script/spoof.py +++ b/bob/pad/base/script/spoof.py @@ -31,15 +31,16 @@ def parse_arguments(command_line_parameters, exclude_resources_from=[]): # Add sub-tasks that can be executed by this script parser = parsers['main'] parser.add_argument('--sub-task', - choices=('preprocess', 'extract', 'train-projector', 'project', 'compute-scores'), + choices=('preprocess', 'train-extractor', 'extract', 'train-projector', 'project', + 'compute-scores'), help=argparse.SUPPRESS) # 'Executes a subtask (FOR INTERNAL USE ONLY!!!)' parser.add_argument('--group', help=argparse.SUPPRESS) # 'The group for which the current action should be performed' # now that we have set up everything, get the command line arguments return tools.initialize(parsers, command_line_parameters, - skips=['preprocessing', 'extraction', 'projector-training', 'projection', - 'score-computation']) + skips=['preprocessing', 'extractor-training', 'extraction', 'projector-training', + 'projection', 'score-computation']) def add_jobs(args, submitter): @@ -65,6 +66,18 @@ def add_jobs(args, submitter): **args.grid.preprocessing_queue) deps.append(job_ids['preprocessing']) + # feature extraction training + if not args.skip_extractor_training and args.extractor.requires_training: + if args.grid is None: + jobs_to_execute.append(('train-extractor',)) + else: + job_ids['extractor-training'] = submitter.submit( + '--sub-task train-extractor', + name='train-f', + dependencies=deps, + **args.grid.training_queue) + deps.append(job_ids['extractor-training']) + # feature extraction if not args.skip_extraction: if args.grid is None: @@ -151,6 +164,15 @@ def execute(args): groups=tools.groups(args), indices=biotools.indices(fs.original_data_list(groups=tools.groups(args)), None if args.grid is None else args.grid.number_of_preprocessing_jobs), + allow_missing_files=args.allow_missing_files, + force=args.force) + + # train the feature extractor + elif args.sub_task == 'train-extractor': + tools.train_extractor( + args.extractor, + args.preprocessor, + allow_missing_files=args.allow_missing_files, force=args.force) # extract the features @@ -161,6 +183,7 @@ def execute(args): groups=tools.groups(args), indices=biotools.indices(fs.preprocessed_data_list(groups=tools.groups(args)), None if args.grid is None else args.grid.number_of_extraction_jobs), + allow_missing_files=args.allow_missing_files, force=args.force) # train the feature projector @@ -168,6 +191,7 @@ def execute(args): tools.train_projector( args.algorithm, args.extractor, + allow_missing_files=args.allow_missing_files, force=args.force) # project the features @@ -178,6 +202,7 @@ def execute(args): groups=tools.groups(args), indices=biotools.indices(fs.preprocessed_data_list(groups=tools.groups(args)), None if args.grid is None else args.grid.number_of_projection_jobs), + allow_missing_files=args.allow_missing_files, force=args.force) # compute scores @@ -185,6 +210,7 @@ def execute(args): tools.compute_scores( args.algorithm, groups=[args.group], + allow_missing_files=args.allow_missing_files, force=args.force, write_compressed=args.write_compressed_score_files) diff --git a/bob/pad/base/tools/FileSelector.py b/bob/pad/base/tools/FileSelector.py index aa0e8342a7797b6b61708fe73afc62afa4138f51..79401f65d12140875cb67cf273310bed18239a6b 100644 --- a/bob/pad/base/tools/FileSelector.py +++ b/bob/pad/base/tools/FileSelector.py @@ -25,6 +25,9 @@ class FileSelector(object): preprocessed_directory : str The directory, where preprocessed data should be written to. + extractor_file : str + The filename, where the extractor should be written to (if any). + extracted_directory : str The directory, where extracted features should be written to. @@ -51,6 +54,7 @@ class FileSelector(object): self, database, preprocessed_directory, + extractor_file, extracted_directory, projector_file, projected_directory, @@ -61,6 +65,7 @@ class FileSelector(object): """Initialize the file selector object with the current configuration.""" self.database = database + self.extractor_file = extractor_file self.projector_file = projector_file self.score_directories = score_directories diff --git a/bob/pad/base/tools/algorithm.py b/bob/pad/base/tools/algorithm.py index 34ed55cf23cb5a5a06b117f00393a3806f757198..0bd4cc3c17df51b98682e5b98ecde1115936e3ab 100644 --- a/bob/pad/base/tools/algorithm.py +++ b/bob/pad/base/tools/algorithm.py @@ -18,7 +18,7 @@ from .extractor import read_features from bob.bio.base import utils -def train_projector(algorithm, extractor, force=False): +def train_projector(algorithm, extractor, allow_missing_files=False, force=False): """Trains the feature projector using extracted features of the ``'train'`` group, if the algorithm requires projector training. This function should only be called, when the ``algorithm`` actually requires projector training. @@ -60,7 +60,7 @@ def train_projector(algorithm, extractor, force=False): algorithm.train_projector(train_features, fs.projector_file) -def project(algorithm, extractor, groups=None, indices=None, force=False): +def project(algorithm, extractor, groups=None, indices=None, allow_missing_files=False, force=False): """Projects the features for all files of the database. The given ``algorithm`` is used to project all features required for the current experiment. diff --git a/bob/pad/base/tools/command_line.py b/bob/pad/base/tools/command_line.py index 041e31d0e3adc47be8644af62efff259646f2099..8146a5827629c1bba46a342461df6236064b42ed 100644 --- a/bob/pad/base/tools/command_line.py +++ b/bob/pad/base/tools/command_line.py @@ -31,14 +31,17 @@ def is_idiap(): def command_line_parser(description=__doc__, exclude_resources_from=[]): """command_line_parser(description=__doc__, exclude_resources_from=[]) -> parsers - Creates an :py:class:`argparse.ArgumentParser` object that includes the minimum set of command line options (which is not so few). + Creates an :py:class:`argparse.ArgumentParser` object that includes the minimum set of command line + options (which is not so few). The ``description`` can be overwritten, but has a (small) default. Included in the parser, several groups are defined. Each group specifies a set of command line options. - For the configurations, registered resources are listed, which can be limited by the ``exclude_resources_from`` list of extensions. + For the configurations, registered resources are listed, which can be limited by the + ``exclude_resources_from`` list of extensions. - It returns a dictionary, containing the parser object itself (in the ``'main'`` keyword), and a list of command line groups. + It returns a dictionary, containing the parser object itself (in the ``'main'`` keyword), and + a list of command line groups. **Parameters:** @@ -60,7 +63,8 @@ def command_line_parser(description=__doc__, exclude_resources_from=[]): ####################################################################################### ############## options that are required to be specified ####################### config_group = parser.add_argument_group( - '\nParameters defining the experiment. Most of these parameters can be a registered resource, a configuration file, or even a string that defines a newly created object') + '\nParameters defining the experiment. Most of these parameters can be a registered resource, a ' + 'configuration file, or even a string that defines a newly created object') config_group.add_argument('-d', '--database', metavar='x', nargs='+', required=True, help='Database and the protocol; registered databases are: %s' % utils.resource_keys( 'database', exclude_resources_from, package_prefix='bob.pad.')) @@ -83,8 +87,8 @@ def command_line_parser(description=__doc__, exclude_resources_from=[]): help='The sub-directory where the files of the current experiment should be stored. ' 'Please specify a directory name with a name describing your experiment.') config_group.add_argument('--groups', metavar='GROUP', nargs='+', default=['dev'], - help="The groups (i.e., 'train', 'dev', 'eval') for which the models and scores should be " - "generated; by default, only the 'dev' group is evaluated") + help="The groups (i.e., 'train', 'dev', 'eval') for which the models and scores " + "should be generated; by default, only the 'dev' group is evaluated") config_group.add_argument('-P', '--protocol', metavar='PROTOCOL', help='Overwrite the protocol that is stored in the database by the given one ' '(might not by applicable for all databases).') @@ -104,18 +108,27 @@ def command_line_parser(description=__doc__, exclude_resources_from=[]): help='The directory for resulting score files, default is: %s.' % results) file_group = parser.add_argument_group( - '\nName (maybe including a path relative to the --temp-directory, if not specified otherwise) of files that will be generated. Note that not all files will be used by all algorithms') + '\nName (maybe including a path relative to the --temp-directory, if not specified otherwise) of files ' + 'that will be generated. Note that not all files will be used by all algorithms') + file_group.add_argument('--extractor-file', metavar='FILE', default='Extractor.hdf5', + help='Name of the file to write the feature extractor into ' + '(used only if the extractor requires training).') file_group.add_argument('--projector-file', metavar='FILE', default='Projector.hdf5', help='Name of the file to write the feature projector into.') file_group.add_argument('-G', '--gridtk-database-file', metavar='FILE', default='submitted.sql3', - help='The database file in which the submitted jobs will be written; relative to the current directory (only valid with the --grid option).') + help='The database file in which the submitted jobs will be written; relative to the ' + 'current directory (only valid with the --grid option).') file_group.add_argument('--experiment-info-file', metavar='FILE', default='Experiment.info', - help='The file where the configuration of all parts of the experiments are written; relative to te --result-directory.') + help='The file where the configuration of all parts of the experiments are written; ' + 'relative to te --result-directory.') file_group.add_argument('-D', '--database-directories-file', metavar='FILE', default=database_replacement, - help='An optional file, where database directories are stored (to avoid changing the database configurations)') + help='An optional file, where database directories are stored (to avoid changing the ' + 'database configurations)') sub_dir_group = parser.add_argument_group( - '\nSubdirectories of certain parts of the tool chain. You can specify directories in case you want to reuse parts of the experiments (e.g. extracted features) in other experiments. Please note that these directories are relative to the --temp-directory, but you can also specify absolute paths') + '\nSubdirectories of certain parts of the tool chain. You can specify directories in case you want to ' + 'reuse parts of the experiments (e.g. extracted features) in other experiments. Please note that these ' + 'directories are relative to the --temp-directory, but you can also specify absolute paths') sub_dir_group.add_argument('--preprocessed-directory', metavar='DIR', default='preprocessed', help='Name of the directory of the preprocessed data.') sub_dir_group.add_argument('--extracted-directory', metavar='DIR', default='extracted', @@ -123,9 +136,11 @@ def command_line_parser(description=__doc__, exclude_resources_from=[]): sub_dir_group.add_argument('--projected-directory', metavar='DIR', default='projected', help='Name of the directory where the projected data should be stored.') sub_dir_group.add_argument('--score-directories', metavar='DIR', nargs='+', default=['scores'], - help='Name of the directory (relative to --result-directory) where to write the results to') + help='Name of the directory (relative to --result-directory) where to write ' + 'the results to') sub_dir_group.add_argument('--grid-log-directory', metavar='DIR', default='gridtk_logs', - help='Name of the directory (relative to --temp-directory) where to log files are written; only used with --grid') + help='Name of the directory (relative to --temp-directory) where to log files ' + 'are written; only used with --grid') flag_group = parser.add_argument_group('\nFlags that change the behavior of the experiment') bob.core.log.add_command_line_option(flag_group) @@ -136,18 +151,25 @@ def command_line_parser(description=__doc__, exclude_resources_from=[]): flag_group.add_argument('-Z', '--write-compressed-score-files', action='store_true', help='Writes score files which are compressed with tar.bz2.') flag_group.add_argument('-S', '--stop-on-failure', action='store_true', - help='Try to recursively stop the dependent jobs from the SGE grid queue, when a job failed') + help='Try to recursively stop the dependent jobs from the SGE grid queue, ' + 'when a job failed') flag_group.add_argument('-X', '--external-dependencies', type=int, default=[], nargs='+', help='The jobs submitted to the grid have dependencies on the given job ids.') flag_group.add_argument('-B', '--timer', choices=('real', 'system', 'user'), nargs='*', - help='Measure and report the time required by the execution of the tool chain (only on local machine)') + help='Measure and report the time required by the execution of the tool chain ' + '(only on local machine)') flag_group.add_argument('-L', '--run-local-scheduler', action='store_true', - help='Starts the local scheduler after submitting the jobs to the local queue (by default, local jobs must be started by hand, e.g., using ./bin/jman --local -vv run-scheduler -x)') + help='Starts the local scheduler after submitting the jobs to the local queue ' + '(by default, local jobs must be started by hand, e.g., using ./bin/jman ' + '--local -vv run-scheduler -x)') flag_group.add_argument('-N', '--nice', type=int, default=10, help='Runs the local scheduler with the given nice value') flag_group.add_argument('-D', '--delete-jobs-finished-with-status', choices=('all', 'failure', 'success'), help='If selected, local scheduler jobs that finished with the given status are deleted ' 'from the --gridtk-database-file; otherwise the jobs remain in the database') + flag_group.add_argument('-A', '--allow-missing-files', action='store_true', + help="If given, missing files will not stop the processing; this is helpful if not " + "all files of the database can be processed; missing scores will be NaN.") flag_group.add_argument('-t', '--environment', dest='env', nargs='*', default=[], help='Passes specific environment variables to the job.') @@ -191,7 +213,8 @@ def initialize(parsers, command_line_parameters=None, skips=[]): args : namespace A namespace of arguments as read from the command line. - .. note:: The database, preprocessor, extractor, algorithm and grid (if specified) are actual instances of the according classes. + .. note:: The database, preprocessor, extractor, algorithm and grid (if specified) are actual + instances of the according classes. """ # execute-only @@ -222,10 +245,14 @@ def initialize(parsers, command_line_parameters=None, skips=[]): args.timer = ('real', 'system', 'user') # load configuration resources - args.database = utils.load_resource(' '.join(args.database), 'database', imports=args.imports, package_prefix='bob.pad.') - args.preprocessor = utils.load_resource(' '.join(args.preprocessor), 'preprocessor', imports=args.imports, package_prefix='bob.pad.') - args.extractor = utils.load_resource(' '.join(args.extractor), 'extractor', imports=args.imports, package_prefix='bob.pad.') - args.algorithm = utils.load_resource(' '.join(args.algorithm), 'algorithm', imports=args.imports, package_prefix='bob.pad.') + args.database = utils.load_resource(' '.join(args.database), 'database', imports=args.imports, + package_prefix='bob.pad.') + args.preprocessor = utils.load_resource(' '.join(args.preprocessor), 'preprocessor', imports=args.imports, + package_prefix='bob.pad.') + args.extractor = utils.load_resource(' '.join(args.extractor), 'extractor', imports=args.imports, + package_prefix='bob.pad.') + args.algorithm = utils.load_resource(' '.join(args.algorithm), 'algorithm', imports=args.imports, + package_prefix='bob.pad.') if args.grid is not None: args.grid = utils.load_resource(' '.join(args.grid), 'grid', imports=args.imports, package_prefix='bob.pad.') @@ -234,7 +261,7 @@ def initialize(parsers, command_line_parameters=None, skips=[]): args.temp_directory = "/idiap/temp/%s/%s" % (os.environ["USER"], args.database.name) if is_idiap() else "temp" if args.result_directory is None: args.result_directory = "/idiap/user/%s/%s" % ( - os.environ["USER"], args.database.name) if is_idiap() else "results" + os.environ["USER"], args.database.name) if is_idiap() else "results" args.temp_directory = os.path.join(args.temp_directory, args.sub_directory) args.result_directory = os.path.join(args.result_directory, args.sub_directory) @@ -250,8 +277,10 @@ def initialize(parsers, command_line_parameters=None, skips=[]): args.info_file = os.path.join(args.result_directory, protocol, args.experiment_info_file) # sub-directorues that depend on the database - extractor_sub_dir = '.' - projector_sub_dir = extractor_sub_dir + extractor_sub_dir = protocol if args.database.training_depends_on_protocol and \ + args.extractor.requires_training else '.' + projector_sub_dir = protocol if args.database.training_depends_on_protocol and \ + args.algorithm.requires_projector_training else extractor_sub_dir # Database directories, which should be automatically replaced if isinstance(args.database, PadDatabase): @@ -260,6 +289,7 @@ def initialize(parsers, command_line_parameters=None, skips=[]): # initialize the file selector FileSelector.create( database=args.database, + extractor_file=os.path.join(args.temp_directory, extractor_sub_dir, args.extractor_file), projector_file=os.path.join(args.temp_directory, projector_sub_dir, args.projector_file), preprocessed_directory=os.path.join(args.temp_directory, args.preprocessed_directory), extracted_directory=os.path.join(args.temp_directory, extractor_sub_dir, args.extracted_directory), @@ -299,7 +329,8 @@ def command_line(cmdline): """command_line(cmdline) -> str Converts the given options to a string that can be executed in a terminal. - Parameters are enclosed into ``'...'`` quotes so that the command line can interpret them (e.g., if they contain spaces or special characters). + Parameters are enclosed into ``'...'`` quotes so that the command line can interpret them (e.g., if they + contain spaces or special characters). **Parameters:** diff --git a/bob/pad/base/tools/extractor.py b/bob/pad/base/tools/extractor.py index 018076d3c8160a93d70b1cafe8827b7ef4ab7977..f67783acf909fff5bf32c180ba959b106876ceb3 100644 --- a/bob/pad/base/tools/extractor.py +++ b/bob/pad/base/tools/extractor.py @@ -15,9 +15,55 @@ logger = logging.getLogger("bob.pad.base") from .FileSelector import FileSelector from bob.bio.base import utils +from .preprocessor import read_preprocessed_data -def extract(extractor, preprocessor, groups=None, indices=None, force=False): +def train_extractor(extractor, preprocessor, allow_missing_files=False, force=False): + """Trains the feature extractor using preprocessed data of the ``'train'`` group, + if the feature extractor requires training. + + This function should only be called, when the ``extractor`` actually requires training. + The given ``extractor`` is trained using preprocessed data. + It writes the extractor to the file specified by the :py:class:`bob.pad.base.tools.FileSelector`. + By default, if the target file already exist, it is not re-created. + + **Parameters:** + + extractor : py:class:`bob.bio.base.extractor.Extractor` or derived + The extractor to be trained. + + preprocessor : py:class:`bob.bio.base.preprocessor.Preprocessor` or derived + The preprocessor, used for reading the preprocessed data. + + allow_missing_files : bool + If set to ``True``, preprocessed data files that are not found are silently ignored during training. + + force : bool + If given, the extractor file is regenerated, even if it already exists. + """ + + if not extractor.requires_training: + logger.warn( + "The train_extractor function should not have been called, since the extractor does not need training.") + return + + # the file selector object + fs = FileSelector.instance() + # the file to write + if utils.check_file(fs.extractor_file, force, + extractor.min_extractor_file_size): + logger.info("- Extraction: extractor '%s' already exists.", fs.extractor_file) + else: + bob.io.base.create_directories_safe(os.path.dirname(fs.extractor_file)) + # read training files + train_files = fs.training_list('preprocessed', 'train_extractor') + train_data = read_preprocessed_data(train_files, preprocessor) + logger.info("- Extraction: training extractor '%s' using %d training files:", fs.extractor_file, + len(train_files)) + # train model + extractor.train(train_data, fs.extractor_file) + +def extract(extractor, preprocessor, groups=None, indices=None, allow_missing_files=False, force=False): """Extracts features from the preprocessed data using the given extractor. The given ``extractor`` is used to extract all features required for the current experiment. @@ -46,6 +92,7 @@ def extract(extractor, preprocessor, groups=None, indices=None, force=False): """ # the file selector object fs = FileSelector.instance() + extractor.load(fs.extractor_file) data_files = fs.preprocessed_data_list(groups=groups) feature_files = fs.feature_list(groups=groups) diff --git a/bob/pad/base/tools/preprocessor.py b/bob/pad/base/tools/preprocessor.py index 163ceef1ac4b3f915adabb6e8f4ad229471ea728..9ce6ee6630538f007f9e51b5a29d011495777c43 100644 --- a/bob/pad/base/tools/preprocessor.py +++ b/bob/pad/base/tools/preprocessor.py @@ -17,7 +17,7 @@ from .FileSelector import FileSelector from bob.bio.base import utils -def preprocess(preprocessor, groups=None, indices=None, force=False): +def preprocess(preprocessor, groups=None, indices=None, allow_missing_files=False, force=False): """Preprocesses the original data of the database with the given preprocessor. The given ``preprocessor`` is used to preprocess all data required for the current experiment. diff --git a/bob/pad/base/tools/scoring.py b/bob/pad/base/tools/scoring.py index 12c713cac00c8624e3aec8e55b5f69b1f7f43498..7c5639a74d149bc0139d233be247bb2439619039 100644 --- a/bob/pad/base/tools/scoring.py +++ b/bob/pad/base/tools/scoring.py @@ -147,7 +147,7 @@ def _scores_all(algorithm, group, force, write_compressed=False): current_toscore_objects[0]+current_toscore_objects[1], write_compressed) -def compute_scores(algorithm, force=False, groups=['dev', 'eval'], write_compressed=False): +def compute_scores(algorithm, force=False, groups=['dev', 'eval'], allow_missing_files=False, write_compressed=False): """Computes the scores for the given groups. This function computes all scores for the experiment and writes them to score files.