diff --git a/bob/bio/base/database/database.py b/bob/bio/base/database/database.py index 2ab5156477c2803242ece20cb1bb8ad27640eabc..d20f76f67caed469ae13ba33095aa2c2bf8f5535 100644 --- a/bob/bio/base/database/database.py +++ b/bob/bio/base/database/database.py @@ -9,7 +9,7 @@ from numpy.testing.decorators import setastest import bob.db.base -class BioDatabase(six.with_metaclass(abc.ABCMeta, bob.db.base.Database)): +class BioDatabase(six.with_metaclass(abc.ABCMeta, bob.db.base.FileDatabase)): """This class represents the basic API for database access. Please use this class as a base class for your database access classes. Do not forget to call the constructor of this base class in your derived class. @@ -90,7 +90,8 @@ class BioDatabase(six.with_metaclass(abc.ABCMeta, bob.db.base.Database)): super(BioDatabase, self).__init__( original_directory=original_directory, - original_extension=original_extension) + original_extension=original_extension, + **kwargs) self.name = name @@ -100,7 +101,7 @@ class BioDatabase(six.with_metaclass(abc.ABCMeta, bob.db.base.Database)): self.enroller_training_options = enroller_training_options self.check_existence = check_original_files_for_existence - self._kwargs = kwargs + self._kwargs = {} self.annotation_directory = annotation_directory self.annotation_extension = annotation_extension @@ -196,18 +197,9 @@ class BioDatabase(six.with_metaclass(abc.ABCMeta, bob.db.base.Database)): if self.original_directory in replacements: self.original_directory = replacements[self.original_directory] - try: - self._db.original_directory = self.original_directory - except AttributeError: - pass - try: if self.annotation_directory in replacements: self.annotation_directory = replacements[self.annotation_directory] - try: - self._db.annotation_directory = self.annotation_directory - except AttributeError: - pass except AttributeError: pass diff --git a/bob/bio/base/database/filelist/models.py b/bob/bio/base/database/filelist/models.py index f5507f877796c63efe2cee143304bade57ed9df0..a6317136dca29f70d9bec0dd41c7891ed37ab0ff 100644 --- a/bob/bio/base/database/filelist/models.py +++ b/bob/bio/base/database/filelist/models.py @@ -87,6 +87,8 @@ class ListReader(object): raise RuntimeError('File %s does not exist.' % (list_file,)) try: for line in fileinput.input(list_file): + if line.strip().startswith('#'): + continue parsed_line = re.findall('[\w/(-.)]+', line) if len(parsed_line): # perform some sanity checks diff --git a/bob/bio/base/database/filelist/query.py b/bob/bio/base/database/filelist/query.py index 2d6eaf1399d63608f3ea6f49564ee7bf507f23f1..4bb6487ba9a86e080f6cd5e2ee91b11653a3cf57 100644 --- a/bob/bio/base/database/filelist/query.py +++ b/bob/bio/base/database/filelist/query.py @@ -10,6 +10,9 @@ from .. import BioFile from .models import ListReader +import logging +logger = logging.getLogger('bob.bio.base') + class FileListBioDatabase(ZTBioDatabase): """This class provides a user-friendly interface to databases that are given as file lists. @@ -127,7 +130,6 @@ class FileListBioDatabase(ZTBioDatabase): and the given sub-directories and file names (which default to useful values if not given).""" super(FileListBioDatabase, self).__init__( - filelists_directory=filelists_directory, name=name, protocol=protocol, original_directory=original_directory, @@ -135,7 +137,10 @@ class FileListBioDatabase(ZTBioDatabase): annotation_directory=annotation_directory, annotation_extension=annotation_extension, annotation_type=annotation_type, - # extra args for pretty printing + **kwargs) + # extra args for pretty printing + self._kwargs.update(dict( + filelists_directory=filelists_directory, dev_sub_directory=dev_sub_directory, eval_sub_directory=eval_sub_directory, world_filename=world_filename, @@ -147,9 +152,10 @@ class FileListBioDatabase(ZTBioDatabase): tnorm_filename=tnorm_filename, znorm_filename=znorm_filename, use_dense_probe_file_list=use_dense_probe_file_list, - # if both probe_filename and scores_filename are given, what kind of list should be used? + # if both probe_filename and scores_filename are given, what kind + # of list should be used? keep_read_lists_in_memory=keep_read_lists_in_memory, - **kwargs) + )) # self.original_directory = original_directory # self.original_extension = original_extension self.bio_file_class = bio_file_class @@ -226,10 +232,12 @@ class FileListBioDatabase(ZTBioDatabase): if group == 'world': continue if add_zt_files: - if not self.implements_zt(self.protocol, group): - raise ValueError("ZT score files are requested, but no such files are defined in group %s for protocol %s", group, self.protocol) - files += self.tobjects(group, self.protocol) - files += self.zobjects(group, self.protocol, **self.z_probe_options) + if self.implements_zt(self.protocol, group): + files += self.tobjects(group, self.protocol) + files += self.zobjects(group, self.protocol, **self.z_probe_options) + else: + logger.warn("ZT score files are requested, but no such files are defined in group %s for protocol %s", group, self.protocol) + return self.sort(self._make_bio(files)) diff --git a/bob/bio/base/extractor/stacks.py b/bob/bio/base/extractor/stacks.py index aae378385d0b4ebeacbc080f1462e73056fae4d7..780dafdb47b211ad0b8aa70af2b0c559b9f636e1 100644 --- a/bob/bio/base/extractor/stacks.py +++ b/bob/bio/base/extractor/stacks.py @@ -7,14 +7,15 @@ class MultipleExtractor(Extractor): """Base class for SequentialExtractor and ParallelExtractor. This class is not meant to be used directly.""" - def get_attributes(self, processors): + @staticmethod + def get_attributes(processors): requires_training = any(p.requires_training for p in processors) split_training_data_by_client = any(p.split_training_data_by_client for p in processors) min_extractor_file_size = min(p.min_extractor_file_size for p in processors) - min_feature_file_size = min( - p.min_feature_file_size for p in processors) + min_feature_file_size = min(p.min_feature_file_size for p in + processors) return (requires_training, split_training_data_by_client, min_extractor_file_size, min_feature_file_size) @@ -23,35 +24,59 @@ class MultipleExtractor(Extractor): return groups def train_one(self, e, training_data, extractor_file, apply=False): + """Trains one extractor and optionally applies the extractor on the + training data after training. + + Parameters + ---------- + e : :any:`Extractor` + The extractor to train. The extractor should be able to save itself + in an opened hdf5 file. + training_data : [object] or [[object]] + The data to be used for training. + extractor_file : :any:`bob.io.base.HDF5File` + The opened hdf5 file to save the trained extractor inside. + apply : :obj:`bool`, optional + If ``True``, the extractor is applied to the training data after it + is trained and the data is returned. + + Returns + ------- + None or [object] or [[object]] + Returns ``None`` if ``apply`` is ``False``. Otherwise, returns the + transformed ``training_data``. + """ if not e.requires_training: - return + # do nothing since e does not require training! + pass # if any of the extractors require splitting the data, the # split_training_data_by_client is True. - if e.split_training_data_by_client: + elif e.split_training_data_by_client: e.train(training_data, extractor_file) - if not apply: - return - training_data = [[e(d) for d in datalist] - for datalist in training_data] # when no extractor needs splitting elif not self.split_training_data_by_client: e.train(training_data, extractor_file) - if not apply: - return - training_data = [e(d) for d in training_data] # when e here wants it flat but the data is split else: # make training_data flat - aligned_training_data = [d for datalist in training_data for d in - datalist] - e.train(aligned_training_data, extractor_file) - if not apply: - return + flat_training_data = [d for datalist in training_data for d in + datalist] + e.train(flat_training_data, extractor_file) + + if not apply: + return + + # prepare the training data for the next extractor + if self.split_training_data_by_client: training_data = [[e(d) for d in datalist] for datalist in training_data] + else: + training_data = [e(d) for d in training_data] return training_data def load(self, extractor_file): + if not self.requires_training: + return with HDF5File(extractor_file) as f: groups = self.get_extractor_groups() for e, group in zip(self.processors, groups): @@ -88,7 +113,7 @@ class SequentialExtractor(SequentialProcessor, MultipleExtractor): True """ - def __init__(self, processors): + def __init__(self, processors, **kwargs): (requires_training, split_training_data_by_client, min_extractor_file_size, min_feature_file_size) = \ @@ -99,15 +124,18 @@ class SequentialExtractor(SequentialProcessor, MultipleExtractor): requires_training=requires_training, split_training_data_by_client=split_training_data_by_client, min_extractor_file_size=min_extractor_file_size, - min_feature_file_size=min_feature_file_size) + min_feature_file_size=min_feature_file_size, + **kwargs) def train(self, training_data, extractor_file): with HDF5File(extractor_file, 'w') as f: groups = self.get_extractor_groups() - for e, group in zip(self.processors, groups): + for i, (e, group) in enumerate(zip(self.processors, groups)): + apply = i != len(self.processors) - 1 f.create_group(group) f.cd(group) - training_data = self.train_one(e, training_data, f, apply=True) + training_data = self.train_one(e, training_data, f, + apply=apply) f.cd('..') def read_feature(self, feature_file): @@ -154,7 +182,7 @@ class ParallelExtractor(ParallelProcessor, MultipleExtractor): [ 1. , 2. , 3. , 0.5, 1. , 1.5]]) """ - def __init__(self, processors): + def __init__(self, processors, **kwargs): (requires_training, split_training_data_by_client, min_extractor_file_size, min_feature_file_size) = self.get_attributes( @@ -165,7 +193,8 @@ class ParallelExtractor(ParallelProcessor, MultipleExtractor): requires_training=requires_training, split_training_data_by_client=split_training_data_by_client, min_extractor_file_size=min_extractor_file_size, - min_feature_file_size=min_feature_file_size) + min_feature_file_size=min_feature_file_size, + **kwargs) def train(self, training_data, extractor_file): with HDF5File(extractor_file, 'w') as f: diff --git a/bob/bio/base/test/data/example_filelist2/dev/for_models.lst b/bob/bio/base/test/data/example_filelist2/dev/for_models.lst index 7bdbf751bd8c6f432f780199a77a8d60a45fa388..c6618bb72f48eebe17a5be41c74bbc1a626e2ffd 100644 --- a/bob/bio/base/test/data/example_filelist2/dev/for_models.lst +++ b/bob/bio/base/test/data/example_filelist2/dev/for_models.lst @@ -1,3 +1,6 @@ +# Model samples +#Modelsamples +#data/model3_session1_sample1 3 3 data/model3_session1_sample1 3 3 data/model3_session1_sample2 3 3 data/model3_session1_sample3 3 3 diff --git a/bob/bio/base/test/dummy/extractor.py b/bob/bio/base/test/dummy/extractor.py index a3aaf6f7ea04347393db8cbc8efd1dac95e98000..eca7517ca8a46676074f93410d95b98d71757721 100644 --- a/bob/bio/base/test/dummy/extractor.py +++ b/bob/bio/base/test/dummy/extractor.py @@ -1,5 +1,5 @@ import numpy -import bob.io.base +import bob.bio.base from bob.bio.base.extractor import Extractor @@ -12,10 +12,10 @@ class DummyExtractor (Extractor): def train(self, train_data, extractor_file): assert isinstance(train_data, list) - bob.io.base.save(_data, extractor_file) + bob.bio.base.save(_data, extractor_file) def load(self, extractor_file): - data = bob.io.base.load(extractor_file) + data = bob.bio.base.load(extractor_file) assert (_data == data).all() self.model = True diff --git a/bob/bio/base/test/test_filelist.py b/bob/bio/base/test/test_filelist.py index 405591a866ba3e7bc2c630a27b533d2e5a39dbb6..c1c0735b475755e8476b783379ffb568a274640b 100644 --- a/bob/bio/base/test/test_filelist.py +++ b/bob/bio/base/test/test_filelist.py @@ -136,6 +136,12 @@ def test_query_protocol(): assert len(db.objects(protocol=prot, groups='dev', purposes='probe')) == 9 +def test_noztnorm(): + db = FileListBioDatabase(os.path.join(os.path.dirname(example_dir), + 'example_filelist2'), 'test') + assert len(db.all_files()) + + def test_query_dense(): db = FileListBioDatabase(example_dir, 'test', use_dense_probe_file_list=True) diff --git a/bob/bio/base/test/test_stacks.py b/bob/bio/base/test/test_stacks.py index 926901382af5455ee292ace46abc885582baaaa6..a296a9a33ebed07b3176d5955c84a7e254f9ca71 100644 --- a/bob/bio/base/test/test_stacks.py +++ b/bob/bio/base/test/test_stacks.py @@ -1,11 +1,13 @@ from functools import partial import numpy as np +import tempfile from bob.bio.base.utils.processors import ( SequentialProcessor, ParallelProcessor) from bob.bio.base.preprocessor import ( SequentialPreprocessor, ParallelPreprocessor, CallablePreprocessor) from bob.bio.base.extractor import ( SequentialExtractor, ParallelExtractor, CallableExtractor) +from bob.bio.base.test.dummy.extractor import extractor as dummy_extractor DATA = [0, 1, 2, 3, 4] PROCESSORS = [partial(np.power, 2), np.mean] @@ -37,9 +39,31 @@ def test_preprocessors(): def test_extractors(): processors = [CallableExtractor(p) for p in PROCESSORS] proc = SequentialExtractor(processors) + proc.load(None) data = proc(DATA) assert np.allclose(data, SEQ_DATA) proc = ParallelExtractor(processors) + proc.load(None) data = proc(DATA) assert all(np.allclose(x1, x2) for x1, x2 in zip(data, PAR_DATA)) + + +def test_sequential_trainable_extractors(): + processors = [CallableExtractor(p) for p in PROCESSORS] + [dummy_extractor] + proc = SequentialExtractor(processors) + with tempfile.NamedTemporaryFile(suffix='.hdf5') as f: + proc.train(DATA, f.name) + proc.load(f.name) + data = proc(DATA) + assert np.allclose(data, SEQ_DATA) + + +def test_parallel_trainable_extractors(): + processors = [CallableExtractor(p) for p in PROCESSORS] + [dummy_extractor] + proc = ParallelExtractor(processors) + with tempfile.NamedTemporaryFile(suffix='.hdf5') as f: + proc.train(DATA, f.name) + proc.load(f.name) + data = proc(np.array(DATA)) + assert all(np.allclose(x1, x2) for x1, x2 in zip(data, PAR_DATA)) diff --git a/bob/bio/base/tools/algorithm.py b/bob/bio/base/tools/algorithm.py index 8cd0010538f0c17351c329be33571b0a14598a56..0447df71cbc0a8331bf799b81bfc3aafc19210d9 100644 --- a/bob/bio/base/tools/algorithm.py +++ b/bob/bio/base/tools/algorithm.py @@ -123,7 +123,8 @@ def project(algorithm, extractor, groups = None, indices = None, allow_missing_f if not utils.check_file(projected_file, force, algorithm.min_projected_file_size): - logger.debug("... Projecting features for file '%s'", feature_file) + logger.debug("... Projecting features for file '%s' (%d/%d)", + feature_file, index_range.index(i)+1, len(index_range)) # create output directory before reading the data file (is sometimes required, when relative directories are specified, especially, including a .. somewhere) bob.io.base.create_directories_safe(os.path.dirname(projected_file)) # load feature @@ -256,7 +257,7 @@ def enroll(algorithm, extractor, compute_zt_norm, indices = None, groups = ['dev logger.info("- Enrollment: splitting of index range %s", str(indices)) logger.info("- Enrollment: enrolling models of group '%s'", group) - for model_id in model_ids: + for pos, model_id in enumerate(model_ids): # Path to the model model_file = fs.model_file(model_id, group) @@ -271,7 +272,9 @@ def enroll(algorithm, extractor, compute_zt_norm, indices = None, groups = ['dev logger.debug("... Skipping model file %s since no feature file could be found", model_file) continue - logger.debug("... Enrolling model from %d features to file '%s'", len(enroll_files), model_file) + logger.debug("... Enrolling model '%d' from %d feature(s) to " + "file '%s' (%d/%d)", model_id, len(enroll_files), model_file, + pos+1, len(model_ids)) bob.io.base.create_directories_safe(os.path.dirname(model_file)) # load all files into memory diff --git a/bob/bio/base/tools/command_line.py b/bob/bio/base/tools/command_line.py index 91af40e7a67739478993edea4402ce32baaf6b2c..e766070605b3e5b60593e8d9ba3ae22047331f91 100644 --- a/bob/bio/base/tools/command_line.py +++ b/bob/bio/base/tools/command_line.py @@ -295,6 +295,9 @@ def parse_config_file(parsers, args, args_dictionary, keywords, skips): take_from_config_or_command_line(args, config, "sub_directory", parser.get_default("sub_directory"), is_resource=False) + take_from_config_or_command_line(args, config, "env", + parser.get_default("env"), is_resource=False) + skip_keywords = tuple(['skip_' + k.replace('-', '_') for k in skips]) for keyword in keywords + skip_keywords + ('execute_only',): diff --git a/bob/bio/base/tools/extractor.py b/bob/bio/base/tools/extractor.py index 7a0b62e2cf30ac55d33ba9a507961978aa10c88c..281314677f7a5e8766fcbe8ee92986c91414de64 100644 --- a/bob/bio/base/tools/extractor.py +++ b/bob/bio/base/tools/extractor.py @@ -112,7 +112,8 @@ def extract(extractor, preprocessor, groups=None, indices = None, allow_missing_ if not utils.check_file(feature_file, force, extractor.min_feature_file_size): - logger.debug("... Extracting features for data file '%s'", data_file) + logger.debug("... Extracting features for data file '%s' (%d/%d)", + data_file, index_range.index(i)+1, len(index_range)) # create output directory before reading the data file (is sometimes required, when relative directories are specified, especially, including a .. somewhere) bob.io.base.create_directories_safe(os.path.dirname(feature_file)) # load data diff --git a/bob/bio/base/tools/preprocessor.py b/bob/bio/base/tools/preprocessor.py index 1f7374cf98a3d9135035eabfb76fd140491094d2..83eafab8953940e3a668c398e6b4e6166fa5cccf 100644 --- a/bob/bio/base/tools/preprocessor.py +++ b/bob/bio/base/tools/preprocessor.py @@ -67,7 +67,8 @@ def preprocess(preprocessor, groups = None, indices = None, allow_missing_files # check for existence if not utils.check_file(preprocessed_data_file, force, preprocessor.min_preprocessed_file_size): - logger.debug("... Processing original data file '%s'", file_name) + logger.debug("... Processing original data file '%s' (%d/%d)", file_name, + index_range.index(i)+1, len(index_range)) data = preprocessor.read_original_data(file_object, original_directory, original_extension) # create output directory before reading the data file (is sometimes required, when relative directories are specified, especially, including a .. somewhere) diff --git a/bob/bio/base/tools/scoring.py b/bob/bio/base/tools/scoring.py index 778b4f46e530c56fc86e9be642c4218fb0a556c6..1649add9826561f8c13a9336c9538154958a6773 100644 --- a/bob/bio/base/tools/scoring.py +++ b/bob/bio/base/tools/scoring.py @@ -131,9 +131,11 @@ def _scores_a(algorithm, reader, model_ids, group, compute_zt_norm, force, write logger.info("- Scoring: computing scores for group '%s'", group) # Computes the raw scores for each model - for model_id in model_ids: + for pos, model_id in enumerate(model_ids): # test if the file is already there score_file = fs.a_file(model_id, group) if compute_zt_norm else fs.no_norm_file(model_id, group) + logger.debug("... Scoring model '%s' at '%s' (%d/%d)", model_id, score_file, + pos+1, len(model_ids)) if utils.check_file(score_file, force): logger.warn("Score file '%s' already exists.", score_file) else: @@ -166,9 +168,11 @@ def _scores_b(algorithm, reader, model_ids, group, force, allow_missing_files): logger.info("- Scoring: computing score matrix B for group '%s'", group) # Loads the models - for model_id in model_ids: + for pos, model_id in enumerate(model_ids): # test if the file is already there score_file = fs.b_file(model_id, group) + logger.debug("... Scoring model '%s' at '%s' (%d/%d)", model_id, + score_file, pos+1, len(model_ids)) if utils.check_file(score_file, force): logger.warn("Score file '%s' already exists.", score_file) else: @@ -191,9 +195,11 @@ def _scores_c(algorithm, reader, t_model_ids, group, force, allow_missing_files) logger.info("- Scoring: computing score matrix C for group '%s'", group) # Computes the raw scores for the T-Norm model - for t_model_id in t_model_ids: + for pos, t_model_id in enumerate(t_model_ids): # test if the file is already there score_file = fs.c_file(t_model_id, group) + logger.debug("... Scoring model '%s' at '%s' (%d/%d)", t_model_id, + score_file, pos+1, len(t_model_ids)) if utils.check_file(score_file, force): logger.warn("Score file '%s' already exists.", score_file) else: @@ -219,9 +225,11 @@ def _scores_d(algorithm, reader, t_model_ids, group, force, allow_missing_files) z_probe_ids = [z_probe_object.client_id for z_probe_object in z_probe_objects] # Loads the T-Norm models - for t_model_id in t_model_ids: + for pos, t_model_id in enumerate(t_model_ids): # test if the file is already there score_file = fs.d_file(t_model_id, group) + logger.debug("... Scoring model '%s' at '%s' (%d/%d)", t_model_id, + score_file, pos+1, len(t_model_ids)) same_score_file = fs.d_same_value_file(t_model_id, group) if utils.check_file(score_file, force) and utils.check_file(same_score_file, force): logger.warn("score files '%s' and '%s' already exist.", score_file, same_score_file) diff --git a/bob/bio/base/utils/processors.py b/bob/bio/base/utils/processors.py index ca04d37cf3c2b34df5b188a49c9dd0e9a5f2908e..b01a953dc34e5962bb74e03801d60f9227ed2838 100644 --- a/bob/bio/base/utils/processors.py +++ b/bob/bio/base/utils/processors.py @@ -26,7 +26,7 @@ class SequentialProcessor(object): """ def __init__(self, processors, **kwargs): - super(SequentialProcessor, self).__init__() + super(SequentialProcessor, self).__init__(**kwargs) self.processors = processors def __call__(self, data, **kwargs): @@ -86,7 +86,7 @@ class ParallelProcessor(object): """ def __init__(self, processors, **kwargs): - super(ParallelProcessor, self).__init__() + super(ParallelProcessor, self).__init__(**kwargs) self.processors = processors def __call__(self, data, **kwargs): diff --git a/doc/filelist-guide.rst b/doc/filelist-guide.rst index 547ac79767b12d6fafe8f64333a4675056bc6144..969e812f4ab3ab4f5f28935bfb9d5e6e9e5ceeec 100644 --- a/doc/filelist-guide.rst +++ b/doc/filelist-guide.rst @@ -124,6 +124,8 @@ The following list files need to be created: filename client_id +Please note that in all files, the lines starting with any number of white +space and ``#`` will be ignored. Protocols and File Lists