diff --git a/bob/bio/base/algorithm/BIC.py b/bob/bio/base/algorithm/BIC.py index 42aa785c53af8fd36898f6f4818567b2bb633782..b9be9138d1aee8996162c4b574eb4b8bc5ce7e05 100644 --- a/bob/bio/base/algorithm/BIC.py +++ b/bob/bio/base/algorithm/BIC.py @@ -103,7 +103,7 @@ class BIC(Algorithm): def _trainset_for(self, pairs): """Computes the array containing the comparison results for the given set of image pairs.""" - return numpy.vstack([self.comparison_function(f1, f2) for (f1, f2) in pairs]) + return numpy.vstack(self.comparison_function(f1, f2) for (f1, f2) in pairs) def train_enroller(self, train_features, enroller_file): """Trains the BIC by computing intra-personal and extra-personal subspaces. diff --git a/bob/bio/base/algorithm/Distance.py b/bob/bio/base/algorithm/Distance.py index c6a235960d97f7691a2c01631125261ed2bf782e..e0fc5261fab616845176bd038d0fbe512e5ab3ad 100644 --- a/bob/bio/base/algorithm/Distance.py +++ b/bob/bio/base/algorithm/Distance.py @@ -70,7 +70,7 @@ class Distance (Algorithm): assert len(enroll_features) [self._check_feature(feature) for feature in enroll_features] # just store all the features - return numpy.vstack([f.flatten() for f in enroll_features]) + return numpy.vstack(f.flatten() for f in enroll_features) def score(self, model, probe): diff --git a/bob/bio/base/algorithm/LDA.py b/bob/bio/base/algorithm/LDA.py index b0cb34b983ff75454bb36a024da4f3470a955d04..5bcc21e606b17acf78b79d47024e078f628c768c 100644 --- a/bob/bio/base/algorithm/LDA.py +++ b/bob/bio/base/algorithm/LDA.py @@ -109,7 +109,7 @@ class LDA (Algorithm): if len(client_files) < 2: logger.warn("Skipping one client since the number of client files is only %d", len(client_files)) continue - data.append(numpy.vstack([feature.flatten() for feature in client_files])) + data.append(numpy.vstack(feature.flatten() for feature in client_files)) # Returns the list of lists of arrays return data @@ -117,7 +117,7 @@ class LDA (Algorithm): def _train_pca(self, training_set): """Trains and returns a LinearMachine that is trained using PCA""" - data_list = [feature for client in training_set for feature in client] + data_list = (feature for client in training_set for feature in client) data = numpy.vstack(data_list) logger.info(" -> Training Linear Machine using PCA") @@ -145,7 +145,7 @@ class LDA (Algorithm): def _perform_pca(self, machine, training_set): """Perform PCA on data of the training set""" - return [numpy.vstack([machine(feature) for feature in client_features]) for client_features in training_set] + return [numpy.vstack(machine(feature) for feature in client_features) for client_features in training_set] def train_projector(self, training_features, projector_file): diff --git a/bob/bio/base/algorithm/PLDA.py b/bob/bio/base/algorithm/PLDA.py index 712b9d0f352be4e90c4399ab1285718de7847e77..e29499d29e5377c2b09f62f4a3ea6ab25b91505a 100644 --- a/bob/bio/base/algorithm/PLDA.py +++ b/bob/bio/base/algorithm/PLDA.py @@ -72,7 +72,7 @@ class PLDA (Algorithm): def _train_pca(self, training_set): """Trains and returns a LinearMachine that is trained using PCA""" - data = numpy.vstack([feature for feature in training_set]) + data = numpy.vstack(feature for feature in training_set) logger.info(" -> Training LinearMachine using PCA ") trainer = bob.learn.linear.PCATrainer() @@ -103,7 +103,7 @@ class PLDA (Algorithm): if len(client_files) < 2: logger.warn("Skipping one client since the number of client files is only %d", len(client_files)) continue - data.append(numpy.vstack([feature.flatten() for feature in client_files])) + data.append(numpy.vstack(feature.flatten() for feature in client_files)) # Returns the list of lists of arrays return data @@ -179,7 +179,7 @@ class PLDA (Algorithm): In this base class implementation, it computes the scores for each probe file using the 'score' method, and fuses the scores using the fusion method specified in the constructor of this class.""" if self.pca_machine is not None: - probes = [self.pca_machine(probe) for probe in probes] + probes = (self.pca_machine(probe) for probe in probes) # forward if self.score_set == 'joint_likelihood': return model.log_likelihood_ratio(numpy.vstack(probes)) diff --git a/bob/bio/base/script/fuse_scores.py b/bob/bio/base/script/fuse_scores.py index 693e6ec651842210ed42fb2df0213319386386d2..29152981af9d3e823da7c04ca0a3474b0f8ef1d1 100755 --- a/bob/bio/base/script/fuse_scores.py +++ b/bob/bio/base/script/fuse_scores.py @@ -70,8 +70,8 @@ def main(command_line_options = None): import numpy trainer = bob.learn.linear.CGLogRegTrainer(0.5, args.convergence_threshold, args.max_iterations, mean_std_norm=not args.no_whitening) - data_neg = numpy.vstack([data[k][0] for k in range(n_systems)]).T - data_pos = numpy.vstack([data[k][1] for k in range(n_systems)]).T + data_neg = numpy.vstack(data[k][0] for k in range(n_systems)).T + data_pos = numpy.vstack(data[k][1] for k in range(n_systems)).T machine = trainer.train(data_neg, data_pos) # fuse development scores diff --git a/bob/bio/base/test/test_utils.py b/bob/bio/base/test/test_utils.py index 72ebfbbd641f584740cdd3ab320fa979d82a6138..8c7d3d4a0833b77604f95f210599c89723fee634 100644 --- a/bob/bio/base/test/test_utils.py +++ b/bob/bio/base/test/test_utils.py @@ -3,7 +3,7 @@ import bob.learn.linear import pkg_resources import os import numpy - +import nose import bob.io.base.test_utils from . import utils @@ -84,6 +84,81 @@ def test_io(): if os.path.exists(filename): os.remove(filename) + +def test_io_vstack(): + + paths = [1, 2, 3, 4, 5] + + def oracle(reader, paths): + return numpy.vstack([reader(p) for p in paths]) + + def reader_same_size_C(path): + return numpy.arange(10).reshape(5, 2) + + def reader_different_size_C(path): + return numpy.arange(2 * path).reshape(path, 2) + + def reader_same_size_F(path): + return numpy.asfortranarray(numpy.arange(10).reshape(5, 2)) + + def reader_different_size_F(path): + return numpy.asfortranarray(numpy.arange(2 * path).reshape(path, 2)) + + def reader_same_size_C2(path): + return numpy.arange(30).reshape(5, 2, 3) + + def reader_different_size_C2(path): + return numpy.arange(6 * path).reshape(path, 2, 3) + + def reader_same_size_F2(path): + return numpy.asfortranarray(numpy.arange(30).reshape(5, 2, 3)) + + def reader_different_size_F2(path): + return numpy.asfortranarray(numpy.arange(6 * path).reshape(path, 2, 3)) + + def reader_wrong_size(path): + return numpy.arange(2 * path).reshape(2, path) + + # test C and F readers + numpy.all(bob.bio.base.vstack_features(reader_different_size_C, + paths, False) == + oracle(reader_different_size_C, paths)) + numpy.all(bob.bio.base.vstack_features(reader_different_size_F, + paths, False) == + oracle(reader_different_size_F, paths)) + + numpy.all(bob.bio.base.vstack_features(reader_same_size_C, paths, False) == + oracle(reader_same_size_C, paths)) + numpy.all(bob.bio.base.vstack_features(reader_same_size_F, paths, False) == + oracle(reader_same_size_F, paths)) + + numpy.all(bob.bio.base.vstack_features(reader_same_size_C, paths, True) == + oracle(reader_same_size_C, paths)) + numpy.all(bob.bio.base.vstack_features(reader_same_size_F, paths, True) == + oracle(reader_same_size_F, paths)) + + # test 3 dimensional readers + numpy.all(bob.bio.base.vstack_features(reader_different_size_C2, + paths, False) == + oracle(reader_different_size_C2, paths)) + numpy.all(bob.bio.base.vstack_features(reader_different_size_F2, + paths, False) == + oracle(reader_different_size_F2, paths)) + + numpy.all(bob.bio.base.vstack_features(reader_same_size_C2, paths, False) == + oracle(reader_same_size_C2, paths)) + numpy.all(bob.bio.base.vstack_features(reader_same_size_F2, paths, False) == + oracle(reader_same_size_F2, paths)) + + numpy.all(bob.bio.base.vstack_features(reader_same_size_C2, paths, True) == + oracle(reader_same_size_C2, paths)) + numpy.all(bob.bio.base.vstack_features(reader_same_size_F2, paths, True) == + oracle(reader_same_size_F2, paths)) + + with nose.tools.assert_raises(AssertionError): + bob.bio.base.vstack_features(reader_wrong_size, paths) + + def test_sampling(): # test selection of elements indices = bob.bio.base.selected_indices(100, 10) diff --git a/bob/bio/base/utils/io.py b/bob/bio/base/utils/io.py index 3e25a5255fcd40b32e9b781ef86a4b8e3ab66802..f8dcddcc93629bf58a842d45a4c773115f9baac3 100644 --- a/bob/bio/base/utils/io.py +++ b/bob/bio/base/utils/io.py @@ -1,12 +1,16 @@ import os -import tempfile, tarfile - +import tempfile +import tarfile +import collections # this is needed for the sphinx documentation +import functools # this is needed for the sphinx documentation +import numpy import logging -logger = logging.getLogger("bob.bio.base") +logger = logging.getLogger(__name__) from .. import database import bob.io.base + def filter_missing_files(file_names, split_by_client=False, allow_missing_files=True): """This function filters out files that do not exist, but only if ``allow_missing_files`` is set to ``True``, otherwise the list of ``file_names`` is returned unaltered.""" @@ -15,8 +19,10 @@ def filter_missing_files(file_names, split_by_client=False, allow_missing_files= if split_by_client: # filter out missing files and empty clients - existing_files = [[f for f in client_files if os.path.exists(f)] for client_files in file_names] - existing_files = [client_files for client_files in existing_files if client_files] + existing_files = [ + [f for f in client_files if os.path.exists(f)] for client_files in file_names] + existing_files = [ + client_files for client_files in existing_files if client_files] else: # filter out missing files existing_files = [f for f in file_names if os.path.exists(f)] @@ -28,17 +34,17 @@ def filter_none(data, split_by_client=False): if split_by_client: # filter out missing files and empty clients - existing_data = [[d for d in client_data if d is not None] for client_data in data] - existing_data = [client_data for client_data in existing_data if client_data] + existing_data = [[d for d in client_data if d is not None] + for client_data in data] + existing_data = [ + client_data for client_data in existing_data if client_data] else: # filter out missing files existing_data = [d for d in data if d is not None] return existing_data - - -def check_file(filename, force, expected_file_size = 1): +def check_file(filename, force, expected_file_size=1): """Checks if the file with the given ``filename`` exists and has size greater or equal to ``expected_file_size``. If the file is to small, **or** if the ``force`` option is set to ``True``, the file is removed. This function returns ``True`` is the file exists (and has not been removed), otherwise ``False``""" @@ -86,18 +92,20 @@ def load(file): else: return bob.io.base.load(file) + def save(data, file, compression=0): """Saves the data to file using HDF5. The given file might be an HDF5 file open for writing, or a string. If the given data contains a ``save`` method, this method is called with the given HDF5 file. Otherwise the data is written to the HDF5 file using the given compression.""" - f = file if isinstance(file, bob.io.base.HDF5File) else bob.io.base.HDF5File(file, 'w') + f = file if isinstance( + file, bob.io.base.HDF5File) else bob.io.base.HDF5File(file, 'w') if hasattr(data, 'save'): data.save(f) else: f.set("array", data, compression=compression) -def open_compressed(filename, open_flag = 'r', compression_type='bz2'): +def open_compressed(filename, open_flag='r', compression_type='bz2'): """Opens a compressed HDF5File with the given opening flags. For the 'r' flag, the given compressed file will be extracted to a local space. For 'w', an empty HDF5File is created. @@ -108,7 +116,7 @@ def open_compressed(filename, open_flag = 'r', compression_type='bz2'): if open_flag == 'r': # extract the HDF5 file from the given file name into a temporary file name - tar = tarfile.open(filename, mode="r:"+compression_type) + tar = tarfile.open(filename, mode="r:" + compression_type) memory_file = tar.extractfile(tar.next()) real_file = open(hdf5_file_name, 'wb') real_file.write(memory_file.read()) @@ -130,13 +138,14 @@ def close_compressed(filename, hdf5_file, compression_type='bz2', create_link=Fa if is_writable: # create compressed tar file - tar = tarfile.open(filename, mode="w:"+compression_type) + tar = tarfile.open(filename, mode="w:" + compression_type) tar.add(hdf5_file_name, os.path.basename(filename)) tar.close() if create_link: - extension = {'':'.tar', 'bz2':'.tar.bz2', 'gz':'tar.gz'}[compression_type] - link_file = filename+extension + extension = {'': '.tar', 'bz2': '.tar.bz2', + 'gz': 'tar.gz'}[compression_type] + link_file = filename + extension if not os.path.exists(link_file): os.symlink(os.path.basename(filename), link_file) @@ -165,3 +174,132 @@ def save_compressed(data, filename, compression_type='bz2', create_link=False): hdf5 = open_compressed(filename, 'w') save(data, hdf5) close_compressed(filename, hdf5, compression_type, create_link) + + +def _generate_features(reader, paths): + """Load and stack features a memory efficient way. This function is meant to + be used inside :py:func:`vstack_features`. + + Parameters + ---------- + reader : collections.Callable + See the documentation of :py:func:`vstack_features`. + paths : collections.Iterable + See the documentation of :py:func:`vstack_features`. + + Yields + ------ + object + The first object returned is the :py:type:`numpy.dtype` of features. The + second objects returned is the shape of the first feature. The rest of + objects are the actual values in features. The features are returned in + C order. + + Examples + -------- + This function can be used to with :py:func:`numpy.fromiter`: + + >>> def reader(path): + ... # in each file, there are 5 samples and features are 2 dimensional. + ... return numpy.arange(10).reshape(5,2) + >>> paths = ['path1', 'path2'] + >>> iterator = _generate_features(reader, paths) + >>> dtype = next(iterator) + >>> dtype + dtype('int64') + >>> first_feature_shape = next(iterator) + >>> first_feature_shape + (5, 2) + >>> all_features_flat = numpy.fromiter(iterator, dtype) + >>> all_features_flat + array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9]) + >>> all_features = all_features_flat.reshape(-1, first_feature_shape[1]) + >>> all_features + array([[0, 1], + [2, 3], + [4, 5], + [6, 7], + [8, 9], + [0, 1], + [2, 3], + [4, 5], + [6, 7], + [8, 9]]) + >>> all_features_with_more_memory = numpy.vstack([reader(p) for p in paths]) + >>> assert numpy.allclose(all_features == all_features_with_more_memory) + + You can allocate the array at once to improve the performance if you know + that all features in paths have the same shape and you know the total number + of the paths: + >>> iterator = _generate_features(reader, paths) + >>> dtype = next(iterator) + >>> first_feature_shape = next(iterator) + >>> total_size = len(paths) * numpy.prod(first_feature_shape) + >>> all_features_flat = numpy.fromiter(iterator, dtype, total_size) + >>> all_features = all_features_flat.reshape(-1, first_feature_shape[1]) + >>> all_features + array([[0, 1], + [2, 3], + [4, 5], + [6, 7], + [8, 9], + [0, 1], + [2, 3], + [4, 5], + [6, 7], + [8, 9]]) + """ + for i, path in enumerate(paths): + feature = numpy.atleast_2d(reader(path)) + feature = numpy.ascontiguousarray(feature) + if i == 0: + dtype = feature.dtype + shape = list(feature.shape) + yield dtype + yield shape + else: + # make sure all features have the same shape[1:] + assert shape[1:] == list(feature.shape[1:]) + + for value in feature.flat: + yield value + + +def vstack_features(reader, paths, same_size=False): + """Stacks all features in a memory efficient way. + + Parameters + ---------- + reader : collections.Callable + The function to load the features. The function should only take one + argument being the path to the features. Use :py:type:`functools.partial` + to accommodate your reader to this format. The features returned by + ``reader`` are expected to have the same :py:type:`numpy.dtype` and the + same shape except for their first dimension. First dimension is should + correspond to the number of samples. + paths : collections.Iterable + An iterable of paths to iterate on. Whatever is inside path is given to + ``reader``. If ``same_size`` is ``True``, ``len(paths)`` must be valid. + same_size : :obj:`bool`, optional + If ``True``, it assumes that arrays inside all the paths are the same + shape. If you know the features are the same size in all paths, set this + to ``True`` to improve the performance. + + Returns + ------- + numpy.ndarray + The read features with the shape (n_samples, *features_shape[1:]). + """ + iterable = _generate_features(reader, paths) + dtype = next(iterable) + shape = next(iterable) + if same_size: + total_size = int(len(paths) * numpy.prod(shape)) + all_features = numpy.fromiter(iterable, dtype, total_size) + else: + all_features = numpy.fromiter(iterable, dtype) + + # the shape is assumed to be (n_samples, ...) it can be (5, 2) or (5, 3, 3). + shape = list(shape) + shape[0] = -1 + return numpy.reshape(all_features, shape, order='C') diff --git a/version.txt b/version.txt index ba1053ecd0644155680bcaea68a33f9d54af6b9e..878694a626336eef2d5ca7870e72aefd586821b5 100644 --- a/version.txt +++ b/version.txt @@ -1 +1 @@ -3.1.3b0 \ No newline at end of file +3.2.0b0