diff --git a/bob/bio/base/test/test_utils.py b/bob/bio/base/test/test_utils.py index 5748dd69549f07c35aceb845cd6ae777d1e018fe..5cebac0e17225592199da005c32ad00e9071658d 100644 --- a/bob/bio/base/test/test_utils.py +++ b/bob/bio/base/test/test_utils.py @@ -3,7 +3,6 @@ import bob.learn.linear import pkg_resources import os import numpy -import nose import bob.io.base.test_utils from . import utils @@ -67,118 +66,6 @@ def test_io(): os.remove(filename) -def test_io_vstack(): - - paths = [1, 2, 3, 4, 5] - - def oracle(reader, paths): - return numpy.vstack([reader(p) for p in paths]) - - def reader_same_size_C(path): - return numpy.arange(10).reshape(5, 2) - - def reader_different_size_C(path): - return numpy.arange(2 * path).reshape(path, 2) - - def reader_same_size_F(path): - return numpy.asfortranarray(numpy.arange(10).reshape(5, 2)) - - def reader_different_size_F(path): - return numpy.asfortranarray(numpy.arange(2 * path).reshape(path, 2)) - - def reader_same_size_C2(path): - return numpy.arange(30).reshape(5, 2, 3) - - def reader_different_size_C2(path): - return numpy.arange(6 * path).reshape(path, 2, 3) - - def reader_same_size_F2(path): - return numpy.asfortranarray(numpy.arange(30).reshape(5, 2, 3)) - - def reader_different_size_F2(path): - return numpy.asfortranarray(numpy.arange(6 * path).reshape(path, 2, 3)) - - def reader_wrong_size(path): - return numpy.arange(2 * path).reshape(2, path) - - # when same_size is False - for reader in [ - reader_different_size_C, - reader_different_size_F, - reader_same_size_C, - reader_same_size_F, - reader_different_size_C2, - reader_different_size_F2, - reader_same_size_C2, - reader_same_size_F2, - ]: - numpy.all(bob.bio.base.vstack_features(reader, paths) == - oracle(reader, paths)) - - # when same_size is True - for reader in [ - reader_same_size_C, - reader_same_size_F, - reader_same_size_C2, - reader_same_size_F2, - ]: - numpy.all(bob.bio.base.vstack_features(reader, paths, True) == - oracle(reader, paths)) - - with nose.tools.assert_raises(AssertionError): - bob.bio.base.vstack_features(reader_wrong_size, paths) - - # test actual files - paths = [bob.io.base.test_utils.temporary_filename(), - bob.io.base.test_utils.temporary_filename(), - bob.io.base.test_utils.temporary_filename()] - try: - # try different readers: - for reader in [ - reader_different_size_C, - reader_different_size_F, - reader_same_size_C, - reader_same_size_F, - reader_different_size_C2, - reader_different_size_F2, - reader_same_size_C2, - reader_same_size_F2, - ]: - # save some data in files - for i, path in enumerate(paths): - bob.bio.base.save(reader(i + 1), path) - # test when all data is present - reference = oracle(bob.bio.base.load, paths) - numpy.all(bob.bio.base.vstack_features(bob.bio.base.load, paths) == - reference) - # delete the first one - os.remove(paths[0]) - reference = oracle(bob.bio.base.load, paths[1:]) - target = bob.bio.base.vstack_features(bob.bio.base.load, paths, False, - True) - numpy.all(target == reference) - # save back first one and delete second one - bob.bio.base.save(reader(1), paths[0]) - os.remove(paths[1]) - reference = oracle(bob.bio.base.load, paths[:1] + paths[2:]) - target = bob.bio.base.vstack_features(bob.bio.base.load, paths, False, - True) - numpy.all(target == reference) - # Check if RuntimeError is raised when one of the files is missing and - # allow_missing_files if False - with nose.tools.assert_raises(RuntimeError): - bob.bio.base.vstack_features(bob.bio.base.load, paths) - # Check if ValueError is raised. - with nose.tools.assert_raises(ValueError): - bob.bio.base.vstack_features(bob.bio.base.load, paths, True, True) - finally: - try: - for path in paths: - os.remove(path) - except Exception: - pass - - def test_sampling(): # test selection of elements indices = bob.bio.base.selected_indices(100, 10) diff --git a/bob/bio/base/utils/io.py b/bob/bio/base/utils/io.py index d69fece9f96a3cd2bef4735df9ffa144ab583398..dc65b189ff0d4774d97472611e16ba219267970d 100644 --- a/bob/bio/base/utils/io.py +++ b/bob/bio/base/utils/io.py @@ -173,150 +173,3 @@ def save_compressed(data, filename, compression_type='bz2', create_link=False): hdf5 = open_compressed(filename, 'w') save(data, hdf5) close_compressed(filename, hdf5, compression_type, create_link) - - -def _generate_features(reader, paths, same_size=False, - allow_missing_files=False): - """Load and stack features in a memory efficient way. This function is meant - to be used inside :py:func:`vstack_features`. - - Parameters - ---------- - reader : ``collections.Callable`` - See the documentation of :py:func:`vstack_features`. - paths : ``collections.Iterable`` - See the documentation of :py:func:`vstack_features`. - same_size : :obj:`bool`, optional - See the documentation of :py:func:`vstack_features`. - allow_missing_files : :obj:`bool`, optional - See the documentation of :py:func:`vstack_features`. - - Yields - ------ - object - The first object returned is a tuple of :py:class:`numpy.dtype` of - features and the shape of the first feature. The rest of objects are - the actual values in features. The features are returned in C order. - """ - - shape_determined = False - for i, path in enumerate(paths): - if allow_missing_files and not os.path.isfile(path): - logger.debug("... File %s, that does not exist, has been ignored.", path) - continue - - feature = numpy.atleast_2d(reader(path)) - feature = numpy.ascontiguousarray(feature) - if not shape_determined: - shape_determined = True - dtype = feature.dtype - shape = list(feature.shape) - yield (dtype, shape) - else: - # make sure all features have the same shape and dtype - if same_size: - assert shape == list(feature.shape) - else: - assert shape[1:] == list(feature.shape[1:]) - assert dtype == feature.dtype - - for value in feature.flat: - yield value - - -def vstack_features(reader, paths, same_size=False, allow_missing_files=False): - """Stacks all features in a memory efficient way. - - Parameters - ---------- - reader : ``collections.Callable`` - The function to load the features. The function should only take one - argument being the path to the features. Use - :any:`functools.partial` to accommodate your reader to this format. - The features returned by ``reader`` are expected to have the same - :py:class:`numpy.dtype` and the same shape except for their first - dimension. First dimension is should correspond to the number of samples. - paths : ``collections.Iterable`` - An iterable of paths to iterate on. Whatever is inside path is given to - ``reader`` so they do not need to be necessarily paths to actual files. - If ``same_size`` is ``True``, ``len(paths)`` must be valid. - same_size : :obj:`bool`, optional - If ``True``, it assumes that arrays inside all the paths are the same - shape. If you know the features are the same size in all paths, set this - to ``True`` to improve the performance. - allow_missing_files : :obj:`bool`, optional - If ``True``, it assumes that the items inside paths are actual files and - ignores the ones that do not exist. - - Returns - ------- - numpy.ndarray - The read features with the shape (n_samples, \*features_shape[1:]). - - Raises - ------ - ValueError - If both same_size and allow_missing_files are ``True``. - - Examples - -------- - This function in a simple way is equivalent to calling - ``numpy.vstack(reader(p) for p in paths)``. - - >>> import numpy - >>> from bob.bio.base import vstack_features - >>> def reader(path): - ... # in each file, there are 5 samples and features are 2 dimensional. - ... return numpy.arange(10).reshape(5,2) - >>> paths = ['path1', 'path2'] - >>> all_features = vstack_features(reader, paths) - >>> all_features - array([[0, 1], - [2, 3], - [4, 5], - [6, 7], - [8, 9], - [0, 1], - [2, 3], - [4, 5], - [6, 7], - [8, 9]]) - >>> all_features_with_more_memory = numpy.vstack(reader(p) for p in paths) - >>> numpy.allclose(all_features, all_features_with_more_memory) - True - - You can allocate the array at once to improve the performance if you know - that all features in paths have the same shape and you know the total number - of the paths: - - >>> vstack_features(reader, paths, same_size=True) - array([[0, 1], - [2, 3], - [4, 5], - [6, 7], - [8, 9], - [0, 1], - [2, 3], - [4, 5], - [6, 7], - [8, 9]]) - - """ - if same_size and allow_missing_files: - raise ValueError("Both same_size and allow_missing_files cannot be True at" - " the same time.") - iterable = _generate_features(reader, paths, same_size, allow_missing_files) - try: - dtype, shape = next(iterable) - except StopIteration: - return numpy.array([]) - if same_size: - total_size = int(len(paths) * numpy.prod(shape)) - all_features = numpy.fromiter(iterable, dtype, total_size) - else: - all_features = numpy.fromiter(iterable, dtype) - - # the shape is assumed to be (n_samples, ...) it can be (5, 2) or (5, 3, 4). - shape = list(shape) - shape[0] = -1 - return numpy.reshape(all_features, shape, order='C') diff --git a/doc/py_api.rst b/doc/py_api.rst index d127ce4ddd0973075cb6ef34ebaf40e74559d83d..9ac8d563462267338747a86777f1c718c8fddb52 100644 --- a/doc/py_api.rst +++ b/doc/py_api.rst @@ -17,7 +17,6 @@ IO-related functions bob.bio.base.open_compressed bob.bio.base.close_compressed bob.bio.base.check_file - bob.bio.base.vstack_features Pipelines