diff --git a/bob/io/base/__init__.py b/bob/io/base/__init__.py index ca1d2581da304c66a55dc0306521346539e21da4..9cd5b9a8a34fdfe2e3980ba5b639896daa0a3274 100644 --- a/bob/io/base/__init__.py +++ b/bob/io/base/__init__.py @@ -371,5 +371,138 @@ def get_macros(): return [('HAVE_HDF5', '1')] +def _generate_features(reader, paths, same_size=False): + """Load and stack features in a memory efficient way. This function is meant + to be used inside :py:func:`vstack_features`. + + Parameters + ---------- + reader : ``collections.Callable`` + See the documentation of :py:func:`vstack_features`. + paths : ``collections.Iterable`` + See the documentation of :py:func:`vstack_features`. + same_size : :obj:`bool`, optional + See the documentation of :py:func:`vstack_features`. + + Yields + ------ + object + The first object returned is a tuple of :py:class:`numpy.dtype` of + features and the shape of the first feature. The rest of objects are + the actual values in features. The features are returned in C order. + """ + + shape_determined = False + for i, path in enumerate(paths): + + feature = numpy.atleast_2d(reader(path)) + feature = numpy.ascontiguousarray(feature) + if not shape_determined: + shape_determined = True + dtype = feature.dtype + shape = list(feature.shape) + yield (dtype, shape) + else: + # make sure all features have the same shape and dtype + if same_size: + assert shape == list(feature.shape) + else: + assert shape[1:] == list(feature.shape[1:]) + assert dtype == feature.dtype + + for value in feature.flat: + yield value + + +def vstack_features(reader, paths, same_size=False): + """Stacks all features in a memory efficient way. + + Parameters + ---------- + reader : ``collections.Callable`` + The function to load the features. The function should only take one + argument ``path`` and return loaded features. Use :any:`functools.partial` + to accommodate your reader to this format. + The features returned by ``reader`` are expected to have the same + :py:class:`numpy.dtype` and the same shape except for their first + dimension. First dimension should correspond to the number of samples. + paths : ``collections.Iterable`` + An iterable of paths to iterate on. Whatever is inside path is given to + ``reader`` so they do not need to be necessarily paths to actual files. + If ``same_size`` is ``True``, ``len(paths)`` must be valid. + same_size : :obj:`bool`, optional + If ``True``, it assumes that arrays inside all the paths are the same + shape. If you know the features are the same size in all paths, set this + to ``True`` to improve the performance. + + Returns + ------- + numpy.ndarray + The read features with the shape ``(n_samples, *features_shape[1:])``. + + Examples + -------- + This function in a simple way is equivalent to calling + ``numpy.vstack(reader(p) for p in paths)``. + + >>> import numpy + >>> from bob.io.base import vstack_features + >>> def reader(path): + ... # in each file, there are 5 samples and features are 2 dimensional. + ... return numpy.arange(10).reshape(5,2) + >>> paths = ['path1', 'path2'] + >>> all_features = vstack_features(reader, paths) + >>> numpy.allclose(all_features, numpy.array( + ... [[0, 1], + ... [2, 3], + ... [4, 5], + ... [6, 7], + ... [8, 9], + ... [0, 1], + ... [2, 3], + ... [4, 5], + ... [6, 7], + ... [8, 9]])) + True + >>> all_features_with_more_memory = numpy.vstack(reader(p) for p in paths) + >>> numpy.allclose(all_features, all_features_with_more_memory) + True + + You can allocate the array at once to improve the performance if you know + that all features in paths have the same shape and you know the total number + of the paths: + + >>> all_features = vstack_features(reader, paths, same_size=True) + >>> numpy.allclose(all_features, numpy.array( + ... [[0, 1], + ... [2, 3], + ... [4, 5], + ... [6, 7], + ... [8, 9], + ... [0, 1], + ... [2, 3], + ... [4, 5], + ... [6, 7], + ... [8, 9]])) + True + + .. note:: + + This function runs very slowly. Only use it when RAM is precious. + """ + iterable = _generate_features(reader, paths, same_size) + dtype, shape = next(iterable) + if same_size: + total_size = int(len(paths) * numpy.prod(shape)) + all_features = numpy.fromiter(iterable, dtype, total_size) + else: + all_features = numpy.fromiter(iterable, dtype) + + # the shape is assumed to be (n_samples, ...) it can be (5, 2) or (5, 3, 4). + shape = list(shape) + shape[0] = -1 + return numpy.reshape(all_features, shape, order="C") + + # gets sphinx autodoc done right - don't remove it __all__ = [_ for _ in dir() if not _.startswith('_')] diff --git a/bob/io/base/test_io.py b/bob/io/base/test_io.py new file mode 100644 index 0000000000000000000000000000000000000000..f9092289667c1e6fd83bac7be9ea383fcedfb580 --- /dev/null +++ b/bob/io/base/test_io.py @@ -0,0 +1,96 @@ +import nose +import numpy as np +import os +from . import vstack_features, save, load +from .test_utils import temporary_filename + + +def test_io_vstack(): + + paths = [1, 2, 3, 4, 5] + + def oracle(reader, paths): + return np.vstack([reader(p) for p in paths]) + + def reader_same_size_C(path): + return np.arange(10).reshape(5, 2) + + def reader_different_size_C(path): + return np.arange(2 * path).reshape(path, 2) + + def reader_same_size_F(path): + return np.asfortranarray(np.arange(10).reshape(5, 2)) + + def reader_different_size_F(path): + return np.asfortranarray(np.arange(2 * path).reshape(path, 2)) + + def reader_same_size_C2(path): + return np.arange(30).reshape(5, 2, 3) + + def reader_different_size_C2(path): + return np.arange(6 * path).reshape(path, 2, 3) + + def reader_same_size_F2(path): + return np.asfortranarray(np.arange(30).reshape(5, 2, 3)) + + def reader_different_size_F2(path): + return np.asfortranarray(np.arange(6 * path).reshape(path, 2, 3)) + + def reader_wrong_size(path): + return np.arange(2 * path).reshape(2, path) + + # when same_size is False + for reader in [ + reader_different_size_C, + reader_different_size_F, + reader_same_size_C, + reader_same_size_F, + reader_different_size_C2, + reader_different_size_F2, + reader_same_size_C2, + reader_same_size_F2, + ]: + np.all(vstack_features(reader, paths) == oracle(reader, paths)) + + # when same_size is True + for reader in [ + reader_same_size_C, + reader_same_size_F, + reader_same_size_C2, + reader_same_size_F2, + ]: + np.all(vstack_features(reader, paths, True) == oracle(reader, paths)) + + with nose.tools.assert_raises(AssertionError): + vstack_features(reader_wrong_size, paths) + + # test actual files + paths = [temporary_filename(), temporary_filename(), temporary_filename()] + try: + # try different readers: + for reader in [ + reader_different_size_C, + reader_different_size_F, + reader_same_size_C, + reader_same_size_F, + reader_different_size_C2, + reader_different_size_F2, + reader_same_size_C2, + reader_same_size_F2, + ]: + # save some data in files + for i, path in enumerate(paths): + save(reader(i + 1), path) + # test when all data is present + reference = oracle(load, paths) + np.all(vstack_features(load, paths) == reference) + os.remove(paths[0]) + # Check if RuntimeError is raised when one of the files is missing + with nose.tools.assert_raises(RuntimeError): + vstack_features(load, paths) + finally: + try: + for path in paths: + os.remove(path) + except Exception: + pass diff --git a/doc/py_api.rst b/doc/py_api.rst index bd8a9a713b3346583274afbca3bbb92306cf8ddf..512c928a6cc045817a8fb067bdeac926530cc066 100644 --- a/doc/py_api.rst +++ b/doc/py_api.rst @@ -27,6 +27,7 @@ Functions bob.io.base.peek bob.io.base.peek_all bob.io.base.create_directories_safe + bob.io.base.vstack_features bob.io.base.extensions bob.io.base.get_config