Add a function to load features in a memory efficient way

99024b66 · Amir MOHAMMADI · 0fc24e19 · 99024b66 · 99024b66
Commit 99024b66 authored 6 years ago by Amir MOHAMMADI
--- a/bob/extension/processors.py
+++ b/bob/extension/processors.py
+import logging
+import numpy
+import os
+
+logger = logging.getLogger(__name__)
+
+
 class SequentialProcessor(object):
    """A helper class which takes several processors and applies them one by
    one on data sequentially. See :ref:`bob.extension.processors` for more
@@ -85,3 +92,130 @@ class ParallelProcessor(object):
        """
        for processor in self.processors:
            yield processor(data, **kwargs)
+
+
+def _generate_features(reader, paths, same_size=False):
+    """Load and stack features in a memory efficient way. This function is meant
+    to be used inside :py:func:`vstack_features`.
+
+    Parameters
+    ----------
+    reader : ``collections.Callable``
+      See the documentation of :py:func:`vstack_features`.
+    paths : ``collections.Iterable``
+      See the documentation of :py:func:`vstack_features`.
+    same_size : :obj:`bool`, optional
+      See the documentation of :py:func:`vstack_features`.
+
+    Yields
+    ------
+    object
+      The first object returned is a tuple of :py:class:`numpy.dtype` of
+      features and the shape of the first feature. The rest of objects are
+      the actual values in features. The features are returned in C order.
+  """
+
+    shape_determined = False
+    for i, path in enumerate(paths):
+
+        feature = numpy.atleast_2d(reader(path))
+        feature = numpy.ascontiguousarray(feature)
+        if not shape_determined:
+            shape_determined = True
+            dtype = feature.dtype
+            shape = list(feature.shape)
+            yield (dtype, shape)
+        else:
+            # make sure all features have the same shape and dtype
+            if same_size:
+                assert shape == list(feature.shape)
+            else:
+                assert shape[1:] == list(feature.shape[1:])
+            assert dtype == feature.dtype
+
+        for value in feature.flat:
+            yield value
+
+
+def vstack_features(reader, paths, same_size=False):
+    """Stacks all features in a memory efficient way.
+
+    Parameters
+    ----------
+    reader : ``collections.Callable``
+      The function to load the features. The function should only take one
+      argument being the path to the features. Use
+      :any:`functools.partial` to accommodate your reader to this format.
+      The features returned by ``reader`` are expected to have the same
+      :py:class:`numpy.dtype` and the same shape except for their first
+      dimension. First dimension is should correspond to the number of samples.
+    paths : ``collections.Iterable``
+      An iterable of paths to iterate on. Whatever is inside path is given to
+      ``reader`` so they do not need to be necessarily paths to actual files.
+      If ``same_size`` is ``True``, ``len(paths)`` must be valid.
+    same_size : :obj:`bool`, optional
+      If ``True``, it assumes that arrays inside all the paths are the same
+      shape. If you know the features are the same size in all paths, set this
+      to ``True`` to improve the performance.
+
+    Returns
+    -------
+    numpy.ndarray
+      The read features with the shape (n_samples, \*features_shape[1:]).
+
+    Examples
+    --------
+    This function in a simple way is equivalent to calling
+    ``numpy.vstack(reader(p) for p in paths)``.
+
+    >>> import numpy
+    >>> from bob.bio.base import vstack_features
+    >>> def reader(path):
+    ...     # in each file, there are 5 samples and features are 2 dimensional.
+    ...     return numpy.arange(10).reshape(5,2)
+    >>> paths = ['path1', 'path2']
+    >>> all_features = vstack_features(reader, paths)
+    >>> all_features
+    array([[0, 1],
+         [2, 3],
+         [4, 5],
+         [6, 7],
+         [8, 9],
+         [0, 1],
+         [2, 3],
+         [4, 5],
+         [6, 7],
+         [8, 9]])
+    >>> all_features_with_more_memory = numpy.vstack(reader(p) for p in paths)
+    >>> numpy.allclose(all_features, all_features_with_more_memory)
+    True
+
+    You can allocate the array at once to improve the performance if you know
+    that all features in paths have the same shape and you know the total number
+    of the paths:
+
+    >>> vstack_features(reader, paths, same_size=True)
+    array([[0, 1],
+         [2, 3],
+         [4, 5],
+         [6, 7],
+         [8, 9],
+         [0, 1],
+         [2, 3],
+         [4, 5],
+         [6, 7],
+         [8, 9]])
+
+  """
+    iterable = _generate_features(reader, paths, same_size)
+    dtype, shape = next(iterable)
+    if same_size:
+        total_size = int(len(paths) * numpy.prod(shape))
+        all_features = numpy.fromiter(iterable, dtype, total_size)
+    else:
+        all_features = numpy.fromiter(iterable, dtype)
+
+    # the shape is assumed to be (n_samples, ...) it can be (5, 2) or (5, 3, 4).
+    shape = list(shape)
+    shape[0] = -1
+    return numpy.reshape(all_features, shape, order="C")
--- a/bob/extension/test_stack_processors.py
+++ b/bob/extension/test_stack_processors.py
 from functools import partial
 import numpy as np
 import tempfile
+import nose
+import os
 from bob.extension.processors import (
-    SequentialProcessor, ParallelProcessor)
+    SequentialProcessor,
+    ParallelProcessor,
+    vstack_features,
+)

 DATA = [0, 1, 2, 3, 4]
 PROCESSORS = [partial(np.power, 2), np.mean]
@@ -10,6 +15,21 @@ SEQ_DATA = PROCESSORS[1](PROCESSORS[0](DATA))
 PAR_DATA = (PROCESSORS[0](DATA), PROCESSORS[1](DATA))


+def temporary_filename(prefix="bobtest_", suffix=".npy"):
+    fd, name = tempfile.mkstemp(suffix, prefix)
+    os.close(fd)
+    os.unlink(name)
+    return name
+
+
+def save(data, path):
+    np.save(path, data)
+
+
+def load(path):
+    return np.load(path)
+
+
 def test_processors():
    proc = SequentialProcessor(PROCESSORS)
    data = proc(DATA)
@@ -18,3 +38,94 @@ def test_processors():
    proc = ParallelProcessor(PROCESSORS)
    data = proc(DATA)
    assert all(np.allclose(x1, x2) for x1, x2 in zip(data, PAR_DATA))
+
+
+def test_io_vstack():
+
+    paths = [1, 2, 3, 4, 5]
+
+    def oracle(reader, paths):
+        return np.vstack([reader(p) for p in paths])
+
+    def reader_same_size_C(path):
+        return np.arange(10).reshape(5, 2)
+
+    def reader_different_size_C(path):
+        return np.arange(2 * path).reshape(path, 2)
+
+    def reader_same_size_F(path):
+        return np.asfortranarray(np.arange(10).reshape(5, 2))
+
+    def reader_different_size_F(path):
+        return np.asfortranarray(np.arange(2 * path).reshape(path, 2))
+
+    def reader_same_size_C2(path):
+        return np.arange(30).reshape(5, 2, 3)
+
+    def reader_different_size_C2(path):
+        return np.arange(6 * path).reshape(path, 2, 3)
+
+    def reader_same_size_F2(path):
+        return np.asfortranarray(np.arange(30).reshape(5, 2, 3))
+
+    def reader_different_size_F2(path):
+        return np.asfortranarray(np.arange(6 * path).reshape(path, 2, 3))
+
+    def reader_wrong_size(path):
+        return np.arange(2 * path).reshape(2, path)
+
+    # when same_size is False
+    for reader in [
+        reader_different_size_C,
+        reader_different_size_F,
+        reader_same_size_C,
+        reader_same_size_F,
+        reader_different_size_C2,
+        reader_different_size_F2,
+        reader_same_size_C2,
+        reader_same_size_F2,
+    ]:
+        np.all(vstack_features(reader, paths) == oracle(reader, paths))
+
+    # when same_size is True
+    for reader in [
+        reader_same_size_C,
+        reader_same_size_F,
+        reader_same_size_C2,
+        reader_same_size_F2,
+    ]:
+        np.all(vstack_features(reader, paths, True) == oracle(reader, paths))
+
+    with nose.tools.assert_raises(AssertionError):
+        vstack_features(reader_wrong_size, paths)
+
+    # test actual files
+    paths = [temporary_filename(), temporary_filename(), temporary_filename()]
+    try:
+        # try different readers:
+        for reader in [
+            reader_different_size_C,
+            reader_different_size_F,
+            reader_same_size_C,
+            reader_same_size_F,
+            reader_different_size_C2,
+            reader_different_size_F2,
+            reader_same_size_C2,
+            reader_same_size_F2,
+        ]:
+            # save some data in files
+            for i, path in enumerate(paths):
+                save(reader(i + 1), path)
+            # test when all data is present
+            reference = oracle(load, paths)
+            np.all(vstack_features(load, paths) == reference)
+            os.remove(paths[0])
+            # Check if FileNotFoundError: is raised when one of the files is missing
+            with nose.tools.assert_raises(FileNotFoundError):
+                vstack_features(load, paths)
+    finally:
+        try:
+            for path in paths:
+                os.remove(path)
+        except Exception:
+            pass