Amir MOHAMMADI · cf9b1223
--- a/bob/io/base/__init__.py

+ 133

− 0
+++ b/bob/io/base/__init__.py

+ 133

− 0
 @@ -371,5 +371,138 @@ def get_macros():
    return [('HAVE_HDF5', '1')]


+def _generate_features(reader, paths, same_size=False):
+  """Load and stack features in a memory efficient way. This function is meant
+  to be used inside :py:func:`vstack_features`.
+
+  Parameters
+  ----------
+  reader : ``collections.Callable``
+    See the documentation of :py:func:`vstack_features`.
+  paths : ``collections.Iterable``
+    See the documentation of :py:func:`vstack_features`.
+  same_size : :obj:`bool`, optional
+    See the documentation of :py:func:`vstack_features`.
+
+  Yields
+  ------
+  object
+    The first object returned is a tuple of :py:class:`numpy.dtype` of
+    features and the shape of the first feature. The rest of objects are
+    the actual values in features. The features are returned in C order.
+  """
+
+  shape_determined = False
+  for i, path in enumerate(paths):
+
+    feature = numpy.atleast_2d(reader(path))
+    feature = numpy.ascontiguousarray(feature)
+    if not shape_determined:
+      shape_determined = True
+      dtype = feature.dtype
+      shape = list(feature.shape)
+      yield (dtype, shape)
+    else:
+      # make sure all features have the same shape and dtype
+      if same_size:
+        assert shape == list(feature.shape)
+      else:
+        assert shape[1:] == list(feature.shape[1:])
+      assert dtype == feature.dtype
+
+    for value in feature.flat:
+      yield value
+
+
+def vstack_features(reader, paths, same_size=False):
+  """Stacks all features in a memory efficient way.
+
+  Parameters
+  ----------
+  reader : ``collections.Callable``
+    The function to load the features. The function should only take one
+    argument ``path`` and return loaded features. Use :any:`functools.partial`
+    to accommodate your reader to this format.
+    The features returned by ``reader`` are expected to have the same
+    :py:class:`numpy.dtype` and the same shape except for their first
+    dimension. First dimension should correspond to the number of samples.
+  paths : ``collections.Iterable``
+    An iterable of paths to iterate on. Whatever is inside path is given to
+    ``reader`` so they do not need to be necessarily paths to actual files.
+    If ``same_size`` is ``True``, ``len(paths)`` must be valid.
+  same_size : :obj:`bool`, optional
+    If ``True``, it assumes that arrays inside all the paths are the same
+    shape. If you know the features are the same size in all paths, set this
+    to ``True`` to improve the performance.
+
+  Returns
+  -------
+  numpy.ndarray
+    The read features with the shape ``(n_samples, *features_shape[1:])``.
+
+  Examples
+  --------
+  This function in a simple way is equivalent to calling
+  ``numpy.vstack(reader(p) for p in paths)``.
+
+  >>> import numpy
+  >>> from bob.io.base import vstack_features
+  >>> def reader(path):
+  ...     # in each file, there are 5 samples and features are 2 dimensional.
+  ...     return numpy.arange(10).reshape(5,2)
+  >>> paths = ['path1', 'path2']
+  >>> all_features = vstack_features(reader, paths)
+  >>> numpy.allclose(all_features, numpy.array(
+  ...     [[0, 1],
+  ...      [2, 3],
+  ...      [4, 5],
+  ...      [6, 7],
+  ...      [8, 9],
+  ...      [0, 1],
+  ...      [2, 3],
+  ...      [4, 5],
+  ...      [6, 7],
+  ...      [8, 9]]))
+  True
+  >>> all_features_with_more_memory = numpy.vstack(reader(p) for p in paths)
+  >>> numpy.allclose(all_features, all_features_with_more_memory)
+  True
+
+  You can allocate the array at once to improve the performance if you know
+  that all features in paths have the same shape and you know the total number
+  of the paths:
+
+  >>> all_features = vstack_features(reader, paths, same_size=True)
+  >>> numpy.allclose(all_features, numpy.array(
+  ...     [[0, 1],
+  ...      [2, 3],
+  ...      [4, 5],
+  ...      [6, 7],
+  ...      [8, 9],
+  ...      [0, 1],
+  ...      [2, 3],
+  ...      [4, 5],
+  ...      [6, 7],
+  ...      [8, 9]]))
+  True
+
+  .. note::
+
+    This function runs very slowly. Only use it when RAM is precious.
+  """
+  iterable = _generate_features(reader, paths, same_size)
+  dtype, shape = next(iterable)
+  if same_size:
+    total_size = int(len(paths) * numpy.prod(shape))
+    all_features = numpy.fromiter(iterable, dtype, total_size)
+  else:
+    all_features = numpy.fromiter(iterable, dtype)
+
+  # the shape is assumed to be (n_samples, ...) it can be (5, 2) or (5, 3, 4).
+  shape = list(shape)
+  shape[0] = -1
+  return numpy.reshape(all_features, shape, order="C")
+
+
 # gets sphinx autodoc done right - don't remove it
 __all__ = [_ for _ in dir() if not _.startswith('_')]