Skip to content
Snippets Groups Projects
Commit edf71158 authored by Amir MOHAMMADI's avatar Amir MOHAMMADI
Browse files

Merge branch 'vstack-features-improve-load' into 'master'

[vstack_features] Improve load speeds using numpy field arrays

See merge request !32
parents dbe6e793 25cff9c8
No related branches found
No related tags found
1 merge request!32[vstack_features] Improve load speeds using numpy field arrays
Pipeline #46445 skipped
...@@ -366,8 +366,8 @@ def get_macros(): ...@@ -366,8 +366,8 @@ def get_macros():
def _generate_features(reader, paths, same_size=False): def _generate_features(reader, paths, same_size=False):
"""Load and stack features in a memory efficient way. This function is meant """Load and stack features in a memory efficient way. This function is
to be used inside :py:func:`vstack_features`. meant to be used inside :py:func:`vstack_features`.
Parameters Parameters
---------- ----------
...@@ -404,11 +404,14 @@ def _generate_features(reader, paths, same_size=False): ...@@ -404,11 +404,14 @@ def _generate_features(reader, paths, same_size=False):
assert shape[1:] == list(feature.shape[1:]) assert shape[1:] == list(feature.shape[1:])
assert dtype == feature.dtype assert dtype == feature.dtype
for value in feature.flat: if same_size:
yield value yield (feature.ravel(),)
else:
for feat in feature:
yield (feat.ravel(),)
def vstack_features(reader, paths, same_size=False): def vstack_features(reader, paths, same_size=False, dtype=None):
"""Stacks all features in a memory efficient way. """Stacks all features in a memory efficient way.
Parameters Parameters
...@@ -428,6 +431,8 @@ def vstack_features(reader, paths, same_size=False): ...@@ -428,6 +431,8 @@ def vstack_features(reader, paths, same_size=False):
If ``True``, it assumes that arrays inside all the paths are the same If ``True``, it assumes that arrays inside all the paths are the same
shape. If you know the features are the same size in all paths, set this shape. If you know the features are the same size in all paths, set this
to ``True`` to improve the performance. to ``True`` to improve the performance.
dtype : :py:class:`numpy.dtype`, optional
If provided, the data will be casted to this format.
Returns Returns
------- -------
...@@ -437,7 +442,7 @@ def vstack_features(reader, paths, same_size=False): ...@@ -437,7 +442,7 @@ def vstack_features(reader, paths, same_size=False):
Examples Examples
-------- --------
This function in a simple way is equivalent to calling This function in a simple way is equivalent to calling
``numpy.vstack(reader(p) for p in paths)``. ``numpy.vstack([reader(p) for p in paths])``.
>>> import numpy >>> import numpy
>>> from bob.io.base import vstack_features >>> from bob.io.base import vstack_features
...@@ -458,7 +463,7 @@ def vstack_features(reader, paths, same_size=False): ...@@ -458,7 +463,7 @@ def vstack_features(reader, paths, same_size=False):
... [6, 7], ... [6, 7],
... [8, 9]])) ... [8, 9]]))
True True
>>> all_features_with_more_memory = numpy.vstack(reader(p) for p in paths) >>> all_features_with_more_memory = numpy.vstack([reader(p) for p in paths])
>>> numpy.allclose(all_features, all_features_with_more_memory) >>> numpy.allclose(all_features, all_features_with_more_memory)
True True
...@@ -479,19 +484,22 @@ def vstack_features(reader, paths, same_size=False): ...@@ -479,19 +484,22 @@ def vstack_features(reader, paths, same_size=False):
... [6, 7], ... [6, 7],
... [8, 9]])) ... [8, 9]]))
True True
.. note::
This function runs very slowly. Only use it when RAM is precious.
""" """
iterable = _generate_features(reader, paths, same_size) iterable = _generate_features(reader, paths, same_size)
dtype, shape = next(iterable) data_dtype, shape = next(iterable)
if dtype is None:
dtype = data_dtype
if same_size: if same_size:
total_size = int(len(paths) * numpy.prod(shape)) # numpy black magic: https://stackoverflow.com/a/12473478/1286165
all_features = numpy.fromiter(iterable, dtype, total_size) field_dtype = [("", (dtype, (numpy.prod(shape),)))]
total_size = len(paths)
all_features = numpy.fromiter(iterable, field_dtype, total_size)
else: else:
all_features = numpy.fromiter(iterable, dtype) field_dtype = [("", (dtype, (numpy.prod(shape[1:]),)))]
all_features = numpy.fromiter(iterable, field_dtype)
# go from a field array to a normal array
all_features = all_features.view(dtype)
# the shape is assumed to be (n_samples, ...) it can be (5, 2) or (5, 3, 4). # the shape is assumed to be (n_samples, ...) it can be (5, 2) or (5, 3, 4).
shape = list(shape) shape = list(shape)
shape[0] = -1 shape[0] = -1
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment