diff --git a/bob/io/base/__init__.py b/bob/io/base/__init__.py index 30afaa3c5c80992152491f48e65df3493a00c831..7f30f18bb6c942b6568a6f957361ba50f0af7782 100644 --- a/bob/io/base/__init__.py +++ b/bob/io/base/__init__.py @@ -366,8 +366,8 @@ def get_macros(): def _generate_features(reader, paths, same_size=False): - """Load and stack features in a memory efficient way. This function is meant - to be used inside :py:func:`vstack_features`. + """Load and stack features in a memory efficient way. This function is + meant to be used inside :py:func:`vstack_features`. Parameters ---------- @@ -404,11 +404,14 @@ def _generate_features(reader, paths, same_size=False): assert shape[1:] == list(feature.shape[1:]) assert dtype == feature.dtype - for value in feature.flat: - yield value + if same_size: + yield (feature.ravel(),) + else: + for feat in feature: + yield (feat.ravel(),) -def vstack_features(reader, paths, same_size=False): +def vstack_features(reader, paths, same_size=False, dtype=None): """Stacks all features in a memory efficient way. Parameters @@ -428,6 +431,8 @@ def vstack_features(reader, paths, same_size=False): If ``True``, it assumes that arrays inside all the paths are the same shape. If you know the features are the same size in all paths, set this to ``True`` to improve the performance. + dtype : :py:class:`numpy.dtype`, optional + If provided, the data will be casted to this format. Returns ------- @@ -437,7 +442,7 @@ def vstack_features(reader, paths, same_size=False): Examples -------- This function in a simple way is equivalent to calling - ``numpy.vstack(reader(p) for p in paths)``. + ``numpy.vstack([reader(p) for p in paths])``. >>> import numpy >>> from bob.io.base import vstack_features @@ -458,7 +463,7 @@ def vstack_features(reader, paths, same_size=False): ... [6, 7], ... [8, 9]])) True - >>> all_features_with_more_memory = numpy.vstack(reader(p) for p in paths) + >>> all_features_with_more_memory = numpy.vstack([reader(p) for p in paths]) >>> numpy.allclose(all_features, all_features_with_more_memory) True @@ -479,19 +484,22 @@ def vstack_features(reader, paths, same_size=False): ... [6, 7], ... [8, 9]])) True - - .. note:: - - This function runs very slowly. Only use it when RAM is precious. """ iterable = _generate_features(reader, paths, same_size) - dtype, shape = next(iterable) + data_dtype, shape = next(iterable) + if dtype is None: + dtype = data_dtype if same_size: - total_size = int(len(paths) * numpy.prod(shape)) - all_features = numpy.fromiter(iterable, dtype, total_size) + # numpy black magic: https://stackoverflow.com/a/12473478/1286165 + field_dtype = [("", (dtype, (numpy.prod(shape),)))] + total_size = len(paths) + all_features = numpy.fromiter(iterable, field_dtype, total_size) else: - all_features = numpy.fromiter(iterable, dtype) + field_dtype = [("", (dtype, (numpy.prod(shape[1:]),)))] + all_features = numpy.fromiter(iterable, field_dtype) + # go from a field array to a normal array + all_features = all_features.view(dtype) # the shape is assumed to be (n_samples, ...) it can be (5, 2) or (5, 3, 4). shape = list(shape) shape[0] = -1