Merge branch 'remove-vstack-features' into 'master'

move vstack_features to bob.io.base See merge request !46

Merge branch 'remove-vstack-features' into 'master'
6ea25543 · Amir MOHAMMADI · a4ee858b · 6858dff8 · 6ea25543 · 6ea25543
Commit 6ea25543 authored 4 years ago by Amir MOHAMMADI
--- a/bob/pipelines/sample.py
+++ b/bob/pipelines/sample.py
@@ -6,7 +6,7 @@ from collections.abc import Sequence
 import h5py
 import numpy as np

-from .utils import vstack_features
+from bob.io.base import vstack_features

 SAMPLE_DATA_ATTRS = ("data", "load", "samples", "_data")


--- a/bob/pipelines/tests/test_utils.py
+++ b/bob/pipelines/tests/test_utils.py
-import os
-
-from tempfile import NamedTemporaryFile
-
-import nose
-import numpy as np
-
 import bob.pipelines as mario


-def test_io_vstack():
-
-    paths = [1, 2, 3, 4, 5]
-
-    def asser_(actual, desired, dtype=None):
-        np.testing.assert_allclose(actual, desired)
-        if dtype is not None:
-            assert actual.dtype == dtype, (actual.dtype, dtype)
-
-    def oracle(reader, paths):
-        return np.vstack([reader(p) for p in paths])
-
-    def reader_same_size_C(path):
-        return np.arange(10).reshape(5, 2) + path
-
-    def reader_different_size_C(path):
-        return np.arange(2 * path).reshape(path, 2) + path
-
-    def reader_same_size_F(path):
-        return np.asfortranarray(np.arange(10).reshape(5, 2)) + path
-
-    def reader_different_size_F(path):
-        return np.asfortranarray(np.arange(2 * path).reshape(path, 2)) + path
-
-    def reader_same_size_C2(path):
-        return np.arange(30).reshape(5, 2, 3) + path
-
-    def reader_different_size_C2(path):
-        return np.arange(6 * path).reshape(path, 2, 3) + path
-
-    def reader_same_size_F2(path):
-        return np.asfortranarray(np.arange(30).reshape(5, 2, 3)) + path
-
-    def reader_different_size_F2(path):
-        return np.asfortranarray(np.arange(6 * path).reshape(path, 2, 3)) + path
-
-    def reader_wrong_size(path):
-        return np.arange(2 * path).reshape(2, path) + path
-
-    dtype = "float32"
-    # when same_size is False
-    for reader in [
-        reader_different_size_C,
-        reader_different_size_F,
-        reader_same_size_C,
-        reader_same_size_F,
-        reader_different_size_C2,
-        reader_different_size_F2,
-        reader_same_size_C2,
-        reader_same_size_F2,
-    ]:
-        asser_(mario.utils.vstack_features(reader, paths), oracle(reader, paths))
-        asser_(
-            mario.utils.vstack_features(reader, paths, dtype=dtype),
-            oracle(reader, paths),
-            dtype,
-        )
-
-    # when same_size is True
-    for reader in [
-        reader_same_size_C,
-        reader_same_size_F,
-        reader_same_size_C2,
-        reader_same_size_F2,
-    ]:
-        asser_(mario.utils.vstack_features(reader, paths, True), oracle(reader, paths))
-        asser_(
-            mario.utils.vstack_features(reader, paths, True, dtype=dtype),
-            oracle(reader, paths),
-            dtype,
-        )
-
-    with nose.tools.assert_raises(AssertionError):
-        mario.utils.vstack_features(reader_wrong_size, paths)
-
-    # test actual files
-    suffix = ".npy"
-    with NamedTemporaryFile(suffix=suffix) as f1, NamedTemporaryFile(
-        suffix=suffix
-    ) as f2, NamedTemporaryFile(suffix=suffix) as f3:
-        paths = [f1.name, f2.name, f3.name]
-        # try different readers:
-        for reader in [
-            reader_different_size_C,
-            reader_different_size_F,
-            reader_same_size_C,
-            reader_same_size_F,
-            reader_different_size_C2,
-            reader_different_size_F2,
-            reader_same_size_C2,
-            reader_same_size_F2,
-        ]:
-            # save some data in files
-            for i, path in enumerate(paths):
-                np.save(path, reader(i + 1), allow_pickle=False)
-            # test when all data is present
-            reference = oracle(np.load, paths)
-            asser_(mario.utils.vstack_features(np.load, paths), reference)
-            asser_(
-                mario.utils.vstack_features(np.load, paths, dtype=dtype),
-                reference,
-                dtype,
-            )
-            try:
-                os.remove(paths[0])
-                # Check if RuntimeError is raised when one of the files is missing
-                with nose.tools.assert_raises(FileNotFoundError):
-                    mario.utils.vstack_features(np.load, paths)
-            finally:
-                # create the file back so NamedTemporaryFile does not complain
-                np.save(paths[0], reader(i + 1))
-
-
 def test_isinstance_nested():
    class A:
        pass

--- a/bob/pipelines/utils.py
+++ b/bob/pipelines/utils.py
@@ -45,151 +45,6 @@ def is_estimator_stateless(estimator):
    return False


-def _generate_features(reader, paths, same_size=False):
-    """Load and stack features in a memory efficient way. This function is
-    meant to be used inside :py:func:`vstack_features`.
-
-    Parameters
-    ----------
-    reader : ``collections.Callable``
-      See the documentation of :py:func:`vstack_features`.
-    paths : ``collections.Iterable``
-      See the documentation of :py:func:`vstack_features`.
-    same_size : :obj:`bool`, optional
-      See the documentation of :py:func:`vstack_features`.
-
-    Yields
-    ------
-    object
-      The first object returned is a tuple of :py:class:`numpy.dtype` of
-      features and the shape of the first feature. The rest of objects are
-      the actual values in features. The features are returned in C order.
-    """
-
-    shape_determined = False
-    for i, path in enumerate(paths):
-
-        feature = np.atleast_2d(reader(path))
-        feature = np.ascontiguousarray(feature)
-        if not shape_determined:
-            shape_determined = True
-            dtype = feature.dtype
-            shape = list(feature.shape)
-            yield (dtype, shape)
-        else:
-            # make sure all features have the same shape and dtype
-            if same_size:
-                assert shape == list(feature.shape)
-            else:
-                assert shape[1:] == list(feature.shape[1:])
-            assert dtype == feature.dtype
-
-        if same_size:
-            yield (feature.ravel(),)
-        else:
-            for feat in feature:
-                yield (feat.ravel(),)
-
-
-def vstack_features(reader, paths, same_size=False, dtype=None):
-    """Stacks all features in a memory efficient way.
-
-    Parameters
-    ----------
-    reader : ``collections.Callable``
-      The function to load the features. The function should only take one
-      argument ``path`` and return loaded features. Use :any:`functools.partial`
-      to accommodate your reader to this format.
-      The features returned by ``reader`` are expected to have the same
-      :py:class:`numpy.dtype` and the same shape except for their first
-      dimension. First dimension should correspond to the number of samples.
-    paths : ``collections.Iterable``
-      An iterable of paths to iterate on. Whatever is inside path is given to
-      ``reader`` so they do not need to be necessarily paths to actual files.
-      If ``same_size`` is ``True``, ``len(paths)`` must be valid.
-    same_size : :obj:`bool`, optional
-      If ``True``, it assumes that arrays inside all the paths are the same
-      shape. If you know the features are the same size in all paths, set this
-      to ``True`` to improve the performance.
-    dtype : :py:class:`numpy.dtype`, optional
-      If provided, the data will be casted to this format.
-
-    Returns
-    -------
-    numpy.ndarray
-      The read features with the shape ``(n_samples, *features_shape[1:])``.
-
-    Examples
-    --------
-    This function in a simple way is equivalent to calling
-    ``numpy.vstack(reader(p) for p in paths)``.
-
-    >>> import numpy
-    >>> from bob.io.base import vstack_features
-    >>> def reader(path):
-    ...     # in each file, there are 5 samples and features are 2 dimensional.
-    ...     return numpy.arange(10).reshape(5,2)
-    >>> paths = ['path1', 'path2']
-    >>> all_features = vstack_features(reader, paths)
-    >>> numpy.allclose(all_features, numpy.array(
-    ...     [[0, 1],
-    ...      [2, 3],
-    ...      [4, 5],
-    ...      [6, 7],
-    ...      [8, 9],
-    ...      [0, 1],
-    ...      [2, 3],
-    ...      [4, 5],
-    ...      [6, 7],
-    ...      [8, 9]]))
-    True
-    >>> all_features_with_more_memory = numpy.vstack(reader(p) for p in paths)
-    >>> numpy.allclose(all_features, all_features_with_more_memory)
-    True
-
-    You can allocate the array at once to improve the performance if you know
-    that all features in paths have the same shape and you know the total number
-    of the paths:
-
-    >>> all_features = vstack_features(reader, paths, same_size=True)
-    >>> numpy.allclose(all_features, numpy.array(
-    ...     [[0, 1],
-    ...      [2, 3],
-    ...      [4, 5],
-    ...      [6, 7],
-    ...      [8, 9],
-    ...      [0, 1],
-    ...      [2, 3],
-    ...      [4, 5],
-    ...      [6, 7],
-    ...      [8, 9]]))
-    True
-
-    .. note::
-
-      This function runs very slowly. Only use it when RAM is precious.
-    """
-    iterable = _generate_features(reader, paths, same_size)
-    data_dtype, shape = next(iterable)
-    if dtype is None:
-        dtype = data_dtype
-    if same_size:
-        # numpy black magic: https://stackoverflow.com/a/12473478/1286165
-        field_dtype = [("", (dtype, (np.prod(shape),)))]
-        total_size = len(paths)
-        all_features = np.fromiter(iterable, field_dtype, total_size)
-    else:
-        field_dtype = [("", (dtype, (np.prod(shape[1:]),)))]
-        all_features = np.fromiter(iterable, field_dtype)
-
-    # go from a field array to a normal array
-    all_features = all_features.view(dtype)
-    # the shape is assumed to be (n_samples, ...) it can be (5, 2) or (5, 3, 4).
-    shape = list(shape)
-    shape[0] = -1
-    return np.reshape(all_features, shape, order="C")
-
-
 def isinstance_nested(instance, attribute, isinstance_of):
    """
    Check if an object and its nested objects is an instance of a class.