Commit 6ea25543 authored by Amir MOHAMMADI's avatar Amir MOHAMMADI
Browse files

Merge branch 'remove-vstack-features' into 'master'

move vstack_features to bob.io.base

See merge request !46
parents a4ee858b 6858dff8
Pipeline #45543 passed with stages
in 11 minutes and 3 seconds
......@@ -6,7 +6,7 @@ from collections.abc import Sequence
import h5py
import numpy as np
from .utils import vstack_features
from bob.io.base import vstack_features
SAMPLE_DATA_ATTRS = ("data", "load", "samples", "_data")
......
import os
from tempfile import NamedTemporaryFile
import nose
import numpy as np
import bob.pipelines as mario
def test_io_vstack():
paths = [1, 2, 3, 4, 5]
def asser_(actual, desired, dtype=None):
np.testing.assert_allclose(actual, desired)
if dtype is not None:
assert actual.dtype == dtype, (actual.dtype, dtype)
def oracle(reader, paths):
return np.vstack([reader(p) for p in paths])
def reader_same_size_C(path):
return np.arange(10).reshape(5, 2) + path
def reader_different_size_C(path):
return np.arange(2 * path).reshape(path, 2) + path
def reader_same_size_F(path):
return np.asfortranarray(np.arange(10).reshape(5, 2)) + path
def reader_different_size_F(path):
return np.asfortranarray(np.arange(2 * path).reshape(path, 2)) + path
def reader_same_size_C2(path):
return np.arange(30).reshape(5, 2, 3) + path
def reader_different_size_C2(path):
return np.arange(6 * path).reshape(path, 2, 3) + path
def reader_same_size_F2(path):
return np.asfortranarray(np.arange(30).reshape(5, 2, 3)) + path
def reader_different_size_F2(path):
return np.asfortranarray(np.arange(6 * path).reshape(path, 2, 3)) + path
def reader_wrong_size(path):
return np.arange(2 * path).reshape(2, path) + path
dtype = "float32"
# when same_size is False
for reader in [
reader_different_size_C,
reader_different_size_F,
reader_same_size_C,
reader_same_size_F,
reader_different_size_C2,
reader_different_size_F2,
reader_same_size_C2,
reader_same_size_F2,
]:
asser_(mario.utils.vstack_features(reader, paths), oracle(reader, paths))
asser_(
mario.utils.vstack_features(reader, paths, dtype=dtype),
oracle(reader, paths),
dtype,
)
# when same_size is True
for reader in [
reader_same_size_C,
reader_same_size_F,
reader_same_size_C2,
reader_same_size_F2,
]:
asser_(mario.utils.vstack_features(reader, paths, True), oracle(reader, paths))
asser_(
mario.utils.vstack_features(reader, paths, True, dtype=dtype),
oracle(reader, paths),
dtype,
)
with nose.tools.assert_raises(AssertionError):
mario.utils.vstack_features(reader_wrong_size, paths)
# test actual files
suffix = ".npy"
with NamedTemporaryFile(suffix=suffix) as f1, NamedTemporaryFile(
suffix=suffix
) as f2, NamedTemporaryFile(suffix=suffix) as f3:
paths = [f1.name, f2.name, f3.name]
# try different readers:
for reader in [
reader_different_size_C,
reader_different_size_F,
reader_same_size_C,
reader_same_size_F,
reader_different_size_C2,
reader_different_size_F2,
reader_same_size_C2,
reader_same_size_F2,
]:
# save some data in files
for i, path in enumerate(paths):
np.save(path, reader(i + 1), allow_pickle=False)
# test when all data is present
reference = oracle(np.load, paths)
asser_(mario.utils.vstack_features(np.load, paths), reference)
asser_(
mario.utils.vstack_features(np.load, paths, dtype=dtype),
reference,
dtype,
)
try:
os.remove(paths[0])
# Check if RuntimeError is raised when one of the files is missing
with nose.tools.assert_raises(FileNotFoundError):
mario.utils.vstack_features(np.load, paths)
finally:
# create the file back so NamedTemporaryFile does not complain
np.save(paths[0], reader(i + 1))
def test_isinstance_nested():
class A:
pass
......
......@@ -45,151 +45,6 @@ def is_estimator_stateless(estimator):
return False
def _generate_features(reader, paths, same_size=False):
"""Load and stack features in a memory efficient way. This function is
meant to be used inside :py:func:`vstack_features`.
Parameters
----------
reader : ``collections.Callable``
See the documentation of :py:func:`vstack_features`.
paths : ``collections.Iterable``
See the documentation of :py:func:`vstack_features`.
same_size : :obj:`bool`, optional
See the documentation of :py:func:`vstack_features`.
Yields
------
object
The first object returned is a tuple of :py:class:`numpy.dtype` of
features and the shape of the first feature. The rest of objects are
the actual values in features. The features are returned in C order.
"""
shape_determined = False
for i, path in enumerate(paths):
feature = np.atleast_2d(reader(path))
feature = np.ascontiguousarray(feature)
if not shape_determined:
shape_determined = True
dtype = feature.dtype
shape = list(feature.shape)
yield (dtype, shape)
else:
# make sure all features have the same shape and dtype
if same_size:
assert shape == list(feature.shape)
else:
assert shape[1:] == list(feature.shape[1:])
assert dtype == feature.dtype
if same_size:
yield (feature.ravel(),)
else:
for feat in feature:
yield (feat.ravel(),)
def vstack_features(reader, paths, same_size=False, dtype=None):
"""Stacks all features in a memory efficient way.
Parameters
----------
reader : ``collections.Callable``
The function to load the features. The function should only take one
argument ``path`` and return loaded features. Use :any:`functools.partial`
to accommodate your reader to this format.
The features returned by ``reader`` are expected to have the same
:py:class:`numpy.dtype` and the same shape except for their first
dimension. First dimension should correspond to the number of samples.
paths : ``collections.Iterable``
An iterable of paths to iterate on. Whatever is inside path is given to
``reader`` so they do not need to be necessarily paths to actual files.
If ``same_size`` is ``True``, ``len(paths)`` must be valid.
same_size : :obj:`bool`, optional
If ``True``, it assumes that arrays inside all the paths are the same
shape. If you know the features are the same size in all paths, set this
to ``True`` to improve the performance.
dtype : :py:class:`numpy.dtype`, optional
If provided, the data will be casted to this format.
Returns
-------
numpy.ndarray
The read features with the shape ``(n_samples, *features_shape[1:])``.
Examples
--------
This function in a simple way is equivalent to calling
``numpy.vstack(reader(p) for p in paths)``.
>>> import numpy
>>> from bob.io.base import vstack_features
>>> def reader(path):
... # in each file, there are 5 samples and features are 2 dimensional.
... return numpy.arange(10).reshape(5,2)
>>> paths = ['path1', 'path2']
>>> all_features = vstack_features(reader, paths)
>>> numpy.allclose(all_features, numpy.array(
... [[0, 1],
... [2, 3],
... [4, 5],
... [6, 7],
... [8, 9],
... [0, 1],
... [2, 3],
... [4, 5],
... [6, 7],
... [8, 9]]))
True
>>> all_features_with_more_memory = numpy.vstack(reader(p) for p in paths)
>>> numpy.allclose(all_features, all_features_with_more_memory)
True
You can allocate the array at once to improve the performance if you know
that all features in paths have the same shape and you know the total number
of the paths:
>>> all_features = vstack_features(reader, paths, same_size=True)
>>> numpy.allclose(all_features, numpy.array(
... [[0, 1],
... [2, 3],
... [4, 5],
... [6, 7],
... [8, 9],
... [0, 1],
... [2, 3],
... [4, 5],
... [6, 7],
... [8, 9]]))
True
.. note::
This function runs very slowly. Only use it when RAM is precious.
"""
iterable = _generate_features(reader, paths, same_size)
data_dtype, shape = next(iterable)
if dtype is None:
dtype = data_dtype
if same_size:
# numpy black magic: https://stackoverflow.com/a/12473478/1286165
field_dtype = [("", (dtype, (np.prod(shape),)))]
total_size = len(paths)
all_features = np.fromiter(iterable, field_dtype, total_size)
else:
field_dtype = [("", (dtype, (np.prod(shape[1:]),)))]
all_features = np.fromiter(iterable, field_dtype)
# go from a field array to a normal array
all_features = all_features.view(dtype)
# the shape is assumed to be (n_samples, ...) it can be (5, 2) or (5, 3, 4).
shape = list(shape)
shape[0] = -1
return np.reshape(all_features, shape, order="C")
def isinstance_nested(instance, attribute, isinstance_of):
"""
Check if an object and its nested objects is an instance of a class.
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment