Skip to content
Snippets Groups Projects
Commit 37e82fd8 authored by Tiago de Freitas Pereira's avatar Tiago de Freitas Pereira
Browse files

Merge branch 'vstack_features' into 'dask-pipelines'

Remove vstack_features -> use the one in bob.pipelines

See merge request !187
parents c95c92d6 42f23502
No related branches found
No related tags found
2 merge requests!187Remove vstack_features -> use the one in bob.pipelines,!180[dask] Preparing bob.bio.base for dask pipelines
Pipeline #39822 passed
......@@ -3,7 +3,6 @@ import bob.learn.linear
import pkg_resources
import os
import numpy
import nose
import bob.io.base.test_utils
from . import utils
......@@ -67,118 +66,6 @@ def test_io():
os.remove(filename)
def test_io_vstack():
paths = [1, 2, 3, 4, 5]
def oracle(reader, paths):
return numpy.vstack([reader(p) for p in paths])
def reader_same_size_C(path):
return numpy.arange(10).reshape(5, 2)
def reader_different_size_C(path):
return numpy.arange(2 * path).reshape(path, 2)
def reader_same_size_F(path):
return numpy.asfortranarray(numpy.arange(10).reshape(5, 2))
def reader_different_size_F(path):
return numpy.asfortranarray(numpy.arange(2 * path).reshape(path, 2))
def reader_same_size_C2(path):
return numpy.arange(30).reshape(5, 2, 3)
def reader_different_size_C2(path):
return numpy.arange(6 * path).reshape(path, 2, 3)
def reader_same_size_F2(path):
return numpy.asfortranarray(numpy.arange(30).reshape(5, 2, 3))
def reader_different_size_F2(path):
return numpy.asfortranarray(numpy.arange(6 * path).reshape(path, 2, 3))
def reader_wrong_size(path):
return numpy.arange(2 * path).reshape(2, path)
# when same_size is False
for reader in [
reader_different_size_C,
reader_different_size_F,
reader_same_size_C,
reader_same_size_F,
reader_different_size_C2,
reader_different_size_F2,
reader_same_size_C2,
reader_same_size_F2,
]:
numpy.all(bob.bio.base.vstack_features(reader, paths) ==
oracle(reader, paths))
# when same_size is True
for reader in [
reader_same_size_C,
reader_same_size_F,
reader_same_size_C2,
reader_same_size_F2,
]:
numpy.all(bob.bio.base.vstack_features(reader, paths, True) ==
oracle(reader, paths))
with nose.tools.assert_raises(AssertionError):
bob.bio.base.vstack_features(reader_wrong_size, paths)
# test actual files
paths = [bob.io.base.test_utils.temporary_filename(),
bob.io.base.test_utils.temporary_filename(),
bob.io.base.test_utils.temporary_filename()]
try:
# try different readers:
for reader in [
reader_different_size_C,
reader_different_size_F,
reader_same_size_C,
reader_same_size_F,
reader_different_size_C2,
reader_different_size_F2,
reader_same_size_C2,
reader_same_size_F2,
]:
# save some data in files
for i, path in enumerate(paths):
bob.bio.base.save(reader(i + 1), path)
# test when all data is present
reference = oracle(bob.bio.base.load, paths)
numpy.all(bob.bio.base.vstack_features(bob.bio.base.load, paths) ==
reference)
# delete the first one
os.remove(paths[0])
reference = oracle(bob.bio.base.load, paths[1:])
target = bob.bio.base.vstack_features(bob.bio.base.load, paths, False,
True)
numpy.all(target == reference)
# save back first one and delete second one
bob.bio.base.save(reader(1), paths[0])
os.remove(paths[1])
reference = oracle(bob.bio.base.load, paths[:1] + paths[2:])
target = bob.bio.base.vstack_features(bob.bio.base.load, paths, False,
True)
numpy.all(target == reference)
# Check if RuntimeError is raised when one of the files is missing and
# allow_missing_files if False
with nose.tools.assert_raises(RuntimeError):
bob.bio.base.vstack_features(bob.bio.base.load, paths)
# Check if ValueError is raised.
with nose.tools.assert_raises(ValueError):
bob.bio.base.vstack_features(bob.bio.base.load, paths, True, True)
finally:
try:
for path in paths:
os.remove(path)
except Exception:
pass
def test_sampling():
# test selection of elements
indices = bob.bio.base.selected_indices(100, 10)
......
......@@ -173,150 +173,3 @@ def save_compressed(data, filename, compression_type='bz2', create_link=False):
hdf5 = open_compressed(filename, 'w')
save(data, hdf5)
close_compressed(filename, hdf5, compression_type, create_link)
def _generate_features(reader, paths, same_size=False,
allow_missing_files=False):
"""Load and stack features in a memory efficient way. This function is meant
to be used inside :py:func:`vstack_features`.
Parameters
----------
reader : ``collections.Callable``
See the documentation of :py:func:`vstack_features`.
paths : ``collections.Iterable``
See the documentation of :py:func:`vstack_features`.
same_size : :obj:`bool`, optional
See the documentation of :py:func:`vstack_features`.
allow_missing_files : :obj:`bool`, optional
See the documentation of :py:func:`vstack_features`.
Yields
------
object
The first object returned is a tuple of :py:class:`numpy.dtype` of
features and the shape of the first feature. The rest of objects are
the actual values in features. The features are returned in C order.
"""
shape_determined = False
for i, path in enumerate(paths):
if allow_missing_files and not os.path.isfile(path):
logger.debug("... File %s, that does not exist, has been ignored.", path)
continue
feature = numpy.atleast_2d(reader(path))
feature = numpy.ascontiguousarray(feature)
if not shape_determined:
shape_determined = True
dtype = feature.dtype
shape = list(feature.shape)
yield (dtype, shape)
else:
# make sure all features have the same shape and dtype
if same_size:
assert shape == list(feature.shape)
else:
assert shape[1:] == list(feature.shape[1:])
assert dtype == feature.dtype
for value in feature.flat:
yield value
def vstack_features(reader, paths, same_size=False, allow_missing_files=False):
"""Stacks all features in a memory efficient way.
Parameters
----------
reader : ``collections.Callable``
The function to load the features. The function should only take one
argument being the path to the features. Use
:any:`functools.partial` to accommodate your reader to this format.
The features returned by ``reader`` are expected to have the same
:py:class:`numpy.dtype` and the same shape except for their first
dimension. First dimension is should correspond to the number of samples.
paths : ``collections.Iterable``
An iterable of paths to iterate on. Whatever is inside path is given to
``reader`` so they do not need to be necessarily paths to actual files.
If ``same_size`` is ``True``, ``len(paths)`` must be valid.
same_size : :obj:`bool`, optional
If ``True``, it assumes that arrays inside all the paths are the same
shape. If you know the features are the same size in all paths, set this
to ``True`` to improve the performance.
allow_missing_files : :obj:`bool`, optional
If ``True``, it assumes that the items inside paths are actual files and
ignores the ones that do not exist.
Returns
-------
numpy.ndarray
The read features with the shape (n_samples, \*features_shape[1:]).
Raises
------
ValueError
If both same_size and allow_missing_files are ``True``.
Examples
--------
This function in a simple way is equivalent to calling
``numpy.vstack(reader(p) for p in paths)``.
>>> import numpy
>>> from bob.bio.base import vstack_features
>>> def reader(path):
... # in each file, there are 5 samples and features are 2 dimensional.
... return numpy.arange(10).reshape(5,2)
>>> paths = ['path1', 'path2']
>>> all_features = vstack_features(reader, paths)
>>> all_features
array([[0, 1],
[2, 3],
[4, 5],
[6, 7],
[8, 9],
[0, 1],
[2, 3],
[4, 5],
[6, 7],
[8, 9]])
>>> all_features_with_more_memory = numpy.vstack(reader(p) for p in paths)
>>> numpy.allclose(all_features, all_features_with_more_memory)
True
You can allocate the array at once to improve the performance if you know
that all features in paths have the same shape and you know the total number
of the paths:
>>> vstack_features(reader, paths, same_size=True)
array([[0, 1],
[2, 3],
[4, 5],
[6, 7],
[8, 9],
[0, 1],
[2, 3],
[4, 5],
[6, 7],
[8, 9]])
"""
if same_size and allow_missing_files:
raise ValueError("Both same_size and allow_missing_files cannot be True at"
" the same time.")
iterable = _generate_features(reader, paths, same_size, allow_missing_files)
try:
dtype, shape = next(iterable)
except StopIteration:
return numpy.array([])
if same_size:
total_size = int(len(paths) * numpy.prod(shape))
all_features = numpy.fromiter(iterable, dtype, total_size)
else:
all_features = numpy.fromiter(iterable, dtype)
# the shape is assumed to be (n_samples, ...) it can be (5, 2) or (5, 3, 4).
shape = list(shape)
shape[0] = -1
return numpy.reshape(all_features, shape, order='C')
......@@ -17,7 +17,6 @@ IO-related functions
bob.bio.base.open_compressed
bob.bio.base.close_compressed
bob.bio.base.check_file
bob.bio.base.vstack_features
Pipelines
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment