Skip to content
Snippets Groups Projects
Commit 700d7a48 authored by Tiago de Freitas Pereira's avatar Tiago de Freitas Pereira
Browse files

Merge branch 'issue-96' into 'master'

Propagated the  option --allow-missing-files .....

See merge request !103
parents 54be3074 3ba3e30f
No related branches found
No related tags found
1 merge request!103Propagated the option --allow-missing-files .....
Pipeline #
......@@ -119,45 +119,83 @@ def test_io_vstack():
def reader_wrong_size(path):
return numpy.arange(2 * path).reshape(2, path)
# test C and F readers
numpy.all(bob.bio.base.vstack_features(reader_different_size_C,
paths, False) ==
oracle(reader_different_size_C, paths))
numpy.all(bob.bio.base.vstack_features(reader_different_size_F,
paths, False) ==
oracle(reader_different_size_F, paths))
numpy.all(bob.bio.base.vstack_features(reader_same_size_C, paths, False) ==
oracle(reader_same_size_C, paths))
numpy.all(bob.bio.base.vstack_features(reader_same_size_F, paths, False) ==
oracle(reader_same_size_F, paths))
numpy.all(bob.bio.base.vstack_features(reader_same_size_C, paths, True) ==
oracle(reader_same_size_C, paths))
numpy.all(bob.bio.base.vstack_features(reader_same_size_F, paths, True) ==
oracle(reader_same_size_F, paths))
# test 3 dimensional readers
numpy.all(bob.bio.base.vstack_features(reader_different_size_C2,
paths, False) ==
oracle(reader_different_size_C2, paths))
numpy.all(bob.bio.base.vstack_features(reader_different_size_F2,
paths, False) ==
oracle(reader_different_size_F2, paths))
numpy.all(bob.bio.base.vstack_features(reader_same_size_C2, paths, False) ==
oracle(reader_same_size_C2, paths))
numpy.all(bob.bio.base.vstack_features(reader_same_size_F2, paths, False) ==
oracle(reader_same_size_F2, paths))
numpy.all(bob.bio.base.vstack_features(reader_same_size_C2, paths, True) ==
oracle(reader_same_size_C2, paths))
numpy.all(bob.bio.base.vstack_features(reader_same_size_F2, paths, True) ==
oracle(reader_same_size_F2, paths))
# when same_size is False
for reader in [
reader_different_size_C,
reader_different_size_F,
reader_same_size_C,
reader_same_size_F,
reader_different_size_C2,
reader_different_size_F2,
reader_same_size_C2,
reader_same_size_F2,
]:
numpy.all(bob.bio.base.vstack_features(reader, paths) ==
oracle(reader, paths))
# when same_size is True
for reader in [
reader_same_size_C,
reader_same_size_F,
reader_same_size_C2,
reader_same_size_F2,
]:
numpy.all(bob.bio.base.vstack_features(reader, paths, True) ==
oracle(reader, paths))
with nose.tools.assert_raises(AssertionError):
bob.bio.base.vstack_features(reader_wrong_size, paths)
# test actual files
paths = [bob.io.base.test_utils.temporary_filename(),
bob.io.base.test_utils.temporary_filename(),
bob.io.base.test_utils.temporary_filename()]
try:
# try different readers:
for reader in [
reader_different_size_C,
reader_different_size_F,
reader_same_size_C,
reader_same_size_F,
reader_different_size_C2,
reader_different_size_F2,
reader_same_size_C2,
reader_same_size_F2,
]:
# save some data in files
for i, path in enumerate(paths):
bob.bio.base.save(reader(i + 1), path)
# test when all data is present
reference = oracle(bob.bio.base.load, paths)
numpy.all(bob.bio.base.vstack_features(bob.bio.base.load, paths) ==
reference)
# delete the first one
os.remove(paths[0])
reference = oracle(bob.bio.base.load, paths[1:])
target = bob.bio.base.vstack_features(bob.bio.base.load, paths, False,
True)
numpy.all(target == reference)
# save back first one and delete second one
bob.bio.base.save(reader(1), paths[0])
os.remove(paths[1])
reference = oracle(bob.bio.base.load, paths[:1] + paths[2:])
target = bob.bio.base.vstack_features(bob.bio.base.load, paths, False,
True)
numpy.all(target == reference)
# Check if RuntimeError is raised when one of the files is missing and
# allow_missing_files if False
with nose.tools.assert_raises(RuntimeError):
bob.bio.base.vstack_features(bob.bio.base.load, paths)
# Check if ValueError is raised.
with nose.tools.assert_raises(ValueError):
bob.bio.base.vstack_features(bob.bio.base.load, paths, True, True)
finally:
try:
for path in paths:
os.remove(path)
except Exception:
pass
def test_sampling():
# test selection of elements
......
......@@ -175,9 +175,10 @@ def save_compressed(data, filename, compression_type='bz2', create_link=False):
close_compressed(filename, hdf5, compression_type, create_link)
def _generate_features(reader, paths):
"""Load and stack features a memory efficient way. This function is meant to
be used inside :py:func:`vstack_features`.
def _generate_features(reader, paths, same_size=False,
allow_missing_files=False):
"""Load and stack features in a memory efficient way. This function is meant
to be used inside :py:func:`vstack_features`.
Parameters
----------
......@@ -185,6 +186,10 @@ def _generate_features(reader, paths):
See the documentation of :py:func:`vstack_features`.
paths : ``collections.Iterable``
See the documentation of :py:func:`vstack_features`.
same_size : :obj:`bool`, optional
See the documentation of :py:func:`vstack_features`.
allow_missing_files : :obj:`bool`, optional
See the documentation of :py:func:`vstack_features`.
Yields
------
......@@ -193,23 +198,33 @@ def _generate_features(reader, paths):
features and the shape of the first feature. The rest of objects are
the actual values in features. The features are returned in C order.
"""
shape_determined = False
for i, path in enumerate(paths):
if allow_missing_files and not os.path.isfile(path):
logger.debug("... File %s, that does not exist, has been ignored.", path)
continue
feature = numpy.atleast_2d(reader(path))
feature = numpy.ascontiguousarray(feature)
if i == 0:
if not shape_determined:
shape_determined = True
dtype = feature.dtype
shape = list(feature.shape)
yield (dtype, shape)
else:
# make sure all features have the same shape[1:] and dtype
assert shape[1:] == list(feature.shape[1:])
# make sure all features have the same shape and dtype
if same_size:
assert shape == list(feature.shape)
else:
assert shape[1:] == list(feature.shape[1:])
assert dtype == feature.dtype
for value in feature.flat:
yield value
def vstack_features(reader, paths, same_size=False):
def vstack_features(reader, paths, same_size=False, allow_missing_files=False):
"""Stacks all features in a memory efficient way.
Parameters
......@@ -223,20 +238,29 @@ def vstack_features(reader, paths, same_size=False):
dimension. First dimension is should correspond to the number of samples.
paths : ``collections.Iterable``
An iterable of paths to iterate on. Whatever is inside path is given to
``reader``. If ``same_size`` is ``True``, ``len(paths)`` must be valid.
``reader`` so they do not need to be necessarily paths to actual files.
If ``same_size`` is ``True``, ``len(paths)`` must be valid.
same_size : :obj:`bool`, optional
If ``True``, it assumes that arrays inside all the paths are the same
shape. If you know the features are the same size in all paths, set this
to ``True`` to improve the performance.
allow_missing_files : :obj:`bool`, optional
If ``True``, it assumes that the items inside paths are actual files and
ignores the ones that do not exist.
Returns
-------
numpy.ndarray
The read features with the shape (n_samples, \*features_shape[1:]).
Raises
------
ValueError
If both same_size and allow_missing_files are ``True``.
Examples
--------
This function is equivalent to calling
This function in a simple way is equivalent to calling
``numpy.vstack(reader(p) for p in paths)``.
>>> import numpy
......@@ -276,8 +300,12 @@ def vstack_features(reader, paths, same_size=False):
[4, 5],
[6, 7],
[8, 9]])
"""
iterable = _generate_features(reader, paths)
if same_size and allow_missing_files:
raise ValueError("Both same_size and allow_missing_files cannot be True at"
" the same time.")
iterable = _generate_features(reader, paths, same_size, allow_missing_files)
dtype, shape = next(iterable)
if same_size:
total_size = int(len(paths) * numpy.prod(shape))
......@@ -285,7 +313,7 @@ def vstack_features(reader, paths, same_size=False):
else:
all_features = numpy.fromiter(iterable, dtype)
# the shape is assumed to be (n_samples, ...) it can be (5, 2) or (5, 3, 3).
# the shape is assumed to be (n_samples, ...) it can be (5, 2) or (5, 3, 4).
shape = list(shape)
shape[0] = -1
return numpy.reshape(all_features, shape, order='C')
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment