Skip to content
Snippets Groups Projects
Commit 700d7a48 authored by Tiago de Freitas Pereira's avatar Tiago de Freitas Pereira
Browse files

Merge branch 'issue-96' into 'master'

Propagated the  option --allow-missing-files .....

See merge request !103
parents 54be3074 3ba3e30f
No related branches found
No related tags found
1 merge request!103Propagated the option --allow-missing-files .....
Pipeline #
...@@ -119,45 +119,83 @@ def test_io_vstack(): ...@@ -119,45 +119,83 @@ def test_io_vstack():
def reader_wrong_size(path): def reader_wrong_size(path):
return numpy.arange(2 * path).reshape(2, path) return numpy.arange(2 * path).reshape(2, path)
# test C and F readers # when same_size is False
numpy.all(bob.bio.base.vstack_features(reader_different_size_C, for reader in [
paths, False) == reader_different_size_C,
oracle(reader_different_size_C, paths)) reader_different_size_F,
numpy.all(bob.bio.base.vstack_features(reader_different_size_F, reader_same_size_C,
paths, False) == reader_same_size_F,
oracle(reader_different_size_F, paths)) reader_different_size_C2,
reader_different_size_F2,
numpy.all(bob.bio.base.vstack_features(reader_same_size_C, paths, False) == reader_same_size_C2,
oracle(reader_same_size_C, paths)) reader_same_size_F2,
numpy.all(bob.bio.base.vstack_features(reader_same_size_F, paths, False) == ]:
oracle(reader_same_size_F, paths)) numpy.all(bob.bio.base.vstack_features(reader, paths) ==
oracle(reader, paths))
numpy.all(bob.bio.base.vstack_features(reader_same_size_C, paths, True) ==
oracle(reader_same_size_C, paths)) # when same_size is True
numpy.all(bob.bio.base.vstack_features(reader_same_size_F, paths, True) == for reader in [
oracle(reader_same_size_F, paths)) reader_same_size_C,
reader_same_size_F,
# test 3 dimensional readers reader_same_size_C2,
numpy.all(bob.bio.base.vstack_features(reader_different_size_C2, reader_same_size_F2,
paths, False) == ]:
oracle(reader_different_size_C2, paths)) numpy.all(bob.bio.base.vstack_features(reader, paths, True) ==
numpy.all(bob.bio.base.vstack_features(reader_different_size_F2, oracle(reader, paths))
paths, False) ==
oracle(reader_different_size_F2, paths))
numpy.all(bob.bio.base.vstack_features(reader_same_size_C2, paths, False) ==
oracle(reader_same_size_C2, paths))
numpy.all(bob.bio.base.vstack_features(reader_same_size_F2, paths, False) ==
oracle(reader_same_size_F2, paths))
numpy.all(bob.bio.base.vstack_features(reader_same_size_C2, paths, True) ==
oracle(reader_same_size_C2, paths))
numpy.all(bob.bio.base.vstack_features(reader_same_size_F2, paths, True) ==
oracle(reader_same_size_F2, paths))
with nose.tools.assert_raises(AssertionError): with nose.tools.assert_raises(AssertionError):
bob.bio.base.vstack_features(reader_wrong_size, paths) bob.bio.base.vstack_features(reader_wrong_size, paths)
# test actual files
paths = [bob.io.base.test_utils.temporary_filename(),
bob.io.base.test_utils.temporary_filename(),
bob.io.base.test_utils.temporary_filename()]
try:
# try different readers:
for reader in [
reader_different_size_C,
reader_different_size_F,
reader_same_size_C,
reader_same_size_F,
reader_different_size_C2,
reader_different_size_F2,
reader_same_size_C2,
reader_same_size_F2,
]:
# save some data in files
for i, path in enumerate(paths):
bob.bio.base.save(reader(i + 1), path)
# test when all data is present
reference = oracle(bob.bio.base.load, paths)
numpy.all(bob.bio.base.vstack_features(bob.bio.base.load, paths) ==
reference)
# delete the first one
os.remove(paths[0])
reference = oracle(bob.bio.base.load, paths[1:])
target = bob.bio.base.vstack_features(bob.bio.base.load, paths, False,
True)
numpy.all(target == reference)
# save back first one and delete second one
bob.bio.base.save(reader(1), paths[0])
os.remove(paths[1])
reference = oracle(bob.bio.base.load, paths[:1] + paths[2:])
target = bob.bio.base.vstack_features(bob.bio.base.load, paths, False,
True)
numpy.all(target == reference)
# Check if RuntimeError is raised when one of the files is missing and
# allow_missing_files if False
with nose.tools.assert_raises(RuntimeError):
bob.bio.base.vstack_features(bob.bio.base.load, paths)
# Check if ValueError is raised.
with nose.tools.assert_raises(ValueError):
bob.bio.base.vstack_features(bob.bio.base.load, paths, True, True)
finally:
try:
for path in paths:
os.remove(path)
except Exception:
pass
def test_sampling(): def test_sampling():
# test selection of elements # test selection of elements
......
...@@ -175,9 +175,10 @@ def save_compressed(data, filename, compression_type='bz2', create_link=False): ...@@ -175,9 +175,10 @@ def save_compressed(data, filename, compression_type='bz2', create_link=False):
close_compressed(filename, hdf5, compression_type, create_link) close_compressed(filename, hdf5, compression_type, create_link)
def _generate_features(reader, paths): def _generate_features(reader, paths, same_size=False,
"""Load and stack features a memory efficient way. This function is meant to allow_missing_files=False):
be used inside :py:func:`vstack_features`. """Load and stack features in a memory efficient way. This function is meant
to be used inside :py:func:`vstack_features`.
Parameters Parameters
---------- ----------
...@@ -185,6 +186,10 @@ def _generate_features(reader, paths): ...@@ -185,6 +186,10 @@ def _generate_features(reader, paths):
See the documentation of :py:func:`vstack_features`. See the documentation of :py:func:`vstack_features`.
paths : ``collections.Iterable`` paths : ``collections.Iterable``
See the documentation of :py:func:`vstack_features`. See the documentation of :py:func:`vstack_features`.
same_size : :obj:`bool`, optional
See the documentation of :py:func:`vstack_features`.
allow_missing_files : :obj:`bool`, optional
See the documentation of :py:func:`vstack_features`.
Yields Yields
------ ------
...@@ -193,23 +198,33 @@ def _generate_features(reader, paths): ...@@ -193,23 +198,33 @@ def _generate_features(reader, paths):
features and the shape of the first feature. The rest of objects are features and the shape of the first feature. The rest of objects are
the actual values in features. The features are returned in C order. the actual values in features. The features are returned in C order.
""" """
shape_determined = False
for i, path in enumerate(paths): for i, path in enumerate(paths):
if allow_missing_files and not os.path.isfile(path):
logger.debug("... File %s, that does not exist, has been ignored.", path)
continue
feature = numpy.atleast_2d(reader(path)) feature = numpy.atleast_2d(reader(path))
feature = numpy.ascontiguousarray(feature) feature = numpy.ascontiguousarray(feature)
if i == 0: if not shape_determined:
shape_determined = True
dtype = feature.dtype dtype = feature.dtype
shape = list(feature.shape) shape = list(feature.shape)
yield (dtype, shape) yield (dtype, shape)
else: else:
# make sure all features have the same shape[1:] and dtype # make sure all features have the same shape and dtype
assert shape[1:] == list(feature.shape[1:]) if same_size:
assert shape == list(feature.shape)
else:
assert shape[1:] == list(feature.shape[1:])
assert dtype == feature.dtype assert dtype == feature.dtype
for value in feature.flat: for value in feature.flat:
yield value yield value
def vstack_features(reader, paths, same_size=False): def vstack_features(reader, paths, same_size=False, allow_missing_files=False):
"""Stacks all features in a memory efficient way. """Stacks all features in a memory efficient way.
Parameters Parameters
...@@ -223,20 +238,29 @@ def vstack_features(reader, paths, same_size=False): ...@@ -223,20 +238,29 @@ def vstack_features(reader, paths, same_size=False):
dimension. First dimension is should correspond to the number of samples. dimension. First dimension is should correspond to the number of samples.
paths : ``collections.Iterable`` paths : ``collections.Iterable``
An iterable of paths to iterate on. Whatever is inside path is given to An iterable of paths to iterate on. Whatever is inside path is given to
``reader``. If ``same_size`` is ``True``, ``len(paths)`` must be valid. ``reader`` so they do not need to be necessarily paths to actual files.
If ``same_size`` is ``True``, ``len(paths)`` must be valid.
same_size : :obj:`bool`, optional same_size : :obj:`bool`, optional
If ``True``, it assumes that arrays inside all the paths are the same If ``True``, it assumes that arrays inside all the paths are the same
shape. If you know the features are the same size in all paths, set this shape. If you know the features are the same size in all paths, set this
to ``True`` to improve the performance. to ``True`` to improve the performance.
allow_missing_files : :obj:`bool`, optional
If ``True``, it assumes that the items inside paths are actual files and
ignores the ones that do not exist.
Returns Returns
------- -------
numpy.ndarray numpy.ndarray
The read features with the shape (n_samples, \*features_shape[1:]). The read features with the shape (n_samples, \*features_shape[1:]).
Raises
------
ValueError
If both same_size and allow_missing_files are ``True``.
Examples Examples
-------- --------
This function is equivalent to calling This function in a simple way is equivalent to calling
``numpy.vstack(reader(p) for p in paths)``. ``numpy.vstack(reader(p) for p in paths)``.
>>> import numpy >>> import numpy
...@@ -276,8 +300,12 @@ def vstack_features(reader, paths, same_size=False): ...@@ -276,8 +300,12 @@ def vstack_features(reader, paths, same_size=False):
[4, 5], [4, 5],
[6, 7], [6, 7],
[8, 9]]) [8, 9]])
""" """
iterable = _generate_features(reader, paths) if same_size and allow_missing_files:
raise ValueError("Both same_size and allow_missing_files cannot be True at"
" the same time.")
iterable = _generate_features(reader, paths, same_size, allow_missing_files)
dtype, shape = next(iterable) dtype, shape = next(iterable)
if same_size: if same_size:
total_size = int(len(paths) * numpy.prod(shape)) total_size = int(len(paths) * numpy.prod(shape))
...@@ -285,7 +313,7 @@ def vstack_features(reader, paths, same_size=False): ...@@ -285,7 +313,7 @@ def vstack_features(reader, paths, same_size=False):
else: else:
all_features = numpy.fromiter(iterable, dtype) all_features = numpy.fromiter(iterable, dtype)
# the shape is assumed to be (n_samples, ...) it can be (5, 2) or (5, 3, 3). # the shape is assumed to be (n_samples, ...) it can be (5, 2) or (5, 3, 4).
shape = list(shape) shape = list(shape)
shape[0] = -1 shape[0] = -1
return numpy.reshape(all_features, shape, order='C') return numpy.reshape(all_features, shape, order='C')
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment