Skip to content
Snippets Groups Projects
Commit 3f78e620 authored by Amir MOHAMMADI's avatar Amir MOHAMMADI
Browse files

improve allow_missing_files option and add tests

parent 68412583
No related branches found
No related tags found
1 merge request!103Propagated the option --allow-missing-files .....
Pipeline #
...@@ -119,45 +119,83 @@ def test_io_vstack(): ...@@ -119,45 +119,83 @@ def test_io_vstack():
def reader_wrong_size(path): def reader_wrong_size(path):
return numpy.arange(2 * path).reshape(2, path) return numpy.arange(2 * path).reshape(2, path)
# test C and F readers # when same_size is False
numpy.all(bob.bio.base.vstack_features(reader_different_size_C, for reader in [
paths, False) == reader_different_size_C,
oracle(reader_different_size_C, paths)) reader_different_size_F,
numpy.all(bob.bio.base.vstack_features(reader_different_size_F, reader_same_size_C,
paths, False) == reader_same_size_F,
oracle(reader_different_size_F, paths)) reader_different_size_C2,
reader_different_size_F2,
numpy.all(bob.bio.base.vstack_features(reader_same_size_C, paths, False) == reader_same_size_C2,
oracle(reader_same_size_C, paths)) reader_same_size_F2,
numpy.all(bob.bio.base.vstack_features(reader_same_size_F, paths, False) == ]:
oracle(reader_same_size_F, paths)) numpy.all(bob.bio.base.vstack_features(reader, paths) ==
oracle(reader, paths))
numpy.all(bob.bio.base.vstack_features(reader_same_size_C, paths, True) ==
oracle(reader_same_size_C, paths)) # when same_size is True
numpy.all(bob.bio.base.vstack_features(reader_same_size_F, paths, True) == for reader in [
oracle(reader_same_size_F, paths)) reader_same_size_C,
reader_same_size_F,
# test 3 dimensional readers reader_same_size_C2,
numpy.all(bob.bio.base.vstack_features(reader_different_size_C2, reader_same_size_F2,
paths, False) == ]:
oracle(reader_different_size_C2, paths)) numpy.all(bob.bio.base.vstack_features(reader, paths, True) ==
numpy.all(bob.bio.base.vstack_features(reader_different_size_F2, oracle(reader, paths))
paths, False) ==
oracle(reader_different_size_F2, paths))
numpy.all(bob.bio.base.vstack_features(reader_same_size_C2, paths, False) ==
oracle(reader_same_size_C2, paths))
numpy.all(bob.bio.base.vstack_features(reader_same_size_F2, paths, False) ==
oracle(reader_same_size_F2, paths))
numpy.all(bob.bio.base.vstack_features(reader_same_size_C2, paths, True) ==
oracle(reader_same_size_C2, paths))
numpy.all(bob.bio.base.vstack_features(reader_same_size_F2, paths, True) ==
oracle(reader_same_size_F2, paths))
with nose.tools.assert_raises(AssertionError): with nose.tools.assert_raises(AssertionError):
bob.bio.base.vstack_features(reader_wrong_size, paths) bob.bio.base.vstack_features(reader_wrong_size, paths)
# test actual files
paths = [bob.io.base.test_utils.temporary_filename(),
bob.io.base.test_utils.temporary_filename(),
bob.io.base.test_utils.temporary_filename()]
try:
# try different readers:
for reader in [
reader_different_size_C,
reader_different_size_F,
reader_same_size_C,
reader_same_size_F,
reader_different_size_C2,
reader_different_size_F2,
reader_same_size_C2,
reader_same_size_F2,
]:
# save some data in files
for i, path in enumerate(paths):
bob.bio.base.save(reader(i + 1), path)
# test when all data is present
reference = oracle(bob.bio.base.load, paths)
numpy.all(bob.bio.base.vstack_features(bob.bio.base.load, paths) ==
reference)
# delete the first one
os.remove(paths[0])
reference = oracle(bob.bio.base.load, paths[1:])
target = bob.bio.base.vstack_features(bob.bio.base.load, paths, False,
True)
numpy.all(target == reference)
# save back first one and delete second one
bob.bio.base.save(reader(1), paths[0])
os.remove(paths[1])
reference = oracle(bob.bio.base.load, paths[:1] + paths[2:])
target = bob.bio.base.vstack_features(bob.bio.base.load, paths, False,
True)
numpy.all(target == reference)
# Check if RuntimeError is raised when one of the files is missing and
# allow_missing_files if False
with nose.tools.assert_raises(RuntimeError):
bob.bio.base.vstack_features(bob.bio.base.load, paths)
# Check if ValueError is raised.
with nose.tools.assert_raises(ValueError):
bob.bio.base.vstack_features(bob.bio.base.load, paths, True, True)
finally:
try:
for path in paths:
os.remove(path)
except Exception:
pass
def test_sampling(): def test_sampling():
# test selection of elements # test selection of elements
......
...@@ -175,9 +175,10 @@ def save_compressed(data, filename, compression_type='bz2', create_link=False): ...@@ -175,9 +175,10 @@ def save_compressed(data, filename, compression_type='bz2', create_link=False):
close_compressed(filename, hdf5, compression_type, create_link) close_compressed(filename, hdf5, compression_type, create_link)
def _generate_features(reader, paths, allow_missing_files=False): def _generate_features(reader, paths, same_size=False,
"""Load and stack features a memory efficient way. This function is meant to allow_missing_files=False):
be used inside :py:func:`vstack_features`. """Load and stack features in a memory efficient way. This function is meant
to be used inside :py:func:`vstack_features`.
Parameters Parameters
---------- ----------
...@@ -185,8 +186,10 @@ def _generate_features(reader, paths, allow_missing_files=False): ...@@ -185,8 +186,10 @@ def _generate_features(reader, paths, allow_missing_files=False):
See the documentation of :py:func:`vstack_features`. See the documentation of :py:func:`vstack_features`.
paths : ``collections.Iterable`` paths : ``collections.Iterable``
See the documentation of :py:func:`vstack_features`. See the documentation of :py:func:`vstack_features`.
same_size : bool, optional
See the documentation of :py:func:`vstack_features`.
allow_missing_files : :obj:`bool`, optional allow_missing_files : :obj:`bool`, optional
If ``True``, it ignores files that doesn't exists See the documentation of :py:func:`vstack_features`.
Yields Yields
------ ------
...@@ -195,23 +198,26 @@ def _generate_features(reader, paths, allow_missing_files=False): ...@@ -195,23 +198,26 @@ def _generate_features(reader, paths, allow_missing_files=False):
features and the shape of the first feature. The rest of objects are features and the shape of the first feature. The rest of objects are
the actual values in features. The features are returned in C order. the actual values in features. The features are returned in C order.
""" """
shape_check = False shape_determined = False
for i, path in enumerate(paths): for i, path in enumerate(paths):
if allow_missing_files and not os.path.isfile(path): if allow_missing_files and not os.path.isfile(path):
logger.debug("... The file {0}, that does not exist, has been ignored . ".format(path)) logger.debug("... File %s, that does not exist, has been ignored.", path)
continue continue
feature = numpy.atleast_2d(reader(path)) feature = numpy.atleast_2d(reader(path))
feature = numpy.ascontiguousarray(feature) feature = numpy.ascontiguousarray(feature)
if not shape_check: if not shape_determined:
shape_check = True shape_determined = True
dtype = feature.dtype dtype = feature.dtype
shape = list(feature.shape) shape = list(feature.shape)
yield (dtype, shape) yield (dtype, shape)
else: else:
# make sure all features have the same shape[1:] and dtype # make sure all features have the same shape and dtype
assert shape[1:] == list(feature.shape[1:]) if same_size:
assert shape == list(feature.shape)
else:
assert shape[1:] == list(feature.shape[1:])
assert dtype == feature.dtype assert dtype == feature.dtype
for value in feature.flat: for value in feature.flat:
...@@ -232,23 +238,29 @@ def vstack_features(reader, paths, same_size=False, allow_missing_files=False): ...@@ -232,23 +238,29 @@ def vstack_features(reader, paths, same_size=False, allow_missing_files=False):
dimension. First dimension is should correspond to the number of samples. dimension. First dimension is should correspond to the number of samples.
paths : ``collections.Iterable`` paths : ``collections.Iterable``
An iterable of paths to iterate on. Whatever is inside path is given to An iterable of paths to iterate on. Whatever is inside path is given to
``reader``. If ``same_size`` is ``True``, ``len(paths)`` must be valid. ``reader`` so they do not need to be necessarily paths to actual files.
If ``same_size`` is ``True``, ``len(paths)`` must be valid.
same_size : :obj:`bool`, optional same_size : :obj:`bool`, optional
If ``True``, it assumes that arrays inside all the paths are the same If ``True``, it assumes that arrays inside all the paths are the same
shape. If you know the features are the same size in all paths, set this shape. If you know the features are the same size in all paths, set this
to ``True`` to improve the performance. to ``True`` to improve the performance.
allow_missing_files : :obj:`bool`, optional allow_missing_files : :obj:`bool`, optional
If ``True``, it ignores files that doesn't exists If ``True``, it assumes that the items inside paths are actual files and
ignores the ones that do not exist.
Returns Returns
------- -------
numpy.ndarray numpy.ndarray
The read features with the shape (n_samples, \*features_shape[1:]). The read features with the shape (n_samples, \*features_shape[1:]).
Raises
------
ValueError
If both same_size and allow_missing_files are ``True``.
Examples Examples
-------- --------
This function is equivalent to calling This function in a simple way is equivalent to calling
``numpy.vstack(reader(p) for p in paths)``. ``numpy.vstack(reader(p) for p in paths)``.
>>> import numpy >>> import numpy
...@@ -288,8 +300,13 @@ def vstack_features(reader, paths, same_size=False, allow_missing_files=False): ...@@ -288,8 +300,13 @@ def vstack_features(reader, paths, same_size=False, allow_missing_files=False):
[4, 5], [4, 5],
[6, 7], [6, 7],
[8, 9]]) [8, 9]])
""" """
iterable = _generate_features(reader, paths, allow_missing_files=allow_missing_files) if same_size and allow_missing_files:
raise ValueError("Both same_size and allow_missing_files cannot be True at"
" the same time.")
iterable = _generate_features(
reader, paths, allow_missing_files=allow_missing_files)
dtype, shape = next(iterable) dtype, shape = next(iterable)
if same_size: if same_size:
total_size = int(len(paths) * numpy.prod(shape)) total_size = int(len(paths) * numpy.prod(shape))
...@@ -297,7 +314,7 @@ def vstack_features(reader, paths, same_size=False, allow_missing_files=False): ...@@ -297,7 +314,7 @@ def vstack_features(reader, paths, same_size=False, allow_missing_files=False):
else: else:
all_features = numpy.fromiter(iterable, dtype) all_features = numpy.fromiter(iterable, dtype)
# the shape is assumed to be (n_samples, ...) it can be (5, 2) or (5, 3, 3). # the shape is assumed to be (n_samples, ...) it can be (5, 2) or (5, 3, 4).
shape = list(shape) shape = list(shape)
shape[0] = -1 shape[0] = -1
return numpy.reshape(all_features, shape, order='C') return numpy.reshape(all_features, shape, order='C')
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment