diff --git a/bob/bio/base/test/test_utils.py b/bob/bio/base/test/test_utils.py index 8c7d3d4a0833b77604f95f210599c89723fee634..ce46232bf515a021342379ef4efc9828b94d49d5 100644 --- a/bob/bio/base/test/test_utils.py +++ b/bob/bio/base/test/test_utils.py @@ -119,45 +119,83 @@ def test_io_vstack(): def reader_wrong_size(path): return numpy.arange(2 * path).reshape(2, path) - # test C and F readers - numpy.all(bob.bio.base.vstack_features(reader_different_size_C, - paths, False) == - oracle(reader_different_size_C, paths)) - numpy.all(bob.bio.base.vstack_features(reader_different_size_F, - paths, False) == - oracle(reader_different_size_F, paths)) - - numpy.all(bob.bio.base.vstack_features(reader_same_size_C, paths, False) == - oracle(reader_same_size_C, paths)) - numpy.all(bob.bio.base.vstack_features(reader_same_size_F, paths, False) == - oracle(reader_same_size_F, paths)) - - numpy.all(bob.bio.base.vstack_features(reader_same_size_C, paths, True) == - oracle(reader_same_size_C, paths)) - numpy.all(bob.bio.base.vstack_features(reader_same_size_F, paths, True) == - oracle(reader_same_size_F, paths)) - - # test 3 dimensional readers - numpy.all(bob.bio.base.vstack_features(reader_different_size_C2, - paths, False) == - oracle(reader_different_size_C2, paths)) - numpy.all(bob.bio.base.vstack_features(reader_different_size_F2, - paths, False) == - oracle(reader_different_size_F2, paths)) - - numpy.all(bob.bio.base.vstack_features(reader_same_size_C2, paths, False) == - oracle(reader_same_size_C2, paths)) - numpy.all(bob.bio.base.vstack_features(reader_same_size_F2, paths, False) == - oracle(reader_same_size_F2, paths)) - - numpy.all(bob.bio.base.vstack_features(reader_same_size_C2, paths, True) == - oracle(reader_same_size_C2, paths)) - numpy.all(bob.bio.base.vstack_features(reader_same_size_F2, paths, True) == - oracle(reader_same_size_F2, paths)) + # when same_size is False + for reader in [ + reader_different_size_C, + reader_different_size_F, + reader_same_size_C, + reader_same_size_F, + reader_different_size_C2, + reader_different_size_F2, + reader_same_size_C2, + reader_same_size_F2, + ]: + numpy.all(bob.bio.base.vstack_features(reader, paths) == + oracle(reader, paths)) + + # when same_size is True + for reader in [ + reader_same_size_C, + reader_same_size_F, + reader_same_size_C2, + reader_same_size_F2, + ]: + numpy.all(bob.bio.base.vstack_features(reader, paths, True) == + oracle(reader, paths)) with nose.tools.assert_raises(AssertionError): bob.bio.base.vstack_features(reader_wrong_size, paths) + # test actual files + paths = [bob.io.base.test_utils.temporary_filename(), + bob.io.base.test_utils.temporary_filename(), + bob.io.base.test_utils.temporary_filename()] + try: + # try different readers: + for reader in [ + reader_different_size_C, + reader_different_size_F, + reader_same_size_C, + reader_same_size_F, + reader_different_size_C2, + reader_different_size_F2, + reader_same_size_C2, + reader_same_size_F2, + ]: + # save some data in files + for i, path in enumerate(paths): + bob.bio.base.save(reader(i + 1), path) + # test when all data is present + reference = oracle(bob.bio.base.load, paths) + numpy.all(bob.bio.base.vstack_features(bob.bio.base.load, paths) == + reference) + # delete the first one + os.remove(paths[0]) + reference = oracle(bob.bio.base.load, paths[1:]) + target = bob.bio.base.vstack_features(bob.bio.base.load, paths, False, + True) + numpy.all(target == reference) + # save back first one and delete second one + bob.bio.base.save(reader(1), paths[0]) + os.remove(paths[1]) + reference = oracle(bob.bio.base.load, paths[:1] + paths[2:]) + target = bob.bio.base.vstack_features(bob.bio.base.load, paths, False, + True) + numpy.all(target == reference) + # Check if RuntimeError is raised when one of the files is missing and + # allow_missing_files if False + with nose.tools.assert_raises(RuntimeError): + bob.bio.base.vstack_features(bob.bio.base.load, paths) + # Check if ValueError is raised. + with nose.tools.assert_raises(ValueError): + bob.bio.base.vstack_features(bob.bio.base.load, paths, True, True) + finally: + try: + for path in paths: + os.remove(path) + except Exception: + pass + def test_sampling(): # test selection of elements diff --git a/bob/bio/base/utils/io.py b/bob/bio/base/utils/io.py index 9d57f7d4fcf14459af10c11a1f93fb52dd7252f4..e9f6424f6deb5282031c3fc76c7feee4b07f9fa6 100644 --- a/bob/bio/base/utils/io.py +++ b/bob/bio/base/utils/io.py @@ -175,9 +175,10 @@ def save_compressed(data, filename, compression_type='bz2', create_link=False): close_compressed(filename, hdf5, compression_type, create_link) -def _generate_features(reader, paths): - """Load and stack features a memory efficient way. This function is meant to - be used inside :py:func:`vstack_features`. +def _generate_features(reader, paths, same_size=False, + allow_missing_files=False): + """Load and stack features in a memory efficient way. This function is meant + to be used inside :py:func:`vstack_features`. Parameters ---------- @@ -185,6 +186,10 @@ def _generate_features(reader, paths): See the documentation of :py:func:`vstack_features`. paths : ``collections.Iterable`` See the documentation of :py:func:`vstack_features`. + same_size : :obj:`bool`, optional + See the documentation of :py:func:`vstack_features`. + allow_missing_files : :obj:`bool`, optional + See the documentation of :py:func:`vstack_features`. Yields ------ @@ -193,23 +198,33 @@ def _generate_features(reader, paths): features and the shape of the first feature. The rest of objects are the actual values in features. The features are returned in C order. """ + + shape_determined = False for i, path in enumerate(paths): + if allow_missing_files and not os.path.isfile(path): + logger.debug("... File %s, that does not exist, has been ignored.", path) + continue + feature = numpy.atleast_2d(reader(path)) feature = numpy.ascontiguousarray(feature) - if i == 0: + if not shape_determined: + shape_determined = True dtype = feature.dtype shape = list(feature.shape) yield (dtype, shape) else: - # make sure all features have the same shape[1:] and dtype - assert shape[1:] == list(feature.shape[1:]) + # make sure all features have the same shape and dtype + if same_size: + assert shape == list(feature.shape) + else: + assert shape[1:] == list(feature.shape[1:]) assert dtype == feature.dtype for value in feature.flat: yield value -def vstack_features(reader, paths, same_size=False): +def vstack_features(reader, paths, same_size=False, allow_missing_files=False): """Stacks all features in a memory efficient way. Parameters @@ -223,20 +238,29 @@ def vstack_features(reader, paths, same_size=False): dimension. First dimension is should correspond to the number of samples. paths : ``collections.Iterable`` An iterable of paths to iterate on. Whatever is inside path is given to - ``reader``. If ``same_size`` is ``True``, ``len(paths)`` must be valid. + ``reader`` so they do not need to be necessarily paths to actual files. + If ``same_size`` is ``True``, ``len(paths)`` must be valid. same_size : :obj:`bool`, optional If ``True``, it assumes that arrays inside all the paths are the same shape. If you know the features are the same size in all paths, set this to ``True`` to improve the performance. + allow_missing_files : :obj:`bool`, optional + If ``True``, it assumes that the items inside paths are actual files and + ignores the ones that do not exist. Returns ------- numpy.ndarray The read features with the shape (n_samples, \*features_shape[1:]). + Raises + ------ + ValueError + If both same_size and allow_missing_files are ``True``. + Examples -------- - This function is equivalent to calling + This function in a simple way is equivalent to calling ``numpy.vstack(reader(p) for p in paths)``. >>> import numpy @@ -276,8 +300,12 @@ def vstack_features(reader, paths, same_size=False): [4, 5], [6, 7], [8, 9]]) + """ - iterable = _generate_features(reader, paths) + if same_size and allow_missing_files: + raise ValueError("Both same_size and allow_missing_files cannot be True at" + " the same time.") + iterable = _generate_features(reader, paths, same_size, allow_missing_files) dtype, shape = next(iterable) if same_size: total_size = int(len(paths) * numpy.prod(shape)) @@ -285,7 +313,7 @@ def vstack_features(reader, paths, same_size=False): else: all_features = numpy.fromiter(iterable, dtype) - # the shape is assumed to be (n_samples, ...) it can be (5, 2) or (5, 3, 3). + # the shape is assumed to be (n_samples, ...) it can be (5, 2) or (5, 3, 4). shape = list(shape) shape[0] = -1 return numpy.reshape(all_features, shape, order='C')