improve allow_missing_files option and add tests

3f78e620 · Amir MOHAMMADI · 68412583 · 3f78e620 · 3f78e620
Commit 3f78e620 authored 7 years ago by Amir MOHAMMADI
--- a/bob/bio/base/test/test_utils.py
+++ b/bob/bio/base/test/test_utils.py
@@ -119,45 +119,83 @@ def test_io_vstack():
  def reader_wrong_size(path):
    return numpy.arange(2 * path).reshape(2, path)
-  # test C and F readers
+  # when same_size is False
-  numpy.all(bob.bio.base.vstack_features(reader_different_size_C,
+  for reader in [
-                                         paths, False) ==
+      reader_different_size_C,
-            oracle(reader_different_size_C, paths))
+      reader_different_size_F,
-  numpy.all(bob.bio.base.vstack_features(reader_different_size_F,
+      reader_same_size_C,
-                                         paths, False) ==
+      reader_same_size_F,
-            oracle(reader_different_size_F, paths))
+      reader_different_size_C2,
+      reader_different_size_F2,
-  numpy.all(bob.bio.base.vstack_features(reader_same_size_C, paths, False) ==
+      reader_same_size_C2,
-            oracle(reader_same_size_C, paths))
+      reader_same_size_F2,
-  numpy.all(bob.bio.base.vstack_features(reader_same_size_F, paths, False) ==
+  ]:
-            oracle(reader_same_size_F, paths))
+    numpy.all(bob.bio.base.vstack_features(reader, paths) ==
+              oracle(reader, paths))
-  numpy.all(bob.bio.base.vstack_features(reader_same_size_C, paths, True) ==
-            oracle(reader_same_size_C, paths))
+  # when same_size is True
-  numpy.all(bob.bio.base.vstack_features(reader_same_size_F, paths, True) ==
+  for reader in [
-            oracle(reader_same_size_F, paths))
+      reader_same_size_C,
+      reader_same_size_F,
-  # test 3 dimensional readers
+      reader_same_size_C2,
-  numpy.all(bob.bio.base.vstack_features(reader_different_size_C2,
+      reader_same_size_F2,
-                                         paths, False) ==
+  ]:
-            oracle(reader_different_size_C2, paths))
+    numpy.all(bob.bio.base.vstack_features(reader, paths, True) ==
-  numpy.all(bob.bio.base.vstack_features(reader_different_size_F2,
+              oracle(reader, paths))
-                                         paths, False) ==
-            oracle(reader_different_size_F2, paths))
-  numpy.all(bob.bio.base.vstack_features(reader_same_size_C2, paths, False) ==
-            oracle(reader_same_size_C2, paths))
-  numpy.all(bob.bio.base.vstack_features(reader_same_size_F2, paths, False) ==
-            oracle(reader_same_size_F2, paths))
-  numpy.all(bob.bio.base.vstack_features(reader_same_size_C2, paths, True) ==
-            oracle(reader_same_size_C2, paths))
-  numpy.all(bob.bio.base.vstack_features(reader_same_size_F2, paths, True) ==
-            oracle(reader_same_size_F2, paths))
  with nose.tools.assert_raises(AssertionError):
    bob.bio.base.vstack_features(reader_wrong_size, paths)
+  # test actual files
+  paths = [bob.io.base.test_utils.temporary_filename(),
+           bob.io.base.test_utils.temporary_filename(),
+           bob.io.base.test_utils.temporary_filename()]
+  try:
+    # try different readers:
+    for reader in [
+        reader_different_size_C,
+        reader_different_size_F,
+        reader_same_size_C,
+        reader_same_size_F,
+        reader_different_size_C2,
+        reader_different_size_F2,
+        reader_same_size_C2,
+        reader_same_size_F2,
+    ]:
+      # save some data in files
+      for i, path in enumerate(paths):
+        bob.bio.base.save(reader(i + 1), path)
+      # test when all data is present
+      reference = oracle(bob.bio.base.load, paths)
+      numpy.all(bob.bio.base.vstack_features(bob.bio.base.load, paths) ==
+                reference)
+      # delete the first one
+      os.remove(paths[0])
+      reference = oracle(bob.bio.base.load, paths[1:])
+      target = bob.bio.base.vstack_features(bob.bio.base.load, paths, False,
+                                            True)
+      numpy.all(target == reference)
+      # save back first one and delete second one
+      bob.bio.base.save(reader(1), paths[0])
+      os.remove(paths[1])
+      reference = oracle(bob.bio.base.load, paths[:1] + paths[2:])
+      target = bob.bio.base.vstack_features(bob.bio.base.load, paths, False,
+                                            True)
+      numpy.all(target == reference)
+      # Check if RuntimeError is raised when one of the files is missing and
+      # allow_missing_files if False
+      with nose.tools.assert_raises(RuntimeError):
+        bob.bio.base.vstack_features(bob.bio.base.load, paths)
+      # Check if ValueError is raised.
+      with nose.tools.assert_raises(ValueError):
+        bob.bio.base.vstack_features(bob.bio.base.load, paths, True, True)
+  finally:
+    try:
+      for path in paths:
+        os.remove(path)
+    except Exception:
+      pass
 def test_sampling():
  # test selection of elements

--- a/bob/bio/base/utils/io.py
+++ b/bob/bio/base/utils/io.py
@@ -175,9 +175,10 @@ def save_compressed(data, filename, compression_type='bz2', create_link=False):
  close_compressed(filename, hdf5, compression_type, create_link)
-def _generate_features(reader, paths, allow_missing_files=False):
+def _generate_features(reader, paths, same_size=False,
-  """Load and stack features a memory efficient way. This function is meant to
+                       allow_missing_files=False):
-  be used inside :py:func:`vstack_features`.
+  """Load and stack features in a memory efficient way. This function is meant
+  to be used inside :py:func:`vstack_features`.
  Parameters
  ----------
@@ -185,8 +186,10 @@ def _generate_features(reader, paths, allow_missing_files=False):
      See the documentation of :py:func:`vstack_features`.
  paths : ``collections.Iterable``
      See the documentation of :py:func:`vstack_features`.
+  same_size : bool, optional
+      See the documentation of :py:func:`vstack_features`.
  allow_missing_files : :obj:`bool`, optional
-      If ``True``, it ignores files that doesn't exists
+      See the documentation of :py:func:`vstack_features`.
  Yields
  ------
@@ -195,23 +198,26 @@ def _generate_features(reader, paths, allow_missing_files=False):
      features and the shape of the first feature. The rest of objects are
      the actual values in features. The features are returned in C order.
  """
-  shape_check = False
+  shape_determined = False
  for i, path in enumerate(paths):
    if allow_missing_files and not os.path.isfile(path):
-        logger.debug("... The file {0}, that does not exist, has been ignored . ".format(path))
+      logger.debug("... File %s, that does not exist, has been ignored.", path)
-        continue
+      continue
    feature = numpy.atleast_2d(reader(path))
    feature = numpy.ascontiguousarray(feature)
-    if not shape_check:
+    if not shape_determined:
-      shape_check = True
+      shape_determined = True
      dtype = feature.dtype
      shape = list(feature.shape)
      yield (dtype, shape)
    else:
-      # make sure all features have the same shape[1:] and dtype
+      # make sure all features have the same shape and dtype
-      assert shape[1:] == list(feature.shape[1:])
+      if same_size:
+        assert shape == list(feature.shape)
+      else:
+        assert shape[1:] == list(feature.shape[1:])
      assert dtype == feature.dtype
    for value in feature.flat:
@@ -232,23 +238,29 @@ def vstack_features(reader, paths, same_size=False, allow_missing_files=False):
      dimension. First dimension is should correspond to the number of samples.
  paths : ``collections.Iterable``
      An iterable of paths to iterate on. Whatever is inside path is given to
-      ``reader``. If ``same_size`` is ``True``, ``len(paths)`` must be valid.
+      ``reader`` so they do not need to be necessarily paths to actual files.
+      If ``same_size`` is ``True``, ``len(paths)`` must be valid.
  same_size : :obj:`bool`, optional
      If ``True``, it assumes that arrays inside all the paths are the same
      shape. If you know the features are the same size in all paths, set this
      to ``True`` to improve the performance.
  allow_missing_files : :obj:`bool`, optional
-      If ``True``, it ignores files that doesn't exists
+      If ``True``, it assumes that the items inside paths are actual files and
+      ignores the ones that do not exist.
  Returns
  -------
  numpy.ndarray
      The read features with the shape (n_samples, \*features_shape[1:]).
+  Raises
+  ------
+  ValueError
+      If both same_size and allow_missing_files are ``True``.
  Examples
  --------
-  This function is equivalent to calling
+  This function in a simple way is equivalent to calling
  ``numpy.vstack(reader(p) for p in paths)``.
  >>> import numpy
@@ -288,8 +300,13 @@ def vstack_features(reader, paths, same_size=False, allow_missing_files=False):
         [4, 5],
         [6, 7],
         [8, 9]])
  """
-  iterable = _generate_features(reader, paths, allow_missing_files=allow_missing_files)
+  if same_size and allow_missing_files:
+    raise ValueError("Both same_size and allow_missing_files cannot be True at"
+                     " the same time.")
+  iterable = _generate_features(
+      reader, paths, allow_missing_files=allow_missing_files)
  dtype, shape = next(iterable)
  if same_size:
    total_size = int(len(paths) * numpy.prod(shape))
@@ -297,7 +314,7 @@ def vstack_features(reader, paths, same_size=False, allow_missing_files=False):
  else:
    all_features = numpy.fromiter(iterable, dtype)
-  # the shape is assumed to be (n_samples, ...) it can be (5, 2) or (5, 3, 3).
+  # the shape is assumed to be (n_samples, ...) it can be (5, 2) or (5, 3, 4).
  shape = list(shape)
  shape[0] = -1
  return numpy.reshape(all_features, shape, order='C')