Merge branch 'vstack_features' into 'dask-pipelines'

Remove vstack_features -> use the one in bob.pipelines See merge request !187

Merge branch 'vstack_features' into 'dask-pipelines'
37e82fd8 · Tiago de Freitas Pereira · c95c92d6 · 42f23502 · 37e82fd8 · 37e82fd8
Commit 37e82fd8 authored 4 years ago by Tiago de Freitas Pereira
--- a/bob/bio/base/test/test_utils.py
+++ b/bob/bio/base/test/test_utils.py
@@ -3,7 +3,6 @@ import bob.learn.linear
 import pkg_resources
 import os
 import numpy
-import nose
 import bob.io.base.test_utils

 from . import utils
@@ -67,118 +66,6 @@ def test_io():
      os.remove(filename)


-def test_io_vstack():
-
-  paths = [1, 2, 3, 4, 5]
-
-  def oracle(reader, paths):
-    return numpy.vstack([reader(p) for p in paths])
-
-  def reader_same_size_C(path):
-    return numpy.arange(10).reshape(5, 2)
-
-  def reader_different_size_C(path):
-    return numpy.arange(2 * path).reshape(path, 2)
-
-  def reader_same_size_F(path):
-    return numpy.asfortranarray(numpy.arange(10).reshape(5, 2))
-
-  def reader_different_size_F(path):
-    return numpy.asfortranarray(numpy.arange(2 * path).reshape(path, 2))
-
-  def reader_same_size_C2(path):
-    return numpy.arange(30).reshape(5, 2, 3)
-
-  def reader_different_size_C2(path):
-    return numpy.arange(6 * path).reshape(path, 2, 3)
-
-  def reader_same_size_F2(path):
-    return numpy.asfortranarray(numpy.arange(30).reshape(5, 2, 3))
-
-  def reader_different_size_F2(path):
-    return numpy.asfortranarray(numpy.arange(6 * path).reshape(path, 2, 3))
-
-  def reader_wrong_size(path):
-    return numpy.arange(2 * path).reshape(2, path)
-
-  # when same_size is False
-  for reader in [
-      reader_different_size_C,
-      reader_different_size_F,
-      reader_same_size_C,
-      reader_same_size_F,
-      reader_different_size_C2,
-      reader_different_size_F2,
-      reader_same_size_C2,
-      reader_same_size_F2,
-  ]:
-    numpy.all(bob.bio.base.vstack_features(reader, paths) ==
-              oracle(reader, paths))
-
-  # when same_size is True
-  for reader in [
-      reader_same_size_C,
-      reader_same_size_F,
-      reader_same_size_C2,
-      reader_same_size_F2,
-  ]:
-    numpy.all(bob.bio.base.vstack_features(reader, paths, True) ==
-              oracle(reader, paths))
-
-  with nose.tools.assert_raises(AssertionError):
-    bob.bio.base.vstack_features(reader_wrong_size, paths)
-
-  # test actual files
-  paths = [bob.io.base.test_utils.temporary_filename(),
-           bob.io.base.test_utils.temporary_filename(),
-           bob.io.base.test_utils.temporary_filename()]
-  try:
-    # try different readers:
-    for reader in [
-        reader_different_size_C,
-        reader_different_size_F,
-        reader_same_size_C,
-        reader_same_size_F,
-        reader_different_size_C2,
-        reader_different_size_F2,
-        reader_same_size_C2,
-        reader_same_size_F2,
-    ]:
-      # save some data in files
-      for i, path in enumerate(paths):
-        bob.bio.base.save(reader(i + 1), path)
-      # test when all data is present
-      reference = oracle(bob.bio.base.load, paths)
-      numpy.all(bob.bio.base.vstack_features(bob.bio.base.load, paths) ==
-                reference)
-      # delete the first one
-      os.remove(paths[0])
-      reference = oracle(bob.bio.base.load, paths[1:])
-      target = bob.bio.base.vstack_features(bob.bio.base.load, paths, False,
-                                            True)
-      numpy.all(target == reference)
-      # save back first one and delete second one
-      bob.bio.base.save(reader(1), paths[0])
-      os.remove(paths[1])
-      reference = oracle(bob.bio.base.load, paths[:1] + paths[2:])
-      target = bob.bio.base.vstack_features(bob.bio.base.load, paths, False,
-                                            True)
-      numpy.all(target == reference)
-      # Check if RuntimeError is raised when one of the files is missing and
-      # allow_missing_files if False
-      with nose.tools.assert_raises(RuntimeError):
-        bob.bio.base.vstack_features(bob.bio.base.load, paths)
-      # Check if ValueError is raised.
-      with nose.tools.assert_raises(ValueError):
-        bob.bio.base.vstack_features(bob.bio.base.load, paths, True, True)
-  finally:
-    try:
-      for path in paths:
-        os.remove(path)
-    except Exception:
-      pass
-
-
 def test_sampling():
  # test selection of elements
  indices = bob.bio.base.selected_indices(100, 10)

--- a/bob/bio/base/utils/io.py
+++ b/bob/bio/base/utils/io.py
@@ -173,150 +173,3 @@ def save_compressed(data, filename, compression_type='bz2', create_link=False):
  hdf5 = open_compressed(filename, 'w')
  save(data, hdf5)
  close_compressed(filename, hdf5, compression_type, create_link)
-
-
-def _generate_features(reader, paths, same_size=False,
-                       allow_missing_files=False):
-  """Load and stack features in a memory efficient way. This function is meant
-  to be used inside :py:func:`vstack_features`.
-
-  Parameters
-  ----------
-  reader : ``collections.Callable``
-      See the documentation of :py:func:`vstack_features`.
-  paths : ``collections.Iterable``
-      See the documentation of :py:func:`vstack_features`.
-  same_size : :obj:`bool`, optional
-      See the documentation of :py:func:`vstack_features`.
-  allow_missing_files : :obj:`bool`, optional
-      See the documentation of :py:func:`vstack_features`.
-
-  Yields
-  ------
-  object
-      The first object returned is a tuple of :py:class:`numpy.dtype` of
-      features and the shape of the first feature. The rest of objects are
-      the actual values in features. The features are returned in C order.
-  """
-
-  shape_determined = False
-  for i, path in enumerate(paths):
-    if allow_missing_files and not os.path.isfile(path):
-      logger.debug("... File %s, that does not exist, has been ignored.", path)
-      continue
-
-    feature = numpy.atleast_2d(reader(path))
-    feature = numpy.ascontiguousarray(feature)
-    if not shape_determined:
-      shape_determined = True
-      dtype = feature.dtype
-      shape = list(feature.shape)
-      yield (dtype, shape)
-    else:
-      # make sure all features have the same shape and dtype
-      if same_size:
-        assert shape == list(feature.shape)
-      else:
-        assert shape[1:] == list(feature.shape[1:])
-      assert dtype == feature.dtype
-
-    for value in feature.flat:
-      yield value
-
-
-def vstack_features(reader, paths, same_size=False, allow_missing_files=False):
-  """Stacks all features in a memory efficient way.
-
-  Parameters
-  ----------
-  reader : ``collections.Callable``
-      The function to load the features. The function should only take one
-      argument being the path to the features. Use
-      :any:`functools.partial` to accommodate your reader to this format.
-      The features returned by ``reader`` are expected to have the same
-      :py:class:`numpy.dtype` and the same shape except for their first
-      dimension. First dimension is should correspond to the number of samples.
-  paths : ``collections.Iterable``
-      An iterable of paths to iterate on. Whatever is inside path is given to
-      ``reader`` so they do not need to be necessarily paths to actual files.
-      If ``same_size`` is ``True``, ``len(paths)`` must be valid.
-  same_size : :obj:`bool`, optional
-      If ``True``, it assumes that arrays inside all the paths are the same
-      shape. If you know the features are the same size in all paths, set this
-      to ``True`` to improve the performance.
-  allow_missing_files : :obj:`bool`, optional
-      If ``True``, it assumes that the items inside paths are actual files and
-      ignores the ones that do not exist.
-
-  Returns
-  -------
-  numpy.ndarray
-      The read features with the shape (n_samples, \*features_shape[1:]).
-
-  Raises
-  ------
-  ValueError
-      If both same_size and allow_missing_files are ``True``.
-
-  Examples
-  --------
-  This function in a simple way is equivalent to calling
-  ``numpy.vstack(reader(p) for p in paths)``.
-
-  >>> import numpy
-  >>> from bob.bio.base import vstack_features
-  >>> def reader(path):
-  ...     # in each file, there are 5 samples and features are 2 dimensional.
-  ...     return numpy.arange(10).reshape(5,2)
-  >>> paths = ['path1', 'path2']
-  >>> all_features = vstack_features(reader, paths)
-  >>> all_features
-  array([[0, 1],
-         [2, 3],
-         [4, 5],
-         [6, 7],
-         [8, 9],
-         [0, 1],
-         [2, 3],
-         [4, 5],
-         [6, 7],
-         [8, 9]])
-  >>> all_features_with_more_memory = numpy.vstack(reader(p) for p in paths)
-  >>> numpy.allclose(all_features, all_features_with_more_memory)
-  True
-
-  You can allocate the array at once to improve the performance if you know
-  that all features in paths have the same shape and you know the total number
-  of the paths:
-
-  >>> vstack_features(reader, paths, same_size=True)
-  array([[0, 1],
-         [2, 3],
-         [4, 5],
-         [6, 7],
-         [8, 9],
-         [0, 1],
-         [2, 3],
-         [4, 5],
-         [6, 7],
-         [8, 9]])
-
-  """
-  if same_size and allow_missing_files:
-    raise ValueError("Both same_size and allow_missing_files cannot be True at"
-                     " the same time.")
-  iterable = _generate_features(reader, paths, same_size, allow_missing_files)
-  try:
-    dtype, shape = next(iterable)
-  except StopIteration:
-    return numpy.array([])
-  if same_size:
-    total_size = int(len(paths) * numpy.prod(shape))
-    all_features = numpy.fromiter(iterable, dtype, total_size)
-  else:
-    all_features = numpy.fromiter(iterable, dtype)
-
-  # the shape is assumed to be (n_samples, ...) it can be (5, 2) or (5, 3, 4).
-  shape = list(shape)
-  shape[0] = -1
-  return numpy.reshape(all_features, shape, order='C')
--- a/doc/py_api.rst
+++ b/doc/py_api.rst
@@ -17,7 +17,6 @@ IO-related functions
   bob.bio.base.open_compressed
   bob.bio.base.close_compressed
   bob.bio.base.check_file
-   bob.bio.base.vstack_features


 Pipelines