From 9ae5f1c95f5874f8974949c1f6a594e21547d83c Mon Sep 17 00:00:00 2001
From: Amir MOHAMMADI <amir.mohammadi@idiap.ch>
Date: Mon, 12 Jun 2017 10:40:51 +0200
Subject: [PATCH] Add a function to read features with generators

---
 bob/bio/base/algorithm/BIC.py      |   2 +-
 bob/bio/base/algorithm/Distance.py |   2 +-
 bob/bio/base/algorithm/LDA.py      |   6 +-
 bob/bio/base/algorithm/PLDA.py     |   6 +-
 bob/bio/base/script/fuse_scores.py |   4 +-
 bob/bio/base/test/test_utils.py    |  77 ++++++++++++-
 bob/bio/base/utils/io.py           | 170 ++++++++++++++++++++++++++---
 version.txt                        |   2 +-
 8 files changed, 241 insertions(+), 28 deletions(-)

diff --git a/bob/bio/base/algorithm/BIC.py b/bob/bio/base/algorithm/BIC.py
index 42aa785c..b9be9138 100644
--- a/bob/bio/base/algorithm/BIC.py
+++ b/bob/bio/base/algorithm/BIC.py
@@ -103,7 +103,7 @@ class BIC(Algorithm):
 
     def _trainset_for(self, pairs):
         """Computes the array containing the comparison results for the given set of image pairs."""
-        return numpy.vstack([self.comparison_function(f1, f2) for (f1, f2) in pairs])
+        return numpy.vstack(self.comparison_function(f1, f2) for (f1, f2) in pairs)
 
     def train_enroller(self, train_features, enroller_file):
         """Trains the BIC by computing intra-personal and extra-personal subspaces.
diff --git a/bob/bio/base/algorithm/Distance.py b/bob/bio/base/algorithm/Distance.py
index c6a23596..e0fc5261 100644
--- a/bob/bio/base/algorithm/Distance.py
+++ b/bob/bio/base/algorithm/Distance.py
@@ -70,7 +70,7 @@ class Distance (Algorithm):
     assert len(enroll_features)
     [self._check_feature(feature) for feature in enroll_features]
     # just store all the features
-    return numpy.vstack([f.flatten() for f in enroll_features])
+    return numpy.vstack(f.flatten() for f in enroll_features)
 
 
   def score(self, model, probe):
diff --git a/bob/bio/base/algorithm/LDA.py b/bob/bio/base/algorithm/LDA.py
index b0cb34b9..5bcc21e6 100644
--- a/bob/bio/base/algorithm/LDA.py
+++ b/bob/bio/base/algorithm/LDA.py
@@ -109,7 +109,7 @@ class LDA (Algorithm):
       if len(client_files) < 2:
         logger.warn("Skipping one client since the number of client files is only %d", len(client_files))
         continue
-      data.append(numpy.vstack([feature.flatten() for feature in client_files]))
+      data.append(numpy.vstack(feature.flatten() for feature in client_files))
 
     # Returns the list of lists of arrays
     return data
@@ -117,7 +117,7 @@ class LDA (Algorithm):
 
   def _train_pca(self, training_set):
     """Trains and returns a LinearMachine that is trained using PCA"""
-    data_list = [feature for client in training_set for feature in client]
+    data_list = (feature for client in training_set for feature in client)
     data = numpy.vstack(data_list)
 
     logger.info("  -> Training Linear Machine using PCA")
@@ -145,7 +145,7 @@ class LDA (Algorithm):
 
   def _perform_pca(self, machine, training_set):
     """Perform PCA on data of the training set"""
-    return [numpy.vstack([machine(feature) for feature in client_features]) for client_features in training_set]
+    return [numpy.vstack(machine(feature) for feature in client_features) for client_features in training_set]
 
 
   def train_projector(self, training_features, projector_file):
diff --git a/bob/bio/base/algorithm/PLDA.py b/bob/bio/base/algorithm/PLDA.py
index 712b9d0f..e29499d2 100644
--- a/bob/bio/base/algorithm/PLDA.py
+++ b/bob/bio/base/algorithm/PLDA.py
@@ -72,7 +72,7 @@ class PLDA (Algorithm):
 
   def _train_pca(self, training_set):
     """Trains and returns a LinearMachine that is trained using PCA"""
-    data = numpy.vstack([feature for feature in training_set])
+    data = numpy.vstack(feature for feature in training_set)
 
     logger.info("  -> Training LinearMachine using PCA ")
     trainer = bob.learn.linear.PCATrainer()
@@ -103,7 +103,7 @@ class PLDA (Algorithm):
       if len(client_files) < 2:
         logger.warn("Skipping one client since the number of client files is only %d", len(client_files))
         continue
-      data.append(numpy.vstack([feature.flatten() for feature in client_files]))
+      data.append(numpy.vstack(feature.flatten() for feature in client_files))
 
     # Returns the list of lists of arrays
     return data
@@ -179,7 +179,7 @@ class PLDA (Algorithm):
     In this base class implementation, it computes the scores for each probe file using the 'score' method,
     and fuses the scores using the fusion method specified in the constructor of this class."""
     if self.pca_machine is not None:
-      probes = [self.pca_machine(probe) for probe in probes]
+      probes = (self.pca_machine(probe) for probe in probes)
     # forward
     if self.score_set == 'joint_likelihood':
       return model.log_likelihood_ratio(numpy.vstack(probes))
diff --git a/bob/bio/base/script/fuse_scores.py b/bob/bio/base/script/fuse_scores.py
index 693e6ec6..29152981 100755
--- a/bob/bio/base/script/fuse_scores.py
+++ b/bob/bio/base/script/fuse_scores.py
@@ -70,8 +70,8 @@ def main(command_line_options = None):
   import numpy
 
   trainer = bob.learn.linear.CGLogRegTrainer(0.5, args.convergence_threshold, args.max_iterations, mean_std_norm=not args.no_whitening)
-  data_neg = numpy.vstack([data[k][0] for k in range(n_systems)]).T
-  data_pos = numpy.vstack([data[k][1] for k in range(n_systems)]).T
+  data_neg = numpy.vstack(data[k][0] for k in range(n_systems)).T
+  data_pos = numpy.vstack(data[k][1] for k in range(n_systems)).T
   machine = trainer.train(data_neg, data_pos)
 
   # fuse development scores
diff --git a/bob/bio/base/test/test_utils.py b/bob/bio/base/test/test_utils.py
index 72ebfbbd..8c7d3d4a 100644
--- a/bob/bio/base/test/test_utils.py
+++ b/bob/bio/base/test/test_utils.py
@@ -3,7 +3,7 @@ import bob.learn.linear
 import pkg_resources
 import os
 import numpy
-
+import nose
 import bob.io.base.test_utils
 
 from . import utils
@@ -84,6 +84,81 @@ def test_io():
     if os.path.exists(filename):
       os.remove(filename)
 
+
+def test_io_vstack():
+
+  paths = [1, 2, 3, 4, 5]
+
+  def oracle(reader, paths):
+    return numpy.vstack([reader(p) for p in paths])
+
+  def reader_same_size_C(path):
+    return numpy.arange(10).reshape(5, 2)
+
+  def reader_different_size_C(path):
+    return numpy.arange(2 * path).reshape(path, 2)
+
+  def reader_same_size_F(path):
+    return numpy.asfortranarray(numpy.arange(10).reshape(5, 2))
+
+  def reader_different_size_F(path):
+    return numpy.asfortranarray(numpy.arange(2 * path).reshape(path, 2))
+
+  def reader_same_size_C2(path):
+    return numpy.arange(30).reshape(5, 2, 3)
+
+  def reader_different_size_C2(path):
+    return numpy.arange(6 * path).reshape(path, 2, 3)
+
+  def reader_same_size_F2(path):
+    return numpy.asfortranarray(numpy.arange(30).reshape(5, 2, 3))
+
+  def reader_different_size_F2(path):
+    return numpy.asfortranarray(numpy.arange(6 * path).reshape(path, 2, 3))
+
+  def reader_wrong_size(path):
+    return numpy.arange(2 * path).reshape(2, path)
+
+  # test C and F readers
+  numpy.all(bob.bio.base.vstack_features(reader_different_size_C,
+                                         paths, False) ==
+            oracle(reader_different_size_C, paths))
+  numpy.all(bob.bio.base.vstack_features(reader_different_size_F,
+                                         paths, False) ==
+            oracle(reader_different_size_F, paths))
+
+  numpy.all(bob.bio.base.vstack_features(reader_same_size_C, paths, False) ==
+            oracle(reader_same_size_C, paths))
+  numpy.all(bob.bio.base.vstack_features(reader_same_size_F, paths, False) ==
+            oracle(reader_same_size_F, paths))
+
+  numpy.all(bob.bio.base.vstack_features(reader_same_size_C, paths, True) ==
+            oracle(reader_same_size_C, paths))
+  numpy.all(bob.bio.base.vstack_features(reader_same_size_F, paths, True) ==
+            oracle(reader_same_size_F, paths))
+
+  # test 3 dimensional readers
+  numpy.all(bob.bio.base.vstack_features(reader_different_size_C2,
+                                         paths, False) ==
+            oracle(reader_different_size_C2, paths))
+  numpy.all(bob.bio.base.vstack_features(reader_different_size_F2,
+                                         paths, False) ==
+            oracle(reader_different_size_F2, paths))
+
+  numpy.all(bob.bio.base.vstack_features(reader_same_size_C2, paths, False) ==
+            oracle(reader_same_size_C2, paths))
+  numpy.all(bob.bio.base.vstack_features(reader_same_size_F2, paths, False) ==
+            oracle(reader_same_size_F2, paths))
+
+  numpy.all(bob.bio.base.vstack_features(reader_same_size_C2, paths, True) ==
+            oracle(reader_same_size_C2, paths))
+  numpy.all(bob.bio.base.vstack_features(reader_same_size_F2, paths, True) ==
+            oracle(reader_same_size_F2, paths))
+
+  with nose.tools.assert_raises(AssertionError):
+    bob.bio.base.vstack_features(reader_wrong_size, paths)
+
+
 def test_sampling():
   # test selection of elements
   indices = bob.bio.base.selected_indices(100, 10)
diff --git a/bob/bio/base/utils/io.py b/bob/bio/base/utils/io.py
index 3e25a525..f8dcddcc 100644
--- a/bob/bio/base/utils/io.py
+++ b/bob/bio/base/utils/io.py
@@ -1,12 +1,16 @@
 import os
-import tempfile, tarfile
-
+import tempfile
+import tarfile
+import collections  # this is needed for the sphinx documentation
+import functools  # this is needed for the sphinx documentation
+import numpy
 import logging
-logger = logging.getLogger("bob.bio.base")
+logger = logging.getLogger(__name__)
 
 from .. import database
 import bob.io.base
 
+
 def filter_missing_files(file_names, split_by_client=False, allow_missing_files=True):
   """This function filters out files that do not exist, but only if ``allow_missing_files`` is set to ``True``, otherwise the list of ``file_names`` is returned unaltered."""
 
@@ -15,8 +19,10 @@ def filter_missing_files(file_names, split_by_client=False, allow_missing_files=
 
   if split_by_client:
     # filter out missing files and empty clients
-    existing_files = [[f for f in client_files if os.path.exists(f)] for client_files in file_names]
-    existing_files = [client_files for client_files in existing_files if client_files]
+    existing_files = [
+        [f for f in client_files if os.path.exists(f)] for client_files in file_names]
+    existing_files = [
+        client_files for client_files in existing_files if client_files]
   else:
     # filter out missing files
     existing_files = [f for f in file_names if os.path.exists(f)]
@@ -28,17 +34,17 @@ def filter_none(data, split_by_client=False):
 
   if split_by_client:
     # filter out missing files and empty clients
-    existing_data = [[d for d in client_data if d is not None] for client_data in data]
-    existing_data = [client_data for client_data in existing_data if client_data]
+    existing_data = [[d for d in client_data if d is not None]
+                     for client_data in data]
+    existing_data = [
+        client_data for client_data in existing_data if client_data]
   else:
     # filter out missing files
     existing_data = [d for d in data if d is not None]
   return existing_data
 
 
-
-
-def check_file(filename, force, expected_file_size = 1):
+def check_file(filename, force, expected_file_size=1):
   """Checks if the file with the given ``filename`` exists and has size greater or equal to ``expected_file_size``.
   If the file is to small, **or** if the ``force`` option is set to ``True``, the file is removed.
   This function returns ``True`` is the file exists (and has not been removed), otherwise ``False``"""
@@ -86,18 +92,20 @@ def load(file):
   else:
     return bob.io.base.load(file)
 
+
 def save(data, file, compression=0):
   """Saves the data to file using HDF5. The given file might be an HDF5 file open for writing, or a string.
   If the given data contains a ``save`` method, this method is called with the given HDF5 file.
   Otherwise the data is written to the HDF5 file using the given compression."""
-  f = file if isinstance(file, bob.io.base.HDF5File) else bob.io.base.HDF5File(file, 'w')
+  f = file if isinstance(
+      file, bob.io.base.HDF5File) else bob.io.base.HDF5File(file, 'w')
   if hasattr(data, 'save'):
     data.save(f)
   else:
     f.set("array", data, compression=compression)
 
 
-def open_compressed(filename, open_flag = 'r', compression_type='bz2'):
+def open_compressed(filename, open_flag='r', compression_type='bz2'):
   """Opens a compressed HDF5File with the given opening flags.
   For the 'r' flag, the given compressed file will be extracted to a local space.
   For 'w', an empty HDF5File is created.
@@ -108,7 +116,7 @@ def open_compressed(filename, open_flag = 'r', compression_type='bz2'):
 
   if open_flag == 'r':
     # extract the HDF5 file from the given file name into a temporary file name
-    tar = tarfile.open(filename, mode="r:"+compression_type)
+    tar = tarfile.open(filename, mode="r:" + compression_type)
     memory_file = tar.extractfile(tar.next())
     real_file = open(hdf5_file_name, 'wb')
     real_file.write(memory_file.read())
@@ -130,13 +138,14 @@ def close_compressed(filename, hdf5_file, compression_type='bz2', create_link=Fa
 
   if is_writable:
     # create compressed tar file
-    tar = tarfile.open(filename, mode="w:"+compression_type)
+    tar = tarfile.open(filename, mode="w:" + compression_type)
     tar.add(hdf5_file_name, os.path.basename(filename))
     tar.close()
 
   if create_link:
-    extension = {'':'.tar', 'bz2':'.tar.bz2', 'gz':'tar.gz'}[compression_type]
-    link_file = filename+extension
+    extension = {'': '.tar', 'bz2': '.tar.bz2',
+                 'gz': 'tar.gz'}[compression_type]
+    link_file = filename + extension
     if not os.path.exists(link_file):
       os.symlink(os.path.basename(filename), link_file)
 
@@ -165,3 +174,132 @@ def save_compressed(data, filename, compression_type='bz2', create_link=False):
   hdf5 = open_compressed(filename, 'w')
   save(data, hdf5)
   close_compressed(filename, hdf5, compression_type, create_link)
+
+
+def _generate_features(reader, paths):
+  """Load and stack features a memory efficient way. This function is meant to
+  be used inside :py:func:`vstack_features`.
+
+  Parameters
+  ----------
+  reader : collections.Callable
+      See the documentation of :py:func:`vstack_features`.
+  paths : collections.Iterable
+      See the documentation of :py:func:`vstack_features`.
+
+  Yields
+  ------
+  object
+      The first object returned is the :py:type:`numpy.dtype` of features. The
+      second objects returned is the shape of the first feature. The rest of
+      objects are the actual values in features. The features are returned in
+      C order.
+
+  Examples
+  --------
+  This function can be used to with :py:func:`numpy.fromiter`:
+
+  >>> def reader(path):
+  ...     # in each file, there are 5 samples and features are 2 dimensional.
+  ...     return numpy.arange(10).reshape(5,2)
+  >>> paths = ['path1', 'path2']
+  >>> iterator = _generate_features(reader, paths)
+  >>> dtype = next(iterator)
+  >>> dtype
+  dtype('int64')
+  >>> first_feature_shape = next(iterator)
+  >>> first_feature_shape
+  (5, 2)
+  >>> all_features_flat = numpy.fromiter(iterator, dtype)
+  >>> all_features_flat
+  array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9])
+  >>> all_features = all_features_flat.reshape(-1, first_feature_shape[1])
+  >>> all_features
+  array([[0, 1],
+         [2, 3],
+         [4, 5],
+         [6, 7],
+         [8, 9],
+         [0, 1],
+         [2, 3],
+         [4, 5],
+         [6, 7],
+         [8, 9]])
+  >>> all_features_with_more_memory = numpy.vstack([reader(p) for p in paths])
+  >>> assert numpy.allclose(all_features == all_features_with_more_memory)
+
+  You can allocate the array at once to improve the performance if you know
+  that all features in paths have the same shape and you know the total number
+  of the paths:
+  >>> iterator = _generate_features(reader, paths)
+  >>> dtype = next(iterator)
+  >>> first_feature_shape = next(iterator)
+  >>> total_size = len(paths) * numpy.prod(first_feature_shape)
+  >>> all_features_flat = numpy.fromiter(iterator, dtype, total_size)
+  >>> all_features = all_features_flat.reshape(-1, first_feature_shape[1])
+  >>> all_features
+  array([[0, 1],
+         [2, 3],
+         [4, 5],
+         [6, 7],
+         [8, 9],
+         [0, 1],
+         [2, 3],
+         [4, 5],
+         [6, 7],
+         [8, 9]])
+  """
+  for i, path in enumerate(paths):
+    feature = numpy.atleast_2d(reader(path))
+    feature = numpy.ascontiguousarray(feature)
+    if i == 0:
+      dtype = feature.dtype
+      shape = list(feature.shape)
+      yield dtype
+      yield shape
+    else:
+      # make sure all features have the same shape[1:]
+      assert shape[1:] == list(feature.shape[1:])
+
+    for value in feature.flat:
+      yield value
+
+
+def vstack_features(reader, paths, same_size=False):
+  """Stacks all features in a memory efficient way.
+
+  Parameters
+  ----------
+  reader : collections.Callable
+      The function to load the features. The function should only take one
+      argument being the path to the features. Use :py:type:`functools.partial`
+      to accommodate your reader to this format. The features returned by
+      ``reader`` are expected to have the same :py:type:`numpy.dtype` and the
+      same shape except for their first dimension. First dimension is should
+      correspond to the number of samples.
+  paths : collections.Iterable
+      An iterable of paths to iterate on. Whatever is inside path is given to
+      ``reader``. If ``same_size`` is ``True``, ``len(paths)`` must be valid.
+  same_size : :obj:`bool`, optional
+      If ``True``, it assumes that arrays inside all the paths are the same
+      shape. If you know the features are the same size in all paths, set this
+      to ``True`` to improve the performance.
+
+  Returns
+  -------
+  numpy.ndarray
+      The read features with the shape (n_samples, *features_shape[1:]).
+  """
+  iterable = _generate_features(reader, paths)
+  dtype = next(iterable)
+  shape = next(iterable)
+  if same_size:
+    total_size = int(len(paths) * numpy.prod(shape))
+    all_features = numpy.fromiter(iterable, dtype, total_size)
+  else:
+    all_features = numpy.fromiter(iterable, dtype)
+
+  # the shape is assumed to be (n_samples, ...) it can be (5, 2) or (5, 3, 3).
+  shape = list(shape)
+  shape[0] = -1
+  return numpy.reshape(all_features, shape, order='C')
diff --git a/version.txt b/version.txt
index ba1053ec..878694a6 100644
--- a/version.txt
+++ b/version.txt
@@ -1 +1 @@
-3.1.3b0
\ No newline at end of file
+3.2.0b0
-- 
GitLab