From b6c65c06ee75a33d357675c7e3d69d2ca20bd733 Mon Sep 17 00:00:00 2001
From: Manuel Gunther <siebenkopf@googlemail.com>
Date: Tue, 17 May 2016 13:00:12 -0600
Subject: [PATCH] Implemented Preprocessor.writes_data flag so that
 preprocessing can easier be skipped

---
 bob/bio/base/preprocessor/Filename.py     |  2 +-
 bob/bio/base/preprocessor/Preprocessor.py |  3 ++-
 bob/bio/base/tools/extractor.py           |  2 +-
 bob/bio/base/tools/preprocessor.py        | 12 +++++++++---
 bob/bio/base/utils/io.py                  | 15 +++++++++++++++
 5 files changed, 28 insertions(+), 6 deletions(-)

diff --git a/bob/bio/base/preprocessor/Filename.py b/bob/bio/base/preprocessor/Filename.py
index 8d2f8ba0..a1ab0f64 100644
--- a/bob/bio/base/preprocessor/Filename.py
+++ b/bob/bio/base/preprocessor/Filename.py
@@ -27,7 +27,7 @@ class Filename (Preprocessor):
   """
 
   def __init__(self):
-    pass
+    Preprocessor.__init__(self, writes_data=False)
 
 
   # The call function (i.e. the operator() in C++ terms)
diff --git a/bob/bio/base/preprocessor/Preprocessor.py b/bob/bio/base/preprocessor/Preprocessor.py
index a01026a7..f5a7e16a 100644
--- a/bob/bio/base/preprocessor/Preprocessor.py
+++ b/bob/bio/base/preprocessor/Preprocessor.py
@@ -33,9 +33,10 @@ class Preprocessor:
     A list of keyword arguments to be written in the :py:meth:`__str__` function.
   """
 
-  def __init__(self, **kwargs):
+  def __init__(self, writes_data = True, **kwargs):
     # Each class needs to have a constructor taking
     # all the parameters that are required for the preprocessing as arguments
+    self.writes_data = writes_data
     self._kwargs = kwargs
     pass
 
diff --git a/bob/bio/base/tools/extractor.py b/bob/bio/base/tools/extractor.py
index 5872a4ba..a28016af 100644
--- a/bob/bio/base/tools/extractor.py
+++ b/bob/bio/base/tools/extractor.py
@@ -102,7 +102,7 @@ def extract(extractor, preprocessor, groups=None, indices = None, allow_missing_
     data_file = data_files[i]
     feature_file = feature_files[i]
 
-    if not os.path.exists(data_file):
+    if not os.path.exists(data_file) and preprocessor.writes_data:
       if allow_missing_files:
         logger.debug("... Cannot find preprocessed data file %s; skipping", data_file)
         continue
diff --git a/bob/bio/base/tools/preprocessor.py b/bob/bio/base/tools/preprocessor.py
index d28b2852..01dea478 100644
--- a/bob/bio/base/tools/preprocessor.py
+++ b/bob/bio/base/tools/preprocessor.py
@@ -33,6 +33,11 @@ def preprocess(preprocessor, groups = None, indices = None, allow_missing_files
   force : bool
     If given, files are regenerated, even if they already exist.
   """
+  if not preprocessor.writes_data:
+    # The preprocessor does not write anything, so no need to call it
+    logger.info("Skipping preprocessing as preprocessor does not write any data")
+    return
+
   # the file selector object
   fs = FileSelector.instance()
 
@@ -110,9 +115,10 @@ def read_preprocessed_data(file_names, preprocessor, split_by_client = False, al
   preprocessed : [object] or [[object]]
     The list of preprocessed data, in the same order as in the ``file_names``.
   """
-  file_names = utils.filter_missing_files(file_names, split_by_client, allow_missing_files)
+  file_names = utils.filter_missing_files(file_names, split_by_client, allow_missing_files and preprocessor.writes_data)
 
   if split_by_client:
-    return [[preprocessor.read_data(f) for f in client_files] for client_files in file_names]
+    preprocessed = [[preprocessor.read_data(f) for f in client_files] for client_files in file_names]
   else:
-    return [preprocessor.read_data(f) for f in file_names]
+    preprocessed = [preprocessor.read_data(f) for f in file_names]
+  return utils.filter_none(preprocessed, split_by_client)
diff --git a/bob/bio/base/utils/io.py b/bob/bio/base/utils/io.py
index 1e2851bc..1e73f499 100644
--- a/bob/bio/base/utils/io.py
+++ b/bob/bio/base/utils/io.py
@@ -22,6 +22,21 @@ def filter_missing_files(file_names, split_by_client=False, allow_missing_files=
   return existing_files
 
 
+def filter_none(data, split_by_client=False):
+  """This function filters out ``None`` values from the given list (or list of lists, when ``split_by_client`` is enabled)."""
+
+  if split_by_client:
+    # filter out missing files and empty clients
+    existing_data = [[d for d in client_data if d is not None] for client_data in data]
+    existing_data = [client_data for client_data in existing_data if client_data]
+  else:
+    # filter out missing files
+    existing_data = [d for d in data if d is not None]
+  return existing_data
+
+
+
+
 def check_file(filename, force, expected_file_size = 1):
   """Checks if the file with the given ``filename`` exists and has size greater or equal to ``expected_file_size``.
   If the file is to small, **or** if the ``force`` option is set to ``True``, the file is removed.
-- 
GitLab