Commit b6c65c06 authored by Manuel Günther's avatar Manuel Günther
Browse files

Implemented Preprocessor.writes_data flag so that preprocessing can easier be skipped

parent 9a25a354
......@@ -27,7 +27,7 @@ class Filename (Preprocessor):
def __init__(self):
Preprocessor.__init__(self, writes_data=False)
# The call function (i.e. the operator() in C++ terms)
......@@ -33,9 +33,10 @@ class Preprocessor:
A list of keyword arguments to be written in the :py:meth:`__str__` function.
def __init__(self, **kwargs):
def __init__(self, writes_data = True, **kwargs):
# Each class needs to have a constructor taking
# all the parameters that are required for the preprocessing as arguments
self.writes_data = writes_data
self._kwargs = kwargs
......@@ -102,7 +102,7 @@ def extract(extractor, preprocessor, groups=None, indices = None, allow_missing_
data_file = data_files[i]
feature_file = feature_files[i]
if not os.path.exists(data_file):
if not os.path.exists(data_file) and preprocessor.writes_data:
if allow_missing_files:
logger.debug("... Cannot find preprocessed data file %s; skipping", data_file)
......@@ -33,6 +33,11 @@ def preprocess(preprocessor, groups = None, indices = None, allow_missing_files
force : bool
If given, files are regenerated, even if they already exist.
if not preprocessor.writes_data:
# The preprocessor does not write anything, so no need to call it"Skipping preprocessing as preprocessor does not write any data")
# the file selector object
fs = FileSelector.instance()
......@@ -110,9 +115,10 @@ def read_preprocessed_data(file_names, preprocessor, split_by_client = False, al
preprocessed : [object] or [[object]]
The list of preprocessed data, in the same order as in the ``file_names``.
file_names = utils.filter_missing_files(file_names, split_by_client, allow_missing_files)
file_names = utils.filter_missing_files(file_names, split_by_client, allow_missing_files and preprocessor.writes_data)
if split_by_client:
return [[preprocessor.read_data(f) for f in client_files] for client_files in file_names]
preprocessed = [[preprocessor.read_data(f) for f in client_files] for client_files in file_names]
return [preprocessor.read_data(f) for f in file_names]
preprocessed = [preprocessor.read_data(f) for f in file_names]
return utils.filter_none(preprocessed, split_by_client)
......@@ -22,6 +22,21 @@ def filter_missing_files(file_names, split_by_client=False, allow_missing_files=
return existing_files
def filter_none(data, split_by_client=False):
"""This function filters out ``None`` values from the given list (or list of lists, when ``split_by_client`` is enabled)."""
if split_by_client:
# filter out missing files and empty clients
existing_data = [[d for d in client_data if d is not None] for client_data in data]
existing_data = [client_data for client_data in existing_data if client_data]
# filter out missing files
existing_data = [d for d in data if d is not None]
return existing_data
def check_file(filename, force, expected_file_size = 1):
"""Checks if the file with the given ``filename`` exists and has size greater or equal to ``expected_file_size``.
If the file is to small, **or** if the ``force`` option is set to ``True``, the file is removed.
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment