preprocessor.py 5.47 KB
Newer Older
1
2
3
4
5
6
7
8
9
import bob.io.base
import os

import logging
logger = logging.getLogger("bob.bio.base")

from .FileSelector import FileSelector
from .. import utils

10
11

def preprocess(preprocessor, groups = None, indices = None, allow_missing_files = False, force = False):
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
  """Preprocesses the original data of the database with the given preprocessor.

  The given ``preprocessor`` is used to preprocess all data required for the current experiment.
  It writes the preprocessed data into the directory specified by the :py:class:`bob.bio.base.tools.FileSelector`.
  By default, if target files already exist, they are not re-created.

  **Parameters:**

  preprocessor : py:class:`bob.bio.base.preprocessor.Preprocessor` or derived
    The preprocessor, which should be applied to all data.

  groups : some of ``('world', 'dev', 'eval')`` or ``None``
    The list of groups, for which the data should be preprocessed.

  indices : (int, int) or None
    If specified, only the data for the given index range ``range(begin, end)`` should be preprocessed.
    This is usually given, when parallel threads are executed.

30
31
32
  allow_missing_files : bool
    If set to ``True``, files for which the preprocessor returns ``None`` are silently ignored.

33
34
35
  force : bool
    If given, files are regenerated, even if they already exist.
  """
36
37
38
39
40
  if not preprocessor.writes_data:
    # The preprocessor does not write anything, so no need to call it
    logger.info("Skipping preprocessing as preprocessor does not write any data")
    return

41
42
43
44
  # the file selector object
  fs = FileSelector.instance()

  # get the file lists
45
46
  data_files = fs.original_data_list(groups=groups)
  original_directory, original_extension = fs.original_directory_and_extension()
47
48
49
  preprocessed_data_files = fs.preprocessed_data_list(groups=groups)

  # select a subset of keys to iterate
50
  if indices is not None:
51
52
53
54
55
    index_range = range(indices[0], indices[1])
    logger.info("- Preprocessing: splitting of index range %s", str(indices))
  else:
    index_range = range(len(data_files))

56
  logger.info("- Preprocessing: processing %d data files from directory '%s' to directory '%s'", len(index_range), fs.directories['original'], fs.directories['preprocessed'])
57
58
59
60
61
62

  # read annotation files
  annotation_list = fs.annotation_list(groups=groups)

  # iterate over the selected files
  for i in index_range:
63
    preprocessed_data_file = preprocessed_data_files[i]
64
65
    file_object = data_files[i]
    file_name = file_object.make_path(original_directory, original_extension)
66
67

    # check for existence
68
69
    if not utils.check_file(preprocessed_data_file, force,
                            preprocessor.min_preprocessed_file_size):
70
      logger.debug("... Processing original data file '%s'", file_name)
71
72
73
74
75
76
77
78
79

      # Maybe we have missing file in the databse
      if not os.path.exists(file_name):
        if allow_missing_files:
          logger.debug("... Original data file is missing '%s' and will be skipped", file_name)
          continue
        else:
          raise RuntimeError("Original data file is missing '%s' " % file_name)
      
80
      data = preprocessor.read_original_data(file_object, original_directory, original_extension)
81
82
      # create output directory before reading the data file (is sometimes required, when relative directories are specified, especially, including a .. somewhere)
      bob.io.base.create_directories_safe(os.path.dirname(preprocessed_data_file))
83
84
85
86
87
88
89

      # get the annotations; might be None
      annotations = fs.get_annotations(annotation_list[i])

      # call the preprocessor
      preprocessed_data = preprocessor(data, annotations)
      if preprocessed_data is None:
90
91
        if allow_missing_files:
          logger.debug("... Processing original data file '%s' was not successful", file_name)
92
          continue
93
        else:
94
          raise RuntimeError("Preprocessing of file '%s' was not successful" % file_name)
95
96
97
98

      # write the data
      preprocessor.write_data(preprocessed_data, preprocessed_data_file)

99
100
101
102
    else:
      logger.debug("... Skipping original data file '%s' since preprocessed data '%s' exists", file_name, preprocessed_data_file)


103

104
def read_preprocessed_data(file_names, preprocessor, split_by_client = False, allow_missing_files = False):
105
106
107
  """read_preprocessed_data(file_names, preprocessor, split_by_client = False) -> preprocessed

  Reads the preprocessed data from ``file_names`` using the given preprocessor.
108
  If ``split_by_client`` is set to ``True``, it is assumed that the ``file_names`` are already sorted by client.
109
110
111
112
113
114
115
116
117
118
119
120
121

  **Parameters:**

  file_names : [str] or [[str]]
    A list of names of files to be read.
    If ``split_by_client = True``, file names are supposed to be split into groups.

  preprocessor : py:class:`bob.bio.base.preprocessor.Preprocessor` or derived
    The preprocessor, which can read the preprocessed data.

  split_by_client : bool
    Indicates if the given ``file_names`` are split into groups.

122
123
124
  allow_missing_files : bool
    If set to ``True``, preprocessed data files that are not found are silently ignored.

125
126
127
128
  **Returns:**

  preprocessed : [object] or [[object]]
    The list of preprocessed data, in the same order as in the ``file_names``.
129
  """
130
  file_names = utils.filter_missing_files(file_names, split_by_client, allow_missing_files and preprocessor.writes_data)
131

132
  if split_by_client:
133
    preprocessed = [[preprocessor.read_data(f) for f in client_files] for client_files in file_names]
134
  else:
135
136
    preprocessed = [preprocessor.read_data(f) for f in file_names]
  return utils.filter_none(preprocessed, split_by_client)