io.py 6.49 KB
Newer Older
1
import os
2
3
import tempfile
import tarfile
Amir MOHAMMADI's avatar
Amir MOHAMMADI committed
4
import collections  # this is needed for the sphinx documentation
5
6
import functools  # this is needed for the sphinx documentation
import numpy
7
import logging
8
logger = logging.getLogger("bob.bio.base")
9

10
from .. import database
11
12
import bob.io.base

13

14
15
16
17
18
19
20
21
def filter_missing_files(file_names, split_by_client=False, allow_missing_files=True):
  """This function filters out files that do not exist, but only if ``allow_missing_files`` is set to ``True``, otherwise the list of ``file_names`` is returned unaltered."""

  if not allow_missing_files:
    return file_names

  if split_by_client:
    # filter out missing files and empty clients
22
23
24
25
    existing_files = [
        [f for f in client_files if os.path.exists(f)] for client_files in file_names]
    existing_files = [
        client_files for client_files in existing_files if client_files]
26
27
28
29
30
31
  else:
    # filter out missing files
    existing_files = [f for f in file_names if os.path.exists(f)]
  return existing_files


32
33
34
35
36
def filter_none(data, split_by_client=False):
  """This function filters out ``None`` values from the given list (or list of lists, when ``split_by_client`` is enabled)."""

  if split_by_client:
    # filter out missing files and empty clients
37
38
39
40
    existing_data = [[d for d in client_data if d is not None]
                     for client_data in data]
    existing_data = [
        client_data for client_data in existing_data if client_data]
41
42
43
44
45
46
  else:
    # filter out missing files
    existing_data = [d for d in data if d is not None]
  return existing_data


47
def check_file(filename, force, expected_file_size=1):
48
49
50
51
52
53
54
55
56
57
58
59
60
  """Checks if the file with the given ``filename`` exists and has size greater or equal to ``expected_file_size``.
  If the file is to small, **or** if the ``force`` option is set to ``True``, the file is removed.
  This function returns ``True`` is the file exists (and has not been removed), otherwise ``False``"""
  if os.path.exists(filename):
    if force or os.path.getsize(filename) < expected_file_size:
      logger.debug("  .. Removing old file '%s'.", filename)
      os.remove(filename)
      return False
    else:
      return True
  return False


61
def read_original_data(biofile, directory, extension):
62
  """This function reads the original data using the given ``biofile`` instance.
63
64
  It simply calls ``load(directory, extension)`` from :py:class:`bob.bio.base.database.BioFile` or one of its derivatives.

65
66
  Parameters
  ----------
67
68
69
70
71
72
73
74
75
76
77

  ``biofile`` : :py:class:`bob.bio.base.database.BioFile` or one of its derivatives
    The file to read the original data.

  ``directory`` : str
    The base directory of the database.

  ``extension`` : str or ``None``
    The extension of the original data.
    Might be ``None`` if the ``biofile`` itself has the extension stored.

78
79
  Returns
  -------
80

81
  object:
82
83
84
85
86
87
    Whatver ``biofile.load`` returns; usually a :py:class:`numpy.ndarray`
  """
  assert isinstance(biofile, database.BioFile)
  return biofile.load(directory, extension)


88
89
90
91
92
93
94
def load(file):
  """Loads data from file. The given file might be an HDF5 file open for reading or a string."""
  if isinstance(file, bob.io.base.HDF5File):
    return file.read("array")
  else:
    return bob.io.base.load(file)

95

96
97
98
99
def save(data, file, compression=0):
  """Saves the data to file using HDF5. The given file might be an HDF5 file open for writing, or a string.
  If the given data contains a ``save`` method, this method is called with the given HDF5 file.
  Otherwise the data is written to the HDF5 file using the given compression."""
100
  f = file if isinstance(file, bob.io.base.HDF5File) else bob.io.base.HDF5File(file, 'w')
101
102
103
104
105
106
  if hasattr(data, 'save'):
    data.save(f)
  else:
    f.set("array", data, compression=compression)


107
def open_compressed(filename, open_flag='r', compression_type='bz2'):
108
109
110
111
112
113
  """Opens a compressed HDF5File with the given opening flags.
  For the 'r' flag, the given compressed file will be extracted to a local space.
  For 'w', an empty HDF5File is created.
  In any case, the opened HDF5File is returned, which needs to be closed using the close_compressed() function.
  """
  # create temporary HDF5 file name
114
  hdf5_file_name = tempfile.mkstemp('.hdf5', 'bob_')[1]
115
116
117

  if open_flag == 'r':
    # extract the HDF5 file from the given file name into a temporary file name
118
    tar = tarfile.open(filename, mode="r:" + compression_type)
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
    memory_file = tar.extractfile(tar.next())
    real_file = open(hdf5_file_name, 'wb')
    real_file.write(memory_file.read())
    del memory_file
    real_file.close()
    tar.close()

  return bob.io.base.HDF5File(hdf5_file_name, open_flag)


def close_compressed(filename, hdf5_file, compression_type='bz2', create_link=False):
  """Closes the compressed hdf5_file that was opened with open_compressed.
  When the file was opened for writing (using the 'w' flag in open_compressed), the created HDF5 file is compressed into the given file name.
  To be able to read the data using the real tools, a link with the correct extension might is created, when create_link is set to True.
  """
  hdf5_file_name = hdf5_file.filename
  is_writable = hdf5_file.writable
  hdf5_file.close()

  if is_writable:
    # create compressed tar file
140
    tar = tarfile.open(filename, mode="w:" + compression_type)
141
142
143
144
    tar.add(hdf5_file_name, os.path.basename(filename))
    tar.close()

  if create_link:
145
146
147
    extension = {'': '.tar', 'bz2': '.tar.bz2',
                 'gz': 'tar.gz'}[compression_type]
    link_file = filename + extension
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
    if not os.path.exists(link_file):
      os.symlink(os.path.basename(filename), link_file)

  # clean up locally generated files
  os.remove(hdf5_file_name)


def load_compressed(filename, compression_type='bz2'):
  """Extracts the data to a temporary HDF5 file using HDF5 and reads its contents.
  Note that, though the file name is .hdf5, it contains compressed data!
  Accepted compression types are 'gz', 'bz2', ''"""
  # read from compressed HDF5
  hdf5 = open_compressed(filename, 'r')
  data = hdf5.read("array")
  close_compressed(filename, hdf5)

  return data


def save_compressed(data, filename, compression_type='bz2', create_link=False):
  """Saves the data to a temporary file using HDF5.
  Afterwards, the file is compressed using the given compression method and saved using the given file name.
  Note that, though the file name will be .hdf5, it will contain compressed data!
  Accepted compression types are 'gz', 'bz2', ''"""
  # write to compressed HDF5 file
  hdf5 = open_compressed(filename, 'w')
  save(data, hdf5)
  close_compressed(filename, hdf5, compression_type, create_link)