load.py 13 KB
Newer Older
André Anjos's avatar
André Anjos committed
1
2
3
4
5
6
7
8
#!/usr/bin/env python
# vim: set fileencoding=utf-8 :
# Mon 23 May 2011 16:23:05 CEST

"""A set of utilities to load score files with different formats.
"""

import numpy
9
import csv
10
11
import tarfile
import os
12
import sys
13

14
15
16
import logging
logger = logging.getLogger('bob.measure')

André Anjos's avatar
André Anjos committed
17

18
def open_file(filename, mode='rt'):
André Anjos's avatar
André Anjos committed
19
20
21
22
23
24
25
26
  """Opens the given score file for reading.

  Score files might be raw text files, or a tar-file including a single score
  file inside.


  Parameters:

André Anjos's avatar
André Anjos committed
27
    filename (:py:class:`str`, ``file-like``): The name of the score file to
André Anjos's avatar
André Anjos committed
28
29
      open, or a file-like object open for reading. If a file name is given,
      the according file might be a raw text file or a (compressed) tar file
30
      containing a raw text file.
31

32

André Anjos's avatar
André Anjos committed
33
  Returns:
34

35

André Anjos's avatar
André Anjos committed
36
37
    ``file-like``: A read-only file-like object as it would be returned by
    :py:func:`open`.
38

39
  """
André Anjos's avatar
André Anjos committed
40

41
42
43
44
  if not isinstance(filename, str) and hasattr(filename, 'read'):
    # It seems that this is an open file
    return filename

45
46
47
  if not os.path.isfile(filename):
    raise IOError("Score file '%s' does not exist." % filename)
  if not tarfile.is_tarfile(filename):
48
    return open(filename, mode)
49
50
51
52
53
54
55
56
57
58
59
60
61
62

  # open the tar file for reading
  tar = tarfile.open(filename, 'r')
  # get the first file in the tar file
  tar_info = tar.next()
  while tar_info is not None and not tar_info.isfile():
    tar_info = tar.next()
  # check that one file was found in the archive
  if tar_info is None:
    raise IOError("The given file is a .tar file, but it does not contain any file.")

  # open the file for reading
  return tar.extractfile(tar_info)

André Anjos's avatar
André Anjos committed
63
64

def four_column(filename):
André Anjos's avatar
André Anjos committed
65
66
67
68
69
70
71
72
73
74
  """Loads a score set from a single file and yield its lines

  Loads a score set from a single file and yield its lines (to avoid loading
  the score file at once into memory).  This function verifies that all fields
  are correctly placed and contain valid fields.  The score file must contain
  the following information in each line:

  .. code-block:: text

     claimed_id real_id test_label score
75
76


André Anjos's avatar
André Anjos committed
77
  Parameters:
78

André Anjos's avatar
André Anjos committed
79
    filename (:py:class:`str`, ``file-like``): The file object that will be
André Anjos's avatar
André Anjos committed
80
      opened with :py:func:`open_file` containing the scores.
81

André Anjos's avatar
André Anjos committed
82

83
  Yields:
André Anjos's avatar
André Anjos committed
84

André Anjos's avatar
André Anjos committed
85
    str: The claimed identity -- the client name of the model that was used in
André Anjos's avatar
André Anjos committed
86
    the comparison
André Anjos's avatar
André Anjos committed
87

André Anjos's avatar
André Anjos committed
88
    str: The real identity -- the client name of the probe that was used in the
André Anjos's avatar
André Anjos committed
89
    comparison
90

André Anjos's avatar
André Anjos committed
91
92
93
    str: A label of the probe -- usually the probe file name, or the probe id

    float: The result of the comparison of the model and the probe
94

André Anjos's avatar
André Anjos committed
95
  """
96
  return _iterate_score_file(filename)
André Anjos's avatar
André Anjos committed
97
98
99
100



def split_four_column(filename):
André Anjos's avatar
André Anjos committed
101
  """Loads a score set from a single file and splits the scores
102

André Anjos's avatar
André Anjos committed
103
104
105
  Loads a score set from a single file and splits the scores between negatives
  and positives. The score file has to respect the 4 column format as defined
  in the method :py:func:`four_column`.
André Anjos's avatar
André Anjos committed
106
107
108
109

  This method avoids loading and allocating memory for the strings present in
  the file. We only keep the scores.

110

André Anjos's avatar
André Anjos committed
111
112
  Parameters:

André Anjos's avatar
André Anjos committed
113
    filename (:py:class:`str`, ``file-like``): The file object that will be
André Anjos's avatar
André Anjos committed
114
      opened with :py:func:`open_file` containing the scores.
André Anjos's avatar
André Anjos committed
115

André Anjos's avatar
André Anjos committed
116

André Anjos's avatar
André Anjos committed
117
  Returns:
118

André Anjos's avatar
André Anjos committed
119
    negatives (array): 1D float array containing the list of scores, for which
André Anjos's avatar
André Anjos committed
120
121
    the ``claimed_id`` and the ``real_id`` differed (see
    :py:func:`four_column`)
André Anjos's avatar
André Anjos committed
122
123

    positivies (array): 1D float array containing the list of scores, for which
André Anjos's avatar
André Anjos committed
124
125
    the ``claimed_id`` and the ``real_id`` are identical (see
    :py:func:`four_column`)
126
127

  """
André Anjos's avatar
André Anjos committed
128

129
130
  score_lines = four_column(filename)
  return _split_scores(score_lines, 1)
André Anjos's avatar
André Anjos committed
131

André Anjos's avatar
André Anjos committed
132

Tiago de Freitas Pereira's avatar
Tiago de Freitas Pereira committed
133
def cmc_four_column(filename):
André Anjos's avatar
André Anjos committed
134
  """Loads scores to compute CMC curves from a file in four column format.
135

André Anjos's avatar
André Anjos committed
136
137
138
  The four column file needs to be in the same format as described in
  :py:func:`four_column`, and the ``test_label`` (column 3) has to contain the
  test/probe file name or a probe id.
139

André Anjos's avatar
André Anjos committed
140
141
142
143
144
  This function returns a list of tuples.  For each probe file, the tuple
  consists of a list of negative scores and a list of positive scores.
  Usually, the list of positive scores should contain only one element, but
  more are allowed.  The result of this function can directly be passed to,
  e.g., the :py:func:`bob.measure.cmc` function.
145

146

André Anjos's avatar
André Anjos committed
147
  Parameters:
148

André Anjos's avatar
André Anjos committed
149
    filename (:py:class:`str`, ``file-like``): The file object that will be
André Anjos's avatar
André Anjos committed
150
      opened with :py:func:`open_file` containing the scores.
151

Tiago de Freitas Pereira's avatar
Tiago de Freitas Pereira committed
152

André Anjos's avatar
André Anjos committed
153
  Returns:
154

André Anjos's avatar
André Anjos committed
155
    list: A list of tuples, where each tuple contains the ``negative`` and
André Anjos's avatar
André Anjos committed
156
157
158
    ``positive`` scores for one probe of the database. Both ``negatives`` and
    ``positives`` can be either an 1D :py:class:`numpy.ndarray` of type
    ``float``, or ``None``.
Tiago de Freitas Pereira's avatar
Tiago de Freitas Pereira committed
159

160
  """
André Anjos's avatar
André Anjos committed
161

162
163
  score_lines = four_column(filename)
  return _split_cmc_scores(score_lines, 1)
Tiago de Freitas Pereira's avatar
Tiago de Freitas Pereira committed
164

André Anjos's avatar
André Anjos committed
165
166

def five_column(filename):
André Anjos's avatar
André Anjos committed
167
168
169
170
171
172
  """Loads a score set from a single file and yield its lines

  Loads a score set from a single file and yield its lines (to avoid loading
  the score file at once into memory).  This function verifies that all fields
  are correctly placed and contain valid fields.  The score file must contain
  the following information in each line:
173

André Anjos's avatar
André Anjos committed
174
  .. code-block:: text
175

André Anjos's avatar
André Anjos committed
176
     claimed_id model_label real_id test_label score
177
178


André Anjos's avatar
André Anjos committed
179
  Parameters:
180

André Anjos's avatar
André Anjos committed
181
    filename (:py:class:`str`, ``file-like``): The file object that will be
André Anjos's avatar
André Anjos committed
182
      opened with :py:func:`open_file` containing the scores.
183
184


185
  Yields:
186

André Anjos's avatar
André Anjos committed
187
    str: The claimed identity -- the client name of the model that was used in
André Anjos's avatar
André Anjos committed
188
    the comparison
189

André Anjos's avatar
André Anjos committed
190
191
192
    str: A label for the model -- usually the model file name, or the model id

    str: The real identity -- the client name of the probe that was used in the
André Anjos's avatar
André Anjos committed
193
    comparison
André Anjos's avatar
André Anjos committed
194
195
196
197

    str: A label of the probe -- usually the probe file name, or the probe id

    float: The result of the comparison of the model and the probe
198

André Anjos's avatar
André Anjos committed
199
200
  """

201
  return _iterate_score_file(filename)
André Anjos's avatar
André Anjos committed
202

Tiago de Freitas Pereira's avatar
Tiago de Freitas Pereira committed
203

André Anjos's avatar
André Anjos committed
204
def split_five_column(filename):
André Anjos's avatar
André Anjos committed
205
  """Loads a score set from a single file and splits the scores
206

André Anjos's avatar
André Anjos committed
207
208
209
  Loads a score set from a single file in five column format and splits the
  scores between negatives and positives. The score file has to respect the 5
  column format as defined in the method :py:func:`five_column`.
André Anjos's avatar
André Anjos committed
210
211
212
213

  This method avoids loading and allocating memory for the strings present in
  the file. We only keep the scores.

214

André Anjos's avatar
André Anjos committed
215
216
  Parameters:

André Anjos's avatar
André Anjos committed
217
    filename (:py:class:`str`, ``file-like``): The file object that will be
André Anjos's avatar
André Anjos committed
218
      opened with :py:func:`open_file` containing the scores.
André Anjos's avatar
André Anjos committed
219

220

André Anjos's avatar
André Anjos committed
221
  Returns:
222

André Anjos's avatar
André Anjos committed
223
    negatives (array): 1D float array containing the list of scores, for which
André Anjos's avatar
André Anjos committed
224
225
    the ``claimed_id`` and the ``real_id`` differed (see
    :py:func:`four_column`)
André Anjos's avatar
André Anjos committed
226
227

    positivies (array): 1D float array containing the list of scores, for which
André Anjos's avatar
André Anjos committed
228
229
    the ``claimed_id`` and the ``real_id`` are identical (see
    :py:func:`four_column`)
230

André Anjos's avatar
André Anjos committed
231
  """
André Anjos's avatar
André Anjos committed
232

233
234
  score_lines = four_column(filename)
  return _split_scores(score_lines, 2)
André Anjos's avatar
André Anjos committed
235

236

Tiago de Freitas Pereira's avatar
Tiago de Freitas Pereira committed
237
def cmc_five_column(filename):
André Anjos's avatar
André Anjos committed
238
239
240
241
242
243
244
245
246
247
248
  """Loads scores to compute CMC curves from a file in five column format.

  The five column file needs to be in the same format as described in
  :py:func:`five_column`, and the ``test_label`` (column 4) has to contain the
  test/probe file name or a probe id.

  This function returns a list of tuples.  For each probe file, the tuple
  consists of a list of negative scores and a list of positive scores.
  Usually, the list of positive scores should contain only one element, but
  more are allowed.  The result of this function can directly be passed to,
  e.g., the :py:func:`bob.measure.cmc` function.
249

André Anjos's avatar
André Anjos committed
250

André Anjos's avatar
André Anjos committed
251
  Parameters:
252

André Anjos's avatar
André Anjos committed
253
    filename (:py:class:`str`, ``file-like``): The file object that will be
André Anjos's avatar
André Anjos committed
254
      opened with :py:func:`open_file` containing the scores.
255
256


André Anjos's avatar
André Anjos committed
257
  Returns:
258

André Anjos's avatar
André Anjos committed
259
    list: A list of tuples, where each tuple contains the ``negative`` and
André Anjos's avatar
André Anjos committed
260
    ``positive`` scores for one probe of the database.
Tiago de Freitas Pereira's avatar
Tiago de Freitas Pereira committed
261

André Anjos's avatar
André Anjos committed
262
  """
263
264
  score_lines = four_column(filename)
  return _split_cmc_scores(score_lines, 2)
André Anjos's avatar
André Anjos committed
265

Tiago de Freitas Pereira's avatar
Tiago de Freitas Pereira committed
266

267

268
def load_score(filename, ncolumns=None):
269
270
  """Load scores using numpy.loadtxt and return the data as a numpy array.

André Anjos's avatar
André Anjos committed
271
  Parameters:
272

André Anjos's avatar
André Anjos committed
273
    filename (:py:class:`str`, ``file-like``): The file object that will be
André Anjos's avatar
André Anjos committed
274
      opened with :py:func:`open_file` containing the scores.
275

André Anjos's avatar
André Anjos committed
276
277
278
    ncolumns (:py:class:`int`, optional): 4, 5 or None (the default),
      specifying the number of columns in the score file. If None is provided,
      the number of columns will be guessed.
279
280


André Anjos's avatar
André Anjos committed
281
282
283
  Returns:

    array: An array which contains not only the actual scores but also the
André Anjos's avatar
André Anjos committed
284
    ``claimed_id``, ``real_id``, ``test_label`` and ``['model_label']``
285
286
287

  """

288
289
290
291
292
293
294
295
296
297
298
299
300
301
  def convertfunc(x):
    return x

  if ncolumns not in (4, 5):
    f = open_file(filename)
    try:
      line = f.readline()
      ncolumns = len(line.split())
    except Exception:
      logger.warn('Could not guess the number of columns in file: {}. '
                  'Assuming 4 column format.'.format(filename))
      ncolumns = 4
    finally:
      f.close()
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333

  if ncolumns == 4:
    names = ('claimed_id', 'real_id', 'test_label', 'score')
    converters = {
      0: convertfunc,
      1: convertfunc,
      2: convertfunc,
      3: float}

  elif ncolumns == 5:
    names = ('claimed_id', 'model_label', 'real_id', 'test_label', 'score')
    converters = {
      0: convertfunc,
      1: convertfunc,
      2: convertfunc,
      3: convertfunc,
      4: float}
  else:
    raise ValueError("ncolumns of 4 and 5 are supported only.")

  score_lines = numpy.genfromtxt(
    open_file(filename, mode='rb'), dtype=None, names=names,
    converters=converters, invalid_raise=True)
  new_dtype = []
  for name in score_lines.dtype.names[:-1]:
    new_dtype.append((name, str(score_lines.dtype[name]).replace('S', 'U')))
  new_dtype.append(('score', float))
  score_lines = numpy.array(score_lines, new_dtype)
  return score_lines


def get_negatives_positives(score_lines):
André Anjos's avatar
André Anjos committed
334
335
336
  """Take the output of load_score and return negatives and positives.  This
  function aims to replace split_four_column and split_five_column but takes a
  different input. It's up to you to use which one.
337
  """
André Anjos's avatar
André Anjos committed
338

339
340
341
342
343
344
345
346
  pos_mask = score_lines['claimed_id'] == score_lines['real_id']
  positives = score_lines['score'][pos_mask]
  negatives = score_lines['score'][numpy.logical_not(pos_mask)]
  return (negatives, positives)


def get_negatives_positives_all(score_lines_list):
  """Take a list of outputs of load_score and return stacked negatives and
André Anjos's avatar
André Anjos committed
347
348
349
  positives.
  """

350
351
352
353
354
355
356
357
358
359
360
361
  negatives, positives = [], []
  for score_lines in score_lines_list:
    neg_pos = get_negatives_positives(score_lines)
    negatives.append(neg_pos[0])
    positives.append(neg_pos[1])
  negatives = numpy.vstack(negatives).T
  positives = numpy.vstack(positives).T
  return (negatives, positives)


def get_all_scores(score_lines_list):
  """Take a list of outputs of load_score and return stacked scores"""
André Anjos's avatar
André Anjos committed
362

363
364
365
366
367
368
369
370
  return numpy.vstack([score_lines['score']
                       for score_lines in score_lines_list]).T


def dump_score(filename, score_lines):
  """Dump scores that were loaded using :py:func:`load_score`
  The number of columns is automatically detected.
  """
André Anjos's avatar
André Anjos committed
371

372
373
374
375
376
377
378
  if len(score_lines.dtype) == 5:
    fmt = '%s %s %s %s %.9f'
  elif len(score_lines.dtype) == 4:
    fmt = '%s %s %s %.9f'
  else:
    raise ValueError("Only scores with 4 and 5 columns are supported.")
  numpy.savetxt(filename, score_lines, fmt=fmt)
379

André Anjos's avatar
André Anjos committed
380

381
382
383
384
385
386
387
388
def _iterate_score_file(filename):
  """Opens the score file for reading and yields the score file line by line in a tuple/list.

  The last element of the line (which is the score) will be transformed to float, the other elements will be str
  """
  opened = open_file(filename, 'rb')
  if sys.version_info.major > 2:
    import io
389
390
    if not isinstance(opened, io.TextIOWrapper):
      opened = io.TextIOWrapper(opened, newline="")
391
392
393
394
395
396
397

  reader = csv.reader(opened, delimiter=' ')
  for splits in reader:
    splits[-1] = float(splits[-1])
    yield splits


398
399
400
401
402
403
404
405
406
407
def _split_scores(score_lines, real_id_index, claimed_id_index = 0, score_index = -1):
  """Take the output of :py:func:`four_column` or :py:func:`five_column` and return negatives and positives.
  """
  positives, negatives = [], []
  for line in score_lines:
    which = positives if line[claimed_id_index] == line[real_id_index] else negatives
    which.append(line[score_index])

  return (numpy.array(negatives), numpy.array(positives))

408
409
def _split_cmc_scores(score_lines, real_id_index, probe_name_index = None, claimed_id_index = 0, score_index = -1):
  """Takes the output of :py:func:`four_column` or :py:func:`five_column` and return cmc scores.
André Anjos's avatar
André Anjos committed
410
  """
411
412
413
414
415
416
417
418
419
420
421
422
423
  if probe_name_index is None:
    probe_name_index = real_id_index + 1
  # extract positives and negatives
  pos_dict = {}
  neg_dict = {}
  # read four column list
  for line in score_lines:
    which = pos_dict if line[claimed_id_index] == line[real_id_index] else neg_dict
    probe_name = line[probe_name_index]
    # append score
    if probe_name not in which:
      which[probe_name] = []
    which[probe_name].append(line[score_index])
André Anjos's avatar
André Anjos committed
424

425
426
427
428
429
430
431
  # convert to lists of tuples of ndarrays (or None)
  probe_names = sorted(set(neg_dict.keys()).union(set(pos_dict.keys())))
  # get all scores in the desired format
  return [(
    numpy.array(neg_dict[probe_name], numpy.float64) if probe_name in neg_dict else None,
    numpy.array(pos_dict[probe_name], numpy.float64) if probe_name in pos_dict else None
  ) for probe_name in probe_names]