load.py 13.5 KB
Newer Older
André Anjos's avatar
André Anjos committed
1
2
3
4
5
6
7
8
#!/usr/bin/env python
# vim: set fileencoding=utf-8 :
# Mon 23 May 2011 16:23:05 CEST

"""A set of utilities to load score files with different formats.
"""

import numpy
9
10
11
import tarfile
import os

12
13
14
import logging
logger = logging.getLogger('bob.measure')

André Anjos's avatar
André Anjos committed
15

16
def open_file(filename, mode='rt'):
André Anjos's avatar
André Anjos committed
17
18
19
20
21
22
23
24
  """Opens the given score file for reading.

  Score files might be raw text files, or a tar-file including a single score
  file inside.


  Parameters:

André Anjos's avatar
André Anjos committed
25
    filename (:py:class:`str`, ``file-like``): The name of the score file to
André Anjos's avatar
André Anjos committed
26
27
      open, or a file-like object open for reading. If a file name is given,
      the according file might be a raw text file or a (compressed) tar file
28
      containing a raw text file.
29

30

André Anjos's avatar
André Anjos committed
31
  Returns:
32

33

André Anjos's avatar
André Anjos committed
34
35
    ``file-like``: A read-only file-like object as it would be returned by
    :py:func:`open`.
36

37
  """
André Anjos's avatar
André Anjos committed
38

39
40
41
42
  if not isinstance(filename, str) and hasattr(filename, 'read'):
    # It seems that this is an open file
    return filename

43
44
45
  if not os.path.isfile(filename):
    raise IOError("Score file '%s' does not exist." % filename)
  if not tarfile.is_tarfile(filename):
46
    return open(filename, mode)
47
48
49
50
51
52
53
54
55
56
57
58
59
60

  # open the tar file for reading
  tar = tarfile.open(filename, 'r')
  # get the first file in the tar file
  tar_info = tar.next()
  while tar_info is not None and not tar_info.isfile():
    tar_info = tar.next()
  # check that one file was found in the archive
  if tar_info is None:
    raise IOError("The given file is a .tar file, but it does not contain any file.")

  # open the file for reading
  return tar.extractfile(tar_info)

André Anjos's avatar
André Anjos committed
61
62

def four_column(filename):
André Anjos's avatar
André Anjos committed
63
64
65
66
67
68
69
70
71
72
  """Loads a score set from a single file and yield its lines

  Loads a score set from a single file and yield its lines (to avoid loading
  the score file at once into memory).  This function verifies that all fields
  are correctly placed and contain valid fields.  The score file must contain
  the following information in each line:

  .. code-block:: text

     claimed_id real_id test_label score
73
74


André Anjos's avatar
André Anjos committed
75
  Parameters:
76

André Anjos's avatar
André Anjos committed
77
    filename (:py:class:`str`, ``file-like``): The file object that will be
André Anjos's avatar
André Anjos committed
78
      opened with :py:func:`open_file` containing the scores.
79

André Anjos's avatar
André Anjos committed
80

André Anjos's avatar
André Anjos committed
81
  Returns:
André Anjos's avatar
André Anjos committed
82

André Anjos's avatar
André Anjos committed
83
    str: The claimed identity -- the client name of the model that was used in
André Anjos's avatar
André Anjos committed
84
    the comparison
André Anjos's avatar
André Anjos committed
85

André Anjos's avatar
André Anjos committed
86
    str: The real identity -- the client name of the probe that was used in the
André Anjos's avatar
André Anjos committed
87
    comparison
88

André Anjos's avatar
André Anjos committed
89
90
91
    str: A label of the probe -- usually the probe file name, or the probe id

    float: The result of the comparison of the model and the probe
92

André Anjos's avatar
André Anjos committed
93
94
  """

95
96
  for i, l in enumerate(open_file(filename)):
    if isinstance(l, bytes): l = l.decode('utf-8')
André Anjos's avatar
André Anjos committed
97
98
99
100
101
102
103
104
105
    s = l.strip()
    if len(s) == 0 or s[0] == '#': continue #empty or comment
    field = [k.strip() for k in s.split()]
    if len(field) < 4:
      raise SyntaxError('Line %d of file "%s" is invalid: %s' % (i, filename, l))
    try:
      score = float(field[3])
    except:
      raise SyntaxError('Cannot convert score to float at line %d of file "%s": %s' % (i, filename, l))
106
    yield (field[0], field[1], field[2], score)
André Anjos's avatar
André Anjos committed
107
108
109


def split_four_column(filename):
André Anjos's avatar
André Anjos committed
110
  """Loads a score set from a single file and splits the scores
111

André Anjos's avatar
André Anjos committed
112
113
114
  Loads a score set from a single file and splits the scores between negatives
  and positives. The score file has to respect the 4 column format as defined
  in the method :py:func:`four_column`.
André Anjos's avatar
André Anjos committed
115
116
117
118

  This method avoids loading and allocating memory for the strings present in
  the file. We only keep the scores.

119

André Anjos's avatar
André Anjos committed
120
121
  Parameters:

André Anjos's avatar
André Anjos committed
122
    filename (:py:class:`str`, ``file-like``): The file object that will be
André Anjos's avatar
André Anjos committed
123
      opened with :py:func:`open_file` containing the scores.
André Anjos's avatar
André Anjos committed
124

André Anjos's avatar
André Anjos committed
125

André Anjos's avatar
André Anjos committed
126
  Returns:
127

André Anjos's avatar
André Anjos committed
128
    negatives (array): 1D float array containing the list of scores, for which
André Anjos's avatar
André Anjos committed
129
130
    the ``claimed_id`` and the ``real_id`` differed (see
    :py:func:`four_column`)
André Anjos's avatar
André Anjos committed
131
132

    positivies (array): 1D float array containing the list of scores, for which
André Anjos's avatar
André Anjos committed
133
134
    the ``claimed_id`` and the ``real_id`` are identical (see
    :py:func:`four_column`)
135
136

  """
André Anjos's avatar
André Anjos committed
137

138
139
  score_lines = load_score(filename, 4)
  return get_negatives_positives(score_lines)
André Anjos's avatar
André Anjos committed
140

André Anjos's avatar
André Anjos committed
141

Tiago de Freitas Pereira's avatar
Tiago de Freitas Pereira committed
142
def cmc_four_column(filename):
André Anjos's avatar
André Anjos committed
143
  """Loads scores to compute CMC curves from a file in four column format.
144

André Anjos's avatar
André Anjos committed
145
146
147
  The four column file needs to be in the same format as described in
  :py:func:`four_column`, and the ``test_label`` (column 3) has to contain the
  test/probe file name or a probe id.
148

André Anjos's avatar
André Anjos committed
149
150
151
152
153
  This function returns a list of tuples.  For each probe file, the tuple
  consists of a list of negative scores and a list of positive scores.
  Usually, the list of positive scores should contain only one element, but
  more are allowed.  The result of this function can directly be passed to,
  e.g., the :py:func:`bob.measure.cmc` function.
154

155

André Anjos's avatar
André Anjos committed
156
  Parameters:
157

André Anjos's avatar
André Anjos committed
158
    filename (:py:class:`str`, ``file-like``): The file object that will be
André Anjos's avatar
André Anjos committed
159
      opened with :py:func:`open_file` containing the scores.
160

Tiago de Freitas Pereira's avatar
Tiago de Freitas Pereira committed
161

André Anjos's avatar
André Anjos committed
162
  Returns:
163

André Anjos's avatar
André Anjos committed
164
    list: A list of tuples, where each tuple contains the ``negative`` and
André Anjos's avatar
André Anjos committed
165
166
167
    ``positive`` scores for one probe of the database. Both ``negatives`` and
    ``positives`` can be either an 1D :py:class:`numpy.ndarray` of type
    ``float``, or ``None``.
Tiago de Freitas Pereira's avatar
Tiago de Freitas Pereira committed
168

169
  """
André Anjos's avatar
André Anjos committed
170

André Anjos's avatar
André Anjos committed
171
172
173
  # extract positives and negatives
  pos_dict = {}
  neg_dict = {}
174
175
176
177
178
179
180
181
  # read four column list
  for (client_id, probe_id, probe_name, score) in four_column(filename):
    # check in which dict we have to put the score
    correct_dict = pos_dict if client_id == probe_id else neg_dict

    # append score
    if probe_name in correct_dict:
      correct_dict[probe_name].append(score)
Tiago de Freitas Pereira's avatar
Tiago de Freitas Pereira committed
182
    else:
183
      correct_dict[probe_name] = [score]
Tiago de Freitas Pereira's avatar
Tiago de Freitas Pereira committed
184

185
186
  # convert that into the desired format
  return _convert_cmc_scores(neg_dict, pos_dict)
Tiago de Freitas Pereira's avatar
Tiago de Freitas Pereira committed
187

André Anjos's avatar
André Anjos committed
188
189

def five_column(filename):
André Anjos's avatar
André Anjos committed
190
191
192
193
194
195
  """Loads a score set from a single file and yield its lines

  Loads a score set from a single file and yield its lines (to avoid loading
  the score file at once into memory).  This function verifies that all fields
  are correctly placed and contain valid fields.  The score file must contain
  the following information in each line:
196

André Anjos's avatar
André Anjos committed
197
  .. code-block:: text
198

André Anjos's avatar
André Anjos committed
199
     claimed_id model_label real_id test_label score
200
201


André Anjos's avatar
André Anjos committed
202
  Parameters:
203

André Anjos's avatar
André Anjos committed
204
    filename (:py:class:`str`, ``file-like``): The file object that will be
André Anjos's avatar
André Anjos committed
205
      opened with :py:func:`open_file` containing the scores.
206
207


André Anjos's avatar
André Anjos committed
208
  Returns:
209

André Anjos's avatar
André Anjos committed
210
    str: The claimed identity -- the client name of the model that was used in
André Anjos's avatar
André Anjos committed
211
    the comparison
212

André Anjos's avatar
André Anjos committed
213
214
215
    str: A label for the model -- usually the model file name, or the model id

    str: The real identity -- the client name of the probe that was used in the
André Anjos's avatar
André Anjos committed
216
    comparison
André Anjos's avatar
André Anjos committed
217
218
219
220

    str: A label of the probe -- usually the probe file name, or the probe id

    float: The result of the comparison of the model and the probe
221

André Anjos's avatar
André Anjos committed
222
223
  """

224
  for i, l in enumerate(open_file(filename)):
225
    if isinstance(l, bytes): l = l.decode('utf-8')
André Anjos's avatar
André Anjos committed
226
227
228
229
230
231
232
233
234
    s = l.strip()
    if len(s) == 0 or s[0] == '#': continue #empty or comment
    field = [k.strip() for k in s.split()]
    if len(field) < 5:
      raise SyntaxError('Line %d of file "%s" is invalid: %s' % (i, filename, l))
    try:
      score = float(field[4])
    except:
      raise SyntaxError('Cannot convert score to float at line %d of file "%s": %s' % (i, filename, l))
235
    yield (field[0], field[1], field[2], field[3], score)
André Anjos's avatar
André Anjos committed
236

Tiago de Freitas Pereira's avatar
Tiago de Freitas Pereira committed
237

André Anjos's avatar
André Anjos committed
238
def split_five_column(filename):
André Anjos's avatar
André Anjos committed
239
  """Loads a score set from a single file and splits the scores
240

André Anjos's avatar
André Anjos committed
241
242
243
  Loads a score set from a single file in five column format and splits the
  scores between negatives and positives. The score file has to respect the 5
  column format as defined in the method :py:func:`five_column`.
André Anjos's avatar
André Anjos committed
244
245
246
247

  This method avoids loading and allocating memory for the strings present in
  the file. We only keep the scores.

248

André Anjos's avatar
André Anjos committed
249
250
  Parameters:

André Anjos's avatar
André Anjos committed
251
    filename (:py:class:`str`, ``file-like``): The file object that will be
André Anjos's avatar
André Anjos committed
252
      opened with :py:func:`open_file` containing the scores.
André Anjos's avatar
André Anjos committed
253

254

André Anjos's avatar
André Anjos committed
255
  Returns:
256

André Anjos's avatar
André Anjos committed
257
    negatives (array): 1D float array containing the list of scores, for which
André Anjos's avatar
André Anjos committed
258
259
    the ``claimed_id`` and the ``real_id`` differed (see
    :py:func:`four_column`)
André Anjos's avatar
André Anjos committed
260
261

    positivies (array): 1D float array containing the list of scores, for which
André Anjos's avatar
André Anjos committed
262
263
    the ``claimed_id`` and the ``real_id`` are identical (see
    :py:func:`four_column`)
264

André Anjos's avatar
André Anjos committed
265
  """
André Anjos's avatar
André Anjos committed
266

267
268
  score_lines = load_score(filename, 5)
  return get_negatives_positives(score_lines)
André Anjos's avatar
André Anjos committed
269

270

Tiago de Freitas Pereira's avatar
Tiago de Freitas Pereira committed
271
def cmc_five_column(filename):
André Anjos's avatar
André Anjos committed
272
273
274
275
276
277
278
279
280
281
282
  """Loads scores to compute CMC curves from a file in five column format.

  The five column file needs to be in the same format as described in
  :py:func:`five_column`, and the ``test_label`` (column 4) has to contain the
  test/probe file name or a probe id.

  This function returns a list of tuples.  For each probe file, the tuple
  consists of a list of negative scores and a list of positive scores.
  Usually, the list of positive scores should contain only one element, but
  more are allowed.  The result of this function can directly be passed to,
  e.g., the :py:func:`bob.measure.cmc` function.
283

André Anjos's avatar
André Anjos committed
284

André Anjos's avatar
André Anjos committed
285
  Parameters:
286

André Anjos's avatar
André Anjos committed
287
    filename (:py:class:`str`, ``file-like``): The file object that will be
André Anjos's avatar
André Anjos committed
288
      opened with :py:func:`open_file` containing the scores.
289
290


André Anjos's avatar
André Anjos committed
291
  Returns:
292

André Anjos's avatar
André Anjos committed
293
    list: A list of tuples, where each tuple contains the ``negative`` and
André Anjos's avatar
André Anjos committed
294
    ``positive`` scores for one probe of the database.
Tiago de Freitas Pereira's avatar
Tiago de Freitas Pereira committed
295

André Anjos's avatar
André Anjos committed
296
  """
297
  # extract positives and negatives
André Anjos's avatar
André Anjos committed
298
299
  pos_dict = {}
  neg_dict = {}
300
301
302
  # read four column list
  for (client_id, _, probe_id, probe_name, score) in five_column(filename):
    # check in which dict we have to put the score
303
304
    correct_dict = pos_dict if client_id == probe_id else neg_dict

305
306
307
308
309
    # append score
    if probe_name in correct_dict:
      correct_dict[probe_name].append(score)
    else:
      correct_dict[probe_name] = [score]
André Anjos's avatar
André Anjos committed
310

311
312
  # convert that into the desired format
  return _convert_cmc_scores(neg_dict, pos_dict)
Tiago de Freitas Pereira's avatar
Tiago de Freitas Pereira committed
313

314

315
def load_score(filename, ncolumns=None):
316
317
  """Load scores using numpy.loadtxt and return the data as a numpy array.

André Anjos's avatar
André Anjos committed
318
  Parameters:
319

André Anjos's avatar
André Anjos committed
320
    filename (:py:class:`str`, ``file-like``): The file object that will be
André Anjos's avatar
André Anjos committed
321
      opened with :py:func:`open_file` containing the scores.
322

André Anjos's avatar
André Anjos committed
323
324
325
    ncolumns (:py:class:`int`, optional): 4, 5 or None (the default),
      specifying the number of columns in the score file. If None is provided,
      the number of columns will be guessed.
326
327


André Anjos's avatar
André Anjos committed
328
329
330
  Returns:

    array: An array which contains not only the actual scores but also the
André Anjos's avatar
André Anjos committed
331
    ``claimed_id``, ``real_id``, ``test_label`` and ``['model_label']``
332
333
334

  """

335
336
337
338
339
340
341
342
343
344
345
346
347
348
  def convertfunc(x):
    return x

  if ncolumns not in (4, 5):
    f = open_file(filename)
    try:
      line = f.readline()
      ncolumns = len(line.split())
    except Exception:
      logger.warn('Could not guess the number of columns in file: {}. '
                  'Assuming 4 column format.'.format(filename))
      ncolumns = 4
    finally:
      f.close()
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380

  if ncolumns == 4:
    names = ('claimed_id', 'real_id', 'test_label', 'score')
    converters = {
      0: convertfunc,
      1: convertfunc,
      2: convertfunc,
      3: float}

  elif ncolumns == 5:
    names = ('claimed_id', 'model_label', 'real_id', 'test_label', 'score')
    converters = {
      0: convertfunc,
      1: convertfunc,
      2: convertfunc,
      3: convertfunc,
      4: float}
  else:
    raise ValueError("ncolumns of 4 and 5 are supported only.")

  score_lines = numpy.genfromtxt(
    open_file(filename, mode='rb'), dtype=None, names=names,
    converters=converters, invalid_raise=True)
  new_dtype = []
  for name in score_lines.dtype.names[:-1]:
    new_dtype.append((name, str(score_lines.dtype[name]).replace('S', 'U')))
  new_dtype.append(('score', float))
  score_lines = numpy.array(score_lines, new_dtype)
  return score_lines


def get_negatives_positives(score_lines):
André Anjos's avatar
André Anjos committed
381
382
383
  """Take the output of load_score and return negatives and positives.  This
  function aims to replace split_four_column and split_five_column but takes a
  different input. It's up to you to use which one.
384
  """
André Anjos's avatar
André Anjos committed
385

386
387
388
389
390
391
392
393
  pos_mask = score_lines['claimed_id'] == score_lines['real_id']
  positives = score_lines['score'][pos_mask]
  negatives = score_lines['score'][numpy.logical_not(pos_mask)]
  return (negatives, positives)


def get_negatives_positives_all(score_lines_list):
  """Take a list of outputs of load_score and return stacked negatives and
André Anjos's avatar
André Anjos committed
394
395
396
  positives.
  """

397
398
399
400
401
402
403
404
405
406
407
408
  negatives, positives = [], []
  for score_lines in score_lines_list:
    neg_pos = get_negatives_positives(score_lines)
    negatives.append(neg_pos[0])
    positives.append(neg_pos[1])
  negatives = numpy.vstack(negatives).T
  positives = numpy.vstack(positives).T
  return (negatives, positives)


def get_all_scores(score_lines_list):
  """Take a list of outputs of load_score and return stacked scores"""
André Anjos's avatar
André Anjos committed
409

410
411
412
413
414
415
416
417
  return numpy.vstack([score_lines['score']
                       for score_lines in score_lines_list]).T


def dump_score(filename, score_lines):
  """Dump scores that were loaded using :py:func:`load_score`
  The number of columns is automatically detected.
  """
André Anjos's avatar
André Anjos committed
418

419
420
421
422
423
424
425
  if len(score_lines.dtype) == 5:
    fmt = '%s %s %s %s %.9f'
  elif len(score_lines.dtype) == 4:
    fmt = '%s %s %s %.9f'
  else:
    raise ValueError("Only scores with 4 and 5 columns are supported.")
  numpy.savetxt(filename, score_lines, fmt=fmt)
426

André Anjos's avatar
André Anjos committed
427

428
def _convert_cmc_scores(neg_dict, pos_dict):
André Anjos's avatar
André Anjos committed
429
430
431
432
433
  """Converts the negative and positive scores read with
  :py:func:`cmc_four_column` or :py:func:`cmc_four_column` into a format that
  is handled by the :py:func:`bob.measure.cmc` and similar functions.
  """

434
435
436
437
438
439
440
  # convert to lists of tuples of ndarrays (or None)
  probe_names = sorted(set(neg_dict.keys()).union(set(pos_dict.keys())))
  # get all scores in the desired format
  return [(
    numpy.array(neg_dict[probe_name], numpy.float64) if probe_name in neg_dict else None,
    numpy.array(pos_dict[probe_name], numpy.float64) if probe_name in pos_dict else None
  ) for probe_name in probe_names]