load.py 13.6 KB
Newer Older
André Anjos's avatar
André Anjos committed
1
2
3
4
5
6
7
8
#!/usr/bin/env python
# vim: set fileencoding=utf-8 :
# Mon 23 May 2011 16:23:05 CEST

"""A set of utilities to load score files with different formats.
"""

import numpy
9
import csv
10
11
import tarfile
import os
12
import sys
13

14
15
16
import logging
logger = logging.getLogger('bob.measure')

André Anjos's avatar
André Anjos committed
17

18
def open_file(filename, mode='rt'):
André Anjos's avatar
André Anjos committed
19
20
21
22
23
24
25
26
  """Opens the given score file for reading.

  Score files might be raw text files, or a tar-file including a single score
  file inside.


  Parameters:

André Anjos's avatar
André Anjos committed
27
    filename (:py:class:`str`, ``file-like``): The name of the score file to
André Anjos's avatar
André Anjos committed
28
29
      open, or a file-like object open for reading. If a file name is given,
      the according file might be a raw text file or a (compressed) tar file
30
      containing a raw text file.
31

32

André Anjos's avatar
André Anjos committed
33
  Returns:
34

35

André Anjos's avatar
André Anjos committed
36
37
    ``file-like``: A read-only file-like object as it would be returned by
    :py:func:`open`.
38

39
  """
André Anjos's avatar
André Anjos committed
40

41
42
43
44
  if not isinstance(filename, str) and hasattr(filename, 'read'):
    # It seems that this is an open file
    return filename

45
46
47
  if not os.path.isfile(filename):
    raise IOError("Score file '%s' does not exist." % filename)
  if not tarfile.is_tarfile(filename):
48
    return open(filename, mode)
49
50
51
52
53
54
55
56
57
58
59
60
61
62

  # open the tar file for reading
  tar = tarfile.open(filename, 'r')
  # get the first file in the tar file
  tar_info = tar.next()
  while tar_info is not None and not tar_info.isfile():
    tar_info = tar.next()
  # check that one file was found in the archive
  if tar_info is None:
    raise IOError("The given file is a .tar file, but it does not contain any file.")

  # open the file for reading
  return tar.extractfile(tar_info)

André Anjos's avatar
André Anjos committed
63
64

def four_column(filename):
André Anjos's avatar
André Anjos committed
65
66
67
68
69
70
71
72
73
74
  """Loads a score set from a single file and yield its lines

  Loads a score set from a single file and yield its lines (to avoid loading
  the score file at once into memory).  This function verifies that all fields
  are correctly placed and contain valid fields.  The score file must contain
  the following information in each line:

  .. code-block:: text

     claimed_id real_id test_label score
75
76


André Anjos's avatar
André Anjos committed
77
  Parameters:
78

André Anjos's avatar
André Anjos committed
79
    filename (:py:class:`str`, ``file-like``): The file object that will be
André Anjos's avatar
André Anjos committed
80
      opened with :py:func:`open_file` containing the scores.
81

André Anjos's avatar
André Anjos committed
82

83
  Yields:
André Anjos's avatar
André Anjos committed
84

André Anjos's avatar
André Anjos committed
85
    str: The claimed identity -- the client name of the model that was used in
André Anjos's avatar
André Anjos committed
86
    the comparison
André Anjos's avatar
André Anjos committed
87

André Anjos's avatar
André Anjos committed
88
    str: The real identity -- the client name of the probe that was used in the
André Anjos's avatar
André Anjos committed
89
    comparison
90

André Anjos's avatar
André Anjos committed
91
92
93
    str: A label of the probe -- usually the probe file name, or the probe id

    float: The result of the comparison of the model and the probe
94

André Anjos's avatar
André Anjos committed
95
  """
96
  return _iterate_score_file(filename)
André Anjos's avatar
André Anjos committed
97
98
99
100



def split_four_column(filename):
André Anjos's avatar
André Anjos committed
101
  """Loads a score set from a single file and splits the scores
102

André Anjos's avatar
André Anjos committed
103
104
105
  Loads a score set from a single file and splits the scores between negatives
  and positives. The score file has to respect the 4 column format as defined
  in the method :py:func:`four_column`.
André Anjos's avatar
André Anjos committed
106
107
108
109

  This method avoids loading and allocating memory for the strings present in
  the file. We only keep the scores.

110

André Anjos's avatar
André Anjos committed
111
112
  Parameters:

André Anjos's avatar
André Anjos committed
113
    filename (:py:class:`str`, ``file-like``): The file object that will be
André Anjos's avatar
André Anjos committed
114
      opened with :py:func:`open_file` containing the scores.
André Anjos's avatar
André Anjos committed
115

André Anjos's avatar
André Anjos committed
116

André Anjos's avatar
André Anjos committed
117
  Returns:
118

André Anjos's avatar
André Anjos committed
119
    negatives (array): 1D float array containing the list of scores, for which
André Anjos's avatar
André Anjos committed
120
121
    the ``claimed_id`` and the ``real_id`` differed (see
    :py:func:`four_column`)
André Anjos's avatar
André Anjos committed
122
123

    positivies (array): 1D float array containing the list of scores, for which
André Anjos's avatar
André Anjos committed
124
125
    the ``claimed_id`` and the ``real_id`` are identical (see
    :py:func:`four_column`)
126
127

  """
André Anjos's avatar
André Anjos committed
128

129
130
  score_lines = four_column(filename)
  return _split_scores(score_lines, 1)
André Anjos's avatar
André Anjos committed
131

André Anjos's avatar
André Anjos committed
132

Tiago de Freitas Pereira's avatar
Tiago de Freitas Pereira committed
133
def cmc_four_column(filename):
André Anjos's avatar
André Anjos committed
134
  """Loads scores to compute CMC curves from a file in four column format.
135

André Anjos's avatar
André Anjos committed
136
137
138
  The four column file needs to be in the same format as described in
  :py:func:`four_column`, and the ``test_label`` (column 3) has to contain the
  test/probe file name or a probe id.
139

André Anjos's avatar
André Anjos committed
140
141
142
143
144
  This function returns a list of tuples.  For each probe file, the tuple
  consists of a list of negative scores and a list of positive scores.
  Usually, the list of positive scores should contain only one element, but
  more are allowed.  The result of this function can directly be passed to,
  e.g., the :py:func:`bob.measure.cmc` function.
145

146

André Anjos's avatar
André Anjos committed
147
  Parameters:
148

André Anjos's avatar
André Anjos committed
149
    filename (:py:class:`str`, ``file-like``): The file object that will be
André Anjos's avatar
André Anjos committed
150
      opened with :py:func:`open_file` containing the scores.
151

Tiago de Freitas Pereira's avatar
Tiago de Freitas Pereira committed
152

André Anjos's avatar
André Anjos committed
153
  Returns:
154

André Anjos's avatar
André Anjos committed
155
    list: A list of tuples, where each tuple contains the ``negative`` and
André Anjos's avatar
André Anjos committed
156
157
158
    ``positive`` scores for one probe of the database. Both ``negatives`` and
    ``positives`` can be either an 1D :py:class:`numpy.ndarray` of type
    ``float``, or ``None``.
Tiago de Freitas Pereira's avatar
Tiago de Freitas Pereira committed
159

160
  """
André Anjos's avatar
André Anjos committed
161

162
163
  score_lines = four_column(filename)
  return _split_cmc_scores(score_lines, 1)
Tiago de Freitas Pereira's avatar
Tiago de Freitas Pereira committed
164

André Anjos's avatar
André Anjos committed
165
166

def five_column(filename):
André Anjos's avatar
André Anjos committed
167
168
169
170
171
172
  """Loads a score set from a single file and yield its lines

  Loads a score set from a single file and yield its lines (to avoid loading
  the score file at once into memory).  This function verifies that all fields
  are correctly placed and contain valid fields.  The score file must contain
  the following information in each line:
173

André Anjos's avatar
André Anjos committed
174
  .. code-block:: text
175

André Anjos's avatar
André Anjos committed
176
     claimed_id model_label real_id test_label score
177
178


André Anjos's avatar
André Anjos committed
179
  Parameters:
180

André Anjos's avatar
André Anjos committed
181
    filename (:py:class:`str`, ``file-like``): The file object that will be
André Anjos's avatar
André Anjos committed
182
      opened with :py:func:`open_file` containing the scores.
183
184


185
  Yields:
186

André Anjos's avatar
André Anjos committed
187
    str: The claimed identity -- the client name of the model that was used in
André Anjos's avatar
André Anjos committed
188
    the comparison
189

André Anjos's avatar
André Anjos committed
190
191
192
    str: A label for the model -- usually the model file name, or the model id

    str: The real identity -- the client name of the probe that was used in the
André Anjos's avatar
André Anjos committed
193
    comparison
André Anjos's avatar
André Anjos committed
194
195
196
197

    str: A label of the probe -- usually the probe file name, or the probe id

    float: The result of the comparison of the model and the probe
198

André Anjos's avatar
André Anjos committed
199
200
  """

201
  return _iterate_score_file(filename)
André Anjos's avatar
André Anjos committed
202

Tiago de Freitas Pereira's avatar
Tiago de Freitas Pereira committed
203

André Anjos's avatar
André Anjos committed
204
def split_five_column(filename):
André Anjos's avatar
André Anjos committed
205
  """Loads a score set from a single file and splits the scores
206

André Anjos's avatar
André Anjos committed
207
208
209
  Loads a score set from a single file in five column format and splits the
  scores between negatives and positives. The score file has to respect the 5
  column format as defined in the method :py:func:`five_column`.
André Anjos's avatar
André Anjos committed
210
211
212
213

  This method avoids loading and allocating memory for the strings present in
  the file. We only keep the scores.

214

André Anjos's avatar
André Anjos committed
215
216
  Parameters:

André Anjos's avatar
André Anjos committed
217
    filename (:py:class:`str`, ``file-like``): The file object that will be
André Anjos's avatar
André Anjos committed
218
      opened with :py:func:`open_file` containing the scores.
André Anjos's avatar
André Anjos committed
219

220

André Anjos's avatar
André Anjos committed
221
  Returns:
222

André Anjos's avatar
André Anjos committed
223
    negatives (array): 1D float array containing the list of scores, for which
André Anjos's avatar
André Anjos committed
224
225
    the ``claimed_id`` and the ``real_id`` differed (see
    :py:func:`four_column`)
André Anjos's avatar
André Anjos committed
226
227

    positivies (array): 1D float array containing the list of scores, for which
André Anjos's avatar
André Anjos committed
228
229
    the ``claimed_id`` and the ``real_id`` are identical (see
    :py:func:`four_column`)
230

André Anjos's avatar
André Anjos committed
231
  """
André Anjos's avatar
André Anjos committed
232

233
234
  score_lines = four_column(filename)
  return _split_scores(score_lines, 2)
André Anjos's avatar
André Anjos committed
235

236

Tiago de Freitas Pereira's avatar
Tiago de Freitas Pereira committed
237
def cmc_five_column(filename):
André Anjos's avatar
André Anjos committed
238
239
240
241
242
243
244
245
246
247
248
  """Loads scores to compute CMC curves from a file in five column format.

  The five column file needs to be in the same format as described in
  :py:func:`five_column`, and the ``test_label`` (column 4) has to contain the
  test/probe file name or a probe id.

  This function returns a list of tuples.  For each probe file, the tuple
  consists of a list of negative scores and a list of positive scores.
  Usually, the list of positive scores should contain only one element, but
  more are allowed.  The result of this function can directly be passed to,
  e.g., the :py:func:`bob.measure.cmc` function.
249

André Anjos's avatar
André Anjos committed
250

André Anjos's avatar
André Anjos committed
251
  Parameters:
252

André Anjos's avatar
André Anjos committed
253
    filename (:py:class:`str`, ``file-like``): The file object that will be
André Anjos's avatar
André Anjos committed
254
      opened with :py:func:`open_file` containing the scores.
255
256


André Anjos's avatar
André Anjos committed
257
  Returns:
258

André Anjos's avatar
André Anjos committed
259
    list: A list of tuples, where each tuple contains the ``negative`` and
André Anjos's avatar
André Anjos committed
260
    ``positive`` scores for one probe of the database.
Tiago de Freitas Pereira's avatar
Tiago de Freitas Pereira committed
261

André Anjos's avatar
André Anjos committed
262
  """
263
264
  score_lines = four_column(filename)
  return _split_cmc_scores(score_lines, 2)
André Anjos's avatar
André Anjos committed
265

Tiago de Freitas Pereira's avatar
Tiago de Freitas Pereira committed
266

267
def load_score(filename, ncolumns=None, minimal=False, **kwargs):
268
269
  """Load scores using numpy.loadtxt and return the data as a numpy array.

André Anjos's avatar
André Anjos committed
270
  Parameters:
271

André Anjos's avatar
André Anjos committed
272
    filename (:py:class:`str`, ``file-like``): The file object that will be
André Anjos's avatar
André Anjos committed
273
      opened with :py:func:`open_file` containing the scores.
274

André Anjos's avatar
André Anjos committed
275
276
277
    ncolumns (:py:class:`int`, optional): 4, 5 or None (the default),
      specifying the number of columns in the score file. If None is provided,
      the number of columns will be guessed.
278

279
280
281
282
283
    minimal (:py:class:`bool`, optional): If True, only loads ``claimed_id``, ``real_id``,
      and ``scores``.

    **kwargs: Keyword arguments passed to :py:func:`numpy.genfromtxt`

284

André Anjos's avatar
André Anjos committed
285
286
287
  Returns:

    array: An array which contains not only the actual scores but also the
André Anjos's avatar
André Anjos committed
288
    ``claimed_id``, ``real_id``, ``test_label`` and ``['model_label']``
289
290
291

  """

292
293
294
295
296
297
298
299
300
301
302
303
304
305
  def convertfunc(x):
    return x

  if ncolumns not in (4, 5):
    f = open_file(filename)
    try:
      line = f.readline()
      ncolumns = len(line.split())
    except Exception:
      logger.warn('Could not guess the number of columns in file: {}. '
                  'Assuming 4 column format.'.format(filename))
      ncolumns = 4
    finally:
      f.close()
306

307
  usecols = kwargs.pop('usecols', None)
308
309
310
311
312
313
314
  if ncolumns == 4:
    names = ('claimed_id', 'real_id', 'test_label', 'score')
    converters = {
      0: convertfunc,
      1: convertfunc,
      2: convertfunc,
      3: float}
315
316
    if minimal:
      usecols = (0, 1, 3)
317
318
319
320
321
322
323
324
325

  elif ncolumns == 5:
    names = ('claimed_id', 'model_label', 'real_id', 'test_label', 'score')
    converters = {
      0: convertfunc,
      1: convertfunc,
      2: convertfunc,
      3: convertfunc,
      4: float}
326
327
    if minimal:
      usecols = (0, 2, 4)
328
329
330
331
332
  else:
    raise ValueError("ncolumns of 4 and 5 are supported only.")

  score_lines = numpy.genfromtxt(
    open_file(filename, mode='rb'), dtype=None, names=names,
333
    converters=converters, invalid_raise=True, usecols=usecols, **kwargs)
334
335
336
337
338
339
340
341
342
  new_dtype = []
  for name in score_lines.dtype.names[:-1]:
    new_dtype.append((name, str(score_lines.dtype[name]).replace('S', 'U')))
  new_dtype.append(('score', float))
  score_lines = numpy.array(score_lines, new_dtype)
  return score_lines


def get_negatives_positives(score_lines):
André Anjos's avatar
André Anjos committed
343
344
345
  """Take the output of load_score and return negatives and positives.  This
  function aims to replace split_four_column and split_five_column but takes a
  different input. It's up to you to use which one.
346
  """
André Anjos's avatar
André Anjos committed
347

348
349
350
351
352
353
  pos_mask = score_lines['claimed_id'] == score_lines['real_id']
  positives = score_lines['score'][pos_mask]
  negatives = score_lines['score'][numpy.logical_not(pos_mask)]
  return (negatives, positives)


354
355
356
357
358
359
360
def get_negatives_positives_from_file(filename, **kwargs):
  """Loads the scores first efficiently and then calls
  get_negatives_positives"""
  score_lines = load_score(filename, minimal=True, **kwargs)
  return get_negatives_positives(score_lines)


361
362
def get_negatives_positives_all(score_lines_list):
  """Take a list of outputs of load_score and return stacked negatives and
André Anjos's avatar
André Anjos committed
363
364
365
  positives.
  """

366
367
368
369
370
371
372
373
374
375
376
377
  negatives, positives = [], []
  for score_lines in score_lines_list:
    neg_pos = get_negatives_positives(score_lines)
    negatives.append(neg_pos[0])
    positives.append(neg_pos[1])
  negatives = numpy.vstack(negatives).T
  positives = numpy.vstack(positives).T
  return (negatives, positives)


def get_all_scores(score_lines_list):
  """Take a list of outputs of load_score and return stacked scores"""
André Anjos's avatar
André Anjos committed
378

379
380
381
382
383
384
385
386
  return numpy.vstack([score_lines['score']
                       for score_lines in score_lines_list]).T


def dump_score(filename, score_lines):
  """Dump scores that were loaded using :py:func:`load_score`
  The number of columns is automatically detected.
  """
André Anjos's avatar
André Anjos committed
387

388
389
390
391
392
393
394
  if len(score_lines.dtype) == 5:
    fmt = '%s %s %s %s %.9f'
  elif len(score_lines.dtype) == 4:
    fmt = '%s %s %s %.9f'
  else:
    raise ValueError("Only scores with 4 and 5 columns are supported.")
  numpy.savetxt(filename, score_lines, fmt=fmt)
395

André Anjos's avatar
André Anjos committed
396

397
398
399
400
401
402
403
404
def _iterate_score_file(filename):
  """Opens the score file for reading and yields the score file line by line in a tuple/list.

  The last element of the line (which is the score) will be transformed to float, the other elements will be str
  """
  opened = open_file(filename, 'rb')
  if sys.version_info.major > 2:
    import io
405
406
    if not isinstance(opened, io.TextIOWrapper):
      opened = io.TextIOWrapper(opened, newline="")
407
408
409
410
411
412
413

  reader = csv.reader(opened, delimiter=' ')
  for splits in reader:
    splits[-1] = float(splits[-1])
    yield splits


414
415
416
417
418
419
420
421
422
423
def _split_scores(score_lines, real_id_index, claimed_id_index = 0, score_index = -1):
  """Take the output of :py:func:`four_column` or :py:func:`five_column` and return negatives and positives.
  """
  positives, negatives = [], []
  for line in score_lines:
    which = positives if line[claimed_id_index] == line[real_id_index] else negatives
    which.append(line[score_index])

  return (numpy.array(negatives), numpy.array(positives))

424
425
def _split_cmc_scores(score_lines, real_id_index, probe_name_index = None, claimed_id_index = 0, score_index = -1):
  """Takes the output of :py:func:`four_column` or :py:func:`five_column` and return cmc scores.
André Anjos's avatar
André Anjos committed
426
  """
427
428
429
430
431
432
433
434
435
436
437
438
439
  if probe_name_index is None:
    probe_name_index = real_id_index + 1
  # extract positives and negatives
  pos_dict = {}
  neg_dict = {}
  # read four column list
  for line in score_lines:
    which = pos_dict if line[claimed_id_index] == line[real_id_index] else neg_dict
    probe_name = line[probe_name_index]
    # append score
    if probe_name not in which:
      which[probe_name] = []
    which[probe_name].append(line[score_index])
André Anjos's avatar
André Anjos committed
440

441
442
443
444
445
446
447
  # convert to lists of tuples of ndarrays (or None)
  probe_names = sorted(set(neg_dict.keys()).union(set(pos_dict.keys())))
  # get all scores in the desired format
  return [(
    numpy.array(neg_dict[probe_name], numpy.float64) if probe_name in neg_dict else None,
    numpy.array(pos_dict[probe_name], numpy.float64) if probe_name in pos_dict else None
  ) for probe_name in probe_names]