load.py 13.5 KB
Newer Older
André Anjos's avatar
André Anjos committed
1 2 3 4 5 6 7 8
#!/usr/bin/env python
# vim: set fileencoding=utf-8 :
# Mon 23 May 2011 16:23:05 CEST

"""A set of utilities to load score files with different formats.
"""

import numpy
9 10 11
import tarfile
import os

12 13 14
import logging
logger = logging.getLogger('bob.measure')

André Anjos's avatar
André Anjos committed
15

16
def open_file(filename, mode='rt'):
André Anjos's avatar
André Anjos committed
17 18 19 20 21 22 23 24
  """Opens the given score file for reading.

  Score files might be raw text files, or a tar-file including a single score
  file inside.


  Parameters:

André Anjos's avatar
André Anjos committed
25 26 27
    filename (:py:class:`str`, file object): The name of the score file to
      open, or a file-like object open for reading. If a file name is given,
      the according file might be a raw text file or a (compressed) tar file
28
      containing a raw text file.
29

30

André Anjos's avatar
André Anjos committed
31
  Returns:
32

33

André Anjos's avatar
André Anjos committed
34 35
    file: A read-only file-like object as it would be returned by
      :py:func:`open`.
36

37
  """
André Anjos's avatar
André Anjos committed
38

39 40 41 42
  if not isinstance(filename, str) and hasattr(filename, 'read'):
    # It seems that this is an open file
    return filename

43 44 45
  if not os.path.isfile(filename):
    raise IOError("Score file '%s' does not exist." % filename)
  if not tarfile.is_tarfile(filename):
46
    return open(filename, mode)
47 48 49 50 51 52 53 54 55 56 57 58 59 60

  # open the tar file for reading
  tar = tarfile.open(filename, 'r')
  # get the first file in the tar file
  tar_info = tar.next()
  while tar_info is not None and not tar_info.isfile():
    tar_info = tar.next()
  # check that one file was found in the archive
  if tar_info is None:
    raise IOError("The given file is a .tar file, but it does not contain any file.")

  # open the file for reading
  return tar.extractfile(tar_info)

André Anjos's avatar
André Anjos committed
61 62

def four_column(filename):
André Anjos's avatar
André Anjos committed
63 64 65 66 67 68 69 70 71 72
  """Loads a score set from a single file and yield its lines

  Loads a score set from a single file and yield its lines (to avoid loading
  the score file at once into memory).  This function verifies that all fields
  are correctly placed and contain valid fields.  The score file must contain
  the following information in each line:

  .. code-block:: text

     claimed_id real_id test_label score
73 74


André Anjos's avatar
André Anjos committed
75
  Parameters:
76

André Anjos's avatar
André Anjos committed
77 78
    filename (:py:class:`str`, :py:class:`File`): The file object that will be
      opened with :py:func:`open_file` containing the scores.
79

André Anjos's avatar
André Anjos committed
80

André Anjos's avatar
André Anjos committed
81
  Returns:
André Anjos's avatar
André Anjos committed
82

André Anjos's avatar
André Anjos committed
83 84
    str: The claimed identity -- the client name of the model that was used in
      the comparison
André Anjos's avatar
André Anjos committed
85

André Anjos's avatar
André Anjos committed
86 87
    str: The real identity -- the client name of the probe that was used in the
      comparison
88

André Anjos's avatar
André Anjos committed
89 90 91
    str: A label of the probe -- usually the probe file name, or the probe id

    float: The result of the comparison of the model and the probe
92

André Anjos's avatar
André Anjos committed
93 94
  """

95 96
  for i, l in enumerate(open_file(filename)):
    if isinstance(l, bytes): l = l.decode('utf-8')
André Anjos's avatar
André Anjos committed
97 98 99 100 101 102 103 104 105
    s = l.strip()
    if len(s) == 0 or s[0] == '#': continue #empty or comment
    field = [k.strip() for k in s.split()]
    if len(field) < 4:
      raise SyntaxError('Line %d of file "%s" is invalid: %s' % (i, filename, l))
    try:
      score = float(field[3])
    except:
      raise SyntaxError('Cannot convert score to float at line %d of file "%s": %s' % (i, filename, l))
106
    yield (field[0], field[1], field[2], score)
André Anjos's avatar
André Anjos committed
107 108 109


def split_four_column(filename):
André Anjos's avatar
André Anjos committed
110
  """Loads a score set from a single file and splits the scores
111

André Anjos's avatar
André Anjos committed
112 113 114
  Loads a score set from a single file and splits the scores between negatives
  and positives. The score file has to respect the 4 column format as defined
  in the method :py:func:`four_column`.
André Anjos's avatar
André Anjos committed
115 116 117 118

  This method avoids loading and allocating memory for the strings present in
  the file. We only keep the scores.

119

André Anjos's avatar
André Anjos committed
120 121
  Parameters:

André Anjos's avatar
André Anjos committed
122 123
    filename (:py:class:`str`, :py:class:`File`): The file object that will be
      opened with :py:func:`open_file` containing the scores.
André Anjos's avatar
André Anjos committed
124

André Anjos's avatar
André Anjos committed
125

André Anjos's avatar
André Anjos committed
126
  Returns:
127

André Anjos's avatar
André Anjos committed
128 129 130 131 132 133 134
    negatives (array): 1D float array containing the list of scores, for which
      the ``claimed_id`` and the ``real_id`` differed (see
      :py:func:`four_column`)

    positivies (array): 1D float array containing the list of scores, for which
      the ``claimed_id`` and the ``real_id`` are identical (see
      :py:func:`four_column`)
135 136

  """
André Anjos's avatar
André Anjos committed
137

138 139
  score_lines = load_score(filename, 4)
  return get_negatives_positives(score_lines)
André Anjos's avatar
André Anjos committed
140

André Anjos's avatar
André Anjos committed
141

Tiago de Freitas Pereira's avatar
Tiago de Freitas Pereira committed
142
def cmc_four_column(filename):
André Anjos's avatar
André Anjos committed
143
  """Loads scores to compute CMC curves from a file in four column format.
144

André Anjos's avatar
André Anjos committed
145 146 147
  The four column file needs to be in the same format as described in
  :py:func:`four_column`, and the ``test_label`` (column 3) has to contain the
  test/probe file name or a probe id.
148

André Anjos's avatar
André Anjos committed
149 150 151 152 153
  This function returns a list of tuples.  For each probe file, the tuple
  consists of a list of negative scores and a list of positive scores.
  Usually, the list of positive scores should contain only one element, but
  more are allowed.  The result of this function can directly be passed to,
  e.g., the :py:func:`bob.measure.cmc` function.
154

155

André Anjos's avatar
André Anjos committed
156
  Parameters:
157

André Anjos's avatar
André Anjos committed
158 159
    filename (:py:class:`str`, :py:class:`File`): The file object that will be
      opened with :py:func:`open_file` containing the scores.
160

Tiago de Freitas Pereira's avatar
Tiago de Freitas Pereira committed
161

André Anjos's avatar
André Anjos committed
162
  Returns:
163

André Anjos's avatar
André Anjos committed
164 165 166 167
    list: A list of tuples, where each tuple contains the ``negative`` and
      ``positive`` scores for one probe of the database. Both ``negatives`` and
      ``positives`` can be either an 1D :py:class:`numpy.ndarray` of type
      ``float``, or ``None``.
Tiago de Freitas Pereira's avatar
Tiago de Freitas Pereira committed
168

169
  """
André Anjos's avatar
André Anjos committed
170

André Anjos's avatar
André Anjos committed
171 172 173
  # extract positives and negatives
  pos_dict = {}
  neg_dict = {}
174 175 176 177 178 179 180 181
  # read four column list
  for (client_id, probe_id, probe_name, score) in four_column(filename):
    # check in which dict we have to put the score
    correct_dict = pos_dict if client_id == probe_id else neg_dict

    # append score
    if probe_name in correct_dict:
      correct_dict[probe_name].append(score)
Tiago de Freitas Pereira's avatar
Tiago de Freitas Pereira committed
182
    else:
183
      correct_dict[probe_name] = [score]
Tiago de Freitas Pereira's avatar
Tiago de Freitas Pereira committed
184

185 186
  # convert that into the desired format
  return _convert_cmc_scores(neg_dict, pos_dict)
Tiago de Freitas Pereira's avatar
Tiago de Freitas Pereira committed
187

André Anjos's avatar
André Anjos committed
188 189

def five_column(filename):
André Anjos's avatar
André Anjos committed
190 191 192 193 194 195
  """Loads a score set from a single file and yield its lines

  Loads a score set from a single file and yield its lines (to avoid loading
  the score file at once into memory).  This function verifies that all fields
  are correctly placed and contain valid fields.  The score file must contain
  the following information in each line:
196

André Anjos's avatar
André Anjos committed
197
  .. code-block:: text
198

André Anjos's avatar
André Anjos committed
199
     claimed_id model_label real_id test_label score
200 201


André Anjos's avatar
André Anjos committed
202
  Parameters:
203

André Anjos's avatar
André Anjos committed
204 205
    filename (:py:class:`str`, :py:class:`File`): The file object that will be
      opened with :py:func:`open_file` containing the scores.
206 207


André Anjos's avatar
André Anjos committed
208
  Returns:
209

André Anjos's avatar
André Anjos committed
210 211
    str: The claimed identity -- the client name of the model that was used in
      the comparison
212

André Anjos's avatar
André Anjos committed
213 214 215 216 217 218 219 220
    str: A label for the model -- usually the model file name, or the model id

    str: The real identity -- the client name of the probe that was used in the
      comparison

    str: A label of the probe -- usually the probe file name, or the probe id

    float: The result of the comparison of the model and the probe
221

André Anjos's avatar
André Anjos committed
222 223
  """

224
  for i, l in enumerate(open_file(filename)):
225
    if isinstance(l, bytes): l = l.decode('utf-8')
André Anjos's avatar
André Anjos committed
226 227 228 229 230 231 232 233 234
    s = l.strip()
    if len(s) == 0 or s[0] == '#': continue #empty or comment
    field = [k.strip() for k in s.split()]
    if len(field) < 5:
      raise SyntaxError('Line %d of file "%s" is invalid: %s' % (i, filename, l))
    try:
      score = float(field[4])
    except:
      raise SyntaxError('Cannot convert score to float at line %d of file "%s": %s' % (i, filename, l))
235
    yield (field[0], field[1], field[2], field[3], score)
André Anjos's avatar
André Anjos committed
236

Tiago de Freitas Pereira's avatar
Tiago de Freitas Pereira committed
237

André Anjos's avatar
André Anjos committed
238
def split_five_column(filename):
André Anjos's avatar
André Anjos committed
239
  """Loads a score set from a single file and splits the scores
240

André Anjos's avatar
André Anjos committed
241 242 243
  Loads a score set from a single file in five column format and splits the
  scores between negatives and positives. The score file has to respect the 5
  column format as defined in the method :py:func:`five_column`.
André Anjos's avatar
André Anjos committed
244 245 246 247

  This method avoids loading and allocating memory for the strings present in
  the file. We only keep the scores.

248

André Anjos's avatar
André Anjos committed
249 250
  Parameters:

André Anjos's avatar
André Anjos committed
251 252
    filename (:py:class:`str`, :py:class:`File`): The file object that will be
      opened with :py:func:`open_file` containing the scores.
André Anjos's avatar
André Anjos committed
253

254

André Anjos's avatar
André Anjos committed
255
  Returns:
256

André Anjos's avatar
André Anjos committed
257 258 259 260 261 262 263
    negatives (array): 1D float array containing the list of scores, for which
      the ``claimed_id`` and the ``real_id`` differed (see
      :py:func:`four_column`)

    positivies (array): 1D float array containing the list of scores, for which
      the ``claimed_id`` and the ``real_id`` are identical (see
      :py:func:`four_column`)
264

André Anjos's avatar
André Anjos committed
265
  """
André Anjos's avatar
André Anjos committed
266

267 268
  score_lines = load_score(filename, 5)
  return get_negatives_positives(score_lines)
André Anjos's avatar
André Anjos committed
269

270

Tiago de Freitas Pereira's avatar
Tiago de Freitas Pereira committed
271
def cmc_five_column(filename):
André Anjos's avatar
André Anjos committed
272 273 274 275 276 277 278 279 280 281 282
  """Loads scores to compute CMC curves from a file in five column format.

  The five column file needs to be in the same format as described in
  :py:func:`five_column`, and the ``test_label`` (column 4) has to contain the
  test/probe file name or a probe id.

  This function returns a list of tuples.  For each probe file, the tuple
  consists of a list of negative scores and a list of positive scores.
  Usually, the list of positive scores should contain only one element, but
  more are allowed.  The result of this function can directly be passed to,
  e.g., the :py:func:`bob.measure.cmc` function.
283

André Anjos's avatar
André Anjos committed
284

André Anjos's avatar
André Anjos committed
285
  Parameters:
286

André Anjos's avatar
André Anjos committed
287 288
    filename (:py:class:`str`, :py:class:`File`): The file object that will be
      opened with :py:func:`open_file` containing the scores.
289 290


André Anjos's avatar
André Anjos committed
291
  Returns:
292

André Anjos's avatar
André Anjos committed
293 294
    list: A list of tuples, where each tuple contains the ``negative`` and
      ``positive`` scores for one probe of the database.
Tiago de Freitas Pereira's avatar
Tiago de Freitas Pereira committed
295

André Anjos's avatar
André Anjos committed
296
  """
297
  # extract positives and negatives
André Anjos's avatar
André Anjos committed
298 299
  pos_dict = {}
  neg_dict = {}
300 301 302
  # read four column list
  for (client_id, _, probe_id, probe_name, score) in five_column(filename):
    # check in which dict we have to put the score
303 304
    correct_dict = pos_dict if client_id == probe_id else neg_dict

305 306 307 308 309
    # append score
    if probe_name in correct_dict:
      correct_dict[probe_name].append(score)
    else:
      correct_dict[probe_name] = [score]
André Anjos's avatar
André Anjos committed
310

311 312
  # convert that into the desired format
  return _convert_cmc_scores(neg_dict, pos_dict)
Tiago de Freitas Pereira's avatar
Tiago de Freitas Pereira committed
313

314

315
def load_score(filename, ncolumns=None):
316 317
  """Load scores using numpy.loadtxt and return the data as a numpy array.

André Anjos's avatar
André Anjos committed
318
  Parameters:
319

André Anjos's avatar
André Anjos committed
320 321
    filename (:py:class:`str`, :py:class:`File`): The file object that will be
      opened with :py:func:`open_file` containing the scores.
322

André Anjos's avatar
André Anjos committed
323 324 325
    ncolumns (:py:class:`int`, optional): 4, 5 or None (the default),
      specifying the number of columns in the score file. If None is provided,
      the number of columns will be guessed.
326 327


André Anjos's avatar
André Anjos committed
328 329 330 331
  Returns:

    array: An array which contains not only the actual scores but also the
      ``claimed_id``, ``real_id``, ``test_label`` and ``['model_label']``
332 333 334

  """

335 336 337 338 339 340 341 342 343 344 345 346 347 348
  def convertfunc(x):
    return x

  if ncolumns not in (4, 5):
    f = open_file(filename)
    try:
      line = f.readline()
      ncolumns = len(line.split())
    except Exception:
      logger.warn('Could not guess the number of columns in file: {}. '
                  'Assuming 4 column format.'.format(filename))
      ncolumns = 4
    finally:
      f.close()
349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380

  if ncolumns == 4:
    names = ('claimed_id', 'real_id', 'test_label', 'score')
    converters = {
      0: convertfunc,
      1: convertfunc,
      2: convertfunc,
      3: float}

  elif ncolumns == 5:
    names = ('claimed_id', 'model_label', 'real_id', 'test_label', 'score')
    converters = {
      0: convertfunc,
      1: convertfunc,
      2: convertfunc,
      3: convertfunc,
      4: float}
  else:
    raise ValueError("ncolumns of 4 and 5 are supported only.")

  score_lines = numpy.genfromtxt(
    open_file(filename, mode='rb'), dtype=None, names=names,
    converters=converters, invalid_raise=True)
  new_dtype = []
  for name in score_lines.dtype.names[:-1]:
    new_dtype.append((name, str(score_lines.dtype[name]).replace('S', 'U')))
  new_dtype.append(('score', float))
  score_lines = numpy.array(score_lines, new_dtype)
  return score_lines


def get_negatives_positives(score_lines):
André Anjos's avatar
André Anjos committed
381 382 383
  """Take the output of load_score and return negatives and positives.  This
  function aims to replace split_four_column and split_five_column but takes a
  different input. It's up to you to use which one.
384
  """
André Anjos's avatar
André Anjos committed
385

386 387 388 389 390 391 392 393
  pos_mask = score_lines['claimed_id'] == score_lines['real_id']
  positives = score_lines['score'][pos_mask]
  negatives = score_lines['score'][numpy.logical_not(pos_mask)]
  return (negatives, positives)


def get_negatives_positives_all(score_lines_list):
  """Take a list of outputs of load_score and return stacked negatives and
André Anjos's avatar
André Anjos committed
394 395 396
  positives.
  """

397 398 399 400 401 402 403 404 405 406 407 408
  negatives, positives = [], []
  for score_lines in score_lines_list:
    neg_pos = get_negatives_positives(score_lines)
    negatives.append(neg_pos[0])
    positives.append(neg_pos[1])
  negatives = numpy.vstack(negatives).T
  positives = numpy.vstack(positives).T
  return (negatives, positives)


def get_all_scores(score_lines_list):
  """Take a list of outputs of load_score and return stacked scores"""
André Anjos's avatar
André Anjos committed
409

410 411 412 413 414 415 416 417
  return numpy.vstack([score_lines['score']
                       for score_lines in score_lines_list]).T


def dump_score(filename, score_lines):
  """Dump scores that were loaded using :py:func:`load_score`
  The number of columns is automatically detected.
  """
André Anjos's avatar
André Anjos committed
418

419 420 421 422 423 424 425
  if len(score_lines.dtype) == 5:
    fmt = '%s %s %s %s %.9f'
  elif len(score_lines.dtype) == 4:
    fmt = '%s %s %s %.9f'
  else:
    raise ValueError("Only scores with 4 and 5 columns are supported.")
  numpy.savetxt(filename, score_lines, fmt=fmt)
426

André Anjos's avatar
André Anjos committed
427

428
def _convert_cmc_scores(neg_dict, pos_dict):
André Anjos's avatar
André Anjos committed
429 430 431 432 433
  """Converts the negative and positive scores read with
  :py:func:`cmc_four_column` or :py:func:`cmc_four_column` into a format that
  is handled by the :py:func:`bob.measure.cmc` and similar functions.
  """

434 435 436 437 438 439 440
  # convert to lists of tuples of ndarrays (or None)
  probe_names = sorted(set(neg_dict.keys()).union(set(pos_dict.keys())))
  # get all scores in the desired format
  return [(
    numpy.array(neg_dict[probe_name], numpy.float64) if probe_name in neg_dict else None,
    numpy.array(pos_dict[probe_name], numpy.float64) if probe_name in pos_dict else None
  ) for probe_name in probe_names]