load.py 7.76 KB
Newer Older
André Anjos's avatar
André Anjos committed
1 2 3 4
#!/usr/bin/env python
# vim: set fileencoding=utf-8 :
# Andre Anjos <andre.anjos@idiap.ch>
# Mon 23 May 2011 16:23:05 CEST
5 6
#
# Copyright (C) 2011-2013 Idiap Research Institute, Martigny, Switzerland
André Anjos's avatar
André Anjos committed
7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77

"""A set of utilities to load score files with different formats.
"""

import numpy

def four_column(filename):
  """Loads a score set from a single file to memory.

  Verifies that all fields are correctly placed and contain valid fields.

  Returns a python list of tuples containg the following fields:

    [0]
      claimed identity (string)
    [1]
      real identity (string)
    [2]
      test label (string)
    [3]
      score (float)
  """

  retval = []
  for i, l in enumerate(open(filename, 'rt')):
    s = l.strip()
    if len(s) == 0 or s[0] == '#': continue #empty or comment
    field = [k.strip() for k in s.split()]
    if len(field) < 4:
      raise SyntaxError('Line %d of file "%s" is invalid: %s' % (i, filename, l))
    try:
      score = float(field[3])
      t = (field[0], field[1], field[2], score)
      retval.append(t)
    except:
      raise SyntaxError('Cannot convert score to float at line %d of file "%s": %s' % (i, filename, l))

  return retval

def split_four_column(filename):
  """Loads a score set from a single file to memory and splits the scores
  between positives and negatives. The score file has to respect the 4 column
  format as defined in the method four_column().

  This method avoids loading and allocating memory for the strings present in
  the file. We only keep the scores.

  Returns a python tuple (negatives, positives). The values are 1-D blitz
  arrays of float64.
  """

  # read four column list
  scores_list = four_column(filename)

  # split in positives and negatives
  neg = []
  pos = []
  for (client_id, probe_id, _, score_str) in scores_list:
    try:
      score = float(score_str)
      if client_id == probe_id:
        pos.append(score)
      else:
        neg.append(score)
    except:
      raise SyntaxError('Cannot convert score "%s" to float' % score_str)

  return (numpy.array(neg, numpy.float64), numpy.array(pos, numpy.float64))

def cmc_four_column(filename):
  """Loads scores to compute CMC curves from a file in four column format.
78 79 80 81 82 83 84 85 86 87 88
  
  The four column file needs to be in the same format as described in the
  four_column function, and the "test label" (column 3) has to contain the
  test/probe file name.

  This function returns a list of tuples. For each probe file, the tuple
  consists of a list of negative scores and a list of positive scores.
  Usually, the list of positive scores should contain only one element, but
  more are allowed.

  The result of this function can directly be passed to, e.g., the
André Anjos's avatar
André Anjos committed
89
  :py:func:`bob.measure.cmc` function.
André Anjos's avatar
André Anjos committed
90
  """
91

André Anjos's avatar
André Anjos committed
92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196
  # read four column list
  all_list = four_column(filename)
  # extract positives and negatives
  pos_dict = {}
  neg_dict = {}
  for (client_id, probe_id, probe_name, score_str) in all_list:
    try:
      score = float(score_str)
      # check in which dict we have to put the score
      if client_id == probe_id:
        correct_dict = pos_dict
      else:
        correct_dict = neg_dict
      # append score
      if probe_name in correct_dict:
        correct_dict[probe_name].append(score)
      else:
        correct_dict[probe_name] = [score]
    except:
      raise SyntaxError("Cannot convert score '%s' to float" % score_str)

  # convert to lists of tuples of ndarrays
  retval = []
  import logging
  logger = logging.getLogger('bob')
  for probe_name in sorted(pos_dict.keys()):
    if probe_name in neg_dict:
      retval.append((numpy.array(neg_dict[probe_name], numpy.float64), numpy.array(pos_dict[probe_name], numpy.float64)))
    else:
      logger.warn('For probe name "%s" there are only positive scores. This probe name is ignored.' % probe_name)
  # test if there are probes for which only negatives exist
  for probe_name in sorted(neg_dict.keys()):
    if not probe_name in pos_dict.keys():
       logger.warn('For probe name "%s" there are only negative scores. This probe name is ignored.' % probe_name)

  return retval

def five_column(filename):
  """Loads a score set from a single file to memory.

  Verifies that all fields are correctly placed and contain valid fields.

  Returns a python list of tuples containg the following fields:

    [0]
      claimed identity (string)
    [1]
      model label (string)
    [2]
      real identity (string)
    [3]
      test label (string)
    [4]
      score (float)
  """

  retval = []
  for i, l in enumerate(open(filename, 'rt')):
    s = l.strip()
    if len(s) == 0 or s[0] == '#': continue #empty or comment
    field = [k.strip() for k in s.split()]
    if len(field) < 5:
      raise SyntaxError('Line %d of file "%s" is invalid: %s' % (i, filename, l))
    try:
      score = float(field[4])
      t = (field[0], field[1], field[2], field[3], score)
      retval.append(t)
    except:
      raise SyntaxError('Cannot convert score to float at line %d of file "%s": %s' % (i, filename, l))

  return retval

def split_five_column(filename):
  """Loads a score set from a single file to memory and splits the scores
  between positives and negatives. The score file has to respect the 5 column
  format as defined in the method five_column().

  This method avoids loading and allocating memory for the strings present in
  the file. We only keep the scores.

  Returns a python tuple (negatives, positives). The values are 1-D blitz
  arrays of float64.
  """

  # read five column list
  scores_list = five_column(filename)

  # split in positives and negatives
  neg = []
  pos = []
  for (client_id, _, probe_id, _, score_str) in scores_list:
    try:
      score = float(score_str)
      if client_id == probe_id:
        pos.append(score)
      else:
        neg.append(score)
    except:
      raise SyntaxError('Cannot convert score "%s" to float' % score_str)

  return (numpy.array(neg, numpy.float64), numpy.array(pos, numpy.float64))

def cmc_five_column(filename):
  """Loads scores to compute CMC curves from a file in five column format.

197 198 199
  The four column file needs to be in the same format as described in the
  five_column function, and the "test label" (column 4) has to contain the
  test/probe file name.
André Anjos's avatar
André Anjos committed
200

201 202 203 204 205 206
  This function returns a list of tuples.  For each probe file, the tuple
  consists of a list of negative scores and a list of positive scores.
  Usually, the list of positive scores should contain only one element, but
  more are allowed.

  The result of this function can directly be passed to, e.g., the
André Anjos's avatar
André Anjos committed
207
  :py:func:`bob.measure.cmc` function.
André Anjos's avatar
André Anjos committed
208
  """
209

André Anjos's avatar
André Anjos committed
210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244
  # read four column list
  all_list = five_column(filename)

  pos_dict = {}
  neg_dict = {}
  for (client_id, _, probe_id, probe_name, score_str) in all_list:
    try:
      score = float(score_str)
      # check in which dict we have to put the score
      if client_id == probe_id:
        correct_dict = pos_dict
      else:
        correct_dict = neg_dict
      # append score
      if probe_name in correct_dict:
        correct_dict[probe_name].append(score)
      else:
        correct_dict[probe_name] = [score]
    except:
      raise SyntaxError('Cannot convert score "%s" to float' % score_str)

  # convert to lists of tuples of ndarrays
  retval = []
  import logging
  logger = logging.getLogger('bob')
  for probe_name in sorted(pos_dict.keys()):
    if probe_name in neg_dict:
      retval.append((numpy.array(neg_dict[probe_name], numpy.float64), numpy.array(pos_dict[probe_name], numpy.float64)))
    else:
      logger.warn('For probe name "%s" there are only positive scores. This probe name is ignored.' % probe_name)
  # test if there are probes for which only negatives exist
  for probe_name in sorted(neg_dict.keys()):
    if not probe_name in pos_dict.keys():
       logger.warn('For probe name "%s" there are only negative scores. This probe name is ignored.' % probe_name)
  return retval