load.py 7.76 KB
Newer Older
André Anjos's avatar
André Anjos committed
1
2
3
4
#!/usr/bin/env python
# vim: set fileencoding=utf-8 :
# Andre Anjos <andre.anjos@idiap.ch>
# Mon 23 May 2011 16:23:05 CEST
5
6
#
# Copyright (C) 2011-2013 Idiap Research Institute, Martigny, Switzerland
André Anjos's avatar
André Anjos committed
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77

"""A set of utilities to load score files with different formats.
"""

import numpy

def four_column(filename):
  """Loads a score set from a single file to memory.

  Verifies that all fields are correctly placed and contain valid fields.

  Returns a python list of tuples containg the following fields:

    [0]
      claimed identity (string)
    [1]
      real identity (string)
    [2]
      test label (string)
    [3]
      score (float)
  """

  retval = []
  for i, l in enumerate(open(filename, 'rt')):
    s = l.strip()
    if len(s) == 0 or s[0] == '#': continue #empty or comment
    field = [k.strip() for k in s.split()]
    if len(field) < 4:
      raise SyntaxError('Line %d of file "%s" is invalid: %s' % (i, filename, l))
    try:
      score = float(field[3])
      t = (field[0], field[1], field[2], score)
      retval.append(t)
    except:
      raise SyntaxError('Cannot convert score to float at line %d of file "%s": %s' % (i, filename, l))

  return retval

def split_four_column(filename):
  """Loads a score set from a single file to memory and splits the scores
  between positives and negatives. The score file has to respect the 4 column
  format as defined in the method four_column().

  This method avoids loading and allocating memory for the strings present in
  the file. We only keep the scores.

  Returns a python tuple (negatives, positives). The values are 1-D blitz
  arrays of float64.
  """

  # read four column list
  scores_list = four_column(filename)

  # split in positives and negatives
  neg = []
  pos = []
  for (client_id, probe_id, _, score_str) in scores_list:
    try:
      score = float(score_str)
      if client_id == probe_id:
        pos.append(score)
      else:
        neg.append(score)
    except:
      raise SyntaxError('Cannot convert score "%s" to float' % score_str)

  return (numpy.array(neg, numpy.float64), numpy.array(pos, numpy.float64))

def cmc_four_column(filename):
  """Loads scores to compute CMC curves from a file in four column format.
78
79
80
81
82
83
84
85
86
87
88
  
  The four column file needs to be in the same format as described in the
  four_column function, and the "test label" (column 3) has to contain the
  test/probe file name.

  This function returns a list of tuples. For each probe file, the tuple
  consists of a list of negative scores and a list of positive scores.
  Usually, the list of positive scores should contain only one element, but
  more are allowed.

  The result of this function can directly be passed to, e.g., the
André Anjos's avatar
André Anjos committed
89
  :py:func:`bob.measure.cmc` function.
André Anjos's avatar
André Anjos committed
90
  """
91

André Anjos's avatar
André Anjos committed
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
  # read four column list
  all_list = four_column(filename)
  # extract positives and negatives
  pos_dict = {}
  neg_dict = {}
  for (client_id, probe_id, probe_name, score_str) in all_list:
    try:
      score = float(score_str)
      # check in which dict we have to put the score
      if client_id == probe_id:
        correct_dict = pos_dict
      else:
        correct_dict = neg_dict
      # append score
      if probe_name in correct_dict:
        correct_dict[probe_name].append(score)
      else:
        correct_dict[probe_name] = [score]
    except:
      raise SyntaxError("Cannot convert score '%s' to float" % score_str)

  # convert to lists of tuples of ndarrays
  retval = []
  import logging
  logger = logging.getLogger('bob')
  for probe_name in sorted(pos_dict.keys()):
    if probe_name in neg_dict:
      retval.append((numpy.array(neg_dict[probe_name], numpy.float64), numpy.array(pos_dict[probe_name], numpy.float64)))
    else:
      logger.warn('For probe name "%s" there are only positive scores. This probe name is ignored.' % probe_name)
  # test if there are probes for which only negatives exist
  for probe_name in sorted(neg_dict.keys()):
    if not probe_name in pos_dict.keys():
       logger.warn('For probe name "%s" there are only negative scores. This probe name is ignored.' % probe_name)

  return retval

def five_column(filename):
  """Loads a score set from a single file to memory.

  Verifies that all fields are correctly placed and contain valid fields.

  Returns a python list of tuples containg the following fields:

    [0]
      claimed identity (string)
    [1]
      model label (string)
    [2]
      real identity (string)
    [3]
      test label (string)
    [4]
      score (float)
  """

  retval = []
  for i, l in enumerate(open(filename, 'rt')):
    s = l.strip()
    if len(s) == 0 or s[0] == '#': continue #empty or comment
    field = [k.strip() for k in s.split()]
    if len(field) < 5:
      raise SyntaxError('Line %d of file "%s" is invalid: %s' % (i, filename, l))
    try:
      score = float(field[4])
      t = (field[0], field[1], field[2], field[3], score)
      retval.append(t)
    except:
      raise SyntaxError('Cannot convert score to float at line %d of file "%s": %s' % (i, filename, l))

  return retval

def split_five_column(filename):
  """Loads a score set from a single file to memory and splits the scores
  between positives and negatives. The score file has to respect the 5 column
  format as defined in the method five_column().

  This method avoids loading and allocating memory for the strings present in
  the file. We only keep the scores.

  Returns a python tuple (negatives, positives). The values are 1-D blitz
  arrays of float64.
  """

  # read five column list
  scores_list = five_column(filename)

  # split in positives and negatives
  neg = []
  pos = []
  for (client_id, _, probe_id, _, score_str) in scores_list:
    try:
      score = float(score_str)
      if client_id == probe_id:
        pos.append(score)
      else:
        neg.append(score)
    except:
      raise SyntaxError('Cannot convert score "%s" to float' % score_str)

  return (numpy.array(neg, numpy.float64), numpy.array(pos, numpy.float64))

def cmc_five_column(filename):
  """Loads scores to compute CMC curves from a file in five column format.

197
198
199
  The four column file needs to be in the same format as described in the
  five_column function, and the "test label" (column 4) has to contain the
  test/probe file name.
André Anjos's avatar
André Anjos committed
200

201
202
203
204
205
206
  This function returns a list of tuples.  For each probe file, the tuple
  consists of a list of negative scores and a list of positive scores.
  Usually, the list of positive scores should contain only one element, but
  more are allowed.

  The result of this function can directly be passed to, e.g., the
André Anjos's avatar
André Anjos committed
207
  :py:func:`bob.measure.cmc` function.
André Anjos's avatar
André Anjos committed
208
  """
209

André Anjos's avatar
André Anjos committed
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
  # read four column list
  all_list = five_column(filename)

  pos_dict = {}
  neg_dict = {}
  for (client_id, _, probe_id, probe_name, score_str) in all_list:
    try:
      score = float(score_str)
      # check in which dict we have to put the score
      if client_id == probe_id:
        correct_dict = pos_dict
      else:
        correct_dict = neg_dict
      # append score
      if probe_name in correct_dict:
        correct_dict[probe_name].append(score)
      else:
        correct_dict[probe_name] = [score]
    except:
      raise SyntaxError('Cannot convert score "%s" to float' % score_str)

  # convert to lists of tuples of ndarrays
  retval = []
  import logging
  logger = logging.getLogger('bob')
  for probe_name in sorted(pos_dict.keys()):
    if probe_name in neg_dict:
      retval.append((numpy.array(neg_dict[probe_name], numpy.float64), numpy.array(pos_dict[probe_name], numpy.float64)))
    else:
      logger.warn('For probe name "%s" there are only positive scores. This probe name is ignored.' % probe_name)
  # test if there are probes for which only negatives exist
  for probe_name in sorted(neg_dict.keys()):
    if not probe_name in pos_dict.keys():
       logger.warn('For probe name "%s" there are only negative scores. This probe name is ignored.' % probe_name)
  return retval