test_error.py 14.5 KB
Newer Older
André Anjos's avatar
André Anjos committed
1
2
3
#!/usr/bin/env python
# vim: set fileencoding=utf-8 :
# Andre Anjos <andre.anjos@idiap.ch>
4
# Wed 11 Dec 15:14:08 2013 CET
André Anjos's avatar
André Anjos committed
5
6
7
8
9
10
#
# Copyright (C) 2011-2013 Idiap Research Institute, Martigny, Switzerland

"""Basic tests for the error measuring system of bob
"""

11
import os
André Anjos's avatar
André Anjos committed
12
import numpy
13
import nose.tools
André Anjos's avatar
André Anjos committed
14
import bob.io.base
André Anjos's avatar
André Anjos committed
15

Amir Mohammadi's avatar
lint    
Amir Mohammadi committed
16

André Anjos's avatar
André Anjos committed
17
18
def F(f):
  """Returns the test file on the "data" subdirectory"""
19
  import pkg_resources
André Anjos's avatar
André Anjos committed
20
21
  return pkg_resources.resource_filename(__name__, os.path.join('data', f))

22

André Anjos's avatar
André Anjos committed
23
24
def save(fname, data):
  """Saves a single array into a file in the 'data' directory."""
25
  bob.io.base.save(data, os.path.join('bob/measure/data', fname))
26
27
28
29


def test_basic_ratios():

André Anjos's avatar
André Anjos committed
30
  from . import farfrr, precision_recall, f_score
31
32
33
34

  # We test the basic functionaly on FAR and FRR calculation. The first
  # example is separable, with a separation threshold of about 3.0

André Anjos's avatar
André Anjos committed
35
36
  positives = bob.io.base.load(F('linsep-positives.hdf5'))
  negatives = bob.io.base.load(F('linsep-negatives.hdf5'))
37
38
39
40
41
42

  minimum = min(positives.min(), negatives.min())
  maximum = max(positives.max(), negatives.max())

  # If we take a threshold on the minimum, the FAR should be 1.0 and the FRR
  # should be 0.0. Precision should be 0.5, recall should be 1.0
Amir Mohammadi's avatar
lint    
Amir Mohammadi committed
43
  far, frr = farfrr(negatives, positives, minimum - 0.1)
44
45
  nose.tools.eq_(far, 1.0)
  nose.tools.eq_(frr, 0.0)
Amir Mohammadi's avatar
lint    
Amir Mohammadi committed
46
  prec, recall = precision_recall(negatives, positives, minimum - 0.1)
47
48
  nose.tools.eq_(prec, 0.5)
  nose.tools.eq_(recall, 1.0)
49

50
51
  # Similarly, if we take a threshold on the maximum, the FRR should be 1.0
  # while the FAR should be 0.0. Both precision and recall should be 0.0.
Amir Mohammadi's avatar
lint    
Amir Mohammadi committed
52
  far, frr = farfrr(negatives, positives, maximum + 0.1)
53
54
  nose.tools.eq_(far, 0.0)
  nose.tools.eq_(frr, 1.0)
Amir Mohammadi's avatar
lint    
Amir Mohammadi committed
55
  prec, recall = precision_recall(negatives, positives, maximum + 0.1)
56
57
58
59
60
61
62
63
64
65
66
  nose.tools.eq_(prec, 0.0)
  nose.tools.eq_(recall, 0.0)

  # If we choose the appropriate threshold, we should get 0.0 for both FAR
  # and FRR. Precision will be 1.0, recall will be 1.0
  far, frr = farfrr(negatives, positives, 3.0)
  nose.tools.eq_(far, 0.0)
  nose.tools.eq_(frr, 0.0)
  prec, recall = precision_recall(negatives, positives, 3.0)
  nose.tools.eq_(prec, 1.0)
  nose.tools.eq_(recall, 1.0)
67

Amir Mohammadi's avatar
lint    
Amir Mohammadi committed
68
69
70
  # Testing the values of F-score depending on different choices of the
  # threshold
  f_score_ = f_score(negatives, positives, minimum - 0.1)
André Anjos's avatar
André Anjos committed
71
  nose.tools.assert_almost_equal(f_score_, 0.66666667)
Amir Mohammadi's avatar
lint    
Amir Mohammadi committed
72
  f_score_ = f_score(negatives, positives, minimum - 0.1, 2)
André Anjos's avatar
André Anjos committed
73
  nose.tools.assert_almost_equal(f_score_, 0.83333333)
74

Amir Mohammadi's avatar
lint    
Amir Mohammadi committed
75
  f_score_ = f_score(negatives, positives, maximum + 0.1)
André Anjos's avatar
André Anjos committed
76
  nose.tools.eq_(f_score_, 0.0)
Amir Mohammadi's avatar
lint    
Amir Mohammadi committed
77
  f_score_ = f_score(negatives, positives, maximum + 0.1, 2)
André Anjos's avatar
André Anjos committed
78
  nose.tools.eq_(f_score_, 0.0)
79

André Anjos's avatar
André Anjos committed
80
81
82
83
  f_score_ = f_score(negatives, positives, 3.0)
  nose.tools.eq_(f_score_, 1.0)
  f_score_ = f_score(negatives, positives, 3.0, 2)
  nose.tools.eq_(f_score_, 1.0)
84

85
86
87

def test_indexing():

André Anjos's avatar
André Anjos committed
88
  from . import correctly_classified_positives, correctly_classified_negatives
89
90
91

  # This test verifies that the output of correctly_classified_positives() and
  # correctly_classified_negatives() makes sense.
André Anjos's avatar
André Anjos committed
92
93
  positives = bob.io.base.load(F('linsep-positives.hdf5'))
  negatives = bob.io.base.load(F('linsep-negatives.hdf5'))
94
95
96
97
98
99
100

  minimum = min(positives.min(), negatives.min())
  maximum = max(positives.max(), negatives.max())

  # If the threshold is minimum, we should have all positive samples
  # correctly classified and none of the negative samples correctly
  # classified.
Amir Mohammadi's avatar
lint    
Amir Mohammadi committed
101
102
  assert correctly_classified_positives(positives, minimum - 0.1).all()
  assert not correctly_classified_negatives(negatives, minimum - 0.1).any()
103
104

  # The inverse is true if the threshold is a bit above the maximum.
Amir Mohammadi's avatar
lint    
Amir Mohammadi committed
105
106
  assert not correctly_classified_positives(positives, maximum + 0.1).any()
  assert correctly_classified_negatives(negatives, maximum + 0.1).all()
107
108
109

  # If the threshold separates the sets, than all should be correctly
  # classified.
110
111
  assert correctly_classified_positives(positives, 3).all()
  assert correctly_classified_negatives(negatives, 3).all()
112
113
114
115


def test_thresholding():

Amir Mohammadi's avatar
lint    
Amir Mohammadi committed
116
117
118
  from . import eer_threshold, far_threshold, frr_threshold, farfrr, \
      correctly_classified_positives, correctly_classified_negatives, \
      min_hter_threshold
119

André Anjos's avatar
André Anjos committed
120
121
  def count(array, value=True):
    """Counts occurrences of a certain value in an array"""
122
    return list(array == value).count(True)
André Anjos's avatar
André Anjos committed
123

124
125
126
127
  # This example will demonstrate and check the use of eer_threshold() to
  # calculate the threshold that minimizes the EER.

  # This test set is not separable.
André Anjos's avatar
André Anjos committed
128
129
  positives = bob.io.base.load(F('nonsep-positives.hdf5'))
  negatives = bob.io.base.load(F('nonsep-negatives.hdf5'))
130
131
  threshold = eer_threshold(negatives, positives)

132
133
134
  sorted_positives = numpy.sort(positives)
  sorted_negatives = numpy.sort(negatives)

135
  # Of course we have to make sure that will set the EER correctly:
Amir Mohammadi's avatar
lint    
Amir Mohammadi committed
136
137
  ccp = count(correctly_classified_positives(positives, threshold))
  ccn = count(correctly_classified_negatives(negatives, threshold))
138
139
140
141
  assert (ccp - ccn) <= 1

  for t in (0, 0.001, 0.1, 0.5, 0.9, 0.999, 1):
    # Lets also test the far_threshold and the frr_threshold functions
142
143
    threshold_far = far_threshold(sorted_negatives, [], t, is_sorted=True)
    threshold_frr = frr_threshold([], sorted_positives, t, is_sorted=True)
Amir Mohammadi's avatar
lint    
Amir Mohammadi committed
144
145
    # Check that the requested FAR and FRR values are smaller than the
    # requested ones
146
147
148
149
150
    far = farfrr(negatives, positives, threshold_far)[0]
    frr = farfrr(negatives, positives, threshold_frr)[1]
    assert far + 1e-7 > t
    assert frr + 1e-7 > t
    # test that the values are at least somewhere in the range
Amir Mohammadi's avatar
lint    
Amir Mohammadi committed
151
152
    assert far - t <= 0.15
    assert frr - t <= 0.15
153
154
155
156
157

  # If the set is separable, the calculation of the threshold is a little bit
  # trickier, as you have no points in the middle of the range to compare
  # things to. This is where the currently used recursive algorithm seems to
  # do better. Let's verify
André Anjos's avatar
André Anjos committed
158
159
  positives = bob.io.base.load(F('linsep-positives.hdf5'))
  negatives = bob.io.base.load(F('linsep-negatives.hdf5'))
160
  threshold = eer_threshold(negatives, positives)
161
162
  # the result here is 3.2 (which is what is expect ;-)
  assert threshold == 3.2
163
164

  # Of course we have to make sure that will set the EER correctly:
Amir Mohammadi's avatar
lint    
Amir Mohammadi committed
165
166
  ccp = count(correctly_classified_positives(positives, threshold))
  ccn = count(correctly_classified_negatives(negatives, threshold))
167
168
169
170
171
  nose.tools.eq_(ccp, ccn)

  # The second option for the calculation of the threshold is to use the
  # minimum HTER.
  threshold2 = min_hter_threshold(negatives, positives)
172
  assert threshold2 == 3.2
Amir Mohammadi's avatar
lint    
Amir Mohammadi committed
173
  nose.tools.eq_(threshold, threshold2)  # in this particular case
174
175

  # Of course we have to make sure that will set the EER correctly:
Amir Mohammadi's avatar
lint    
Amir Mohammadi committed
176
177
  ccp = count(correctly_classified_positives(positives, threshold2))
  ccn = count(correctly_classified_negatives(negatives, threshold2))
178
179
180
  nose.tools.eq_(ccp, ccn)


181
182
def test_empty_raises():
  # tests that
Amir Mohammadi's avatar
lint    
Amir Mohammadi committed
183
184
  from bob.measure import farfrr, precision_recall, f_score, eer_threshold, \
      min_hter_threshold, min_weighted_error_rate_threshold
185

Amir Mohammadi's avatar
lint    
Amir Mohammadi committed
186
187
188
  for func in (
          farfrr, precision_recall,
          f_score, min_weighted_error_rate_threshold):
189
190
191
192
193
194
195
196
197
198
    nose.tools.assert_raises(RuntimeError, func, [], [1.], 0)
    nose.tools.assert_raises(RuntimeError, func, [1.], [], 0)
    nose.tools.assert_raises(RuntimeError, func, [], [], 0)

  for func in (eer_threshold, min_hter_threshold):
    nose.tools.assert_raises(RuntimeError, func, [], [1.])
    nose.tools.assert_raises(RuntimeError, func, [1.], [])
    nose.tools.assert_raises(RuntimeError, func, [], [])


199
200
def test_plots():

Amir Mohammadi's avatar
lint    
Amir Mohammadi committed
201
202
  from . import eer_threshold, roc, roc_for_far, precision_recall_curve, det, \
      epc
203
204

  # This test set is not separable.
André Anjos's avatar
André Anjos committed
205
206
  positives = bob.io.base.load(F('nonsep-positives.hdf5'))
  negatives = bob.io.base.load(F('nonsep-negatives.hdf5'))
207
208
209
210
211
212
  threshold = eer_threshold(negatives, positives)

  # This example will test the ROC plot calculation functionality.
  xy = roc(negatives, positives, 100)
  # uncomment the next line to save a reference value
  # save('nonsep-roc.hdf5', xy)
André Anjos's avatar
André Anjos committed
213
  xyref = bob.io.base.load(F('nonsep-roc.hdf5'))
214
  assert numpy.array_equal(xy, xyref)
215

216
217
  # This example will test the ROC for FAR plot calculation functionality.
  far = [0.01, 0.1, 1]
218
  ref = [0.42, 0.12, 0]
219
  xy = roc_for_far(negatives, positives, far)
220

221
222
223
  assert numpy.array_equal(xy[0], far)
  assert numpy.array_equal(xy[1], ref)

224
225
226
227
  # This example will test the Precision-Recall plot calculation functionality.
  xy = precision_recall_curve(negatives, positives, 100)
  # uncomment the next line to save a reference value
  # save('nonsep-roc.hdf5', xy)
André Anjos's avatar
André Anjos committed
228
  xyref = bob.io.base.load(F('nonsep-precisionrecall.hdf5'))
229
  assert numpy.array_equal(xy, xyref)
230

231
232
233
234
  # This example will test the DET plot calculation functionality.
  det_xyzw = det(negatives, positives, 100)
  # uncomment the next line to save a reference value
  # save('nonsep-det.hdf5', det_xyzw)
André Anjos's avatar
André Anjos committed
235
  det_xyzw_ref = bob.io.base.load(F('nonsep-det.hdf5'))
236
237
238
239
240
241
  assert numpy.allclose(det_xyzw, det_xyzw_ref, atol=1e-15)

  # This example will test the EPC plot calculation functionality. For the
  # EPC curve, you need to have a development and a test set. We will split,
  # by the middle, the negatives and positives sample we have, just for the
  # sake of testing
Amir Mohammadi's avatar
lint    
Amir Mohammadi committed
242
243
244
245
  dev_negatives = negatives[:(negatives.shape[0] // 2)]
  test_negatives = negatives[(negatives.shape[0] // 2):]
  dev_positives = positives[:(positives.shape[0] // 2)]
  test_positives = positives[(positives.shape[0] // 2):]
246
  xy = epc(dev_negatives, dev_positives,
Amir Mohammadi's avatar
lint    
Amir Mohammadi committed
247
           test_negatives, test_positives, 100)
André Anjos's avatar
André Anjos committed
248
  xyref = bob.io.base.load(F('nonsep-epc.hdf5'))
249
  assert numpy.allclose(xy, xyref[:2], atol=1e-15)
250
  xy = epc(dev_negatives, dev_positives,
Amir Mohammadi's avatar
lint    
Amir Mohammadi committed
251
           test_negatives, test_positives, 100, False, True)
252
253
254
  # uncomment the next line to save a reference value
  # save('nonsep-epc.hdf5', xy)
  assert numpy.allclose(xy, xyref, atol=1e-15)
255
256
257
258


def test_rocch():

André Anjos's avatar
André Anjos committed
259
  from . import rocch, rocch2eer, eer_rocch
260
261
262
263
264

  # This example will demonstrate and check the use of eer_rocch_threshold() to
  # calculate the threshold that minimizes the EER on the ROC Convex Hull

  # This test set is separable.
André Anjos's avatar
André Anjos committed
265
266
  positives = bob.io.base.load(F('linsep-positives.hdf5'))
  negatives = bob.io.base.load(F('linsep-negatives.hdf5'))
267
  # References obtained using Bosaris 1.06
268
  pmiss_pfa_ref = numpy.array([[1., 0., 0.], [0., 0., 1.]])
269
270
271
272
273
  eer_ref = 0.
  # Computes
  pmiss_pfa = rocch(negatives, positives)
  assert numpy.allclose(pmiss_pfa, pmiss_pfa_ref, atol=1e-15)
  eer = rocch2eer(pmiss_pfa)
Amir Mohammadi's avatar
lint    
Amir Mohammadi committed
274
  assert abs(eer - eer_ref) < 1e-4
275
  eer = eer_rocch(negatives, positives)
Amir Mohammadi's avatar
lint    
Amir Mohammadi committed
276
  assert abs(eer - eer_ref) < 1e-4
277
278

  # This test set is not separable.
André Anjos's avatar
André Anjos committed
279
280
  positives = bob.io.base.load(F('nonsep-positives.hdf5'))
  negatives = bob.io.base.load(F('nonsep-negatives.hdf5'))
281
  # References obtained using Bosaris 1.06
Amir Mohammadi's avatar
lint    
Amir Mohammadi committed
282
283
  pmiss_pfa_ref = numpy.array([[1., 0.68, 0.28, 0.1, 0.06, 0., 0.], [
                              0, 0, 0.08, 0.12, 0.22, 0.48, 1.]])
284
285
286
287
288
  eer_ref = 0.116363636363636
  # Computes
  pmiss_pfa = rocch(negatives, positives)
  assert numpy.allclose(pmiss_pfa, pmiss_pfa_ref, atol=1e-15)
  eer = rocch2eer(pmiss_pfa)
Amir Mohammadi's avatar
lint    
Amir Mohammadi committed
289
  assert abs(eer - eer_ref) < 1e-4
290
  eer = eer_rocch(negatives, positives)
Amir Mohammadi's avatar
lint    
Amir Mohammadi committed
291
  assert abs(eer - eer_ref) < 1e-4
292

293
294
295

def test_cmc():

André Anjos's avatar
André Anjos committed
296
  from . import recognition_rate, cmc, load
297
298

  # tests the CMC calculation
Amir Mohammadi's avatar
lint    
Amir Mohammadi committed
299
300
301
302
  # test data; should give match characteristics [1/2,1/4,1/3] and CMC
  # [1/3,2/3,1]
  test_data = [((0.3, 1.1, 0.5), (0.7,)), ((1.4, -1.3, 0.6), (0.2,)),
               ((0.8, 0., 1.5), (-0.8, 1.8)), ((2., 1.3, 1.6, 0.9), (2.4,))]
303
304
305
306
307
308
309
310
311
  # compute recognition rate
  rr = recognition_rate(test_data)
  nose.tools.eq_(rr, 0.5)
  # compute CMC
  cmc_ = cmc(test_data)
  assert (cmc_ == [0.5, 0.75, 1., 1., 1]).all()

  # load test data
  desired_rr = 0.76
Amir Mohammadi's avatar
lint    
Amir Mohammadi committed
312
313
  desired_cmc = [0.76, 0.89, 0.96, 0.98, 1., 1., 1.,
                 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.]
314
315
316
317
318
319
320
321
322
323
324
325
  data = load.cmc_four_column(F('scores-cmc-4col.txt'))
  rr = recognition_rate(data)
  nose.tools.eq_(rr, desired_rr)
  cmc_ = cmc(data)
  assert (cmc_ == desired_cmc).all()

  data = load.cmc_five_column(F('scores-cmc-5col.txt'))
  rr = recognition_rate(data)
  nose.tools.eq_(rr, desired_rr)
  cmc_ = cmc(data)
  assert (cmc_ == desired_cmc).all()

326
327
328
329
330
331
332
  data = load.cmc(F('scores-cmc-5col.txt'))
  rr = recognition_rate(data)
  nose.tools.eq_(rr, desired_rr)
  cmc_ = cmc(data)
  assert (cmc_ == desired_cmc).all()


333
334
def test_calibration():

André Anjos's avatar
André Anjos committed
335
  from . import calibration
336

André Anjos's avatar
André Anjos committed
337
  # Tests the cllr and min_cllr measures
338
  # This test set is separable.
André Anjos's avatar
André Anjos committed
339
340
  positives = bob.io.base.load(F('linsep-positives.hdf5'))
  negatives = bob.io.base.load(F('linsep-negatives.hdf5'))
341
342
343
344
345
346
347
348
349
350

  cllr = calibration.cllr(negatives, positives)
  min_cllr = calibration.min_cllr(negatives, positives)

  assert min_cllr <= cllr
  nose.tools.assert_almost_equal(cllr, 1.2097942129)
  # Since the test set is separable, the min_cllr needs to be zero
  nose.tools.assert_almost_equal(min_cllr, 0.)

  # This test set is not separable.
André Anjos's avatar
André Anjos committed
351
352
  positives = bob.io.base.load(F('nonsep-positives.hdf5'))
  negatives = bob.io.base.load(F('nonsep-negatives.hdf5'))
353
354
355
356
357

  cllr = calibration.cllr(negatives, positives)
  min_cllr = calibration.min_cllr(negatives, positives)

  assert min_cllr <= cllr
358
359
360
361
  assert abs(cllr - 3.61833) < 1e-5, cllr
  assert abs(min_cllr - 0.33736) < 1e-5, min_cllr


362
def test_open_set_rates():
363

364
  # No error files
365
  cmc_scores = bob.measure.load.cmc(F("scores-cmc-4col-open-set.txt"))
Amir Mohammadi's avatar
lint    
Amir Mohammadi committed
366
367
  assert abs(bob.measure.detection_identification_rate(
      cmc_scores, threshold=0.5) - 1.0) < 1e-8
368
369
  assert abs(bob.measure.false_alarm_rate(cmc_scores, threshold=0.5)) < 1e-8

Amir Mohammadi's avatar
lint    
Amir Mohammadi committed
370
371
372
  assert abs(bob.measure.recognition_rate(cmc_scores) - 7. / 9.) < 1e-8
  assert abs(bob.measure.recognition_rate(
      cmc_scores, threshold=0.5) - 1.0) < 1e-8
373

374
  # One error
Amir Mohammadi's avatar
lint    
Amir Mohammadi committed
375
376
377
378
  cmc_scores = bob.measure.load.cmc(
      F("scores-cmc-4col-open-set-one-error.txt"))
  assert abs(bob.measure.detection_identification_rate(
      cmc_scores, threshold=0.5) - 6. / 7.) < 1e-8
379
380
  assert abs(bob.measure.false_alarm_rate(cmc_scores, threshold=0.5)) < 1e-8

Amir Mohammadi's avatar
lint    
Amir Mohammadi committed
381
382
383
  assert abs(bob.measure.recognition_rate(cmc_scores) - 6. / 9.) < 1e-8
  assert abs(bob.measure.recognition_rate(
      cmc_scores, threshold=0.5) - 6. / 7.) < 1e-8
Tiago de Freitas Pereira's avatar
Tiago de Freitas Pereira committed
384

385
  # Two errors
Amir Mohammadi's avatar
lint    
Amir Mohammadi committed
386
387
388
389
390
391
392
393
394
395
  cmc_scores = bob.measure.load.cmc_four_column(
      F("scores-cmc-4col-open-set-two-errors.txt"))
  assert abs(bob.measure.detection_identification_rate(
      cmc_scores, threshold=0.5) - 6. / 7.) < 1e-8
  assert abs(bob.measure.false_alarm_rate(
      cmc_scores, threshold=0.5) - 0.5) < 1e-8

  assert abs(bob.measure.recognition_rate(cmc_scores) - 6. / 9.) < 1e-8
  assert abs(bob.measure.recognition_rate(
      cmc_scores, threshold=0.5) - 6. / 8.) < 1e-8