test_error.py 14.2 KB
Newer Older
André Anjos's avatar
André Anjos committed
1 2 3
#!/usr/bin/env python
# vim: set fileencoding=utf-8 :
# Andre Anjos <andre.anjos@idiap.ch>
4
# Wed 11 Dec 15:14:08 2013 CET
André Anjos's avatar
André Anjos committed
5 6 7 8 9 10
#
# Copyright (C) 2011-2013 Idiap Research Institute, Martigny, Switzerland

"""Basic tests for the error measuring system of bob
"""

11
import os
André Anjos's avatar
André Anjos committed
12
import numpy
13
import nose.tools
André Anjos's avatar
André Anjos committed
14
import bob.io.base
André Anjos's avatar
André Anjos committed
15 16 17

def F(f):
  """Returns the test file on the "data" subdirectory"""
18
  import pkg_resources
André Anjos's avatar
André Anjos committed
19 20
  return pkg_resources.resource_filename(__name__, os.path.join('data', f))

21

André Anjos's avatar
André Anjos committed
22 23
def save(fname, data):
  """Saves a single array into a file in the 'data' directory."""
André Anjos's avatar
André Anjos committed
24
  bob.io.base.Array(data).save(os.path.join('data', fname))
25 26 27 28


def test_basic_ratios():

André Anjos's avatar
André Anjos committed
29
  from . import farfrr, precision_recall, f_score
30 31 32 33

  # We test the basic functionaly on FAR and FRR calculation. The first
  # example is separable, with a separation threshold of about 3.0

André Anjos's avatar
André Anjos committed
34 35
  positives = bob.io.base.load(F('linsep-positives.hdf5'))
  negatives = bob.io.base.load(F('linsep-negatives.hdf5'))
36 37 38 39 40 41 42 43 44 45 46 47

  minimum = min(positives.min(), negatives.min())
  maximum = max(positives.max(), negatives.max())

  # If we take a threshold on the minimum, the FAR should be 1.0 and the FRR
  # should be 0.0. Precision should be 0.5, recall should be 1.0
  far, frr = farfrr(negatives, positives, minimum-0.1)
  nose.tools.eq_(far, 1.0)
  nose.tools.eq_(frr, 0.0)
  prec, recall = precision_recall(negatives, positives, minimum-0.1)
  nose.tools.eq_(prec, 0.5)
  nose.tools.eq_(recall, 1.0)
48

49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65
  # Similarly, if we take a threshold on the maximum, the FRR should be 1.0
  # while the FAR should be 0.0. Both precision and recall should be 0.0.
  far, frr = farfrr(negatives, positives, maximum+0.1)
  nose.tools.eq_(far, 0.0)
  nose.tools.eq_(frr, 1.0)
  prec, recall = precision_recall(negatives, positives, maximum+0.1)
  nose.tools.eq_(prec, 0.0)
  nose.tools.eq_(recall, 0.0)

  # If we choose the appropriate threshold, we should get 0.0 for both FAR
  # and FRR. Precision will be 1.0, recall will be 1.0
  far, frr = farfrr(negatives, positives, 3.0)
  nose.tools.eq_(far, 0.0)
  nose.tools.eq_(frr, 0.0)
  prec, recall = precision_recall(negatives, positives, 3.0)
  nose.tools.eq_(prec, 1.0)
  nose.tools.eq_(recall, 1.0)
66

67
  # Testing the values of F-score depending on different choices of the threshold
André Anjos's avatar
André Anjos committed
68 69 70 71
  f_score_ = f_score(negatives, positives, minimum-0.1)
  nose.tools.assert_almost_equal(f_score_, 0.66666667)
  f_score_ = f_score(negatives, positives, minimum-0.1, 2)
  nose.tools.assert_almost_equal(f_score_, 0.83333333)
72

André Anjos's avatar
André Anjos committed
73 74 75 76
  f_score_ = f_score(negatives, positives, maximum+0.1)
  nose.tools.eq_(f_score_, 0.0)
  f_score_ = f_score(negatives, positives, maximum+0.1, 2)
  nose.tools.eq_(f_score_, 0.0)
77

André Anjos's avatar
André Anjos committed
78 79 80 81
  f_score_ = f_score(negatives, positives, 3.0)
  nose.tools.eq_(f_score_, 1.0)
  f_score_ = f_score(negatives, positives, 3.0, 2)
  nose.tools.eq_(f_score_, 1.0)
82

83 84 85

def test_indexing():

André Anjos's avatar
André Anjos committed
86
  from . import correctly_classified_positives, correctly_classified_negatives
87 88 89

  # This test verifies that the output of correctly_classified_positives() and
  # correctly_classified_negatives() makes sense.
André Anjos's avatar
André Anjos committed
90 91
  positives = bob.io.base.load(F('linsep-positives.hdf5'))
  negatives = bob.io.base.load(F('linsep-negatives.hdf5'))
92 93 94 95 96 97 98

  minimum = min(positives.min(), negatives.min())
  maximum = max(positives.max(), negatives.max())

  # If the threshold is minimum, we should have all positive samples
  # correctly classified and none of the negative samples correctly
  # classified.
99 100
  assert correctly_classified_positives(positives, minimum-0.1).all()
  assert not correctly_classified_negatives(negatives, minimum-0.1).any()
101 102

  # The inverse is true if the threshold is a bit above the maximum.
103 104
  assert not correctly_classified_positives(positives, maximum+0.1).any()
  assert correctly_classified_negatives(negatives, maximum+0.1).all()
105 106 107

  # If the threshold separates the sets, than all should be correctly
  # classified.
108 109
  assert correctly_classified_positives(positives, 3).all()
  assert correctly_classified_negatives(negatives, 3).all()
110 111 112 113


def test_thresholding():

André Anjos's avatar
André Anjos committed
114
  from . import eer_threshold, far_threshold, frr_threshold, farfrr, correctly_classified_positives, correctly_classified_negatives, min_hter_threshold
115

André Anjos's avatar
André Anjos committed
116 117
  def count(array, value=True):
    """Counts occurrences of a certain value in an array"""
118
    return list(array == value).count(True)
André Anjos's avatar
André Anjos committed
119

120 121 122 123
  # This example will demonstrate and check the use of eer_threshold() to
  # calculate the threshold that minimizes the EER.

  # This test set is not separable.
André Anjos's avatar
André Anjos committed
124 125
  positives = bob.io.base.load(F('nonsep-positives.hdf5'))
  negatives = bob.io.base.load(F('nonsep-negatives.hdf5'))
126 127
  threshold = eer_threshold(negatives, positives)

128 129 130
  sorted_positives = numpy.sort(positives)
  sorted_negatives = numpy.sort(negatives)

131 132 133 134 135 136 137
  # Of course we have to make sure that will set the EER correctly:
  ccp = count(correctly_classified_positives(positives,threshold))
  ccn = count(correctly_classified_negatives(negatives,threshold))
  assert (ccp - ccn) <= 1

  for t in (0, 0.001, 0.1, 0.5, 0.9, 0.999, 1):
    # Lets also test the far_threshold and the frr_threshold functions
138 139
    threshold_far = far_threshold(sorted_negatives, [], t, is_sorted=True)
    threshold_frr = frr_threshold([], sorted_positives, t, is_sorted=True)
140 141 142 143 144 145 146 147 148 149 150 151 152 153
    # Check that the requested FAR and FRR values are smaller than the requested ones
    far = farfrr(negatives, positives, threshold_far)[0]
    frr = farfrr(negatives, positives, threshold_frr)[1]
    assert far + 1e-7 > t
    assert frr + 1e-7 > t
    # test that the values are at least somewhere in the range
    assert far-t <= 0.15
    assert frr-t <= 0.15


  # If the set is separable, the calculation of the threshold is a little bit
  # trickier, as you have no points in the middle of the range to compare
  # things to. This is where the currently used recursive algorithm seems to
  # do better. Let's verify
André Anjos's avatar
André Anjos committed
154 155
  positives = bob.io.base.load(F('linsep-positives.hdf5'))
  negatives = bob.io.base.load(F('linsep-negatives.hdf5'))
156
  threshold = eer_threshold(negatives, positives)
157 158
  # the result here is 3.2 (which is what is expect ;-)
  assert threshold == 3.2
159 160 161 162 163 164 165 166 167

  # Of course we have to make sure that will set the EER correctly:
  ccp = count(correctly_classified_positives(positives,threshold))
  ccn = count(correctly_classified_negatives(negatives,threshold))
  nose.tools.eq_(ccp, ccn)

  # The second option for the calculation of the threshold is to use the
  # minimum HTER.
  threshold2 = min_hter_threshold(negatives, positives)
168
  assert threshold2 == 3.2
169 170 171 172 173 174 175 176
  nose.tools.eq_(threshold, threshold2) #in this particular case

  # Of course we have to make sure that will set the EER correctly:
  ccp = count(correctly_classified_positives(positives,threshold2))
  ccn = count(correctly_classified_negatives(negatives,threshold2))
  nose.tools.eq_(ccp, ccn)


177 178
def test_empty_raises():
  # tests that
179
  from bob.measure import farfrr, precision_recall, f_score, eer_threshold, min_hter_threshold, min_weighted_error_rate_threshold
180

181
  for func in (farfrr, precision_recall, f_score, min_weighted_error_rate_threshold):
182 183 184 185 186 187 188 189 190 191
    nose.tools.assert_raises(RuntimeError, func, [], [1.], 0)
    nose.tools.assert_raises(RuntimeError, func, [1.], [], 0)
    nose.tools.assert_raises(RuntimeError, func, [], [], 0)

  for func in (eer_threshold, min_hter_threshold):
    nose.tools.assert_raises(RuntimeError, func, [], [1.])
    nose.tools.assert_raises(RuntimeError, func, [1.], [])
    nose.tools.assert_raises(RuntimeError, func, [], [])


192 193
def test_plots():

194
  from . import eer_threshold, roc, roc_for_far, precision_recall_curve, det, epc
195 196

  # This test set is not separable.
André Anjos's avatar
André Anjos committed
197 198
  positives = bob.io.base.load(F('nonsep-positives.hdf5'))
  negatives = bob.io.base.load(F('nonsep-negatives.hdf5'))
199 200 201 202 203 204
  threshold = eer_threshold(negatives, positives)

  # This example will test the ROC plot calculation functionality.
  xy = roc(negatives, positives, 100)
  # uncomment the next line to save a reference value
  # save('nonsep-roc.hdf5', xy)
André Anjos's avatar
André Anjos committed
205
  xyref = bob.io.base.load(F('nonsep-roc.hdf5'))
206
  assert numpy.array_equal(xy, xyref)
207

208 209
  # This example will test the ROC for FAR plot calculation functionality.
  far = [0.01, 0.1, 1]
210
  ref = [0.42, 0.12, 0]
211
  xy = roc_for_far(negatives, positives, far)
212

213 214 215
  assert numpy.array_equal(xy[0], far)
  assert numpy.array_equal(xy[1], ref)

216 217 218 219
  # This example will test the Precision-Recall plot calculation functionality.
  xy = precision_recall_curve(negatives, positives, 100)
  # uncomment the next line to save a reference value
  # save('nonsep-roc.hdf5', xy)
André Anjos's avatar
André Anjos committed
220
  xyref = bob.io.base.load(F('nonsep-precisionrecall.hdf5'))
221
  assert numpy.array_equal(xy, xyref)
222

223 224 225 226
  # This example will test the DET plot calculation functionality.
  det_xyzw = det(negatives, positives, 100)
  # uncomment the next line to save a reference value
  # save('nonsep-det.hdf5', det_xyzw)
André Anjos's avatar
André Anjos committed
227
  det_xyzw_ref = bob.io.base.load(F('nonsep-det.hdf5'))
228 229 230 231 232 233
  assert numpy.allclose(det_xyzw, det_xyzw_ref, atol=1e-15)

  # This example will test the EPC plot calculation functionality. For the
  # EPC curve, you need to have a development and a test set. We will split,
  # by the middle, the negatives and positives sample we have, just for the
  # sake of testing
Manuel Günther's avatar
Manuel Günther committed
234 235 236 237
  dev_negatives = negatives[:(negatives.shape[0]//2)]
  test_negatives = negatives[(negatives.shape[0]//2):]
  dev_positives = positives[:(positives.shape[0]//2)]
  test_positives = positives[(positives.shape[0]//2):]
238 239 240
  xy = epc(dev_negatives, dev_positives,
      test_negatives, test_positives, 100)
  # uncomment the next line to save a reference value
241
  # save('nonsep-epc.hdf5', xy)
André Anjos's avatar
André Anjos committed
242
  xyref = bob.io.base.load(F('nonsep-epc.hdf5'))
243
  assert numpy.allclose(xy, xyref, atol=1e-15)
244 245 246
  xy = epc(dev_negatives, dev_positives,
      test_negatives, test_positives, 100, False, True)
  assert numpy.allclose(xy[:2], xyref, atol=1e-15)
247 248 249 250


def test_rocch():

André Anjos's avatar
André Anjos committed
251
  from . import rocch, rocch2eer, eer_rocch
252 253 254 255 256

  # This example will demonstrate and check the use of eer_rocch_threshold() to
  # calculate the threshold that minimizes the EER on the ROC Convex Hull

  # This test set is separable.
André Anjos's avatar
André Anjos committed
257 258
  positives = bob.io.base.load(F('linsep-positives.hdf5'))
  negatives = bob.io.base.load(F('linsep-negatives.hdf5'))
259
  # References obtained using Bosaris 1.06
260
  pmiss_pfa_ref = numpy.array([[1., 0., 0.], [0., 0., 1.]])
261 262 263 264 265 266 267 268 269 270
  eer_ref = 0.
  # Computes
  pmiss_pfa = rocch(negatives, positives)
  assert numpy.allclose(pmiss_pfa, pmiss_pfa_ref, atol=1e-15)
  eer = rocch2eer(pmiss_pfa)
  assert abs(eer-eer_ref) < 1e-4
  eer = eer_rocch(negatives, positives)
  assert abs(eer-eer_ref) < 1e-4

  # This test set is not separable.
André Anjos's avatar
André Anjos committed
271 272
  positives = bob.io.base.load(F('nonsep-positives.hdf5'))
  negatives = bob.io.base.load(F('nonsep-negatives.hdf5'))
273
  # References obtained using Bosaris 1.06
274
  pmiss_pfa_ref = numpy.array([[1., 0.68, 0.28, 0.1, 0.06, 0., 0.], [0, 0, 0.08, 0.12, 0.22, 0.48, 1.]])
275 276 277 278 279 280 281 282
  eer_ref = 0.116363636363636
  # Computes
  pmiss_pfa = rocch(negatives, positives)
  assert numpy.allclose(pmiss_pfa, pmiss_pfa_ref, atol=1e-15)
  eer = rocch2eer(pmiss_pfa)
  assert abs(eer-eer_ref) < 1e-4
  eer = eer_rocch(negatives, positives)
  assert abs(eer-eer_ref) < 1e-4
283

284 285 286

def test_cmc():

André Anjos's avatar
André Anjos committed
287
  from . import recognition_rate, cmc, load
288 289 290

  # tests the CMC calculation
  # test data; should give match characteristics [1/2,1/4,1/3] and CMC [1/3,2/3,1]
291
  test_data = [((0.3, 1.1, 0.5), (0.7,)), ((1.4, -1.3, 0.6), (0.2,)), ((0.8, 0., 1.5), (-0.8, 1.8)), ((2., 1.3, 1.6, 0.9), (2.4,))]
292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313
  # compute recognition rate
  rr = recognition_rate(test_data)
  nose.tools.eq_(rr, 0.5)
  # compute CMC
  cmc_ = cmc(test_data)
  assert (cmc_ == [0.5, 0.75, 1., 1., 1]).all()

  # load test data
  desired_rr = 0.76
  desired_cmc = [0.76, 0.89, 0.96, 0.98, 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.]
  data = load.cmc_four_column(F('scores-cmc-4col.txt'))
  rr = recognition_rate(data)
  nose.tools.eq_(rr, desired_rr)
  cmc_ = cmc(data)
  assert (cmc_ == desired_cmc).all()

  data = load.cmc_five_column(F('scores-cmc-5col.txt'))
  rr = recognition_rate(data)
  nose.tools.eq_(rr, desired_rr)
  cmc_ = cmc(data)
  assert (cmc_ == desired_cmc).all()

314 315 316 317 318 319 320
  data = load.cmc(F('scores-cmc-5col.txt'))
  rr = recognition_rate(data)
  nose.tools.eq_(rr, desired_rr)
  cmc_ = cmc(data)
  assert (cmc_ == desired_cmc).all()


321 322 323

def test_calibration():

André Anjos's avatar
André Anjos committed
324
  from . import calibration
325

André Anjos's avatar
André Anjos committed
326
  # Tests the cllr and min_cllr measures
327
  # This test set is separable.
André Anjos's avatar
André Anjos committed
328 329
  positives = bob.io.base.load(F('linsep-positives.hdf5'))
  negatives = bob.io.base.load(F('linsep-negatives.hdf5'))
330 331 332 333 334 335 336 337 338 339

  cllr = calibration.cllr(negatives, positives)
  min_cllr = calibration.min_cllr(negatives, positives)

  assert min_cllr <= cllr
  nose.tools.assert_almost_equal(cllr, 1.2097942129)
  # Since the test set is separable, the min_cllr needs to be zero
  nose.tools.assert_almost_equal(min_cllr, 0.)

  # This test set is not separable.
André Anjos's avatar
André Anjos committed
340 341
  positives = bob.io.base.load(F('nonsep-positives.hdf5'))
  negatives = bob.io.base.load(F('nonsep-negatives.hdf5'))
342 343 344 345 346

  cllr = calibration.cllr(negatives, positives)
  min_cllr = calibration.min_cllr(negatives, positives)

  assert min_cllr <= cllr
347 348 349 350
  assert abs(cllr - 3.61833) < 1e-5, cllr
  assert abs(min_cllr - 0.33736) < 1e-5, min_cllr


Tiago de Freitas Pereira's avatar
Tiago de Freitas Pereira committed
351

352
def test_open_set_rates():
353

354
  # No error files
355
  cmc_scores = bob.measure.load.cmc(F("scores-cmc-4col-open-set.txt"))
356 357 358 359
  assert abs(bob.measure.detection_identification_rate(cmc_scores, threshold=0.5) - 1.0) < 1e-8
  assert abs(bob.measure.false_alarm_rate(cmc_scores, threshold=0.5)) < 1e-8

  assert abs(bob.measure.recognition_rate(cmc_scores) - 7./9.) < 1e-8
360
  assert abs(bob.measure.recognition_rate(cmc_scores, threshold=0.5) - 1.0) < 1e-8
361

362
  # One error
363
  cmc_scores = bob.measure.load.cmc(F("scores-cmc-4col-open-set-one-error.txt"))
364 365 366 367 368 369
  assert abs(bob.measure.detection_identification_rate(cmc_scores, threshold=0.5) - 6./7.) < 1e-8
  assert abs(bob.measure.false_alarm_rate(cmc_scores, threshold=0.5)) < 1e-8

  assert abs(bob.measure.recognition_rate(cmc_scores) - 6./9.) < 1e-8
  assert abs(bob.measure.recognition_rate(cmc_scores, threshold=0.5) - 6./7.) < 1e-8

Tiago de Freitas Pereira's avatar
Tiago de Freitas Pereira committed
370

371
  # Two errors
372
  cmc_scores = bob.measure.load.cmc_four_column(F("scores-cmc-4col-open-set-two-errors.txt"))
373 374 375 376 377
  assert abs(bob.measure.detection_identification_rate(cmc_scores, threshold=0.5) - 6./7.) < 1e-8
  assert abs(bob.measure.false_alarm_rate(cmc_scores, threshold=0.5) - 0.5) < 1e-8

  assert abs(bob.measure.recognition_rate(cmc_scores) - 6./9.) < 1e-8
  assert abs(bob.measure.recognition_rate(cmc_scores, threshold=0.5) - 6./8.) < 1e-8