test_error.py 16.3 KB
Newer Older
André Anjos's avatar
André Anjos committed
1 2 3
#!/usr/bin/env python
# vim: set fileencoding=utf-8 :
# Andre Anjos <andre.anjos@idiap.ch>
4
# Wed 11 Dec 15:14:08 2013 CET
André Anjos's avatar
André Anjos committed
5 6 7 8 9 10
#
# Copyright (C) 2011-2013 Idiap Research Institute, Martigny, Switzerland

"""Basic tests for the error measuring system of bob
"""

11
import os
André Anjos's avatar
André Anjos committed
12
import numpy
13
import nose.tools
André Anjos's avatar
André Anjos committed
14
import bob.io.base
15
import math
André Anjos's avatar
André Anjos committed
16

Amir MOHAMMADI's avatar
Amir MOHAMMADI committed
17

André Anjos's avatar
André Anjos committed
18 19
def F(f):
  """Returns the test file on the "data" subdirectory"""
20
  import pkg_resources
André Anjos's avatar
André Anjos committed
21 22
  return pkg_resources.resource_filename(__name__, os.path.join('data', f))

23

André Anjos's avatar
André Anjos committed
24 25
def save(fname, data):
  """Saves a single array into a file in the 'data' directory."""
26
  bob.io.base.save(data, os.path.join('bob/measure/data', fname))
27 28 29 30


def test_basic_ratios():

31
  from . import farfrr, precision_recall, f_score
32 33 34 35

  # We test the basic functionaly on FAR and FRR calculation. The first
  # example is separable, with a separation threshold of about 3.0

André Anjos's avatar
André Anjos committed
36 37
  positives = bob.io.base.load(F('linsep-positives.hdf5'))
  negatives = bob.io.base.load(F('linsep-negatives.hdf5'))
38 39 40 41 42 43

  minimum = min(positives.min(), negatives.min())
  maximum = max(positives.max(), negatives.max())

  # If we take a threshold on the minimum, the FAR should be 1.0 and the FRR
  # should be 0.0. Precision should be 0.5, recall should be 1.0
Amir MOHAMMADI's avatar
Amir MOHAMMADI committed
44
  far, frr = farfrr(negatives, positives, minimum - 0.1)
45 46
  nose.tools.eq_(far, 1.0)
  nose.tools.eq_(frr, 0.0)
Amir MOHAMMADI's avatar
Amir MOHAMMADI committed
47
  prec, recall = precision_recall(negatives, positives, minimum - 0.1)
48 49
  nose.tools.eq_(prec, 0.5)
  nose.tools.eq_(recall, 1.0)
50

51 52
  # Similarly, if we take a threshold on the maximum, the FRR should be 1.0
  # while the FAR should be 0.0. Both precision and recall should be 0.0.
Amir MOHAMMADI's avatar
Amir MOHAMMADI committed
53
  far, frr = farfrr(negatives, positives, maximum + 0.1)
54 55
  nose.tools.eq_(far, 0.0)
  nose.tools.eq_(frr, 1.0)
Amir MOHAMMADI's avatar
Amir MOHAMMADI committed
56
  prec, recall = precision_recall(negatives, positives, maximum + 0.1)
57 58 59 60 61 62 63 64 65 66 67
  nose.tools.eq_(prec, 0.0)
  nose.tools.eq_(recall, 0.0)

  # If we choose the appropriate threshold, we should get 0.0 for both FAR
  # and FRR. Precision will be 1.0, recall will be 1.0
  far, frr = farfrr(negatives, positives, 3.0)
  nose.tools.eq_(far, 0.0)
  nose.tools.eq_(frr, 0.0)
  prec, recall = precision_recall(negatives, positives, 3.0)
  nose.tools.eq_(prec, 1.0)
  nose.tools.eq_(recall, 1.0)
68

Amir MOHAMMADI's avatar
Amir MOHAMMADI committed
69 70 71
  # Testing the values of F-score depending on different choices of the
  # threshold
  f_score_ = f_score(negatives, positives, minimum - 0.1)
André Anjos's avatar
André Anjos committed
72
  nose.tools.assert_almost_equal(f_score_, 0.66666667)
Amir MOHAMMADI's avatar
Amir MOHAMMADI committed
73
  f_score_ = f_score(negatives, positives, minimum - 0.1, 2)
André Anjos's avatar
André Anjos committed
74
  nose.tools.assert_almost_equal(f_score_, 0.83333333)
75

Amir MOHAMMADI's avatar
Amir MOHAMMADI committed
76
  f_score_ = f_score(negatives, positives, maximum + 0.1)
André Anjos's avatar
André Anjos committed
77
  nose.tools.eq_(f_score_, 0.0)
Amir MOHAMMADI's avatar
Amir MOHAMMADI committed
78
  f_score_ = f_score(negatives, positives, maximum + 0.1, 2)
André Anjos's avatar
André Anjos committed
79
  nose.tools.eq_(f_score_, 0.0)
80

André Anjos's avatar
André Anjos committed
81 82 83 84
  f_score_ = f_score(negatives, positives, 3.0)
  nose.tools.eq_(f_score_, 1.0)
  f_score_ = f_score(negatives, positives, 3.0, 2)
  nose.tools.eq_(f_score_, 1.0)
85

86

87 88 89 90 91 92
def test_nan_for_uncomputable_thresholds():
  # in some cases, we cannot compute an FAR or FRR threshold, e.g., when we have too little data or too many equal scores
  # in these cases, the methods should return NaN
  from . import far_threshold, frr_threshold

  # case 1: several scores are identical
93 94
  positives = [0.0, 0.0, 0.0, 0.0, 0.1, 0.2, 0.3, 0.4, 0.5]
  negatives = [0.5, 0.6, 0.7, 0.8, 0.9, 1.0, 1.0, 1.0, 1.0]
95 96

  # test that reasonable thresholds for reachable data points are provided
97 98
  assert far_threshold(negatives, positives, 0.5) == 0.9
  assert numpy.isclose(frr_threshold(negatives, positives, 0.5), 0.1)
99 100 101 102

  assert math.isnan(far_threshold(negatives, positives, 0.4))
  assert math.isnan(frr_threshold(negatives, positives, 0.4))

103 104 105
  # test the same with even number of scores
  positives = [0.0, 0.0, 0.0, 0.0, 0.0, 0.1, 0.2, 0.3, 0.4, 0.5]
  negatives = [0.5, 0.6, 0.7, 0.8, 0.9, 1.0, 1.0, 1.0, 1.0, 1.0]
106

107 108 109 110
  assert far_threshold(negatives, positives, 0.5) == 0.9
  assert numpy.isclose(frr_threshold(negatives, positives, 0.51), 0.1)
  assert math.isnan(far_threshold(negatives, positives, 0.49))
  assert math.isnan(frr_threshold(negatives, positives, 0.5))
111 112


113 114 115 116 117 118
  # case 2: too few scores for the desired threshold
  positives = numpy.arange(10.)
  negatives = numpy.arange(10.)

  assert math.isnan(far_threshold(negatives, positives, 0.09))
  assert math.isnan(frr_threshold(negatives, positives, 0.09))
119 120 121 122 123
  # there is no limit above; the threshold will just be the largest possible value
  assert far_threshold(negatives, positives, 0.11) == 8.
  assert far_threshold(negatives, positives, 0.91) == 0.
  assert numpy.isclose(frr_threshold(negatives, positives, 0.11), 1.)
  assert numpy.isclose(frr_threshold(negatives, positives, 0.91), 9.)
124 125


126 127
def test_indexing():

128
  from . import correctly_classified_positives, correctly_classified_negatives
129 130 131

  # This test verifies that the output of correctly_classified_positives() and
  # correctly_classified_negatives() makes sense.
André Anjos's avatar
André Anjos committed
132 133
  positives = bob.io.base.load(F('linsep-positives.hdf5'))
  negatives = bob.io.base.load(F('linsep-negatives.hdf5'))
134 135 136 137 138 139 140

  minimum = min(positives.min(), negatives.min())
  maximum = max(positives.max(), negatives.max())

  # If the threshold is minimum, we should have all positive samples
  # correctly classified and none of the negative samples correctly
  # classified.
Amir MOHAMMADI's avatar
Amir MOHAMMADI committed
141 142
  assert correctly_classified_positives(positives, minimum - 0.1).all()
  assert not correctly_classified_negatives(negatives, minimum - 0.1).any()
143 144

  # The inverse is true if the threshold is a bit above the maximum.
Amir MOHAMMADI's avatar
Amir MOHAMMADI committed
145 146
  assert not correctly_classified_positives(positives, maximum + 0.1).any()
  assert correctly_classified_negatives(negatives, maximum + 0.1).all()
147 148 149

  # If the threshold separates the sets, than all should be correctly
  # classified.
150 151
  assert correctly_classified_positives(positives, 3).all()
  assert correctly_classified_negatives(negatives, 3).all()
152 153 154 155


def test_thresholding():

Amir MOHAMMADI's avatar
Amir MOHAMMADI committed
156 157 158
  from . import eer_threshold, far_threshold, frr_threshold, farfrr, \
      correctly_classified_positives, correctly_classified_negatives, \
      min_hter_threshold
159

André Anjos's avatar
André Anjos committed
160 161
  def count(array, value=True):
    """Counts occurrences of a certain value in an array"""
162
    return list(array == value).count(True)
André Anjos's avatar
André Anjos committed
163

164 165 166 167
  # This example will demonstrate and check the use of eer_threshold() to
  # calculate the threshold that minimizes the EER.

  # This test set is not separable.
André Anjos's avatar
André Anjos committed
168 169
  positives = bob.io.base.load(F('nonsep-positives.hdf5'))
  negatives = bob.io.base.load(F('nonsep-negatives.hdf5'))
170 171
  threshold = eer_threshold(negatives, positives)

172 173 174
  sorted_positives = numpy.sort(positives)
  sorted_negatives = numpy.sort(negatives)

175
  # Of course we have to make sure that will set the EER correctly:
Amir MOHAMMADI's avatar
Amir MOHAMMADI committed
176 177
  ccp = count(correctly_classified_positives(positives, threshold))
  ccn = count(correctly_classified_negatives(negatives, threshold))
178 179 180 181
  assert (ccp - ccn) <= 1

  for t in (0, 0.001, 0.1, 0.5, 0.9, 0.999, 1):
    # Lets also test the far_threshold and the frr_threshold functions
182 183
    threshold_far = far_threshold(sorted_negatives, [], t, is_sorted=True)
    threshold_frr = frr_threshold([], sorted_positives, t, is_sorted=True)
Amir MOHAMMADI's avatar
Amir MOHAMMADI committed
184 185
    # Check that the requested FAR and FRR values are smaller than the
    # requested ones
186 187
    far = farfrr(negatives, positives, threshold_far)[0]
    frr = farfrr(negatives, positives, threshold_frr)[1]
188 189 190 191 192 193 194
    if not math.isnan(threshold_far):
      assert far + 1e-7 > t, (far,t)
      assert far - t <= 0.1
    if not math.isnan(threshold_frr):
      assert frr + 1e-7 > t, (frr,t)
      # test that the values are at least somewhere in the range
      assert frr - t <= 0.1
195 196 197 198 199

  # If the set is separable, the calculation of the threshold is a little bit
  # trickier, as you have no points in the middle of the range to compare
  # things to. This is where the currently used recursive algorithm seems to
  # do better. Let's verify
André Anjos's avatar
André Anjos committed
200 201
  positives = bob.io.base.load(F('linsep-positives.hdf5'))
  negatives = bob.io.base.load(F('linsep-negatives.hdf5'))
202
  threshold = eer_threshold(negatives, positives)
203 204
  # the result here is 3.2 (which is what is expect ;-)
  assert threshold == 3.2
205 206

  # Of course we have to make sure that will set the EER correctly:
Amir MOHAMMADI's avatar
Amir MOHAMMADI committed
207 208
  ccp = count(correctly_classified_positives(positives, threshold))
  ccn = count(correctly_classified_negatives(negatives, threshold))
209 210 211 212 213
  nose.tools.eq_(ccp, ccn)

  # The second option for the calculation of the threshold is to use the
  # minimum HTER.
  threshold2 = min_hter_threshold(negatives, positives)
214
  assert threshold2 == 3.2
Amir MOHAMMADI's avatar
Amir MOHAMMADI committed
215
  nose.tools.eq_(threshold, threshold2)  # in this particular case
216 217

  # Of course we have to make sure that will set the EER correctly:
Amir MOHAMMADI's avatar
Amir MOHAMMADI committed
218 219
  ccp = count(correctly_classified_positives(positives, threshold2))
  ccn = count(correctly_classified_negatives(negatives, threshold2))
220 221 222
  nose.tools.eq_(ccp, ccn)


223 224
def test_empty_raises():
  # tests that
Amir MOHAMMADI's avatar
Amir MOHAMMADI committed
225 226
  from bob.measure import farfrr, precision_recall, f_score, eer_threshold, \
      min_hter_threshold, min_weighted_error_rate_threshold
227

Amir MOHAMMADI's avatar
Amir MOHAMMADI committed
228 229 230
  for func in (
          farfrr, precision_recall,
          f_score, min_weighted_error_rate_threshold):
231 232 233 234 235 236 237 238 239 240
    nose.tools.assert_raises(RuntimeError, func, [], [1.], 0)
    nose.tools.assert_raises(RuntimeError, func, [1.], [], 0)
    nose.tools.assert_raises(RuntimeError, func, [], [], 0)

  for func in (eer_threshold, min_hter_threshold):
    nose.tools.assert_raises(RuntimeError, func, [], [1.])
    nose.tools.assert_raises(RuntimeError, func, [1.], [])
    nose.tools.assert_raises(RuntimeError, func, [], [])


241 242
def test_plots():

Amir MOHAMMADI's avatar
Amir MOHAMMADI committed
243 244
  from . import eer_threshold, roc, roc_for_far, precision_recall_curve, det, \
      epc
245 246

  # This test set is not separable.
André Anjos's avatar
André Anjos committed
247 248
  positives = bob.io.base.load(F('nonsep-positives.hdf5'))
  negatives = bob.io.base.load(F('nonsep-negatives.hdf5'))
249 250 251 252 253 254
  threshold = eer_threshold(negatives, positives)

  # This example will test the ROC plot calculation functionality.
  xy = roc(negatives, positives, 100)
  # uncomment the next line to save a reference value
  # save('nonsep-roc.hdf5', xy)
André Anjos's avatar
André Anjos committed
255
  xyref = bob.io.base.load(F('nonsep-roc.hdf5'))
256
  assert numpy.array_equal(xy, xyref)
257

258 259
  # This example will test the ROC for FAR plot calculation functionality.
  far = [0.01, 0.1, 1]
260
  ref = [0.42, 0.12, 0]
261
  xy = roc_for_far(negatives, positives, far)
262

263 264 265
  assert numpy.array_equal(xy[0], far)
  assert numpy.array_equal(xy[1], ref)

266 267 268 269
  # This example will test the Precision-Recall plot calculation functionality.
  xy = precision_recall_curve(negatives, positives, 100)
  # uncomment the next line to save a reference value
  # save('nonsep-roc.hdf5', xy)
André Anjos's avatar
André Anjos committed
270
  xyref = bob.io.base.load(F('nonsep-precisionrecall.hdf5'))
271
  assert numpy.array_equal(xy, xyref)
272

273 274 275 276
  # This example will test the DET plot calculation functionality.
  det_xyzw = det(negatives, positives, 100)
  # uncomment the next line to save a reference value
  # save('nonsep-det.hdf5', det_xyzw)
André Anjos's avatar
André Anjos committed
277
  det_xyzw_ref = bob.io.base.load(F('nonsep-det.hdf5'))
278 279 280 281 282 283
  assert numpy.allclose(det_xyzw, det_xyzw_ref, atol=1e-15)

  # This example will test the EPC plot calculation functionality. For the
  # EPC curve, you need to have a development and a test set. We will split,
  # by the middle, the negatives and positives sample we have, just for the
  # sake of testing
Amir MOHAMMADI's avatar
Amir MOHAMMADI committed
284 285 286 287
  dev_negatives = negatives[:(negatives.shape[0] // 2)]
  test_negatives = negatives[(negatives.shape[0] // 2):]
  dev_positives = positives[:(positives.shape[0] // 2)]
  test_positives = positives[(positives.shape[0] // 2):]
288
  xy = epc(dev_negatives, dev_positives,
Amir MOHAMMADI's avatar
Amir MOHAMMADI committed
289
           test_negatives, test_positives, 100)
André Anjos's avatar
André Anjos committed
290
  xyref = bob.io.base.load(F('nonsep-epc.hdf5'))
291
  assert numpy.allclose(xy, xyref[:2], atol=1e-15)
292
  xy = epc(dev_negatives, dev_positives,
Amir MOHAMMADI's avatar
Amir MOHAMMADI committed
293
           test_negatives, test_positives, 100, False, True)
294 295 296
  # uncomment the next line to save a reference value
  # save('nonsep-epc.hdf5', xy)
  assert numpy.allclose(xy, xyref, atol=1e-15)
297 298 299 300


def test_rocch():

301
  from . import rocch, rocch2eer, eer_rocch
302 303 304 305 306

  # This example will demonstrate and check the use of eer_rocch_threshold() to
  # calculate the threshold that minimizes the EER on the ROC Convex Hull

  # This test set is separable.
André Anjos's avatar
André Anjos committed
307 308
  positives = bob.io.base.load(F('linsep-positives.hdf5'))
  negatives = bob.io.base.load(F('linsep-negatives.hdf5'))
309
  # References obtained using Bosaris 1.06
310
  pmiss_pfa_ref = numpy.array([[1., 0., 0.], [0., 0., 1.]])
311 312 313 314 315
  eer_ref = 0.
  # Computes
  pmiss_pfa = rocch(negatives, positives)
  assert numpy.allclose(pmiss_pfa, pmiss_pfa_ref, atol=1e-15)
  eer = rocch2eer(pmiss_pfa)
Amir MOHAMMADI's avatar
Amir MOHAMMADI committed
316
  assert abs(eer - eer_ref) < 1e-4
317
  eer = eer_rocch(negatives, positives)
Amir MOHAMMADI's avatar
Amir MOHAMMADI committed
318
  assert abs(eer - eer_ref) < 1e-4
319 320

  # This test set is not separable.
André Anjos's avatar
André Anjos committed
321 322
  positives = bob.io.base.load(F('nonsep-positives.hdf5'))
  negatives = bob.io.base.load(F('nonsep-negatives.hdf5'))
323
  # References obtained using Bosaris 1.06
Amir MOHAMMADI's avatar
Amir MOHAMMADI committed
324 325
  pmiss_pfa_ref = numpy.array([[1., 0.68, 0.28, 0.1, 0.06, 0., 0.], [
                              0, 0, 0.08, 0.12, 0.22, 0.48, 1.]])
326 327 328 329 330
  eer_ref = 0.116363636363636
  # Computes
  pmiss_pfa = rocch(negatives, positives)
  assert numpy.allclose(pmiss_pfa, pmiss_pfa_ref, atol=1e-15)
  eer = rocch2eer(pmiss_pfa)
Amir MOHAMMADI's avatar
Amir MOHAMMADI committed
331
  assert abs(eer - eer_ref) < 1e-4
332
  eer = eer_rocch(negatives, positives)
Amir MOHAMMADI's avatar
Amir MOHAMMADI committed
333
  assert abs(eer - eer_ref) < 1e-4
334

335 336 337

def test_cmc():

338
  from . import recognition_rate, cmc, load
339 340

  # tests the CMC calculation
Amir MOHAMMADI's avatar
Amir MOHAMMADI committed
341 342 343 344
  # test data; should give match characteristics [1/2,1/4,1/3] and CMC
  # [1/3,2/3,1]
  test_data = [((0.3, 1.1, 0.5), (0.7,)), ((1.4, -1.3, 0.6), (0.2,)),
               ((0.8, 0., 1.5), (-0.8, 1.8)), ((2., 1.3, 1.6, 0.9), (2.4,))]
345 346 347 348 349 350 351 352 353
  # compute recognition rate
  rr = recognition_rate(test_data)
  nose.tools.eq_(rr, 0.5)
  # compute CMC
  cmc_ = cmc(test_data)
  assert (cmc_ == [0.5, 0.75, 1., 1., 1]).all()

  # load test data
  desired_rr = 0.76
Amir MOHAMMADI's avatar
Amir MOHAMMADI committed
354 355
  desired_cmc = [0.76, 0.89, 0.96, 0.98, 1., 1., 1.,
                 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.]
356 357 358 359 360 361 362 363 364 365 366 367
  data = load.cmc_four_column(F('scores-cmc-4col.txt'))
  rr = recognition_rate(data)
  nose.tools.eq_(rr, desired_rr)
  cmc_ = cmc(data)
  assert (cmc_ == desired_cmc).all()

  data = load.cmc_five_column(F('scores-cmc-5col.txt'))
  rr = recognition_rate(data)
  nose.tools.eq_(rr, desired_rr)
  cmc_ = cmc(data)
  assert (cmc_ == desired_cmc).all()

368 369 370 371 372 373 374
  data = load.cmc(F('scores-cmc-5col.txt'))
  rr = recognition_rate(data)
  nose.tools.eq_(rr, desired_rr)
  cmc_ = cmc(data)
  assert (cmc_ == desired_cmc).all()


375 376
def test_calibration():

377
  from . import calibration
378

379
  # Tests the cllr and min_cllr measures
380
  # This test set is separable.
André Anjos's avatar
André Anjos committed
381 382
  positives = bob.io.base.load(F('linsep-positives.hdf5'))
  negatives = bob.io.base.load(F('linsep-negatives.hdf5'))
383 384 385 386 387 388 389 390 391 392

  cllr = calibration.cllr(negatives, positives)
  min_cllr = calibration.min_cllr(negatives, positives)

  assert min_cllr <= cllr
  nose.tools.assert_almost_equal(cllr, 1.2097942129)
  # Since the test set is separable, the min_cllr needs to be zero
  nose.tools.assert_almost_equal(min_cllr, 0.)

  # This test set is not separable.
André Anjos's avatar
André Anjos committed
393 394
  positives = bob.io.base.load(F('nonsep-positives.hdf5'))
  negatives = bob.io.base.load(F('nonsep-negatives.hdf5'))
395 396 397 398 399

  cllr = calibration.cllr(negatives, positives)
  min_cllr = calibration.min_cllr(negatives, positives)

  assert min_cllr <= cllr
400 401 402 403
  assert abs(cllr - 3.61833) < 1e-5, cllr
  assert abs(min_cllr - 0.33736) < 1e-5, min_cllr


404
def test_open_set_rates():
405

406
  # No error files
407
  cmc_scores = bob.measure.load.cmc(F("scores-cmc-4col-open-set.txt"))
Amir MOHAMMADI's avatar
Amir MOHAMMADI committed
408 409
  assert abs(bob.measure.detection_identification_rate(
      cmc_scores, threshold=0.5) - 1.0) < 1e-8
410 411
  assert abs(bob.measure.false_alarm_rate(cmc_scores, threshold=0.5)) < 1e-8

Amir MOHAMMADI's avatar
Amir MOHAMMADI committed
412 413 414
  assert abs(bob.measure.recognition_rate(cmc_scores) - 7. / 9.) < 1e-8
  assert abs(bob.measure.recognition_rate(
      cmc_scores, threshold=0.5) - 1.0) < 1e-8
415

416
  # One error
Amir MOHAMMADI's avatar
Amir MOHAMMADI committed
417 418 419 420
  cmc_scores = bob.measure.load.cmc(
      F("scores-cmc-4col-open-set-one-error.txt"))
  assert abs(bob.measure.detection_identification_rate(
      cmc_scores, threshold=0.5) - 6. / 7.) < 1e-8
421 422
  assert abs(bob.measure.false_alarm_rate(cmc_scores, threshold=0.5)) < 1e-8

Amir MOHAMMADI's avatar
Amir MOHAMMADI committed
423 424 425
  assert abs(bob.measure.recognition_rate(cmc_scores) - 6. / 9.) < 1e-8
  assert abs(bob.measure.recognition_rate(
      cmc_scores, threshold=0.5) - 6. / 7.) < 1e-8
426

427
  # Two errors
Amir MOHAMMADI's avatar
Amir MOHAMMADI committed
428 429 430 431 432 433 434 435 436 437
  cmc_scores = bob.measure.load.cmc_four_column(
      F("scores-cmc-4col-open-set-two-errors.txt"))
  assert abs(bob.measure.detection_identification_rate(
      cmc_scores, threshold=0.5) - 6. / 7.) < 1e-8
  assert abs(bob.measure.false_alarm_rate(
      cmc_scores, threshold=0.5) - 0.5) < 1e-8

  assert abs(bob.measure.recognition_rate(cmc_scores) - 6. / 9.) < 1e-8
  assert abs(bob.measure.recognition_rate(
      cmc_scores, threshold=0.5) - 6. / 8.) < 1e-8