diff --git a/bob/measure/credible_region.py b/bob/measure/credible_region.py index 0596e01b0ed45261b3e5903fb70ab0da40ca51cc..4eaeb844475405d0f36a79822e54e093a3508c8a 100644 --- a/bob/measure/credible_region.py +++ b/bob/measure/credible_region.py @@ -18,11 +18,9 @@ methods. .. note:: - For a disambiguation with `Confidence Interval <confidence interval_>`_ (the + For a disambiguation with `Confidence Interval <confidence-interval_>`_ (the frequentist approach), read `Credible Regions or Intervals - <credible interval_>`_. - -.. include:: ../links.rst + <credible-interval_>`_. """ import numbers @@ -42,8 +40,8 @@ def beta(k, l, lambda_, coverage): This implementation is based on [GOUTTE-2005]_. It assumes :math:`k` successes and :math:`l` failures (:math:`n = k+l` total trials) are issued from a series of Bernoulli trials (likelihood is binomial). The posterior - is derived using the Bayes Theorem with a beta prior. As there is no - reason to favour high vs. low precision, we use a symmetric Beta prior + is derivated using the Bayes Theorem with a beta prior. As there is no + reason to favour high vs. low precision, we use a symmetric Beta prior (:math:`\\alpha=\\beta`): .. math:: @@ -173,7 +171,76 @@ def beta(k, l, lambda_, coverage): else: return E, mode, lower, upper +def randomgamma(nbsamples, shape, scale): + gammageneration = numpy.empty(nbsamples) + for i in range(nbsamples): + gammageneration[i] = random.gammavariate(shape, scale) + return gammageneration + +def f1score(tp, fp, tn, fn, lambda_, coverage): + """ + Returns the mean, mode, upper and lower bounds of the credible + region of the F1 score. + + This implementation is based on [GOUTTE-2005]_. + + + Parameters + ---------- + + tp : int + True positive count, AKA "hit" + + fp : int + False positive count, AKA "false alarm", or "Type I error" + + tn : int + True negative count, AKA "correct rejection" + + fn : int + False Negative count, AKA "miss", or "Type II error" + + lambda_ : float + The parameterisation of the Beta prior to consider. Use + :math:`\lambda=1` for a flat prior. Use :math:`\lambda=0.5` for + Jeffrey's prior. + + coverage : float + A floating-point number between 0 and 1.0 indicating the coverage + you're expecting. A value of 0.95 will ensure 95% of the area under + the probability density of the posterior is covered by the returned + equal-tailed interval. + + + Returns + ------- + + f1_score : (float, float, float, float) + F1, mean, mode and credible intervals (95% CI). See `F1-score + <https://en.wikipedia.org/wiki/F1_score>`_. It corresponds + arithmetically to ``2*P*R/(P+R)`` or ``2*tp/(2*tp+fp+fn)``. The F1 or + Dice score depends on a TP-only numerator, similarly to the Jaccard + index. For regions where there are no annotations, the F1-score will + always be zero, irrespective of the model output. Accuracy may be a + better proxy if one needs to consider the true abscence of annotations + in a region as part of the measure. + + """ + nbsample = 100000 + U = randomgamma(nbsample, shape=tp+lambda_, scale=2) + V = randomgamma(nbsample, fp+fn+2*lambda_, scale=1) + F1scores = U/(U+V) + F1scores = numpy.around(F1scores, 3) + coverageLeftHalf = (1-coverage)/2 + sortedScores = numpy.sort(F1scores) + lowerIndex = int(round(nbsample * coverageLeftHalf)) + upperIndex = int(round(nbsample * (1 - coverageLeftHalf))) + lower = sortedScores[lowerIndex - 1] + upper = sortedScores[upperIndex - 1] + return numpy.mean(F1scores), scipy.stats.mode(F1scores)[0][0], lower, upper + + def measures(tp, fp, tn, fn, lambda_, coverage): """Calculates mean and mode from true/false positive and negative counts with credible regions @@ -268,5 +335,69 @@ def measures(tp, fp, tn, fn, lambda_, coverage): beta(tn, fp, lambda_, coverage), #specificity beta(tp+tn, fp+fn, lambda_, coverage), #accuracy beta(tp, fp+fn, lambda_, coverage), #jaccard index - beta(2*tp, fp+fn, lambda_, coverage), #f1-score + f1score(tp, fp, tn, fn, lambda_, coverage), #f1-score ) + +def compare(tp1, fp1, tn1, fn1, tp2, fp2, tn2, fn2, lambda_): + """Compare the credible regions of 2 systems + + + Parameters + ---------- + + tp1/2 : int + True positive count for 1st/2nd system, AKA "hit" + + fp1/2 : int + False positive count for 1st/2nd system, AKA "false alarm", or "Type I error" + + tn1/2 : int + True negative count for 1st/2nd system, AKA "correct rejection" + + fn1/2 : int + False Negative count for 1st/2nd system, AKA "miss", or "Type II error" + + lambda_ : float + The parameterisation of the Beta prior to consider. Use + :math:`\lambda=1` for a flat prior. Use :math:`\lambda=0.5` for + Jeffrey's prior. + + + Returns + ------- + + result : string + A string describing the statistical comparison between the two systems for + the different performance measures + + """ + + coverage = 0.95 + system1 = measures(tp1, fp1, tn1, fn1, lambda_, coverage) + system2 = measures(tp2, fp2, tn2, fn2, lambda_, coverage) + measure = ['precision', 'recall', 'specificity', 'accuracy', 'Jaccard index', 'F1 score'] + result = "" + for i in range(len(measure)): + result += "For the %s we can say that : \n " % (measure[i]) + if system1[i][2] > system2[i][3]: + # lower bound from system 1 is greater than the upper bound from system 2 + result += "System 1 is better than system 2 with convincing evidence \n" + elif system2[i][2] > system1[i][3]: + # lower bound from system 2 is greater than the upper bound from system 1 + result += "System 2 is better than system 1 with convincing evidence \n" + else : + # the confidence intervals overlap so we compute the 85% confidence intervals to compare them + # (cf. https://mmeredith.net/blog/2013/1303_Comparison_of_confidence_intervals.htm and + # https://statisticsbyjim.com/hypothesis-testing/confidence-intervals-compare-means) + coverage = 0.85 + system1 = measures(tp1, fp1, tn1, fn1, lambda_, coverage) + system2 = measures(tp2, fp2, tn2, fn2, lambda_, coverage) + if system1[i][2] > system2[i][3]: + # lower bound from system 1 is greater than the upper bound from system 2 + result += "System 1 is better than system 2 with \"significance\" at the 5% level. \n" + elif system2[i][2] > system1[i][3]: + # lower bound from system 2 is greater than the upper bound from system 1 + result += "System 2 is better than system 1 with \"significance\" at the 5% level. \n" + else : + result += "There is no statistical difference between the 2 CIs \n" + return result \ No newline at end of file