Antonio MORAIS · André Anjos
--- a/bob/measure/credible_region.py
+++ b/bob/measure/credible_region.py
+    Returns
+    -------
+    
+    f1_score : (float, float, float, float)
+        F1, mean, mode and credible intervals (95% CI). See `F1-score
+        <https://en.wikipedia.org/wiki/F1_score>`_.  It corresponds
+        arithmetically to ``2*P*R/(P+R)`` or ``2*tp/(2*tp+fp+fn)``.  The F1 or
+        Dice score depends on a TP-only numerator, similarly to the Jaccard
+        index.  For regions where there are no annotations, the F1-score will
+        always be zero, irrespective of the model output.  Accuracy may be a
+        better proxy if one needs to consider the true abscence of annotations
+        in a region as part of the measure.
+
+    """

+    nbsample = 100000
--- a/bob/measure/credible_region.py
+++ b/bob/measure/credible_region.py
+    coverage = 0.95
+    system1 = measures(tp1, fp1, tn1, fn1, lambda_, coverage)
+    system2 = measures(tp2, fp2, tn2, fn2, lambda_, coverage)
+    measure = ['precision', 'recall', 'specificity', 'accuracy', 'Jaccard index', 'F1 score']
+    result = ""
+    for i in range(len(measure)):
+        result += "For the %s we can say that : \n " % (measure[i])
+        if system1[i][2] > system2[i][3]:
+            # lower bound from system 1 is greater than the upper bound from system 2 
+            result += "System 1 is better than system 2 with convincing evidence \n"
+        elif  system2[i][2] > system1[i][3]:
+            # lower bound from system 2 is greater than the upper bound from system 1 
+            result += "System 2 is better than system 1 with convincing evidence \n"
+        else :
+            # the confidence intervals overlap so we compute the 85% confidence intervals to compare them
+            # (cf. https://mmeredith.net/blog/2013/1303_Comparison_of_confidence_intervals.htm and
--- a/bob/measure/credible_region.py
+++ b/bob/measure/credible_region.py
            beta(tn, fp, lambda_, coverage),  #specificity
            beta(tp+tn, fp+fn, lambda_, coverage),  #accuracy
            beta(tp, fp+fn, lambda_, coverage),  #jaccard index
-            beta(2*tp, fp+fn, lambda_, coverage),  #f1-score
+            f1score(tp, fp, tn, fn, lambda_, coverage),  #f1-score
            )
+
+def compare(tp1, fp1, tn1, fn1, tp2, fp2, tn2, fn2, lambda_):
--- a/bob/measure/credible_region.py
+++ b/bob/measure/credible_region.py
+        else :
+            # the confidence intervals overlap so we compute the 85% confidence intervals to compare them
+            # (cf. https://mmeredith.net/blog/2013/1303_Comparison_of_confidence_intervals.htm and
+            # https://statisticsbyjim.com/hypothesis-testing/confidence-intervals-compare-means)
+            coverage = 0.85
+            system1 = measures(tp1, fp1, tn1, fn1, lambda_, coverage)
+            system2 = measures(tp2, fp2, tn2, fn2, lambda_, coverage)
+            if system1[i][2] > system2[i][3]:
+                # lower bound from system 1 is greater than the upper bound from system 2 
+                result += "System 1 is better than system 2 with \"significance\" at the 5% level. \n"
+            elif  system2[i][2] > system1[i][3]:
+                # lower bound from system 2 is greater than the upper bound from system 1 
+                result += "System 2 is better than system 1 with \"significance\" at the 5% level. \n"
+            else : 
+                result += "There is no statistical difference between the 2 CIs \n"
+    return result