Skip to content
Snippets Groups Projects
Commit b3ace5aa authored by Antonio MORAIS's avatar Antonio MORAIS Committed by André Anjos
Browse files

[credible region] fixed the modifications of the implementation for the F1score

parent ceeee96a
No related branches found
No related tags found
1 merge request!103WIP: Breaking changes
This commit is part of merge request !103. Comments created here will be created in the context of that merge request.
......@@ -18,6 +18,7 @@ Should Know About <ci-evaluation_>`_.
import numbers
import numpy
import scipy.stats
from credible_region import measures
def clopper_pearson(k, l, coverage=0.95):
......@@ -188,3 +189,110 @@ def wilson(k, l, coverage=0.95):
return lower[0], upper[0]
else:
return lower, upper
def compare(tp1, fp1, tn1, fn1, tp2, fp2, tn2, fn2, lambda_):
"""
Compare the credible regions of 2 systems for different performance measures
'precision', 'recall', 'specificity', 'accuracy', 'Jaccard index' and 'F1 score'.
The method used to compare them is described in this two articles
https://mmeredith.net/blog/2013/1303_Comparison_of_confidence_intervals.htm and
https://statisticsbyjim.com/hypothesis-testing/confidence-intervals-compare-means
Parameters
----------
tp1/2 : int
True positive count for 1st/2nd system, AKA "hit"
fp1/2 : int
False positive count for 1st/2nd system, AKA "false alarm", or "Type I error"
tn1/2 : int
True negative count for 1st/2nd system, AKA "correct rejection"
fn1/2 : int
False Negative count for 1st/2nd system, AKA "miss", or "Type II error"
lambda_ : float
The parameterisation of the Beta prior to consider. Use
:math:`\lambda=1` for a flat prior. Use :math:`\lambda=0.5` for
Jeffrey's prior.
Returns
-------
dictionary : a dictionary indicating which confidence interval is better for each measure.
It returns a tuple that indicates the direction ">", "<", and the CI (0.95, 0.85).
retval["F1-score"] = (">", 0.85) means that system 1 is better than system 2 with a 5% uncertainty considering a 0.85 CI.
retval["precision"] = ("=", None) means that system 1 and system 2 are comparable according to that metric.
"""
coverage = 0.95
system1 = measures(tp1, fp1, tn1, fn1, lambda_, coverage)
system2 = measures(tp2, fp2, tn2, fn2, lambda_, coverage)
measure = ['precision', 'recall', 'specificity', 'accuracy', 'Jaccard index', 'F1 score']
dictionary = {}
for i in range(len(measure)):
if system1[i][2] > system2[i][3]:
# lower bound from system 1 is greater than the upper bound from system 2
dictionary[measure[i]] = ('>', 0.95)
elif system2[i][2] > system1[i][3]:
# lower bound from system 2 is greater than the upper bound from system 1
dictionary[measure[i]] = ('<', 0.95)
else :
# the confidence intervals overlap so we compute the 85% confidence intervals to compare them
coverage = 0.85
system1 = measures(tp1, fp1, tn1, fn1, lambda_, coverage)
system2 = measures(tp2, fp2, tn2, fn2, lambda_, coverage)
if system1[i][2] > system2[i][3]:
# lower bound from system 1 is greater than the upper bound from system 2
dictionary[measure[i]] = ('>', 0.85)
elif system2[i][2] > system1[i][3]:
# lower bound from system 2 is greater than the upper bound from system 1
dictionary[measure[i]] = ('<', 0.85)
else :
dictionary[measure[i]] = ('=', None)
return dictionary
def compareToString(dictionary) :
"""
Outputs a string describing the comparison between the two confidence interval
This function should be used with the dictionary output of the compare function
if it's not the case the output is undefined.
Parameters
----------
dictionary : dictionary
The dictionary that describes the comparison of the confidence intervals of the different metrics
example : dictionary["precision"] = ("=", None)
Returns
-------
result : a string explaining which confidence interval is better for each measure.
It translates the tuple given from the compare function.
dictionary["F1-score"] = (">", 0.85) means that system 1 is better than system 2 with \"significance\" at the 5% level for
the F1-score
dictionary["accuracy"] = ("<", 0.95) means that System 2 is better than system 1 with convincing evidence for the accuracy
"""
result = ""
for key in dictionary:
result += "For the %s we can say that : \n " % (key)
if dictionary[key][0] == '>' :
if dictionary[key][1] == 0.95 :
result += "System 1 is better than system 2 with convincing evidence \n"
else :
result += "System 1 is better than system 2 with \"significance\" at the 5% level. \n"
elif dictionary[key][0] == '<' :
if dictionary[key][1] == 0.95 :
result += "System 2 is better than system 1 with convincing evidence \n"
else :
result += "System 2 is better than system 1 with \"significance\" at the 5% level. \n"
else :
result += "There is no statistical difference between the 2 CIs \n"
return result
\ No newline at end of file
......@@ -26,6 +26,7 @@ methods.
import numbers
import numpy
import scipy.special
import random
def beta(k, l, lambda_, coverage):
......@@ -177,7 +178,55 @@ def randomgamma(nbsamples, shape, scale):
gammageneration[i] = random.gammavariate(shape, scale)
return gammageneration
def f1score(tp, fp, tn, fn, lambda_, coverage):
def comparef1score(tp1, fp1, tn1, fn1, tp2, fp2, tn2, fn2, lambda_, nbsamples):
"""
Returns the probability that the F1-score from 1 system is bigger than the F1-score of a second system
This implementation is based on [GOUTTE-2005]_.
Parameters
----------
tp1/2 : int
True positive count, AKA "hit"
fp1/2 : int
False positive count, AKA "false alarm", or "Type I error"
tn1/2 : int
True negative count, AKA "correct rejection"
fn1/2 : int
False Negative count, AKA "miss", or "Type II error"
lambda_ : float
The parameterisation of the Beta prior to consider. Use
:math:`\lambda=1` for a flat prior. Use :math:`\lambda=0.5` for
Jeffrey's prior.
nbsample : int
number of generated gamma distribution values
Returns
-------
f1_score : float
A number between 0.0 and 1.0 that describes the probability that the first system is bigger than the second
"""
U1 = randomgamma(nbsamples, shape=tp1+lambda_, scale=2)
V1 = randomgamma(nbsamples, fp1+fn1+2*lambda_, scale=1)
F1scores1 = U1/(U1+V1)
U2 = randomgamma(nbsamples, tp2+lambda_, scale=2)
V2 = randomgamma(nbsamples, tp2+fn2+2*lambda_, scale=1)
F1scores2 = U2/(U2+V2)
return numpy.count_nonzero(F1scores1 > F1scores2) / nbsamples
def f1score(tp, fp, tn, fn, lambda_, coverage, nbsample):
"""
Returns the mean, mode, upper and lower bounds of the credible
region of the F1 score.
......@@ -211,6 +260,9 @@ def f1score(tp, fp, tn, fn, lambda_, coverage):
the probability density of the posterior is covered by the returned
equal-tailed interval.
nbsample : int
number of generated gamma distribution values
Returns
-------
......@@ -227,7 +279,6 @@ def f1score(tp, fp, tn, fn, lambda_, coverage):
"""
nbsample = 100000
U = randomgamma(nbsample, shape=tp+lambda_, scale=2)
V = randomgamma(nbsample, fp+fn+2*lambda_, scale=1)
F1scores = U/(U+V)
......@@ -335,69 +386,6 @@ def measures(tp, fp, tn, fn, lambda_, coverage):
beta(tn, fp, lambda_, coverage), #specificity
beta(tp+tn, fp+fn, lambda_, coverage), #accuracy
beta(tp, fp+fn, lambda_, coverage), #jaccard index
f1score(tp, fp, tn, fn, lambda_, coverage), #f1-score
f1score(tp, fp, tn, fn, lambda_, coverage, 100000), #f1-score
)
def compare(tp1, fp1, tn1, fn1, tp2, fp2, tn2, fn2, lambda_):
"""Compare the credible regions of 2 systems
Parameters
----------
tp1/2 : int
True positive count for 1st/2nd system, AKA "hit"
fp1/2 : int
False positive count for 1st/2nd system, AKA "false alarm", or "Type I error"
tn1/2 : int
True negative count for 1st/2nd system, AKA "correct rejection"
fn1/2 : int
False Negative count for 1st/2nd system, AKA "miss", or "Type II error"
lambda_ : float
The parameterisation of the Beta prior to consider. Use
:math:`\lambda=1` for a flat prior. Use :math:`\lambda=0.5` for
Jeffrey's prior.
Returns
-------
result : string
A string describing the statistical comparison between the two systems for
the different performance measures
"""
coverage = 0.95
system1 = measures(tp1, fp1, tn1, fn1, lambda_, coverage)
system2 = measures(tp2, fp2, tn2, fn2, lambda_, coverage)
measure = ['precision', 'recall', 'specificity', 'accuracy', 'Jaccard index', 'F1 score']
result = ""
for i in range(len(measure)):
result += "For the %s we can say that : \n " % (measure[i])
if system1[i][2] > system2[i][3]:
# lower bound from system 1 is greater than the upper bound from system 2
result += "System 1 is better than system 2 with convincing evidence \n"
elif system2[i][2] > system1[i][3]:
# lower bound from system 2 is greater than the upper bound from system 1
result += "System 2 is better than system 1 with convincing evidence \n"
else :
# the confidence intervals overlap so we compute the 85% confidence intervals to compare them
# (cf. https://mmeredith.net/blog/2013/1303_Comparison_of_confidence_intervals.htm and
# https://statisticsbyjim.com/hypothesis-testing/confidence-intervals-compare-means)
coverage = 0.85
system1 = measures(tp1, fp1, tn1, fn1, lambda_, coverage)
system2 = measures(tp2, fp2, tn2, fn2, lambda_, coverage)
if system1[i][2] > system2[i][3]:
# lower bound from system 1 is greater than the upper bound from system 2
result += "System 1 is better than system 2 with \"significance\" at the 5% level. \n"
elif system2[i][2] > system1[i][3]:
# lower bound from system 2 is greater than the upper bound from system 1
result += "System 2 is better than system 1 with \"significance\" at the 5% level. \n"
else :
result += "There is no statistical difference between the 2 CIs \n"
return result
\ No newline at end of file
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment