diff --git a/bob/ip/binseg/engine/evaluator.py b/bob/ip/binseg/engine/evaluator.py index 8655b6b2627d0883427a2c2cd7a5b3efbd659fe7..ee1c7bc1710514490c00bb3f1f0b9fd8f791baf7 100644 --- a/bob/ip/binseg/engine/evaluator.py +++ b/bob/ip/binseg/engine/evaluator.py @@ -16,7 +16,7 @@ import torchvision.transforms.functional as VF import h5py -from ..utils.measure import base_measures +from ..utils.measure import base_measures, bayesian_measures import logging @@ -84,7 +84,7 @@ def _posneg(pred, gt, threshold): def sample_measures_for_threshold(pred, gt, mask, threshold): """ - Calculates measures on one single sample, for a specific threshold + Calculates counts on one single sample, for a specific threshold Parameters @@ -107,17 +107,13 @@ def sample_measures_for_threshold(pred, gt, mask, threshold): Returns ------- - precision: float + tp : int - recall: float + fp : int - specificity: float + tn : int - accuracy: float - - jaccard: float - - f1_score: float + fn : int """ @@ -138,7 +134,7 @@ def sample_measures_for_threshold(pred, gt, mask, threshold): tn_count = torch.sum(tn_tensor).item() fn_count = torch.sum(fn_tensor).item() - return base_measures(tp_count, fp_count, tn_count, fn_count) + return tp_count, fp_count, tn_count, fn_count def _sample_measures(pred, gt, mask, steps): @@ -170,36 +166,33 @@ def _sample_measures(pred, gt, mask, steps): A pandas dataframe with the following columns: - * threshold: float - * precision: float - * recall: float - * specificity: float - * accuracy: float - * jaccard: float - * f1_score: float + * tp: int + * fp: int + * tn: int + * fn: int """ step_size = 1.0 / steps data = [ - (index, threshold) + sample_measures_for_threshold(pred, gt, mask, - threshold) + (index, threshold) + + sample_measures_for_threshold(pred, gt, mask, threshold) for index, threshold in enumerate(numpy.arange(0.0, 1.0, step_size)) ] - return pandas.DataFrame( + retval = pandas.DataFrame( data, columns=( "index", "threshold", - "precision", - "recall", - "specificity", - "accuracy", - "jaccard", - "f1_score", + "tp", + "fp", + "tn", + "fn", ), ) + retval.set_index("index", inplace=True) + return retval def _sample_analysis( @@ -291,6 +284,73 @@ def _sample_analysis( return tp_pil_colored +def _summarize(data): + """Summarizes collected dataframes and adds bayesian figures""" + + _entries = ( + "mean_precision", + "mode_precision", + "lower_precision", + "upper_precision", + "mean_recall", + "mode_recall", + "lower_recall", + "upper_recall", + "mean_specificity", + "mode_specificity", + "lower_specificity", + "upper_specificity", + "mean_accuracy", + "mode_accuracy", + "lower_accuracy", + "upper_accuracy", + "mean_jaccard", + "mode_jaccard", + "lower_jaccard", + "upper_jaccard", + "mean_f1_score", + "mode_f1_score", + "lower_f1_score", + "upper_f1_score", + "frequentist_precision", + "frequentist_recall", + "frequentist_specificity", + "frequentist_accuracy", + "frequentist_jaccard", + "frequentist_f1_score", + ) + + def _row_summary(r): + + # run bayesian_measures(), flatten tuple of tuples, name entries + bayesian = [ + item + for sublist in bayesian_measures( + r.tp, + r.fp, + r.tn, + r.fn, + lambda_=0.5, + coverage=0.95, + ) + for item in sublist + ] + + # evaluate frequentist measures + frequentist = base_measures(r.tp, r.fp, r.tn, r.fn) + return pandas.Series(bayesian + list(frequentist), index=_entries) + + # Merges all dataframes together + sums = pandas.concat(data.values()).groupby("index").sum() + sums["threshold"] /= len(data) + + # create a new dataframe with these + measures = sums.apply(lambda r: _row_summary(r), axis=1) + + ## merge sums and measures into a single dataframe + return pandas.concat([sums, measures.reindex(sums.index)], axis=1).copy() + + def run( dataset, name, @@ -384,35 +444,11 @@ def run( overlay_image.save(fullpath) # Merges all dataframes together - df_measures = pandas.concat(data.values()) - - # Report and Averages - avg_measures = df_measures.groupby("index").mean() - std_measures = df_measures.groupby("index").std() - - # Uncomment below for F1-score calculation based on average precision and - # measures instead of F1-scores of individual images. This method is in line - # with Maninis et. al. (2016) - # - # avg_measures["f1_score"] = \ - # (2* avg_measures["precision"]*avg_measures["recall"])/ \ - # (avg_measures["precision"]+avg_measures["recall"]) - - avg_measures["std_pr"] = std_measures["precision"] - avg_measures["pr_upper"] = ( - avg_measures["precision"] + std_measures["precision"] - ) - avg_measures["pr_lower"] = ( - avg_measures["precision"] - std_measures["precision"] - ) - avg_measures["std_re"] = std_measures["recall"] - avg_measures["re_upper"] = avg_measures["recall"] + std_measures["recall"] - avg_measures["re_lower"] = avg_measures["recall"] - std_measures["recall"] - avg_measures["std_f1"] = std_measures["f1_score"] + measures = _summarize(data) - maxf1 = avg_measures["f1_score"].max() - maxf1_index = avg_measures["f1_score"].idxmax() - maxf1_threshold = avg_measures["threshold"][maxf1_index] + maxf1 = measures["mean_f1_score"].max() + maxf1_index = measures["mean_f1_score"].idxmax() + maxf1_threshold = measures["threshold"][maxf1_index] logger.info( f"Maximum F1-score of {maxf1:.5f}, achieved at " @@ -423,8 +459,12 @@ def run( # get the closest possible threshold we have index = int(round(steps * threshold)) - f1_a_priori = avg_measures["f1_score"][index] - actual_threshold = avg_measures["threshold"][index] + f1_a_priori = measures["mean_f1_score"][index] + actual_threshold = measures["threshold"][index] + + # mark threshold a priori chosen on this dataset + measures["threshold_a_priori"] = False + measures["threshold_a_priori", index] = True logger.info( f"F1-score of {f1_a_priori:.5f}, at threshold " @@ -436,9 +476,9 @@ def run( os.makedirs(output_folder, exist_ok=True) measures_path = os.path.join(output_folder, f"{name}.csv") logger.info( - f"Saving averages over all input images at {measures_path}..." + f"Saving measures over all input images at {measures_path}..." ) - avg_measures.to_csv(measures_path) + measures.to_csv(measures_path) return maxf1_threshold @@ -521,40 +561,15 @@ def compare_annotators( os.makedirs(os.path.dirname(fullpath), exist_ok=True) overlay_image.save(fullpath) - # Merges all dataframes together - df_measures = pandas.concat(data.values()) - df_measures.drop(0, inplace=True) - - # Report and Averages - avg_measures = df_measures.groupby("index").mean() - std_measures = df_measures.groupby("index").std() - - # Uncomment below for F1-score calculation based on average precision and - # {name} instead of F1-scores of individual images. This method is in line - # with Maninis et. al. (2016) - # - # avg_measures["f1_score"] = \ - # (2* avg_measures["precision"]*avg_measures["recall"])/ \ - # (avg_measures["precision"]+avg_measures["recall"]) - - avg_measures["std_pr"] = std_measures["precision"] - avg_measures["pr_upper"] = ( - avg_measures["precision"] + std_measures["precision"] - ) - avg_measures["pr_lower"] = ( - avg_measures["precision"] - std_measures["precision"] - ) - avg_measures["std_re"] = std_measures["recall"] - avg_measures["re_upper"] = avg_measures["recall"] + std_measures["recall"] - avg_measures["re_lower"] = avg_measures["recall"] - std_measures["recall"] - avg_measures["std_f1"] = std_measures["f1_score"] + measures = _summarize(data) + measures.drop(0, inplace=True) #removes threshold == 0.0, keeps 0.5 only measures_path = os.path.join( output_folder, "second-annotator", f"{name}.csv" ) os.makedirs(os.path.dirname(measures_path), exist_ok=True) - logger.info(f"Saving averages over all input images at {measures_path}...") - avg_measures.to_csv(measures_path) + logger.info(f"Saving summaries over all input images at {measures_path}...") + measures.to_csv(measures_path) - maxf1 = avg_measures["f1_score"].max() + maxf1 = measures["mean_f1_score"].max() logger.info(f"F1-score of {maxf1:.5f} (second annotator; threshold=0.5)") diff --git a/bob/ip/binseg/script/compare.py b/bob/ip/binseg/script/compare.py index cc899ffb6a61b7debff1fc0fa8bc48a309c71142..43423bbb3ba2b3b6d408284e21653ab5ca80d871 100644 --- a/bob/ip/binseg/script/compare.py +++ b/bob/ip/binseg/script/compare.py @@ -86,7 +86,7 @@ def _load(data, threshold=None): f"'{threshold}' dataset...") measures_path = data[threshold] df = pandas.read_csv(measures_path) - use_threshold = df.threshold[df.f1_score.idxmax()] + use_threshold = df.threshold[df.mean_f1_score.idxmax()] logger.info(f"Dataset '*': threshold = {use_threshold:.3f}'") elif isinstance(threshold, float): @@ -105,8 +105,15 @@ def _load(data, threshold=None): df = pandas.read_csv(measures_path) if threshold is None: - use_threshold = df.threshold[df.f1_score.idxmax()] - logger.info(f"Dataset '{name}': threshold = {use_threshold:.3f}'") + + if 'threshold_a_priori' in df: + use_threshold = df.threshold[df.threshold_a_priori.idxmax()] + logger.info(f"Dataset '{name}': threshold (a priori) = " \ + f"{use_threshold:.3f}'") + else: + use_threshold = df.threshold[df.mean_f1_score.idxmax()] + logger.info(f"Dataset '{name}': threshold (a posteriori) = " \ + f"{use_threshold:.3f}'") retval[name] = dict(df=df, threshold=use_threshold) @@ -159,7 +166,10 @@ def _load(data, threshold=None): help="This number is used to select which F1-score to use for " "representing a system performance. If not set, we report the maximum " "F1-score in the set, which is equivalent to threshold selection a " - "posteriori (biased estimator). You can either set this value to a " + "posteriori (biased estimator), unless the performance file being " + "considered already was pre-tunned, and contains a 'threshold_a_priori' " + "column which we then use to pick a threshold for the dataset. " + "You can override this behaviour by either setting this value to a " "floating-point number in the range [0.0, 1.0], or to a string, naming " "one of the systems which will be used to calculate the threshold " "leading to the maximum F1-score and then applied to all other sets.", @@ -187,7 +197,7 @@ def compare(label_path, output_figure, table_format, output_table, threshold, output_figure = os.path.realpath(output_figure) logger.info(f"Creating and saving plot at {output_figure}...") os.makedirs(os.path.dirname(output_figure), exist_ok=True) - fig = precision_recall_f1iso(data, confidence=True) + fig = precision_recall_f1iso(data, credible=True) fig.savefig(output_figure) logger.info("Tabulating performance summary...") diff --git a/bob/ip/binseg/test/test_batchmeasures.py b/bob/ip/binseg/test/test_batchmeasures.py deleted file mode 100644 index 096b74ba87c479acba4f5aaf4823d6ad45a4c6c3..0000000000000000000000000000000000000000 --- a/bob/ip/binseg/test/test_batchmeasures.py +++ /dev/null @@ -1,42 +0,0 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- - -import unittest -import random -import shutil - -import torch -import pandas -import numpy - -from ..engine.evaluator import _sample_measures - -import logging -logger = logging.getLogger(__name__) - - -class Tester(unittest.TestCase): - """ - Unit test for batch measures - """ - - def setUp(self): - self.tp = random.randint(1, 100) - self.fp = random.randint(1, 100) - self.tn = random.randint(1, 100) - self.fn = random.randint(1, 100) - self.predictions = torch.rand(size=(2, 1, 420, 420)) - self.ground_truths = torch.randint(low=0, high=2, size=(2, 1, 420, 420)) - self.names = ["Bob", "Tim"] - - def test_batch_measures(self): - dfs = [] - for pred, gt in zip(self.predictions, self.ground_truths): - dfs.append(_sample_measures(pred, gt, None, 100)) - bm = pandas.concat(dfs) - - self.assertEqual(len(bm), 2 * 100) - # check whether f1 score agree - calculated = bm.f1_score.to_numpy() - ours = (2*(bm.precision*bm.recall)/(bm.precision+bm.recall)).to_numpy() - assert numpy.isclose(calculated, ours).all() diff --git a/bob/ip/binseg/test/test_cli.py b/bob/ip/binseg/test/test_cli.py index 8c959a2d63a774a24d02dd03573818e5bbdf141a..a650ea7612b3646a3783b28cd6349d987ab6049e 100644 --- a/bob/ip/binseg/test/test_cli.py +++ b/bob/ip/binseg/test/test_cli.py @@ -88,7 +88,7 @@ def _check_experiment_stare(overlay): output_folder = "results" options = [ - "m2unet", + "lwnet", config.name, "-vv", "--epochs=1", @@ -293,7 +293,7 @@ def _check_train(runner): result = runner.invoke( train, [ - "m2unet", + "lwnet", config.name, "-vv", "--epochs=1", @@ -359,7 +359,7 @@ def _check_predict(runner): result = runner.invoke( predict, [ - "m2unet", + "lwnet", config.name, "-vv", "--batch-size=1", @@ -458,7 +458,7 @@ def _check_evaluate(runner): keywords = { r"^Skipping dataset '__train__'": 0, - r"^Saving averages over all input images.*$": 2, + r"^Saving summaries over all input images.*$": 1, r"^Maximum F1-score of.*\(chosen \*a posteriori\*\)$": 1, r"^F1-score of.*\(chosen \*a priori\*\)$": 1, r"^F1-score of.*\(second annotator; threshold=0.5\)$": 1, @@ -597,7 +597,7 @@ def test_discrete_experiment_stare(): _check_predict(runner) _check_evaluate(runner) _check_compare(runner) - _check_significance(runner) + #_check_significance(runner) def test_train_help(): diff --git a/bob/ip/binseg/test/test_measures.py b/bob/ip/binseg/test/test_measures.py index cddcd0f4c30513adee22d07b411839469257d57f..b0d81e39760d967c2dd8110f8da5e9ece6869c5d 100644 --- a/bob/ip/binseg/test/test_measures.py +++ b/bob/ip/binseg/test/test_measures.py @@ -3,18 +3,24 @@ import random import unittest - import math + +import numpy import torch import nose.tools -from ..utils.measure import base_measures, auc +from ..utils.measure import ( + base_measures, + bayesian_measures, + beta_credible_region, + auc, +) from ..engine.evaluator import sample_measures_for_threshold -class Tester(unittest.TestCase): +class TestFrequentist(unittest.TestCase): """ - Unit test for base measures + Unit test for frequentist base measures """ def setUp(self): @@ -54,6 +60,121 @@ class Tester(unittest.TestCase): self.assertAlmostEqual((2 * p * r) / (p + r), f1) # base definition +class TestBayesian: + """ + Unit test for bayesian base measures + """ + + def mean(self, k, l, lambda_): + return (k + lambda_) / (k + l + 2 * lambda_) + + def mode1(self, k, l, lambda_): # (k+lambda_), (l+lambda_) > 1 + return (k + lambda_ - 1) / (k + l + 2 * lambda_ - 2) + + def test_beta_credible_region_base(self): + k = 40 + l = 10 + lambda_ = 0.5 + cover = 0.95 + got = beta_credible_region(k, l, lambda_, cover) + # mean, mode, lower, upper + exp = ( + self.mean(k, l, lambda_), + self.mode1(k, l, lambda_), + 0.6741731038857685, + 0.8922659692341358, + ) + assert numpy.isclose(got, exp).all(), f"{got} <> {exp}" + + def test_beta_credible_region_small_k(self): + + k = 4 + l = 1 + lambda_ = 0.5 + cover = 0.95 + got = beta_credible_region(k, l, lambda_, cover) + # mean, mode, lower, upper + exp = ( + self.mean(k, l, lambda_), + self.mode1(k, l, lambda_), + 0.37137359936800574, + 0.9774872340008449, + ) + assert numpy.isclose(got, exp).all(), f"{got} <> {exp}" + + def test_beta_credible_region_precision_jeffrey(self): + + # simulation of situation for precision TP == FP == 0, Jeffrey's prior + k = 0 + l = 0 + lambda_ = 0.5 + cover = 0.95 + got = beta_credible_region(k, l, lambda_, cover) + # mean, mode, lower, upper + exp = ( + self.mean(k, l, lambda_), + 0.0, + 0.0015413331334360135, + 0.998458666866564, + ) + assert numpy.isclose(got, exp).all(), f"{got} <> {exp}" + + def test_beta_credible_region_precision_flat(self): + + # simulation of situation for precision TP == FP == 0, flat prior + k = 0 + l = 0 + lambda_ = 1.0 + cover = 0.95 + got = beta_credible_region(k, l, lambda_, cover) + # mean, mode, lower, upper + exp = (self.mean(k, l, lambda_), 0.0, 0.025000000000000022, 0.975) + assert numpy.isclose(got, exp).all(), f"{got} <> {exp}" + + def test_bayesian_measures(self): + + tp = random.randint(100000, 1000000) + fp = random.randint(100000, 1000000) + tn = random.randint(100000, 1000000) + fn = random.randint(100000, 1000000) + + _prec, _rec, _spec, _acc, _jac, _f1 = base_measures(tp, fp, tn, fn) + prec, rec, spec, acc, jac, f1 = bayesian_measures( + tp, fp, tn, fn, 0.5, 0.95 + ) + + # Notice that for very large k and l, the base frequentist measures + # should be approximately the same as the bayesian mean and mode + # extracted from the beta posterior. We test that here. + assert numpy.isclose(_prec, prec[0]), f"freq: {_prec} <> bays: {prec[0]}" + assert numpy.isclose(_prec, prec[1]), f"freq: {_prec} <> bays: {prec[1]}" + assert numpy.isclose(_rec, rec[0]), f"freq: {_rec} <> bays: {rec[0]}" + assert numpy.isclose(_rec, rec[1]), f"freq: {_rec} <> bays: {rec[1]}" + assert numpy.isclose(_spec, spec[0]), f"freq: {_spec} <> bays: {spec[0]}" + assert numpy.isclose(_spec, spec[1]), f"freq: {_spec} <> bays: {spec[1]}" + assert numpy.isclose(_acc, acc[0]), f"freq: {_acc} <> bays: {acc[0]}" + assert numpy.isclose(_acc, acc[1]), f"freq: {_acc} <> bays: {acc[1]}" + assert numpy.isclose(_jac, jac[0]), f"freq: {_jac} <> bays: {jac[0]}" + assert numpy.isclose(_jac, jac[1]), f"freq: {_jac} <> bays: {jac[1]}" + assert numpy.isclose(_f1, f1[0]), f"freq: {_f1} <> bays: {f1[0]}" + assert numpy.isclose(_f1, f1[1]), f"freq: {_f1} <> bays: {f1[1]}" + + # We also test that the interval in question includes the mode and the + # mean in this case. + assert (prec[2] < prec[1]) and (prec[1] < prec[3]), \ + f"precision is out of bounds {_prec[2]} < {_prec[1]} < {_prec[3]}" + assert (rec[2] < rec[1]) and (rec[1] < rec[3]), \ + f"recall is out of bounds {_rec[2]} < {_rec[1]} < {_rec[3]}" + assert (spec[2] < spec[1]) and (spec[1] < spec[3]), \ + f"specif. is out of bounds {_spec[2]} < {_spec[1]} < {_spec[3]}" + assert (acc[2] < acc[1]) and (acc[1] < acc[3]), \ + f"accuracy is out of bounds {_acc[2]} < {_acc[1]} < {_acc[3]}" + assert (jac[2] < jac[1]) and (jac[1] < jac[3]), \ + f"jaccard is out of bounds {_jac[2]} < {_jac[1]} < {_jac[3]}" + assert (f1[2] < f1[1]) and (f1[1] < f1[3]), \ + f"f1-score is out of bounds {_f1[2]} < {_f1[1]} < {_f1[3]}" + + def test_auc(): # basic tests @@ -124,7 +245,7 @@ def test_sample_measures_mask_checkerbox(): fn = 0 nose.tools.eq_( - base_measures(tp, fp, tn, fn), + (tp, fp, tn, fn), sample_measures_for_threshold( prediction, ground_truth, mask, threshold ), @@ -134,13 +255,15 @@ def test_sample_measures_mask_checkerbox(): def test_sample_measures_mask_cross(): prediction = torch.ones((10, 10), dtype=float) - prediction[0,:] = 0.0 - prediction[9,:] = 0.0 + prediction[0, :] = 0.0 + prediction[9, :] = 0.0 ground_truth = torch.ones((10, 10), dtype=float) - ground_truth[:5,] = 0.0 #lower part is not to be set + ground_truth[ + :5, + ] = 0.0 # lower part is not to be set mask = torch.zeros((10, 10), dtype=float) - mask[(0,1,2,3,4,5,6,7,8,9),(0,1,2,3,4,5,6,7,8,9)] = 1.0 - mask[(0,1,2,3,4,5,6,7,8,9),(9,8,7,6,5,4,3,2,1,0)] = 1.0 + mask[(0, 1, 2, 3, 4, 5, 6, 7, 8, 9), (0, 1, 2, 3, 4, 5, 6, 7, 8, 9)] = 1.0 + mask[(0, 1, 2, 3, 4, 5, 6, 7, 8, 9), (9, 8, 7, 6, 5, 4, 3, 2, 1, 0)] = 1.0 threshold = 0.5 # with this configuration, this should be the correct count @@ -150,7 +273,7 @@ def test_sample_measures_mask_cross(): fn = 2 nose.tools.eq_( - base_measures(tp, fp, tn, fn), + (tp, fp, tn, fn), sample_measures_for_threshold( prediction, ground_truth, mask, threshold ), @@ -160,19 +283,19 @@ def test_sample_measures_mask_cross(): def test_sample_measures_mask_border(): prediction = torch.zeros((10, 10), dtype=float) - prediction[:,4] = 1.0 - prediction[:,5] = 1.0 - prediction[0,4] = 0.0 - prediction[8,4] = 0.0 - prediction[1,6] = 1.0 + prediction[:, 4] = 1.0 + prediction[:, 5] = 1.0 + prediction[0, 4] = 0.0 + prediction[8, 4] = 0.0 + prediction[1, 6] = 1.0 ground_truth = torch.zeros((10, 10), dtype=float) - ground_truth[:,4] = 1.0 - ground_truth[:,5] = 1.0 + ground_truth[:, 4] = 1.0 + ground_truth[:, 5] = 1.0 mask = torch.ones((10, 10), dtype=float) - mask[:,0] = 0.0 - mask[0,:] = 0.0 - mask[:,9] = 0.0 - mask[9,:] = 0.0 + mask[:, 0] = 0.0 + mask[0, :] = 0.0 + mask[:, 9] = 0.0 + mask[9, :] = 0.0 threshold = 0.5 # with this configuration, this should be the correct count @@ -182,7 +305,7 @@ def test_sample_measures_mask_border(): fn = 1 nose.tools.eq_( - base_measures(tp, fp, tn, fn), + (tp, fp, tn, fn), sample_measures_for_threshold( prediction, ground_truth, mask, threshold ), diff --git a/bob/ip/binseg/utils/measure.py b/bob/ip/binseg/utils/measure.py index 84eb811ca9a08bdb259998777b77107250119a50..e7d14da6d714a3f6216121149ccc767a6dffe176 100644 --- a/bob/ip/binseg/utils/measure.py +++ b/bob/ip/binseg/utils/measure.py @@ -4,6 +4,7 @@ from collections import deque import numpy import torch +import scipy.special class SmoothedValue: @@ -35,12 +36,13 @@ def tricky_division(n, d): def base_measures(tp, fp, tn, fn): - """Calculates measures from true/false positive and negative counts + """Calculates frequentist measures from true/false positive and negative + counts - This function can return standard machine learning measures from true and - false positive counts of positives and negatives. For a thorough look into - these and alternate names for the returned values, please check Wikipedia's - entry on `Precision and Recall + This function can return (frequentist versions of) standard machine + learning measures from true and false positive counts of positives and + negatives. For a thorough look into these and alternate names for the + returned values, please check Wikipedia's entry on `Precision and Recall <https://en.wikipedia.org/wiki/Precision_and_recall>`_. @@ -51,10 +53,10 @@ def base_measures(tp, fp, tn, fn): True positive count, AKA "hit" fp : int - False positive count, AKA, "correct rejection" + False positive count, AKA "false alarm", or "Type I error" tn : int - True negative count, AKA "false alarm", or "Type I error" + True negative count, AKA "correct rejection" fn : int False Negative count, AKA "miss", or "Type II error" @@ -121,6 +123,249 @@ def base_measures(tp, fp, tn, fn): ) +def beta_credible_region(k, l, lambda_, coverage): + """ + Returns the mode, upper and lower bounds of the equal-tailed credible + region of a probability estimate following Bernoulli trials. + + This implemetnation is based on [GOUTTE-2005]_. It assumes :math:`k` + successes and :math:`l` failures (:math:`n = k+l` total trials) are issued + from a series of Bernoulli trials (likelihood is binomial). The posterior + is derivated using the Bayes Theorem with a beta prior. As there is no + reason to favour high vs. low precision, we use a symmetric Beta prior + (:math:`\alpha=\beta`): + + .. math:: + + P(p|k,n) &= \frac{P(k,n|p)P(p)}{P(k,n)} \\ + P(p|k,n) &= \frac{\frac{n!}{k!(n-k)!}p^{k}(1-p)^{n-k}P(p)}{P(k)} \\ + P(p|k,n) &= \frac{1}{B(k+\alpha, n-k+\beta)}p^{k+\alpha-1}(1-p)^{n-k+\beta-1} + P(p|k,n) &= \frac{1}{B(k+\alpha, n-k+\alpha)}p^{k+\alpha-1}(1-p)^{n-k+\alpha-1} + + The mode for this posterior (also the maximum a posteriori) is: + + .. math:: + + mode(p) = \frac{k+\lambda-1}{n+2\lambda-2} + + Concretely, the prior may be flat (all rates are equally likely, + :math:`\lambda=1`) or we may use Jeoffrey's prior (:math:`\lambda=0.5`), + that is invariant through re-parameterisation. Jeffrey's prior indicate + that rates close to zero or one are more likely. + + The mode above works if :math:`k+\alpha,n-k+\alpha > 1`, which is usually + the case for a resonably well tunned system, with more than a few samples + for analysis. In the limit of the system performance, :math:`k` may be 0, + which will make the mode become zero. + + For our purposes, it may be more suitable to represent :math:`n = k + l`, + with :math:`k`, the number of successes and :math:`l`, the number of + failures in the binomial experiment, and find this more suitable + representation: + + .. math:: + + P(p|k,l) &= \frac{1}{B(k+\alpha, l+\alpha)}p^{k+\alpha-1}(1-p)^{l+\alpha-1} \\ + mode(p) &= \frac{k+\lambda-1}{k+l+2\lambda-2} + + This can be mapped to most rates calculated in the context of binary + classification this way: + + * Precision or Positive-Predictive Value (PPV): p = TP/(TP+FP), so k=TP, l=FP + * Recall, Sensitivity, or True Positive Rate: r = TP/(TP+FN), so k=TP, l=FN + * Specificity or True Negative Rage: s = TN/(TN+FP), so k=TN, l=FP + * F1-score: f1 = 2TP/(2TP+FP+FN), so k=2TP, l=FP+FN + * Accuracy: acc = TP+TN/(TP+TN+FP+FN), so k=TP+TN, l=FP+FN + * Jaccard: j = TP/(TP+FP+FN), so k=TP, l=FP+FN + + Contrary to frequentist approaches, in which one can only + say that if the test were repeated an infinite number of times, + and one constructed a confidence interval each time, then X% + of the confidence intervals would contain the true rate, here + we can say that given our observed data, there is a X% probability + that the true value of :math:`k/n` falls within the provided + interval. + + + .. note:: + + For a disambiguation with Confidence Interfval, read + https://en.wikipedia.org/wiki/Credible_interval. + + + Parameters + ========== + + k : int + Number of successes observed on the experiment + + l : int + Number of failures observed on the experiment + + lambda_ : :py:class:`float`, Optional + The parameterisation of the Beta prior to consider. Use + :math:`\lambda=1` for a flat prior. Use :math:`\lambda=0.5` for + Jeffrey's prior (the default). + + coverage : :py:class:`float`, Optional + A floating-point number between 0 and 1.0 indicating the + coverage you're expecting. A value of 0.95 will ensure 95% + of the area under the probability density of the posterior + is covered by the returned equal-tailed interval. + + + Returns + ======= + + mean : float + The mean of the posterior distribution + + mode : float + The mode of the posterior distribution + + lower, upper: float + The lower and upper bounds of the credible region + + """ + + # we return the equally-tailed range + right = (1.0-coverage)/2 #half-width in each side + lower = scipy.special.betaincinv(k+lambda_, l+lambda_, right) + upper = scipy.special.betaincinv(k+lambda_, l+lambda_, 1.0-right) + + # evaluate mean and mode (https://en.wikipedia.org/wiki/Beta_distribution) + alpha = k+lambda_ + beta = l+lambda_ + + E = alpha / (alpha + beta) + + # the mode of a beta distribution is a bit tricky + if alpha > 1 and beta > 1: + mode = (alpha-1) / (alpha+beta-2) + elif alpha == 1 and beta == 1: + # In the case of precision, if the threshold is close to 1.0, both TP + # and FP can be zero, which may cause this condition to be reached, if + # the prior is exactly 1 (flat prior). This is a weird situation, + # because effectively we are trying to compute the posterior when the + # total number of experiments is zero. So, only the prior counts - but + # the prior is flat, so we should just pick a value. We choose the + # middle of the range. + mode = 0.0 #any value would do, we just pick this one + elif alpha <= 1 and beta > 1: + mode = 0.0 + elif alpha > 1 and beta <= 1: + mode = 1.0 + else: #elif alpha < 1 and beta < 1: + # in the case of precision, if the threshold is close to 1.0, both TP + # and FP can be zero, which may cause this condition to be reached, if + # the prior is smaller than 1. This is a weird situation, because + # effectively we are trying to compute the posterior when the total + # number of experiments is zero. So, only the prior counts - but the + # prior is bimodal, so we should just pick a value. We choose the + # left of the range. + mode = 0.0 #could also be 1.0 as the prior is bimodal + + return E, mode, lower, upper + + +def bayesian_measures(tp, fp, tn, fn, lambda_, coverage): + """Calculates mean and mode from true/false positive and negative counts + with credible regions + + This function can return bayesian estimates of standard machine learning + measures from true and false positive counts of positives and negatives. + For a thorough look into these and alternate names for the returned values, + please check Wikipedia's entry on `Precision and Recall + <https://en.wikipedia.org/wiki/Precision_and_recall>`_. See + :py:func:`beta_credible_region` for details on the calculation of returned + values. + + + Parameters + ---------- + + tp : int + True positive count, AKA "hit" + + fp : int + False positive count, AKA "false alarm", or "Type I error" + + tn : int + True negative count, AKA "correct rejection" + + fn : int + False Negative count, AKA "miss", or "Type II error" + + lambda_ : float + The parameterisation of the Beta prior to consider. Use + :math:`\lambda=1` for a flat prior. Use :math:`\lambda=0.5` for + Jeffrey's prior. + + coverage : float + A floating-point number between 0 and 1.0 indicating the + coverage you're expecting. A value of 0.95 will ensure 95% + of the area under the probability density of the posterior + is covered by the returned equal-tailed interval. + + + + Returns + ------- + + precision : (float, float, float, float) + P, AKA positive predictive value (PPV), mean, mode and credible + intervals (95% CI). It corresponds arithmetically + to ``tp/(tp+fp)``. + + recall : (float, float, float, float) + R, AKA sensitivity, hit rate, or true positive rate (TPR), mean, mode + and credible intervals (95% CI). It corresponds arithmetically to + ``tp/(tp+fn)``. + + specificity : (float, float, float, float) + S, AKA selectivity or true negative rate (TNR), mean, mode and credible + intervals (95% CI). It corresponds arithmetically to ``tn/(tn+fp)``. + + accuracy : (float, float, float, float) + A, mean, mode and credible intervals (95% CI). See `Accuracy + <https://en.wikipedia.org/wiki/Evaluation_of_binary_classifiers>`_. is + the proportion of correct predictions (both true positives and true + negatives) among the total number of pixels examined. It corresponds + arithmetically to ``(tp+tn)/(tp+tn+fp+fn)``. This measure includes + both true-negatives and positives in the numerator, what makes it + sensitive to data or regions without annotations. + + jaccard : (float, float, float, float) + J, mean, mode and credible intervals (95% CI). See `Jaccard Index or + Similarity <https://en.wikipedia.org/wiki/Jaccard_index>`_. It + corresponds arithmetically to ``tp/(tp+fp+fn)``. The Jaccard index + depends on a TP-only numerator, similarly to the F1 score. For regions + where there are no annotations, the Jaccard index will always be zero, + irrespective of the model output. Accuracy may be a better proxy if + one needs to consider the true abscence of annotations in a region as + part of the measure. + + f1_score : (float, float, float, float) + F1, mean, mode and credible intervals (95% CI). See `F1-score + <https://en.wikipedia.org/wiki/F1_score>`_. It corresponds + arithmetically to ``2*P*R/(P+R)`` or ``2*tp/(2*tp+fp+fn)``. The F1 or + Dice score depends on a TP-only numerator, similarly to the Jaccard + index. For regions where there are no annotations, the F1-score will + always be zero, irrespective of the model output. Accuracy may be a + better proxy if one needs to consider the true abscence of annotations + in a region as part of the measure. + """ + + return ( + beta_credible_region(tp, fp, lambda_, coverage), #precision + beta_credible_region(tp, fn, lambda_, coverage), #recall + beta_credible_region(tn, fp, lambda_, coverage), #specificity + beta_credible_region(tp+tn, fp+fn, lambda_, coverage), #accuracy + beta_credible_region(tp, fp+fn, lambda_, coverage), #jaccard index + beta_credible_region(2*tp, fp+fn, lambda_, coverage), #f1-score + ) + + def auc(x, y): """Calculates the area under the precision-recall curve (AUC) diff --git a/bob/ip/binseg/utils/plot.py b/bob/ip/binseg/utils/plot.py index 6a993aae6cc8179c990e9510198b55dace41b168..910c795ff023c4526775c173f14f1bdad71f2e8e 100644 --- a/bob/ip/binseg/utils/plot.py +++ b/bob/ip/binseg/utils/plot.py @@ -12,15 +12,15 @@ import matplotlib matplotlib.use("agg") import matplotlib.pyplot as plt -from matplotlib.patches import Ellipse +from matplotlib.patches import Arc import logging logger = logging.getLogger(__name__) -def _concave_hull(x, y, hw, hh): - """Calculates a approximate (concave) hull from ellipse centers and sizes +def _concave_hull(x, y, lx, ux, ly, uy): + """Calculates a approximate (concave) hull from arc centers and sizes Each ellipse is approximated as a number of discrete points distributed over the ellipse border following an homogeneous angle distribution. @@ -35,11 +35,9 @@ def _concave_hull(x, y, hw, hh): y : numpy.ndarray 1D array with y coordinates of ellipse centers - hw : numpy.ndarray - 1D array with half-widths for each ellipse - - hh : numpy.ndarray - 1D array with half-heights for each ellipse + lx, ux, ly, uy : numpy.ndarray + 1D array(s) with upper and lower widths and heights for your deformed + ellipse Returns @@ -47,33 +45,58 @@ def _concave_hull(x, y, hw, hh): points : numpy.ndarray 2D array containing the ``(x, y)`` coordinates of the concave hull - encompassing all defined ellipses. + encompassing all defined arcs. """ - def _ellipse_points(_x, _y, _hw, _hh, steps=100): - """Generates border points for an ellipse + def _irregular_ellipse_points(_x, _y, _lx, _ux, _ly, _uy, steps=100): + """Generates border points for an irregular ellipse This functions distributes points according to a rotation angle rather than uniformily with respect to a particular axis. The result is a more homogeneous border representation for the ellipse. """ - _hw = _hw or 1e-8 - _hh = _hh or 1e-8 - angles = numpy.arange(0, numpy.pi, step=numpy.pi / (steps / 2)) - rx1 = _hw*numpy.cos(angles) - rx2 = _hw*numpy.cos(angles + numpy.pi) - ry1 = (_hh/_hw) * numpy.sqrt(numpy.square(_hw) - numpy.square(rx1)) - ry2 = -(_hh/_hw) * numpy.sqrt(numpy.square(_hw) - numpy.square(rx2)) - return numpy.vstack(( - numpy.hstack((rx1+_x, rx2+_x)), - numpy.hstack((ry1+_y, ry2+_y)), - )).T - - retval = numpy.ndarray((0,2)) - for (k, l, m, n) in zip(x, y, hw, hh): - retval = numpy.vstack((retval, [numpy.nan, numpy.nan], - _ellipse_points(k, l, m, n))) + up = _uy - _y + down = _y - _ly + left = _x - _lx + right = _ux - _x + + angles = numpy.arange(0, numpy.pi/2, step=2*numpy.pi/steps) + points = numpy.ndarray((0, 2)) + + # upper left part (90 -> 180 degrees) + px = 2*left * numpy.cos(angles) + py = (up/left) * numpy.sqrt(numpy.square(2*left) - numpy.square(px)) + # order: x and y increase + points = numpy.vstack((points, numpy.array([_x-px, _y+py]).T)) + + # upper right part (0 -> 90 degrees) + px = 2*right * numpy.cos(angles) + py = (up/right) * numpy.sqrt(numpy.square(2*right) - numpy.square(px)) + # order: x increases and y decreases + points = numpy.vstack((points, numpy.flipud(numpy.array([_x+px, + _y+py]).T))) + + # lower right part (180 -> 270 degrees) + px = 2*right * numpy.cos(angles) + py = (down/right) * numpy.sqrt(numpy.square(2*right) - numpy.square(px)) + # order: x increases and y decreases + points = numpy.vstack((points, numpy.array([_x+px, _y-py]).T)) + + # lower left part (180 -> 270 degrees) + px = 2*left * numpy.cos(angles) + py = (down/left) * numpy.sqrt(numpy.square(2*left) - numpy.square(px)) + # order: x decreases and y increases + points = numpy.vstack((points, numpy.flipud(numpy.array([_x-px, + _y-py]).T))) + + return points + + retval = numpy.ndarray((0, 2)) + for (k, l, m, n, o, p) in zip(x, y, lx, ux, ly, uy): + retval = numpy.vstack( + (retval, [numpy.nan, numpy.nan], _irregular_ellipse_points(k, l, m, n, o, p)) + ) return retval @@ -158,19 +181,18 @@ def _precision_recall_canvas(title=None): plt.tight_layout() -def precision_recall_f1iso(data, confidence=True): - """Creates a precision-recall plot with confidence intervals +def precision_recall_f1iso(data, credible=True): + """Creates a precision-recall plot with credible intervals This function creates and returns a Matplotlib figure with a - precision-recall plot containing shaded confidence intervals (standard - deviation on the precision-recall measurements). The plot will be - annotated with F1-score iso-lines (in which the F1-score maintains the same - value). + precision-recall plot containing shaded credible intervals (on the + precision-recall measurements). The plot will be annotated with F1-score + iso-lines (in which the F1-score maintains the same value). This function specially supports "second-annotator" entries by plotting a line showing the comparison between the default annotator being analyzed - and a second "opinion". Second annotator dataframes contain a single - entry (threshold=0.5), given the nature of the binary map comparisons. + and a second "opinion". Second annotator dataframes contain a single entry + (threshold=0.5), given the nature of the binary map comparisons. Parameters @@ -183,20 +205,28 @@ def precision_recall_f1iso(data, confidence=True): * ``df``: :py:class:`pandas.DataFrame` A dataframe that is produced by our evaluator engine, indexed by - integer "thresholds", containing the following columns: ``threshold`` - (sorted ascending), ``precision``, ``recall``, ``pr_upper`` (upper - precision bounds), ``pr_lower`` (lower precision bounds), - ``re_upper`` (upper recall bounds), ``re_lower`` (lower recall - bounds). + integer "thresholds", containing the following columns: + ``threshold``, ``tp``, ``fp``, ``tn``, ``fn``, ``mean_precision``, + ``mode_precision``, ``lower_precision``, ``upper_precision``, + ``mean_recall``, ``mode_recall``, ``lower_recall``, ``upper_recall``, + ``mean_specificity``, ``mode_specificity``, ``lower_specificity``, + ``upper_specificity``, ``mean_accuracy``, ``mode_accuracy``, + ``lower_accuracy``, ``upper_accuracy``, ``mean_jaccard``, + ``mode_jaccard``, ``lower_jaccard``, ``upper_jaccard``, + ``mean_f1_score``, ``mode_f1_score``, ``lower_f1_score``, + ``upper_f1_score``, ``frequentist_precision``, + ``frequentist_recall``, ``frequentist_specificity``, + ``frequentist_accuracy``, ``frequentist_jaccard``, + ``frequentist_f1_score``. * ``threshold``: :py:class:`list` A threshold to graph with a dot for each set. Specific threshold values do not affect "second-annotator" dataframes. - confidence : :py:class:`bool`, Optional - If set, draw confidence intervals for each line, using ``*_upper`` and - ``*_lower`` entries. + credible : :py:class:`bool`, Optional + If set, draw credible intervals for each line, using ``upper_*`` and + ``lower_*`` entries. Returns @@ -234,12 +264,10 @@ def precision_recall_f1iso(data, confidence=True): # plots only from the point where recall reaches its maximum, # otherwise, we don't see a curve... - max_recall = df["recall"].idxmax() - pi = df.precision[max_recall:] - ri = df.recall[max_recall:] - + max_recall = df.mean_recall.idxmax() + pi = df.mean_precision[max_recall:] + ri = df.mean_recall[max_recall:] valid = (pi + ri) > 0 - f1 = 2 * (pi[valid] * ri[valid]) / (pi[valid] + ri[valid]) # optimal point along the curve bins = len(df) @@ -247,14 +275,14 @@ def precision_recall_f1iso(data, confidence=True): index = min(index, len(df) - 1) # avoids out of range indexing # plots Recall/Precision as threshold changes - label = f"{name} (F1={df.f1_score[index]:.4f})" + label = f"{name} (F1={df.mean_f1_score[index]:.4f})" color = next(colorcycler) if len(df) == 1: # plot black dot for F1-score at select threshold (marker,) = axes.plot( - df.recall[index], - df.precision[index], + df.mean_recall[index], + df.mean_precision[index], marker="*", markersize=6, color=color, @@ -262,8 +290,8 @@ def precision_recall_f1iso(data, confidence=True): linestyle="None", ) (line,) = axes.plot( - df.recall[index], - df.precision[index], + df.mean_recall[index], + df.mean_precision[index], linestyle="None", color=color, alpha=0.2, @@ -276,8 +304,8 @@ def precision_recall_f1iso(data, confidence=True): ri[pi > 0], pi[pi > 0], color=color, linestyle=style ) (marker,) = axes.plot( - df.recall[index], - df.precision[index], + df.mean_recall[index], + df.mean_precision[index], marker="o", linestyle=style, markersize=4, @@ -286,29 +314,19 @@ def precision_recall_f1iso(data, confidence=True): ) legend.append(([marker, line], label)) - if confidence: - - # hacky workaround to plot 2nd human - if len(df) == 1: # binary system, very likely - logger.warning("Found 2nd human annotator - patching...") - p = Ellipse( - (df.recall.iloc[0], df.precision.iloc[0]), - 2 * df.std_re.iloc[0], - 2 * df.std_pr.iloc[0], - angle=0, - color=color, - alpha=0.1, - linewidth=0, - ) - else: - hull = _concave_hull( - df.recall, df.precision, df.std_re, df.std_pr - ) - p = plt.Polygon(hull, + if credible: + + hull = _concave_hull(df.mean_recall, df.mean_precision, + df.lower_recall, df.upper_recall, + df.lower_precision, df.upper_precision, + ) + p = plt.Polygon( + hull, facecolor=color, alpha=0.2, edgecolor="none", lw=0.2, + closed=True, ) axes.add_patch(p) legend[-1][0].append(p) @@ -346,7 +364,11 @@ def loss_curve(df): ax1 = df.plot(x="epoch", y="median-loss", grid=True) ax1.set_ylabel("Median Loss") ax1.grid(linestyle="--", linewidth=1, color="gray", alpha=0.2) - ax2 = df["learning-rate"].plot(secondary_y=True, legend=True, grid=True,) + ax2 = df["learning-rate"].plot( + secondary_y=True, + legend=True, + grid=True, + ) ax2.set_ylabel("Learning Rate") ax1.set_xlabel("Epoch") plt.tight_layout() diff --git a/bob/ip/binseg/utils/table.py b/bob/ip/binseg/utils/table.py index 097891260e9171b19b1f89fc40246b344e7815ad..d65de645a9113ddf369fa6bce670be7820938c3e 100644 --- a/bob/ip/binseg/utils/table.py +++ b/bob/ip/binseg/utils/table.py @@ -20,11 +20,19 @@ def performance_table(data, fmt): * ``df``: :py:class:`pandas.DataFrame` A dataframe that is produced by our evaluator engine, indexed by - integer "thresholds", containing the following columns: ``threshold`` - (sorted ascending), ``precision``, ``recall``, ``pr_upper`` (upper - precision bounds), ``pr_lower`` (lower precision bounds), - ``re_upper`` (upper recall bounds), ``re_lower`` (lower recall - bounds). + integer "thresholds", containing the following columns: + ``threshold``, ``tp``, ``fp``, ``tn``, ``fn``, ``mean_precision``, + ``mode_precision``, ``lower_precision``, ``upper_precision``, + ``mean_recall``, ``mode_recall``, ``lower_recall``, ``upper_recall``, + ``mean_specificity``, ``mode_specificity``, ``lower_specificity``, + ``upper_specificity``, ``mean_accuracy``, ``mode_accuracy``, + ``lower_accuracy``, ``upper_accuracy``, ``mean_jaccard``, + ``mode_jaccard``, ``lower_jaccard``, ``upper_jaccard``, + ``mean_f1_score``, ``mode_f1_score``, ``lower_f1_score``, + ``upper_f1_score``, ``frequentist_precision``, + ``frequentist_recall``, ``frequentist_specificity``, + ``frequentist_accuracy``, ``frequentist_jaccard``, + ``frequentist_f1_score``. * ``threshold``: :py:class:`list` @@ -47,14 +55,10 @@ def performance_table(data, fmt): headers = [ "Dataset", "T", - "F1", - "F1\nstd", - "P", - "R", - "F1\nmax", - "P\nmax", - "R\nmax", + "E(F1)", + "CI(F1)", "AUC", + "CI(AUC)", ] table = [] @@ -65,19 +69,19 @@ def performance_table(data, fmt): bins = len(v["df"]) index = int(round(bins*v["threshold"])) index = min(index, len(v["df"])-1) #avoids out of range indexing - entry.append(v["df"].f1_score[index]) - entry.append(v["df"].std_f1[index]) - entry.append(v["df"].precision[index]) - entry.append(v["df"].recall[index]) - - # statistics based on the best threshold (a posteriori, biased) - entry.append(v["df"].f1_score.max()) - f1max_idx = v["df"].f1_score.idxmax() - entry.append(v["df"].precision[f1max_idx]) - entry.append(v["df"].recall[f1max_idx]) - entry.append(auc(v["df"]["recall"].to_numpy(), - v["df"]["precision"].to_numpy())) + entry.append(v["df"].mean_f1_score[index]) + entry.append(f"{v['df'].lower_f1_score[index]:.3f}-{v['df'].upper_f1_score[index]:.3f}") + + # AUC PR curve + entry.append(auc(v["df"]["mean_recall"].to_numpy(), + v["df"]["mean_precision"].to_numpy())) + lower_auc = auc(v["df"]["lower_recall"].to_numpy(), + v["df"]["lower_precision"].to_numpy()) + upper_auc = auc(v["df"]["upper_recall"].to_numpy(), + v["df"]["upper_precision"].to_numpy()) + entry.append(f"{lower_auc:.3f}-{upper_auc:.3f}") table.append(entry) - return tabulate.tabulate(table, headers, tablefmt=fmt, floatfmt=".3f") + return tabulate.tabulate(table, headers, tablefmt=fmt, floatfmt=".3f", + stralign="right") diff --git a/doc/references.rst b/doc/references.rst index df7bce250328e1b985b14051c5b20b4de74034a3..a832b08923e7fc85f23b149f86a6413faefd81e9 100644 --- a/doc/references.rst +++ b/doc/references.rst @@ -112,3 +112,8 @@ .. [SMITH-2017] *L. N. Smith*, **Cyclical Learning Rates for Training Neural Networks**, 2017. https://arxiv.org/abs/1506.01186 + +.. [GOUTTE-2005] *C. Goutte and E. Gaussier*, **A probabilistic interpretation + of precision, recall and F-score, with implication for evaluation**, + European conference on Advances in Information Retrieval Research, 2005. + https://doi.org/10.1007/978-3-540-31865-1_25