diff --git a/bob/ip/binseg/engine/evaluator.py b/bob/ip/binseg/engine/evaluator.py index 2901305574a55af3afdab3fd93302ee14ee86d04..0714289b9894f1d82aa8cfc1a229356971488f8b 100644 --- a/bob/ip/binseg/engine/evaluator.py +++ b/bob/ip/binseg/engine/evaluator.py @@ -50,6 +50,51 @@ def _posneg(pred, gt, threshold): return tp_tensor, fp_tensor, tn_tensor, fn_tensor +def _sample_measures_for_threshold(pred, gt, threshold): + """ + Calculates measures on one single sample, for a specific threshold + + + Parameters + ---------- + + pred : torch.Tensor + pixel-wise predictions + + gt : torch.Tensor + ground-truth (annotations) + + threshold : float + a particular threshold in which to calculate the performance + measures + + + Returns + ------- + + measures : list + + A list containing the following values: + + * precision: float + * recall: float + * specificity: float + * accuracy: float + * jaccard: float + * f1_score: float + + """ + + tp_tensor, fp_tensor, tn_tensor, fn_tensor = _posneg(pred, gt, threshold) + + # calc measures from scalars + tp_count = torch.sum(tp_tensor).item() + fp_count = torch.sum(fp_tensor).item() + tn_count = torch.sum(tn_tensor).item() + fn_count = torch.sum(fn_tensor).item() + return base_measures(tp_count, fp_count, tn_count, fn_count) + + def _sample_measures(pred, gt, steps): """ Calculates measures on one single sample @@ -87,40 +132,10 @@ def _sample_measures(pred, gt, steps): """ step_size = 1.0 / steps - data = [] - - for index, threshold in enumerate(numpy.arange(0.0, 1.0, step_size)): - - tp_tensor, fp_tensor, tn_tensor, fn_tensor = _posneg( - pred, gt, threshold - ) - - # calc measures from scalars - tp_count = torch.sum(tp_tensor).item() - fp_count = torch.sum(fp_tensor).item() - tn_count = torch.sum(tn_tensor).item() - fn_count = torch.sum(fn_tensor).item() - ( - precision, - recall, - specificity, - accuracy, - jaccard, - f1_score, - ) = base_measures(tp_count, fp_count, tn_count, fn_count) - - data.append( - [ - index, - threshold, - precision, - recall, - specificity, - accuracy, - jaccard, - f1_score, - ] - ) + data = [ + [index, threshold] + _sample_measures_for_threshold(pred, gt, threshold) + for index, threshold in enumerate(numpy.arange(0.0, 1.0, step_size)) + ] return pandas.DataFrame( data, @@ -304,7 +319,6 @@ def run( os.makedirs(os.path.dirname(fullpath), exist_ok=True) overlay_image.save(fullpath) - # Merges all dataframes together df_measures = pandas.concat(data.values()) @@ -321,8 +335,12 @@ def run( # (avg_measures["precision"]+avg_measures["recall"]) avg_measures["std_pr"] = std_measures["precision"] - avg_measures["pr_upper"] = avg_measures["precision"] + std_measures["precision"] - avg_measures["pr_lower"] = avg_measures["precision"] - std_measures["precision"] + avg_measures["pr_upper"] = ( + avg_measures["precision"] + std_measures["precision"] + ) + avg_measures["pr_lower"] = ( + avg_measures["precision"] - std_measures["precision"] + ) avg_measures["std_re"] = std_measures["recall"] avg_measures["re_upper"] = avg_measures["recall"] + std_measures["recall"] avg_measures["re_lower"] = avg_measures["recall"] - std_measures["recall"] @@ -361,8 +379,9 @@ def run( return maxf1_threshold -def compare_annotators(baseline, other, name, output_folder, - overlayed_folder=None): +def compare_annotators( + baseline, other, name, output_folder, overlayed_folder=None +): """ Compares annotations on the **same** dataset @@ -398,13 +417,15 @@ def compare_annotators(baseline, other, name, output_folder, data = {} for baseline_sample, other_sample in tqdm( - list(zip(baseline, other)), desc="samples", leave=False, disable=None, + list(zip(baseline, other)), desc="samples", leave=False, disable=None ): - assert baseline_sample[0] == other_sample[0], f"Mismatch between " \ - f"datasets for second-annotator analysis " \ - f"({baseline_sample[0]} != {other_sample[0]}). This " \ - f"typically occurs when the second annotator (`other`) " \ - f"comes from a different dataset than the `baseline` dataset" + assert baseline_sample[0] == other_sample[0], ( + f"Mismatch between " + f"datasets for second-annotator analysis " + f"({baseline_sample[0]} != {other_sample[0]}). This " + f"typically occurs when the second annotator (`other`) " + f"comes from a different dataset than the `baseline` dataset" + ) stem = baseline_sample[0] image = baseline_sample[1] @@ -417,8 +438,9 @@ def compare_annotators(baseline, other, name, output_folder, data[stem] = _sample_measures(pred, gt, 2) if output_folder is not None: - fullpath = os.path.join(output_folder, "second-annotator", name, - f"{stem}.csv") + fullpath = os.path.join( + output_folder, "second-annotator", name, f"{stem}.csv" + ) tqdm.write(f"Saving {fullpath}...") os.makedirs(os.path.dirname(fullpath), exist_ok=True) data[stem].to_csv(fullpath) @@ -427,8 +449,9 @@ def compare_annotators(baseline, other, name, output_folder, overlay_image = _sample_analysis( image, pred, gt, threshold=0.5, overlay=True ) - fullpath = os.path.join(overlayed_folder, "second-annotator", - name, f"{stem}.png") + fullpath = os.path.join( + overlayed_folder, "second-annotator", name, f"{stem}.png" + ) tqdm.write(f"Saving {fullpath}...") os.makedirs(os.path.dirname(fullpath), exist_ok=True) overlay_image.save(fullpath) @@ -450,14 +473,20 @@ def compare_annotators(baseline, other, name, output_folder, # (avg_measures["precision"]+avg_measures["recall"]) avg_measures["std_pr"] = std_measures["precision"] - avg_measures["pr_upper"] = avg_measures["precision"] + std_measures["precision"] - avg_measures["pr_lower"] = avg_measures["precision"] - std_measures["precision"] + avg_measures["pr_upper"] = ( + avg_measures["precision"] + std_measures["precision"] + ) + avg_measures["pr_lower"] = ( + avg_measures["precision"] - std_measures["precision"] + ) avg_measures["std_re"] = std_measures["recall"] avg_measures["re_upper"] = avg_measures["recall"] + std_measures["recall"] avg_measures["re_lower"] = avg_measures["recall"] - std_measures["recall"] avg_measures["std_f1"] = std_measures["f1_score"] - measures_path = os.path.join(output_folder, "second-annotator", f"{name}.csv") + measures_path = os.path.join( + output_folder, "second-annotator", f"{name}.csv" + ) os.makedirs(os.path.dirname(measures_path), exist_ok=True) logger.info(f"Saving averages over all input images at {measures_path}...") avg_measures.to_csv(measures_path) diff --git a/bob/ip/binseg/engine/significance.py b/bob/ip/binseg/engine/significance.py index 4586b9c2632f9fd5aa76e70d48b2457382313f66..68fae32d1f4ee35e7ad523dbf9eedb02ad943109 100644 --- a/bob/ip/binseg/engine/significance.py +++ b/bob/ip/binseg/engine/significance.py @@ -5,11 +5,11 @@ import os import itertools import h5py -import tqdm +from tqdm import tqdm import pandas import torch.nn -from .evaluator import _sample_measures +from .evaluator import _sample_measures_for_threshold def _patch_measures(pred, gt, threshold, size, stride): @@ -56,8 +56,6 @@ def _patch_measures(pred, gt, threshold, size, stride): """ - height, width, stride = size - # we calculate the required padding so that the last windows on the left # and bottom size of predictions/ground-truth data are zero padded, and # torch unfolding works exactly. @@ -76,26 +74,37 @@ def _patch_measures(pred, gt, threshold, size, stride): pred_patches = pred_padded.unfold(0, size[0], stride[0]).unfold( 1, size[1], stride[1] ) - gt_patches = gt_padded.unfold(0, size[0], stride).unfold( - 1, size[1], stride[0] + gt_patches = gt_padded.unfold(0, size[0], stride[0]).unfold( + 1, size[1], stride[1] ) assert pred_patches.shape == gt_patches.shape ylen, xlen, _, _ = pred_patches.shape - dfs = [] - for j, i in itertools.product(range(ylen), range(xlen)): - dfs.append( - _sample_measures( - pred_patches[j, i, :, :], gt_patches[j, i, :, :], steps - ) + data = [ + [j, i] + + _sample_measures_for_threshold( + pred_patches[j, i, :, :], gt_patches[j, i, :, :], threshold ) - dfs[-1]["patch"] = i + (j * xlen) - - return pandas.concat(dfs, ignore_index=True) + for j, i in itertools.product(range(ylen), range(xlen)) + ] + + return pandas.DataFrame( + data, + columns=( + "y", + "x", + "precision", + "recall", + "specificity", + "accuracy", + "jaccard", + "f1_score", + ), + ) def patch_performances( - dataset, name, predictions_folder, threshold, size, stride, + dataset, name, predictions_folder, threshold, size, stride ): """ Evaluates the performances for multiple image patches, for a whole dataset @@ -144,7 +153,7 @@ def patch_performances( if not os.path.exists(use_predictions_folder): use_predictions_folder = predictions_folder - for sample in tqdm(dataset): + for sample in tqdm(dataset[name]): stem = sample[0] image = sample[1] gt = sample[2] @@ -153,6 +162,6 @@ def patch_performances( pred = f["array"][:] pred = torch.from_numpy(pred) data.append(_patch_measures(pred, gt, threshold, size, stride)) - data['stem'] = stem + data[-1]["stem"] = stem return pandas.concat(data, ignore_index=True) diff --git a/bob/ip/binseg/script/significance.py b/bob/ip/binseg/script/significance.py index 2207738a52ed193db78e68c288ef681638fc3835..2700d0c2c7a2a7772bd6dc9b0b144b5c628bd30a 100755 --- a/bob/ip/binseg/script/significance.py +++ b/bob/ip/binseg/script/significance.py @@ -10,6 +10,7 @@ from bob.extension.scripts.click_helper import ( ResourceOption, ) +import numpy import scipy.stats import logging @@ -114,7 +115,7 @@ from ..engine.significance import patch_performances "respectively.", default=(128, 128), nargs=2, - type=float, + type=int, show_default=True, required=True, cls=ResourceOption, @@ -127,7 +128,7 @@ from ..engine.significance import patch_performances "respectively.", default=(32, 32), nargs=2, - type=float, + type=int, show_default=True, required=True, cls=ResourceOption, @@ -161,39 +162,46 @@ def significance( assert threshold in dataset, f"No dataset named '{threshold}'" - logger.info(f"Evaluating threshold on '{threshold}' set for system 1") + logger.info(f"Evaluating threshold on '{threshold}' set for system 1 using {steps} steps") threshold1 = run_evaluation( dataset[threshold], threshold, predictions_1, steps=steps ) - logger.info(f"Set --threshold={threshold:.5f} for system 1") + logger.info(f"Set --threshold={threshold1:.5f} for system 1") - logger.info(f"Evaluating threshold on '{threshold}' set for system 2") + logger.info(f"Evaluating threshold on '{threshold}' set for system 2 using {steps} steps") threshold2 = run_evaluation( dataset[threshold], threshold, predictions_2, steps=steps ) - logger.info(f"Set --threshold={threshold:.5f} for system 2") + logger.info(f"Set --threshold={threshold2:.5f} for system 2") # for a given threshold on each system, calculate patch performances - logger.info(f"Evaluating patch performances on '{evaluate}' set for system 1") - perf1 = patch_performances(data, evaluate, predictions_1, threshold1, + logger.info(f"Evaluating patch performances on '{evaluate}' set for system 1 using windows of size {size} and stride {stride}") + perf1 = patch_performances(dataset, evaluate, predictions_1, threshold1, size, stride) - logger.info(f"Evaluating patch performances on '{evaluate}' set for system 2") - perf2 = patch_performances(data, evaluate, predictions_2, threshold2, + logger.info(f"Evaluating patch performances on '{evaluate}' set for system 2 using windows of size {size} and stride {stride}") + perf2 = patch_performances(dataset, evaluate, predictions_2, threshold2, size, stride) ###### MAGIC STARTS ####### # load all F1-scores for the given threshold da = perf1.f1_score - #import matplotlib - #matplotlib.use('macosx') - #import matplotlib.pyplot as plt db = perf2.f1_score - #plt.boxplot([da, db]) - #plt.hist(numpy.array(da)-db, bins=6) - #plt.show() - diff = da - db + + import matplotlib + import matplotlib.pyplot as plt + plt.subplot(2,2,1) + plt.boxplot([da, db]) + plt.title('Systems 1 and 2') + plt.subplot(2,2,2) + plt.boxplot(diff) + plt.title('Differences (1 - 2)') + plt.subplot(2,1,2) + plt.hist(diff, bins=50) + plt.title('Histogram (1 - 2)') + plt.savefig('analysis.pdf') + #diff = diff[diff!=0.0] #click.echo(diff)