diff --git a/bob/ip/binseg/engine/evaluator.py b/bob/ip/binseg/engine/evaluator.py index 37fb4e2b9f37ae6151fddf61bb141f498a33c4a0..2901305574a55af3afdab3fd93302ee14ee86d04 100644 --- a/bob/ip/binseg/engine/evaluator.py +++ b/bob/ip/binseg/engine/evaluator.py @@ -4,7 +4,6 @@ """Defines functionality for the evaluation of predictions""" import os -import itertools import PIL import numpy @@ -23,9 +22,6 @@ import logging logger = logging.getLogger(__name__) -_PATCH_CONFIG = (128, 128, 32) -"""Stock configuration for patch analysis""" - def _posneg(pred, gt, threshold): """Calculates true and false positives and negatives""" @@ -141,81 +137,6 @@ def _sample_measures(pred, gt, steps): ) -def _patch_measures(pred, gt, steps, size): - """ - Calculates measures on patches of a single sample - - - Parameters - ---------- - - pred : torch.Tensor - pixel-wise predictions - - gt : torch.Tensor - ground-truth (annotations) - - steps : int - number of steps to use for threshold analysis. The step size is - calculated from this by dividing ``1.0/steps`` - - size : :py:class:`tuple` - A tripplet with three integers indicating the height, width, and stride - of patches to break measure analysis into. In this case, the input - image and ground-truth will be cut into blocks of the provided height - and width, overlapping by the total overlap size, starting on the top - left corner and then moving right and to the bottom. Windows on the - left and bottom edge of the image may be incomplete. - - - Returns - ------- - - measures : pandas.DataFrame - - A pandas dataframe with the following columns: - - * patch: int - * threshold: float - * precision: float - * recall: float - * specificity: float - * accuracy: float - * jaccard: float - * f1_score: float - - """ - - height, width, stride = size - - # we calculate the required padding so that the last windows on the left - # and bottom size of predictions/ground-truth data are zero padded, and - # torch unfolding works exactly. - padding = (0, 0) - rem = (pred.shape[1] - width) % stride - if rem != 0: - padding = (0, (stride-rem)) - rem = (pred.shape[0] - height) % stride - if rem != 0: - padding += (0, (stride-rem)) - - pred_padded = torch.nn.functional.pad(pred, padding) - gt_padded = torch.nn.functional.pad(gt.squeeze(0), padding) - - # this will create as many views as required - pred_patches = pred_padded.unfold(0, height, stride).unfold(1, width, stride) - gt_patches = gt_padded.unfold(0, height, stride).unfold(1, width, stride) - assert pred_patches.shape == gt_patches.shape - ylen, xlen, _, _ = pred_patches.shape - - dfs = [] - for j, i in itertools.product(range(ylen), range(xlen)): - dfs.append(_sample_measures(pred_patches[j,i,:,:], gt_patches[j,i,:,:], steps)) - dfs[-1]['patch'] = i+(j*xlen) - - return pandas.concat(dfs, ignore_index=True) - - def _sample_analysis( img, pred, @@ -374,12 +295,6 @@ def run( os.makedirs(os.path.dirname(fullpath), exist_ok=True) data[stem].to_csv(fullpath) - # saves patch analysis - fullpath = os.path.join(output_folder, name, "patches", f"{stem}.csv") - tqdm.write(f"Saving {fullpath}...") - os.makedirs(os.path.dirname(fullpath), exist_ok=True) - _patch_measures(pred, gt, steps, _PATCH_CONFIG).to_csv(fullpath) - if overlayed_folder is not None: overlay_image = _sample_analysis( image, pred, gt, threshold=threshold, overlay=True @@ -508,13 +423,6 @@ def compare_annotators(baseline, other, name, output_folder, os.makedirs(os.path.dirname(fullpath), exist_ok=True) data[stem].to_csv(fullpath) - # saves patch analysis - fullpath = os.path.join(output_folder, "second-annotator", name, - "patches", f"{stem}.csv") - tqdm.write(f"Saving {fullpath}...") - os.makedirs(os.path.dirname(fullpath), exist_ok=True) - _patch_measures(pred, gt, 2, _PATCH_CONFIG).to_csv(fullpath) - if overlayed_folder is not None: overlay_image = _sample_analysis( image, pred, gt, threshold=0.5, overlay=True diff --git a/bob/ip/binseg/engine/significance.py b/bob/ip/binseg/engine/significance.py new file mode 100644 index 0000000000000000000000000000000000000000..4586b9c2632f9fd5aa76e70d48b2457382313f66 --- /dev/null +++ b/bob/ip/binseg/engine/significance.py @@ -0,0 +1,158 @@ +#!/usr/bin/env python +# coding=utf-8 + +import os +import itertools + +import h5py +import tqdm +import pandas +import torch.nn + +from .evaluator import _sample_measures + + +def _patch_measures(pred, gt, threshold, size, stride): + """ + Calculates measures on patches of a single sample + + + Parameters + ---------- + + pred : torch.Tensor + pixel-wise predictions + + gt : torch.Tensor + ground-truth (annotations) + + threshold : float + threshold to use for evaluating individual patch performances + + size : tuple + size (vertical, horizontal) for windows for which we will calculate + partial performances based on the threshold and existing ground-truth + + stride : tuple + strides (vertical, horizontal) for windows for which we will calculate + partial performances based on the threshold and existing ground-truth + + + Returns + ------- + + measures : pandas.DataFrame + + A pandas dataframe with the following columns: + + * patch: int + * threshold: float + * precision: float + * recall: float + * specificity: float + * accuracy: float + * jaccard: float + * f1_score: float + + """ + + height, width, stride = size + + # we calculate the required padding so that the last windows on the left + # and bottom size of predictions/ground-truth data are zero padded, and + # torch unfolding works exactly. + padding = (0, 0) + rem = (pred.shape[1] - size[1]) % stride[1] + if rem != 0: + padding = (0, (stride[1] - rem)) + rem = (pred.shape[0] - size[0]) % stride[0] + if rem != 0: + padding += (0, (stride[0] - rem)) + + pred_padded = torch.nn.functional.pad(pred, padding) + gt_padded = torch.nn.functional.pad(gt.squeeze(0), padding) + + # this will create as many views as required + pred_patches = pred_padded.unfold(0, size[0], stride[0]).unfold( + 1, size[1], stride[1] + ) + gt_patches = gt_padded.unfold(0, size[0], stride).unfold( + 1, size[1], stride[0] + ) + assert pred_patches.shape == gt_patches.shape + ylen, xlen, _, _ = pred_patches.shape + + dfs = [] + for j, i in itertools.product(range(ylen), range(xlen)): + dfs.append( + _sample_measures( + pred_patches[j, i, :, :], gt_patches[j, i, :, :], steps + ) + ) + dfs[-1]["patch"] = i + (j * xlen) + + return pandas.concat(dfs, ignore_index=True) + + +def patch_performances( + dataset, name, predictions_folder, threshold, size, stride, +): + """ + Evaluates the performances for multiple image patches, for a whole dataset + + + Parameters + --------- + + dataset : py:class:`torch.utils.data.Dataset` + a dataset to iterate on + + name : str + the local name of this dataset (e.g. ``train``, or ``test``), to be + used when saving measures files. + + predictions_folder : str + folder where predictions for the dataset images has been previously + stored + + threshold : :py:class:`float` + this should be a threshold (floating point) to apply to prediction maps + to decide on positives and negatives. + + size : tuple + size (vertical, horizontal) for windows for which we will calculate + partial performances based on the threshold and existing ground-truth + + stride : tuple + strides (vertical, horizontal) for windows for which we will calculate + partial performances based on the threshold and existing ground-truth + + + Returns + ------- + + df : pandas.DataFrame + A dataframe with all the patch performances aggregated, for all input + images. + + """ + + # Collect overall measures + data = [] + + use_predictions_folder = os.path.join(predictions_folder, name) + if not os.path.exists(use_predictions_folder): + use_predictions_folder = predictions_folder + + for sample in tqdm(dataset): + stem = sample[0] + image = sample[1] + gt = sample[2] + pred_fullpath = os.path.join(use_predictions_folder, stem + ".hdf5") + with h5py.File(pred_fullpath, "r") as f: + pred = f["array"][:] + pred = torch.from_numpy(pred) + data.append(_patch_measures(pred, gt, threshold, size, stride)) + data['stem'] = stem + + return pandas.concat(data, ignore_index=True) diff --git a/bob/ip/binseg/script/significance.py b/bob/ip/binseg/script/significance.py old mode 100644 new mode 100755 index ff7dd440f6e88990f445416e68c8491e23ddbeeb..2207738a52ed193db78e68c288ef681638fc3835 --- a/bob/ip/binseg/script/significance.py +++ b/bob/ip/binseg/script/significance.py @@ -2,154 +2,217 @@ # coding=utf-8 import os -import pathlib - import click -import numpy -import scipy.stats from bob.extension.scripts.click_helper import ( verbosity_option, - AliasedGroup, + ConfigCommand, + ResourceOption, ) -import pandas - +import scipy.stats import logging -logger = logging.getLogger(__name__) - - -def _get_threshold(csvfile): - """Returns the thresholds that maximizes the F1-score from an aggreated scan - - Parameters - ========== - - csvfile : str - Path to a CSV file containing the aggreated performance across various - thresholds for a validation set. - - - Returns - ======= - - threshold : float - A float threshold - - """ - - df = pandas.read_csv(csvfile) - threshold = df.threshold[df.f1_score.idxmax()] - logger.info(f"Threshold for '{csvfile}' = {threshold:.3f}'") - return threshold - - -def _find_files(path, expression): - """Finds a files with names matching the glob expression recursively - - - Parameters - ========== - - path : str - The base path where to search for files - - expression : str - A glob expression to search for filenames (e.g. ``"*.csv"``) - - - Returns - ======= - - l : list - A sorted list (by name) of relative file paths found under ``path`` - - """ - - return sorted([k.name for k in pathlib.Path(path).rglob(expression)]) - -def _load_score(path, threshold): - """Loads the specific score (at threshold "t") from an analysis file - - - Parameters - ========== - - path : string - Path pointing to the CSV files from where to load the scores - - threshold : float - Threshold value to use for picking the score - - - Returns - ======= - - value : float - Value representing the score at the given threshold inside the file +logger = logging.getLogger(__name__) - """ - df = pandas.read_csv(path) - bins = len(df) - index = int(round(bins * threshold)) - index = min(index, len(df) - 1) # avoids out of range indexing - return df.f1_score[index] +from .evaluate import _validate_threshold, run as run_evaluation +from ..engine.significance import patch_performances @click.command( + entry_point_group="bob.ip.binseg.config", + cls=ConfigCommand, epilog="""Examples: \b - 1. Measures if systems A and B are significantly different for the same - input data, at a threshold optimized on specific validation sets. + 1. Runs a significance test using as base the calculated predictions of two + different systems (1, the first and 2, the second), on the **same** + dataset: +\b + $ bob binseg significance -vv drive --predictions-1=path/to/predictions/system-1 --predictions-2=path/to/predictions/system-2 +\b + 2. By default, we use a "validation" dataset if it is available, to infer + the a priori threshold for the comparison of two systems. Otherwise, + you may need to specify the name of a set to be used as validation set + for choosing a threshold. The same goes for the set to be used for + testing the hypothesis - by default we use the "test" dataset if it is + available, otherwise, specify. \b - $ bob binseg significance -vv A path/to/A/valid.csv path/to/A/test B path/to/B/valid.csv path/to/B/test + $ bob binseg significance -vv drive --predictions-1=path/to/predictions/system-1 --predictions-2=path/to/predictions/system-2 --threshold=train --evaluate=alternate-test """, ) -@click.argument( - 'label_valid_path', - nargs=-1, - ) -@verbosity_option() -def significance(label_valid_path, **kwargs): - """Check if systems A and B are significantly different in performance - - Significance testing is done through a `Wilcoxon Test`_. +@click.option( + "--predictions-1", + "-p", + help="Path where predictions of system 1 are currently stored. You may " + "also input predictions from a second-annotator. This application " + "will adequately handle it.", + required=True, + type=click.Path(exists=True, file_okay=False, dir_okay=True), + cls=ResourceOption, +) +@click.option( + "--predictions-2", + "-P", + help="Path where predictions of system 2 are currently stored. You may " + "also input predictions from a second-annotator. This application " + "will adequately handle it.", + required=True, + type=click.Path(exists=True, file_okay=False, dir_okay=True), + cls=ResourceOption, +) +@click.option( + "--dataset", + "-d", + help="A dictionary mapping string keys to " + "torch.utils.data.dataset.Dataset instances", + required=True, + cls=ResourceOption, +) +@click.option( + "--threshold", + "-t", + help="This number is used to define positives and negatives from " + "probability maps, and report F1-scores (a priori). By default, we " + "expect a set named 'validation' to be available at the input data. " + "If that is not the case, we use 'train', if available. You may provide " + "the name of another dataset to be used for threshold tunning otherwise. " + "If not set, or a string is input, threshold tunning is done per system, " + "individually. Optionally, you may also provide a floating-point number " + "between [0.0, 1.0] as the threshold to use for both systems.", + default='validation', + show_default=True, + required=True, + cls=ResourceOption, +) +@click.option( + "--evaluate", + "-e", + help="Name of the dataset to evaluate", + default='test', + show_default=True, + required=True, + cls=ResourceOption, +) +@click.option( + "--steps", + "-S", + help="This number is used to define the number of threshold steps to " + "consider when evaluating the highest possible F1-score on train/test data.", + default=1000, + type=int, + show_default=True, + required=True, + cls=ResourceOption, +) +@click.option( + "--size", + "-s", + help="This is a tuple with two values indicating the size of windows to " + "be used for patch analysis. The values represent height and width " + "respectively.", + default=(128, 128), + nargs=2, + type=float, + show_default=True, + required=True, + cls=ResourceOption, +) +@click.option( + "--stride", + "-t", + help="This is a tuple with two values indicating the stride of windows to " + "be used for patch analysis. The values represent height and width " + "respectively.", + default=(32, 32), + nargs=2, + type=float, + show_default=True, + required=True, + cls=ResourceOption, +) +@verbosity_option(cls=ResourceOption) +def significance( + predictions_1, + predictions_2, + dataset, + threshold, + evaluate, + steps, + size, + stride, + **kwargs, +): + """Evaluates how significantly different are two models on the same dataset + + This application calculates the significance of results of two models + operating on the same dataset, and subject to a priori threshold tunning. """ - # hack to get a dictionary from arguments passed to input - if len(label_valid_path) % 3 != 0: - raise click.ClickException("Input label-validation-paths should be " - " tripplets composed of name-path-path entries") + # minimal validation to startup + threshold = _validate_threshold(threshold, dataset) + assert evaluate in dataset, f"No dataset named '{evaluate}'" - # split input into 2 systems - sa, sb = list(zip(*(iter(label_valid_path),)*3)) + if isinstance(threshold, float): + threshold1 = threshold2 = threshold - # sanity check file lists - fa = _find_files(sa[2], "*.csv") - fb = _find_files(sb[2], "*.csv") - assert fa == fb, f"List of files mismatched between '{sa[2]}' " \ - f"and '{sb[2]}' - please check" + else: #it is a string, re-calculate it for each system individually - # now run analysis - ta = _get_threshold(sa[1]) - tb = _get_threshold(sb[1]) + assert threshold in dataset, f"No dataset named '{threshold}'" - # load all F1-scores for the given threshold - da = [_load_score(os.path.join(sa[2], k), ta) for k in fa] - db = [_load_score(os.path.join(sb[2], k), tb) for k in fb] + logger.info(f"Evaluating threshold on '{threshold}' set for system 1") + threshold1 = run_evaluation( + dataset[threshold], threshold, predictions_1, steps=steps + ) + logger.info(f"Set --threshold={threshold:.5f} for system 1") + + logger.info(f"Evaluating threshold on '{threshold}' set for system 2") + threshold2 = run_evaluation( + dataset[threshold], threshold, predictions_2, steps=steps + ) + logger.info(f"Set --threshold={threshold:.5f} for system 2") + + # for a given threshold on each system, calculate patch performances + logger.info(f"Evaluating patch performances on '{evaluate}' set for system 1") + perf1 = patch_performances(data, evaluate, predictions_1, threshold1, + size, stride) + logger.info(f"Evaluating patch performances on '{evaluate}' set for system 2") + perf2 = patch_performances(data, evaluate, predictions_2, threshold2, + size, stride) - click.echo("Median F1-scores:") - click.echo(f"* {sa[0]}: {numpy.median(da):.3f}") - click.echo(f"* {sb[0]}: {numpy.median(db):.3f}") + ###### MAGIC STARTS ####### - w, p = scipy.stats.wilcoxon(da, db) + # load all F1-scores for the given threshold + da = perf1.f1_score + #import matplotlib + #matplotlib.use('macosx') + #import matplotlib.pyplot as plt + db = perf2.f1_score + #plt.boxplot([da, db]) + #plt.hist(numpy.array(da)-db, bins=6) + #plt.show() + + diff = da - db + #diff = diff[diff!=0.0] + #click.echo(diff) + + click.echo("#Samples/Median/Avg/Std.Dev./Normality Conf. F1-scores:") + click.echo(f"* system1: {len(da)} / {numpy.median(da):.3f} / {numpy.mean(da):.3f} / {numpy.std(da, ddof=1):.3f} / {scipy.stats.normaltest(da)[1]}" ) + click.echo(f"* system2: {len(db)} / {numpy.median(db):.3f} / {numpy.mean(db):.3f} / {numpy.std(db, ddof=1):.3f} / {scipy.stats.normaltest(db)[1]}" ) + click.echo(f"* system1-system2: {len(diff)} / {numpy.median(diff):.3f} / {numpy.mean(diff):.3f} / {numpy.std(diff, ddof=1):.3f} / {scipy.stats.normaltest(diff)[1]}" ) + + w, p = scipy.stats.ttest_rel(da, db) + click.echo(f"Paired T-test (is the difference zero?): S = {w:g}, p = {p:.5f}") + + w, p = scipy.stats.ttest_ind(da, db, equal_var=False) + click.echo(f"Ind. T-test (is the difference zero?): S = {w:g}, p = {p:.5f}") + + w, p = scipy.stats.wilcoxon(diff) click.echo(f"Wilcoxon test (is the difference zero?): W = {w:g}, p = {p:.5f}") - w, p = scipy.stats.wilcoxon(da, db, alternative="greater") - click.echo(f"Wilcoxon test (md({sa[0]}) < md({sb[0]})?): W = {w:g}, p = {p:.5f}") + w, p = scipy.stats.wilcoxon(diff, alternative="greater") + click.echo(f"Wilcoxon test (md(system1) < md(system2)?): W = {w:g}, p = {p:.5f}") - w, p = scipy.stats.wilcoxon(da, db, alternative="less") - click.echo(f"Wilcoxon test (md({sa[0]}) > md({sb[0]})?): W = {w:g}, p = {p:.5f}") + w, p = scipy.stats.wilcoxon(diff, alternative="less") + click.echo(f"Wilcoxon test (md(system1) > md(system2)?): W = {w:g}, p = {p:.5f}") diff --git a/bob/ip/binseg/test/test_cli.py b/bob/ip/binseg/test/test_cli.py index 2949d86fe83ecf184d21ba13316fc2dbec38dd07..97f3dc25e9685f4e78b7dcd37310edd3819de42d 100644 --- a/bob/ip/binseg/test/test_cli.py +++ b/bob/ip/binseg/test/test_cli.py @@ -151,11 +151,6 @@ def _check_experiment_stare(overlay): nose.tools.eq_( len(fnmatch.filter(os.listdir(traindir), "*.csv")), 10 ) - traindir = os.path.join(eval_folder, "train", "patches", "stare-images") - assert os.path.exists(traindir) - nose.tools.eq_( - len(fnmatch.filter(os.listdir(traindir), "*.csv")), 10 - ) assert os.path.exists(os.path.join(eval_folder, "test.csv")) # checks individual performance figures are there @@ -164,11 +159,6 @@ def _check_experiment_stare(overlay): nose.tools.eq_( len(fnmatch.filter(os.listdir(testdir), "*.csv")), 10 ) - testdir = os.path.join(eval_folder, "test", "patches", "stare-images") - assert os.path.exists(testdir) - nose.tools.eq_( - len(fnmatch.filter(os.listdir(testdir), "*.csv")), 10 - ) assert os.path.exists( os.path.join(eval_folder, "second-annotator", "train.csv") @@ -180,12 +170,6 @@ def _check_experiment_stare(overlay): nose.tools.eq_( len(fnmatch.filter(os.listdir(traindir_sa), "*.csv")), 10 ) - traindir_sa = os.path.join(eval_folder, "second-annotator", "patches", - "train", "stare-images") - assert os.path.exists(traindir_sa) - nose.tools.eq_( - len(fnmatch.filter(os.listdir(traindir_sa), "*.csv")), 10 - ) assert os.path.exists( os.path.join(eval_folder, "second-annotator", "test.csv") @@ -196,12 +180,6 @@ def _check_experiment_stare(overlay): nose.tools.eq_( len(fnmatch.filter(os.listdir(testdir_sa), "*.csv")), 10 ) - testdir_sa = os.path.join(eval_folder, "second-annotator", "patches", - "test", "stare-images") - assert os.path.exists(testdir_sa) - nose.tools.eq_( - len(fnmatch.filter(os.listdir(testdir_sa), "*.csv")), 10 - ) overlay_folder = os.path.join(output_folder, "overlayed", "analysis") traindir = os.path.join(overlay_folder, "train", "stare-images") @@ -461,11 +439,6 @@ def _check_evaluate(runner): nose.tools.eq_( len(fnmatch.filter(os.listdir(testdir), "*.csv")), 10 ) - testdir = os.path.join(output_folder, "test", "patches", "stare-images") - assert os.path.exists(testdir) - nose.tools.eq_( - len(fnmatch.filter(os.listdir(testdir), "*.csv")), 10 - ) assert os.path.exists( os.path.join(output_folder, "second-annotator", "test.csv") @@ -477,12 +450,6 @@ def _check_evaluate(runner): nose.tools.eq_( len(fnmatch.filter(os.listdir(testdir_sa), "*.csv")), 10 ) - testdir_sa = os.path.join(output_folder, "second-annotator", "test", - "patches", "stare-images") - assert os.path.exists(testdir_sa) - nose.tools.eq_( - len(fnmatch.filter(os.listdir(testdir_sa), "*.csv")), 10 - ) # check overlayed images are there (since we requested them) basedir = os.path.join(overlay_folder, "test", "stare-images")