diff --git a/bob/ip/binseg/engine/evaluator.py b/bob/ip/binseg/engine/evaluator.py index 8d3b11955f194d82ca837c2f878320e3be475550..8667300ebc47a024a1d3a01b8d92fa0cd975de17 100644 --- a/bob/ip/binseg/engine/evaluator.py +++ b/bob/ip/binseg/engine/evaluator.py @@ -288,6 +288,12 @@ def run( ) data[stem] = _sample_measures(pred, gt, steps) + if output_folder is not None: + fullpath = os.path.join(output_folder, name, f"{stem}.csv") + tqdm.write(f"Saving {fullpath}...") + os.makedirs(os.path.dirname(fullpath), exist_ok=True) + data[stem].to_csv(fullpath) + if overlayed_folder is not None: overlay_image = _sample_analysis( image, pred, gt, threshold=threshold, overlay=True @@ -297,6 +303,7 @@ def run( os.makedirs(os.path.dirname(fullpath), exist_ok=True) overlay_image.save(fullpath) + # Merges all dataframes together df_measures = pandas.concat(data.values()) @@ -408,6 +415,13 @@ def compare_annotators(baseline, other, name, output_folder, ) data[stem] = _sample_measures(pred, gt, 2) + if output_folder is not None: + fullpath = os.path.join(output_folder, "second-annotator", name, + f"{stem}.csv") + tqdm.write(f"Saving {fullpath}...") + os.makedirs(os.path.dirname(fullpath), exist_ok=True) + data[stem].to_csv(fullpath) + if overlayed_folder is not None: overlay_image = _sample_analysis( image, pred, gt, threshold=0.5, overlay=True diff --git a/bob/ip/binseg/script/compare.py b/bob/ip/binseg/script/compare.py index dd06106de0a5a4ba7871a9a5910aaa4fa887f2f7..cc899ffb6a61b7debff1fc0fa8bc48a309c71142 100644 --- a/bob/ip/binseg/script/compare.py +++ b/bob/ip/binseg/script/compare.py @@ -86,8 +86,6 @@ def _load(data, threshold=None): f"'{threshold}' dataset...") measures_path = data[threshold] df = pandas.read_csv(measures_path) - - maxf1 = df.f1_score.max() use_threshold = df.threshold[df.f1_score.idxmax()] logger.info(f"Dataset '*': threshold = {use_threshold:.3f}'") diff --git a/bob/ip/binseg/script/evaluate.py b/bob/ip/binseg/script/evaluate.py index cbd11aaeec69e001fd8edc18c1323bef38cec4fd..3e923636c872da06cd1684a324009eab82589bbf 100644 --- a/bob/ip/binseg/script/evaluate.py +++ b/bob/ip/binseg/script/evaluate.py @@ -131,7 +131,7 @@ def _validate_threshold(t, dataset): "dataset as input, this may also be the name of an existing set from " "which the threshold will be estimated (highest F1-score) and then " "applied to the subsequent sets. This number is also used to print " - "the test set F1-score a priori performance (default: 0.5)", + "the test set F1-score a priori performance", default=None, show_default=False, required=False, diff --git a/bob/ip/binseg/script/significance.py b/bob/ip/binseg/script/significance.py new file mode 100644 index 0000000000000000000000000000000000000000..ff7dd440f6e88990f445416e68c8491e23ddbeeb --- /dev/null +++ b/bob/ip/binseg/script/significance.py @@ -0,0 +1,155 @@ +#!/usr/bin/env python +# coding=utf-8 + +import os +import pathlib + +import click +import numpy +import scipy.stats + +from bob.extension.scripts.click_helper import ( + verbosity_option, + AliasedGroup, +) + +import pandas + +import logging +logger = logging.getLogger(__name__) + + +def _get_threshold(csvfile): + """Returns the thresholds that maximizes the F1-score from an aggreated scan + + Parameters + ========== + + csvfile : str + Path to a CSV file containing the aggreated performance across various + thresholds for a validation set. + + + Returns + ======= + + threshold : float + A float threshold + + """ + + df = pandas.read_csv(csvfile) + threshold = df.threshold[df.f1_score.idxmax()] + logger.info(f"Threshold for '{csvfile}' = {threshold:.3f}'") + return threshold + + +def _find_files(path, expression): + """Finds a files with names matching the glob expression recursively + + + Parameters + ========== + + path : str + The base path where to search for files + + expression : str + A glob expression to search for filenames (e.g. ``"*.csv"``) + + + Returns + ======= + + l : list + A sorted list (by name) of relative file paths found under ``path`` + + """ + + return sorted([k.name for k in pathlib.Path(path).rglob(expression)]) + + +def _load_score(path, threshold): + """Loads the specific score (at threshold "t") from an analysis file + + + Parameters + ========== + + path : string + Path pointing to the CSV files from where to load the scores + + threshold : float + Threshold value to use for picking the score + + + Returns + ======= + + value : float + Value representing the score at the given threshold inside the file + + """ + + df = pandas.read_csv(path) + bins = len(df) + index = int(round(bins * threshold)) + index = min(index, len(df) - 1) # avoids out of range indexing + return df.f1_score[index] + + +@click.command( + epilog="""Examples: + +\b + 1. Measures if systems A and B are significantly different for the same + input data, at a threshold optimized on specific validation sets. +\b + $ bob binseg significance -vv A path/to/A/valid.csv path/to/A/test B path/to/B/valid.csv path/to/B/test +""", +) +@click.argument( + 'label_valid_path', + nargs=-1, + ) +@verbosity_option() +def significance(label_valid_path, **kwargs): + """Check if systems A and B are significantly different in performance + + Significance testing is done through a `Wilcoxon Test`_. + """ + + # hack to get a dictionary from arguments passed to input + if len(label_valid_path) % 3 != 0: + raise click.ClickException("Input label-validation-paths should be " + " tripplets composed of name-path-path entries") + + # split input into 2 systems + sa, sb = list(zip(*(iter(label_valid_path),)*3)) + + # sanity check file lists + fa = _find_files(sa[2], "*.csv") + fb = _find_files(sb[2], "*.csv") + assert fa == fb, f"List of files mismatched between '{sa[2]}' " \ + f"and '{sb[2]}' - please check" + + # now run analysis + ta = _get_threshold(sa[1]) + tb = _get_threshold(sb[1]) + + # load all F1-scores for the given threshold + da = [_load_score(os.path.join(sa[2], k), ta) for k in fa] + db = [_load_score(os.path.join(sb[2], k), tb) for k in fb] + + click.echo("Median F1-scores:") + click.echo(f"* {sa[0]}: {numpy.median(da):.3f}") + click.echo(f"* {sb[0]}: {numpy.median(db):.3f}") + + w, p = scipy.stats.wilcoxon(da, db) + click.echo(f"Wilcoxon test (is the difference zero?): W = {w:g}, p = {p:.5f}") + + w, p = scipy.stats.wilcoxon(da, db, alternative="greater") + click.echo(f"Wilcoxon test (md({sa[0]}) < md({sb[0]})?): W = {w:g}, p = {p:.5f}") + + w, p = scipy.stats.wilcoxon(da, db, alternative="less") + click.echo(f"Wilcoxon test (md({sa[0]}) > md({sb[0]})?): W = {w:g}, p = {p:.5f}") diff --git a/bob/ip/binseg/test/test_cli.py b/bob/ip/binseg/test/test_cli.py index 52b0864dcfbef95f2a1d4f1e3d4cfae639c2b013..97f3dc25e9685f4e78b7dcd37310edd3819de42d 100644 --- a/bob/ip/binseg/test/test_cli.py +++ b/bob/ip/binseg/test/test_cli.py @@ -145,13 +145,41 @@ def _check_experiment_stare(overlay): # check evaluation outputs eval_folder = os.path.join(output_folder, "analysis") assert os.path.exists(os.path.join(eval_folder, "train.csv")) + # checks individual performance figures are there + traindir = os.path.join(eval_folder, "train", "stare-images") + assert os.path.exists(traindir) + nose.tools.eq_( + len(fnmatch.filter(os.listdir(traindir), "*.csv")), 10 + ) + assert os.path.exists(os.path.join(eval_folder, "test.csv")) + # checks individual performance figures are there + testdir = os.path.join(eval_folder, "test", "stare-images") + assert os.path.exists(testdir) + nose.tools.eq_( + len(fnmatch.filter(os.listdir(testdir), "*.csv")), 10 + ) + assert os.path.exists( os.path.join(eval_folder, "second-annotator", "train.csv") ) + # checks individual performance figures are there + traindir_sa = os.path.join(eval_folder, "second-annotator", "train", + "stare-images") + assert os.path.exists(traindir_sa) + nose.tools.eq_( + len(fnmatch.filter(os.listdir(traindir_sa), "*.csv")), 10 + ) + assert os.path.exists( os.path.join(eval_folder, "second-annotator", "test.csv") ) + testdir_sa = os.path.join(eval_folder, "second-annotator", "test", + "stare-images") + assert os.path.exists(testdir_sa) + nose.tools.eq_( + len(fnmatch.filter(os.listdir(testdir_sa), "*.csv")), 10 + ) overlay_folder = os.path.join(output_folder, "overlayed", "analysis") traindir = os.path.join(overlay_folder, "train", "stare-images") @@ -405,9 +433,23 @@ def _check_evaluate(runner): _assert_exit_0(result) assert os.path.exists(os.path.join(output_folder, "test.csv")) + # checks individual performance figures are there + testdir = os.path.join(output_folder, "test", "stare-images") + assert os.path.exists(testdir) + nose.tools.eq_( + len(fnmatch.filter(os.listdir(testdir), "*.csv")), 10 + ) + assert os.path.exists( os.path.join(output_folder, "second-annotator", "test.csv") ) + # checks individual performance figures are there + testdir_sa = os.path.join(output_folder, "second-annotator", "test", + "stare-images") + assert os.path.exists(testdir_sa) + nose.tools.eq_( + len(fnmatch.filter(os.listdir(testdir_sa), "*.csv")), 10 + ) # check overlayed images are there (since we requested them) basedir = os.path.join(overlay_folder, "test", "stare-images") diff --git a/conda/meta.yaml b/conda/meta.yaml index 5756384d632a378f8479a30cdd451fbdf080519d..e46233992f3e7e3e2d6ca801ab22125b3a10b8ce 100644 --- a/conda/meta.yaml +++ b/conda/meta.yaml @@ -26,6 +26,7 @@ requirements: - python {{ python }} - setuptools {{ setuptools }} - numpy {{ numpy }} + - scipy {{ scipy }} - h5py {{ h5py }} - pytorch {{ pytorch }} - torchvision {{ torchvision }} # [linux] @@ -34,6 +35,7 @@ requirements: - python - setuptools - {{ pin_compatible('numpy') }} + - {{ pin_compatible('scipy') }} - {{ pin_compatible('pytorch') }} - {{ pin_compatible('torchvision') }} # [linux] - matplotlib diff --git a/doc/cli.rst b/doc/cli.rst index 588cc997c85333e3fd2cf948dd9bd650f26b8a8e..13ded80d54a4abc1cf8204b53f351b581793435f 100644 --- a/doc/cli.rst +++ b/doc/cli.rst @@ -179,4 +179,15 @@ combined figures and tables that compare results of multiple systems. .. command-output:: bob binseg compare --help +.. _bob.ip.binseg.cli.significance: + +Performance Difference Significance +=================================== + +Calculates the significance between results obtained through 2 systems on the +same dataset. + +.. command-output:: bob binseg significance --help + + .. include:: links.rst diff --git a/doc/links.rst b/doc/links.rst index 8a9234e4c07161768ae38cea20552079cb0eea08..02ce1acdab66867258b04ad9bc5ad10ba0790877 100644 --- a/doc/links.rst +++ b/doc/links.rst @@ -24,6 +24,8 @@ .. Software Tools .. _maskrcnn-benchmark: https://github.com/facebookresearch/maskrcnn-benchmark +.. _wilcoxon test: https://en.wikipedia.org/wiki/Wilcoxon_signed-rank_test + .. Pretrained models diff --git a/doc/usage.rst b/doc/usage.rst index bb03e127a1289bf841ad9ef422d6dc26e561b002..abf252e39c78398a0df3d21168608d44119f3090 100644 --- a/doc/usage.rst +++ b/doc/usage.rst @@ -17,7 +17,9 @@ semantic binary segmentation with support for the following activities: * Evaluation: Vessel map predictions are used evaluate FCN performance against provided annotations, or visualize prediction results overlayed on the original raw images. -* Comparison: Use evaluation results to compare performance as you like. +* Comparison: Use evaluation results to compare performance as you like, or to + evaluate the significance between the results of two systems on the same + dataset. Whereas we provide :ref:`command-line interfaces (CLI) <bob.ip.binseg.cli.single>` that implement each of the phases above, we also diff --git a/requirements.txt b/requirements.txt index 706cd10addac90fafe7f8720b1e2d7aacde87445..69459e575cb3ab50b620a6858db41c505414c919 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,6 +1,7 @@ bob.extension matplotlib numpy +scipy pandas h5py pillow diff --git a/setup.py b/setup.py index bff45cf36196001418a51894c65f90f82d3066e2..bf6173fb6ce71a08973d5156e4698672c0640a16 100644 --- a/setup.py +++ b/setup.py @@ -37,6 +37,7 @@ setup( "predict = bob.ip.binseg.script.predict:predict", "evaluate = bob.ip.binseg.script.evaluate:evaluate", "compare = bob.ip.binseg.script.compare:compare", + "significance = bob.ip.binseg.script.significance:significance", "analyze = bob.ip.binseg.script.analyze:analyze", "experiment = bob.ip.binseg.script.experiment:experiment", ],