diff --git a/bob/ip/binseg/engine/significance.py b/bob/ip/binseg/engine/significance.py index 22c63c51c777de8d00f5e0f34f459a8458d0e9d5..54b2bb8685c9feb0f9c2b80c60d710c497f6a055 100644 --- a/bob/ip/binseg/engine/significance.py +++ b/bob/ip/binseg/engine/significance.py @@ -810,21 +810,27 @@ def write_analysis_text(names, da, db, f): w, p = scipy.stats.ttest_rel(da, db) f.write(f" * Paired T (H0: same distro): S = {w:g}, p = {p:.5f}\n") - w, p = scipy.stats.wilcoxon(diff) - f.write(" * Wilcoxon:\n") - f.write(f" * H0 = same distro: W = {w:g}, p = {p:.5f}\n") - - w, p = scipy.stats.wilcoxon(diff, alternative="greater") - f.write( - f" * H0 = med({names[0]}) < med({names[1]}): " - f"W = {w:g}, p = {p:.5f}\n" - ) + try: + f.write(" * Wilcoxon:\n") - w, p = scipy.stats.wilcoxon(diff, alternative="less") - f.write( - f" * H0 = med({names[0]}) > med({names[1]}): " - f"W = {w:g}, p = {p:.5f}\n" - ) + w, p = scipy.stats.wilcoxon(diff) + f.write(f" * H0 = same distro: W = {w:g}, p = {p:.5f}\n") + + w, p = scipy.stats.wilcoxon(diff, alternative="greater") + f.write( + f" * H0 = med({names[0]}) < med({names[1]}): " + f"W = {w:g}, p = {p:.5f}\n" + ) + + w, p = scipy.stats.wilcoxon(diff, alternative="less") + f.write( + f" * H0 = med({names[0]}) > med({names[1]}): " + f"W = {w:g}, p = {p:.5f}\n" + ) + except ValueError as e: + f.write(f" ERROR: Differences are exactly zero between both " + f"patch distributions. The Wilcoxon test does not work in " + f"these conditions (review your prediction directories): {e}\n") def write_analysis_figures(names, da, db, fname): diff --git a/bob/ip/binseg/script/significance.py b/bob/ip/binseg/script/significance.py index 661592416aea66138ca46a76d3b8f12cae003c17..9e409b412d7a42254d133e1373e9ed90653c9371 100755 --- a/bob/ip/binseg/script/significance.py +++ b/bob/ip/binseg/script/significance.py @@ -91,8 +91,8 @@ def _eval_patches( store performance visualizations. figure : str - The name of a performance figure (e.g. ``f1_score``, or ``jaccard``) to - use when comparing performances + The name of a performance figure (e.g. ``f1_score``, ``jaccard``, or + ``accuracy``) to use when comparing performances nproc : int Sets the number of parallel processes to use when running using @@ -169,7 +169,7 @@ def _eval_patches( # for a given threshold on each system, calculate patch performances logger.info( - f"Evaluating patch performances on '{evaluate}' set for " + f"Evaluating patch '{figure}' on '{evaluate}' set for " f"'{system_name}' using windows of size {size} and stride {stride}" ) @@ -292,6 +292,13 @@ def _eval_differences(names, perfs, evaluate, dataset, size, stride, outdir, for col in to_subtract: perf_diff[k][col] -= perfs[1][k]["df"][col] + # for a given threshold on each system, calculate patch performances + logger.info( + f"Evaluating patch '{figure}' differences on '{evaluate}' set on " + f"'{names[0]}-{names[1]}' using windows of size {size} and " + f"stride {stride}" + ) + retval = visual_performances( dataset, evaluate, @@ -428,7 +435,7 @@ def _eval_differences(names, perfs, evaluate, dataset, size, stride, outdir, "-f", help="The name of a performance figure (e.g. f1_score, or jaccard) to " "use when comparing performances", - default="f1_score", + default="accuracy", type=str, show_default=True, required=True, @@ -549,8 +556,8 @@ def significance( ) perf_diff = _eval_differences( - perf1, - perf2, + names, + (perf1, perf2), evaluate, dataset, size, @@ -586,12 +593,13 @@ def significance( if output_folder is not None: fname = os.path.join(output_folder, "analysis.pdf") os.makedirs(os.path.dirname(fname), exist_ok=True) + logger.info(f"Writing analysis figures to {fname} (multipage PDF)...") write_analysis_figures(names, da, db, fname) if output_folder is not None: fname = os.path.join(output_folder, "analysis.txt") os.makedirs(os.path.dirname(fname), exist_ok=True) + logger.info(f"Writing analysis summary to {fname}...") with open(fname, "wt") as f: write_analysis_text(names, da, db, f) - else: - write_analysis_text(names, da, db, sys.stdout) + write_analysis_text(names, da, db, sys.stdout) diff --git a/bob/ip/binseg/test/test_cli.py b/bob/ip/binseg/test/test_cli.py index 97f3dc25e9685f4e78b7dcd37310edd3819de42d..b427185caf017651223457fdd44096c24f737fdc 100644 --- a/bob/ip/binseg/test/test_cli.py +++ b/bob/ip/binseg/test/test_cli.py @@ -524,6 +524,69 @@ def _check_compare(runner): ) +def _check_significance(runner): + + from ..script.significance import significance + + with tempfile.NamedTemporaryFile( + mode="wt" + ) as config, stdout_logging() as buf: + + config.write("from bob.ip.binseg.data.stare import _make_dataset\n") + config.write(f"_raw = _make_dataset('{stare_datadir}')\n") + config.write( + "from bob.ip.binseg.configs.datasets.stare import _maker\n" + ) + config.write("dataset = _maker('ah', _raw)\n") + config.flush() + + ofolder = "significance" + cfolder = os.path.join(ofolder, "caches") + + result = runner.invoke( + significance, + [ + "-vv", + config.name, + "--names=v1", "v2", + "--predictions=predictions", "predictions", + "--threshold=0.5", + "--size=64", "64", + "--stride=32", "32", + "--figure=accuracy", + f"--output-folder={ofolder}", + f"--checkpoint-folder={cfolder}", + ], + ) + _assert_exit_0(result) + + assert os.path.exists(ofolder) + assert os.path.exists(cfolder) + assert os.path.exists(os.path.join(ofolder, "analysis.pdf")) + assert os.path.exists(os.path.join(ofolder, "analysis.txt")) + + keywords = { + r"^Evaluating patch 'accuracy' on": 2, + r"^Evaluating patch 'accuracy' differences on": 1, + r"^#Samples/Median/Avg/Std.Dev.": 1, + r"^Paired T-test": 1, + r"^Wilcoxon test": 3, + } + buf.seek(0) + logging_output = buf.read() + + for k, v in keywords.items(): + # if _str_counter(k, logging_output) != v: + # print(f"Count for string '{k}' appeared " \ + # f"({_str_counter(k, result.output)}) " \ + # f"instead of the expected {v}") + assert _str_counter(k, logging_output) == v, ( + f"Count for string '{k}' appeared " + f"({_str_counter(k, logging_output)}) " + f"instead of the expected {v}:\nOutput:\n{logging_output}" + ) + + @rc_variable_set("bob.ip.binseg.stare.datadir") def test_discrete_experiment_stare(): @@ -533,6 +596,7 @@ def test_discrete_experiment_stare(): _check_predict(runner) _check_evaluate(runner) _check_compare(runner) + _check_significance(runner) def test_train_help(): @@ -559,6 +623,12 @@ def test_compare_help(): _check_help(compare) +def test_significance_help(): + from ..script.significance import significance + + _check_help(significance) + + def test_config_help(): from ..script.config import config