From c2ad295206f7e479bd719b05a4047424ae65a83a Mon Sep 17 00:00:00 2001
From: Andre Anjos <andre.dos.anjos@gmail.com>
Date: Mon, 20 Jul 2020 18:37:40 +0200
Subject: [PATCH] [test] Added test for significance CLI app

---
 bob/ip/binseg/engine/significance.py | 34 ++++++++------
 bob/ip/binseg/script/significance.py | 24 ++++++----
 bob/ip/binseg/test/test_cli.py       | 70 ++++++++++++++++++++++++++++
 3 files changed, 106 insertions(+), 22 deletions(-)

diff --git a/bob/ip/binseg/engine/significance.py b/bob/ip/binseg/engine/significance.py
index 22c63c51..54b2bb86 100644
--- a/bob/ip/binseg/engine/significance.py
+++ b/bob/ip/binseg/engine/significance.py
@@ -810,21 +810,27 @@ def write_analysis_text(names, da, db, f):
     w, p = scipy.stats.ttest_rel(da, db)
     f.write(f"  * Paired T (H0: same distro): S = {w:g}, p = {p:.5f}\n")
 
-    w, p = scipy.stats.wilcoxon(diff)
-    f.write("  * Wilcoxon:\n")
-    f.write(f"    * H0 = same distro: W = {w:g}, p = {p:.5f}\n")
-
-    w, p = scipy.stats.wilcoxon(diff, alternative="greater")
-    f.write(
-        f"    * H0 = med({names[0]}) < med({names[1]}): "
-        f"W = {w:g}, p = {p:.5f}\n"
-    )
+    try:
+        f.write("  * Wilcoxon:\n")
 
-    w, p = scipy.stats.wilcoxon(diff, alternative="less")
-    f.write(
-        f"    * H0 = med({names[0]}) > med({names[1]}): "
-        f"W = {w:g}, p = {p:.5f}\n"
-    )
+        w, p = scipy.stats.wilcoxon(diff)
+        f.write(f"    * H0 = same distro: W = {w:g}, p = {p:.5f}\n")
+
+        w, p = scipy.stats.wilcoxon(diff, alternative="greater")
+        f.write(
+            f"    * H0 = med({names[0]}) < med({names[1]}): "
+            f"W = {w:g}, p = {p:.5f}\n"
+        )
+
+        w, p = scipy.stats.wilcoxon(diff, alternative="less")
+        f.write(
+            f"    * H0 = med({names[0]}) > med({names[1]}): "
+            f"W = {w:g}, p = {p:.5f}\n"
+        )
+    except ValueError as e:
+        f.write(f"    ERROR: Differences are exactly zero between both "
+                f"patch distributions.  The Wilcoxon test does not work in "
+                f"these conditions (review your prediction directories): {e}\n")
 
 
 def write_analysis_figures(names, da, db, fname):
diff --git a/bob/ip/binseg/script/significance.py b/bob/ip/binseg/script/significance.py
index 66159241..9e409b41 100755
--- a/bob/ip/binseg/script/significance.py
+++ b/bob/ip/binseg/script/significance.py
@@ -91,8 +91,8 @@ def _eval_patches(
         store performance visualizations.
 
     figure : str
-        The name of a performance figure (e.g. ``f1_score``, or ``jaccard``) to
-        use when comparing performances
+        The name of a performance figure (e.g. ``f1_score``, ``jaccard``, or
+        ``accuracy``) to use when comparing performances
 
     nproc : int
         Sets the number of parallel processes to use when running using
@@ -169,7 +169,7 @@ def _eval_patches(
 
     # for a given threshold on each system, calculate patch performances
     logger.info(
-        f"Evaluating patch performances on '{evaluate}' set for "
+        f"Evaluating patch '{figure}' on '{evaluate}' set for "
         f"'{system_name}' using windows of size {size} and stride {stride}"
     )
 
@@ -292,6 +292,13 @@ def _eval_differences(names, perfs, evaluate, dataset, size, stride, outdir,
         for col in to_subtract:
             perf_diff[k][col] -= perfs[1][k]["df"][col]
 
+    # for a given threshold on each system, calculate patch performances
+    logger.info(
+        f"Evaluating patch '{figure}' differences on '{evaluate}' set on "
+        f"'{names[0]}-{names[1]}' using windows of size {size} and "
+        f"stride {stride}"
+    )
+
     retval = visual_performances(
         dataset,
         evaluate,
@@ -428,7 +435,7 @@ def _eval_differences(names, perfs, evaluate, dataset, size, stride, outdir,
     "-f",
     help="The name of a performance figure (e.g. f1_score, or jaccard) to "
     "use when comparing performances",
-    default="f1_score",
+    default="accuracy",
     type=str,
     show_default=True,
     required=True,
@@ -549,8 +556,8 @@ def significance(
     )
 
     perf_diff = _eval_differences(
-            perf1,
-            perf2,
+            names,
+            (perf1, perf2),
             evaluate,
             dataset,
             size,
@@ -586,12 +593,13 @@ def significance(
     if output_folder is not None:
         fname = os.path.join(output_folder, "analysis.pdf")
         os.makedirs(os.path.dirname(fname), exist_ok=True)
+        logger.info(f"Writing analysis figures to {fname} (multipage PDF)...")
         write_analysis_figures(names, da, db, fname)
 
     if output_folder is not None:
         fname = os.path.join(output_folder, "analysis.txt")
         os.makedirs(os.path.dirname(fname), exist_ok=True)
+        logger.info(f"Writing analysis summary to {fname}...")
         with open(fname, "wt") as f:
             write_analysis_text(names, da, db, f)
-    else:
-        write_analysis_text(names, da, db, sys.stdout)
+    write_analysis_text(names, da, db, sys.stdout)
diff --git a/bob/ip/binseg/test/test_cli.py b/bob/ip/binseg/test/test_cli.py
index 97f3dc25..b427185c 100644
--- a/bob/ip/binseg/test/test_cli.py
+++ b/bob/ip/binseg/test/test_cli.py
@@ -524,6 +524,69 @@ def _check_compare(runner):
             )
 
 
+def _check_significance(runner):
+
+    from ..script.significance import significance
+
+    with tempfile.NamedTemporaryFile(
+        mode="wt"
+    ) as config, stdout_logging() as buf:
+
+        config.write("from bob.ip.binseg.data.stare import _make_dataset\n")
+        config.write(f"_raw = _make_dataset('{stare_datadir}')\n")
+        config.write(
+            "from bob.ip.binseg.configs.datasets.stare import _maker\n"
+        )
+        config.write("dataset = _maker('ah', _raw)\n")
+        config.flush()
+
+        ofolder = "significance"
+        cfolder = os.path.join(ofolder, "caches")
+
+        result = runner.invoke(
+            significance,
+            [
+                "-vv",
+                config.name,
+                "--names=v1", "v2",
+                "--predictions=predictions", "predictions",
+                "--threshold=0.5",
+                "--size=64", "64",
+                "--stride=32", "32",
+                "--figure=accuracy",
+                f"--output-folder={ofolder}",
+                f"--checkpoint-folder={cfolder}",
+            ],
+        )
+        _assert_exit_0(result)
+
+        assert os.path.exists(ofolder)
+        assert os.path.exists(cfolder)
+        assert os.path.exists(os.path.join(ofolder, "analysis.pdf"))
+        assert os.path.exists(os.path.join(ofolder, "analysis.txt"))
+
+        keywords = {
+            r"^Evaluating patch 'accuracy' on": 2,
+            r"^Evaluating patch 'accuracy' differences on": 1,
+            r"^#Samples/Median/Avg/Std.Dev.": 1,
+            r"^Paired T-test": 1,
+            r"^Wilcoxon test": 3,
+        }
+        buf.seek(0)
+        logging_output = buf.read()
+
+        for k, v in keywords.items():
+            # if _str_counter(k, logging_output) != v:
+            #    print(f"Count for string '{k}' appeared " \
+            #        f"({_str_counter(k, result.output)}) " \
+            #        f"instead of the expected {v}")
+            assert _str_counter(k, logging_output) == v, (
+                f"Count for string '{k}' appeared "
+                f"({_str_counter(k, logging_output)}) "
+                f"instead of the expected {v}:\nOutput:\n{logging_output}"
+            )
+
+
 @rc_variable_set("bob.ip.binseg.stare.datadir")
 def test_discrete_experiment_stare():
 
@@ -533,6 +596,7 @@ def test_discrete_experiment_stare():
         _check_predict(runner)
         _check_evaluate(runner)
         _check_compare(runner)
+        _check_significance(runner)
 
 
 def test_train_help():
@@ -559,6 +623,12 @@ def test_compare_help():
     _check_help(compare)
 
 
+def test_significance_help():
+    from ..script.significance import significance
+
+    _check_help(significance)
+
+
 def test_config_help():
     from ..script.config import config
 
-- 
GitLab