diff --git a/bob/ip/binseg/engine/significance.py b/bob/ip/binseg/engine/significance.py
index 22c63c51c777de8d00f5e0f34f459a8458d0e9d5..54b2bb8685c9feb0f9c2b80c60d710c497f6a055 100644
--- a/bob/ip/binseg/engine/significance.py
+++ b/bob/ip/binseg/engine/significance.py
@@ -810,21 +810,27 @@ def write_analysis_text(names, da, db, f):
     w, p = scipy.stats.ttest_rel(da, db)
     f.write(f"  * Paired T (H0: same distro): S = {w:g}, p = {p:.5f}\n")
 
-    w, p = scipy.stats.wilcoxon(diff)
-    f.write("  * Wilcoxon:\n")
-    f.write(f"    * H0 = same distro: W = {w:g}, p = {p:.5f}\n")
-
-    w, p = scipy.stats.wilcoxon(diff, alternative="greater")
-    f.write(
-        f"    * H0 = med({names[0]}) < med({names[1]}): "
-        f"W = {w:g}, p = {p:.5f}\n"
-    )
+    try:
+        f.write("  * Wilcoxon:\n")
 
-    w, p = scipy.stats.wilcoxon(diff, alternative="less")
-    f.write(
-        f"    * H0 = med({names[0]}) > med({names[1]}): "
-        f"W = {w:g}, p = {p:.5f}\n"
-    )
+        w, p = scipy.stats.wilcoxon(diff)
+        f.write(f"    * H0 = same distro: W = {w:g}, p = {p:.5f}\n")
+
+        w, p = scipy.stats.wilcoxon(diff, alternative="greater")
+        f.write(
+            f"    * H0 = med({names[0]}) < med({names[1]}): "
+            f"W = {w:g}, p = {p:.5f}\n"
+        )
+
+        w, p = scipy.stats.wilcoxon(diff, alternative="less")
+        f.write(
+            f"    * H0 = med({names[0]}) > med({names[1]}): "
+            f"W = {w:g}, p = {p:.5f}\n"
+        )
+    except ValueError as e:
+        f.write(f"    ERROR: Differences are exactly zero between both "
+                f"patch distributions.  The Wilcoxon test does not work in "
+                f"these conditions (review your prediction directories): {e}\n")
 
 
 def write_analysis_figures(names, da, db, fname):
diff --git a/bob/ip/binseg/script/significance.py b/bob/ip/binseg/script/significance.py
index 661592416aea66138ca46a76d3b8f12cae003c17..9e409b412d7a42254d133e1373e9ed90653c9371 100755
--- a/bob/ip/binseg/script/significance.py
+++ b/bob/ip/binseg/script/significance.py
@@ -91,8 +91,8 @@ def _eval_patches(
         store performance visualizations.
 
     figure : str
-        The name of a performance figure (e.g. ``f1_score``, or ``jaccard``) to
-        use when comparing performances
+        The name of a performance figure (e.g. ``f1_score``, ``jaccard``, or
+        ``accuracy``) to use when comparing performances
 
     nproc : int
         Sets the number of parallel processes to use when running using
@@ -169,7 +169,7 @@ def _eval_patches(
 
     # for a given threshold on each system, calculate patch performances
     logger.info(
-        f"Evaluating patch performances on '{evaluate}' set for "
+        f"Evaluating patch '{figure}' on '{evaluate}' set for "
         f"'{system_name}' using windows of size {size} and stride {stride}"
     )
 
@@ -292,6 +292,13 @@ def _eval_differences(names, perfs, evaluate, dataset, size, stride, outdir,
         for col in to_subtract:
             perf_diff[k][col] -= perfs[1][k]["df"][col]
 
+    # for a given threshold on each system, calculate patch performances
+    logger.info(
+        f"Evaluating patch '{figure}' differences on '{evaluate}' set on "
+        f"'{names[0]}-{names[1]}' using windows of size {size} and "
+        f"stride {stride}"
+    )
+
     retval = visual_performances(
         dataset,
         evaluate,
@@ -428,7 +435,7 @@ def _eval_differences(names, perfs, evaluate, dataset, size, stride, outdir,
     "-f",
     help="The name of a performance figure (e.g. f1_score, or jaccard) to "
     "use when comparing performances",
-    default="f1_score",
+    default="accuracy",
     type=str,
     show_default=True,
     required=True,
@@ -549,8 +556,8 @@ def significance(
     )
 
     perf_diff = _eval_differences(
-            perf1,
-            perf2,
+            names,
+            (perf1, perf2),
             evaluate,
             dataset,
             size,
@@ -586,12 +593,13 @@ def significance(
     if output_folder is not None:
         fname = os.path.join(output_folder, "analysis.pdf")
         os.makedirs(os.path.dirname(fname), exist_ok=True)
+        logger.info(f"Writing analysis figures to {fname} (multipage PDF)...")
         write_analysis_figures(names, da, db, fname)
 
     if output_folder is not None:
         fname = os.path.join(output_folder, "analysis.txt")
         os.makedirs(os.path.dirname(fname), exist_ok=True)
+        logger.info(f"Writing analysis summary to {fname}...")
         with open(fname, "wt") as f:
             write_analysis_text(names, da, db, f)
-    else:
-        write_analysis_text(names, da, db, sys.stdout)
+    write_analysis_text(names, da, db, sys.stdout)
diff --git a/bob/ip/binseg/test/test_cli.py b/bob/ip/binseg/test/test_cli.py
index 97f3dc25e9685f4e78b7dcd37310edd3819de42d..b427185caf017651223457fdd44096c24f737fdc 100644
--- a/bob/ip/binseg/test/test_cli.py
+++ b/bob/ip/binseg/test/test_cli.py
@@ -524,6 +524,69 @@ def _check_compare(runner):
             )
 
 
+def _check_significance(runner):
+
+    from ..script.significance import significance
+
+    with tempfile.NamedTemporaryFile(
+        mode="wt"
+    ) as config, stdout_logging() as buf:
+
+        config.write("from bob.ip.binseg.data.stare import _make_dataset\n")
+        config.write(f"_raw = _make_dataset('{stare_datadir}')\n")
+        config.write(
+            "from bob.ip.binseg.configs.datasets.stare import _maker\n"
+        )
+        config.write("dataset = _maker('ah', _raw)\n")
+        config.flush()
+
+        ofolder = "significance"
+        cfolder = os.path.join(ofolder, "caches")
+
+        result = runner.invoke(
+            significance,
+            [
+                "-vv",
+                config.name,
+                "--names=v1", "v2",
+                "--predictions=predictions", "predictions",
+                "--threshold=0.5",
+                "--size=64", "64",
+                "--stride=32", "32",
+                "--figure=accuracy",
+                f"--output-folder={ofolder}",
+                f"--checkpoint-folder={cfolder}",
+            ],
+        )
+        _assert_exit_0(result)
+
+        assert os.path.exists(ofolder)
+        assert os.path.exists(cfolder)
+        assert os.path.exists(os.path.join(ofolder, "analysis.pdf"))
+        assert os.path.exists(os.path.join(ofolder, "analysis.txt"))
+
+        keywords = {
+            r"^Evaluating patch 'accuracy' on": 2,
+            r"^Evaluating patch 'accuracy' differences on": 1,
+            r"^#Samples/Median/Avg/Std.Dev.": 1,
+            r"^Paired T-test": 1,
+            r"^Wilcoxon test": 3,
+        }
+        buf.seek(0)
+        logging_output = buf.read()
+
+        for k, v in keywords.items():
+            # if _str_counter(k, logging_output) != v:
+            #    print(f"Count for string '{k}' appeared " \
+            #        f"({_str_counter(k, result.output)}) " \
+            #        f"instead of the expected {v}")
+            assert _str_counter(k, logging_output) == v, (
+                f"Count for string '{k}' appeared "
+                f"({_str_counter(k, logging_output)}) "
+                f"instead of the expected {v}:\nOutput:\n{logging_output}"
+            )
+
+
 @rc_variable_set("bob.ip.binseg.stare.datadir")
 def test_discrete_experiment_stare():
 
@@ -533,6 +596,7 @@ def test_discrete_experiment_stare():
         _check_predict(runner)
         _check_evaluate(runner)
         _check_compare(runner)
+        _check_significance(runner)
 
 
 def test_train_help():
@@ -559,6 +623,12 @@ def test_compare_help():
     _check_help(compare)
 
 
+def test_significance_help():
+    from ..script.significance import significance
+
+    _check_help(significance)
+
+
 def test_config_help():
     from ..script.config import config