diff --git a/bob/ip/binseg/engine/evaluator.py b/bob/ip/binseg/engine/evaluator.py
index 8d3b11955f194d82ca837c2f878320e3be475550..8667300ebc47a024a1d3a01b8d92fa0cd975de17 100644
--- a/bob/ip/binseg/engine/evaluator.py
+++ b/bob/ip/binseg/engine/evaluator.py
@@ -288,6 +288,12 @@ def run(
             )
         data[stem] = _sample_measures(pred, gt, steps)
 
+        if output_folder is not None:
+            fullpath = os.path.join(output_folder, name, f"{stem}.csv")
+            tqdm.write(f"Saving {fullpath}...")
+            os.makedirs(os.path.dirname(fullpath), exist_ok=True)
+            data[stem].to_csv(fullpath)
+
         if overlayed_folder is not None:
             overlay_image = _sample_analysis(
                 image, pred, gt, threshold=threshold, overlay=True
@@ -297,6 +303,7 @@ def run(
             os.makedirs(os.path.dirname(fullpath), exist_ok=True)
             overlay_image.save(fullpath)
 
+
     # Merges all dataframes together
     df_measures = pandas.concat(data.values())
 
@@ -408,6 +415,13 @@ def compare_annotators(baseline, other, name, output_folder,
             )
         data[stem] = _sample_measures(pred, gt, 2)
 
+        if output_folder is not None:
+            fullpath = os.path.join(output_folder, "second-annotator", name,
+                    f"{stem}.csv")
+            tqdm.write(f"Saving {fullpath}...")
+            os.makedirs(os.path.dirname(fullpath), exist_ok=True)
+            data[stem].to_csv(fullpath)
+
         if overlayed_folder is not None:
             overlay_image = _sample_analysis(
                 image, pred, gt, threshold=0.5, overlay=True
diff --git a/bob/ip/binseg/script/compare.py b/bob/ip/binseg/script/compare.py
index dd06106de0a5a4ba7871a9a5910aaa4fa887f2f7..cc899ffb6a61b7debff1fc0fa8bc48a309c71142 100644
--- a/bob/ip/binseg/script/compare.py
+++ b/bob/ip/binseg/script/compare.py
@@ -86,8 +86,6 @@ def _load(data, threshold=None):
                 f"'{threshold}' dataset...")
         measures_path = data[threshold]
         df = pandas.read_csv(measures_path)
-
-        maxf1 = df.f1_score.max()
         use_threshold = df.threshold[df.f1_score.idxmax()]
         logger.info(f"Dataset '*': threshold = {use_threshold:.3f}'")
 
diff --git a/bob/ip/binseg/script/evaluate.py b/bob/ip/binseg/script/evaluate.py
index cbd11aaeec69e001fd8edc18c1323bef38cec4fd..3e923636c872da06cd1684a324009eab82589bbf 100644
--- a/bob/ip/binseg/script/evaluate.py
+++ b/bob/ip/binseg/script/evaluate.py
@@ -131,7 +131,7 @@ def _validate_threshold(t, dataset):
     "dataset as input, this may also be the name of an existing set from "
     "which the threshold will be estimated (highest F1-score) and then "
     "applied to the subsequent sets.  This number is also used to print "
-    "the test set F1-score a priori performance (default: 0.5)",
+    "the test set F1-score a priori performance",
     default=None,
     show_default=False,
     required=False,
diff --git a/bob/ip/binseg/script/significance.py b/bob/ip/binseg/script/significance.py
new file mode 100644
index 0000000000000000000000000000000000000000..ff7dd440f6e88990f445416e68c8491e23ddbeeb
--- /dev/null
+++ b/bob/ip/binseg/script/significance.py
@@ -0,0 +1,155 @@
+#!/usr/bin/env python
+# coding=utf-8
+
+import os
+import pathlib
+
+import click
+import numpy
+import scipy.stats
+
+from bob.extension.scripts.click_helper import (
+    verbosity_option,
+    AliasedGroup,
+)
+
+import pandas
+
+import logging
+logger = logging.getLogger(__name__)
+
+
+def _get_threshold(csvfile):
+    """Returns the thresholds that maximizes the F1-score from an aggreated scan
+
+    Parameters
+    ==========
+
+    csvfile : str
+        Path to a CSV file containing the aggreated performance across various
+        thresholds for a validation set.
+
+
+    Returns
+    =======
+
+    threshold : float
+        A float threshold
+
+    """
+
+    df = pandas.read_csv(csvfile)
+    threshold = df.threshold[df.f1_score.idxmax()]
+    logger.info(f"Threshold for '{csvfile}' = {threshold:.3f}'")
+    return threshold
+
+
+def _find_files(path, expression):
+    """Finds a files with names matching the glob expression recursively
+
+
+    Parameters
+    ==========
+
+    path : str
+        The base path where to search for files
+
+    expression : str
+        A glob expression to search for filenames (e.g. ``"*.csv"``)
+
+
+    Returns
+    =======
+
+    l : list
+        A sorted list (by name) of relative file paths found under ``path``
+
+    """
+
+    return sorted([k.name for k in pathlib.Path(path).rglob(expression)])
+
+
+def _load_score(path, threshold):
+    """Loads the specific score (at threshold "t") from an analysis file
+
+
+    Parameters
+    ==========
+
+    path : string
+        Path pointing to the CSV files from where to load the scores
+
+    threshold : float
+        Threshold value to use for picking the score
+
+
+    Returns
+    =======
+
+    value : float
+        Value representing the score at the given threshold inside the file
+
+    """
+
+    df = pandas.read_csv(path)
+    bins = len(df)
+    index = int(round(bins * threshold))
+    index = min(index, len(df) - 1)  # avoids out of range indexing
+    return df.f1_score[index]
+
+
+@click.command(
+    epilog="""Examples:
+
+\b
+    1. Measures if systems A and B are significantly different for the same
+       input data, at a threshold optimized on specific validation sets.
+\b
+       $ bob binseg significance -vv A path/to/A/valid.csv path/to/A/test B path/to/B/valid.csv path/to/B/test
+""",
+)
+@click.argument(
+        'label_valid_path',
+        nargs=-1,
+        )
+@verbosity_option()
+def significance(label_valid_path, **kwargs):
+    """Check if systems A and B are significantly different in performance
+
+    Significance testing is done through a `Wilcoxon Test`_.
+    """
+
+    # hack to get a dictionary from arguments passed to input
+    if len(label_valid_path) % 3 != 0:
+        raise click.ClickException("Input label-validation-paths should be "
+                " tripplets composed of name-path-path entries")
+
+    # split input into 2 systems
+    sa, sb = list(zip(*(iter(label_valid_path),)*3))
+
+    # sanity check file lists
+    fa = _find_files(sa[2], "*.csv")
+    fb = _find_files(sb[2], "*.csv")
+    assert fa == fb, f"List of files mismatched between '{sa[2]}' " \
+        f"and '{sb[2]}' - please check"
+
+    # now run analysis
+    ta = _get_threshold(sa[1])
+    tb = _get_threshold(sb[1])
+
+    # load all F1-scores for the given threshold
+    da = [_load_score(os.path.join(sa[2], k), ta) for k in fa]
+    db = [_load_score(os.path.join(sb[2], k), tb) for k in fb]
+
+    click.echo("Median F1-scores:")
+    click.echo(f"* {sa[0]}: {numpy.median(da):.3f}")
+    click.echo(f"* {sb[0]}: {numpy.median(db):.3f}")
+
+    w, p = scipy.stats.wilcoxon(da, db)
+    click.echo(f"Wilcoxon test (is the difference zero?): W = {w:g}, p = {p:.5f}")
+
+    w, p = scipy.stats.wilcoxon(da, db, alternative="greater")
+    click.echo(f"Wilcoxon test (md({sa[0]}) < md({sb[0]})?): W = {w:g}, p = {p:.5f}")
+
+    w, p = scipy.stats.wilcoxon(da, db, alternative="less")
+    click.echo(f"Wilcoxon test (md({sa[0]}) > md({sb[0]})?): W = {w:g}, p = {p:.5f}")
diff --git a/bob/ip/binseg/test/test_cli.py b/bob/ip/binseg/test/test_cli.py
index 52b0864dcfbef95f2a1d4f1e3d4cfae639c2b013..97f3dc25e9685f4e78b7dcd37310edd3819de42d 100644
--- a/bob/ip/binseg/test/test_cli.py
+++ b/bob/ip/binseg/test/test_cli.py
@@ -145,13 +145,41 @@ def _check_experiment_stare(overlay):
         # check evaluation outputs
         eval_folder = os.path.join(output_folder, "analysis")
         assert os.path.exists(os.path.join(eval_folder, "train.csv"))
+        # checks individual performance figures are there
+        traindir = os.path.join(eval_folder, "train", "stare-images")
+        assert os.path.exists(traindir)
+        nose.tools.eq_(
+            len(fnmatch.filter(os.listdir(traindir), "*.csv")), 10
+        )
+
         assert os.path.exists(os.path.join(eval_folder, "test.csv"))
+        # checks individual performance figures are there
+        testdir = os.path.join(eval_folder, "test", "stare-images")
+        assert os.path.exists(testdir)
+        nose.tools.eq_(
+            len(fnmatch.filter(os.listdir(testdir), "*.csv")), 10
+        )
+
         assert os.path.exists(
             os.path.join(eval_folder, "second-annotator", "train.csv")
         )
+        # checks individual performance figures are there
+        traindir_sa = os.path.join(eval_folder, "second-annotator", "train",
+                "stare-images")
+        assert os.path.exists(traindir_sa)
+        nose.tools.eq_(
+            len(fnmatch.filter(os.listdir(traindir_sa), "*.csv")), 10
+        )
+
         assert os.path.exists(
             os.path.join(eval_folder, "second-annotator", "test.csv")
         )
+        testdir_sa = os.path.join(eval_folder, "second-annotator", "test",
+                "stare-images")
+        assert os.path.exists(testdir_sa)
+        nose.tools.eq_(
+            len(fnmatch.filter(os.listdir(testdir_sa), "*.csv")), 10
+        )
 
         overlay_folder = os.path.join(output_folder, "overlayed", "analysis")
         traindir = os.path.join(overlay_folder, "train", "stare-images")
@@ -405,9 +433,23 @@ def _check_evaluate(runner):
         _assert_exit_0(result)
 
         assert os.path.exists(os.path.join(output_folder, "test.csv"))
+        # checks individual performance figures are there
+        testdir = os.path.join(output_folder, "test", "stare-images")
+        assert os.path.exists(testdir)
+        nose.tools.eq_(
+            len(fnmatch.filter(os.listdir(testdir), "*.csv")), 10
+        )
+
         assert os.path.exists(
             os.path.join(output_folder, "second-annotator", "test.csv")
         )
+        # checks individual performance figures are there
+        testdir_sa = os.path.join(output_folder, "second-annotator", "test",
+                "stare-images")
+        assert os.path.exists(testdir_sa)
+        nose.tools.eq_(
+            len(fnmatch.filter(os.listdir(testdir_sa), "*.csv")), 10
+        )
 
         # check overlayed images are there (since we requested them)
         basedir = os.path.join(overlay_folder, "test", "stare-images")
diff --git a/conda/meta.yaml b/conda/meta.yaml
index 5756384d632a378f8479a30cdd451fbdf080519d..e46233992f3e7e3e2d6ca801ab22125b3a10b8ce 100644
--- a/conda/meta.yaml
+++ b/conda/meta.yaml
@@ -26,6 +26,7 @@ requirements:
     - python {{ python }}
     - setuptools {{ setuptools }}
     - numpy {{ numpy }}
+    - scipy {{ scipy }}
     - h5py {{ h5py }}
     - pytorch {{ pytorch }}
     - torchvision  {{ torchvision }} # [linux]
@@ -34,6 +35,7 @@ requirements:
     - python
     - setuptools
     - {{ pin_compatible('numpy') }}
+    - {{ pin_compatible('scipy') }}
     - {{ pin_compatible('pytorch') }}
     - {{ pin_compatible('torchvision') }} # [linux]
     - matplotlib
diff --git a/doc/cli.rst b/doc/cli.rst
index 588cc997c85333e3fd2cf948dd9bd650f26b8a8e..13ded80d54a4abc1cf8204b53f351b581793435f 100644
--- a/doc/cli.rst
+++ b/doc/cli.rst
@@ -179,4 +179,15 @@ combined figures and tables that compare results of multiple systems.
 .. command-output:: bob binseg compare --help
 
 
+.. _bob.ip.binseg.cli.significance:
+
+Performance Difference Significance
+===================================
+
+Calculates the significance between results obtained through 2 systems on the
+same dataset.
+
+.. command-output:: bob binseg significance --help
+
+
 .. include:: links.rst
diff --git a/doc/links.rst b/doc/links.rst
index 8a9234e4c07161768ae38cea20552079cb0eea08..02ce1acdab66867258b04ad9bc5ad10ba0790877 100644
--- a/doc/links.rst
+++ b/doc/links.rst
@@ -24,6 +24,8 @@
 .. Software Tools
 .. _maskrcnn-benchmark: https://github.com/facebookresearch/maskrcnn-benchmark
 
+.. _wilcoxon test: https://en.wikipedia.org/wiki/Wilcoxon_signed-rank_test
+
 
 .. Pretrained models
 
diff --git a/doc/usage.rst b/doc/usage.rst
index bb03e127a1289bf841ad9ef422d6dc26e561b002..abf252e39c78398a0df3d21168608d44119f3090 100644
--- a/doc/usage.rst
+++ b/doc/usage.rst
@@ -17,7 +17,9 @@ semantic binary segmentation with support for the following activities:
 * Evaluation: Vessel map predictions are used evaluate FCN performance against
   provided annotations, or visualize prediction results overlayed on
   the original raw images.
-* Comparison: Use evaluation results to compare performance as you like.
+* Comparison: Use evaluation results to compare performance as you like, or to
+  evaluate the significance between the results of two systems on the same
+  dataset.
 
 Whereas we provide :ref:`command-line interfaces (CLI)
 <bob.ip.binseg.cli.single>` that implement each of the phases above, we also
diff --git a/requirements.txt b/requirements.txt
index 706cd10addac90fafe7f8720b1e2d7aacde87445..69459e575cb3ab50b620a6858db41c505414c919 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,6 +1,7 @@
 bob.extension
 matplotlib
 numpy
+scipy
 pandas
 h5py
 pillow
diff --git a/setup.py b/setup.py
index bff45cf36196001418a51894c65f90f82d3066e2..bf6173fb6ce71a08973d5156e4698672c0640a16 100644
--- a/setup.py
+++ b/setup.py
@@ -37,6 +37,7 @@ setup(
             "predict = bob.ip.binseg.script.predict:predict",
             "evaluate = bob.ip.binseg.script.evaluate:evaluate",
             "compare =  bob.ip.binseg.script.compare:compare",
+            "significance =  bob.ip.binseg.script.significance:significance",
             "analyze =  bob.ip.binseg.script.analyze:analyze",
             "experiment =  bob.ip.binseg.script.experiment:experiment",
         ],