diff --git a/bob/ip/binseg/engine/evaluator.py b/bob/ip/binseg/engine/evaluator.py
index 2901305574a55af3afdab3fd93302ee14ee86d04..0714289b9894f1d82aa8cfc1a229356971488f8b 100644
--- a/bob/ip/binseg/engine/evaluator.py
+++ b/bob/ip/binseg/engine/evaluator.py
@@ -50,6 +50,51 @@ def _posneg(pred, gt, threshold):
     return tp_tensor, fp_tensor, tn_tensor, fn_tensor
 
 
+def _sample_measures_for_threshold(pred, gt, threshold):
+    """
+    Calculates measures on one single sample, for a specific threshold
+
+
+    Parameters
+    ----------
+
+    pred : torch.Tensor
+        pixel-wise predictions
+
+    gt : torch.Tensor
+        ground-truth (annotations)
+
+    threshold : float
+        a particular threshold in which to calculate the performance
+        measures
+
+
+    Returns
+    -------
+
+    measures : list
+
+        A list containing the following values:
+
+        * precision: float
+        * recall: float
+        * specificity: float
+        * accuracy: float
+        * jaccard: float
+        * f1_score: float
+
+    """
+
+    tp_tensor, fp_tensor, tn_tensor, fn_tensor = _posneg(pred, gt, threshold)
+
+    # calc measures from scalars
+    tp_count = torch.sum(tp_tensor).item()
+    fp_count = torch.sum(fp_tensor).item()
+    tn_count = torch.sum(tn_tensor).item()
+    fn_count = torch.sum(fn_tensor).item()
+    return base_measures(tp_count, fp_count, tn_count, fn_count)
+
+
 def _sample_measures(pred, gt, steps):
     """
     Calculates measures on one single sample
@@ -87,40 +132,10 @@ def _sample_measures(pred, gt, steps):
     """
 
     step_size = 1.0 / steps
-    data = []
-
-    for index, threshold in enumerate(numpy.arange(0.0, 1.0, step_size)):
-
-        tp_tensor, fp_tensor, tn_tensor, fn_tensor = _posneg(
-            pred, gt, threshold
-        )
-
-        # calc measures from scalars
-        tp_count = torch.sum(tp_tensor).item()
-        fp_count = torch.sum(fp_tensor).item()
-        tn_count = torch.sum(tn_tensor).item()
-        fn_count = torch.sum(fn_tensor).item()
-        (
-            precision,
-            recall,
-            specificity,
-            accuracy,
-            jaccard,
-            f1_score,
-        ) = base_measures(tp_count, fp_count, tn_count, fn_count)
-
-        data.append(
-            [
-                index,
-                threshold,
-                precision,
-                recall,
-                specificity,
-                accuracy,
-                jaccard,
-                f1_score,
-            ]
-        )
+    data = [
+        [index, threshold] + _sample_measures_for_threshold(pred, gt, threshold)
+        for index, threshold in enumerate(numpy.arange(0.0, 1.0, step_size))
+    ]
 
     return pandas.DataFrame(
         data,
@@ -304,7 +319,6 @@ def run(
             os.makedirs(os.path.dirname(fullpath), exist_ok=True)
             overlay_image.save(fullpath)
 
-
     # Merges all dataframes together
     df_measures = pandas.concat(data.values())
 
@@ -321,8 +335,12 @@ def run(
     #         (avg_measures["precision"]+avg_measures["recall"])
 
     avg_measures["std_pr"] = std_measures["precision"]
-    avg_measures["pr_upper"] = avg_measures["precision"] + std_measures["precision"]
-    avg_measures["pr_lower"] = avg_measures["precision"] - std_measures["precision"]
+    avg_measures["pr_upper"] = (
+        avg_measures["precision"] + std_measures["precision"]
+    )
+    avg_measures["pr_lower"] = (
+        avg_measures["precision"] - std_measures["precision"]
+    )
     avg_measures["std_re"] = std_measures["recall"]
     avg_measures["re_upper"] = avg_measures["recall"] + std_measures["recall"]
     avg_measures["re_lower"] = avg_measures["recall"] - std_measures["recall"]
@@ -361,8 +379,9 @@ def run(
     return maxf1_threshold
 
 
-def compare_annotators(baseline, other, name, output_folder,
-        overlayed_folder=None):
+def compare_annotators(
+    baseline, other, name, output_folder, overlayed_folder=None
+):
     """
     Compares annotations on the **same** dataset
 
@@ -398,13 +417,15 @@ def compare_annotators(baseline, other, name, output_folder,
     data = {}
 
     for baseline_sample, other_sample in tqdm(
-        list(zip(baseline, other)), desc="samples", leave=False, disable=None,
+        list(zip(baseline, other)), desc="samples", leave=False, disable=None
     ):
-        assert baseline_sample[0] == other_sample[0], f"Mismatch between " \
-                f"datasets for second-annotator analysis " \
-                f"({baseline_sample[0]} != {other_sample[0]}).  This " \
-                f"typically occurs when the second annotator (`other`) " \
-                f"comes from a different dataset than the `baseline` dataset"
+        assert baseline_sample[0] == other_sample[0], (
+            f"Mismatch between "
+            f"datasets for second-annotator analysis "
+            f"({baseline_sample[0]} != {other_sample[0]}).  This "
+            f"typically occurs when the second annotator (`other`) "
+            f"comes from a different dataset than the `baseline` dataset"
+        )
 
         stem = baseline_sample[0]
         image = baseline_sample[1]
@@ -417,8 +438,9 @@ def compare_annotators(baseline, other, name, output_folder,
         data[stem] = _sample_measures(pred, gt, 2)
 
         if output_folder is not None:
-            fullpath = os.path.join(output_folder, "second-annotator", name,
-                    f"{stem}.csv")
+            fullpath = os.path.join(
+                output_folder, "second-annotator", name, f"{stem}.csv"
+            )
             tqdm.write(f"Saving {fullpath}...")
             os.makedirs(os.path.dirname(fullpath), exist_ok=True)
             data[stem].to_csv(fullpath)
@@ -427,8 +449,9 @@ def compare_annotators(baseline, other, name, output_folder,
             overlay_image = _sample_analysis(
                 image, pred, gt, threshold=0.5, overlay=True
             )
-            fullpath = os.path.join(overlayed_folder, "second-annotator",
-                    name, f"{stem}.png")
+            fullpath = os.path.join(
+                overlayed_folder, "second-annotator", name, f"{stem}.png"
+            )
             tqdm.write(f"Saving {fullpath}...")
             os.makedirs(os.path.dirname(fullpath), exist_ok=True)
             overlay_image.save(fullpath)
@@ -450,14 +473,20 @@ def compare_annotators(baseline, other, name, output_folder,
     #         (avg_measures["precision"]+avg_measures["recall"])
 
     avg_measures["std_pr"] = std_measures["precision"]
-    avg_measures["pr_upper"] = avg_measures["precision"] + std_measures["precision"]
-    avg_measures["pr_lower"] = avg_measures["precision"] - std_measures["precision"]
+    avg_measures["pr_upper"] = (
+        avg_measures["precision"] + std_measures["precision"]
+    )
+    avg_measures["pr_lower"] = (
+        avg_measures["precision"] - std_measures["precision"]
+    )
     avg_measures["std_re"] = std_measures["recall"]
     avg_measures["re_upper"] = avg_measures["recall"] + std_measures["recall"]
     avg_measures["re_lower"] = avg_measures["recall"] - std_measures["recall"]
     avg_measures["std_f1"] = std_measures["f1_score"]
 
-    measures_path = os.path.join(output_folder, "second-annotator", f"{name}.csv")
+    measures_path = os.path.join(
+        output_folder, "second-annotator", f"{name}.csv"
+    )
     os.makedirs(os.path.dirname(measures_path), exist_ok=True)
     logger.info(f"Saving averages over all input images at {measures_path}...")
     avg_measures.to_csv(measures_path)
diff --git a/bob/ip/binseg/engine/significance.py b/bob/ip/binseg/engine/significance.py
index 4586b9c2632f9fd5aa76e70d48b2457382313f66..68fae32d1f4ee35e7ad523dbf9eedb02ad943109 100644
--- a/bob/ip/binseg/engine/significance.py
+++ b/bob/ip/binseg/engine/significance.py
@@ -5,11 +5,11 @@ import os
 import itertools
 
 import h5py
-import tqdm
+from tqdm import tqdm
 import pandas
 import torch.nn
 
-from .evaluator import _sample_measures
+from .evaluator import _sample_measures_for_threshold
 
 
 def _patch_measures(pred, gt, threshold, size, stride):
@@ -56,8 +56,6 @@ def _patch_measures(pred, gt, threshold, size, stride):
 
     """
 
-    height, width, stride = size
-
     # we calculate the required padding so that the last windows on the left
     # and bottom size of predictions/ground-truth data are zero padded, and
     # torch unfolding works exactly.
@@ -76,26 +74,37 @@ def _patch_measures(pred, gt, threshold, size, stride):
     pred_patches = pred_padded.unfold(0, size[0], stride[0]).unfold(
         1, size[1], stride[1]
     )
-    gt_patches = gt_padded.unfold(0, size[0], stride).unfold(
-        1, size[1], stride[0]
+    gt_patches = gt_padded.unfold(0, size[0], stride[0]).unfold(
+        1, size[1], stride[1]
     )
     assert pred_patches.shape == gt_patches.shape
     ylen, xlen, _, _ = pred_patches.shape
 
-    dfs = []
-    for j, i in itertools.product(range(ylen), range(xlen)):
-        dfs.append(
-            _sample_measures(
-                pred_patches[j, i, :, :], gt_patches[j, i, :, :], steps
-            )
+    data = [
+        [j, i]
+        + _sample_measures_for_threshold(
+            pred_patches[j, i, :, :], gt_patches[j, i, :, :], threshold
         )
-        dfs[-1]["patch"] = i + (j * xlen)
-
-    return pandas.concat(dfs, ignore_index=True)
+        for j, i in itertools.product(range(ylen), range(xlen))
+    ]
+
+    return pandas.DataFrame(
+        data,
+        columns=(
+            "y",
+            "x",
+            "precision",
+            "recall",
+            "specificity",
+            "accuracy",
+            "jaccard",
+            "f1_score",
+        ),
+    )
 
 
 def patch_performances(
-    dataset, name, predictions_folder, threshold, size, stride,
+    dataset, name, predictions_folder, threshold, size, stride
 ):
     """
     Evaluates the performances for multiple image patches, for a whole dataset
@@ -144,7 +153,7 @@ def patch_performances(
     if not os.path.exists(use_predictions_folder):
         use_predictions_folder = predictions_folder
 
-    for sample in tqdm(dataset):
+    for sample in tqdm(dataset[name]):
         stem = sample[0]
         image = sample[1]
         gt = sample[2]
@@ -153,6 +162,6 @@ def patch_performances(
             pred = f["array"][:]
         pred = torch.from_numpy(pred)
         data.append(_patch_measures(pred, gt, threshold, size, stride))
-        data['stem'] = stem
+        data[-1]["stem"] = stem
 
     return pandas.concat(data, ignore_index=True)
diff --git a/bob/ip/binseg/script/significance.py b/bob/ip/binseg/script/significance.py
index 2207738a52ed193db78e68c288ef681638fc3835..2700d0c2c7a2a7772bd6dc9b0b144b5c628bd30a 100755
--- a/bob/ip/binseg/script/significance.py
+++ b/bob/ip/binseg/script/significance.py
@@ -10,6 +10,7 @@ from bob.extension.scripts.click_helper import (
     ResourceOption,
 )
 
+import numpy
 import scipy.stats
 import logging
 
@@ -114,7 +115,7 @@ from ..engine.significance import patch_performances
          "respectively.",
     default=(128, 128),
     nargs=2,
-    type=float,
+    type=int,
     show_default=True,
     required=True,
     cls=ResourceOption,
@@ -127,7 +128,7 @@ from ..engine.significance import patch_performances
          "respectively.",
     default=(32, 32),
     nargs=2,
-    type=float,
+    type=int,
     show_default=True,
     required=True,
     cls=ResourceOption,
@@ -161,39 +162,46 @@ def significance(
 
         assert threshold in dataset, f"No dataset named '{threshold}'"
 
-        logger.info(f"Evaluating threshold on '{threshold}' set for system 1")
+        logger.info(f"Evaluating threshold on '{threshold}' set for system 1 using {steps} steps")
         threshold1 = run_evaluation(
             dataset[threshold], threshold, predictions_1, steps=steps
         )
-        logger.info(f"Set --threshold={threshold:.5f} for system 1")
+        logger.info(f"Set --threshold={threshold1:.5f} for system 1")
 
-        logger.info(f"Evaluating threshold on '{threshold}' set for system 2")
+        logger.info(f"Evaluating threshold on '{threshold}' set for system 2 using {steps} steps")
         threshold2 = run_evaluation(
             dataset[threshold], threshold, predictions_2, steps=steps
         )
-        logger.info(f"Set --threshold={threshold:.5f} for system 2")
+        logger.info(f"Set --threshold={threshold2:.5f} for system 2")
 
     # for a given threshold on each system, calculate patch performances
-    logger.info(f"Evaluating patch performances on '{evaluate}' set for system 1")
-    perf1 = patch_performances(data, evaluate, predictions_1, threshold1,
+    logger.info(f"Evaluating patch performances on '{evaluate}' set for system 1 using windows of size {size} and stride {stride}")
+    perf1 = patch_performances(dataset, evaluate, predictions_1, threshold1,
             size, stride)
-    logger.info(f"Evaluating patch performances on '{evaluate}' set for system 2")
-    perf2 = patch_performances(data, evaluate, predictions_2, threshold2,
+    logger.info(f"Evaluating patch performances on '{evaluate}' set for system 2 using windows of size {size} and stride {stride}")
+    perf2 = patch_performances(dataset, evaluate, predictions_2, threshold2,
             size, stride)
 
     ###### MAGIC STARTS #######
 
     # load all F1-scores for the given threshold
     da = perf1.f1_score
-    #import matplotlib
-    #matplotlib.use('macosx')
-    #import matplotlib.pyplot as plt
     db = perf2.f1_score
-    #plt.boxplot([da, db])
-    #plt.hist(numpy.array(da)-db, bins=6)
-    #plt.show()
-
     diff = da - db
+
+    import matplotlib
+    import matplotlib.pyplot as plt
+    plt.subplot(2,2,1)
+    plt.boxplot([da, db])
+    plt.title('Systems 1 and 2')
+    plt.subplot(2,2,2)
+    plt.boxplot(diff)
+    plt.title('Differences (1 - 2)')
+    plt.subplot(2,1,2)
+    plt.hist(diff, bins=50)
+    plt.title('Histogram (1 - 2)')
+    plt.savefig('analysis.pdf')
+
     #diff = diff[diff!=0.0]
     #click.echo(diff)