diff --git a/bob/ip/binseg/script/significance.py b/bob/ip/binseg/script/significance.py
index 287ce0fc81d42d2f8929f3919a46fb9a2b4a366e..1e4d5e8e0c9d907d5e5af990a6610db713a878fa 100755
--- a/bob/ip/binseg/script/significance.py
+++ b/bob/ip/binseg/script/significance.py
@@ -4,7 +4,6 @@
 import os
 import sys
 import click
-import typing
 
 from bob.extension.scripts.click_helper import (
     verbosity_option,
@@ -13,142 +12,232 @@ from bob.extension.scripts.click_helper import (
 )
 
 import numpy
-import scipy.stats
-
 import logging
 
 logger = logging.getLogger(__name__)
 
 
 from .evaluate import _validate_threshold, run as run_evaluation
-from ..engine.significance import patch_performances, visual_performances
+from ..engine.significance import (
+    patch_performances,
+    visual_performances,
+    write_analysis_text,
+    write_analysis_figures,
+    index_of_outliers,
+)
 
 
-def _index_of_outliers(c):
-    """Finds indexes of outlines (+/- 1.5*IQR) on a pandas dataframe column"""
+def _eval_patches(
+    system_name,
+    threshold,
+    evaluate,
+    preddir,
+    dataset,
+    steps,
+    size,
+    stride,
+    outdir,
+    figure,
+    nproc,
+):
+    """Calculates the patch performances on a dataset
 
-    iqr = c.quantile(0.75) - c.quantile(0.25)
-    limits = (c.quantile(0.25) - 1.5 * iqr, c.quantile(0.75) + 1.5 * iqr)
-    return (c < limits[0]) | (c > limits[1])
 
+    Parameters
+    ==========
 
-def _write_analysis_text(names, da, db, f):
-    """Writes a text file containing the most important statistics"""
+    system_name : str
+        The name of the current system being analyzed
 
-    diff = da - db
-    f.write("#Samples/Median/Avg/Std.Dev./Normality Conf. F1-scores:\n")
-    f.write(
-        f"* {names[0]}: {len(da)}" \
-        f" / {numpy.median(da):.3f}" \
-        f" / {numpy.mean(da):.3f}" \
-        f" / {numpy.std(da, ddof=1):.3f}\n"
-    )
-    f.write(
-        f"* {names[1]}: {len(db)}" \
-        f" / {numpy.median(db):.3f}" \
-        f" / {numpy.mean(db):.3f}" \
-        f" / {numpy.std(db, ddof=1):.3f}\n"
-    )
-    f.write(
-        f"* {names[0]}-{names[1]}: {len(diff)}" \
-        f" / {numpy.median(diff):.3f}" \
-        f" / {numpy.mean(diff):.3f}" \
-        f" / {numpy.std(diff, ddof=1):.3f}" \
-        f" / gaussian? p={scipy.stats.normaltest(diff)[1]:.3f}\n"
-    )
+    threshold : :py:class:`float`, :py:class:`str`
+        This number is used to define positives and negatives from probability
+        maps, and report F1-scores (a priori). By default, we expect a set
+        named 'validation' to be available at the input data.  If that is not
+        the case, we use 'train', if available.  You may provide the name of
+        another dataset to be used for threshold tunning otherwise.  If not
+        set, or a string is input, threshold tunning is done per system,
+        individually.  Optionally, you may also provide a floating-point number
+        between [0.0, 1.0] as the threshold to use for both systems.
 
-    w, p = scipy.stats.ttest_rel(da, db)
-    f.write(
-        f"Paired T-test (is the difference zero?): S = {w:g}, p = {p:.5f}\n"
-    )
+    evaluate : str
+        Name of the dataset key to use from ``dataset`` to evaluate (typically,
+        ``test``)
 
-    w, p = scipy.stats.ttest_ind(da, db, equal_var=False)
-    f.write(f"Ind. T-test (is the difference zero?): S = {w:g}, p = {p:.5f}\n")
+    preddir : str
+        Root path to the predictions generated by system ``system_name``.  The
+        final subpath inside ``preddir`` that will be used will have the value
+        of this variable suffixed with the value of ``evaluate``.  We will
+        search for ``<preddir>/<evaluate>/<stems>.hdf5``.
 
-    w, p = scipy.stats.wilcoxon(diff)
-    f.write(
-        f"Wilcoxon test (is the difference zero?): W = {w:g}, p = {p:.5f}\n"
-    )
+    dataset : dict
+        A dictionary mapping string keys to
+        :py:class:`torch.utils.data.dataset.Dataset` instances
 
-    w, p = scipy.stats.wilcoxon(diff, alternative="greater")
-    f.write(
-        f"Wilcoxon test (md({names[0]}) < md({names[1]})?): " \
-        f"W = {w:g}, p = {p:.5f}\n"
-    )
+    steps : int
+        The number of threshold steps to consider when evaluating the highest
+        possible F1-score on train/test data.
 
-    w, p = scipy.stats.wilcoxon(diff, alternative="less")
-    f.write(
-        f"Wilcoxon test (md({names[0]}) > md({names[1]})?): " \
-        f"W = {w:g}, p = {p:.5f}\n"
-    )
+    size : tuple
+        Two values indicating the size of windows to be used for patch
+        analysis.  The values represent height and width respectively
 
+    stride : tuple
+        Two values indicating the stride of windows to be used for patch
+        analysis.  The values represent height and width respectively
 
-def _write_analysis_figures(names, da, db, folder):
-    """Writes a PDF containing most important plots for analysis"""
+    outdir : str
+        Path where to store visualizations.  If set to ``None``, then do not
+        store performance visualizations.
 
-    from matplotlib.backends.backend_pdf import PdfPages
-    import matplotlib.pyplot as plt
+    figure : str
+        The name of a performance figure (e.g. ``f1_score``, or ``jaccard``) to
+        use when comparing performances
 
-    diff = da - db
-    bins = 50
-
-    fname = os.path.join(folder, "statistics.pdf")
-    os.makedirs(os.path.dirname(fname), exist_ok=True)
-    with PdfPages(fname) as pdf:
-
-        plt.figure()
-        plt.grid()
-        plt.hist(da, bins=bins)
-        plt.title(
-            f"{names[0]} - scores (N={len(da)}; M={numpy.median(da):.3f}; "
-            f"$\mu$={numpy.mean(da):.3f}; $\sigma$={numpy.std(da, ddof=1):.3f})"
-        )
-        pdf.savefig()
-        plt.close()
-
-        plt.figure()
-        plt.grid()
-        plt.hist(db, bins=bins)
-        plt.title(
-            f"{names[1]} - scores (N={len(db)}; M={numpy.median(db):.3f}; "
-            f"$\mu$={numpy.mean(db):.3f}; $\sigma$={numpy.std(db, ddof=1):.3f})"
+    nproc : int
+        Sets the number of parallel processes to use when running using
+        multiprocessing.  A value of zero uses all reported cores.  A value of
+        ``1`` avoids completely the use of multiprocessing and runs all chores
+        in the current processing context.
+
+
+    Returns
+    =======
+
+    d : dict
+        A dictionary in which keys are filename stems and values are
+        dictionaries with the following contents:
+
+        ``df``: :py:class:`pandas.DataFrame`
+            A dataframe with all the patch performances aggregated, for all
+            input images.
+
+        ``n`` : :py:class:`numpy.ndarray`
+            A 2D numpy array containing the number of performance scores for
+            every pixel in the original image
+
+        ``avg`` : :py:class:`numpy.ndarray`
+            A 2D numpy array containing the average performances for every
+            pixel on the input image considering the patch sizes and strides
+            applied when windowing the image
+
+        ``std`` : :py:class:`numpy.ndarray`
+            A 2D numpy array containing the (unbiased) standard deviations for
+            the provided performance figure, for every pixel on the input image
+            considering the patch sizes and strides applied when windowing the
+            image
+
+    """
+
+    if not isinstance(threshold, float):
+
+        assert threshold in dataset, f"No dataset named '{threshold}'"
+
+        logger.info(
+            f"Evaluating threshold on '{threshold}' set for "
+            f"'{system_name}' using {steps} steps"
         )
-        pdf.savefig()
-        plt.close()
-
-        plt.figure()
-        plt.boxplot([da, db])
-        plt.title(f"{names[0]} and {names[1]} (N={len(da)})")
-        pdf.savefig()
-        plt.close()
-
-        plt.figure()
-        plt.boxplot(diff)
-        plt.title(f"Differences ({names[0]} - {names[1]}) (N={len(da)})")
-        pdf.savefig()
-        plt.close()
-
-        plt.figure()
-        plt.grid()
-        plt.hist(diff, bins=bins)
-        plt.title(
-            f"Systems ({names[0]} - {names[1]}) " \
-            f"(N={len(diff)}; M={numpy.median(diff):.3f}; " \
-            f"$\mu$={numpy.mean(diff):.3f}; " \
-            f"$\sigma$={numpy.std(diff, ddof=1):.3f})"
+        threshold = run_evaluation(
+            dataset[threshold], threshold, predictions[0], steps=steps
         )
-        pdf.savefig()
-        plt.close()
+        logger.info(f"Set --threshold={threshold:.5f} for '{system_name}'")
+
+    # for a given threshold on each system, calculate patch performances
+    logger.info(
+        f"Evaluating patch performances on '{evaluate}' set for "
+        f"'{system_name}' using windows of size {size} and stride {stride}"
+    )
+
+    return patch_performances(
+        dataset,
+        evaluate,
+        preddir,
+        threshold,
+        size,
+        stride,
+        figure,
+        nproc,
+        outdir,
+    )
+
+
+def _eval_differences(perf1, perf2, evaluate, dataset, size, stride, outdir,
+        figure, nproc):
+    """Evaluate differences in the performance patches between two systems
+
+    Parameters
+    ----------
+
+    perf1, perf2 : dict
+        A dictionary as returned by :py:func:`_eval_patches`
+
+    evaluate : str
+        Name of the dataset key to use from ``dataset`` to evaluate (typically,
+        ``test``)
 
-        p = scipy.stats.pearsonr(da, db)
-        plt.figure()
-        plt.grid()
-        plt.scatter(da, db, marker=".", color="black")
-        plt.xlabel("{names[0]}")
-        plt.ylabel("{names[1]}")
-        plt.title(f"Scatter (p={p[0]:.3f})")
-        pdf.savefig()
-        plt.close()
+    dataset : dict
+        A dictionary mapping string keys to
+        :py:class:`torch.utils.data.dataset.Dataset` instances
+
+    size : tuple
+        Two values indicating the size of windows to be used for patch
+        analysis.  The values represent height and width respectively
+
+    stride : tuple
+        Two values indicating the stride of windows to be used for patch
+        analysis.  The values represent height and width respectively
+
+    outdir : str
+        If set to ``None``, then do not output performance visualizations.
+        Otherwise, in directory ``outdir``, dumps the visualizations for the
+        performance differences between both systems.
+
+    figure : str
+        The name of a performance figure (e.g. ``f1_score``, or ``jaccard``) to
+        use when comparing performances
+
+    nproc : int
+        Sets the number of parallel processes to use when running using
+        multiprocessing.  A value of zero uses all reported cores.  A value of
+        ``1`` avoids completely the use of multiprocessing and runs all chores
+        in the current processing context.
+
+
+    Returns
+    -------
+
+    d : dict
+        A dictionary representing patch performance differences across all
+        files and patches.  The format of this is similar to the individual
+        inputs ``perf1`` and ``perf2``.
+
+    """
+
+    perf_diff = dict([(k, perf1[k]["df"].copy()) for k in perf1])
+
+    # we can subtract these
+    to_subtract = (
+        "precision",
+        "recall",
+        "specificity",
+        "accuracy",
+        "jaccard",
+        "f1_score",
+    )
+
+    for k in perf_diff:
+        for col in to_subtract:
+            perf_diff[k][col] -= perf2[k]["df"][col]
+
+    return visual_performances(
+        dataset,
+        evaluate,
+        perf_diff,
+        size,
+        stride,
+        figure,
+        nproc,
+        outdir,
+    )
 
 
 @click.command(
@@ -265,8 +354,8 @@ def _write_analysis_figures(names, da, db, folder):
 @click.option(
     "--figure",
     "-f",
-    help="The name of a performance figure (e.g. f1_score) to use for "
-    "for comparing performances",
+    help="The name of a performance figure (e.g. f1_score, or jaccard) to "
+    "use when comparing performances",
     default="f1_score",
     type=str,
     show_default=True,
@@ -285,8 +374,21 @@ def _write_analysis_figures(names, da, db, folder):
 @click.option(
     "--remove-outliers/--no-remove-outliers",
     "-R",
-    help="If set, removes outliers from both score distributions before " \
-         "running statistical analysis",
+    help="If set, removes outliers from both score distributions before "
+    "running statistical analysis.  Outlier removal follows a 1.5 IQR range "
+    "check from the difference in figures between both systems and assumes "
+    "most of the distribution is contained within that range (like in a "
+    "normal distribution)",
+    default=False,
+    required=True,
+    show_default=True,
+    cls=ResourceOption,
+)
+@click.option(
+    "--remove-zeros/--no-remove-zeros",
+    "-R",
+    help="If set, removes instances from the statistical analysis in which "
+    "both systems had a performance equal to zero.",
     default=False,
     required=True,
     show_default=True,
@@ -295,7 +397,7 @@ def _write_analysis_figures(names, da, db, folder):
 @click.option(
     "--parallel",
     "-x",
-    help="Set the number of parallel processes to use when running using " \
+    help="Set the number of parallel processes to use when running using "
     "multiprocessing.  A value of zero uses all reported cores.",
     default=1,
     type=int,
@@ -316,6 +418,7 @@ def significance(
     figure,
     output_folder,
     remove_outliers,
+    remove_zeros,
     parallel,
     **kwargs,
 ):
@@ -329,125 +432,81 @@ def significance(
     threshold = _validate_threshold(threshold, dataset)
     assert evaluate in dataset, f"No dataset named '{evaluate}'"
 
-    if isinstance(threshold, float):
-        threshold1 = threshold2 = threshold
-
-    else:  # it is a string, re-calculate it for each system individually
-
-        assert threshold in dataset, f"No dataset named '{threshold}'"
-
-        logger.info(
-            f"Evaluating threshold on '{threshold}' set for '{names[0]}' using {steps} steps"
-        )
-        threshold1 = run_evaluation(
-            dataset[threshold], threshold, predictions[0], steps=steps
-        )
-        logger.info(f"Set --threshold={threshold1:.5f} for '{names[0]}'")
-
-        logger.info(
-            f"Evaluating threshold on '{threshold}' set for '{names[1]}' using {steps} steps"
-        )
-        threshold2 = run_evaluation(
-            dataset[threshold], threshold, predictions[1], steps=steps
-        )
-        logger.info(f"Set --threshold={threshold2:.5f} for '{names[1]}'")
-
-    # for a given threshold on each system, calculate patch performances
-    logger.info(
-        f"Evaluating patch performances on '{evaluate}' set for '{names[0]}' using windows of size {size} and stride {stride}"
-    )
-    dir1 = (
-        os.path.join(output_folder, names[0])
-        if output_folder is not None
-        else None
-    )
-    perf1 = patch_performances(
-        dataset,
+    perf1 = _eval_patches(
+        names[0],
+        threshold,
         evaluate,
         predictions[0],
-        threshold1,
+        dataset,
+        steps,
         size,
         stride,
+        (output_folder
+        if output_folder is None
+        else os.path.join(output_folder, names[0])),
         figure,
-        nproc=parallel,
-        outdir=dir1,
+        parallel,
     )
 
-    logger.info(
-        f"Evaluating patch performances on '{evaluate}' set for '{names[1]}' using windows of size {size} and stride {stride}"
-    )
-    dir2 = (
-        os.path.join(output_folder, names[1])
-        if output_folder is not None
-        else None
-    )
-    perf2 = patch_performances(
-        dataset,
+    perf2 = _eval_patches(
+        names[1],
+        threshold,
         evaluate,
         predictions[1],
-        threshold2,
-        size,
-        stride,
-        figure,
-        nproc=parallel,
-        outdir=dir2,
-    )
-
-    perf_diff = dict([(k, perf1[k]["df"].copy()) for k in perf1])
-    to_subtract = (
-        "precision",
-        "recall",
-        "specificity",
-        "accuracy",
-        "jaccard",
-        "f1_score",
-    )
-    for k in perf_diff:
-        for col in to_subtract:
-            perf_diff[k][col] -= perf2[k]["df"][col]
-    dirdiff = (
-        os.path.join(output_folder, "diff")
-        if output_folder is not None
-        else None
-    )
-    perf_diff = visual_performances(
         dataset,
-        evaluate,
-        perf_diff,
+        steps,
         size,
         stride,
+        (output_folder
+            if output_folder is None
+            else os.path.join(output_folder, names[1])),
         figure,
-        nproc=parallel,
-        outdir=dirdiff,
+        parallel,
     )
 
-    # loads all F1-scores for the given threshold
+    perf_diff = _eval_differences(
+            perf1,
+            perf2,
+            evaluate,
+            dataset,
+            size,
+            stride,
+            (output_folder
+                if output_folder is None
+                else os.path.join(output_folder, "diff")),
+            figure,
+            parallel,
+            )
+
+    # loads all figures for the given threshold
     stems = list(perf1.keys())
-    da = numpy.array([perf1[k]["df"].f1_score for k in stems]).flatten()
-    db = numpy.array([perf2[k]["df"].f1_score for k in stems]).flatten()
+    da = numpy.array([perf1[k]["df"][figure] for k in stems]).flatten()
+    db = numpy.array([perf2[k]["df"][figure] for k in stems]).flatten()
     diff = da - db
 
     while remove_outliers:
-        outliers_diff = _index_of_outliers(diff)
+        outliers_diff = index_of_outliers(diff)
         if sum(outliers_diff) == 0:
             break
         diff = diff[~outliers_diff]
         da = da[~outliers_diff]
         db = db[~outliers_diff]
 
-    # also remove cases in which both da and db are zero
-    remove_zeros = (da == 0) & (db == 0)
-    diff = diff[~remove_zeros]
-    da = da[~remove_zeros]
-    db = db[~remove_zeros]
+    if remove_zeros:
+        remove_zeros = (da == 0) & (db == 0)
+        diff = diff[~remove_zeros]
+        da = da[~remove_zeros]
+        db = db[~remove_zeros]
 
     if output_folder is not None:
-        _write_analysis_figures(names, da, db, output_folder)
+        fname = os.path.join(output_folder, "analysis.pdf")
+        os.makedirs(os.path.dirname(fname), exist_ok=True)
+        write_analysis_figures(names, da, db, fname)
 
     if output_folder is not None:
         fname = os.path.join(output_folder, "analysis.txt")
         os.makedirs(os.path.dirname(fname), exist_ok=True)
         with open(fname, "wt") as f:
-            _write_analysis_text(names, da, db, f)
+            write_analysis_text(names, da, db, f)
     else:
-        _write_analysis_text(names, da, db, sys.stdout)
+        write_analysis_text(names, da, db, sys.stdout)