diff --git a/bob/ip/binseg/engine/significance.py b/bob/ip/binseg/engine/significance.py
index 6bbed5c5520b97597ce9d27f37177ef64ed84ff2..a699cb5b29885f50c9ab654415262af308161aef 100644
--- a/bob/ip/binseg/engine/significance.py
+++ b/bob/ip/binseg/engine/significance.py
@@ -2,7 +2,6 @@
 # coding=utf-8
 
 import os
-import itertools
 import textwrap
 import multiprocessing
 
@@ -12,7 +11,6 @@ logger = logging.getLogger(__name__)
 import h5py
 from tqdm import tqdm
 import numpy
-import pandas
 import torch.nn
 import scipy.stats
 import tabulate
@@ -20,13 +18,24 @@ import tabulate
 from .evaluator import _sample_measures_for_threshold
 
 
-def _performance_summary(size, patch_perf, patch_size, patch_stride, figure):
+PERFORMANCE_FIGURES = [
+        "precision",
+        "recall",
+        "specificity",
+        "accuracy",
+        "jaccard",
+        "f1_score",
+        ]
+"""List of performance figures supported by this module, in order"""
+
+
+def _performance_summary(size, winperf, winsize, winstride, figure):
     """Generates an array that represents the performance per pixel of the
     original image
 
     The returned array corresponds to a stacked version of performances for
-    each pixel in the original image taking into consideration the patch
-    performances, their size and stride.
+    each pixel in the original image taking into consideration the sliding
+    window performances, their size and stride.
 
 
     Parameters
@@ -36,17 +45,20 @@ def _performance_summary(size, patch_perf, patch_size, patch_stride, figure):
         A two tuple with the original height and width of the image being
         analyzed
 
-    patch_perf : typing.Sequence
-        An ordered sequence of patch performances (in raster direction - every
-        row, from left to right and then rows from top to bottom).
+    winperf : numpy.ndarray
+        A 3D array with shape ``(N, H, W)``, where ``N`` represents the number
+        of performance measures supported by this module, ``(H,W)`` is the
+        total number of vertical and horizontal sliding windows.
 
-    patch_size : tuple
-        A two tuple that indicates the size of each patch (height, width)
+    winsize : tuple
+        A two tuple that indicates the size of the sliding window (height,
+        width)
 
-    patch_stride: tuple
-        A two tuple that indicates the stride of each patch (height, width)
+    winstride : tuple
+        A two tuple that indicates the stride of the sliding window (height,
+        width)
 
-    figure: str
+    figure : str
         Name of the performance figure to use for the summary
 
 
@@ -59,14 +71,13 @@ def _performance_summary(size, patch_perf, patch_size, patch_stride, figure):
 
     avg : numpy.ndarray
         A 2D numpy array containing the average performances for every pixel on
-        the input image considering the patch sizes and strides applied when
-        windowing the image
+        the input image considering the sliding window sizes and strides
+        applied to the image
 
     std : numpy.ndarray
         A 2D numpy array containing the (unbiased) standard deviations for the
         provided performance figure, for every pixel on the input image
-        considering the patch sizes and strides applied when windowing the
-        image
+        considering the sliding window sizes and strides applied to the image
 
     """
 
@@ -77,33 +88,30 @@ def _performance_summary(size, patch_perf, patch_size, patch_stride, figure):
     # torch unfolding works exactly.  The last windows on the left and bottom
     # parts of the image may be extended with zeros.
     final_size = list(size)
-    rem = (size[0] - patch_size[0]) % patch_stride[0]
+    rem = (size[0] - winsize[0]) % winstride[0]
     if rem != 0:
-        final_size[0] += patch_stride[0] - rem
-    rem = (size[1] - patch_size[1]) % patch_stride[1]
+        final_size[0] += winstride[0] - rem
+    rem = (size[1] - winsize[1]) % winstride[1]
     if rem != 0:
-        final_size[1] += patch_stride[1] - rem
+        final_size[1] += winstride[1] - rem
     n = numpy.zeros(final_size, dtype=int)
-    ylen = ((final_size[0] - patch_size[0]) // patch_stride[0]) + 1
-    xlen = ((final_size[1] - patch_size[1]) // patch_stride[1]) + 1
 
     # calculates the stacked performance
     layers = int(
-        numpy.ceil(patch_size[0] / patch_stride[0])
-        * numpy.ceil(patch_size[1] / patch_stride[1])
-    )
-    perf = numpy.zeros(
-        [layers] + final_size, dtype=patch_perf[figure].iloc[0].dtype
+        numpy.ceil(winsize[0] / winstride[0])
+        * numpy.ceil(winsize[1] / winstride[1])
     )
+    figindex = PERFORMANCE_FIGURES.index(figure)
+    perf = numpy.zeros([layers] + final_size, dtype=winperf.dtype)
     n = -1 * numpy.ones(final_size, dtype=int)
-    col = numpy.array(patch_perf[figure])
-    for j in range(ylen):
+    data = winperf[PERFORMANCE_FIGURES.index(figure)]
+    for j in range(data.shape[0]):
         yup = slice(
-            patch_stride[0] * j, (patch_stride[0] * j) + patch_size[0], 1
+            winstride[0] * j, (winstride[0] * j) + winsize[0], 1
         )
-        for i in range(xlen):
+        for i in range(data.shape[1]):
             xup = slice(
-                patch_stride[1] * i, (patch_stride[1] * i) + patch_size[1], 1
+                winstride[1] * i, (winstride[1] * i) + winsize[1], 1
             )
             nup = n[yup, xup]
             nup += 1
@@ -112,7 +120,7 @@ def _performance_summary(size, patch_perf, patch_size, patch_stride, figure):
                 range(xup.start, xup.stop, xup.step),
                 indexing="ij",
             )
-            perf[nup.flat, yr.flat, xr.flat] = col[(j * xlen) + i]
+            perf[nup.flat, yr.flat, xr.flat] = data[j, i]
 
     # for each element in the ``perf``matrix, calculates avg and std.
     n += 1  # adjust for starting at -1 before
@@ -125,9 +133,9 @@ def _performance_summary(size, patch_perf, patch_size, patch_stride, figure):
     return n, avg, std
 
 
-def _patch_measures(pred, gt, threshold, size, stride):
+def _winperf_measures(pred, gt, threshold, size, stride):
     """
-    Calculates measures on patches of a single sample
+    Calculates measures on sliding windows of a single sample
 
 
     Parameters
@@ -140,7 +148,7 @@ def _patch_measures(pred, gt, threshold, size, stride):
         ground-truth (annotations)
 
     threshold : float
-        threshold to use for evaluating individual patch performances
+        threshold to use for evaluating individual sliding window performances
 
     size : tuple
         size (vertical, horizontal) for windows for which we will calculate
@@ -154,18 +162,15 @@ def _patch_measures(pred, gt, threshold, size, stride):
     Returns
     -------
 
-    measures : pandas.DataFrame
+    measures : numpy.ndarray
 
-        A pandas dataframe with the following columns:
+        A 3D float array with all supported performance entries for each
+        sliding window.
 
-        * patch: int
-        * threshold: float
-        * precision: float
-        * recall: float
-        * specificity: float
-        * accuracy: float
-        * jaccard: float
-        * f1_score: float
+        The first dimension of the array is therefore 6.  The other two
+        dimensions correspond to resulting size of the sliding window operation
+        applied to the input data and taking into consideration the sliding
+        window size and the stride.
 
     """
 
@@ -185,36 +190,20 @@ def _patch_measures(pred, gt, threshold, size, stride):
     gt_padded = torch.nn.functional.pad(gt.squeeze(0), padding)
 
     # this will create as many views as required
-    pred_patches = pred_padded.unfold(0, size[0], stride[0]).unfold(
+    pred_windows = pred_padded.unfold(0, size[0], stride[0]).unfold(
         1, size[1], stride[1]
     )
-    gt_patches = gt_padded.unfold(0, size[0], stride[0]).unfold(
+    gt_windows = gt_padded.unfold(0, size[0], stride[0]).unfold(
         1, size[1], stride[1]
     )
-    assert pred_patches.shape == gt_patches.shape
-    ylen, xlen, _, _ = pred_patches.shape
+    assert pred_windows.shape == gt_windows.shape
+    ylen, xlen, _, _ = pred_windows.shape
 
-    data = [
-        (j, i)
-        + _sample_measures_for_threshold(
-            pred_patches[j, i, :, :], gt_patches[j, i, :, :], threshold
+    retval = numpy.array([_sample_measures_for_threshold(
+            pred_windows[j, i, :, :], gt_windows[j, i, :, :], threshold
         )
-        for j, i in itertools.product(range(ylen), range(xlen))
-    ]
-
-    return pandas.DataFrame(
-        data,
-        columns=(
-            "y",
-            "x",
-            "precision",
-            "recall",
-            "specificity",
-            "accuracy",
-            "jaccard",
-            "f1_score",
-        ),
-    )
+        for j in range(ylen) for i in range(xlen)])
+    return retval.transpose(1,0).reshape(6, ylen, xlen)
 
 
 def _visual_dataset_performance(stem, img, n, avg, std, outdir):
@@ -286,11 +275,11 @@ def _visual_dataset_performance(stem, img, n, avg, std, outdir):
     plt.close(fig)
 
 
-def _patch_performances_for_sample(
+def _winperf_for_sample(
     basedir, threshold, size, stride, dataset, k, figure, outdir,
 ):
     """
-    Evaluates patch performances per sample
+    Evaluates sliding window performances per sample
 
 
     Parameters
@@ -317,16 +306,17 @@ def _patch_performances_for_sample(
 
     k : int
         the sample number (order inside the dataset, starting from zero), to
-        calculate patch performances for
+        calculate sliding window performances for
 
     figure : str
-        the performance figure to use for calculating patch micro performances
-        (e.g. `accuracy`, `f1_score` or `jaccard`).  Must be available on the
-        produced performance dataframe.
+        the performance figure to use for calculating sliding window micro
+        performances (e.g. `accuracy`, `f1_score` or `jaccard`).  Must be
+        a supported performance figure as defined in
+        :py:attr:`PERFORMANCE_FIGURES`
 
     outdir : str
-        path were to save a visual representation of patch performances.  If
-        set to ``None``, then do not save those to disk.
+        path were to save a visual representation of sliding window
+        performances.  If set to ``None``, then do not save those to disk.
 
 
     Returns
@@ -338,8 +328,8 @@ def _patch_performances_for_sample(
     data : dict
         A dictionary containing the following fields:
 
-        * ``df``: a :py:class:`pandas.DataFrame` with the patch performance
-          figures in raster scan order.
+        * ``winperf``: a 3D :py:class:`numpy.ndarray` with the sliding window
+          performance figures
         * ``n``: a 2D integer :py:class:`numpy.ndarray` with the same size as
           the original image pertaining to the analyzed sample, that indicates
           how many overlapping windows are available for each pixel in the
@@ -356,16 +346,16 @@ def _patch_performances_for_sample(
     sample = dataset[k]
     with h5py.File(os.path.join(basedir, sample[0] + ".hdf5"), "r") as f:
         pred = torch.from_numpy(f["array"][:])
-    df = _patch_measures(pred, sample[2], threshold, size, stride)
+    winperf = _winperf_measures(pred, sample[2], threshold, size, stride)
     n, avg, std = _performance_summary(
-        sample[1].shape[1:], df, size, stride, figure
+        sample[1].shape[1:], winperf, size, stride, figure
     )
     if outdir is not None:
         _visual_dataset_performance(sample[0], sample[1], n, avg, std, outdir)
-    return sample[0], dict(df=df, n=n, avg=avg, std=std)
+    return sample[0], dict(winperf=winperf, n=n, avg=avg, std=std)
 
 
-def patch_performances(
+def sliding_window_performances(
     dataset,
     name,
     predictions_folder,
@@ -377,7 +367,7 @@ def patch_performances(
     outdir=None,
 ):
     """
-    Evaluates the performances for multiple image patches, for a whole dataset
+    Evaluates sliding window performances for a whole dataset
 
 
     Parameters
@@ -407,7 +397,8 @@ def patch_performances(
         partial performances based on the threshold and existing ground-truth
 
     figure : str
-        the performance figure to use for calculating patch micro performances
+        the performance figure to use for calculating sliding window micro
+        performances
 
     nproc : :py:class:`int`, Optional
         the number of processing cores to use for performance evaluation.
@@ -417,8 +408,8 @@ def patch_performances(
         multiprocessing.
 
     outdir : :py:class:`str`, Optional
-        path were to save a visual representation of patch performances.  If
-        set to ``None``, then do not save those to disk.
+        path were to save a visual representation of sliding window
+        performances.  If set to ``None``, then do not save those to disk.
 
 
     Returns
@@ -428,23 +419,21 @@ def patch_performances(
         A dictionary in which keys are filename stems and values are
         dictionaries with the following contents:
 
-        ``df``: :py:class:`pandas.DataFrame`
-            A dataframe with all the patch performances aggregated, for all
-            input images.
+        ``winperf``: numpy.ndarray
 
-        ``n`` : :py:class:`numpy.ndarray`
+        ``n`` : numpy.ndarray
             A 2D numpy array containing the number of performance scores for
             every pixel in the original image
 
         ``avg`` : :py:class:`numpy.ndarray`
             A 2D numpy array containing the average performances for every
-            pixel on the input image considering the patch sizes and strides
-            applied when windowing the image
+            pixel on the input image considering the sliding window sizes and
+            strides applied to the image
 
         ``std`` : :py:class:`numpy.ndarray`
             A 2D numpy array containing the (unbiased) standard deviations for
             the provided performance figure, for every pixel on the input image
-            considering the patch sizes and strides applied when windowing the
+            considering the sliding window sizes and strides applied to the
             image
 
     """
@@ -453,7 +442,7 @@ def patch_performances(
     if not os.path.exists(use_predictions_folder):
         use_predictions_folder = predictions_folder
 
-    with tqdm(range(len(dataset[name])), desc="patch-perf") as pbar:
+    with tqdm(range(len(dataset[name])), desc="sld-win-perf") as pbar:
         # we avoid the multiprocessing module if nproc==1
         # so it is easier to run ipdb
         if nproc != 1:
@@ -462,7 +451,7 @@ def patch_performances(
             pool = multiprocessing.Pool(nproc)
             results = [
                 pool.apply_async(
-                    _patch_performances_for_sample,
+                    _winperf_for_sample,
                     args=(
                         use_predictions_folder,
                         threshold,
@@ -483,7 +472,7 @@ def patch_performances(
         else:
             data = []
             for k in pbar:
-                df = _patch_performances_for_sample(
+                winperf = _winperf_for_sample(
                     use_predictions_folder,
                     threshold,
                     size,
@@ -493,21 +482,22 @@ def patch_performances(
                     figure,
                     outdir,
                 )
-                data.append(df)
+                data.append(winperf)
 
     return dict(data)
 
 
 def _visual_performances_for_sample(
-    size, stride, dataset, k, df, figure, outdir
+    size, stride, dataset, k, winperf, figure, outdir
 ):
     """
-    Displays patch performances per sample
+    Displays sliding windows performances per sample
 
-    This is a simplified version of :py:func:`_patch_performances_for_sample`
-    in which the patch performances are not recalculated and used as input.  It
-    can be used in case you have the patch performances stored in disk or if
-    you're evaluating differences between patches of 2 different systems.
+    This is a simplified version of :py:func:`_winper_for_sample`
+    in which the sliding window performances are not recalculated and used as
+    input.  It can be used in case you have the sliding window performances
+    stored in disk or if you're evaluating differences between sliding windows
+    of 2 different systems.
 
 
     Parameters
@@ -526,20 +516,21 @@ def _visual_performances_for_sample(
 
     k : int
         the sample number (order inside the dataset, starting from zero), to
-        calculate patch performances for
+        calculate sliding window performances for
 
-    df : pandas.DataFrame
-        the previously calculated dataframe to use for this patch performance
+    winperf : numpy.ndarray
+        the previously calculated sliding window performances to use for this
         assessment.
 
     figure : str
-        the performance figure to use for calculating patch micro performances
-        (e.g. `f1_score` or `jaccard`).  Must be available on the produced
-        performance dataframe.
+        the performance figure to use for calculating sliding window micro
+        performances (e.g. `accuracy`, `f1_score` or `jaccard`).  Must be
+        a supported performance figure as defined in
+        :py:attr:`PERFORMANCE_FIGURES`
 
     outdir : :py:class:`str`
-        path were to save a visual representation of patch performances.  If
-        set to ``None``, then do not save those to disk.
+        path were to save a visual representation of sliding window
+        performances.  If set to ``None``, then do not save those to disk.
 
 
     Returns
@@ -551,9 +542,9 @@ def _visual_performances_for_sample(
     data : dict
         A dictionary containing the following fields:
 
-        * ``df``: a :py:class:`pandas.DataFrame` with the patch performance
-          figures in raster scan order.  Notice this is just a copy of the
-          input data frame with the same name.
+        * ``winperf``: a 3D float :py:class:`numpy.ndarray` with the sliding
+          window performance figures.  Notice this is just a copy of the input
+          sliding window performance figures with the same name.
         * ``n``: a 2D integer :py:class:`numpy.ndarray` with the same size as
           the original image pertaining to the analyzed sample, that indicates
           how many overlapping windows are available for each pixel in the
@@ -569,23 +560,24 @@ def _visual_performances_for_sample(
 
     sample = dataset[k]
     n, avg, std = _performance_summary(
-        sample[1].shape[1:], df, size, stride, figure
+        sample[1].shape[1:], winperf, size, stride, figure
     )
     if outdir is not None:
         _visual_dataset_performance(sample[0], sample[1], n, avg, std, outdir)
-    return sample[0], dict(df=df, n=n, avg=avg, std=std)
+    return sample[0], dict(winperf=winperf, n=n, avg=avg, std=std)
 
 
 def visual_performances(
-    dataset, name, dfs, size, stride, figure, nproc=1, outdir=None,
+    dataset, name, winperfs, size, stride, figure, nproc=1, outdir=None,
 ):
     """
-    Displays the performances for multiple image patches, for a whole dataset
+    Displays the performances for for a whole dataset
 
-    This is a simplified version of :py:func:`patch_performances` in which the
-    patch performances are not recalculated and used as input.  It can be used
-    in case you have the patch performances stored in disk or if you're
-    evaluating differences between patches of 2 different systems.
+    This is a simplified version of :py:func:`sliding_window_performances` in
+    which the sliding window performances are not recalculated and used as
+    input.  It can be used in case you have the sliding window performances
+    stored in disk or if you're evaluating differences between sliding windows
+    of 2 different systems.
 
 
     Parameters
@@ -598,9 +590,9 @@ def visual_performances(
         the local name of this dataset (e.g. ``train``, or ``test``), to be
         used when saving measures files.
 
-    dfs : dict
-        a dictionary mapping dataset stems to dataframes containing the patch
-        performances to be evaluated
+    winperfs : dict
+        a dictionary mapping dataset stems to arrays containing the sliding
+        window performances to be evaluated
 
     size : tuple
         size (vertical, horizontal) for windows for which we will calculate
@@ -611,7 +603,8 @@ def visual_performances(
         partial performances based on the threshold and existing ground-truth
 
     figure : str
-        the performance figure to use for calculating patch micro performances
+        the performance figure to use for calculating sliding window micro
+        performances
 
     nproc : :py:class:`int`, Optional
         the number of processing cores to use for performance evaluation.
@@ -621,8 +614,8 @@ def visual_performances(
         multiprocessing.
 
     outdir : :py:class:`str`, Optional
-        path were to save a visual representation of patch performances.  If
-        set to ``None``, then do not save those to disk.
+        path were to save a visual representation of sliding window
+        performances.  If set to ``None``, then do not save those to disk.
 
 
     Returns
@@ -632,23 +625,23 @@ def visual_performances(
         A dictionary in which keys are filename stems and values are
         dictionaries with the following contents:
 
-        ``df``: :py:class:`pandas.DataFrame`
-            A dataframe with all the patch performances aggregated, for all
-            input images.
+        ``winperf``: numpy.ndarray
+            A 3D float array with all the sliding window performances for the
+            input image
 
-        ``n`` : :py:class:`numpy.ndarray`
+        ``n`` : numpy.ndarray
             A 2D numpy array containing the number of performance scores for
             every pixel in the original image
 
-        ``avg`` : :py:class:`numpy.ndarray`
+        ``avg`` : numpy.ndarray
             A 2D numpy array containing the average performances for every
-            pixel on the input image considering the patch sizes and strides
-            applied when windowing the image
+            pixel on the input image considering the sliding window sizes and
+            strides applied to the image
 
         ``std`` : :py:class:`numpy.ndarray`
             A 2D numpy array containing the (unbiased) standard deviations for
             the provided performance figure, for every pixel on the input image
-            considering the patch sizes and strides applied when windowing the
+            considering the sliding window sizes and strides applied to the
             image
 
     """
@@ -670,7 +663,7 @@ def visual_performances(
                         stride,
                         dataset[name],
                         k,
-                        dfs[stems[k]],
+                        winperfs[stems[k]],
                         figure,
                         outdir,
                     ),
@@ -684,22 +677,22 @@ def visual_performances(
         else:
             data = []
             for k in pbar:
-                df = _visual_performances_for_sample(
+                winperf = _visual_performances_for_sample(
                     size,
                     stride,
                     dataset[name],
                     k,
-                    dfs[stems[k]],
+                    winperfs[stems[k]],
                     figure,
                     outdir,
                 )
-                data.append(df)
+                data.append(winperf)
 
     return dict(data)
 
 
 def index_of_outliers(c):
-    """Finds indexes of outliers (+/- 1.5*IQR) on a pandas dataframe column
+    """Finds indexes of outliers (+/- 1.5*IQR) on an array of random values
 
     The IQR measures the midspread or where 50% of a normal distribution would
     sit, if the input data is, indeed, normal.  1.5 IQR corresponds to a
@@ -711,32 +704,32 @@ def index_of_outliers(c):
     Parameters
     ----------
 
-    c : pandas.DataFrame
-        This should be a **single** column of a pandas dataframe with the
-        ``quantile`` method
+    c : numpy.ndarray
+        A 1D float array
 
 
     Returns
     -------
 
-    indexes : typing.Sequence
+    indexes : numpy.ndarray
         Indexes of the input column that are considered outliers in the
         distribution (outside the 1.5 Interquartile Range).
 
     """
 
-    iqr = c.quantile(0.75) - c.quantile(0.25)
-    limits = (c.quantile(0.25) - 1.5 * iqr, c.quantile(0.75) + 1.5 * iqr)
+    l, h = numpy.quantile(c, (0.25, 0.75))
+    iqr = h - l
+    limits = (l - 1.5 * iqr, h + 1.5 * iqr)
     return (c < limits[0]) | (c > limits[1])
 
 
 def write_analysis_text(names, da, db, f):
     """Writes a text file containing the most important statistics
 
-    Compares patch performances in ``da`` and ``db`` taking into consideration
-    their statistical properties.  A significance test is applied to check
-    whether observed differences in the statistics of both distributions is
-    significant.
+    Compares sliding window performances in ``da`` and ``db`` taking into
+    consideration their statistical properties.  A significance test is applied
+    to check whether observed differences in the statistics of both
+    distributions is significant.
 
 
     Parameters
@@ -747,14 +740,14 @@ def write_analysis_text(names, da, db, f):
         analyzed
 
     da : numpy.ndarray
-        A 1D numpy array containing all the performance figures per patch
-        analyzed and organized in a particular order (raster), for the first
-        system (first entry of ``names``)
+        A 1D numpy array containing all the performance figures per sliding
+        window analyzed and organized in a particular order (raster), for the
+        first system (first entry of ``names``)
 
     db : numpy.ndarray
-        A 1D numpy array containing all the performance figures per patch
-        analyzed and organized in a particular order (raster), for the second
-        system (second entry of ``names``)
+        A 1D numpy array containing all the performance figures per sliding
+        window analyzed and organized in a particular order (raster), for the
+        second system (second entry of ``names``)
 
     f : file
         An open file that will be used dump the analysis to
@@ -802,7 +795,7 @@ def write_analysis_text(names, da, db, f):
     f.write(textwrap.indent(tdata, "  "))
     f.write("\n")
 
-    # Note: dependent variable = patch performance figure in our case
+    # Note: dependent variable = sliding window performance figure in our case
     # Assumptions of a Paired T-test:
     # * The dependent variable must be continuous (interval/ratio). [OK]
     # * The observations are independent of one another. [OK]
@@ -811,10 +804,11 @@ def write_analysis_text(names, da, db, f):
 
     if (diff == 0.0).all():
         logger.error("Differences are exactly zero between both "
-                "patch distributions, for **all** samples.  Statistical "
-                "significance tests are not meaningful in this context and "
-                "will be skipped.  This typically indicates an issue with "
-                "the setup of prediction folders (duplicated?)")
+                "sliding window distributions, for **all** samples. "
+                "Statistical significance tests are not meaningful in "
+                "this context and will be skipped.  This typically "
+                "indicates an issue with the setup of prediction folders "
+                "(duplicated?)")
         return
 
     f.write("\nPaired significance tests:\n")
@@ -851,14 +845,14 @@ def write_analysis_figures(names, da, db, fname):
         analyzed
 
     da : numpy.ndarray
-        A 1D numpy array containing all the performance figures per patch
-        analyzed and organized in a particular order (raster), for the first
-        system (first entry of ``names``)
+        A 1D numpy array containing all the performance figures per sliding
+        window analyzed and organized in a particular order (raster), for the
+        first system (first entry of ``names``)
 
     db : numpy.ndarray
-        A 1D numpy array containing all the performance figures per patch
-        analyzed and organized in a particular order (raster), for the second
-        system (second entry of ``names``)
+        A 1D numpy array containing all the performance figures per sliding
+        window analyzed and organized in a particular order (raster), for the
+        second system (second entry of ``names``)
 
     fname : str
         The filename to use for storing the summarized performance figures
diff --git a/bob/ip/binseg/script/significance.py b/bob/ip/binseg/script/significance.py
index 9e409b412d7a42254d133e1373e9ed90653c9371..cf387c9bccbce843f7ce329b6274dd64bd043c7c 100755
--- a/bob/ip/binseg/script/significance.py
+++ b/bob/ip/binseg/script/significance.py
@@ -19,15 +19,16 @@ logger = logging.getLogger(__name__)
 
 from .evaluate import _validate_threshold, run as run_evaluation
 from ..engine.significance import (
-    patch_performances,
+    sliding_window_performances,
     visual_performances,
     write_analysis_text,
     write_analysis_figures,
     index_of_outliers,
+    PERFORMANCE_FIGURES,
 )
 
 
-def _eval_patches(
+def _eval_sliding_windows(
     system_name,
     threshold,
     evaluate,
@@ -41,7 +42,7 @@ def _eval_patches(
     nproc,
     checkpointdir,
 ):
-    """Calculates the patch performances on a dataset
+    """Calculates the sliding window performances on a dataset
 
 
     Parameters
@@ -79,12 +80,12 @@ def _eval_patches(
         possible F1-score on train/test data.
 
     size : tuple
-        Two values indicating the size of windows to be used for patch
-        analysis.  The values represent height and width respectively
+        Two values indicating the size of windows to be used for the sliding
+        window analysis.  The values represent height and width respectively
 
     stride : tuple
-        Two values indicating the stride of windows to be used for patch
-        analysis.  The values represent height and width respectively
+        Two values indicating the stride of windows to be used for the sliding
+        window analysis.  The values represent height and width respectively
 
     outdir : str
         Path where to store visualizations.  If set to ``None``, then do not
@@ -102,7 +103,7 @@ def _eval_patches(
 
     checkpointdir : str
         If set to a string (instead of ``None``), then stores a cached version
-        of the patch performances on disk, for a particular system.
+        of the sliding window performances on disk, for a particular system.
 
 
     Returns
@@ -112,45 +113,49 @@ def _eval_patches(
         A dictionary in which keys are filename stems and values are
         dictionaries with the following contents:
 
-        ``df``: :py:class:`pandas.DataFrame`
-            A dataframe with all the patch performances aggregated, for all
-            input images.
+        ``winperf``: numpy.ndarray
+            A dataframe with all the sliding window performances aggregated,
+            for all input images.
 
-        ``n`` : :py:class:`numpy.ndarray`
+        ``n`` : numpy.ndarray
             A 2D numpy array containing the number of performance scores for
             every pixel in the original image
 
-        ``avg`` : :py:class:`numpy.ndarray`
+        ``avg`` : numpy.ndarray
             A 2D numpy array containing the average performances for every
-            pixel on the input image considering the patch sizes and strides
-            applied when windowing the image
+            pixel on the input image considering the sliding window sizes and
+            strides applied to the image
 
-        ``std`` : :py:class:`numpy.ndarray`
+        ``std`` : numpy.ndarray
             A 2D numpy array containing the (unbiased) standard deviations for
             the provided performance figure, for every pixel on the input image
-            considering the patch sizes and strides applied when windowing the
+            considering the sliding window sizes and strides applied to the
             image
 
     """
 
     if checkpointdir is not None:
-        chkpt_fname = os.path.join(checkpointdir,
-                f"{system_name}-{evaluate}-{threshold}-" \
-                f"{size[0]}x{size[1]}+{stride[0]}x{stride[1]}-{figure}.pkl.gz"
-                )
+        chkpt_fname = os.path.join(
+            checkpointdir,
+            f"{system_name}-{evaluate}-{threshold}-"
+            f"{size[0]}x{size[1]}+{stride[0]}x{stride[1]}-{figure}.pkl.gz",
+        )
         os.makedirs(os.path.dirname(chkpt_fname), exist_ok=True)
         if os.path.exists(chkpt_fname):
             logger.info(f"Loading checkpoint from {chkpt_fname}...")
             # loads and returns checkpoint from file
             try:
-                with __import__('gzip').GzipFile(chkpt_fname, "r") as f:
-                    return __import__('pickle').load(f)
+                with __import__("gzip").GzipFile(chkpt_fname, "r") as f:
+                    return __import__("pickle").load(f)
             except EOFError as e:
-                logger.warning(f"Could not load patch performance from " \
-                        f"{chkpt_fname}: {e}. Calculating...")
+                logger.warning(
+                    f"Could not load sliding window performance "
+                    f"from {chkpt_fname}: {e}. Calculating..."
+                )
         else:
-            logger.debug(f"Checkpoint not available at {chkpt_fname}. " \
-                    f"Calculating...")
+            logger.debug(
+                f"Checkpoint not available at {chkpt_fname}. " f"Calculating..."
+            )
     else:
         chkpt_fname = None
 
@@ -167,13 +172,13 @@ def _eval_patches(
         )
         logger.info(f"Set --threshold={threshold:.5f} for '{system_name}'")
 
-    # for a given threshold on each system, calculate patch performances
+    # for a given threshold on each system, calculate sliding window performances
     logger.info(
-        f"Evaluating patch '{figure}' on '{evaluate}' set for "
+        f"Evaluating sliding window '{figure}' on '{evaluate}' set for "
         f"'{system_name}' using windows of size {size} and stride {stride}"
     )
 
-    retval = patch_performances(
+    retval = sliding_window_performances(
         dataset,
         evaluate,
         preddir,
@@ -185,18 +190,28 @@ def _eval_patches(
         outdir,
     )
 
-    # cache patch performance for later use, if necessary
+    # cache sliding window performance for later use, if necessary
     if chkpt_fname is not None:
         logger.debug(f"Storing checkpoint at {chkpt_fname}...")
-        with __import__('gzip').GzipFile(chkpt_fname, "w") as f:
-            __import__('pickle').dump(retval, f)
+        with __import__("gzip").GzipFile(chkpt_fname, "w") as f:
+            __import__("pickle").dump(retval, f)
 
     return retval
 
 
-def _eval_differences(names, perfs, evaluate, dataset, size, stride, outdir,
-        figure, nproc, checkpointdir):
-    """Evaluate differences in the performance patches between two systems
+def _eval_differences(
+    names,
+    perfs,
+    evaluate,
+    dataset,
+    size,
+    stride,
+    outdir,
+    figure,
+    nproc,
+    checkpointdir,
+):
+    """Evaluate differences in the performance sliding windows between two systems
 
     Parameters
     ----------
@@ -205,8 +220,8 @@ def _eval_differences(names, perfs, evaluate, dataset, size, stride, outdir,
         Names of the first and second systems
 
     perfs : :py:class:`tuple` of :py:class:`dict`
-        Dictionaries for the patch performances of each system, as returned by
-        :py:func:`_eval_patches`
+        Dictionaries for the sliding window performances of each system, as
+        returned by :py:func:`_eval_sliding_windows`
 
     evaluate : str
         Name of the dataset key to use from ``dataset`` to evaluate (typically,
@@ -217,12 +232,12 @@ def _eval_differences(names, perfs, evaluate, dataset, size, stride, outdir,
         :py:class:`torch.utils.data.dataset.Dataset` instances
 
     size : tuple
-        Two values indicating the size of windows to be used for patch
+        Two values indicating the size of windows to be used for sliding window
         analysis.  The values represent height and width respectively
 
     stride : tuple
-        Two values indicating the stride of windows to be used for patch
-        analysis.  The values represent height and width respectively
+        Two values indicating the stride of windows to be used for sliding
+        window analysis.  The values represent height and width respectively
 
     outdir : str
         If set to ``None``, then do not output performance visualizations.
@@ -241,80 +256,65 @@ def _eval_differences(names, perfs, evaluate, dataset, size, stride, outdir,
 
     checkpointdir : str
         If set to a string (instead of ``None``), then stores a cached version
-        of the patch performances on disk, for a particular difference between
-        systems.
+        of the sliding window performances on disk, for a particular difference
+        between systems.
 
 
     Returns
     -------
 
     d : dict
-        A dictionary representing patch performance differences across all
-        files and patches.  The format of this is similar to the individual
-        inputs ``perf1`` and ``perf2``.
+        A dictionary representing sliding window performance differences across
+        all files and sliding windows.  The format of this is similar to the
+        individual inputs ``perf1`` and ``perf2``.
 
     """
 
     if checkpointdir is not None:
-        chkpt_fname = os.path.join(checkpointdir,
-                f"{names[0]}-{names[1]}-{evaluate}-" \
-                f"{size[0]}x{size[1]}+{stride[0]}x{stride[1]}-{figure}.pkl.gz"
-                )
+        chkpt_fname = os.path.join(
+            checkpointdir,
+            f"{names[0]}-{names[1]}-{evaluate}-"
+            f"{size[0]}x{size[1]}+{stride[0]}x{stride[1]}-{figure}.pkl.gz",
+        )
         os.makedirs(os.path.dirname(chkpt_fname), exist_ok=True)
         if os.path.exists(chkpt_fname):
             logger.info(f"Loading checkpoint from {chkpt_fname}...")
             # loads and returns checkpoint from file
             try:
-                with __import__('gzip').GzipFile(chkpt_fname, "r") as f:
-                    return __import__('pickle').load(f)
+                with __import__("gzip").GzipFile(chkpt_fname, "r") as f:
+                    return __import__("pickle").load(f)
             except EOFError as e:
-                logger.warning(f"Could not load patch performance from " \
-                        f"{chkpt_fname}: {e}. Calculating...")
+                logger.warning(
+                    f"Could not load sliding window performance "
+                    f"from {chkpt_fname}: {e}. Calculating..."
+                )
         else:
-            logger.debug(f"Checkpoint not available at {chkpt_fname}. " \
-                    f"Calculating...")
+            logger.debug(
+                f"Checkpoint not available at {chkpt_fname}. " f"Calculating..."
+            )
     else:
         chkpt_fname = None
 
-    perf_diff = dict([(k, perfs[0][k]["df"].copy()) for k in perfs[0]])
-
-    # we can subtract these
-    to_subtract = (
-        "precision",
-        "recall",
-        "specificity",
-        "accuracy",
-        "jaccard",
-        "f1_score",
+    perf_diff = dict(
+        [(k, perfs[0][k]["winperf"] - perfs[1][k]["winperf"]) for k in perfs[0]]
     )
 
-    for k in perf_diff:
-        for col in to_subtract:
-            perf_diff[k][col] -= perfs[1][k]["df"][col]
-
-    # for a given threshold on each system, calculate patch performances
+    # for a given threshold on each system, calculate sliding window performances
     logger.info(
-        f"Evaluating patch '{figure}' differences on '{evaluate}' set on "
-        f"'{names[0]}-{names[1]}' using windows of size {size} and "
+        f"Evaluating sliding window '{figure}' differences on '{evaluate}' "
+        f"set on '{names[0]}-{names[1]}' using windows of size {size} and "
         f"stride {stride}"
     )
 
     retval = visual_performances(
-        dataset,
-        evaluate,
-        perf_diff,
-        size,
-        stride,
-        figure,
-        nproc,
-        outdir,
+        dataset, evaluate, perf_diff, size, stride, figure, nproc, outdir,
     )
 
-    # cache patch performance for later use, if necessary
+    # cache sliding window performance for later use, if necessary
     if chkpt_fname is not None:
         logger.debug(f"Storing checkpoint at {chkpt_fname}...")
-        with __import__('gzip').GzipFile(chkpt_fname, "w") as f:
-            __import__('pickle').dump(retval, f)
+        with __import__("gzip").GzipFile(chkpt_fname, "w") as f:
+            __import__("pickle").dump(retval, f)
 
     return retval
 
@@ -408,8 +408,8 @@ def _eval_differences(names, perfs, evaluate, dataset, size, stride, outdir,
     "--size",
     "-s",
     help="This is a tuple with two values indicating the size of windows to "
-    "be used for patch analysis.  The values represent height and width "
-    "respectively.",
+    "be used for sliding window analysis.  The values represent height and "
+    "width respectively.",
     default=(128, 128),
     nargs=2,
     type=int,
@@ -421,8 +421,8 @@ def _eval_differences(names, perfs, evaluate, dataset, size, stride, outdir,
     "--stride",
     "-t",
     help="This is a tuple with two values indicating the stride of windows to "
-    "be used for patch analysis.  The values represent height and width "
-    "respectively.",
+    "be used for sliding window analysis.  The values represent height and "
+    "width respectively.",
     default=(32, 32),
     nargs=2,
     type=int,
@@ -487,7 +487,8 @@ def _eval_differences(names, perfs, evaluate, dataset, size, stride, outdir,
 @click.option(
     "--checkpoint-folder",
     "-k",
-    help="Path where to store checkpointed versions of patch performances",
+    help="Path where to store checkpointed versions of sliding window "
+    "performances",
     required=False,
     type=click.Path(),
     show_default=True,
@@ -521,7 +522,7 @@ def significance(
     threshold = _validate_threshold(threshold, dataset)
     assert evaluate in dataset, f"No dataset named '{evaluate}'"
 
-    perf1 = _eval_patches(
+    perf1 = _eval_sliding_windows(
         names[0],
         threshold,
         evaluate,
@@ -530,15 +531,17 @@ def significance(
         steps,
         size,
         stride,
-        (output_folder
-        if output_folder is None
-        else os.path.join(output_folder, names[0])),
+        (
+            output_folder
+            if output_folder is None
+            else os.path.join(output_folder, names[0])
+        ),
         figure,
         parallel,
         checkpoint_folder,
     )
 
-    perf2 = _eval_patches(
+    perf2 = _eval_sliding_windows(
         names[1],
         threshold,
         evaluate,
@@ -547,33 +550,38 @@ def significance(
         steps,
         size,
         stride,
-        (output_folder
+        (
+            output_folder
             if output_folder is None
-            else os.path.join(output_folder, names[1])),
+            else os.path.join(output_folder, names[1])
+        ),
         figure,
         parallel,
         checkpoint_folder,
     )
 
     perf_diff = _eval_differences(
-            names,
-            (perf1, perf2),
-            evaluate,
-            dataset,
-            size,
-            stride,
-            (output_folder
-                if output_folder is None
-                else os.path.join(output_folder, "diff")),
-            figure,
-            parallel,
-            checkpoint_folder,
-            )
+        names,
+        (perf1, perf2),
+        evaluate,
+        dataset,
+        size,
+        stride,
+        (
+            output_folder
+            if output_folder is None
+            else os.path.join(output_folder, "diff")
+        ),
+        figure,
+        parallel,
+        checkpoint_folder,
+    )
 
     # loads all figures for the given threshold
     stems = list(perf1.keys())
-    da = numpy.array([perf1[k]["df"][figure] for k in stems]).flatten()
-    db = numpy.array([perf2[k]["df"][figure] for k in stems]).flatten()
+    figindex = PERFORMANCE_FIGURES.index(figure)
+    da = numpy.array([perf1[k]["winperf"][figindex] for k in stems]).flatten()
+    db = numpy.array([perf2[k]["winperf"][figindex] for k in stems]).flatten()
     diff = da - db
 
     while remove_outliers:
diff --git a/bob/ip/binseg/test/test_significance.py b/bob/ip/binseg/test/test_significance.py
index 0e5c5dd5507ed8a8250f87fe804c63272c1dc8cd..7d00ff8baaea23f3ce881dc6507d75cebd01ef61 100755
--- a/bob/ip/binseg/test/test_significance.py
+++ b/bob/ip/binseg/test/test_significance.py
@@ -9,159 +9,220 @@ import pandas
 import nose.tools
 import torch
 
-from ..engine.significance import _patch_measures
+from ..engine.significance import (
+    _winperf_measures,
+    _performance_summary,
+    PERFORMANCE_FIGURES,
+)
 from ..utils.measure import base_measures
 
 
-def _check_patch_measures(pred, gt, threshold, size, stride, expected):
+def _check_window_measures(pred, gt, threshold, size, stride, expected):
 
     pred = torch.tensor(pred)
     gt = torch.tensor(gt)
-    actual = _patch_measures(pred, gt, threshold, size, stride)
+    actual = _winperf_measures(pred, gt, threshold, size, stride)
 
     # transforms tp,tn,fp,fn through base_measures()
-    expected = pandas.DataFrame([k[:2] + base_measures(*k[2:]) for k in expected],
-            columns=[
-                "y",
-                "x",
-                "precision",  # tp/(tp+fp)
-                "recall",  # tpr = tp/p = tp/(tp+fn)
-                "specificity",  # tnr = tn/n = tn/(tn+fp)
-                "accuracy",  # (tp+tn)/(p+n) = (tp+tn)/(tp+fn+tn+fp)
-                "jaccard",  #  f1/(2-f1) = tp/(tp+fp+fn)
-                "f1_score",  # 2*rp/(r+p) = 2*tp/(2*tp+fp+fn)
-                ])
-
-    assert (actual == expected).all().all(), f"Actual output:\n{actual}\n " \
-            f"**!=** Expected output:\n{expected}"
-
-
-def test_patch_measures_alltrue():
-
-    pred = numpy.ones((4,4), dtype=float)
-    gt = numpy.ones((4,4), dtype=bool)
+    expected_shape = numpy.array(expected).shape[:2]
+    expected = numpy.array([base_measures(*c) for r in expected for c in r]).T
+    expected = expected.reshape((len(PERFORMANCE_FIGURES),) + expected_shape)
+
+    assert numpy.allclose(
+        actual, expected
+    ), f"Actual output:\n{actual}\n **!=** Expected output:\n{expected}"
+
+
+def test_winperf_measures_alltrue():
+
+    pred = numpy.ones((4, 4), dtype=float)
+    gt = numpy.ones((4, 4), dtype=bool)
     threshold = 0.5
-    size = (2,2)
-    stride = (1,1)
+    size = (2, 2)
+    stride = (1, 1)
 
     expected = [
-            #y, x, tp, fp, tn, fn
-            (0, 0,  4,  0,  0,  0),
-            (0, 1,  4,  0,  0,  0),
-            (0, 2,  4,  0,  0,  0),
-            (1, 0,  4,  0,  0,  0),
-            (1, 1,  4,  0,  0,  0),
-            (1, 2,  4,  0,  0,  0),
-            (2, 0,  4,  0,  0,  0),
-            (2, 1,  4,  0,  0,  0),
-            (2, 2,  4,  0,  0,  0),
-            ]
-    _check_patch_measures(pred, gt, threshold, size, stride, expected)
-
-
-def test_patch_measures_alltrue_with_padding():
-
-    pred = numpy.ones((3,3), dtype=float)
-    gt = numpy.ones((3,3), dtype=bool)
+        # tp, fp, tn, fn
+        [(4, 0, 0, 0), (4, 0, 0, 0), (4, 0, 0, 0)],
+        [(4, 0, 0, 0), (4, 0, 0, 0), (4, 0, 0, 0)],
+        [(4, 0, 0, 0), (4, 0, 0, 0), (4, 0, 0, 0)],
+    ]
+    _check_window_measures(pred, gt, threshold, size, stride, expected)
+
+
+def test_winperf_measures_alltrue_with_padding():
+
+    pred = numpy.ones((3, 3), dtype=float)
+    gt = numpy.ones((3, 3), dtype=bool)
     threshold = 0.5
-    size = (2,2)
-    stride = (2,2)
+    size = (2, 2)
+    stride = (2, 2)
 
     expected = [
-            #y, x, tp, fp, tn, fn
-            (0, 0,  4,  0,  0,  0),
-            (0, 1,  2,  0,  2,  0),
-            (1, 0,  2,  0,  2,  0),
-            (1, 1,  1,  0,  3,  0),
-            ]
-    _check_patch_measures(pred, gt, threshold, size, stride, expected)
+        # tp, fp, tn, fn
+        [(4, 0, 0, 0), (2, 0, 2, 0)],
+        [(2, 0, 2, 0), (1, 0, 3, 0)],
+    ]
+    _check_window_measures(pred, gt, threshold, size, stride, expected)
 
 
-def test_patch_measures_dot_with_padding():
+def test_winperf_measures_dot_with_padding():
 
-    pred = numpy.ones((3,3), dtype=float)
-    gt = numpy.zeros((3,3), dtype=bool)
-    gt[1,1] = 1.0  #white dot pattern
+    pred = numpy.ones((3, 3), dtype=float)
+    gt = numpy.zeros((3, 3), dtype=bool)
+    gt[1, 1] = 1.0  # white dot pattern
     threshold = 0.5
-    size = (2,2)
-    stride = (2,2)
+    size = (2, 2)
+    stride = (2, 2)
 
     expected = [
-            #y, x, tp, fp, tn, fn
-            (0, 0,  1,  3,  0,  0),
-            (0, 1,  0,  2,  2,  0),
-            (1, 0,  0,  2,  2,  0),
-            (1, 1,  0,  1,  3,  0),
-            ]
-    _check_patch_measures(pred, gt, threshold, size, stride, expected)
-
-
-def test_patch_measures_cross():
-
-    pred = numpy.zeros((5,5), dtype=float)
-    pred[2,:] = 1.0
-    pred[:,2] = 1.0
-    pred[2,2] = 0.0  #make one mistake at the center of the cross
-    gt = numpy.zeros((5,5), dtype=bool)
-    gt[2,:] = 1.0
-    gt[:,2] = 1.0  #white cross pattern
+        # tp, fp, tn, fn
+        [(1, 3, 0, 0), (0, 2, 2, 0)],
+        [(0, 2, 2, 0), (0, 1, 3, 0)],
+    ]
+    _check_window_measures(pred, gt, threshold, size, stride, expected)
+
+
+def test_winperf_measures_cross():
+
+    pred = numpy.zeros((5, 5), dtype=float)
+    pred[2, :] = 1.0
+    pred[:, 2] = 1.0
+    pred[2, 2] = 0.0  # make one mistake at the center of the cross
+    gt = numpy.zeros((5, 5), dtype=bool)
+    gt[2, :] = 1.0
+    gt[:, 2] = 1.0  # white cross pattern
     threshold = 0.5
-    size = (3,3)
-    stride = (1,1)
+    size = (3, 3)
+    stride = (1, 1)
 
     expected = [
-            #y, x, tp, fp, tn, fn
-            (0, 0,  4,  0,  4,  1),
-            (0, 1,  4,  0,  4,  1),
-            (0, 2,  4,  0,  4,  1),
-            (1, 0,  4,  0,  4,  1),
-            (1, 1,  4,  0,  4,  1),
-            (1, 2,  4,  0,  4,  1),
-            (2, 0,  4,  0,  4,  1),
-            (2, 1,  4,  0,  4,  1),
-            (2, 2,  4,  0,  4,  1),
-            ]
-    _check_patch_measures(pred, gt, threshold, size, stride, expected)
-
-
-def test_patch_measures_cross_with_padding():
-
-    pred = numpy.zeros((5,5), dtype=float)
-    gt = numpy.zeros((5,5), dtype=bool)
-    gt[2,:] = 1.0
-    gt[:,2] = 1.0  #white cross pattern
+        # tp, fp, tn, fn
+        [(4, 0, 4, 1), (4, 0, 4, 1), (4, 0, 4, 1)],
+        [(4, 0, 4, 1), (4, 0, 4, 1), (4, 0, 4, 1)],
+        [(4, 0, 4, 1), (4, 0, 4, 1), (4, 0, 4, 1)],
+    ]
+    _check_window_measures(pred, gt, threshold, size, stride, expected)
+
+
+def test_winperf_measures_cross_with_padding():
+
+    pred = numpy.zeros((5, 5), dtype=float)
+    gt = numpy.zeros((5, 5), dtype=bool)
+    gt[2, :] = 1.0
+    gt[:, 2] = 1.0  # white cross pattern
     threshold = 0.5
-    size = (4,4)
-    stride = (2,2)
+    size = (4, 4)
+    stride = (2, 2)
 
     expected = [
-            #y, x, tp, fp, tn, fn
-            (0, 0,  0,  0,  9,  7),
-            (0, 1,  0,  0,  10,  6),
-            (1, 0,  0,  0,  10,  6),
-            (1, 1,  0,  0,  11,  5),
-            ]
-    _check_patch_measures(pred, gt, threshold, size, stride, expected)
-
-
-def test_patch_measures_cross_with_padding_2():
-
-    pred = numpy.zeros((5,5), dtype=float)
-    pred[2,:] = 1.0
-    pred[:,2] = 1.0
-    pred[2,2] = 0.0  #make one mistake at the center of the cross
-    gt = numpy.zeros((5,5), dtype=bool)
-    gt[2,:] = 1.0
-    gt[:,2] = 1.0  #white cross pattern
+        # tp, fp, tn, fn
+        [(0, 0, 9, 7), (0, 0, 10, 6)],
+        [(0, 0, 10, 6), (0, 0, 11, 5)],
+    ]
+    _check_window_measures(pred, gt, threshold, size, stride, expected)
+
+
+def test_winperf_measures_cross_with_padding_2():
+
+    pred = numpy.zeros((5, 5), dtype=float)
+    pred[2, :] = 1.0
+    pred[:, 2] = 1.0
+    pred[2, 2] = 0.0  # make one mistake at the center of the cross
+    gt = numpy.zeros((5, 5), dtype=bool)
+    gt[2, :] = 1.0
+    gt[:, 2] = 1.0  # white cross pattern
     threshold = 0.5
-    size = (4,4)
-    stride = (2,2)
+    size = (4, 4)
+    stride = (2, 2)
 
     expected = [
-            #y, x, tp, fp, tn, fn
-            (0, 0,  6,  0,  9,  1),
-            (0, 1,  5,  0,  10,  1),
-            (1, 0,  5,  0,  10,  1),
-            (1, 1,  4,  0,  11,  1),
-            ]
-    _check_patch_measures(pred, gt, threshold, size, stride, expected)
+        # tp, fp, tn, fn
+        [(6, 0, 9, 1), (5, 0, 10, 1)],
+        [(5, 0, 10, 1), (4, 0, 11, 1)],
+    ]
+    _check_window_measures(pred, gt, threshold, size, stride, expected)
+
+
+def _check_performance_summary(pred, gt, threshold, size, stride, s, figure):
+
+    figsize = pred.shape
+    pred = torch.tensor(pred)
+    gt = torch.tensor(gt)
+
+    # notice _winperf_measures() was previously tested (above)
+    measures = _winperf_measures(pred, gt, threshold, size, stride)
+
+    n_actual, avg_actual, std_actual = _performance_summary(
+        figsize, measures, size, stride, figure
+    )
+
+    # the following code is not optimal, but easier to debug than the
+    # list comprehension versions:
+    # n_expected = numpy.array([len(k) for j in s for k in j]).reshape(figsize)
+    # avg_expected = numpy.array([measures.iloc[k][figure].mean() for j in s for k in j]).reshape(figsize)
+    # std_expected = numpy.array([measures.iloc[k][figure].std(ddof=1) for j in s for k in j]).reshape(figsize)
+    n_expected = numpy.zeros_like(n_actual)
+    avg_expected = numpy.zeros_like(avg_actual)
+    std_expected = numpy.zeros_like(std_actual)
+    figindex = PERFORMANCE_FIGURES.index(figure)
+    for y, row in enumerate(s):
+        for x, cell in enumerate(row):
+            n_expected[y, x] = len(cell)
+            entries = tuple(numpy.array(cell).T)  # convert indexing to numpy
+            avg_expected[y, x] = measures[figindex][entries].mean()
+            std_expected[y, x] = measures[figindex][entries].std(ddof=1)
+    std_expected = numpy.nan_to_num(std_expected)
+
+    assert (n_actual == n_expected).all(), (
+        f"Actual N output:\n{n_actual}\n "
+        f"**!=** Expected N output:\n{n_expected}"
+    )
+
+    assert (avg_actual == avg_expected).all(), (
+        f"Actual average output:\n{avg_actual}\n "
+        f"**!=** Expected average output:\n{avg_expected}"
+    )
+
+    assert (std_actual == std_expected).all(), (
+        f"Actual std.deviation output:\n{std_actual}\n "
+        f"**!=** Expected std.deviation output:\n{std_expected}"
+    )
+
+
+def test_performance_summary_alltrue_accuracy():
+
+    pred = numpy.ones((4, 4), dtype=float)
+    gt = numpy.ones((4, 4), dtype=bool)
+    threshold = 0.5
+    size = (2, 2)
+    stride = (1, 1)
+
+    # what we expect will happen for the accumulation of statistics
+    # each number represents the pandas dataframe index in ``measures``
+    # that needs to be accumulated for that particular pixel in the
+    # original image
+    stats = [
+        # first row of image
+        [[(0, 0)], [(0, 0), (0, 1)], [(0, 1), (0, 2)], [(0, 2)]],
+        # second row of image
+        [
+            [(0, 0), (1, 0)],
+            [(0, 0), (0, 1), (1, 0), (1, 1)],
+            [(0, 1), (0, 2), (1, 1), (1, 2)],
+            [(0, 2), (1, 2)],
+        ],
+        # third row of image
+        [
+            [(1, 0), (2, 0)],
+            [(1, 0), (1, 1), (2, 0), (2, 1)],
+            [(1, 1), (1, 2), (2, 1), (2, 2)],
+            [(1, 2), (2, 2)],
+        ],
+        # fourth row of image
+        [[(2, 0)], [(2, 0), (2, 1)], [(2, 1), (2, 2)], [(2, 2)]],
+    ]
+
+    _check_performance_summary(
+        pred, gt, threshold, size, stride, stats, "accuracy"
+    )