diff --git a/bob/ip/binseg/engine/evaluator.py b/bob/ip/binseg/engine/evaluator.py
index 37fb4e2b9f37ae6151fddf61bb141f498a33c4a0..2901305574a55af3afdab3fd93302ee14ee86d04 100644
--- a/bob/ip/binseg/engine/evaluator.py
+++ b/bob/ip/binseg/engine/evaluator.py
@@ -4,7 +4,6 @@
 """Defines functionality for the evaluation of predictions"""
 
 import os
-import itertools
 
 import PIL
 import numpy
@@ -23,9 +22,6 @@ import logging
 
 logger = logging.getLogger(__name__)
 
-_PATCH_CONFIG = (128, 128, 32)
-"""Stock configuration for patch analysis"""
-
 
 def _posneg(pred, gt, threshold):
     """Calculates true and false positives and negatives"""
@@ -141,81 +137,6 @@ def _sample_measures(pred, gt, steps):
     )
 
 
-def _patch_measures(pred, gt, steps, size):
-    """
-    Calculates measures on patches of a single sample
-
-
-    Parameters
-    ----------
-
-    pred : torch.Tensor
-        pixel-wise predictions
-
-    gt : torch.Tensor
-        ground-truth (annotations)
-
-    steps : int
-        number of steps to use for threshold analysis.  The step size is
-        calculated from this by dividing ``1.0/steps``
-
-    size : :py:class:`tuple`
-        A tripplet with three integers indicating the height, width, and stride
-        of patches to break measure analysis into.  In this case, the input
-        image and ground-truth will be cut into blocks of the provided height
-        and width, overlapping by the total overlap size, starting on the top
-        left corner and then moving right and to the bottom.  Windows on the
-        left and bottom edge of the image may be incomplete.
-
-
-    Returns
-    -------
-
-    measures : pandas.DataFrame
-
-        A pandas dataframe with the following columns:
-
-        * patch: int
-        * threshold: float
-        * precision: float
-        * recall: float
-        * specificity: float
-        * accuracy: float
-        * jaccard: float
-        * f1_score: float
-
-    """
-
-    height, width, stride = size
-
-    # we calculate the required padding so that the last windows on the left
-    # and bottom size of predictions/ground-truth data are zero padded, and
-    # torch unfolding works exactly.
-    padding = (0, 0)
-    rem = (pred.shape[1] - width) % stride
-    if rem != 0:
-        padding = (0, (stride-rem))
-    rem = (pred.shape[0] - height) % stride
-    if rem != 0:
-        padding += (0, (stride-rem))
-
-    pred_padded = torch.nn.functional.pad(pred, padding)
-    gt_padded = torch.nn.functional.pad(gt.squeeze(0), padding)
-
-    # this will create as many views as required
-    pred_patches = pred_padded.unfold(0, height, stride).unfold(1, width, stride)
-    gt_patches = gt_padded.unfold(0, height, stride).unfold(1, width, stride)
-    assert pred_patches.shape == gt_patches.shape
-    ylen, xlen, _, _ = pred_patches.shape
-
-    dfs = []
-    for j, i in itertools.product(range(ylen), range(xlen)):
-        dfs.append(_sample_measures(pred_patches[j,i,:,:], gt_patches[j,i,:,:], steps))
-        dfs[-1]['patch'] = i+(j*xlen)
-
-    return pandas.concat(dfs, ignore_index=True)
-
-
 def _sample_analysis(
     img,
     pred,
@@ -374,12 +295,6 @@ def run(
             os.makedirs(os.path.dirname(fullpath), exist_ok=True)
             data[stem].to_csv(fullpath)
 
-            # saves patch analysis
-            fullpath = os.path.join(output_folder, name, "patches", f"{stem}.csv")
-            tqdm.write(f"Saving {fullpath}...")
-            os.makedirs(os.path.dirname(fullpath), exist_ok=True)
-            _patch_measures(pred, gt, steps, _PATCH_CONFIG).to_csv(fullpath)
-
         if overlayed_folder is not None:
             overlay_image = _sample_analysis(
                 image, pred, gt, threshold=threshold, overlay=True
@@ -508,13 +423,6 @@ def compare_annotators(baseline, other, name, output_folder,
             os.makedirs(os.path.dirname(fullpath), exist_ok=True)
             data[stem].to_csv(fullpath)
 
-            # saves patch analysis
-            fullpath = os.path.join(output_folder, "second-annotator", name,
-                    "patches", f"{stem}.csv")
-            tqdm.write(f"Saving {fullpath}...")
-            os.makedirs(os.path.dirname(fullpath), exist_ok=True)
-            _patch_measures(pred, gt, 2, _PATCH_CONFIG).to_csv(fullpath)
-
         if overlayed_folder is not None:
             overlay_image = _sample_analysis(
                 image, pred, gt, threshold=0.5, overlay=True
diff --git a/bob/ip/binseg/engine/significance.py b/bob/ip/binseg/engine/significance.py
new file mode 100644
index 0000000000000000000000000000000000000000..4586b9c2632f9fd5aa76e70d48b2457382313f66
--- /dev/null
+++ b/bob/ip/binseg/engine/significance.py
@@ -0,0 +1,158 @@
+#!/usr/bin/env python
+# coding=utf-8
+
+import os
+import itertools
+
+import h5py
+import tqdm
+import pandas
+import torch.nn
+
+from .evaluator import _sample_measures
+
+
+def _patch_measures(pred, gt, threshold, size, stride):
+    """
+    Calculates measures on patches of a single sample
+
+
+    Parameters
+    ----------
+
+    pred : torch.Tensor
+        pixel-wise predictions
+
+    gt : torch.Tensor
+        ground-truth (annotations)
+
+    threshold : float
+        threshold to use for evaluating individual patch performances
+
+    size : tuple
+        size (vertical, horizontal) for windows for which we will calculate
+        partial performances based on the threshold and existing ground-truth
+
+    stride : tuple
+        strides (vertical, horizontal) for windows for which we will calculate
+        partial performances based on the threshold and existing ground-truth
+
+
+    Returns
+    -------
+
+    measures : pandas.DataFrame
+
+        A pandas dataframe with the following columns:
+
+        * patch: int
+        * threshold: float
+        * precision: float
+        * recall: float
+        * specificity: float
+        * accuracy: float
+        * jaccard: float
+        * f1_score: float
+
+    """
+
+    height, width, stride = size
+
+    # we calculate the required padding so that the last windows on the left
+    # and bottom size of predictions/ground-truth data are zero padded, and
+    # torch unfolding works exactly.
+    padding = (0, 0)
+    rem = (pred.shape[1] - size[1]) % stride[1]
+    if rem != 0:
+        padding = (0, (stride[1] - rem))
+    rem = (pred.shape[0] - size[0]) % stride[0]
+    if rem != 0:
+        padding += (0, (stride[0] - rem))
+
+    pred_padded = torch.nn.functional.pad(pred, padding)
+    gt_padded = torch.nn.functional.pad(gt.squeeze(0), padding)
+
+    # this will create as many views as required
+    pred_patches = pred_padded.unfold(0, size[0], stride[0]).unfold(
+        1, size[1], stride[1]
+    )
+    gt_patches = gt_padded.unfold(0, size[0], stride).unfold(
+        1, size[1], stride[0]
+    )
+    assert pred_patches.shape == gt_patches.shape
+    ylen, xlen, _, _ = pred_patches.shape
+
+    dfs = []
+    for j, i in itertools.product(range(ylen), range(xlen)):
+        dfs.append(
+            _sample_measures(
+                pred_patches[j, i, :, :], gt_patches[j, i, :, :], steps
+            )
+        )
+        dfs[-1]["patch"] = i + (j * xlen)
+
+    return pandas.concat(dfs, ignore_index=True)
+
+
+def patch_performances(
+    dataset, name, predictions_folder, threshold, size, stride,
+):
+    """
+    Evaluates the performances for multiple image patches, for a whole dataset
+
+
+    Parameters
+    ---------
+
+    dataset : py:class:`torch.utils.data.Dataset`
+        a dataset to iterate on
+
+    name : str
+        the local name of this dataset (e.g. ``train``, or ``test``), to be
+        used when saving measures files.
+
+    predictions_folder : str
+        folder where predictions for the dataset images has been previously
+        stored
+
+    threshold : :py:class:`float`
+        this should be a threshold (floating point) to apply to prediction maps
+        to decide on positives and negatives.
+
+    size : tuple
+        size (vertical, horizontal) for windows for which we will calculate
+        partial performances based on the threshold and existing ground-truth
+
+    stride : tuple
+        strides (vertical, horizontal) for windows for which we will calculate
+        partial performances based on the threshold and existing ground-truth
+
+
+    Returns
+    -------
+
+    df : pandas.DataFrame
+        A dataframe with all the patch performances aggregated, for all input
+        images.
+
+    """
+
+    # Collect overall measures
+    data = []
+
+    use_predictions_folder = os.path.join(predictions_folder, name)
+    if not os.path.exists(use_predictions_folder):
+        use_predictions_folder = predictions_folder
+
+    for sample in tqdm(dataset):
+        stem = sample[0]
+        image = sample[1]
+        gt = sample[2]
+        pred_fullpath = os.path.join(use_predictions_folder, stem + ".hdf5")
+        with h5py.File(pred_fullpath, "r") as f:
+            pred = f["array"][:]
+        pred = torch.from_numpy(pred)
+        data.append(_patch_measures(pred, gt, threshold, size, stride))
+        data['stem'] = stem
+
+    return pandas.concat(data, ignore_index=True)
diff --git a/bob/ip/binseg/script/significance.py b/bob/ip/binseg/script/significance.py
old mode 100644
new mode 100755
index ff7dd440f6e88990f445416e68c8491e23ddbeeb..2207738a52ed193db78e68c288ef681638fc3835
--- a/bob/ip/binseg/script/significance.py
+++ b/bob/ip/binseg/script/significance.py
@@ -2,154 +2,217 @@
 # coding=utf-8
 
 import os
-import pathlib
-
 import click
-import numpy
-import scipy.stats
 
 from bob.extension.scripts.click_helper import (
     verbosity_option,
-    AliasedGroup,
+    ConfigCommand,
+    ResourceOption,
 )
 
-import pandas
-
+import scipy.stats
 import logging
-logger = logging.getLogger(__name__)
-
-
-def _get_threshold(csvfile):
-    """Returns the thresholds that maximizes the F1-score from an aggreated scan
-
-    Parameters
-    ==========
-
-    csvfile : str
-        Path to a CSV file containing the aggreated performance across various
-        thresholds for a validation set.
-
-
-    Returns
-    =======
-
-    threshold : float
-        A float threshold
-
-    """
-
-    df = pandas.read_csv(csvfile)
-    threshold = df.threshold[df.f1_score.idxmax()]
-    logger.info(f"Threshold for '{csvfile}' = {threshold:.3f}'")
-    return threshold
-
-
-def _find_files(path, expression):
-    """Finds a files with names matching the glob expression recursively
-
-
-    Parameters
-    ==========
-
-    path : str
-        The base path where to search for files
-
-    expression : str
-        A glob expression to search for filenames (e.g. ``"*.csv"``)
-
-
-    Returns
-    =======
-
-    l : list
-        A sorted list (by name) of relative file paths found under ``path``
-
-    """
-
-    return sorted([k.name for k in pathlib.Path(path).rglob(expression)])
-
 
-def _load_score(path, threshold):
-    """Loads the specific score (at threshold "t") from an analysis file
-
-
-    Parameters
-    ==========
-
-    path : string
-        Path pointing to the CSV files from where to load the scores
-
-    threshold : float
-        Threshold value to use for picking the score
-
-
-    Returns
-    =======
-
-    value : float
-        Value representing the score at the given threshold inside the file
+logger = logging.getLogger(__name__)
 
-    """
 
-    df = pandas.read_csv(path)
-    bins = len(df)
-    index = int(round(bins * threshold))
-    index = min(index, len(df) - 1)  # avoids out of range indexing
-    return df.f1_score[index]
+from .evaluate import _validate_threshold, run as run_evaluation
+from ..engine.significance import patch_performances
 
 
 @click.command(
+    entry_point_group="bob.ip.binseg.config",
+    cls=ConfigCommand,
     epilog="""Examples:
 
 \b
-    1. Measures if systems A and B are significantly different for the same
-       input data, at a threshold optimized on specific validation sets.
+    1. Runs a significance test using as base the calculated predictions of two
+       different systems (1, the first and 2, the second), on the **same**
+       dataset:
+\b
+       $ bob binseg significance -vv drive --predictions-1=path/to/predictions/system-1 --predictions-2=path/to/predictions/system-2
+\b
+    2. By default, we use a "validation" dataset if it is available, to infer
+       the a priori threshold for the comparison of two systems.  Otherwise,
+       you may need to specify the name of a set to be used as validation set
+       for choosing a threshold.  The same goes for the set to be used for
+       testing the hypothesis - by default we use the "test" dataset if it is
+       available, otherwise, specify.
 \b
-       $ bob binseg significance -vv A path/to/A/valid.csv path/to/A/test B path/to/B/valid.csv path/to/B/test
+       $ bob binseg significance -vv drive --predictions-1=path/to/predictions/system-1 --predictions-2=path/to/predictions/system-2 --threshold=train --evaluate=alternate-test
 """,
 )
-@click.argument(
-        'label_valid_path',
-        nargs=-1,
-        )
-@verbosity_option()
-def significance(label_valid_path, **kwargs):
-    """Check if systems A and B are significantly different in performance
-
-    Significance testing is done through a `Wilcoxon Test`_.
+@click.option(
+    "--predictions-1",
+    "-p",
+    help="Path where predictions of system 1 are currently stored.  You may "
+         "also input predictions from a second-annotator.  This application "
+         "will adequately handle it.",
+    required=True,
+    type=click.Path(exists=True, file_okay=False, dir_okay=True),
+    cls=ResourceOption,
+)
+@click.option(
+    "--predictions-2",
+    "-P",
+    help="Path where predictions of system 2 are currently stored.  You may "
+         "also input predictions from a second-annotator.  This application "
+         "will adequately handle it.",
+    required=True,
+    type=click.Path(exists=True, file_okay=False, dir_okay=True),
+    cls=ResourceOption,
+)
+@click.option(
+    "--dataset",
+    "-d",
+    help="A dictionary mapping string keys to "
+         "torch.utils.data.dataset.Dataset instances",
+    required=True,
+    cls=ResourceOption,
+)
+@click.option(
+    "--threshold",
+    "-t",
+    help="This number is used to define positives and negatives from "
+    "probability maps, and report F1-scores (a priori). By default, we "
+    "expect a set named 'validation' to be available at the input data. "
+    "If that is not the case, we use 'train', if available.  You may provide "
+    "the name of another dataset to be used for threshold tunning otherwise. "
+    "If not set, or a string is input, threshold tunning is done per system, "
+    "individually.  Optionally, you may also provide a floating-point number "
+    "between [0.0, 1.0] as the threshold to use for both systems.",
+    default='validation',
+    show_default=True,
+    required=True,
+    cls=ResourceOption,
+)
+@click.option(
+    "--evaluate",
+    "-e",
+    help="Name of the dataset to evaluate",
+    default='test',
+    show_default=True,
+    required=True,
+    cls=ResourceOption,
+)
+@click.option(
+    "--steps",
+    "-S",
+    help="This number is used to define the number of threshold steps to "
+    "consider when evaluating the highest possible F1-score on train/test data.",
+    default=1000,
+    type=int,
+    show_default=True,
+    required=True,
+    cls=ResourceOption,
+)
+@click.option(
+    "--size",
+    "-s",
+    help="This is a tuple with two values indicating the size of windows to "
+         "be used for patch analysis.  The values represent height and width "
+         "respectively.",
+    default=(128, 128),
+    nargs=2,
+    type=float,
+    show_default=True,
+    required=True,
+    cls=ResourceOption,
+)
+@click.option(
+    "--stride",
+    "-t",
+    help="This is a tuple with two values indicating the stride of windows to "
+         "be used for patch analysis.  The values represent height and width "
+         "respectively.",
+    default=(32, 32),
+    nargs=2,
+    type=float,
+    show_default=True,
+    required=True,
+    cls=ResourceOption,
+)
+@verbosity_option(cls=ResourceOption)
+def significance(
+    predictions_1,
+    predictions_2,
+    dataset,
+    threshold,
+    evaluate,
+    steps,
+    size,
+    stride,
+    **kwargs,
+):
+    """Evaluates how significantly different are two models on the same dataset
+
+    This application calculates the significance of results of two models
+    operating on the same dataset, and subject to a priori threshold tunning.
     """
 
-    # hack to get a dictionary from arguments passed to input
-    if len(label_valid_path) % 3 != 0:
-        raise click.ClickException("Input label-validation-paths should be "
-                " tripplets composed of name-path-path entries")
+    # minimal validation to startup
+    threshold = _validate_threshold(threshold, dataset)
+    assert evaluate in dataset, f"No dataset named '{evaluate}'"
 
-    # split input into 2 systems
-    sa, sb = list(zip(*(iter(label_valid_path),)*3))
+    if isinstance(threshold, float):
+        threshold1 = threshold2 = threshold
 
-    # sanity check file lists
-    fa = _find_files(sa[2], "*.csv")
-    fb = _find_files(sb[2], "*.csv")
-    assert fa == fb, f"List of files mismatched between '{sa[2]}' " \
-        f"and '{sb[2]}' - please check"
+    else:  #it is a string, re-calculate it for each system individually
 
-    # now run analysis
-    ta = _get_threshold(sa[1])
-    tb = _get_threshold(sb[1])
+        assert threshold in dataset, f"No dataset named '{threshold}'"
 
-    # load all F1-scores for the given threshold
-    da = [_load_score(os.path.join(sa[2], k), ta) for k in fa]
-    db = [_load_score(os.path.join(sb[2], k), tb) for k in fb]
+        logger.info(f"Evaluating threshold on '{threshold}' set for system 1")
+        threshold1 = run_evaluation(
+            dataset[threshold], threshold, predictions_1, steps=steps
+        )
+        logger.info(f"Set --threshold={threshold:.5f} for system 1")
+
+        logger.info(f"Evaluating threshold on '{threshold}' set for system 2")
+        threshold2 = run_evaluation(
+            dataset[threshold], threshold, predictions_2, steps=steps
+        )
+        logger.info(f"Set --threshold={threshold:.5f} for system 2")
+
+    # for a given threshold on each system, calculate patch performances
+    logger.info(f"Evaluating patch performances on '{evaluate}' set for system 1")
+    perf1 = patch_performances(data, evaluate, predictions_1, threshold1,
+            size, stride)
+    logger.info(f"Evaluating patch performances on '{evaluate}' set for system 2")
+    perf2 = patch_performances(data, evaluate, predictions_2, threshold2,
+            size, stride)
 
-    click.echo("Median F1-scores:")
-    click.echo(f"* {sa[0]}: {numpy.median(da):.3f}")
-    click.echo(f"* {sb[0]}: {numpy.median(db):.3f}")
+    ###### MAGIC STARTS #######
 
-    w, p = scipy.stats.wilcoxon(da, db)
+    # load all F1-scores for the given threshold
+    da = perf1.f1_score
+    #import matplotlib
+    #matplotlib.use('macosx')
+    #import matplotlib.pyplot as plt
+    db = perf2.f1_score
+    #plt.boxplot([da, db])
+    #plt.hist(numpy.array(da)-db, bins=6)
+    #plt.show()
+
+    diff = da - db
+    #diff = diff[diff!=0.0]
+    #click.echo(diff)
+
+    click.echo("#Samples/Median/Avg/Std.Dev./Normality Conf. F1-scores:")
+    click.echo(f"* system1: {len(da)} / {numpy.median(da):.3f} / {numpy.mean(da):.3f} / {numpy.std(da, ddof=1):.3f} / {scipy.stats.normaltest(da)[1]}" )
+    click.echo(f"* system2: {len(db)} / {numpy.median(db):.3f} / {numpy.mean(db):.3f} / {numpy.std(db, ddof=1):.3f} / {scipy.stats.normaltest(db)[1]}" )
+    click.echo(f"* system1-system2: {len(diff)} / {numpy.median(diff):.3f} / {numpy.mean(diff):.3f} / {numpy.std(diff, ddof=1):.3f} / {scipy.stats.normaltest(diff)[1]}" )
+
+    w, p = scipy.stats.ttest_rel(da, db)
+    click.echo(f"Paired T-test (is the difference zero?): S = {w:g}, p = {p:.5f}")
+
+    w, p = scipy.stats.ttest_ind(da, db, equal_var=False)
+    click.echo(f"Ind. T-test (is the difference zero?): S = {w:g}, p = {p:.5f}")
+
+    w, p = scipy.stats.wilcoxon(diff)
     click.echo(f"Wilcoxon test (is the difference zero?): W = {w:g}, p = {p:.5f}")
 
-    w, p = scipy.stats.wilcoxon(da, db, alternative="greater")
-    click.echo(f"Wilcoxon test (md({sa[0]}) < md({sb[0]})?): W = {w:g}, p = {p:.5f}")
+    w, p = scipy.stats.wilcoxon(diff, alternative="greater")
+    click.echo(f"Wilcoxon test (md(system1) < md(system2)?): W = {w:g}, p = {p:.5f}")
 
-    w, p = scipy.stats.wilcoxon(da, db, alternative="less")
-    click.echo(f"Wilcoxon test (md({sa[0]}) > md({sb[0]})?): W = {w:g}, p = {p:.5f}")
+    w, p = scipy.stats.wilcoxon(diff, alternative="less")
+    click.echo(f"Wilcoxon test (md(system1) > md(system2)?): W = {w:g}, p = {p:.5f}")
diff --git a/bob/ip/binseg/test/test_cli.py b/bob/ip/binseg/test/test_cli.py
index 2949d86fe83ecf184d21ba13316fc2dbec38dd07..97f3dc25e9685f4e78b7dcd37310edd3819de42d 100644
--- a/bob/ip/binseg/test/test_cli.py
+++ b/bob/ip/binseg/test/test_cli.py
@@ -151,11 +151,6 @@ def _check_experiment_stare(overlay):
         nose.tools.eq_(
             len(fnmatch.filter(os.listdir(traindir), "*.csv")), 10
         )
-        traindir = os.path.join(eval_folder, "train", "patches", "stare-images")
-        assert os.path.exists(traindir)
-        nose.tools.eq_(
-            len(fnmatch.filter(os.listdir(traindir), "*.csv")), 10
-        )
 
         assert os.path.exists(os.path.join(eval_folder, "test.csv"))
         # checks individual performance figures are there
@@ -164,11 +159,6 @@ def _check_experiment_stare(overlay):
         nose.tools.eq_(
             len(fnmatch.filter(os.listdir(testdir), "*.csv")), 10
         )
-        testdir = os.path.join(eval_folder, "test", "patches", "stare-images")
-        assert os.path.exists(testdir)
-        nose.tools.eq_(
-            len(fnmatch.filter(os.listdir(testdir), "*.csv")), 10
-        )
 
         assert os.path.exists(
             os.path.join(eval_folder, "second-annotator", "train.csv")
@@ -180,12 +170,6 @@ def _check_experiment_stare(overlay):
         nose.tools.eq_(
             len(fnmatch.filter(os.listdir(traindir_sa), "*.csv")), 10
         )
-        traindir_sa = os.path.join(eval_folder, "second-annotator", "patches",
-                "train", "stare-images")
-        assert os.path.exists(traindir_sa)
-        nose.tools.eq_(
-            len(fnmatch.filter(os.listdir(traindir_sa), "*.csv")), 10
-        )
 
         assert os.path.exists(
             os.path.join(eval_folder, "second-annotator", "test.csv")
@@ -196,12 +180,6 @@ def _check_experiment_stare(overlay):
         nose.tools.eq_(
             len(fnmatch.filter(os.listdir(testdir_sa), "*.csv")), 10
         )
-        testdir_sa = os.path.join(eval_folder, "second-annotator", "patches",
-                "test", "stare-images")
-        assert os.path.exists(testdir_sa)
-        nose.tools.eq_(
-            len(fnmatch.filter(os.listdir(testdir_sa), "*.csv")), 10
-        )
 
         overlay_folder = os.path.join(output_folder, "overlayed", "analysis")
         traindir = os.path.join(overlay_folder, "train", "stare-images")
@@ -461,11 +439,6 @@ def _check_evaluate(runner):
         nose.tools.eq_(
             len(fnmatch.filter(os.listdir(testdir), "*.csv")), 10
         )
-        testdir = os.path.join(output_folder, "test", "patches", "stare-images")
-        assert os.path.exists(testdir)
-        nose.tools.eq_(
-            len(fnmatch.filter(os.listdir(testdir), "*.csv")), 10
-        )
 
         assert os.path.exists(
             os.path.join(output_folder, "second-annotator", "test.csv")
@@ -477,12 +450,6 @@ def _check_evaluate(runner):
         nose.tools.eq_(
             len(fnmatch.filter(os.listdir(testdir_sa), "*.csv")), 10
         )
-        testdir_sa = os.path.join(output_folder, "second-annotator", "test",
-                "patches", "stare-images")
-        assert os.path.exists(testdir_sa)
-        nose.tools.eq_(
-            len(fnmatch.filter(os.listdir(testdir_sa), "*.csv")), 10
-        )
 
         # check overlayed images are there (since we requested them)
         basedir = os.path.join(overlay_folder, "test", "stare-images")