From a57676d0805e1f020ee7dccd48336a742f3786b0 Mon Sep 17 00:00:00 2001
From: Andre Anjos <andre.dos.anjos@gmail.com>
Date: Mon, 13 Jul 2020 18:16:26 +0200
Subject: [PATCH] [script.significance] Improve analysis

---
 bob/ip/binseg/engine/significance.py | 404 +++++++++++++++++++++++++--
 bob/ip/binseg/script/significance.py | 363 +++++++++++++++++++-----
 2 files changed, 673 insertions(+), 94 deletions(-)

diff --git a/bob/ip/binseg/engine/significance.py b/bob/ip/binseg/engine/significance.py
index 68fae32d..484236b2 100644
--- a/bob/ip/binseg/engine/significance.py
+++ b/bob/ip/binseg/engine/significance.py
@@ -3,15 +3,129 @@
 
 import os
 import itertools
+import multiprocessing
 
 import h5py
 from tqdm import tqdm
+import numpy
 import pandas
 import torch.nn
 
 from .evaluator import _sample_measures_for_threshold
 
 
+def _performance_summary(size, patch_perf, patch_size, patch_stride, figure):
+    """Generates an array that represents the performance per pixel of the
+    original image
+
+    The returned array corresponds to a stacked version of performances for
+    each pixel in the original image taking into consideration the patch
+    performances, their size and stride.
+
+
+    Parameters
+    ==========
+
+    size : tuple
+        A two tuple with the original height and width of the image being
+        analyzed
+
+    patch_perf : typing.Sequence
+        An ordered sequence of patch performances (in raster direction - every
+        row, from left to right and then rows from top to bottom).
+
+    patch_size : tuple
+        A two tuple that indicates the size of each patch (height, width)
+
+    patch_stride: tuple
+        A two tuple that indicates the stride of each patch (height, width)
+
+    figure: str
+        Name of the performance figure to use for the summary
+
+
+    Returns
+    =======
+
+    n : numpy.ndarray
+        A 2D numpy array containing the number of performance scores for every
+        pixel in the original image
+
+    avg : numpy.ndarray
+        A 2D numpy array containing the average performances for every pixel on
+        the input image considering the patch sizes and strides applied when
+        windowing the image
+
+    std : numpy.ndarray
+        A 2D numpy array containing the (unbiased) standard deviations for the
+        provided performance figure, for every pixel on the input image
+        considering the patch sizes and strides applied when windowing the
+        image
+
+    """
+
+    # first, estimate how many overlaps we have per pixel in the original image
+
+    # we calculate the required padding so that the last windows on the left
+    # and bottom size of predictions/ground-truth data are zero padded, and
+    # torch unfolding works exactly.  The last windows on the left and bottom
+    # parts of the image may be extended with zeros.
+    final_size = list(size)
+    rem = (size[0] - patch_size[0]) % patch_stride[0]
+    if rem != 0:
+        final_size[0] += patch_stride[0] - rem
+    rem = (size[1] - patch_size[1]) % patch_stride[1]
+    if rem != 0:
+        final_size[1] += patch_stride[1] - rem
+    n = numpy.zeros(final_size, dtype=int)
+    ylen = ((final_size[0] - patch_size[0]) // patch_stride[0]) + 1
+    xlen = ((final_size[1] - patch_size[1]) // patch_stride[1]) + 1
+    for j in range(ylen):
+        for i in range(xlen):
+            yup = slice(
+                patch_stride[0] * j, (patch_stride[0] * j) + patch_size[0], 1
+            )
+            xup = slice(
+                patch_stride[1] * i, (patch_stride[1] * i) + patch_size[1], 1
+            )
+            n[yup, xup] += 1
+
+    # calculates the stacked performance
+    layers = n.max()
+    perf = numpy.zeros(
+        [layers] + final_size, dtype=patch_perf[figure].iloc[0].dtype
+    )
+    n = -1 * numpy.ones(final_size, dtype=int)
+    for j in range(ylen):
+        for i in range(xlen):
+            yup = slice(
+                patch_stride[0] * j, (patch_stride[0] * j) + patch_size[0], 1
+            )
+            xup = slice(
+                patch_stride[1] * i, (patch_stride[1] * i) + patch_size[1], 1
+            )
+            nup = n[yup, xup]
+            nup += 1
+            yr, xr = numpy.meshgrid(
+                range(yup.start, yup.stop, yup.step),
+                range(xup.start, xup.stop, xup.step),
+                indexing="ij",
+            )
+            perf[nup.flat, yr.flat, xr.flat] = patch_perf.loc[
+                (patch_perf["y"] == j) & (patch_perf["x"] == i)
+            ][figure]
+
+    # for each element in the ``perf``matrix, calculated avg and std.
+    n += 1  # adjust for starting at -1 before
+    avg = perf.sum(axis=0) / n
+    # calculate variances
+    std = numpy.zeros_like(avg)
+    for k in range(2, n.max() + 1):
+        std[n == k] = perf[: (k + 1), :, :].std(axis=0, ddof=1)[n == k]
+
+    return n, avg, std
+
+
 def _patch_measures(pred, gt, threshold, size, stride):
     """
     Calculates measures on patches of a single sample
@@ -58,7 +172,8 @@ def _patch_measures(pred, gt, threshold, size, stride):
 
     # we calculate the required padding so that the last windows on the left
     # and bottom size of predictions/ground-truth data are zero padded, and
-    # torch unfolding works exactly.
+    # torch unfolding works exactly.  The last windows on the left and bottom
+    # parts of the image may be extended with zeros.
     padding = (0, 0)
     rem = (pred.shape[1] - size[1]) % stride[1]
     if rem != 0:
@@ -103,8 +218,71 @@ def _patch_measures(pred, gt, threshold, size, stride):
     )
 
 
+def _visual_dataset_performance(stem, img, n, avg, std, outdir):
+    """Runs a visual performance assessment for each entry in a given dataset"""
+
+    import matplotlib.pyplot as plt
+
+    figsize = img.shape[1:]
+
+    fig, axs = plt.subplots(2, 2)
+    axs[0, 0].imshow(img.numpy().transpose(1, 2, 0))
+    axs[0, 0].set_title("Original image")
+
+    pos01 = axs[0, 1].imshow(
+        n[: figsize[0], : figsize[1]], cmap="Greys_r", interpolation="none"
+    )
+    fig.colorbar(pos01, ax=axs[0, 1], orientation="vertical")
+    axs[0, 1].set_title("# Overlapping Windows")
+
+    pos10 = axs[1, 0].imshow(
+        avg[: figsize[0], : figsize[1]], cmap="Greys_r", interpolation="none"
+    )
+    fig.colorbar(pos10, ax=axs[1, 0], orientation="vertical")
+    axs[1, 0].set_title("Average Performance")
+
+    pos11 = axs[1, 1].imshow(
+        std[: figsize[0], : figsize[1]], cmap="Greys_r", interpolation="none"
+    )
+    fig.colorbar(pos11, ax=axs[1, 1], orientation="vertical")
+    axs[1, 1].set_title("Performance Std. Dev.")
+
+    plt.tight_layout()
+
+    fname = os.path.join(outdir, stem + ".pdf")
+    os.makedirs(os.path.dirname(fname), exist_ok=True)
+    fig.savefig(fname)
+
+
+def _patch_performances_for_sample(
+    basedir, threshold, size, stride, dataset, k, figure, outdir=None,
+):
+    """
+    Evaluates patch performances per sample
+    """
+
+    sample = dataset[k]
+    with h5py.File(os.path.join(basedir, sample[0] + ".hdf5"), "r") as f:
+        pred = torch.from_numpy(f["array"][:])
+    df = _patch_measures(pred, sample[2], threshold, size, stride)
+    n, avg, std = _performance_summary(
+        sample[1].shape[1:], df, size, stride, figure
+    )
+    if outdir is not None:
+        _visual_dataset_performance(sample[0], sample[1], n, avg, std, outdir)
+    return sample[0], dict(df=df, n=n, avg=avg, std=std)
+
+
 def patch_performances(
-    dataset, name, predictions_folder, threshold, size, stride
+    dataset,
+    name,
+    predictions_folder,
+    threshold,
+    size,
+    stride,
+    figure,
+    nproc=1,
+    outdir=None,
 ):
     """
     Evaluates the performances for multiple image patches, for a whole dataset
@@ -113,8 +291,8 @@ def patch_performances(
     Parameters
     ---------
 
-    dataset : py:class:`torch.utils.data.Dataset`
-        a dataset to iterate on
+    dataset : :py:class:`dict` of py:class:`torch.utils.data.Dataset`
+        datasets to iterate on
 
     name : str
         the local name of this dataset (e.g. ``train``, or ``test``), to be
@@ -136,32 +314,218 @@ def patch_performances(
         strides (vertical, horizontal) for windows for which we will calculate
         partial performances based on the threshold and existing ground-truth
 
+    figure : str
+        the performance figure to use for calculating patch micro performances
+
+    nproc : :py:class:`int`, Optional
+        the number of processing cores to use for performance evaluation.
+        Setting this number to a value ``<= 0`` will cause the function to use
+        all available computing cores.  Otherwise, limit to the number of cores
+        specified.  If set to the value of ``1``, then do not use
+        multiprocessing.
+
+    outdir : :py:class:`str`, Optional
+        path were to save a visual representation of patch performances.  If
+        set to ``None``, then do not save those to disk.
+
 
     Returns
     -------
 
-    df : pandas.DataFrame
-        A dataframe with all the patch performances aggregated, for all input
-        images.
+    d : dict
+        A dictionary in which keys are filename stems and values are
+        dictionaries with the following contents:
+
+        ``df``: :py:class:`pandas.DataFrame`
+            A dataframe with all the patch performances aggregated, for all
+            input images.
+
+        ``n`` : :py:class:`numpy.ndarray`
+            A 2D numpy array containing the number of performance scores for
+            every pixel in the original image
+
+        ``avg`` : :py:class:`numpy.ndarray`
+            A 2D numpy array containing the average performances for every
+            pixel on the input image considering the patch sizes and strides
+            applied when windowing the image
+
+        ``std`` : :py:class:`numpy.ndarray`
+            A 2D numpy array containing the (unbiased) standard deviations for
+            the provided performance figure, for every pixel on the input image
+            considering the patch sizes and strides applied when windowing the
+            image
 
     """
 
     # Collect overall measures
-    data = []
 
     use_predictions_folder = os.path.join(predictions_folder, name)
     if not os.path.exists(use_predictions_folder):
         use_predictions_folder = predictions_folder
 
-    for sample in tqdm(dataset[name]):
-        stem = sample[0]
-        image = sample[1]
-        gt = sample[2]
-        pred_fullpath = os.path.join(use_predictions_folder, stem + ".hdf5")
-        with h5py.File(pred_fullpath, "r") as f:
-            pred = f["array"][:]
-        pred = torch.from_numpy(pred)
-        data.append(_patch_measures(pred, gt, threshold, size, stride))
-        data[-1]["stem"] = stem
-
-    return pandas.concat(data, ignore_index=True)
+    with tqdm(range(len(dataset[name])), desc="patch-perf") as pbar:
+        if nproc != 1:
+            if nproc <= 0:
+                nproc = multiprocessing.cpu_count()
+            pool = multiprocessing.Pool(nproc)
+            results = [
+                pool.apply_async(
+                    _patch_performances_for_sample,
+                    args=(
+                        use_predictions_folder,
+                        threshold,
+                        size,
+                        stride,
+                        dataset[name],
+                        k,
+                        figure,
+                        outdir,
+                    ),
+                    callback=lambda x: pbar.update(),
+                )
+                for k in range(len(dataset[name]))
+            ]
+            pool.close()
+            pool.join()
+            data = [k.get() for k in results]
+        else:
+            data = []
+            for k in pbar:
+                df = _patch_performances_for_sample(
+                    use_predictions_folder,
+                    threshold,
+                    size,
+                    stride,
+                    dataset[name],
+                    k,
+                    figure,
+                    outdir,
+                )
+                data.append(df)
+
+    return dict(data)
+
+
+def _visual_performances_for_sample(
+    size, stride, dataset, k, df, figure, outdir=None
+):
+    """
+    Displays patch performances per sample
+    """
+
+    sample = dataset[k]
+    n, avg, std = _performance_summary(
+        sample[1].shape[1:], df, size, stride, figure
+    )
+    if outdir is not None:
+        _visual_dataset_performance(sample[0], sample[1], n, avg, std, outdir)
+    return sample[0], dict(df=df, n=n, avg=avg, std=std)
+
+
+def visual_performances(
+    dataset, name, dfs, size, stride, figure, nproc=1, outdir=None,
+):
+    """
+    Displays the performances for multiple image patches, for a whole dataset
+
+
+    Parameters
+    ---------
+
+    dataset : :py:class:`dict` of py:class:`torch.utils.data.Dataset`
+        datasets to iterate on
+
+    name : str
+        the local name of this dataset (e.g. ``train``, or ``test``), to be
+        used when saving measures files.
+
+    dfs : dict
+        a dictionary mapping dataset stems to dataframes containing the patch
+        performances to be evaluated
+
+    size : tuple
+        size (vertical, horizontal) for windows for which we will calculate
+        partial performances based on the threshold and existing ground-truth
+
+    stride : tuple
+        strides (vertical, horizontal) for windows for which we will calculate
+        partial performances based on the threshold and existing ground-truth
+
+    figure : str
+        the performance figure to use for calculating patch micro performances
+
+    nproc : :py:class:`int`, Optional
+        the number of processing cores to use for performance evaluation.
+        Setting this number to a value ``<= 0`` will cause the function to use
+        all available computing cores.  Otherwise, limit to the number of cores
+        specified.  If set to the value of ``1``, then do not use
+        multiprocessing.
+
+    outdir : :py:class:`str`, Optional
+        path were to save a visual representation of patch performances.  If
+        set to ``None``, then do not save those to disk.
+
+
+    Returns
+    -------
+
+    d : dict
+        A dictionary in which keys are filename stems and values are
+        dictionaries with the following contents:
+
+        ``df``: :py:class:`pandas.DataFrame`
+            A dataframe with all the patch performances aggregated, for all
+            input images.
+
+        ``n`` : :py:class:`numpy.ndarray`
+            A 2D numpy array containing the number of performance scores for
+            every pixel in the original image
+
+        ``avg`` : :py:class:`numpy.ndarray`
+            A 2D numpy array containing the average performances for every
+            pixel on the input image considering the patch sizes and strides
+            applied when windowing the image
+
+        ``std`` : :py:class:`numpy.ndarray`
+            A 2D numpy array containing the (unbiased) standard deviations for
+            the provided performance figure, for every pixel on the input image
+            considering the patch sizes and strides applied when windowing the
+            image
+
+    """
+
+    stems = list(dataset[name].keys())
+
+    with tqdm(range(len(dataset[name])), desc="visual-perf") as pbar:
+        if nproc != 1:
+            if nproc <= 0:
+                nproc = multiprocessing.cpu_count()
+            pool = multiprocessing.Pool(nproc)
+            results = [
+                pool.apply_async(
+                    _visual_performances_for_sample,
+                    args=(
+                        size,
+                        stride,
+                        dataset[name],
+                        k,
+                        dfs[stems[k]],
+                        figure,
+                        outdir,
+                    ),
+                    callback=lambda x: pbar.update(),
+                )
+                for k in range(len(dataset[name]))
+            ]
+            pool.close()
+            pool.join()
+            data = [k.get() for k in results]
+        else:
+            data = []
+            for k in pbar:
+                df = _visual_performances_for_sample(
+                    size, stride, dataset[name], k, dfs[stems[k]], figure, outdir,
+                )
+                data.append(df)
+
+    return dict(data)
diff --git a/bob/ip/binseg/script/significance.py b/bob/ip/binseg/script/significance.py
index 2700d0c2..077c0ab1 100755
--- a/bob/ip/binseg/script/significance.py
+++ b/bob/ip/binseg/script/significance.py
@@ -2,7 +2,9 @@
 # coding=utf-8
 
 import os
+import sys
 import click
+import typing
 
 from bob.extension.scripts.click_helper import (
     verbosity_option,
@@ -12,13 +14,141 @@ from bob.extension.scripts.click_helper import (
 
 import numpy
 import scipy.stats
+
 import logging
 
 logger = logging.getLogger(__name__)
 
 
 from .evaluate import _validate_threshold, run as run_evaluation
-from ..engine.significance import patch_performances
+from ..engine.significance import patch_performances, visual_performances
+
+
+def _index_of_outliers(c):
+    """Finds indexes of outlines (+/- 1.5*IQR) on a pandas dataframe column"""
+
+    iqr = c.quantile(0.75) - c.quantile(0.25)
+    limits = (c.quantile(0.25) - 1.5 * iqr, c.quantile(0.75) + 1.5 * iqr)
+    return (c < limits[0]) | (c > limits[1])
+
+
+def _write_analysis_text(names, da, db, f):
+    """Writes a text file containing the most important statistics"""
+
+    diff = da - db
+    f.write("#Samples/Median/Avg/Std.Dev./Normality Conf. F1-scores:\n")
+    f.write(
+        f"* {names[0]}: {len(da)}" \
+        f" / {numpy.median(da):.3f}" \
+        f" / {numpy.mean(da):.3f}" \
+        f" / {numpy.std(da, ddof=1):.3f}\n"
+    )
+    f.write(
+        f"* {names[1]}: {len(db)}" \
+        f" / {numpy.median(db):.3f}" \
+        f" / {numpy.mean(db):.3f}" \
+        f" / {numpy.std(db, ddof=1):.3f}\n"
+    )
+    f.write(
+        f"* {names[0]}-{names[1]}: {len(diff)}" \
+        f" / {numpy.median(diff):.3f}" \
+        f" / {numpy.mean(diff):.3f}" \
+        f" / {numpy.std(diff, ddof=1):.3f}" \
+        f" / gaussian? p={scipy.stats.normaltest(diff)[1]:.3f}\n"
+    )
+
+    w, p = scipy.stats.ttest_rel(da, db)
+    f.write(
+        f"Paired T-test (is the difference zero?): S = {w:g}, p = {p:.5f}\n"
+    )
+
+    w, p = scipy.stats.ttest_ind(da, db, equal_var=False)
+    f.write(f"Ind. T-test (is the difference zero?): S = {w:g}, p = {p:.5f}\n")
+
+    w, p = scipy.stats.wilcoxon(diff)
+    f.write(
+        f"Wilcoxon test (is the difference zero?): W = {w:g}, p = {p:.5f}\n"
+    )
+
+    w, p = scipy.stats.wilcoxon(diff, alternative="greater")
+    f.write(
+        f"Wilcoxon test (md({names[0]}) < md({names[1]})?): " \
+        f"W = {w:g}, p = {p:.5f}\n"
+    )
+
+    w, p = scipy.stats.wilcoxon(diff, alternative="less")
+    f.write(
+        f"Wilcoxon test (md({names[0]}) > md({names[1]})?): " \
+        f"W = {w:g}, p = {p:.5f}\n"
+    )
+
+
+def _write_analysis_figures(names, da, db, folder):
+    """Writes a PDF containing most important plots for analysis"""
+
+    from matplotlib.backends.backend_pdf import PdfPages
+    import matplotlib.pyplot as plt
+
+    diff = da - db
+    bins = 50
+
+    fname = os.path.join(folder, "statistics.pdf")
+    os.makedirs(os.path.dirname(fname), exist_ok=True)
+    with PdfPages(fname) as pdf:
+
+        plt.figure()
+        plt.grid()
+        plt.hist(da, bins=bins)
+        plt.title(
+            f"{names[0]} - scores (N={len(da)}; M={numpy.median(da):.3f}; "
+            f"$\mu$={numpy.mean(da):.3f}; $\sigma$={numpy.std(da, ddof=1):.3f})"
+        )
+        pdf.savefig()
+        plt.close()
+
+        plt.figure()
+        plt.grid()
+        plt.hist(db, bins=bins)
+        plt.title(
+            f"{names[1]} - scores (N={len(db)}; M={numpy.median(db):.3f}; "
+            f"$\mu$={numpy.mean(db):.3f}; $\sigma$={numpy.std(db, ddof=1):.3f})"
+        )
+        pdf.savefig()
+        plt.close()
+
+        plt.figure()
+        plt.boxplot([da, db])
+        plt.title(f"{names[0]} and {names[1]} (N={len(da)})")
+        pdf.savefig()
+        plt.close()
+
+        plt.figure()
+        plt.boxplot(diff)
+        plt.title(f"Differences ({names[0]} - {names[1]}) (N={len(da)})")
+        pdf.savefig()
+        plt.close()
+
+        plt.figure()
+        plt.grid()
+        plt.hist(diff, bins=bins)
+        plt.title(
+            f"Systems ({names[0]} - {names[1]}) " \
+            f"(N={len(diff)}; M={numpy.median(diff):.3f}; " \
+            f"$\mu$={numpy.mean(diff):.3f}; " \
+            f"$\sigma$={numpy.std(diff, ddof=1):.3f})"
+        )
+        pdf.savefig()
+        plt.close()
+
+        p = scipy.stats.pearsonr(da, db)
+        plt.figure()
+        plt.grid()
+        plt.scatter(da, db, marker=".", color="black")
+        plt.xlabel("{names[0]}")
+        plt.ylabel("{names[1]}")
+        plt.title(f"Scatter (p={p[0]:.3f})")
+        pdf.savefig()
+        plt.close()
 
 
 @click.command(
@@ -28,10 +158,9 @@ from ..engine.significance import patch_performances
 
 \b
     1. Runs a significance test using as base the calculated predictions of two
-       different systems (1, the first and 2, the second), on the **same**
-       dataset:
+       different systems, on the **same** dataset:
 \b
-       $ bob binseg significance -vv drive --predictions-1=path/to/predictions/system-1 --predictions-2=path/to/predictions/system-2
+       $ bob binseg significance -vv drive --names system1 system2 --predictions=path/to/predictions/system-1 path/to/predictions/system-2
 \b
     2. By default, we use a "validation" dataset if it is available, to infer
        the a priori threshold for the comparison of two systems.  Otherwise,
@@ -40,25 +169,25 @@ from ..engine.significance import patch_performances
        testing the hypothesis - by default we use the "test" dataset if it is
        available, otherwise, specify.
 \b
-       $ bob binseg significance -vv drive --predictions-1=path/to/predictions/system-1 --predictions-2=path/to/predictions/system-2 --threshold=train --evaluate=alternate-test
+       $ bob binseg significance -vv drive --names system1 system2 --predictions=path/to/predictions/system-1 path/to/predictions/system-2 --threshold=train --evaluate=alternate-test
 """,
 )
 @click.option(
-    "--predictions-1",
-    "-p",
-    help="Path where predictions of system 1 are currently stored.  You may "
-         "also input predictions from a second-annotator.  This application "
-         "will adequately handle it.",
+    "--names",
+    "-n",
+    help="Names of the two systems to compare",
+    nargs=2,
     required=True,
-    type=click.Path(exists=True, file_okay=False, dir_okay=True),
+    type=str,
     cls=ResourceOption,
 )
 @click.option(
-    "--predictions-2",
-    "-P",
+    "--predictions",
+    "-p",
     help="Path where predictions of system 2 are currently stored.  You may "
-         "also input predictions from a second-annotator.  This application "
-         "will adequately handle it.",
+    "also input predictions from a second-annotator.  This application "
+    "will adequately handle it.",
+    nargs=2,
     required=True,
     type=click.Path(exists=True, file_okay=False, dir_okay=True),
     cls=ResourceOption,
@@ -67,7 +196,7 @@ from ..engine.significance import patch_performances
     "--dataset",
     "-d",
     help="A dictionary mapping string keys to "
-         "torch.utils.data.dataset.Dataset instances",
+    "torch.utils.data.dataset.Dataset instances",
     required=True,
     cls=ResourceOption,
 )
@@ -82,7 +211,7 @@ from ..engine.significance import patch_performances
     "If not set, or a string is input, threshold tunning is done per system, "
     "individually.  Optionally, you may also provide a floating-point number "
     "between [0.0, 1.0] as the threshold to use for both systems.",
-    default='validation',
+    default="validation",
     show_default=True,
     required=True,
     cls=ResourceOption,
@@ -91,7 +220,7 @@ from ..engine.significance import patch_performances
     "--evaluate",
     "-e",
     help="Name of the dataset to evaluate",
-    default='test',
+    default="test",
     show_default=True,
     required=True,
     cls=ResourceOption,
@@ -111,8 +240,8 @@ from ..engine.significance import patch_performances
     "--size",
     "-s",
     help="This is a tuple with two values indicating the size of windows to "
-         "be used for patch analysis.  The values represent height and width "
-         "respectively.",
+    "be used for patch analysis.  The values represent height and width "
+    "respectively.",
     default=(128, 128),
     nargs=2,
     type=int,
@@ -124,8 +253,8 @@ from ..engine.significance import patch_performances
     "--stride",
     "-t",
     help="This is a tuple with two values indicating the stride of windows to "
-         "be used for patch analysis.  The values represent height and width "
-         "respectively.",
+    "be used for patch analysis.  The values represent height and width "
+    "respectively.",
     default=(32, 32),
     nargs=2,
     type=int,
@@ -133,16 +262,49 @@ from ..engine.significance import patch_performances
     required=True,
     cls=ResourceOption,
 )
+@click.option(
+    "--figure",
+    "-f",
+    help="The name of a performance figure (e.g. f1_score) to use for "
+    "for comparing performances",
+    default="f1_score",
+    type=str,
+    show_default=True,
+    required=True,
+    cls=ResourceOption,
+)
+@click.option(
+    "--output-folder",
+    "-o",
+    help="Path where to store visualizations",
+    required=False,
+    type=click.Path(),
+    show_default=True,
+    cls=ResourceOption,
+)
+@click.option(
+    "--remove-outliers/--no-remove-outliers",
+    "-R",
+    help="If set, removes outliers from both score distributions before " \
+         "running statistical analysis",
+    default=False,
+    required=True,
+    show_default=True,
+    cls=ResourceOption,
+)
 @verbosity_option(cls=ResourceOption)
 def significance(
-    predictions_1,
-    predictions_2,
+    names,
+    predictions,
     dataset,
     threshold,
     evaluate,
     steps,
     size,
     stride,
+    figure,
+    output_folder,
+    remove_outliers,
     **kwargs,
 ):
     """Evaluates how significantly different are two models on the same dataset
@@ -158,69 +320,122 @@ def significance(
     if isinstance(threshold, float):
         threshold1 = threshold2 = threshold
 
-    else:  #it is a string, re-calculate it for each system individually
+    else:  # it is a string, re-calculate it for each system individually
 
         assert threshold in dataset, f"No dataset named '{threshold}'"
 
-        logger.info(f"Evaluating threshold on '{threshold}' set for system 1 using {steps} steps")
+        logger.info(
+            f"Evaluating threshold on '{threshold}' set for '{names[0]}' using {steps} steps"
+        )
         threshold1 = run_evaluation(
-            dataset[threshold], threshold, predictions_1, steps=steps
+            dataset[threshold], threshold, predictions[0], steps=steps
         )
-        logger.info(f"Set --threshold={threshold1:.5f} for system 1")
+        logger.info(f"Set --threshold={threshold1:.5f} for '{names[0]}'")
 
-        logger.info(f"Evaluating threshold on '{threshold}' set for system 2 using {steps} steps")
+        logger.info(
+            f"Evaluating threshold on '{threshold}' set for '{names[1]}' using {steps} steps"
+        )
         threshold2 = run_evaluation(
-            dataset[threshold], threshold, predictions_2, steps=steps
+            dataset[threshold], threshold, predictions[1], steps=steps
         )
-        logger.info(f"Set --threshold={threshold2:.5f} for system 2")
+        logger.info(f"Set --threshold={threshold2:.5f} for '{names[1]}'")
 
     # for a given threshold on each system, calculate patch performances
-    logger.info(f"Evaluating patch performances on '{evaluate}' set for system 1 using windows of size {size} and stride {stride}")
-    perf1 = patch_performances(dataset, evaluate, predictions_1, threshold1,
-            size, stride)
-    logger.info(f"Evaluating patch performances on '{evaluate}' set for system 2 using windows of size {size} and stride {stride}")
-    perf2 = patch_performances(dataset, evaluate, predictions_2, threshold2,
-            size, stride)
-
-    ###### MAGIC STARTS #######
-
-    # load all F1-scores for the given threshold
-    da = perf1.f1_score
-    db = perf2.f1_score
-    diff = da - db
+    logger.info(
+        f"Evaluating patch performances on '{evaluate}' set for '{names[0]}' using windows of size {size} and stride {stride}"
+    )
+    dir1 = (
+        os.path.join(output_folder, names[0])
+        if output_folder is not None
+        else None
+    )
+    perf1 = patch_performances(
+        dataset,
+        evaluate,
+        predictions[0],
+        threshold1,
+        size,
+        stride,
+        figure,
+        nproc=0,
+        outdir=dir1,
+    )
 
-    import matplotlib
-    import matplotlib.pyplot as plt
-    plt.subplot(2,2,1)
-    plt.boxplot([da, db])
-    plt.title('Systems 1 and 2')
-    plt.subplot(2,2,2)
-    plt.boxplot(diff)
-    plt.title('Differences (1 - 2)')
-    plt.subplot(2,1,2)
-    plt.hist(diff, bins=50)
-    plt.title('Histogram (1 - 2)')
-    plt.savefig('analysis.pdf')
-
-    #diff = diff[diff!=0.0]
-    #click.echo(diff)
-
-    click.echo("#Samples/Median/Avg/Std.Dev./Normality Conf. F1-scores:")
-    click.echo(f"* system1: {len(da)} / {numpy.median(da):.3f} / {numpy.mean(da):.3f} / {numpy.std(da, ddof=1):.3f} / {scipy.stats.normaltest(da)[1]}" )
-    click.echo(f"* system2: {len(db)} / {numpy.median(db):.3f} / {numpy.mean(db):.3f} / {numpy.std(db, ddof=1):.3f} / {scipy.stats.normaltest(db)[1]}" )
-    click.echo(f"* system1-system2: {len(diff)} / {numpy.median(diff):.3f} / {numpy.mean(diff):.3f} / {numpy.std(diff, ddof=1):.3f} / {scipy.stats.normaltest(diff)[1]}" )
+    logger.info(
+        f"Evaluating patch performances on '{evaluate}' set for '{names[1]}' using windows of size {size} and stride {stride}"
+    )
+    dir2 = (
+        os.path.join(output_folder, names[1])
+        if output_folder is not None
+        else None
+    )
+    perf2 = patch_performances(
+        dataset,
+        evaluate,
+        predictions[1],
+        threshold2,
+        size,
+        stride,
+        figure,
+        nproc=0,
+        outdir=dir2,
+    )
 
-    w, p = scipy.stats.ttest_rel(da, db)
-    click.echo(f"Paired T-test (is the difference zero?): S = {w:g}, p = {p:.5f}")
+    perf_diff = dict([(k, perf1[k]["df"].copy()) for k in perf1])
+    to_subtract = (
+        "precision",
+        "recall",
+        "specificity",
+        "accuracy",
+        "jaccard",
+        "f1_score",
+    )
+    for k in perf_diff:
+        for col in to_subtract:
+            perf_diff[k][col] -= perf2[k]["df"][col]
+    dirdiff = (
+        os.path.join(output_folder, "diff")
+        if output_folder is not None
+        else None
+    )
+    perf_diff = visual_performances(
+        dataset,
+        evaluate,
+        perf_diff,
+        size,
+        stride,
+        figure,
+        nproc=0,
+        outdir=dirdiff,
+    )
 
-    w, p = scipy.stats.ttest_ind(da, db, equal_var=False)
-    click.echo(f"Ind. T-test (is the difference zero?): S = {w:g}, p = {p:.5f}")
+    # loads all F1-scores for the given threshold
+    stems = list(perf1.keys())
+    da = numpy.array([perf1[k]["df"].f1_score for k in stems]).flatten()
+    db = numpy.array([perf2[k]["df"].f1_score for k in stems]).flatten()
+    diff = da - db
 
-    w, p = scipy.stats.wilcoxon(diff)
-    click.echo(f"Wilcoxon test (is the difference zero?): W = {w:g}, p = {p:.5f}")
+    while remove_outliers:
+        outliers_diff = _index_of_outliers(diff)
+        if sum(outliers_diff) == 0:
+            break
+        diff = diff[~outliers_diff]
+        da = da[~outliers_diff]
+        db = db[~outliers_diff]
 
-    w, p = scipy.stats.wilcoxon(diff, alternative="greater")
-    click.echo(f"Wilcoxon test (md(system1) < md(system2)?): W = {w:g}, p = {p:.5f}")
+    # also remove cases in which both da and db are zero
+    remove_zeros = (da == 0) & (db == 0)
+    diff = diff[~remove_zeros]
+    da = da[~remove_zeros]
+    db = db[~remove_zeros]
 
-    w, p = scipy.stats.wilcoxon(diff, alternative="less")
-    click.echo(f"Wilcoxon test (md(system1) > md(system2)?): W = {w:g}, p = {p:.5f}")
+    if output_folder is not None:
+        _write_analysis_figures(names, da, db, output_folder)
+
+    if output_folder is not None:
+        fname = os.path.join(output_folder, "analysis.txt")
+        os.makedirs(os.path.dirname(fname), exist_ok=True)
+        with open(fname, "wt") as f:
+            _write_analysis_text(names, da, db, f)
+    else:
+        _write_analysis_text(names, da, db, sys.stdout)
-- 
GitLab