From a57676d0805e1f020ee7dccd48336a742f3786b0 Mon Sep 17 00:00:00 2001 From: Andre Anjos <andre.dos.anjos@gmail.com> Date: Mon, 13 Jul 2020 18:16:26 +0200 Subject: [PATCH] [script.significance] Improve analysis --- bob/ip/binseg/engine/significance.py | 404 +++++++++++++++++++++++++-- bob/ip/binseg/script/significance.py | 363 +++++++++++++++++++----- 2 files changed, 673 insertions(+), 94 deletions(-) diff --git a/bob/ip/binseg/engine/significance.py b/bob/ip/binseg/engine/significance.py index 68fae32d..484236b2 100644 --- a/bob/ip/binseg/engine/significance.py +++ b/bob/ip/binseg/engine/significance.py @@ -3,15 +3,129 @@ import os import itertools +import multiprocessing import h5py from tqdm import tqdm +import numpy import pandas import torch.nn from .evaluator import _sample_measures_for_threshold +def _performance_summary(size, patch_perf, patch_size, patch_stride, figure): + """Generates an array that represents the performance per pixel of the + original image + + The returned array corresponds to a stacked version of performances for + each pixel in the original image taking into consideration the patch + performances, their size and stride. + + + Parameters + ========== + + size : tuple + A two tuple with the original height and width of the image being + analyzed + + patch_perf : typing.Sequence + An ordered sequence of patch performances (in raster direction - every + row, from left to right and then rows from top to bottom). + + patch_size : tuple + A two tuple that indicates the size of each patch (height, width) + + patch_stride: tuple + A two tuple that indicates the stride of each patch (height, width) + + figure: str + Name of the performance figure to use for the summary + + + Returns + ======= + + n : numpy.ndarray + A 2D numpy array containing the number of performance scores for every + pixel in the original image + + avg : numpy.ndarray + A 2D numpy array containing the average performances for every pixel on + the input image considering the patch sizes and strides applied when + windowing the image + + std : numpy.ndarray + A 2D numpy array containing the (unbiased) standard deviations for the + provided performance figure, for every pixel on the input image + considering the patch sizes and strides applied when windowing the + image + + """ + + # first, estimate how many overlaps we have per pixel in the original image + + # we calculate the required padding so that the last windows on the left + # and bottom size of predictions/ground-truth data are zero padded, and + # torch unfolding works exactly. The last windows on the left and bottom + # parts of the image may be extended with zeros. + final_size = list(size) + rem = (size[0] - patch_size[0]) % patch_stride[0] + if rem != 0: + final_size[0] += patch_stride[0] - rem + rem = (size[1] - patch_size[1]) % patch_stride[1] + if rem != 0: + final_size[1] += patch_stride[1] - rem + n = numpy.zeros(final_size, dtype=int) + ylen = ((final_size[0] - patch_size[0]) // patch_stride[0]) + 1 + xlen = ((final_size[1] - patch_size[1]) // patch_stride[1]) + 1 + for j in range(ylen): + for i in range(xlen): + yup = slice( + patch_stride[0] * j, (patch_stride[0] * j) + patch_size[0], 1 + ) + xup = slice( + patch_stride[1] * i, (patch_stride[1] * i) + patch_size[1], 1 + ) + n[yup, xup] += 1 + + # calculates the stacked performance + layers = n.max() + perf = numpy.zeros( + [layers] + final_size, dtype=patch_perf[figure].iloc[0].dtype + ) + n = -1 * numpy.ones(final_size, dtype=int) + for j in range(ylen): + for i in range(xlen): + yup = slice( + patch_stride[0] * j, (patch_stride[0] * j) + patch_size[0], 1 + ) + xup = slice( + patch_stride[1] * i, (patch_stride[1] * i) + patch_size[1], 1 + ) + nup = n[yup, xup] + nup += 1 + yr, xr = numpy.meshgrid( + range(yup.start, yup.stop, yup.step), + range(xup.start, xup.stop, xup.step), + indexing="ij", + ) + perf[nup.flat, yr.flat, xr.flat] = patch_perf.loc[ + (patch_perf["y"] == j) & (patch_perf["x"] == i) + ][figure] + + # for each element in the ``perf``matrix, calculated avg and std. + n += 1 # adjust for starting at -1 before + avg = perf.sum(axis=0) / n + # calculate variances + std = numpy.zeros_like(avg) + for k in range(2, n.max() + 1): + std[n == k] = perf[: (k + 1), :, :].std(axis=0, ddof=1)[n == k] + + return n, avg, std + + def _patch_measures(pred, gt, threshold, size, stride): """ Calculates measures on patches of a single sample @@ -58,7 +172,8 @@ def _patch_measures(pred, gt, threshold, size, stride): # we calculate the required padding so that the last windows on the left # and bottom size of predictions/ground-truth data are zero padded, and - # torch unfolding works exactly. + # torch unfolding works exactly. The last windows on the left and bottom + # parts of the image may be extended with zeros. padding = (0, 0) rem = (pred.shape[1] - size[1]) % stride[1] if rem != 0: @@ -103,8 +218,71 @@ def _patch_measures(pred, gt, threshold, size, stride): ) +def _visual_dataset_performance(stem, img, n, avg, std, outdir): + """Runs a visual performance assessment for each entry in a given dataset""" + + import matplotlib.pyplot as plt + + figsize = img.shape[1:] + + fig, axs = plt.subplots(2, 2) + axs[0, 0].imshow(img.numpy().transpose(1, 2, 0)) + axs[0, 0].set_title("Original image") + + pos01 = axs[0, 1].imshow( + n[: figsize[0], : figsize[1]], cmap="Greys_r", interpolation="none" + ) + fig.colorbar(pos01, ax=axs[0, 1], orientation="vertical") + axs[0, 1].set_title("# Overlapping Windows") + + pos10 = axs[1, 0].imshow( + avg[: figsize[0], : figsize[1]], cmap="Greys_r", interpolation="none" + ) + fig.colorbar(pos10, ax=axs[1, 0], orientation="vertical") + axs[1, 0].set_title("Average Performance") + + pos11 = axs[1, 1].imshow( + std[: figsize[0], : figsize[1]], cmap="Greys_r", interpolation="none" + ) + fig.colorbar(pos11, ax=axs[1, 1], orientation="vertical") + axs[1, 1].set_title("Performance Std. Dev.") + + plt.tight_layout() + + fname = os.path.join(outdir, stem + ".pdf") + os.makedirs(os.path.dirname(fname), exist_ok=True) + fig.savefig(fname) + + +def _patch_performances_for_sample( + basedir, threshold, size, stride, dataset, k, figure, outdir=None, +): + """ + Evaluates patch performances per sample + """ + + sample = dataset[k] + with h5py.File(os.path.join(basedir, sample[0] + ".hdf5"), "r") as f: + pred = torch.from_numpy(f["array"][:]) + df = _patch_measures(pred, sample[2], threshold, size, stride) + n, avg, std = _performance_summary( + sample[1].shape[1:], df, size, stride, figure + ) + if outdir is not None: + _visual_dataset_performance(sample[0], sample[1], n, avg, std, outdir) + return sample[0], dict(df=df, n=n, avg=avg, std=std) + + def patch_performances( - dataset, name, predictions_folder, threshold, size, stride + dataset, + name, + predictions_folder, + threshold, + size, + stride, + figure, + nproc=1, + outdir=None, ): """ Evaluates the performances for multiple image patches, for a whole dataset @@ -113,8 +291,8 @@ def patch_performances( Parameters --------- - dataset : py:class:`torch.utils.data.Dataset` - a dataset to iterate on + dataset : :py:class:`dict` of py:class:`torch.utils.data.Dataset` + datasets to iterate on name : str the local name of this dataset (e.g. ``train``, or ``test``), to be @@ -136,32 +314,218 @@ def patch_performances( strides (vertical, horizontal) for windows for which we will calculate partial performances based on the threshold and existing ground-truth + figure : str + the performance figure to use for calculating patch micro performances + + nproc : :py:class:`int`, Optional + the number of processing cores to use for performance evaluation. + Setting this number to a value ``<= 0`` will cause the function to use + all available computing cores. Otherwise, limit to the number of cores + specified. If set to the value of ``1``, then do not use + multiprocessing. + + outdir : :py:class:`str`, Optional + path were to save a visual representation of patch performances. If + set to ``None``, then do not save those to disk. + Returns ------- - df : pandas.DataFrame - A dataframe with all the patch performances aggregated, for all input - images. + d : dict + A dictionary in which keys are filename stems and values are + dictionaries with the following contents: + + ``df``: :py:class:`pandas.DataFrame` + A dataframe with all the patch performances aggregated, for all + input images. + + ``n`` : :py:class:`numpy.ndarray` + A 2D numpy array containing the number of performance scores for + every pixel in the original image + + ``avg`` : :py:class:`numpy.ndarray` + A 2D numpy array containing the average performances for every + pixel on the input image considering the patch sizes and strides + applied when windowing the image + + ``std`` : :py:class:`numpy.ndarray` + A 2D numpy array containing the (unbiased) standard deviations for + the provided performance figure, for every pixel on the input image + considering the patch sizes and strides applied when windowing the + image """ # Collect overall measures - data = [] use_predictions_folder = os.path.join(predictions_folder, name) if not os.path.exists(use_predictions_folder): use_predictions_folder = predictions_folder - for sample in tqdm(dataset[name]): - stem = sample[0] - image = sample[1] - gt = sample[2] - pred_fullpath = os.path.join(use_predictions_folder, stem + ".hdf5") - with h5py.File(pred_fullpath, "r") as f: - pred = f["array"][:] - pred = torch.from_numpy(pred) - data.append(_patch_measures(pred, gt, threshold, size, stride)) - data[-1]["stem"] = stem - - return pandas.concat(data, ignore_index=True) + with tqdm(range(len(dataset[name])), desc="patch-perf") as pbar: + if nproc != 1: + if nproc <= 0: + nproc = multiprocessing.cpu_count() + pool = multiprocessing.Pool(nproc) + results = [ + pool.apply_async( + _patch_performances_for_sample, + args=( + use_predictions_folder, + threshold, + size, + stride, + dataset[name], + k, + figure, + outdir, + ), + callback=lambda x: pbar.update(), + ) + for k in range(len(dataset[name])) + ] + pool.close() + pool.join() + data = [k.get() for k in results] + else: + data = [] + for k in pbar: + df = _patch_performances_for_sample( + use_predictions_folder, + threshold, + size, + stride, + dataset[name], + k, + figure, + outdir, + ) + data.append(df) + + return dict(data) + + +def _visual_performances_for_sample( + size, stride, dataset, k, df, figure, outdir=None +): + """ + Displays patch performances per sample + """ + + sample = dataset[k] + n, avg, std = _performance_summary( + sample[1].shape[1:], df, size, stride, figure + ) + if outdir is not None: + _visual_dataset_performance(sample[0], sample[1], n, avg, std, outdir) + return sample[0], dict(df=df, n=n, avg=avg, std=std) + + +def visual_performances( + dataset, name, dfs, size, stride, figure, nproc=1, outdir=None, +): + """ + Displays the performances for multiple image patches, for a whole dataset + + + Parameters + --------- + + dataset : :py:class:`dict` of py:class:`torch.utils.data.Dataset` + datasets to iterate on + + name : str + the local name of this dataset (e.g. ``train``, or ``test``), to be + used when saving measures files. + + dfs : dict + a dictionary mapping dataset stems to dataframes containing the patch + performances to be evaluated + + size : tuple + size (vertical, horizontal) for windows for which we will calculate + partial performances based on the threshold and existing ground-truth + + stride : tuple + strides (vertical, horizontal) for windows for which we will calculate + partial performances based on the threshold and existing ground-truth + + figure : str + the performance figure to use for calculating patch micro performances + + nproc : :py:class:`int`, Optional + the number of processing cores to use for performance evaluation. + Setting this number to a value ``<= 0`` will cause the function to use + all available computing cores. Otherwise, limit to the number of cores + specified. If set to the value of ``1``, then do not use + multiprocessing. + + outdir : :py:class:`str`, Optional + path were to save a visual representation of patch performances. If + set to ``None``, then do not save those to disk. + + + Returns + ------- + + d : dict + A dictionary in which keys are filename stems and values are + dictionaries with the following contents: + + ``df``: :py:class:`pandas.DataFrame` + A dataframe with all the patch performances aggregated, for all + input images. + + ``n`` : :py:class:`numpy.ndarray` + A 2D numpy array containing the number of performance scores for + every pixel in the original image + + ``avg`` : :py:class:`numpy.ndarray` + A 2D numpy array containing the average performances for every + pixel on the input image considering the patch sizes and strides + applied when windowing the image + + ``std`` : :py:class:`numpy.ndarray` + A 2D numpy array containing the (unbiased) standard deviations for + the provided performance figure, for every pixel on the input image + considering the patch sizes and strides applied when windowing the + image + + """ + + stems = list(dataset[name].keys()) + + with tqdm(range(len(dataset[name])), desc="visual-perf") as pbar: + if nproc != 1: + if nproc <= 0: + nproc = multiprocessing.cpu_count() + pool = multiprocessing.Pool(nproc) + results = [ + pool.apply_async( + _visual_performances_for_sample, + args=( + size, + stride, + dataset[name], + k, + dfs[stems[k]], + figure, + outdir, + ), + callback=lambda x: pbar.update(), + ) + for k in range(len(dataset[name])) + ] + pool.close() + pool.join() + data = [k.get() for k in results] + else: + data = [] + for k in pbar: + df = _visual_performances_for_sample( + size, stride, dataset[name], k, dfs[stems[k]], figure, outdir, + ) + data.append(df) + + return dict(data) diff --git a/bob/ip/binseg/script/significance.py b/bob/ip/binseg/script/significance.py index 2700d0c2..077c0ab1 100755 --- a/bob/ip/binseg/script/significance.py +++ b/bob/ip/binseg/script/significance.py @@ -2,7 +2,9 @@ # coding=utf-8 import os +import sys import click +import typing from bob.extension.scripts.click_helper import ( verbosity_option, @@ -12,13 +14,141 @@ from bob.extension.scripts.click_helper import ( import numpy import scipy.stats + import logging logger = logging.getLogger(__name__) from .evaluate import _validate_threshold, run as run_evaluation -from ..engine.significance import patch_performances +from ..engine.significance import patch_performances, visual_performances + + +def _index_of_outliers(c): + """Finds indexes of outlines (+/- 1.5*IQR) on a pandas dataframe column""" + + iqr = c.quantile(0.75) - c.quantile(0.25) + limits = (c.quantile(0.25) - 1.5 * iqr, c.quantile(0.75) + 1.5 * iqr) + return (c < limits[0]) | (c > limits[1]) + + +def _write_analysis_text(names, da, db, f): + """Writes a text file containing the most important statistics""" + + diff = da - db + f.write("#Samples/Median/Avg/Std.Dev./Normality Conf. F1-scores:\n") + f.write( + f"* {names[0]}: {len(da)}" \ + f" / {numpy.median(da):.3f}" \ + f" / {numpy.mean(da):.3f}" \ + f" / {numpy.std(da, ddof=1):.3f}\n" + ) + f.write( + f"* {names[1]}: {len(db)}" \ + f" / {numpy.median(db):.3f}" \ + f" / {numpy.mean(db):.3f}" \ + f" / {numpy.std(db, ddof=1):.3f}\n" + ) + f.write( + f"* {names[0]}-{names[1]}: {len(diff)}" \ + f" / {numpy.median(diff):.3f}" \ + f" / {numpy.mean(diff):.3f}" \ + f" / {numpy.std(diff, ddof=1):.3f}" \ + f" / gaussian? p={scipy.stats.normaltest(diff)[1]:.3f}\n" + ) + + w, p = scipy.stats.ttest_rel(da, db) + f.write( + f"Paired T-test (is the difference zero?): S = {w:g}, p = {p:.5f}\n" + ) + + w, p = scipy.stats.ttest_ind(da, db, equal_var=False) + f.write(f"Ind. T-test (is the difference zero?): S = {w:g}, p = {p:.5f}\n") + + w, p = scipy.stats.wilcoxon(diff) + f.write( + f"Wilcoxon test (is the difference zero?): W = {w:g}, p = {p:.5f}\n" + ) + + w, p = scipy.stats.wilcoxon(diff, alternative="greater") + f.write( + f"Wilcoxon test (md({names[0]}) < md({names[1]})?): " \ + f"W = {w:g}, p = {p:.5f}\n" + ) + + w, p = scipy.stats.wilcoxon(diff, alternative="less") + f.write( + f"Wilcoxon test (md({names[0]}) > md({names[1]})?): " \ + f"W = {w:g}, p = {p:.5f}\n" + ) + + +def _write_analysis_figures(names, da, db, folder): + """Writes a PDF containing most important plots for analysis""" + + from matplotlib.backends.backend_pdf import PdfPages + import matplotlib.pyplot as plt + + diff = da - db + bins = 50 + + fname = os.path.join(folder, "statistics.pdf") + os.makedirs(os.path.dirname(fname), exist_ok=True) + with PdfPages(fname) as pdf: + + plt.figure() + plt.grid() + plt.hist(da, bins=bins) + plt.title( + f"{names[0]} - scores (N={len(da)}; M={numpy.median(da):.3f}; " + f"$\mu$={numpy.mean(da):.3f}; $\sigma$={numpy.std(da, ddof=1):.3f})" + ) + pdf.savefig() + plt.close() + + plt.figure() + plt.grid() + plt.hist(db, bins=bins) + plt.title( + f"{names[1]} - scores (N={len(db)}; M={numpy.median(db):.3f}; " + f"$\mu$={numpy.mean(db):.3f}; $\sigma$={numpy.std(db, ddof=1):.3f})" + ) + pdf.savefig() + plt.close() + + plt.figure() + plt.boxplot([da, db]) + plt.title(f"{names[0]} and {names[1]} (N={len(da)})") + pdf.savefig() + plt.close() + + plt.figure() + plt.boxplot(diff) + plt.title(f"Differences ({names[0]} - {names[1]}) (N={len(da)})") + pdf.savefig() + plt.close() + + plt.figure() + plt.grid() + plt.hist(diff, bins=bins) + plt.title( + f"Systems ({names[0]} - {names[1]}) " \ + f"(N={len(diff)}; M={numpy.median(diff):.3f}; " \ + f"$\mu$={numpy.mean(diff):.3f}; " \ + f"$\sigma$={numpy.std(diff, ddof=1):.3f})" + ) + pdf.savefig() + plt.close() + + p = scipy.stats.pearsonr(da, db) + plt.figure() + plt.grid() + plt.scatter(da, db, marker=".", color="black") + plt.xlabel("{names[0]}") + plt.ylabel("{names[1]}") + plt.title(f"Scatter (p={p[0]:.3f})") + pdf.savefig() + plt.close() @click.command( @@ -28,10 +158,9 @@ from ..engine.significance import patch_performances \b 1. Runs a significance test using as base the calculated predictions of two - different systems (1, the first and 2, the second), on the **same** - dataset: + different systems, on the **same** dataset: \b - $ bob binseg significance -vv drive --predictions-1=path/to/predictions/system-1 --predictions-2=path/to/predictions/system-2 + $ bob binseg significance -vv drive --names system1 system2 --predictions=path/to/predictions/system-1 path/to/predictions/system-2 \b 2. By default, we use a "validation" dataset if it is available, to infer the a priori threshold for the comparison of two systems. Otherwise, @@ -40,25 +169,25 @@ from ..engine.significance import patch_performances testing the hypothesis - by default we use the "test" dataset if it is available, otherwise, specify. \b - $ bob binseg significance -vv drive --predictions-1=path/to/predictions/system-1 --predictions-2=path/to/predictions/system-2 --threshold=train --evaluate=alternate-test + $ bob binseg significance -vv drive --names system1 system2 --predictions=path/to/predictions/system-1 path/to/predictions/system-2 --threshold=train --evaluate=alternate-test """, ) @click.option( - "--predictions-1", - "-p", - help="Path where predictions of system 1 are currently stored. You may " - "also input predictions from a second-annotator. This application " - "will adequately handle it.", + "--names", + "-n", + help="Names of the two systems to compare", + nargs=2, required=True, - type=click.Path(exists=True, file_okay=False, dir_okay=True), + type=str, cls=ResourceOption, ) @click.option( - "--predictions-2", - "-P", + "--predictions", + "-p", help="Path where predictions of system 2 are currently stored. You may " - "also input predictions from a second-annotator. This application " - "will adequately handle it.", + "also input predictions from a second-annotator. This application " + "will adequately handle it.", + nargs=2, required=True, type=click.Path(exists=True, file_okay=False, dir_okay=True), cls=ResourceOption, @@ -67,7 +196,7 @@ from ..engine.significance import patch_performances "--dataset", "-d", help="A dictionary mapping string keys to " - "torch.utils.data.dataset.Dataset instances", + "torch.utils.data.dataset.Dataset instances", required=True, cls=ResourceOption, ) @@ -82,7 +211,7 @@ from ..engine.significance import patch_performances "If not set, or a string is input, threshold tunning is done per system, " "individually. Optionally, you may also provide a floating-point number " "between [0.0, 1.0] as the threshold to use for both systems.", - default='validation', + default="validation", show_default=True, required=True, cls=ResourceOption, @@ -91,7 +220,7 @@ from ..engine.significance import patch_performances "--evaluate", "-e", help="Name of the dataset to evaluate", - default='test', + default="test", show_default=True, required=True, cls=ResourceOption, @@ -111,8 +240,8 @@ from ..engine.significance import patch_performances "--size", "-s", help="This is a tuple with two values indicating the size of windows to " - "be used for patch analysis. The values represent height and width " - "respectively.", + "be used for patch analysis. The values represent height and width " + "respectively.", default=(128, 128), nargs=2, type=int, @@ -124,8 +253,8 @@ from ..engine.significance import patch_performances "--stride", "-t", help="This is a tuple with two values indicating the stride of windows to " - "be used for patch analysis. The values represent height and width " - "respectively.", + "be used for patch analysis. The values represent height and width " + "respectively.", default=(32, 32), nargs=2, type=int, @@ -133,16 +262,49 @@ from ..engine.significance import patch_performances required=True, cls=ResourceOption, ) +@click.option( + "--figure", + "-f", + help="The name of a performance figure (e.g. f1_score) to use for " + "for comparing performances", + default="f1_score", + type=str, + show_default=True, + required=True, + cls=ResourceOption, +) +@click.option( + "--output-folder", + "-o", + help="Path where to store visualizations", + required=False, + type=click.Path(), + show_default=True, + cls=ResourceOption, +) +@click.option( + "--remove-outliers/--no-remove-outliers", + "-R", + help="If set, removes outliers from both score distributions before " \ + "running statistical analysis", + default=False, + required=True, + show_default=True, + cls=ResourceOption, +) @verbosity_option(cls=ResourceOption) def significance( - predictions_1, - predictions_2, + names, + predictions, dataset, threshold, evaluate, steps, size, stride, + figure, + output_folder, + remove_outliers, **kwargs, ): """Evaluates how significantly different are two models on the same dataset @@ -158,69 +320,122 @@ def significance( if isinstance(threshold, float): threshold1 = threshold2 = threshold - else: #it is a string, re-calculate it for each system individually + else: # it is a string, re-calculate it for each system individually assert threshold in dataset, f"No dataset named '{threshold}'" - logger.info(f"Evaluating threshold on '{threshold}' set for system 1 using {steps} steps") + logger.info( + f"Evaluating threshold on '{threshold}' set for '{names[0]}' using {steps} steps" + ) threshold1 = run_evaluation( - dataset[threshold], threshold, predictions_1, steps=steps + dataset[threshold], threshold, predictions[0], steps=steps ) - logger.info(f"Set --threshold={threshold1:.5f} for system 1") + logger.info(f"Set --threshold={threshold1:.5f} for '{names[0]}'") - logger.info(f"Evaluating threshold on '{threshold}' set for system 2 using {steps} steps") + logger.info( + f"Evaluating threshold on '{threshold}' set for '{names[1]}' using {steps} steps" + ) threshold2 = run_evaluation( - dataset[threshold], threshold, predictions_2, steps=steps + dataset[threshold], threshold, predictions[1], steps=steps ) - logger.info(f"Set --threshold={threshold2:.5f} for system 2") + logger.info(f"Set --threshold={threshold2:.5f} for '{names[1]}'") # for a given threshold on each system, calculate patch performances - logger.info(f"Evaluating patch performances on '{evaluate}' set for system 1 using windows of size {size} and stride {stride}") - perf1 = patch_performances(dataset, evaluate, predictions_1, threshold1, - size, stride) - logger.info(f"Evaluating patch performances on '{evaluate}' set for system 2 using windows of size {size} and stride {stride}") - perf2 = patch_performances(dataset, evaluate, predictions_2, threshold2, - size, stride) - - ###### MAGIC STARTS ####### - - # load all F1-scores for the given threshold - da = perf1.f1_score - db = perf2.f1_score - diff = da - db + logger.info( + f"Evaluating patch performances on '{evaluate}' set for '{names[0]}' using windows of size {size} and stride {stride}" + ) + dir1 = ( + os.path.join(output_folder, names[0]) + if output_folder is not None + else None + ) + perf1 = patch_performances( + dataset, + evaluate, + predictions[0], + threshold1, + size, + stride, + figure, + nproc=0, + outdir=dir1, + ) - import matplotlib - import matplotlib.pyplot as plt - plt.subplot(2,2,1) - plt.boxplot([da, db]) - plt.title('Systems 1 and 2') - plt.subplot(2,2,2) - plt.boxplot(diff) - plt.title('Differences (1 - 2)') - plt.subplot(2,1,2) - plt.hist(diff, bins=50) - plt.title('Histogram (1 - 2)') - plt.savefig('analysis.pdf') - - #diff = diff[diff!=0.0] - #click.echo(diff) - - click.echo("#Samples/Median/Avg/Std.Dev./Normality Conf. F1-scores:") - click.echo(f"* system1: {len(da)} / {numpy.median(da):.3f} / {numpy.mean(da):.3f} / {numpy.std(da, ddof=1):.3f} / {scipy.stats.normaltest(da)[1]}" ) - click.echo(f"* system2: {len(db)} / {numpy.median(db):.3f} / {numpy.mean(db):.3f} / {numpy.std(db, ddof=1):.3f} / {scipy.stats.normaltest(db)[1]}" ) - click.echo(f"* system1-system2: {len(diff)} / {numpy.median(diff):.3f} / {numpy.mean(diff):.3f} / {numpy.std(diff, ddof=1):.3f} / {scipy.stats.normaltest(diff)[1]}" ) + logger.info( + f"Evaluating patch performances on '{evaluate}' set for '{names[1]}' using windows of size {size} and stride {stride}" + ) + dir2 = ( + os.path.join(output_folder, names[1]) + if output_folder is not None + else None + ) + perf2 = patch_performances( + dataset, + evaluate, + predictions[1], + threshold2, + size, + stride, + figure, + nproc=0, + outdir=dir2, + ) - w, p = scipy.stats.ttest_rel(da, db) - click.echo(f"Paired T-test (is the difference zero?): S = {w:g}, p = {p:.5f}") + perf_diff = dict([(k, perf1[k]["df"].copy()) for k in perf1]) + to_subtract = ( + "precision", + "recall", + "specificity", + "accuracy", + "jaccard", + "f1_score", + ) + for k in perf_diff: + for col in to_subtract: + perf_diff[k][col] -= perf2[k]["df"][col] + dirdiff = ( + os.path.join(output_folder, "diff") + if output_folder is not None + else None + ) + perf_diff = visual_performances( + dataset, + evaluate, + perf_diff, + size, + stride, + figure, + nproc=0, + outdir=dirdiff, + ) - w, p = scipy.stats.ttest_ind(da, db, equal_var=False) - click.echo(f"Ind. T-test (is the difference zero?): S = {w:g}, p = {p:.5f}") + # loads all F1-scores for the given threshold + stems = list(perf1.keys()) + da = numpy.array([perf1[k]["df"].f1_score for k in stems]).flatten() + db = numpy.array([perf2[k]["df"].f1_score for k in stems]).flatten() + diff = da - db - w, p = scipy.stats.wilcoxon(diff) - click.echo(f"Wilcoxon test (is the difference zero?): W = {w:g}, p = {p:.5f}") + while remove_outliers: + outliers_diff = _index_of_outliers(diff) + if sum(outliers_diff) == 0: + break + diff = diff[~outliers_diff] + da = da[~outliers_diff] + db = db[~outliers_diff] - w, p = scipy.stats.wilcoxon(diff, alternative="greater") - click.echo(f"Wilcoxon test (md(system1) < md(system2)?): W = {w:g}, p = {p:.5f}") + # also remove cases in which both da and db are zero + remove_zeros = (da == 0) & (db == 0) + diff = diff[~remove_zeros] + da = da[~remove_zeros] + db = db[~remove_zeros] - w, p = scipy.stats.wilcoxon(diff, alternative="less") - click.echo(f"Wilcoxon test (md(system1) > md(system2)?): W = {w:g}, p = {p:.5f}") + if output_folder is not None: + _write_analysis_figures(names, da, db, output_folder) + + if output_folder is not None: + fname = os.path.join(output_folder, "analysis.txt") + os.makedirs(os.path.dirname(fname), exist_ok=True) + with open(fname, "wt") as f: + _write_analysis_text(names, da, db, f) + else: + _write_analysis_text(names, da, db, sys.stdout) -- GitLab