From 4de9ef8f7837c155fee423587bfc2fa91723dd95 Mon Sep 17 00:00:00 2001 From: Andre Anjos <andre.dos.anjos@gmail.com> Date: Wed, 15 Jul 2020 15:26:59 +0200 Subject: [PATCH] [script.significance] Rewrite of app to improve DRY --- bob/ip/binseg/script/significance.py | 473 +++++++++++++++------------ 1 file changed, 266 insertions(+), 207 deletions(-) diff --git a/bob/ip/binseg/script/significance.py b/bob/ip/binseg/script/significance.py index 287ce0fc..1e4d5e8e 100755 --- a/bob/ip/binseg/script/significance.py +++ b/bob/ip/binseg/script/significance.py @@ -4,7 +4,6 @@ import os import sys import click -import typing from bob.extension.scripts.click_helper import ( verbosity_option, @@ -13,142 +12,232 @@ from bob.extension.scripts.click_helper import ( ) import numpy -import scipy.stats - import logging logger = logging.getLogger(__name__) from .evaluate import _validate_threshold, run as run_evaluation -from ..engine.significance import patch_performances, visual_performances +from ..engine.significance import ( + patch_performances, + visual_performances, + write_analysis_text, + write_analysis_figures, + index_of_outliers, +) -def _index_of_outliers(c): - """Finds indexes of outlines (+/- 1.5*IQR) on a pandas dataframe column""" +def _eval_patches( + system_name, + threshold, + evaluate, + preddir, + dataset, + steps, + size, + stride, + outdir, + figure, + nproc, +): + """Calculates the patch performances on a dataset - iqr = c.quantile(0.75) - c.quantile(0.25) - limits = (c.quantile(0.25) - 1.5 * iqr, c.quantile(0.75) + 1.5 * iqr) - return (c < limits[0]) | (c > limits[1]) + Parameters + ========== -def _write_analysis_text(names, da, db, f): - """Writes a text file containing the most important statistics""" + system_name : str + The name of the current system being analyzed - diff = da - db - f.write("#Samples/Median/Avg/Std.Dev./Normality Conf. F1-scores:\n") - f.write( - f"* {names[0]}: {len(da)}" \ - f" / {numpy.median(da):.3f}" \ - f" / {numpy.mean(da):.3f}" \ - f" / {numpy.std(da, ddof=1):.3f}\n" - ) - f.write( - f"* {names[1]}: {len(db)}" \ - f" / {numpy.median(db):.3f}" \ - f" / {numpy.mean(db):.3f}" \ - f" / {numpy.std(db, ddof=1):.3f}\n" - ) - f.write( - f"* {names[0]}-{names[1]}: {len(diff)}" \ - f" / {numpy.median(diff):.3f}" \ - f" / {numpy.mean(diff):.3f}" \ - f" / {numpy.std(diff, ddof=1):.3f}" \ - f" / gaussian? p={scipy.stats.normaltest(diff)[1]:.3f}\n" - ) + threshold : :py:class:`float`, :py:class:`str` + This number is used to define positives and negatives from probability + maps, and report F1-scores (a priori). By default, we expect a set + named 'validation' to be available at the input data. If that is not + the case, we use 'train', if available. You may provide the name of + another dataset to be used for threshold tunning otherwise. If not + set, or a string is input, threshold tunning is done per system, + individually. Optionally, you may also provide a floating-point number + between [0.0, 1.0] as the threshold to use for both systems. - w, p = scipy.stats.ttest_rel(da, db) - f.write( - f"Paired T-test (is the difference zero?): S = {w:g}, p = {p:.5f}\n" - ) + evaluate : str + Name of the dataset key to use from ``dataset`` to evaluate (typically, + ``test``) - w, p = scipy.stats.ttest_ind(da, db, equal_var=False) - f.write(f"Ind. T-test (is the difference zero?): S = {w:g}, p = {p:.5f}\n") + preddir : str + Root path to the predictions generated by system ``system_name``. The + final subpath inside ``preddir`` that will be used will have the value + of this variable suffixed with the value of ``evaluate``. We will + search for ``<preddir>/<evaluate>/<stems>.hdf5``. - w, p = scipy.stats.wilcoxon(diff) - f.write( - f"Wilcoxon test (is the difference zero?): W = {w:g}, p = {p:.5f}\n" - ) + dataset : dict + A dictionary mapping string keys to + :py:class:`torch.utils.data.dataset.Dataset` instances - w, p = scipy.stats.wilcoxon(diff, alternative="greater") - f.write( - f"Wilcoxon test (md({names[0]}) < md({names[1]})?): " \ - f"W = {w:g}, p = {p:.5f}\n" - ) + steps : int + The number of threshold steps to consider when evaluating the highest + possible F1-score on train/test data. - w, p = scipy.stats.wilcoxon(diff, alternative="less") - f.write( - f"Wilcoxon test (md({names[0]}) > md({names[1]})?): " \ - f"W = {w:g}, p = {p:.5f}\n" - ) + size : tuple + Two values indicating the size of windows to be used for patch + analysis. The values represent height and width respectively + stride : tuple + Two values indicating the stride of windows to be used for patch + analysis. The values represent height and width respectively -def _write_analysis_figures(names, da, db, folder): - """Writes a PDF containing most important plots for analysis""" + outdir : str + Path where to store visualizations. If set to ``None``, then do not + store performance visualizations. - from matplotlib.backends.backend_pdf import PdfPages - import matplotlib.pyplot as plt + figure : str + The name of a performance figure (e.g. ``f1_score``, or ``jaccard``) to + use when comparing performances - diff = da - db - bins = 50 - - fname = os.path.join(folder, "statistics.pdf") - os.makedirs(os.path.dirname(fname), exist_ok=True) - with PdfPages(fname) as pdf: - - plt.figure() - plt.grid() - plt.hist(da, bins=bins) - plt.title( - f"{names[0]} - scores (N={len(da)}; M={numpy.median(da):.3f}; " - f"$\mu$={numpy.mean(da):.3f}; $\sigma$={numpy.std(da, ddof=1):.3f})" - ) - pdf.savefig() - plt.close() - - plt.figure() - plt.grid() - plt.hist(db, bins=bins) - plt.title( - f"{names[1]} - scores (N={len(db)}; M={numpy.median(db):.3f}; " - f"$\mu$={numpy.mean(db):.3f}; $\sigma$={numpy.std(db, ddof=1):.3f})" + nproc : int + Sets the number of parallel processes to use when running using + multiprocessing. A value of zero uses all reported cores. A value of + ``1`` avoids completely the use of multiprocessing and runs all chores + in the current processing context. + + + Returns + ======= + + d : dict + A dictionary in which keys are filename stems and values are + dictionaries with the following contents: + + ``df``: :py:class:`pandas.DataFrame` + A dataframe with all the patch performances aggregated, for all + input images. + + ``n`` : :py:class:`numpy.ndarray` + A 2D numpy array containing the number of performance scores for + every pixel in the original image + + ``avg`` : :py:class:`numpy.ndarray` + A 2D numpy array containing the average performances for every + pixel on the input image considering the patch sizes and strides + applied when windowing the image + + ``std`` : :py:class:`numpy.ndarray` + A 2D numpy array containing the (unbiased) standard deviations for + the provided performance figure, for every pixel on the input image + considering the patch sizes and strides applied when windowing the + image + + """ + + if not isinstance(threshold, float): + + assert threshold in dataset, f"No dataset named '{threshold}'" + + logger.info( + f"Evaluating threshold on '{threshold}' set for " + f"'{system_name}' using {steps} steps" ) - pdf.savefig() - plt.close() - - plt.figure() - plt.boxplot([da, db]) - plt.title(f"{names[0]} and {names[1]} (N={len(da)})") - pdf.savefig() - plt.close() - - plt.figure() - plt.boxplot(diff) - plt.title(f"Differences ({names[0]} - {names[1]}) (N={len(da)})") - pdf.savefig() - plt.close() - - plt.figure() - plt.grid() - plt.hist(diff, bins=bins) - plt.title( - f"Systems ({names[0]} - {names[1]}) " \ - f"(N={len(diff)}; M={numpy.median(diff):.3f}; " \ - f"$\mu$={numpy.mean(diff):.3f}; " \ - f"$\sigma$={numpy.std(diff, ddof=1):.3f})" + threshold = run_evaluation( + dataset[threshold], threshold, predictions[0], steps=steps ) - pdf.savefig() - plt.close() + logger.info(f"Set --threshold={threshold:.5f} for '{system_name}'") + + # for a given threshold on each system, calculate patch performances + logger.info( + f"Evaluating patch performances on '{evaluate}' set for " + f"'{system_name}' using windows of size {size} and stride {stride}" + ) + + return patch_performances( + dataset, + evaluate, + preddir, + threshold, + size, + stride, + figure, + nproc, + outdir, + ) + + +def _eval_differences(perf1, perf2, evaluate, dataset, size, stride, outdir, + figure, nproc): + """Evaluate differences in the performance patches between two systems + + Parameters + ---------- + + perf1, perf2 : dict + A dictionary as returned by :py:func:`_eval_patches` + + evaluate : str + Name of the dataset key to use from ``dataset`` to evaluate (typically, + ``test``) - p = scipy.stats.pearsonr(da, db) - plt.figure() - plt.grid() - plt.scatter(da, db, marker=".", color="black") - plt.xlabel("{names[0]}") - plt.ylabel("{names[1]}") - plt.title(f"Scatter (p={p[0]:.3f})") - pdf.savefig() - plt.close() + dataset : dict + A dictionary mapping string keys to + :py:class:`torch.utils.data.dataset.Dataset` instances + + size : tuple + Two values indicating the size of windows to be used for patch + analysis. The values represent height and width respectively + + stride : tuple + Two values indicating the stride of windows to be used for patch + analysis. The values represent height and width respectively + + outdir : str + If set to ``None``, then do not output performance visualizations. + Otherwise, in directory ``outdir``, dumps the visualizations for the + performance differences between both systems. + + figure : str + The name of a performance figure (e.g. ``f1_score``, or ``jaccard``) to + use when comparing performances + + nproc : int + Sets the number of parallel processes to use when running using + multiprocessing. A value of zero uses all reported cores. A value of + ``1`` avoids completely the use of multiprocessing and runs all chores + in the current processing context. + + + Returns + ------- + + d : dict + A dictionary representing patch performance differences across all + files and patches. The format of this is similar to the individual + inputs ``perf1`` and ``perf2``. + + """ + + perf_diff = dict([(k, perf1[k]["df"].copy()) for k in perf1]) + + # we can subtract these + to_subtract = ( + "precision", + "recall", + "specificity", + "accuracy", + "jaccard", + "f1_score", + ) + + for k in perf_diff: + for col in to_subtract: + perf_diff[k][col] -= perf2[k]["df"][col] + + return visual_performances( + dataset, + evaluate, + perf_diff, + size, + stride, + figure, + nproc, + outdir, + ) @click.command( @@ -265,8 +354,8 @@ def _write_analysis_figures(names, da, db, folder): @click.option( "--figure", "-f", - help="The name of a performance figure (e.g. f1_score) to use for " - "for comparing performances", + help="The name of a performance figure (e.g. f1_score, or jaccard) to " + "use when comparing performances", default="f1_score", type=str, show_default=True, @@ -285,8 +374,21 @@ def _write_analysis_figures(names, da, db, folder): @click.option( "--remove-outliers/--no-remove-outliers", "-R", - help="If set, removes outliers from both score distributions before " \ - "running statistical analysis", + help="If set, removes outliers from both score distributions before " + "running statistical analysis. Outlier removal follows a 1.5 IQR range " + "check from the difference in figures between both systems and assumes " + "most of the distribution is contained within that range (like in a " + "normal distribution)", + default=False, + required=True, + show_default=True, + cls=ResourceOption, +) +@click.option( + "--remove-zeros/--no-remove-zeros", + "-R", + help="If set, removes instances from the statistical analysis in which " + "both systems had a performance equal to zero.", default=False, required=True, show_default=True, @@ -295,7 +397,7 @@ def _write_analysis_figures(names, da, db, folder): @click.option( "--parallel", "-x", - help="Set the number of parallel processes to use when running using " \ + help="Set the number of parallel processes to use when running using " "multiprocessing. A value of zero uses all reported cores.", default=1, type=int, @@ -316,6 +418,7 @@ def significance( figure, output_folder, remove_outliers, + remove_zeros, parallel, **kwargs, ): @@ -329,125 +432,81 @@ def significance( threshold = _validate_threshold(threshold, dataset) assert evaluate in dataset, f"No dataset named '{evaluate}'" - if isinstance(threshold, float): - threshold1 = threshold2 = threshold - - else: # it is a string, re-calculate it for each system individually - - assert threshold in dataset, f"No dataset named '{threshold}'" - - logger.info( - f"Evaluating threshold on '{threshold}' set for '{names[0]}' using {steps} steps" - ) - threshold1 = run_evaluation( - dataset[threshold], threshold, predictions[0], steps=steps - ) - logger.info(f"Set --threshold={threshold1:.5f} for '{names[0]}'") - - logger.info( - f"Evaluating threshold on '{threshold}' set for '{names[1]}' using {steps} steps" - ) - threshold2 = run_evaluation( - dataset[threshold], threshold, predictions[1], steps=steps - ) - logger.info(f"Set --threshold={threshold2:.5f} for '{names[1]}'") - - # for a given threshold on each system, calculate patch performances - logger.info( - f"Evaluating patch performances on '{evaluate}' set for '{names[0]}' using windows of size {size} and stride {stride}" - ) - dir1 = ( - os.path.join(output_folder, names[0]) - if output_folder is not None - else None - ) - perf1 = patch_performances( - dataset, + perf1 = _eval_patches( + names[0], + threshold, evaluate, predictions[0], - threshold1, + dataset, + steps, size, stride, + (output_folder + if output_folder is None + else os.path.join(output_folder, names[0])), figure, - nproc=parallel, - outdir=dir1, + parallel, ) - logger.info( - f"Evaluating patch performances on '{evaluate}' set for '{names[1]}' using windows of size {size} and stride {stride}" - ) - dir2 = ( - os.path.join(output_folder, names[1]) - if output_folder is not None - else None - ) - perf2 = patch_performances( - dataset, + perf2 = _eval_patches( + names[1], + threshold, evaluate, predictions[1], - threshold2, - size, - stride, - figure, - nproc=parallel, - outdir=dir2, - ) - - perf_diff = dict([(k, perf1[k]["df"].copy()) for k in perf1]) - to_subtract = ( - "precision", - "recall", - "specificity", - "accuracy", - "jaccard", - "f1_score", - ) - for k in perf_diff: - for col in to_subtract: - perf_diff[k][col] -= perf2[k]["df"][col] - dirdiff = ( - os.path.join(output_folder, "diff") - if output_folder is not None - else None - ) - perf_diff = visual_performances( dataset, - evaluate, - perf_diff, + steps, size, stride, + (output_folder + if output_folder is None + else os.path.join(output_folder, names[1])), figure, - nproc=parallel, - outdir=dirdiff, + parallel, ) - # loads all F1-scores for the given threshold + perf_diff = _eval_differences( + perf1, + perf2, + evaluate, + dataset, + size, + stride, + (output_folder + if output_folder is None + else os.path.join(output_folder, "diff")), + figure, + parallel, + ) + + # loads all figures for the given threshold stems = list(perf1.keys()) - da = numpy.array([perf1[k]["df"].f1_score for k in stems]).flatten() - db = numpy.array([perf2[k]["df"].f1_score for k in stems]).flatten() + da = numpy.array([perf1[k]["df"][figure] for k in stems]).flatten() + db = numpy.array([perf2[k]["df"][figure] for k in stems]).flatten() diff = da - db while remove_outliers: - outliers_diff = _index_of_outliers(diff) + outliers_diff = index_of_outliers(diff) if sum(outliers_diff) == 0: break diff = diff[~outliers_diff] da = da[~outliers_diff] db = db[~outliers_diff] - # also remove cases in which both da and db are zero - remove_zeros = (da == 0) & (db == 0) - diff = diff[~remove_zeros] - da = da[~remove_zeros] - db = db[~remove_zeros] + if remove_zeros: + remove_zeros = (da == 0) & (db == 0) + diff = diff[~remove_zeros] + da = da[~remove_zeros] + db = db[~remove_zeros] if output_folder is not None: - _write_analysis_figures(names, da, db, output_folder) + fname = os.path.join(output_folder, "analysis.pdf") + os.makedirs(os.path.dirname(fname), exist_ok=True) + write_analysis_figures(names, da, db, fname) if output_folder is not None: fname = os.path.join(output_folder, "analysis.txt") os.makedirs(os.path.dirname(fname), exist_ok=True) with open(fname, "wt") as f: - _write_analysis_text(names, da, db, f) + write_analysis_text(names, da, db, f) else: - _write_analysis_text(names, da, db, sys.stdout) + write_analysis_text(names, da, db, sys.stdout) -- GitLab