Skip to content
Snippets Groups Projects
Commit 4de9ef8f authored by André Anjos's avatar André Anjos :speech_balloon:
Browse files

[script.significance] Rewrite of app to improve DRY

parent e48694a4
No related branches found
No related tags found
No related merge requests found
...@@ -4,7 +4,6 @@ ...@@ -4,7 +4,6 @@
import os import os
import sys import sys
import click import click
import typing
from bob.extension.scripts.click_helper import ( from bob.extension.scripts.click_helper import (
verbosity_option, verbosity_option,
...@@ -13,142 +12,232 @@ from bob.extension.scripts.click_helper import ( ...@@ -13,142 +12,232 @@ from bob.extension.scripts.click_helper import (
) )
import numpy import numpy
import scipy.stats
import logging import logging
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
from .evaluate import _validate_threshold, run as run_evaluation from .evaluate import _validate_threshold, run as run_evaluation
from ..engine.significance import patch_performances, visual_performances from ..engine.significance import (
patch_performances,
visual_performances,
write_analysis_text,
write_analysis_figures,
index_of_outliers,
)
def _index_of_outliers(c): def _eval_patches(
"""Finds indexes of outlines (+/- 1.5*IQR) on a pandas dataframe column""" system_name,
threshold,
evaluate,
preddir,
dataset,
steps,
size,
stride,
outdir,
figure,
nproc,
):
"""Calculates the patch performances on a dataset
iqr = c.quantile(0.75) - c.quantile(0.25)
limits = (c.quantile(0.25) - 1.5 * iqr, c.quantile(0.75) + 1.5 * iqr)
return (c < limits[0]) | (c > limits[1])
Parameters
==========
def _write_analysis_text(names, da, db, f): system_name : str
"""Writes a text file containing the most important statistics""" The name of the current system being analyzed
diff = da - db threshold : :py:class:`float`, :py:class:`str`
f.write("#Samples/Median/Avg/Std.Dev./Normality Conf. F1-scores:\n") This number is used to define positives and negatives from probability
f.write( maps, and report F1-scores (a priori). By default, we expect a set
f"* {names[0]}: {len(da)}" \ named 'validation' to be available at the input data. If that is not
f" / {numpy.median(da):.3f}" \ the case, we use 'train', if available. You may provide the name of
f" / {numpy.mean(da):.3f}" \ another dataset to be used for threshold tunning otherwise. If not
f" / {numpy.std(da, ddof=1):.3f}\n" set, or a string is input, threshold tunning is done per system,
) individually. Optionally, you may also provide a floating-point number
f.write( between [0.0, 1.0] as the threshold to use for both systems.
f"* {names[1]}: {len(db)}" \
f" / {numpy.median(db):.3f}" \
f" / {numpy.mean(db):.3f}" \
f" / {numpy.std(db, ddof=1):.3f}\n"
)
f.write(
f"* {names[0]}-{names[1]}: {len(diff)}" \
f" / {numpy.median(diff):.3f}" \
f" / {numpy.mean(diff):.3f}" \
f" / {numpy.std(diff, ddof=1):.3f}" \
f" / gaussian? p={scipy.stats.normaltest(diff)[1]:.3f}\n"
)
w, p = scipy.stats.ttest_rel(da, db) evaluate : str
f.write( Name of the dataset key to use from ``dataset`` to evaluate (typically,
f"Paired T-test (is the difference zero?): S = {w:g}, p = {p:.5f}\n" ``test``)
)
w, p = scipy.stats.ttest_ind(da, db, equal_var=False) preddir : str
f.write(f"Ind. T-test (is the difference zero?): S = {w:g}, p = {p:.5f}\n") Root path to the predictions generated by system ``system_name``. The
final subpath inside ``preddir`` that will be used will have the value
of this variable suffixed with the value of ``evaluate``. We will
search for ``<preddir>/<evaluate>/<stems>.hdf5``.
w, p = scipy.stats.wilcoxon(diff) dataset : dict
f.write( A dictionary mapping string keys to
f"Wilcoxon test (is the difference zero?): W = {w:g}, p = {p:.5f}\n" :py:class:`torch.utils.data.dataset.Dataset` instances
)
w, p = scipy.stats.wilcoxon(diff, alternative="greater") steps : int
f.write( The number of threshold steps to consider when evaluating the highest
f"Wilcoxon test (md({names[0]}) < md({names[1]})?): " \ possible F1-score on train/test data.
f"W = {w:g}, p = {p:.5f}\n"
)
w, p = scipy.stats.wilcoxon(diff, alternative="less") size : tuple
f.write( Two values indicating the size of windows to be used for patch
f"Wilcoxon test (md({names[0]}) > md({names[1]})?): " \ analysis. The values represent height and width respectively
f"W = {w:g}, p = {p:.5f}\n"
)
stride : tuple
Two values indicating the stride of windows to be used for patch
analysis. The values represent height and width respectively
def _write_analysis_figures(names, da, db, folder): outdir : str
"""Writes a PDF containing most important plots for analysis""" Path where to store visualizations. If set to ``None``, then do not
store performance visualizations.
from matplotlib.backends.backend_pdf import PdfPages figure : str
import matplotlib.pyplot as plt The name of a performance figure (e.g. ``f1_score``, or ``jaccard``) to
use when comparing performances
diff = da - db nproc : int
bins = 50 Sets the number of parallel processes to use when running using
multiprocessing. A value of zero uses all reported cores. A value of
fname = os.path.join(folder, "statistics.pdf") ``1`` avoids completely the use of multiprocessing and runs all chores
os.makedirs(os.path.dirname(fname), exist_ok=True) in the current processing context.
with PdfPages(fname) as pdf:
plt.figure() Returns
plt.grid() =======
plt.hist(da, bins=bins)
plt.title( d : dict
f"{names[0]} - scores (N={len(da)}; M={numpy.median(da):.3f}; " A dictionary in which keys are filename stems and values are
f"$\mu$={numpy.mean(da):.3f}; $\sigma$={numpy.std(da, ddof=1):.3f})" dictionaries with the following contents:
)
pdf.savefig() ``df``: :py:class:`pandas.DataFrame`
plt.close() A dataframe with all the patch performances aggregated, for all
input images.
plt.figure()
plt.grid() ``n`` : :py:class:`numpy.ndarray`
plt.hist(db, bins=bins) A 2D numpy array containing the number of performance scores for
plt.title( every pixel in the original image
f"{names[1]} - scores (N={len(db)}; M={numpy.median(db):.3f}; "
f"$\mu$={numpy.mean(db):.3f}; $\sigma$={numpy.std(db, ddof=1):.3f})" ``avg`` : :py:class:`numpy.ndarray`
A 2D numpy array containing the average performances for every
pixel on the input image considering the patch sizes and strides
applied when windowing the image
``std`` : :py:class:`numpy.ndarray`
A 2D numpy array containing the (unbiased) standard deviations for
the provided performance figure, for every pixel on the input image
considering the patch sizes and strides applied when windowing the
image
"""
if not isinstance(threshold, float):
assert threshold in dataset, f"No dataset named '{threshold}'"
logger.info(
f"Evaluating threshold on '{threshold}' set for "
f"'{system_name}' using {steps} steps"
) )
pdf.savefig() threshold = run_evaluation(
plt.close() dataset[threshold], threshold, predictions[0], steps=steps
plt.figure()
plt.boxplot([da, db])
plt.title(f"{names[0]} and {names[1]} (N={len(da)})")
pdf.savefig()
plt.close()
plt.figure()
plt.boxplot(diff)
plt.title(f"Differences ({names[0]} - {names[1]}) (N={len(da)})")
pdf.savefig()
plt.close()
plt.figure()
plt.grid()
plt.hist(diff, bins=bins)
plt.title(
f"Systems ({names[0]} - {names[1]}) " \
f"(N={len(diff)}; M={numpy.median(diff):.3f}; " \
f"$\mu$={numpy.mean(diff):.3f}; " \
f"$\sigma$={numpy.std(diff, ddof=1):.3f})"
) )
pdf.savefig() logger.info(f"Set --threshold={threshold:.5f} for '{system_name}'")
plt.close()
# for a given threshold on each system, calculate patch performances
logger.info(
f"Evaluating patch performances on '{evaluate}' set for "
f"'{system_name}' using windows of size {size} and stride {stride}"
)
return patch_performances(
dataset,
evaluate,
preddir,
threshold,
size,
stride,
figure,
nproc,
outdir,
)
def _eval_differences(perf1, perf2, evaluate, dataset, size, stride, outdir,
figure, nproc):
"""Evaluate differences in the performance patches between two systems
Parameters
----------
perf1, perf2 : dict
A dictionary as returned by :py:func:`_eval_patches`
evaluate : str
Name of the dataset key to use from ``dataset`` to evaluate (typically,
``test``)
p = scipy.stats.pearsonr(da, db) dataset : dict
plt.figure() A dictionary mapping string keys to
plt.grid() :py:class:`torch.utils.data.dataset.Dataset` instances
plt.scatter(da, db, marker=".", color="black")
plt.xlabel("{names[0]}") size : tuple
plt.ylabel("{names[1]}") Two values indicating the size of windows to be used for patch
plt.title(f"Scatter (p={p[0]:.3f})") analysis. The values represent height and width respectively
pdf.savefig()
plt.close() stride : tuple
Two values indicating the stride of windows to be used for patch
analysis. The values represent height and width respectively
outdir : str
If set to ``None``, then do not output performance visualizations.
Otherwise, in directory ``outdir``, dumps the visualizations for the
performance differences between both systems.
figure : str
The name of a performance figure (e.g. ``f1_score``, or ``jaccard``) to
use when comparing performances
nproc : int
Sets the number of parallel processes to use when running using
multiprocessing. A value of zero uses all reported cores. A value of
``1`` avoids completely the use of multiprocessing and runs all chores
in the current processing context.
Returns
-------
d : dict
A dictionary representing patch performance differences across all
files and patches. The format of this is similar to the individual
inputs ``perf1`` and ``perf2``.
"""
perf_diff = dict([(k, perf1[k]["df"].copy()) for k in perf1])
# we can subtract these
to_subtract = (
"precision",
"recall",
"specificity",
"accuracy",
"jaccard",
"f1_score",
)
for k in perf_diff:
for col in to_subtract:
perf_diff[k][col] -= perf2[k]["df"][col]
return visual_performances(
dataset,
evaluate,
perf_diff,
size,
stride,
figure,
nproc,
outdir,
)
@click.command( @click.command(
...@@ -265,8 +354,8 @@ def _write_analysis_figures(names, da, db, folder): ...@@ -265,8 +354,8 @@ def _write_analysis_figures(names, da, db, folder):
@click.option( @click.option(
"--figure", "--figure",
"-f", "-f",
help="The name of a performance figure (e.g. f1_score) to use for " help="The name of a performance figure (e.g. f1_score, or jaccard) to "
"for comparing performances", "use when comparing performances",
default="f1_score", default="f1_score",
type=str, type=str,
show_default=True, show_default=True,
...@@ -285,8 +374,21 @@ def _write_analysis_figures(names, da, db, folder): ...@@ -285,8 +374,21 @@ def _write_analysis_figures(names, da, db, folder):
@click.option( @click.option(
"--remove-outliers/--no-remove-outliers", "--remove-outliers/--no-remove-outliers",
"-R", "-R",
help="If set, removes outliers from both score distributions before " \ help="If set, removes outliers from both score distributions before "
"running statistical analysis", "running statistical analysis. Outlier removal follows a 1.5 IQR range "
"check from the difference in figures between both systems and assumes "
"most of the distribution is contained within that range (like in a "
"normal distribution)",
default=False,
required=True,
show_default=True,
cls=ResourceOption,
)
@click.option(
"--remove-zeros/--no-remove-zeros",
"-R",
help="If set, removes instances from the statistical analysis in which "
"both systems had a performance equal to zero.",
default=False, default=False,
required=True, required=True,
show_default=True, show_default=True,
...@@ -295,7 +397,7 @@ def _write_analysis_figures(names, da, db, folder): ...@@ -295,7 +397,7 @@ def _write_analysis_figures(names, da, db, folder):
@click.option( @click.option(
"--parallel", "--parallel",
"-x", "-x",
help="Set the number of parallel processes to use when running using " \ help="Set the number of parallel processes to use when running using "
"multiprocessing. A value of zero uses all reported cores.", "multiprocessing. A value of zero uses all reported cores.",
default=1, default=1,
type=int, type=int,
...@@ -316,6 +418,7 @@ def significance( ...@@ -316,6 +418,7 @@ def significance(
figure, figure,
output_folder, output_folder,
remove_outliers, remove_outliers,
remove_zeros,
parallel, parallel,
**kwargs, **kwargs,
): ):
...@@ -329,125 +432,81 @@ def significance( ...@@ -329,125 +432,81 @@ def significance(
threshold = _validate_threshold(threshold, dataset) threshold = _validate_threshold(threshold, dataset)
assert evaluate in dataset, f"No dataset named '{evaluate}'" assert evaluate in dataset, f"No dataset named '{evaluate}'"
if isinstance(threshold, float): perf1 = _eval_patches(
threshold1 = threshold2 = threshold names[0],
threshold,
else: # it is a string, re-calculate it for each system individually
assert threshold in dataset, f"No dataset named '{threshold}'"
logger.info(
f"Evaluating threshold on '{threshold}' set for '{names[0]}' using {steps} steps"
)
threshold1 = run_evaluation(
dataset[threshold], threshold, predictions[0], steps=steps
)
logger.info(f"Set --threshold={threshold1:.5f} for '{names[0]}'")
logger.info(
f"Evaluating threshold on '{threshold}' set for '{names[1]}' using {steps} steps"
)
threshold2 = run_evaluation(
dataset[threshold], threshold, predictions[1], steps=steps
)
logger.info(f"Set --threshold={threshold2:.5f} for '{names[1]}'")
# for a given threshold on each system, calculate patch performances
logger.info(
f"Evaluating patch performances on '{evaluate}' set for '{names[0]}' using windows of size {size} and stride {stride}"
)
dir1 = (
os.path.join(output_folder, names[0])
if output_folder is not None
else None
)
perf1 = patch_performances(
dataset,
evaluate, evaluate,
predictions[0], predictions[0],
threshold1, dataset,
steps,
size, size,
stride, stride,
(output_folder
if output_folder is None
else os.path.join(output_folder, names[0])),
figure, figure,
nproc=parallel, parallel,
outdir=dir1,
) )
logger.info( perf2 = _eval_patches(
f"Evaluating patch performances on '{evaluate}' set for '{names[1]}' using windows of size {size} and stride {stride}" names[1],
) threshold,
dir2 = (
os.path.join(output_folder, names[1])
if output_folder is not None
else None
)
perf2 = patch_performances(
dataset,
evaluate, evaluate,
predictions[1], predictions[1],
threshold2,
size,
stride,
figure,
nproc=parallel,
outdir=dir2,
)
perf_diff = dict([(k, perf1[k]["df"].copy()) for k in perf1])
to_subtract = (
"precision",
"recall",
"specificity",
"accuracy",
"jaccard",
"f1_score",
)
for k in perf_diff:
for col in to_subtract:
perf_diff[k][col] -= perf2[k]["df"][col]
dirdiff = (
os.path.join(output_folder, "diff")
if output_folder is not None
else None
)
perf_diff = visual_performances(
dataset, dataset,
evaluate, steps,
perf_diff,
size, size,
stride, stride,
(output_folder
if output_folder is None
else os.path.join(output_folder, names[1])),
figure, figure,
nproc=parallel, parallel,
outdir=dirdiff,
) )
# loads all F1-scores for the given threshold perf_diff = _eval_differences(
perf1,
perf2,
evaluate,
dataset,
size,
stride,
(output_folder
if output_folder is None
else os.path.join(output_folder, "diff")),
figure,
parallel,
)
# loads all figures for the given threshold
stems = list(perf1.keys()) stems = list(perf1.keys())
da = numpy.array([perf1[k]["df"].f1_score for k in stems]).flatten() da = numpy.array([perf1[k]["df"][figure] for k in stems]).flatten()
db = numpy.array([perf2[k]["df"].f1_score for k in stems]).flatten() db = numpy.array([perf2[k]["df"][figure] for k in stems]).flatten()
diff = da - db diff = da - db
while remove_outliers: while remove_outliers:
outliers_diff = _index_of_outliers(diff) outliers_diff = index_of_outliers(diff)
if sum(outliers_diff) == 0: if sum(outliers_diff) == 0:
break break
diff = diff[~outliers_diff] diff = diff[~outliers_diff]
da = da[~outliers_diff] da = da[~outliers_diff]
db = db[~outliers_diff] db = db[~outliers_diff]
# also remove cases in which both da and db are zero if remove_zeros:
remove_zeros = (da == 0) & (db == 0) remove_zeros = (da == 0) & (db == 0)
diff = diff[~remove_zeros] diff = diff[~remove_zeros]
da = da[~remove_zeros] da = da[~remove_zeros]
db = db[~remove_zeros] db = db[~remove_zeros]
if output_folder is not None: if output_folder is not None:
_write_analysis_figures(names, da, db, output_folder) fname = os.path.join(output_folder, "analysis.pdf")
os.makedirs(os.path.dirname(fname), exist_ok=True)
write_analysis_figures(names, da, db, fname)
if output_folder is not None: if output_folder is not None:
fname = os.path.join(output_folder, "analysis.txt") fname = os.path.join(output_folder, "analysis.txt")
os.makedirs(os.path.dirname(fname), exist_ok=True) os.makedirs(os.path.dirname(fname), exist_ok=True)
with open(fname, "wt") as f: with open(fname, "wt") as f:
_write_analysis_text(names, da, db, f) write_analysis_text(names, da, db, f)
else: else:
_write_analysis_text(names, da, db, sys.stdout) write_analysis_text(names, da, db, sys.stdout)
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment