Skip to content
Snippets Groups Projects
Commit c2ad2952 authored by André Anjos's avatar André Anjos :speech_balloon:
Browse files

[test] Added test for significance CLI app

parent 24957595
No related branches found
No related tags found
No related merge requests found
Pipeline #41327 failed
...@@ -810,21 +810,27 @@ def write_analysis_text(names, da, db, f): ...@@ -810,21 +810,27 @@ def write_analysis_text(names, da, db, f):
w, p = scipy.stats.ttest_rel(da, db) w, p = scipy.stats.ttest_rel(da, db)
f.write(f" * Paired T (H0: same distro): S = {w:g}, p = {p:.5f}\n") f.write(f" * Paired T (H0: same distro): S = {w:g}, p = {p:.5f}\n")
w, p = scipy.stats.wilcoxon(diff) try:
f.write(" * Wilcoxon:\n") f.write(" * Wilcoxon:\n")
f.write(f" * H0 = same distro: W = {w:g}, p = {p:.5f}\n")
w, p = scipy.stats.wilcoxon(diff, alternative="greater")
f.write(
f" * H0 = med({names[0]}) < med({names[1]}): "
f"W = {w:g}, p = {p:.5f}\n"
)
w, p = scipy.stats.wilcoxon(diff, alternative="less") w, p = scipy.stats.wilcoxon(diff)
f.write( f.write(f" * H0 = same distro: W = {w:g}, p = {p:.5f}\n")
f" * H0 = med({names[0]}) > med({names[1]}): "
f"W = {w:g}, p = {p:.5f}\n" w, p = scipy.stats.wilcoxon(diff, alternative="greater")
) f.write(
f" * H0 = med({names[0]}) < med({names[1]}): "
f"W = {w:g}, p = {p:.5f}\n"
)
w, p = scipy.stats.wilcoxon(diff, alternative="less")
f.write(
f" * H0 = med({names[0]}) > med({names[1]}): "
f"W = {w:g}, p = {p:.5f}\n"
)
except ValueError as e:
f.write(f" ERROR: Differences are exactly zero between both "
f"patch distributions. The Wilcoxon test does not work in "
f"these conditions (review your prediction directories): {e}\n")
def write_analysis_figures(names, da, db, fname): def write_analysis_figures(names, da, db, fname):
......
...@@ -91,8 +91,8 @@ def _eval_patches( ...@@ -91,8 +91,8 @@ def _eval_patches(
store performance visualizations. store performance visualizations.
figure : str figure : str
The name of a performance figure (e.g. ``f1_score``, or ``jaccard``) to The name of a performance figure (e.g. ``f1_score``, ``jaccard``, or
use when comparing performances ``accuracy``) to use when comparing performances
nproc : int nproc : int
Sets the number of parallel processes to use when running using Sets the number of parallel processes to use when running using
...@@ -169,7 +169,7 @@ def _eval_patches( ...@@ -169,7 +169,7 @@ def _eval_patches(
# for a given threshold on each system, calculate patch performances # for a given threshold on each system, calculate patch performances
logger.info( logger.info(
f"Evaluating patch performances on '{evaluate}' set for " f"Evaluating patch '{figure}' on '{evaluate}' set for "
f"'{system_name}' using windows of size {size} and stride {stride}" f"'{system_name}' using windows of size {size} and stride {stride}"
) )
...@@ -292,6 +292,13 @@ def _eval_differences(names, perfs, evaluate, dataset, size, stride, outdir, ...@@ -292,6 +292,13 @@ def _eval_differences(names, perfs, evaluate, dataset, size, stride, outdir,
for col in to_subtract: for col in to_subtract:
perf_diff[k][col] -= perfs[1][k]["df"][col] perf_diff[k][col] -= perfs[1][k]["df"][col]
# for a given threshold on each system, calculate patch performances
logger.info(
f"Evaluating patch '{figure}' differences on '{evaluate}' set on "
f"'{names[0]}-{names[1]}' using windows of size {size} and "
f"stride {stride}"
)
retval = visual_performances( retval = visual_performances(
dataset, dataset,
evaluate, evaluate,
...@@ -428,7 +435,7 @@ def _eval_differences(names, perfs, evaluate, dataset, size, stride, outdir, ...@@ -428,7 +435,7 @@ def _eval_differences(names, perfs, evaluate, dataset, size, stride, outdir,
"-f", "-f",
help="The name of a performance figure (e.g. f1_score, or jaccard) to " help="The name of a performance figure (e.g. f1_score, or jaccard) to "
"use when comparing performances", "use when comparing performances",
default="f1_score", default="accuracy",
type=str, type=str,
show_default=True, show_default=True,
required=True, required=True,
...@@ -549,8 +556,8 @@ def significance( ...@@ -549,8 +556,8 @@ def significance(
) )
perf_diff = _eval_differences( perf_diff = _eval_differences(
perf1, names,
perf2, (perf1, perf2),
evaluate, evaluate,
dataset, dataset,
size, size,
...@@ -586,12 +593,13 @@ def significance( ...@@ -586,12 +593,13 @@ def significance(
if output_folder is not None: if output_folder is not None:
fname = os.path.join(output_folder, "analysis.pdf") fname = os.path.join(output_folder, "analysis.pdf")
os.makedirs(os.path.dirname(fname), exist_ok=True) os.makedirs(os.path.dirname(fname), exist_ok=True)
logger.info(f"Writing analysis figures to {fname} (multipage PDF)...")
write_analysis_figures(names, da, db, fname) write_analysis_figures(names, da, db, fname)
if output_folder is not None: if output_folder is not None:
fname = os.path.join(output_folder, "analysis.txt") fname = os.path.join(output_folder, "analysis.txt")
os.makedirs(os.path.dirname(fname), exist_ok=True) os.makedirs(os.path.dirname(fname), exist_ok=True)
logger.info(f"Writing analysis summary to {fname}...")
with open(fname, "wt") as f: with open(fname, "wt") as f:
write_analysis_text(names, da, db, f) write_analysis_text(names, da, db, f)
else: write_analysis_text(names, da, db, sys.stdout)
write_analysis_text(names, da, db, sys.stdout)
...@@ -524,6 +524,69 @@ def _check_compare(runner): ...@@ -524,6 +524,69 @@ def _check_compare(runner):
) )
def _check_significance(runner):
from ..script.significance import significance
with tempfile.NamedTemporaryFile(
mode="wt"
) as config, stdout_logging() as buf:
config.write("from bob.ip.binseg.data.stare import _make_dataset\n")
config.write(f"_raw = _make_dataset('{stare_datadir}')\n")
config.write(
"from bob.ip.binseg.configs.datasets.stare import _maker\n"
)
config.write("dataset = _maker('ah', _raw)\n")
config.flush()
ofolder = "significance"
cfolder = os.path.join(ofolder, "caches")
result = runner.invoke(
significance,
[
"-vv",
config.name,
"--names=v1", "v2",
"--predictions=predictions", "predictions",
"--threshold=0.5",
"--size=64", "64",
"--stride=32", "32",
"--figure=accuracy",
f"--output-folder={ofolder}",
f"--checkpoint-folder={cfolder}",
],
)
_assert_exit_0(result)
assert os.path.exists(ofolder)
assert os.path.exists(cfolder)
assert os.path.exists(os.path.join(ofolder, "analysis.pdf"))
assert os.path.exists(os.path.join(ofolder, "analysis.txt"))
keywords = {
r"^Evaluating patch 'accuracy' on": 2,
r"^Evaluating patch 'accuracy' differences on": 1,
r"^#Samples/Median/Avg/Std.Dev.": 1,
r"^Paired T-test": 1,
r"^Wilcoxon test": 3,
}
buf.seek(0)
logging_output = buf.read()
for k, v in keywords.items():
# if _str_counter(k, logging_output) != v:
# print(f"Count for string '{k}' appeared " \
# f"({_str_counter(k, result.output)}) " \
# f"instead of the expected {v}")
assert _str_counter(k, logging_output) == v, (
f"Count for string '{k}' appeared "
f"({_str_counter(k, logging_output)}) "
f"instead of the expected {v}:\nOutput:\n{logging_output}"
)
@rc_variable_set("bob.ip.binseg.stare.datadir") @rc_variable_set("bob.ip.binseg.stare.datadir")
def test_discrete_experiment_stare(): def test_discrete_experiment_stare():
...@@ -533,6 +596,7 @@ def test_discrete_experiment_stare(): ...@@ -533,6 +596,7 @@ def test_discrete_experiment_stare():
_check_predict(runner) _check_predict(runner)
_check_evaluate(runner) _check_evaluate(runner)
_check_compare(runner) _check_compare(runner)
_check_significance(runner)
def test_train_help(): def test_train_help():
...@@ -559,6 +623,12 @@ def test_compare_help(): ...@@ -559,6 +623,12 @@ def test_compare_help():
_check_help(compare) _check_help(compare)
def test_significance_help():
from ..script.significance import significance
_check_help(significance)
def test_config_help(): def test_config_help():
from ..script.config import config from ..script.config import config
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment