Skip to content
Snippets Groups Projects
Commit f6aaad9b authored by André Anjos's avatar André Anjos :speech_balloon:
Browse files

[script.*] Implement F1-score a priori where possible; Improve plotting of...

[script.*] Implement F1-score a priori where possible; Improve plotting of precision-recall curves with better legends, more consistence; Remove redundant plotting code; Update tests
parent b28ff7d1
No related branches found
No related tags found
1 merge request!12Streamlining
Pipeline #39138 failed
......@@ -16,7 +16,6 @@ import torchvision.transforms.functional as VF
import h5py
from ..utils.metric import base_metrics
from ..utils.plot import precision_recall_f1iso_confintval
import logging
......@@ -50,7 +49,7 @@ def _posneg(pred, gt, threshold):
return tp_tensor, fp_tensor, tn_tensor, fn_tensor
def _sample_metrics(pred, gt):
def _sample_metrics(pred, gt, bins):
"""
Calculates metrics on one single sample and saves it to disk
......@@ -64,6 +63,10 @@ def _sample_metrics(pred, gt):
gt : torch.Tensor
ground-truth (annotations)
bins : int
number of bins to use for threshold analysis. The step size is
calculated from this by dividing ``1.0/bins``.
Returns
-------
......@@ -82,10 +85,10 @@ def _sample_metrics(pred, gt):
"""
step_size = 0.01
step_size = 1.0/bins
data = []
for threshold in numpy.arange(0.0, 1.0, step_size):
for index, threshold in enumerate(numpy.arange(0.0, 1.0, step_size)):
tp_tensor, fp_tensor, tn_tensor, fn_tensor = _posneg(
pred, gt, threshold
......@@ -107,6 +110,7 @@ def _sample_metrics(pred, gt):
data.append(
[
index,
threshold,
precision,
recall,
......@@ -120,6 +124,7 @@ def _sample_metrics(pred, gt):
return pandas.DataFrame(
data,
columns=(
"index",
"threshold",
"precision",
"recall",
......@@ -254,6 +259,7 @@ def run(
"""
# Collect overall metrics
bins = 100 #number of thresholds to analyse for
data = {}
for sample in tqdm(dataset):
......@@ -268,7 +274,7 @@ def run(
raise RuntimeError(
f"{stem} entry already exists in data. Cannot overwrite."
)
data[stem] = _sample_metrics(pred, gt)
data[stem] = _sample_metrics(pred, gt, bins)
if overlayed_folder is not None:
overlay_image = _sample_analysis(
......@@ -286,8 +292,8 @@ def run(
df_metrics = pandas.concat(data.values())
# Report and Averages
avg_metrics = df_metrics.groupby("threshold").mean()
std_metrics = df_metrics.groupby("threshold").std()
avg_metrics = df_metrics.groupby("index").mean()
std_metrics = df_metrics.groupby("index").std()
# Uncomment below for F1-score calculation based on average precision and
# metrics instead of F1-scores of individual images. This method is in line
......@@ -306,20 +312,24 @@ def run(
avg_metrics["std_f1"] = std_metrics["f1_score"]
maxf1 = avg_metrics["f1_score"].max()
optimal_f1_threshold = avg_metrics["f1_score"].idxmax()
maxf1_index = avg_metrics["f1_score"].idxmax()
maxf1_threshold = avg_metrics["threshold"][maxf1_index]
logger.info(
f"Maximum F1-score of {maxf1:.5f}, achieved at "
f"threshold {optimal_f1_threshold:.2f} (chosen *a posteriori*)"
f"threshold {maxf1_threshold:.3f} (chosen *a posteriori*)"
)
if threshold is not None:
f1_a_priori = avg_metrics["f1_score"][threshold]
# get the closest possible threshold we have
index = int(round(bins*threshold))
f1_a_priori = avg_metrics["f1_score"][index]
actual_threshold = avg_metrics["threshold"][index]
logger.info(
f"F1-score of {f1_a_priori:.5f}, at threshold {threshold:.5f} "
f"(chosen *a priori*)"
f"F1-score of {f1_a_priori:.5f}, at threshold "
f"{actual_threshold:.3f} (chosen *a priori*)"
)
if output_folder is not None:
......@@ -335,22 +345,7 @@ def run(
)
avg_metrics.to_csv(metrics_path)
# Plotting
np_avg_metrics = avg_metrics.to_numpy().T
figure_path = os.path.join(output_folder, "precision-recall.pdf")
logger.info(f"Saving overall precision-recall plot at {figure_path}...")
fig = precision_recall_f1iso_confintval(
[np_avg_metrics[0]],
[np_avg_metrics[1]],
[np_avg_metrics[7]],
[np_avg_metrics[8]],
[np_avg_metrics[10]],
[np_avg_metrics[11]],
["data"],
)
fig.savefig(figure_path)
return optimal_f1_threshold
return maxf1_threshold
def compare_annotators(baseline, other, output_folder, overlayed_folder=None):
......@@ -395,7 +390,7 @@ def compare_annotators(baseline, other, output_folder, overlayed_folder=None):
raise RuntimeError(
f"{stem} entry already exists in data. " f"Cannot overwrite."
)
data[stem] = _sample_metrics(pred, gt)
data[stem] = _sample_metrics(pred, gt, 2)
if overlayed_folder is not None:
overlay_image = _sample_analysis(
......@@ -413,8 +408,8 @@ def compare_annotators(baseline, other, output_folder, overlayed_folder=None):
df_metrics = pandas.concat(data.values())
# Report and Averages
avg_metrics = df_metrics.groupby("threshold").mean()
std_metrics = df_metrics.groupby("threshold").std()
avg_metrics = df_metrics.groupby("index").mean()
std_metrics = df_metrics.groupby("index").std()
# Uncomment below for F1-score calculation based on average precision and
# metrics instead of F1-scores of individual images. This method is in line
......@@ -432,9 +427,13 @@ def compare_annotators(baseline, other, output_folder, overlayed_folder=None):
avg_metrics["re_lower"] = avg_metrics["recall"] - avg_metrics["std_re"]
avg_metrics["std_f1"] = std_metrics["f1_score"]
metrics_path = os.path.join(output_folder, "metrics.csv")
# we actually only need to keep the second row of the pandas dataframe
# with threshold == 0.5 - the first row is redundant
avg_metrics.drop(0, inplace=True)
metrics_path = os.path.join(output_folder, "metrics-second-annotator.csv")
logger.info(f"Saving averages over all input images at {metrics_path}...")
avg_metrics.to_csv(metrics_path)
maxf1 = avg_metrics["f1_score"].max()
logger.info(f"Maximum F1-score of {maxf1:.5f} (second annotator)")
logger.info(f"F1-score of {maxf1:.5f} (second annotator; threshold=0.5)")
......@@ -8,12 +8,105 @@ from bob.extension.scripts.click_helper import (
AliasedGroup,
)
from ..utils.plot import combined_precision_recall_f1iso_confintval
import pandas
from ..utils.plot import precision_recall_f1iso
import logging
logger = logging.getLogger(__name__)
def _validate_threshold(t, dataset):
"""Validates the user threshold selection. Returns parsed threshold."""
if t is None:
return t
try:
# we try to convert it to float first
t = float(t)
if t < 0.0 or t > 1.0:
raise ValueError("Float thresholds must be within range [0.0, 1.0]")
except ValueError:
# it is a bit of text - assert dataset with name is available
if not isinstance(dataset, dict):
raise ValueError(
"Threshold should be a floating-point number "
"if your provide only a single dataset for evaluation"
)
if t not in dataset:
raise ValueError(
f"Text thresholds should match dataset names, "
f"but {t} is not available among the datasets provided ("
f"({', '.join(dataset.keys())})"
)
return t
def _load_and_plot(data, threshold=None):
"""Plots comparison chart of all evaluated models
Parameters
----------
data : dict
A dict in which keys are the names of the systems and the values are
paths to ``metrics.csv`` style files.
threshold : :py:class:`float`, :py:class:`str`, Optional
A value indicating which threshold to choose for plotting a "F1-score"
(black) dot on the various curves. If set to ``None``, then plot the
maximum F1-score on that curve. If set to a floating-point value, then
plot the F1-score that is obtained on that particular threshold. If
set to a string, it should match one of the keys in ``data``. It then
first calculate the threshold reaching the maximum F1-score on that
particular dataset and then applies that threshold to all other sets.
Returns
-------
figure : matplotlib.figure.Figure
A figure, with all systems combined into a single plot.
"""
if isinstance(threshold, str):
logger.info(f"Calculating threshold from maximum F1-score at "
f"'{threshold}' dataset...")
metrics_path = data[threshold]
df = pandas.read_csv(metrics_path)
maxf1 = df["f1_score"].max()
use_threshold = df["threshold"][df["f1_score"].idxmax()]
logger.info(f"Dataset '*': threshold = {use_threshold:.3f}'")
elif isinstance(threshold, float):
use_threshold = threshold
logger.info(f"Dataset '*': threshold = {use_threshold:.3f}'")
names = []
dfs = []
thresholds = []
# loads all data
for name, metrics_path in data.items():
logger.info(f"Loading metrics from {metrics_path}...")
df = pandas.read_csv(metrics_path)
if threshold is None:
use_threshold = df["threshold"][df["f1_score"].idxmax()]
logger.info(f"Dataset '{name}': threshold = {use_threshold:.3f}'")
names.append(name)
dfs.append(df)
thresholds.append(use_threshold)
return precision_recall_f1iso(names, dfs, thresholds, confidence=True)
@click.command(
epilog="""Examples:
......@@ -36,8 +129,23 @@ logger = logging.getLogger(__name__)
default="comparison.pdf",
type=click.Path(),
)
@click.option(
"--threshold",
"-t",
help="This number is used to select which F1-score to use for "
"representing a system performance. If not set, we report the maximum "
"F1-score in the set, which is equivalent to threshold selection a "
"posteriori (biased estimator). You can either set this value to a "
"floating-point number in the range [0.0, 1.0], or to a string, naming "
"one of the systems which will be used to calculate the threshold "
"leading to the maximum F1-score and then applied to all other sets.",
default=None,
show_default=False,
required=False,
)
@verbosity_option()
def compare(label_path, output, **kwargs):
def compare(label_path, output, threshold, **kwargs):
"""Compares multiple systems together"""
# hack to get a dictionary from arguments passed to input
......@@ -46,6 +154,11 @@ def compare(label_path, output, **kwargs):
" composed of name-path entries")
data = dict(zip(label_path[::2], label_path[1::2]))
fig = combined_precision_recall_f1iso_confintval(data)
threshold = _validate_threshold(threshold, data)
fig = _load_and_plot(data, threshold=threshold)
logger.info(f"Saving plot at {output}")
fig.savefig(output)
# TODO: print table with all results
pass
......@@ -107,16 +107,6 @@ def _validate_threshold(t, dataset):
cls=ResourceOption,
show_default=True,
)
@click.option(
"--second-annotator-folder",
"-O",
help="Path where to store the analysis result for second annotator "
"comparisons (only used if --second-annotator is also passed)",
required=True,
default="second-annotator",
type=click.Path(),
cls=ResourceOption,
)
@click.option(
"--overlayed",
"-O",
......@@ -133,11 +123,9 @@ def _validate_threshold(t, dataset):
)
@click.option(
"--threshold",
"-T",
help="If you set --overlayed, then you can provide a value to be used as "
"threshold to be applied on probability maps and decide for positives and "
"negatives. This binary output will be used to define true and false "
"positives, and false negatives for the overlay analysis. This number "
"-t",
help="This number is used to define positives and negatives from "
"probability maps, and report F1-scores (a priori). It "
"should either come from the training set or a separate validation set "
"to avoid biasing the analysis. Optionally, if you provide a multi-set "
"dataset as input, this may also be the name of an existing set from "
......@@ -155,7 +143,6 @@ def evaluate(
predictions_folder,
dataset,
second_annotator,
second_annotator_folder,
overlayed,
threshold,
**kwargs,
......@@ -173,7 +160,6 @@ def evaluate(
"dataset": dataset,
"output_folder": output_folder,
"second_annotator": second_annotator,
"second_annotator_folder": second_annotator_folder,
}
else:
for k, v in dataset.items():
......@@ -184,9 +170,6 @@ def evaluate(
"dataset": v,
"output_folder": os.path.join(output_folder, k),
"second_annotator": second_annotator.get(k),
"second_annotator_folder": os.path.join(
second_annotator_folder, k
),
}
if isinstance(threshold, str):
......@@ -209,7 +192,7 @@ def evaluate(
compare_annotators(
v["dataset"],
v["second_annotator"],
v["second_annotator_folder"],
v["output_folder"],
os.path.join(overlayed, "second-annotator")
if overlayed
else None,
......
......@@ -380,14 +380,12 @@ def experiment(
logger.info(f"Setting --threshold={threshold}...")
analysis_folder = os.path.join(output_folder, "analysis")
second_annotator_folder = os.path.join(analysis_folder, "second-annotator")
ctx.invoke(
evaluate,
output_folder=analysis_folder,
predictions_folder=predictions_folder,
dataset=dataset,
second_annotator=second_annotator,
second_annotator_folder=second_annotator_folder,
overlayed=overlayed_folder,
threshold=threshold,
verbose=verbose,
......@@ -412,8 +410,8 @@ def experiment(
if k.startswith("_"):
logger.info(f"Skipping dataset '{k}' (not to be compared)")
continue
systems += [f"{k} (2nd. annot.)",
os.path.join(second_annotator_folder, k, "metrics.csv")]
systems += [f"{k} (2nd. annot.)", os.path.join(analysis_folder, k,
"metrics-second-annotator.csv")]
output_pdf = os.path.join(output_folder, "comparison.pdf")
ctx.invoke(compare, label_path=systems, output=output_pdf, verbose=verbose)
......
......@@ -127,14 +127,13 @@ def _check_experiment_stare(overlay):
# check evaluation outputs
eval_folder = os.path.join(output_folder, "analysis")
second_folder = os.path.join(eval_folder, "second-annotator")
assert os.path.exists(os.path.join(eval_folder, "train", "metrics.csv"))
assert os.path.exists(os.path.join(eval_folder, "test", "metrics.csv"))
assert os.path.exists(
os.path.join(second_folder, "train", "metrics.csv")
os.path.join(eval_folder, "train", "metrics-second-annotator.csv")
)
assert os.path.exists(
os.path.join(second_folder, "test", "metrics.csv")
os.path.join(eval_folder, "test", "metrics-second-annotator.csv")
)
overlay_folder = os.path.join(output_folder, "overlayed", "analysis")
......@@ -172,8 +171,7 @@ def _check_experiment_stare(overlay):
r"^Started evaluation$": 1,
r"^Maximum F1-score of.*\(chosen \*a posteriori\*\)$": 3,
r"^F1-score of.*\(chosen \*a priori\*\)$": 2,
r"^Maximum F1-score of .* \(second annotator\)$": 2,
r"^Saving overall precision-recall plot at .*$": 2,
r"^F1-score of.*\(second annotator; threshold=0.5\)$": 2,
r"^Ended evaluation$": 1,
r"^Started comparison$": 1,
r"^Loading metrics from": 4,
......@@ -341,7 +339,6 @@ def _check_evaluate(runner):
config.flush()
output_folder = "evaluations"
second_folder = "evaluations-2nd"
overlay_folder = os.path.join("overlayed", "analysis")
result = runner.invoke(
evaluate,
......@@ -351,13 +348,13 @@ def _check_evaluate(runner):
f"--output-folder={output_folder}",
"--predictions-folder=predictions",
f"--overlayed={overlay_folder}",
f"--second-annotator-folder={second_folder}",
],
)
_assert_exit_0(result)
assert os.path.exists(os.path.join(output_folder, "metrics.csv"))
assert os.path.exists(os.path.join(second_folder, "metrics.csv"))
assert os.path.exists(os.path.join(output_folder,
"metrics-second-annotator.csv"))
# check overlayed images are there (since we requested them)
basedir = os.path.join(overlay_folder, "stare-images")
......@@ -369,7 +366,7 @@ def _check_evaluate(runner):
r"^Saving averages over all input images.*$": 2,
r"^Maximum F1-score of.*\(chosen \*a posteriori\*\)$": 1,
r"^F1-score of.*\(chosen \*a priori\*\)$": 1,
r"^Maximum F1-score of .* \(second annotator\)$": 1,
r"^F1-score of.*\(second annotator; threshold=0.5\)$": 1,
}
buf.seek(0)
logging_output = buf.read()
......@@ -393,7 +390,6 @@ def _check_compare(runner):
with stdout_logging() as buf:
output_folder = "evaluations"
second_folder = "evaluations-2nd"
result = runner.invoke(
compare,
[
......@@ -402,7 +398,7 @@ def _check_compare(runner):
"test",
os.path.join(output_folder, "metrics.csv"),
"test (2nd. human)",
os.path.join(second_folder, "metrics.csv"),
os.path.join(output_folder, "metrics-second-annotator.csv"),
],
)
_assert_exit_0(result)
......
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import contextlib
from itertools import cycle
import numpy
......@@ -15,72 +16,49 @@ import logging
logger = logging.getLogger(__name__)
def precision_recall_f1iso(precision, recall, names):
"""Creates a precision-recall plot of the given data.
@contextlib.contextmanager
def _precision_recall_canvas(title=None):
"""Generates a canvas to draw precision-recall curves
Works like a context manager, yielding a figure and an axes set in which
the precision-recall curves should be added to. The figure already
contains F1-ISO lines and is preset to a 0-1 square region. Once the
context is finished, ``fig.tight_layout()`` is called.
The plot will be annotated with F1-score iso-lines (in which the F1-score
maintains the same value)
Parameters
----------
precision : :py:class:`numpy.ndarray` or :py:class:`list`
A list of 1D arrays containing the Y coordinates of the plot, or the
precision, or a 2D np array in which the rows correspond to each of the
system's precision coordinates.
recall : :py:class:`numpy.ndarray` or :py:class:`list`
A list of 1D arrays containing the X coordinates of the plot, or the
recall, or a 2D np array in which the rows correspond to each of the
system's recall coordinates.
title : :py:class:`str`, Optional
Optional title to add to this plot
names : :py:class:`list`
An iterable over the names of each of the systems along the rows of
``precision`` and ``recall``
Returns
-------
Yields
------
figure : matplotlib.figure.Figure
A matplotlib figure you can save or display
The figure that should be finally returned to the user
axes : matplotlib.figure.Axes
An axis set where to precision-recall plots should be added to
"""
fig, ax1 = plt.subplots(1)
lines = ["-", "--", "-.", ":"]
linecycler = cycle(lines)
for p, r, n in zip(precision, recall, names):
# Plots only from the point where recall reaches its maximum, otherwise, we
# don't see a curve...
i = r.argmax()
pi = p[i:]
ri = r[i:]
valid = (pi + ri) > 0
f1 = 2 * (pi[valid] * ri[valid]) / (pi[valid] + ri[valid])
# optimal point along the curve
argmax = f1.argmax()
opi = pi[argmax]
ori = ri[argmax]
# Plot Recall/Precision as threshold changes
ax1.plot(
ri[pi > 0],
pi[pi > 0],
next(linecycler),
label="[F={:.4f}] {}".format(f1.max(), n),
)
ax1.plot(
ori, opi, marker="o", linestyle=None, markersize=3, color="black"
)
ax1.grid(linestyle="--", linewidth=1, color="gray", alpha=0.2)
if len(names) > 1:
plt.legend(loc="lower left", framealpha=0.5)
ax1.set_xlabel("Recall")
ax1.set_ylabel("Precision")
ax1.set_xlim([0.0, 1.0])
ax1.set_ylim([0.0, 1.0])
fig, axes1 = plt.subplots(1)
# Names and bounds
axes1.set_xlabel("Recall")
axes1.set_ylabel("Precision")
axes1.set_xlim([0.0, 1.0])
axes1.set_ylim([0.0, 1.0])
if title is not None:
axes1.set_title(title)
axes1.grid(linestyle="--", linewidth=1, color="gray", alpha=0.2)
axes2 = axes1.twinx()
# Annotates plot with F1-score iso-lines
ax2 = ax1.twinx()
f_scores = numpy.linspace(0.1, 0.9, num=9)
tick_locs = []
tick_labels = []
......@@ -90,197 +68,168 @@ def precision_recall_f1iso(precision, recall, names):
(l,) = plt.plot(x[y >= 0], y[y >= 0], color="green", alpha=0.1)
tick_locs.append(y[-1])
tick_labels.append("%.1f" % f_score)
ax2.tick_params(axis="y", which="both", pad=0, right=False, left=False)
ax2.set_ylabel("iso-F", color="green", alpha=0.3)
ax2.set_ylim([0.0, 1.0])
ax2.yaxis.set_label_coords(1.015, 0.97)
ax2.set_yticks(tick_locs) # notice these are invisible
for k in ax2.set_yticklabels(tick_labels):
axes2.tick_params(axis="y", which="both", pad=0, right=False, left=False)
axes2.set_ylabel("iso-F", color="green", alpha=0.3)
axes2.set_ylim([0.0, 1.0])
axes2.yaxis.set_label_coords(1.015, 0.97)
axes2.set_yticks(tick_locs) # notice these are invisible
for k in axes2.set_yticklabels(tick_labels):
k.set_color("green")
k.set_alpha(0.3)
k.set_size(8)
# we should see some of axes 1 axes
ax1.spines["right"].set_visible(False)
ax1.spines["top"].set_visible(False)
ax1.spines["left"].set_position(("data", -0.015))
ax1.spines["bottom"].set_position(("data", -0.015))
axes1.spines["right"].set_visible(False)
axes1.spines["top"].set_visible(False)
axes1.spines["left"].set_position(("data", -0.015))
axes1.spines["bottom"].set_position(("data", -0.015))
# we shouldn't see any of axes 2 axes
ax2.spines["right"].set_visible(False)
ax2.spines["top"].set_visible(False)
ax2.spines["left"].set_visible(False)
ax2.spines["bottom"].set_visible(False)
axes2.spines["right"].set_visible(False)
axes2.spines["top"].set_visible(False)
axes2.spines["left"].set_visible(False)
axes2.spines["bottom"].set_visible(False)
# yield execution, lets user draw precision-recall plots, and the legend
# before tighteneing the layout
yield fig, axes1
plt.tight_layout()
return fig
def precision_recall_f1iso_confintval(
precision, recall, pr_upper, pr_lower, re_upper, re_lower, names
):
"""Creates a precision-recall plot of the given data, with confidence
intervals
def precision_recall_f1iso(label, df, threshold, confidence=True):
"""Creates a precision-recall plot with confidence intervals
This function creates and returns a Matplotlib figure with a
precision-recall plot containing shaded confidence intervals. The plot
will be annotated with F1-score iso-lines (in which the F1-score maintains
the same value).
The plot will be annotated with F1-score iso-lines (in which the F1-score
maintains the same value)
Parameters
----------
precision : :py:class:`numpy.ndarray` or :py:class:`list`
A list of 1D arrays containing the Y coordinates of the plot, or the
precision, or a 2D array in which the rows correspond to each
of the system's precision coordinates.
recall : :py:class:`numpy.ndarray` or :py:class:`list`
A list of 1D arrays containing the X coordinates of the plot, or
the recall, or a 2D array in which the rows correspond to each
of the system's recall coordinates.
pr_upper : :py:class:`numpy.ndarray` or :py:class:`list`
A list of 1D arrays containing the upper bound of the confidence
interval for the Y coordinates of the plot, or the precision upper
bound, or a 2D array in which the rows correspond to each of the
system's precision upper-bound coordinates.
pr_lower : :py:class:`numpy.ndarray` or :py:class:`list`
A list of 1D arrays containing the lower bound of the confidence
interval for the Y coordinates of the plot, or the precision lower
bound, or a 2D array in which the rows correspond to each of the
system's precision lower-bound coordinates.
re_upper : :py:class:`numpy.ndarray` or :py:class:`list`
A list of 1D arrays containing the upper bound of the confidence
interval for the Y coordinates of the plot, or the recall upper bound,
or a 2D array in which the rows correspond to each of the system's
recall upper-bound coordinates.
re_lower : :py:class:`numpy.ndarray` or :py:class:`list`
A list of 1D arrays containing the lower bound of the confidence
interval for the Y coordinates of the plot, or the recall lower bound,
or a 2D array in which the rows correspond to each of the system's
recall lower-bound coordinates.
names : :py:class:`list`
An iterable over the names of each of the systems along the rows of
``precision`` and ``recall``
label : :py:class:`list`
A list of names to be associated to each line
df : :py:class:`pandas.DataFrame`
A dataframe that is produced by our evaluator engine, indexed by
integer "thresholds", containing the following columns: ``threshold``
(sorted ascending), ``precision``, ``recall``, ``pr_upper`` (upper
precision bounds), ``pr_lower`` (lower precision bounds), ``re_upper``
(upper recall bounds), ``re_lower`` (lower recall bounds).
Dataframes with a single entry are treated specially as these are
considered "second-annotator" performances. A single dot and a line
showing the variability is drawn in these cases.
threshold : :py:class:`list`
A list of thresholds to graph with a dot for each set. Specific
threshold values do not affect "second-annotator" dataframes.
confidence : :py:class:`bool`, Optional
If set, draw confidence intervals for each line, using ``*_upper`` and
``*_lower`` entries.
Returns
-------
figure : matplotlib.figure.Figure
A matplotlib figure you can save or display
A matplotlib figure you can save or display (uses an ``agg`` backend)
"""
fig, ax1 = plt.subplots(1)
lines = ["-", "--", "-.", ":"]
colors = [
"#1f77b4",
"#ff7f0e",
"#2ca02c",
"#d62728",
"#9467bd",
"#8c564b",
"#e377c2",
"#7f7f7f",
"#bcbd22",
"#17becf",
]
"#1f77b4",
"#ff7f0e",
"#2ca02c",
"#d62728",
"#9467bd",
"#8c564b",
"#e377c2",
"#7f7f7f",
"#bcbd22",
"#17becf",
]
colorcycler = cycle(colors)
linecycler = cycle(lines)
for p, r, pu, pl, ru, rl, n in zip(
precision, recall, pr_upper, pr_lower, re_upper, re_lower, names
):
# Plots only from the point where recall reaches its maximum, otherwise, we
# don't see a curve...
i = r.argmax()
pi = p[i:]
ri = r[i:]
pui = pu[i:]
pli = pl[i:]
rui = ru[i:]
rli = rl[i:]
valid = (pi + ri) > 0
f1 = 2 * (pi[valid] * ri[valid]) / (pi[valid] + ri[valid])
# optimal point along the curve
argmax = f1.argmax()
opi = pi[argmax]
ori = ri[argmax]
# Plot Recall/Precision as threshold changes
ax1.plot(
ri[pi > 0],
pi[pi > 0],
next(linecycler),
label="[F={:.4f}] {}".format(f1.max(), n),
)
ax1.plot(
ori, opi, marker="o", linestyle=None, markersize=3, color="black"
)
# Plot confidence
# Upper bound
# ax1.plot(r95ui[p95ui>0], p95ui[p95ui>0])
# Lower bound
# ax1.plot(r95li[p95li>0], p95li[p95li>0])
# create the limiting polygon
vert_x = numpy.concatenate((rui[pui > 0], rli[pli > 0][::-1]))
vert_y = numpy.concatenate((pui[pui > 0], pli[pli > 0][::-1]))
# hacky workaround to plot 2nd human
if numpy.isclose(numpy.mean(rui), rui[1], rtol=1e-05):
logger.warning("Found 2nd human annotator in metrics - patching...")
p = plt.Polygon(
numpy.column_stack((vert_x, vert_y)),
facecolor="none",
alpha=0.2,
edgecolor=next(colorcycler),
lw=2,
)
else:
p = plt.Polygon(
numpy.column_stack((vert_x, vert_y)),
facecolor=next(colorcycler),
alpha=0.2,
edgecolor="none",
lw=0.2,
)
ax1.add_artist(p)
ax1.grid(linestyle="--", linewidth=1, color="gray", alpha=0.2)
if len(names) > 1:
plt.legend(loc="lower left", framealpha=0.5)
ax1.set_xlabel("Recall")
ax1.set_ylabel("Precision")
ax1.set_xlim([0.0, 1.0])
ax1.set_ylim([0.0, 1.0])
# Annotates plot with F1-score iso-lines
ax2 = ax1.twinx()
f_scores = numpy.linspace(0.1, 0.9, num=9)
tick_locs = []
tick_labels = []
for f_score in f_scores:
x = numpy.linspace(0.01, 1)
y = f_score * x / (2 * x - f_score)
(l,) = plt.plot(x[y >= 0], y[y >= 0], color="green", alpha=0.1)
tick_locs.append(y[-1])
tick_labels.append("%.1f" % f_score)
ax2.tick_params(axis="y", which="both", pad=0, right=False, left=False)
ax2.set_ylabel("iso-F", color="green", alpha=0.3)
ax2.set_ylim([0.0, 1.0])
ax2.yaxis.set_label_coords(1.015, 0.97)
ax2.set_yticks(tick_locs) # notice these are invisible
for k in ax2.set_yticklabels(tick_labels):
k.set_color("green")
k.set_alpha(0.3)
k.set_size(8)
# we should see some of axes 1 axes
ax1.spines["right"].set_visible(False)
ax1.spines["top"].set_visible(False)
ax1.spines["left"].set_position(("data", -0.015))
ax1.spines["bottom"].set_position(("data", -0.015))
# we shouldn't see any of axes 2 axes
ax2.spines["right"].set_visible(False)
ax2.spines["top"].set_visible(False)
ax2.spines["left"].set_visible(False)
ax2.spines["bottom"].set_visible(False)
plt.tight_layout()
with _precision_recall_canvas(title=None) as (fig, axes):
legend = []
for kn, kdf, kt in zip(label, df, threshold):
# plots only from the point where recall reaches its maximum,
# otherwise, we don't see a curve...
max_recall = kdf["recall"].idxmax()
pi = kdf.precision[max_recall:]
ri = kdf.recall[max_recall:]
valid = (pi + ri) > 0
f1 = 2 * (pi[valid] * ri[valid]) / (pi[valid] + ri[valid])
# optimal point along the curve
bins = len(kdf)
index = int(round(bins*kt))
index = min(index, len(kdf)-1) #avoids out of range indexing
# plots Recall/Precision as threshold changes
label = f"{kn} (F1={kdf.f1_score[index]:.4f})"
color = next(colorcycler)
if len(kdf) == 1:
# plot black dot for F1-score at select threshold
marker, = axes.plot(kdf.recall[index], kdf.precision[index],
marker="*", markersize=6, color=color, alpha=0.8,
linestyle="None")
line, = axes.plot(kdf.recall[index], kdf.precision[index],
linestyle="None", color=color, alpha=0.2)
legend.append(([marker, line], label))
else:
# line first, so marker gets on top
style = next(linecycler)
line, = axes.plot(ri[pi > 0], pi[pi > 0], color=color,
linestyle=style)
marker, = axes.plot(kdf.recall[index], kdf.precision[index],
marker="o", linestyle=style, markersize=4,
color=color, alpha=0.8)
legend.append(([marker, line], label))
if confidence:
pui = kdf.pr_upper[max_recall:]
pli = kdf.pr_lower[max_recall:]
rui = kdf.re_upper[max_recall:]
rli = kdf.re_lower[max_recall:]
# Plot confidence
# Upper bound
# create the limiting polygon
vert_x = numpy.concatenate((rui[pui > 0], rli[pli > 0][::-1]))
vert_y = numpy.concatenate((pui[pui > 0], pli[pli > 0][::-1]))
# hacky workaround to plot 2nd human
if len(kdf) == 1: #binary system, very likely
logger.warning("Found 2nd human annotator - patching...")
p, = axes.plot(vert_x, vert_y, color=color, alpha=0.1, lw=3)
else:
p = plt.Polygon(
numpy.column_stack((vert_x, vert_y)),
facecolor=color,
alpha=0.2,
edgecolor="none",
lw=0.2,
)
legend[-1][0].append(p)
axes.add_artist(p)
if len(label) > 1:
axes.legend([tuple(k[0]) for k in legend], [k[1] for k in legend],
loc="lower left", fancybox=True, framealpha=0.7)
return fig
......@@ -311,48 +260,3 @@ def loss_curve(df):
plt.tight_layout()
fig = ax1.get_figure()
return fig
def combined_precision_recall_f1iso_confintval(data):
"""Plots comparison chart of all evaluated models
Parameters
----------
data : dict
A dict in which keys are the names of the systems and the values are
paths to ``metrics.csv`` style files.
Returns
-------
figure : matplotlib.figure.Figure
A figure, with all systems combined into a single plot.
"""
precisions = []
recalls = []
pr_ups = []
pr_lows = []
re_ups = []
re_lows = []
names = []
for name, metrics_path in data.items():
logger.info(f"Loading metrics from {metrics_path}...")
df = pandas.read_csv(metrics_path)
precisions.append(df.precision.to_numpy())
recalls.append(df.recall.to_numpy())
pr_ups.append(df.pr_upper.to_numpy())
pr_lows.append(df.pr_lower.to_numpy())
re_ups.append(df.re_upper.to_numpy())
re_lows.append(df.re_lower.to_numpy())
names.append(name)
fig = precision_recall_f1iso_confintval(
precisions, recalls, pr_ups, pr_lows, re_ups, re_lows, names
)
return fig
......@@ -65,7 +65,7 @@ Evaluation
----------
In evaluation, we input an **annotated** dataset and predictions to generate
performance figures that can help analysis of a trained model. Evaluation is
performance summaries that help analysis of a trained model. Evaluation is
done using the :ref:`evaluate command `<bob.ip.binseg.cli.evaluate>` followed
by the model and the annotated dataset configuration, and the path to the
pretrained weights via the ``--weight`` argument.
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment