Skip to content
Snippets Groups Projects
Commit 139af88c authored by André Anjos's avatar André Anjos :speech_balloon:
Browse files

[scripts.evaluate] Implement automatic threshold and a priori statistics

parent 4977aa0d
No related branches found
No related tags found
1 merge request!12Streamlining
Pipeline #39117 failed
......@@ -19,6 +19,7 @@ from ..utils.metric import base_metrics
from ..utils.plot import precision_recall_f1iso_confintval
import logging
logger = logging.getLogger(__name__)
......@@ -86,40 +87,60 @@ def _sample_metrics(pred, gt):
for threshold in numpy.arange(0.0, 1.0, step_size):
tp_tensor, fp_tensor, tn_tensor, fn_tensor = _posneg(pred, gt, threshold)
tp_tensor, fp_tensor, tn_tensor, fn_tensor = _posneg(
pred, gt, threshold
)
# calc metrics from scalars
tp_count = torch.sum(tp_tensor).item()
fp_count = torch.sum(fp_tensor).item()
tn_count = torch.sum(tn_tensor).item()
fn_count = torch.sum(fn_tensor).item()
precision, recall, specificity, accuracy, jaccard, f1_score = \
base_metrics(tp_count, fp_count, tn_count, fn_count)
data.append([threshold, precision, recall, specificity,
accuracy, jaccard, f1_score])
return pandas.DataFrame(data, columns=(
"threshold",
"precision",
"recall",
"specificity",
"accuracy",
"jaccard",
"f1_score",
))
(
precision,
recall,
specificity,
accuracy,
jaccard,
f1_score,
) = base_metrics(tp_count, fp_count, tn_count, fn_count)
data.append(
[
threshold,
precision,
recall,
specificity,
accuracy,
jaccard,
f1_score,
]
)
return pandas.DataFrame(
data,
columns=(
"threshold",
"precision",
"recall",
"specificity",
"accuracy",
"jaccard",
"f1_score",
),
)
def _sample_analysis(
img,
pred,
gt,
threshold,
tp_color=(0, 255, 0), # (128,128,128) Gray
fp_color=(0, 0, 255), # (70, 240, 240) Cyan
fn_color=(255, 0, 0), # (245, 130, 48) Orange
overlay=True,
):
img,
pred,
gt,
threshold,
tp_color=(0, 255, 0), # (128,128,128) Gray
fp_color=(0, 0, 255), # (70, 240, 240) Cyan
fn_color=(255, 0, 0), # (245, 130, 48) Orange
overlay=True,
):
"""Visualizes true positives, false positives and false negatives
......@@ -186,8 +207,13 @@ def _sample_analysis(
return tp_pil_colored
def run(dataset, predictions_folder, output_folder, overlayed_folder=None,
overlay_threshold=None):
def run(
dataset,
predictions_folder,
output_folder=None,
overlayed_folder=None,
threshold=None,
):
"""
Runs inference and calculates metrics
......@@ -202,19 +228,21 @@ def run(dataset, predictions_folder, output_folder, overlayed_folder=None,
folder where predictions for the dataset images has been previously
stored
output_folder : str
folder where to store results
output_folder : :py:class:`str`, Optional
folder where to store results. If not provided, then do not store any
analysis (useful for quickly calculating overlay thresholds)
overlayed_folder : :py:class:`str`, Optional
if not ``None``, then it should be the name of a folder where to store
overlayed versions of the images and ground-truths
overlay_threshold : :py:class:`float`, Optional
threshold : :py:class:`float`, Optional
if ``overlayed_folder``, then this should be threshold (floating point)
to apply to prediction maps to decide on positives and negatives for
overlaying analysis (graphical output). This number should come from
the training set or a separate validation set. Using a test set value
may bias your analysis.
may bias your analysis. This number is also used to print the a priori
F1-score on the evaluated set.
Returns
......@@ -225,12 +253,6 @@ def run(dataset, predictions_folder, output_folder, overlayed_folder=None,
"""
logger.info(f"Output folder: {output_folder}")
if not os.path.exists(output_folder):
logger.info(f"Creating {output_folder}...")
os.makedirs(output_folder, exist_ok=True)
# Collect overall metrics
data = {}
......@@ -243,13 +265,15 @@ def run(dataset, predictions_folder, output_folder, overlayed_folder=None,
pred = f["array"][:]
pred = torch.from_numpy(pred)
if stem in data:
raise RuntimeError(f"{stem} entry already exists in data. "
f"Cannot overwrite.")
raise RuntimeError(
f"{stem} entry already exists in data. Cannot overwrite."
)
data[stem] = _sample_metrics(pred, gt)
if overlayed_folder is not None:
overlay_image = _sample_analysis(image, pred, gt,
threshold=overlay_threshold, overlay=True)
overlay_image = _sample_analysis(
image, pred, gt, threshold=threshold, overlay=True
)
fullpath = os.path.join(overlayed_folder, f"{stem}.png")
tqdm.write(f"Saving {fullpath}...")
fulldir = os.path.dirname(fullpath)
......@@ -281,30 +305,49 @@ def run(dataset, predictions_folder, output_folder, overlayed_folder=None,
avg_metrics["re_lower"] = avg_metrics["recall"] - avg_metrics["std_re"]
avg_metrics["std_f1"] = std_metrics["f1_score"]
metrics_path = os.path.join(output_folder, "metrics.csv")
logger.info(f"Saving averages over all input images at {metrics_path}...")
avg_metrics.to_csv(metrics_path)
maxf1 = avg_metrics["f1_score"].max()
optimal_f1_threshold = avg_metrics["f1_score"].idxmax()
logger.info(f"Highest F1-score of {maxf1:.5f}, achieved at "
f"threshold {optimal_f1_threshold:.2f}")
# Plotting
np_avg_metrics = avg_metrics.to_numpy().T
figure_path = os.path.join(output_folder, "precision-recall.pdf")
logger.info(f"Saving overall precision-recall plot at {figure_path}...")
fig = precision_recall_f1iso_confintval(
[np_avg_metrics[0]],
[np_avg_metrics[1]],
[np_avg_metrics[7]],
[np_avg_metrics[8]],
[np_avg_metrics[10]],
[np_avg_metrics[11]],
["data"],
logger.info(
f"Highest (a posteriori) F1-score of {maxf1:.5f}, achieved at "
f"threshold {optimal_f1_threshold:.2f}"
)
fig.savefig(figure_path)
if threshold is not None:
f1_apriori = avg_metrics["f1_score"][threshold]
logger.info(
f"F1-score (a priori) is {f1_apriori:.5f}, at "
f"threshold={threshold:.5f}"
)
if output_folder is not None:
logger.info(f"Output folder: {output_folder}")
if not os.path.exists(output_folder):
logger.info(f"Creating {output_folder}...")
os.makedirs(output_folder, exist_ok=True)
metrics_path = os.path.join(output_folder, "metrics.csv")
logger.info(
f"Saving averages over all input images at {metrics_path}..."
)
avg_metrics.to_csv(metrics_path)
# Plotting
np_avg_metrics = avg_metrics.to_numpy().T
figure_path = os.path.join(output_folder, "precision-recall.pdf")
logger.info(f"Saving overall precision-recall plot at {figure_path}...")
fig = precision_recall_f1iso_confintval(
[np_avg_metrics[0]],
[np_avg_metrics[1]],
[np_avg_metrics[7]],
[np_avg_metrics[8]],
[np_avg_metrics[10]],
[np_avg_metrics[11]],
["data"],
)
fig.savefig(figure_path)
return optimal_f1_threshold
......@@ -331,13 +374,6 @@ def compare_annotators(baseline, other, output_folder, overlayed_folder=None):
if not ``None``, then it should be the name of a folder where to store
overlayed versions of the images and ground-truths
overlay_threshold : :py:class:`float`, Optional
if ``overlayed_folder``, then this should be threshold (floating point)
to apply to prediction maps to decide on positives and negatives for
overlaying analysis (graphical output). This number should come from
the training set or a separate validation set. Using a test set value
may bias your analysis.
"""
logger.info(f"Output folder: {output_folder}")
......@@ -349,19 +385,21 @@ def compare_annotators(baseline, other, output_folder, overlayed_folder=None):
# Collect overall metrics
data = {}
for baseline_sample, other_sample in tqdm(zip(baseline, other)):
for baseline_sample, other_sample in tqdm(list(zip(baseline, other))):
stem = baseline_sample[0]
image = baseline_sample[1]
gt = baseline_sample[2]
pred = other_sample[2] #works as a prediction
pred = other_sample[2] # works as a prediction
if stem in data:
raise RuntimeError(f"{stem} entry already exists in data. "
f"Cannot overwrite.")
raise RuntimeError(
f"{stem} entry already exists in data. " f"Cannot overwrite."
)
data[stem] = _sample_metrics(pred, gt)
if overlayed_folder is not None:
overlay_image = _sample_analysis(image, pred, gt, threshold=0.5,
overlay=True)
overlay_image = _sample_analysis(
image, pred, gt, threshold=0.5, overlay=True
)
fullpath = os.path.join(overlayed_folder, f"{stem}.png")
tqdm.write(f"Saving {fullpath}...")
fulldir = os.path.dirname(fullpath)
......
......@@ -17,6 +17,34 @@ import logging
logger = logging.getLogger(__name__)
def _validate_threshold(t, dataset):
"""Validates the user threshold selection. Returns parsed threshold."""
if t is None:
return 0.5
try:
# we try to convert it to float first
t = float(t)
if t < 0.0 or t > 1.0:
raise ValueError("Float thresholds must be within range [0.0, 1.0]")
except ValueError:
# it is a bit of text - assert dataset with name is available
if not isinstance(dataset, dict):
raise ValueError(
"Threshold should be a floating-point number "
"if your provide only a single dataset for evaluation"
)
if t not in dataset:
raise ValueError(
f"Text thresholds should match dataset names, "
f"but {t} is not available among the datasets provided ("
f"({', '.join(dataset.keys())})"
)
return t
@click.command(
entry_point_group="bob.ip.binseg.config",
cls=ConfigCommand,
......@@ -104,17 +132,20 @@ logger = logging.getLogger(__name__)
cls=ResourceOption,
)
@click.option(
"--overlay-threshold",
"--threshold",
"-T",
help="If you set --overlayed, then you can provide a value to be used as "
"threshold to be applied on probability maps and decide for positives and "
"negatives. This binary output will be used to define true and false "
"positives, and false negatives for the overlay analysis. This number "
"should either come from the training set or a separate validation set "
"to avoid biasing the analysis",
default=0.5,
type=click.FloatRange(min=0.0, max=1.0),
show_default=True,
"to avoid biasing the analysis. Optionally, if you provide a multi-set "
"dataset as input, this may also be the name of an existing set from "
"which the threshold will be estimated (highest F1-score) and then "
"applied to the subsequent sets. This number is also used to print "
"the test set F1-score a priori performance (default: 0.5)",
default=None,
show_default=False,
required=False,
cls=ResourceOption,
)
......@@ -126,12 +157,14 @@ def evaluate(
second_annotator,
second_annotator_folder,
overlayed,
overlay_threshold,
threshold,
**kwargs,
):
"""Evaluates an FCN on a binary segmentation task.
"""
threshold = _validate_threshold(threshold, dataset)
# if we work with dictionaries of datasets, then output evaluation
# information into sub-directories of the output_folder
config = {}
......@@ -156,18 +189,28 @@ def evaluate(
),
}
if isinstance(threshold, str):
# first run evaluation for reference dataset, do not save overlays
logger.info(f"Evaluating threshold on '{threshold}' set")
threshold = run(dataset[threshold], predictions_folder)
logger.info(f"Set --threshold={threshold:.5f}")
# now run with the
for k, v in config.items():
logger.info(f"Analyzing '{k}' set...")
run(
v["dataset"],
predictions_folder,
v["output_folder"],
overlayed,
overlay_threshold,
threshold,
)
if v["second_annotator"] is not None:
compare_annotators(
v["dataset"],
v["second_annotator"],
v["second_annotator_folder"],
os.path.join(overlayed, "second-annotator"),
os.path.join(overlayed, "second-annotator")
if overlayed
else None,
)
......@@ -270,6 +270,12 @@ def experiment(
├── model/ #the generated model will be here
├── predictions/ #the prediction outputs for the train/test set
├── overlayed/ #the overlayed outputs for the train/test set
├── predictions/ #predictions overlayed on the input images
├── analysis/ #predictions overlayed on the input images
├ #including analysis of false positives, negatives
├ #and true positives
└── second-annotator/ #if set, store overlayed images for the
#second annotator here
└── analysis / #the outputs of the analysis of both train/test sets
Training is performed for a configurable number of epochs, and generates at
......@@ -278,6 +284,23 @@ def experiment(
during the training and useful to resume the procedure in case it stops
abruptly.
N.B.: The tool is designed to prevent analysis bias and allows one to
provide separate subsets for training and evaluation. Instead of using
simple datasets, datasets for full experiment running should be
dictionaries with specific subset names:
* ``__train__``: dataset used for training, prioritarily. It is typically
the dataset containing data augmentation pipelines.
* ``train`` (optional): a copy of the ``__train__`` dataset, without data
augmentation, that will be evaluated alongside other sets available
* ``*``: any other name, not starting with an underscore character (``_``),
will be considered a test set for evaluation.
N.B.2: The threshold used for calculating the F1-score on the test set, or
overlay analysis (false positives, negatives and true positives overprinted
on the original image) will be automatically calculated from a
``validation`` set, if one is provided, otherwise, from the ``train`` set.
If none of those is provided, a fixed threshold value at 0.5 will be used.
"""
_save_sh_command(os.path.join(output_folder, "command.sh"))
......@@ -347,6 +370,15 @@ def experiment(
else None
)
# choosing the overlayed_threshold
if "validation" in dataset:
threshold = "validation"
elif "train" in dataset:
threshold = "train"
else:
threshold = 0.5
logger.info(f"Setting --threshold={threshold}...")
analysis_folder = os.path.join(output_folder, "analysis")
second_annotator_folder = os.path.join(analysis_folder, "second-annotator")
ctx.invoke(
......@@ -357,7 +389,7 @@ def experiment(
second_annotator=second_annotator,
second_annotator_folder=second_annotator_folder,
overlayed=overlayed_folder,
overlay_threshold=0.5,
threshold=threshold,
verbose=verbose,
)
......
......@@ -8,9 +8,9 @@
We provide an :ref:`aggregator command called "experiment"
<bob.ip.binseg.cli.experiment>` that runs training, followed by prediction,
evaluation and comparison. After running, you will be able to find results
from model fitting, prediction, evaluation and comparison under a single output
directory.
evaluation and comparison. After running, you
will be able to find results from model fitting, prediction, evaluation and
comparison under a single output directory.
For example, to train a Mobile V2 U-Net architecture on the STARE dataset,
evaluate both train and test set performances, output prediction maps and
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment