[scripts.evaluate] Implement automatic threshold and a priori statistics

139af88c · André Anjos · 4977aa0d · 139af88c · 139af88c · 139af88c
Commit 139af88c authored 5 years ago by André Anjos
--- a/bob/ip/binseg/engine/evaluator.py
+++ b/bob/ip/binseg/engine/evaluator.py
@@ -19,6 +19,7 @@ from ..utils.metric import base_metrics
 from ..utils.plot import precision_recall_f1iso_confintval

 import logging
+
 logger = logging.getLogger(__name__)


@@ -86,40 +87,60 @@ def _sample_metrics(pred, gt):

    for threshold in numpy.arange(0.0, 1.0, step_size):

-        tp_tensor, fp_tensor, tn_tensor, fn_tensor = _posneg(pred, gt, threshold)
+        tp_tensor, fp_tensor, tn_tensor, fn_tensor = _posneg(
+            pred, gt, threshold
+        )

        # calc metrics from scalars
        tp_count = torch.sum(tp_tensor).item()
        fp_count = torch.sum(fp_tensor).item()
        tn_count = torch.sum(tn_tensor).item()
        fn_count = torch.sum(fn_tensor).item()
-        precision, recall, specificity, accuracy, jaccard, f1_score = \
-                base_metrics(tp_count, fp_count, tn_count, fn_count)
-
-        data.append([threshold, precision, recall, specificity,
-            accuracy, jaccard, f1_score])
-
-    return pandas.DataFrame(data, columns=(
-        "threshold",
-        "precision",
-        "recall",
-        "specificity",
-        "accuracy",
-        "jaccard",
-        "f1_score",
-        ))
+        (
+            precision,
+            recall,
+            specificity,
+            accuracy,
+            jaccard,
+            f1_score,
+        ) = base_metrics(tp_count, fp_count, tn_count, fn_count)
+
+        data.append(
+            [
+                threshold,
+                precision,
+                recall,
+                specificity,
+                accuracy,
+                jaccard,
+                f1_score,
+            ]
+        )
+
+    return pandas.DataFrame(
+        data,
+        columns=(
+            "threshold",
+            "precision",
+            "recall",
+            "specificity",
+            "accuracy",
+            "jaccard",
+            "f1_score",
+        ),
+    )


 def _sample_analysis(
-        img,
-        pred,
-        gt,
-        threshold,
-        tp_color=(0, 255, 0),  # (128,128,128) Gray
-        fp_color=(0, 0, 255),  # (70, 240, 240) Cyan
-        fn_color=(255, 0, 0),  # (245, 130, 48) Orange
-        overlay=True,
-        ):
+    img,
+    pred,
+    gt,
+    threshold,
+    tp_color=(0, 255, 0),  # (128,128,128) Gray
+    fp_color=(0, 0, 255),  # (70, 240, 240) Cyan
+    fn_color=(255, 0, 0),  # (245, 130, 48) Orange
+    overlay=True,
+):
    """Visualizes true positives, false positives and false negatives


@@ -186,8 +207,13 @@ def _sample_analysis(
    return tp_pil_colored


-def run(dataset, predictions_folder, output_folder, overlayed_folder=None,
-        overlay_threshold=None):
+def run(
+    dataset,
+    predictions_folder,
+    output_folder=None,
+    overlayed_folder=None,
+    threshold=None,
+):
    """
    Runs inference and calculates metrics

@@ -202,19 +228,21 @@ def run(dataset, predictions_folder, output_folder, overlayed_folder=None,
        folder where predictions for the dataset images has been previously
        stored

-    output_folder : str
-        folder where to store results
+    output_folder : :py:class:`str`, Optional
+        folder where to store results.  If not provided, then do not store any
+        analysis (useful for quickly calculating overlay thresholds)

    overlayed_folder : :py:class:`str`, Optional
        if not ``None``, then it should be the name of a folder where to store
        overlayed versions of the images and ground-truths

-    overlay_threshold : :py:class:`float`, Optional
+    threshold : :py:class:`float`, Optional
        if ``overlayed_folder``, then this should be threshold (floating point)
        to apply to prediction maps to decide on positives and negatives for
        overlaying analysis (graphical output).  This number should come from
        the training set or a separate validation set.  Using a test set value
-        may bias your analysis.
+        may bias your analysis.  This number is also used to print the a priori
+        F1-score on the evaluated set.


    Returns
@@ -225,12 +253,6 @@ def run(dataset, predictions_folder, output_folder, overlayed_folder=None,

    """

-    logger.info(f"Output folder: {output_folder}")
-
-    if not os.path.exists(output_folder):
-        logger.info(f"Creating {output_folder}...")
-        os.makedirs(output_folder, exist_ok=True)
-
    # Collect overall metrics
    data = {}

@@ -243,13 +265,15 @@ def run(dataset, predictions_folder, output_folder, overlayed_folder=None,
            pred = f["array"][:]
        pred = torch.from_numpy(pred)
        if stem in data:
-            raise RuntimeError(f"{stem} entry already exists in data. "
-                    f"Cannot overwrite.")
+            raise RuntimeError(
+                f"{stem} entry already exists in data. Cannot overwrite."
+            )
        data[stem] = _sample_metrics(pred, gt)

        if overlayed_folder is not None:
-            overlay_image = _sample_analysis(image, pred, gt,
-                    threshold=overlay_threshold, overlay=True)
+            overlay_image = _sample_analysis(
+                image, pred, gt, threshold=threshold, overlay=True
+            )
            fullpath = os.path.join(overlayed_folder, f"{stem}.png")
            tqdm.write(f"Saving {fullpath}...")
            fulldir = os.path.dirname(fullpath)
@@ -281,30 +305,49 @@ def run(dataset, predictions_folder, output_folder, overlayed_folder=None,
    avg_metrics["re_lower"] = avg_metrics["recall"] - avg_metrics["std_re"]
    avg_metrics["std_f1"] = std_metrics["f1_score"]

-    metrics_path = os.path.join(output_folder, "metrics.csv")
-    logger.info(f"Saving averages over all input images at {metrics_path}...")
-    avg_metrics.to_csv(metrics_path)
-
    maxf1 = avg_metrics["f1_score"].max()
    optimal_f1_threshold = avg_metrics["f1_score"].idxmax()

-    logger.info(f"Highest F1-score of {maxf1:.5f}, achieved at "
-            f"threshold {optimal_f1_threshold:.2f}")
-
-    # Plotting
-    np_avg_metrics = avg_metrics.to_numpy().T
-    figure_path = os.path.join(output_folder, "precision-recall.pdf")
-    logger.info(f"Saving overall precision-recall plot at {figure_path}...")
-    fig = precision_recall_f1iso_confintval(
-        [np_avg_metrics[0]],
-        [np_avg_metrics[1]],
-        [np_avg_metrics[7]],
-        [np_avg_metrics[8]],
-        [np_avg_metrics[10]],
-        [np_avg_metrics[11]],
-        ["data"],
+    logger.info(
+        f"Highest (a posteriori) F1-score of {maxf1:.5f}, achieved at "
+        f"threshold {optimal_f1_threshold:.2f}"
    )
-    fig.savefig(figure_path)
+
+    if threshold is not None:
+        f1_apriori = avg_metrics["f1_score"][threshold]
+
+        logger.info(
+                f"F1-score (a priori) is {f1_apriori:.5f}, at "
+                f"threshold={threshold:.5f}"
+        )
+
+    if output_folder is not None:
+        logger.info(f"Output folder: {output_folder}")
+
+        if not os.path.exists(output_folder):
+            logger.info(f"Creating {output_folder}...")
+            os.makedirs(output_folder, exist_ok=True)
+
+        metrics_path = os.path.join(output_folder, "metrics.csv")
+        logger.info(
+            f"Saving averages over all input images at {metrics_path}..."
+        )
+        avg_metrics.to_csv(metrics_path)
+
+        # Plotting
+        np_avg_metrics = avg_metrics.to_numpy().T
+        figure_path = os.path.join(output_folder, "precision-recall.pdf")
+        logger.info(f"Saving overall precision-recall plot at {figure_path}...")
+        fig = precision_recall_f1iso_confintval(
+            [np_avg_metrics[0]],
+            [np_avg_metrics[1]],
+            [np_avg_metrics[7]],
+            [np_avg_metrics[8]],
+            [np_avg_metrics[10]],
+            [np_avg_metrics[11]],
+            ["data"],
+        )
+        fig.savefig(figure_path)

    return optimal_f1_threshold

@@ -331,13 +374,6 @@ def compare_annotators(baseline, other, output_folder, overlayed_folder=None):
        if not ``None``, then it should be the name of a folder where to store
        overlayed versions of the images and ground-truths

-    overlay_threshold : :py:class:`float`, Optional
-        if ``overlayed_folder``, then this should be threshold (floating point)
-        to apply to prediction maps to decide on positives and negatives for
-        overlaying analysis (graphical output).  This number should come from
-        the training set or a separate validation set.  Using a test set value
-        may bias your analysis.
-
    """

    logger.info(f"Output folder: {output_folder}")
@@ -349,19 +385,21 @@ def compare_annotators(baseline, other, output_folder, overlayed_folder=None):
    # Collect overall metrics
    data = {}

-    for baseline_sample, other_sample in tqdm(zip(baseline, other)):
+    for baseline_sample, other_sample in tqdm(list(zip(baseline, other))):
        stem = baseline_sample[0]
        image = baseline_sample[1]
        gt = baseline_sample[2]
-        pred = other_sample[2]  #works as a prediction
+        pred = other_sample[2]  # works as a prediction
        if stem in data:
-            raise RuntimeError(f"{stem} entry already exists in data. "
-                    f"Cannot overwrite.")
+            raise RuntimeError(
+                f"{stem} entry already exists in data. " f"Cannot overwrite."
+            )
        data[stem] = _sample_metrics(pred, gt)

        if overlayed_folder is not None:
-            overlay_image = _sample_analysis(image, pred, gt, threshold=0.5,
-                    overlay=True)
+            overlay_image = _sample_analysis(
+                image, pred, gt, threshold=0.5, overlay=True
+            )
            fullpath = os.path.join(overlayed_folder, f"{stem}.png")
            tqdm.write(f"Saving {fullpath}...")
            fulldir = os.path.dirname(fullpath)

--- a/bob/ip/binseg/script/evaluate.py
+++ b/bob/ip/binseg/script/evaluate.py
@@ -17,6 +17,34 @@ import logging
 logger = logging.getLogger(__name__)


+def _validate_threshold(t, dataset):
+    """Validates the user threshold selection.  Returns parsed threshold."""
+
+    if t is None:
+        return 0.5
+
+    try:
+        # we try to convert it to float first
+        t = float(t)
+        if t < 0.0 or t > 1.0:
+            raise ValueError("Float thresholds must be within range [0.0, 1.0]")
+    except ValueError:
+        # it is a bit of text - assert dataset with name is available
+        if not isinstance(dataset, dict):
+            raise ValueError(
+                "Threshold should be a floating-point number "
+                "if your provide only a single dataset for evaluation"
+            )
+        if t not in dataset:
+            raise ValueError(
+                f"Text thresholds should match dataset names, "
+                f"but {t} is not available among the datasets provided ("
+                f"({', '.join(dataset.keys())})"
+            )
+
+    return t
+
+
 @click.command(
    entry_point_group="bob.ip.binseg.config",
    cls=ConfigCommand,
@@ -104,17 +132,20 @@ logger = logging.getLogger(__name__)
    cls=ResourceOption,
 )
 @click.option(
-    "--overlay-threshold",
+    "--threshold",
    "-T",
    help="If you set --overlayed, then you can provide a value to be used as "
    "threshold to be applied on probability maps and decide for positives and "
    "negatives.  This binary output will be used to define true and false "
    "positives, and false negatives for the overlay analysis.  This number "
    "should either come from the training set or a separate validation set "
-    "to avoid biasing the analysis",
-    default=0.5,
-    type=click.FloatRange(min=0.0, max=1.0),
-    show_default=True,
+    "to avoid biasing the analysis.  Optionally, if you provide a multi-set "
+    "dataset as input, this may also be the name of an existing set from "
+    "which the threshold will be estimated (highest F1-score) and then "
+    "applied to the subsequent sets.  This number is also used to print "
+    "the test set F1-score a priori performance (default: 0.5)",
+    default=None,
+    show_default=False,
    required=False,
    cls=ResourceOption,
 )
@@ -126,12 +157,14 @@ def evaluate(
    second_annotator,
    second_annotator_folder,
    overlayed,
-    overlay_threshold,
+    threshold,
    **kwargs,
 ):
    """Evaluates an FCN on a binary segmentation task.
    """

+    threshold = _validate_threshold(threshold, dataset)
+
    # if we work with dictionaries of datasets, then output evaluation
    # information into sub-directories of the output_folder
    config = {}
@@ -156,18 +189,28 @@ def evaluate(
                ),
            }

+    if isinstance(threshold, str):
+        # first run evaluation for reference dataset, do not save overlays
+        logger.info(f"Evaluating threshold on '{threshold}' set")
+        threshold = run(dataset[threshold], predictions_folder)
+        logger.info(f"Set --threshold={threshold:.5f}")
+
+    # now run with the
    for k, v in config.items():
+        logger.info(f"Analyzing '{k}' set...")
        run(
            v["dataset"],
            predictions_folder,
            v["output_folder"],
            overlayed,
-            overlay_threshold,
+            threshold,
        )
        if v["second_annotator"] is not None:
            compare_annotators(
                v["dataset"],
                v["second_annotator"],
                v["second_annotator_folder"],
-                os.path.join(overlayed, "second-annotator"),
+                os.path.join(overlayed, "second-annotator")
+                if overlayed
+                else None,
            )
--- a/bob/ip/binseg/script/experiment.py
+++ b/bob/ip/binseg/script/experiment.py
@@ -270,6 +270,12 @@ def experiment(
          ├── model/  #the generated model will be here
          ├── predictions/  #the prediction outputs for the train/test set
          ├── overlayed/  #the overlayed outputs for the train/test set
+             ├── predictions/  #predictions overlayed on the input images
+             ├── analysis/  #predictions overlayed on the input images
+             ├              #including analysis of false positives, negatives
+             ├              #and true positives
+             └── second-annotator/  #if set, store overlayed images for the
+                                    #second annotator here
          └── analysis /  #the outputs of the analysis of both train/test sets

    Training is performed for a configurable number of epochs, and generates at
@@ -278,6 +284,23 @@ def experiment(
    during the training and useful to resume the procedure in case it stops
    abruptly.

+    N.B.: The tool is designed to prevent analysis bias and allows one to
+    provide separate subsets for training and evaluation.  Instead of using
+    simple datasets, datasets for full experiment running should be
+    dictionaries with specific subset names:
+
+    * ``__train__``: dataset used for training, prioritarily.  It is typically
+      the dataset containing data augmentation pipelines.
+    * ``train`` (optional): a copy of the ``__train__`` dataset, without data
+      augmentation, that will be evaluated alongside other sets available
+    * ``*``: any other name, not starting with an underscore character (``_``),
+      will be considered a test set for evaluation.
+
+    N.B.2: The threshold used for calculating the F1-score on the test set, or
+    overlay analysis (false positives, negatives and true positives overprinted
+    on the original image) will be automatically calculated from a
+    ``validation`` set, if one is provided, otherwise, from the ``train`` set.
+    If none of those is provided, a fixed threshold value at 0.5 will be used.
    """

    _save_sh_command(os.path.join(output_folder, "command.sh"))
@@ -347,6 +370,15 @@ def experiment(
        else None
    )

+    # choosing the overlayed_threshold
+    if "validation" in dataset:
+        threshold = "validation"
+    elif "train" in dataset:
+        threshold = "train"
+    else:
+        threshold = 0.5
+    logger.info(f"Setting --threshold={threshold}...")
+
    analysis_folder = os.path.join(output_folder, "analysis")
    second_annotator_folder = os.path.join(analysis_folder, "second-annotator")
    ctx.invoke(
@@ -357,7 +389,7 @@ def experiment(
        second_annotator=second_annotator,
        second_annotator_folder=second_annotator_folder,
        overlayed=overlayed_folder,
-        overlay_threshold=0.5,
+        threshold=threshold,
        verbose=verbose,
    )


--- a/doc/experiment.rst
+++ b/doc/experiment.rst
@@ -8,9 +8,9 @@

 We provide an :ref:`aggregator command called "experiment"
 <bob.ip.binseg.cli.experiment>` that runs training, followed by prediction,
-evaluation and comparison.  After running, you will be able to find results
-from model fitting, prediction, evaluation and comparison under a single output
-directory.
+evaluation and comparison.  After running, you
+will be able to find results from model fitting, prediction, evaluation and
+comparison under a single output directory.

 For example, to train a Mobile V2 U-Net architecture on the STARE dataset,
 evaluate both train and test set performances, output prediction maps and