From cd689273d43b6af9e514f31b87779823a041e00c Mon Sep 17 00:00:00 2001 From: Andre Anjos <andre.dos.anjos@gmail.com> Date: Tue, 21 Apr 2020 20:16:24 +0200 Subject: [PATCH] [*] Implement automatic second-annotator evaluation --- .../datasets/chasedb1/first_annotator.py | 2 + .../datasets/chasedb1/second_annotator.py | 2 + .../binseg/configs/datasets/drive/default.py | 2 + bob/ip/binseg/configs/datasets/stare/ah.py | 2 + bob/ip/binseg/configs/datasets/stare/vk.py | 2 + bob/ip/binseg/engine/evaluator.py | 118 ++++++++++++++++-- bob/ip/binseg/engine/predictor.py | 3 +- bob/ip/binseg/script/evaluate.py | 83 ++++++++++-- bob/ip/binseg/script/experiment.py | 21 +++- bob/ip/binseg/script/predict.py | 2 +- bob/ip/binseg/test/test_cli.py | 7 +- 11 files changed, 216 insertions(+), 28 deletions(-) diff --git a/bob/ip/binseg/configs/datasets/chasedb1/first_annotator.py b/bob/ip/binseg/configs/datasets/chasedb1/first_annotator.py index 06b01170..07c56842 100644 --- a/bob/ip/binseg/configs/datasets/chasedb1/first_annotator.py +++ b/bob/ip/binseg/configs/datasets/chasedb1/first_annotator.py @@ -6,7 +6,9 @@ * Split reference: [CHASEDB1-2012]_ * Configuration resolution: 960 x 960 (after hand-specified crop) * See :py:mod:`bob.ip.binseg.data.chasedb1` for dataset details +* This dataset offers a second-annotator comparison """ from bob.ip.binseg.configs.datasets.chasedb1 import _maker dataset = _maker("first-annotator") +second_annotator = _maker("second-annotator") diff --git a/bob/ip/binseg/configs/datasets/chasedb1/second_annotator.py b/bob/ip/binseg/configs/datasets/chasedb1/second_annotator.py index 04b5d5a0..87a98310 100644 --- a/bob/ip/binseg/configs/datasets/chasedb1/second_annotator.py +++ b/bob/ip/binseg/configs/datasets/chasedb1/second_annotator.py @@ -6,7 +6,9 @@ * Split reference: [CHASEDB1-2012]_ * Configuration resolution: 960 x 960 (after hand-specified crop) * See :py:mod:`bob.ip.binseg.data.chasedb1` for dataset details +* This dataset offers a second-annotator comparison (using "first-annotator") """ from bob.ip.binseg.configs.datasets.chasedb1 import _maker dataset = _maker("second-annotator") +second_annotator = _maker("first-annotator") diff --git a/bob/ip/binseg/configs/datasets/drive/default.py b/bob/ip/binseg/configs/datasets/drive/default.py index 29f255d1..23317349 100644 --- a/bob/ip/binseg/configs/datasets/drive/default.py +++ b/bob/ip/binseg/configs/datasets/drive/default.py @@ -6,7 +6,9 @@ * Split reference: [DRIVE-2004]_ * This configuration resolution: 544 x 544 (center-crop) * See :py:mod:`bob.ip.binseg.data.drive` for dataset details +* This dataset offers a second-annotator comparison for the test set only """ from bob.ip.binseg.configs.datasets.drive import _maker dataset = _maker("default") +second_annotator = _maker("second-annotator") diff --git a/bob/ip/binseg/configs/datasets/stare/ah.py b/bob/ip/binseg/configs/datasets/stare/ah.py index d6c4147c..6b7cec73 100644 --- a/bob/ip/binseg/configs/datasets/stare/ah.py +++ b/bob/ip/binseg/configs/datasets/stare/ah.py @@ -6,7 +6,9 @@ * Configuration resolution: 704 x 608 (after padding) * Split reference: [MANINIS-2016]_ * See :py:mod:`bob.ip.binseg.data.stare` for dataset details +* This dataset offers a second-annotator comparison (using protocol "vk") """ from bob.ip.binseg.configs.datasets.stare import _maker dataset = _maker("ah") +second_annotator = _maker("vk") diff --git a/bob/ip/binseg/configs/datasets/stare/vk.py b/bob/ip/binseg/configs/datasets/stare/vk.py index b0468f67..0ae41c87 100644 --- a/bob/ip/binseg/configs/datasets/stare/vk.py +++ b/bob/ip/binseg/configs/datasets/stare/vk.py @@ -6,7 +6,9 @@ * Configuration resolution: 704 x 608 (after padding) * Split reference: [MANINIS-2016]_ * See :py:mod:`bob.ip.binseg.data.stare` for dataset details +* This dataset offers a second-annotator comparison (using protocol "ah") """ from bob.ip.binseg.configs.datasets.stare import _maker dataset = _maker("vk") +second_annotator = _maker("ah") diff --git a/bob/ip/binseg/engine/evaluator.py b/bob/ip/binseg/engine/evaluator.py index d927c84c..96870093 100644 --- a/bob/ip/binseg/engine/evaluator.py +++ b/bob/ip/binseg/engine/evaluator.py @@ -186,7 +186,7 @@ def _sample_analysis( return tp_pil_colored -def run(data_loader, predictions_folder, output_folder, overlayed_folder=None, +def run(dataset, predictions_folder, output_folder, overlayed_folder=None, overlay_threshold=None): """ Runs inference and calculates metrics @@ -195,8 +195,8 @@ def run(data_loader, predictions_folder, output_folder, overlayed_folder=None, Parameters --------- - data_loader : py:class:`torch.torch.utils.data.DataLoader` - an iterable over the transformed input dataset, containing ground-truth + dataset : py:class:`torch.utils.data.Dataset` + a dataset to iterate on predictions_folder : str folder where predictions for the dataset images has been previously @@ -216,6 +216,13 @@ def run(data_loader, predictions_folder, output_folder, overlayed_folder=None, the training set or a separate validation set. Using a test set value may bias your analysis. + + Returns + ------- + + threshold : float + Threshold to achieve the highest possible F1-score for this dataset + """ logger.info(f"Output folder: {output_folder}") @@ -227,11 +234,10 @@ def run(data_loader, predictions_folder, output_folder, overlayed_folder=None, # Collect overall metrics data = {} - for sample in tqdm(data_loader): - name = sample[0] - stem = os.path.splitext(name)[0] - image = sample[1].to("cpu") - gt = sample[2].to("cpu") + for sample in tqdm(dataset): + stem = sample[0] + image = sample[1] + gt = sample[2] pred_fullpath = os.path.join(predictions_folder, stem + ".hdf5") with h5py.File(pred_fullpath, "r") as f: pred = f["array"][:] @@ -299,3 +305,99 @@ def run(data_loader, predictions_folder, output_folder, overlayed_folder=None, ["data"], ) fig.savefig(figure_path) + + return optimal_f1_threshold + + +def compare_annotators(baseline, other, output_folder, overlayed_folder=None): + """ + Compares annotations on the **same** dataset + + + Parameters + --------- + + baseline : py:class:`torch.utils.data.Dataset` + a dataset to iterate on, containing the baseline annotations + + other : py:class:`torch.utils.data.Dataset` + a second dataset, with the same samples as ``baseline``, but annotated + by a different annotator than in the first dataset. + + output_folder : str + folder where to store results + + overlayed_folder : :py:class:`str`, Optional + if not ``None``, then it should be the name of a folder where to store + overlayed versions of the images and ground-truths + + overlay_threshold : :py:class:`float`, Optional + if ``overlayed_folder``, then this should be threshold (floating point) + to apply to prediction maps to decide on positives and negatives for + overlaying analysis (graphical output). This number should come from + the training set or a separate validation set. Using a test set value + may bias your analysis. + + """ + + logger.info(f"Output folder: {output_folder}") + + if not os.path.exists(output_folder): + logger.info(f"Creating {output_folder}...") + os.makedirs(output_folder, exist_ok=True) + + # Collect overall metrics + data = {} + + for baseline_sample, other_sample in tqdm(zip(baseline, other)): + stem = baseline_sample[0] + image = baseline_sample[1] + gt = baseline_sample[2] + pred = other_sample[2] #works as a prediction + if stem in data: + raise RuntimeError(f"{stem} entry already exists in data. " + f"Cannot overwrite.") + data[stem] = _sample_metrics(pred, gt) + + if overlayed_folder is not None: + overlay_image = _sample_analysis(image, pred, gt, threshold=0.5, + overlay=True) + fullpath = os.path.join(overlayed_folder, f"{stem}.png") + tqdm.write(f"Saving {fullpath}...") + fulldir = os.path.dirname(fullpath) + if not os.path.exists(fulldir): + tqdm.write(f"Creating directory {fulldir}...") + os.makedirs(fulldir, exist_ok=True) + overlay_image.save(fullpath) + + # Merges all dataframes together + df_metrics = pandas.concat(data.values()) + + # Report and Averages + avg_metrics = df_metrics.groupby("threshold").mean() + std_metrics = df_metrics.groupby("threshold").std() + + # Uncomment below for F1-score calculation based on average precision and + # metrics instead of F1-scores of individual images. This method is in line + # with Maninis et. al. (2016) + # + # avg_metrics["f1_score"] = \ + # (2* avg_metrics["precision"]*avg_metrics["recall"])/ \ + # (avg_metrics["precision"]+avg_metrics["recall"]) + + avg_metrics["std_pr"] = std_metrics["precision"] + avg_metrics["pr_upper"] = avg_metrics["precision"] + avg_metrics["std_pr"] + avg_metrics["pr_lower"] = avg_metrics["precision"] - avg_metrics["std_pr"] + avg_metrics["std_re"] = std_metrics["recall"] + avg_metrics["re_upper"] = avg_metrics["recall"] + avg_metrics["std_re"] + avg_metrics["re_lower"] = avg_metrics["recall"] - avg_metrics["std_re"] + avg_metrics["std_f1"] = std_metrics["f1_score"] + + metrics_path = os.path.join(output_folder, "metrics.csv") + logger.info(f"Saving averages over all input images at {metrics_path}...") + avg_metrics.to_csv(metrics_path) + + maxf1 = avg_metrics["f1_score"].max() + optimal_f1_threshold = avg_metrics["f1_score"].idxmax() + + logger.info(f"Highest F1-score of {maxf1:.5f} (second annotator)") diff --git a/bob/ip/binseg/engine/predictor.py b/bob/ip/binseg/engine/predictor.py index 704899e6..a7ab078d 100644 --- a/bob/ip/binseg/engine/predictor.py +++ b/bob/ip/binseg/engine/predictor.py @@ -168,8 +168,7 @@ def run(model, data_loader, device, output_folder, overlayed_folder): times.append(batch_time) len_samples.append(len(images)) - for name, img, prob in zip(names, images, predictions): - stem = os.path.splitext(name)[0] + for stem, img, prob in zip(names, images, predictions): _save_hdf5(stem, prob, output_folder) if overlayed_folder is not None: _save_overlayed_png(stem, img, prob, overlayed_folder) diff --git a/bob/ip/binseg/script/evaluate.py b/bob/ip/binseg/script/evaluate.py index f4a3f9fc..8c8e575f 100644 --- a/bob/ip/binseg/script/evaluate.py +++ b/bob/ip/binseg/script/evaluate.py @@ -3,7 +3,6 @@ import os import click -from torch.utils.data import DataLoader from bob.extension.scripts.click_helper import ( verbosity_option, @@ -11,9 +10,10 @@ from bob.extension.scripts.click_helper import ( ResourceOption, ) -from ..engine.evaluator import run +from ..engine.evaluator import run, compare_annotators import logging + logger = logging.getLogger(__name__) @@ -25,7 +25,7 @@ logger = logging.getLogger(__name__) \b 1. Runs evaluation on an existing dataset configuration: \b - $ bob binseg evaluate -vv m2unet drive-test --predictions-folder=path/to/predictions --output-folder=path/to/results + $ bob binseg evaluate -vv m2unet drive --predictions-folder=path/to/predictions --output-folder=path/to/results \b 2. To run evaluation on a folder with your own images and annotations, you must first specify resizing, cropping, etc, so that the image can be @@ -69,6 +69,27 @@ logger = logging.getLogger(__name__) required=True, cls=ResourceOption, ) +@click.option( + "--second-annotator", + "-S", + help="A dataset or dictionary, like in --dataset, with the same " + "sample keys, but with annotations from a different annotator that is " + "going to be compared to the one in --dataset", + required=False, + default=None, + cls=ResourceOption, + show_default=True, +) +@click.option( + "--second-annotator-folder", + "-O", + help="Path where to store the analysis result for second annotator " + "comparisons (only used if --second-annotator is also passed)", + required=True, + default="second-annotator", + type=click.Path(), + cls=ResourceOption, +) @click.option( "--overlayed", "-O", @@ -99,15 +120,51 @@ logger = logging.getLogger(__name__) cls=ResourceOption, ) @verbosity_option(cls=ResourceOption) -def evaluate(output_folder, predictions_folder, dataset, overlayed, - overlay_threshold, **kwargs): +def evaluate( + output_folder, + predictions_folder, + dataset, + second_annotator, + second_annotator_folder, + overlayed, + overlay_threshold, + **kwargs, +): """Evaluates an FCN on a binary segmentation task. """ - if isinstance(dataset, dict): - for k,v in dataset.items(): - analysis_folder = os.path.join(output_folder, k) - with v.not_augmented() as d: - data_loader = DataLoader(dataset=d, batch_size=1, - shuffle=False, pin_memory=False) - run(d, predictions_folder, analysis_folder, overlayed, - overlay_threshold) + + # if we work with dictionaries of datasets, then output evaluation + # information into sub-directories of the output_folder + config = {} + if not isinstance(dataset, dict): + config["test"] = { + "dataset": dataset, + "output_folder": output_folder, + "second_annotator": second_annotator, + "second_annotator_folder": second_annotator_folder, + } + else: + for k, v in dataset.items(): + config[k] = { + "dataset": v, + "output_folder": os.path.join(output_folder, k), + "second_annotator": second_annotator.get(k), + "second_annotator_folder": os.path.join( + second_annotator_folder, k + ), + } + + for k, v in config.items(): + with v["dataset"].not_augmented() as d: + run( + d, + predictions_folder, + v["output_folder"], + overlayed, + overlay_threshold, + ) + if v["second_annotator"] is not None: + with v["second_annotator"].not_augmented() as d2: + compare_annotators( + d, d2, v["second_annotator_folder"], overlayed + ) diff --git a/bob/ip/binseg/script/experiment.py b/bob/ip/binseg/script/experiment.py index e4027229..6882d5b9 100644 --- a/bob/ip/binseg/script/experiment.py +++ b/bob/ip/binseg/script/experiment.py @@ -59,6 +59,17 @@ logger = logging.getLogger(__name__) required=True, cls=ResourceOption, ) +@click.option( + "--second-annotator", + "-S", + help="A dataset or dictionary, like in --dataset, with the same " + "sample keys, but with annotations from a different annotator that is " + "going to be compared to the one in --dataset", + required=False, + default=None, + cls=ResourceOption, + show_default=True, +) @click.option( "--optimizer", help="A torch.optim.Optimizer that will be used to train the network", @@ -205,6 +216,7 @@ def experiment( drop_incomplete_batch, criterion, dataset, + second_annotator, checkpoint_period, device, seed, @@ -301,11 +313,14 @@ def experiment( ) analysis_folder = os.path.join(output_folder, "analysis") + second_annotator_folder = os.path.join(analysis_folder, "second-annotator") ctx.invoke( evaluate, output_folder=analysis_folder, predictions_folder=predictions_folder, dataset=dataset, + second_annotator=second_annotator, + second_annotator_folder=second_annotator_folder, overlayed=overlayed_folder, overlay_threshold=0.5, verbose=verbose, @@ -321,7 +336,11 @@ def experiment( systems = [] for k, v in dataset.items(): - systems += [k, os.path.join(output_folder, "analysis", k, "metrics.csv")] + systems += [k, os.path.join(analysis_folder, k, "metrics.csv")] + if second_annotator is not None: + for k, v in second_annotator.items(): + systems += [f"{k} (2nd. annot.)", + os.path.join(second_annotator_folder, k, "metrics.csv")] output_pdf = os.path.join(output_folder, "comparison.pdf") ctx.invoke(compare, label_path=systems, output=output_pdf, verbose=verbose) diff --git a/bob/ip/binseg/script/predict.py b/bob/ip/binseg/script/predict.py index d8af8c01..41419ece 100644 --- a/bob/ip/binseg/script/predict.py +++ b/bob/ip/binseg/script/predict.py @@ -28,7 +28,7 @@ logger = logging.getLogger(__name__) \b 1. Runs prediction on an existing dataset configuration: \b - $ bob binseg predict -vv m2unet drive-test --weight=path/to/model_final.pth --output-folder=path/to/predictions + $ bob binseg predict -vv m2unet drive --weight=path/to/model_final.pth --output-folder=path/to/predictions \b 2. To run prediction on a folder with your own images, you must first specify resizing, cropping, etc, so that the image can be correctly diff --git a/bob/ip/binseg/test/test_cli.py b/bob/ip/binseg/test/test_cli.py index b244bf8b..cbaeea28 100644 --- a/bob/ip/binseg/test/test_cli.py +++ b/bob/ip/binseg/test/test_cli.py @@ -79,6 +79,7 @@ def test_experiment_stare(): "from bob.ip.binseg.configs.datasets.stare import _maker\n" ) config.write("dataset = _maker('ah', _raw)\n") + config.write("second_annotator = _maker('vk', _raw)\n") config.flush() result = runner.invoke( @@ -97,12 +98,12 @@ def test_experiment_stare(): # "Saving results/overlayed/probabilities": 1, #tqdm.write "Ended prediction": 1, # logging "Started evaluation": 1, # logging - "Highest F1-score of": 2, # logging + "Highest F1-score of": 4, # logging "Saving overall precision-recall plot": 2, # logging # "Saving results/overlayed/analysis": 1, #tqdm.write "Ended evaluation": 1, # logging "Started comparison": 1, # logging - "Loading metrics from results/analysis": 2, # logging + "Loading metrics from results/analysis": 4, # logging "Ended comparison": 1, # logging } buf.seek(0) @@ -114,7 +115,7 @@ def test_experiment_stare(): # f"instead of the expected {v}") assert _str_counter(k, logging_output) == v, ( f"Count for string '{k}' appeared " - f"({_str_counter(k, result.output)}) " + f"({_str_counter(k, logging_output)}) " f"instead of the expected {v}" ) -- GitLab