diff --git a/bob/ip/binseg/configs/datasets/chasedb1/first_annotator.py b/bob/ip/binseg/configs/datasets/chasedb1/first_annotator.py index 06b01170e936dc5ea8f67f5faaa5de395fe9f200..07c5684212db6a510a32b246c73a0d7bbe0f8c70 100644 --- a/bob/ip/binseg/configs/datasets/chasedb1/first_annotator.py +++ b/bob/ip/binseg/configs/datasets/chasedb1/first_annotator.py @@ -6,7 +6,9 @@ * Split reference: [CHASEDB1-2012]_ * Configuration resolution: 960 x 960 (after hand-specified crop) * See :py:mod:`bob.ip.binseg.data.chasedb1` for dataset details +* This dataset offers a second-annotator comparison """ from bob.ip.binseg.configs.datasets.chasedb1 import _maker dataset = _maker("first-annotator") +second_annotator = _maker("second-annotator") diff --git a/bob/ip/binseg/configs/datasets/chasedb1/second_annotator.py b/bob/ip/binseg/configs/datasets/chasedb1/second_annotator.py index 04b5d5a0733d61f9304879753e03b5e9cf5846a3..87a98310bbbe2c6784a0d4d525d58a6126543e2c 100644 --- a/bob/ip/binseg/configs/datasets/chasedb1/second_annotator.py +++ b/bob/ip/binseg/configs/datasets/chasedb1/second_annotator.py @@ -6,7 +6,9 @@ * Split reference: [CHASEDB1-2012]_ * Configuration resolution: 960 x 960 (after hand-specified crop) * See :py:mod:`bob.ip.binseg.data.chasedb1` for dataset details +* This dataset offers a second-annotator comparison (using "first-annotator") """ from bob.ip.binseg.configs.datasets.chasedb1 import _maker dataset = _maker("second-annotator") +second_annotator = _maker("first-annotator") diff --git a/bob/ip/binseg/configs/datasets/drive/default.py b/bob/ip/binseg/configs/datasets/drive/default.py index 29f255d1f247b25563911d0fe3b3dc143871275a..2331734969206068bdfb32926c495662cb5d08e7 100644 --- a/bob/ip/binseg/configs/datasets/drive/default.py +++ b/bob/ip/binseg/configs/datasets/drive/default.py @@ -6,7 +6,9 @@ * Split reference: [DRIVE-2004]_ * This configuration resolution: 544 x 544 (center-crop) * See :py:mod:`bob.ip.binseg.data.drive` for dataset details +* This dataset offers a second-annotator comparison for the test set only """ from bob.ip.binseg.configs.datasets.drive import _maker dataset = _maker("default") +second_annotator = _maker("second-annotator") diff --git a/bob/ip/binseg/configs/datasets/stare/ah.py b/bob/ip/binseg/configs/datasets/stare/ah.py index d6c4147ca3a581f3c02f07dd38bd77cb8fd45294..6b7cec73b37fb19f15f3ea6b94dc61a1a58b06fb 100644 --- a/bob/ip/binseg/configs/datasets/stare/ah.py +++ b/bob/ip/binseg/configs/datasets/stare/ah.py @@ -6,7 +6,9 @@ * Configuration resolution: 704 x 608 (after padding) * Split reference: [MANINIS-2016]_ * See :py:mod:`bob.ip.binseg.data.stare` for dataset details +* This dataset offers a second-annotator comparison (using protocol "vk") """ from bob.ip.binseg.configs.datasets.stare import _maker dataset = _maker("ah") +second_annotator = _maker("vk") diff --git a/bob/ip/binseg/configs/datasets/stare/vk.py b/bob/ip/binseg/configs/datasets/stare/vk.py index b0468f672d204533f383131ffe664eed3c609c16..0ae41c87553b013e0de8ad4d8601c80a80be96e2 100644 --- a/bob/ip/binseg/configs/datasets/stare/vk.py +++ b/bob/ip/binseg/configs/datasets/stare/vk.py @@ -6,7 +6,9 @@ * Configuration resolution: 704 x 608 (after padding) * Split reference: [MANINIS-2016]_ * See :py:mod:`bob.ip.binseg.data.stare` for dataset details +* This dataset offers a second-annotator comparison (using protocol "ah") """ from bob.ip.binseg.configs.datasets.stare import _maker dataset = _maker("vk") +second_annotator = _maker("ah") diff --git a/bob/ip/binseg/engine/evaluator.py b/bob/ip/binseg/engine/evaluator.py index d927c84cf758034eab3f20740208af77a1ed72b9..96870093a79892e8dfe69fc35cfc022e905c7eb7 100644 --- a/bob/ip/binseg/engine/evaluator.py +++ b/bob/ip/binseg/engine/evaluator.py @@ -186,7 +186,7 @@ def _sample_analysis( return tp_pil_colored -def run(data_loader, predictions_folder, output_folder, overlayed_folder=None, +def run(dataset, predictions_folder, output_folder, overlayed_folder=None, overlay_threshold=None): """ Runs inference and calculates metrics @@ -195,8 +195,8 @@ def run(data_loader, predictions_folder, output_folder, overlayed_folder=None, Parameters --------- - data_loader : py:class:`torch.torch.utils.data.DataLoader` - an iterable over the transformed input dataset, containing ground-truth + dataset : py:class:`torch.utils.data.Dataset` + a dataset to iterate on predictions_folder : str folder where predictions for the dataset images has been previously @@ -216,6 +216,13 @@ def run(data_loader, predictions_folder, output_folder, overlayed_folder=None, the training set or a separate validation set. Using a test set value may bias your analysis. + + Returns + ------- + + threshold : float + Threshold to achieve the highest possible F1-score for this dataset + """ logger.info(f"Output folder: {output_folder}") @@ -227,11 +234,10 @@ def run(data_loader, predictions_folder, output_folder, overlayed_folder=None, # Collect overall metrics data = {} - for sample in tqdm(data_loader): - name = sample[0] - stem = os.path.splitext(name)[0] - image = sample[1].to("cpu") - gt = sample[2].to("cpu") + for sample in tqdm(dataset): + stem = sample[0] + image = sample[1] + gt = sample[2] pred_fullpath = os.path.join(predictions_folder, stem + ".hdf5") with h5py.File(pred_fullpath, "r") as f: pred = f["array"][:] @@ -299,3 +305,99 @@ def run(data_loader, predictions_folder, output_folder, overlayed_folder=None, ["data"], ) fig.savefig(figure_path) + + return optimal_f1_threshold + + +def compare_annotators(baseline, other, output_folder, overlayed_folder=None): + """ + Compares annotations on the **same** dataset + + + Parameters + --------- + + baseline : py:class:`torch.utils.data.Dataset` + a dataset to iterate on, containing the baseline annotations + + other : py:class:`torch.utils.data.Dataset` + a second dataset, with the same samples as ``baseline``, but annotated + by a different annotator than in the first dataset. + + output_folder : str + folder where to store results + + overlayed_folder : :py:class:`str`, Optional + if not ``None``, then it should be the name of a folder where to store + overlayed versions of the images and ground-truths + + overlay_threshold : :py:class:`float`, Optional + if ``overlayed_folder``, then this should be threshold (floating point) + to apply to prediction maps to decide on positives and negatives for + overlaying analysis (graphical output). This number should come from + the training set or a separate validation set. Using a test set value + may bias your analysis. + + """ + + logger.info(f"Output folder: {output_folder}") + + if not os.path.exists(output_folder): + logger.info(f"Creating {output_folder}...") + os.makedirs(output_folder, exist_ok=True) + + # Collect overall metrics + data = {} + + for baseline_sample, other_sample in tqdm(zip(baseline, other)): + stem = baseline_sample[0] + image = baseline_sample[1] + gt = baseline_sample[2] + pred = other_sample[2] #works as a prediction + if stem in data: + raise RuntimeError(f"{stem} entry already exists in data. " + f"Cannot overwrite.") + data[stem] = _sample_metrics(pred, gt) + + if overlayed_folder is not None: + overlay_image = _sample_analysis(image, pred, gt, threshold=0.5, + overlay=True) + fullpath = os.path.join(overlayed_folder, f"{stem}.png") + tqdm.write(f"Saving {fullpath}...") + fulldir = os.path.dirname(fullpath) + if not os.path.exists(fulldir): + tqdm.write(f"Creating directory {fulldir}...") + os.makedirs(fulldir, exist_ok=True) + overlay_image.save(fullpath) + + # Merges all dataframes together + df_metrics = pandas.concat(data.values()) + + # Report and Averages + avg_metrics = df_metrics.groupby("threshold").mean() + std_metrics = df_metrics.groupby("threshold").std() + + # Uncomment below for F1-score calculation based on average precision and + # metrics instead of F1-scores of individual images. This method is in line + # with Maninis et. al. (2016) + # + # avg_metrics["f1_score"] = \ + # (2* avg_metrics["precision"]*avg_metrics["recall"])/ \ + # (avg_metrics["precision"]+avg_metrics["recall"]) + + avg_metrics["std_pr"] = std_metrics["precision"] + avg_metrics["pr_upper"] = avg_metrics["precision"] + avg_metrics["std_pr"] + avg_metrics["pr_lower"] = avg_metrics["precision"] - avg_metrics["std_pr"] + avg_metrics["std_re"] = std_metrics["recall"] + avg_metrics["re_upper"] = avg_metrics["recall"] + avg_metrics["std_re"] + avg_metrics["re_lower"] = avg_metrics["recall"] - avg_metrics["std_re"] + avg_metrics["std_f1"] = std_metrics["f1_score"] + + metrics_path = os.path.join(output_folder, "metrics.csv") + logger.info(f"Saving averages over all input images at {metrics_path}...") + avg_metrics.to_csv(metrics_path) + + maxf1 = avg_metrics["f1_score"].max() + optimal_f1_threshold = avg_metrics["f1_score"].idxmax() + + logger.info(f"Highest F1-score of {maxf1:.5f} (second annotator)") diff --git a/bob/ip/binseg/engine/predictor.py b/bob/ip/binseg/engine/predictor.py index 704899e6666718cd963ec15b2623e44c31530690..a7ab078db93964e664360128d9437f6cb90aa944 100644 --- a/bob/ip/binseg/engine/predictor.py +++ b/bob/ip/binseg/engine/predictor.py @@ -168,8 +168,7 @@ def run(model, data_loader, device, output_folder, overlayed_folder): times.append(batch_time) len_samples.append(len(images)) - for name, img, prob in zip(names, images, predictions): - stem = os.path.splitext(name)[0] + for stem, img, prob in zip(names, images, predictions): _save_hdf5(stem, prob, output_folder) if overlayed_folder is not None: _save_overlayed_png(stem, img, prob, overlayed_folder) diff --git a/bob/ip/binseg/script/evaluate.py b/bob/ip/binseg/script/evaluate.py index f4a3f9fc522991b34f6cc25c26ac061e1d3afa9f..8c8e575fcb65e85ab588c7a622bf7f03d13ec360 100644 --- a/bob/ip/binseg/script/evaluate.py +++ b/bob/ip/binseg/script/evaluate.py @@ -3,7 +3,6 @@ import os import click -from torch.utils.data import DataLoader from bob.extension.scripts.click_helper import ( verbosity_option, @@ -11,9 +10,10 @@ from bob.extension.scripts.click_helper import ( ResourceOption, ) -from ..engine.evaluator import run +from ..engine.evaluator import run, compare_annotators import logging + logger = logging.getLogger(__name__) @@ -25,7 +25,7 @@ logger = logging.getLogger(__name__) \b 1. Runs evaluation on an existing dataset configuration: \b - $ bob binseg evaluate -vv m2unet drive-test --predictions-folder=path/to/predictions --output-folder=path/to/results + $ bob binseg evaluate -vv m2unet drive --predictions-folder=path/to/predictions --output-folder=path/to/results \b 2. To run evaluation on a folder with your own images and annotations, you must first specify resizing, cropping, etc, so that the image can be @@ -69,6 +69,27 @@ logger = logging.getLogger(__name__) required=True, cls=ResourceOption, ) +@click.option( + "--second-annotator", + "-S", + help="A dataset or dictionary, like in --dataset, with the same " + "sample keys, but with annotations from a different annotator that is " + "going to be compared to the one in --dataset", + required=False, + default=None, + cls=ResourceOption, + show_default=True, +) +@click.option( + "--second-annotator-folder", + "-O", + help="Path where to store the analysis result for second annotator " + "comparisons (only used if --second-annotator is also passed)", + required=True, + default="second-annotator", + type=click.Path(), + cls=ResourceOption, +) @click.option( "--overlayed", "-O", @@ -99,15 +120,51 @@ logger = logging.getLogger(__name__) cls=ResourceOption, ) @verbosity_option(cls=ResourceOption) -def evaluate(output_folder, predictions_folder, dataset, overlayed, - overlay_threshold, **kwargs): +def evaluate( + output_folder, + predictions_folder, + dataset, + second_annotator, + second_annotator_folder, + overlayed, + overlay_threshold, + **kwargs, +): """Evaluates an FCN on a binary segmentation task. """ - if isinstance(dataset, dict): - for k,v in dataset.items(): - analysis_folder = os.path.join(output_folder, k) - with v.not_augmented() as d: - data_loader = DataLoader(dataset=d, batch_size=1, - shuffle=False, pin_memory=False) - run(d, predictions_folder, analysis_folder, overlayed, - overlay_threshold) + + # if we work with dictionaries of datasets, then output evaluation + # information into sub-directories of the output_folder + config = {} + if not isinstance(dataset, dict): + config["test"] = { + "dataset": dataset, + "output_folder": output_folder, + "second_annotator": second_annotator, + "second_annotator_folder": second_annotator_folder, + } + else: + for k, v in dataset.items(): + config[k] = { + "dataset": v, + "output_folder": os.path.join(output_folder, k), + "second_annotator": second_annotator.get(k), + "second_annotator_folder": os.path.join( + second_annotator_folder, k + ), + } + + for k, v in config.items(): + with v["dataset"].not_augmented() as d: + run( + d, + predictions_folder, + v["output_folder"], + overlayed, + overlay_threshold, + ) + if v["second_annotator"] is not None: + with v["second_annotator"].not_augmented() as d2: + compare_annotators( + d, d2, v["second_annotator_folder"], overlayed + ) diff --git a/bob/ip/binseg/script/experiment.py b/bob/ip/binseg/script/experiment.py index e4027229a7dd057065379dc534f6b17ae20083c9..6882d5b931bf69399d48bc83956ce78b459c4de0 100644 --- a/bob/ip/binseg/script/experiment.py +++ b/bob/ip/binseg/script/experiment.py @@ -59,6 +59,17 @@ logger = logging.getLogger(__name__) required=True, cls=ResourceOption, ) +@click.option( + "--second-annotator", + "-S", + help="A dataset or dictionary, like in --dataset, with the same " + "sample keys, but with annotations from a different annotator that is " + "going to be compared to the one in --dataset", + required=False, + default=None, + cls=ResourceOption, + show_default=True, +) @click.option( "--optimizer", help="A torch.optim.Optimizer that will be used to train the network", @@ -205,6 +216,7 @@ def experiment( drop_incomplete_batch, criterion, dataset, + second_annotator, checkpoint_period, device, seed, @@ -301,11 +313,14 @@ def experiment( ) analysis_folder = os.path.join(output_folder, "analysis") + second_annotator_folder = os.path.join(analysis_folder, "second-annotator") ctx.invoke( evaluate, output_folder=analysis_folder, predictions_folder=predictions_folder, dataset=dataset, + second_annotator=second_annotator, + second_annotator_folder=second_annotator_folder, overlayed=overlayed_folder, overlay_threshold=0.5, verbose=verbose, @@ -321,7 +336,11 @@ def experiment( systems = [] for k, v in dataset.items(): - systems += [k, os.path.join(output_folder, "analysis", k, "metrics.csv")] + systems += [k, os.path.join(analysis_folder, k, "metrics.csv")] + if second_annotator is not None: + for k, v in second_annotator.items(): + systems += [f"{k} (2nd. annot.)", + os.path.join(second_annotator_folder, k, "metrics.csv")] output_pdf = os.path.join(output_folder, "comparison.pdf") ctx.invoke(compare, label_path=systems, output=output_pdf, verbose=verbose) diff --git a/bob/ip/binseg/script/predict.py b/bob/ip/binseg/script/predict.py index d8af8c011c9b5ea2ede8d1056a5c0a9ee1d19bc4..41419ece1c3a32f41b46310e65d31b3f58ad8f5e 100644 --- a/bob/ip/binseg/script/predict.py +++ b/bob/ip/binseg/script/predict.py @@ -28,7 +28,7 @@ logger = logging.getLogger(__name__) \b 1. Runs prediction on an existing dataset configuration: \b - $ bob binseg predict -vv m2unet drive-test --weight=path/to/model_final.pth --output-folder=path/to/predictions + $ bob binseg predict -vv m2unet drive --weight=path/to/model_final.pth --output-folder=path/to/predictions \b 2. To run prediction on a folder with your own images, you must first specify resizing, cropping, etc, so that the image can be correctly diff --git a/bob/ip/binseg/test/test_cli.py b/bob/ip/binseg/test/test_cli.py index b244bf8b8e78d3fedac17d2897a09aea21646e9d..cbaeea2849a67dcf7160d071b0d965c2e2da1155 100644 --- a/bob/ip/binseg/test/test_cli.py +++ b/bob/ip/binseg/test/test_cli.py @@ -79,6 +79,7 @@ def test_experiment_stare(): "from bob.ip.binseg.configs.datasets.stare import _maker\n" ) config.write("dataset = _maker('ah', _raw)\n") + config.write("second_annotator = _maker('vk', _raw)\n") config.flush() result = runner.invoke( @@ -97,12 +98,12 @@ def test_experiment_stare(): # "Saving results/overlayed/probabilities": 1, #tqdm.write "Ended prediction": 1, # logging "Started evaluation": 1, # logging - "Highest F1-score of": 2, # logging + "Highest F1-score of": 4, # logging "Saving overall precision-recall plot": 2, # logging # "Saving results/overlayed/analysis": 1, #tqdm.write "Ended evaluation": 1, # logging "Started comparison": 1, # logging - "Loading metrics from results/analysis": 2, # logging + "Loading metrics from results/analysis": 4, # logging "Ended comparison": 1, # logging } buf.seek(0) @@ -114,7 +115,7 @@ def test_experiment_stare(): # f"instead of the expected {v}") assert _str_counter(k, logging_output) == v, ( f"Count for string '{k}' appeared " - f"({_str_counter(k, result.output)}) " + f"({_str_counter(k, logging_output)}) " f"instead of the expected {v}" )