From 2bc38491748c3db42b1e76a455698aafc6ad1780 Mon Sep 17 00:00:00 2001 From: Andre Anjos <andre.dos.anjos@gmail.com> Date: Fri, 28 Jun 2024 20:52:39 +0200 Subject: [PATCH] [libs.segmentation.scripts.evaluate] Implement 2nd. annotator analysis --- .../libs/segmentation/engine/evaluator.py | 260 +++++++----------- .../libs/segmentation/scripts/evaluate.py | 26 +- tests/segmentation/test_cli.py | 4 +- 3 files changed, 120 insertions(+), 170 deletions(-) diff --git a/src/mednet/libs/segmentation/engine/evaluator.py b/src/mednet/libs/segmentation/engine/evaluator.py index 3552aa1d..c467312d 100644 --- a/src/mednet/libs/segmentation/engine/evaluator.py +++ b/src/mednet/libs/segmentation/engine/evaluator.py @@ -496,6 +496,63 @@ def validate_threshold(threshold: float | str, splits: list[str]): return threshold +def compare_annotators( + a1: pathlib.Path, a2: pathlib.Path +) -> dict[str, dict[str, float]]: + """Compare annotators and outputs all supported metrics. + + Parameters + ---------- + a1 + Annotator 1 annotations in the form of a JSON file mapping split-names + ot list of lists, each containing the sample name and the (relative) + location of an HDF5 file containing at least one boolean dataset named + ``target``. This dataset is considered as the annotations from the + first annotator. If a boolean ``mask`` is available, it is also + loaded. All elements outside the mask are not considered during the + metrics calculations. + a2 + Annotator 1 annotations in the form of a JSON file mapping split-names + ot list of lists, each containing the sample name and the (relative) + location of an HDF5 file containing at least one boolean dataset named + ``target``. This dataset is considered as the annotations from the + second annotator. + + Returns + ------- + A dictionary that maps split-names to another dictionary with metric + names and values computed by comparing the targets in ``a1`` and + ``a2``. + """ + + retval: dict[str, dict[str, float]] = {} + + with a1.open("r") as f: + a1_data = json.load(f) + + with a2.open("r") as f: + a2_data = json.load(f) + + metrics_available = list(typing.get_args(SUPPORTED_METRIC_TYPE)) + + for split_name, samples in a1_data.items(): + if split_name not in a2_data: + continue + + for _, hdf5_path in samples: + with h5py.File(a1.parent / hdf5_path, "r") as f: + t1 = numpy.array(f["target"]) + mask = numpy.array(f["mask"]) + with h5py.File(a2.parent / hdf5_path, "r") as f: + t2 = numpy.array(f["target"]) + + tp, fp, tn, fn = get_counts_for_threshold(t2, t1, mask, 0.5) + base_metrics = all_metrics(tp, fp, tn, fn) + retval[split_name] = {k: v for k, v in zip(metrics_available, base_metrics)} + + return retval + + def run( predictions: pathlib.Path, steps: int, @@ -685,17 +742,21 @@ def make_table( table_headers = ["Dataset", "threshold"] + metrics_available + ["auroc", "avgprec"] table_data = [] - for split_name in eval_data.keys(): - base_metrics = [eval_data[split_name][k] for k in metrics_available] + for split_name, data in eval_data.items(): + base_metrics = [data[k] for k in metrics_available] table_data.append( [split_name, threshold] + base_metrics - + [ - eval_data[split_name]["auc_score"], - eval_data[split_name]["average_precision_score"], - ] + + [data["auc_score"], data["average_precision_score"]] ) + if "second_annotator" in data: + table_data.append( + [split_name, "annotator/2"] + + list(data["second_annotator"].values()) + + [None, None] + ) + return tabulate.tabulate( table_data, table_headers, @@ -728,6 +789,19 @@ def make_plots(eval_data): data["curves"]["roc"]["tpr"], label=f"{split_name} (AUC: {data['auc_score']:.2f})", ) + + if "second_annotator" in data: + fpr = 1 - data["second_annotator"]["specificity"] + tpr = data["second_annotator"]["recall"] + ax.plot( + fpr, + tpr, + linestyle="none", + marker="*", + markersize=8, + label=f"{split_name} (annotator/2)", + ) + ax.legend(loc="best", fancybox=True, framealpha=0.7) retval.append(fig) @@ -740,168 +814,20 @@ def make_plots(eval_data): data["curves"]["precision_recall"]["recall"], label=f"{split_name} (AP: {data['average_precision_score']:.2f})", ) + + if "second_annotator" in data: + precision = data["second_annotator"]["precision"] + recall = data["second_annotator"]["recall"] + ax.plot( + precision, + recall, + linestyle="none", + marker="*", + markersize=8, + label=f"{split_name} (annotator/2)", + ) + ax.legend(loc="best", fancybox=True, framealpha=0.7) retval.append(fig) return retval - - -# def _compare_annotators_worker( -# baseline_sample: tuple[str, str], -# other_sample: tuple[str, str], -# name: str, -# output_folder: str | None, -# overlayed_folder: str | None, -# ) -> tuple[str, pandas.DataFrame]: -# """Run all of the comparison steps on a single sample pair. - -# Parameters -# ---------- -# baseline_sample -# Sample to be processed, containing the stem of the filepath -# relative to the database root and a path the hdf5 file in which -# the image, target and mask are stored. -# other_sample -# Another sample that is identical to the first, but has a different -# mask (drawn by a different annotator). -# name -# The local name of the dataset (e.g. ``train``, or ``test``), to be -# used when saving measures files. -# output_folder -# If not ``None``, then outputs a copy of the evaluation for this -# sample in CSV format at this directory, but respecting the sample -# ``stem``. -# overlayed_folder -# If not ``None``, then outputs a version of the input image with -# predictions overlayed, in PNG format, but respecting the sample -# ``stem``. - -# Returns -# ------- -# The unique sample stem and a Dataframe containing the evaluation -# performance on this sample. -# """ - -# assert baseline_sample[0] == other_sample[0], ( -# f"Mismatch between " -# f"datasets for second-annotator analysis " -# f"({baseline_sample[0]} != {other_sample[0]}). This " -# f"typically occurs when the second annotator (`other`) " -# f"comes from a different dataset than the `baseline` dataset" -# ) - -# stem = baseline_sample[0] -# image = baseline_sample[1] -# gt = baseline_sample[2] -# pred = other_sample[2] # works as a prediction -# mask = None if len(baseline_sample) < 4 else baseline_sample[3] -# retval = _sample_measures(pred, gt, mask, 2) - -# if output_folder is not None: -# fullpath = output_folder / "second-annotator" / name / f"{stem}.csv" -# tqdm.write(f"Saving {fullpath}...") -# fullpath.parent.mkdir(parents=True, exist_ok=True) -# retval.to_csv(fullpath) - -# if overlayed_folder is not None: -# overlay_image = _sample_analysis( -# image, pred, gt, mask, threshold=0.5, overlay=True -# ) -# fullpath = overlayed_folder / "second-annotator" / name / f"{stem}.png" -# tqdm.write(f"Saving {fullpath}...") -# fullpath.parent.mkdir(parents=True, exist_ok=True) -# overlay_image.save(fullpath) - -# return stem, retval - - -# def compare_annotators( -# baseline: torch.utils.data.Dataset, -# other: torch.utils.data.Dataset, -# name: str, -# output_folder: str, -# overlayed_folder: str | None | None = None, -# parallel: str | None = -1, -# ): -# """Compare annotations on the **same** dataset. - -# Parameters -# ---------- -# baseline -# A dataset to iterate on, containing the baseline annotations. -# other -# A second dataset, with the same samples as ``baseline``, but annotated -# by a different annotator than in the first dataset. The key values -# must much between ``baseline`` and this dataset. -# name -# The local name of this dataset (e.g. ``train-second-annotator``, or -# ``test-second-annotator``), to be used when saving measures files. -# output_folder -# Folder where to store results. -# overlayed_folder -# If not ``None``, then it should be the name of a folder where to store -# overlayed versions of the images and ground-truths. -# parallel -# If set to a value different >= 0, uses multiprocessing for estimating -# thresholds for each sample through a processing pool. A value of zero -# will create as many processes in the pool as cores in the machine. A -# negative value disables multiprocessing altogether. A value greater -# than zero will spawn as many processes as requested. -# """ - -# logger.info(f"Output folder: {output_folder}") -# output_folder.mkdir(parents=True, exist_ok=True) - -# # Collect overall measures -# data = {} - -# if parallel < 0: # turns off multiprocessing -# for baseline_sample, other_sample in tqdm( -# list(zip(baseline, other)), -# desc="samples", -# leave=False, -# disable=None, -# ): -# k, v = _compare_annotators_worker( -# ( -# baseline_sample, -# other_sample, -# name, -# output_folder, -# overlayed_folder, -# ) -# ) -# data[k] = v -# else: -# parallel = parallel or multiprocessing.cpu_count() -# with ( -# multiprocessing.Pool(processes=parallel) as pool, -# tqdm( -# total=len(baseline), -# desc="sample", -# ) as pbar, -# ): -# for k, v in pool.imap_unordered( -# _compare_annotators_worker, -# zip( -# baseline, -# other, -# itertools.repeat(name), -# itertools.repeat(output_folder), -# itertools.repeat(overlayed_folder), -# ), -# ): -# pbar.update() -# data[k] = v - -# measures = _summarize(data) -# measures.drop(0, inplace=True) # removes threshold == 0.0, keeps 0.5 only - -# measures_path = output_folder / "second-annotator" / f"{name}.csv" - -# measures_path.parent.mkdir(parents=True, exist_ok=True) -# logger.info(f"Saving summaries over all input images at {measures_path}...") -# measures.to_csv(measures_path) - -# maxf1 = measures["mean_f1_score"].max() -# logger.info(f"F1-score of {maxf1:.5f} (second annotator; threshold=0.5)") diff --git a/src/mednet/libs/segmentation/scripts/evaluate.py b/src/mednet/libs/segmentation/scripts/evaluate.py index 1ffc233c..704f932b 100644 --- a/src/mednet/libs/segmentation/scripts/evaluate.py +++ b/src/mednet/libs/segmentation/scripts/evaluate.py @@ -23,11 +23,18 @@ __import__("matplotlib").use("agg") epilog="""Examples: \b - 1. Runs evaluation on an existing dataset configuration: + 1. Runs evaluation from existing predictions: .. code:: sh $ mednet segmentation evaluate -vv --predictions=path/to/predictions.json --output-folder=path/to/results + + 2. Runs evaluation from existing predictions, incorporate a second set of + annotations into the evaluation: + + .. code:: sh + + $ mednet segmentation evaluate -vv --predictions=path/to/predictions.json --compare-annotator=path/to/annottions.json --output-folder=path/to/results """, ) @click.option( @@ -142,7 +149,12 @@ def evaluate( save_json_metadata, save_json_with_backup, ) - from mednet.libs.segmentation.engine.evaluator import make_plots, make_table, run + from mednet.libs.segmentation.engine.evaluator import ( + compare_annotators, + make_plots, + make_table, + run, + ) evaluation_file = output_folder / "evaluation.json" @@ -159,6 +171,16 @@ def evaluate( eval_json_data, threshold_value = run(predictions, steps, threshold, metric) + if compare_annotator is not None: + logger.info(f"Comparing 2nd. annotator using `{str(compare_annotator)}`...") + comparison = compare_annotators(predictions, compare_annotator) + assert comparison, ( + f"No matching split-names found on `{str(compare_annotator)}`. " + f"Are these the right annotations?" + ) + for split_name, values in comparison.items(): + eval_json_data.setdefault(split_name, {})["second_annotator"] = values + # Records full result analysis to a JSON file logger.info(f"Saving evaluation results at `{str(evaluation_file)}`...") save_json_with_backup(evaluation_file, eval_json_data) diff --git a/tests/segmentation/test_cli.py b/tests/segmentation/test_cli.py index 6e11b628..a2e35d2e 100644 --- a/tests/segmentation/test_cli.py +++ b/tests/segmentation/test_cli.py @@ -314,6 +314,7 @@ def test_evaluate_lwnet_drive(session_tmp_path): f"--predictions={str(output_folder / 'predictions.json')}", f"--output-folder={str(output_folder)}", "--threshold=test", + f"--compare-annotator={str(output_folder / 'second-annotator' / 'annotations.json')}", ], ) _assert_exit_0(result) @@ -327,7 +328,8 @@ def test_evaluate_lwnet_drive(session_tmp_path): r"^Writing run metadata at .*$": 1, r"^Counting true/false positive/negatives at split.*$": 2, r"^Evaluating threshold on split .*$": 1, - r"^Computing metrics on split .*...$": 2, + r"^Computing metrics on split .*$": 2, + r"^Comparing 2nd. annotator using .*$": 1, r"^Saving evaluation results at .*$": 1, r"^Saving tabulated performance summary at .*$": 1, r"^Saving evaluation figures at .*$": 1, -- GitLab