From 2bc38491748c3db42b1e76a455698aafc6ad1780 Mon Sep 17 00:00:00 2001
From: Andre Anjos <andre.dos.anjos@gmail.com>
Date: Fri, 28 Jun 2024 20:52:39 +0200
Subject: [PATCH] [libs.segmentation.scripts.evaluate] Implement 2nd. annotator
 analysis

---
 .../libs/segmentation/engine/evaluator.py     | 260 +++++++-----------
 .../libs/segmentation/scripts/evaluate.py     |  26 +-
 tests/segmentation/test_cli.py                |   4 +-
 3 files changed, 120 insertions(+), 170 deletions(-)

diff --git a/src/mednet/libs/segmentation/engine/evaluator.py b/src/mednet/libs/segmentation/engine/evaluator.py
index 3552aa1d..c467312d 100644
--- a/src/mednet/libs/segmentation/engine/evaluator.py
+++ b/src/mednet/libs/segmentation/engine/evaluator.py
@@ -496,6 +496,63 @@ def validate_threshold(threshold: float | str, splits: list[str]):
     return threshold
 
 
+def compare_annotators(
+    a1: pathlib.Path, a2: pathlib.Path
+) -> dict[str, dict[str, float]]:
+    """Compare annotators and outputs all supported metrics.
+
+    Parameters
+    ----------
+    a1
+        Annotator 1 annotations in the form of a JSON file mapping split-names
+        ot list of lists, each containing the sample name and the (relative)
+        location of an HDF5 file containing at least one boolean dataset named
+        ``target``.  This dataset is considered as the annotations from the
+        first annotator.  If a boolean ``mask`` is available, it is also
+        loaded.  All elements outside the mask are not considered during the
+        metrics calculations.
+    a2
+        Annotator 1 annotations in the form of a JSON file mapping split-names
+        ot list of lists, each containing the sample name and the (relative)
+        location of an HDF5 file containing at least one boolean dataset named
+        ``target``.  This dataset is considered as the annotations from the
+        second annotator.
+
+    Returns
+    -------
+        A dictionary that maps split-names to another dictionary with metric
+        names and values computed by comparing the targets in ``a1`` and
+        ``a2``.
+    """
+
+    retval: dict[str, dict[str, float]] = {}
+
+    with a1.open("r") as f:
+        a1_data = json.load(f)
+
+    with a2.open("r") as f:
+        a2_data = json.load(f)
+
+    metrics_available = list(typing.get_args(SUPPORTED_METRIC_TYPE))
+
+    for split_name, samples in a1_data.items():
+        if split_name not in a2_data:
+            continue
+
+        for _, hdf5_path in samples:
+            with h5py.File(a1.parent / hdf5_path, "r") as f:
+                t1 = numpy.array(f["target"])
+                mask = numpy.array(f["mask"])
+            with h5py.File(a2.parent / hdf5_path, "r") as f:
+                t2 = numpy.array(f["target"])
+
+            tp, fp, tn, fn = get_counts_for_threshold(t2, t1, mask, 0.5)
+            base_metrics = all_metrics(tp, fp, tn, fn)
+            retval[split_name] = {k: v for k, v in zip(metrics_available, base_metrics)}
+
+    return retval
+
+
 def run(
     predictions: pathlib.Path,
     steps: int,
@@ -685,17 +742,21 @@ def make_table(
     table_headers = ["Dataset", "threshold"] + metrics_available + ["auroc", "avgprec"]
 
     table_data = []
-    for split_name in eval_data.keys():
-        base_metrics = [eval_data[split_name][k] for k in metrics_available]
+    for split_name, data in eval_data.items():
+        base_metrics = [data[k] for k in metrics_available]
         table_data.append(
             [split_name, threshold]
             + base_metrics
-            + [
-                eval_data[split_name]["auc_score"],
-                eval_data[split_name]["average_precision_score"],
-            ]
+            + [data["auc_score"], data["average_precision_score"]]
         )
 
+        if "second_annotator" in data:
+            table_data.append(
+                [split_name, "annotator/2"]
+                + list(data["second_annotator"].values())
+                + [None, None]
+            )
+
     return tabulate.tabulate(
         table_data,
         table_headers,
@@ -728,6 +789,19 @@ def make_plots(eval_data):
                 data["curves"]["roc"]["tpr"],
                 label=f"{split_name} (AUC: {data['auc_score']:.2f})",
             )
+
+            if "second_annotator" in data:
+                fpr = 1 - data["second_annotator"]["specificity"]
+                tpr = data["second_annotator"]["recall"]
+                ax.plot(
+                    fpr,
+                    tpr,
+                    linestyle="none",
+                    marker="*",
+                    markersize=8,
+                    label=f"{split_name} (annotator/2)",
+                )
+
             ax.legend(loc="best", fancybox=True, framealpha=0.7)
         retval.append(fig)
 
@@ -740,168 +814,20 @@ def make_plots(eval_data):
                 data["curves"]["precision_recall"]["recall"],
                 label=f"{split_name} (AP: {data['average_precision_score']:.2f})",
             )
+
+            if "second_annotator" in data:
+                precision = data["second_annotator"]["precision"]
+                recall = data["second_annotator"]["recall"]
+                ax.plot(
+                    precision,
+                    recall,
+                    linestyle="none",
+                    marker="*",
+                    markersize=8,
+                    label=f"{split_name} (annotator/2)",
+                )
+
             ax.legend(loc="best", fancybox=True, framealpha=0.7)
         retval.append(fig)
 
     return retval
-
-
-# def _compare_annotators_worker(
-#     baseline_sample: tuple[str, str],
-#     other_sample: tuple[str, str],
-#     name: str,
-#     output_folder: str | None,
-#     overlayed_folder: str | None,
-# ) -> tuple[str, pandas.DataFrame]:
-#     """Run all of the comparison steps on a single sample pair.
-
-#     Parameters
-#     ----------
-#     baseline_sample
-#         Sample to be processed, containing the stem of the filepath
-#         relative to the database root and a path the hdf5 file in which
-#         the image, target and mask are stored.
-#     other_sample
-#         Another sample that is identical to the first, but has a different
-#         mask (drawn by a different annotator).
-#     name
-#         The local name of the dataset (e.g. ``train``, or ``test``), to be
-#         used when saving measures files.
-#     output_folder
-#         If not ``None``, then outputs a copy of the evaluation for this
-#         sample in CSV format at this directory, but respecting the sample
-#         ``stem``.
-#     overlayed_folder
-#         If not ``None``, then outputs a version of the input image with
-#         predictions overlayed, in PNG format, but respecting the sample
-#         ``stem``.
-
-#     Returns
-#     -------
-#         The unique sample stem and a Dataframe containing the evaluation
-#         performance on this sample.
-#     """
-
-#     assert baseline_sample[0] == other_sample[0], (
-#         f"Mismatch between "
-#         f"datasets for second-annotator analysis "
-#         f"({baseline_sample[0]} != {other_sample[0]}).  This "
-#         f"typically occurs when the second annotator (`other`) "
-#         f"comes from a different dataset than the `baseline` dataset"
-#     )
-
-#     stem = baseline_sample[0]
-#     image = baseline_sample[1]
-#     gt = baseline_sample[2]
-#     pred = other_sample[2]  # works as a prediction
-#     mask = None if len(baseline_sample) < 4 else baseline_sample[3]
-#     retval = _sample_measures(pred, gt, mask, 2)
-
-#     if output_folder is not None:
-#         fullpath = output_folder / "second-annotator" / name / f"{stem}.csv"
-#         tqdm.write(f"Saving {fullpath}...")
-#         fullpath.parent.mkdir(parents=True, exist_ok=True)
-#         retval.to_csv(fullpath)
-
-#     if overlayed_folder is not None:
-#         overlay_image = _sample_analysis(
-#             image, pred, gt, mask, threshold=0.5, overlay=True
-#         )
-#         fullpath = overlayed_folder / "second-annotator" / name / f"{stem}.png"
-#         tqdm.write(f"Saving {fullpath}...")
-#         fullpath.parent.mkdir(parents=True, exist_ok=True)
-#         overlay_image.save(fullpath)
-
-#     return stem, retval
-
-
-# def compare_annotators(
-#     baseline: torch.utils.data.Dataset,
-#     other: torch.utils.data.Dataset,
-#     name: str,
-#     output_folder: str,
-#     overlayed_folder: str | None | None = None,
-#     parallel: str | None = -1,
-# ):
-#     """Compare annotations on the **same** dataset.
-
-#     Parameters
-#     ----------
-#     baseline
-#         A dataset to iterate on, containing the baseline annotations.
-#     other
-#         A second dataset, with the same samples as ``baseline``, but annotated
-#         by a different annotator than in the first dataset.  The key values
-#         must much between ``baseline`` and this dataset.
-#     name
-#         The local name of this dataset (e.g. ``train-second-annotator``, or
-#         ``test-second-annotator``), to be used when saving measures files.
-#     output_folder
-#         Folder where to store results.
-#     overlayed_folder
-#         If not ``None``, then it should be the name of a folder where to store
-#         overlayed versions of the images and ground-truths.
-#     parallel
-#         If set to a value different >= 0, uses multiprocessing for estimating
-#         thresholds for each sample through a processing pool.  A value of zero
-#         will create as many processes in the pool as cores in the machine.  A
-#         negative value disables multiprocessing altogether.  A value greater
-#         than zero will spawn as many processes as requested.
-#     """
-
-#     logger.info(f"Output folder: {output_folder}")
-#     output_folder.mkdir(parents=True, exist_ok=True)
-
-#     # Collect overall measures
-#     data = {}
-
-#     if parallel < 0:  # turns off multiprocessing
-#         for baseline_sample, other_sample in tqdm(
-#             list(zip(baseline, other)),
-#             desc="samples",
-#             leave=False,
-#             disable=None,
-#         ):
-#             k, v = _compare_annotators_worker(
-#                 (
-#                     baseline_sample,
-#                     other_sample,
-#                     name,
-#                     output_folder,
-#                     overlayed_folder,
-#                 )
-#             )
-#             data[k] = v
-#     else:
-#         parallel = parallel or multiprocessing.cpu_count()
-#         with (
-#             multiprocessing.Pool(processes=parallel) as pool,
-#             tqdm(
-#                 total=len(baseline),
-#                 desc="sample",
-#             ) as pbar,
-#         ):
-#             for k, v in pool.imap_unordered(
-#                 _compare_annotators_worker,
-#                 zip(
-#                     baseline,
-#                     other,
-#                     itertools.repeat(name),
-#                     itertools.repeat(output_folder),
-#                     itertools.repeat(overlayed_folder),
-#                 ),
-#             ):
-#                 pbar.update()
-#                 data[k] = v
-
-#     measures = _summarize(data)
-#     measures.drop(0, inplace=True)  # removes threshold == 0.0, keeps 0.5 only
-
-#     measures_path = output_folder / "second-annotator" / f"{name}.csv"
-
-#     measures_path.parent.mkdir(parents=True, exist_ok=True)
-#     logger.info(f"Saving summaries over all input images at {measures_path}...")
-#     measures.to_csv(measures_path)
-
-#     maxf1 = measures["mean_f1_score"].max()
-#     logger.info(f"F1-score of {maxf1:.5f} (second annotator; threshold=0.5)")
diff --git a/src/mednet/libs/segmentation/scripts/evaluate.py b/src/mednet/libs/segmentation/scripts/evaluate.py
index 1ffc233c..704f932b 100644
--- a/src/mednet/libs/segmentation/scripts/evaluate.py
+++ b/src/mednet/libs/segmentation/scripts/evaluate.py
@@ -23,11 +23,18 @@ __import__("matplotlib").use("agg")
     epilog="""Examples:
 
 \b
-  1. Runs evaluation on an existing dataset configuration:
+  1. Runs evaluation from existing predictions:
 
      .. code:: sh
 
         $ mednet segmentation evaluate -vv --predictions=path/to/predictions.json --output-folder=path/to/results
+
+  2. Runs evaluation from existing predictions, incorporate a second set of
+     annotations into the evaluation:
+
+     .. code:: sh
+
+        $ mednet segmentation evaluate -vv --predictions=path/to/predictions.json --compare-annotator=path/to/annottions.json --output-folder=path/to/results
 """,
 )
 @click.option(
@@ -142,7 +149,12 @@ def evaluate(
         save_json_metadata,
         save_json_with_backup,
     )
-    from mednet.libs.segmentation.engine.evaluator import make_plots, make_table, run
+    from mednet.libs.segmentation.engine.evaluator import (
+        compare_annotators,
+        make_plots,
+        make_table,
+        run,
+    )
 
     evaluation_file = output_folder / "evaluation.json"
 
@@ -159,6 +171,16 @@ def evaluate(
 
     eval_json_data, threshold_value = run(predictions, steps, threshold, metric)
 
+    if compare_annotator is not None:
+        logger.info(f"Comparing 2nd. annotator using `{str(compare_annotator)}`...")
+        comparison = compare_annotators(predictions, compare_annotator)
+        assert comparison, (
+            f"No matching split-names found on `{str(compare_annotator)}`. "
+            f"Are these the right annotations?"
+        )
+        for split_name, values in comparison.items():
+            eval_json_data.setdefault(split_name, {})["second_annotator"] = values
+
     # Records full result analysis to a JSON file
     logger.info(f"Saving evaluation results at `{str(evaluation_file)}`...")
     save_json_with_backup(evaluation_file, eval_json_data)
diff --git a/tests/segmentation/test_cli.py b/tests/segmentation/test_cli.py
index 6e11b628..a2e35d2e 100644
--- a/tests/segmentation/test_cli.py
+++ b/tests/segmentation/test_cli.py
@@ -314,6 +314,7 @@ def test_evaluate_lwnet_drive(session_tmp_path):
                 f"--predictions={str(output_folder / 'predictions.json')}",
                 f"--output-folder={str(output_folder)}",
                 "--threshold=test",
+                f"--compare-annotator={str(output_folder / 'second-annotator' / 'annotations.json')}",
             ],
         )
         _assert_exit_0(result)
@@ -327,7 +328,8 @@ def test_evaluate_lwnet_drive(session_tmp_path):
             r"^Writing run metadata at .*$": 1,
             r"^Counting true/false positive/negatives at split.*$": 2,
             r"^Evaluating threshold on split .*$": 1,
-            r"^Computing metrics on split .*...$": 2,
+            r"^Computing metrics on split .*$": 2,
+            r"^Comparing 2nd. annotator using .*$": 1,
             r"^Saving evaluation results at .*$": 1,
             r"^Saving tabulated performance summary at .*$": 1,
             r"^Saving evaluation figures at .*$": 1,
-- 
GitLab