diff --git a/helpers/extract_hdf5_images.py b/helpers/extract_hdf5_images.py
index 61c79c912ac47aa2d3b035ced0938f114d12d7c6..bf1d6b300c79a8ae80423d1da2bdfa440908dcb2 100644
--- a/helpers/extract_hdf5_images.py
+++ b/helpers/extract_hdf5_images.py
@@ -16,9 +16,15 @@ def save_images(tensors_dict, output_dir):
 def extract_images_from_hdf5(hdf5_file):
     tensors_dict = {}
     with h5py.File(hdf5_file, "r") as f:
-        tensors_dict["image"] = torch.from_numpy(f.get("img")[:])
-        tensors_dict["target"] = torch.from_numpy(f.get("target")[:])
-        tensors_dict["mask"] = torch.from_numpy(f.get("mask")[:])
+        img = f["img"]
+        assert isinstance(img, h5py.Dataset)
+        tensors_dict["image"] = torch.from_numpy(img[:])
+        target = f["target"]
+        assert isinstance(target, h5py.Dataset)
+        tensors_dict["target"] = torch.from_numpy(target[:])
+        mask = f["mask"]
+        assert isinstance(mask, h5py.Dataset)
+        tensors_dict["mask"] = torch.from_numpy(mask[:])
 
     return tensors_dict
 
diff --git a/src/mednet/libs/classification/scripts/evaluate.py b/src/mednet/libs/classification/scripts/evaluate.py
index 418600ee95954ddb8bd6d57b0013b4b0411021ba..31c2c60a2246e9e1b84b236757a27d6a9c61f755 100644
--- a/src/mednet/libs/classification/scripts/evaluate.py
+++ b/src/mednet/libs/classification/scripts/evaluate.py
@@ -128,12 +128,11 @@ def evaluate(
         tabulate_results,
     )
 
-    with predictions.open("r") as f:
-        predict_data = json.load(f)
+    evaluation_file = output_folder / "evaluation.json"
 
     # register metadata
     save_json_metadata(
-        output_file=output_folder / "evaluation.meta.json",
+        output_file=evaluation_file.with_suffix(".meta.json"),
         predictions=str(predictions),
         output_folder=str(output_folder),
         threshold=threshold,
@@ -141,6 +140,9 @@ def evaluate(
         plot=plot,
     )
 
+    with predictions.open("r") as f:
+        predict_data = json.load(f)
+
     if threshold in predict_data:
         # it is the name of a split
         # first run evaluation for reference dataset
@@ -171,7 +173,6 @@ def evaluate(
         )
 
     # records full result analysis to a JSON file
-    evaluation_file = output_folder / "evaluation.json"
     logger.info(f"Saving evaluation results at `{str(evaluation_file)}`...")
     save_json_with_backup(evaluation_file, results)
 
diff --git a/src/mednet/libs/segmentation/engine/evaluator.py b/src/mednet/libs/segmentation/engine/evaluator.py
index 46b2d9ff21b0fdf9807d171bbc632b98503d0f84..289071a5e8ec044c1dad98c813f4993a24e51888 100644
--- a/src/mednet/libs/segmentation/engine/evaluator.py
+++ b/src/mednet/libs/segmentation/engine/evaluator.py
@@ -4,13 +4,17 @@
 
 """Defines functionality for the evaluation of predictions."""
 
+import json
 import logging
 import pathlib
 import typing
 
+import credible.curves
+import credible.plot
 import h5py
 import numpy
 import numpy.typing
+import tabulate
 from tqdm import tqdm
 
 logger = logging.getLogger(__name__)
@@ -370,9 +374,9 @@ def load_count(
     data = numpy.zeros((len(thresholds), 4), dtype=numpy.uint64)
     for sample in tqdm(predictions, desc="sample"):
         with h5py.File(prediction_path / sample[1], "r") as f:
-            pred = numpy.array(f.get("prediction"))  # float32
-            gt = numpy.array(f.get("target"))  # boolean
-            mask = numpy.array(f.get("mask"))  # boolean
+            pred = numpy.array(f["prediction"])  # float32
+            gt = numpy.array(f["target"])  # boolean
+            mask = numpy.array(f["mask"])  # boolean
         data += numpy.array(
             [get_counts_for_threshold(pred, gt, mask, k) for k in thresholds],
             dtype=numpy.uint64,
@@ -411,7 +415,8 @@ def load_predictions(
 
     # peak prediction size and number of samples
     with h5py.File(prediction_path / predictions[0][1], "r") as f:
-        elements = numpy.array(f.get("prediction").shape).prod()
+        data: h5py.Dataset = typing.cast(h5py.Dataset, f["prediction"])
+        elements = numpy.array(data.shape).prod()
     size = len(predictions) * elements
     logger.info(
         f"Data loading will require ({elements} x {len(predictions)} x 5 =) "
@@ -423,10 +428,10 @@ def load_predictions(
     gt_array = numpy.empty((size,), dtype=numpy.bool_)
     for i, sample in enumerate(tqdm(predictions, desc="sample")):
         with h5py.File(prediction_path / sample[1], "r") as f:
-            mask = numpy.array(f.get("mask"))  # boolean
-            pred = numpy.array(f.get("prediction"))  # float32
+            mask = numpy.array(f["mask"])  # boolean
+            pred = numpy.array(f["prediction"])  # float32
             pred *= mask.astype(numpy.float32)
-            gt = numpy.array(f.get("target"))  # boolean
+            gt = numpy.array(f["target"])  # boolean
             gt &= mask
             pred_array[i * elements : (i + 1) * elements] = pred.flatten()
             gt_array[i * elements : (i + 1) * elements] = gt.flatten()
@@ -461,6 +466,278 @@ def compute_metric(
     return numpy.array([metric(*k) for k in counts], dtype=numpy.float64)
 
 
+def validate_threshold(threshold: float | str, splits: list[str]):
+    """Validate the user threshold selection and returns parsed threshold.
+
+    Parameters
+    ----------
+    threshold
+        The threshold to validate.
+    splits
+        List of available splits.
+
+    Returns
+    -------
+        The validated threshold.
+    """
+    try:
+        # we try to convert it to float first
+        threshold = float(threshold)
+        if threshold < 0.0 or threshold > 1.0:
+            raise ValueError("Float thresholds must be within range [0.0, 1.0]")
+    except ValueError:
+        if threshold not in splits:
+            raise ValueError(
+                f"Text thresholds should match dataset names, "
+                f"but {threshold} is not available among the datasets provided ("
+                f"({', '.join(splits)})"
+            )
+
+    return threshold
+
+
+def run(
+    predictions: pathlib.Path,
+    steps: int,
+    threshold: str | float,
+    metric: SUPPORTED_METRIC_TYPE,
+) -> tuple[dict[str, dict[str, typing.Any]], float]:
+    """Evaluate a segmentation model.
+
+    Parameters
+    ----------
+    predictions
+        Path to the file ``predictions.json``, containing the list of
+        predictions to be evaluated.
+    steps
+        The number of steps between ``[0, 1]`` to build a threshold list
+        from.  This list will be applied to the probability outputs and
+        true/false positive/negative counts generated from those.
+    threshold
+        Which threshold to apply when generating unary summaries of the
+        performance.  This can be a value between ``[0, 1]``, or the name
+        of a split in ``predictions`` where a threshold will be calculated
+        at.
+    metric
+        The name of a supported metric that will be used to evaluate the
+        best threshold from a threshold-list uniformily split in ``steps``,
+        and for which unary summaries are generated.
+
+    Returns
+    -------
+        A JSON-able summary with all figures of merit pre-caculated, for
+        all splits.  This is a dictionary where keys are split-names contained
+        in ``predictions``, and values are dictionaries with the following
+        keys:
+
+            * ``counts``: dictionary where keys are thresholds, and values are
+              sequence of integers containing the TP, FP, TN, FN (in this order).
+            * ``auc_score``: a float indicating the area under the ROC curve
+              for the split.  It is calculated using a trapezoidal rule.
+            * ``average_precision_score``: a float indicating the area under the
+              precision-recall curve, calculated using a rectangle rule.
+            * ``curves``: dictionary with 2 keys:
+              * ``roc``: dictionary with 3 keys:
+                * ``fpr``: a list of floats with the false-positive rate
+                * ``tpr``: a list of floats with the true-positive rate
+                * ``thresholds``: a list of thresholds uniformily separated by
+                  ``steps``, at which both ``fpr`` and ``tpr`` are evaluated.
+              * ``precision_recall``: a dictionary with 3 keys:
+                * ``precision``: a list of floats with the precision
+                * ``recall``: a list of floats with the recall
+                * ``thresholds``: a list of thresholds uniformily separated by
+                  ``steps``, at which both ``precision`` and ``recall`` are
+                  evaluated.
+            * ``threshold_a_priori``: boolean indicating if the threshold for unary
+              metrics where computed with a threshold chosen a priori or a
+              posteriori in this split.
+            * ``<metric-name>``: a float representing the supported metric at the
+              threshold that maximizes ``metric``.  There will be one entry of this
+              type for each of the :py:obj:`SUPPORTED_METRIC_TYPE`'s.
+
+        Also returns the threshold considered for all splits.
+    """
+
+    with predictions.open("r") as f:
+        predict_data = json.load(f)
+
+    threshold = validate_threshold(threshold, predict_data)
+    threshold_list = numpy.arange(
+        0.0, (1.0 + 1 / steps), 1 / steps, dtype=numpy.float64
+    )
+
+    # Holds all computed data.  Format <split-name: str> -> <split-data: dict>
+    eval_json_data: dict[str, dict[str, typing.Any]] = {}
+
+    # Compute counts for various splits.
+    for split_name, samples in predict_data.items():
+        logger.info(
+            f"Counting true/false positive/negatives at split `{split_name}`..."
+        )
+        counts = load_count(predictions.parent, samples, threshold_list)
+
+        logger.info(f"Evaluating performance curves/metrics at split `{split_name}`...")
+        fpr_curve = 1.0 - numpy.array([specificity(*k) for k in counts])
+        recall_curve = tpr_curve = numpy.array([recall(*k) for k in counts])
+        precision_curve = numpy.array([precision(*k) for k in counts])
+
+        # populates data to be recorded in JSON format
+        eval_json_data.setdefault(split_name, {})["counts"] = {
+            k: v for k, v in zip(threshold_list, counts)
+        }
+        eval_json_data.setdefault(split_name, {})["auc_score"] = (
+            credible.curves.area_under_the_curve((fpr_curve, tpr_curve))
+        )
+        eval_json_data.setdefault(split_name, {})["average_precision_score"] = (
+            credible.curves.average_metric((precision_curve, recall_curve))
+        )
+        eval_json_data.setdefault(split_name, {})["curves"] = dict(
+            roc=dict(fpr=fpr_curve, tpr=tpr_curve, thresholds=threshold_list),
+            precision_recall=dict(
+                precision=precision_curve,
+                recall=recall_curve,
+                thresholds=threshold_list,
+            ),
+        )
+
+    # Computes argmax in the designated split "counts" (typically "validation"),
+    # where the chosen metric reaches its **maximum**.
+    if isinstance(threshold, str):
+        # Compute threshold on specified split, if required
+        logger.info(f"Evaluating threshold on split `{threshold}` using " f"`{metric}`")
+        metric_list = compute_metric(
+            eval_json_data[threshold]["counts"].values(),
+            name2metric(typing.cast(SUPPORTED_METRIC_TYPE, metric)),
+        )
+        threshold_index = metric_list.argmax()
+
+        # Reset list of how thresholds are calculated on the recorded split
+        for split_name in predict_data.keys():
+            if split_name == threshold:
+                eval_json_data[split_name]["threshold_a_priori"] = False
+            else:
+                eval_json_data[split_name]["threshold_a_posteriori"] = True
+
+    else:
+        # must figure out the closest threshold from the list we are using
+        threshold_index = (numpy.abs(threshold_list - threshold)).argmin()
+
+        # Reset list of how thresholds are calculated on the recorded split
+        for split_name in predict_data.keys():
+            eval_json_data[split_name]["threshold_a_priori"] = True
+
+    logger.info(f"Set --threshold={threshold_list[threshold_index]:.4f}")
+
+    # Computes all available metrics on the designated threshold, across all
+    # splits
+    # Populates <split-name: str> -> <metric-name: SUPPORTED_METRIC_TYPE> ->
+    # float
+    metrics_available = list(typing.get_args(SUPPORTED_METRIC_TYPE))
+    for split_name in predict_data.keys():
+        logger.info(
+            f"Computing metrics on split `{split_name}` at "
+            f"threshold={threshold_list[threshold_index]:.4f}..."
+        )
+        base_metrics = all_metrics(
+            *(list(eval_json_data[split_name]["counts"].values())[threshold_index])
+        )
+        eval_json_data[split_name].update(
+            {k: v for k, v in zip(metrics_available, base_metrics)}
+        )
+
+    return eval_json_data, threshold_list[threshold_index]
+
+
+def make_table(
+    eval_data: dict[str, dict[str, typing.Any]], threshold: float, format_: str
+) -> str:
+    """Extract and format table from pre-computed evaluation data.
+
+    Extracts elements from ``eval_data`` that can be displayed on a
+    terminal-style table, format, and returns it.
+
+    Parameters
+    ----------
+    eval_data
+        Evaluation data as returned by :py:func:`run`.
+    threshold
+        The threshold value used to compute unary metrics on all splits.
+    format_
+        A supported tabulate format.
+
+    Returns
+    -------
+        A string representation of a table.
+    """
+
+    # Extracts elements from ``eval_json_data`` that can be displayed on a
+    # terminal-style table, format, and print to screen.  Record the table into
+    # a file for later usage.
+    metrics_available = list(typing.get_args(SUPPORTED_METRIC_TYPE))
+    table_headers = ["Dataset", "threshold"] + metrics_available + ["auroc", "avgprec"]
+
+    table_data = []
+    for split_name in eval_data.keys():
+        base_metrics = [eval_data[split_name][k] for k in metrics_available]
+        table_data.append(
+            [split_name, threshold]
+            + base_metrics
+            + [
+                eval_data[split_name]["auc_score"],
+                eval_data[split_name]["average_precision_score"],
+            ]
+        )
+
+    return tabulate.tabulate(
+        table_data,
+        table_headers,
+        tablefmt=format_,
+        floatfmt=".3f",
+        stralign="right",
+    )
+
+
+def make_plots(eval_data):
+    """Create plots for all curves in ``eval_data``.
+
+    Parameters
+    ----------
+    eval_data
+        Evaluation data as returned by :py:func:`run`.
+
+    Returns
+    -------
+        A list of figures to record to file
+    """
+    retval = []
+
+    with credible.plot.tight_layout(
+        ("False Positive Rate", "True Positive Rate"), "ROC"
+    ) as (fig, ax):
+        for split_name, data in eval_data.items():
+            ax.plot(
+                data["curves"]["roc"]["fpr"],
+                data["curves"]["roc"]["tpr"],
+                label=f"{split_name} (AUC: {data['auc_score']:.2f})",
+            )
+            ax.legend(loc="best", fancybox=True, framealpha=0.7)
+        retval.append(fig)
+
+    with credible.plot.tight_layout_f1iso(
+        ("Recall", "Precision"), "Precison-Recall"
+    ) as (fig, ax):
+        for split_name, data in eval_data.items():
+            ax.plot(
+                data["curves"]["precision_recall"]["precision"],
+                data["curves"]["precision_recall"]["recall"],
+                label=f"{split_name} (AP: {data['average_precision_score']:.2f})",
+            )
+            ax.legend(loc="best", fancybox=True, framealpha=0.7)
+        retval.append(fig)
+
+    return retval
+
+
 # def _compare_annotators_worker(
 #     baseline_sample: tuple[str, str],
 #     other_sample: tuple[str, str],
diff --git a/src/mednet/libs/segmentation/engine/viewer.py b/src/mednet/libs/segmentation/engine/viewer.py
index 6699f68382b90c059c15a53e14010868c7bd872c..15663167adbad96e284f1c7bba8d30711dea1812 100644
--- a/src/mednet/libs/segmentation/engine/viewer.py
+++ b/src/mednet/libs/segmentation/engine/viewer.py
@@ -59,10 +59,10 @@ def view(
         return torchvision.transforms.functional.to_pil_image(torch.Tensor(arr))
 
     with h5py.File(basedir / stem, "r") as f:
-        image: numpy.typing.NDArray[numpy.float32] = numpy.array(f.get("image"))
-        pred: numpy.typing.NDArray[numpy.float32] = numpy.array(f.get("prediction"))
-        target: numpy.typing.NDArray[numpy.bool_] = numpy.array(f.get("target"))
-        mask: numpy.typing.NDArray[numpy.bool_] = numpy.array(f.get("mask"))
+        image: numpy.typing.NDArray[numpy.float32] = numpy.array(f["image"])
+        pred: numpy.typing.NDArray[numpy.float32] = numpy.array(f["prediction"])
+        target: numpy.typing.NDArray[numpy.bool_] = numpy.array(f["target"])
+        mask: numpy.typing.NDArray[numpy.bool_] = numpy.array(f["mask"])
 
     image *= mask
     pred *= mask
diff --git a/src/mednet/libs/segmentation/scripts/cli.py b/src/mednet/libs/segmentation/scripts/cli.py
index b83721701810280ee155318ae8fb443428d72a1b..4f534e5896a7b8d9e82dc04e2627847edb0a0151 100644
--- a/src/mednet/libs/segmentation/scripts/cli.py
+++ b/src/mednet/libs/segmentation/scripts/cli.py
@@ -8,7 +8,6 @@ import click
 from clapper.click import AliasedGroup
 
 from . import (
-    # analyze,
     config,
     database,
     dump_annotations,
diff --git a/src/mednet/libs/segmentation/scripts/evaluate.py b/src/mednet/libs/segmentation/scripts/evaluate.py
index 4719321d8f993a1b5b6678311b9befe552565fb2..1ffc233c1f54974a83b4ed2ba012a7d7a4493f00 100644
--- a/src/mednet/libs/segmentation/scripts/evaluate.py
+++ b/src/mednet/libs/segmentation/scripts/evaluate.py
@@ -2,7 +2,6 @@
 #
 # SPDX-License-Identifier: GPL-3.0-or-later
 
-import json
 import pathlib
 import typing
 
@@ -18,36 +17,6 @@ logger = setup("mednet")
 __import__("matplotlib").use("agg")
 
 
-def validate_threshold(threshold: float | str, splits: list[str]):
-    """Validate the user threshold selection and returns parsed threshold.
-
-    Parameters
-    ----------
-    threshold
-        The threshold to validate.
-    splits
-        List of available splits.
-
-    Returns
-    -------
-        The validated threshold.
-    """
-    try:
-        # we try to convert it to float first
-        threshold = float(threshold)
-        if threshold < 0.0 or threshold > 1.0:
-            raise ValueError("Float thresholds must be within range [0.0, 1.0]")
-    except ValueError:
-        if threshold not in splits:
-            raise ValueError(
-                f"Text thresholds should match dataset names, "
-                f"but {threshold} is not available among the datasets provided ("
-                f"({', '.join(splits)})"
-            )
-
-    return threshold
-
-
 @click.command(
     entry_point_group="mednet.libs.segmentation.config",
     cls=ConfigCommand,
@@ -160,7 +129,7 @@ def evaluate(
     predictions: pathlib.Path,
     output_folder: pathlib.Path,
     threshold: str | float,
-    metric: str,
+    metric: SUPPORTED_METRIC_TYPE,
     steps: int,
     compare_annotator: pathlib.Path,
     plot: bool,
@@ -168,31 +137,17 @@ def evaluate(
 ):  # numpydoc ignore=PR01
     """Evaluate predictions (from a model) on a segmentation task."""
 
-    import credible.curves
-    import credible.plot
     import matplotlib.backends.backend_pdf
-    import numpy
-    import tabulate
     from mednet.libs.common.scripts.utils import (
         save_json_metadata,
         save_json_with_backup,
     )
-    from mednet.libs.segmentation.engine.evaluator import (
-        all_metrics,
-        compute_metric,
-        load_count,
-        name2metric,
-        precision,
-        recall,
-        specificity,
-    )
+    from mednet.libs.segmentation.engine.evaluator import make_plots, make_table, run
 
-    with predictions.open("r") as f:
-        predict_data = json.load(f)
+    evaluation_file = output_folder / "evaluation.json"
 
-    # register metadata
     save_json_metadata(
-        output_file=output_folder / "evaluation.meta.json",
+        output_file=evaluation_file.with_suffix(".meta.json"),
         predictions=str(predictions),
         output_folder=str(output_folder),
         threshold=threshold,
@@ -202,130 +157,26 @@ def evaluate(
         plot=plot,
     )
 
-    threshold = validate_threshold(threshold, predict_data)
-    threshold_list = numpy.arange(
-        0.0, (1.0 + 1 / steps), 1 / steps, dtype=numpy.float64
-    )
-
-    # Compute counts for various splits
-    eval_json_data: dict[str, dict[str, typing.Any]] = {}
-    for split_name, samples in predict_data.items():
-        logger.info(
-            f"Counting true/false positive/negatives at split `{split_name}`..."
-        )
-        eval_json_data.setdefault(split_name, {})["counts"] = {
-            k: v
-            for k, v in zip(
-                threshold_list, load_count(predictions.parent, samples, threshold_list)
-            )
-        }
-        eval_json_data[split_name]["threshold_a_posteriori"] = True
-
-    if isinstance(threshold, str):
-        # Compute threshold on specified split, if required
-        logger.info(f"Evaluating threshold on split `{threshold}` using " f"`{metric}`")
-        metric_list = compute_metric(
-            eval_json_data[threshold]["counts"].values(),
-            name2metric(typing.cast(SUPPORTED_METRIC_TYPE, metric)),
-        )
-        threshold_index = metric_list.argmax()
-        logger.info(f"Set --threshold={threshold_list[threshold_index]:.4f}")
+    eval_json_data, threshold_value = run(predictions, steps, threshold, metric)
 
-        # Reset list of how thresholds are calculated on the recorded split
-        for split_name in predict_data.keys():
-            if split_name == threshold:
-                continue
-            eval_json_data[split_name]["threshold_a_posteriori"] = False
-
-    else:
-        # must figure out the closest threshold from the list we are using
-        threshold_index = (numpy.abs(threshold_list - threshold)).argmin()
-        logger.info(f"Set --threshold={threshold_list[threshold_index]:.4f}")
-
-    metrics_available = list(typing.get_args(SUPPORTED_METRIC_TYPE))
-    table_headers = ["Dataset", "threshold"] + metrics_available + ["auroc", "avgprec"]
-
-    table_data = []
-    for split_name in predict_data.keys():
-        logger.info("Computing performance on split `{split_name}`...")
-        counts = list(eval_json_data[split_name]["counts"].values())
-        base_metrics = all_metrics(*counts[threshold_index])
-        table_data.append([split_name, threshold_list[threshold_index]] + base_metrics)
-        eval_json_data[split_name].update(
-            {k: v for k, v in zip(metrics_available, base_metrics)}
-        )
-        fpr_curve = 1.0 - numpy.array([specificity(*k) for k in counts])
-        recall_curve = tpr_curve = numpy.array([recall(*k) for k in counts])
-        precision_curve = numpy.array([precision(*k) for k in counts])
-        table_data[-1] += [
-            credible.curves.area_under_the_curve((fpr_curve, tpr_curve)),  # auc-roc
-            credible.curves.average_metric(
-                (precision_curve, recall_curve)
-            ),  # average precision
-        ]
-        eval_json_data[split_name]["auc_score"] = table_data[-1][-2]
-        eval_json_data[split_name]["average_precision_score"] = table_data[-1][-1]
-        eval_json_data[split_name]["curves"] = dict(
-            roc=dict(fpr=fpr_curve, tpr=tpr_curve, thresholds=threshold_list),
-            precision_recall=dict(
-                precision=precision_curve,
-                recall=recall_curve,
-                thresholds=threshold_list,
-            ),
-        )
-
-    # records full result analysis to a JSON file
-    evaluation_file = output_folder / "evaluation.json"
+    # Records full result analysis to a JSON file
     logger.info(f"Saving evaluation results at `{str(evaluation_file)}`...")
     save_json_with_backup(evaluation_file, eval_json_data)
 
-    table_format = "rst"
-    table = tabulate.tabulate(
-        table_data,
-        table_headers,
-        tablefmt=table_format,
-        floatfmt=".3f",
-        stralign="right",
-    )
+    # Produces and records table
+    table = make_table(eval_json_data, threshold_value, "rst")
     click.echo(table)
 
-    output_table = output_folder / "evaluation.rst"
+    output_table = evaluation_file.with_suffix(".rst")
     logger.info(f"Saving tabulated performance summary at `{str(output_table)}`...")
     output_table.parent.mkdir(parents=True, exist_ok=True)
     with output_table.open("w") as f:
         f.write(table)
 
+    # Plots pre-calculated curves, if the user asked to do so.
     if plot:
         figure_path = evaluation_file.with_suffix(".pdf")
         logger.info(f"Saving evaluation figures at `{str(figure_path)}`...")
-
         with matplotlib.backends.backend_pdf.PdfPages(figure_path) as pdf:
-            with credible.plot.tight_layout(
-                ("False Positive Rate", "True Positive Rate"), "ROC"
-            ) as (
-                fig,
-                ax,
-            ):
-                for split_name, data in eval_json_data.items():
-                    ax.plot(
-                        data["curves"]["roc"]["fpr"],
-                        data["curves"]["roc"]["tpr"],
-                        label=f"{split_name} (AUC: {data['auc_score']:.2f})",
-                    )
-                    ax.legend(loc="best", fancybox=True, framealpha=0.7)
-                pdf.savefig(fig)
-
-            with credible.plot.tight_layout_f1iso(
-                ("Recall", "Precision"), "Precison-Recall"
-            ) as (
-                fig,
-                ax,
-            ):
-                for split_name, data in eval_json_data.items():
-                    ax.plot(
-                        data["curves"]["precision_recall"]["precision"],
-                        data["curves"]["precision_recall"]["recall"],
-                        label=f"{split_name} (AP: {data['average_precision_score']:.2f})",
-                    )
-                    ax.legend(loc="best", fancybox=True, framealpha=0.7)
+            for fig in make_plots(eval_json_data):
                 pdf.savefig(fig)
diff --git a/src/mednet/libs/segmentation/scripts/view.py b/src/mednet/libs/segmentation/scripts/view.py
index 79fcb468abde40e8253bc1209663f62524630889..70398b3b115078003095a53b88e6f3249c89337b 100644
--- a/src/mednet/libs/segmentation/scripts/view.py
+++ b/src/mednet/libs/segmentation/scripts/view.py
@@ -13,8 +13,6 @@ from clapper.logging import setup
 from mednet.libs.common.scripts.click import ConfigCommand
 from mednet.libs.segmentation.engine.evaluator import SUPPORTED_METRIC_TYPE
 
-from .evaluate import validate_threshold
-
 logger = setup(__name__.split(".")[0], format="%(levelname)s: %(message)s")
 
 
@@ -146,6 +144,8 @@ def view(
     )
     from mednet.libs.segmentation.engine.viewer import view
 
+    from ..engine.evaluator import validate_threshold
+
     view_filename = "view.json"
     view_file = output_folder / view_filename
 
diff --git a/tests/segmentation/test_cli.py b/tests/segmentation/test_cli.py
index a66daa24b5ae15b2f55d3a315c80f190dc816249..6e11b628b62e4c569bdbf252626356e04f52afde 100644
--- a/tests/segmentation/test_cli.py
+++ b/tests/segmentation/test_cli.py
@@ -327,7 +327,7 @@ def test_evaluate_lwnet_drive(session_tmp_path):
             r"^Writing run metadata at .*$": 1,
             r"^Counting true/false positive/negatives at split.*$": 2,
             r"^Evaluating threshold on split .*$": 1,
-            r"^Computing performance on split .*...$": 2,
+            r"^Computing metrics on split .*...$": 2,
             r"^Saving evaluation results at .*$": 1,
             r"^Saving tabulated performance summary at .*$": 1,
             r"^Saving evaluation figures at .*$": 1,