diff --git a/bob/ip/binseg/engine/evaluator.py b/bob/ip/binseg/engine/evaluator.py
new file mode 100644
index 0000000000000000000000000000000000000000..af2c0a2d1e2cabe5b1eb6a226d87aebd89dca7cb
--- /dev/null
+++ b/bob/ip/binseg/engine/evaluator.py
@@ -0,0 +1,197 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+"""Defines functionality for the evaluation of predictions"""
+
+import os
+
+import numpy
+import pandas
+from tqdm import tqdm
+
+import torch
+import torchvision.transforms.functional as VF
+
+import bob.io.base
+
+from ..utils.metric import base_metrics
+from ..utils.plot import precision_recall_f1iso_confintval
+from ..utils.summary import summary
+
+import logging
+logger = logging.getLogger(__name__)
+
+
+def _sample_metrics(stem, pred, gt):
+    """
+    Calculates metrics on one single sample and saves it to disk
+
+
+    Parameters
+    ----------
+
+    stem : str
+        original filename without extension and relative to its root-path
+
+    pred : torch.Tensor
+        pixel-wise predictions
+
+    gt : torch.Tensor
+        ground-truth (annotations)
+
+
+    Returns
+    -------
+
+    metrics : pandas.DataFrame
+
+        A pandas dataframe with the following columns:
+
+        * threshold: float
+        * precision: float
+        * recall: float
+        * specificity: float
+        * accuracy: float
+        * jaccard: float
+        * f1_score: float
+
+    """
+
+    step_size = 0.01
+    gts = gt.byte()
+
+    data = []
+
+    for threshold in numpy.arange(0.0, 1.0, step_size):
+
+        # threshold
+        binary_pred = torch.gt(pred, threshold).byte()
+
+        # equals and not-equals
+        equals = torch.eq(binary_pred, gts).type(torch.uint8)  # tensor
+        notequals = torch.ne(binary_pred, gts).type(torch.uint8)  # tensor
+
+        # true positives
+        tp_tensor = gt * binary_pred  # tensor
+        tp_count = torch.sum(tp_tensor).item()  # scalar
+
+        # false positives
+        fp_tensor = torch.eq((binary_pred + tp_tensor), 1)
+        fp_count = torch.sum(fp_tensor).item()
+
+        # true negatives
+        tn_tensor = equals - tp_tensor
+        tn_count = torch.sum(tn_tensor).item()
+
+        # false negatives
+        fn_tensor = notequals - fp_tensor.type(torch.uint8)
+        fn_count = torch.sum(fn_tensor).item()
+
+        # calc metrics
+        precision, recall, specificity, accuracy, jaccard, f1_score = \
+                base_metrics(tp_count, fp_count, tn_count, fn_count)
+
+        data.append([threshold, precision, recall, specificity,
+            accuracy, jaccard, f1_score])
+
+    return pandas.DataFrame(data, columns=(
+        "threshold",
+        "precision",
+        "recall",
+        "specificity",
+        "accuracy",
+        "jaccard",
+        "f1_score",
+        ))
+
+
+def run(data_loader, predictions_folder, output_folder):
+    """
+    Runs inference and calculates metrics
+
+
+    Parameters
+    ---------
+
+    data_loader : py:class:`torch.torch.utils.data.DataLoader`
+        an iterable over the transformed input dataset, containing ground-truth
+
+    predictions_folder : str
+        folder where predictions for the dataset images has been previously
+        stored
+
+    output_folder : str
+        folder where to store results
+
+    """
+
+    logger.info("Start evaluation")
+    logger.info(f"Output folder: {output_folder}")
+
+    if not os.path.exists(output_folder):
+        logger.info(f"Creating {output_folder}...")
+        os.makedirs(output_folder, exist_ok=True)
+
+    # Collect overall metrics
+    data = {}
+
+    for sample in tqdm(data_loader):
+        name = sample[0]
+        stem = os.path.splitext(name)[0]
+        image = sample[1].to("cpu")
+        gt = sample[2].to("cpu")
+        pred_fullpath = os.path.join(predictions_folder, stem + ".hdf5")
+        pred = bob.io.base.load(pred_fullpath).astype("float32")
+        pred = torch.from_numpy(pred)
+        if stem in data:
+            raise RuntimeError(f"{stem} entry already exists in data. "
+                    f"Cannot overwrite.")
+        data[stem] = _sample_metrics(stem, pred, gt)
+
+    # Merges all dataframes together
+    df_metrics = pandas.concat(data.values())
+
+    # Report and Averages
+    avg_metrics = df_metrics.groupby("threshold").mean()
+    std_metrics = df_metrics.groupby("threshold").std()
+
+    # Uncomment below for F1-score calculation based on average precision and
+    # metrics instead of F1-scores of individual images. This method is in line
+    # with Maninis et. al. (2016)
+    #
+    # avg_metrics["f1_score"] = \
+    #         (2* avg_metrics["precision"]*avg_metrics["recall"])/ \
+    #         (avg_metrics["precision"]+avg_metrics["recall"])
+
+    avg_metrics["std_pr"] = std_metrics["precision"]
+    avg_metrics["pr_upper"] = avg_metrics["precision"] + avg_metrics["std_pr"]
+    avg_metrics["pr_lower"] = avg_metrics["precision"] - avg_metrics["std_pr"]
+    avg_metrics["std_re"] = std_metrics["recall"]
+    avg_metrics["re_upper"] = avg_metrics["recall"] + avg_metrics["std_re"]
+    avg_metrics["re_lower"] = avg_metrics["recall"] - avg_metrics["std_re"]
+    avg_metrics["std_f1"] = std_metrics["f1_score"]
+
+    metrics_path = os.path.join(output_folder, "metrics.csv")
+    logger.info(f"Saving averages over all input images at {metrics_path}...")
+    avg_metrics.to_csv(metrics_path)
+
+    maxf1 = avg_metrics["f1_score"].max()
+    optimal_f1_threshold = avg_metrics["f1_score"].idxmax()
+
+    logger.info(f"Highest F1-score of {maxf1:.5f}, achieved at "
+            f"threshold {optimal_f1_threshold:.2f}")
+
+    # Plotting
+    np_avg_metrics = avg_metrics.to_numpy().T
+    figure_path = os.path.join(output_folder, "precision-recall.pdf")
+    logger.info(f"Saving overall precision-recall plot at {figure_path}...")
+    fig = precision_recall_f1iso_confintval(
+        [np_avg_metrics[0]],
+        [np_avg_metrics[1]],
+        [np_avg_metrics[7]],
+        [np_avg_metrics[8]],
+        [np_avg_metrics[10]],
+        [np_avg_metrics[11]],
+        ["data"],
+    )
+    fig.savefig(figure_path)
diff --git a/bob/ip/binseg/engine/inferencer.py b/bob/ip/binseg/engine/inferencer.py
deleted file mode 100644
index feb78b3ec79550b91519a938c3e4dd96002f5442..0000000000000000000000000000000000000000
--- a/bob/ip/binseg/engine/inferencer.py
+++ /dev/null
@@ -1,313 +0,0 @@
-#!/usr/bin/env python
-# -*- coding: utf-8 -*-
-
-import os
-import time
-import datetime
-import numpy as np
-import torch
-import pandas as pd
-import torchvision.transforms.functional as VF
-from tqdm import tqdm
-
-import bob.io.base
-
-from bob.ip.binseg.utils.metric import base_metrics
-from bob.ip.binseg.utils.plot import precision_recall_f1iso_confintval
-from bob.ip.binseg.utils.summary import summary
-
-import logging
-logger = logging.getLogger(__name__)
-
-
-def batch_metrics(predictions, ground_truths, names, output_folder):
-    """
-    Calculates metrics on the batch and saves it to disc
-
-
-    Parameters
-    ----------
-
-    predictions : :py:class:`torch.Tensor`
-        tensor with pixel-wise probabilities
-
-    ground_truths : :py:class:`torch.Tensor`
-        tensor with binary ground-truth
-
-    names : list
-        list of file names
-
-    output_folder : str
-        output path
-
-
-    Returns
-    -------
-    metrics : tuple
-        A tuple containing batch metrics: ``(name, threshold, precision, recall, specificity, accuracy, jaccard, f1_score)``
-    """
-
-    step_size = 0.01
-    batch_metrics = []
-
-    for j in range(predictions.size()[0]):
-        # ground truth byte
-        gts = ground_truths[j].byte()
-
-        file_name = "{}.csv".format(names[j])
-        logger.info("saving {}".format(file_name))
-
-        with open(os.path.join(output_folder, file_name), "w+") as outfile:
-
-            outfile.write(
-                "threshold, precision, recall, specificity, accuracy, jaccard, f1_score\n"
-            )
-
-            for threshold in np.arange(0.0, 1.0, step_size):
-                # threshold
-                binary_pred = torch.gt(predictions[j], threshold).byte()
-
-                # equals and not-equals
-                equals = torch.eq(binary_pred, gts).type(torch.uint8)  # tensor
-                notequals = torch.ne(binary_pred, gts).type(torch.uint8)  # tensor
-
-                # true positives
-                tp_tensor = gts * binary_pred  # tensor
-                tp_count = torch.sum(tp_tensor).item()  # scalar
-
-                # false positives
-                fp_tensor = torch.eq((binary_pred + tp_tensor), 1)
-                fp_count = torch.sum(fp_tensor).item()
-
-                # true negatives
-                tn_tensor = equals - tp_tensor
-                tn_count = torch.sum(tn_tensor).item()
-
-                # false negatives
-                fn_tensor = notequals - fp_tensor.type(torch.uint8)
-                fn_count = torch.sum(fn_tensor).item()
-
-                # calc metrics
-                metrics = base_metrics(tp_count, fp_count, tn_count, fn_count)
-
-                # write to disk
-                outfile.write(
-                    "{:.2f},{:.5f},{:.5f},{:.5f},{:.5f},{:.5f},{:.5f} \n".format(
-                        threshold, *metrics
-                    )
-                )
-
-                batch_metrics.append([names[j], threshold, *metrics])
-
-    return batch_metrics
-
-
-def save_probability_images(predictions, names, output_folder):
-    """
-    Saves probability maps as image in the same format as the test image
-
-
-    Parameters
-    ----------
-
-    predictions : :py:class:`torch.Tensor`
-        tensor with pixel-wise probabilities
-
-    names : list
-        list of file names
-
-    output_folder : str
-        output path
-    """
-
-    images_subfolder = os.path.join(output_folder, "images")
-    for j in range(predictions.size()[0]):
-        img = VF.to_pil_image(predictions.cpu().data[j])
-        filename = "{}.png".format(names[j].split(".")[0])
-        fullpath = os.path.join(images_subfolder, filename)
-        logger.info("saving {}".format(fullpath))
-        fulldir = os.path.dirname(fullpath)
-        if not os.path.exists(fulldir):
-            os.makedirs(fulldir)
-        img.save(fullpath)
-
-
-def save_hdf(predictions, names, output_folder):
-    """
-    Saves probability maps as image in the same format as the test image
-
-    Parameters
-    ----------
-    predictions : :py:class:`torch.Tensor`
-        tensor with pixel-wise probabilities
-    names : list
-        list of file names
-    output_folder : str
-        output path
-    """
-    hdf5_subfolder = os.path.join(output_folder, "hdf5")
-    if not os.path.exists(hdf5_subfolder):
-        os.makedirs(hdf5_subfolder)
-    for j in range(predictions.size()[0]):
-        img = predictions.cpu().data[j].squeeze(0).numpy()
-        filename = "{}.hdf5".format(names[j].split(".")[0])
-        fullpath = os.path.join(hdf5_subfolder, filename)
-        logger.info("saving {}".format(filename))
-        fulldir = os.path.dirname(fullpath)
-        if not os.path.exists(fulldir):
-            os.makedirs(fulldir)
-        bob.io.base.save(img, fullpath)
-
-
-def do_inference(model, data_loader, device, output_folder=None):
-    """
-    Runs inference and calculate metrics
-
-    Parameters
-    ---------
-    model : :py:class:`torch.nn.Module`
-        neural network model (e.g. DRIU, HED, UNet)
-    data_loader : py:class:`torch.torch.utils.data.DataLoader`
-    device : str
-        device to use ``'cpu'`` or ``'cuda'``
-    output_folder : str
-    """
-
-    logger.info("Start evaluation")
-    logger.info("Output folder: {}, Device: {}".format(output_folder, device))
-    results_subfolder = os.path.join(output_folder, "results")
-    os.makedirs(results_subfolder, exist_ok=True)
-
-    model.eval().to(device)
-    # Sigmoid for probabilities
-    sigmoid = torch.nn.Sigmoid()
-
-    # Setup timers
-    start_total_time = time.time()
-    times = []
-
-    # Collect overall metrics
-    metrics = []
-
-    for samples in tqdm(data_loader):
-        names = samples[0]
-        images = samples[1].to(device)
-        ground_truths = samples[2].to(device)
-        with torch.no_grad():
-            start_time = time.perf_counter()
-
-            outputs = model(images)
-
-            # necessary check for hed architecture that uses several outputs
-            # for loss calculation instead of just the last concatfuse block
-            if isinstance(outputs, list):
-                outputs = outputs[-1]
-
-            probabilities = sigmoid(outputs)
-
-            batch_time = time.perf_counter() - start_time
-            times.append(batch_time)
-            logger.info("Batch time: {:.5f} s".format(batch_time))
-
-            b_metrics = batch_metrics(
-                probabilities, ground_truths, names, results_subfolder
-            )
-            metrics.extend(b_metrics)
-
-            # Create probability images
-            save_probability_images(probabilities, names, output_folder)
-            # save hdf5
-            save_hdf(probabilities, names, output_folder)
-
-    # DataFrame
-    df_metrics = pd.DataFrame(
-        metrics,
-        columns=[
-            "name",
-            "threshold",
-            "precision",
-            "recall",
-            "specificity",
-            "accuracy",
-            "jaccard",
-            "f1_score",
-        ],
-    )
-
-    # Report and Averages
-    metrics_file = "Metrics.csv".format(model.name)
-    metrics_path = os.path.join(results_subfolder, metrics_file)
-    logger.info("Saving average over all input images: {}".format(metrics_file))
-
-    avg_metrics = df_metrics.groupby("threshold").mean()
-    std_metrics = df_metrics.groupby("threshold").std()
-
-    # Uncomment below for F1-score calculation based on average precision and metrics instead of
-    # F1-scores of individual images. This method is in line with Maninis et. al. (2016)
-    # avg_metrics["f1_score"] =  (2* avg_metrics["precision"]*avg_metrics["recall"])/ \
-    #    (avg_metrics["precision"]+avg_metrics["recall"])
-
-    avg_metrics["std_pr"] = std_metrics["precision"]
-    avg_metrics["pr_upper"] = avg_metrics["precision"] + avg_metrics["std_pr"]
-    avg_metrics["pr_lower"] = avg_metrics["precision"] - avg_metrics["std_pr"]
-    avg_metrics["std_re"] = std_metrics["recall"]
-    avg_metrics["re_upper"] = avg_metrics["recall"] + avg_metrics["std_re"]
-    avg_metrics["re_lower"] = avg_metrics["recall"] - avg_metrics["std_re"]
-    avg_metrics["std_f1"] = std_metrics["f1_score"]
-
-    avg_metrics.to_csv(metrics_path)
-    maxf1 = avg_metrics["f1_score"].max()
-    optimal_f1_threshold = avg_metrics["f1_score"].idxmax()
-
-    logger.info(
-        "Highest F1-score of {:.5f}, achieved at threshold {}".format(
-            maxf1, optimal_f1_threshold
-        )
-    )
-
-    # Plotting
-    np_avg_metrics = avg_metrics.to_numpy().T
-    fig_name = "precision_recall.pdf"
-    logger.info("saving {}".format(fig_name))
-    fig = precision_recall_f1iso_confintval(
-        [np_avg_metrics[0]],
-        [np_avg_metrics[1]],
-        [np_avg_metrics[7]],
-        [np_avg_metrics[8]],
-        [np_avg_metrics[10]],
-        [np_avg_metrics[11]],
-        [model.name, None],
-        title=output_folder,
-    )
-    fig_filename = os.path.join(results_subfolder, fig_name)
-    fig.savefig(fig_filename)
-
-    # Report times
-    total_inference_time = str(datetime.timedelta(seconds=int(sum(times))))
-    average_batch_inference_time = np.mean(times)
-    total_evalution_time = str(
-        datetime.timedelta(seconds=int(time.time() - start_total_time))
-    )
-
-    logger.info(
-        "Average batch inference time: {:.5f}s".format(average_batch_inference_time)
-    )
-
-    times_file = "Times.txt"
-    logger.info("saving {}".format(times_file))
-
-    with open(os.path.join(results_subfolder, times_file), "w+") as outfile:
-        date = datetime.datetime.now()
-        outfile.write("Date: {} \n".format(date.strftime("%Y-%m-%d %H:%M:%S")))
-        outfile.write("Total evaluation run-time: {} \n".format(total_evalution_time))
-        outfile.write(
-            "Average batch inference time: {} \n".format(average_batch_inference_time)
-        )
-        outfile.write("Total inference time: {} \n".format(total_inference_time))
-
-    # Save model summary
-    summary_file = "ModelSummary.txt"
-    logger.info("saving {}".format(summary_file))
-
-    with open(os.path.join(results_subfolder, summary_file), "w+") as outfile:
-        summary(model, outfile)
diff --git a/bob/ip/binseg/engine/predictor.py b/bob/ip/binseg/engine/predictor.py
index 093c6c676fb03c38c6ce50cadf692eccb111d54d..0a63d74a4a9501a2e4a54118491c52d918d8d7a6 100644
--- a/bob/ip/binseg/engine/predictor.py
+++ b/bob/ip/binseg/engine/predictor.py
@@ -5,49 +5,128 @@ import os
 import time
 import datetime
 
+import PIL
 import numpy
-import torch
 from tqdm import tqdm
 
+import torch
+import torchvision.transforms.functional as VF
+
 import bob.io.base
 
 import logging
 logger = logging.getLogger(__name__)
 
 
-def save_hdf5(predictions, names, output_folder):
+def _save_hdf5(stem, prob, output_folder):
     """
-    Saves probability maps as image in the same format as the test image
+    Saves prediction maps as image in the same format as the test image
 
 
     Parameters
     ----------
-    predictions : :py:class:`torch.Tensor`
-        tensor with pixel-wise probabilities
+    stem : str
+        the name of the file without extension on the original dataset
 
-    names : list
-        list of file names
+    prob : PIL.Image.Image
+        Monochrome Image with prediction maps
 
     output_folder : str
-        output path
+        path where to store overlayed results
 
     """
 
-    for j in range(predictions.size()[0]):
+    fullpath = os.path.join(output_folder, f"{stem}.hdf5")
+    tqdm.write(f"Saving {fullpath}...")
+    fulldir = os.path.dirname(fullpath)
+    if not os.path.exists(fulldir):
+        tqdm.write(f"Creating directory {fulldir}...")
+        os.makedirs(fulldir, exist_ok=True)
+    bob.io.base.save(prob.cpu().squeeze(0).numpy(), fullpath)
+
+
+def _save_image(stem, extension, data, output_folder):
+    """Saves a PIL image into a file
+
+    Parameters
+    ----------
+
+    stem : str
+        the name of the file without extension on the original dataset
 
-        img = predictions.cpu().data[j].squeeze(0).numpy()
-        filename = "{}.hdf5".format(names[j].split(".")[0])
-        fullpath = os.path.join(output_folder, filename)
-        tqdm.write(f"Saving {fullpath}...")
-        fulldir = os.path.dirname(fullpath)
-        if not os.path.exists(fulldir):
-            tqdm.write(f"Creating directory {fulldir}...")
-            # protect against concurrent access - exist_ok=True
-            os.makedirs(fulldir, exist_ok=True)
-        bob.io.base.save(img, fullpath)
+    extension : str
+        an extension for the file to be saved (e.g. ``.png``)
 
+    data : PIL.Image.Image
+        RGB image with the original image, preloaded
+
+    output_folder : str
+        path where to store results
 
-def run(model, data_loader, device, output_folder):
+    """
+
+    fullpath = os.path.join(output_folder, stem + extension)
+    tqdm.write(f"Saving {fullpath}...")
+    fulldir = os.path.dirname(fullpath)
+    if not os.path.exists(fulldir):
+        tqdm.write(f"Creating directory {fulldir}...")
+        os.makedirs(fulldir, exist_ok=True)
+    data.save(fullpath)
+
+
+def _save_overlayed_png(stem, image, prob, output_folder):
+    """Overlays prediction predictions vessel tree with original test image
+
+
+    Parameters
+    ----------
+
+    stem : str
+        the name of the file without extension on the original dataset
+
+    image : torch.Tensor
+        Tensor with RGB input image
+
+    prob : torch.Tensor
+        Tensor with 1-D prediction map
+
+    output_folder : str
+        path where to store results
+
+    """
+
+    image = VF.to_pil_image(image)
+    prob = VF.to_pil_image(prob.cpu().squeeze(0))
+
+    # color and overlay
+    prob_green = PIL.ImageOps.colorize(prob, (0, 0, 0), (0, 255, 0))
+    overlayed = PIL.Image.blend(image, prob_green, 0.4)
+    _save_image(stem, '.png', overlayed, output_folder)
+
+
+def _save_transformed_png(stem, image, output_folder):
+    """Saves a PNG copy of the transformed input image to a folder
+
+
+    Parameters
+    ----------
+
+    stem : str
+        the name of the file without extension on the original dataset
+
+    image : torch.Tensor
+        Tensor with RGB input image
+
+    output_folder : str
+        path where to store overlayed results
+
+    """
+
+    _save_image(stem, '.png', VF.to_pil_image(image), output_folder)
+
+
+def run(model, data_loader, device, output_folder, overlayed_folder,
+        transformed_input_folder):
     """
     Runs inference on input data, outputs HDF5 files with predictions
 
@@ -62,7 +141,15 @@ def run(model, data_loader, device, output_folder):
         device to use ``cpu`` or ``cuda:0``
 
     output_folder : str
-        folder where to store output images (HDF5 files)
+        folder where to store output prediction maps (HDF5 files) and model
+        summary
+
+    overlayed_folder : str
+        folder where to store output images (PNG files)
+
+    transformed_input_folder : str
+        folder where to store input images, transformed through the input
+        pipeline (PNG files)
 
     """
 
@@ -76,7 +163,7 @@ def run(model, data_loader, device, output_folder):
         os.makedirs(output_folder, exist_ok=True)
 
     model.eval().to(device)
-    # Sigmoid for probabilities
+    # Sigmoid for predictions
     sigmoid = torch.nn.Sigmoid()
 
     # Setup timers
@@ -101,13 +188,19 @@ def run(model, data_loader, device, output_folder):
             if isinstance(outputs, list):
                 outputs = outputs[-1]
 
-            probabilities = sigmoid(outputs)
+            predictions = sigmoid(outputs)
 
             batch_time = time.perf_counter() - start_time
             times.append(batch_time)
             len_samples.append(len(images))
 
-            save_hdf5(probabilities, names, output_folder)
+            for name, img, prob in zip(names, images, predictions):
+                stem = os.path.splitext(name)[0]
+                _save_hdf5(stem, prob, output_folder)
+                if overlayed_folder is not None:
+                    _save_overlayed_png(stem, img, prob, overlayed_folder)
+                if transformed_input_folder is not None:
+                    _save_transformed_png(stem, img, transformed_input_folder)
 
     logger.info("End prediction")
 
@@ -116,7 +209,13 @@ def run(model, data_loader, device, output_folder):
     logger.info(f"Total time: {total_time}")
 
     average_batch_time = numpy.mean(times)
-    logger.info(f"Average batch time: {average_batch_time:g}s\n")
+    logger.info(f"Average batch time: {average_batch_time:g}s")
 
     average_image_time = numpy.sum(numpy.array(times) * len_samples) / float(sum(len_samples))
-    logger.info(f"Average image time: {average_image_time:g}s\n")
+    logger.info(f"Average image time: {average_image_time:g}s")
+
+    # Save model summary
+    summary_path = os.path.join(output_folder, "model-info.txt")
+    logger.info(f"Saving model summary at {summary_path}...")
+
+    with open(summary_path, "w") as f: summary(model, f)
diff --git a/bob/ip/binseg/engine/trainer.py b/bob/ip/binseg/engine/trainer.py
index 0d575bc99759cc52c479e58da4485857604778bb..da49327b605f9f3362ccc98724dc81dfc129a935 100644
--- a/bob/ip/binseg/engine/trainer.py
+++ b/bob/ip/binseg/engine/trainer.py
@@ -5,8 +5,9 @@ import os
 import csv
 import time
 import datetime
+
 import torch
-import pandas as pd
+import pandas
 from tqdm import tqdm
 
 from bob.ip.binseg.utils.metric import SmoothedValue
@@ -178,7 +179,7 @@ def run(
         )
 
     # plots a version of the CSV trainlog into a PDF
-    logdf = pd.read_csv(logfile_name, header=0, names=logfile_fields)
+    logdf = pandas.read_csv(logfile_name, header=0, names=logfile_fields)
     fig = loss_curve(logdf, title="Loss Evolution")
     figurefile_name = os.path.join(output_folder, "trainlog.pdf")
     logger.info(f"Saving {figurefile_name}")
diff --git a/bob/ip/binseg/script/binseg.py b/bob/ip/binseg/script/binseg.py
index 8f63c83c4ba283f19e5ed353ee861cff601398d4..65a9b1664dd9dce2a70b215e4414c44cb1040844 100644
--- a/bob/ip/binseg/script/binseg.py
+++ b/bob/ip/binseg/script/binseg.py
@@ -25,9 +25,8 @@ from torch.utils.data import DataLoader
 from bob.ip.binseg.utils.plot import plot_overview
 from bob.ip.binseg.utils.click import OptionEatAll
 from bob.ip.binseg.utils.rsttable import create_overview_grid
-from bob.ip.binseg.utils.plot import metricsviz, savetransformedtest
+from bob.ip.binseg.utils.plot import metricsviz
 from bob.ip.binseg.utils.transformfolder import transformfolder as transfld
-from bob.ip.binseg.utils.evaluate import do_eval
 
 logger = logging.getLogger(__name__)
 
@@ -104,10 +103,6 @@ def visualize(dataset, output_path, **kwargs):
     """
     logger.info("Creating TP, FP, FN visualizations for {}".format(output_path))
     metricsviz(dataset=dataset, output_path=output_path)
-    logger.info("Creating overlay visualizations for {}".format(output_path))
-    overlay(dataset=dataset, output_path=output_path)
-    logger.info("Saving transformed test images {}".format(output_path))
-    savetransformedtest(dataset=dataset, output_path=output_path)
 
 # Apply image transforms to a folder containing images
 @binseg.command(entry_point_group="bob.ip.binseg.config", cls=ConfigCommand)
diff --git a/bob/ip/binseg/script/evaluate.py b/bob/ip/binseg/script/evaluate.py
index a036084c6c3d82fe8cefafa7a70be7d6810c1486..11ca9719aa98cb122f142e7ff73ca0d317b84abe 100644
--- a/bob/ip/binseg/script/evaluate.py
+++ b/bob/ip/binseg/script/evaluate.py
@@ -4,7 +4,6 @@
 import click
 from click_plugins import with_plugins
 
-import torch
 from torch.utils.data import DataLoader
 
 from bob.extension.scripts.click_helper import (
@@ -14,8 +13,7 @@ from bob.extension.scripts.click_helper import (
     AliasedGroup,
 )
 
-from ..utils.checkpointer import DetectronCheckpointer
-from ..engine.inferencer import do_inference
+from ..engine.evaluator import run
 
 import logging
 logger = logging.getLogger(__name__)
@@ -27,75 +25,62 @@ logger = logging.getLogger(__name__)
     epilog="""Examples:
 
 \b
-    1. Evaluates a M2U-Net model on the DRIVE test set:
-
-       $ bob binseg evaluate -vv m2unet drive-test --weight=results/model_final.pth
-
+    1. Runs evaluation on an existing dataset configuration:
+\b
+       $ bob binseg evaluate -vv m2unet drive-test --predictions-folder=path/to/predictions --output-folder=path/to/results
+\b
+    2. To run evaluation on a folder with your own images and annotations, you
+       must first specify resizing, cropping, etc, so that the image can be
+       correctly input to the model.  Failing to do so will likely result in
+       poor performance.  To figure out such specifications, you must consult
+       the dataset configuration used for **training** the provided model.
+       Once you figured this out, do the following:
+\b
+       $ bob binseg config copy csv-dataset-example mydataset.py
+       # modify "mydataset.py" to your liking
+       $ bob binseg evaluate -vv m2unet mydataset.py --predictions-folder=path/to/predictions --output-folder=path/to/results
 """,
 )
 @click.option(
-    "--output-path",
+    "--output-folder",
     "-o",
-    help="Path where to store the generated model (created if does not exist)",
+    help="Path where to store the analysis result (created if does not exist)",
     required=True,
     default="results",
     cls=ResourceOption,
 )
 @click.option(
-    "--model",
-    "-m",
-    help="A torch.nn.Module instance implementing the network to be evaluated",
+    "--predictions-folder",
+    "-p",
+    help="Path where predictions are currently stored",
     required=True,
     cls=ResourceOption,
 )
 @click.option(
     "--dataset",
     "-d",
-    help="A torch.utils.data.dataset.Dataset instance implementing a dataset to be used for evaluating the model, possibly including all pre-processing pipelines required",
-    required=True,
-    cls=ResourceOption,
-)
-@click.option(
-    "--batch-size",
-    "-b",
-    help="Number of samples in every batch (this parameter affects memory requirements for the network)",
+    help="A torch.utils.data.dataset.Dataset instance implementing a dataset to be used for evaluating predictions, possibly including all pre-processing pipelines required",
     required=True,
-    show_default=True,
-    default=1,
     cls=ResourceOption,
 )
 @click.option(
-    "--device",
-    "-d",
-    help='A string indicating the device to use (e.g. "cpu" or "cuda:0")',
+    "--overlayed",
+    "-A",
+    help="Creates overlayed representations of the output probability maps, "
+    "similar to --overlayed in prediction-mode, except it includes "
+    "distinctive colours for true and false positives and false negatives.  "
+    "If not set, or empty then do **NOT** output overlayed images.  "
+    "Otherwise, the parameter represents the name of a folder where to "
+    "store those",
     show_default=True,
-    required=True,
-    default="cpu",
-    cls=ResourceOption,
-)
-@click.option(
-    "--weight",
-    "-w",
-    help="Path or URL to pretrained model file (.pth extension)",
-    required=True,
+    default=None,
+    required=False,
     cls=ResourceOption,
 )
 @verbosity_option(cls=ResourceOption)
-def evaluate(output_path, model, dataset, batch_size, device, weight, **kwargs):
+def evaluate(output_folder, predictions_folder, dataset, overlayed, **kwargs):
     """Evaluates an FCN on a binary segmentation task.
     """
-
-    # PyTorch dataloader
-    data_loader = DataLoader(
-        dataset=dataset,
-        batch_size=batch_size,
-        shuffle=False,
-        pin_memory=torch.cuda.is_available(),
-    )
-
-    # checkpointer, load last model in dir
-    checkpointer = DetectronCheckpointer(
-        model, save_dir=output_path, save_to_disk=False
-    )
-    checkpointer.load(weight)
-    do_inference(model, data_loader, device, output_path)
+    data_loader = DataLoader(dataset=dataset, batch_size=1, shuffle=False,
+            pin_memory=False)
+    run(dataset, predictions_folder, output_folder)
diff --git a/bob/ip/binseg/script/predict.py b/bob/ip/binseg/script/predict.py
index 560a8a197fad525f94ce290afebcd342d1509583..2c7929ec2127f76f3afaeaae34287fcfe8b6e862 100644
--- a/bob/ip/binseg/script/predict.py
+++ b/bob/ip/binseg/script/predict.py
@@ -31,7 +31,7 @@ logger = logging.getLogger(__name__)
 \b
     1. Runs prediction on an existing dataset configuration:
 \b
-       $ bob binseg predict -vv m2unet drive-test --weight=path/to/model_final.pth --output-path=path/to/predictions
+       $ bob binseg predict -vv m2unet drive-test --weight=path/to/model_final.pth --output-folder=path/to/predictions
 \b
     2. To run prediction on a folder with your own images, you must first
        specify resizing, cropping, etc, so that the image can be correctly
@@ -42,13 +42,13 @@ logger = logging.getLogger(__name__)
 \b
        $ bob binseg config copy folder-dataset-example mydataset.py
        # modify "mydataset.py" to include the base path and required transforms
-       $ bob binseg predict -vv m2unet mydataset.py --weight=path/to/model_final.pth --output-path=path/to/predictions
+       $ bob binseg predict -vv m2unet mydataset.py --weight=path/to/model_final.pth --output-folder=path/to/predictions
 """,
 )
 @click.option(
-    "--output-path",
+    "--output-folder",
     "-o",
-    help="Path where to store the generated model (created if does not exist)",
+    help="Path where to store the predictions (created if does not exist)",
     required=True,
     default="results",
     cls=ResourceOption,
@@ -92,8 +92,33 @@ logger = logging.getLogger(__name__)
     required=True,
     cls=ResourceOption,
 )
+@click.option(
+    "--overlayed",
+    "-O",
+    help="Creates overlayed representations of the output probability maps on "
+    "top of input images (store results as PNG files).   If not set, or empty "
+    "then do **NOT** output overlayed images.  Otherwise, the parameter "
+    "represents the name of a folder where to store those",
+    show_default=True,
+    default=None,
+    required=False,
+    cls=ResourceOption,
+)
+@click.option(
+    "--transformed",
+    "-T",
+    help="Creates a version of the input dataset with transformations applied, "
+    "but before feeding to FCN.   If not set, or empty then do **NOT** output "
+    "transformed images.  Otherwise, the parameter represents the name of a "
+    "folder where to store those",
+    show_default=True,
+    default=None,
+    required=False,
+    cls=ResourceOption,
+)
 @verbosity_option(cls=ResourceOption)
-def predict(output_path, model, dataset, batch_size, device, weight, **kwargs):
+def predict(output_folder, model, dataset, batch_size, device, weight,
+        overlayed, transformed, **kwargs):
     """Predicts vessel map (probabilities) on input images"""
 
     # PyTorch dataloader
@@ -112,4 +137,8 @@ def predict(output_path, model, dataset, batch_size, device, weight, **kwargs):
             save_to_disk=False)
     checkpointer.load(weight_name)
 
-    run(model, data_loader, device, output_path)
+    # clean-up the overlayed path
+    if overlayed is not None:
+        overlayed = overlayed.strip()
+
+    run(model, data_loader, device, output_folder, overlayed, transformed)
diff --git a/bob/ip/binseg/test/test_batchmetrics.py b/bob/ip/binseg/test/test_batchmetrics.py
index 045b36385cb1a5edff0e734fc1656414ff6560ef..76b1313d4ed819cecfc8b2dd7e0f5125fd9723d9 100644
--- a/bob/ip/binseg/test/test_batchmetrics.py
+++ b/bob/ip/binseg/test/test_batchmetrics.py
@@ -2,11 +2,17 @@
 # -*- coding: utf-8 -*-
 
 import unittest
-from bob.ip.binseg.engine.inferencer import batch_metrics
 import random
-import shutil, tempfile
-import logging
+import shutil
+
 import torch
+import pandas
+import numpy
+
+from ..engine.evaluator import _sample_metrics
+
+import logging
+logger = logging.getLogger(__name__)
 
 
 class Tester(unittest.TestCase):
@@ -22,26 +28,19 @@ class Tester(unittest.TestCase):
         self.predictions = torch.rand(size=(2, 1, 420, 420))
         self.ground_truths = torch.randint(low=0, high=2, size=(2, 1, 420, 420))
         self.names = ["Bob", "Tim"]
-        self.output_folder = tempfile.mkdtemp()
-        self.logger = logging.getLogger(__name__)
-
-    def tearDown(self):
-        # Remove the temporary folder after the test
-        shutil.rmtree(self.output_folder)
 
     def test_batch_metrics(self):
-        bm = batch_metrics(
-            self.predictions,
-            self.ground_truths,
-            self.names,
-            self.output_folder,
-        )
+        dfs = []
+        for stem, pred, gt in zip(self.names, self.predictions,
+                self.ground_truths):
+            dfs.append(_sample_metrics(stem, pred, gt))
+        bm = pandas.concat(dfs)
+
         self.assertEqual(len(bm), 2 * 100)
-        for metric in bm:
-            # check whether f1 score agree
-            self.assertAlmostEqual(
-                metric[-1], 2 * (metric[-6] * metric[-5]) / (metric[-6] + metric[-5])
-            )
+        # check whether f1 score agree
+        calculated = bm.f1_score.to_numpy()
+        ours = (2*(bm.precision*bm.recall)/(bm.precision+bm.recall)).to_numpy()
+        assert numpy.isclose(calculated, ours).all()
 
 
 if __name__ == "__main__":
diff --git a/bob/ip/binseg/utils/evaluate.py b/bob/ip/binseg/utils/evaluate.py
deleted file mode 100644
index a921f61f96b63d03964dcfe7c06a4c06ad567304..0000000000000000000000000000000000000000
--- a/bob/ip/binseg/utils/evaluate.py
+++ /dev/null
@@ -1,208 +0,0 @@
-#!/usr/bin/env python
-# -*- coding: utf-8 -*-
-
-# only used to evaluate 2nd human annotator
-
-import os
-import numpy as np
-import torch
-import pandas as pd
-from tqdm import tqdm
-
-from bob.ip.binseg.utils.metric import base_metrics
-from bob.ip.binseg.utils.plot import (
-    precision_recall_f1iso,
-    precision_recall_f1iso_confintval,
-)
-from PIL import Image
-from torchvision.transforms.functional import to_tensor
-
-import logging
-logger = logging.getLogger(__name__)
-
-
-def batch_metrics(predictions, ground_truths, names, output_folder):
-    """
-    Calculates metrics on the batch and saves it to disc
-
-    Parameters
-    ----------
-    predictions : :py:class:`torch.Tensor`
-        tensor with pixel-wise probabilities
-    ground_truths : :py:class:`torch.Tensor`
-        tensor with binary ground-truth
-    names : list
-        list of file names
-    output_folder : str
-        output path
-
-    Returns
-    -------
-    list
-        list containing batch metrics: ``[name, threshold, precision, recall, specificity, accuracy, jaccard, f1_score]``
-    """
-    step_size = 0.01
-    batch_metrics = []
-
-    for j in range(predictions.size()[0]):
-        # ground truth byte
-        gts = ground_truths[j].byte()
-
-        file_name = "{}.csv".format(names[j])
-        logger.info("saving {}".format(file_name))
-
-        with open(os.path.join(output_folder, file_name), "w+") as outfile:
-
-            outfile.write(
-                "threshold, precision, recall, specificity, accuracy, jaccard, f1_score\n"
-            )
-
-            for threshold in np.arange(0.0, 1.0, step_size):
-                # threshold
-                binary_pred = torch.gt(predictions[j], threshold).byte()
-
-                # equals and not-equals
-                equals = torch.eq(binary_pred, gts)  # tensor
-                notequals = torch.ne(binary_pred, gts)  # tensor
-
-                # true positives
-                tp_tensor = gts * binary_pred  # tensor
-                tp_count = torch.sum(tp_tensor).item()  # scalar
-
-                # false positives
-                fp_tensor = torch.eq((binary_pred + tp_tensor), 1)
-                fp_count = torch.sum(fp_tensor).item()
-
-                # true negatives
-                tn_tensor = equals - tp_tensor
-                tn_count = torch.sum(tn_tensor).item()
-
-                # false negatives
-                fn_tensor = notequals - fp_tensor
-                fn_count = torch.sum(fn_tensor).item()
-
-                # calc metrics
-                metrics = base_metrics(tp_count, fp_count, tn_count, fn_count)
-
-                # write to disk
-                outfile.write(
-                    "{:.2f},{:.5f},{:.5f},{:.5f},{:.5f},{:.5f},{:.5f} \n".format(
-                        threshold, *metrics
-                    )
-                )
-
-                batch_metrics.append([names[j], threshold, *metrics])
-
-    return batch_metrics
-
-
-def do_eval(
-    prediction_folder,
-    data_loader,
-    output_folder=None,
-    title="2nd human",
-    legend="2nd human",
-    prediction_extension=None,
-):
-
-    """
-    Calculate metrics on saved prediction images (needs batch_size = 1 !)
-
-    Parameters
-    ---------
-    model : :py:class:`torch.nn.Module`
-        neural network model (e.g. DRIU, HED, UNet)
-    data_loader : py:class:`torch.torch.utils.data.DataLoader`
-    device : str
-        device to use ``'cpu'`` or ``'cuda'``
-    output_folder : str
-    """
-    logger.info("Start evaluation")
-    logger.info("Prediction folder {}".format(prediction_folder))
-    results_subfolder = os.path.join(output_folder, "results")
-    os.makedirs(results_subfolder, exist_ok=True)
-
-    # Collect overall metrics
-    metrics = []
-    for samples in tqdm(data_loader):
-        names = samples[0]
-        ground_truths = samples[2]
-
-        if prediction_extension is None:
-            pred_file = os.path.join(prediction_folder, names[0])
-        else:
-            pred_file = os.path.join(
-                prediction_folder, os.path.splitext(names[0])[0] + ".png"
-            )
-        probabilities = Image.open(pred_file)
-        probabilities = probabilities.convert(mode="L")
-        probabilities = to_tensor(probabilities)
-
-        b_metrics = batch_metrics(
-            probabilities, ground_truths, names, results_subfolder
-        )
-        metrics.extend(b_metrics)
-
-    # DataFrame
-    df_metrics = pd.DataFrame(
-        metrics,
-        columns=[
-            "name",
-            "threshold",
-            "precision",
-            "recall",
-            "specificity",
-            "accuracy",
-            "jaccard",
-            "f1_score",
-        ],
-    )
-
-    # Report and Averages
-    metrics_file = "Metrics.csv"
-    metrics_path = os.path.join(results_subfolder, metrics_file)
-    logger.info("Saving average over all input images: {}".format(metrics_file))
-
-    avg_metrics = df_metrics.groupby("threshold").mean()
-    std_metrics = df_metrics.groupby("threshold").std()
-
-    # Uncomment below for F1-score calculation based on average precision and metrics instead of
-    # F1-scores of individual images. This method is in line with Maninis et. al. (2016)
-    # avg_metrics["f1_score"] =  (2* avg_metrics["precision"]*avg_metrics["recall"])/ \
-    #    (avg_metrics["precision"]+avg_metrics["recall"])
-
-    avg_metrics["std_pr"] = std_metrics["precision"]
-    avg_metrics["pr_upper"] = avg_metrics["precision"] + avg_metrics["std_pr"]
-    avg_metrics["pr_lower"] = avg_metrics["precision"] - avg_metrics["std_pr"]
-    avg_metrics["std_re"] = std_metrics["recall"]
-    avg_metrics["re_upper"] = avg_metrics["recall"] + avg_metrics["std_re"]
-    avg_metrics["re_lower"] = avg_metrics["recall"] - avg_metrics["std_re"]
-    avg_metrics["std_f1"] = std_metrics["f1_score"]
-
-    avg_metrics.to_csv(metrics_path)
-    maxf1 = avg_metrics["f1_score"].max()
-    optimal_f1_threshold = avg_metrics["f1_score"].idxmax()
-
-    logger.info(
-        "Highest F1-score of {:.5f}, achieved at threshold {}".format(
-            maxf1, optimal_f1_threshold
-        )
-    )
-
-    # Plotting
-    # print(avg_metrics)
-    np_avg_metrics = avg_metrics.to_numpy().T
-    fig_name = "precision_recall.pdf"
-    logger.info("saving {}".format(fig_name))
-    fig = precision_recall_f1iso_confintval(
-        [np_avg_metrics[0]],
-        [np_avg_metrics[1]],
-        [np_avg_metrics[7]],
-        [np_avg_metrics[8]],
-        [np_avg_metrics[10]],
-        [np_avg_metrics[11]],
-        [legend, None],
-        title=title,
-    )
-    fig_filename = os.path.join(results_subfolder, fig_name)
-    fig.savefig(fig_filename)
diff --git a/bob/ip/binseg/utils/plot.py b/bob/ip/binseg/utils/plot.py
index 2873b71565d3c667331e6fb0162ffc52a418ea0e..ecfbe92bbcb9b438f2d9a7cb8d06fa777f7a6584 100644
--- a/bob/ip/binseg/utils/plot.py
+++ b/bob/ip/binseg/utils/plot.py
@@ -120,8 +120,6 @@ def precision_recall_f1iso_confintval(
     precision, recall, pr_upper, pr_lower, re_upper, re_lower, names, title=None
 ):
     """
-    Author: Andre Anjos (andre.anjos@idiap.ch).
-
     Creates a precision-recall plot of the given data.
     The plot will be annotated with F1-score iso-lines (in which the F1-score
     maintains the same value)
@@ -132,13 +130,16 @@ def precision_recall_f1iso_confintval(
         A list of 1D np arrays containing the Y coordinates of the plot, or
         the precision, or a 2D np array in which the rows correspond to each
         of the system's precision coordinates.
+
     recall : :py:class:`numpy.ndarray` or :py:class:`list`
         A list of 1D np arrays containing the X coordinates of the plot, or
         the recall, or a 2D np array in which the rows correspond to each
         of the system's recall coordinates.
+
     names : :py:class:`list`
         An iterable over the names of each of the systems along the rows of
         ``precision`` and ``recall``
+
     title : :py:class:`str`, optional
         A title for the plot. If not set, omits the title
 
@@ -462,63 +463,3 @@ def metricsviz(
         if not os.path.exists(fulldir):
             os.makedirs(fulldir)
         tp_pil_colored.save(fullpath)
-
-
-def overlay(dataset, output_path):
-    """Overlays prediction probabilities vessel tree with original test image.
-
-    Parameters
-    ----------
-    dataset : :py:class:`torch.utils.data.Dataset`
-    output_path : str
-        path where results and probability output images are stored. E.g. ``'DRIVE/MODEL'``
-    """
-
-    for sample in dataset:
-        # get sample
-        name = sample[0]
-        img = VF.to_pil_image(sample[1])  # PIL Image
-
-        # read probability output
-        pred = PIL.Image.open(os.path.join(output_path, "images", name)).convert(mode="L")
-        # color and overlay
-        pred_green = PIL.ImageOps.colorize(pred, (0, 0, 0), (0, 255, 0))
-        overlayed = PIL.Image.blend(img, pred_green, 0.4)
-
-        # add f1-score
-        # fnt_size = overlayed.size[1]//25
-        # draw = PIL.ImageDraw.Draw(overlayed)
-        # fnt = PIL.ImageFont.truetype('FreeMono.ttf', fnt_size)
-        # draw.text((0, 0),"F1: {:.4f}".format(f1),(255,255,255),font=fnt)
-        # save to disk
-        overlayed_path = os.path.join(output_path, "overlayed")
-        fullpath = os.path.join(overlayed_path, name)
-        fulldir = os.path.dirname(fullpath)
-        if not os.path.exists(fulldir):
-            os.makedirs(fulldir)
-        overlayed.save(fullpath)
-
-
-def savetransformedtest(dataset, output_path):
-    """Save the test images as they are fed into the neural network.
-    Makes it easier to create overlay animations (e.g. slide)
-
-    Parameters
-    ----------
-    dataset : :py:class:`torch.utils.data.Dataset`
-    output_path : str
-        path where results and probability output images are stored. E.g. ``'DRIVE/MODEL'``
-    """
-
-    for sample in dataset:
-        # get sample
-        name = sample[0]
-        img = VF.to_pil_image(sample[1])  # PIL Image
-
-        # save to disk
-        testimg_path = os.path.join(output_path, "transformedtestimages")
-        fullpath = os.path.join(testimg_path, name)
-        fulldir = os.path.dirname(fullpath)
-        if not os.path.exists(fulldir):
-            os.makedirs(fulldir)
-        img.save(fullpath)
diff --git a/doc/api.rst b/doc/api.rst
index 3643295bd9f07e23bc874a23cbff45fa370c5b9e..f90f6af99d64cce705fc2cac4ba7121e94f80f3e 100644
--- a/doc/api.rst
+++ b/doc/api.rst
@@ -29,11 +29,11 @@ Engines
    :toctree: api/engine
 
    bob.ip.binseg.engine
-   bob.ip.binseg.engine.adabound
-   bob.ip.binseg.engine.inferencer
-   bob.ip.binseg.engine.predictor
-   bob.ip.binseg.engine.ssltrainer
    bob.ip.binseg.engine.trainer
+   bob.ip.binseg.engine.ssltrainer
+   bob.ip.binseg.engine.predictor
+   bob.ip.binseg.engine.evaluator
+   bob.ip.binseg.engine.adabound
 
 
 Neural Network Models
@@ -68,7 +68,6 @@ Toolbox
    bob.ip.binseg.utils
    bob.ip.binseg.utils.checkpointer
    bob.ip.binseg.utils.click
-   bob.ip.binseg.utils.evaluate
    bob.ip.binseg.utils.metric
    bob.ip.binseg.utils.model_serialization
    bob.ip.binseg.utils.model_zoo
diff --git a/doc/evaluation.rst b/doc/evaluation.rst
index 0329dabdac4b888cc6faa9fb1822ad01e902735a..95ab253844b6f21c13449eb0b1fb98054947ac25 100644
--- a/doc/evaluation.rst
+++ b/doc/evaluation.rst
@@ -68,54 +68,20 @@ things up using ``--device='cuda:0'`` in case you have a GPU.
 Evaluation
 ----------
 
-In evaluation we input an **annotated** dataset and a pre-trained model to
-output a complete set of performance figures that can help analysis of model
-performance.  Evaluation is done using ``bob binseg evaluate`` followed by the
-model and the dataset configuration, and the path to the pretrained model via
-the ``--weight`` argument.
+In evaluation, we input an **annotated** dataset and predictions to generate
+performance figures that can help analysis of a trained model.  Evaluation is
+done using ``bob binseg evaluate`` followed by the model and the annotated
+dataset configuration, and the path to the pretrained model via the
+``--weight`` argument.
 
 Use ``bob binseg evaluate --help`` for more information.
 
-E.g. run inference on model M2U-Net on the DRIVE test set:
+E.g. run inference on predictions from the DRIVE test set, do the following:
 
 .. code-block:: bash
 
     # Point directly to saved model via -w argument:
-    bob binseg evaluate m2unet drive-test -o /outputfolder/for/results -w /direct/path/to/weight/model_final.pth
-
-    # Use training output path (requries last_checkpoint file to be present)
-    # The evaluation results will be stored in the same folder
-    bob binseg test m2unet drive-test -o /outputfolder/for/results
-
-
-Outputs
-========
-The inference run generates the following output files:
-
-.. code-block:: text
-
-    .
-    â”œâ”€â”€ images  # the predicted probabilities as grayscale images in .png format
-    â”œâ”€â”€ hdf5    # the predicted probabilties in hdf5 format
-    â”œâ”€â”€ last_checkpoint  # text file that keeps track of the last checkpoint
-    â”œâ”€â”€ trainlog.csv # training log
-    â”œâ”€â”€ trainlog.pdf # training log plot
-    â”œâ”€â”€ model_*.pth # model checkpoints
-    â””â”€â”€ results
-        â”œâ”€â”€ image*.jpg.csv # evaluation metrics for each image
-        â”œâ”€â”€ Metrics.csv # average evaluation metrics
-        â”œâ”€â”€ ModelSummary.txt # model summary and parameter count
-        â”œâ”€â”€ precision_recall.pdf # precision vs recall plot
-        â””â”€â”€ Times.txt # inference times
-
-
-To run evaluation of pretrained models pass url as ``-w`` argument. E.g.:
-
-.. code-block:: bash
-
-    bob binseg test DRIU DRIVETEST -o Evaluation_DRIU_DRIVE -w https://www.idiap.ch/software/bob/data/bob/bob.ip.binseg/master/DRIU_DRIVE.pth
-    bob binseg test M2UNet DRIVETEST -o Evaluation_M2UNet_DRIVE -w https://www.idiap.ch/software/bob/data/bob/bob.ip.binseg/master/M2UNet_DRIVE.pth
-
+    bob binseg evaluate -vv drive-test -p /predictions/folder -o /eval/results/folder
 
 
 .. include:: links.rst