diff --git a/doc/api.rst b/doc/api.rst
index e166ddbe5fbedce84c4f2c3876dfbdfaf0239469..54457da74cf1a82fa0c28f554b6422482b61129f 100644
--- a/doc/api.rst
+++ b/doc/api.rst
@@ -65,6 +65,20 @@ Functions to actuate on the data.
    ptbench.engine.evaluator
 
 
+.. _ptbench.api.saliency:
+
+Saliency Map Generation and Analysis
+------------------------------------
+
+Engines to generate and analyze saliency mapping techniques.
+
+.. autosummary::
+   :toctree: api/saliency
+
+   ptbench.engine.saliency.generator
+   ptbench.engine.saliency.completeness
+   ptbench.engine.saliency.interpretability
+
 
 .. _ptbench.api.utils:
 
diff --git a/doc/references.rst b/doc/references.rst
index 5692dfb2743593e386a4cab6af63f518ed2e2307..c2712bc6c6c578079422f916f372a8801a7afdad 100644
--- a/doc/references.rst
+++ b/doc/references.rst
@@ -70,3 +70,8 @@
    Explanations for Convolutional Neural Networks** 2020 IEEE/CVF Conference on
    Computer Vision and Pattern Recognition Workshops (CVPRW), Seattle, WA, USA,
    2020 pp. 111-119. doi: https://doi.org/10.1109/CVPRW50498.2020.00020
+
+.. [ROAD-2022] *Y. Rong, T. Leemann, V. Borisov, G. Kasneci, and E. Kasneci*,
+   *A Consistent and Efficient Evaluation Strategy for Attribution Methods* in
+   Proceedings of the 39th International Conference on Machine Learning, PMLR,
+   Jun. 2022, pp. 18770–18795. https://proceedings.mlr.press/v162/rong22a.html
diff --git a/src/ptbench/data/datamodule.py b/src/ptbench/data/datamodule.py
index e1ba75a3da01179e48b274d4f6d488c67a4f6bf7..3d7e1a64fa3898f282822ef536e0b054311a5ff0 100644
--- a/src/ptbench/data/datamodule.py
+++ b/src/ptbench/data/datamodule.py
@@ -717,7 +717,7 @@ class ConcatDataModule(lightning.LightningDataModule):
         if self.cache_samples:
             logger.info(
                 f"Loading dataset:`{name}` into memory (caching)."
-                f" Trade-off: CPU RAM: more | Disk: less"
+                f" Trade-off: CPU RAM usage: more | Disk I/O: less"
             )
             for split, loader in self.splits[name]:
                 datasets.append(
@@ -731,7 +731,7 @@ class ConcatDataModule(lightning.LightningDataModule):
         else:
             logger.info(
                 f"Loading dataset:`{name}` without caching."
-                f" Trade-off: CPU RAM: less | Disk: more"
+                f" Trade-off: CPU RAM usage: less | Disk I/O: more"
             )
             for split, loader in self.splits[name]:
                 datasets.append(
diff --git a/src/ptbench/engine/road_calculator.py b/src/ptbench/engine/road_calculator.py
deleted file mode 100644
index d4ebd329cd9ced34239b0a1d9a670dbba599dfd6..0000000000000000000000000000000000000000
--- a/src/ptbench/engine/road_calculator.py
+++ /dev/null
@@ -1,253 +0,0 @@
-# SPDX-FileCopyrightText: Copyright © 2023 Idiap Research Institute <contact@idiap.ch>
-#
-# SPDX-License-Identifier: GPL-3.0-or-later
-
-import logging
-import os
-
-import numpy as np
-import torch
-
-from pytorch_grad_cam.metrics.road import (
-    ROADCombined,
-    ROADLeastRelevantFirstAverage,
-    ROADMostRelevantFirstAverage,
-)
-from pytorch_grad_cam.utils.model_targets import ClassifierOutputTarget
-from tqdm import tqdm
-
-logger = logging.getLogger(__name__)
-
-
-class SigmoidClassifierOutputTarget:
-    def __init__(self, category):
-        self.category = category
-
-    def __call__(self, model_output):
-        sigmoid_output = torch.sigmoid(model_output)
-        if len(sigmoid_output.shape) == 1:
-            return sigmoid_output[self.category]
-        return sigmoid_output[:, self.category]
-
-
-rs_maps = {
-    0: "cardiomegaly",
-    1: "emphysema",
-    2: "effusion",
-    3: "hernia",
-    4: "infiltration",
-    5: "mass",
-    6: "nodule",
-    7: "atelectasis",
-    8: "pneumothorax",
-    9: "pleural thickening",
-    10: "pneumonia",
-    11: "fibrosis",
-    12: "edema",
-    13: "consolidation",
-}
-
-
-def calculate_road_metrics(
-    input_image,
-    grayscale_cams,
-    model,
-    targets=None,
-    percentiles=[20, 40, 60, 80],
-):
-    """Calculates ROAD scores by averaging the scores for different percentiles
-    for a single input image for a given visualization method and a given
-    target class."""
-    cam_metric_ROADMoRF_avg = ROADMostRelevantFirstAverage(
-        percentiles=percentiles
-    )
-    cam_metric_ROADLeRF_avg = ROADLeastRelevantFirstAverage(
-        percentiles=percentiles
-    )
-    cam_metric_ROADCombined_avg = ROADCombined(percentiles=percentiles)
-
-    # Calculate ROAD scores for each percentile
-    MoRF_scores = cam_metric_ROADMoRF_avg(
-        input_tensor=input_image,
-        cams=grayscale_cams,
-        model=model,
-        targets=targets,
-    )
-    LeRF_scores = cam_metric_ROADLeRF_avg(
-        input_tensor=input_image,
-        cams=grayscale_cams,
-        model=model,
-        targets=targets,
-    )
-    combined_scores = cam_metric_ROADCombined_avg(
-        input_tensor=input_image,
-        cams=grayscale_cams,
-        model=model,
-        targets=targets,
-    )
-
-    return MoRF_scores, LeRF_scores, combined_scores
-
-
-# Helper function to calculate the ROAD scores for a single target class
-# of a single input image.
-def process_target_class(
-    model,
-    names,
-    images,
-    targets,
-    metric_targets,
-    cam,
-    csv_writer,
-    percentiles,
-):
-    grayscale_cams = cam(input_tensor=images, targets=targets)
-
-    MoRF_scores, LeRF_scores, combined_scores = calculate_road_metrics(
-        input_image=images,
-        grayscale_cams=grayscale_cams,
-        model=model,
-        targets=metric_targets,
-        percentiles=percentiles,
-    )
-
-    MoRF_score = MoRF_scores[0]
-    LeRF_score = LeRF_scores[0]
-    combined_score = combined_scores[0]
-
-    # Write metrics to csv file
-    csv_writer.writerow(
-        [
-            names[0],
-            MoRF_score,
-            LeRF_score,
-            combined_score,
-            str(percentiles),
-        ]
-    )
-
-
-def run(
-    model,
-    data_loader,
-    output_folder,
-    device,
-    cam,
-    csv_writers,
-    target_class="highest",
-    tb_positive_only=True,
-):
-    """Applies visualization techniques on input CXR, and perturbs them to
-    calculate ROAD scores.
-
-    Parameters
-    ---------
-    model
-        Neural network model (e.g. pasa).
-
-    data_loader
-        The pytorch lightning Dataloader used to iterate over batches.
-
-    output_folder : str
-        Directory in which the results will be saved.
-
-    dataset_split_name : str
-        Name of the dataset split (e.g. "train", "validation", "test").
-
-    device : str
-        A string indicating the device to use (e.g. "cpu" or "cuda"). The device can also be specified (cuda:0)
-
-    cam : py:class: `pytorch_grad_cam.GradCAM`, `pytorch_grad_cam.ScoreCAM`,
-    `pytorch_grad_cam.FullGrad`, `pytorch_grad_cam.RandomCAM`,
-    `pytorch_grad_cam.EigenCAM`, `pytorch_grad_cam.EigenGradCAM`,
-    `pytorch_grad_cam.LayerCAM`, `pytorch_grad_cam.XGradCAM`,
-    `pytorch_grad_cam.AblationCAM`, `pytorch_grad_cam.HiResCAM`,
-    `pytorch_grad_cam.GradCAMElementWise`, `pytorch_grad_cam.GradCAMplusplus`,
-        The CAM object to use for visualization.
-
-    visualization_types : list
-        Type of visualization techniques to be applied. Possible values are:
-        "GradCAM", "ScoreCAM", "FullGrad", "RandomCAM", "HiResCAM", "GradCAMElementWise", "GradCAMPlusPlus", "XGradCAM", "AblationCAM",
-        "EigenCAM", "EigenGradCAM", "LayerCAM".
-
-    csv_writers : dict
-        Dictionary containing csv writer objects for each target class.
-
-    target_class : str
-        (Use only with multi-label models) Which class to target for CAM calculation. Can be either set to "all" or "highest". "highest" is default, which means only visualizations for the class with the highest activation will be generated.
-
-    tb_positive_only : bool
-        If set, only TB positive samples will be visualized.
-
-    Returns
-    -------
-
-    all_road_scores : list
-        All the ROAD scores associated with filename, saved as .csv.
-    """
-    output_folder = os.path.abspath(output_folder)
-
-    logger.info(f"Output folder: {output_folder}")
-    os.makedirs(output_folder, exist_ok=True)
-
-    model_name = model.__class__.__name__
-
-    percentiles = [20, 40, 60, 80]
-
-    for samples in tqdm(data_loader, desc="batches", leave=False, disable=None):
-        # TB negative labels are skipped
-        if samples[1]["label"].item() == 0:
-            if tb_positive_only:
-                continue
-
-        names = samples[1]["name"]
-        images = samples[0].to(
-            device=device, non_blocking=torch.cuda.is_available()
-        )
-
-        if model_name == "DensenetRS" and target_class.lower() == "all":
-            for target in range(14):
-                targets = [ClassifierOutputTarget(target)]
-                metric_targets = [SigmoidClassifierOutputTarget(target)]
-
-                csv_writer = csv_writers[rs_maps[target]]
-
-                process_target_class(
-                    model,
-                    names,
-                    images,
-                    targets,
-                    metric_targets,
-                    cam,
-                    csv_writer,
-                    percentiles,
-                )
-
-        if model_name == "DensenetRS":
-            # Get the class with highest activation manually
-            outputs = cam.activations_and_grads(images)
-            target_categories = np.argmax(outputs.cpu().data.numpy(), axis=-1)
-            targets = [
-                ClassifierOutputTarget(category)
-                for category in target_categories
-            ]
-            metric_targets = [
-                SigmoidClassifierOutputTarget(category)
-                for category in target_categories
-            ]
-        else:
-            targets = [ClassifierOutputTarget(0)]
-            metric_targets = [SigmoidClassifierOutputTarget(0)]
-
-        csv_writer = csv_writers["targeted_class"]
-
-        process_target_class(
-            model,
-            names,
-            images,
-            targets,
-            metric_targets,
-            cam,
-            csv_writer,
-            percentiles,
-        )
diff --git a/src/ptbench/engine/saliency/__init__.py b/src/ptbench/engine/saliency/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/src/ptbench/engine/saliency/completeness.py b/src/ptbench/engine/saliency/completeness.py
new file mode 100644
index 0000000000000000000000000000000000000000..87778d74955fe40d3f54882afd677949b08a6d7b
--- /dev/null
+++ b/src/ptbench/engine/saliency/completeness.py
@@ -0,0 +1,274 @@
+# SPDX-FileCopyrightText: Copyright © 2023 Idiap Research Institute <contact@idiap.ch>
+#
+# SPDX-License-Identifier: GPL-3.0-or-later
+
+import logging
+import typing
+
+import lightning.pytorch
+import numpy as np
+import torch
+import tqdm
+
+from pytorch_grad_cam.metrics.road import (
+    ROADLeastRelevantFirstAverage,
+    ROADMostRelevantFirstAverage,
+)
+from pytorch_grad_cam.utils.model_targets import ClassifierOutputTarget
+
+from ...models.typing import SaliencyMapAlgorithm
+from ..device import DeviceManager
+
+logger = logging.getLogger(__name__)
+
+
+class SigmoidClassifierOutputTarget(torch.nn.Module):
+    def __init__(self, category):
+        self.category = category
+
+    def __call__(self, model_output):
+        sigmoid_output = torch.sigmoid(model_output)
+        if len(sigmoid_output.shape) == 1:
+            return sigmoid_output[self.category]
+        return sigmoid_output[:, self.category]
+
+
+def _calculate_road_scores(
+    model: lightning.pytorch.LightningModule,
+    images: torch.Tensor,
+    output_num: int,
+    saliency_map_callable: typing.Callable,
+    percentiles: typing.Sequence[int],
+) -> tuple[float, float, float]:
+    """Calculates average ROAD scores for different removal percentiles.
+
+    This function calculates ROAD scores by averaging the scores for
+    different removal (hardcoded) percentiles, for a single input image, a
+    given visualization method, a target class.
+
+
+    Parameters
+    ----------
+    model
+        Neural network model (e.g. pasa).
+    images
+        A batch of input images to use evaluating the ROAD scores.  Currently,
+        we only support batches with a single image.
+    output_num
+        Target output neuron to take into consideration when evaluating the
+        saliency maps and calculating ROAD scores
+    saliency_map_callable
+        A callable saliency-map generator from grad-cam
+    percentiles
+        A sequence of percentiles (percent x100) integer values indicating the
+        proportion of pixels to perturb in the original image to calculate both
+        MoRF and LeRF scores.
+
+
+    Returns
+    -------
+        A 3-tuple containing floating point numbers representing the
+        most-relevant-first score (``morf``), least-relevant-first score
+        (``lerf``) and the value (``(lerf-morf)/2``).
+    """
+    saliency_map = saliency_map_callable(
+        input_tensor=images, targets=[ClassifierOutputTarget(output_num)]
+    )
+
+    cam_metric_ROADMoRF_avg = ROADMostRelevantFirstAverage(
+        percentiles=percentiles
+    )
+    cam_metric_ROADLeRF_avg = ROADLeastRelevantFirstAverage(
+        percentiles=percentiles
+    )
+
+    # Calculate ROAD scores for all percentiles and average - this is NOT the
+    # current processing bottleneck.  If you want to optimise anyting, look at
+    # the evaluation of the perturbation using scipy.sparse at the
+    # NoisyLinearImputer, part of the grad-cam package (submodule
+    # ``metrics.road``.
+    metric_target = [SigmoidClassifierOutputTarget(output_num)]
+
+    MoRF_scores = cam_metric_ROADMoRF_avg(
+        input_tensor=images,
+        cams=saliency_map,
+        model=model,
+        targets=metric_target,
+    )
+
+    LeRF_scores = cam_metric_ROADLeRF_avg(
+        input_tensor=images,
+        cams=saliency_map,
+        model=model,
+        targets=metric_target,
+    )
+
+    return (
+        float(MoRF_scores.item()),
+        float(LeRF_scores.item()),
+        float(LeRF_scores.item() - MoRF_scores.item()) / 2.0,
+    )
+
+
+def run(
+    model: lightning.pytorch.LightningModule,
+    datamodule: lightning.pytorch.LightningDataModule,
+    device_manager: DeviceManager,
+    saliency_map_algorithm: SaliencyMapAlgorithm,
+    target_class: typing.Literal["highest", "all"],
+    positive_only: bool,
+    percentiles: typing.Sequence[int],
+) -> dict[str, list[typing.Any]]:
+    """Evaluates ROAD scores for all samples in a datamodule.
+
+    The ROAD algorithm was first described at [ROAD-2022]_. It estimates
+    explainability (in the completeness sense) of saliency maps by substituting
+    relevant pixels in the input image by a local average, and re-running
+    prediction on the altered image, and measuring changes in the output
+    classification score when said perturbations are in place.  By substituting
+    most or least relevant pixels with surrounding averages, the ROAD algorithm
+    estimates the importance of such elements in the produced saliency map.  As
+    2023, this measurement technique is considered to be one of the
+    state-of-the-art metrics of explainability.
+
+    This function returns a dictionary containing most-relevant-first (remove a
+    percentile of the most relevant pixels), least-relevant-first (remove a
+    percentile of the least relevant pixels), and combined ROAD evaluations per
+    sample for a particular saliency mapping algorithm.
+
+
+    Parameters
+    ---------
+    model
+        Neural network model (e.g. pasa).
+    datamodule
+        The lightning datamodule to iterate on.
+    device_manager
+        An internal device representation, to be used for training and
+        validation.  This representation can be converted into a pytorch device
+        or a torch lightning accelerator setup.
+    saliency_map_algorithm
+        The algorithm for saliency map estimation to use.
+    target_class
+        (Use only with multi-label models) Which class to target for CAM
+        calculation. Can be either set to "all" or "highest". "highest" is
+        default, which means only saliency maps for the class with the highest
+        activation will be generated.
+    positive_only
+        If set, saliency maps will only be generated for positive samples (ie.
+        label == 1 in a binary classification task).  This option is ignored on
+        a multi-class output model.
+    percentiles
+        A sequence of percentiles (percent x100) integer values indicating the
+        proportion of pixels to perturb in the original image to calculate both
+        MoRF and LeRF scores.
+
+
+    Returns
+    -------
+
+        A dictionary where keys are dataset names in the provide datamodule,
+        and values are lists containing sample information alongside metrics
+        calculated:
+
+        * Sample name
+        * Sample target class
+        * The model output number used for the ROAD analysis (0, for binary
+          classifers as there is typically only one output).
+        * ``morf``: ROAD most-relevant-first average of percentiles 20, 40, 60 and
+          80.
+        * ``lerf``: ROAD least-relevant-first average of percentiles 20, 40, 60 and
+          80.
+        * combined: ROAD combined score by evaluating ``(lerf-morf)/2``.
+    """
+
+    from ..models.densenet import Densenet
+    from ..models.pasa import Pasa
+    from .saliencymap_generator import _create_saliency_map_callable
+
+    if isinstance(model, Pasa):
+        if saliency_map_algorithm == "fullgrad":
+            raise ValueError(
+                "Fullgrad saliency map algorithm is not supported for the "
+                "Pasa model."
+            )
+        target_layers = [model.fc14]  # Last non-1x1 Conv2d layer
+    elif isinstance(model, Densenet):
+        target_layers = [
+            model.model_ft.features.denseblock4.denselayer16.conv2,  # type: ignore
+        ]
+    else:
+        raise TypeError(f"Model of type `{type(model)}` is not yet supported.")
+
+    use_cuda = device_manager.device_type == "cuda"
+
+    # prepares model for evaluation, cast to target device
+    device = device_manager.torch_device()
+    model = model.to(device)
+    model.eval()
+
+    saliency_map_callable = _create_saliency_map_callable(
+        saliency_map_algorithm,
+        model,
+        target_layers,  # type: ignore
+        use_cuda,
+    )
+
+    retval: dict[str, list[typing.Any]] = {}
+
+    for k, v in datamodule.predict_dataloader().items():
+        logger.info(f"Computing ROAD scores for dataset `{k}`...")
+        retval[k] = []
+
+        for sample in tqdm.tqdm(v, desc="samples", leave=False, disable=None):
+            name = sample[1]["name"][0]
+            label = int(sample[1]["label"].item())
+            image = sample[0].to(
+                device=device, non_blocking=torch.cuda.is_available()
+            )
+
+            # in binary classification systems, negative labels may be skipped
+            if positive_only and (model.num_classes == 1) and (label == 0):
+                retval[k].append([name, label])
+                continue
+
+            # chooses target outputs to generate saliency maps for
+            if model.num_classes > 1:
+                if target_class == "all":
+                    # test all outputs
+                    for output_num in range(model.num_classes):
+                        results = _calculate_road_scores(
+                            model,
+                            image,
+                            output_num,
+                            saliency_map_callable,
+                            percentiles,
+                        )
+                        retval[k].append([name, label, output_num, *results])
+
+                else:
+                    # we will figure out the output with the highest value and
+                    # evaluate the saliency mapping technique over it.
+                    outputs = saliency_map_callable.activations_and_grads(image)
+                    output_nums = np.argmax(outputs.cpu().data.numpy(), axis=-1)
+                    assert len(output_nums) == 1
+                    results = _calculate_road_scores(
+                        model,
+                        image,
+                        output_nums[0],
+                        saliency_map_callable,
+                        percentiles,
+                    )
+                    retval[k].append([name, label, output_nums[0], *results])
+
+            else:
+                results = _calculate_road_scores(
+                    model,
+                    image,
+                    0,
+                    saliency_map_callable,
+                    percentiles,
+                )
+                retval[k].append([name, label, 0, *results])
+
+    return retval
diff --git a/src/ptbench/engine/saliencymap_generator.py b/src/ptbench/engine/saliency/generator.py
similarity index 99%
rename from src/ptbench/engine/saliencymap_generator.py
rename to src/ptbench/engine/saliency/generator.py
index d3233b6aa3b68f8e68a6c54d8545ffc160911353..eda78ed49937db7d718e048f93c20bd9eeabfd04 100644
--- a/src/ptbench/engine/saliencymap_generator.py
+++ b/src/ptbench/engine/saliency/generator.py
@@ -12,8 +12,8 @@ import torch
 import torch.nn
 import tqdm
 
-from ..models.typing import SaliencyMapAlgorithm
-from .device import DeviceManager
+from ...models.typing import SaliencyMapAlgorithm
+from ..device import DeviceManager
 
 logger = logging.getLogger(__name__)
 
diff --git a/src/ptbench/engine/saliencymap_evaluator.py b/src/ptbench/engine/saliency/interpretability.py
similarity index 96%
rename from src/ptbench/engine/saliencymap_evaluator.py
rename to src/ptbench/engine/saliency/interpretability.py
index 9a5fc15680fb433a68e669422f894b56b044ba8a..ae9cca77a9bd14bd26ed69e34d1564f7d3eb2abf 100644
--- a/src/ptbench/engine/saliencymap_evaluator.py
+++ b/src/ptbench/engine/saliency/interpretability.py
@@ -253,7 +253,7 @@ def _compute_proportional_energy(
 def _process_sample(
     gt_bboxes: typing.Sequence[tuple[int, int, int, int]],
     saliency_map: numpy.typing.NDArray[numpy.double],
-) -> tuple[float, float, float, float, float, float, float, float]:
+) -> tuple[float, float, float, float, int, int, int, int]:
     """Calculates the metrics for a single sample.
 
     Parameters
@@ -268,6 +268,18 @@ def _process_sample(
           pixels
         * width: width of the bounding box, in pixels
         * height: height of the bounding box, in pixels
+
+
+    Returns
+    -------
+
+        A tuple containing the following values:
+
+        * IoU
+        * IoDA
+        * Proportional energy
+        * Average saliency focus
+        * Largest detected bounding box (x, y, width, height)
     """
 
     masks = _ordered_connected_components(saliency_map)
@@ -317,13 +329,13 @@ def run(
         and values are lists containing sample information alongside metrics
         calculated:
 
-        * Sample name
-        * Sample target class
-        * IoU
-        * IoDA
-        * Proportional energy
-        * Average saliency focus
-        * Largest detected bounding box
+        * Sample name (str)
+        * Sample target class (int)
+        * IoU (float)
+        * IoDA (float)
+        * Proportional energy (float)
+        * Average saliency focus (float)
+        * Largest detected bounding box (x, y, width, height) (4 x int)
     """
 
     retval: dict[str, list[typing.Any]] = {}
diff --git a/src/ptbench/scripts/calculate_road.py b/src/ptbench/scripts/calculate_road.py
deleted file mode 100644
index 86bfcecec5058f1c98973beda70cf9ee45e5f123..0000000000000000000000000000000000000000
--- a/src/ptbench/scripts/calculate_road.py
+++ /dev/null
@@ -1,367 +0,0 @@
-# SPDX-FileCopyrightText: Copyright © 2023 Idiap Research Institute <contact@idiap.ch>
-#
-# SPDX-License-Identifier: GPL-3.0-or-later
-
-import csv
-import os
-import pathlib
-
-import click
-
-from clapper.click import ConfigCommand, ResourceOption, verbosity_option
-from clapper.logging import setup
-from pytorch_grad_cam import (
-    AblationCAM,
-    EigenCAM,
-    EigenGradCAM,
-    FullGrad,
-    GradCAM,
-    GradCAMElementWise,
-    GradCAMPlusPlus,
-    HiResCAM,
-    LayerCAM,
-    RandomCAM,
-    ScoreCAM,
-    XGradCAM,
-)
-
-logger = setup(__name__.split(".")[0], format="%(levelname)s: %(message)s")
-
-allowed_visualization_types = {
-    "gradcam",
-    "scorecam",
-    "fullgrad",
-    "randomcam",
-    "hirescam",
-    "gradcamelementwise",
-    "gradcam++",
-    "gradcamplusplus",
-    "xgradcam",
-    "ablationcam",
-    "eigencam",
-    "eigengradcam",
-    "layercam",
-}
-
-rs_maps = {
-    0: "cardiomegaly",
-    1: "emphysema",
-    2: "effusion",
-    3: "hernia",
-    4: "infiltration",
-    5: "mass",
-    6: "nodule",
-    7: "atelectasis",
-    8: "pneumothorax",
-    9: "pleural thickening",
-    10: "pneumonia",
-    11: "fibrosis",
-    12: "edema",
-    13: "consolidation",
-}
-
-
-# To ensure that the user has selected a supported visualization type
-def check_vis_types(vis_types):
-    if isinstance(vis_types, str):
-        vis_types = [vis_types.lower()]
-    else:
-        vis_types = [s.lower() for s in vis_types]
-
-    for s in vis_types:
-        if not isinstance(s, str):
-            raise click.BadParameter(
-                "Visualization type must be a string or a list of strings"
-            )
-        if s not in allowed_visualization_types:
-            raise click.BadParameter(
-                "Visualization type must be one of: {}".format(
-                    ", ".join(allowed_visualization_types)
-                )
-            )
-    return vis_types
-
-
-# CAM factory
-def create_cam(vis_type, model, target_layers, use_cuda):
-    if vis_type == "gradcam":
-        return GradCAM(
-            model=model, target_layers=target_layers, use_cuda=use_cuda
-        )
-    elif vis_type == "scorecam":
-        return ScoreCAM(
-            model=model, target_layers=target_layers, use_cuda=use_cuda
-        )
-    elif vis_type == "fullgrad":
-        return FullGrad(
-            model=model, target_layers=target_layers, use_cuda=use_cuda
-        )
-    elif vis_type == "randomcam":
-        return RandomCAM(
-            model=model, target_layers=target_layers, use_cuda=use_cuda
-        )
-    elif vis_type == "hirescam":
-        return HiResCAM(
-            model=model, target_layers=target_layers, use_cuda=use_cuda
-        )
-    elif vis_type == "gradcamelementwise":
-        return GradCAMElementWise(
-            model=model, target_layers=target_layers, use_cuda=use_cuda
-        )
-    elif vis_type == "gradcam++" or vis_type == "gradcamplusplus":
-        return GradCAMPlusPlus(
-            model=model, target_layers=target_layers, use_cuda=use_cuda
-        )
-    elif vis_type == "xgradcam":
-        return XGradCAM(
-            model=model, target_layers=target_layers, use_cuda=use_cuda
-        )
-    elif vis_type == "ablationcam":
-        return AblationCAM(
-            model=model, target_layers=target_layers, use_cuda=use_cuda
-        )
-    elif vis_type == "eigencam":
-        return EigenCAM(
-            model=model, target_layers=target_layers, use_cuda=use_cuda
-        )
-    elif vis_type == "eigengradcam":
-        return EigenGradCAM(
-            model=model, target_layers=target_layers, use_cuda=use_cuda
-        )
-    elif vis_type == "layercam":
-        return LayerCAM(
-            model=model, target_layers=target_layers, use_cuda=use_cuda
-        )
-    else:
-        raise ValueError(f"Unsupported visualization type: {vis_type}")
-
-
-def prepare_csv_writers(
-    output_folder, visualization_type, dataset_split, num_classes=14
-):
-    # Create a CSV file to store the performance metrics for each image, for each target class
-
-    csv_files = {}
-    csv_writers = {}
-
-    for target in range(num_classes):
-        directory_path = (
-            f"{output_folder}/{visualization_type}/{rs_maps[target]}"
-        )
-        os.makedirs(directory_path, exist_ok=True)
-        csv_files[rs_maps[target]] = open(
-            f"{directory_path}/{dataset_split}_road_metrics.csv",
-            "w",
-            newline="",
-        )
-        csv_writers[rs_maps[target]] = csv.writer(csv_files[rs_maps[target]])
-        csv_writers[rs_maps[target]].writerow(
-            [
-                "Image",
-                "MoRF",
-                "LeRF",
-                "Combined Score ((LeRF-MoRF) / 2)",
-                "Percentiles",
-            ]
-        )
-
-    directory_path = f"{output_folder}/{visualization_type}/targeted_class"
-    os.makedirs(directory_path, exist_ok=True)
-    csv_files["targeted_class"] = open(
-        f"{directory_path}/{dataset_split}_road_metrics.csv", "w", newline=""
-    )
-    csv_writers["targeted_class"] = csv.writer(csv_files["targeted_class"])
-    csv_writers["targeted_class"].writerow(
-        [
-            "Image",
-            "MoRF",
-            "LeRF",
-            "Combined Score ((LeRF-MoRF) / 2)",
-            "Percentiles",
-        ]
-    )
-
-    return csv_files, csv_writers
-
-
-@click.command(
-    entry_point_group="ptbench.config",
-    cls=ConfigCommand,
-    epilog="""Examples:
-
-\b
-    1. Calculates the ROAD scores for an existing dataset configuration and stores them in .csv files:
-
-       .. code:: sh
-
-          ptbench calculate-road -vv pasa tbx11k-v1-healthy-vs-atb --device="cuda" --weight=path/to/model_final.pth --output-folder=path/to/output_folder
-
-""",
-)
-@click.option(
-    "--model",
-    "-m",
-    help="A lightining module instance implementing the network to be used for inference.",
-    required=True,
-    cls=ResourceOption,
-)
-@click.option(
-    "--datamodule",
-    "-d",
-    help="A lighting data module containing the training, validation and test sets.",
-    required=True,
-    cls=ResourceOption,
-)
-@click.option(
-    "--output-folder",
-    "-o",
-    help="Path where to store the ROAD scores (created if does not exist)",
-    required=True,
-    type=click.Path(
-        file_okay=False,
-        dir_okay=True,
-        writable=True,
-        path_type=pathlib.Path,
-    ),
-    default="visualizations",
-    cls=ResourceOption,
-)
-@click.option(
-    "--batch-size",
-    "-b",
-    help="Number of samples in every batch (this parameter affects memory requirements for the network)",
-    required=True,
-    show_default=True,
-    default=1,
-    type=click.IntRange(min=1),
-    cls=ResourceOption,
-)
-@click.option(
-    "--device",
-    "-x",
-    help='A string indicating the device to use (e.g. "cpu" or "cuda:0")',
-    show_default=True,
-    required=True,
-    default="cpu",
-    cls=ResourceOption,
-)
-@click.option(
-    "--weight",
-    "-w",
-    help="""Path or URL to pretrained model file (`.ckpt` extension),
-    corresponding to the architecture set with `--model`.""",
-    required=True,
-    cls=ResourceOption,
-)
-@click.option(
-    "--visualization-types",
-    "-vt",
-    help="Visualization techniques to be used. Can be called multiple times with different techniques. Currently supported ones are: "
-    '"GradCAM", "ScoreCAM", "FullGrad", "RandomCAM", "HiResCAM", "GradCAMElementWise", "GradCAMPlusPlus", "XGradCAM", "AblationCAM", '
-    '"EigenCAM", "EigenGradCAM", "LayerCAM"',
-    multiple=True,
-    default=["GradCAM"],
-    cls=ResourceOption,
-)
-@click.option(
-    "--target-class",
-    "-tc",
-    help='(Use only with multi-label models) Which class to target for ROAD calculation. Can be either set to "all" or "highest". "highest" is default, which means only scores for the class with the highest activation will be calculated.',
-    type=str,
-    required=False,
-    default="highest",
-    cls=ResourceOption,
-)
-@click.option(
-    "--tb-positive-only",
-    "-tb",
-    help="If set, ROAD scores will only be calculate for TB positive samples.",
-    is_flag=True,
-    default=False,
-    cls=ResourceOption,
-)
-@verbosity_option(logger=logger, cls=ResourceOption, expose_value=False)
-def calculate_road(
-    model,
-    datamodule,
-    output_folder,
-    batch_size,
-    device,
-    weight,
-    visualization_types,
-    target_class,
-    tb_positive_only,
-    **_,
-) -> None:
-    """Creates .csv files with the AOPC_ROAD scores calculated with LeRF, MoRF
-    and their combination.
-
-    Calculates them for each target class and split of the dataset.
-    """
-
-    from ..engine.device import DeviceManager
-    from ..engine.road_calculator import run
-    from .utils import save_sh_command
-
-    save_sh_command(output_folder / "command.sh")
-
-    device_manager = DeviceManager(device)
-    device = device_manager.torch_device()
-    use_cuda = device_manager.device_type == "cuda"
-
-    datamodule.set_chunk_size(batch_size, 1)
-    datamodule.drop_incomplete_batch = False
-    # datamodule.cache_samples = cache_samples
-    # datamodule.parallel = parallel
-    datamodule.model_transforms = model.model_transforms
-
-    datamodule.prepare_data()
-    datamodule.setup(stage="predict")
-
-    dataloaders = datamodule.predict_dataloader()
-
-    logger.info(f"Loading checkpoint from {weight}")
-
-    model = model.load_from_checkpoint(weight, strict=False)
-
-    visualization_types = check_vis_types(visualization_types)
-
-    model_name = model.__class__.__name__
-
-    if model_name == "Pasa":
-        if "fullgrad" in visualization_types:
-            raise ValueError(
-                "Fullgrad visualization is not supported for the Pasa model."
-            )
-        target_layers = [model.fc14]  # Last non-1x1 Conv2d layer
-    else:
-        target_layers = [model.model_ft.features.denseblock4.denselayer16.conv2]
-
-    for vis_type in visualization_types:
-        cam = create_cam(vis_type, model, target_layers, use_cuda)
-
-        for k, v in dataloaders.items():
-            if model_name == "DensenetRS" and target_class.lower() == "all":
-                csv_files, csv_writers = prepare_csv_writers(
-                    output_folder, vis_type, k, num_classes=14
-                )
-            else:
-                csv_files, csv_writers = prepare_csv_writers(
-                    output_folder, vis_type, k, num_classes=0
-                )
-
-            logger.info(f"Calculating ROAD scores for '{k}' set...")
-
-            run(
-                model,
-                data_loader=v,
-                output_folder=output_folder,
-                device=device,
-                cam=cam,
-                csv_writers=csv_writers,
-                target_class=target_class,
-                tb_positive_only=tb_positive_only,
-            )
-
-        for csv_file in csv_files.values():
-            csv_file.close()
diff --git a/src/ptbench/scripts/cli.py b/src/ptbench/scripts/cli.py
index d57a974d5528fe7aec81926c4d6d9e8e88822630..ff3611beb8410a240e38f1a08bceb8301950b8e3 100644
--- a/src/ptbench/scripts/cli.py
+++ b/src/ptbench/scripts/cli.py
@@ -7,16 +7,16 @@ import click
 from clapper.click import AliasedGroup
 
 from . import (
-    calculate_road,
     compare_vis,
     config,
     database,
     evaluate,
-    evaluate_saliencymaps,
     evaluatevis,
     experiment,
     generate_saliencymaps,
     predict,
+    saliency_completeness,
+    saliency_interpretability,
     train,
     train_analysis,
     visualize,
@@ -32,12 +32,12 @@ def cli():
     pass
 
 
-cli.add_command(calculate_road.calculate_road)
 cli.add_command(compare_vis.compare_vis)
 cli.add_command(config.config)
 cli.add_command(database.database)
 cli.add_command(evaluate.evaluate)
-cli.add_command(evaluate_saliencymaps.evaluate_saliencymaps)
+cli.add_command(saliency_completeness.saliency_completeness)
+cli.add_command(saliency_interpretability.saliency_interpretability)
 cli.add_command(evaluatevis.evaluatevis)
 cli.add_command(experiment.experiment)
 cli.add_command(generate_saliencymaps.generate_saliencymaps)
diff --git a/src/ptbench/scripts/generate_saliencymaps.py b/src/ptbench/scripts/generate_saliencymaps.py
index c330f5deff56761679f3d8995370b89aa7d58641..5dce34192f01f27796346cd79f2323de44da2f6c 100644
--- a/src/ptbench/scripts/generate_saliencymaps.py
+++ b/src/ptbench/scripts/generate_saliencymaps.py
@@ -120,7 +120,7 @@ logger = setup(__name__.split(".")[0], format="%(levelname)s: %(message)s")
 )
 @click.option(
     "--target-class",
-    "-tc",
+    "-C",
     help="""This option should only be used with multiclass models.  It
     defines the class to target for saliency estimation. Can be either set to
     "all" or "highest". "highest" (the default), means only saliency maps for
@@ -135,7 +135,7 @@ logger = setup(__name__.split(".")[0], format="%(levelname)s: %(message)s")
 )
 @click.option(
     "--positive-only/--no-positive-only",
-    "-p/-P",
+    "-z/-Z",
     help="""If set, and the model chosen has a single output (binary), then
     saliency maps will only be generated for samples of the positive class.
     This option has no effect for multiclass models.""",
@@ -164,14 +164,11 @@ def generate_saliencymaps(
     """
 
     from ..engine.device import DeviceManager
-    from ..engine.saliencymap_generator import run
-    from .utils import save_sh_command
+    from ..engine.saliency.generator import run
 
     logger.info(f"Output folder: {output_folder}")
     output_folder.mkdir(parents=True, exist_ok=True)
 
-    save_sh_command(output_folder / "command.sh")
-
     device_manager = DeviceManager(device)
 
     # batch_size must be == 1 for now (underlying code is NOT prepared to
diff --git a/src/ptbench/scripts/saliency_completeness.py b/src/ptbench/scripts/saliency_completeness.py
new file mode 100644
index 0000000000000000000000000000000000000000..49a3d3a91bb9cf6dec635e75d906470676627d0c
--- /dev/null
+++ b/src/ptbench/scripts/saliency_completeness.py
@@ -0,0 +1,240 @@
+# SPDX-FileCopyrightText: Copyright © 2023 Idiap Research Institute <contact@idiap.ch>
+#
+# SPDX-License-Identifier: GPL-3.0-or-later
+
+import pathlib
+import typing
+
+import click
+
+from clapper.click import ResourceOption, verbosity_option
+from clapper.logging import setup
+
+from ..models.typing import SaliencyMapAlgorithm
+from .click import ConfigCommand
+
+logger = setup(__name__.split(".")[0], format="%(levelname)s: %(message)s")
+
+
+@click.command(
+    entry_point_group="ptbench.config",
+    cls=ConfigCommand,
+    epilog="""Examples:
+
+1. Calculates the ROAD scores for an existing dataset configuration and stores them in .csv files:
+
+   .. code:: sh
+
+      ptbench saliency-completeness -vv pasa tbx11k-v1-healthy-vs-atb --device="cuda" --weight=path/to/model_final.pth --output-folder=path/to/completeness-scores/
+
+""",
+)
+@click.option(
+    "--model",
+    "-m",
+    help="""A lightining module instance implementing the network architecture
+    (not the weights, necessarily) to be used for inference.  Currently, only
+    supports pasa and densenet models.""",
+    required=True,
+    cls=ResourceOption,
+)
+@click.option(
+    "--datamodule",
+    "-d",
+    help="""A lighting data module that will be asked for prediction data
+    loaders. Typically, this includes all configured splits in a datamodule,
+    however this is not a requirement.  A datamodule that returns a single
+    dataloader for prediction (wrapped in a dictionary) is acceptable.""",
+    required=True,
+    cls=ResourceOption,
+)
+@click.option(
+    "--output-folder",
+    "-o",
+    help="Path where to store saliency maps (created if does not exist)",
+    required=True,
+    type=click.Path(
+        exists=False,
+        file_okay=False,
+        dir_okay=True,
+        writable=True,
+        path_type=pathlib.Path,
+    ),
+    default="saliency-maps",
+    cls=ResourceOption,
+)
+@click.option(
+    "--device",
+    "-x",
+    help='A string indicating the device to use (e.g. "cpu" or "cuda:0")',
+    show_default=True,
+    required=True,
+    default="cpu",
+    cls=ResourceOption,
+)
+@click.option(
+    "--cache-samples/--no-cache-samples",
+    help="If set to True, loads the sample into memory, "
+    "otherwise loads them at runtime.",
+    required=True,
+    show_default=True,
+    default=False,
+    cls=ResourceOption,
+)
+@click.option(
+    "--weight",
+    "-w",
+    help="""Path or URL to pretrained model file (`.ckpt` extension),
+    corresponding to the architecture set with `--model`.""",
+    required=True,
+    cls=ResourceOption,
+    type=click.Path(exists=True, file_okay=True, dir_okay=False, readable=True),
+)
+@click.option(
+    "--parallel",
+    "-P",
+    help="""Use multiprocessing for data loading: if set to -1 (default),
+    disables multiprocessing data loading.  Set to 0 to enable as many data
+    loading instances as processing cores as available in the system.  Set to
+    >= 1 to enable that many multiprocessing instances for data loading.""",
+    type=click.IntRange(min=-1),
+    show_default=True,
+    required=True,
+    default=-1,
+    cls=ResourceOption,
+)
+@click.option(
+    "--saliency-map-algorithm",
+    "-s",
+    help="""Saliency map algorithm(s) to be used. Can be called multiple times
+    with different techniques.""",
+    type=click.Choice(
+        typing.get_args(SaliencyMapAlgorithm), case_sensitive=False
+    ),
+    multiple=True,
+    default=["gradcam"],
+    show_default=True,
+    cls=ResourceOption,
+)
+@click.option(
+    "--target-class",
+    "-C",
+    help="""This option should only be used with multiclass models.  It
+    defines the class to target for saliency estimation. Can be either set to
+    "all" or "highest". "highest" (the default), means only saliency maps for
+    the class with the highest activation will be generated.""",
+    required=False,
+    type=click.Choice(
+        ["highest", "all"],
+        case_sensitive=False,
+    ),
+    default="highest",
+    cls=ResourceOption,
+)
+@click.option(
+    "--positive-only/--no-positive-only",
+    "-z/-Z",
+    help="""If set, and the model chosen has a single output (binary), then
+    saliency maps will only be generated for samples of the positive class.
+    This option has no effect for multiclass models.""",
+    default=False,
+    cls=ResourceOption,
+)
+@click.option(
+    "--percentile",
+    "-e",
+    help="""One or more percentiles (percent x100) integer values indicating
+    the proportion of pixels to perturb in the original image to calculate both
+    MoRF and LeRF scores.""",
+    multiple=True,
+    default=[20, 40, 60, 80],
+    show_default=True,
+    cls=ResourceOption,
+)
+@verbosity_option(logger=logger, cls=ResourceOption, expose_value=False)
+def saliency_completeness(
+    model,
+    datamodule,
+    output_folder,
+    device,
+    cache_samples,
+    weight,
+    parallel,
+    saliency_map_algorithm,
+    target_class,
+    positive_only,
+    percentile,
+    **_,
+) -> None:
+    """Evaluates saliency map algorithm completeness using RemOve And Debias
+    (ROAD).
+
+    For each selected saliency map algorithm, evaluates the completeness of
+    explanations using the RemOve And Debias (ROAD) algorithm. The ROAD
+    algorithm was first described at [ROAD-2022]_. It estimates explainability
+    (in the completeness sense) of saliency mapping algorithms by substituting
+    relevant pixels in the input image by a local average, re-running
+    prediction on the altered image, and measuring changes in the output
+    classification score when said perturbations are in place.  By substituting
+    most or least relevant pixels with surrounding averages, the ROAD algorithm
+    estimates the importance of such elements in the produced saliency map.  As
+    2023, this measurement technique is considered to be one of the
+    state-of-the-art metrics of explainability.
+
+    This program outputs a JSON file containing the ROAD evaluations (using
+    most-relevant-first, or MoRF, and least-relevant-first, or LeRF for each
+    sample in the datamodule, and per saliency-mapping algorithm.  Each
+    saliency-mapping algorithm yields a single JSON file with the target
+    algorithm name on the ``output-folder``. Values for MoRF and LeRF represent
+    averages by removing 20, 40, 60 and 80% of most or least relevant pixels
+    respectively from the image, and averaging results for all these
+    percentiles.
+
+    .. note::
+
+       This application is relatively slow when processing a large datamodule
+       with many (positive) samples.
+    """
+    import json
+
+    from ..engine.device import DeviceManager
+    from ..engine.saliency.completeness import run
+
+    logger.info(f"Output folder: {output_folder}")
+    output_folder.mkdir(parents=True, exist_ok=True)
+
+    device_manager = DeviceManager(device)
+
+    # batch_size must be == 1 for now (underlying code is NOT prepared to
+    # treat multiple samples at once).
+    datamodule.set_chunk_size(1, 1)
+    datamodule.cache_samples = cache_samples
+    datamodule.parallel = parallel
+    datamodule.model_transforms = model.model_transforms
+
+    datamodule.prepare_data()
+    datamodule.setup(stage="predict")
+
+    logger.info(f"Loading checkpoint from `{weight}`...")
+    model = model.load_from_checkpoint(weight, strict=False)
+
+    for algo in saliency_map_algorithm:
+        logger.info(
+            f"Evaluating RemOve And Debias (ROAD) average scores for "
+            f"algorithm `{algo}` with percentiles "
+            f"`{', '.join([str(k) for k in percentile])}`..."
+        )
+        results = run(
+            model=model,
+            datamodule=datamodule,
+            device_manager=device_manager,
+            saliency_map_algorithm=algo,
+            target_class=target_class,
+            positive_only=positive_only,
+            percentiles=percentile,
+        )
+
+        output_json = output_folder / (algo + ".json")
+        with output_json.open("w") as f:
+            logger.info(f"Saving output file to `{str(output_json)}`...")
+            json.dump(results, f, indent=2)
diff --git a/src/ptbench/scripts/evaluate_saliencymaps.py b/src/ptbench/scripts/saliency_interpretability.py
similarity index 84%
rename from src/ptbench/scripts/evaluate_saliencymaps.py
rename to src/ptbench/scripts/saliency_interpretability.py
index 0e9fdc49c624355bb114bd86aa3efcb69e9e1322..9281ed82ee19c3f0effdda69494c5f95d825c705 100644
--- a/src/ptbench/scripts/evaluate_saliencymaps.py
+++ b/src/ptbench/scripts/saliency_interpretability.py
@@ -23,7 +23,7 @@ logger = setup(__name__.split(".")[0], format="%(levelname)s: %(message)s")
 
    .. code:: sh
 
-      ptbench evaluate-saliencymaps -vv tbx11k-v1-healthy-vs-atb --input-folder=parent_folder/gradcam/ --output-json=parent_folder/gradcam/tbx11k-v1-interp.json
+      ptbench saliency-interpretability -vv tbx11k-v1-healthy-vs-atb --input-folder=parent_folder/gradcam/ --output-json=parent_folder/gradcam/tbx11k-v1-interp.json
 
 """,
 )
@@ -62,11 +62,11 @@ logger = setup(__name__.split(".")[0], format="%(levelname)s: %(message)s")
         dir_okay=False,
         path_type=pathlib.Path,
     ),
-    default="saliencymap-interpretability.json",
+    default="saliency-interpretability.json",
     cls=ResourceOption,
 )
 @verbosity_option(logger=logger, cls=ResourceOption, expose_value=False)
-def evaluate_saliencymaps(
+def saliency_interpretability(
     datamodule,
     input_folder,
     output_json,
@@ -95,12 +95,13 @@ def evaluate_saliencymaps(
       the annotation that most overlaps, over area of (thresholded) saliency
       maps.
     * Proportional Energy: A measure that compares (UNthresholed) saliency maps
-      with annotations originated from "Score-CAM: Score-Weighted Visual
-      Explanations for Convolutional Neural Networks" by Wang et al. (2020),
-      https://arxiv.org/abs/1910.01279.  It estimates how much activation lies
-      within the ground truth boxes compared to the total sum of the activations.
+      with annotations (based on [SCORECAM-2020]_). It estimates how much
+      activation lies within the ground truth boxes compared to the total sum
+      of the activations.
     * Average Saliency Focus: estimates how much of the ground truth bounding
-      boxes area is covered by the activations.
+      boxes area is covered by the activations.  It is similar to the
+      proportional energy measure in the sense it does not need explicit
+      thresholding.
 
     .. important::
 
@@ -121,10 +122,7 @@ def evaluate_saliencymaps(
 
     import json
 
-    from ..engine.saliencymap_evaluator import run
-    from .utils import save_sh_command
-
-    save_sh_command(input_folder / "command.sh")
+    from ..engine.saliency.interpretability import run
 
     datamodule.model_transforms = []
     datamodule.prepare_data()