diff --git a/doc/references.rst b/doc/references.rst
index c2712bc6c6c578079422f916f372a8801a7afdad..e40d47e710b5c023da1abc451dbe4b8088566202 100644
--- a/doc/references.rst
+++ b/doc/references.rst
@@ -62,10 +62,15 @@
    Recognition, pages 2646–2655.
 
 .. [TBX11K-SIMPLIFIED-2020] *Liu, Y., Wu, Y.-H., Ban, Y., Wang, H., and Cheng, M.-*,
-   **Rethinking computer-aided tuberculosis diagnosis.**,
+   **Rethinking computer-aided tuberculosis diagnosis**,
    In Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern
    Recognition, pages 2646–2655.
 
+.. [GRADCAM-2015] *B. Zhou, A. Khosla, A. Lapedriza, A. Oliva, and A.
+   Torralba*, **Learning Deep Features for Discriminative Localization**, In
+   2016 IEEE Conference on Computer Vision and Pattern Recognition (CVPR). doi:
+   https://doi.org/10.1109/CVPR.2016.319.
+
 .. [SCORECAM-2020] *H. Wang et al.*, **Score-CAM: Score-Weighted Visual
    Explanations for Convolutional Neural Networks** 2020 IEEE/CVF Conference on
    Computer Vision and Pattern Recognition Workshops (CVPRW), Seattle, WA, USA,
diff --git a/src/ptbench/engine/saliency/completeness.py b/src/ptbench/engine/saliency/completeness.py
index f26e577f355216e18513258860aa5691caff0773..2d42f21a7fbe84cc60faddf6a4923db8f7218292 100644
--- a/src/ptbench/engine/saliency/completeness.py
+++ b/src/ptbench/engine/saliency/completeness.py
@@ -71,8 +71,8 @@ def _calculate_road_scores(
     Returns
     -------
         A 3-tuple containing floating point numbers representing the
-        most-relevant-first score (``morf``), least-relevant-first score
-        (``lerf``) and the value (``(lerf-morf)/2``).
+        most-relevant-first average score (``morf``), least-relevant-first
+        average score (``lerf``) and the combined value (``(lerf-morf)/2``).
     """
     saliency_map = saliency_map_callable(
         input_tensor=images, targets=[ClassifierOutputTarget(output_num)]
@@ -245,10 +245,11 @@ def run(
         * The model output number used for the ROAD analysis (0, for binary
           classifers as there is typically only one output).
         * ``morf``: ROAD most-relevant-first average of percentiles 20, 40, 60 and
-          80.
+          80 (a.k.a. AOPC-MoRF).
         * ``lerf``: ROAD least-relevant-first average of percentiles 20, 40, 60 and
-          80.
-        * combined: ROAD combined score by evaluating ``(lerf-morf)/2``.
+          80 (a.k.a. AOPC-LeRF).
+        * combined: Average ROAD combined score by evaluating ``(lerf-morf)/2``
+          (a.k.a. AOPC-Combined).
     """
 
     from ...models.densenet import Densenet
diff --git a/src/ptbench/scripts/evaluate.py b/src/ptbench/scripts/evaluate.py
index 2e4fe626fbba296b030c82effc53390924096dbc..72f6b3061feaa012788ad10bd2fc12b18837c74d 100644
--- a/src/ptbench/scripts/evaluate.py
+++ b/src/ptbench/scripts/evaluate.py
@@ -49,7 +49,7 @@ logger = setup(__name__.split(".")[0], format="%(levelname)s: %(message)s")
     "--output-folder",
     "-o",
     help="Path where to store the analysis result (created if does not exist)",
-    required=True,
+    required=False,
     default="results",
     type=click.Path(file_okay=False, dir_okay=True, path_type=pathlib.Path),
     cls=ResourceOption,
diff --git a/src/ptbench/scripts/saliency_interpretability.py b/src/ptbench/scripts/saliency_interpretability.py
index 9281ed82ee19c3f0effdda69494c5f95d825c705..4d7599a87200075948e759341e3178b328670e8f 100644
--- a/src/ptbench/scripts/saliency_interpretability.py
+++ b/src/ptbench/scripts/saliency_interpretability.py
@@ -40,7 +40,8 @@ logger = setup(__name__.split(".")[0], format="%(levelname)s: %(message)s")
 @click.option(
     "--input-folder",
     "-i",
-    help="""Path where to load saliency maps from.""",
+    help="""Path where to load saliency maps from.  You can generate saliency
+    maps with ``ptbench generate-saliencymaps``.""",
     required=True,
     type=click.Path(
         exists=True,
@@ -106,10 +107,9 @@ def saliency_interpretability(
     .. important::
 
        The thresholding algorithm used to evaluate IoU and IoDA measures is
-       based on the process done by the original CAM paper: "Learning Deep
-       Features for Discriminative Localization" by Zhou et al. (2015),
-       https://arxiv.org/abs/1512.04150.  It keeps all points from the saliency
-       map that are above the 20% of its maximum value.
+       based on the process done by the original CAM paper [GRADCAM-2015]_. It
+       keeps all points from the saliency map that are above the 20% of its
+       maximum value.
 
        It then calculates a **single** bounding box for largest connected
        component.  This bounding box represents detected elements on the