diff --git a/bob/ip/binseg/engine/evaluator.py b/bob/ip/binseg/engine/evaluator.py
index 95cfc9c180f867fd1fd948dedb1c3a48bd10bb15..f4d697f160a59f2d8c149650cc4c02d56a14743a 100644
--- a/bob/ip/binseg/engine/evaluator.py
+++ b/bob/ip/binseg/engine/evaluator.py
@@ -16,7 +16,6 @@ import torchvision.transforms.functional as VF
 import h5py
 
 from ..utils.metric import base_metrics
-from ..utils.plot import precision_recall_f1iso_confintval
 
 import logging
 
@@ -50,7 +49,7 @@ def _posneg(pred, gt, threshold):
     return tp_tensor, fp_tensor, tn_tensor, fn_tensor
 
 
-def _sample_metrics(pred, gt):
+def _sample_metrics(pred, gt, bins):
     """
     Calculates metrics on one single sample and saves it to disk
 
@@ -64,6 +63,10 @@ def _sample_metrics(pred, gt):
     gt : torch.Tensor
         ground-truth (annotations)
 
+    bins : int
+        number of bins to use for threshold analysis.  The step size is
+        calculated from this by dividing ``1.0/bins``.
+
 
     Returns
     -------
@@ -82,10 +85,10 @@ def _sample_metrics(pred, gt):
 
     """
 
-    step_size = 0.01
+    step_size = 1.0/bins
     data = []
 
-    for threshold in numpy.arange(0.0, 1.0, step_size):
+    for index, threshold in enumerate(numpy.arange(0.0, 1.0, step_size)):
 
         tp_tensor, fp_tensor, tn_tensor, fn_tensor = _posneg(
             pred, gt, threshold
@@ -107,6 +110,7 @@ def _sample_metrics(pred, gt):
 
         data.append(
             [
+                index,
                 threshold,
                 precision,
                 recall,
@@ -120,6 +124,7 @@ def _sample_metrics(pred, gt):
     return pandas.DataFrame(
         data,
         columns=(
+            "index",
             "threshold",
             "precision",
             "recall",
@@ -254,6 +259,7 @@ def run(
     """
 
     # Collect overall metrics
+    bins = 100 #number of thresholds to analyse for
     data = {}
 
     for sample in tqdm(dataset):
@@ -268,7 +274,7 @@ def run(
             raise RuntimeError(
                 f"{stem} entry already exists in data. Cannot overwrite."
             )
-        data[stem] = _sample_metrics(pred, gt)
+        data[stem] = _sample_metrics(pred, gt, bins)
 
         if overlayed_folder is not None:
             overlay_image = _sample_analysis(
@@ -286,8 +292,8 @@ def run(
     df_metrics = pandas.concat(data.values())
 
     # Report and Averages
-    avg_metrics = df_metrics.groupby("threshold").mean()
-    std_metrics = df_metrics.groupby("threshold").std()
+    avg_metrics = df_metrics.groupby("index").mean()
+    std_metrics = df_metrics.groupby("index").std()
 
     # Uncomment below for F1-score calculation based on average precision and
     # metrics instead of F1-scores of individual images. This method is in line
@@ -306,20 +312,24 @@ def run(
     avg_metrics["std_f1"] = std_metrics["f1_score"]
 
     maxf1 = avg_metrics["f1_score"].max()
-    optimal_f1_threshold = avg_metrics["f1_score"].idxmax()
+    maxf1_index = avg_metrics["f1_score"].idxmax()
+    maxf1_threshold = avg_metrics["threshold"][maxf1_index]
 
     logger.info(
         f"Maximum F1-score of {maxf1:.5f}, achieved at "
-        f"threshold {optimal_f1_threshold:.2f} (chosen *a posteriori*)"
+        f"threshold {maxf1_threshold:.3f} (chosen *a posteriori*)"
     )
 
     if threshold is not None:
 
-        f1_a_priori = avg_metrics["f1_score"][threshold]
+        # get the closest possible threshold we have
+        index = int(round(bins*threshold))
+        f1_a_priori = avg_metrics["f1_score"][index]
+        actual_threshold = avg_metrics["threshold"][index]
 
         logger.info(
-                f"F1-score of {f1_a_priori:.5f}, at threshold {threshold:.5f} "
-                f"(chosen *a priori*)"
+                f"F1-score of {f1_a_priori:.5f}, at threshold "
+                f"{actual_threshold:.3f} (chosen *a priori*)"
         )
 
     if output_folder is not None:
@@ -335,22 +345,7 @@ def run(
         )
         avg_metrics.to_csv(metrics_path)
 
-        # Plotting
-        np_avg_metrics = avg_metrics.to_numpy().T
-        figure_path = os.path.join(output_folder, "precision-recall.pdf")
-        logger.info(f"Saving overall precision-recall plot at {figure_path}...")
-        fig = precision_recall_f1iso_confintval(
-            [np_avg_metrics[0]],
-            [np_avg_metrics[1]],
-            [np_avg_metrics[7]],
-            [np_avg_metrics[8]],
-            [np_avg_metrics[10]],
-            [np_avg_metrics[11]],
-            ["data"],
-        )
-        fig.savefig(figure_path)
-
-    return optimal_f1_threshold
+    return maxf1_threshold
 
 
 def compare_annotators(baseline, other, output_folder, overlayed_folder=None):
@@ -395,7 +390,7 @@ def compare_annotators(baseline, other, output_folder, overlayed_folder=None):
             raise RuntimeError(
                 f"{stem} entry already exists in data. " f"Cannot overwrite."
             )
-        data[stem] = _sample_metrics(pred, gt)
+        data[stem] = _sample_metrics(pred, gt, 2)
 
         if overlayed_folder is not None:
             overlay_image = _sample_analysis(
@@ -413,8 +408,8 @@ def compare_annotators(baseline, other, output_folder, overlayed_folder=None):
     df_metrics = pandas.concat(data.values())
 
     # Report and Averages
-    avg_metrics = df_metrics.groupby("threshold").mean()
-    std_metrics = df_metrics.groupby("threshold").std()
+    avg_metrics = df_metrics.groupby("index").mean()
+    std_metrics = df_metrics.groupby("index").std()
 
     # Uncomment below for F1-score calculation based on average precision and
     # metrics instead of F1-scores of individual images. This method is in line
@@ -432,9 +427,13 @@ def compare_annotators(baseline, other, output_folder, overlayed_folder=None):
     avg_metrics["re_lower"] = avg_metrics["recall"] - avg_metrics["std_re"]
     avg_metrics["std_f1"] = std_metrics["f1_score"]
 
-    metrics_path = os.path.join(output_folder, "metrics.csv")
+    # we actually only need to keep the second row of the pandas dataframe
+    # with threshold == 0.5 - the first row is redundant
+    avg_metrics.drop(0, inplace=True)
+
+    metrics_path = os.path.join(output_folder, "metrics-second-annotator.csv")
     logger.info(f"Saving averages over all input images at {metrics_path}...")
     avg_metrics.to_csv(metrics_path)
 
     maxf1 = avg_metrics["f1_score"].max()
-    logger.info(f"Maximum F1-score of {maxf1:.5f} (second annotator)")
+    logger.info(f"F1-score of {maxf1:.5f} (second annotator; threshold=0.5)")
diff --git a/bob/ip/binseg/script/compare.py b/bob/ip/binseg/script/compare.py
index 3b2b0ae998e8dbf05015dda950975e78a042b176..e27b6296901587038184153c750a8e1310cec67b 100644
--- a/bob/ip/binseg/script/compare.py
+++ b/bob/ip/binseg/script/compare.py
@@ -8,12 +8,105 @@ from bob.extension.scripts.click_helper import (
     AliasedGroup,
 )
 
-from ..utils.plot import combined_precision_recall_f1iso_confintval
+import pandas
+
+from ..utils.plot import precision_recall_f1iso
 
 import logging
 logger = logging.getLogger(__name__)
 
 
+def _validate_threshold(t, dataset):
+    """Validates the user threshold selection.  Returns parsed threshold."""
+
+    if t is None:
+        return t
+
+    try:
+        # we try to convert it to float first
+        t = float(t)
+        if t < 0.0 or t > 1.0:
+            raise ValueError("Float thresholds must be within range [0.0, 1.0]")
+    except ValueError:
+        # it is a bit of text - assert dataset with name is available
+        if not isinstance(dataset, dict):
+            raise ValueError(
+                "Threshold should be a floating-point number "
+                "if your provide only a single dataset for evaluation"
+            )
+        if t not in dataset:
+            raise ValueError(
+                f"Text thresholds should match dataset names, "
+                f"but {t} is not available among the datasets provided ("
+                f"({', '.join(dataset.keys())})"
+            )
+
+    return t
+
+
+def _load_and_plot(data, threshold=None):
+    """Plots comparison chart of all evaluated models
+
+    Parameters
+    ----------
+
+    data : dict
+        A dict in which keys are the names of the systems and the values are
+        paths to ``metrics.csv`` style files.
+
+    threshold : :py:class:`float`, :py:class:`str`, Optional
+        A value indicating which threshold to choose for plotting a "F1-score"
+        (black) dot on the various curves.  If set to ``None``, then plot the
+        maximum F1-score on that curve.  If set to a floating-point value, then
+        plot the F1-score that is obtained on that particular threshold.  If
+        set to a string, it should match one of the keys in ``data``.  It then
+        first calculate the threshold reaching the maximum F1-score on that
+        particular dataset and then applies that threshold to all other sets.
+
+
+    Returns
+    -------
+
+    figure : matplotlib.figure.Figure
+        A figure, with all systems combined into a single plot.
+
+    """
+
+    if isinstance(threshold, str):
+        logger.info(f"Calculating threshold from maximum F1-score at "
+                f"'{threshold}' dataset...")
+        metrics_path = data[threshold]
+        df = pandas.read_csv(metrics_path)
+
+        maxf1 = df["f1_score"].max()
+        use_threshold = df["threshold"][df["f1_score"].idxmax()]
+        logger.info(f"Dataset '*': threshold = {use_threshold:.3f}'")
+
+    elif isinstance(threshold, float):
+        use_threshold = threshold
+        logger.info(f"Dataset '*': threshold = {use_threshold:.3f}'")
+
+    names = []
+    dfs = []
+    thresholds = []
+
+    # loads all data
+    for name, metrics_path in data.items():
+
+        logger.info(f"Loading metrics from {metrics_path}...")
+        df = pandas.read_csv(metrics_path)
+
+        if threshold is None:
+            use_threshold = df["threshold"][df["f1_score"].idxmax()]
+            logger.info(f"Dataset '{name}': threshold = {use_threshold:.3f}'")
+
+        names.append(name)
+        dfs.append(df)
+        thresholds.append(use_threshold)
+
+    return precision_recall_f1iso(names, dfs, thresholds, confidence=True)
+
+
 @click.command(
     epilog="""Examples:
 
@@ -36,8 +129,23 @@ logger = logging.getLogger(__name__)
     default="comparison.pdf",
     type=click.Path(),
 )
+
+@click.option(
+    "--threshold",
+    "-t",
+    help="This number is used to select which F1-score to use for "
+    "representing a system performance.  If not set, we report the maximum "
+    "F1-score in the set, which is equivalent to threshold selection a "
+    "posteriori (biased estimator).  You can either set this value to a "
+    "floating-point number in the range [0.0, 1.0], or to a string, naming "
+    "one of the systems which will be used to calculate the threshold "
+    "leading to the maximum F1-score and then applied to all other sets.",
+    default=None,
+    show_default=False,
+    required=False,
+)
 @verbosity_option()
-def compare(label_path, output, **kwargs):
+def compare(label_path, output, threshold, **kwargs):
     """Compares multiple systems together"""
 
     # hack to get a dictionary from arguments passed to input
@@ -46,6 +154,11 @@ def compare(label_path, output, **kwargs):
                 " composed of name-path entries")
     data = dict(zip(label_path[::2], label_path[1::2]))
 
-    fig = combined_precision_recall_f1iso_confintval(data)
+    threshold = _validate_threshold(threshold, data)
+
+    fig = _load_and_plot(data, threshold=threshold)
     logger.info(f"Saving plot at {output}")
     fig.savefig(output)
+
+    # TODO: print table with all results
+    pass
diff --git a/bob/ip/binseg/script/evaluate.py b/bob/ip/binseg/script/evaluate.py
index 5f82ace1ddaaf6e9dcfd462a254a860771013d72..c8448e6d5bfebfd2fc388c245a9854637b4c3b70 100644
--- a/bob/ip/binseg/script/evaluate.py
+++ b/bob/ip/binseg/script/evaluate.py
@@ -107,16 +107,6 @@ def _validate_threshold(t, dataset):
     cls=ResourceOption,
     show_default=True,
 )
-@click.option(
-    "--second-annotator-folder",
-    "-O",
-    help="Path where to store the analysis result for second annotator "
-    "comparisons (only used if --second-annotator is also passed)",
-    required=True,
-    default="second-annotator",
-    type=click.Path(),
-    cls=ResourceOption,
-)
 @click.option(
     "--overlayed",
     "-O",
@@ -133,11 +123,9 @@ def _validate_threshold(t, dataset):
 )
 @click.option(
     "--threshold",
-    "-T",
-    help="If you set --overlayed, then you can provide a value to be used as "
-    "threshold to be applied on probability maps and decide for positives and "
-    "negatives.  This binary output will be used to define true and false "
-    "positives, and false negatives for the overlay analysis.  This number "
+    "-t",
+    help="This number is used to define positives and negatives from "
+    "probability maps, and report F1-scores (a priori). It "
     "should either come from the training set or a separate validation set "
     "to avoid biasing the analysis.  Optionally, if you provide a multi-set "
     "dataset as input, this may also be the name of an existing set from "
@@ -155,7 +143,6 @@ def evaluate(
     predictions_folder,
     dataset,
     second_annotator,
-    second_annotator_folder,
     overlayed,
     threshold,
     **kwargs,
@@ -173,7 +160,6 @@ def evaluate(
             "dataset": dataset,
             "output_folder": output_folder,
             "second_annotator": second_annotator,
-            "second_annotator_folder": second_annotator_folder,
         }
     else:
         for k, v in dataset.items():
@@ -184,9 +170,6 @@ def evaluate(
                 "dataset": v,
                 "output_folder": os.path.join(output_folder, k),
                 "second_annotator": second_annotator.get(k),
-                "second_annotator_folder": os.path.join(
-                    second_annotator_folder, k
-                ),
             }
 
     if isinstance(threshold, str):
@@ -209,7 +192,7 @@ def evaluate(
             compare_annotators(
                 v["dataset"],
                 v["second_annotator"],
-                v["second_annotator_folder"],
+                v["output_folder"],
                 os.path.join(overlayed, "second-annotator")
                 if overlayed
                 else None,
diff --git a/bob/ip/binseg/script/experiment.py b/bob/ip/binseg/script/experiment.py
index 074056c1abce92656afcb3186cd0876f4e131f11..5458036828a09fadb76a6df65d43be79c0b7bef9 100644
--- a/bob/ip/binseg/script/experiment.py
+++ b/bob/ip/binseg/script/experiment.py
@@ -380,14 +380,12 @@ def experiment(
     logger.info(f"Setting --threshold={threshold}...")
 
     analysis_folder = os.path.join(output_folder, "analysis")
-    second_annotator_folder = os.path.join(analysis_folder, "second-annotator")
     ctx.invoke(
         evaluate,
         output_folder=analysis_folder,
         predictions_folder=predictions_folder,
         dataset=dataset,
         second_annotator=second_annotator,
-        second_annotator_folder=second_annotator_folder,
         overlayed=overlayed_folder,
         threshold=threshold,
         verbose=verbose,
@@ -412,8 +410,8 @@ def experiment(
             if k.startswith("_"):
                 logger.info(f"Skipping dataset '{k}' (not to be compared)")
                 continue
-            systems += [f"{k} (2nd. annot.)",
-                    os.path.join(second_annotator_folder, k, "metrics.csv")]
+            systems += [f"{k} (2nd. annot.)", os.path.join(analysis_folder, k,
+                "metrics-second-annotator.csv")]
     output_pdf = os.path.join(output_folder, "comparison.pdf")
     ctx.invoke(compare, label_path=systems, output=output_pdf, verbose=verbose)
 
diff --git a/bob/ip/binseg/test/test_cli.py b/bob/ip/binseg/test/test_cli.py
index e39fba54f952c0faa98f64b7b8849d8602a411b0..6e2d8be26a42ba0992af601b235afd98c46123e1 100644
--- a/bob/ip/binseg/test/test_cli.py
+++ b/bob/ip/binseg/test/test_cli.py
@@ -127,14 +127,13 @@ def _check_experiment_stare(overlay):
 
         # check evaluation outputs
         eval_folder = os.path.join(output_folder, "analysis")
-        second_folder = os.path.join(eval_folder, "second-annotator")
         assert os.path.exists(os.path.join(eval_folder, "train", "metrics.csv"))
         assert os.path.exists(os.path.join(eval_folder, "test", "metrics.csv"))
         assert os.path.exists(
-            os.path.join(second_folder, "train", "metrics.csv")
+            os.path.join(eval_folder, "train", "metrics-second-annotator.csv")
         )
         assert os.path.exists(
-            os.path.join(second_folder, "test", "metrics.csv")
+            os.path.join(eval_folder, "test", "metrics-second-annotator.csv")
         )
 
         overlay_folder = os.path.join(output_folder, "overlayed", "analysis")
@@ -172,8 +171,7 @@ def _check_experiment_stare(overlay):
             r"^Started evaluation$": 1,
             r"^Maximum F1-score of.*\(chosen \*a posteriori\*\)$": 3,
             r"^F1-score of.*\(chosen \*a priori\*\)$": 2,
-            r"^Maximum F1-score of .* \(second annotator\)$": 2,
-            r"^Saving overall precision-recall plot at .*$": 2,
+            r"^F1-score of.*\(second annotator; threshold=0.5\)$": 2,
             r"^Ended evaluation$": 1,
             r"^Started comparison$": 1,
             r"^Loading metrics from": 4,
@@ -341,7 +339,6 @@ def _check_evaluate(runner):
         config.flush()
 
         output_folder = "evaluations"
-        second_folder = "evaluations-2nd"
         overlay_folder = os.path.join("overlayed", "analysis")
         result = runner.invoke(
             evaluate,
@@ -351,13 +348,13 @@ def _check_evaluate(runner):
                 f"--output-folder={output_folder}",
                 "--predictions-folder=predictions",
                 f"--overlayed={overlay_folder}",
-                f"--second-annotator-folder={second_folder}",
             ],
         )
         _assert_exit_0(result)
 
         assert os.path.exists(os.path.join(output_folder, "metrics.csv"))
-        assert os.path.exists(os.path.join(second_folder, "metrics.csv"))
+        assert os.path.exists(os.path.join(output_folder,
+            "metrics-second-annotator.csv"))
 
         # check overlayed images are there (since we requested them)
         basedir = os.path.join(overlay_folder, "stare-images")
@@ -369,7 +366,7 @@ def _check_evaluate(runner):
             r"^Saving averages over all input images.*$": 2,
             r"^Maximum F1-score of.*\(chosen \*a posteriori\*\)$": 1,
             r"^F1-score of.*\(chosen \*a priori\*\)$": 1,
-            r"^Maximum F1-score of .* \(second annotator\)$": 1,
+            r"^F1-score of.*\(second annotator; threshold=0.5\)$": 1,
         }
         buf.seek(0)
         logging_output = buf.read()
@@ -393,7 +390,6 @@ def _check_compare(runner):
     with stdout_logging() as buf:
 
         output_folder = "evaluations"
-        second_folder = "evaluations-2nd"
         result = runner.invoke(
             compare,
             [
@@ -402,7 +398,7 @@ def _check_compare(runner):
                 "test",
                 os.path.join(output_folder, "metrics.csv"),
                 "test (2nd. human)",
-                os.path.join(second_folder, "metrics.csv"),
+                os.path.join(output_folder, "metrics-second-annotator.csv"),
             ],
         )
         _assert_exit_0(result)
diff --git a/bob/ip/binseg/utils/plot.py b/bob/ip/binseg/utils/plot.py
index 9f897d5b0e70dd60e76185ed196ea2e34dd3425b..0bbea34ae37e035807bddfe604714b05a18445e8 100644
--- a/bob/ip/binseg/utils/plot.py
+++ b/bob/ip/binseg/utils/plot.py
@@ -1,6 +1,7 @@
 #!/usr/bin/env python
 # -*- coding: utf-8 -*-
 
+import contextlib
 from itertools import cycle
 
 import numpy
@@ -15,72 +16,49 @@ import logging
 logger = logging.getLogger(__name__)
 
 
-def precision_recall_f1iso(precision, recall, names):
-    """Creates a precision-recall plot of the given data.
+@contextlib.contextmanager
+def _precision_recall_canvas(title=None):
+    """Generates a canvas to draw precision-recall curves
+
+    Works like a context manager, yielding a figure and an axes set in which
+    the precision-recall curves should be added to.  The figure already
+    contains F1-ISO lines and is preset to a 0-1 square region.  Once the
+    context is finished, ``fig.tight_layout()`` is called.
 
-    The plot will be annotated with F1-score iso-lines (in which the F1-score
-    maintains the same value)
 
     Parameters
     ----------
 
-    precision : :py:class:`numpy.ndarray` or :py:class:`list`
-        A list of 1D arrays containing the Y coordinates of the plot, or the
-        precision, or a 2D np array in which the rows correspond to each of the
-        system's precision coordinates.
-
-    recall : :py:class:`numpy.ndarray` or :py:class:`list`
-        A list of 1D arrays containing the X coordinates of the plot, or the
-        recall, or a 2D np array in which the rows correspond to each of the
-        system's recall coordinates.
+    title : :py:class:`str`, Optional
+        Optional title to add to this plot
 
-    names : :py:class:`list`
-        An iterable over the names of each of the systems along the rows of
-        ``precision`` and ``recall``
 
-
-    Returns
-    -------
+    Yields
+    ------
 
     figure : matplotlib.figure.Figure
-        A matplotlib figure you can save or display
+        The figure that should be finally returned to the user
+
+    axes : matplotlib.figure.Axes
+        An axis set where to precision-recall plots should be added to
 
     """
 
-    fig, ax1 = plt.subplots(1)
-    lines = ["-", "--", "-.", ":"]
-    linecycler = cycle(lines)
-    for p, r, n in zip(precision, recall, names):
-        # Plots only from the point where recall reaches its maximum, otherwise, we
-        # don't see a curve...
-        i = r.argmax()
-        pi = p[i:]
-        ri = r[i:]
-        valid = (pi + ri) > 0
-        f1 = 2 * (pi[valid] * ri[valid]) / (pi[valid] + ri[valid])
-        # optimal point along the curve
-        argmax = f1.argmax()
-        opi = pi[argmax]
-        ori = ri[argmax]
-        # Plot Recall/Precision as threshold changes
-        ax1.plot(
-            ri[pi > 0],
-            pi[pi > 0],
-            next(linecycler),
-            label="[F={:.4f}] {}".format(f1.max(), n),
-        )
-        ax1.plot(
-            ori, opi, marker="o", linestyle=None, markersize=3, color="black"
-        )
-    ax1.grid(linestyle="--", linewidth=1, color="gray", alpha=0.2)
-    if len(names) > 1:
-        plt.legend(loc="lower left", framealpha=0.5)
-    ax1.set_xlabel("Recall")
-    ax1.set_ylabel("Precision")
-    ax1.set_xlim([0.0, 1.0])
-    ax1.set_ylim([0.0, 1.0])
+    fig, axes1 = plt.subplots(1)
+
+    # Names and bounds
+    axes1.set_xlabel("Recall")
+    axes1.set_ylabel("Precision")
+    axes1.set_xlim([0.0, 1.0])
+    axes1.set_ylim([0.0, 1.0])
+
+    if title is not None:
+        axes1.set_title(title)
+
+    axes1.grid(linestyle="--", linewidth=1, color="gray", alpha=0.2)
+    axes2 = axes1.twinx()
+
     # Annotates plot with F1-score iso-lines
-    ax2 = ax1.twinx()
     f_scores = numpy.linspace(0.1, 0.9, num=9)
     tick_locs = []
     tick_labels = []
@@ -90,197 +68,168 @@ def precision_recall_f1iso(precision, recall, names):
         (l,) = plt.plot(x[y >= 0], y[y >= 0], color="green", alpha=0.1)
         tick_locs.append(y[-1])
         tick_labels.append("%.1f" % f_score)
-    ax2.tick_params(axis="y", which="both", pad=0, right=False, left=False)
-    ax2.set_ylabel("iso-F", color="green", alpha=0.3)
-    ax2.set_ylim([0.0, 1.0])
-    ax2.yaxis.set_label_coords(1.015, 0.97)
-    ax2.set_yticks(tick_locs)  # notice these are invisible
-    for k in ax2.set_yticklabels(tick_labels):
+    axes2.tick_params(axis="y", which="both", pad=0, right=False, left=False)
+    axes2.set_ylabel("iso-F", color="green", alpha=0.3)
+    axes2.set_ylim([0.0, 1.0])
+    axes2.yaxis.set_label_coords(1.015, 0.97)
+    axes2.set_yticks(tick_locs)  # notice these are invisible
+    for k in axes2.set_yticklabels(tick_labels):
         k.set_color("green")
         k.set_alpha(0.3)
         k.set_size(8)
+
     # we should see some of axes 1 axes
-    ax1.spines["right"].set_visible(False)
-    ax1.spines["top"].set_visible(False)
-    ax1.spines["left"].set_position(("data", -0.015))
-    ax1.spines["bottom"].set_position(("data", -0.015))
+    axes1.spines["right"].set_visible(False)
+    axes1.spines["top"].set_visible(False)
+    axes1.spines["left"].set_position(("data", -0.015))
+    axes1.spines["bottom"].set_position(("data", -0.015))
+
     # we shouldn't see any of axes 2 axes
-    ax2.spines["right"].set_visible(False)
-    ax2.spines["top"].set_visible(False)
-    ax2.spines["left"].set_visible(False)
-    ax2.spines["bottom"].set_visible(False)
+    axes2.spines["right"].set_visible(False)
+    axes2.spines["top"].set_visible(False)
+    axes2.spines["left"].set_visible(False)
+    axes2.spines["bottom"].set_visible(False)
+
+    # yield execution, lets user draw precision-recall plots, and the legend
+    # before tighteneing the layout
+    yield fig, axes1
+
     plt.tight_layout()
-    return fig
 
 
-def precision_recall_f1iso_confintval(
-    precision, recall, pr_upper, pr_lower, re_upper, re_lower, names
-):
-    """Creates a precision-recall plot of the given data, with confidence
-    intervals
+def precision_recall_f1iso(label, df, threshold, confidence=True):
+    """Creates a precision-recall plot with confidence intervals
+
+    This function creates and returns a Matplotlib figure with a
+    precision-recall plot containing shaded confidence intervals.  The plot
+    will be annotated with F1-score iso-lines (in which the F1-score maintains
+    the same value).
 
-    The plot will be annotated with F1-score iso-lines (in which the F1-score
-    maintains the same value)
 
     Parameters
     ----------
 
-    precision : :py:class:`numpy.ndarray` or :py:class:`list`
-        A list of 1D arrays containing the Y coordinates of the plot, or the
-        precision, or a 2D array in which the rows correspond to each
-        of the system's precision coordinates.
-
-    recall : :py:class:`numpy.ndarray` or :py:class:`list`
-        A list of 1D arrays containing the X coordinates of the plot, or
-        the recall, or a 2D array in which the rows correspond to each
-        of the system's recall coordinates.
-
-    pr_upper : :py:class:`numpy.ndarray` or :py:class:`list`
-        A list of 1D arrays containing the upper bound of the confidence
-        interval for the Y coordinates of the plot, or the precision upper
-        bound, or a 2D array in which the rows correspond to each of the
-        system's precision upper-bound coordinates.
-
-    pr_lower : :py:class:`numpy.ndarray` or :py:class:`list`
-        A list of 1D arrays containing the lower bound of the confidence
-        interval for the Y coordinates of the plot, or the precision lower
-        bound, or a 2D array in which the rows correspond to each of the
-        system's precision lower-bound coordinates.
-
-    re_upper : :py:class:`numpy.ndarray` or :py:class:`list`
-        A list of 1D arrays containing the upper bound of the confidence
-        interval for the Y coordinates of the plot, or the recall upper bound,
-        or a 2D array in which the rows correspond to each of the system's
-        recall upper-bound coordinates.
-
-    re_lower : :py:class:`numpy.ndarray` or :py:class:`list`
-        A list of 1D arrays containing the lower bound of the confidence
-        interval for the Y coordinates of the plot, or the recall lower bound,
-        or a 2D array in which the rows correspond to each of the system's
-        recall lower-bound coordinates.
-
-    names : :py:class:`list`
-        An iterable over the names of each of the systems along the rows of
-        ``precision`` and ``recall``
+    label : :py:class:`list`
+        A list of names to be associated to each line
+
+    df : :py:class:`pandas.DataFrame`
+        A dataframe that is produced by our evaluator engine, indexed by
+        integer "thresholds", containing the following columns: ``threshold``
+        (sorted ascending), ``precision``, ``recall``, ``pr_upper`` (upper
+        precision bounds), ``pr_lower`` (lower precision bounds), ``re_upper``
+        (upper recall bounds), ``re_lower`` (lower recall bounds).
+
+        Dataframes with a single entry are treated specially as these are
+        considered "second-annotator" performances.  A single dot and a line
+        showing the variability is drawn in these cases.
+
+    threshold : :py:class:`list`
+        A list of thresholds to graph with a dot for each set.  Specific
+        threshold values do not affect "second-annotator" dataframes.
+
+    confidence : :py:class:`bool`, Optional
+        If set, draw confidence intervals for each line, using ``*_upper`` and
+        ``*_lower`` entries.
 
 
     Returns
     -------
+
     figure : matplotlib.figure.Figure
-        A matplotlib figure you can save or display
+        A matplotlib figure you can save or display (uses an ``agg`` backend)
 
     """
 
-    fig, ax1 = plt.subplots(1)
     lines = ["-", "--", "-.", ":"]
     colors = [
-        "#1f77b4",
-        "#ff7f0e",
-        "#2ca02c",
-        "#d62728",
-        "#9467bd",
-        "#8c564b",
-        "#e377c2",
-        "#7f7f7f",
-        "#bcbd22",
-        "#17becf",
-    ]
+            "#1f77b4",
+            "#ff7f0e",
+            "#2ca02c",
+            "#d62728",
+            "#9467bd",
+            "#8c564b",
+            "#e377c2",
+            "#7f7f7f",
+            "#bcbd22",
+            "#17becf",
+            ]
     colorcycler = cycle(colors)
     linecycler = cycle(lines)
-    for p, r, pu, pl, ru, rl, n in zip(
-        precision, recall, pr_upper, pr_lower, re_upper, re_lower, names
-    ):
-        # Plots only from the point where recall reaches its maximum, otherwise, we
-        # don't see a curve...
-        i = r.argmax()
-        pi = p[i:]
-        ri = r[i:]
-        pui = pu[i:]
-        pli = pl[i:]
-        rui = ru[i:]
-        rli = rl[i:]
-        valid = (pi + ri) > 0
-        f1 = 2 * (pi[valid] * ri[valid]) / (pi[valid] + ri[valid])
-        # optimal point along the curve
-        argmax = f1.argmax()
-        opi = pi[argmax]
-        ori = ri[argmax]
-        # Plot Recall/Precision as threshold changes
-        ax1.plot(
-            ri[pi > 0],
-            pi[pi > 0],
-            next(linecycler),
-            label="[F={:.4f}] {}".format(f1.max(), n),
-        )
-        ax1.plot(
-            ori, opi, marker="o", linestyle=None, markersize=3, color="black"
-        )
-        # Plot confidence
-        # Upper bound
-        # ax1.plot(r95ui[p95ui>0], p95ui[p95ui>0])
-        # Lower bound
-        # ax1.plot(r95li[p95li>0], p95li[p95li>0])
-        # create the limiting polygon
-        vert_x = numpy.concatenate((rui[pui > 0], rli[pli > 0][::-1]))
-        vert_y = numpy.concatenate((pui[pui > 0], pli[pli > 0][::-1]))
-        # hacky workaround to plot 2nd human
-        if numpy.isclose(numpy.mean(rui), rui[1], rtol=1e-05):
-            logger.warning("Found 2nd human annotator in metrics - patching...")
-            p = plt.Polygon(
-                numpy.column_stack((vert_x, vert_y)),
-                facecolor="none",
-                alpha=0.2,
-                edgecolor=next(colorcycler),
-                lw=2,
-            )
-        else:
-            p = plt.Polygon(
-                numpy.column_stack((vert_x, vert_y)),
-                facecolor=next(colorcycler),
-                alpha=0.2,
-                edgecolor="none",
-                lw=0.2,
-            )
-        ax1.add_artist(p)
 
-    ax1.grid(linestyle="--", linewidth=1, color="gray", alpha=0.2)
-    if len(names) > 1:
-        plt.legend(loc="lower left", framealpha=0.5)
-    ax1.set_xlabel("Recall")
-    ax1.set_ylabel("Precision")
-    ax1.set_xlim([0.0, 1.0])
-    ax1.set_ylim([0.0, 1.0])
-    # Annotates plot with F1-score iso-lines
-    ax2 = ax1.twinx()
-    f_scores = numpy.linspace(0.1, 0.9, num=9)
-    tick_locs = []
-    tick_labels = []
-    for f_score in f_scores:
-        x = numpy.linspace(0.01, 1)
-        y = f_score * x / (2 * x - f_score)
-        (l,) = plt.plot(x[y >= 0], y[y >= 0], color="green", alpha=0.1)
-        tick_locs.append(y[-1])
-        tick_labels.append("%.1f" % f_score)
-    ax2.tick_params(axis="y", which="both", pad=0, right=False, left=False)
-    ax2.set_ylabel("iso-F", color="green", alpha=0.3)
-    ax2.set_ylim([0.0, 1.0])
-    ax2.yaxis.set_label_coords(1.015, 0.97)
-    ax2.set_yticks(tick_locs)  # notice these are invisible
-    for k in ax2.set_yticklabels(tick_labels):
-        k.set_color("green")
-        k.set_alpha(0.3)
-        k.set_size(8)
-    # we should see some of axes 1 axes
-    ax1.spines["right"].set_visible(False)
-    ax1.spines["top"].set_visible(False)
-    ax1.spines["left"].set_position(("data", -0.015))
-    ax1.spines["bottom"].set_position(("data", -0.015))
-    # we shouldn't see any of axes 2 axes
-    ax2.spines["right"].set_visible(False)
-    ax2.spines["top"].set_visible(False)
-    ax2.spines["left"].set_visible(False)
-    ax2.spines["bottom"].set_visible(False)
-    plt.tight_layout()
+    with _precision_recall_canvas(title=None) as (fig, axes):
+
+        legend = []
+
+        for kn, kdf, kt in zip(label, df, threshold):
+
+            # plots only from the point where recall reaches its maximum,
+            # otherwise, we don't see a curve...
+            max_recall = kdf["recall"].idxmax()
+            pi = kdf.precision[max_recall:]
+            ri = kdf.recall[max_recall:]
+
+            valid = (pi + ri) > 0
+            f1 = 2 * (pi[valid] * ri[valid]) / (pi[valid] + ri[valid])
+
+            # optimal point along the curve
+            bins = len(kdf)
+            index = int(round(bins*kt))
+            index = min(index, len(kdf)-1)  #avoids out of range indexing
+
+            # plots Recall/Precision as threshold changes
+            label = f"{kn} (F1={kdf.f1_score[index]:.4f})"
+            color = next(colorcycler)
+
+            if len(kdf) == 1:
+                # plot black dot for F1-score at select threshold
+                marker, = axes.plot(kdf.recall[index], kdf.precision[index],
+                        marker="*", markersize=6, color=color, alpha=0.8,
+                        linestyle="None")
+                line, = axes.plot(kdf.recall[index], kdf.precision[index],
+                        linestyle="None", color=color, alpha=0.2)
+                legend.append(([marker, line], label))
+            else:
+                # line first, so marker gets on top
+                style = next(linecycler)
+                line, = axes.plot(ri[pi > 0], pi[pi > 0], color=color,
+                        linestyle=style)
+                marker, = axes.plot(kdf.recall[index], kdf.precision[index],
+                        marker="o", linestyle=style, markersize=4,
+                        color=color, alpha=0.8)
+                legend.append(([marker, line], label))
+
+            if confidence:
+
+                pui = kdf.pr_upper[max_recall:]
+                pli = kdf.pr_lower[max_recall:]
+                rui = kdf.re_upper[max_recall:]
+                rli = kdf.re_lower[max_recall:]
+
+                # Plot confidence
+                # Upper bound
+                # create the limiting polygon
+                vert_x = numpy.concatenate((rui[pui > 0], rli[pli > 0][::-1]))
+                vert_y = numpy.concatenate((pui[pui > 0], pli[pli > 0][::-1]))
+
+                # hacky workaround to plot 2nd human
+                if len(kdf) == 1:  #binary system, very likely
+                    logger.warning("Found 2nd human annotator - patching...")
+                    p, = axes.plot(vert_x, vert_y, color=color, alpha=0.1, lw=3)
+                else:
+                    p = plt.Polygon(
+                        numpy.column_stack((vert_x, vert_y)),
+                        facecolor=color,
+                        alpha=0.2,
+                        edgecolor="none",
+                        lw=0.2,
+                    )
+                legend[-1][0].append(p)
+                axes.add_artist(p)
+
+        if len(label) > 1:
+            axes.legend([tuple(k[0]) for k in legend], [k[1] for k in legend],
+                    loc="lower left", fancybox=True, framealpha=0.7)
+
     return fig
 
 
@@ -311,48 +260,3 @@ def loss_curve(df):
     plt.tight_layout()
     fig = ax1.get_figure()
     return fig
-
-
-def combined_precision_recall_f1iso_confintval(data):
-    """Plots comparison chart of all evaluated models
-
-    Parameters
-    ----------
-
-    data : dict
-        A dict in which keys are the names of the systems and the values are
-        paths to ``metrics.csv`` style files.
-
-
-    Returns
-    -------
-
-    figure : matplotlib.figure.Figure
-        A figure, with all systems combined into a single plot.
-
-    """
-
-    precisions = []
-    recalls = []
-    pr_ups = []
-    pr_lows = []
-    re_ups = []
-    re_lows = []
-    names = []
-
-    for name, metrics_path in data.items():
-        logger.info(f"Loading metrics from {metrics_path}...")
-        df = pandas.read_csv(metrics_path)
-        precisions.append(df.precision.to_numpy())
-        recalls.append(df.recall.to_numpy())
-        pr_ups.append(df.pr_upper.to_numpy())
-        pr_lows.append(df.pr_lower.to_numpy())
-        re_ups.append(df.re_upper.to_numpy())
-        re_lows.append(df.re_lower.to_numpy())
-        names.append(name)
-
-    fig = precision_recall_f1iso_confintval(
-        precisions, recalls, pr_ups, pr_lows, re_ups, re_lows, names
-    )
-
-    return fig
diff --git a/doc/evaluation.rst b/doc/evaluation.rst
index 3646969173ddde58f56b87d2f08c718f5c34477c..0368ad6ddd23ab5c231a0e3c29fe8f88f9c0e51b 100644
--- a/doc/evaluation.rst
+++ b/doc/evaluation.rst
@@ -65,7 +65,7 @@ Evaluation
 ----------
 
 In evaluation, we input an **annotated** dataset and predictions to generate
-performance figures that can help analysis of a trained model.  Evaluation is
+performance summaries that help analysis of a trained model.  Evaluation is
 done using the :ref:`evaluate command `<bob.ip.binseg.cli.evaluate>` followed
 by the model and the annotated dataset configuration, and the path to the
 pretrained weights via the ``--weight`` argument.