diff --git a/bob/ip/binseg/engine/inferencer.py b/bob/ip/binseg/engine/inferencer.py
index ccff70d4019c6856334e982fb861d315c2d60196..26de6cf733e301f2ec60a76e275822ec85491be3 100644
--- a/bob/ip/binseg/engine/inferencer.py
+++ b/bob/ip/binseg/engine/inferencer.py
@@ -194,8 +194,10 @@ def do_inference(
     avg_metrics = df_metrics.groupby('threshold').mean()
     std_metrics = df_metrics.groupby('threshold').std()
 
-    avg_metrics["f1_score"] =  (2* avg_metrics["precision"]*avg_metrics["recall"])/ \
-        (avg_metrics["precision"]+avg_metrics["recall"])
+    # Uncomment below for F1-score calculation based on average precision and metrics instead of 
+    # F1-scores of individual images. This method is in line with Maninis et. al. (2016)
+    #avg_metrics["f1_score"] =  (2* avg_metrics["precision"]*avg_metrics["recall"])/ \
+    #    (avg_metrics["precision"]+avg_metrics["recall"])
     
     avg_metrics["std_f1"] = std_metrics["f1_score"]
     
diff --git a/bob/ip/binseg/script/binseg.py b/bob/ip/binseg/script/binseg.py
index 780d85a30f13078af47e9e3ab79aa451b14f108e..44166070d13eeea92d77b6b54c4e1b49e62241c5 100644
--- a/bob/ip/binseg/script/binseg.py
+++ b/bob/ip/binseg/script/binseg.py
@@ -337,28 +337,6 @@ def compare(output_path_list, output_path, title, **kwargs):
     fig.savefig(fig_filename)
 
 
-# Plot overviews
-@binseg.command(entry_point_group='bob.ip.binseg.config', cls=ConfigCommand)
-@click.option(
-    '--output-path',
-    '-o',
-    required=True,
-    )
-@verbosity_option(cls=ResourceOption)
-def pdfoverview(output_path, **kwargs):
-    """ Creates an overview pdf with all precision vs recall curves present in the output directory.
-    Requires pdflatex to be available on the host."""
-    # PR curves
-    pr_filename = "precision_recall_comparison.pdf"
-    pr_filenames = get_paths(output_path,pr_filename)
-    create_pdf(output_path, pr_filenames, title='Precision vs Recall', tex_filename='pr_overview.tex')
-    
-    # Training curves
-    trainlog_filename = "*trainlog.pdf"
-    tl_file_names = get_paths(output_path,trainlog_filename)
-    create_pdf(output_path, tl_file_names, title='Training', tex_filename='training_overview.tex')
-
-
 # Create grid table with results
 @binseg.command(entry_point_group='bob.ip.binseg.config', cls=ConfigCommand)
 @click.option(
@@ -552,7 +530,7 @@ def ssltrain(model
             , rampup
             )
 
-# Apple image transforms to a fodler
+# Apply image transforms to a folder containing images
 @binseg.command(entry_point_group='bob.ip.binseg.config', cls=ConfigCommand)
 @click.option(
     '--source-path',
@@ -579,7 +557,7 @@ def transformfolder(source_path ,target_path,transforms,**kwargs):
     transfld(source_path,target_path,transforms)
 
 
-# Eval only 
+# Evaluate only. Runs evaluation on predicted probability maps (--prediction-folder)
 @binseg.command(entry_point_group='bob.ip.binseg.config', cls=ConfigCommand)
 @click.option(
     '--output-path',
@@ -591,6 +569,7 @@ def transformfolder(source_path ,target_path,transforms,**kwargs):
 @click.option(
     '--prediction-folder',
     '-p',
+    help = 'Path containing output probability maps',
     required=True,
     cls=ResourceOption
     )
@@ -600,12 +579,23 @@ def transformfolder(source_path ,target_path,transforms,**kwargs):
     required=True,
     cls=ResourceOption
     )
+@click.option(
+    '--title',
+    required=False,
+    cls=ResourceOption
+    )
+@click.option(
+    '--legend',
+    cls=ResourceOption
+    )
 
 @verbosity_option(cls=ResourceOption)
 def evalpred(
         output_path
         ,prediction_folder
         ,dataset
+        ,title
+        ,legend
         , **kwargs):
     """ Run inference and evalaute the model performance """
 
@@ -617,5 +607,5 @@ def evalpred(
         ,pin_memory = torch.cuda.is_available()
         )
     
-    # checkpointer, load last model in dir
-    do_eval(prediction_folder, data_loader, output_folder = output_path)
\ No newline at end of file
+    # Run eval
+    do_eval(prediction_folder, data_loader, output_folder = output_path, title= title, legend=legend)
\ No newline at end of file
diff --git a/bob/ip/binseg/utils/evaluate.py b/bob/ip/binseg/utils/evaluate.py
index 620b62db87a0e001151e0a00eeb17fe9829886a7..68257128d64ca899efbd98d2aa8cc3888d932755 100644
--- a/bob/ip/binseg/utils/evaluate.py
+++ b/bob/ip/binseg/utils/evaluate.py
@@ -13,7 +13,7 @@ import torchvision.transforms.functional as VF
 from tqdm import tqdm
 
 from bob.ip.binseg.utils.metric import SmoothedValue, base_metrics
-from bob.ip.binseg.utils.plot import precision_recall_f1iso
+from bob.ip.binseg.utils.plot import precision_recall_f1iso, precision_recall_f1iso_confintval
 from bob.ip.binseg.utils.summary import summary
 from PIL import Image
 from torchvision.transforms.functional import to_tensor
@@ -95,7 +95,9 @@ def batch_metrics(predictions, ground_truths, names, output_folder, logger):
 def do_eval(
     prediction_folder,
     data_loader,
-    output_folder = None
+    output_folder = None,
+    title = '2nd human',
+    legend = '2nd human'
 ):
 
     """
@@ -119,7 +121,7 @@ def do_eval(
     
     # Collect overall metrics 
     metrics = []
-
+    num_images = len(data_loader)
     for samples in tqdm(data_loader):
         names = samples[0]
         images = samples[1]
@@ -156,9 +158,18 @@ def do_eval(
     avg_metrics = df_metrics.groupby('threshold').mean()
     std_metrics = df_metrics.groupby('threshold').std()
 
-    avg_metrics["f1_score"] =  (2* avg_metrics["precision"]*avg_metrics["recall"])/ \
-        (avg_metrics["precision"]+avg_metrics["recall"])
+    # Uncomment below for F1-score calculation based on average precision and metrics instead of 
+    # F1-scores of individual images. This method is in line with Maninis et. al. (2016)
+    #avg_metrics["f1_score"] =  (2* avg_metrics["precision"]*avg_metrics["recall"])/ \
+    #    (avg_metrics["precision"]+avg_metrics["recall"])
+    
     
+    avg_metrics["std_pr"] = std_metrics["precision"]
+    avg_metrics["pr_upper"] = avg_metrics['precision'] + avg_metrics["std_pr"]
+    avg_metrics["pr_lower"] = avg_metrics['precision'] - avg_metrics["std_pr"]
+    avg_metrics["std_re"] = std_metrics["recall"]
+    avg_metrics["re_upper"] = avg_metrics['recall'] + avg_metrics["std_re"]
+    avg_metrics["re_lower"] = avg_metrics['recall'] - avg_metrics["std_re"]
     avg_metrics["std_f1"] = std_metrics["f1_score"]
     
     avg_metrics.to_csv(metrics_path)
@@ -168,10 +179,11 @@ def do_eval(
     logger.info("Highest F1-score of {:.5f}, achieved at threshold {}".format(maxf1, optimal_f1_threshold))
     
     # Plotting
+    #print(avg_metrics)
     np_avg_metrics = avg_metrics.to_numpy().T
     fig_name = "precision_recall.pdf"
     logger.info("saving {}".format(fig_name))
-    fig = precision_recall_f1iso([np_avg_metrics[0]],[np_avg_metrics[1]], ['2nd Human',None], title='2nd Human')
+    fig = precision_recall_f1iso_confintval([np_avg_metrics[0]],[np_avg_metrics[1]],[np_avg_metrics[7]],[np_avg_metrics[8]],[np_avg_metrics[10]],[np_avg_metrics[11]], [legend ,None], title=title)
     fig_filename = os.path.join(results_subfolder, fig_name)
     fig.savefig(fig_filename)
 
diff --git a/bob/ip/binseg/utils/metric.py b/bob/ip/binseg/utils/metric.py
index d1f5ec63fd1b1d7ff3818a976af98562e676f85b..bcb91511f533a8ed69843487ef8cbe793e42a925 100644
--- a/bob/ip/binseg/utils/metric.py
+++ b/bob/ip/binseg/utils/metric.py
@@ -60,5 +60,5 @@ def base_metrics(tp, fp, tn, fn):
     accuracy = (tp + tn) / (tp+fp+fn+tn)
     jaccard = tp / (tp+fp+fn + ( (tp+fp+fn) == 0) )
     f1_score = (2.0 * tp ) / (2.0 * tp + fp + fn + ( (2.0 * tp + fp + fn) == 0) )
-
+    #f1_score = (2.0 * precision * recall) / (precision + recall)
     return [precision, recall, specificity, accuracy, jaccard, f1_score]
\ No newline at end of file
diff --git a/bob/ip/binseg/utils/pdfcreator.py b/bob/ip/binseg/utils/pdfcreator.py
deleted file mode 100644
index fdd1acd393e40e2aa103e9e5f5484f603c1bb350..0000000000000000000000000000000000000000
--- a/bob/ip/binseg/utils/pdfcreator.py
+++ /dev/null
@@ -1,38 +0,0 @@
-#!/usr/bin/env python
-# -*- coding: utf-8 -*-
-from pathlib import Path
-import os
-
-
-def get_paths(output_path, filename):
-    """
-    Parameters
-    ----------
-    output_path : str
-        path in which to look for files
-    filename : str
-
-    Returns
-    -------
-    list 
-        list of file paths
-    """
-    datadir = Path(output_path)
-    file_paths = sorted(list(datadir.glob('**/{}'.format(filename))))
-    file_paths = [f.as_posix() for f in file_paths]
-    return file_paths
-
-
-def create_pdf(output_path, file_paths, title, tex_filename):
-    # setup tex doc
-    textitle = "\\section*{{{}}} \n".format(title, 42)
-    texinit = "\\documentclass{{article}} \\usepackage[utf8]{{inputenc}} \\usepackage[a4paper, margin=2cm]{{geometry}} \\usepackage{{graphicx}} \\begin{{document}} \n".format(42)
-    texclose = "\\end{{document}} \n".format(42)
-    with open (os.path.join(output_path,tex_filename), "w+") as outfile:
-        outfile.write(texinit)
-        outfile.write(textitle)
-        for f in file_paths:
-            outfile.write("\\includegraphics[width=0.5\\textwidth]{{{}}} \n".format(f,42))
-        outfile.write(texclose)
-    # create pdf
-    os.system("pdflatex -output-directory {} {}".format(output_path, os.path.join(output_path,tex_filename)))
\ No newline at end of file
diff --git a/bob/ip/binseg/utils/plot.py b/bob/ip/binseg/utils/plot.py
index 0e6169b41cf3361e7388b7ed2a184ae2abc6f236..b5943e9dc75c3ce034bdc70a8e49c4d231038880 100644
--- a/bob/ip/binseg/utils/plot.py
+++ b/bob/ip/binseg/utils/plot.py
@@ -102,7 +102,121 @@ def precision_recall_f1iso(precision, recall, names, title=None):
     plt.tight_layout()  
     return fig  
 
+def precision_recall_f1iso_confintval(precision, recall, pr_upper, pr_lower, re_upper, re_lower, names, title=None):
+    """
+    Author: Andre Anjos (andre.anjos@idiap.ch).
+    
+    Creates a precision-recall plot of the given data.   
+    The plot will be annotated with F1-score iso-lines (in which the F1-score
+    maintains the same value)   
+    
+    Parameters
+    ----------  
+    precision : :py:class:`numpy.ndarray` or :py:class:`list`
+        A list of 1D np arrays containing the Y coordinates of the plot, or
+        the precision, or a 2D np array in which the rows correspond to each
+        of the system's precision coordinates.  
+    recall : :py:class:`numpy.ndarray` or :py:class:`list`
+        A list of 1D np arrays containing the X coordinates of the plot, or
+        the recall, or a 2D np array in which the rows correspond to each
+        of the system's recall coordinates. 
+    names : :py:class:`list`
+        An iterable over the names of each of the systems along the rows of
+        ``precision`` and ``recall``      
+    title : :py:class:`str`, optional
+        A title for the plot. If not set, omits the title   
+
+    Returns
+    ------- 
+    matplotlib.figure.Figure
+        A matplotlib figure you can save or display 
+    """ 
+    import matplotlib
+    matplotlib.use('agg')
+    import matplotlib.pyplot as plt 
+    from itertools import cycle
+    fig, ax1 = plt.subplots(1)  
+    lines = ["-","--","-.",":"]
+    colors = ['#1f77b4', '#ff7f0e', '#2ca02c', '#d62728',
+              '#9467bd', '#8c564b', '#e377c2', '#7f7f7f',
+              '#bcbd22', '#17becf']
+    colorcycler = cycle(colors)
+    linecycler = cycle(lines)
+    for p, r, pu, pl, ru, rl, n in zip(precision, recall, pr_upper, pr_lower, re_upper, re_lower, names):   
+        # Plots only from the point where recall reaches its maximum, otherwise, we
+        # don't see a curve...
+        i = r.argmax()
+        pi = p[i:]
+        ri = r[i:]
+        pui = pu[i:]
+        pli = pl[i:]
+        rui = ru[i:]
+        rli = rl[i:]    
+        valid = (pi+ri) > 0
+        f1 = 2 * (pi[valid]*ri[valid]) / (pi[valid]+ri[valid])    
+        # optimal point along the curve
+        argmax = f1.argmax()
+        opi = pi[argmax]
+        ori = ri[argmax]
+        # Plot Recall/Precision as threshold changes
+        ax1.plot(ri[pi>0], pi[pi>0], next(linecycler), label='[F={:.4f}] {}'.format(f1.max(), n),) 
+        ax1.plot(ori,opi, marker='o', linestyle=None, markersize=3, color='black')
+        # Plot confidence
+        # Upper bound
+        #ax1.plot(r95ui[p95ui>0], p95ui[p95ui>0]) 
+        # Lower bound
+        #ax1.plot(r95li[p95li>0], p95li[p95li>0])
+        # create the limiting polygon
+        vert_x = np.concatenate((rui[pui>0], rli[pli>0][::-1]))
+        vert_y = np.concatenate((pui[pui>0], pli[pli>0][::-1])) 
+        # hacky workaround to plot 2nd human
+        if np.isclose(np.mean(rui), rui[1], rtol=1e-05):
+            print('found human')
+            p = plt.Polygon(np.column_stack((vert_x, vert_y)), facecolor='none', alpha=.2, edgecolor=next(colorcycler),lw=2)
+        else:
+            p = plt.Polygon(np.column_stack((vert_x, vert_y)), facecolor=next(colorcycler), alpha=.2, edgecolor='none',lw=.2)
+        ax1.add_artist(p)
 
+    ax1.grid(linestyle='--', linewidth=1, color='gray', alpha=0.2)  
+    if len(names) > 1:
+        plt.legend(loc='lower left', framealpha=0.5)  
+    ax1.set_xlabel('Recall')
+    ax1.set_ylabel('Precision')
+    ax1.set_xlim([0.0, 1.0])
+    ax1.set_ylim([0.0, 1.0])    
+    if title is not None: ax1.set_title(title)  
+    # Annotates plot with F1-score iso-lines
+    ax2 = ax1.twinx()
+    f_scores = np.linspace(0.1, 0.9, num=9)
+    tick_locs = []
+    tick_labels = []
+    for f_score in f_scores:
+        x = np.linspace(0.01, 1)
+        y = f_score * x / (2 * x - f_score)
+        l, = plt.plot(x[y >= 0], y[y >= 0], color='green', alpha=0.1)
+        tick_locs.append(y[-1])
+        tick_labels.append('%.1f' % f_score)  
+    ax2.tick_params(axis='y', which='both', pad=0, right=False, left=False)
+    ax2.set_ylabel('iso-F', color='green', alpha=0.3)
+    ax2.set_ylim([0.0, 1.0])
+    ax2.yaxis.set_label_coords(1.015, 0.97) 
+    ax2.set_yticks(tick_locs) #notice these are invisible   
+    for k in ax2.set_yticklabels(tick_labels):
+        k.set_color('green')
+        k.set_alpha(0.3)
+        k.set_size(8) 
+    # we should see some of axes 1 axes
+    ax1.spines['right'].set_visible(False)
+    ax1.spines['top'].set_visible(False)
+    ax1.spines['left'].set_position(('data', -0.015))
+    ax1.spines['bottom'].set_position(('data', -0.015)) 
+    # we shouldn't see any of axes 2 axes
+    ax2.spines['right'].set_visible(False)
+    ax2.spines['top'].set_visible(False)
+    ax2.spines['left'].set_visible(False)
+    ax2.spines['bottom'].set_visible(False) 
+    plt.tight_layout()  
+    return fig  
 
 def loss_curve(df, title):
     """ Creates a loss curve given a Dataframe with column names:
@@ -152,10 +266,18 @@ def read_metricscsv(file):
         next(metricsreader)
         precision = []
         recall = []
+        pr_upper = []
+        pr_lower = []
+        re_upper = []
+        re_lower = []
         for row in metricsreader:
             precision.append(float(row[1]))
             recall.append(float(row[2]))
-    return np.array(precision), np.array(recall)
+            pr_upper.append(float(row[8]))
+            pr_lower.append(float(row[9]))
+            re_upper.append(float(row[11]))
+            re_lower.append(float(row[12]))
+    return np.array(precision), np.array(recall), np.array(pr_upper), np.array(pr_lower), np.array(re_upper), np.array(re_lower)
 
 
 def plot_overview(outputfolders,title):
@@ -174,14 +296,22 @@ def plot_overview(outputfolders,title):
     """
     precisions = []
     recalls = []
+    pr_ups = []
+    pr_lows = []
+    re_ups = []
+    re_lows = []
     names = []
     params = []
     for folder in outputfolders:
         # metrics 
         metrics_path = os.path.join(folder,'results/Metrics.csv')
-        pr, re = read_metricscsv(metrics_path)
+        pr, re, pr_upper, pr_lower, re_upper, re_lower = read_metricscsv(metrics_path)
         precisions.append(pr)
         recalls.append(re)
+        pr_ups.append(pr_upper)
+        pr_lows.append(pr_lower)
+        re_ups.append(re_upper)
+        re_lows.append(re_lower)
         modelname = folder.split('/')[-1]
         datasetname =  folder.split('/')[-2]
         # parameters
@@ -190,10 +320,11 @@ def plot_overview(outputfolders,title):
           rows = outfile.readlines()
           lastrow = rows[-1]
           parameter = int(lastrow.split()[1].replace(',',''))
-        name = '[P={:.2f}M] {} {}'.format(parameter/100**3, modelname, "")
+        #name = '[P={:.2f}M] {} {}'.format(parameter/100**3, modelname, "")
+        name = '{} '.format(modelname)
         names.append(name)
     #title = folder.split('/')[-4]
-    fig = precision_recall_f1iso(precisions,recalls,names,title)
+    fig = precision_recall_f1iso_confintval(precisions,recalls, pr_ups, pr_lows, re_ups, re_lows, names,title)
     return fig
 
 def metricsviz(dataset
diff --git a/bob/ip/binseg/utils/rsttable.py b/bob/ip/binseg/utils/rsttable.py
index 15a0b68d0f8a6decd08b80710ebd5d57b62a6dd9..1db04e45892c5993e711529cb6ebc4b4ea41a43a 100644
--- a/bob/ip/binseg/utils/rsttable.py
+++ b/bob/ip/binseg/utils/rsttable.py
@@ -8,11 +8,15 @@ def create_overview_grid(output_path):
     filename = 'Metrics.csv'
     metrics = get_paths(output_path,filename)
     f1s = []
+    stds = []
     models = []
     databases = []
     for m in metrics:
         metrics = pd.read_csv(m)
         maxf1 = metrics['f1_score'].max()
+        idmaxf1 = metrics['f1_score'].idxmax()
+        std = metrics['std_f1'][idmaxf1]
+        stds.append(std)
         f1s.append(maxf1)
         model = m.split('/')[-3]
         models.append(model)
@@ -22,7 +26,11 @@ def create_overview_grid(output_path):
     df['database'] = databases
     df['model'] = models
     df['f1'] = f1s
+    df['std'] = stds
     pivot = df.pivot(index='database',columns='model',values='f1')
+    pivot2 = df.pivot(index='database',columns='model',values='std')
 
     with open (os.path.join(output_path,'Metrics_overview.rst'), "w+") as outfile:
-        outfile.write(tabulate(pivot,headers=pivot.columns, tablefmt="grid"))
\ No newline at end of file
+        outfile.write(tabulate(pivot,headers=pivot.columns, tablefmt="grid"))
+    with open (os.path.join(output_path,'Metrics_overview_std.rst'), "w+") as outfile:
+        outfile.write(tabulate(pivot2,headers=pivot2.columns, tablefmt="grid"))
\ No newline at end of file
diff --git a/doc/benchmarkresults.rst b/doc/benchmarkresults.rst
index c0d6ae0a689c16f19197352735aace7c38a58be7..2f391611db75358ffeb809058e5bf2242a474b93 100644
--- a/doc/benchmarkresults.rst
+++ b/doc/benchmarkresults.rst
@@ -16,55 +16,15 @@ F1 Scores
 +--------------------------------------------+------------------------------------------------+---------------------------------------------+-------------------------------------------+----------------------------------------------+---------------------------------------------+
 | F1 (std)                                   | :ref:`bob.ip.binseg.configs.datasets.chasedb1` | :ref:`bob.ip.binseg.configs.datasets.drive` | :ref:`bob.ip.binseg.configs.datasets.hrf` | :ref:`bob.ip.binseg.configs.datasets.iostar` | :ref:`bob.ip.binseg.configs.datasets.stare` |
 +--------------------------------------------+------------------------------------------------+---------------------------------------------+-------------------------------------------+----------------------------------------------+---------------------------------------------+
-| :ref:`bob.ip.binseg.configs.models.driu`   | `0.8114 (0.0206) <driu_chasedb1.pth_>`_        | `0.8226 (0.0142) <driu_drive.pth_>`_        | `0.7865 (0.0545) <driu_hrf.pth_>`_        | `0.8273 (0.0199) <driu_iostar.pth_>`_        | `0.8286 (0.0368) <driu_stare.pth_>`_        |
+| :ref:`bob.ip.binseg.configs.models.driu`   | `0.810 (0.021) <driu_chasedb1.pth_>`_          | `0.820 (0.014) <driu_drive.pth_>`_          | `0.783 (0.055) <driu_hrf.pth_>`_          | `0.825 (0.020) <driu_iostar.pth_>`_          | `0.827 (0.037) <driu_stare.pth_>`_          |
 +--------------------------------------------+------------------------------------------------+---------------------------------------------+-------------------------------------------+----------------------------------------------+---------------------------------------------+
-| :ref:`bob.ip.binseg.configs.models.hed`    | 0.8111 (0.0214)                                | 0.8192 (0.0136)                             | 0.7868 (0.0576)                           | 0.8275 (0.0201)                              | 0.8250 (0.0375)                             |
+| :ref:`bob.ip.binseg.configs.models.hed`    | 0.810 (0.022)                                  | 0.817 (0.013)                               | 0.783 (0.058)                             | 0.825 (0.020)                                | 0.823 (0.037)                               |
 +--------------------------------------------+------------------------------------------------+---------------------------------------------+-------------------------------------------+----------------------------------------------+---------------------------------------------+
-| :ref:`bob.ip.binseg.configs.models.m2unet` | `0.8035 (0.0195) <m2unet_chasedb1.pth_>`_      | `0.8051 (0.0141) <m2unet_drive.pth_>`_      | `0.7838 (0.0572) <m2unet_hrf.pth_>`_      | `0.8194 (0.0201) <m2unet_iostar.pth_>`_      | `0.8174 (0.0409) <m2unet_stare.pth_>`_      |
+| :ref:`bob.ip.binseg.configs.models.m2unet` | `0.802 (0.019) <m2unet_chasedb1.pth_>`_        | `0.803 (0.014) <m2unet_drive.pth_>`_        | `0.780 (0.057) <m2unet_hrf.pth_>`_        | `0.817 (0.020) <m2unet_iostar.pth_>`_        | `0.815 (0.041) <m2unet_stare.pth_>`_        |
 +--------------------------------------------+------------------------------------------------+---------------------------------------------+-------------------------------------------+----------------------------------------------+---------------------------------------------+
-| :ref:`bob.ip.binseg.configs.models.unet`   | 0.8136 (0.0209)                                | 0.8237 (0.0145)                             | 0.7914 (0.0516)                           | 0.8203 (0.0190)                              | 0.8306 (0.0421)                             |
+| :ref:`bob.ip.binseg.configs.models.unet`   | 0.812 (0.020)                                  | 0.822 (0.015)                               | 0.788 (0.051)                             | 0.818 (0.019)                                | 0.829 (0.042)                               |
 +--------------------------------------------+------------------------------------------------+---------------------------------------------+-------------------------------------------+----------------------------------------------+---------------------------------------------+
 
 
-.. figure:: img/pr_CHASEDB1.png
-   :scale: 30 %
-   :align: center
-   :alt: model comparisons
-
-   CHASE_DB1: Precision vs Recall curve, F1 scores and
-   number of parameter of each model.
-
-.. figure:: img/pr_DRIVE.png
-   :scale: 30 %
-   :align: center
-   :alt: model comparisons
-
-   DRIVE: Precision vs Recall curve, F1 scores and
-   number of parameter of each model.
-
-.. figure:: img/pr_HRF.png
-   :scale: 30 %
-   :align: center
-   :alt: model comparisons
-
-   HRF: Precision vs Recall curve, F1 scores and
-   number of parameter of each model.
-
-.. figure:: img/pr_IOSTARVESSEL.png
-   :scale: 30 %
-   :align: center
-   :alt: model comparisons
-
-   IOSTAR: Precision vs Recall curve, F1 scores and
-   number of parameter of each model.
-
-.. figure:: img/pr_STARE.png
-   :scale: 30 %
-   :align: center
-   :alt: model comparisons
-
-   STARE: Precision vs Recall curve, F1 scores and
-   number of parameter of each model.
-
 
 .. include:: links.rst
diff --git a/doc/covdresults.rst b/doc/covdresults.rst
index b3c5541408faf071f352865ef380fb0eef905aa0..95f7d7fe40f860b1c8af32f6beca365ee14e9eb1 100644
--- a/doc/covdresults.rst
+++ b/doc/covdresults.rst
@@ -12,26 +12,66 @@ F1 Scores
 F1 score together with standard deviation across test images.
 
 +---------------------------------------------------------+---------------------------------------------+-----------------------------------------------+---------------------------------------------------+
-| F1 score                                                | :ref:`bob.ip.binseg.configs.models.driussl` | :ref:`bob.ip.binseg.configs.models.driubnssl` | :ref:`bob.ip.binseg.configs.models.m2unetssl`     |
+| F1 score                                                | :ref:`bob.ip.binseg.configs.models.driu`    | :ref:`bob.ip.binseg.configs.models.driubn`    | :ref:`bob.ip.binseg.configs.models.m2unet`        |
 +---------------------------------------------------------+---------------------------------------------+-----------------------------------------------+---------------------------------------------------+
-| :ref:`bob.ip.binseg.configs.datasets.covd-drive`        | 0.7896 (0.0178)                             | 0.8000 (0.0182)                               | `0.7906 (0.0179) <m2unet_covd-drive.pth>`_        |
+| :ref:`bob.ip.binseg.configs.datasets.covd-drive`        | 0.788 (0.018)                               | 0.797 (0.019)                                 | `0.789 (0.018) <m2unet_covd-drive.pth>`_          |
 +---------------------------------------------------------+---------------------------------------------+-----------------------------------------------+---------------------------------------------------+
-| :ref:`bob.ip.binseg.configs.datasets.covd-drive_ssl`    | 0.7870 (0.0176)                             | 0.8020 (0.0179)                               | `0.7938 (0.0142) <m2unet_covd-drive_ssl.pth>`_    |
+| :ref:`bob.ip.binseg.configs.datasets.covd-drive_ssl`    | 0.785 (0.018)                               | 0.783 (0.019)                                 | `0.791 (0.014) <m2unet_covd-drive_ssl.pth>`_      |
 +---------------------------------------------------------+---------------------------------------------+-----------------------------------------------+---------------------------------------------------+
-| :ref:`bob.ip.binseg.configs.datasets.covd-stare`        | 0.7979 (0.1254)                             | 0.8129 (0.0986)                               | `0.8120 (0.0457) <m2unet_covd-stare.pth>`_        |
+| :ref:`bob.ip.binseg.configs.datasets.covd-stare`        | 0.778 (0.117)                               | 0.778 (0.122)                                 | `0.812 (0.046) <m2unet_covd-stare.pth>`_          |
 +---------------------------------------------------------+---------------------------------------------+-----------------------------------------------+---------------------------------------------------+
-| :ref:`bob.ip.binseg.configs.datasets.covd-stare_ssl`    | 0.8062 (0.1033)                             | 0.8221 (0.0784)                               | `0.8222 (0.0441) <m2unet_covd-stare_ssl.pth>`_    |
+| :ref:`bob.ip.binseg.configs.datasets.covd-stare_ssl`    | 0.788 (0.102)                               | 0.811 (0.074)                                 | `0.820 (0.044) <m2unet_covd-stare_ssl.pth>`_      |
 +---------------------------------------------------------+---------------------------------------------+-----------------------------------------------+---------------------------------------------------+
-| :ref:`bob.ip.binseg.configs.datasets.covd-chasedb1`     | 0.7979 (0.0284)                             | 0.7923 (0.0240)                               | `0.7898 (0.0236) <m2unet_covd-chasedb1.pth>`_     |
+| :ref:`bob.ip.binseg.configs.datasets.covd-chasedb1`     | 0.796 (0.027)                               | 0.791 (0.025)                                 | `0.788 (0.024) <m2unet_covd-chasedb1.pth>`_       |
 +---------------------------------------------------------+---------------------------------------------+-----------------------------------------------+---------------------------------------------------+
-| :ref:`bob.ip.binseg.configs.datasets.covd-chasedb1_ssl` | 0.7976 (0.0242)                             | 0.7992 (0.0235)                               | `0.8000 (0.0268) <m2unet_covd-chasedb1_ssl.pth>`_ |
+| :ref:`bob.ip.binseg.configs.datasets.covd-chasedb1_ssl` | 0.796 (0.024)                               | 0.798 (0.025)                                 | `0.799 (0.026) <m2unet_covd-chasedb1_ssl.pth>`_   |
 +---------------------------------------------------------+---------------------------------------------+-----------------------------------------------+---------------------------------------------------+
-| :ref:`bob.ip.binseg.configs.datasets.covd-hrf`          | 0.8013 (0.0436)                             | 0.8027 (0.0452)                               | `0.8036 (0.0442) <m2unet_covd-hrf.pth>`_          |
+| :ref:`bob.ip.binseg.configs.datasets.covd-hrf`          | 0.799 (0.044)                               | 0.800 (0.045)                                 | `0.802 (0.045) <m2unet_covd-hrf.pth>`_            |
 +---------------------------------------------------------+---------------------------------------------+-----------------------------------------------+---------------------------------------------------+
-| :ref:`bob.ip.binseg.configs.datasets.covd-hrf_ssl`      | 0.8002 (0.0421)                             | 0.7916 (0.0468)                               | `0.7987 (0.0436) <m2unet_covd-hrf_ssl.pth>`_      |
+| :ref:`bob.ip.binseg.configs.datasets.covd-hrf_ssl`      | 0.799 (0.044)                               | 0.784 (0.048)                                 | `0.797 (0.044) <m2unet_covd-hrf_ssl.pth>`_        |
 +---------------------------------------------------------+---------------------------------------------+-----------------------------------------------+---------------------------------------------------+
-| :ref:`bob.ip.binseg.configs.datasets.covd-iostar`       | 0.7934 (0.0206)                             | 0.7763 (0.0311)                               | `0.7953 (0.0152) <m2unet_covd-iostar.pth>`_       |
+| :ref:`bob.ip.binseg.configs.datasets.covd-iostar`       | 0.791 (0.021)                               | 0.777 (0.032)                                 | `0.793 (0.015) <m2unet_covd-iostar.pth>`_         |
 +---------------------------------------------------------+---------------------------------------------+-----------------------------------------------+---------------------------------------------------+
-| :ref:`bob.ip.binseg.configs.datasets.covd-iostar_ssl`   | 0.7995 (0.0174)                             | 0.7904 (0.0215)                               | `0.7868 (0.0182) <m2unet_covd-iostar_ssl.pth>`_   |
+| :ref:`bob.ip.binseg.configs.datasets.covd-iostar_ssl`   | 0.797 (0.017)                               | 0.811 (0.074)                                 | `0.785 (0.018) <m2unet_covd-iostar_ssl.pth>`_     |
 +---------------------------------------------------------+---------------------------------------------+-----------------------------------------------+---------------------------------------------------+
 
+M2U-Net Precision vs. Recall Curves
+===================================
+
+Note that here the F1-score is calculated on a macro level (see paper for more details).
+
+.. figure:: img/pr_CHASEDB1.png
+   :scale: 50 %
+   :align: center
+   :alt: model comparisons
+
+   CHASE_DB1: Precision vs Recall curve and F1 scores
+
+.. figure:: img/pr_DRIVE.png
+   :scale: 50 %
+   :align: center
+   :alt: model comparisons
+
+   DRIVE: Precision vs Recall curve and F1 scores
+
+.. figure:: img/pr_HRF.png
+   :scale: 50 %
+   :align: center
+   :alt: model comparisons
+
+   HRF: Precision vs Recall curve and F1 scores
+
+.. figure:: img/pr_IOSTARVESSEL.png
+   :scale: 50 %
+   :align: center
+   :alt: model comparisons
+
+   IOSTAR: Precision vs Recall curve and F1 scores
+
+.. figure:: img/pr_STARE.png
+   :scale: 50 %
+   :align: center
+   :alt: model comparisons
+
+   STARE: Precision vs Recall curve and F1 scores
+
diff --git a/doc/img/pr_CHASEDB1.png b/doc/img/pr_CHASEDB1.png
index 7fe74f4e6178af9abc8fdda8c3d1142c992110c8..923a4af3445926fe46cb3ac58a3a73d28b12d9fc 100644
Binary files a/doc/img/pr_CHASEDB1.png and b/doc/img/pr_CHASEDB1.png differ
diff --git a/doc/img/pr_DRIVE.png b/doc/img/pr_DRIVE.png
index fc9e739e31c47bf319981dc6a561e335acfb261b..2aee69db83b3caaabcaa71e7c24d2eab7ca0eb7e 100644
Binary files a/doc/img/pr_DRIVE.png and b/doc/img/pr_DRIVE.png differ
diff --git a/doc/img/pr_HRF.png b/doc/img/pr_HRF.png
index ac6f870ece6c4fe9d439ba5c0d5e3914eea3bcbb..df479805f32dcd60770facfab285322d02534c68 100644
Binary files a/doc/img/pr_HRF.png and b/doc/img/pr_HRF.png differ
diff --git a/doc/img/pr_IOSTARVESSEL.png b/doc/img/pr_IOSTARVESSEL.png
index 97ed5c7a6b8f0d7ab6c0786db55588d7b163e9bb..e7c1b9a9203473d50a21adf938c9f4ccb0abe034 100644
Binary files a/doc/img/pr_IOSTARVESSEL.png and b/doc/img/pr_IOSTARVESSEL.png differ
diff --git a/doc/img/pr_STARE.png b/doc/img/pr_STARE.png
index 14603d2d3782292e66c813685fc61bca60953976..c485243f5ecb3350685a327fd61e9a12d457e71d 100644
Binary files a/doc/img/pr_STARE.png and b/doc/img/pr_STARE.png differ
diff --git a/doc/index.rst b/doc/index.rst
index 97d792b87b1b2e40dd566f373220540955dbab48..18d8d561e8e5aff351f4e411a6496edad5458c07 100644
--- a/doc/index.rst
+++ b/doc/index.rst
@@ -23,6 +23,7 @@ Users Guide
    benchmarkresults
    covdresults
    configs
+   plotting
    visualization
    api
 
diff --git a/doc/plotting.rst b/doc/plotting.rst
new file mode 100644
index 0000000000000000000000000000000000000000..f05ee42d12572211ffe4037a1c7d706283042a63
--- /dev/null
+++ b/doc/plotting.rst
@@ -0,0 +1,20 @@
+.. -*- coding: utf-8 -*-
+.. _bob.ip.binseg.plotting:
+
+========
+Plotting
+========
+
+Precision vs recall curves for each evaluation run are generated by default and
+stored in the ``results`` subfolder of the model output directory.
+
+To generate a comparison chart of various models use the ``compare`` command
+and pass as arguments the output paths of the models you would like to plot.
+
+E.g.:
+
+.. code-block:: bash
+
+    bob binseg compare -o myoutput -l myoutput/DRIVE/M2U-Net myoutput/DRIVE/U-Net myoutput/DRIVE/HED -t MyPlotTitle
+
+Use ``bob binseg compare --help`` for more information.