diff --git a/bob/ip/binseg/engine/evaluator.py b/bob/ip/binseg/engine/evaluator.py
index af2c0a2d1e2cabe5b1eb6a226d87aebd89dca7cb..d6aff378e64cb5e2edb69bc0cb5150acf0f29564 100644
--- a/bob/ip/binseg/engine/evaluator.py
+++ b/bob/ip/binseg/engine/evaluator.py
@@ -5,6 +5,7 @@
 
 import os
 
+import PIL
 import numpy
 import pandas
 from tqdm import tqdm
@@ -22,7 +23,34 @@ import logging
 logger = logging.getLogger(__name__)
 
 
-def _sample_metrics(stem, pred, gt):
+def _posneg(pred, gt, threshold):
+    """Calculates true and false positives and negatives"""
+
+    gt = gt.byte()  # byte tensor
+
+    # threshold
+    binary_pred = torch.gt(pred, threshold).byte()
+
+    # equals and not-equals
+    equals = torch.eq(binary_pred, gt).type(torch.uint8)  # tensor
+    notequals = torch.ne(binary_pred, gt).type(torch.uint8)  # tensor
+
+    # true positives
+    tp_tensor = gt * binary_pred
+
+    # false positives
+    fp_tensor = torch.eq((binary_pred + tp_tensor), 1)
+
+    # true negatives
+    tn_tensor = equals - tp_tensor
+
+    # false negatives
+    fn_tensor = notequals - fp_tensor.type(torch.uint8)
+
+    return tp_tensor, fp_tensor, tn_tensor, fn_tensor
+
+
+def _sample_metrics(pred, gt):
     """
     Calculates metrics on one single sample and saves it to disk
 
@@ -30,9 +58,6 @@ def _sample_metrics(stem, pred, gt):
     Parameters
     ----------
 
-    stem : str
-        original filename without extension and relative to its root-path
-
     pred : torch.Tensor
         pixel-wise predictions
 
@@ -58,36 +83,17 @@ def _sample_metrics(stem, pred, gt):
     """
 
     step_size = 0.01
-    gts = gt.byte()
-
     data = []
 
     for threshold in numpy.arange(0.0, 1.0, step_size):
 
-        # threshold
-        binary_pred = torch.gt(pred, threshold).byte()
-
-        # equals and not-equals
-        equals = torch.eq(binary_pred, gts).type(torch.uint8)  # tensor
-        notequals = torch.ne(binary_pred, gts).type(torch.uint8)  # tensor
+        tp_tensor, fp_tensor, tn_tensor, fn_tensor = _posneg(pred, gt, threshold)
 
-        # true positives
-        tp_tensor = gt * binary_pred  # tensor
-        tp_count = torch.sum(tp_tensor).item()  # scalar
-
-        # false positives
-        fp_tensor = torch.eq((binary_pred + tp_tensor), 1)
+        # calc metrics from scalars
+        tp_count = torch.sum(tp_tensor).item()
         fp_count = torch.sum(fp_tensor).item()
-
-        # true negatives
-        tn_tensor = equals - tp_tensor
         tn_count = torch.sum(tn_tensor).item()
-
-        # false negatives
-        fn_tensor = notequals - fp_tensor.type(torch.uint8)
         fn_count = torch.sum(fn_tensor).item()
-
-        # calc metrics
         precision, recall, specificity, accuracy, jaccard, f1_score = \
                 base_metrics(tp_count, fp_count, tn_count, fn_count)
 
@@ -105,7 +111,82 @@ def _sample_metrics(stem, pred, gt):
         ))
 
 
-def run(data_loader, predictions_folder, output_folder):
+def _sample_analysis(
+        img,
+        pred,
+        gt,
+        threshold,
+        tp_color=(0, 255, 0),  # (128,128,128) Gray
+        fp_color=(0, 0, 255),  # (70, 240, 240) Cyan
+        fn_color=(255, 0, 0),  # (245, 130, 48) Orange
+        overlay=True,
+        ):
+    """Visualizes true positives, false positives and false negatives
+
+
+    Parameters
+    ----------
+
+    img : torch.Tensor
+        original image
+
+    pred : torch.Tensor
+        pixel-wise predictions
+
+    gt : torch.Tensor
+        ground-truth (annotations)
+
+    threshold : float
+        The threshold to be used while analyzing this image's probability map
+
+    tp_color : tuple
+        RGB value for true positives
+
+    fp_color : tuple
+        RGB value for false positives
+
+    fn_color : tuple
+        RGB value for false negatives
+
+    overlay : :py:class:`bool`, Optional
+        If set to ``True`` (which is the default), then overlay annotations on
+        top of the image.  Otherwise, represent data on a black canvas.
+
+
+    Returns
+    -------
+
+    figure : PIL.Image.Image
+
+        A PIL image that contains the overlayed analysis of true-positives
+        (TP), false-positives (FP) and false negatives (FN).
+
+    """
+
+    tp_tensor, fp_tensor, tn_tensor, fn_tensor = _posneg(pred, gt, threshold)
+
+    # change to PIL representation
+    tp_pil = VF.to_pil_image(tp_tensor.float())
+    tp_pil_colored = PIL.ImageOps.colorize(tp_pil, (0, 0, 0), tp_color)
+
+    fp_pil = VF.to_pil_image(fp_tensor.float())
+    fp_pil_colored = PIL.ImageOps.colorize(fp_pil, (0, 0, 0), fp_color)
+
+    fn_pil = VF.to_pil_image(fn_tensor.float())
+    fn_pil_colored = PIL.ImageOps.colorize(fn_pil, (0, 0, 0), fn_color)
+
+    tp_pil_colored.paste(fp_pil_colored, mask=fp_pil)
+    tp_pil_colored.paste(fn_pil_colored, mask=fn_pil)
+
+    if overlay:
+        img = VF.to_pil_image(img)  # PIL Image
+        tp_pil_colored = PIL.Image.blend(img, tp_pil_colored, 0.4)
+
+    return tp_pil_colored
+
+
+def run(data_loader, predictions_folder, output_folder, overlayed_folder=None,
+        overlay_threshold=None):
     """
     Runs inference and calculates metrics
 
@@ -123,6 +204,17 @@ def run(data_loader, predictions_folder, output_folder):
     output_folder : str
         folder where to store results
 
+    overlayed_folder : :py:class:`str`, Optional
+        if not ``None``, then it should be the name of a folder where to store
+        overlayed versions of the images and ground-truths
+
+    overlay_threshold : :py:class:`float`, Optional
+        if ``overlayed_folder``, then this should be threshold (floating point)
+        to apply to prediction maps to decide on positives and negatives for
+        overlaying analysis (graphical output).  This number should come from
+        the training set or a separate validation set.  Using a test set value
+        may bias your analysis.
+
     """
 
     logger.info("Start evaluation")
@@ -146,7 +238,18 @@ def run(data_loader, predictions_folder, output_folder):
         if stem in data:
             raise RuntimeError(f"{stem} entry already exists in data. "
                     f"Cannot overwrite.")
-        data[stem] = _sample_metrics(stem, pred, gt)
+        data[stem] = _sample_metrics(pred, gt)
+
+        if overlayed_folder is not None:
+            overlay_image = _sample_analysis(image, pred, gt,
+                    threshold=overlay_threshold, overlay=True)
+            fullpath = os.path.join(overlayed_folder, f"{stem}.png")
+            tqdm.write(f"Saving {fullpath}...")
+            fulldir = os.path.dirname(fullpath)
+            if not os.path.exists(fulldir):
+                tqdm.write(f"Creating directory {fulldir}...")
+                os.makedirs(fulldir, exist_ok=True)
+            overlay_image.save(fullpath)
 
     # Merges all dataframes together
     df_metrics = pandas.concat(data.values())
diff --git a/bob/ip/binseg/engine/ssltrainer.py b/bob/ip/binseg/engine/ssltrainer.py
index 4d1d1c2c2d778b01b7ffc7c31c8b2b7b1cbeb7cc..7bd4aba027cd70fd62286abfa154445516a3a48b 100644
--- a/bob/ip/binseg/engine/ssltrainer.py
+++ b/bob/ip/binseg/engine/ssltrainer.py
@@ -338,7 +338,7 @@ def run(
 
     # plots a version of the CSV trainlog into a PDF
     logdf = pd.read_csv(logfile_name, header=0, names=logfile_fields)
-    fig = loss_curve(logdf, title="Loss Evolution")
+    fig = loss_curve(logdf)
     figurefile_name = os.path.join(output_folder, "trainlog.pdf")
     logger.info(f"Saving {figurefile_name}")
     fig.savefig(figurefile_name)
diff --git a/bob/ip/binseg/engine/trainer.py b/bob/ip/binseg/engine/trainer.py
index da49327b605f9f3362ccc98724dc81dfc129a935..040e5651514a1b12d89f3a3c00b2f8b9b5eb90d8 100644
--- a/bob/ip/binseg/engine/trainer.py
+++ b/bob/ip/binseg/engine/trainer.py
@@ -180,7 +180,7 @@ def run(
 
     # plots a version of the CSV trainlog into a PDF
     logdf = pandas.read_csv(logfile_name, header=0, names=logfile_fields)
-    fig = loss_curve(logdf, title="Loss Evolution")
+    fig = loss_curve(logdf)
     figurefile_name = os.path.join(output_folder, "trainlog.pdf")
     logger.info(f"Saving {figurefile_name}")
     fig.savefig(figurefile_name)
diff --git a/bob/ip/binseg/script/binseg.py b/bob/ip/binseg/script/binseg.py
index 65a9b1664dd9dce2a70b215e4414c44cb1040844..0a301ddf33cd3f950d41d3dd705a805c7d573880 100644
--- a/bob/ip/binseg/script/binseg.py
+++ b/bob/ip/binseg/script/binseg.py
@@ -3,171 +3,12 @@
 
 """The main entry for bob ip binseg (click-based) scripts."""
 
-
-import os
 import pkg_resources
-
 import click
 from click_plugins import with_plugins
-
-import logging
-import torch
-
-from bob.extension.scripts.click_helper import (
-    verbosity_option,
-    ConfigCommand,
-    ResourceOption,
-    AliasedGroup,
-)
-
-from bob.ip.binseg.utils.checkpointer import DetectronCheckpointer
-from torch.utils.data import DataLoader
-from bob.ip.binseg.utils.plot import plot_overview
-from bob.ip.binseg.utils.click import OptionEatAll
-from bob.ip.binseg.utils.rsttable import create_overview_grid
-from bob.ip.binseg.utils.plot import metricsviz
-from bob.ip.binseg.utils.transformfolder import transformfolder as transfld
-
-logger = logging.getLogger(__name__)
-
+from bob.extension.scripts.click_helper import AliasedGroup
 
 @with_plugins(pkg_resources.iter_entry_points("bob.ip.binseg.cli"))
 @click.group(cls=AliasedGroup)
 def binseg():
     """Binary 2D Image Segmentation Benchmark commands."""
-
-
-# Plot comparison
-@binseg.command(entry_point_group="bob.ip.binseg.config", cls=ConfigCommand)
-@click.option(
-    "--output-path-list",
-    "-l",
-    required=True,
-    help="Pass all output paths as arguments",
-    cls=OptionEatAll,
-)
-@click.option(
-    "--output-path", "-o", required=True,
-)
-@click.option(
-    "--title", "-t", required=False,
-)
-@verbosity_option(cls=ResourceOption)
-def compare(output_path_list, output_path, title, **kwargs):
-    """ Compares multiple metrics files that are stored in the format mymodel/results/Metrics.csv """
-    logger.debug("Output paths: {}".format(output_path_list))
-    logger.info("Plotting precision vs recall curves for {}".format(output_path_list))
-    fig = plot_overview(output_path_list, title)
-    if not os.path.exists(output_path):
-        os.makedirs(output_path)
-    fig_filename = os.path.join(output_path, "precision_recall_comparison.pdf")
-    logger.info("saving {}".format(fig_filename))
-    fig.savefig(fig_filename)
-
-
-# Create grid table with results
-@binseg.command(entry_point_group="bob.ip.binseg.config", cls=ConfigCommand)
-@click.option(
-    "--output-path", "-o", required=True,
-)
-@verbosity_option(cls=ResourceOption)
-def gridtable(output_path, **kwargs):
-    """ Creates an overview table in grid rst format for all Metrics.csv in the output_path
-    tree structure:
-        â”œâ”€â”€ DATABASE
-        â”œâ”€â”€ MODEL
-            â”œâ”€â”€ images
-            â””â”€â”€ results
-    """
-    logger.info("Creating grid for all results in {}".format(output_path))
-    create_overview_grid(output_path)
-
-
-# Create metrics viz
-@binseg.command(entry_point_group="bob.ip.binseg.config", cls=ConfigCommand)
-@click.option("--dataset", "-d", required=True, cls=ResourceOption)
-@click.option(
-    "--output-path", "-o", required=True,
-)
-@verbosity_option(cls=ResourceOption)
-def visualize(dataset, output_path, **kwargs):
-    """ Creates the following visualizations of the probabilties output maps:
-    overlayed: test images overlayed with prediction probabilities vessel tree
-    tpfnfpviz: highlights true positives, false negatives and false positives
-
-    Required tree structure:
-    â”œâ”€â”€ DATABASE
-        â”œâ”€â”€ MODEL
-            â”œâ”€â”€ images
-            â””â”€â”€ results
-    """
-    logger.info("Creating TP, FP, FN visualizations for {}".format(output_path))
-    metricsviz(dataset=dataset, output_path=output_path)
-
-# Apply image transforms to a folder containing images
-@binseg.command(entry_point_group="bob.ip.binseg.config", cls=ConfigCommand)
-@click.option("--source-path", "-s", required=True, cls=ResourceOption)
-@click.option("--target-path", "-t", required=True, cls=ResourceOption)
-@click.option("--transforms", "-a", required=True, cls=ResourceOption)
-@verbosity_option(cls=ResourceOption)
-def transformfolder(source_path, target_path, transforms, **kwargs):
-    logger.info(
-        "Applying transforms to images in {} and saving them to {}".format(
-            source_path, target_path
-        )
-    )
-    transfld(source_path, target_path, transforms)
-
-
-# Evaluate only. Runs evaluation on predicted probability maps (--prediction-folder)
-@binseg.command(entry_point_group="bob.ip.binseg.config", cls=ConfigCommand)
-@click.option(
-    "--output-path", "-o", required=True, default="output", cls=ResourceOption
-)
-@click.option(
-    "--prediction-folder",
-    "-p",
-    help="Path containing output probability maps",
-    required=True,
-    cls=ResourceOption,
-)
-@click.option(
-    "--prediction-extension",
-    "-x",
-    help='Extension (e.g. ".png") for the prediction files',
-    default=".png",
-    required=False,
-    cls=ResourceOption,
-)
-@click.option("--dataset", "-d", required=True, cls=ResourceOption)
-@click.option("--title", required=False, cls=ResourceOption)
-@click.option("--legend", cls=ResourceOption)
-@verbosity_option(cls=ResourceOption)
-def evalpred(
-    output_path,
-    prediction_folder,
-    prediction_extension,
-    dataset,
-    title,
-    legend,
-    **kwargs
-):
-    """ Run inference and evalaute the model performance """
-
-    # PyTorch dataloader
-    data_loader = DataLoader(
-        dataset=dataset,
-        batch_size=1,
-        shuffle=False,
-        pin_memory=torch.cuda.is_available(),
-    )
-
-    # Run eval
-    do_eval(
-        prediction_folder,
-        data_loader,
-        output_folder=output_path,
-        title=title,
-        legend=legend,
-        prediction_extension=prediction_extension,
-    )
diff --git a/bob/ip/binseg/script/compare.py b/bob/ip/binseg/script/compare.py
new file mode 100644
index 0000000000000000000000000000000000000000..3b2b0ae998e8dbf05015dda950975e78a042b176
--- /dev/null
+++ b/bob/ip/binseg/script/compare.py
@@ -0,0 +1,51 @@
+#!/usr/bin/env python
+# coding=utf-8
+
+import click
+
+from bob.extension.scripts.click_helper import (
+    verbosity_option,
+    AliasedGroup,
+)
+
+from ..utils.plot import combined_precision_recall_f1iso_confintval
+
+import logging
+logger = logging.getLogger(__name__)
+
+
+@click.command(
+    epilog="""Examples:
+
+\b
+    1. Compares system A and B, with their own pre-computed metric files:
+\b
+       $ bob binseg compare -vv A path/to/A/metrics.csv B path/to/B/metrics.csv
+""",
+)
+@click.argument(
+        'label_path',
+        nargs=-1,
+        )
+@click.option(
+    "--output",
+    "-o",
+    help="Path where write the output figure (PDF format)",
+    show_default=True,
+    required=True,
+    default="comparison.pdf",
+    type=click.Path(),
+)
+@verbosity_option()
+def compare(label_path, output, **kwargs):
+    """Compares multiple systems together"""
+
+    # hack to get a dictionary from arguments passed to input
+    if len(label_path) % 2 != 0:
+        raise click.ClickException("Input label-paths should be doubles"
+                " composed of name-path entries")
+    data = dict(zip(label_path[::2], label_path[1::2]))
+
+    fig = combined_precision_recall_f1iso_confintval(data)
+    logger.info(f"Saving plot at {output}")
+    fig.savefig(output)
diff --git a/bob/ip/binseg/script/evaluate.py b/bob/ip/binseg/script/evaluate.py
index 11ca9719aa98cb122f142e7ff73ca0d317b84abe..1576f3875ebe5a1ad328293d234ebf0bfdec714d 100644
--- a/bob/ip/binseg/script/evaluate.py
+++ b/bob/ip/binseg/script/evaluate.py
@@ -2,15 +2,12 @@
 # coding=utf-8
 
 import click
-from click_plugins import with_plugins
-
 from torch.utils.data import DataLoader
 
 from bob.extension.scripts.click_helper import (
     verbosity_option,
     ConfigCommand,
     ResourceOption,
-    AliasedGroup,
 )
 
 from ..engine.evaluator import run
@@ -47,6 +44,7 @@ logger = logging.getLogger(__name__)
     help="Path where to store the analysis result (created if does not exist)",
     required=True,
     default="results",
+    type=click.Path(),
     cls=ResourceOption,
 )
 @click.option(
@@ -54,6 +52,7 @@ logger = logging.getLogger(__name__)
     "-p",
     help="Path where predictions are currently stored",
     required=True,
+    type=click.Path(exists=True, file_okay=False, dir_okay=True),
     cls=ResourceOption,
 )
 @click.option(
@@ -65,7 +64,7 @@ logger = logging.getLogger(__name__)
 )
 @click.option(
     "--overlayed",
-    "-A",
+    "-O",
     help="Creates overlayed representations of the output probability maps, "
     "similar to --overlayed in prediction-mode, except it includes "
     "distinctive colours for true and false positives and false negatives.  "
@@ -77,10 +76,27 @@ logger = logging.getLogger(__name__)
     required=False,
     cls=ResourceOption,
 )
+@click.option(
+    "--overlay-threshold",
+    "-T",
+    help="If you set --overlayed, then you can provide a value to be used as "
+    "threshold to be applied on probability maps and decide for positives and "
+    "negatives.  This binary output will be used to define true and false "
+    "positives, and false negatives for the overlay analysis.  This number "
+    "should either come from the training set or a separate validation set "
+    "to avoid biasing the analysis",
+    default=0.5,
+    type=click.FloatRange(min=0.0, max=1.0),
+    show_default=True,
+    required=False,
+    cls=ResourceOption,
+)
 @verbosity_option(cls=ResourceOption)
-def evaluate(output_folder, predictions_folder, dataset, overlayed, **kwargs):
+def evaluate(output_folder, predictions_folder, dataset, overlayed,
+        overlay_threshold, **kwargs):
     """Evaluates an FCN on a binary segmentation task.
     """
     data_loader = DataLoader(dataset=dataset, batch_size=1, shuffle=False,
             pin_memory=False)
-    run(dataset, predictions_folder, output_folder)
+    run(dataset, predictions_folder, output_folder, overlayed,
+            overlay_threshold)
diff --git a/bob/ip/binseg/script/predict.py b/bob/ip/binseg/script/predict.py
index 2c7929ec2127f76f3afaeaae34287fcfe8b6e862..d4d8bd75677acd94517f52f6b63a0b375e49e809 100644
--- a/bob/ip/binseg/script/predict.py
+++ b/bob/ip/binseg/script/predict.py
@@ -4,8 +4,6 @@
 import os
 
 import click
-from click_plugins import with_plugins
-
 import torch
 from torch.utils.data import DataLoader
 
@@ -13,7 +11,6 @@ from bob.extension.scripts.click_helper import (
     verbosity_option,
     ConfigCommand,
     ResourceOption,
-    AliasedGroup,
 )
 
 from ..engine.predictor import run
@@ -52,6 +49,7 @@ logger = logging.getLogger(__name__)
     required=True,
     default="results",
     cls=ResourceOption,
+    type=click.Path(),
 )
 @click.option(
     "--model",
@@ -74,6 +72,7 @@ logger = logging.getLogger(__name__)
     required=True,
     show_default=True,
     default=1,
+    type=click.IntRange(min=1),
     cls=ResourceOption,
 )
 @click.option(
diff --git a/bob/ip/binseg/script/train.py b/bob/ip/binseg/script/train.py
index 3302e9ea59539ef6cf33fbbc6d34ee3f0e46cab2..7eb4bb992074e9005c32519b6661daa0f0b837b5 100644
--- a/bob/ip/binseg/script/train.py
+++ b/bob/ip/binseg/script/train.py
@@ -4,8 +4,6 @@
 import os
 
 import click
-from click_plugins import with_plugins
-
 import torch
 from torch.utils.data import DataLoader
 
@@ -13,7 +11,6 @@ from bob.extension.scripts.click_helper import (
     verbosity_option,
     ConfigCommand,
     ResourceOption,
-    AliasedGroup,
 )
 
 from ..utils.checkpointer import DetectronCheckpointer
@@ -52,6 +49,7 @@ logger = logging.getLogger(__name__)
     "-o",
     help="Path where to store the generated model (created if does not exist)",
     required=True,
+    type=click.Path(),
     default="results",
     cls=ResourceOption,
 )
@@ -115,6 +113,7 @@ logger = logging.getLogger(__name__)
     required=True,
     show_default=True,
     default=2,
+    type=click.IntRange(min=1),
     cls=ResourceOption,
 )
 @click.option(
@@ -136,6 +135,7 @@ logger = logging.getLogger(__name__)
     show_default=True,
     required=True,
     default=1000,
+    type=click.IntRange(min=1),
     cls=ResourceOption,
 )
 @click.option(
@@ -149,6 +149,7 @@ logger = logging.getLogger(__name__)
     show_default=True,
     required=True,
     default=0,
+    type=click.IntRange(min=0),
     cls=ResourceOption,
 )
 @click.option(
@@ -167,6 +168,7 @@ logger = logging.getLogger(__name__)
     show_default=True,
     required=False,
     default=42,
+    type=click.IntRange(min=0),
     cls=ResourceOption,
 )
 @click.option(
@@ -184,6 +186,7 @@ logger = logging.getLogger(__name__)
     show_default=True,
     required=True,
     default=900,
+    type=click.IntRange(min=0),
     cls=ResourceOption,
 )
 @verbosity_option(cls=ResourceOption)
diff --git a/bob/ip/binseg/test/test_batchmetrics.py b/bob/ip/binseg/test/test_batchmetrics.py
index 76b1313d4ed819cecfc8b2dd7e0f5125fd9723d9..172e66f2e92c9c130d0f5205d7cacaf53fefe7e3 100644
--- a/bob/ip/binseg/test/test_batchmetrics.py
+++ b/bob/ip/binseg/test/test_batchmetrics.py
@@ -31,9 +31,8 @@ class Tester(unittest.TestCase):
 
     def test_batch_metrics(self):
         dfs = []
-        for stem, pred, gt in zip(self.names, self.predictions,
-                self.ground_truths):
-            dfs.append(_sample_metrics(stem, pred, gt))
+        for pred, gt in zip(self.predictions, self.ground_truths):
+            dfs.append(_sample_metrics(pred, gt))
         bm = pandas.concat(dfs)
 
         self.assertEqual(len(bm), 2 * 100)
diff --git a/bob/ip/binseg/utils/FreeMono.ttf b/bob/ip/binseg/utils/FreeMono.ttf
deleted file mode 100644
index 7485f9e4c84d5a372c81e11df2cd9f5e2eb2064a..0000000000000000000000000000000000000000
Binary files a/bob/ip/binseg/utils/FreeMono.ttf and /dev/null differ
diff --git a/bob/ip/binseg/utils/click.py b/bob/ip/binseg/utils/click.py
deleted file mode 100644
index 792cebfd34f50d3fbe2a08e6d829fb17aaba91e3..0000000000000000000000000000000000000000
--- a/bob/ip/binseg/utils/click.py
+++ /dev/null
@@ -1,52 +0,0 @@
-#!/usr/bin/env python
-# -*- coding: utf-8 -*-
-
-import click
-
-
-class OptionEatAll(click.Option):
-    """
-    Allows for ``*args`` and ``**kwargs`` to be passed to click
-
-    https://stackoverflow.com/questions/48391777/nargs-equivalent-for-options-in-click
-    """
-
-    def __init__(self, *args, **kwargs):
-        self.save_other_options = kwargs.pop("save_other_options", True)
-        nargs = kwargs.pop("nargs", -1)
-        assert nargs == -1, "nargs, if set, must be -1 not {}".format(nargs)
-        super(OptionEatAll, self).__init__(*args, **kwargs)
-        self._previous_parser_process = None
-        self._eat_all_parser = None
-
-    def add_to_parser(self, parser, ctx):
-        def parser_process(value, state):
-            # method to hook to the parser.process
-            done = False
-            value = [value]
-            if self.save_other_options:
-                # grab everything up to the next option
-                while state.rargs and not done:
-                    for prefix in self._eat_all_parser.prefixes:
-                        if state.rargs[0].startswith(prefix):
-                            done = True
-                    if not done:
-                        value.append(state.rargs.pop(0))
-            else:
-                # grab everything remaining
-                value += state.rargs
-                state.rargs[:] = []
-            value = tuple(value)
-
-            # call the actual process
-            self._previous_parser_process(value, state)
-
-        retval = super(OptionEatAll, self).add_to_parser(parser, ctx)
-        for name in self.opts:
-            our_parser = parser._long_opt.get(name) or parser._short_opt.get(name)
-            if our_parser:
-                self._eat_all_parser = our_parser
-                self._previous_parser_process = our_parser.process
-                our_parser.process = parser_process
-                break
-        return retval
diff --git a/bob/ip/binseg/utils/plot.py b/bob/ip/binseg/utils/plot.py
index ecfbe92bbcb9b438f2d9a7cb8d06fa777f7a6584..29a0d28f1eef949a103c757530a4605078546d74 100644
--- a/bob/ip/binseg/utils/plot.py
+++ b/bob/ip/binseg/utils/plot.py
@@ -1,54 +1,51 @@
 #!/usr/bin/env python
 # -*- coding: utf-8 -*-
 
-import os
-import csv
+from itertools import cycle
 
-import numpy as np
-import pandas as pd
-import PIL
-
-import torchvision.transforms.functional as VF
-import torch
+import numpy
+import pandas
 
 import matplotlib
 matplotlib.use("agg")
 
+import matplotlib.pyplot as plt
 
-def precision_recall_f1iso(precision, recall, names, title=None):
-    """
-    Author: Andre Anjos (andre.anjos@idiap.ch).
+import logging
+logger = logging.getLogger(__name__)
+
+
+def precision_recall_f1iso(precision, recall, names):
+    """Creates a precision-recall plot of the given data.
 
-    Creates a precision-recall plot of the given data.
     The plot will be annotated with F1-score iso-lines (in which the F1-score
     maintains the same value)
 
     Parameters
     ----------
+
     precision : :py:class:`numpy.ndarray` or :py:class:`list`
-        A list of 1D np arrays containing the Y coordinates of the plot, or
-        the precision, or a 2D np array in which the rows correspond to each
-        of the system's precision coordinates.
+        A list of 1D arrays containing the Y coordinates of the plot, or the
+        precision, or a 2D np array in which the rows correspond to each of the
+        system's precision coordinates.
+
     recall : :py:class:`numpy.ndarray` or :py:class:`list`
-        A list of 1D np arrays containing the X coordinates of the plot, or
-        the recall, or a 2D np array in which the rows correspond to each
-        of the system's recall coordinates.
+        A list of 1D arrays containing the X coordinates of the plot, or the
+        recall, or a 2D np array in which the rows correspond to each of the
+        system's recall coordinates.
+
     names : :py:class:`list`
         An iterable over the names of each of the systems along the rows of
         ``precision`` and ``recall``
-    title : :py:class:`str`, optional
-        A title for the plot. If not set, omits the title
+
 
     Returns
     -------
-    matplotlib.figure.Figure
+
+    figure : matplotlib.figure.Figure
         A matplotlib figure you can save or display
-    """
-    import matplotlib
 
-    matplotlib.use("agg")
-    import matplotlib.pyplot as plt
-    from itertools import cycle
+    """
 
     fig, ax1 = plt.subplots(1)
     lines = ["-", "--", "-.", ":"]
@@ -72,7 +69,9 @@ def precision_recall_f1iso(precision, recall, names, title=None):
             next(linecycler),
             label="[F={:.4f}] {}".format(f1.max(), n),
         )
-        ax1.plot(ori, opi, marker="o", linestyle=None, markersize=3, color="black")
+        ax1.plot(
+            ori, opi, marker="o", linestyle=None, markersize=3, color="black"
+        )
     ax1.grid(linestyle="--", linewidth=1, color="gray", alpha=0.2)
     if len(names) > 1:
         plt.legend(loc="lower left", framealpha=0.5)
@@ -80,15 +79,13 @@ def precision_recall_f1iso(precision, recall, names, title=None):
     ax1.set_ylabel("Precision")
     ax1.set_xlim([0.0, 1.0])
     ax1.set_ylim([0.0, 1.0])
-    if title is not None:
-        ax1.set_title(title)
     # Annotates plot with F1-score iso-lines
     ax2 = ax1.twinx()
-    f_scores = np.linspace(0.1, 0.9, num=9)
+    f_scores = numpy.linspace(0.1, 0.9, num=9)
     tick_locs = []
     tick_labels = []
     for f_score in f_scores:
-        x = np.linspace(0.01, 1)
+        x = numpy.linspace(0.01, 1)
         y = f_score * x / (2 * x - f_score)
         (l,) = plt.plot(x[y >= 0], y[y >= 0], color="green", alpha=0.1)
         tick_locs.append(y[-1])
@@ -117,42 +114,62 @@ def precision_recall_f1iso(precision, recall, names, title=None):
 
 
 def precision_recall_f1iso_confintval(
-    precision, recall, pr_upper, pr_lower, re_upper, re_lower, names, title=None
+    precision, recall, pr_upper, pr_lower, re_upper, re_lower, names
 ):
-    """
-    Creates a precision-recall plot of the given data.
+    """Creates a precision-recall plot of the given data, with confidence
+    intervals
+
     The plot will be annotated with F1-score iso-lines (in which the F1-score
     maintains the same value)
 
     Parameters
     ----------
+
     precision : :py:class:`numpy.ndarray` or :py:class:`list`
-        A list of 1D np arrays containing the Y coordinates of the plot, or
-        the precision, or a 2D np array in which the rows correspond to each
+        A list of 1D arrays containing the Y coordinates of the plot, or the
+        precision, or a 2D array in which the rows correspond to each
         of the system's precision coordinates.
 
     recall : :py:class:`numpy.ndarray` or :py:class:`list`
-        A list of 1D np arrays containing the X coordinates of the plot, or
-        the recall, or a 2D np array in which the rows correspond to each
+        A list of 1D arrays containing the X coordinates of the plot, or
+        the recall, or a 2D array in which the rows correspond to each
         of the system's recall coordinates.
 
+    pr_upper : :py:class:`numpy.ndarray` or :py:class:`list`
+        A list of 1D arrays containing the upper bound of the confidence
+        interval for the Y coordinates of the plot, or the precision upper
+        bound, or a 2D array in which the rows correspond to each of the
+        system's precision upper-bound coordinates.
+
+    pr_lower : :py:class:`numpy.ndarray` or :py:class:`list`
+        A list of 1D arrays containing the lower bound of the confidence
+        interval for the Y coordinates of the plot, or the precision lower
+        bound, or a 2D array in which the rows correspond to each of the
+        system's precision lower-bound coordinates.
+
+    re_upper : :py:class:`numpy.ndarray` or :py:class:`list`
+        A list of 1D arrays containing the upper bound of the confidence
+        interval for the Y coordinates of the plot, or the recall upper bound,
+        or a 2D array in which the rows correspond to each of the system's
+        recall upper-bound coordinates.
+
+    re_lower : :py:class:`numpy.ndarray` or :py:class:`list`
+        A list of 1D arrays containing the lower bound of the confidence
+        interval for the Y coordinates of the plot, or the recall lower bound,
+        or a 2D array in which the rows correspond to each of the system's
+        recall lower-bound coordinates.
+
     names : :py:class:`list`
         An iterable over the names of each of the systems along the rows of
         ``precision`` and ``recall``
 
-    title : :py:class:`str`, optional
-        A title for the plot. If not set, omits the title
 
     Returns
     -------
-    matplotlib.figure.Figure
+    figure : matplotlib.figure.Figure
         A matplotlib figure you can save or display
-    """
-    import matplotlib
 
-    matplotlib.use("agg")
-    import matplotlib.pyplot as plt
-    from itertools import cycle
+    """
 
     fig, ax1 = plt.subplots(1)
     lines = ["-", "--", "-.", ":"]
@@ -195,20 +212,22 @@ def precision_recall_f1iso_confintval(
             next(linecycler),
             label="[F={:.4f}] {}".format(f1.max(), n),
         )
-        ax1.plot(ori, opi, marker="o", linestyle=None, markersize=3, color="black")
+        ax1.plot(
+            ori, opi, marker="o", linestyle=None, markersize=3, color="black"
+        )
         # Plot confidence
         # Upper bound
         # ax1.plot(r95ui[p95ui>0], p95ui[p95ui>0])
         # Lower bound
         # ax1.plot(r95li[p95li>0], p95li[p95li>0])
         # create the limiting polygon
-        vert_x = np.concatenate((rui[pui > 0], rli[pli > 0][::-1]))
-        vert_y = np.concatenate((pui[pui > 0], pli[pli > 0][::-1]))
+        vert_x = numpy.concatenate((rui[pui > 0], rli[pli > 0][::-1]))
+        vert_y = numpy.concatenate((pui[pui > 0], pli[pli > 0][::-1]))
         # hacky workaround to plot 2nd human
-        if np.isclose(np.mean(rui), rui[1], rtol=1e-05):
+        if numpy.isclose(numpy.mean(rui), rui[1], rtol=1e-05):
             print("found human")
             p = plt.Polygon(
-                np.column_stack((vert_x, vert_y)),
+                numpy.column_stack((vert_x, vert_y)),
                 facecolor="none",
                 alpha=0.2,
                 edgecolor=next(colorcycler),
@@ -216,7 +235,7 @@ def precision_recall_f1iso_confintval(
             )
         else:
             p = plt.Polygon(
-                np.column_stack((vert_x, vert_y)),
+                numpy.column_stack((vert_x, vert_y)),
                 facecolor=next(colorcycler),
                 alpha=0.2,
                 edgecolor="none",
@@ -231,15 +250,13 @@ def precision_recall_f1iso_confintval(
     ax1.set_ylabel("Precision")
     ax1.set_xlim([0.0, 1.0])
     ax1.set_ylim([0.0, 1.0])
-    if title is not None:
-        ax1.set_title(title)
     # Annotates plot with F1-score iso-lines
     ax2 = ax1.twinx()
-    f_scores = np.linspace(0.1, 0.9, num=9)
+    f_scores = numpy.linspace(0.1, 0.9, num=9)
     tick_locs = []
     tick_labels = []
     for f_score in f_scores:
-        x = np.linspace(0.01, 1)
+        x = numpy.linspace(0.01, 1)
         y = f_score * x / (2 * x - f_score)
         (l,) = plt.plot(x[y >= 0], y[y >= 0], color="green", alpha=0.1)
         tick_locs.append(y[-1])
@@ -267,7 +284,7 @@ def precision_recall_f1iso_confintval(
     return fig
 
 
-def loss_curve(df, title=None):
+def loss_curve(df):
     """Creates a loss curve in a Matplotlib figure.
 
     Parameters
@@ -277,9 +294,6 @@ def loss_curve(df, title=None):
         A dataframe containing, at least, "epoch", "median-loss" and
         "learning-rate" columns, that will be plotted.
 
-    title : :py:class:`str`, Optional
-        Optional title, that will be set on the figure if passed
-
     Returns
     -------
 
@@ -287,10 +301,8 @@ def loss_curve(df, title=None):
         A figure, that may be saved or displayed
 
     """
-    import matplotlib.pyplot as plt
 
     ax1 = df.plot(x="epoch", y="median-loss", grid=True)
-    if title is not None: ax1.set_title(title)
     ax1.set_ylabel("Median Loss")
     ax1.grid(linestyle="--", linewidth=1, color="gray", alpha=0.2)
     ax2 = df["learning-rate"].plot(secondary_y=True, legend=True, grid=True,)
@@ -301,61 +313,25 @@ def loss_curve(df, title=None):
     return fig
 
 
-def read_metricscsv(file):
-    """
-    Read precision and recall from csv file
+def combined_precision_recall_f1iso_confintval(data):
+    """Plots comparison chart of all evaluated models
 
     Parameters
     ----------
-    file : str
-        path to file
+
+    data : dict
+        A dict in which keys are the names of the systems and the values are
+        paths to ``metrics.csv`` style files.
+
 
     Returns
     -------
-    :py:class:`numpy.ndarray`
-    :py:class:`numpy.ndarray`
-    """
-    with open(file, "r") as infile:
-        metricsreader = csv.reader(infile)
-        # skip header row
-        next(metricsreader)
-        precision = []
-        recall = []
-        pr_upper = []
-        pr_lower = []
-        re_upper = []
-        re_lower = []
-        for row in metricsreader:
-            precision.append(float(row[1]))
-            recall.append(float(row[2]))
-            pr_upper.append(float(row[8]))
-            pr_lower.append(float(row[9]))
-            re_upper.append(float(row[11]))
-            re_lower.append(float(row[12]))
-    return (
-        np.array(precision),
-        np.array(recall),
-        np.array(pr_upper),
-        np.array(pr_lower),
-        np.array(re_upper),
-        np.array(re_lower),
-    )
 
+    figure : matplotlib.figure.Figure
+        A figure, with all systems combined into a single plot.
 
-def plot_overview(outputfolders, title):
     """
-    Plots comparison chart of all trained models
 
-    Parameters
-    ----------
-    outputfolder : list
-        list containing output paths of all evaluated models (e.g. ``['DRIVE/model1', 'DRIVE/model2']``)
-    title : str
-        title of plot
-    Returns
-    -------
-    matplotlib.figure.Figure
-    """
     precisions = []
     recalls = []
     pr_ups = []
@@ -363,103 +339,20 @@ def plot_overview(outputfolders, title):
     re_ups = []
     re_lows = []
     names = []
-    for folder in outputfolders:
-        # metrics
-        metrics_path = os.path.join(folder, "results/Metrics.csv")
-        pr, re, pr_upper, pr_lower, re_upper, re_lower = read_metricscsv(metrics_path)
-        precisions.append(pr)
-        recalls.append(re)
-        pr_ups.append(pr_upper)
-        pr_lows.append(pr_lower)
-        re_ups.append(re_upper)
-        re_lows.append(re_lower)
-        modelname = folder.split("/")[-1]
-        name = "{} ".format(modelname)
+
+    for name, metrics_path in data.items():
+        logger.info(f"Loading metrics from {metrics_path}...")
+        df = pandas.read_csv(metrics_path)
+        precisions.append(df.precision.to_numpy())
+        recalls.append(df.recall.to_numpy())
+        pr_ups.append(df.pr_upper.to_numpy())
+        pr_lows.append(df.pr_lower.to_numpy())
+        re_ups.append(df.re_upper.to_numpy())
+        re_lows.append(df.re_lower.to_numpy())
         names.append(name)
-    # title = folder.split('/')[-4]
+
     fig = precision_recall_f1iso_confintval(
-        precisions, recalls, pr_ups, pr_lows, re_ups, re_lows, names, title
+        precisions, recalls, pr_ups, pr_lows, re_ups, re_lows, names
     )
-    return fig
-
-
-def metricsviz(
-    dataset,
-    output_path,
-    tp_color=(0, 255, 0),  # (128,128,128) Gray
-    fp_color=(0, 0, 255),  # (70, 240, 240) Cyan
-    fn_color=(255, 0, 0),  # (245, 130, 48) Orange
-    overlayed=True,
-):
-    """ Visualizes true positives, false positives and false negatives
-    Default colors TP: Gray, FP: Cyan, FN: Orange
 
-    Parameters
-    ----------
-    dataset : :py:class:`torch.utils.data.Dataset`
-    output_path : str
-        path where results and probability output images are stored. E.g. ``'DRIVE/MODEL'``
-    tp_color : tuple
-        RGB values, by default (128,128,128)
-    fp_color : tuple
-        RGB values, by default (70, 240, 240)
-    fn_color : tuple
-        RGB values, by default (245, 130, 48)
-    """
-
-    for sample in dataset:
-        # get sample
-        name = sample[0]
-        img = VF.to_pil_image(sample[1])  # PIL Image
-        gt = sample[2].byte()  # byte tensor
-
-        # read metrics
-        metrics = pd.read_csv(os.path.join(output_path, "results", "Metrics.csv"))
-        optimal_threshold = metrics["threshold"][metrics["f1_score"].idxmax()]
-
-        # read probability output
-        pred = Image.open(os.path.join(output_path, "images", name))
-        pred = pred.convert(mode="L")
-        pred = VF.to_tensor(pred)
-        binary_pred = torch.gt(pred, optimal_threshold).byte()
-
-        # calc metrics
-        # equals and not-equals
-        equals = torch.eq(binary_pred, gt)  # tensor
-        notequals = torch.ne(binary_pred, gt)  # tensor
-        # true positives
-        tp_tensor = gt * binary_pred  # tensor
-        tp_pil = VF.to_pil_image(tp_tensor.float())
-        tp_pil_colored = PIL.ImageOps.colorize(tp_pil, (0, 0, 0), tp_color)
-        # false positives
-        fp_tensor = torch.eq((binary_pred + tp_tensor), 1)
-        fp_pil = VF.to_pil_image(fp_tensor.float())
-        fp_pil_colored = PIL.ImageOps.colorize(fp_pil, (0, 0, 0), fp_color)
-        # false negatives
-        fn_tensor = notequals - fp_tensor
-        fn_pil = VF.to_pil_image(fn_tensor.float())
-        fn_pil_colored = PIL.ImageOps.colorize(fn_pil, (0, 0, 0), fn_color)
-
-        # paste together
-        tp_pil_colored.paste(fp_pil_colored, mask=fp_pil)
-        tp_pil_colored.paste(fn_pil_colored, mask=fn_pil)
-
-        if overlayed:
-            tp_pil_colored = PIL.Image.blend(img, tp_pil_colored, 0.4)
-            img_metrics = pd.read_csv(
-                os.path.join(output_path, "results", name + ".csv")
-            )
-            f1 = img_metrics[" f1_score"].max()
-            # add f1-score
-            fnt_size = tp_pil_colored.size[1] // 25
-            draw = PIL.ImageDraw.Draw(tp_pil_colored)
-            fnt = PIL.ImageFont.truetype("FreeMono.ttf", fnt_size)
-            draw.text((0, 0), "F1: {:.4f}".format(f1), (255, 255, 255), font=fnt)
-
-        # save to disk
-        overlayed_path = os.path.join(output_path, "tpfnfpviz")
-        fullpath = os.path.join(overlayed_path, name)
-        fulldir = os.path.dirname(fullpath)
-        if not os.path.exists(fulldir):
-            os.makedirs(fulldir)
-        tp_pil_colored.save(fullpath)
+    return fig
diff --git a/bob/ip/binseg/utils/rsttable.py b/bob/ip/binseg/utils/rsttable.py
deleted file mode 100644
index c5329d8aee9ea28fd202fb057076d0f6a11eca24..0000000000000000000000000000000000000000
--- a/bob/ip/binseg/utils/rsttable.py
+++ /dev/null
@@ -1,56 +0,0 @@
-import pandas as pd
-from tabulate import tabulate
-import os
-from pathlib import Path
-
-
-def get_paths(output_path, filename):
-    """
-    Parameters
-    ----------
-    output_path : str
-        path in which to look for files
-    filename : str
-
-    Returns
-    -------
-    list 
-        list of file paths
-    """
-    datadir = Path(output_path)
-    file_paths = sorted(list(datadir.glob("**/{}".format(filename))))
-    file_paths = [f.as_posix() for f in file_paths]
-    return file_paths
-
-
-def create_overview_grid(output_path):
-    """ Reads all Metrics.csv in a certain output path and pivots them to a rst grid table"""
-    filename = "Metrics.csv"
-    metrics = get_paths(output_path, filename)
-    f1s = []
-    stds = []
-    models = []
-    databases = []
-    for m in metrics:
-        metrics = pd.read_csv(m)
-        maxf1 = metrics["f1_score"].max()
-        idmaxf1 = metrics["f1_score"].idxmax()
-        std = metrics["std_f1"][idmaxf1]
-        stds.append(std)
-        f1s.append(maxf1)
-        model = m.split("/")[-3]
-        models.append(model)
-        database = m.split("/")[-4]
-        databases.append(database)
-    df = pd.DataFrame()
-    df["database"] = databases
-    df["model"] = models
-    df["f1"] = f1s
-    df["std"] = stds
-    pivot = df.pivot(index="database", columns="model", values="f1")
-    pivot2 = df.pivot(index="database", columns="model", values="std")
-
-    with open(os.path.join(output_path, "Metrics_overview.rst"), "w+") as outfile:
-        outfile.write(tabulate(pivot, headers=pivot.columns, tablefmt="grid"))
-    with open(os.path.join(output_path, "Metrics_overview_std.rst"), "w+") as outfile:
-        outfile.write(tabulate(pivot2, headers=pivot2.columns, tablefmt="grid"))
diff --git a/bob/ip/binseg/utils/summary.py b/bob/ip/binseg/utils/summary.py
index 97fc09da9b89404d96e2230ca49c8a0b81db9c8f..493d9d163833c64f26599a69e087381d54a2a62e 100644
--- a/bob/ip/binseg/utils/summary.py
+++ b/bob/ip/binseg/utils/summary.py
@@ -10,11 +10,11 @@ from torch.nn.modules.module import _addindent
 
 def summary(model, file=sys.stderr):
     """Counts the number of paramters in each layers
-    
+
     Parameters
     ----------
     model : :py:class:`torch.nn.Module`
-    
+
     Returns
     -------
     int
diff --git a/bob/ip/binseg/utils/transformfolder.py b/bob/ip/binseg/utils/transformfolder.py
deleted file mode 100644
index 95c3353947530e6ff33423db6adf3fc222a481b8..0000000000000000000000000000000000000000
--- a/bob/ip/binseg/utils/transformfolder.py
+++ /dev/null
@@ -1,28 +0,0 @@
-#!/usr/bin/env python
-# -*- coding: utf-8 -*-
-from pathlib import Path, PurePosixPath
-from PIL import Image
-from torchvision.transforms.functional import to_pil_image
-
-
-def transformfolder(source_path, target_path, transforms):
-    """Applies a set of transfroms on an image folder 
-    
-    Parameters
-    ----------
-    source_path : str
-        [description]
-    target_path : str
-        [description]
-    transforms : [type]
-        transform function
-    """
-    source_path = Path(source_path)
-    target_path = Path(target_path)
-    file_paths = sorted(list(source_path.glob("*?.*")))
-    for f in file_paths:
-        timg_path = PurePosixPath(target_path).joinpath(f.name)
-        img = Image.open(f).convert(mode="1", dither=None)
-        img, _ = transforms(img, img)
-        img = to_pil_image(img)
-        img.save(str(timg_path))
diff --git a/doc/cli.rst b/doc/cli.rst
index fc3778926a56ee2db2845aa80d9028273c6c2d08..03f7c2dc7c692c1aba580d3f05393396f585f4e1 100644
--- a/doc/cli.rst
+++ b/doc/cli.rst
@@ -95,4 +95,15 @@ a series of analysis figures which are useful to understand model performance.
 .. command-output:: bob binseg evaluate --help
 
 
+.. _bob.ip.binseg.cli.compare:
+
+Performance Comparison
+----------------------
+
+Performance comparison takes the performance evaluation results and generate
+combined figures and tables that compare results of multiple systems.
+
+.. command-output:: bob binseg compare --help
+
+
 .. include:: links.rst
diff --git a/doc/evaluation.rst b/doc/evaluation.rst
index 95ab253844b6f21c13449eb0b1fb98054947ac25..48f7a91efbbec8da0bec6723ddbadf47a4d4d358 100644
--- a/doc/evaluation.rst
+++ b/doc/evaluation.rst
@@ -84,4 +84,15 @@ E.g. run inference on predictions from the DRIVE test set, do the following:
     bob binseg evaluate -vv drive-test -p /predictions/folder -o /eval/results/folder
 
 
+Comparing Systems
+=================
+
+To compare multiple systems together and generate combined plots and tables,
+use ``bob binseg compare``.  Use ``--help`` for a quick guide.
+
+.. code-block:: bash
+
+   $ bob binseg compare -vv A A/metrics.csv B B/metrics.csv
+
+
 .. include:: links.rst
diff --git a/doc/usage.rst b/doc/usage.rst
index 24c3fa1ada6058ffbb5668de475d5b3f939c55fc..d9c1ef87c00264666f78e71c3294ecd6e486345d 100644
--- a/doc/usage.rst
+++ b/doc/usage.rst
@@ -49,8 +49,6 @@ modifying one of our configuration resources.
    training
    models
    evaluation
-   plotting
-   visualization
 
 
 .. include:: links.rst
diff --git a/doc/visualization.rst b/doc/visualization.rst
deleted file mode 100644
index 56728e9562003c676dab0a5d51ca4640b12df1e8..0000000000000000000000000000000000000000
--- a/doc/visualization.rst
+++ /dev/null
@@ -1,30 +0,0 @@
-.. -*- coding: utf-8 -*-
-.. _bob.ip.binseg.visualization:
-
-=============
-Visualization
-=============
-
-Two visualization are generated via the ``bob binseg visualize`` command:
-
-1. Visualizations of true positives, false positives and false negatives
-overlayed over the test images
-2. Visualizations of the probability map outputs overlayed over the test images
-
-The following directory structure is expected:
-
-.. code-block:: bash
-
-    â”œâ”€â”€ DATABASE
-        â”œâ”€â”€ MODEL
-            â”œâ”€â”€ images
-            â””â”€â”€ results
-
-Example to generate visualization for outputs for the DRIVE dataset:
-
-.. code-block:: bash
-
-    # Visualizations are stored in the same output folder.
-    bob binseg visualize DRIVETEST -o /DRIVE/M2UNet/output
-
-Use ``bob binseg visualize --help`` for more information.
diff --git a/setup.py b/setup.py
index 452c77c099d133454e8376cbc23094ddc425bff4..5a234f7c70c84706a1ee531d23954f799754db46 100644
--- a/setup.py
+++ b/setup.py
@@ -31,14 +31,11 @@ setup(
         "bob.cli": ["binseg = bob.ip.binseg.script.binseg:binseg"],
         # bob binseg sub-commands
         "bob.ip.binseg.cli": [
-            "compare =  bob.bin.binseg.script.binseg:compare",
-            "evalpred = bob.ip.binseg.script.binseg:evalpred",
-            "gridtable = bob.ip.binseg.script.binseg:testcheckpoints",
-            "visualize = bob.ip.binseg.script.binseg:visualize",
             "config = bob.ip.binseg.script.config:config",
             "train = bob.ip.binseg.script.train:train",
             "predict = bob.ip.binseg.script.predict:predict",
             "evaluate = bob.ip.binseg.script.evaluate:evaluate",
+            "compare =  bob.ip.binseg.script.compare:compare",
         ],
         # bob train configurations
         "bob.ip.binseg.config": [