diff --git a/bob/ip/binseg/script/binseg.py b/bob/ip/binseg/script/binseg.py
index ee8118afaa3dab9385e8e440a8ad37519b29c29a..780d85a30f13078af47e9e3ab79aa451b14f108e 100644
--- a/bob/ip/binseg/script/binseg.py
+++ b/bob/ip/binseg/script/binseg.py
@@ -31,6 +31,8 @@ from bob.ip.binseg.utils.click import OptionEatAll
 from bob.ip.binseg.utils.pdfcreator import create_pdf, get_paths
 from bob.ip.binseg.utils.rsttable import create_overview_grid
 from bob.ip.binseg.utils.plot import metricsviz, overlay,savetransformedtest
+from bob.ip.binseg.utils.transformfolder import transformfolder as transfld
+from bob.ip.binseg.utils.evaluate import do_eval
 
 logger = logging.getLogger(__name__)
 
@@ -548,4 +550,72 @@ def ssltrain(model
             , arguments
             , output_path
             , rampup
-            )
\ No newline at end of file
+            )
+
+# Apple image transforms to a fodler
+@binseg.command(entry_point_group='bob.ip.binseg.config', cls=ConfigCommand)
+@click.option(
+    '--source-path',
+    '-s',
+    required=True,
+    cls=ResourceOption
+    )
+@click.option(
+    '--target-path',
+    '-t',
+    required=True,
+    cls=ResourceOption
+    )
+@click.option(
+    '--transforms',
+    '-a',
+    required=True,
+    cls=ResourceOption
+    )
+
+@verbosity_option(cls=ResourceOption)
+def transformfolder(source_path ,target_path,transforms,**kwargs):
+    logger.info('Applying transforms to images in {} and saving them to {}'.format(source_path, target_path))
+    transfld(source_path,target_path,transforms)
+
+
+# Eval only 
+@binseg.command(entry_point_group='bob.ip.binseg.config', cls=ConfigCommand)
+@click.option(
+    '--output-path',
+    '-o',
+    required=True,
+    default="output",
+    cls=ResourceOption
+    )
+@click.option(
+    '--prediction-folder',
+    '-p',
+    required=True,
+    cls=ResourceOption
+    )
+@click.option(
+    '--dataset',
+    '-d',
+    required=True,
+    cls=ResourceOption
+    )
+
+@verbosity_option(cls=ResourceOption)
+def evalpred(
+        output_path
+        ,prediction_folder
+        ,dataset
+        , **kwargs):
+    """ Run inference and evalaute the model performance """
+
+    # PyTorch dataloader
+    data_loader = DataLoader(
+        dataset = dataset
+        ,batch_size = 1
+        ,shuffle= False
+        ,pin_memory = torch.cuda.is_available()
+        )
+    
+    # checkpointer, load last model in dir
+    do_eval(prediction_folder, data_loader, output_folder = output_path)
\ No newline at end of file
diff --git a/bob/ip/binseg/utils/evaluate.py b/bob/ip/binseg/utils/evaluate.py
new file mode 100644
index 0000000000000000000000000000000000000000..620b62db87a0e001151e0a00eeb17fe9829886a7
--- /dev/null
+++ b/bob/ip/binseg/utils/evaluate.py
@@ -0,0 +1,179 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+# only use to evaluate 2nd human annotator
+#  
+import os 
+import logging
+import time
+import datetime
+import numpy as np
+import torch
+import pandas as pd
+import torchvision.transforms.functional as VF
+from tqdm import tqdm
+
+from bob.ip.binseg.utils.metric import SmoothedValue, base_metrics
+from bob.ip.binseg.utils.plot import precision_recall_f1iso
+from bob.ip.binseg.utils.summary import summary
+from PIL import Image
+from torchvision.transforms.functional import to_tensor
+
+
+def batch_metrics(predictions, ground_truths, names, output_folder, logger):
+    """
+    Calculates metrics on the batch and saves it to disc
+
+    Parameters
+    ----------
+    predictions : :py:class:`torch.Tensor`
+        tensor with pixel-wise probabilities
+    ground_truths : :py:class:`torch.Tensor`
+        tensor with binary ground-truth
+    names : list
+        list of file names 
+    output_folder : str
+        output path
+    logger : :py:class:`logging.Logger`
+        python logger
+
+    Returns
+    -------
+    list 
+        list containing batch metrics: ``[name, threshold, precision, recall, specificity, accuracy, jaccard, f1_score]``
+    """
+    step_size = 0.01
+    batch_metrics = []
+
+    for j in range(predictions.size()[0]):
+        # ground truth byte
+        gts = ground_truths[j].byte()
+
+        file_name = "{}.csv".format(names[j])
+        logger.info("saving {}".format(file_name))
+        
+        with open (os.path.join(output_folder,file_name), "w+") as outfile:
+
+            outfile.write("threshold, precision, recall, specificity, accuracy, jaccard, f1_score\n")
+
+            for threshold in np.arange(0.0,1.0,step_size):        
+                # threshold
+                binary_pred = torch.gt(predictions[j], threshold).byte()
+
+                # equals and not-equals
+                equals = torch.eq(binary_pred, gts) # tensor
+                notequals = torch.ne(binary_pred, gts) # tensor
+                
+                # true positives 
+                tp_tensor = (gts * binary_pred ) # tensor
+                tp_count = torch.sum(tp_tensor).item() # scalar
+
+                # false positives 
+                fp_tensor = torch.eq((binary_pred + tp_tensor), 1) 
+                fp_count = torch.sum(fp_tensor).item()
+
+                # true negatives
+                tn_tensor = equals - tp_tensor
+                tn_count = torch.sum(tn_tensor).item()
+
+                # false negatives
+                fn_tensor = notequals - fp_tensor
+                fn_count = torch.sum(fn_tensor).item()
+
+                # calc metrics
+                metrics = base_metrics(tp_count, fp_count, tn_count, fn_count)    
+                
+                # write to disk 
+                outfile.write("{:.2f},{:.5f},{:.5f},{:.5f},{:.5f},{:.5f},{:.5f} \n".format(threshold, *metrics))
+                
+                batch_metrics.append([names[j],threshold, *metrics ])
+
+    
+    return batch_metrics
+
+
+
+def do_eval(
+    prediction_folder,
+    data_loader,
+    output_folder = None
+):
+
+    """
+    Calculate metrics on saved prediction images (needs batch_size = 1 !)
+    
+    Parameters
+    ---------
+    model : :py:class:`torch.nn.Module`
+        neural network model (e.g. DRIU, HED, UNet)
+    data_loader : py:class:`torch.torch.utils.data.DataLoader`
+    device : str
+        device to use ``'cpu'`` or ``'cuda'``
+    output_folder : str
+    """
+    logger = logging.getLogger("bob.ip.binseg.engine.evaluate")
+    logger.info("Start evaluation")
+    logger.info("Prediction folder {}".format(prediction_folder))
+    results_subfolder = os.path.join(output_folder,'results') 
+    os.makedirs(results_subfolder,exist_ok=True)
+    
+    
+    # Collect overall metrics 
+    metrics = []
+
+    for samples in tqdm(data_loader):
+        names = samples[0]
+        images = samples[1]
+        ground_truths = samples[2]
+      
+    
+        pred_file = os.path.join(prediction_folder,names[0])
+        probabilities = Image.open(pred_file)    
+        probabilities = probabilities.convert(mode='L')
+        probabilities = to_tensor(probabilities)
+
+            
+        b_metrics = batch_metrics(probabilities, ground_truths, names,results_subfolder, logger)
+        metrics.extend(b_metrics)
+            
+
+
+    # DataFrame 
+    df_metrics = pd.DataFrame(metrics,columns= \
+                           ["name",
+                            "threshold",
+                            "precision", 
+                            "recall", 
+                            "specificity", 
+                            "accuracy", 
+                            "jaccard", 
+                            "f1_score"])
+
+    # Report and Averages
+    metrics_file = "Metrics.csv"
+    metrics_path = os.path.join(results_subfolder, metrics_file)
+    logger.info("Saving average over all input images: {}".format(metrics_file))
+    
+    avg_metrics = df_metrics.groupby('threshold').mean()
+    std_metrics = df_metrics.groupby('threshold').std()
+
+    avg_metrics["f1_score"] =  (2* avg_metrics["precision"]*avg_metrics["recall"])/ \
+        (avg_metrics["precision"]+avg_metrics["recall"])
+    
+    avg_metrics["std_f1"] = std_metrics["f1_score"]
+    
+    avg_metrics.to_csv(metrics_path)
+    maxf1 = avg_metrics['f1_score'].max()
+    optimal_f1_threshold = avg_metrics['f1_score'].idxmax()
+    
+    logger.info("Highest F1-score of {:.5f}, achieved at threshold {}".format(maxf1, optimal_f1_threshold))
+    
+    # Plotting
+    np_avg_metrics = avg_metrics.to_numpy().T
+    fig_name = "precision_recall.pdf"
+    logger.info("saving {}".format(fig_name))
+    fig = precision_recall_f1iso([np_avg_metrics[0]],[np_avg_metrics[1]], ['2nd Human',None], title='2nd Human')
+    fig_filename = os.path.join(results_subfolder, fig_name)
+    fig.savefig(fig_filename)
+
+
+
diff --git a/bob/ip/binseg/utils/plot.py b/bob/ip/binseg/utils/plot.py
index d87de5f5bf3b2d014f6a3415e23ee6a7e68774db..0e6169b41cf3361e7388b7ed2a184ae2abc6f236 100644
--- a/bob/ip/binseg/utils/plot.py
+++ b/bob/ip/binseg/utils/plot.py
@@ -230,6 +230,7 @@ def metricsviz(dataset
         
         # read probability output 
         pred = Image.open(os.path.join(output_path,'images',name))
+        pred = pred.convert(mode='L')
         pred = VF.to_tensor(pred)
         binary_pred = torch.gt(pred, optimal_threshold).byte()
         
@@ -291,7 +292,7 @@ def overlay(dataset, output_path):
         #f1 = metrics[' f1_score'].max()
         
         # read probability output 
-        pred = Image.open(os.path.join(output_path,'images',name))
+        pred = Image.open(os.path.join(output_path,'images',name)).convert(mode='L')
         # color and overlay
         pred_green = PIL.ImageOps.colorize(pred, (0,0,0), (0,255,0))
         overlayed = PIL.Image.blend(img, pred_green, 0.4)
diff --git a/bob/ip/binseg/utils/transformfolder.py b/bob/ip/binseg/utils/transformfolder.py
new file mode 100644
index 0000000000000000000000000000000000000000..9308d64705adeba36ae0edefc7f07b3f56b8069f
--- /dev/null
+++ b/bob/ip/binseg/utils/transformfolder.py
@@ -0,0 +1,27 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+from pathlib import Path,PurePosixPath
+from PIL import Image
+from torchvision.transforms.functional import to_pil_image
+
+def transformfolder(source_path, target_path, transforms):
+    """Applies a set of transfroms on an image folder 
+    
+    Parameters
+    ----------
+    source_path : str
+        [description]
+    target_path : str
+        [description]
+    transforms : [type]
+        transform function
+    """
+    source_path = Path(source_path)
+    target_path = Path(target_path)
+    file_paths = sorted(list(source_path.glob('*?.*')))
+    for f in file_paths:
+        timg_path = PurePosixPath(target_path).joinpath(f.name)
+        img = Image.open(f).convert(mode='1', dither=None)
+        img, _ = transforms(img,img)
+        img = to_pil_image(img)
+        img.save(str(timg_path))
\ No newline at end of file
diff --git a/doc/index.rst b/doc/index.rst
index 629de2b872bcc47f8806a60f46995b3acb065706..97d792b87b1b2e40dd566f373220540955dbab48 100644
--- a/doc/index.rst
+++ b/doc/index.rst
@@ -23,6 +23,7 @@ Users Guide
    benchmarkresults
    covdresults
    configs
+   visualization
    api
 
 .. todolist::
diff --git a/doc/visualization.rst b/doc/visualization.rst
new file mode 100644
index 0000000000000000000000000000000000000000..56728e9562003c676dab0a5d51ca4640b12df1e8
--- /dev/null
+++ b/doc/visualization.rst
@@ -0,0 +1,30 @@
+.. -*- coding: utf-8 -*-
+.. _bob.ip.binseg.visualization:
+
+=============
+Visualization
+=============
+
+Two visualization are generated via the ``bob binseg visualize`` command:
+
+1. Visualizations of true positives, false positives and false negatives
+overlayed over the test images
+2. Visualizations of the probability map outputs overlayed over the test images
+
+The following directory structure is expected:
+
+.. code-block:: bash
+
+    ├── DATABASE
+        ├── MODEL
+            ├── images
+            └── results
+
+Example to generate visualization for outputs for the DRIVE dataset:
+
+.. code-block:: bash
+
+    # Visualizations are stored in the same output folder.
+    bob binseg visualize DRIVETEST -o /DRIVE/M2UNet/output
+
+Use ``bob binseg visualize --help`` for more information.