diff --git a/bob/ip/binseg/configs/datasets/imagefolderinference.py b/bob/ip/binseg/configs/datasets/imagefolderinference.py
new file mode 100644
index 0000000000000000000000000000000000000000..ba760e876811ef5bc80259a7008e6e66e23958f4
--- /dev/null
+++ b/bob/ip/binseg/configs/datasets/imagefolderinference.py
@@ -0,0 +1,17 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+from bob.ip.binseg.data.transforms import *
+from bob.ip.binseg.data.imagefolderinference import ImageFolderInference
+
+#### Config ####
+
+# add your transforms below
+transforms = Compose([  
+                        CenterCrop((544,544))
+                        ,ToTensor()
+                    ])
+
+# PyTorch dataset
+path = '/path/to/folder/containing/images'
+dataset = ImageFolderInference(path,transform=transforms)
diff --git a/bob/ip/binseg/configs/datasets/imagefoldertest.py b/bob/ip/binseg/configs/datasets/imagefoldertest.py
new file mode 100644
index 0000000000000000000000000000000000000000..15d9b0383163cee1f3b102783ceb3b50defe5459
--- /dev/null
+++ b/bob/ip/binseg/configs/datasets/imagefoldertest.py
@@ -0,0 +1,17 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+from bob.ip.binseg.data.transforms import *
+from bob.ip.binseg.data.imagefolder import ImageFolder
+
+#### Config ####
+
+# add your transforms below
+transforms = Compose([  
+                        CenterCrop((544,544))
+                        ,ToTensor()
+                    ])
+
+# PyTorch dataset
+path = '/path/to/testdataset'
+dataset = ImageFolder(path,transform=transforms)
diff --git a/bob/ip/binseg/data/imagefolder.py b/bob/ip/binseg/data/imagefolder.py
index da6670241afd4d2533fca8021a15e4e85d91bdb8..7ec9dd9dbd558b8a1b405076946dca0919df5990 100644
--- a/bob/ip/binseg/data/imagefolder.py
+++ b/bob/ip/binseg/data/imagefolder.py
@@ -6,6 +6,7 @@ import numpy as np
 from PIL import Image
 import torch
 import torchvision.transforms.functional as VF
+import bob.io.base
 
 def get_file_lists(data_path):
     data_path = Path(data_path)
@@ -60,7 +61,14 @@ class ImageFolder(Dataset):
         img = Image.open(img_path).convert(mode='RGB')
     
         gt_path = self.gt_file_list[index]
-        gt = Image.open(gt_path).convert(mode='1', dither=None)
+        if gt_path.suffix == '.hdf5':
+            gt = bob.io.base.load(str(gt_path)).astype('float32')
+            # not elegant but since transforms require PIL images we do this hacky workaround here
+            gt = torch.from_numpy(gt)
+            gt = VF.to_pil_image(gt).convert(mode='1', dither=None)
+        else:
+            gt = Image.open(gt_path).convert(mode='1', dither=None)
+        
         sample = [img, gt]
         
         if self.transform :
diff --git a/bob/ip/binseg/data/imagefolderinference.py b/bob/ip/binseg/data/imagefolderinference.py
new file mode 100644
index 0000000000000000000000000000000000000000..79e57e245ffbc3bd369f29523f36f1d09cabc833
--- /dev/null
+++ b/bob/ip/binseg/data/imagefolderinference.py
@@ -0,0 +1,61 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+from torch.utils.data import Dataset
+from pathlib import Path
+import numpy as np
+from PIL import Image
+import torch
+import torchvision.transforms.functional as VF
+import bob.io.base
+
+def get_file_lists(data_path):
+    data_path = Path(data_path)
+    image_file_names = np.array(sorted(list(data_path.glob('*'))))
+    return image_file_names
+
+class ImageFolderInference(Dataset):
+    """
+    Generic ImageFolder containing images for inference
+    
+    Parameters
+    ----------
+    path : str
+        full path to root of dataset
+    
+    """
+    def __init__(self, path, transform = None):
+        self.transform = transform
+        self.img_file_list = get_file_lists(path)
+
+    def __len__(self):
+        """
+        Returns
+        -------
+        int
+            size of the dataset
+        """
+        return len(self.img_file_list)
+    
+    def __getitem__(self,index):
+        """
+        Parameters
+        ----------
+        index : int
+        
+        Returns
+        -------
+        list
+            dataitem [img_name, img]
+        """
+        img_path = self.img_file_list[index]
+        img_name = img_path.name
+        img = Image.open(img_path).convert(mode='RGB')
+    
+        sample = [img]
+        
+        if self.transform :
+            sample = self.transform(*sample)
+        
+        sample.insert(0,img_name)
+        
+        return sample
diff --git a/bob/ip/binseg/engine/inferencer.py b/bob/ip/binseg/engine/inferencer.py
index 26de6cf733e301f2ec60a76e275822ec85491be3..c2dd6443152cef96fba9dfdbe5a3907f948e8ebf 100644
--- a/bob/ip/binseg/engine/inferencer.py
+++ b/bob/ip/binseg/engine/inferencer.py
@@ -11,8 +11,10 @@ import pandas as pd
 import torchvision.transforms.functional as VF
 from tqdm import tqdm
 
+import bob.io.base
+
 from bob.ip.binseg.utils.metric import SmoothedValue, base_metrics
-from bob.ip.binseg.utils.plot import precision_recall_f1iso
+from bob.ip.binseg.utils.plot import precision_recall_f1iso_confintval
 from bob.ip.binseg.utils.summary import summary
 
 
@@ -108,10 +110,32 @@ def save_probability_images(predictions, names, output_folder, logger):
     if not os.path.exists(images_subfolder): os.makedirs(images_subfolder)
     for j in range(predictions.size()[0]):
         img = VF.to_pil_image(predictions.cpu().data[j])
-        filename = '{}'.format(names[j])
+        filename = '{}.png'.format(names[j].split(".")[0])
         logger.info("saving {}".format(filename))
         img.save(os.path.join(images_subfolder, filename))
 
+def save_hdf(predictions, names, output_folder, logger):
+    """
+    Saves probability maps as image in the same format as the test image
+
+    Parameters
+    ----------
+    predictions : :py:class:`torch.Tensor`
+        tensor with pixel-wise probabilities
+    names : list
+        list of file names 
+    output_folder : str
+        output path
+    logger : :py:class:`logging.Logger`
+        python logger
+    """
+    hdf5_subfolder = os.path.join(output_folder,'hdf5') 
+    if not os.path.exists(hdf5_subfolder): os.makedirs(hdf5_subfolder)
+    for j in range(predictions.size()[0]):
+        img = predictions.cpu().data[j].squeeze(0).numpy()
+        filename = '{}.hdf5'.format(names[j].split(".")[0])
+        logger.info("saving {}".format(filename))
+        bob.io.base.save(img, os.path.join(hdf5_subfolder, filename))
 
 def do_inference(
     model,
@@ -174,6 +198,8 @@ def do_inference(
             
             # Create probability images
             save_probability_images(probabilities, names, output_folder, logger)
+            # save hdf5
+            save_hdf(probabilities, names, output_folder, logger)
 
     # DataFrame 
     df_metrics = pd.DataFrame(metrics,columns= \
@@ -199,6 +225,12 @@ def do_inference(
     #avg_metrics["f1_score"] =  (2* avg_metrics["precision"]*avg_metrics["recall"])/ \
     #    (avg_metrics["precision"]+avg_metrics["recall"])
     
+    avg_metrics["std_pr"] = std_metrics["precision"]
+    avg_metrics["pr_upper"] = avg_metrics['precision'] + avg_metrics["std_pr"]
+    avg_metrics["pr_lower"] = avg_metrics['precision'] - avg_metrics["std_pr"]
+    avg_metrics["std_re"] = std_metrics["recall"]
+    avg_metrics["re_upper"] = avg_metrics['recall'] + avg_metrics["std_re"]
+    avg_metrics["re_lower"] = avg_metrics['recall'] - avg_metrics["std_re"]
     avg_metrics["std_f1"] = std_metrics["f1_score"]
     
     avg_metrics.to_csv(metrics_path)
@@ -211,7 +243,7 @@ def do_inference(
     np_avg_metrics = avg_metrics.to_numpy().T
     fig_name = "precision_recall.pdf"
     logger.info("saving {}".format(fig_name))
-    fig = precision_recall_f1iso([np_avg_metrics[0]],[np_avg_metrics[1]], [model.name,None], title=output_folder.split('/')[-2:])
+    fig = precision_recall_f1iso_confintval([np_avg_metrics[0]],[np_avg_metrics[1]],[np_avg_metrics[7]],[np_avg_metrics[8]],[np_avg_metrics[10]],[np_avg_metrics[11]], [model.name,None], title=output_folder)
     fig_filename = os.path.join(results_subfolder, fig_name)
     fig.savefig(fig_filename)
     
diff --git a/bob/ip/binseg/engine/predicter.py b/bob/ip/binseg/engine/predicter.py
new file mode 100644
index 0000000000000000000000000000000000000000..b6e8ad06da54b43a538cf4fc7805cc63e6966cee
--- /dev/null
+++ b/bob/ip/binseg/engine/predicter.py
@@ -0,0 +1,93 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+import os 
+import logging
+import time
+import datetime
+import numpy as np
+import torch
+import torchvision.transforms.functional as VF
+from tqdm import tqdm
+
+from bob.ip.binseg.utils.summary import summary
+from bob.ip.binseg.engine.inferencer import save_probability_images
+from bob.ip.binseg.engine.inferencer import save_hdf
+
+
+def do_predict(
+    model,
+    data_loader,
+    device,
+    output_folder = None
+):
+
+    """
+    Run inference and calculate metrics
+    
+    Parameters
+    ---------
+    model : :py:class:`torch.nn.Module`
+        neural network model (e.g. DRIU, HED, UNet)
+    data_loader : py:class:`torch.torch.utils.data.DataLoader`
+    device : str
+        device to use ``'cpu'`` or ``'cuda'``
+    output_folder : str
+    """
+    logger = logging.getLogger("bob.ip.binseg.engine.inference")
+    logger.info("Start evaluation")
+    logger.info("Output folder: {}, Device: {}".format(output_folder, device))
+    results_subfolder = os.path.join(output_folder,'results') 
+    os.makedirs(results_subfolder,exist_ok=True)
+    
+    model.eval().to(device)
+    # Sigmoid for probabilities 
+    sigmoid = torch.nn.Sigmoid() 
+
+    # Setup timers
+    start_total_time = time.time()
+    times = []
+
+    for samples in tqdm(data_loader):
+        names = samples[0]
+        images = samples[1].to(device)
+        with torch.no_grad():
+            start_time = time.perf_counter()
+
+            outputs = model(images)
+            
+            # necessary check for hed architecture that uses several outputs 
+            # for loss calculation instead of just the last concatfuse block
+            if isinstance(outputs,list):
+                outputs = outputs[-1]
+            
+            probabilities = sigmoid(outputs)
+            
+            batch_time = time.perf_counter() - start_time
+            times.append(batch_time)
+            logger.info("Batch time: {:.5f} s".format(batch_time))
+            
+            # Create probability images
+            save_probability_images(probabilities, names, output_folder, logger)
+            # Save hdf5
+            save_hdf(probabilities, names, output_folder, logger)
+
+ 
+    # Report times
+    total_inference_time = str(datetime.timedelta(seconds=int(sum(times))))
+    average_batch_inference_time = np.mean(times)
+    total_evalution_time = str(datetime.timedelta(seconds=int(time.time() - start_total_time )))
+
+    logger.info("Average batch inference time: {:.5f}s".format(average_batch_inference_time))
+
+    times_file = "Times.txt"
+    logger.info("saving {}".format(times_file))
+ 
+    with open (os.path.join(results_subfolder,times_file), "w+") as outfile:
+        date = datetime.datetime.now()
+        outfile.write("Date: {} \n".format(date.strftime("%Y-%m-%d %H:%M:%S")))
+        outfile.write("Total evaluation run-time: {} \n".format(total_evalution_time))
+        outfile.write("Average batch inference time: {} \n".format(average_batch_inference_time))
+        outfile.write("Total inference time: {} \n".format(total_inference_time))
+
+
diff --git a/bob/ip/binseg/script/binseg.py b/bob/ip/binseg/script/binseg.py
index 3e4e5681ff70045e4663ebe137ce28f0427b3a4f..f78fe4045d766f892283c892e849ae0aa26ee3ab 100644
--- a/bob/ip/binseg/script/binseg.py
+++ b/bob/ip/binseg/script/binseg.py
@@ -32,6 +32,7 @@ from bob.ip.binseg.utils.rsttable import create_overview_grid
 from bob.ip.binseg.utils.plot import metricsviz, overlay,savetransformedtest
 from bob.ip.binseg.utils.transformfolder import transformfolder as transfld
 from bob.ip.binseg.utils.evaluate import do_eval
+from bob.ip.binseg.engine.predicter import do_predict
 
 logger = logging.getLogger(__name__)
 
@@ -492,6 +493,77 @@ def transformfolder(source_path ,target_path,transforms,**kwargs):
     transfld(source_path,target_path,transforms)
 
 
+# Run inference and create predictions only (no ground truth available)
+@binseg.command(entry_point_group='bob.ip.binseg.config', cls=ConfigCommand)
+@click.option(
+    '--output-path',
+    '-o',
+    required=True,
+    default="output",
+    cls=ResourceOption
+    )
+@click.option(
+    '--model',
+    '-m',
+    required=True,
+    cls=ResourceOption
+    )
+@click.option(
+    '--dataset',
+    '-d',
+    required=True,
+    cls=ResourceOption
+    )
+@click.option(
+    '--batch-size',
+    '-b',
+    required=True,
+    default=2,
+    cls=ResourceOption)
+@click.option(
+    '--device',
+    '-d',
+    help='A string indicating the device to use (e.g. "cpu" or "cuda:0"',
+    show_default=True,
+    required=True,
+    default='cpu',
+    cls=ResourceOption)
+@click.option(
+    '--weight',
+    '-w',
+    help='Path or URL to pretrained model',
+    required=False,
+    default=None,
+    cls=ResourceOption
+    )
+@verbosity_option(cls=ResourceOption)
+def predict(model
+        ,output_path
+        ,device
+        ,batch_size
+        ,dataset
+        ,weight
+        , **kwargs):
+    """ Run inference and evalaute the model performance """
+
+    # PyTorch dataloader
+    data_loader = DataLoader(
+        dataset = dataset
+        ,batch_size = batch_size
+        ,shuffle= False
+        ,pin_memory = torch.cuda.is_available()
+        )
+    
+    # checkpointer, load last model in dir
+    checkpointer = DetectronCheckpointer(model, save_dir = output_path, save_to_disk=False)
+    checkpointer.load(weight)
+    do_predict(model, data_loader, device, output_path)
+
+    # Overlayed images
+    overlay(dataset=dataset, output_path=output_path)
+
+
+
 # Evaluate only. Runs evaluation on predicted probability maps (--prediction-folder)
 @binseg.command(entry_point_group='bob.ip.binseg.config', cls=ConfigCommand)
 @click.option(
@@ -544,4 +616,6 @@ def evalpred(
     
     # Run eval
     do_eval(prediction_folder, data_loader, output_folder = output_path, title= title, legend=legend)
+
+
     
\ No newline at end of file
diff --git a/bob/ip/binseg/utils/checkpointer.py b/bob/ip/binseg/utils/checkpointer.py
index e2090caa2c5f337fcae10247446f5676713f33d8..f3899e1dc2a23e6b6c8c250bebfd9a40ebc1fc93 100644
--- a/bob/ip/binseg/utils/checkpointer.py
+++ b/bob/ip/binseg/utils/checkpointer.py
@@ -62,7 +62,7 @@ class Checkpointer:
             f = self.get_checkpoint_file()
         if not f:
             # no checkpoint could be found
-            self.logger.info("No checkpoint found. Initializing model from scratch")
+            self.logger.warn("No checkpoint found. Initializing model from scratch")
             return {}
         self.logger.info("Loading checkpoint from {}".format(f))
         checkpoint = self._load_file(f)
diff --git a/bob/ip/binseg/utils/plot.py b/bob/ip/binseg/utils/plot.py
index b5943e9dc75c3ce034bdc70a8e49c4d231038880..ceb268d06ee29c42c750e58fc37a1cfd438af2fb 100644
--- a/bob/ip/binseg/utils/plot.py
+++ b/bob/ip/binseg/utils/plot.py
@@ -416,11 +416,6 @@ def overlay(dataset, output_path):
         # get sample
         name  = sample[0]
         img = VF.to_pil_image(sample[1]) # PIL Image
-        gt = sample[2].byte() # byte tensor
-        
-        # read metrics 
-        #metrics = pd.read_csv(os.path.join(output_path,'results',name+'.csv'))
-        #f1 = metrics[' f1_score'].max()
         
         # read probability output 
         pred = Image.open(os.path.join(output_path,'images',name)).convert(mode='L')
diff --git a/doc/configs.rst b/doc/configs.rst
index a45e2986ec489bc5216e3512c4b7acbe39454051..e25e66c6afc3e2fc629c915f25475d101f81033c 100644
--- a/doc/configs.rst
+++ b/doc/configs.rst
@@ -14,6 +14,17 @@ ImageFolder
 ----------------
 .. literalinclude:: ../bob/ip/binseg/configs/datasets/imagefolder.py
 
+.. _bob.ip.binseg.configs.datasets.imagefoldertest:
+
+ImageFolderTest
+----------------
+.. literalinclude:: ../bob/ip/binseg/configs/datasets/imagefoldertest.py
+
+.. _bob.ip.binseg.configs.datasets.imagefolderinference:
+
+ImageFolderInference
+---------------------
+.. literalinclude:: ../bob/ip/binseg/configs/datasets/imagefolderinference.py
 
 .. _bob.ip.binseg.configs.datasets.chasedb1:
 
diff --git a/doc/datasets.rst b/doc/datasets.rst
index 9e00c4398734b28be0990ef77c3ecc7b59935822..5b82d3b12f8909a414055a17a3360c73d8449e1e 100644
--- a/doc/datasets.rst
+++ b/doc/datasets.rst
@@ -42,7 +42,23 @@ dataset folder structure for images and ground-truth (gt):
        |- images
        |- gt 
 
-In the dataset config :ref:`bob.ip.binseg.configs.datasets.imagefolder` the full path of the dataset has to be amended. Training can then for example be started with
-``bob binseg train M2UNet IMAGEFOLDER -b 4 -d cuda -o /my/output/path -vv``
+the file names should have the same stem. Currently all image formats that can be read via PIL are supported. Additionally we support hdf5 binary files.
+
+For training a new dataset config needs to be created. You can copy the template :ref:`bob.ip.binseg.configs.datasets.imagefolder` and amend accordingly, 
+e.g. the full path of the dataset and if necessary any preprocessing steps such as resizing, cropping, padding etc..
+
+Training can then be started with
+
+.. code-block:: bash
+
+    bob binseg train M2UNet /path/to/myimagefolderconfig.py -b 4 -d cuda -o /my/output/path -vv
+
+Similary for testing, a test dataset config needs to be created. You can copy the template :ref:`bob.ip.binseg.configs.datasets.imagefoldertest` and amend accordingly.
+
+Testing can then be started with 
+
+.. code-block:: bash
+
+    bob binseg test M2UNet /path/to/myimagefoldertestconfig.py -b 2 -d cuda -o /my/output/path -vv
 
 .. include:: links.rst
diff --git a/doc/evaluation.rst b/doc/evaluation.rst
index e807c954ce2eb867e7a37669904a3d25c56d2bbf..2fb923a00978a3a6aa172722697472f77c94a118 100644
--- a/doc/evaluation.rst
+++ b/doc/evaluation.rst
@@ -26,6 +26,34 @@ E.g. run inference on model M2U-Net on the DRIVE test set:
     # The evaluation results will be stored in the same folder
     bob binseg test M2UNet DRIVETEST -o /DRIVE/M2UNet/output
 
+Outputs
+========
+The inference run generates the following output files:
+
+.. code-block:: bash
+
+    .
+    ├── images  # the predicted probabilities as grayscale images in .png format 
+    ├── hdf5    # the predicted probabilties in hdf5 format
+    ├── last_checkpoint  # text file that keeps track of the last checkpoint 
+    ├── M2UNet_trainlog.csv # training log 
+    ├── M2UNet_trainlog.pdf # training log plot
+    ├── model_*.pth # model checkpoints
+    └── results
+        ├── image*.jpg.csv # evaluation metrics for each image
+        ├── Metrics.csv # average evaluation metrics
+        ├── ModelSummary.txt # model summary and parameter count
+        ├── precision_recall.pdf # precision vs recall plot
+        └── Times.txt # inference times
+
+Inference Only Mode
+====================
+
+If you wish to run inference only on a folder containing images, use the ``predict`` function in combination with a :ref:`bob.ip.binseg.configs.datasets.imagefolderinference` config. E.g.:
+
+.. code-block:: bash
+
+    bob binseg predict M2UNet /path/to/myinferencedatasetconfig.py -b 1 -d cpu -o /my/output/path -w /path/to/pretrained/weight/model_final.pth -vv
 
 Pretrained Models
 =================