diff --git a/bob/ip/binseg/engine/evaluator.py b/bob/ip/binseg/engine/evaluator.py
index 82a8ba8de373826b636c9e5f8c07a3642231bfea..134ea2e0700b1eb5b09553e3942ee78d43f89e85 100644
--- a/bob/ip/binseg/engine/evaluator.py
+++ b/bob/ip/binseg/engine/evaluator.py
@@ -24,7 +24,39 @@ logger = logging.getLogger(__name__)
 
 
 def _posneg(pred, gt, threshold):
-    """Calculates true and false positives and negatives"""
+    """Calculates true and false positives and negatives
+
+
+    Parameters
+    ----------
+
+    pred : torch.Tensor
+        pixel-wise predictions
+
+    gt : torch.Tensor
+        ground-truth (annotations)
+
+    threshold : float
+        a particular threshold in which to calculate the performance
+        measures
+
+
+    Returns
+    -------
+
+    tp_tensor : torch.Tensor
+        boolean tensor with true positives, considering all observations
+
+    fp_tensor : torch.Tensor
+        boolean tensor with false positives, considering all observations
+
+    tn_tensor : torch.Tensor
+        boolean tensor with true negatives, considering all observations
+
+    fn_tensor : torch.Tensor
+        boolean tensor with false negatives, considering all observations
+
+    """
 
     gt = gt.byte()  # byte tensor
 
@@ -39,18 +71,18 @@ def _posneg(pred, gt, threshold):
     tp_tensor = gt * binary_pred
 
     # false positives
-    fp_tensor = torch.eq((binary_pred + tp_tensor), 1)
+    fp_tensor = torch.eq((binary_pred + tp_tensor), 1).byte()
 
     # true negatives
     tn_tensor = equals - tp_tensor
 
     # false negatives
-    fn_tensor = notequals - fp_tensor.type(torch.uint8)
+    fn_tensor = notequals - fp_tensor
 
     return tp_tensor, fp_tensor, tn_tensor, fn_tensor
 
 
-def _sample_measures_for_threshold(pred, gt, threshold):
+def sample_measures_for_threshold(pred, gt, mask, threshold):
     """
     Calculates measures on one single sample, for a specific threshold
 
@@ -64,6 +96,9 @@ def _sample_measures_for_threshold(pred, gt, threshold):
     gt : torch.Tensor
         ground-truth (annotations)
 
+    mask : torch.Tensor
+        region mask (used only if available).  May be set to ``None``.
+
     threshold : float
         a particular threshold in which to calculate the performance
         measures
@@ -88,15 +123,25 @@ def _sample_measures_for_threshold(pred, gt, threshold):
 
     tp_tensor, fp_tensor, tn_tensor, fn_tensor = _posneg(pred, gt, threshold)
 
+    # if a mask is provided, consider only TP/FP/TN/FN **within** the region of
+    # interest defined by the mask
+    if mask is not None:
+        antimask = torch.le(mask, 0.5)
+        tp_tensor[antimask] = 0
+        fp_tensor[antimask] = 0
+        tn_tensor[antimask] = 0
+        fn_tensor[antimask] = 0
+
     # calc measures from scalars
     tp_count = torch.sum(tp_tensor).item()
     fp_count = torch.sum(fp_tensor).item()
     tn_count = torch.sum(tn_tensor).item()
     fn_count = torch.sum(fn_tensor).item()
+
     return base_measures(tp_count, fp_count, tn_count, fn_count)
 
 
-def _sample_measures(pred, gt, steps):
+def _sample_measures(pred, gt, mask, steps):
     """
     Calculates measures on one single sample
 
@@ -110,6 +155,9 @@ def _sample_measures(pred, gt, steps):
     gt : torch.Tensor
         ground-truth (annotations)
 
+    mask : torch.Tensor
+        region mask (used only if available).  May be set to ``None``.
+
     steps : int
         number of steps to use for threshold analysis.  The step size is
         calculated from this by dividing ``1.0/steps``
@@ -134,7 +182,8 @@ def _sample_measures(pred, gt, steps):
 
     step_size = 1.0 / steps
     data = [
-        (index, threshold) + _sample_measures_for_threshold(pred, gt, threshold)
+        (index, threshold) + sample_measures_for_threshold(pred, gt, mask,
+            threshold)
         for index, threshold in enumerate(numpy.arange(0.0, 1.0, step_size))
     ]
 
@@ -157,6 +206,7 @@ def _sample_analysis(
     img,
     pred,
     gt,
+    mask,
     threshold,
     tp_color=(0, 255, 0),  # (128,128,128) Gray
     fp_color=(0, 0, 255),  # (70, 240, 240) Cyan
@@ -178,6 +228,9 @@ def _sample_analysis(
     gt : torch.Tensor
         ground-truth (annotations)
 
+    mask : torch.Tensor
+        region mask (used only if available).  May be set to ``None``.
+
     threshold : float
         The threshold to be used while analyzing this image's probability map
 
@@ -207,6 +260,15 @@ def _sample_analysis(
 
     tp_tensor, fp_tensor, tn_tensor, fn_tensor = _posneg(pred, gt, threshold)
 
+    # if a mask is provided, consider only TP/FP/TN/FN **within** the region of
+    # interest defined by the mask
+    if mask is not None:
+        antimask = torch.le(mask, 0.5)
+        tp_tensor[antimask] = 0
+        fp_tensor[antimask] = 0
+        tn_tensor[antimask] = 0
+        fn_tensor[antimask] = 0
+
     # change to PIL representation
     tp_pil = VF.to_pil_image(tp_tensor.float())
     tp_pil_colored = PIL.ImageOps.colorize(tp_pil, (0, 0, 0), tp_color)
@@ -295,6 +357,7 @@ def run(
         stem = sample[0]
         image = sample[1]
         gt = sample[2]
+        mask = None if len(sample) <= 3 else sample[3]
         pred_fullpath = os.path.join(use_predictions_folder, stem + ".hdf5")
         with h5py.File(pred_fullpath, "r") as f:
             pred = f["array"][:]
@@ -303,7 +366,7 @@ def run(
             raise RuntimeError(
                 f"{stem} entry already exists in data. Cannot overwrite."
             )
-        data[stem] = _sample_measures(pred, gt, steps)
+        data[stem] = _sample_measures(pred, gt, mask, steps)
 
         if output_folder is not None:
             fullpath = os.path.join(output_folder, name, f"{stem}.csv")
@@ -313,7 +376,7 @@ def run(
 
         if overlayed_folder is not None:
             overlay_image = _sample_analysis(
-                image, pred, gt, threshold=threshold, overlay=True
+                image, pred, gt, mask, threshold=threshold, overlay=True
             )
             fullpath = os.path.join(overlayed_folder, name, f"{stem}.png")
             tqdm.write(f"Saving {fullpath}...")
@@ -432,11 +495,12 @@ def compare_annotators(
         image = baseline_sample[1]
         gt = baseline_sample[2]
         pred = other_sample[2]  # works as a prediction
+        mask = None if len(sample) <= 3 else sample[3]
         if stem in data:
             raise RuntimeError(
                 f"{stem} entry already exists in data. " f"Cannot overwrite."
             )
-        data[stem] = _sample_measures(pred, gt, 2)
+        data[stem] = _sample_measures(pred, gt, mask, 2)
 
         if output_folder is not None:
             fullpath = os.path.join(
@@ -448,7 +512,7 @@ def compare_annotators(
 
         if overlayed_folder is not None:
             overlay_image = _sample_analysis(
-                image, pred, gt, threshold=0.5, overlay=True
+                image, pred, gt, mask, threshold=0.5, overlay=True
             )
             fullpath = os.path.join(
                 overlayed_folder, "second-annotator", name, f"{stem}.png"
diff --git a/bob/ip/binseg/test/test_measures.py b/bob/ip/binseg/test/test_measures.py
index 71c2b241f10569284da2e20ad183a95841e74db4..cddcd0f4c30513adee22d07b411839469257d57f 100644
--- a/bob/ip/binseg/test/test_measures.py
+++ b/bob/ip/binseg/test/test_measures.py
@@ -5,9 +5,11 @@ import random
 import unittest
 
 import math
+import torch
 import nose.tools
 
 from ..utils.measure import base_measures, auc
+from ..engine.evaluator import sample_measures_for_threshold
 
 
 class Tester(unittest.TestCase):
@@ -103,3 +105,85 @@ def test_auc_raises_assertion_error():
 
     # x is **not** the same size as y
     assert math.isclose(auc([0.0, 0.5, 1.0], [1.0, 1.0]), 1.0)
+
+
+def test_sample_measures_mask_checkerbox():
+
+    prediction = torch.ones((4, 4), dtype=float)
+    ground_truth = torch.ones((4, 4), dtype=float)
+    ground_truth[2:, :2] = 0.0
+    ground_truth[:2, 2:] = 0.0
+    mask = torch.zeros((4, 4), dtype=float)
+    mask[1:3, 1:3] = 1.0
+    threshold = 0.5
+
+    # with this configuration, this should be the correct count
+    tp = 2
+    fp = 2
+    tn = 0
+    fn = 0
+
+    nose.tools.eq_(
+        base_measures(tp, fp, tn, fn),
+        sample_measures_for_threshold(
+            prediction, ground_truth, mask, threshold
+        ),
+    )
+
+
+def test_sample_measures_mask_cross():
+
+    prediction = torch.ones((10, 10), dtype=float)
+    prediction[0,:] = 0.0
+    prediction[9,:] = 0.0
+    ground_truth = torch.ones((10, 10), dtype=float)
+    ground_truth[:5,] = 0.0  #lower part is not to be set
+    mask = torch.zeros((10, 10), dtype=float)
+    mask[(0,1,2,3,4,5,6,7,8,9),(0,1,2,3,4,5,6,7,8,9)] = 1.0
+    mask[(0,1,2,3,4,5,6,7,8,9),(9,8,7,6,5,4,3,2,1,0)] = 1.0
+    threshold = 0.5
+
+    # with this configuration, this should be the correct count
+    tp = 8
+    fp = 8
+    tn = 2
+    fn = 2
+
+    nose.tools.eq_(
+        base_measures(tp, fp, tn, fn),
+        sample_measures_for_threshold(
+            prediction, ground_truth, mask, threshold
+        ),
+    )
+
+
+def test_sample_measures_mask_border():
+
+    prediction = torch.zeros((10, 10), dtype=float)
+    prediction[:,4] = 1.0
+    prediction[:,5] = 1.0
+    prediction[0,4] = 0.0
+    prediction[8,4] = 0.0
+    prediction[1,6] = 1.0
+    ground_truth = torch.zeros((10, 10), dtype=float)
+    ground_truth[:,4] = 1.0
+    ground_truth[:,5] = 1.0
+    mask = torch.ones((10, 10), dtype=float)
+    mask[:,0] = 0.0
+    mask[0,:] = 0.0
+    mask[:,9] = 0.0
+    mask[9,:] = 0.0
+    threshold = 0.5
+
+    # with this configuration, this should be the correct count
+    tp = 15
+    fp = 1
+    tn = 47
+    fn = 1
+
+    nose.tools.eq_(
+        base_measures(tp, fp, tn, fn),
+        sample_measures_for_threshold(
+            prediction, ground_truth, mask, threshold
+        ),
+    )
diff --git a/doc/datasets.rst b/doc/datasets.rst
index a1fb29e04e27da12f0d508f15be1a498ff589c70..32d195321f92cb825c439a05318c1e51fb89cdd8 100644
--- a/doc/datasets.rst
+++ b/doc/datasets.rst
@@ -12,7 +12,7 @@ can be downloaded.  We include the reference of the data split protocols used
 to generate iterators for training and testing.
 
 
-.. list-table::
+.. list-table:: Supported Datasets (``*`` provided within this package)
 
    * - Dataset
      - Reference
@@ -40,7 +40,7 @@ to generate iterators for training and testing.
      - [STARE-2000]_
      - 605 x 700
      - 20
-     -
+     - *
      - x
      -
      -
@@ -51,7 +51,7 @@ to generate iterators for training and testing.
      - [CHASEDB1-2012]_
      - 960 x 999
      - 28
-     -
+     - *
      - x
      -
      -