diff --git a/bob/ip/binseg/engine/evaluator.py b/bob/ip/binseg/engine/evaluator.py
index 1bac73519c91419d3fb908375092468a7547767c..6e957cadca8867205188ea6026286c51d8b4f60a 100644
--- a/bob/ip/binseg/engine/evaluator.py
+++ b/bob/ip/binseg/engine/evaluator.py
@@ -15,7 +15,7 @@ import torchvision.transforms.functional as VF
 
 import h5py
 
-from ..utils.metric import base_metrics
+from ..utils.metric import base_measures
 
 import logging
 
@@ -106,7 +106,7 @@ def _sample_metrics(pred, gt, bins):
             accuracy,
             jaccard,
             f1_score,
-        ) = base_metrics(tp_count, fp_count, tn_count, fn_count)
+        ) = base_measures(tp_count, fp_count, tn_count, fn_count)
 
         data.append(
             [
diff --git a/bob/ip/binseg/test/test_basemetrics.py b/bob/ip/binseg/test/test_basemetrics.py
deleted file mode 100644
index 969894f5e453bfdf6fc86fe07448d8e1c8f7ece2..0000000000000000000000000000000000000000
--- a/bob/ip/binseg/test/test_basemetrics.py
+++ /dev/null
@@ -1,48 +0,0 @@
-#!/usr/bin/env python
-# -*- coding: utf-8 -*-
-
-import unittest
-from bob.ip.binseg.utils.metric import base_metrics
-import random
-
-
-class Tester(unittest.TestCase):
-    """
-    Unit test for base metrics
-    """
-
-    def setUp(self):
-        self.tp = random.randint(1, 100)
-        self.fp = random.randint(1, 100)
-        self.tn = random.randint(1, 100)
-        self.fn = random.randint(1, 100)
-
-    def test_precision(self):
-        precision = base_metrics(self.tp, self.fp, self.tn, self.fn)[0]
-        self.assertEqual((self.tp) / (self.tp + self.fp), precision)
-
-    def test_recall(self):
-        recall = base_metrics(self.tp, self.fp, self.tn, self.fn)[1]
-        self.assertEqual((self.tp) / (self.tp + self.fn), recall)
-
-    def test_specificity(self):
-        specificity = base_metrics(self.tp, self.fp, self.tn, self.fn)[2]
-        self.assertEqual((self.tn) / (self.tn + self.fp), specificity)
-
-    def test_accuracy(self):
-        accuracy = base_metrics(self.tp, self.fp, self.tn, self.fn)[3]
-        self.assertEqual(
-            (self.tp + self.tn) / (self.tp + self.tn + self.fp + self.fn), accuracy
-        )
-
-    def test_jaccard(self):
-        jaccard = base_metrics(self.tp, self.fp, self.tn, self.fn)[4]
-        self.assertEqual(self.tp / (self.tp + self.fp + self.fn), jaccard)
-
-    def test_f1(self):
-        f1 = base_metrics(self.tp, self.fp, self.tn, self.fn)[5]
-        self.assertEqual((2.0 * self.tp) / (2.0 * self.tp + self.fp + self.fn), f1)
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/bob/ip/binseg/test/test_measures.py b/bob/ip/binseg/test/test_measures.py
new file mode 100644
index 0000000000000000000000000000000000000000..71c2b241f10569284da2e20ad183a95841e74db4
--- /dev/null
+++ b/bob/ip/binseg/test/test_measures.py
@@ -0,0 +1,105 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+import random
+import unittest
+
+import math
+import nose.tools
+
+from ..utils.measure import base_measures, auc
+
+
+class Tester(unittest.TestCase):
+    """
+    Unit test for base measures
+    """
+
+    def setUp(self):
+        self.tp = random.randint(1, 100)
+        self.fp = random.randint(1, 100)
+        self.tn = random.randint(1, 100)
+        self.fn = random.randint(1, 100)
+
+    def test_precision(self):
+        precision = base_measures(self.tp, self.fp, self.tn, self.fn)[0]
+        self.assertEqual((self.tp) / (self.tp + self.fp), precision)
+
+    def test_recall(self):
+        recall = base_measures(self.tp, self.fp, self.tn, self.fn)[1]
+        self.assertEqual((self.tp) / (self.tp + self.fn), recall)
+
+    def test_specificity(self):
+        specificity = base_measures(self.tp, self.fp, self.tn, self.fn)[2]
+        self.assertEqual((self.tn) / (self.tn + self.fp), specificity)
+
+    def test_accuracy(self):
+        accuracy = base_measures(self.tp, self.fp, self.tn, self.fn)[3]
+        self.assertEqual(
+            (self.tp + self.tn) / (self.tp + self.tn + self.fp + self.fn),
+            accuracy,
+        )
+
+    def test_jaccard(self):
+        jaccard = base_measures(self.tp, self.fp, self.tn, self.fn)[4]
+        self.assertEqual(self.tp / (self.tp + self.fp + self.fn), jaccard)
+
+    def test_f1(self):
+        p, r, s, a, j, f1 = base_measures(self.tp, self.fp, self.tn, self.fn)
+        self.assertEqual(
+            (2.0 * self.tp) / (2.0 * self.tp + self.fp + self.fn), f1
+        )
+        self.assertAlmostEqual((2 * p * r) / (p + r), f1)  # base definition
+
+
+def test_auc():
+
+    # basic tests
+    assert math.isclose(auc([0.0, 0.5, 1.0], [1.0, 1.0, 1.0]), 1.0)
+    assert math.isclose(
+        auc([0.0, 0.5, 1.0], [1.0, 0.5, 0.0]), 0.5, rel_tol=0.001
+    )
+    assert math.isclose(
+        auc([0.0, 0.5, 1.0], [0.0, 0.0, 0.0]), 0.0, rel_tol=0.001
+    )
+    assert math.isclose(
+        auc([0.0, 0.5, 1.0], [0.0, 1.0, 0.0]), 0.5, rel_tol=0.001
+    )
+    assert math.isclose(
+        auc([0.0, 0.5, 1.0], [0.0, 0.5, 0.0]), 0.25, rel_tol=0.001
+    )
+    assert math.isclose(
+        auc([0.0, 0.5, 1.0], [0.0, 0.5, 0.0]), 0.25, rel_tol=0.001
+    )
+
+    # reversing tht is also true
+    assert math.isclose(auc([0.0, 0.5, 1.0][::-1], [1.0, 1.0, 1.0][::-1]), 1.0)
+    assert math.isclose(
+        auc([0.0, 0.5, 1.0][::-1], [1.0, 0.5, 0.0][::-1]), 0.5, rel_tol=0.001
+    )
+    assert math.isclose(
+        auc([0.0, 0.5, 1.0][::-1], [0.0, 0.0, 0.0][::-1]), 0.0, rel_tol=0.001
+    )
+    assert math.isclose(
+        auc([0.0, 0.5, 1.0][::-1], [0.0, 1.0, 0.0][::-1]), 0.5, rel_tol=0.001
+    )
+    assert math.isclose(
+        auc([0.0, 0.5, 1.0][::-1], [0.0, 0.5, 0.0][::-1]), 0.25, rel_tol=0.001
+    )
+    assert math.isclose(
+        auc([0.0, 0.5, 1.0][::-1], [0.0, 0.5, 0.0][::-1]), 0.25, rel_tol=0.001
+    )
+
+
+@nose.tools.raises(ValueError)
+def test_auc_raises_value_error():
+
+    # x is **not** monotonically increasing or decreasing
+    assert math.isclose(auc([0.0, 0.5, 0.0], [1.0, 1.0, 1.0]), 1.0)
+
+
+@nose.tools.raises(AssertionError)
+def test_auc_raises_assertion_error():
+
+    # x is **not** the same size as y
+    assert math.isclose(auc([0.0, 0.5, 1.0], [1.0, 1.0]), 1.0)
diff --git a/bob/ip/binseg/utils/metric.py b/bob/ip/binseg/utils/measure.py
similarity index 55%
rename from bob/ip/binseg/utils/metric.py
rename to bob/ip/binseg/utils/measure.py
index b49f4ede3265fc18a38125178622f2dfe288c59d..881ac7c8ce7488db55fe93f2bdee5763e0008ecd 100644
--- a/bob/ip/binseg/utils/metric.py
+++ b/bob/ip/binseg/utils/measure.py
@@ -28,33 +28,62 @@ class SmoothedValue:
         return d.mean().item()
 
 
-def base_metrics(tp, fp, tn, fn):
+def base_measures(tp, fp, tn, fn):
     """
-    Calculates Precision, Recall (=Sensitivity), Specificity, Accuracy, Jaccard and F1-score (Dice)
+    Calculates a bunch of measures from true/false positive and negative counts
+
+    This function can return standard machine learning measures from true and
+    false positive counts of positives and negatives.
+
+    For a thorough look into these and alternate names for the returned values,
+    please check Wikipedia's entry on `Precision and Recall`_.
 
 
     Parameters
     ----------
 
-    tp : float
-        True positives
+    tp : int
+        True positive count, AKA "hit"
 
-    fp : float
-        False positives
+    fp : int
+        False positive count, AKA, "correct rejection"
 
-    tn : float
-        True negatives
+    tn : int
+        True negative count, AKA "false alarm", or "Type I error"
 
-    fn : float
-        False Negatives
+    fn : int
+        False Negative count, AKA "miss", or "Type II error"
 
 
     Returns
     -------
 
-    metrics : list
+    precision : float
+        P, AKA positive predictive value (PPV)
+        :math:`\frac{tp}{tp+fp}`
+
+    recall : float
+        R, AKA sensitivity, hit rate, or true positive rate (TPR)
+        :math:`\frac{tp}{p} = \frac{tp}{tp+fn}`
+
+    specificity : float
+        S, AKA selectivity or true negative rate (TNR).
+        :math:`\frac{tn}{n} = \frac{tn}{tn+fp}`
+
+    accuracy : float
+        A, :math:`\frac{tp + tn}{p + n} = \frac{tp + tn}{tp + fp + tn + fn}`
+
+    jaccard : float
+        J, :math:`\frac{tp}{tp+fp+fn}`, see `Jaccard Index`_
+
+    f1_score : float
+        F1, :math:`\frac{2 P R}{P + R} = \frac{2tp}{2tp + fp + fn}`, see
+        `F1-score`_
 
     """
+
+    tp = float(tp)
+    tn = float(tn)
     precision = tp / (tp + fp + ((tp + fp) == 0))
     recall = tp / (tp + fn + ((tp + fn) == 0))
     specificity = tn / (fp + tn + ((fp + tn) == 0))
@@ -87,7 +116,10 @@ def auc(x, y):
 
     """
 
-    assert len(x) == len(y)
+    x = numpy.array(x)
+    y = numpy.array(y)
+
+    assert len(x) == len(y), "x and y sequences must have the same length"
 
     dx = numpy.diff(x)
     if numpy.any(dx < 0):
@@ -99,18 +131,11 @@ def auc(x, y):
             raise ValueError("x is neither increasing nor decreasing "
                              ": {}.".format(x))
 
-    # avoids repeated sums for every y
-    y_unique, y_unique_ndx = numpy.unique(y, return_index=True)
-    x_unique = x[y_unique_ndx]
-
-    if y_unique.shape[0] > 1:
-        x_interp = numpy.interp(
-            numpy.arange(0, 1, 0.001),
-            y_unique,
-            x_unique,
-            left=0.0,
-            right=0.0,
-        )
-        return x_interp.sum() * 0.001
-
-    return 0.0
+    y_interp = numpy.interp(
+        numpy.arange(0, 1, 0.001),
+        numpy.array(x),
+        numpy.array(y),
+        left=1.0,
+        right=0.0,
+    )
+    return y_interp.sum() * 0.001