diff --git a/bob/ip/binseg/engine/evaluator.py b/bob/ip/binseg/engine/evaluator.py
index 6e957cadca8867205188ea6026286c51d8b4f60a..1163584894f1b95c64363ff5d4675a5ea39fce85 100644
--- a/bob/ip/binseg/engine/evaluator.py
+++ b/bob/ip/binseg/engine/evaluator.py
@@ -15,7 +15,7 @@ import torchvision.transforms.functional as VF
 
 import h5py
 
-from ..utils.metric import base_measures
+from ..utils.measure import base_measures
 
 import logging
 
@@ -49,9 +49,9 @@ def _posneg(pred, gt, threshold):
     return tp_tensor, fp_tensor, tn_tensor, fn_tensor
 
 
-def _sample_metrics(pred, gt, bins):
+def _sample_measures(pred, gt, bins):
     """
-    Calculates metrics on one single sample and saves it to disk
+    Calculates measures on one single sample and saves it to disk
 
 
     Parameters
@@ -71,7 +71,7 @@ def _sample_metrics(pred, gt, bins):
     Returns
     -------
 
-    metrics : pandas.DataFrame
+    measures : pandas.DataFrame
 
         A pandas dataframe with the following columns:
 
@@ -94,7 +94,7 @@ def _sample_metrics(pred, gt, bins):
             pred, gt, threshold
         )
 
-        # calc metrics from scalars
+        # calc measures from scalars
         tp_count = torch.sum(tp_tensor).item()
         fp_count = torch.sum(fp_tensor).item()
         tn_count = torch.sum(tn_tensor).item()
@@ -221,7 +221,7 @@ def run(
     threshold=None,
 ):
     """
-    Runs inference and calculates metrics
+    Runs inference and calculates measures
 
 
     Parameters
@@ -232,7 +232,7 @@ def run(
 
     name : str
         the local name of this dataset (e.g. ``train``, or ``test``), to be
-        used when saving metrics files.
+        used when saving measures files.
 
     predictions_folder : str
         folder where predictions for the dataset images has been previously
@@ -263,7 +263,7 @@ def run(
 
     """
 
-    # Collect overall metrics
+    # Collect overall measures
     bins = 1000  # number of thresholds to analyse for
     data = {}
 
@@ -279,7 +279,7 @@ def run(
             raise RuntimeError(
                 f"{stem} entry already exists in data. Cannot overwrite."
             )
-        data[stem] = _sample_metrics(pred, gt, bins)
+        data[stem] = _sample_measures(pred, gt, bins)
 
         if overlayed_folder is not None:
             overlay_image = _sample_analysis(
@@ -291,31 +291,31 @@ def run(
             overlay_image.save(fullpath)
 
     # Merges all dataframes together
-    df_metrics = pandas.concat(data.values())
+    df_measures = pandas.concat(data.values())
 
     # Report and Averages
-    avg_metrics = df_metrics.groupby("index").mean()
-    std_metrics = df_metrics.groupby("index").std()
+    avg_measures = df_measures.groupby("index").mean()
+    std_measures = df_measures.groupby("index").std()
 
     # Uncomment below for F1-score calculation based on average precision and
-    # metrics instead of F1-scores of individual images. This method is in line
+    # measures instead of F1-scores of individual images. This method is in line
     # with Maninis et. al. (2016)
     #
-    # avg_metrics["f1_score"] = \
-    #         (2* avg_metrics["precision"]*avg_metrics["recall"])/ \
-    #         (avg_metrics["precision"]+avg_metrics["recall"])
-
-    avg_metrics["std_pr"] = std_metrics["precision"]
-    avg_metrics["pr_upper"] = avg_metrics["precision"] + std_metrics["precision"]
-    avg_metrics["pr_lower"] = avg_metrics["precision"] - std_metrics["precision"]
-    avg_metrics["std_re"] = std_metrics["recall"]
-    avg_metrics["re_upper"] = avg_metrics["recall"] + std_metrics["recall"]
-    avg_metrics["re_lower"] = avg_metrics["recall"] - std_metrics["recall"]
-    avg_metrics["std_f1"] = std_metrics["f1_score"]
-
-    maxf1 = avg_metrics["f1_score"].max()
-    maxf1_index = avg_metrics["f1_score"].idxmax()
-    maxf1_threshold = avg_metrics["threshold"][maxf1_index]
+    # avg_measures["f1_score"] = \
+    #         (2* avg_measures["precision"]*avg_measures["recall"])/ \
+    #         (avg_measures["precision"]+avg_measures["recall"])
+
+    avg_measures["std_pr"] = std_measures["precision"]
+    avg_measures["pr_upper"] = avg_measures["precision"] + std_measures["precision"]
+    avg_measures["pr_lower"] = avg_measures["precision"] - std_measures["precision"]
+    avg_measures["std_re"] = std_measures["recall"]
+    avg_measures["re_upper"] = avg_measures["recall"] + std_measures["recall"]
+    avg_measures["re_lower"] = avg_measures["recall"] - std_measures["recall"]
+    avg_measures["std_f1"] = std_measures["f1_score"]
+
+    maxf1 = avg_measures["f1_score"].max()
+    maxf1_index = avg_measures["f1_score"].idxmax()
+    maxf1_threshold = avg_measures["threshold"][maxf1_index]
 
     logger.info(
         f"Maximum F1-score of {maxf1:.5f}, achieved at "
@@ -326,8 +326,8 @@ def run(
 
         # get the closest possible threshold we have
         index = int(round(bins * threshold))
-        f1_a_priori = avg_metrics["f1_score"][index]
-        actual_threshold = avg_metrics["threshold"][index]
+        f1_a_priori = avg_measures["f1_score"][index]
+        actual_threshold = avg_measures["threshold"][index]
 
         logger.info(
             f"F1-score of {f1_a_priori:.5f}, at threshold "
@@ -337,11 +337,11 @@ def run(
     if output_folder is not None:
         logger.info(f"Output folder: {output_folder}")
         os.makedirs(output_folder, exist_ok=True)
-        metrics_path = os.path.join(output_folder, f"{name}.csv")
+        measures_path = os.path.join(output_folder, f"{name}.csv")
         logger.info(
-            f"Saving averages over all input images at {metrics_path}..."
+            f"Saving averages over all input images at {measures_path}..."
         )
-        avg_metrics.to_csv(metrics_path)
+        avg_measures.to_csv(measures_path)
 
     return maxf1_threshold
 
@@ -364,7 +364,7 @@ def compare_annotators(baseline, other, name, output_folder,
 
     name : str
         the local name of this dataset (e.g. ``train-second-annotator``, or
-        ``test-second-annotator``), to be used when saving metrics files.
+        ``test-second-annotator``), to be used when saving measures files.
 
     output_folder : str
         folder where to store results
@@ -378,7 +378,7 @@ def compare_annotators(baseline, other, name, output_folder,
     logger.info(f"Output folder: {output_folder}")
     os.makedirs(output_folder, exist_ok=True)
 
-    # Collect overall metrics
+    # Collect overall measures
     data = {}
 
     for baseline_sample, other_sample in tqdm(
@@ -392,7 +392,7 @@ def compare_annotators(baseline, other, name, output_folder,
             raise RuntimeError(
                 f"{stem} entry already exists in data. " f"Cannot overwrite."
             )
-        data[stem] = _sample_metrics(pred, gt, 2)
+        data[stem] = _sample_measures(pred, gt, 2)
 
         if overlayed_folder is not None:
             overlay_image = _sample_analysis(
@@ -405,33 +405,33 @@ def compare_annotators(baseline, other, name, output_folder,
             overlay_image.save(fullpath)
 
     # Merges all dataframes together
-    df_metrics = pandas.concat(data.values())
-    df_metrics.drop(0, inplace=True)
+    df_measures = pandas.concat(data.values())
+    df_measures.drop(0, inplace=True)
 
     # Report and Averages
-    avg_metrics = df_metrics.groupby("index").mean()
-    std_metrics = df_metrics.groupby("index").std()
+    avg_measures = df_measures.groupby("index").mean()
+    std_measures = df_measures.groupby("index").std()
 
     # Uncomment below for F1-score calculation based on average precision and
     # {name} instead of F1-scores of individual images. This method is in line
     # with Maninis et. al. (2016)
     #
-    # avg_metrics["f1_score"] = \
-    #         (2* avg_metrics["precision"]*avg_metrics["recall"])/ \
-    #         (avg_metrics["precision"]+avg_metrics["recall"])
-
-    avg_metrics["std_pr"] = std_metrics["precision"]
-    avg_metrics["pr_upper"] = avg_metrics["precision"] + std_metrics["precision"]
-    avg_metrics["pr_lower"] = avg_metrics["precision"] - std_metrics["precision"]
-    avg_metrics["std_re"] = std_metrics["recall"]
-    avg_metrics["re_upper"] = avg_metrics["recall"] + std_metrics["recall"]
-    avg_metrics["re_lower"] = avg_metrics["recall"] - std_metrics["recall"]
-    avg_metrics["std_f1"] = std_metrics["f1_score"]
-
-    metrics_path = os.path.join(output_folder, "second-annotator", f"{name}.csv")
-    os.makedirs(os.path.dirname(metrics_path), exist_ok=True)
-    logger.info(f"Saving averages over all input images at {metrics_path}...")
-    avg_metrics.to_csv(metrics_path)
-
-    maxf1 = avg_metrics["f1_score"].max()
+    # avg_measures["f1_score"] = \
+    #         (2* avg_measures["precision"]*avg_measures["recall"])/ \
+    #         (avg_measures["precision"]+avg_measures["recall"])
+
+    avg_measures["std_pr"] = std_measures["precision"]
+    avg_measures["pr_upper"] = avg_measures["precision"] + std_measures["precision"]
+    avg_measures["pr_lower"] = avg_measures["precision"] - std_measures["precision"]
+    avg_measures["std_re"] = std_measures["recall"]
+    avg_measures["re_upper"] = avg_measures["recall"] + std_measures["recall"]
+    avg_measures["re_lower"] = avg_measures["recall"] - std_measures["recall"]
+    avg_measures["std_f1"] = std_measures["f1_score"]
+
+    measures_path = os.path.join(output_folder, "second-annotator", f"{name}.csv")
+    os.makedirs(os.path.dirname(measures_path), exist_ok=True)
+    logger.info(f"Saving averages over all input images at {measures_path}...")
+    avg_measures.to_csv(measures_path)
+
+    maxf1 = avg_measures["f1_score"].max()
     logger.info(f"F1-score of {maxf1:.5f} (second annotator; threshold=0.5)")
diff --git a/bob/ip/binseg/engine/ssltrainer.py b/bob/ip/binseg/engine/ssltrainer.py
index d56202074bd77c4513b243eff0f71dfec4e8307e..d8b66b69d5e729de87abafd4963a9fc71a4a87d9 100644
--- a/bob/ip/binseg/engine/ssltrainer.py
+++ b/bob/ip/binseg/engine/ssltrainer.py
@@ -12,7 +12,7 @@ import pandas
 import torch
 from tqdm import tqdm
 
-from ..utils.metric import SmoothedValue
+from ..utils.measure import SmoothedValue
 from ..utils.plot import loss_curve
 
 import logging
diff --git a/bob/ip/binseg/engine/trainer.py b/bob/ip/binseg/engine/trainer.py
index d5591526fc149248f950e69694443335c85728a0..00f9318212e999c742f0c87380f6b45cf1c61a5a 100644
--- a/bob/ip/binseg/engine/trainer.py
+++ b/bob/ip/binseg/engine/trainer.py
@@ -11,7 +11,7 @@ import distutils.version
 import torch
 from tqdm import tqdm
 
-from ..utils.metric import SmoothedValue
+from ..utils.measure import SmoothedValue
 from ..utils.summary import summary
 from ..utils.resources import cpu_constants, gpu_constants, cpu_log, gpu_log
 
diff --git a/bob/ip/binseg/script/analyze.py b/bob/ip/binseg/script/analyze.py
index bd66611d635c5a31b7163c0b69eb9da1ee5e955e..8a7e502139a47ae71292c173f211c3a92ac973f9 100644
--- a/bob/ip/binseg/script/analyze.py
+++ b/bob/ip/binseg/script/analyze.py
@@ -149,7 +149,7 @@ def analyze(
              └── second-annotator/  #if set, store overlayed images for the
                                     #second annotator here
           └── analysis /  #the outputs of the analysis of both train/test sets
-                          #includes second-annotator "metrics" as well, if
+                          #includes second-annotator "mesures" as well, if
                           # configured
 
     N.B.: The tool is designed to prevent analysis bias and allows one to
diff --git a/bob/ip/binseg/script/compare.py b/bob/ip/binseg/script/compare.py
index 813a5cb8c392ceb2eb280d9e45a36752b01839b5..dd06106de0a5a4ba7871a9a5910aaa4fa887f2f7 100644
--- a/bob/ip/binseg/script/compare.py
+++ b/bob/ip/binseg/script/compare.py
@@ -55,11 +55,11 @@ def _load(data, threshold=None):
 
     data : dict
         A dict in which keys are the names of the systems and the values are
-        paths to ``metrics.csv`` style files.
+        paths to ``measures.csv`` style files.
 
     threshold : :py:class:`float`, :py:class:`str`, Optional
         A value indicating which threshold to choose for selecting a "F1-score"
-        If set to ``None``, then use the maximum F1-score on that metrics file.
+        If set to ``None``, then use the maximum F1-score on that measures file.
         If set to a floating-point value, then use the F1-score that is
         obtained on that particular threshold.  If set to a string, it should
         match one of the keys in ``data``.  It then first calculate the
@@ -74,7 +74,7 @@ def _load(data, threshold=None):
         A dict in which keys are the names of the systems and the values are
         dictionaries that contain two keys:
 
-        * ``df``: A :py:class:`pandas.DataFrame` with the metrics data loaded
+        * ``df``: A :py:class:`pandas.DataFrame` with the measures data loaded
           to
         * ``threshold``: A threshold to be used for summarization, depending on
           the ``threshold`` parameter set on the input
@@ -84,8 +84,8 @@ def _load(data, threshold=None):
     if isinstance(threshold, str):
         logger.info(f"Calculating threshold from maximum F1-score at "
                 f"'{threshold}' dataset...")
-        metrics_path = data[threshold]
-        df = pandas.read_csv(metrics_path)
+        measures_path = data[threshold]
+        df = pandas.read_csv(measures_path)
 
         maxf1 = df.f1_score.max()
         use_threshold = df.threshold[df.f1_score.idxmax()]
@@ -101,10 +101,10 @@ def _load(data, threshold=None):
 
     # loads all data
     retval = {}
-    for name, metrics_path in data.items():
+    for name, measures_path in data.items():
 
-        logger.info(f"Loading metrics from {metrics_path}...")
-        df = pandas.read_csv(metrics_path)
+        logger.info(f"Loading measures from {measures_path}...")
+        df = pandas.read_csv(measures_path)
 
         if threshold is None:
             use_threshold = df.threshold[df.f1_score.idxmax()]
@@ -119,9 +119,9 @@ def _load(data, threshold=None):
     epilog="""Examples:
 
 \b
-    1. Compares system A and B, with their own pre-computed metric files:
+    1. Compares system A and B, with their own pre-computed measure files:
 \b
-       $ bob binseg compare -vv A path/to/A/metrics.csv B path/to/B/metrics.csv
+       $ bob binseg compare -vv A path/to/A/train.csv B path/to/B/test.csv
 """,
 )
 @click.argument(
@@ -182,7 +182,7 @@ def compare(label_path, output_figure, table_format, output_table, threshold,
 
     threshold = _validate_threshold(threshold, data)
 
-    # load all data metrics
+    # load all data measures
     data = _load(data, threshold=threshold)
 
     if output_figure is not None:
diff --git a/bob/ip/binseg/script/experiment.py b/bob/ip/binseg/script/experiment.py
index cbbfd56f0754327b6bb93abde03b4718c387d930..050910c38c29745382de7c7d9e310db427f7f9ea 100644
--- a/bob/ip/binseg/script/experiment.py
+++ b/bob/ip/binseg/script/experiment.py
@@ -247,7 +247,7 @@ def experiment(
              └── second-annotator/  #if set, store overlayed images for the
                                     #second annotator here
           └── analysis /  #the outputs of the analysis of both train/test sets
-                          #includes second-annotator "metrics" as well, if
+                          #includes second-annotator "mesures" as well, if
                           # configured
 
     Training is performed for a configurable number of epochs, and generates at
diff --git a/bob/ip/binseg/test/test_batchmetrics.py b/bob/ip/binseg/test/test_batchmeasures.py
similarity index 85%
rename from bob/ip/binseg/test/test_batchmetrics.py
rename to bob/ip/binseg/test/test_batchmeasures.py
index 09ffe250a805a00718d2cb4687ea2bbe5e49daf4..d6fb2cb8e768546cff2be14558ccae25d619d716 100644
--- a/bob/ip/binseg/test/test_batchmetrics.py
+++ b/bob/ip/binseg/test/test_batchmeasures.py
@@ -9,7 +9,7 @@ import torch
 import pandas
 import numpy
 
-from ..engine.evaluator import _sample_metrics
+from ..engine.evaluator import _sample_measures
 
 import logging
 logger = logging.getLogger(__name__)
@@ -17,7 +17,7 @@ logger = logging.getLogger(__name__)
 
 class Tester(unittest.TestCase):
     """
-    Unit test for batch metrics
+    Unit test for batch measures
     """
 
     def setUp(self):
@@ -29,10 +29,10 @@ class Tester(unittest.TestCase):
         self.ground_truths = torch.randint(low=0, high=2, size=(2, 1, 420, 420))
         self.names = ["Bob", "Tim"]
 
-    def test_batch_metrics(self):
+    def test_batch_measures(self):
         dfs = []
         for pred, gt in zip(self.predictions, self.ground_truths):
-            dfs.append(_sample_metrics(pred, gt, 100))
+            dfs.append(_sample_measures(pred, gt, 100))
         bm = pandas.concat(dfs)
 
         self.assertEqual(len(bm), 2 * 100)
diff --git a/bob/ip/binseg/test/test_cli.py b/bob/ip/binseg/test/test_cli.py
index 0b0d20af4fb3fe312afb2c95ed7acd577d737b40..74187b1ca1842dca8d0043a85a14a25cfd5d0ed1 100644
--- a/bob/ip/binseg/test/test_cli.py
+++ b/bob/ip/binseg/test/test_cli.py
@@ -178,7 +178,7 @@ def _check_experiment_stare(overlay):
             r"^F1-score of.*\(second annotator; threshold=0.5\)$": 2,
             r"^Ended evaluation$": 1,
             r"^Started comparison$": 1,
-            r"^Loading metrics from": 4,
+            r"^Loading measures from": 4,
             r"^Creating and saving plot at": 1,
             r"^Tabulating performance summary...": 1,
             r"^Saving table at": 1,
@@ -403,7 +403,7 @@ def _check_compare(runner):
             compare,
             [
                 "-vv",
-                # label - path to metrics
+                # label - path to measures
                 "test",
                 os.path.join(output_folder, "test.csv"),
                 "test (2nd. human)",
@@ -418,7 +418,7 @@ def _check_compare(runner):
         assert os.path.exists("comparison.rst")
 
         keywords = {
-            r"^Loading metrics from": 2,
+            r"^Loading measures from": 2,
             r"^Creating and saving plot at": 1,
             r"^Tabulating performance summary...": 1,
             r"^Saving table at": 1,
diff --git a/bob/ip/binseg/utils/table.py b/bob/ip/binseg/utils/table.py
index e8c4d64c3b65c52c0741c7f44b62284cf214ad71..097891260e9171b19b1f89fc40246b344e7815ad 100644
--- a/bob/ip/binseg/utils/table.py
+++ b/bob/ip/binseg/utils/table.py
@@ -3,7 +3,7 @@
 
 
 import tabulate
-from .metric import auc
+from .measure import auc
 
 
 def performance_table(data, fmt):