diff --git a/bob/ip/binseg/engine/evaluator.py b/bob/ip/binseg/engine/evaluator.py
index 1163584894f1b95c64363ff5d4675a5ea39fce85..eab56b66290bf8c5a92ce01cdb10595de9d0e47e 100644
--- a/bob/ip/binseg/engine/evaluator.py
+++ b/bob/ip/binseg/engine/evaluator.py
@@ -49,7 +49,7 @@ def _posneg(pred, gt, threshold):
     return tp_tensor, fp_tensor, tn_tensor, fn_tensor
 
 
-def _sample_measures(pred, gt, bins):
+def _sample_measures(pred, gt, steps):
     """
     Calculates measures on one single sample and saves it to disk
 
@@ -63,9 +63,9 @@ def _sample_measures(pred, gt, bins):
     gt : torch.Tensor
         ground-truth (annotations)
 
-    bins : int
-        number of bins to use for threshold analysis.  The step size is
-        calculated from this by dividing ``1.0/bins``.
+    steps : int
+        number of steps to use for threshold analysis.  The step size is
+        calculated from this by dividing ``1.0/steps``.
 
 
     Returns
@@ -85,7 +85,7 @@ def _sample_measures(pred, gt, bins):
 
     """
 
-    step_size = 1.0 / bins
+    step_size = 1.0 / steps
     data = []
 
     for index, threshold in enumerate(numpy.arange(0.0, 1.0, step_size)):
@@ -219,6 +219,7 @@ def run(
     output_folder=None,
     overlayed_folder=None,
     threshold=None,
+    steps=1000,
 ):
     """
     Runs inference and calculates measures
@@ -254,6 +255,9 @@ def run(
         may bias your analysis.  This number is also used to print the a priori
         F1-score on the evaluated set.
 
+    steps : :py:class:`float`, Optional
+        number of threshold steps to consider when evaluating thresholds.
+
 
     Returns
     -------
@@ -264,7 +268,6 @@ def run(
     """
 
     # Collect overall measures
-    bins = 1000  # number of thresholds to analyse for
     data = {}
 
     for sample in tqdm(dataset):
@@ -279,7 +282,7 @@ def run(
             raise RuntimeError(
                 f"{stem} entry already exists in data. Cannot overwrite."
             )
-        data[stem] = _sample_measures(pred, gt, bins)
+        data[stem] = _sample_measures(pred, gt, steps)
 
         if overlayed_folder is not None:
             overlay_image = _sample_analysis(
@@ -325,7 +328,7 @@ def run(
     if threshold is not None:
 
         # get the closest possible threshold we have
-        index = int(round(bins * threshold))
+        index = int(round(steps * threshold))
         f1_a_priori = avg_measures["f1_score"][index]
         actual_threshold = avg_measures["threshold"][index]
 
diff --git a/bob/ip/binseg/script/analyze.py b/bob/ip/binseg/script/analyze.py
index 8a7e502139a47ae71292c173f211c3a92ac973f9..8ce4fde614efcf0e93679e3b409794ef5a07c24c 100644
--- a/bob/ip/binseg/script/analyze.py
+++ b/bob/ip/binseg/script/analyze.py
@@ -117,6 +117,16 @@ logger = logging.getLogger(__name__)
     required=True,
     cls=ResourceOption,
 )
+@click.option(
+    "--steps",
+    "-S",
+    help="This number is used to define the number of threshold steps to "
+    "consider when evaluating the highest possible F1-score on test data.",
+    default=1000,
+    show_default=True,
+    required=True,
+    cls=ResourceOption,
+)
 @verbosity_option(cls=ResourceOption)
 @click.pass_context
 def analyze(
@@ -129,6 +139,7 @@ def analyze(
     device,
     overlayed,
     weight,
+    steps,
     verbose,
     **kwargs,
 ):
@@ -230,6 +241,7 @@ def analyze(
         second_annotator=second_annotator,
         overlayed=overlayed_folder,
         threshold=threshold,
+        steps=steps,
         verbose=verbose,
     )
 
diff --git a/bob/ip/binseg/script/evaluate.py b/bob/ip/binseg/script/evaluate.py
index 8a4b33d1a7a44991bbc827dc25f83f127fd47875..2e558671d1d7bd4919f37f3a63d160387c1988f2 100644
--- a/bob/ip/binseg/script/evaluate.py
+++ b/bob/ip/binseg/script/evaluate.py
@@ -137,6 +137,16 @@ def _validate_threshold(t, dataset):
     required=False,
     cls=ResourceOption,
 )
+@click.option(
+    "--steps",
+    "-S",
+    help="This number is used to define the number of threshold steps to "
+    "consider when evaluating the highest possible F1-score on test data.",
+    default=1000,
+    show_default=True,
+    required=True,
+    cls=ResourceOption,
+)
 @verbosity_option(cls=ResourceOption)
 def evaluate(
     output_folder,
@@ -145,6 +155,7 @@ def evaluate(
     second_annotator,
     overlayed,
     threshold,
+    steps,
     **kwargs,
 ):
     """Evaluates an FCN on a binary segmentation task.
@@ -164,7 +175,8 @@ def evaluate(
     if isinstance(threshold, str):
         # first run evaluation for reference dataset, do not save overlays
         logger.info(f"Evaluating threshold on '{threshold}' set")
-        threshold = run(dataset[threshold], threshold, predictions_folder)
+        threshold = run(dataset[threshold], threshold, predictions_folder,
+                steps=steps)
         logger.info(f"Set --threshold={threshold:.5f}")
 
     # now run with the
@@ -173,7 +185,8 @@ def evaluate(
             logger.info(f"Skipping dataset '{k}' (not to be evaluated)")
             continue
         logger.info(f"Analyzing '{k}' set...")
-        run(v, k, predictions_folder, output_folder, overlayed, threshold)
+        run(v, k, predictions_folder, output_folder, overlayed, threshold,
+                steps=steps)
         second = second_annotator.get(k)
         if second is not None:
             compare_annotators(v, second, k, output_folder, overlayed)
diff --git a/bob/ip/binseg/script/experiment.py b/bob/ip/binseg/script/experiment.py
index 050910c38c29745382de7c7d9e310db427f7f9ea..9afa6f5bf0b9096e726042f9778cb35e04c417a6 100644
--- a/bob/ip/binseg/script/experiment.py
+++ b/bob/ip/binseg/script/experiment.py
@@ -205,6 +205,16 @@ logger = logging.getLogger(__name__)
     required=False,
     cls=ResourceOption,
 )
+@click.option(
+    "--steps",
+    "-S",
+    help="This number is used to define the number of threshold steps to "
+    "consider when evaluating the highest possible F1-score on test data.",
+    default=1000,
+    show_default=True,
+    required=True,
+    cls=ResourceOption,
+)
 @verbosity_option(cls=ResourceOption)
 @click.pass_context
 def experiment(
@@ -226,6 +236,7 @@ def experiment(
     ssl,
     rampup,
     overlayed,
+    steps,
     verbose,
     **kwargs,
 ):
@@ -323,5 +334,6 @@ def experiment(
             device=device,
             overlayed=overlayed,
             weight=model_file,
+            steps=steps,
             verbose=verbose,
             )
diff --git a/bob/ip/binseg/test/test_cli.py b/bob/ip/binseg/test/test_cli.py
index 74187b1ca1842dca8d0043a85a14a25cfd5d0ed1..3d3f728f1f9d07c694f2c4cc5149593fbbed95a5 100644
--- a/bob/ip/binseg/test/test_cli.py
+++ b/bob/ip/binseg/test/test_cli.py
@@ -93,6 +93,7 @@ def _check_experiment_stare(overlay):
                 "-vv",
                 "--epochs=1",
                 "--batch-size=1",
+                "--steps=10",
                 f"--output-folder={output_folder}",
                 ]
         if overlay:
@@ -354,6 +355,7 @@ def _check_evaluate(runner):
             [
                 config.name,
                 "-vv",
+                "--steps=10",
                 f"--output-folder={output_folder}",
                 "--predictions-folder=predictions",
                 f"--overlayed={overlay_folder}",