diff --git a/src/mednet/scripts/experiment.py b/src/mednet/scripts/experiment.py
index 9f7aff12e53d4b5f26c54c53099f585fcd59e16e..538ddf63fa83314db862fc0e60553b1385d79a42 100644
--- a/src/mednet/scripts/experiment.py
+++ b/src/mednet/scripts/experiment.py
@@ -141,7 +141,7 @@ def experiment(
     ctx.invoke(
         evaluate,
         predictions=predictions_output,
-        output=output_folder,
+        output_folder=output_folder,
         threshold="validation",
     )
 
diff --git a/tests/test_cli.py b/tests/test_cli.py
index e4923621c87091c51e1eaad355d44c532be010ce..4ee0e2c6ab454161dbf871030b30b989593d1c85 100644
--- a/tests/test_cli.py
+++ b/tests/test_cli.py
@@ -404,23 +404,24 @@ def test_evaluate_pasa_montgomery(temporary_basedir):
 
     with stdout_logging() as buf:
         prediction_path = temporary_basedir / "predictions.json"
-        output_path = temporary_basedir / "evaluation.json"
+        evaluation_filename = "evaluation.json"
+        evaluation_file = temporary_basedir / evaluation_filename
         result = runner.invoke(
             evaluate,
             [
                 "-vv",
                 "montgomery",
                 f"--predictions={str(prediction_path)}",
-                f"--output={str(output_path)}",
+                f"--output-folder={str(temporary_basedir)}",
                 "--threshold=test",
             ],
         )
         _assert_exit_0(result)
 
-        assert output_path.exists()
-        assert output_path.with_suffix(".meta.json").exists()
-        assert output_path.with_suffix(".rst").exists()
-        assert output_path.with_suffix(".pdf").exists()
+        assert evaluation_file.exists()
+        assert evaluation_file.with_suffix(".meta.json").exists()
+        assert evaluation_file.with_suffix(".rst").exists()
+        assert evaluation_file.with_suffix(".pdf").exists()
 
         keywords = {
             r"^Setting --threshold=.*$": 1,