diff --git a/src/mednet/scripts/evaluate.py b/src/mednet/scripts/evaluate.py
index 9e137cce84048b6894dbca9c7130c2baaa52aa83..49f8ad4e89e3f6cd41c69550e31a58c663dc5afb 100644
--- a/src/mednet/scripts/evaluate.py
+++ b/src/mednet/scripts/evaluate.py
@@ -97,10 +97,16 @@ def evaluate(
         aggregate_summaries,
         run_binary,
     )
+    from .utils import execution_metadata, save_json_with_backup
 
     with predictions.open("r") as f:
         predict_data = json.load(f)
 
+    # register metadata
+    json_data: dict[str, typing.Any] = execution_metadata()
+    json_data = {k.replace("_", "-"): v for k, v in json_data.items()}
+    save_json_with_backup(output.with_suffix(".meta.json"), json_data)
+
     if threshold in predict_data:
         # it is the name of a split
         # first run evaluation for reference dataset
diff --git a/tests/test_cli.py b/tests/test_cli.py
index bd672a62acad68e83df33816a3046aa3f94a1a05..381ef539bc7a4dec82e468456124fa42086be7e5 100644
--- a/tests/test_cli.py
+++ b/tests/test_cli.py
@@ -392,6 +392,7 @@ def test_evaluate_pasa_montgomery(temporary_basedir):
         _assert_exit_0(result)
 
         assert output_path.exists()
+        assert output_path.with_suffix(".meta.json").exists()
         assert output_path.with_suffix(".rst").exists()
         assert output_path.with_suffix(".pdf").exists()
 
@@ -462,6 +463,7 @@ def test_experiment(temporary_basedir):
         == 1
     )
     assert (output_folder / "evaluation.json").exists()
+    assert (output_folder / "evaluation.meta.json").exists()
     assert (output_folder / "evaluation.rst").exists()
     assert (output_folder / "evaluation.pdf").exists()
     assert (output_folder / "gradcam" / "saliencies").exists()