[tests] Separate test output folders

bd4d1f75 · André Anjos · c2483554 · bd4d1f75 · bd4d1f75 · bd4d1f75
Commit bd4d1f75 authored 10 months ago by André Anjos
--- a/src/mednet/libs/classification/scripts/evaluate.py
+++ b/src/mednet/libs/classification/scripts/evaluate.py
@@ -137,8 +137,19 @@ def evaluate(
    # register metadata
    json_data: dict[str, typing.Any] = execution_metadata()
+    json_data.update(
+        dict(
+            predictions=str(predictions),
+            output_folder=str(output_folder),
+            threshold=threshold,
+            binning=binning,
+            plot=plot,
+        ),
+    )
    json_data = {k.replace("_", "-"): v for k, v in json_data.items()}
-    save_json_with_backup(evaluation_file.with_suffix(".meta.json"), json_data)
+    evaluation_meta = evaluation_file.with_suffix(".meta.json")
+    logger.info(f"Saving evaluation metadata at `{str(evaluation_meta)}`...")
+    save_json_with_backup(evaluation_meta, json_data)
    if threshold in predict_data:
        # it is the name of a split
@@ -161,7 +172,7 @@ def evaluate(
    results: dict[str, dict[str, typing.Any]] = dict()
    for k, v in predict_data.items():
-        logger.info(f"Analyzing split `{k}`...")
+        logger.info(f"Computing performance on split `{k}`...")
        results[k] = run_binary(
            name=k,
            predictions=v,
@@ -170,7 +181,7 @@ def evaluate(
        )
    # records full result analysis to a JSON file
-    logger.info(f"Saving evaluation results at `{evaluation_file}`...")
+    logger.info(f"Saving evaluation results at `{str(evaluation_file)}`...")
    with evaluation_file.open("w") as f:
        json.dump(results, f, indent=2, cls=NumpyJSONEncoder)
@@ -190,11 +201,10 @@ def evaluate(
    with table_path.open("w") as f:
        f.write(table)
-    # dump evaluation plots in file
-    figure_path = evaluation_file.with_suffix(".pdf")
-    logger.info(f"Saving evaluation figures at `{figure_path}`...")
    if plot:
+        figure_path = evaluation_file.with_suffix(".pdf")
+        logger.info(f"Saving evaluation figures at `{str(figure_path)}`...")
        with PdfPages(figure_path) as pdf:
            pr_curves = {k: v["curves"]["precision_recall"] for k, v in results.items()}
            pr_fig = aggregate_pr(pr_curves)

--- a/src/mednet/libs/classification/scripts/experiment.py
+++ b/src/mednet/libs/classification/scripts/experiment.py
@@ -61,8 +61,8 @@ def experiment(
       └─ <output-folder>/
          ├── model/  # the generated model will be here
-          ├── predictions.json  # the prediction outputs for the sets
+          ├── predictions.json  # the prediction outputs
-          └── evaluation/  # the outputs of the evaluations for the sets
+          ├── evaluation.json  # the evaluation outputs
    """
    experiment_start_timestamp = datetime.now()
@@ -112,11 +112,9 @@ def experiment(
    from .predict import predict
-    predictions_output = output_folder / "predictions"
    ctx.invoke(
        predict,
-        output_folder=predictions_output,
+        output_folder=output_folder,
        model=model,
        datamodule=datamodule,
        device=device,
@@ -134,9 +132,9 @@ def experiment(
    from .evaluate import evaluate
-    predictions_file = predictions_output / "predictions.json"
+    predictions_file = output_folder / "predictions.json"
-    with (predictions_output / "predictions.json").open() as pf:
+    with (output_folder / "predictions.json").open() as pf:
        splits = json.load(pf).keys()
        if "validation" in splits:
@@ -159,56 +157,6 @@ def experiment(
        f"Prediction runtime: {evaluation_stop_timestamp-evaluation_start_timestamp}"
    )
-    saliency_map_generation_start_timestamp = datetime.now()
-    logger.info(
-        f"Started saliency map generation at {saliency_map_generation_start_timestamp}"
-    )
-    from .saliency.generate import generate
-    saliencies_gen_folder = output_folder / "gradcam" / "saliencies"
-    ctx.invoke(
-        generate,
-        model=model,
-        datamodule=datamodule,
-        weight=train_output_folder,
-        output_folder=saliencies_gen_folder,
-    )
-    saliency_map_generation_stop_timestamp = datetime.now()
-    logger.info(
-        f"Ended saliency map generation at {saliency_map_generation_stop_timestamp}"
-    )
-    logger.info(
-        f"Saliency map generation runtime: {saliency_map_generation_stop_timestamp-saliency_map_generation_start_timestamp}"
-    )
-    saliency_images_generation_start_timestamp = datetime.now()
-    logger.info(
-        f"Started generating saliency images at {saliency_images_generation_start_timestamp}"
-    )
-    from .saliency.view import view
-    saliencies_view_folder = output_folder / "gradcam" / "visualizations"
-    ctx.invoke(
-        view,
-        model=model,
-        datamodule=datamodule,
-        input_folder=saliencies_gen_folder,
-        output_folder=saliencies_view_folder,
-    )
-    saliency_images_generation_stop_timestamp = datetime.now()
-    logger.info(
-        f"Ended saliency images generation at {saliency_images_generation_stop_timestamp}"
-    )
-    logger.info(
-        f"Saliency images generation runtime: {saliency_images_generation_stop_timestamp-saliency_images_generation_start_timestamp}"
-    )
    experiment_stop_timestamp = datetime.now()
    logger.info(
        f"Total experiment runtime: {experiment_stop_timestamp-experiment_start_timestamp}"

--- a/src/mednet/libs/segmentation/scripts/evaluate.py
+++ b/src/mednet/libs/segmentation/scripts/evaluate.py
@@ -14,6 +14,9 @@ from mednet.libs.segmentation.engine.evaluator import SUPPORTED_METRIC_TYPE
 logger = setup("mednet")
+# avoids X11/graphical desktop requirement when creating plots
+__import__("matplotlib").use("agg")
 def validate_threshold(threshold: float | str, splits: list[str]):
    """Validate the user threshold selection and returns parsed threshold.
@@ -89,17 +92,6 @@ def validate_threshold(threshold: float | str, splits: list[str]):
    default="results",
    cls=ResourceOption,
 )
-# @click.option(
-#     "--second-annotator",
-#     "-a",
-#     help="""A datamodule containing annotations from another annotator, that
-#     will be compared to the ground-truth (reference annotator) in each
-#     sample.""",
-#     required=False,
-#     default=None,
-#     cls=ResourceOption,
-#     show_default=True,
-# )
 @click.option(
    "--threshold",
    "-t",
@@ -138,14 +130,43 @@ def validate_threshold(threshold: float | str, splits: list[str]):
    required=True,
    cls=ResourceOption,
 )
+@click.option(
+    "--compare-annotator",
+    "-a",
+    help="""Path to a JSON file as produced by the CLI ``dump-annotations``,
+    containing splits and sample lists with associated HDF5 files where we can
+    find pre-processed annotation masks.  These annotations will be compared
+    with the target annotations on the main predictions.  In this case, a row
+    is added for each available split in the evaluation table.""",
+    required=False,
+    default=None,
+    type=click.Path(
+        file_okay=True,
+        dir_okay=False,
+        writable=False,
+        path_type=pathlib.Path,
+    ),
+    cls=ResourceOption,
+)
+@click.option(
+    "--plot/--no-plot",
+    "-P",
+    help="""If set, then also produces figures containing the plots of
+    performance curves and score histograms.""",
+    required=True,
+    show_default=True,
+    default=True,
+    cls=ResourceOption,
+)
 @verbosity_option(logger=logger, cls=ResourceOption, expose_value=False)
 def evaluate(
    predictions: pathlib.Path,
    output_folder: pathlib.Path,
    threshold: str | float,
    metric: str,
-    # second_annotator,
    steps: int,
+    compare_annotator: pathlib.Path,
+    plot: bool,
    **_,  # ignored
 ):  # numpydoc ignore=PR01
    """Evaluate predictions (from a model) on a segmentation task."""
@@ -185,10 +206,14 @@ def evaluate(
            threshold=threshold,
            metric=metric,
            steps=steps,
+            compare_annotator=str(compare_annotator),
+            plot=plot,
        ),
    )
    json_data = {k.replace("_", "-"): v for k, v in json_data.items()}
-    save_json_with_backup(evaluation_file.with_suffix(".meta.json"), json_data)
+    evaluation_meta = evaluation_file.with_suffix(".meta.json")
+    logger.info(f"Saving evaluation metadata at `{str(evaluation_meta)}`...")
+    save_json_with_backup(evaluation_meta, json_data)
    threshold = validate_threshold(threshold, predict_data)
    threshold_list = numpy.arange(
@@ -211,7 +236,7 @@ def evaluate(
    if isinstance(threshold, str):
        # Compute threshold on specified split, if required
-        logger.info(f"Evaluating threshold on `{threshold}` split using " f"`{metric}`")
+        logger.info(f"Evaluating threshold on split `{threshold}` using " f"`{metric}`")
        metric_list = compute_metric(
            eval_json_data[threshold]["counts"].values(),
            name2metric(typing.cast(SUPPORTED_METRIC_TYPE, metric)),
@@ -230,14 +255,12 @@ def evaluate(
        threshold_index = (numpy.abs(threshold_list - threshold)).argmin()
        logger.info(f"Set --threshold={threshold_list[threshold_index]:.4f}")
-    logger.info("Tabulating performance summary...")
-    table_format = "rst"
-    output_table = output_folder / "evaluation.rst"
    metrics_available = list(typing.get_args(SUPPORTED_METRIC_TYPE))
    table_headers = ["Dataset", "threshold"] + metrics_available + ["auroc", "avgprec"]
    table_data = []
    for split_name in predict_data.keys():
+        logger.info("Computing performance on split `{split_name}`...")
        counts = list(eval_json_data[split_name]["counts"].values())
        base_metrics = all_metrics(*counts[threshold_index])
        table_data.append([split_name, threshold_list[threshold_index]] + base_metrics)
@@ -266,10 +289,11 @@ def evaluate(
    # records full result analysis to a JSON file
    evaluation_file = output_folder / "evaluation.json"
-    logger.info(f"Saving evaluation results at `{evaluation_file}`...")
+    logger.info(f"Saving evaluation results at `{str(evaluation_file)}`...")
    with evaluation_file.open("w") as f:
        json.dump(eval_json_data, f, indent=2, cls=NumpyJSONEncoder)
+    table_format = "rst"
    table = tabulate.tabulate(
        table_data,
        table_headers,
@@ -278,41 +302,44 @@ def evaluate(
        stralign="right",
    )
    click.echo(table)
-    logger.info(f"Saving table at {output_table}...")
+    output_table = output_folder / "evaluation.rst"
+    logger.info(f"Saving tabulated performance summary at `{str(output_table)}`...")
    output_table.parent.mkdir(parents=True, exist_ok=True)
    with output_table.open("w") as f:
        f.write(table)
-    logger.info("Plotting performance curves...")
+    if plot:
-    output_figure = output_folder / "evaluation.pdf"
+        figure_path = evaluation_file.with_suffix(".pdf")
-    logger.info(f"Saving figures at {output_figure}...")
+        logger.info(f"Saving evaluation figures at `{str(figure_path)}`...")
-    with matplotlib.backends.backend_pdf.PdfPages(output_figure) as pdf:
-        with credible.plot.tight_layout(
+        with matplotlib.backends.backend_pdf.PdfPages(figure_path) as pdf:
-            ("False Positive Rate", "True Positive Rate"), "ROC"
+            with credible.plot.tight_layout(
-        ) as (
+                ("False Positive Rate", "True Positive Rate"), "ROC"
-            fig,
+            ) as (
-            ax,
+                fig,
-        ):
+                ax,
-            for split_name, data in eval_json_data.items():
+            ):
-                ax.plot(
+                for split_name, data in eval_json_data.items():
-                    data["curves"]["roc"]["fpr"],
+                    ax.plot(
-                    data["curves"]["roc"]["tpr"],
+                        data["curves"]["roc"]["fpr"],
-                    label=f"{split_name} (AUC: {data['auc_score']:.2f})",
+                        data["curves"]["roc"]["tpr"],
-                )
+                        label=f"{split_name} (AUC: {data['auc_score']:.2f})",
-                ax.legend(loc="best", fancybox=True, framealpha=0.7)
+                    )
-            pdf.savefig(fig)
+                    ax.legend(loc="best", fancybox=True, framealpha=0.7)
+                pdf.savefig(fig)
-        with credible.plot.tight_layout_f1iso(
-            ("Recall", "Precision"), "Precison-Recall"
+            with credible.plot.tight_layout_f1iso(
-        ) as (
+                ("Recall", "Precision"), "Precison-Recall"
-            fig,
+            ) as (
-            ax,
+                fig,
-        ):
+                ax,
-            for split_name, data in eval_json_data.items():
+            ):
-                ax.plot(
+                for split_name, data in eval_json_data.items():
-                    data["curves"]["precision_recall"]["precision"],
+                    ax.plot(
-                    data["curves"]["precision_recall"]["recall"],
+                        data["curves"]["precision_recall"]["precision"],
-                    label=f"{split_name} (AP: {data['average_precision_score']:.2f})",
+                        data["curves"]["precision_recall"]["recall"],
-                )
+                        label=f"{split_name} (AP: {data['average_precision_score']:.2f})",
-                ax.legend(loc="best", fancybox=True, framealpha=0.7)
+                    )
-            pdf.savefig(fig)
+                    ax.legend(loc="best", fancybox=True, framealpha=0.7)
+                pdf.savefig(fig)
--- a/src/mednet/libs/segmentation/scripts/experiment.py
+++ b/src/mednet/libs/segmentation/scripts/experiment.py
@@ -60,8 +60,8 @@ def experiment(
        \b
       └─ <output-folder>/
          ├── model/  # the generated model will be here
-          ├── predictions  # the prediction outputs for the sets
+          ├── predictions.json  # the prediction outputs
-          └── evaluation/  # the outputs of the evaluations for the sets
+          └── evaluation.json  # the evaluation outputs
    """
    experiment_start_timestamp = datetime.now()
@@ -110,11 +110,9 @@ def experiment(
    from .predict import predict
-    predictions_output = output_folder / "predictions"
    ctx.invoke(
        predict,
-        output_folder=predictions_output,
+        output_folder=output_folder,
        model=model,
        datamodule=datamodule,
        device=device,
@@ -132,11 +130,9 @@ def experiment(
    from .evaluate import evaluate
-    evaluation_output = output_folder / "evaluation"
+    predictions_file = output_folder / "predictions.json"
-    predictions_file = predictions_output / "predictions.json"
-    with (predictions_output / "predictions.json").open() as pf:
+    with (predictions_file).open() as pf:
        splits = json.load(pf).keys()
        if "validation" in splits:
@@ -149,7 +145,7 @@ def experiment(
    ctx.invoke(
        evaluate,
        predictions=predictions_file,
-        output_folder=evaluation_output,
+        output_folder=output_folder,
        threshold=evaluation_threshold,
        # metric="f1",
        # steps=100,

--- a/tests/classification/test_cli.py
+++ b/tests/classification/test_cli.py
@@ -185,7 +185,7 @@ def test_upload_help():
 @pytest.mark.slow
 @pytest.mark.skip_if_rc_var_not_set("datadir.montgomery")
-def test_train_pasa_montgomery(temporary_basedir):
+def test_train_pasa_montgomery(session_tmp_path):
    from mednet.libs.classification.scripts.train import train
    from mednet.libs.common.utils.checkpointer import (
        CHECKPOINT_EXTENSION,
@@ -195,7 +195,7 @@ def test_train_pasa_montgomery(temporary_basedir):
    runner = CliRunner()
    with stdout_logging() as buf:
-        output_folder = temporary_basedir / "classification" / "results"
+        output_folder = session_tmp_path / "classification-standalone"
        result = runner.invoke(
            train,
            [
@@ -241,8 +241,8 @@ def test_train_pasa_montgomery(temporary_basedir):
 @pytest.mark.slow
 @pytest.mark.skip_if_rc_var_not_set("datadir.montgomery")
-def test_train_pasa_montgomery_from_checkpoint(temporary_basedir):
+def test_predict_pasa_montgomery(session_tmp_path):
-    from mednet.libs.classification.scripts.train import train
+    from mednet.libs.classification.scripts.predict import predict
    from mednet.libs.common.utils.checkpointer import (
        CHECKPOINT_EXTENSION,
        _get_checkpoint_from_alias,
@@ -250,62 +250,36 @@ def test_train_pasa_montgomery_from_checkpoint(temporary_basedir):
    runner = CliRunner()
-    output_folder = temporary_basedir / "classification" / "results" / "pasa_checkpoint"
-    result0 = runner.invoke(
-        train,
-        [
-            "pasa",
-            "montgomery",
-            "-vv",
-            "--epochs=1",
-            "--batch-size=1",
-            f"--output-folder={str(output_folder)}",
-        ],
-    )
-    _assert_exit_0(result0)
-    # asserts checkpoints are there, or raises FileNotFoundError
-    last = _get_checkpoint_from_alias(output_folder, "periodic")
-    assert last.name.endswith("epoch=0" + CHECKPOINT_EXTENSION)
-    best = _get_checkpoint_from_alias(output_folder, "best")
-    assert best.name.endswith("epoch=0" + CHECKPOINT_EXTENSION)
-    assert (output_folder / "meta.json").exists()
-    assert len(list((output_folder / "logs").glob("events.out.tfevents.*"))) == 1
    with stdout_logging() as buf:
+        output_folder = session_tmp_path / "classification-standalone"
+        last = _get_checkpoint_from_alias(output_folder, "periodic")
+        assert last.name.endswith("epoch=0" + CHECKPOINT_EXTENSION)
        result = runner.invoke(
-            train,
+            predict,
            [
                "pasa",
                "montgomery",
                "-vv",
-                "--epochs=2",
                "--batch-size=1",
-                f"--output-folder={output_folder}",
+                f"--weight={str(last)}",
+                f"--output-folder={str(output_folder)}",
            ],
        )
        _assert_exit_0(result)
-        # asserts checkpoints are there, or raises FileNotFoundError
+        assert (output_folder / "predictions.meta.json").exists()
-        last = _get_checkpoint_from_alias(output_folder, "periodic")
+        assert (output_folder / "predictions.json").exists()
-        assert last.name.endswith("epoch=1" + CHECKPOINT_EXTENSION)
-        best = _get_checkpoint_from_alias(output_folder, "best")
-        assert (output_folder / "meta.json").exists()
-        assert len(list((output_folder / "logs").glob("events.out.tfevents.*"))) == 2
        keywords = {
-            r"^Loading dataset:`train` without caching. Trade-off: CPU RAM usage: less | Disk I/O: more.$": 1,
+            r"^Loading dataset: * without caching. Trade-off: CPU RAM usage: less | Disk I/O: more$": 3,
-            r"^Loading dataset:`validation` without caching. Trade-off: CPU RAM usage: less | Disk I/O: more.$": 1,
+            r"^Loading checkpoint from .*$": 1,
-            r"^Applying train/valid loss balancing...$": 1,
-            r"^Training for at most 2 epochs.$": 1,
-            r"^Resuming from epoch 0 \(checkpoint file: .*$": 1,
-            r"^Writing run metadata at.*$": 1,
-            r"^Dataset `train` is already setup. Not re-instantiating it.$": 1,
-            r"^Dataset `validation` is already setup. Not re-instantiating it.$": 1,
            r"^Restoring normalizer from checkpoint.$": 1,
+            r"^Running prediction on `train` split...$": 1,
+            r"^Running prediction on `validation` split...$": 1,
+            r"^Running prediction on `test` split...$": 1,
+            r"^Predictions saved to .*$": 1,
        }
        buf.seek(0)
        logging_output = buf.read()
@@ -319,47 +293,37 @@ def test_train_pasa_montgomery_from_checkpoint(temporary_basedir):
 @pytest.mark.slow
 @pytest.mark.skip_if_rc_var_not_set("datadir.montgomery")
-def test_predict_pasa_montgomery(temporary_basedir, datadir):
+def test_evaluate_pasa_montgomery(session_tmp_path):
-    from mednet.libs.classification.scripts.predict import predict
+    from mednet.libs.classification.scripts.evaluate import evaluate
-    from mednet.libs.common.utils.checkpointer import (
-        CHECKPOINT_EXTENSION,
-        _get_checkpoint_from_alias,
-    )
    runner = CliRunner()
    with stdout_logging() as buf:
-        output = temporary_basedir / "classification" / "predictions"
+        output_folder = session_tmp_path / "classification-standalone"
-        last = _get_checkpoint_from_alias(
-            temporary_basedir / "classification" / "results",
-            "periodic",
-        )
-        assert last.name.endswith("epoch=0" + CHECKPOINT_EXTENSION)
        result = runner.invoke(
-            predict,
+            evaluate,
            [
-                "pasa",
-                "montgomery",
                "-vv",
-                "--batch-size=1",
+                f"--predictions={str(output_folder / 'predictions.json')}",
-                f"--weight={str(last)}",
+                f"--output-folder={str(output_folder)}",
-                f"--output-folder={str(output)}",
+                "--threshold=test",
            ],
        )
        _assert_exit_0(result)
-        assert output.exists()
+        assert (output_folder / "evaluation.json").exists()
+        assert (output_folder / "evaluation.meta.json").exists()
+        assert (output_folder / "evaluation.rst").exists()
+        assert (output_folder / "evaluation.pdf").exists()
        keywords = {
-            r"^Loading dataset: * without caching. Trade-off: CPU RAM usage: less | Disk I/O: more$": 3,
+            r"^Saving evaluation metadata at .*$": 1,
-            r"^Loading checkpoint from .*$": 1,
+            r"^Setting --threshold=.*$": 1,
-            r"^Restoring normalizer from checkpoint.$": 1,
+            r"^Computing performance on split .*...$": 3,
-            r"^Running prediction on `train` split...$": 1,
+            r"^Saving evaluation results at .*$": 1,
-            r"^Running prediction on `validation` split...$": 1,
+            r"^Saving evaluation results in table format at .*$": 1,
-            r"^Running prediction on `test` split...$": 1,
+            r"^Saving evaluation figures at .*$": 1,
-            r"^Predictions saved to .*$": 1,
        }
        buf.seek(0)
        logging_output = buf.read()
@@ -373,39 +337,69 @@ def test_predict_pasa_montgomery(temporary_basedir, datadir):
 @pytest.mark.slow
 @pytest.mark.skip_if_rc_var_not_set("datadir.montgomery")
-def test_evaluate_pasa_montgomery(temporary_basedir):
+def test_train_pasa_montgomery_from_checkpoint(tmp_path):
-    from mednet.libs.classification.scripts.evaluate import evaluate
+    from mednet.libs.classification.scripts.train import train
+    from mednet.libs.common.utils.checkpointer import (
+        CHECKPOINT_EXTENSION,
+        _get_checkpoint_from_alias,
+    )
    runner = CliRunner()
-    with stdout_logging() as buf:
+    result0 = runner.invoke(
-        prediction_path = temporary_basedir / "classification" / "predictions"
+        train,
-        predictions_file = prediction_path / "predictions.json"
+        [
-        evaluation_path = temporary_basedir / "classification" / "evaluations"
+            "pasa",
+            "montgomery",
+            "-vv",
+            "--epochs=1",
+            "--batch-size=1",
+            f"--output-folder={str(tmp_path)}",
+        ],
+    )
+    _assert_exit_0(result0)
+    # asserts checkpoints are there, or raises FileNotFoundError
+    last = _get_checkpoint_from_alias(tmp_path, "periodic")
+    assert last.name.endswith("epoch=0" + CHECKPOINT_EXTENSION)
+    best = _get_checkpoint_from_alias(tmp_path, "best")
+    assert best.name.endswith("epoch=0" + CHECKPOINT_EXTENSION)
+    assert (tmp_path / "meta.json").exists()
+    assert len(list((tmp_path / "logs").glob("events.out.tfevents.*"))) == 1
+    with stdout_logging() as buf:
        result = runner.invoke(
-            evaluate,
+            train,
            [
-                "-vv",
+                "pasa",
                "montgomery",
-                f"--predictions={predictions_file}",
+                "-vv",
-                f"--output-folder={evaluation_path}",
+                "--epochs=2",
-                "--threshold=test",
+                "--batch-size=1",
+                f"--output-folder={tmp_path}",
            ],
        )
        _assert_exit_0(result)
-        assert (evaluation_path / "evaluation.json").exists()
+        # asserts checkpoints are there, or raises FileNotFoundError
-        assert (evaluation_path / "evaluation.meta.json").exists()
+        last = _get_checkpoint_from_alias(tmp_path, "periodic")
-        assert (evaluation_path / "evaluation.pdf").exists()
+        assert last.name.endswith("epoch=1" + CHECKPOINT_EXTENSION)
-        assert (evaluation_path / "evaluation.rst").exists()
+        best = _get_checkpoint_from_alias(tmp_path, "best")
+        assert (tmp_path / "meta.json").exists()
+        assert len(list((tmp_path / "logs").glob("events.out.tfevents.*"))) == 2
        keywords = {
-            r"^Setting --threshold=.*$": 1,
+            r"^Loading dataset:`train` without caching. Trade-off: CPU RAM usage: less | Disk I/O: more.$": 1,
-            r"^Analyzing split `train`...$": 1,
+            r"^Loading dataset:`validation` without caching. Trade-off: CPU RAM usage: less | Disk I/O: more.$": 1,
-            r"^Analyzing split `validation`...$": 1,
+            r"^Applying train/valid loss balancing...$": 1,
-            r"^Analyzing split `test`...$": 1,
+            r"^Training for at most 2 epochs.$": 1,
-            r"^Saving evaluation results .*$": 2,
+            r"^Resuming from epoch 0 \(checkpoint file: .*$": 1,
-            r"^Saving evaluation figures at .*$": 1,
+            r"^Writing run metadata at.*$": 1,
+            r"^Dataset `train` is already setup. Not re-instantiating it.$": 1,
+            r"^Dataset `validation` is already setup. Not re-instantiating it.$": 1,
+            r"^Restoring normalizer from checkpoint.$": 1,
        }
        buf.seek(0)
        logging_output = buf.read()
@@ -420,12 +414,11 @@ def test_evaluate_pasa_montgomery(temporary_basedir):
 @pytest.mark.slow
 @pytest.mark.skip_if_rc_var_not_set("datadir.montgomery")
-def test_experiment(temporary_basedir):
+def test_experiment(tmp_path):
    from mednet.libs.classification.scripts.experiment import experiment
    runner = CliRunner()
-    output_folder = temporary_basedir / "classification" / "experiment"
    num_epochs = 2
    result = runner.invoke(
        experiment,
@@ -434,61 +427,39 @@ def test_experiment(temporary_basedir):
            "pasa",
            "montgomery",
            f"--epochs={num_epochs}",
-            f"--output-folder={str(output_folder)}",
+            f"--output-folder={str(tmp_path)}",
        ],
    )
    _assert_exit_0(result)
-    assert (output_folder / "model" / "meta.json").exists()
+    assert (tmp_path / "model" / "meta.json").exists()
-    assert (output_folder / "model" / f"model-at-epoch={num_epochs-1}.ckpt").exists()
+    assert (tmp_path / "model" / f"model-at-epoch={num_epochs-1}.ckpt").exists()
-    assert (output_folder / "predictions" / "predictions.json").exists()
+    assert (tmp_path / "predictions.json").exists()
-    assert (output_folder / "predictions" / "predictions.meta.json").exists()
+    assert (tmp_path / "predictions.meta.json").exists()
    # Need to glob because we cannot be sure of the checkpoint with lowest validation loss
    assert (
        len(
            list(
-                (output_folder / "model").glob(
+                (tmp_path / "model").glob(
-                    "model-at-lowest-validation-loss-epoch=*.ckpt",
+                    "model-at-lowest-validation-loss-epoch=*.ckpt"
-                ),
+                )
-            ),
+            )
        )
        == 1
    )
-    assert (output_folder / "model" / "trainlog.pdf").exists()
+    assert (tmp_path / "model" / "trainlog.pdf").exists()
    assert (
        len(
            list(
-                (output_folder / "model" / "logs").glob(
+                (tmp_path / "model" / "logs").glob(
                    "events.out.tfevents.*",
                ),
            ),
        )
        == 1
    )
-    assert (output_folder / "evaluation.json").exists()
+    assert (tmp_path / "evaluation.json").exists()
-    assert (output_folder / "evaluation.meta.json").exists()
+    assert (tmp_path / "evaluation.meta.json").exists()
-    assert (output_folder / "evaluation.rst").exists()
+    assert (tmp_path / "evaluation.rst").exists()
-    assert (output_folder / "evaluation.pdf").exists()
+    assert (tmp_path / "evaluation.pdf").exists()
-    assert (output_folder / "gradcam" / "saliencies").exists()
-    assert (
-        len(
-            list(
-                (output_folder / "gradcam" / "saliencies" / "CXR_png").glob(
-                    "MCUCXR_*.npy",
-                ),
-            ),
-        )
-        == 138
-    )
-    assert (output_folder / "gradcam" / "visualizations").exists()
-    assert (
-        len(
-            list(
-                (output_folder / "gradcam" / "visualizations" / "CXR_png").glob(
-                    "MCUCXR_*.png",
-                ),
-            ),
-        )
-        == 58
-    )
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -95,7 +95,7 @@ def rc_variable_set(name):
 @pytest.fixture(scope="session")
-def temporary_basedir(tmp_path_factory):
+def session_tmp_path(tmp_path_factory):
    return tmp_path_factory.mktemp("test-cli")

--- a/tests/segmentation/test_cli.py
+++ b/tests/segmentation/test_cli.py
@@ -153,7 +153,7 @@ def test_evaluate_help():
 @pytest.mark.slow
 @pytest.mark.skip_if_rc_var_not_set("datadir.drive")
-def test_train_lwnet_drive(temporary_basedir):
+def test_train_lwnet_drive(session_tmp_path):
    from mednet.libs.common.utils.checkpointer import (
        CHECKPOINT_EXTENSION,
        _get_checkpoint_from_alias,
@@ -163,7 +163,7 @@ def test_train_lwnet_drive(temporary_basedir):
    runner = CliRunner()
    with stdout_logging() as buf:
-        output_folder = temporary_basedir / "segmentation" / "results"
+        output_folder = session_tmp_path / "segmentation-standalone"
        result = runner.invoke(
            train,
            [
@@ -206,68 +206,44 @@ def test_train_lwnet_drive(temporary_basedir):
 @pytest.mark.slow
 @pytest.mark.skip_if_rc_var_not_set("datadir.drive")
-def test_train_lwnet_drive_from_checkpoint(temporary_basedir):
+def test_predict_lwnet_drive(session_tmp_path):
    from mednet.libs.common.utils.checkpointer import (
        CHECKPOINT_EXTENSION,
        _get_checkpoint_from_alias,
    )
-    from mednet.libs.segmentation.scripts.train import train
+    from mednet.libs.segmentation.scripts.predict import predict
    runner = CliRunner()
-    output_folder = temporary_basedir / "segmentation" / "results" / "lwnet_checkpoint"
-    result0 = runner.invoke(
-        train,
-        [
-            "lwnet",
-            "drive",
-            "-vv",
-            "--epochs=1",
-            "--batch-size=1",
-            f"--output-folder={str(output_folder)}",
-        ],
-    )
-    _assert_exit_0(result0)
-    # asserts checkpoints are there, or raises FileNotFoundError
-    last = _get_checkpoint_from_alias(output_folder, "periodic")
-    assert last.name.endswith("epoch=0" + CHECKPOINT_EXTENSION)
-    best = _get_checkpoint_from_alias(output_folder, "best")
-    assert best.name.endswith("epoch=0" + CHECKPOINT_EXTENSION)
-    assert (output_folder / "meta.json").exists()
-    assert len(list((output_folder / "logs").glob("events.out.tfevents.*"))) == 1
    with stdout_logging() as buf:
+        output_folder = session_tmp_path / "segmentation-standalone"
+        last_ckpt = _get_checkpoint_from_alias(output_folder, "periodic")
+        assert last_ckpt.name.endswith("epoch=0" + CHECKPOINT_EXTENSION)
        result = runner.invoke(
-            train,
+            predict,
            [
                "lwnet",
                "drive",
                "-vv",
-                "--epochs=2",
                "--batch-size=1",
-                f"--output-folder={output_folder}",
+                f"--weight={str(last_ckpt)}",
+                f"--output-folder={str(output_folder)}",
            ],
        )
        _assert_exit_0(result)
-        # asserts checkpoints are there, or raises FileNotFoundError
+        assert (output_folder / "predictions.meta.json").exists()
-        last = _get_checkpoint_from_alias(output_folder, "periodic")
+        assert (output_folder / "predictions.json").exists()
-        assert last.name.endswith("epoch=1" + CHECKPOINT_EXTENSION)
-        best = _get_checkpoint_from_alias(output_folder, "best")
-        assert (output_folder / "meta.json").exists()
-        assert len(list((output_folder / "logs").glob("events.out.tfevents.*"))) == 2
        keywords = {
-            r"^Loading dataset:`train` without caching. Trade-off: CPU RAM usage: less | Disk I/O: more.$": 1,
+            r"^Loading dataset: * without caching. Trade-off: CPU RAM usage: less | Disk I/O: more$": 2,
-            r"^Training for at most 2 epochs.$": 1,
+            r"^Loading checkpoint from .*$": 1,
-            r"^Resuming from epoch 0 \(checkpoint file: .*$": 1,
-            r"^Writing run metadata at.*$": 1,
-            r"^Dataset `train` is already setup. Not re-instantiating it.$": 3,
            r"^Restoring normalizer from checkpoint.$": 1,
+            r"^Running prediction on `train` split...$": 1,
+            r"^Running prediction on `test` split...$": 1,
+            r"^Predictions saved to .*$": 1,
        }
        buf.seek(0)
        logging_output = buf.read()
@@ -281,46 +257,38 @@ def test_train_lwnet_drive_from_checkpoint(temporary_basedir):
 @pytest.mark.slow
 @pytest.mark.skip_if_rc_var_not_set("datadir.drive")
-def test_predict_lwnet_drive(temporary_basedir, datadir):
+def test_evaluate_lwnet_drive(session_tmp_path):
-    from mednet.libs.common.utils.checkpointer import (
+    from mednet.libs.segmentation.scripts.evaluate import evaluate
-        CHECKPOINT_EXTENSION,
-        _get_checkpoint_from_alias,
-    )
-    from mednet.libs.segmentation.scripts.predict import predict
    runner = CliRunner()
    with stdout_logging() as buf:
-        output = temporary_basedir / "segmentation" / "predictions"
+        output_folder = session_tmp_path / "segmentation-standalone"
-        last = _get_checkpoint_from_alias(
-            temporary_basedir / "segmentation" / "results",
-            "periodic",
-        )
-        assert last.name.endswith("epoch=0" + CHECKPOINT_EXTENSION)
        result = runner.invoke(
-            predict,
+            evaluate,
            [
-                "lwnet",
-                "drive",
                "-vv",
-                "--batch-size=1",
+                f"--predictions={str(output_folder / 'predictions.json')}",
-                f"--weight={str(last)}",
+                f"--output-folder={str(output_folder)}",
-                f"--output-folder={str(output)}",
+                "--threshold=test",
            ],
        )
        _assert_exit_0(result)
-        assert output.exists()
+        assert (output_folder / "evaluation.json").exists()
+        assert (output_folder / "evaluation.meta.json").exists()
+        assert (output_folder / "evaluation.rst").exists()
+        assert (output_folder / "evaluation.pdf").exists()
        keywords = {
-            r"^Loading dataset: * without caching. Trade-off: CPU RAM usage: less | Disk I/O: more$": 2,
+            r"^Saving evaluation metadata at .*$": 1,
-            r"^Loading checkpoint from .*$": 1,
+            r"^Counting true/false positive/negatives at split.*$": 2,
-            r"^Restoring normalizer from checkpoint.$": 1,
+            r"^Evaluating threshold on split .*$": 1,
-            r"^Running prediction on `train` split...$": 1,
+            r"^Computing performance on split .*...$": 2,
-            r"^Running prediction on `test` split...$": 1,
+            r"^Saving evaluation results at .*$": 1,
-            r"^Predictions saved to .*$": 1,
+            r"^Saving tabulated performance summary at .*$": 1,
+            r"^Saving evaluation figures at .*$": 1,
        }
        buf.seek(0)
        logging_output = buf.read()
@@ -334,41 +302,66 @@ def test_predict_lwnet_drive(temporary_basedir, datadir):
 @pytest.mark.slow
 @pytest.mark.skip_if_rc_var_not_set("datadir.drive")
-def test_evaluate_lwnet_drive(temporary_basedir):
+def test_train_lwnet_drive_from_checkpoint(tmp_path):
-    from mednet.libs.segmentation.scripts.evaluate import evaluate
+    from mednet.libs.common.utils.checkpointer import (
+        CHECKPOINT_EXTENSION,
+        _get_checkpoint_from_alias,
+    )
+    from mednet.libs.segmentation.scripts.train import train
    runner = CliRunner()
+    result0 = runner.invoke(
+        train,
+        [
+            "lwnet",
+            "drive",
+            "-vv",
+            "--epochs=1",
+            "--batch-size=1",
+            f"--output-folder={str(tmp_path)}",
+        ],
+    )
+    _assert_exit_0(result0)
+    # asserts checkpoints are there, or raises FileNotFoundError
+    last = _get_checkpoint_from_alias(tmp_path, "periodic")
+    assert last.name.endswith("epoch=0" + CHECKPOINT_EXTENSION)
+    best = _get_checkpoint_from_alias(tmp_path, "best")
+    assert best.name.endswith("epoch=0" + CHECKPOINT_EXTENSION)
+    assert (tmp_path / "meta.json").exists()
+    assert len(list((tmp_path / "logs").glob("events.out.tfevents.*"))) == 1
    with stdout_logging() as buf:
-        prediction_path = temporary_basedir / "segmentation" / "predictions"
-        predictions_file = prediction_path / "predictions.json"
-        evaluation_path = temporary_basedir / "segmentation" / "evaluations"
        result = runner.invoke(
-            evaluate,
+            train,
            [
-                "-vv",
+                "lwnet",
                "drive",
-                f"--predictions={predictions_file}",
+                "-vv",
-                f"--output-folder={evaluation_path}",
+                "--epochs=2",
-                "--threshold=test",
+                "--batch-size=1",
+                f"--output-folder={tmp_path}",
            ],
        )
        _assert_exit_0(result)
-        assert (evaluation_path / "evaluation.json").exists()
+        # asserts checkpoints are there, or raises FileNotFoundError
-        assert (evaluation_path / "evaluation.meta.json").exists()
+        last = _get_checkpoint_from_alias(tmp_path, "periodic")
-        assert (evaluation_path / "evaluation.pdf").exists()
+        assert last.name.endswith("epoch=1" + CHECKPOINT_EXTENSION)
-        assert (evaluation_path / "evaluation.rst").exists()
+        best = _get_checkpoint_from_alias(tmp_path, "best")
+        assert (tmp_path / "meta.json").exists()
+        assert len(list((tmp_path / "logs").glob("events.out.tfevents.*"))) == 2
        keywords = {
+            r"^Loading dataset:`train` without caching. Trade-off: CPU RAM usage: less | Disk I/O: more.$": 1,
+            r"^Training for at most 2 epochs.$": 1,
+            r"^Resuming from epoch 0 \(checkpoint file: .*$": 1,
            r"^Writing run metadata at.*$": 1,
-            r"^Counting true/false positive/negatives at split.*$": 2,
+            r"^Dataset `train` is already setup. Not re-instantiating it.$": 3,
-            r"^Evaluating threshold on.*$": 1,
+            r"^Restoring normalizer from checkpoint.$": 1,
-            r"^Tabulating performance summary...": 1,
-            r"^Saving evaluation results at.*$": 1,
-            r"^Saving table at .*$": 1,
-            r"^Plotting performance curves...": 1,
-            r"^Saving figures at .*$": 1,
        }
        buf.seek(0)
        logging_output = buf.read()
@@ -383,12 +376,11 @@ def test_evaluate_lwnet_drive(temporary_basedir):
 @pytest.mark.slow
 @pytest.mark.skip_if_rc_var_not_set("datadir.drive")
-def test_experiment(temporary_basedir):
+def test_experiment(tmp_path):
    from mednet.libs.segmentation.scripts.experiment import experiment
    runner = CliRunner()
-    output_folder = temporary_basedir / "segmentation" / "experiment"
    num_epochs = 2
    result = runner.invoke(
        experiment,
@@ -397,39 +389,40 @@ def test_experiment(temporary_basedir):
            "lwnet",
            "drive",
            f"--epochs={num_epochs}",
-            f"--output-folder={str(output_folder)}",
+            f"--output-folder={str(tmp_path)}",
        ],
    )
    _assert_exit_0(result)
-    assert (output_folder / "model" / "meta.json").exists()
+    assert (tmp_path / "model" / "meta.json").exists()
-    assert (output_folder / "model" / f"model-at-epoch={num_epochs-1}.ckpt").exists()
+    assert (tmp_path / "model" / f"model-at-epoch={num_epochs-1}.ckpt").exists()
-    assert (output_folder / "predictions" / "predictions.json").exists()
-    assert (output_folder / "predictions" / "predictions.meta.json").exists()
    # Need to glob because we cannot be sure of the checkpoint with lowest validation loss
    assert (
        len(
            list(
-                (output_folder / "model").glob(
+                (tmp_path / "model").glob(
                    "model-at-lowest-validation-loss-epoch=*.ckpt",
                ),
            ),
        )
        == 1
    )
-    assert (output_folder / "model" / "trainlog.pdf").exists()
+    assert (tmp_path / "model" / "trainlog.pdf").exists()
    assert (
        len(
            list(
-                (output_folder / "model" / "logs").glob(
+                (tmp_path / "model" / "logs").glob(
                    "events.out.tfevents.*",
                ),
            ),
        )
        == 1
    )
-    assert (output_folder / "evaluation" / "evaluation.json").exists()
-    assert (output_folder / "evaluation" / "evaluation.meta.json").exists()
+    assert (tmp_path / "predictions.json").exists()
-    assert (output_folder / "evaluation" / "evaluation.pdf").exists()
+    assert (tmp_path / "predictions.meta.json").exists()
-    assert (output_folder / "evaluation" / "evaluation.rst").exists()
+    assert (tmp_path / "evaluation.json").exists()
+    assert (tmp_path / "evaluation.meta.json").exists()
+    assert (tmp_path / "evaluation.pdf").exists()
+    assert (tmp_path / "evaluation.rst").exists()