diff --git a/src/mednet/libs/classification/scripts/upload.py b/src/mednet/libs/classification/scripts/upload.py index 7e95426a7496a72ccdeb9a753161f339a42f577a..1224edf62024090e1dca8a731654f3d38212f488 100644 --- a/src/mednet/libs/classification/scripts/upload.py +++ b/src/mednet/libs/classification/scripts/upload.py @@ -54,123 +54,26 @@ def upload( upload_limit_mb: int, **_, # ignored ) -> None: # numpydoc ignore=PR01 - """Upload results from an experiment folder to GitLab's MLFlow server.""" - - import json - import os - import tempfile - - import mlflow - from mednet.libs.common.utils.checkpointer import ( - get_checkpoint_to_run_inference, - ) - from mednet.libs.common.utils.gitlab import ( - gitlab_instance_and_token, - sanitize_filename, - size_in_mb, - ) - - logger.info("Retrieving GitLab credentials for access to hosted MLFlow server...") - gitlab, token = gitlab_instance_and_token() - project = gitlab.projects.get(project_path) - os.environ["MLFLOW_TRACKING_TOKEN"] = token - os.environ["MLFLOW_TRACKING_URI"] = ( - gitlab.api_url + f"/projects/{project.id}/ml/mlflow" - ) - - # get train files - train_folder = experiment_folder - train_log_file = train_folder / "trainlog.pdf" - train_meta_file = train_folder / "train.meta.json" - train_model_file = get_checkpoint_to_run_inference(train_folder) - train_files = [train_meta_file, train_model_file, train_log_file] - - # get evaluation files - evaluation_file = experiment_folder / "evaluation.json" - evaluation_meta_file = experiment_folder / "evaluation.meta.json" - evaluation_meta_file = experiment_folder / "evaluation.rst" - evaluation_log_file = experiment_folder / "evaluation.pdf" - evaluation_files = [ - evaluation_file, - evaluation_meta_file, - evaluation_log_file, + """Upload results from a classification experiment folder to GitLab's MLFlow server.""" + + from mednet.libs.common.scripts.upload import upload as upload_ + + metrics = [ + "threshold", + "precision", + "recall", + "f1_score", + "average_precision_score", + "specificity", + "auc_score", + "accuracy", ] - # checks for maximum upload limit - total_size_mb = sum([size_in_mb(f) for f in train_files + evaluation_files]) - if upload_limit_mb != 0 and total_size_mb > upload_limit_mb: - raise RuntimeError( - f"Total size of upload ({total_size_mb:.2f} MB) exceeds " - f"permitted maximum ({upload_limit_mb:.2f} MB)." - ) - - with train_meta_file.open("r") as meta_file: - train_data = json.load(meta_file) - - with evaluation_file.open("r") as meta_file: - evaluation_data = json.load(meta_file) - evaluation_data = evaluation_data["test"] - - # get lowest validation epoch - best_epoch = str(train_model_file).split(".")[0].split("=")[1] - - experiment_name = ( - experiment_name or f"{train_data['model-name']}-{train_data['database-name']}" - ) - run_name = run_name or train_data["datetime"] - - click.secho( - f"Uploading entry `{run_name}` to experiment `{experiment_name}` " - f"on GitLab project {project_path} (id: {project.id})...", - bold=True, - fg="green", - ) - exp_meta = mlflow.set_experiment(experiment_name=experiment_name) - with mlflow.start_run(run_name=run_name): - click.echo("Uploading package metadata...") - click.echo(f" -> `version` ({train_data['package-version']})") - mlflow.log_param("package version", train_data["package-version"]) - - click.echo("Uploading metrics...") - - for k in [ - "epochs", - "batch-size", - ]: - click.secho(f" -> `{k}` ({train_data[k]})") - mlflow.log_param(k, train_data[k]) - - click.secho(f" -> `#accumulations` ({train_data['accumulate-grad-batches']})") - mlflow.log_param("#Accumulations", train_data["accumulate-grad-batches"]) - click.secho(f" -> `epoch (best)` ({best_epoch})") - mlflow.log_param("Epoch (best)", best_epoch) - - for k in [ - "threshold", - "precision", - "recall", - "f1_score", - "average_precision_score", - "specificity", - "auc_score", - "accuracy", - ]: - click.secho(f" -> `{k}` ({evaluation_data[k]:.3g})") - mlflow.log_metric(k, evaluation_data[k]) - - click.echo("Uploading artifacts (files)...") - - with tempfile.TemporaryDirectory() as tmpdir_name: - tmpdir = pathlib.Path(tmpdir_name) - for f in train_files + evaluation_files: - assert f.exists(), f"File `{f}` does not exist - cannot upload!" - clean_path = str(sanitize_filename(tmpdir, f)) - click.secho(f" -> `{clean_path}` ({size_in_mb(f):.2f} MB)") - mlflow.log_artifact(clean_path) - - click.secho(f"Uploaded {total_size_mb:.2f} MB to server.", bold=True, fg="green") - click.secho( - f"Visit {gitlab.url}/{project.path_with_namespace}/-/ml/experiments/{exp_meta.experiment_id}", - bold=True, - fg="blue", + upload_( + project_path, + experiment_folder, + experiment_name, + run_name, + metrics, + upload_limit_mb, ) diff --git a/src/mednet/libs/common/scripts/upload.py b/src/mednet/libs/common/scripts/upload.py index 5302adbc59ac4625413e0831dde20d675b21b45f..3d51efbbbf817de48e648918336e7b7803560748 100644 --- a/src/mednet/libs/common/scripts/upload.py +++ b/src/mednet/libs/common/scripts/upload.py @@ -81,3 +81,140 @@ def reusable_options(f): return f(*args, **kwargs) return wrapper_reusable_options + + +def upload( + project_path: str, + experiment_folder: pathlib.Path, + experiment_name: str, + run_name: str, + metrics: list[str], + upload_limit_mb: int, +) -> None: + """Upload results from an experiment folder to GitLab's MLFlow server. + + Parameters + ---------- + project_path + Path to the project where to upload model entries. + experiment_folder + Directory in which to upload results from. + experiment_name + A string indicating the experiment name (e.g. "exp-pasa-mc" or "exp-densenet-mc-ch"). + run_name + A string indicating the run name (e.g. "run-1"). + metrics + List of metrics to upload. + upload_limit_mb + Maximim upload size in MB (set to 0 for no limit). + """ + + import json + import os + import tempfile + + import mlflow + from mednet.libs.common.utils.checkpointer import ( + get_checkpoint_to_run_inference, + ) + from mednet.libs.common.utils.gitlab import ( + gitlab_instance_and_token, + sanitize_filename, + size_in_mb, + ) + + logger.info("Retrieving GitLab credentials for access to hosted MLFlow server...") + gitlab, token = gitlab_instance_and_token() + project = gitlab.projects.get(project_path) + os.environ["MLFLOW_TRACKING_TOKEN"] = token + os.environ["MLFLOW_TRACKING_URI"] = ( + gitlab.api_url + f"/projects/{project.id}/ml/mlflow" + ) + + # get train files + train_folder = experiment_folder + train_log_file = train_folder / "trainlog.pdf" + train_meta_file = train_folder / "train.meta.json" + train_model_file = get_checkpoint_to_run_inference(train_folder) + train_files = [train_meta_file, train_model_file, train_log_file] + + # get evaluation files + evaluation_file = experiment_folder / "evaluation.json" + evaluation_meta_file = experiment_folder / "evaluation.meta.json" + evaluation_meta_file = experiment_folder / "evaluation.rst" + evaluation_log_file = experiment_folder / "evaluation.pdf" + evaluation_files = [ + evaluation_file, + evaluation_meta_file, + evaluation_log_file, + ] + + # checks for maximum upload limit + total_size_mb = sum([size_in_mb(f) for f in train_files + evaluation_files]) + if upload_limit_mb != 0 and total_size_mb > upload_limit_mb: + raise RuntimeError( + f"Total size of upload ({total_size_mb:.2f} MB) exceeds " + f"permitted maximum ({upload_limit_mb:.2f} MB)." + ) + + with train_meta_file.open("r") as meta_file: + train_data = json.load(meta_file) + + with evaluation_file.open("r") as meta_file: + evaluation_data = json.load(meta_file) + evaluation_data = evaluation_data["test"] + + # get lowest validation epoch + best_epoch = str(train_model_file).split(".")[0].split("=")[1] + + experiment_name = ( + experiment_name or f"{train_data['model-name']}-{train_data['database-name']}" + ) + run_name = run_name or train_data["datetime"] + + click.secho( + f"Uploading entry `{run_name}` to experiment `{experiment_name}` " + f"on GitLab project {project_path} (id: {project.id})...", + bold=True, + fg="green", + ) + exp_meta = mlflow.set_experiment(experiment_name=experiment_name) + with mlflow.start_run(run_name=run_name): + click.echo("Uploading package metadata...") + click.echo(f" -> `version` ({train_data['package-version']})") + mlflow.log_param("package version", train_data["package-version"]) + + click.echo("Uploading metrics...") + + for k in [ + "epochs", + "batch-size", + ]: + click.secho(f" -> `{k}` ({train_data[k]})") + mlflow.log_param(k, train_data[k]) + + click.secho(f" -> `#accumulations` ({train_data['accumulate-grad-batches']})") + mlflow.log_param("#Accumulations", train_data["accumulate-grad-batches"]) + click.secho(f" -> `epoch (best)` ({best_epoch})") + mlflow.log_param("Epoch (best)", best_epoch) + + for k in metrics: + click.secho(f" -> `{k}` ({evaluation_data[k]:.3g})") + mlflow.log_metric(k, evaluation_data[k]) + + click.echo("Uploading artifacts (files)...") + + with tempfile.TemporaryDirectory() as tmpdir_name: + tmpdir = pathlib.Path(tmpdir_name) + for f in train_files + evaluation_files: + assert f.exists(), f"File `{f}` does not exist - cannot upload!" + clean_path = str(sanitize_filename(tmpdir, f)) + click.secho(f" -> `{clean_path}` ({size_in_mb(f):.2f} MB)") + mlflow.log_artifact(clean_path) + + click.secho(f"Uploaded {total_size_mb:.2f} MB to server.", bold=True, fg="green") + click.secho( + f"Visit {gitlab.url}/{project.path_with_namespace}/-/ml/experiments/{exp_meta.experiment_id}", + bold=True, + fg="blue", + ) diff --git a/src/mednet/libs/segmentation/scripts/upload.py b/src/mednet/libs/segmentation/scripts/upload.py index 9abbbd277b37855f0bd8a29960908617d78065e7..bd523442a773d159ea4758a91cee047d6b166351 100644 --- a/src/mednet/libs/segmentation/scripts/upload.py +++ b/src/mednet/libs/segmentation/scripts/upload.py @@ -54,122 +54,25 @@ def upload( upload_limit_mb: int, **_, # ignored ) -> None: # numpydoc ignore=PR01 - """Upload results from an experiment folder to GitLab's MLFlow server.""" - - import json - import os - import tempfile - - import mlflow - from mednet.libs.common.utils.checkpointer import ( - get_checkpoint_to_run_inference, - ) - from mednet.libs.common.utils.gitlab import ( - gitlab_instance_and_token, - sanitize_filename, - size_in_mb, - ) - - logger.info("Retrieving GitLab credentials for access to hosted MLFlow server...") - gitlab, token = gitlab_instance_and_token() - project = gitlab.projects.get(project_path) - os.environ["MLFLOW_TRACKING_TOKEN"] = token - os.environ["MLFLOW_TRACKING_URI"] = ( - gitlab.api_url + f"/projects/{project.id}/ml/mlflow" - ) - - # get train files - train_folder = experiment_folder - train_log_file = train_folder / "trainlog.pdf" - train_meta_file = train_folder / "train.meta.json" - train_model_file = get_checkpoint_to_run_inference(train_folder) - train_files = [train_meta_file, train_model_file, train_log_file] - - # get evaluation files - evaluation_file = experiment_folder / "evaluation.json" - evaluation_meta_file = experiment_folder / "evaluation.meta.json" - evaluation_meta_file = experiment_folder / "evaluation.rst" - evaluation_log_file = experiment_folder / "evaluation.pdf" - evaluation_files = [ - evaluation_file, - evaluation_meta_file, - evaluation_log_file, + """Upload results from a segmentation experiment folder to GitLab's MLFlow server.""" + + from mednet.libs.common.scripts.upload import upload as upload_ + + metrics = [ + "precision", + "recall", + "f1", + "average_precision_score", + "specificity", + "auc_score", + "accuracy", ] - # checks for maximum upload limit - total_size_mb = sum([size_in_mb(f) for f in train_files + evaluation_files]) - if upload_limit_mb != 0 and total_size_mb > upload_limit_mb: - raise RuntimeError( - f"Total size of upload ({total_size_mb:.2f} MB) exceeds " - f"permitted maximum ({upload_limit_mb:.2f} MB)." - ) - - with train_meta_file.open("r") as meta_file: - train_data = json.load(meta_file) - - with evaluation_file.open("r") as meta_file: - evaluation_data = json.load(meta_file) - evaluation_data = evaluation_data["test"] - - # get lowest validation epoch - best_epoch = str(train_model_file).split(".")[0].split("=")[1] - - experiment_name = ( - experiment_name or f"{train_data['model-name']}-{train_data['database-name']}" - ) - run_name = run_name or train_data["datetime"] - - click.secho( - f"Uploading entry `{run_name}` to experiment `{experiment_name}` " - f"on GitLab project {project_path} (id: {project.id})...", - bold=True, - fg="green", - ) - exp_meta = mlflow.set_experiment(experiment_name=experiment_name) - with mlflow.start_run(run_name=run_name): - click.echo("Uploading package metadata...") - click.echo(f" -> `version` ({train_data['package-version']})") - mlflow.log_param("package version", train_data["package-version"]) - - click.echo("Uploading metrics...") - - for k in [ - "epochs", - "batch-size", - ]: - click.secho(f" -> `{k}` ({train_data[k]})") - mlflow.log_param(k, train_data[k]) - - click.secho(f" -> `#accumulations` ({train_data['accumulate-grad-batches']})") - mlflow.log_param("#Accumulations", train_data["accumulate-grad-batches"]) - click.secho(f" -> `epoch (best)` ({best_epoch})") - mlflow.log_param("Epoch (best)", best_epoch) - - for k in [ - "precision", - "recall", - "f1", - "average_precision_score", - "specificity", - "auc_score", - "accuracy", - ]: - click.secho(f" -> `{k}` ({evaluation_data[k]:.3g})") - mlflow.log_metric(k, evaluation_data[k]) - - click.echo("Uploading artifacts (files)...") - - with tempfile.TemporaryDirectory() as tmpdir_name: - tmpdir = pathlib.Path(tmpdir_name) - for f in train_files + evaluation_files: - assert f.exists(), f"File `{f}` does not exist - cannot upload!" - clean_path = str(sanitize_filename(tmpdir, f)) - click.secho(f" -> `{clean_path}` ({size_in_mb(f):.2f} MB)") - mlflow.log_artifact(clean_path) - - click.secho(f"Uploaded {total_size_mb:.2f} MB to server.", bold=True, fg="green") - click.secho( - f"Visit {gitlab.url}/{project.path_with_namespace}/-/ml/experiments/{exp_meta.experiment_id}", - bold=True, - fg="blue", + upload_( + project_path, + experiment_folder, + experiment_name, + run_name, + metrics, + upload_limit_mb, )