diff --git a/helpers/visceral_make_splits.py b/helpers/visceral_make_splits.py new file mode 100644 index 0000000000000000000000000000000000000000..1cbe3f56f0bf101555265ad22743dd64a1c45092 --- /dev/null +++ b/helpers/visceral_make_splits.py @@ -0,0 +1,118 @@ +# SPDX-FileCopyrightText: Copyright © 2023 Idiap Research Institute <contact@idiap.ch> +# +# SPDX-License-Identifier: GPL-3.0-or-later +"""Generate visceral default JSON dataset for 3d binary classification tasks in mednet. + +Arguments of the scripts are as follow: + root-of-preprocessed-visceral-dataset + Full path to the root of the preprocessed visceral dataset. + Filenames in the resulting json are relative to this path. + See output format below. + output-folder + Full path to the folder where to output the default.json file containing the default split of data. + organ_1_id + Integer representing the ID of the first organ to include in the split dataset. + This organ will be labeled as 0 for the binary classification task. + For example, 237 corresponds to bladder. + organ_2_id + Integer representing the ID of the second organ to include in the split dataset. + This organ will be labeled as 1 for the binary classification task. + For example, 237 corresponds to bladder. + +Output format is the following: + +.. code:: json + + { + "train": [ + [ + "<size>/<filename>", + # label is one of: + # 0: organ_1 / 1: organ_2 + <label>, + ], + ... + ], + "validation": [ + # same format as for train + ... + ] + "test": [ + # same format as for train + ... + ] +""" + +import json +import os +import pathlib +import sys + +from sklearn.model_selection import train_test_split + + +def split_files( + files: list[str], + train_size: float = 0.7, + test_size: float = 0.2, + validation_size: float = 0.1, +): + train_files, temp_files = train_test_split(files, test_size=(1 - train_size)) + test_files, validation_files = train_test_split( + temp_files, test_size=(validation_size / (test_size + validation_size)) + ) + return train_files, test_files, validation_files + + +def save_to_json( + train_files: list[str], + test_files: list[str], + validation_files: list[str], + output_file: str, + organ_1_id: str, +): + data = { + "train": [ + [filename, 0 if organ_1_id in filename else 1] for filename in train_files + ], + "test": [ + [filename, 0 if organ_1_id in filename else 1] for filename in test_files + ], + "validation": [ + [filename, 0 if organ_1_id in filename else 1] + for filename in validation_files + ], + } + + with pathlib.Path(output_file).open("w") as json_file: + json.dump(data, json_file, indent=2) + + +def main(): + if len(sys.argv) != 6: + print(__doc__) + print( + f"Usage: python3 {sys.argv[0]} <root-of-preprocessed-visceral-dataset> <output-folder> <organ_1_id> <organ_2_id> <size>" + ) + sys.exit(0) + + root_folder = sys.argv[1] + output_folder = sys.argv[2] + organ_1_id = sys.argv[3] + organ_2_id = sys.argv[4] + size = sys.argv[5] + output_file = pathlib.Path(output_folder) / "default.json" + input_folder = pathlib.Path(root_folder) / size + files = [ + f"{size}/{file}" + for file in os.listdir(input_folder) + if organ_1_id in file or organ_2_id in file + ] + train_files, test_files, validation_files = split_files(files) + + save_to_json(train_files, test_files, validation_files, output_file, organ_1_id) + print(f"Data saved to {output_file}") + + +if __name__ == "__main__": + main()