diff --git a/.reuse/dep5 b/.reuse/dep5 index 9ddad99c1508e91ea62e11d08860343ad177173d..5314ea6e597528ba571ecdaa3e5686b76102efe5 100644 --- a/.reuse/dep5 +++ b/.reuse/dep5 @@ -11,6 +11,7 @@ Files: doc/extras.inv doc/extras.txt doc/catalog.json + doc/img/*.png doc/usage/img/*.png doc/results/img/*.jpg doc/results/img/*.png diff --git a/doc/data-model.rst b/doc/data-model.rst new file mode 100644 index 0000000000000000000000000000000000000000..d8b152993eaa97465f539a9e30a6ffd1cabbf579 --- /dev/null +++ b/doc/data-model.rst @@ -0,0 +1,98 @@ +.. Copyright © 2023 Idiap Research Institute <contact@idiap.ch> +.. +.. SPDX-License-Identifier: GPL-3.0-or-later + +.. _mednet.datamodel: + +============ + Data model +============ + +The data model implemented in this package is summarized in the following +figure: + +.. figure:: img/data-model.png + + +Each of the elements is described next. + + +Database +-------- + +Data that is downloaded from a data provider, and contains samples in their raw +data format. The database may contain both data and metadata, and is supposed +to exist on disk (or any other storage device) in an arbitrary location that is +user-configurable, in the user environment. For example, databases 1 and 2 for +user A may be under ``/home/user-a/databases/database-1`` and +``/home/user-a/databases/database-2``, while for user B, they may sit in +``/groups/medical-data/DatabaseOne`` and ``/groups/medical-data/DatabaseTwo``. + + +Sample +------ + +The in-memory representation of the raw database samples. In this package, it +is specified as a two-tuple with a tensor, and metadata (typically label, name, +etc.). + + +RawDataLoader +------------- + +A concrete "functor" that allows one to load the raw data and associated +metadata, to create a in-memory Sample representation. RawDataLoaders are +typically Database-specific due to raw data and metadata encoding varying quite +a lot on different databases. RawDataLoaders may also embed various +pre-processing transformations to render data readily usable such as +pre-cropping of black pixel areas, or 16-bit to 8-bit auto-level conversion. + + +TransformSequence +----------------- + +A sequence of callables that allows one to transform torch.Tensor objects into +other torch.Tensor objects, typically to crop, resize, convert Color-spaces, +and the such on raw-data. + + +DatabaseSplit +------------- + +A dictionary that represents an organization of the available raw data in the +database to perform an evaluation protocol (e.g. train, validation, test) +through datasets (or subsets). It is represented as dictionary mapping dataset +names to lists of "raw-data" sample representations, which vary in format +depending on Database metadata availability. RawDataLoaders receive this raw +representations and can convert these to in-memory Sample's. + + +ConcatDatabaseSplit +------------------- + +An extension of a DatabaseSplit, in which the split can be formed by +cannibalising various other DatabaseSplits to construct a new evaluation +protocol. Examples of this are cross-database tests, or the construction of +multi-Database training and validation subsets. + + +Dataset +------- + +An iterable object over in-memory Samples, inherited from the pytorch Dataset +definition. A dataset in our framework may be completely cached in memory or +have in-memory representation of samples loaded on demand. After data loading, +our datasets can optionally apply a TransformSequence, composed of +pre-processing steps defined on a per-model level before optionally caching +in-memory Sample representations. The "raw" representation of a dataset are the +split dictionary values (ie. not the keys). + + +DataModule +---------- + +A DataModule aggregates Splits and RawDataLoaders to provide lightning a +known-interface to the complete evaluation protocol (train, validation, +prediction and testing) required for a full experiment to take place. It +automates control over data loading parallelisation and caching inside our +framework, providing final access to readily-usable pytorch DataLoaders. diff --git a/doc/data_model.rst b/doc/data_model.rst deleted file mode 100644 index d028419572a6a56145fbe098ea9e20f785327942..0000000000000000000000000000000000000000 --- a/doc/data_model.rst +++ /dev/null @@ -1,69 +0,0 @@ -.. Copyright © 2023 Idiap Research Institute <contact@idiap.ch> -.. -.. SPDX-License-Identifier: GPL-3.0-or-later - -.. _mednet.datamodel: - -============ - Data model -============ - -The following describes the various parts of our data model, which are used in this documentation and throughout the codebase. - - -Database --------- -Data that is downloaded from a data provider, and contains samples in their raw data format. -The database may contain both data and metadata, and is supposed to exist on disk (or any other storage device) -in an arbitrary location that is user-configurable, in the user environment. -For example, databases 1 and 2 for user A may be under /home/user-a/databases/database-1 and /home/user-a/databases/database-2, -while for user B, they may sit in /groups/medical-data/DatabaseOne and /groups/medical-data/DatabaseTwo. - - -Sample ------- -The in-memory representation of the raw database samples. -In this package, it is specified as a two-tuple with a tensor, and metadata (typically label, name, etc.). - - -RawDataLoader -------------- -A concrete "functor" that allows one to load the raw data and associated metadata, to create a in-memory Sample representation. -RawDataLoaders are typically Database-specific due to raw data and metadata encoding varying quite a lot on different databases. -RawDataLoaders may also embed various pre-processing transformations to render data readily usable such as pre-cropping of black pixel areas, -or 16-bit to 8-bit auto-level conversion. - - -TransformSequence ------------------ -A sequence of callables that allows one to transform torch.Tensor objects into other torch.Tensor objects, -typically to crop, resize, convert Color-spaces, and the such on raw-data. - - -DatabaseSplit -------------- -A dictionary that represents an organization of the available raw data in the database to perform -an evaluation protocol (e.g. train, validation, test) through datasets (or subsets). -It is represented as dictionary mapping dataset names to lists of "raw-data" sample representations, which vary in format -depending on Database metadata availability. RawDataLoaders receive this raw representations and can convert these to in-memory Sample's. - - -ConcatDatabaseSplit -------------------- -An extension of a DatabaseSplit, in which the split can be formed by cannibalising various other DatabaseSplits to construct a new evaluation protocol. -Examples of this are cross-database tests, or the construction of multi-Database training and validation subsets. - - -Dataset -------- -An iterable object over in-memory Samples, inherited from the pytorch Dataset definition. -A dataset in our framework may be completely cached in memory or have in-memory representation of samples loaded on demand. -After data loading, our datasets can optionally apply a TransformSequence, composed of pre-processing steps defined on a per-model level -before optionally caching in-memory Sample representations. The "raw" representation of a dataset are the split dictionary values (ie. not the keys). - - -DataModule ----------- -A DataModule aggregates Splits and RawDataLoaders to provide lightning a known-interface to the complete evaluation protocol (train, validation, prediction and testing) -required for a full experiment to take place. It automates control over data loading parallelisation and caching inside our framework, -providing final access to readily-usable pytorch DataLoaders. diff --git a/doc/img/data-model.dot b/doc/img/data-model.dot new file mode 100644 index 0000000000000000000000000000000000000000..9c7671d94d2d06cdb29438b88bbfa21ef9586062 --- /dev/null +++ b/doc/img/data-model.dot @@ -0,0 +1,89 @@ +# SPDX-FileCopyrightText: Copyright © 2024 Idiap Research Institute <contact@idiap.ch> +# +# SPDX-License-Identifier: GPL-3.0-or-later + +digraph G { + rankdir = T; + + fontname = "Helvetica" + + node [ + fontname = "Helvetica" + shape = "record" + ] + + edge [ + fontname = "Helvetica" + ] + + Database [ + label = "Database\l(on storage)" + shape = "cylinder" + ] + + DatabaseSplit [ + label = "{DatabaseSplit|+ __init__(description: JSON)\l+ splits() : dict[str, list]\l}" + ] + + RawDataLoader [ + label = "{RawDataLoader|+ datadir : path\l|+ sample(description : JSON) : Sample \l+ label(description : JSON) : int\l}" + ] + + DataModule [ + label = "{DataModule|- datasets : dict[str, torch.Dataset]\l+ model_transforms : TransformSequence\l|+ setup(stage: str)\l+ train_dataloader() : DataLoader\l+ val_dataloader() : dict[str, DataLoader]\l+ test_dataloader() : dict[str, DataLoader]\l+ predict_dataloader() : dict[str, DataLoader]\l}" + ] + + CachingDataModule [ + label = "{CachingDataModule (lightning.DataModule)}" + style = "dashed" + ] + + Sample [ + label = "{Sample (tuple)|+ tensor: torch.Tensor\l+ metadata: dict[str, Any]\l}" + ] + + DataLoader [ + label = "{DataLoader (torch.DataLoader)|+ __getitem__(key: int)\l+ __iter__()\l}" + ] + + edge [ + arrowhead = "empty" + ] + + DataModule -> CachingDataModule + + edge [ + arrowhead = "diamond" + taillabel = "1..1" + ] + + DatabaseSplit -> DataModule + RawDataLoader -> DataModule + + edge [ + arrowhead = "diamond" + taillabel = "1..*" + ] + + Sample -> DataLoader + + edge [ + arrowhead = "none" + taillabel = "" + label = "generates" + ] + + DataModule -> DataLoader + + edge [ + arrowhead = "none" + headlabel = "1..1" + label = "reads" + ] + + RawDataLoader -> Database + + { rank = same; Database; CachingDataModule; Sample; } + { rank = same; RawDataLoader; DatabaseSplit; DataLoader; } + +} diff --git a/doc/img/data-model.png b/doc/img/data-model.png new file mode 100644 index 0000000000000000000000000000000000000000..e0f39233babf91a7c88408b92e589e90c39cab1f Binary files /dev/null and b/doc/img/data-model.png differ diff --git a/doc/index.rst b/doc/index.rst index 1864f3db1b95e61854160189f13f1f82d1bd53ca..f6a8ab67795144fcde0173141397bbe1b677e12b 100644 --- a/doc/index.rst +++ b/doc/index.rst @@ -52,7 +52,7 @@ User Guide install usage/index results/index - data_model + data-model references cli config