diff --git a/doc/algorithms.rst b/doc/algorithms.rst deleted file mode 100644 index c15e884aa7cdf55ac3cf826d42cc70e61640585b..0000000000000000000000000000000000000000 --- a/doc/algorithms.rst +++ /dev/null @@ -1,886 +0,0 @@ -.. vim: set fileencoding=utf-8 : - -.. Copyright (c) 2016 Idiap Research Institute, http://www.idiap.ch/ .. -.. Contact: beat.support@idiap.ch .. -.. .. -.. This file is part of the beat.core module of the BEAT platform. .. -.. .. -.. Commercial License Usage .. -.. Licensees holding valid commercial BEAT licenses may use this file in .. -.. accordance with the terms contained in a written agreement between you .. -.. and Idiap. For further information contact tto@idiap.ch .. -.. .. -.. Alternatively, this file may be used under the terms of the GNU Affero .. -.. Public License version 3 as published by the Free Software and appearing .. -.. in the file LICENSE.AGPL included in the packaging of this file. .. -.. The BEAT platform is distributed in the hope that it will be useful, but .. -.. WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY .. -.. or FITNESS FOR A PARTICULAR PURPOSE. .. -.. .. -.. You should have received a copy of the GNU Affero Public License along .. -.. with the BEAT platform. If not, see http://www.gnu.org/licenses/. .. - - -.. _beat-core-algorithms: - -=========== -Algorithms -=========== - -Algorithms are user-defined piece of software that run within the blocks of a -toolchain. An algorithm can read data on the input(s) of the block and write -processed data on its output(s). They are, hence, key components for -scientific experiments, since they formally describe how to transform raw -data into higher level concept such as classes. - - -An algorithm lies at the core of each processing block and may be subject to -parametrization. Inputs and outputs of an algorithm have well-defined data -formats. The format of the data on each input and output of the block is -defined at a higher-level in the platform. It is expected that the -implementation of the algorithm respects whatever was declared on the -platform. - -By default, the algorithm is **data-driven**; algorithm is typically provided -one data sample at a time and must immediately produce some output data. -Furthermore, the way the algorithm handle the data is highly configurable and -covers a huge range of possible scenarios. - -:numref:`beat-core-overview-block` displays the relationship between a -processing block and its algorithm. - -.. _beat-core-overview-block: -.. figure:: ./img/block.* - - Relationship between a processing block and its algorithm - -This section contains information on the definition of algorithm and -its programmatic use on Python-based language bindings. - - -.. _beat-core-algorithms-definition: - -Definition ----------- - -An algorithm is defined by two distinct components: - -* a `JSON`_ object with several fields, specifying the inputs, the outputs, - the parameters and additional information such as the language in which it - is implemented. -* source code (and/or [later] binary code) describing how to transform the input - data. - - -.. _beat-core-algorithms-definition-json: - -JSON Declaration -................ - -A `JSON`_ declaration of an algorithm consists of several fields. For example, -the following declaration is the one of an algorithm implementing -probabilistic component analysis (PCA): - -.. code-block:: javascript - - { - "language": "python", - "splittable": false, - "groups": [ - { - "inputs": { - "image": { - "type": "system/array_2d_uint8/1" - } - }, - "outputs": { - "subspace": { - "type": "tutorial/linear_machine/1" - } - } - } - ], - "parameters": { - "number-of-components": { - "default": 5, - "type": "uint32" - } - }, - "description": "Principal Component Analysis (PCA)" - } - -The field `language` specifies the language in which the algorithm is -implemented. The field `splittable` indicates, whether the algorithm can be -parallelized into chunks or not. The field `parameters` lists the parameters -of the algorithm, describing both default values and their types. The field -`groups` gives information about the inputs and outputs of the algorithm. -They are provided into a list of dictionary, each element in this list being -associated to a database `channel`. The group, which contains outputs, is -the **synchronization channel**. By default, a loop is automatically performs -by the platform on the synchronization channel, and user-code must not loop -on this group. In contrast, it is the responsability of the user to load data -from the other groups. This is described in more details in the following -subsections. Finally, the field `description` is optional and gives a short -description of the algorithm. - -The web client of the BEAT platform provides a graphical editor for algorithm, -which simplifies its `JSON`_ declaration definition. - - -.. _beat-core-algorithms-definition-analyzer: - -Analyzer -........ - -At the end of the processing workflow of an experiment, there is a special -kind of algorithm, which does not yield any output, but in contrast so called -`results`. These algorithms are called **analyzers**. - -`Results` of an experiment are reported back to the user. Since the platform -is concerned about data privacy, only a limited number of data formats can be -employed as results in an analyzer, such as boolean, integers, floating point -values, strings (of limited size), as well as plots (such as scatter or bar -plots). - -For example, the following declaration is the one of a simple analyzer, which -generates an ROC curve as well as few other metrics. - -.. code-block:: javascript - - { - "language": "python", - "groups": [ - { - "inputs": { - "scores": { - "type": "tutorial/probe_scores/1" - } - } - } - ], - "results": { - "far": { - "type": "float32", - "display": true - }, - "roc": { - "type": "plot/scatter/1", - "display": false - }, - "number_of_positives": { - "type": "int32", - "display": false - }, - "frr": { - "type": "float32", - "display": true - }, - "eer": { - "type": "float32", - "display": true - }, - "threshold": { - "type": "float32", - "display": false - }, - "number_of_negatives": { - "type": "int32", - "display": false - } - } - } - - -.. _beat-core-algorithms-definition-code: - -Source Code -........... - -The BEAT platform has been designed to support algorithms written in different -programming languages. However, for each language, a corresponding back-end -needs to be implemented, which is in charge of connecting the inputs and -outputs to the algorithm and running its code as expected. In this section, -we describe the implementation of algorithms in the Python programming -language. - -To implement a new algorithm, one must write a class following a few -conventions. In the following, examples of such classes are provided. - - -.. _beat-core-algorithms-examples: - -Examples --------- - -.. _beat-core-algorithms-examples-simple: - -Simple algorithm (no parametrization) -..................................... - -At the very minimum, an algorithm class must look like this: - -.. code-block:: python - - class Algorithm: - - def process(self, inputs, outputs): - # Read data from inputs, compute something, and write the result - # of the computation on outputs - ... - return True - -The class must be called ``Algorithm`` and must have a method called -``process()``, that takes as parameters a list of inputs (see section -:ref:`beat-core-algorithms-input-inputlist`) and a list of outputs (see -section :ref:`beat-core-algorithms-output-outputlist`). This method must -return ``True`` if everything went correctly, and ``False`` if an error -occurred. - -The platform will call this method once per block of data available on the -`synchronized` inputs of the block. - - -.. _beat-core-algorithms-examples-parametrizable: - -Parametrizable algorithm -........................ - -To implement a parametrizable algorithm, two things must be added to the class: -(1) a field in the JSON declaration of the algorithm containing their default -values as well as the type of the parameters, and (2) a method called -``setup()``, that takes one argument, a map containing the parameters of the -algorithm. - -.. code-block:: javascript - - { - ... - "parameters": { - "threshold": { - "default": 0.5, - "type": "float32" - } - }, - ... - } - -.. code-block:: python - - class Algorithm: - - def setup(self, parameters): - # Retrieve the value of the parameters - self.threshold = parameters['threshold'] - return True - - def process(self, inputs, outputs): - # Read data from inputs, compute something, and write the result - # of the computation on outputs - ... - return True - -When retrieving the value of the parameters, one must not assume that a value -was provided for each parameter. This is why we may use a *try: ... except: ...* -construct in the ``setup()`` method. - -.. _beat-core-algorithms-input: - -Handling input data -------------------- - -.. _beat-core-algorithms-input-inputlist: - -Input list -.......... - -An algorithm is given access to the **list of the inputs** of the processing -block. This list can be used to access each input individually, either by -their name (see section :ref:`beat-core-algorithms-input-name`), their index -or by iterating over the list: - -.. code-block:: python - - # 'inputs' is the list of inputs of the processing block - - print(inputs['labels'].data_format) - - for index in range(0, inputs.length): - print(inputs[index].data_format) - - for input in inputs: - print(input.data_format) - - for input in inputs[0:2]: - print(input.data_format) - -Additionally, the following method is useable on a **list of inputs**: - -.. py:method:: hasMoreData() - - Indicates if there is (at least) another block of data to process on some of - the inputs - - -.. _beat-core-algorithms-input-input: - -Input -..... - -Each input provides the following informations: - -.. py:attribute:: name - - *(string)* Name of the input - -.. py:attribute:: data_format - - *(string)* Data format accepted by the input - -.. py:attribute:: data_index - - *(integer)* Index of the last block of data received on the input (See section - :ref:`beat-core-algorithms-input-synchronization`) - -.. py:attribute:: data - - *(object)* The last block of data received on the input - -The structure of the ``data`` object is dependent of the data format assigned to -the input. Note that ``data`` can be *None*. - -.. _beat-core-algorithms-input-name: - -Input naming -............ - -Each algorithm assign a name of its choice to each input (and output, see -section :ref:`beat-core-algorithms-output-name`). This mechanism ensures that algorithms -are easily shareable between users. - -For instance, in :numref:`beat-core-algorithms-input-naming`, two different users -(Joe and Bill) are using two different toolchains. Both toolchains have one -block with two entries and one output, with a similar set of data formats -(*image/rgb* and *label* on the inputs, *array/float* on the output), although -not in the same order. The two blocks use different algorithms, which both -refers to their inputs and outputs using names of their choice - -Nevertheless, Joe can choose to use Bill's algorithm instead of his own one. -When the algorithm to use is changed on the web interface, the platform will -attempt to match each input with the names (and types) declared by the -algorithm. In case of ambiguity, the user will be asked to manually resolve it. - -In other words: the way the block is connected in the toolchain doesn't force a -naming scheme or a specific order of inputs to the algorithms used in that -block. As long as the set of data types (on the inputs and outputs) is -compatible for both the block and the algorithm, the algorithm can be used in -the block. - -.. _beat-core-algorithms-input-naming: -.. figure:: ./img/inputs-naming.* - - Different toolchains, but interchangeable algorithms - -The name of the inputs are assigned in the JSON declaration of the algorithm, -such as: - -.. code-block:: javascript - - { - ... - "groups": [ - { - "inputs": { - "name1": { - "type": "data_format_1" - }, - "name2": { - "type": "data_format_2" - } - } - } - ], - ... - } - - -.. _beat-core-algorithms-input-synchronization: - -Inputs synchronization -...................... - -The data available on the different inputs from the synchronized channels -are (of course) synchronized. Let's consider the example toolchain on -:numref:`beat-core-algorithms-input-synchronization-example`, where: - -* The image database provides two kind of data: some *images* and their - associated *labels* -* The *block A* receives both data via its inputs -* The *block B* only receives the *labels* -* Both algorithms are *data-driven* - -The system will ask the *block A* to process 6 images, one by one. On the -second input, the algorithm will find the correct label for the current image. -The ``block B`` will only be asked to process 2 labels. - -The algorithm can retrieve the index of the current block of data of each of -its input by looking at their ``data_index`` attribute. For simplicity, the -list of inputs has two attributes (``current_data_index`` and -``current_end_data_index``) that indicates the data indexes currently used by -the synchronization mechanism of the platform. - -.. _beat-core-algorithms-input-synchronization-example: -.. figure:: ./img/inputs-synchronization.* - :width: 80% - - Synchronization example - - -.. _beat-core-algorithms-input-unsynchronized: - -Additional input methods for unsynchronized channels -.................................................... - -Unsynchronized input channels of algorithms can be accessed at will, and -algorithms can use it any way they want. To be able to perform their job, they -have access to additional methods. - -The following method is useable on a **list of inputs**: - -.. py:method:: next() - - Retrieve the next block of data on all the inputs **in a synchronized - manner** - - -Let's come back at the example toolchain on -:numref:`beat-core-algorithms-input-synchronization-example`, and assume -that *block A* uses an autonomous algorithm. To iterate over all the data on -its inputs, the algorithm would do: - -.. code-block:: python - - class Algorithm: - - def process(self, inputs, outputs): - - # Iterate over all the unsynchronized data - while inputs.hasMoreData(): - inputs.next() - - # Do something with inputs['images'].data and inputs['labels'].data - ... - - # At this point, there is no more data available on inputs['images'] and - # inputs['labels'] - - return True - - -The following methods are useable on an ``input``, in cases where the algorithm -doesn't care about the synchronization of some of its inputs: - -.. py:method:: hasMoreData() - - Indicates if there is (at least) another block of data available on the input - -.. py:method:: next() - - Retrieve the next block of data - - .. warning:: - - Once this method has been called by an algorithm, the input is no more - automatically synchronized with the other inputs of the block. - -In the following example, the algorithm desynchronizes one of its inputs but -keeps the others synchronized and iterate over all their data: - -.. code-block:: javascript - - { - ... - "groups": [ - { - "inputs": { - "images": { - "type": "image/rgb" - }, - "labels": { - "type": "label" - }, - "desynchronized": { - "type": "number" - } - } - } - ], - ... - } - - -.. code-block:: python - - class Algorithm: - - def process(self, inputs, outputs): - - # Desynchronize the third input. From now on, inputs['desynchronized'].data - # and inputs['desynchronized'].data_index won't change - inputs['desynchronized'].next() - - # Iterate over all the data on the inputs still synchronized - while inputs.hasMoreData(): - inputs.next() - - # Do something with inputs['images'].data and inputs['labels'].data - ... - - # At this point, there is no more data available on inputs['images'] and - # inputs['labels'], but there might be more on inputs['desynchronized'] - - return True - - -.. _beat-core-algorithms-input-feedbackloop: - -Feedback inputs -............... - -The :numref:`beat-core-algorithms-input-feedbackloop-example` shows a toolchain -containing a feedback loop. A special kind of input is needed in this scenario: -a *feedback input*, that isn't synchronized with the other inputs, and can be -freely used by the algorithm. - -Those feedback inputs aren't yet implemented in the prototype of the platform. -This will be addressed in a later version. - -.. _beat-core-algorithms-input-feedbackloop-example: -.. figure:: ./img/feedback-loop.* - - Feedback loop - - -.. _beat-core-algorithms-output: - -Handling output data --------------------- - -.. _beat-core-algorithms-output-outputlist: - -Output list -........... - -An algorithm is given access to the **list of the outputs** of the processing -block. This list can be used to access each output individually, either by -their name (see section :ref:`beat-core-algorithms-output-name`), their index -or by iterating over the list: - -.. code-block:: python - - # 'outputs' is the list of outputs of the processing block - - print outputs['features'].data_format - - for index in range(0, outputs.length): - outputs[index].write(...) - - for output in outputs: - output.write(...) - - for output in outputs[0:2]: - output.write(...) - - -.. _beat-core-algorithms-output-output: - -Output -...... - -Each output provides the following informations: - -.. py:attribute:: name - - *(string)* Name of the output - -.. py:attribute:: data_format - - *(string)* Format of the data written on the output - - -And the following methods: - -.. py:method:: createData() - - Retrieve an initialized block of data corresponding to the data format of - the output - -.. py:method:: write(data, end_data_index=None) - - Write a block of data on the output - - -We'll look at the usage of those methods through some examples in the following -sections. - - -.. _beat-core-algorithms-output-name: - -Output naming -............. - -Like for its inputs, each algorithm assign a name of its choice to each output -(see section :ref:`beat-core-algorithms-input-name` for more details) by -including them in the JSON declaration of the algorithm. - - -.. code-block:: javascript - - { - ... - "groups": [ - { - "inputs": { - ... - }, - "outputs": { - "name1": { - "type": "data_format1" - }, - "name2": { - "type": "data_format2" - } - } - } - ], - ... - } - - -.. _beat-core-algorithms-output-example1: - -Example 1: Write one block of data for each received block of data -.................................................................. - -.. _beat-core-algorithms-output-example1-figure: -.. figure:: ./img/outputs-example1.* - - Example 1: 6 images as input, 6 blocks of data produced - -Consider the example toolchain on -:numref:`beat-core-algorithms-output-example1-figure`. We will implement a -*data-driven* algorithm that will write one block of data on the output of the -block for each image received on its inputs. This is the simplest case. - -.. code-block:: javascript - - { - ... - "groups": [ - { - "inputs": { - "images": { - "type": "image/rgb" - }, - "labels": { - "type": "label" - } - }, - "outputs": { - "features": { - "type": "array/float" - } - } - } - ], - ... - } - - -.. code-block:: python - - class Algorithm: - - def process(self, inputs, outputs): - - # Ask the output to create a data object according to its data format - data = outputs['features'].createData() - - # Compute something from inputs['images'].data and inputs['labels'].data - # and store the result in 'data' - ... - - # Write our data block on the output - outputs['features'].write(data) - - return True - - -The structure of the ``data`` object is dependent of the data format assigned -to the output. - - -.. _beat-core-algorithms-output-example2: - -Example 2: Skip some blocks of data -................................... - -.. _beat-core-algorithms-output-example2-figure: -.. figure:: ./img/outputs-example2.* - - Example 2: 6 images as input, 4 blocks of data produced, 2 blocks of data - skipped - -Consider the example toolchain on -:numref:`beat-core-algorithms-output-example2-figure`. This time, our algorithm -will use a criterion to decide if it can perform its computation on an image or -not, and tell the platform that, for a particular data index, no data is -available. - -.. code-block:: javascript - - { - ... - "groups": [ - { - "inputs": { - "images": { - "type": "image/rgb" - }, - "labels": { - "type": "label" - } - }, - "outputs": { - "features": { - "type": "array/float" - } - } - } - ], - ... - } - -.. code-block:: python - - class Algorithm: - - def process(self, inputs, outputs): - - # Use a criterion on the image to determine if we can perform our - # computation on it or not - if can_compute(inputs['images'].data): - # Ask the output to create a data object according to its data format - data = outputs['features'].createData() - - # Compute something from inputs['images'].data and inputs['labels'].data - # and store the result in 'data' - ... - - # Write our data block on the output - outputs['features'].write(data) - else: - # Tell the platform that no data is available for this image - outputs['features'].write(None) - - return True - - def can_compute(self, image): - # Implementation of our criterion - ... - return True # or False - - -.. _beat-core-algorithms-output-example3: - -Example 3: Write one block of data related to several received blocks of data -............................................................................. - -.. _beat-core-algorithms-output-example3-figure: -.. figure:: ./img/outputs-example3.* - - Example 3: 6 images as input, 2 blocks of data produced - -Consider the example toolchain on -:numref:`beat-core-algorithms-output-example3-figure`. This time, our algorithm -will compute something using all the images with the same label (all the dogs, -all the cats) and write only one block of data related to all those images. - -The key here is the correct usage of the **current end data index** of the -input list to specify the indexes of the blocks of data we write on the output. -This ensure that the data will be synchronized everywhere in the toolchain: the -platform can now tell, for each of our data block, which image and label it -relates to (See section :ref:`beat-core-algorithms-input-synchronization`). - -Additionally, since we can't know in advance if the image currently processed -is the last one with the current label, we need to memorize the current data -index of the input list to correctly assign it later when we effectively write -the data block on the output. - -.. code-block:: javascript - - { - ... - "groups": [ - { - "inputs": { - "images": { - "type": "image/rgb" - }, - "labels": { - "type": "label" - } - }, - "outputs": { - "features": { - "type": "array/float" - } - } - } - ], - ... - } - -.. code-block:: python - - class Algorithm: - - def __init__(self): - self.data = None # Block of data updated each time we - # receive a new image - self.current_label = None # Label of the images currently processed - self.previous_data_index = None # Data index of the input list during the - # processing of the previous image - - def process(self, inputs, outputs): - # Determine if we already processed some image(s) - if self.data is not None: - # Determine if the label has changed since the last image we processed - if inputs['labels'].data.name != self.current_label: - # Write the block of data on the output - outputs['features'].write(data, self.previous_data_index) - self.data = None - - # Memorize the current data index of the input list - self.previous_data_index = inputs.current_end_data_index - - # Create a new block of data if necessary - if self.data is None: - # Ask the output to create a data object according to its data format - self.data = outputs['features'].createData() - - # Remember the label we are currently processing - self.current_label = inputs['labels'].data.name - - # Compute something from inputs['images'].data and inputs['labels'].data - # and update the content of 'self.data' - ... - - # Determine if this was the last block of data or not - if not(inputs.hasMoreData()): - # Write the block of data on the output - outputs['features'].write(self.data, inputs.current_end_data_index) - - return True - - -.. include:: links.rst diff --git a/doc/databases.rst b/doc/databases.rst deleted file mode 100644 index 749974bc33778bf5dcf0da6d40d37f6da1ef7758..0000000000000000000000000000000000000000 --- a/doc/databases.rst +++ /dev/null @@ -1,94 +0,0 @@ -.. vim: set fileencoding=utf-8 : - -.. Copyright (c) 2016 Idiap Research Institute, http://www.idiap.ch/ .. -.. Contact: beat.support@idiap.ch .. -.. .. -.. This file is part of the beat.core module of the BEAT platform. .. -.. .. -.. Commercial License Usage .. -.. Licensees holding valid commercial BEAT licenses may use this file in .. -.. accordance with the terms contained in a written agreement between you .. -.. and Idiap. For further information contact tto@idiap.ch .. -.. .. -.. Alternatively, this file may be used under the terms of the GNU Affero .. -.. Public License version 3 as published by the Free Software and appearing .. -.. in the file LICENSE.AGPL included in the packaging of this file. .. -.. The BEAT platform is distributed in the hope that it will be useful, but .. -.. WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY .. -.. or FITNESS FOR A PARTICULAR PURPOSE. .. -.. .. -.. You should have received a copy of the GNU Affero Public License along .. -.. with the BEAT platform. If not, see http://www.gnu.org/licenses/. .. - - -========== -Databases -========== - -A database is a collection of data files, one for each output of the database. -This data are inputs to the BEAT toolchains. Therefore, it is important to -define evaluation protocols, which describe how a specific system must use the -raw data of a given database. - -For instance, a recognition system will typically use a subset of the data to -train a recognition `model`, while another subset of data will be used to -evaluate the performance of this model. - - -Structure of a database ------------------------ - -A database has the following structure on disk:: - - database_name/ - output1_name.data - output2_name.data - ... - outputN_name.data - -For a given database, the BEAT platform will typically stores information -about the root folder containing this raw data as well as a description of -it. - - -Evaluation protocols --------------------- - -A BEAT evaluation protocol consists of several ``datasets``, each datasets -having several ``outputs`` with well-defined data formats. In practice, -each dataset will typically be used for a different purpose. - -For instance, in the case of a simple face recognition protocol, the -database may be split into three datasets: one for training, one for enrolling -client-specific model, and one for testing these models. -The training dataset may have two outputs: grayscale images as two-dimensional -array of type `uint8` and client id as `uint64` integers. - -The BEAT platform is data-driven, which means that all the outputs of a given -dataset are synchronized. The way the data is generated by each template -is defined in a piece of code called the ``database view``. It is important -that a database view has a deterministic behavior for reproducibility -purposes. - - -Database set templates ----------------------- - -In practice, different databases used for the same purpose may have the exact -same datasets with the exact same outputs (and attached data formats). In this -case, it is interesting to abstract the definition of the database sets from -a given database. BEAT defines ``database set templates`` for this purpose. - -For instance, the simple face recognition evaluation protocol described above, -which consists of three datasets and few inputs may be abstracted in a -database set template. This template defines both the datasets, their outputs -as well as their corresponding data formats. Next, if several databases -implements such a protocol, they may rely on the same `database set template`. -Similarly, evaluation protocols testing different conditions (such as -enrolling on clean and testing on clean data vs. enrolling on clean and -testing on noisy data) may rely on the same database set template. - -In practice, this reduces the amount of work to integrate new databases and/or -new evaluation protocols into the platform. Besides, at the experiment level, -this allows to re-use a toolchain on a different database, with almost no -configuration changes from the user. diff --git a/doc/dataformats.rst b/doc/dataformats.rst deleted file mode 100644 index e97b636fe78e1e6fee8ceb2634e77d5f72e27a9d..0000000000000000000000000000000000000000 --- a/doc/dataformats.rst +++ /dev/null @@ -1,361 +0,0 @@ -.. vim: set fileencoding=utf-8 : - -.. Copyright (c) 2016 Idiap Research Institute, http://www.idiap.ch/ .. -.. Contact: beat.support@idiap.ch .. -.. .. -.. This file is part of the beat.core module of the BEAT platform. .. -.. .. -.. Commercial License Usage .. -.. Licensees holding valid commercial BEAT licenses may use this file in .. -.. accordance with the terms contained in a written agreement between you .. -.. and Idiap. For further information contact tto@idiap.ch .. -.. .. -.. Alternatively, this file may be used under the terms of the GNU Affero .. -.. Public License version 3 as published by the Free Software and appearing .. -.. in the file LICENSE.AGPL included in the packaging of this file. .. -.. The BEAT platform is distributed in the hope that it will be useful, but .. -.. WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY .. -.. or FITNESS FOR A PARTICULAR PURPOSE. .. -.. .. -.. You should have received a copy of the GNU Affero Public License along .. -.. with the BEAT platform. If not, see http://www.gnu.org/licenses/. .. - - -.. _beat-core-dataformats: - -============= -Data formats -============= - -Data formats formalize the interaction between algorithms and data sets, so -they can communicate data in an orderly manner. All data formats produced or -consumed by these objects must be formally declared. Two algorithms which must -directly communicate data must produce and consume the same type of data -objects. - -A data format specifies a list of typed fields. An algorithm or data set -generating a block of data (via one of its outputs) **must** fill all the -fields declared in that data format. An algorithm consuming a block of data -(via one of its inputs) **must not** expect the presence of any other field -than the ones defined by the data format. - -This section contains information on the definition of dataformats, its -programmatic use on Python-based language bindings. - - -Definition ----------- - -A data format is declared as a `JSON`_ object with several fields. For example, -the following declaration could represent the coordinates of a rectangular -region in an image: - -.. code-block:: json - - { - "x": "int32", - "y": "int32", - "width": "int32", - "height": "int32" - } - -.. note:: - - We have chosen to define objects inside the BEAT platform using JSON - declarations as JSON files can be easily validated, transferred through - web-based APIs and provide and easy to read format for local inspection. - -Each field must be named according to typical programming rules for variable -names. For example, these are valid names: - - * ``my_field`` - * ``_my_field`` - * ``number1`` - -These are invalid field names: - - * ``1number`` - * ``my field`` - -The following regular expression is used to validate field names: -``^[a-zA-Z_][a-zA-Z0-9_-]*$``. In short, a field name has to start with a -letter or an underscore character and can contain, immediately after, any -number of alpha-numerical characters or underscores. - -By convention, fields prefixed and suffixed with a double underscore (``__``) -are reserved and should be avoided. - -The special field ``#description`` can be used to store a short description of -the declared data format and also ignored: - -.. code-block:: json - - { - "#description": "A rectangle in an pixeled image", - "x": "int32", - "y": "int32", - "width": "int32", - "height": "int32" - } - -The ``#description`` field is ignored in practice and only used for -informational purposes. - -Each field in a declaration has a well-defined type, which can be one of: - - * a primitive, simple type (see :ref:`beat-core-dataformats-simple`) - * a directly nested object (see :ref:`beat-core-dataformats-complex`) - * another data format (see :ref:`beat-core-dataformats-aggregation`) - * an array (see :ref:`beat-core-dataformats-array`) - -A data format can also extend another one, as explained further down (see -ref:`beat-core-dataformats-extension`). - - -.. _beat-core-dataformats-simple: - -Simple types ------------- - -The following primitive data types are available in the BEAT platform: - - * Integers: ``int8``, ``int16``, ``int32``, ``int64`` - * Unsigned integers: ``uint8``, ``uint16``, ``uint32``, ``uint64`` - * Floating-point numbers: ``float32``, ``float64`` - * Complex numbers: ``complex64``, ``complex128`` - * ``bool`` - * ``string`` - -.. note:: - - All primitive types are implemented using their :py:mod:`numpy` - counterparts. - -When determining if a block of data corresponds to a data format, the platform -will check that the value of each field can safely (without loss of precision) -be converted to the type declared by the data format. An error is generated if -you fail to follow these requirements. - -For example, an ``int8`` *can* be converted, without a precision loss, to an -``int16``, but a ``float32`` **cannot** be losslessly converted to an -``int32``. In case of doubt, you can manually test for `NumPy safe-casting -rules`_ yourself in order to understand imposed restrictions. If you wish to -allow for a precision loss on your code, you must do it explicitly (`Zen of -Python`_). - - -.. _beat-core-dataformats-complex: - -Complex types -------------- - -A data format can be composed of complex objects formed by nesting other types. -The coordinates of a rectangular region in an image be represented like this: - -.. code-block:: json - - { - "coords": { - "x": "int32", - "y": "int32" - }, - "size": { - "width": "int32", - "height": "int32" - } - } - - -.. _beat-core-dataformats-aggregation: - -Aggregation ------------ - -.. note:: - - Data formats are named using 3 values joined by a ``/`` (slash) separator: - the username who is the author of the dataformat, an identifier and the - object version (integer starting from 1). Here are examples of data format - names: - - * ``user/my_format/1`` - * ``johndoe/integers/37`` - * ``mary_mary/rectangle/2`` - - -A field can use the declaration of another data format instead of specifying -its own declaration. Consider the following data formats, on their first -version, for user ``user``: - -.. code-block:: json - :caption: Two dimensional coordinates (``user/coordinates/1``) - - { - "x": "int32", - "y": "int32" - } - -.. code-block:: json - :caption: Two dimensional size (``user/size/1``): - - { - "width": "int32", - "height": "int32" - } - -Now let's aggregate both previous formats in order to declare a new data format -for describing a rectangle: - -.. code-block:: json - :caption: The definition of a rectangle - - { - "coords": "user/coordinates/1", - "size": "user/size/1" - } - - -.. _beat-core-dataformats-array: - -Arrays ------- - -A field can be a multi-dimensional array of any other type. For instance, -consider the following example: - -.. code-block:: json - - { - "field1": [10, "int32"], - "field2": [10, 5, "bool"] - } - -Here we declare that ``field1`` is a one-dimensional array of 10 32-bit signed -integers (``int32``), and ``field2`` is a two-dimensional array with 10 rows -and 5 columns of booleans. - -.. note:: - - In the Python language representation of data formats, multi-dimensional - arrays are implemented using :py:class:`numpy.ndarray`'s. - - -An array can have as many dimensions as you want. It can also contain objects -(either declared inline, or using another data format): - -.. code-block:: json - - { - "inline": [10, { - "x": "int32", - "y": "int32" - }], - "imported": [10, "beat/coordinates/1"] - } - -It is also possible to declare an array without specifying the number of -elements in some of its dimensions, by using a size of 0 (zero): - -.. code-block:: json - - { - "field1": [0, "int32"], - "field2": [0, 0, "bool"], - "field3": [10, 0, "float32"] - } - -Here, ``field1`` is a one-dimensional array of 32-bit signed integers -(``int32``), ``field2`` is a two-dimensional array of booleans, and ``field3`` -is a two-dimensional array of floating-point numbers (``float32``) whose the -first dimension is fixed to 10 (number of rows). - -Note that the following declaration isn't valid (you can't fix a dimension if -the preceding one isn't fixed too): - -.. code-block:: json - - { - "error": [0, 10, "int32"] - } - -.. note:: - - When determining if that a block of data corresponds to a data format - containing an array, the platform automatically checks that: - - * the number of dimensions is correct - * the size of each declared dimension that isn't 0 is correct - * the type of each value in the array is correct - - -.. _beat-core-dataformats-extension: - -Extensions ----------- - -Besides aggregation, it is possible to extend data formats through inheritance. -In practice, inheriting from a data format is the same as pasting its -declaration right on the top of the new format. - -For example, one might implement a face detector algorithm and may want to -create a data format containing all the informations about a face (say its -position, its size and the position of each eye). This could be done by -extending the type ``user/rectangular_area/1`` defined earlier: - -.. code-block:: json - - { - "#extends": "user/rectangular_area/1", - "left_eye": "coordinates", - "right_eye": "coordinates" - } - - -.. _beat-core-dataformats-usage: - -Python API ----------- - -Data formats are useful descriptions of data blocks that are consumed by -algorithmic code inside the platform. In BEAT, the user never instantiates data -formats directly. Instead, when a new object representing a data format needs -to be created, the user may just create a dictionary in which the keys are the -format field names, whereas the values are instances of the type defined for -such a field. If the type is a reference to another format, the user may nest -dictionaries so as to build objects of any complexity. When the dictionary -representing a data format is written to an algorithm output, the data is -properly validated. - -This concept will become clearer when you'll read about algorithms and the way -they receive and produce data. Here is just a simple illustrative example: - -.. testsetup:: test-output-write - - import numpy - from beat.core.dataformat import DataFormat - from beat.core.test.mocks import MockDataSink - from beat.core.outputs import Output - - dataformat = DataFormat('/not/needed', { - "x": "int32", - "y": "int32", - "width": "int32", - "height": "int32" - }) - assert dataformat.valid - data_sink = MockDataSink(dataformat) - output = Output('test', data_sink) - -.. testcode:: test-output-write - - # suppose, for this example, `output' is provided to your algorithm - output.write({ - "x": numpy.int32(10), - "y": numpy.int32(20), - "width": numpy.int32(100), - "height": numpy.int32(100), - }) - - -.. include:: links.rst - diff --git a/doc/experiments.rst b/doc/experiments.rst deleted file mode 100644 index d392eda25fa865beca98fde38fb43b712b3eab19..0000000000000000000000000000000000000000 --- a/doc/experiments.rst +++ /dev/null @@ -1,200 +0,0 @@ -.. vim: set fileencoding=utf-8 : - -.. Copyright (c) 2016 Idiap Research Institute, http://www.idiap.ch/ .. -.. Contact: beat.support@idiap.ch .. -.. .. -.. This file is part of the beat.core module of the BEAT platform. .. -.. .. -.. Commercial License Usage .. -.. Licensees holding valid commercial BEAT licenses may use this file in .. -.. accordance with the terms contained in a written agreement between you .. -.. and Idiap. For further information contact tto@idiap.ch .. -.. .. -.. Alternatively, this file may be used under the terms of the GNU Affero .. -.. Public License version 3 as published by the Free Software and appearing .. -.. in the file LICENSE.AGPL included in the packaging of this file. .. -.. The BEAT platform is distributed in the hope that it will be useful, but .. -.. WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY .. -.. or FITNESS FOR A PARTICULAR PURPOSE. .. -.. .. -.. You should have received a copy of the GNU Affero Public License along .. -.. with the BEAT platform. If not, see http://www.gnu.org/licenses/. .. - - -.. _beat-core-experiments: - -============ -Experiments -============ - -An experiment is the reunion of algorithms, datasets, a toolchain and -parameters that allow the platform to schedule and run the prescribed recipe -to produce displayable results. Defining a BEAT experiment can be seen as -configuring the processing blocks of a toolchain, such as selecting which -database, algorithms and algorithm parameters to use. - -.. _beat-core-experiments-declaration: - -Declaration of an experiment ----------------------------- - -.. note:: - - One needs only to declare an experiment using those specifications when not - using the web interface (i.e. when doing local development or using the web - api). The web interface provides a user-friendly way to configure an - experiment. - -An experiment is declared in a JSON file, and must contain at least the following -fields: - -.. code-block:: javascript - - { - "datasets": [ - ], - "blocks": [ - ], - "analyzers": [ - ], - "globals": [ - ] - } - - -.. _beat-core-experiments-datasets: - -Declaration of the dataset(s) ------------------------------ - -The dataset inputs are defined by the toolchain. However, the toolchain does -not describe which data to plug in each dataset input. - -This is the role of the field `datasets` from an experiment. -For each dataset, an experiment must specify three attributes as follows: - -.. code-block:: javascript - - { - "datasets": [ - "templates": { - "set": "templates", - "protocol": "idiap", - "database": "atnt/1" - }, - ... - ], - ... - } - - -The key of an experiment dataset must correspond to the desired dataset name -from the toolchain. Then, three fields must be given: - -* `database`: the database name and version -* `protocol`: the protocol name -* `set`: the dataset name of this database to associate to this toolchain - dataset - - -.. _beat-core-experiments-blocks: - -Declaration of the block(s) ---------------------------- - -The blocks are defined by the toolchain. However, the toolchain does not -describe which algorithm to run in each processing block, and how each of these -algorithms are parametrized. - -This is the role of the field `blocks` from an experiment. -For each block, an experiment must specify four attributes as follows: - -.. code-block:: javascript - - { - "blocks": { - "linear_machine_training": { - "inputs": { - "image": "image" - }, - "parameters": {}, - "algorithm": "tutorial/pca/1", - "outputs": { - "subspace": "subspace" - } - }, - ... - }, - ... - } - -The key of an experiment block must correspond to the desired block from the -toolchain. Then, four fields must be given: - -* `algorithm`: the algorithm to use (author_name/algorithm_name/version) -* `inputs`: the list of inputs. The key is the algorithm input, while the - value is the corresponding toolchain input. -* `outputs`: the list of outputs. The key is the algorithm output, while the - value is the corresponding toolchain output. -* `parameters`: the algorithm parameters to use for this processing block - - -.. note:: - - When setting an algorithm in a processing block, this will also set the - dataformats of the outputs (and inputs) of this block. In particular, - this has an impact on all the inputs of blocks connected to those outputs, - which must have the same data formats (or be an extension of these data - formats). The platform automatically validate that the data formats of - consecutive blocks are compatible. - - -.. _beat-core-experiments-analyzers: - -Declaration of the analyzer(s) ------------------------------- - -Analyzers are similar to algorithms, except that they run on toolchain -endpoints. There configuration is very similar to the one of regular blocks, -except that they have no `outputs`: - -.. code-block:: javascript - - { - "analyzers": { - "analysis": { - "inputs": { - "scores": "scores" - }, - "algorithm": "tutorial/postperf/1" - } - }, - } - - -Global parameters ------------------ - -Each block and analyzer may rely on its own local parameters. However, several -blocks may rely on the exact same parameters. In this case, it is more -convenient to define those globally. - -For an experiment, this is achieved using the `globals` field in its JSON -declaration. For instance: - -.. code-block:: javascript - - { - "globals": { - "queue": "Default", - "environment": { - "version": "0.0.3", - "name": "Scientific Python 2.7" - }, - "tutorial/pca/1": { - "number-of-components": "5" - } - }, - ... - } - diff --git a/doc/index.rst b/doc/index.rst index cbc414dbf30ae5b28e92a1fd4ecd7165829fbc78..7b1e9d9773f7648e8c58d259933a43322a48608a 100644 --- a/doc/index.rst +++ b/doc/index.rst @@ -27,19 +27,12 @@ Core BEAT components ====================== -This user guide contains information about BEAT core components, defining -experiments, toolchains and user algorithms among others. +This package provides the core components of BEAT ecosystem that are the building blocks for all the other BEAT packages to use. .. toctree:: introduction - dataformats - algorithms - libraries - toolchains - experiments - databases io backend_api develop diff --git a/doc/introduction.rst b/doc/introduction.rst index bdf95746ee08f9dbe363fa36d2faf88eca8a5cb9..9556e17db733d16a7570f47113f1a4b467c91d32 100644 --- a/doc/introduction.rst +++ b/doc/introduction.rst @@ -27,183 +27,6 @@ Introduction ============= -The BEAT platform is a web-based system for certifying results for -software-based data-driven workflows that can be sub-divided functionally (into -processing blocks). The platform takes all burden of hosting data and software -away from users by providing a capable computing farm that handles both aspects -graciously. Data is kept sequestered inside the platform. The user provides the -description of data formats, algorithms, data flows (also known as toolchains) -and experimental details (parameters), which are mashed inside the platform to -produce beautiful results, easily exportable into computer graphics or tables -for scientific reports. - -It is intended as a fundamental building-block in `Reproducible Research`_, -allowing academic and industrial parties to prescribe system behavior and have -it reproducible through software, hardware and staff generations. Here are some -known applications: - -* Challenges and competitions on defined data, protocols and workflow - components; -* Study group exercises and exams; -* Support to publication submission; -* System and algorithm performance optimization; -* Reproduction of experiments through communities; -* Support for industry-academy relationship. - -This package, in particular, defines a set of core components useful for the -whole platform: the building blocks used by all other packages in the BEAT -software suite. These are: - -* **Data formats**: the specification of data which is transmitted between - blocks of a toolchain; -* **Libraries**: routines (source-code or binaries) that can be incorporated - into other libraries or user code on algorithms; -* **Algorithms**: the program (source-code or binaries) that defines the user - algorithm to be run within the blocks of a toolchain; -* **Databases** and **Datasets**: means to read raw-data from a disk and feed - into a toolchain, respecting a certain usage protocol; -* **Toolchain**: the definition of the data flow in an experiment; -* **Experiment**: the reunion of algorithms, datasets, a toolchain and - parameters that allow the platform to schedule and run the prescribed recipe - to produce displayable results. - - -.. _beat-core-introduction-example: - -A Simple Example ----------------- - -The next figure shows a representation of a very simple toolchain, composed of -only a few color-coded components: - -* To the left, the reader can identify two datasets, named ``set`` and ``set2`` - respectively. They emit data (of, at this point, an unspecified type) into - the following processing blocks; -* Following the datasets, two processing blocks named ``echo1`` and ``echo2`` - receive the input from the dataset and emit data into a third block, named - ``echo3``; -* The final component receives the inputs emitted from ``echo3`` and it is - called ``analysis``. Because this block has no output, it is considered a - final block, from which the BEAT platform expects to collect experiment - results (that, at this point, are also unspecified). - -.. Simple toolchain representation for the BEAT platform -.. graphviz:: img/toolchain-triangle.dot - -The toolchain only defines the very basic data flow and connections that must -be respected by experiments. It does not define what is the type of data that -is produced or consumed by any of the existing blocks, the algorithms or -databases and protocols to use. From the toolchain description, it is possible -to devise a possible execution order, by taking into consideration the imposed -data flow. In this simple example, the datasets called ``set`` and ``set2`` -may yield data in parallel, allowing the execution of blocks ``echo1`` and -``echo2``. Block ``echo3`` must come next, before the ``analysis`` block, which -comes by last. - -In typical problems that can be implemented in the BEAT platform, datasets are -composed of multiple instances of raw data. For example, these could be images -for an object recognition problem, speech sequences for a speech recognition -task or model data for biometric recognition tasks. Computing blocks must -process these data by looping on these atomic data samples. The color-coding in -the figure indicates this extra data-flow information: for each dataset in the -drawing, it indicates how blocks loop on their atomic data. For the proposed, -toolchain, we can observe that blocks ``echo1``, ``echo3`` and ``analysis`` -loop over the "raw" data samples from ``set``, while ``echo2`` loop over the -samples from ``set2``. - -The next figure shows a complete experimental setup for the above toolchain. -The input blocks use a given database, called ``simple/1`` (the name is -``simple`` and the version is ``1``), using one of its protocols called -``protocol``. Each block is set to a specific data set inside the -database/protocol combination. Both datasets on this database/protocol yield -objects of type ``beat/integer/1`` (a format called ``integer`` from user -``beat``, version ``1``), which are consumed by algorithms running on the next -blocks. The block ``echo1`` uses the algorithm ``user/integers_echo/1`` (an -algorithm called ``integers_echo`` from user ``user``, version ``1``) and -also yields ``beat/integer/1`` objects. The same is valid for the algorithm -running on block ``echo2``. - -The algorithm for block ``echo3`` cannot possibly be the same - it must deal -with 2 inputs, generated by blocks looping on different raw data. We'll be more -detailed about conceptual differences while writing algorithms which are not -synchronized with all of their inputs next. For this introduction, it suffices -you understand the organization of algorithms in an experiment is constrained -by its neighboring block requirements as well as the input and output -data flows determined for a given block. - -Block ``echo3`` yields elements to the algorithm on the ``analysis`` block, -called ``user/integers_echo_analyzer/1``, which produces a single result named -``out_data``, which is of type ``int32`` (that is, a signed integer with 32 -bits). Algorithms that do not communicate with other algorithms are typically -called ``analyzers``. They are set-up on the end of experiments so as to -produce quantifiable results you can use to measure the performance of your -experimental setup. - -.. Simple experiment representation for the BEAT platform -.. graphviz:: img/experiment-triangle.dot - - -.. _beat-core-introduction-design: - -Design ------- - -The next figure shows an UML representation of main BEAT components, showing -some of their interaction and interdependence. Experiments use algorithms, data -sets and a toolchain in order to define a complete runnable setup. Data sets -are grouped into protocols which are, in turn, grouped into databases. -Algorithms use data formats to defined input and output patterns. Most objects -are subject to versioning, possess a name and belong to a specific user. By -contracting those markers, it is possible to define unique identifiers for all -objects in the platform. In the example above, you can identify some examples. - -.. High-level component interaction in the BEAT platform core -.. graphviz:: - - digraph hierarchy { - graph [fontname="helvetica", compound=true, splines=polyline] - node [fontname="helvetica", shape=record, style=filled, fillcolor=gray95] - edge [fontname="helvetica"] - - subgraph "algorithm_cluster" { - 1[label = "{Dataformat|...|+user\n+name\n+version}"] - 2[label = "{Algorithm|...|+user\n+name\n+version\n+code\n+language}"] - 6[label = "{Library|...|+user\n+name\n+version\n+code\n+language}"] - } - subgraph "database_cluster" { - graph [label=datasets] - 3[label = "{Database|...|+name\n+version}"] - 4[label = "{Protocol|...|+template}"] - 5[label = "Set"] - } - subgraph "experiment_cluster" { - graph [label=experiments] - 7[label = "{Toolchain|+execution_order()|+user\n+name\n+version}"] - 8[label = "{Experiment|...|+user\n+label}"] - } - - 1->1 [label = "0..*", arrowhead=empty] - 2->1 [label = "1..*", arrowhead=empty] - 2->6 [label = "0..*", arrowhead=empty] - 6->6 [label = "0..*", arrowhead=empty] - 4->3 [label = "1..*", arrowhead=odiamond] - 5->4 [label = "1..*", arrowhead=odiamond] - 5->1 [label = "1..*", arrowhead=empty] - 8->7 [label = "1..1", arrowhead=empty] - 8->2 [label = "1..*", arrowhead=empty] - 8->5 [label = "1..*", arrowhead=empty] - - } - - -The BEAT platform provides a graphical user interface so that you can program -data formats, algorithms, toolchains and define experiments rather intuitively. -This package provides the core building blocks of the BEAT platform. For expert -users, we provide a command-line interface to the platform, allowing such -users to create, modify and dispose of such objects using their own private -editors. For developers and programmers, the rest of this guide details each of -those building blocks, their relationships and how to use such a command-line -interface to interact with the platform efficiently. .. include:: links.rst diff --git a/doc/libraries.rst b/doc/libraries.rst deleted file mode 100644 index 3bd225f3b37d00427e47a678d0fd575221febcd9..0000000000000000000000000000000000000000 --- a/doc/libraries.rst +++ /dev/null @@ -1,116 +0,0 @@ -.. vim: set fileencoding=utf-8 : - -.. Copyright (c) 2016 Idiap Research Institute, http://www.idiap.ch/ .. -.. Contact: beat.support@idiap.ch .. -.. .. -.. This file is part of the beat.core module of the BEAT platform. .. -.. .. -.. Commercial License Usage .. -.. Licensees holding valid commercial BEAT licenses may use this file in .. -.. accordance with the terms contained in a written agreement between you .. -.. and Idiap. For further information contact tto@idiap.ch .. -.. .. -.. Alternatively, this file may be used under the terms of the GNU Affero .. -.. Public License version 3 as published by the Free Software and appearing .. -.. in the file LICENSE.AGPL included in the packaging of this file. .. -.. The BEAT platform is distributed in the hope that it will be useful, but .. -.. WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY .. -.. or FITNESS FOR A PARTICULAR PURPOSE. .. -.. .. -.. You should have received a copy of the GNU Affero Public License along .. -.. with the BEAT platform. If not, see http://www.gnu.org/licenses/. .. - - -.. _beat-core-libraries: - -========== -Libraries -========== - -Algorithms are fundamental elements in the platform that formally describe how -to process data. In particular, they are always attached to a specific -processing block with a given set of inputs and outputs. When an algorithm -needs to be applied in a slightly different processing block, this may, hence, -lead to a lot of code duplication. Duplicate code is undesirable for a number -of reasons such as high maintenance cost. - -To address this problem, the platform defines the concept of **libraries**. -Libraries allow users to put code required by several different algorithms -into a common location. Once done, code from a library may be used by any -algorithm as long as the algorithm mentions its dependency to it in its -JSON declaration. In addition, a library may depend on another library. - - -Definition ----------- - -Similarly to algorithms, a library consists of two folds: - -* A ``JSON declaration`` indicating: - - - The language in which the library is written - - Library dependencies of this library - -.. code-block:: javascript - - { - "uses": { - "otherlib": "user/otherlibrary/1" - }, - "language": "python" - } - - -* ``Source code``. For the Python back-end, this may consist of any Python - function and classes, as long as dependencies are fulfilled. - -.. code-block:: python - - def simple_function(array): - return len([v for v in array if v != 0]) - - class MyLibraryClass: - - def __init__(self, multiplier=37): - self.multiplier = multiplier - - def function_from_my_library(value): - return value * self.multiplier - -The web client of the BEAT platform provides a graphical editor for algorithm, -which simplifies its `JSON`_ declaration definition. It also includes a simple -Python code editor. - - -Usage ------ - -To use a defined library in an algorithm or in another library, it is -sufficient to: - -* Add the library dependency into the `JSON`_ declaration of the algorithm - (or of the library). The name given as a key is the one used to import - the library, while the corresponding value is the fullname, that is - `author/name/version` of the library. - -.. code-block:: javascript - - { - ... - "uses": { - "mylib": "user/mylibrary/1" - }, - ... - } - -* Import the library and use its desired functionalities. - -.. code-block:: python - - import mylib - ... - array = [0, 1, 2, 3] - array_processed = mylib.simple_function(array) - - -.. include:: links.rst diff --git a/doc/toolchains.rst b/doc/toolchains.rst deleted file mode 100644 index 08586892f9d7841cfc2816d2da006bb828b61689..0000000000000000000000000000000000000000 --- a/doc/toolchains.rst +++ /dev/null @@ -1,501 +0,0 @@ -.. vim: set fileencoding=utf-8 : - -.. Copyright (c) 2016 Idiap Research Institute, http://www.idiap.ch/ .. -.. Contact: beat.support@idiap.ch .. -.. .. -.. This file is part of the beat.core module of the BEAT platform. .. -.. .. -.. Commercial License Usage .. -.. Licensees holding valid commercial BEAT licenses may use this file in .. -.. accordance with the terms contained in a written agreement between you .. -.. and Idiap. For further information contact tto@idiap.ch .. -.. .. -.. Alternatively, this file may be used under the terms of the GNU Affero .. -.. Public License version 3 as published by the Free Software and appearing .. -.. in the file LICENSE.AGPL included in the packaging of this file. .. -.. The BEAT platform is distributed in the hope that it will be useful, but .. -.. WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY .. -.. or FITNESS FOR A PARTICULAR PURPOSE. .. -.. .. -.. You should have received a copy of the GNU Affero Public License along .. -.. with the BEAT platform. If not, see http://www.gnu.org/licenses/. .. - - -.. _beat-core-toolchains: - -=========== -Toolchains -=========== - -A toolchain describes the workflow of a particular experiment. Its declaration -defines: - - * a collection of *processing blocks*, including for each of them: - - * a *name* (unique in the toolchain) - * a list of *inputs* - * a list of *outputs* - - * the *interconnections* between those blocks (from an output to an input) - * a list of *datasets*, that yield raw input data for the experiment - * a list of *result outputs*, that produce the results of the experiment - - -.. _beat-core-toolchains-declaration: - -Declaration of a toolchain --------------------------- - -.. note:: - - One needs only to declare a toolchain using those specifications when not - using the web interface (i.e. when doing local development or using the web - api). The web interface provides a user-friendly way to declare and modify - toolchains. - -A toolchain is declared in a JSON file, and must contain at least the following -fields: - -.. code-block:: javascript - - { - "datasets": [ - ], - "blocks": [ - ], - "connections": [ - ], - "analyzers": [ - ] - } - -Note that this toolchain is considered as a correct one by the platform (i.e. -it doesn't contain any error, and thus can be modified via the web interface), -but not as an executable one, as there is nothing to execute. - -For display purposes, the JSON file may contain an additional field called -`representation`, which provides insight on how to display the workflow in -a graphical way. - - -.. _beat-core-toolchains-datasets: - -Declaration of the datasets ---------------------------- - -Datasets are starting points of a toolchain. They provide raw input data for a -scientific experiment, and they have, hence, outputs but no input. Several -datasets are typically attached to a given a protocol of a database, each of -them having a particular role. For instance, a protocol for a classification -task may provide three distinct datasets, one for training a generic model, -one for enrolling class-specific models, and one for generating probe samples -that are compared against the enrolled models. - -To define the dataset, its name as well as its corresponding outputs have to -be defined in the JSON declaration of the toolchain. Considering the example -mentioned above, this would look like: - -.. code-block:: javascript - - { - ... - "datasets": [ - { - "outputs": [ - "image", - "eye_centers" - ], - "name": "train" - }, - { - "outputs": [ - "template_id", - "client_id", - "image", - "eye_centers" - ], - "name": "templates" - }, - { - "outputs": [ - "probe_id", - "client_id", - "template_ids", - "image", - "eye_centers" - ], - "name": "probes" - }, - ... - } - - -.. _beat-core-toolchains-blocks: - -Declaration of the processing blocks ------------------------------------- - -To define the processing blocks contained in a toolchain, just add some entries -into the ``blocks`` array, such as: - -.. code-block:: javascript - - { - ... - "blocks": [ - { - "inputs": [ - "image", - "eye_centers" - ], - "synchronized_channel": "train", - "name": "cropping_rgb_train", - "outputs": [ - "image_gray" - ] - } - ] - ... - } - -Here we defined a block named `cropping_rgb_train`, which expects two inputs, -`image` and `eye_centers`, and returns one output, `image_gray`. The -synchronization channel indicates against which dataset the outputs are -synchronized. However, the toolchain does specify neither the data format of -the inputs and outputs, nor the algorithm that is going to run inside the -block. This is performed in the experiment definition, which combines -dataformats, algorithms and a toolchain together. - -As with the datasets, to define more blocks just add more entries into the -``blocks`` array: - -.. code-block:: javascript - - { - ... - "blocks": [ - { - "inputs": [ - "image", - "eye_centers" - ], - "synchronized_channel": "train", - "name": "cropping_rgb_train", - "outputs": [ - "image_cropped" - ] - }, - { - "inputs": [ - "image_cropped" - ], - "synchronized_channel": "train", - "name": "feature_extraction_train", - "outputs": [ - "feature_vector" - ] - } - ] - ... - } - - -.. _beat-core-toolchains-connections: - -Declaration of the connections between the processing blocks ------------------------------------------------------------- - -To define a connection between two processing blocks (or one dataset and one -processing block), just add one entry into the ``connections`` array, with one -of the following forms: - -.. code-block:: javascript - - { - "from": "block1_name.output_name", - "to": "block2_name.input_name" - } - -or: - -.. code-block:: javascript - - { - "from": "dataset_name.output_name", - "to": "block_name.input_name" - } - -For example: - -.. code-block:: javascript - - { - ... - "connections": [{ - "from": "cropping_rgb_train.image_cropped", - "to": "features_extraction_train.image_cropped" - } - ], - "blocks": [ - { - "inputs": [ - "image", - "eye_centers" - ], - "synchronized_channel": "train", - "name": "cropping_rgb_train", - "outputs": [ - "image_cropped" - ] - }, - { - "inputs": [ - "image_cropped" - ], - "synchronized_channel": "train", - "name": "feature_extraction_train", - "outputs": [ - "feature_vector" - ] - } - ] - ... - } - - -Several important things to note: - - * The names of the connected output and input don't need to be the same. Use - whatever make sense in the context of each block - * An output can be connected to several inputs - * An input can only be connected to one output - * The names of the blocks and of the datasets must be unique in the - toolchain - - -.. _beat-core-toolchains-results: - -Declaration of the outputs to use as results --------------------------------------------- - -To declare that a particular processing block output produces the result of the -toolchain (or a part of it), just add one entry into the ``analyzers`` field, -with the following form: - -.. code-block:: javascript - - { - ... - "analyzers": [ - { - "inputs": [ - "scoring_dev_scores", - "scoring_test_scores" - ], - "synchronized_channel": "probes", - "name": "analysis" - } - ] - ... - } - -The field `inputs` lists the results, while the field `synchronized_channel` -indicates the dataset against which to automatically perform the loop as for -any regular toolchain block. - -The data written on those `inputs` will be used to display results and plots -on the web interface. - - -.. _beat-core-toolchains-example: - -Putting it all together: a complete example -------------------------------------------- - -.. _beat-core-toolchains-example-figure: -.. figure:: img/toolchain-example-2.* - - A complete toolchain that train and test an Eigenfaces system - -The following example describes the toolchain visible at -:numref:`beat-core-toolchains-example-figure`, a complete toolchain that: - - #. train an Eigenfaces face recognition system on one set of images (*train*) - #. enroll client-specific models on another set of images (*templates*) - #. test these models using samples from a third set of images (*probes*) - -.. note:: - - A toolchain is still not executable, since it contains no mention of the - algorithms that must be used in each processing block, as well as the - database to use. - - -.. code-block:: javascript - - { - "datasets": [ - { - "outputs": [ - "image" - ], - "name": "train" - }, - { - "outputs": [ - "template_id", - "client_id", - "image" - ], - "name": "templates" - }, - { - "outputs": [ - "probe_id", - "client_id", - "template_ids", - "image" - ], - "name": "probes" - } - ], - "blocks": [ - { - "inputs": [ - "image" - ], - "synchronized_channel": "train", - "name": "linear_machine_training", - "outputs": [ - "subspace" - ] - }, - { - "inputs": [ - "image", - "template_id", - "subspace" - ], - "synchronized_channel": "templates", - "name": "template_builder", - "outputs": [ - "projections" - ] - }, - { - "inputs": [ - "image", - "probe_id", - "subspace" - ], - "synchronized_channel": "probes", - "name": "probe_builder", - "outputs": [ - "projections" - ] - }, - { - "inputs": [ - "templates_client_id", - "templates_template_id", - "template_builder_projections", - "probes_client_id", - "probes_probe_id", - "probe_builder_projections", - "probes_template_ids" - ], - "synchronized_channel": "probes", - "name": "scoring", - "outputs": [ - "scores" - ] - } - ], - "analyzers": [ - { - "inputs": [ - "scores" - ], - "synchronized_channel": "probes", - "name": "analysis" - } - ], - "connections": [ - { - "to": "linear_machine_training.image", - "from": "train.image", - "channel": "train" - }, - { - "to": "template_builder.image", - "from": "templates.image", - "channel": "templates" - }, - { - "to": "template_builder.template_id", - "from": "templates.template_id", - "channel": "templates" - }, - { - "to": "template_builder.subspace", - "from": "linear_machine_training.subspace", - "channel": "train" - }, - { - "to": "probe_builder.image", - "from": "probes.image", - "channel": "probes" - }, - { - "to": "probe_builder.probe_id", - "from": "probes.probe_id", - "channel": "probes" - }, - { - "to": "probe_builder.subspace", - "from": "linear_machine_training.subspace", - "channel": "train" - }, - { - "to": "scoring.templates_client_id", - "from": "templates.client_id", - "channel": "templates" - }, - { - "to": "scoring.templates_template_id", - "from": "templates.template_id", - "channel": "templates" - }, - { - "to": "scoring.template_builder_projections", - "from": "template_builder.projections", - "channel": "templates" - }, - { - "to": "scoring.probes_client_id", - "from": "probes.client_id", - "channel": "probes" - }, - { - "to": "scoring.probes_probe_id", - "from": "probes.probe_id", - "channel": "probes" - }, - { - "to": "scoring.probe_builder_projections", - "from": "probe_builder.projections", - "channel": "probes" - }, - { - "to": "scoring.probes_template_ids", - "from": "probes.template_ids", - "channel": "probes" - }, - { - "to": "analysis.scores", - "from": "scoring.scores", - "channel": "probes" - } - ] - } -