diff --git a/doc/algorithms.rst b/doc/algorithms.rst deleted file mode 100644 index c15e884aa7cdf55ac3cf826d42cc70e61640585b..0000000000000000000000000000000000000000 --- a/doc/algorithms.rst +++ /dev/null @@ -1,886 +0,0 @@ -.. vim: set fileencoding=utf-8 : - -.. Copyright (c) 2016 Idiap Research Institute, http://www.idiap.ch/ .. -.. Contact: beat.support@idiap.ch .. -.. .. -.. This file is part of the beat.core module of the BEAT platform. .. -.. .. -.. Commercial License Usage .. -.. Licensees holding valid commercial BEAT licenses may use this file in .. -.. accordance with the terms contained in a written agreement between you .. -.. and Idiap. For further information contact tto@idiap.ch .. -.. .. -.. Alternatively, this file may be used under the terms of the GNU Affero .. -.. Public License version 3 as published by the Free Software and appearing .. -.. in the file LICENSE.AGPL included in the packaging of this file. .. -.. The BEAT platform is distributed in the hope that it will be useful, but .. -.. WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY .. -.. or FITNESS FOR A PARTICULAR PURPOSE. .. -.. .. -.. You should have received a copy of the GNU Affero Public License along .. -.. with the BEAT platform. If not, see http://www.gnu.org/licenses/. .. - - -.. _beat-core-algorithms: - -=========== -Algorithms -=========== - -Algorithms are user-defined piece of software that run within the blocks of a -toolchain. An algorithm can read data on the input(s) of the block and write -processed data on its output(s). They are, hence, key components for -scientific experiments, since they formally describe how to transform raw -data into higher level concept such as classes. - - -An algorithm lies at the core of each processing block and may be subject to -parametrization. Inputs and outputs of an algorithm have well-defined data -formats. The format of the data on each input and output of the block is -defined at a higher-level in the platform. It is expected that the -implementation of the algorithm respects whatever was declared on the -platform. - -By default, the algorithm is **data-driven**; algorithm is typically provided -one data sample at a time and must immediately produce some output data. -Furthermore, the way the algorithm handle the data is highly configurable and -covers a huge range of possible scenarios. - -:numref:`beat-core-overview-block` displays the relationship between a -processing block and its algorithm. - -.. _beat-core-overview-block: -.. figure:: ./img/block.* - - Relationship between a processing block and its algorithm - -This section contains information on the definition of algorithm and -its programmatic use on Python-based language bindings. - - -.. _beat-core-algorithms-definition: - -Definition ----------- - -An algorithm is defined by two distinct components: - -* a `JSON`_ object with several fields, specifying the inputs, the outputs, - the parameters and additional information such as the language in which it - is implemented. -* source code (and/or [later] binary code) describing how to transform the input - data. - - -.. _beat-core-algorithms-definition-json: - -JSON Declaration -................ - -A `JSON`_ declaration of an algorithm consists of several fields. For example, -the following declaration is the one of an algorithm implementing -probabilistic component analysis (PCA): - -.. code-block:: javascript - - { - "language": "python", - "splittable": false, - "groups": [ - { - "inputs": { - "image": { - "type": "system/array_2d_uint8/1" - } - }, - "outputs": { - "subspace": { - "type": "tutorial/linear_machine/1" - } - } - } - ], - "parameters": { - "number-of-components": { - "default": 5, - "type": "uint32" - } - }, - "description": "Principal Component Analysis (PCA)" - } - -The field `language` specifies the language in which the algorithm is -implemented. The field `splittable` indicates, whether the algorithm can be -parallelized into chunks or not. The field `parameters` lists the parameters -of the algorithm, describing both default values and their types. The field -`groups` gives information about the inputs and outputs of the algorithm. -They are provided into a list of dictionary, each element in this list being -associated to a database `channel`. The group, which contains outputs, is -the **synchronization channel**. By default, a loop is automatically performs -by the platform on the synchronization channel, and user-code must not loop -on this group. In contrast, it is the responsability of the user to load data -from the other groups. This is described in more details in the following -subsections. Finally, the field `description` is optional and gives a short -description of the algorithm. - -The web client of the BEAT platform provides a graphical editor for algorithm, -which simplifies its `JSON`_ declaration definition. - - -.. _beat-core-algorithms-definition-analyzer: - -Analyzer -........ - -At the end of the processing workflow of an experiment, there is a special -kind of algorithm, which does not yield any output, but in contrast so called -`results`. These algorithms are called **analyzers**. - -`Results` of an experiment are reported back to the user. Since the platform -is concerned about data privacy, only a limited number of data formats can be -employed as results in an analyzer, such as boolean, integers, floating point -values, strings (of limited size), as well as plots (such as scatter or bar -plots). - -For example, the following declaration is the one of a simple analyzer, which -generates an ROC curve as well as few other metrics. - -.. code-block:: javascript - - { - "language": "python", - "groups": [ - { - "inputs": { - "scores": { - "type": "tutorial/probe_scores/1" - } - } - } - ], - "results": { - "far": { - "type": "float32", - "display": true - }, - "roc": { - "type": "plot/scatter/1", - "display": false - }, - "number_of_positives": { - "type": "int32", - "display": false - }, - "frr": { - "type": "float32", - "display": true - }, - "eer": { - "type": "float32", - "display": true - }, - "threshold": { - "type": "float32", - "display": false - }, - "number_of_negatives": { - "type": "int32", - "display": false - } - } - } - - -.. _beat-core-algorithms-definition-code: - -Source Code -........... - -The BEAT platform has been designed to support algorithms written in different -programming languages. However, for each language, a corresponding back-end -needs to be implemented, which is in charge of connecting the inputs and -outputs to the algorithm and running its code as expected. In this section, -we describe the implementation of algorithms in the Python programming -language. - -To implement a new algorithm, one must write a class following a few -conventions. In the following, examples of such classes are provided. - - -.. _beat-core-algorithms-examples: - -Examples --------- - -.. _beat-core-algorithms-examples-simple: - -Simple algorithm (no parametrization) -..................................... - -At the very minimum, an algorithm class must look like this: - -.. code-block:: python - - class Algorithm: - - def process(self, inputs, outputs): - # Read data from inputs, compute something, and write the result - # of the computation on outputs - ... - return True - -The class must be called ``Algorithm`` and must have a method called -``process()``, that takes as parameters a list of inputs (see section -:ref:`beat-core-algorithms-input-inputlist`) and a list of outputs (see -section :ref:`beat-core-algorithms-output-outputlist`). This method must -return ``True`` if everything went correctly, and ``False`` if an error -occurred. - -The platform will call this method once per block of data available on the -`synchronized` inputs of the block. - - -.. _beat-core-algorithms-examples-parametrizable: - -Parametrizable algorithm -........................ - -To implement a parametrizable algorithm, two things must be added to the class: -(1) a field in the JSON declaration of the algorithm containing their default -values as well as the type of the parameters, and (2) a method called -``setup()``, that takes one argument, a map containing the parameters of the -algorithm. - -.. code-block:: javascript - - { - ... - "parameters": { - "threshold": { - "default": 0.5, - "type": "float32" - } - }, - ... - } - -.. code-block:: python - - class Algorithm: - - def setup(self, parameters): - # Retrieve the value of the parameters - self.threshold = parameters['threshold'] - return True - - def process(self, inputs, outputs): - # Read data from inputs, compute something, and write the result - # of the computation on outputs - ... - return True - -When retrieving the value of the parameters, one must not assume that a value -was provided for each parameter. This is why we may use a *try: ... except: ...* -construct in the ``setup()`` method. - -.. _beat-core-algorithms-input: - -Handling input data -------------------- - -.. _beat-core-algorithms-input-inputlist: - -Input list -.......... - -An algorithm is given access to the **list of the inputs** of the processing -block. This list can be used to access each input individually, either by -their name (see section :ref:`beat-core-algorithms-input-name`), their index -or by iterating over the list: - -.. code-block:: python - - # 'inputs' is the list of inputs of the processing block - - print(inputs['labels'].data_format) - - for index in range(0, inputs.length): - print(inputs[index].data_format) - - for input in inputs: - print(input.data_format) - - for input in inputs[0:2]: - print(input.data_format) - -Additionally, the following method is useable on a **list of inputs**: - -.. py:method:: hasMoreData() - - Indicates if there is (at least) another block of data to process on some of - the inputs - - -.. _beat-core-algorithms-input-input: - -Input -..... - -Each input provides the following informations: - -.. py:attribute:: name - - *(string)* Name of the input - -.. py:attribute:: data_format - - *(string)* Data format accepted by the input - -.. py:attribute:: data_index - - *(integer)* Index of the last block of data received on the input (See section - :ref:`beat-core-algorithms-input-synchronization`) - -.. py:attribute:: data - - *(object)* The last block of data received on the input - -The structure of the ``data`` object is dependent of the data format assigned to -the input. Note that ``data`` can be *None*. - -.. _beat-core-algorithms-input-name: - -Input naming -............ - -Each algorithm assign a name of its choice to each input (and output, see -section :ref:`beat-core-algorithms-output-name`). This mechanism ensures that algorithms -are easily shareable between users. - -For instance, in :numref:`beat-core-algorithms-input-naming`, two different users -(Joe and Bill) are using two different toolchains. Both toolchains have one -block with two entries and one output, with a similar set of data formats -(*image/rgb* and *label* on the inputs, *array/float* on the output), although -not in the same order. The two blocks use different algorithms, which both -refers to their inputs and outputs using names of their choice - -Nevertheless, Joe can choose to use Bill's algorithm instead of his own one. -When the algorithm to use is changed on the web interface, the platform will -attempt to match each input with the names (and types) declared by the -algorithm. In case of ambiguity, the user will be asked to manually resolve it. - -In other words: the way the block is connected in the toolchain doesn't force a -naming scheme or a specific order of inputs to the algorithms used in that -block. As long as the set of data types (on the inputs and outputs) is -compatible for both the block and the algorithm, the algorithm can be used in -the block. - -.. _beat-core-algorithms-input-naming: -.. figure:: ./img/inputs-naming.* - - Different toolchains, but interchangeable algorithms - -The name of the inputs are assigned in the JSON declaration of the algorithm, -such as: - -.. code-block:: javascript - - { - ... - "groups": [ - { - "inputs": { - "name1": { - "type": "data_format_1" - }, - "name2": { - "type": "data_format_2" - } - } - } - ], - ... - } - - -.. _beat-core-algorithms-input-synchronization: - -Inputs synchronization -...................... - -The data available on the different inputs from the synchronized channels -are (of course) synchronized. Let's consider the example toolchain on -:numref:`beat-core-algorithms-input-synchronization-example`, where: - -* The image database provides two kind of data: some *images* and their - associated *labels* -* The *block A* receives both data via its inputs -* The *block B* only receives the *labels* -* Both algorithms are *data-driven* - -The system will ask the *block A* to process 6 images, one by one. On the -second input, the algorithm will find the correct label for the current image. -The ``block B`` will only be asked to process 2 labels. - -The algorithm can retrieve the index of the current block of data of each of -its input by looking at their ``data_index`` attribute. For simplicity, the -list of inputs has two attributes (``current_data_index`` and -``current_end_data_index``) that indicates the data indexes currently used by -the synchronization mechanism of the platform. - -.. _beat-core-algorithms-input-synchronization-example: -.. figure:: ./img/inputs-synchronization.* - :width: 80% - - Synchronization example - - -.. _beat-core-algorithms-input-unsynchronized: - -Additional input methods for unsynchronized channels -.................................................... - -Unsynchronized input channels of algorithms can be accessed at will, and -algorithms can use it any way they want. To be able to perform their job, they -have access to additional methods. - -The following method is useable on a **list of inputs**: - -.. py:method:: next() - - Retrieve the next block of data on all the inputs **in a synchronized - manner** - - -Let's come back at the example toolchain on -:numref:`beat-core-algorithms-input-synchronization-example`, and assume -that *block A* uses an autonomous algorithm. To iterate over all the data on -its inputs, the algorithm would do: - -.. code-block:: python - - class Algorithm: - - def process(self, inputs, outputs): - - # Iterate over all the unsynchronized data - while inputs.hasMoreData(): - inputs.next() - - # Do something with inputs['images'].data and inputs['labels'].data - ... - - # At this point, there is no more data available on inputs['images'] and - # inputs['labels'] - - return True - - -The following methods are useable on an ``input``, in cases where the algorithm -doesn't care about the synchronization of some of its inputs: - -.. py:method:: hasMoreData() - - Indicates if there is (at least) another block of data available on the input - -.. py:method:: next() - - Retrieve the next block of data - - .. warning:: - - Once this method has been called by an algorithm, the input is no more - automatically synchronized with the other inputs of the block. - -In the following example, the algorithm desynchronizes one of its inputs but -keeps the others synchronized and iterate over all their data: - -.. code-block:: javascript - - { - ... - "groups": [ - { - "inputs": { - "images": { - "type": "image/rgb" - }, - "labels": { - "type": "label" - }, - "desynchronized": { - "type": "number" - } - } - } - ], - ... - } - - -.. code-block:: python - - class Algorithm: - - def process(self, inputs, outputs): - - # Desynchronize the third input. From now on, inputs['desynchronized'].data - # and inputs['desynchronized'].data_index won't change - inputs['desynchronized'].next() - - # Iterate over all the data on the inputs still synchronized - while inputs.hasMoreData(): - inputs.next() - - # Do something with inputs['images'].data and inputs['labels'].data - ... - - # At this point, there is no more data available on inputs['images'] and - # inputs['labels'], but there might be more on inputs['desynchronized'] - - return True - - -.. _beat-core-algorithms-input-feedbackloop: - -Feedback inputs -............... - -The :numref:`beat-core-algorithms-input-feedbackloop-example` shows a toolchain -containing a feedback loop. A special kind of input is needed in this scenario: -a *feedback input*, that isn't synchronized with the other inputs, and can be -freely used by the algorithm. - -Those feedback inputs aren't yet implemented in the prototype of the platform. -This will be addressed in a later version. - -.. _beat-core-algorithms-input-feedbackloop-example: -.. figure:: ./img/feedback-loop.* - - Feedback loop - - -.. _beat-core-algorithms-output: - -Handling output data --------------------- - -.. _beat-core-algorithms-output-outputlist: - -Output list -........... - -An algorithm is given access to the **list of the outputs** of the processing -block. This list can be used to access each output individually, either by -their name (see section :ref:`beat-core-algorithms-output-name`), their index -or by iterating over the list: - -.. code-block:: python - - # 'outputs' is the list of outputs of the processing block - - print outputs['features'].data_format - - for index in range(0, outputs.length): - outputs[index].write(...) - - for output in outputs: - output.write(...) - - for output in outputs[0:2]: - output.write(...) - - -.. _beat-core-algorithms-output-output: - -Output -...... - -Each output provides the following informations: - -.. py:attribute:: name - - *(string)* Name of the output - -.. py:attribute:: data_format - - *(string)* Format of the data written on the output - - -And the following methods: - -.. py:method:: createData() - - Retrieve an initialized block of data corresponding to the data format of - the output - -.. py:method:: write(data, end_data_index=None) - - Write a block of data on the output - - -We'll look at the usage of those methods through some examples in the following -sections. - - -.. _beat-core-algorithms-output-name: - -Output naming -............. - -Like for its inputs, each algorithm assign a name of its choice to each output -(see section :ref:`beat-core-algorithms-input-name` for more details) by -including them in the JSON declaration of the algorithm. - - -.. code-block:: javascript - - { - ... - "groups": [ - { - "inputs": { - ... - }, - "outputs": { - "name1": { - "type": "data_format1" - }, - "name2": { - "type": "data_format2" - } - } - } - ], - ... - } - - -.. _beat-core-algorithms-output-example1: - -Example 1: Write one block of data for each received block of data -.................................................................. - -.. _beat-core-algorithms-output-example1-figure: -.. figure:: ./img/outputs-example1.* - - Example 1: 6 images as input, 6 blocks of data produced - -Consider the example toolchain on -:numref:`beat-core-algorithms-output-example1-figure`. We will implement a -*data-driven* algorithm that will write one block of data on the output of the -block for each image received on its inputs. This is the simplest case. - -.. code-block:: javascript - - { - ... - "groups": [ - { - "inputs": { - "images": { - "type": "image/rgb" - }, - "labels": { - "type": "label" - } - }, - "outputs": { - "features": { - "type": "array/float" - } - } - } - ], - ... - } - - -.. code-block:: python - - class Algorithm: - - def process(self, inputs, outputs): - - # Ask the output to create a data object according to its data format - data = outputs['features'].createData() - - # Compute something from inputs['images'].data and inputs['labels'].data - # and store the result in 'data' - ... - - # Write our data block on the output - outputs['features'].write(data) - - return True - - -The structure of the ``data`` object is dependent of the data format assigned -to the output. - - -.. _beat-core-algorithms-output-example2: - -Example 2: Skip some blocks of data -................................... - -.. _beat-core-algorithms-output-example2-figure: -.. figure:: ./img/outputs-example2.* - - Example 2: 6 images as input, 4 blocks of data produced, 2 blocks of data - skipped - -Consider the example toolchain on -:numref:`beat-core-algorithms-output-example2-figure`. This time, our algorithm -will use a criterion to decide if it can perform its computation on an image or -not, and tell the platform that, for a particular data index, no data is -available. - -.. code-block:: javascript - - { - ... - "groups": [ - { - "inputs": { - "images": { - "type": "image/rgb" - }, - "labels": { - "type": "label" - } - }, - "outputs": { - "features": { - "type": "array/float" - } - } - } - ], - ... - } - -.. code-block:: python - - class Algorithm: - - def process(self, inputs, outputs): - - # Use a criterion on the image to determine if we can perform our - # computation on it or not - if can_compute(inputs['images'].data): - # Ask the output to create a data object according to its data format - data = outputs['features'].createData() - - # Compute something from inputs['images'].data and inputs['labels'].data - # and store the result in 'data' - ... - - # Write our data block on the output - outputs['features'].write(data) - else: - # Tell the platform that no data is available for this image - outputs['features'].write(None) - - return True - - def can_compute(self, image): - # Implementation of our criterion - ... - return True # or False - - -.. _beat-core-algorithms-output-example3: - -Example 3: Write one block of data related to several received blocks of data -............................................................................. - -.. _beat-core-algorithms-output-example3-figure: -.. figure:: ./img/outputs-example3.* - - Example 3: 6 images as input, 2 blocks of data produced - -Consider the example toolchain on -:numref:`beat-core-algorithms-output-example3-figure`. This time, our algorithm -will compute something using all the images with the same label (all the dogs, -all the cats) and write only one block of data related to all those images. - -The key here is the correct usage of the **current end data index** of the -input list to specify the indexes of the blocks of data we write on the output. -This ensure that the data will be synchronized everywhere in the toolchain: the -platform can now tell, for each of our data block, which image and label it -relates to (See section :ref:`beat-core-algorithms-input-synchronization`). - -Additionally, since we can't know in advance if the image currently processed -is the last one with the current label, we need to memorize the current data -index of the input list to correctly assign it later when we effectively write -the data block on the output. - -.. code-block:: javascript - - { - ... - "groups": [ - { - "inputs": { - "images": { - "type": "image/rgb" - }, - "labels": { - "type": "label" - } - }, - "outputs": { - "features": { - "type": "array/float" - } - } - } - ], - ... - } - -.. code-block:: python - - class Algorithm: - - def __init__(self): - self.data = None # Block of data updated each time we - # receive a new image - self.current_label = None # Label of the images currently processed - self.previous_data_index = None # Data index of the input list during the - # processing of the previous image - - def process(self, inputs, outputs): - # Determine if we already processed some image(s) - if self.data is not None: - # Determine if the label has changed since the last image we processed - if inputs['labels'].data.name != self.current_label: - # Write the block of data on the output - outputs['features'].write(data, self.previous_data_index) - self.data = None - - # Memorize the current data index of the input list - self.previous_data_index = inputs.current_end_data_index - - # Create a new block of data if necessary - if self.data is None: - # Ask the output to create a data object according to its data format - self.data = outputs['features'].createData() - - # Remember the label we are currently processing - self.current_label = inputs['labels'].data.name - - # Compute something from inputs['images'].data and inputs['labels'].data - # and update the content of 'self.data' - ... - - # Determine if this was the last block of data or not - if not(inputs.hasMoreData()): - # Write the block of data on the output - outputs['features'].write(self.data, inputs.current_end_data_index) - - return True - - -.. include:: links.rst diff --git a/doc/backend_api.rst b/doc/backend_api.rst index 88ea148bd60b8d9633ae8269885095967110e90e..31484c5eb835b425ffce31a1f68c7a50f0db870e 100644 --- a/doc/backend_api.rst +++ b/doc/backend_api.rst @@ -34,11 +34,70 @@ with a hybrid set of algorithms that execute on different backends. Each backend can be implemented in a different programming language and contain any number of (pre-installed) libraries users can call on their algorithms. -This document describes the API required by such backend implementations. The +The requirements for BEAT when reading/writing data are: + + * Ability to manage large and complex data + * Portability to allow the use of heterogeneous environments + +Based on our experience and on these requirements, we investigated +the use of HDF5. Unfortunately, HDF5 is not convenient to handle +structures such as arrays of variable-size elements, for instance, +array of strings. +Therefore, we decided to rely on our own binary format. + + +This document describes the binary formats in BEAT and the API required by BEAT to handle multiple backend implementations. The package `beat.env.python27`_ provides the *reference* Python backend implementation based on `Python 2.7`_. +Binary Format +------------- + +Our binary format does *not* contains information about the format of the data +itself, and it is hence necessary to know this format a priori. This means that +the format cannot be inferred from the content of a file. + +We rely on the following fundamental C-style formats: + + * int8 + * int16 + * int32 + * int64 + * uint8 + * uint16 + * uint32 + * uint64 + * float32 + * float64 + * complex64 (first real value, and then imaginary value) + * complex128 (first real value, and then imaginary value) + * bool (written as a byte) + * string + +An element of such a basic format is written in the C-style way, using +little-endian byte ordering. + +Besides, dataformats always consist of arrays or dictionary of such fundamental +formats or compound formats. + +An array of elements is saved as followed. First, the shape of the array is +saved using an *uint64* value for each dimension. Next, the elements of the +arrays are saved in C-style order. + +A dictionary of elements is saved as followed. First, the key are ordered +according to the lexicographic ordering. Then, the values associated to each of +these keys are saved following this ordering. + +The platform is data-driven and always processes chunks of data. Therefore, +data are always written by chunks, each chunk being preceded by a text-formated +header indicated the start- and end- indices followed by the size (in bytes) of +the chunck. + +Considering the Python backend of the platform, this binary format has been +successfully implemented using the ``struct`` module. + + Filesystem Organization ----------------------- diff --git a/doc/databases.rst b/doc/databases.rst deleted file mode 100644 index 749974bc33778bf5dcf0da6d40d37f6da1ef7758..0000000000000000000000000000000000000000 --- a/doc/databases.rst +++ /dev/null @@ -1,94 +0,0 @@ -.. vim: set fileencoding=utf-8 : - -.. Copyright (c) 2016 Idiap Research Institute, http://www.idiap.ch/ .. -.. Contact: beat.support@idiap.ch .. -.. .. -.. This file is part of the beat.core module of the BEAT platform. .. -.. .. -.. Commercial License Usage .. -.. Licensees holding valid commercial BEAT licenses may use this file in .. -.. accordance with the terms contained in a written agreement between you .. -.. and Idiap. For further information contact tto@idiap.ch .. -.. .. -.. Alternatively, this file may be used under the terms of the GNU Affero .. -.. Public License version 3 as published by the Free Software and appearing .. -.. in the file LICENSE.AGPL included in the packaging of this file. .. -.. The BEAT platform is distributed in the hope that it will be useful, but .. -.. WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY .. -.. or FITNESS FOR A PARTICULAR PURPOSE. .. -.. .. -.. You should have received a copy of the GNU Affero Public License along .. -.. with the BEAT platform. If not, see http://www.gnu.org/licenses/. .. - - -========== -Databases -========== - -A database is a collection of data files, one for each output of the database. -This data are inputs to the BEAT toolchains. Therefore, it is important to -define evaluation protocols, which describe how a specific system must use the -raw data of a given database. - -For instance, a recognition system will typically use a subset of the data to -train a recognition `model`, while another subset of data will be used to -evaluate the performance of this model. - - -Structure of a database ------------------------ - -A database has the following structure on disk:: - - database_name/ - output1_name.data - output2_name.data - ... - outputN_name.data - -For a given database, the BEAT platform will typically stores information -about the root folder containing this raw data as well as a description of -it. - - -Evaluation protocols --------------------- - -A BEAT evaluation protocol consists of several ``datasets``, each datasets -having several ``outputs`` with well-defined data formats. In practice, -each dataset will typically be used for a different purpose. - -For instance, in the case of a simple face recognition protocol, the -database may be split into three datasets: one for training, one for enrolling -client-specific model, and one for testing these models. -The training dataset may have two outputs: grayscale images as two-dimensional -array of type `uint8` and client id as `uint64` integers. - -The BEAT platform is data-driven, which means that all the outputs of a given -dataset are synchronized. The way the data is generated by each template -is defined in a piece of code called the ``database view``. It is important -that a database view has a deterministic behavior for reproducibility -purposes. - - -Database set templates ----------------------- - -In practice, different databases used for the same purpose may have the exact -same datasets with the exact same outputs (and attached data formats). In this -case, it is interesting to abstract the definition of the database sets from -a given database. BEAT defines ``database set templates`` for this purpose. - -For instance, the simple face recognition evaluation protocol described above, -which consists of three datasets and few inputs may be abstracted in a -database set template. This template defines both the datasets, their outputs -as well as their corresponding data formats. Next, if several databases -implements such a protocol, they may rely on the same `database set template`. -Similarly, evaluation protocols testing different conditions (such as -enrolling on clean and testing on clean data vs. enrolling on clean and -testing on noisy data) may rely on the same database set template. - -In practice, this reduces the amount of work to integrate new databases and/or -new evaluation protocols into the platform. Besides, at the experiment level, -this allows to re-use a toolchain on a different database, with almost no -configuration changes from the user. diff --git a/doc/dataformats.rst b/doc/dataformats.rst deleted file mode 100644 index e97b636fe78e1e6fee8ceb2634e77d5f72e27a9d..0000000000000000000000000000000000000000 --- a/doc/dataformats.rst +++ /dev/null @@ -1,361 +0,0 @@ -.. vim: set fileencoding=utf-8 : - -.. Copyright (c) 2016 Idiap Research Institute, http://www.idiap.ch/ .. -.. Contact: beat.support@idiap.ch .. -.. .. -.. This file is part of the beat.core module of the BEAT platform. .. -.. .. -.. Commercial License Usage .. -.. Licensees holding valid commercial BEAT licenses may use this file in .. -.. accordance with the terms contained in a written agreement between you .. -.. and Idiap. For further information contact tto@idiap.ch .. -.. .. -.. Alternatively, this file may be used under the terms of the GNU Affero .. -.. Public License version 3 as published by the Free Software and appearing .. -.. in the file LICENSE.AGPL included in the packaging of this file. .. -.. The BEAT platform is distributed in the hope that it will be useful, but .. -.. WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY .. -.. or FITNESS FOR A PARTICULAR PURPOSE. .. -.. .. -.. You should have received a copy of the GNU Affero Public License along .. -.. with the BEAT platform. If not, see http://www.gnu.org/licenses/. .. - - -.. _beat-core-dataformats: - -============= -Data formats -============= - -Data formats formalize the interaction between algorithms and data sets, so -they can communicate data in an orderly manner. All data formats produced or -consumed by these objects must be formally declared. Two algorithms which must -directly communicate data must produce and consume the same type of data -objects. - -A data format specifies a list of typed fields. An algorithm or data set -generating a block of data (via one of its outputs) **must** fill all the -fields declared in that data format. An algorithm consuming a block of data -(via one of its inputs) **must not** expect the presence of any other field -than the ones defined by the data format. - -This section contains information on the definition of dataformats, its -programmatic use on Python-based language bindings. - - -Definition ----------- - -A data format is declared as a `JSON`_ object with several fields. For example, -the following declaration could represent the coordinates of a rectangular -region in an image: - -.. code-block:: json - - { - "x": "int32", - "y": "int32", - "width": "int32", - "height": "int32" - } - -.. note:: - - We have chosen to define objects inside the BEAT platform using JSON - declarations as JSON files can be easily validated, transferred through - web-based APIs and provide and easy to read format for local inspection. - -Each field must be named according to typical programming rules for variable -names. For example, these are valid names: - - * ``my_field`` - * ``_my_field`` - * ``number1`` - -These are invalid field names: - - * ``1number`` - * ``my field`` - -The following regular expression is used to validate field names: -``^[a-zA-Z_][a-zA-Z0-9_-]*$``. In short, a field name has to start with a -letter or an underscore character and can contain, immediately after, any -number of alpha-numerical characters or underscores. - -By convention, fields prefixed and suffixed with a double underscore (``__``) -are reserved and should be avoided. - -The special field ``#description`` can be used to store a short description of -the declared data format and also ignored: - -.. code-block:: json - - { - "#description": "A rectangle in an pixeled image", - "x": "int32", - "y": "int32", - "width": "int32", - "height": "int32" - } - -The ``#description`` field is ignored in practice and only used for -informational purposes. - -Each field in a declaration has a well-defined type, which can be one of: - - * a primitive, simple type (see :ref:`beat-core-dataformats-simple`) - * a directly nested object (see :ref:`beat-core-dataformats-complex`) - * another data format (see :ref:`beat-core-dataformats-aggregation`) - * an array (see :ref:`beat-core-dataformats-array`) - -A data format can also extend another one, as explained further down (see -ref:`beat-core-dataformats-extension`). - - -.. _beat-core-dataformats-simple: - -Simple types ------------- - -The following primitive data types are available in the BEAT platform: - - * Integers: ``int8``, ``int16``, ``int32``, ``int64`` - * Unsigned integers: ``uint8``, ``uint16``, ``uint32``, ``uint64`` - * Floating-point numbers: ``float32``, ``float64`` - * Complex numbers: ``complex64``, ``complex128`` - * ``bool`` - * ``string`` - -.. note:: - - All primitive types are implemented using their :py:mod:`numpy` - counterparts. - -When determining if a block of data corresponds to a data format, the platform -will check that the value of each field can safely (without loss of precision) -be converted to the type declared by the data format. An error is generated if -you fail to follow these requirements. - -For example, an ``int8`` *can* be converted, without a precision loss, to an -``int16``, but a ``float32`` **cannot** be losslessly converted to an -``int32``. In case of doubt, you can manually test for `NumPy safe-casting -rules`_ yourself in order to understand imposed restrictions. If you wish to -allow for a precision loss on your code, you must do it explicitly (`Zen of -Python`_). - - -.. _beat-core-dataformats-complex: - -Complex types -------------- - -A data format can be composed of complex objects formed by nesting other types. -The coordinates of a rectangular region in an image be represented like this: - -.. code-block:: json - - { - "coords": { - "x": "int32", - "y": "int32" - }, - "size": { - "width": "int32", - "height": "int32" - } - } - - -.. _beat-core-dataformats-aggregation: - -Aggregation ------------ - -.. note:: - - Data formats are named using 3 values joined by a ``/`` (slash) separator: - the username who is the author of the dataformat, an identifier and the - object version (integer starting from 1). Here are examples of data format - names: - - * ``user/my_format/1`` - * ``johndoe/integers/37`` - * ``mary_mary/rectangle/2`` - - -A field can use the declaration of another data format instead of specifying -its own declaration. Consider the following data formats, on their first -version, for user ``user``: - -.. code-block:: json - :caption: Two dimensional coordinates (``user/coordinates/1``) - - { - "x": "int32", - "y": "int32" - } - -.. code-block:: json - :caption: Two dimensional size (``user/size/1``): - - { - "width": "int32", - "height": "int32" - } - -Now let's aggregate both previous formats in order to declare a new data format -for describing a rectangle: - -.. code-block:: json - :caption: The definition of a rectangle - - { - "coords": "user/coordinates/1", - "size": "user/size/1" - } - - -.. _beat-core-dataformats-array: - -Arrays ------- - -A field can be a multi-dimensional array of any other type. For instance, -consider the following example: - -.. code-block:: json - - { - "field1": [10, "int32"], - "field2": [10, 5, "bool"] - } - -Here we declare that ``field1`` is a one-dimensional array of 10 32-bit signed -integers (``int32``), and ``field2`` is a two-dimensional array with 10 rows -and 5 columns of booleans. - -.. note:: - - In the Python language representation of data formats, multi-dimensional - arrays are implemented using :py:class:`numpy.ndarray`'s. - - -An array can have as many dimensions as you want. It can also contain objects -(either declared inline, or using another data format): - -.. code-block:: json - - { - "inline": [10, { - "x": "int32", - "y": "int32" - }], - "imported": [10, "beat/coordinates/1"] - } - -It is also possible to declare an array without specifying the number of -elements in some of its dimensions, by using a size of 0 (zero): - -.. code-block:: json - - { - "field1": [0, "int32"], - "field2": [0, 0, "bool"], - "field3": [10, 0, "float32"] - } - -Here, ``field1`` is a one-dimensional array of 32-bit signed integers -(``int32``), ``field2`` is a two-dimensional array of booleans, and ``field3`` -is a two-dimensional array of floating-point numbers (``float32``) whose the -first dimension is fixed to 10 (number of rows). - -Note that the following declaration isn't valid (you can't fix a dimension if -the preceding one isn't fixed too): - -.. code-block:: json - - { - "error": [0, 10, "int32"] - } - -.. note:: - - When determining if that a block of data corresponds to a data format - containing an array, the platform automatically checks that: - - * the number of dimensions is correct - * the size of each declared dimension that isn't 0 is correct - * the type of each value in the array is correct - - -.. _beat-core-dataformats-extension: - -Extensions ----------- - -Besides aggregation, it is possible to extend data formats through inheritance. -In practice, inheriting from a data format is the same as pasting its -declaration right on the top of the new format. - -For example, one might implement a face detector algorithm and may want to -create a data format containing all the informations about a face (say its -position, its size and the position of each eye). This could be done by -extending the type ``user/rectangular_area/1`` defined earlier: - -.. code-block:: json - - { - "#extends": "user/rectangular_area/1", - "left_eye": "coordinates", - "right_eye": "coordinates" - } - - -.. _beat-core-dataformats-usage: - -Python API ----------- - -Data formats are useful descriptions of data blocks that are consumed by -algorithmic code inside the platform. In BEAT, the user never instantiates data -formats directly. Instead, when a new object representing a data format needs -to be created, the user may just create a dictionary in which the keys are the -format field names, whereas the values are instances of the type defined for -such a field. If the type is a reference to another format, the user may nest -dictionaries so as to build objects of any complexity. When the dictionary -representing a data format is written to an algorithm output, the data is -properly validated. - -This concept will become clearer when you'll read about algorithms and the way -they receive and produce data. Here is just a simple illustrative example: - -.. testsetup:: test-output-write - - import numpy - from beat.core.dataformat import DataFormat - from beat.core.test.mocks import MockDataSink - from beat.core.outputs import Output - - dataformat = DataFormat('/not/needed', { - "x": "int32", - "y": "int32", - "width": "int32", - "height": "int32" - }) - assert dataformat.valid - data_sink = MockDataSink(dataformat) - output = Output('test', data_sink) - -.. testcode:: test-output-write - - # suppose, for this example, `output' is provided to your algorithm - output.write({ - "x": numpy.int32(10), - "y": numpy.int32(20), - "width": numpy.int32(100), - "height": numpy.int32(100), - }) - - -.. include:: links.rst - diff --git a/doc/develop.rst b/doc/develop.rst index 6a8092f206af00cb340eb556ac0f120a8c4ce929..d2b3083e45e4cb185fa2c3c5dcaac4d71e907066 100644 --- a/doc/develop.rst +++ b/doc/develop.rst @@ -20,6 +20,7 @@ .. You should have received a copy of the GNU Affero Public License along .. .. with the BEAT platform. If not, see http://www.gnu.org/licenses/. .. +.. _beat-core-local-development: =================== Local Development @@ -30,11 +31,11 @@ Go through the following sequence of commands: .. code-block:: sh - $ git co https://gitlab.idiap.ch/bob/bob.admin + $ git checkout https://gitlab.idiap.ch/bob/bob.admin $ #install miniconda (version 4.4 or above required) $ conda activate base $ cd beat.backend.python #cd into this package's sources - $ ../bob.admin/conda/conda-bootstrap.py --overwrite --python=2.7 beat-core-dev + $ ../bob.admin/conda/conda-bootstrap.py --overwrite --python=3.6 beat-core-dev $ conda activate beat-core-dev $ #n.b.: docker must be installed on your system (see next section) $ buildout -c develop.cfg @@ -67,17 +68,29 @@ execute algorithms or experiments. We use specific docker images to run user algorithms. Download the following base images before you try to run tests or experiments on your computer:: - $ docker pull docker.idiap.ch/beat/beat.env.system.python:1.1.2 - $ docker pull docker.idiap.ch/beat/beat.env.db.examples:1.1.1 - $ docker pull docker.idiap.ch/beat/beat.env.client:1.2.0 - $ docker pull docker.idiap.ch/beat/beat.env.cxx:1.0.2 + $ docker pull docker.idiap.ch/beat/beat.env.system.python:1.3.0 + $ docker pull docker.idiap.ch/beat/beat.env.db.examples:1.4.0 + $ docker pull docker.idiap.ch/beat/beat.env.client:2.0.0 + $ docker pull docker.idiap.ch/beat/beat.env.cxx:2.0.0 Optionally, also download the following images to be able to re-run experiments -downloaded from the BEAT platform (not required for unit testing):: +downloaded from the BEAT platform (not required for unit testing). These docker +images corresponds to the python environment available on the platform. Keep in +mind that at the moment you cannot use different environments to run each block +when you are using BEAT locally (meaning not using the Docker executor):: - $ docker pull docker.idiap.ch/beat/beat.env.python:0.0.4 - $ docker pull docker.idiap.ch/beat/beat.env.python:1.0.0 - $ docker pull docker.idiap.ch/beat/beat.env.db:1.2.2 + $ docker pull docker.idiap.ch/beat/beat.env.python:1.1.0 + $ docker pull docker.idiap.ch/beat/beat.env.python:2.0.0 + $ docker pull docker.idiap.ch/beat/beat.env.db:1.4.0 + +Before pulling these images, you should check the registry as there might have +been new release (i.e. rX versions). + +To run an experiment using docker you should specify the docker image when defining the experiment, then use the `--docker` flag when using `beat.cmdline`:: + + $ beat experiment run --docker + +You can find more information about running experiments locally using different executors in `here `_. Documentation @@ -90,6 +103,7 @@ To build the documentation, just do: $ ./bin/sphinx-build doc sphinx + Testing ------- @@ -103,18 +117,18 @@ use ``nose``: .. note:: - Some of the tests for our command-line toolkit require a running BEAT - platform web-server, with a compatible ``beat.core`` installed (preferably - the same). By default, these tests will be skipped. If you want to run - them, you must setup a development web server and set the environment - variable ``BEAT_CORE_TEST_PLATFORM`` to point to that address. For example:: + Some of the tests for our command-line toolkit require a running BEAT + platform web-server, with a compatible ``beat.core`` installed (preferably + the same). By default, these tests will be skipped. If you want to run + them, you must setup a development web server and set the environment + variable ``BEAT_CORE_TEST_PLATFORM`` to point to that address. For example:: - $ export BEAT_CORE_TEST_PLATFORM="http://example.com/platform/" - $ ./bin/nosetests -sv + $ export BEAT_CORE_TEST_PLATFORM="http://example.com/platform/" + $ ./bin/nosetests -sv - .. warning:: + .. warning:: - Do **NOT** run tests against a production web server. + Do **NOT** run tests against a production web server. If you want to skip slow tests (at least those pulling stuff from our servers) @@ -131,15 +145,13 @@ To measure the test coverage, do the following:: Our documentation is also interspersed with test units. You can run them using sphinx:: - $ ./bin/sphinx -b doctest doc sphinx - - -Other bits ----------- + $ ./bin/sphinx -b doctest doc sphinx +Other Bits ========== + Profiling -========== +--------- In order to profile the test code, try the following:: @@ -154,4 +166,4 @@ This will allow you to dump and print the profiling statistics as you may find fit. -.. _docker: https://www.docker.com/ +.. include:: links.rst diff --git a/doc/experiments.rst b/doc/experiments.rst deleted file mode 100644 index d392eda25fa865beca98fde38fb43b712b3eab19..0000000000000000000000000000000000000000 --- a/doc/experiments.rst +++ /dev/null @@ -1,200 +0,0 @@ -.. vim: set fileencoding=utf-8 : - -.. Copyright (c) 2016 Idiap Research Institute, http://www.idiap.ch/ .. -.. Contact: beat.support@idiap.ch .. -.. .. -.. This file is part of the beat.core module of the BEAT platform. .. -.. .. -.. Commercial License Usage .. -.. Licensees holding valid commercial BEAT licenses may use this file in .. -.. accordance with the terms contained in a written agreement between you .. -.. and Idiap. For further information contact tto@idiap.ch .. -.. .. -.. Alternatively, this file may be used under the terms of the GNU Affero .. -.. Public License version 3 as published by the Free Software and appearing .. -.. in the file LICENSE.AGPL included in the packaging of this file. .. -.. The BEAT platform is distributed in the hope that it will be useful, but .. -.. WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY .. -.. or FITNESS FOR A PARTICULAR PURPOSE. .. -.. .. -.. You should have received a copy of the GNU Affero Public License along .. -.. with the BEAT platform. If not, see http://www.gnu.org/licenses/. .. - - -.. _beat-core-experiments: - -============ -Experiments -============ - -An experiment is the reunion of algorithms, datasets, a toolchain and -parameters that allow the platform to schedule and run the prescribed recipe -to produce displayable results. Defining a BEAT experiment can be seen as -configuring the processing blocks of a toolchain, such as selecting which -database, algorithms and algorithm parameters to use. - -.. _beat-core-experiments-declaration: - -Declaration of an experiment ----------------------------- - -.. note:: - - One needs only to declare an experiment using those specifications when not - using the web interface (i.e. when doing local development or using the web - api). The web interface provides a user-friendly way to configure an - experiment. - -An experiment is declared in a JSON file, and must contain at least the following -fields: - -.. code-block:: javascript - - { - "datasets": [ - ], - "blocks": [ - ], - "analyzers": [ - ], - "globals": [ - ] - } - - -.. _beat-core-experiments-datasets: - -Declaration of the dataset(s) ------------------------------ - -The dataset inputs are defined by the toolchain. However, the toolchain does -not describe which data to plug in each dataset input. - -This is the role of the field `datasets` from an experiment. -For each dataset, an experiment must specify three attributes as follows: - -.. code-block:: javascript - - { - "datasets": [ - "templates": { - "set": "templates", - "protocol": "idiap", - "database": "atnt/1" - }, - ... - ], - ... - } - - -The key of an experiment dataset must correspond to the desired dataset name -from the toolchain. Then, three fields must be given: - -* `database`: the database name and version -* `protocol`: the protocol name -* `set`: the dataset name of this database to associate to this toolchain - dataset - - -.. _beat-core-experiments-blocks: - -Declaration of the block(s) ---------------------------- - -The blocks are defined by the toolchain. However, the toolchain does not -describe which algorithm to run in each processing block, and how each of these -algorithms are parametrized. - -This is the role of the field `blocks` from an experiment. -For each block, an experiment must specify four attributes as follows: - -.. code-block:: javascript - - { - "blocks": { - "linear_machine_training": { - "inputs": { - "image": "image" - }, - "parameters": {}, - "algorithm": "tutorial/pca/1", - "outputs": { - "subspace": "subspace" - } - }, - ... - }, - ... - } - -The key of an experiment block must correspond to the desired block from the -toolchain. Then, four fields must be given: - -* `algorithm`: the algorithm to use (author_name/algorithm_name/version) -* `inputs`: the list of inputs. The key is the algorithm input, while the - value is the corresponding toolchain input. -* `outputs`: the list of outputs. The key is the algorithm output, while the - value is the corresponding toolchain output. -* `parameters`: the algorithm parameters to use for this processing block - - -.. note:: - - When setting an algorithm in a processing block, this will also set the - dataformats of the outputs (and inputs) of this block. In particular, - this has an impact on all the inputs of blocks connected to those outputs, - which must have the same data formats (or be an extension of these data - formats). The platform automatically validate that the data formats of - consecutive blocks are compatible. - - -.. _beat-core-experiments-analyzers: - -Declaration of the analyzer(s) ------------------------------- - -Analyzers are similar to algorithms, except that they run on toolchain -endpoints. There configuration is very similar to the one of regular blocks, -except that they have no `outputs`: - -.. code-block:: javascript - - { - "analyzers": { - "analysis": { - "inputs": { - "scores": "scores" - }, - "algorithm": "tutorial/postperf/1" - } - }, - } - - -Global parameters ------------------ - -Each block and analyzer may rely on its own local parameters. However, several -blocks may rely on the exact same parameters. In this case, it is more -convenient to define those globally. - -For an experiment, this is achieved using the `globals` field in its JSON -declaration. For instance: - -.. code-block:: javascript - - { - "globals": { - "queue": "Default", - "environment": { - "version": "0.0.3", - "name": "Scientific Python 2.7" - }, - "tutorial/pca/1": { - "number-of-components": "5" - } - }, - ... - } - diff --git a/doc/index.rst b/doc/index.rst index cbc414dbf30ae5b28e92a1fd4ecd7165829fbc78..79ba91f5171b36f6ce2f3a1069c819e88e6bd1c6 100644 --- a/doc/index.rst +++ b/doc/index.rst @@ -27,20 +27,12 @@ Core BEAT components ====================== -This user guide contains information about BEAT core components, defining -experiments, toolchains and user algorithms among others. +This package provides the core components of BEAT ecosystem. These core components are the building blocks of BEAT experiments that are used by all the other BEAT packages. .. toctree:: introduction - dataformats - algorithms - libraries - toolchains - experiments - databases - io backend_api develop api diff --git a/doc/introduction.rst b/doc/introduction.rst index bdf95746ee08f9dbe363fa36d2faf88eca8a5cb9..b68b07825393bf0962b6339edcc06dce02ac4262 100644 --- a/doc/introduction.rst +++ b/doc/introduction.rst @@ -23,187 +23,14 @@ .. _beat-core-introduction: -============= -Introduction -============= +============== + Introduction +============== -The BEAT platform is a web-based system for certifying results for -software-based data-driven workflows that can be sub-divided functionally (into -processing blocks). The platform takes all burden of hosting data and software -away from users by providing a capable computing farm that handles both aspects -graciously. Data is kept sequestered inside the platform. The user provides the -description of data formats, algorithms, data flows (also known as toolchains) -and experimental details (parameters), which are mashed inside the platform to -produce beautiful results, easily exportable into computer graphics or tables -for scientific reports. +A typical BEAT experiment is composed of several building blocks. Datasets that provide data to the system, algorithms that handles the functions introduced by user, analyzers that is in charge of interpreting the output result and producing the appropriate results and figures, and toolchains that determines the data flow between the blocks from datasets to the final results. In addition, each block accepts specific data formats and the data is synchronized between blocks neatly without users need to interfere. These basic functionalities that are introduced in `Getting Started with BEAT`_ are all defined and managed by ``beat.core``. For example, as it is explained in `Algorithms`_, algorithm objects should be derived from the class +``Algorithm`` when using Python or in case of C++, they should be derived from ``IAlgorithmLagacy``, ``IAlgorithmSequential``, or ``IAlgorithmAutonomous`` depending of the algorithm type. All these parent classes are defined in ``beat.core`` package. -It is intended as a fundamental building-block in `Reproducible Research`_, -allowing academic and industrial parties to prescribe system behavior and have -it reproducible through software, hardware and staff generations. Here are some -known applications: - -* Challenges and competitions on defined data, protocols and workflow - components; -* Study group exercises and exams; -* Support to publication submission; -* System and algorithm performance optimization; -* Reproduction of experiments through communities; -* Support for industry-academy relationship. - -This package, in particular, defines a set of core components useful for the -whole platform: the building blocks used by all other packages in the BEAT -software suite. These are: - -* **Data formats**: the specification of data which is transmitted between - blocks of a toolchain; -* **Libraries**: routines (source-code or binaries) that can be incorporated - into other libraries or user code on algorithms; -* **Algorithms**: the program (source-code or binaries) that defines the user - algorithm to be run within the blocks of a toolchain; -* **Databases** and **Datasets**: means to read raw-data from a disk and feed - into a toolchain, respecting a certain usage protocol; -* **Toolchain**: the definition of the data flow in an experiment; -* **Experiment**: the reunion of algorithms, datasets, a toolchain and - parameters that allow the platform to schedule and run the prescribed recipe - to produce displayable results. - - -.. _beat-core-introduction-example: - -A Simple Example ----------------- - -The next figure shows a representation of a very simple toolchain, composed of -only a few color-coded components: - -* To the left, the reader can identify two datasets, named ``set`` and ``set2`` - respectively. They emit data (of, at this point, an unspecified type) into - the following processing blocks; -* Following the datasets, two processing blocks named ``echo1`` and ``echo2`` - receive the input from the dataset and emit data into a third block, named - ``echo3``; -* The final component receives the inputs emitted from ``echo3`` and it is - called ``analysis``. Because this block has no output, it is considered a - final block, from which the BEAT platform expects to collect experiment - results (that, at this point, are also unspecified). - -.. Simple toolchain representation for the BEAT platform -.. graphviz:: img/toolchain-triangle.dot - -The toolchain only defines the very basic data flow and connections that must -be respected by experiments. It does not define what is the type of data that -is produced or consumed by any of the existing blocks, the algorithms or -databases and protocols to use. From the toolchain description, it is possible -to devise a possible execution order, by taking into consideration the imposed -data flow. In this simple example, the datasets called ``set`` and ``set2`` -may yield data in parallel, allowing the execution of blocks ``echo1`` and -``echo2``. Block ``echo3`` must come next, before the ``analysis`` block, which -comes by last. - -In typical problems that can be implemented in the BEAT platform, datasets are -composed of multiple instances of raw data. For example, these could be images -for an object recognition problem, speech sequences for a speech recognition -task or model data for biometric recognition tasks. Computing blocks must -process these data by looping on these atomic data samples. The color-coding in -the figure indicates this extra data-flow information: for each dataset in the -drawing, it indicates how blocks loop on their atomic data. For the proposed, -toolchain, we can observe that blocks ``echo1``, ``echo3`` and ``analysis`` -loop over the "raw" data samples from ``set``, while ``echo2`` loop over the -samples from ``set2``. - -The next figure shows a complete experimental setup for the above toolchain. -The input blocks use a given database, called ``simple/1`` (the name is -``simple`` and the version is ``1``), using one of its protocols called -``protocol``. Each block is set to a specific data set inside the -database/protocol combination. Both datasets on this database/protocol yield -objects of type ``beat/integer/1`` (a format called ``integer`` from user -``beat``, version ``1``), which are consumed by algorithms running on the next -blocks. The block ``echo1`` uses the algorithm ``user/integers_echo/1`` (an -algorithm called ``integers_echo`` from user ``user``, version ``1``) and -also yields ``beat/integer/1`` objects. The same is valid for the algorithm -running on block ``echo2``. - -The algorithm for block ``echo3`` cannot possibly be the same - it must deal -with 2 inputs, generated by blocks looping on different raw data. We'll be more -detailed about conceptual differences while writing algorithms which are not -synchronized with all of their inputs next. For this introduction, it suffices -you understand the organization of algorithms in an experiment is constrained -by its neighboring block requirements as well as the input and output -data flows determined for a given block. - -Block ``echo3`` yields elements to the algorithm on the ``analysis`` block, -called ``user/integers_echo_analyzer/1``, which produces a single result named -``out_data``, which is of type ``int32`` (that is, a signed integer with 32 -bits). Algorithms that do not communicate with other algorithms are typically -called ``analyzers``. They are set-up on the end of experiments so as to -produce quantifiable results you can use to measure the performance of your -experimental setup. - -.. Simple experiment representation for the BEAT platform -.. graphviz:: img/experiment-triangle.dot - - -.. _beat-core-introduction-design: - -Design ------- - -The next figure shows an UML representation of main BEAT components, showing -some of their interaction and interdependence. Experiments use algorithms, data -sets and a toolchain in order to define a complete runnable setup. Data sets -are grouped into protocols which are, in turn, grouped into databases. -Algorithms use data formats to defined input and output patterns. Most objects -are subject to versioning, possess a name and belong to a specific user. By -contracting those markers, it is possible to define unique identifiers for all -objects in the platform. In the example above, you can identify some examples. - -.. High-level component interaction in the BEAT platform core -.. graphviz:: - - digraph hierarchy { - graph [fontname="helvetica", compound=true, splines=polyline] - node [fontname="helvetica", shape=record, style=filled, fillcolor=gray95] - edge [fontname="helvetica"] - - subgraph "algorithm_cluster" { - 1[label = "{Dataformat|...|+user\n+name\n+version}"] - 2[label = "{Algorithm|...|+user\n+name\n+version\n+code\n+language}"] - 6[label = "{Library|...|+user\n+name\n+version\n+code\n+language}"] - } - subgraph "database_cluster" { - graph [label=datasets] - 3[label = "{Database|...|+name\n+version}"] - 4[label = "{Protocol|...|+template}"] - 5[label = "Set"] - } - subgraph "experiment_cluster" { - graph [label=experiments] - 7[label = "{Toolchain|+execution_order()|+user\n+name\n+version}"] - 8[label = "{Experiment|...|+user\n+label}"] - } - - 1->1 [label = "0..*", arrowhead=empty] - 2->1 [label = "1..*", arrowhead=empty] - 2->6 [label = "0..*", arrowhead=empty] - 6->6 [label = "0..*", arrowhead=empty] - 4->3 [label = "1..*", arrowhead=odiamond] - 5->4 [label = "1..*", arrowhead=odiamond] - 5->1 [label = "1..*", arrowhead=empty] - 8->7 [label = "1..1", arrowhead=empty] - 8->2 [label = "1..*", arrowhead=empty] - 8->5 [label = "1..*", arrowhead=empty] - - } - - -The BEAT platform provides a graphical user interface so that you can program -data formats, algorithms, toolchains and define experiments rather intuitively. -This package provides the core building blocks of the BEAT platform. For expert -users, we provide a command-line interface to the platform, allowing such -users to create, modify and dispose of such objects using their own private -editors. For developers and programmers, the rest of this guide details each of -those building blocks, their relationships and how to use such a command-line -interface to interact with the platform efficiently. +The rest of this document includes information about the backend api used to handle data through the BEAT ecosystem. For developers and advanced user there is information for local development of the package. .. include:: links.rst diff --git a/doc/io.rst b/doc/io.rst deleted file mode 100644 index 488f542f05c3a60159396e5dde906a62723cf4da..0000000000000000000000000000000000000000 --- a/doc/io.rst +++ /dev/null @@ -1,93 +0,0 @@ -.. vim: set fileencoding=utf-8 : - -.. Copyright (c) 2016 Idiap Research Institute, http://www.idiap.ch/ .. -.. Contact: beat.support@idiap.ch .. -.. .. -.. This file is part of the beat.core module of the BEAT platform. .. -.. .. -.. Commercial License Usage .. -.. Licensees holding valid commercial BEAT licenses may use this file in .. -.. accordance with the terms contained in a written agreement between you .. -.. and Idiap. For further information contact tto@idiap.ch .. -.. .. -.. Alternatively, this file may be used under the terms of the GNU Affero .. -.. Public License version 3 as published by the Free Software and appearing .. -.. in the file LICENSE.AGPL included in the packaging of this file. .. -.. The BEAT platform is distributed in the hope that it will be useful, but .. -.. WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY .. -.. or FITNESS FOR A PARTICULAR PURPOSE. .. -.. .. -.. You should have received a copy of the GNU Affero Public License along .. -.. with the BEAT platform. If not, see http://www.gnu.org/licenses/. .. - - -.. _developerguide-io: - -=============== -Inputs/Outputs -=============== - -.. _developerguide-io-introduction: - -Introduction ------------- - -The requirements for the platform when reading/writing data are: - - * Ability to manage large and complex data - * Portability to allow the use of heterogeneous environments - -Based on our experience and on these requirements, we investigated -the use of HDF5. Unfortunately, HDF5 is not convenient to handle -structures such as arrays of variable-size elements, for instance, -array of strings. -Therefore, we decided to rely on our own binary format. - -.. _developerguide-io-strategy: - - -Binary Format -------------- - -Our binary format does *not* contains information about the format of the data -itself, and it is hence necessary to know this format a priori. This means that -the format cannot be inferred from the content of a file. - -We rely on the following fundamental C-style formats: - - * int8 - * int16 - * int32 - * int64 - * uint8 - * uint16 - * uint32 - * uint64 - * float32 - * float64 - * complex64 (first real value, and then imaginary value) - * complex128 (first real value, and then imaginary value) - * bool (written as a byte) - * string - -An element of such a basic format is written in the C-style way, using -little-endian byte ordering. - -Besides, dataformats always consist of arrays or dictionary of such fundamental -formats or compound formats. - -An array of elements is saved as followed. First, the shape of the array is -saved using an *uint64* value for each dimension. Next, the elements of the -arrays are saved in C-style order. - -A dictionary of elements is saved as followed. First, the key are ordered -according to the lexicographic ordering. Then, the values associated to each of -these keys are saved following this ordering. - -The platform is data-driven and always processes chunks of data. Therefore, -data are always written by chunks, each chunk being preceded by a text-formated -header indicated the start- and end- indices followed by the size (in bytes) of -the chunck. - -Considering the Python backend of the platform, this binary format has been -successfully implemented using the ``struct`` module. diff --git a/doc/libraries.rst b/doc/libraries.rst deleted file mode 100644 index 3bd225f3b37d00427e47a678d0fd575221febcd9..0000000000000000000000000000000000000000 --- a/doc/libraries.rst +++ /dev/null @@ -1,116 +0,0 @@ -.. vim: set fileencoding=utf-8 : - -.. Copyright (c) 2016 Idiap Research Institute, http://www.idiap.ch/ .. -.. Contact: beat.support@idiap.ch .. -.. .. -.. This file is part of the beat.core module of the BEAT platform. .. -.. .. -.. Commercial License Usage .. -.. Licensees holding valid commercial BEAT licenses may use this file in .. -.. accordance with the terms contained in a written agreement between you .. -.. and Idiap. For further information contact tto@idiap.ch .. -.. .. -.. Alternatively, this file may be used under the terms of the GNU Affero .. -.. Public License version 3 as published by the Free Software and appearing .. -.. in the file LICENSE.AGPL included in the packaging of this file. .. -.. The BEAT platform is distributed in the hope that it will be useful, but .. -.. WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY .. -.. or FITNESS FOR A PARTICULAR PURPOSE. .. -.. .. -.. You should have received a copy of the GNU Affero Public License along .. -.. with the BEAT platform. If not, see http://www.gnu.org/licenses/. .. - - -.. _beat-core-libraries: - -========== -Libraries -========== - -Algorithms are fundamental elements in the platform that formally describe how -to process data. In particular, they are always attached to a specific -processing block with a given set of inputs and outputs. When an algorithm -needs to be applied in a slightly different processing block, this may, hence, -lead to a lot of code duplication. Duplicate code is undesirable for a number -of reasons such as high maintenance cost. - -To address this problem, the platform defines the concept of **libraries**. -Libraries allow users to put code required by several different algorithms -into a common location. Once done, code from a library may be used by any -algorithm as long as the algorithm mentions its dependency to it in its -JSON declaration. In addition, a library may depend on another library. - - -Definition ----------- - -Similarly to algorithms, a library consists of two folds: - -* A ``JSON declaration`` indicating: - - - The language in which the library is written - - Library dependencies of this library - -.. code-block:: javascript - - { - "uses": { - "otherlib": "user/otherlibrary/1" - }, - "language": "python" - } - - -* ``Source code``. For the Python back-end, this may consist of any Python - function and classes, as long as dependencies are fulfilled. - -.. code-block:: python - - def simple_function(array): - return len([v for v in array if v != 0]) - - class MyLibraryClass: - - def __init__(self, multiplier=37): - self.multiplier = multiplier - - def function_from_my_library(value): - return value * self.multiplier - -The web client of the BEAT platform provides a graphical editor for algorithm, -which simplifies its `JSON`_ declaration definition. It also includes a simple -Python code editor. - - -Usage ------ - -To use a defined library in an algorithm or in another library, it is -sufficient to: - -* Add the library dependency into the `JSON`_ declaration of the algorithm - (or of the library). The name given as a key is the one used to import - the library, while the corresponding value is the fullname, that is - `author/name/version` of the library. - -.. code-block:: javascript - - { - ... - "uses": { - "mylib": "user/mylibrary/1" - }, - ... - } - -* Import the library and use its desired functionalities. - -.. code-block:: python - - import mylib - ... - array = [0, 1, 2, 3] - array_processed = mylib.simple_function(array) - - -.. include:: links.rst diff --git a/doc/links.rst b/doc/links.rst index 7cd86ba351807eaffd55d8a893d84bf02d71c998..1f95bde1d154c6944fbf00490bdede13ae4ce3a9 100644 --- a/doc/links.rst +++ b/doc/links.rst @@ -15,3 +15,5 @@ .. _python bindings: http://zeromq.org/bindings:python .. _markdown: http://daringfireball.net/projects/markdown/ .. _restructuredtext: http://docutils.sourceforge.net/rst.html +.. _Getting Started with BEAT: https://www.idiap.ch/software/beat/docs/beat/docs/master/beat/introduction.html +.. _Algorithms: https://www.idiap.ch/software/beat/docs/beat/docs/master/beat/algorithms.html diff --git a/doc/toolchains.rst b/doc/toolchains.rst deleted file mode 100644 index 08586892f9d7841cfc2816d2da006bb828b61689..0000000000000000000000000000000000000000 --- a/doc/toolchains.rst +++ /dev/null @@ -1,501 +0,0 @@ -.. vim: set fileencoding=utf-8 : - -.. Copyright (c) 2016 Idiap Research Institute, http://www.idiap.ch/ .. -.. Contact: beat.support@idiap.ch .. -.. .. -.. This file is part of the beat.core module of the BEAT platform. .. -.. .. -.. Commercial License Usage .. -.. Licensees holding valid commercial BEAT licenses may use this file in .. -.. accordance with the terms contained in a written agreement between you .. -.. and Idiap. For further information contact tto@idiap.ch .. -.. .. -.. Alternatively, this file may be used under the terms of the GNU Affero .. -.. Public License version 3 as published by the Free Software and appearing .. -.. in the file LICENSE.AGPL included in the packaging of this file. .. -.. The BEAT platform is distributed in the hope that it will be useful, but .. -.. WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY .. -.. or FITNESS FOR A PARTICULAR PURPOSE. .. -.. .. -.. You should have received a copy of the GNU Affero Public License along .. -.. with the BEAT platform. If not, see http://www.gnu.org/licenses/. .. - - -.. _beat-core-toolchains: - -=========== -Toolchains -=========== - -A toolchain describes the workflow of a particular experiment. Its declaration -defines: - - * a collection of *processing blocks*, including for each of them: - - * a *name* (unique in the toolchain) - * a list of *inputs* - * a list of *outputs* - - * the *interconnections* between those blocks (from an output to an input) - * a list of *datasets*, that yield raw input data for the experiment - * a list of *result outputs*, that produce the results of the experiment - - -.. _beat-core-toolchains-declaration: - -Declaration of a toolchain --------------------------- - -.. note:: - - One needs only to declare a toolchain using those specifications when not - using the web interface (i.e. when doing local development or using the web - api). The web interface provides a user-friendly way to declare and modify - toolchains. - -A toolchain is declared in a JSON file, and must contain at least the following -fields: - -.. code-block:: javascript - - { - "datasets": [ - ], - "blocks": [ - ], - "connections": [ - ], - "analyzers": [ - ] - } - -Note that this toolchain is considered as a correct one by the platform (i.e. -it doesn't contain any error, and thus can be modified via the web interface), -but not as an executable one, as there is nothing to execute. - -For display purposes, the JSON file may contain an additional field called -`representation`, which provides insight on how to display the workflow in -a graphical way. - - -.. _beat-core-toolchains-datasets: - -Declaration of the datasets ---------------------------- - -Datasets are starting points of a toolchain. They provide raw input data for a -scientific experiment, and they have, hence, outputs but no input. Several -datasets are typically attached to a given a protocol of a database, each of -them having a particular role. For instance, a protocol for a classification -task may provide three distinct datasets, one for training a generic model, -one for enrolling class-specific models, and one for generating probe samples -that are compared against the enrolled models. - -To define the dataset, its name as well as its corresponding outputs have to -be defined in the JSON declaration of the toolchain. Considering the example -mentioned above, this would look like: - -.. code-block:: javascript - - { - ... - "datasets": [ - { - "outputs": [ - "image", - "eye_centers" - ], - "name": "train" - }, - { - "outputs": [ - "template_id", - "client_id", - "image", - "eye_centers" - ], - "name": "templates" - }, - { - "outputs": [ - "probe_id", - "client_id", - "template_ids", - "image", - "eye_centers" - ], - "name": "probes" - }, - ... - } - - -.. _beat-core-toolchains-blocks: - -Declaration of the processing blocks ------------------------------------- - -To define the processing blocks contained in a toolchain, just add some entries -into the ``blocks`` array, such as: - -.. code-block:: javascript - - { - ... - "blocks": [ - { - "inputs": [ - "image", - "eye_centers" - ], - "synchronized_channel": "train", - "name": "cropping_rgb_train", - "outputs": [ - "image_gray" - ] - } - ] - ... - } - -Here we defined a block named `cropping_rgb_train`, which expects two inputs, -`image` and `eye_centers`, and returns one output, `image_gray`. The -synchronization channel indicates against which dataset the outputs are -synchronized. However, the toolchain does specify neither the data format of -the inputs and outputs, nor the algorithm that is going to run inside the -block. This is performed in the experiment definition, which combines -dataformats, algorithms and a toolchain together. - -As with the datasets, to define more blocks just add more entries into the -``blocks`` array: - -.. code-block:: javascript - - { - ... - "blocks": [ - { - "inputs": [ - "image", - "eye_centers" - ], - "synchronized_channel": "train", - "name": "cropping_rgb_train", - "outputs": [ - "image_cropped" - ] - }, - { - "inputs": [ - "image_cropped" - ], - "synchronized_channel": "train", - "name": "feature_extraction_train", - "outputs": [ - "feature_vector" - ] - } - ] - ... - } - - -.. _beat-core-toolchains-connections: - -Declaration of the connections between the processing blocks ------------------------------------------------------------- - -To define a connection between two processing blocks (or one dataset and one -processing block), just add one entry into the ``connections`` array, with one -of the following forms: - -.. code-block:: javascript - - { - "from": "block1_name.output_name", - "to": "block2_name.input_name" - } - -or: - -.. code-block:: javascript - - { - "from": "dataset_name.output_name", - "to": "block_name.input_name" - } - -For example: - -.. code-block:: javascript - - { - ... - "connections": [{ - "from": "cropping_rgb_train.image_cropped", - "to": "features_extraction_train.image_cropped" - } - ], - "blocks": [ - { - "inputs": [ - "image", - "eye_centers" - ], - "synchronized_channel": "train", - "name": "cropping_rgb_train", - "outputs": [ - "image_cropped" - ] - }, - { - "inputs": [ - "image_cropped" - ], - "synchronized_channel": "train", - "name": "feature_extraction_train", - "outputs": [ - "feature_vector" - ] - } - ] - ... - } - - -Several important things to note: - - * The names of the connected output and input don't need to be the same. Use - whatever make sense in the context of each block - * An output can be connected to several inputs - * An input can only be connected to one output - * The names of the blocks and of the datasets must be unique in the - toolchain - - -.. _beat-core-toolchains-results: - -Declaration of the outputs to use as results --------------------------------------------- - -To declare that a particular processing block output produces the result of the -toolchain (or a part of it), just add one entry into the ``analyzers`` field, -with the following form: - -.. code-block:: javascript - - { - ... - "analyzers": [ - { - "inputs": [ - "scoring_dev_scores", - "scoring_test_scores" - ], - "synchronized_channel": "probes", - "name": "analysis" - } - ] - ... - } - -The field `inputs` lists the results, while the field `synchronized_channel` -indicates the dataset against which to automatically perform the loop as for -any regular toolchain block. - -The data written on those `inputs` will be used to display results and plots -on the web interface. - - -.. _beat-core-toolchains-example: - -Putting it all together: a complete example -------------------------------------------- - -.. _beat-core-toolchains-example-figure: -.. figure:: img/toolchain-example-2.* - - A complete toolchain that train and test an Eigenfaces system - -The following example describes the toolchain visible at -:numref:`beat-core-toolchains-example-figure`, a complete toolchain that: - - #. train an Eigenfaces face recognition system on one set of images (*train*) - #. enroll client-specific models on another set of images (*templates*) - #. test these models using samples from a third set of images (*probes*) - -.. note:: - - A toolchain is still not executable, since it contains no mention of the - algorithms that must be used in each processing block, as well as the - database to use. - - -.. code-block:: javascript - - { - "datasets": [ - { - "outputs": [ - "image" - ], - "name": "train" - }, - { - "outputs": [ - "template_id", - "client_id", - "image" - ], - "name": "templates" - }, - { - "outputs": [ - "probe_id", - "client_id", - "template_ids", - "image" - ], - "name": "probes" - } - ], - "blocks": [ - { - "inputs": [ - "image" - ], - "synchronized_channel": "train", - "name": "linear_machine_training", - "outputs": [ - "subspace" - ] - }, - { - "inputs": [ - "image", - "template_id", - "subspace" - ], - "synchronized_channel": "templates", - "name": "template_builder", - "outputs": [ - "projections" - ] - }, - { - "inputs": [ - "image", - "probe_id", - "subspace" - ], - "synchronized_channel": "probes", - "name": "probe_builder", - "outputs": [ - "projections" - ] - }, - { - "inputs": [ - "templates_client_id", - "templates_template_id", - "template_builder_projections", - "probes_client_id", - "probes_probe_id", - "probe_builder_projections", - "probes_template_ids" - ], - "synchronized_channel": "probes", - "name": "scoring", - "outputs": [ - "scores" - ] - } - ], - "analyzers": [ - { - "inputs": [ - "scores" - ], - "synchronized_channel": "probes", - "name": "analysis" - } - ], - "connections": [ - { - "to": "linear_machine_training.image", - "from": "train.image", - "channel": "train" - }, - { - "to": "template_builder.image", - "from": "templates.image", - "channel": "templates" - }, - { - "to": "template_builder.template_id", - "from": "templates.template_id", - "channel": "templates" - }, - { - "to": "template_builder.subspace", - "from": "linear_machine_training.subspace", - "channel": "train" - }, - { - "to": "probe_builder.image", - "from": "probes.image", - "channel": "probes" - }, - { - "to": "probe_builder.probe_id", - "from": "probes.probe_id", - "channel": "probes" - }, - { - "to": "probe_builder.subspace", - "from": "linear_machine_training.subspace", - "channel": "train" - }, - { - "to": "scoring.templates_client_id", - "from": "templates.client_id", - "channel": "templates" - }, - { - "to": "scoring.templates_template_id", - "from": "templates.template_id", - "channel": "templates" - }, - { - "to": "scoring.template_builder_projections", - "from": "template_builder.projections", - "channel": "templates" - }, - { - "to": "scoring.probes_client_id", - "from": "probes.client_id", - "channel": "probes" - }, - { - "to": "scoring.probes_probe_id", - "from": "probes.probe_id", - "channel": "probes" - }, - { - "to": "scoring.probe_builder_projections", - "from": "probe_builder.projections", - "channel": "probes" - }, - { - "to": "scoring.probes_template_ids", - "from": "probes.template_ids", - "channel": "probes" - }, - { - "to": "analysis.scores", - "from": "scoring.scores", - "channel": "probes" - } - ] - } -