Commit 24f787ef authored by Philip ABBET's avatar Philip ABBET
Browse files

Merge branch 'datasets' into 'master'

Refactoring and support of dataset providing in a container

See merge request !8
parents f0c66547 3a9bd3ae
......@@ -28,6 +28,18 @@
This package contains the source code for a python-based backend for the BEAT
platform.
It contains the minimum amount of code needed to run an algorithm or serve
data from a dataset. It is designed to be installed in a container.
The ``beat.core`` package extends the functionalities of this one (for
instance, it adds thorough validation of each user contribution, whereas
``beat.backend.python`` assumes that an invalid contribution will never
reach the container).
For this reason (and to keep ``beat.backend.python`` as small as possible),
all the unit tests are located in ``beat.core``.
Installation
------------
......@@ -39,48 +51,11 @@ Really easy, with ``zc.buildout``::
These 2 commands should download and install all non-installed dependencies and
get you a fully operational test and development environment.
.. note::
If you are on the Idiap filesystem, you may use
``/idiap/project/beat/environments/staging/usr/bin/python`` to bootstrap this
package instead. It contains the same setup deployed at the final BEAT
machinery.
Documentation
-------------
To build the documentation, just do::
$ ./bin/sphinx-apidoc --separate -d 2 --output=doc/api beat/backend/python
$ ./bin/sphinx-apidoc --separate -d 2 --output=doc/api beat
$ ./bin/sphinx-build doc sphinx
Testing
-------
After installation, it is possible to run our suite of unit tests. To do so,
use ``nose``::
$ ./bin/nosetests -sv
If you want to skip slow tests (at least those pulling stuff from our servers)
or executing lengthy operations, just do::
$ ./bin/nosetests -sv -a '!slow'
To measure the test coverage, do the following::
$ ./bin/nosetests -sv --with-coverage --cover-package=beat.backend.python
To produce an HTML test coverage report, at the directory `./htmlcov`, do the
following::
$ ./bin/nosetests -sv --with-coverage --cover-package=beat.backend.python --cover-html --cover-html-dir=htmlcov
Our documentation is also interspersed with test units. You can run them using
sphinx::
$ ./bin/sphinx -b doctest doc sphinx
......@@ -38,6 +38,34 @@ import simplejson
from . import dataformat
from . import library
from . import loader
from . import utils
class Storage(utils.CodeStorage):
"""Resolves paths for algorithms
Parameters:
prefix (str): Establishes the prefix of your installation.
name (str): The name of the algorithm object in the format
``<user>/<name>/<version>``.
"""
def __init__(self, prefix, name, language=None):
if name.count('/') != 2:
raise RuntimeError("invalid algorithm name: `%s'" % name)
self.username, self.name, self.version = name.split('/')
self.prefix = prefix
self.fullname = name
path = utils.hashed_or_simple(self.prefix, 'algorithms', name)
super(Storage, self).__init__(path, language)
class Runner(object):
......@@ -160,6 +188,7 @@ class Runner(object):
return getattr(self.obj, key)
class Algorithm(object):
"""Algorithms represent runnable components within the platform.
......@@ -222,6 +251,9 @@ class Algorithm(object):
groups (dict): A list containing dictionaries with inputs and outputs
belonging to the same synchronization group.
errors (list): A list containing errors found while loading this
algorithm.
data (dict): The original data for this algorithm, as loaded by our JSON
decoder.
......@@ -232,20 +264,34 @@ class Algorithm(object):
def __init__(self, prefix, name, dataformat_cache=None, library_cache=None):
self._name = None
self.storage = None
self.prefix = prefix
self.dataformats = {}
self.libraries = {}
self.groups = []
dataformat_cache = dataformat_cache if dataformat_cache is not None else {}
library_cache = library_cache if library_cache is not None else {}
self.name = name
json_path = os.path.join(prefix, 'algorithms', name + '.json')
with open(json_path, 'rb') as f: self.data = simplejson.load(f)
self._load(name, dataformat_cache, library_cache)
self.code_path = os.path.join(prefix, 'algorithms', name + '.py')
def _load(self, data, dataformat_cache, library_cache):
"""Loads the algorithm"""
self._name = data
self.storage = Storage(self.prefix, data)
json_path = self.storage.json.path
if not self.storage.exists():
self.errors.append('Algorithm declaration file not found: %s' % json_path)
return
with open(json_path, 'rb') as f:
self.data = simplejson.load(f)
self.code_path = self.storage.code.path
self.groups = self.data['groups']
......@@ -375,6 +421,22 @@ class Algorithm(object):
library.Library(self.prefix, value, library_cache))
@property
def name(self):
"""Returns the name of this object
"""
return self._name or '__unnamed_algorithm__'
@name.setter
def name(self, value):
if self.data['language'] == 'unknown':
raise RuntimeError("algorithm has no programming language set")
self._name = value
self.storage = Storage(self.prefix, value, self.data['language'])
@property
def schema_version(self):
......@@ -382,6 +444,20 @@ class Algorithm(object):
return self.data.get('schema_version', 1)
@property
def language(self):
"""Returns the current language set for the executable code"""
return self.data['language']
@language.setter
def language(self, value):
"""Sets the current executable code programming language"""
if self.storage:
self.storage.language = value
self.data['language'] = value
def clean_parameter(self, parameter, value):
"""Checks if a given value against a declared parameter
......@@ -410,8 +486,8 @@ class Algorithm(object):
ValueError: If the parameter cannot be safe cast into the algorithm's
type. Alternatively, a ``ValueError`` may also be raised if a range or
choice was specified and the value does not obbey those settings
estipulated for the parameter
choice was specified and the value does not obey those settings
stipulated for the parameter
"""
......@@ -437,35 +513,72 @@ class Algorithm(object):
return retval
@property
def valid(self):
"""A boolean that indicates if this algorithm is valid or not"""
return not bool(self.errors)
@property
def uses(self):
return self.data.get('uses')
@uses.setter
def uses(self, value):
self.data['uses'] = value
return value
@property
def results(self):
return self.data.get('results')
@results.setter
def results(self, value):
self.data['results'] = value
return value
@property
def parameters(self):
return self.data.get('parameters')
@parameters.setter
def parameters(self, value):
self.data['parameters'] = value
return value
@property
def splittable(self):
return self.data.get('splittable', False)
@splittable.setter
def splittable(self, value):
self.data['splittable'] = value
return value
def uses_dict(self):
"""Returns the usage dictionary for all dependent modules"""
if self.data['language'] == 'unknown':
raise RuntimeError("algorithm has no programming language set")
if not self._name:
raise RuntimeError("algorithm has no name")
retval = {}
if self.uses is not None:
for name, value in self.uses.items():
retval[name] = dict(
path=self.libraries[value].code_path,
path=self.libraries[value].storage.code.path,
uses=self.libraries[value].uses_dict(),
)
......@@ -489,11 +602,24 @@ class Algorithm(object):
before using the ``process`` method.
"""
if not self._name:
exc = exc or RuntimeError
raise exc("algorithm has no name")
if self.data['language'] == 'unknown':
exc = exc or RuntimeError
raise exc("algorithm has no programming language set")
if not self.valid:
message = "cannot load code for invalid algorithm (%s)" % (self.name,)
exc = exc or RuntimeError
raise exc(message)
# loads the module only once through the lifetime of the algorithm object
try:
self.__module = getattr(self, 'module',
loader.load_module(self.name.replace(os.sep, '_'),
self.code_path, self.uses_dict()))
self.storage.code.path, self.uses_dict()))
except Exception as e:
if exc is not None:
type, value, traceback = sys.exc_info()
......@@ -504,6 +630,52 @@ class Algorithm(object):
return Runner(self.__module, klass, self, exc)
@property
def description(self):
"""The short description for this object"""
return self.data.get('description', None)
@description.setter
def description(self, value):
"""Sets the short description for this object"""
self.data['description'] = value
@property
def documentation(self):
"""The full-length description for this object"""
if not self._name:
raise RuntimeError("algorithm has no name")
if self.storage.doc.exists():
return self.storage.doc.load()
return None
@documentation.setter
def documentation(self, value):
"""Sets the full-length description for this object"""
if not self._name:
raise RuntimeError("algorithm has no name")
if hasattr(value, 'read'):
self.storage.doc.save(value.read())
else:
self.storage.doc.save(value)
def hash(self):
"""Returns the hexadecimal hash for the current algorithm"""
if not self._name:
raise RuntimeError("algorithm has no name")
return self.storage.hash()
def result_dataformat(self):
"""Generates, on-the-fly, the dataformat for the result readout"""
......
This diff is collapsed.
#!/usr/bin/env python
# vim: set fileencoding=utf-8 :
###############################################################################
# #
# Copyright (c) 2016 Idiap Research Institute, http://www.idiap.ch/ #
# Contact: beat.support@idiap.ch #
# #
# This file is part of the beat.backend.python module of the BEAT platform. #
# #
# Commercial License Usage #
# Licensees holding valid commercial BEAT licenses may use this file in #
# accordance with the terms contained in a written agreement between you #
# and Idiap. For further information contact tto@idiap.ch #
# #
# Alternatively, this file may be used under the terms of the GNU Affero #
# Public License version 3 as published by the Free Software and appearing #
# in the file LICENSE.AGPL included in the packaging of this file. #
# The BEAT platform is distributed in the hope that it will be useful, but #
# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY #
# or FITNESS FOR A PARTICULAR PURPOSE. #
# #
# You should have received a copy of the GNU Affero Public License along #
# with the BEAT platform. If not, see http://www.gnu.org/licenses/. #
# #
###############################################################################
"""Validation of databases"""
import os
import sys
import six
import simplejson
from . import loader
from . import utils
from .dataformat import DataFormat
class Storage(utils.CodeStorage):
"""Resolves paths for databases
Parameters:
prefix (str): Establishes the prefix of your installation.
name (str): The name of the database object in the format
``<name>/<version>``.
"""
def __init__(self, prefix, name):
if name.count('/') != 1:
raise RuntimeError("invalid database name: `%s'" % name)
self.name, self.version = name.split('/')
self.fullname = name
path = os.path.join(prefix, 'databases', name)
super(Storage, self).__init__(path, 'python') #views are coded in Python
class View(object):
'''A special loader class for database views, with specialized methods
Parameters:
db_name (str): The full name of the database object for this view
module (module): The preloaded module containing the database views as
returned by :py:func:`beat.core.loader.load_module`.
prefix (str, path): The prefix path for the current installation
root_folder (str, path): The path pointing to the root folder of this
database
exc (class): The class to use as base exception when translating the
exception from the user code. Read the documention of :py:func:`run`
for more details.
*args: Constructor parameters for the database view. Normally, none.
**kwargs: Constructor parameters for the database view. Normally, none.
'''
def __init__(self, module, definition, prefix, root_folder, exc=None,
*args, **kwargs):
try:
class_ = getattr(module, definition['view'])
except Exception as e:
if exc is not None:
type, value, traceback = sys.exc_info()
six.reraise(exc, exc(value), traceback)
else:
raise #just re-raise the user exception
self.obj = loader.run(class_, '__new__', exc, *args, **kwargs)
self.ready = False
self.prefix = prefix
self.root_folder = root_folder
self.definition = definition
self.exc = exc or RuntimeError
self.outputs = None
def prepare_outputs(self):
'''Prepares the outputs of the dataset'''
from .outputs import Output, OutputList
from .data import MemoryDataSink
from .dataformat import DataFormat
# create the stock outputs for this dataset, so data is dumped
# on a in-memory sink
self.outputs = OutputList()
for out_name, out_format in self.definition.get('outputs', {}).items():
data_sink = MemoryDataSink()
data_sink.dataformat = DataFormat(self.prefix, out_format)
data_sink.setup([])
self.outputs.add(Output(out_name, data_sink, dataset_output=True))
def setup(self, *args, **kwargs):
'''Sets up the view'''
kwargs.setdefault('root_folder', self.root_folder)
kwargs.setdefault('parameters', self.definition.get('parameters', {}))
if 'outputs' not in kwargs:
kwargs['outputs'] = self.outputs
else:
self.outputs = kwargs['outputs'] #record outputs nevertheless
self.ready = loader.run(self.obj, 'setup', self.exc, *args, **kwargs)
if not self.ready:
raise self.exc("unknow setup failure")
return self.ready
def input_group(self, name='default', exclude_outputs=[]):
'''A memory-source input group matching the outputs from the view'''
if not self.ready:
raise self.exc("database view not yet setup")
from .data import MemoryDataSource
from .outputs import SynchronizationListener
from .inputs import Input, InputGroup
# Setup the inputs
synchronization_listener = SynchronizationListener()
input_group = InputGroup(name,
synchronization_listener=synchronization_listener,
restricted_access=False)
for output in self.outputs:
if output.name in exclude_outputs: continue
data_source = MemoryDataSource(self.done, next_callback=self.next)
output.data_sink.data_sources.append(data_source)
input_group.add(Input(output.name,
output.data_sink.dataformat, data_source))
return input_group
def done(self, *args, **kwargs):
'''Checks if the view is done'''
if not self.ready:
raise self.exc("database view not yet setup")
return loader.run(self.obj, 'done', self.exc, *args, **kwargs)
def next(self, *args, **kwargs):
'''Runs through the next data chunk'''
if not self.ready:
raise self.exc("database view not yet setup")
return loader.run(self.obj, 'next', self.exc, *args, **kwargs)
def __getattr__(self, key):
'''Returns an attribute of the view - only called at last resort'''
return getattr(self.obj, key)
class Database(object):
"""Databases define the start point of the dataflow in an experiment.
Parameters:
prefix (str): Establishes the prefix of your installation.
name (str): The fully qualified database name (e.g. ``db/1``)
dataformat_cache (dict, optional): A dictionary mapping dataformat names
to loaded dataformats. This parameter is optional and, if passed, may
greatly speed-up database loading times as dataformats that are already
loaded may be re-used. If you use this parameter, you must guarantee
that the cache is refreshed as appropriate in case the underlying
dataformats change.
Attributes:
name (str): The full, valid name of this database
data (dict): The original data for this database, as loaded by our JSON
decoder.
"""
def __init__(self, prefix, name, dataformat_cache=None):
self._name = None
self.prefix = prefix
self.dataformats = {} # preloaded dataformats
self.storage = None
self.errors = []
self.data = None
# if the user has not provided a cache, still use one for performance
dataformat_cache = dataformat_cache if dataformat_cache is not None else {}
self._load(name, dataformat_cache)
def _load(self, data, dataformat_cache):
"""Loads the database"""
self._name = data
self.storage = Storage(self.prefix, self._name)
json_path = self.storage.json.path
if not self.storage.json.exists():
self.errors.append('Database declaration file not found: %s' % json_path)
return