Commit 24f787ef authored by Philip ABBET's avatar Philip ABBET

Merge branch 'datasets' into 'master'

Refactoring and support of dataset providing in a container

See merge request !8
parents f0c66547 3a9bd3ae
......@@ -28,6 +28,18 @@
This package contains the source code for a python-based backend for the BEAT
platform.
It contains the minimum amount of code needed to run an algorithm or serve
data from a dataset. It is designed to be installed in a container.
The ``beat.core`` package extends the functionalities of this one (for
instance, it adds thorough validation of each user contribution, whereas
``beat.backend.python`` assumes that an invalid contribution will never
reach the container).
For this reason (and to keep ``beat.backend.python`` as small as possible),
all the unit tests are located in ``beat.core``.
Installation
------------
......@@ -39,48 +51,11 @@ Really easy, with ``zc.buildout``::
These 2 commands should download and install all non-installed dependencies and
get you a fully operational test and development environment.
.. note::
If you are on the Idiap filesystem, you may use
``/idiap/project/beat/environments/staging/usr/bin/python`` to bootstrap this
package instead. It contains the same setup deployed at the final BEAT
machinery.
Documentation
-------------
To build the documentation, just do::
$ ./bin/sphinx-apidoc --separate -d 2 --output=doc/api beat/backend/python
$ ./bin/sphinx-apidoc --separate -d 2 --output=doc/api beat
$ ./bin/sphinx-build doc sphinx
Testing
-------
After installation, it is possible to run our suite of unit tests. To do so,
use ``nose``::
$ ./bin/nosetests -sv
If you want to skip slow tests (at least those pulling stuff from our servers)
or executing lengthy operations, just do::
$ ./bin/nosetests -sv -a '!slow'
To measure the test coverage, do the following::
$ ./bin/nosetests -sv --with-coverage --cover-package=beat.backend.python
To produce an HTML test coverage report, at the directory `./htmlcov`, do the
following::
$ ./bin/nosetests -sv --with-coverage --cover-package=beat.backend.python --cover-html --cover-html-dir=htmlcov
Our documentation is also interspersed with test units. You can run them using
sphinx::
$ ./bin/sphinx -b doctest doc sphinx
......@@ -38,6 +38,34 @@ import simplejson
from . import dataformat
from . import library
from . import loader
from . import utils
class Storage(utils.CodeStorage):
"""Resolves paths for algorithms
Parameters:
prefix (str): Establishes the prefix of your installation.
name (str): The name of the algorithm object in the format
``<user>/<name>/<version>``.
"""
def __init__(self, prefix, name, language=None):
if name.count('/') != 2:
raise RuntimeError("invalid algorithm name: `%s'" % name)
self.username, self.name, self.version = name.split('/')
self.prefix = prefix
self.fullname = name
path = utils.hashed_or_simple(self.prefix, 'algorithms', name)
super(Storage, self).__init__(path, language)
class Runner(object):
......@@ -160,6 +188,7 @@ class Runner(object):
return getattr(self.obj, key)
class Algorithm(object):
"""Algorithms represent runnable components within the platform.
......@@ -222,6 +251,9 @@ class Algorithm(object):
groups (dict): A list containing dictionaries with inputs and outputs
belonging to the same synchronization group.
errors (list): A list containing errors found while loading this
algorithm.
data (dict): The original data for this algorithm, as loaded by our JSON
decoder.
......@@ -232,20 +264,34 @@ class Algorithm(object):
def __init__(self, prefix, name, dataformat_cache=None, library_cache=None):
self._name = None
self.storage = None
self.prefix = prefix
self.dataformats = {}
self.libraries = {}
self.groups = []
dataformat_cache = dataformat_cache if dataformat_cache is not None else {}
library_cache = library_cache if library_cache is not None else {}
self.name = name
json_path = os.path.join(prefix, 'algorithms', name + '.json')
with open(json_path, 'rb') as f: self.data = simplejson.load(f)
self._load(name, dataformat_cache, library_cache)
self.code_path = os.path.join(prefix, 'algorithms', name + '.py')
def _load(self, data, dataformat_cache, library_cache):
"""Loads the algorithm"""
self._name = data
self.storage = Storage(self.prefix, data)
json_path = self.storage.json.path
if not self.storage.exists():
self.errors.append('Algorithm declaration file not found: %s' % json_path)
return
with open(json_path, 'rb') as f:
self.data = simplejson.load(f)
self.code_path = self.storage.code.path
self.groups = self.data['groups']
......@@ -375,6 +421,22 @@ class Algorithm(object):
library.Library(self.prefix, value, library_cache))
@property
def name(self):
"""Returns the name of this object
"""
return self._name or '__unnamed_algorithm__'
@name.setter
def name(self, value):
if self.data['language'] == 'unknown':
raise RuntimeError("algorithm has no programming language set")
self._name = value
self.storage = Storage(self.prefix, value, self.data['language'])
@property
def schema_version(self):
......@@ -382,6 +444,20 @@ class Algorithm(object):
return self.data.get('schema_version', 1)
@property
def language(self):
"""Returns the current language set for the executable code"""
return self.data['language']
@language.setter
def language(self, value):
"""Sets the current executable code programming language"""
if self.storage:
self.storage.language = value
self.data['language'] = value
def clean_parameter(self, parameter, value):
"""Checks if a given value against a declared parameter
......@@ -410,8 +486,8 @@ class Algorithm(object):
ValueError: If the parameter cannot be safe cast into the algorithm's
type. Alternatively, a ``ValueError`` may also be raised if a range or
choice was specified and the value does not obbey those settings
estipulated for the parameter
choice was specified and the value does not obey those settings
stipulated for the parameter
"""
......@@ -437,35 +513,72 @@ class Algorithm(object):
return retval
@property
def valid(self):
"""A boolean that indicates if this algorithm is valid or not"""
return not bool(self.errors)
@property
def uses(self):
return self.data.get('uses')
@uses.setter
def uses(self, value):
self.data['uses'] = value
return value
@property
def results(self):
return self.data.get('results')
@results.setter
def results(self, value):
self.data['results'] = value
return value
@property
def parameters(self):
return self.data.get('parameters')
@parameters.setter
def parameters(self, value):
self.data['parameters'] = value
return value
@property
def splittable(self):
return self.data.get('splittable', False)
@splittable.setter
def splittable(self, value):
self.data['splittable'] = value
return value
def uses_dict(self):
"""Returns the usage dictionary for all dependent modules"""
if self.data['language'] == 'unknown':
raise RuntimeError("algorithm has no programming language set")
if not self._name:
raise RuntimeError("algorithm has no name")
retval = {}
if self.uses is not None:
for name, value in self.uses.items():
retval[name] = dict(
path=self.libraries[value].code_path,
path=self.libraries[value].storage.code.path,
uses=self.libraries[value].uses_dict(),
)
......@@ -489,11 +602,24 @@ class Algorithm(object):
before using the ``process`` method.
"""
if not self._name:
exc = exc or RuntimeError
raise exc("algorithm has no name")
if self.data['language'] == 'unknown':
exc = exc or RuntimeError
raise exc("algorithm has no programming language set")
if not self.valid:
message = "cannot load code for invalid algorithm (%s)" % (self.name,)
exc = exc or RuntimeError
raise exc(message)
# loads the module only once through the lifetime of the algorithm object
try:
self.__module = getattr(self, 'module',
loader.load_module(self.name.replace(os.sep, '_'),
self.code_path, self.uses_dict()))
self.storage.code.path, self.uses_dict()))
except Exception as e:
if exc is not None:
type, value, traceback = sys.exc_info()
......@@ -504,6 +630,52 @@ class Algorithm(object):
return Runner(self.__module, klass, self, exc)
@property
def description(self):
"""The short description for this object"""
return self.data.get('description', None)
@description.setter
def description(self, value):
"""Sets the short description for this object"""
self.data['description'] = value
@property
def documentation(self):
"""The full-length description for this object"""
if not self._name:
raise RuntimeError("algorithm has no name")
if self.storage.doc.exists():
return self.storage.doc.load()
return None
@documentation.setter
def documentation(self, value):
"""Sets the full-length description for this object"""
if not self._name:
raise RuntimeError("algorithm has no name")
if hasattr(value, 'read'):
self.storage.doc.save(value.read())
else:
self.storage.doc.save(value)
def hash(self):
"""Returns the hexadecimal hash for the current algorithm"""
if not self._name:
raise RuntimeError("algorithm has no name")
return self.storage.hash()
def result_dataformat(self):
"""Generates, on-the-fly, the dataformat for the result readout"""
......
This diff is collapsed.
This diff is collapsed.
......@@ -36,11 +36,43 @@ import six
import numpy
import simplejson
from . import utils
from .baseformat import baseformat
class Storage(utils.Storage):
"""Resolves paths for dataformats
Parameters:
prefix (str): Establishes the prefix of your installation.
name (str): The name of the dataformat object in the format
``<user>/<name>/<version>``.
"""
def __init__(self, prefix, name):
if name.count('/') != 2:
raise RuntimeError("invalid dataformat name: `%s'" % name)
self.username, self.name, self.version = name.split('/')
self.fullname = name
path = utils.hashed_or_simple(prefix, 'dataformats', name)
super(Storage, self).__init__(path)
def hash(self):
"""The 64-character hash of the database declaration JSON"""
return super(Storage, self).hash('#description')
class DataFormat(object):
"""Data formats define the chunks of data that circulate at data formats.
"""Data formats define the chunks of data that circulate between blocks.
Parameters:
......@@ -67,6 +99,17 @@ class DataFormat(object):
name (str): The full, valid name of this dataformat
description (str): The short description string, loaded from the JSON
file if one was set.
documentation (str): The full-length docstring for this object.
storage (object): A simple object that provides information about file
paths for this dataformat
errors (list of str): A list containing errors found while loading this
dataformat.
data (dict): The original data for this dataformat, as loaded by our JSON
decoder.
......@@ -83,23 +126,44 @@ class DataFormat(object):
def __init__(self, prefix, data, parent=None, dataformat_cache=None):
self._name = None
self.storage = None
self.resolved = None
self.prefix = prefix
self.errors = []
self.data = None
self.resolved = None
self.referenced = {}
self.parent = parent
# if the user has not provided a cache, still use one for performance
dataformat_cache = dataformat_cache if dataformat_cache is not None else {}
try:
self._load(data, dataformat_cache)
finally:
if self._name is not None: #registers it into the cache, even if failed
dataformat_cache[self._name] = self
def _load(self, data, dataformat_cache):
"""Loads the dataformat"""
if isinstance(data, dict):
self.name = 'analysis:result'
self._name = 'analysis:result'
self.data = data
else:
self.name = data
json_path = os.path.join(prefix, 'dataformats', data + '.json')
with open(json_path, 'rb') as f: self.data = simplejson.load(f)
self._name = data
self.storage = Storage(self.prefix, data)
json_path = self.storage.json.path
if not self.storage.exists():
self.errors.append('Dataformat declaration file not found: %s' % json_path)
return
dataformat_cache[self.name] = self #registers itself into the cache
with open(json_path, 'rb') as f:
self.data = simplejson.load(f)
dataformat_cache[self._name] = self #registers itself into the cache
self.resolved = copy.deepcopy(self.data)
......@@ -152,7 +216,7 @@ class DataFormat(object):
if '#extends' in self.resolved:
ext = self.data['#extends']
self.referenced[ext] = maybe_load_format(self.name, ext, dataformat_cache)
self.referenced[ext] = maybe_load_format(self._name, ext, dataformat_cache)
basetype = self.resolved['#extends']
tmp = self.resolved
self.resolved = basetype.resolved
......@@ -160,6 +224,23 @@ class DataFormat(object):
del self.resolved['#extends'] #avoids infinite recursion
@property
def name(self):
"""Returns the name of this object, either from the filename or composed
from the hierarchy it belongs.
"""
if self.parent and self._name is None:
return self.parent[0].name + '.' + self.parent[1] + '_type'
else:
return self._name or '__unnamed_dataformat__'
@name.setter
def name(self, value):
self._name = value
self.storage = Storage(self.prefix, value)
@property
def schema_version(self):
"""Returns the schema version"""
......@@ -245,6 +326,57 @@ class DataFormat(object):
)
@property
def valid(self):
return not bool(self.errors)
@property
def description(self):
"""The short description for this object"""
return self.data.get('#description', None)
@description.setter
def description(self, value):
"""Sets the short description for this object"""
self.data['#description'] = value
@property
def documentation(self):
"""The full-length description for this object"""
if not self._name:
raise RuntimeError("dataformat has no name")
if self.storage.doc.exists():
return self.storage.doc.load()
return None
@documentation.setter
def documentation(self, value):
"""Sets the full-length description for this object"""
if not self._name:
raise RuntimeError("dataformat has no name")
if hasattr(value, 'read'):
self.storage.doc.save(value.read())
else:
self.storage.doc.save(value)
def hash(self):
"""Returns the hexadecimal hash for its declaration"""
if not self._name:
raise RuntimeError("dataformat has no name")
return self.storage.hash()
def validate(self, data):
"""Validates a piece of data provided by the user
......@@ -292,7 +424,9 @@ class DataFormat(object):
"""
if other.extends:
if self.name == other.extends: return True
else: return self.isparent(other.referenced[other.extends])
if self.name == other.extends:
return True
else:
return self.isparent(other.referenced[other.extends])
return False
This diff is collapsed.
......@@ -102,11 +102,11 @@ class Executor(object):
for name, channel in self.data['inputs'].items():
group = self.input_list.group(channel)
if group is None:
group = inputs.InputGroup(channel, (channel == main_channel),
group = inputs.RemoteInputGroup(channel, (channel == main_channel),
socket=self.socket)
self.input_list.add(group)
thisformat = self.algorithm.dataformats[self.algorithm.input_map[name]]
group.add(inputs.Input(name, thisformat, self.socket))
group.add(inputs.RemoteInput(name, thisformat, self.socket))
logger.debug("Loaded input list with %d group(s) and %d input(s)",
self.input_list.nbGroups(), len(self.input_list))
......@@ -115,7 +115,7 @@ class Executor(object):
self.output_list = outputs.OutputList()
for name, channel in self.data['outputs'].items():
thisformat = self.algorithm.dataformats[self.algorithm.output_map[name]]
self.output_list.add(outputs.Output(name, thisformat, self.socket))
self.output_list.add(outputs.RemoteOutput(name, thisformat, self.socket))
logger.debug("Loaded output list with %d output(s)",
len(self.output_list))
......@@ -126,7 +126,7 @@ class Executor(object):
# Retrieve dataformats in the JSON of the algorithm
analysis_format = self.algorithm.result_dataformat()
analysis_format.name = 'analysis:' + self.algorithm.name
self.output_list.add(outputs.Output(name, analysis_format, self.socket))
self.output_list.add(outputs.RemoteOutput(name, analysis_format, self.socket))
logger.debug("Loaded output list for analyzer (1 single output)")
......
#!/usr/bin/env python
# vim: set fileencoding=utf-8 :
###############################################################################
# #
# Copyright (c) 2016 Idiap Research Institute, http://www.idiap.ch/ #
# Contact: beat.support@idiap.ch #
# #
# This file is part of the beat.backend.python module of the BEAT platform. #
# #
# Commercial License Usage #
# Licensees holding valid commercial BEAT licenses may use this file in #
# accordance with the terms contained in a written agreement between you #
# and Idiap. For further information contact tto@idiap.ch #
# #
# Alternatively, this file may be used under the terms of the GNU Affero #
# Public License version 3 as published by the Free Software and appearing #
# in the file LICENSE.AGPL included in the packaging of this file. #
# The BEAT platform is distributed in the hope that it will be useful, but #
# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY #
# or FITNESS FOR A PARTICULAR PURPOSE. #
# #
# You should have received a copy of the GNU Affero Public License along #
# with the BEAT platform. If not, see http://www.gnu.org/licenses/. #
# #
###############################################################################
"""Various functions for hashing platform contributions and others"""
import hashlib
import simplejson
import collections
import copy
import six
import os
def _sha256(s):
"""A python2/3 replacement for :py:func:`haslib.sha256`"""
try:
if isinstance(s, str): s = six.u(s)
return hashlib.sha256(s.encode('utf8')).hexdigest()
except:
return hashlib.sha256(s).hexdigest()
def _stringify(dictionary):
names = sorted(dictionary.keys())
converted_dictionary = '{'
for name in names:
converted_dictionary += '"%s":%s,' % (name, str(dictionary[name]))
if len(converted_dictionary) > 1:
converted_dictionary = converted_dictionary[:-1]
converted_dictionary += '}'
return converted_dictionary
def toUserPath(username):
hash = _sha256(username)
return os.path.join(hash[0:2], hash[2:4], username)
def hash(dictionary_or_string):
if isinstance(dictionary_or_string, dict):
return _sha256(_stringify(dictionary_or_string))
else:
return _sha256(dictionary_or_string)
def hashJSON(contents, description):
"""Hashes the pre-loaded JSON object using :py:func:`hashlib.sha256`
Excludes description changes
"""
if description in contents:
contents = copy.deepcopy(contents) #temporary copy
del contents[description]
contents = simplejson.dumps(contents, sort_keys=True)
return hashlib.sha256(contents).hexdigest()
def hashJSONFile(path, description):
"""Hashes the JSON file contents using :py:func:`hashlib.sha256`
Excludes description changes
"""
try:
with open(path, 'rb') as f:
return hashJSON(simplejson.load(f,
object_pairs_hook=collections.OrderedDict), description) #preserve order
except simplejson.JSONDecodeError:
# falls back to normal file content hashing
return hashFileContents(path)
def hashFileContents(path):
"""Hashes the file contents using :py:func:`hashlib.sha256`."""
with open(path, 'rb') as f:
return hashlib.sha256(f.read()).hexdigest()
......@@ -43,6 +43,224 @@ class Input: