Commit df6542a3 authored by Philip ABBET's avatar Philip ABBET

Merge branch 'datasets' into 'master'

Add support to serve databases from a docker container

See merge request !14
parents 1d47bffb 2612a203
Pipeline #8146 passed with stage
in 4 minutes and 34 seconds
......@@ -2,7 +2,7 @@ stages:
- build
variables:
PREFIX: /opt/beat.env.web-${CI_BUILD_REF_NAME}/usr
PREFIX: /opt/beat.env.web/usr
build:
stage: build
......
......@@ -63,6 +63,7 @@ In particular, this package controls memory and CPU utilisation of the
containers it launches. You must make sure to enable those functionalities on
your installation.
Docker Setup
============
......@@ -75,14 +76,16 @@ execute algorithms or experiments.
We use specific docker images to run user algorithms. Download the following
base images before you try to run tests or experiments on your computer::
$ docker pull beats/py27:system
$ docker pull debian:8.4
$ docker pull docker.idiap.ch/beat/beat.env.system.python:system
$ docker pull docker.idiap.ch/beat/beat.env.db.examples:1.0.0
Optionally, also download the following images to be able to re-run experiments
downloaded from the BEAT platform (not required for unit testing)::
$ docker pull beats/py27:0.0.4
$ docker pull beats/py27:0.1.0
$ docker pull docker.idiap.ch/beat/beat.env.python:0.0.4
$ docker pull docker.idiap.ch/beat/beat.env.python:0.1.0
$ docker pull docker.idiap.ch/beat/beat.env.cxx:1.0.1
$ docker pull docker.idiap.ch/beat/beat.env.db:1.0.0
Documentation
......@@ -139,7 +142,6 @@ sphinx::
Development
-----------
Indentation
===========
......@@ -148,8 +150,8 @@ example, to enforce compliance on a single file and edit it in place, do::
$ ./bin/autopep8 --indent-size=2 --in-place beat/core/utils.py
We normally use 2-space identattion. If ever, you can easily change the
identation to 4 spaces like this::
We normally use 2-space indentation. If ever, you can easily change the
indentation to 4 spaces like this::
$ ./bin/autopep8 --indent-size=4 --in-place beat/core/utils.py
......
This diff is collapsed.
This diff is collapsed.
......@@ -43,162 +43,13 @@ from . import hash
from . import utils
from . import prototypes
class Storage(utils.CodeStorage):
"""Resolves paths for databases
Parameters:
prefix (str): Establishes the prefix of your installation.
name (str): The name of the database object in the format
``<name>/<version>``.
"""
def __init__(self, prefix, name):
if name.count('/') != 1:
raise RuntimeError("invalid database name: `%s'" % name)
self.name, self.version = name.split('/')
self.fullname = name
path = os.path.join(prefix, 'databases', name)
super(Storage, self).__init__(path, 'python') #views are coded in Python
class View(object):
'''A special loader class for database views, with specialized methods
Parameters:
db_name (str): The full name of the database object for this view
module (module): The preloaded module containing the database views as
returned by :py:func:`beat.core.loader.load_module`.
prefix (str, path): The prefix path for the current installation
root_folder (str, path): The path pointing to the root folder of this
database
exc (class): The class to use as base exception when translating the
exception from the user code. Read the documention of :py:func:`run`
for more details.
*args: Constructor parameters for the database view. Normally, none.
**kwargs: Constructor parameters for the database view. Normally, none.
'''
def __init__(self, module, definition, prefix, root_folder, exc=None,
*args, **kwargs):
try:
class_ = getattr(module, definition['view'])
except Exception as e:
if exc is not None:
type, value, traceback = sys.exc_info()
six.reraise(exc, exc(value), traceback)
else:
raise #just re-raise the user exception
self.obj = loader.run(class_, '__new__', exc, *args, **kwargs)
self.ready = False
self.prefix = prefix
self.root_folder = root_folder
self.definition = definition
self.exc = exc or RuntimeError
self.outputs = None
def prepare_outputs(self):
'''Prepares the outputs of the dataset'''
from .outputs import Output, OutputList
from .data import MemoryDataSink
from .dataformat import DataFormat
# create the stock outputs for this dataset, so data is dumped
# on a in-memory sink
self.outputs = OutputList()
for out_name, out_format in self.definition.get('outputs', {}).items():
data_sink = MemoryDataSink()
data_sink.dataformat = DataFormat(self.prefix, out_format)
data_sink.setup([])
self.outputs.add(Output(out_name, data_sink, dataset_output=True))
def setup(self, *args, **kwargs):
'''Sets up the view'''
kwargs.setdefault('root_folder', self.root_folder)
kwargs.setdefault('parameters', self.definition.get('parameters', {}))
if 'outputs' not in kwargs:
kwargs['outputs'] = self.outputs
else:
self.outputs = kwargs['outputs'] #record outputs nevertheless
self.ready = loader.run(self.obj, 'setup', self.exc, *args, **kwargs)
if not self.ready:
raise self.exc("unknow setup failure")
return self.ready
def input_group(self, name='default', exclude_outputs=[]):
'''A memory-source input group matching the outputs from the view'''
if not self.ready:
raise self.exc("database view not yet setup")
from .data import MemoryDataSource
from .outputs import SynchronizationListener
from .inputs import Input, InputGroup
# Setup the inputs
synchronization_listener = SynchronizationListener()
input_group = InputGroup(name,
synchronization_listener=synchronization_listener,
restricted_access=False)
for output in self.outputs:
if output.name in exclude_outputs: continue
data_source = MemoryDataSource(self.done, next_callback=self.next)
output.data_sink.data_sources.append(data_source)
input_group.add(Input(output.name,
output.data_sink.dataformat, data_source))
return input_group
def done(self, *args, **kwargs):
'''Checks if the view is done'''
if not self.ready:
raise self.exc("database view not yet setup")
return loader.run(self.obj, 'done', self.exc, *args, **kwargs)
from beat.backend.python.database import Storage
from beat.backend.python.database import View
from beat.backend.python.database import Database as BackendDatabase
def next(self, *args, **kwargs):
'''Runs through the next data chunk'''
if not self.ready:
raise self.exc("database view not yet setup")
return loader.run(self.obj, 'next', self.exc, *args, **kwargs)
def __getattr__(self, key):
'''Returns an attribute of the view - only called at last resort'''
return getattr(self.obj, key)
class Database(object):
class Database(BackendDatabase):
"""Databases define the start point of the dataflow in an experiment.
......@@ -240,20 +91,8 @@ class Database(object):
"""
def __init__(self, prefix, data, dataformat_cache=None):
super(Database, self).__init__(prefix, data, dataformat_cache)
self._name = None
self.storage = None
self.prefix = prefix
self.dataformats = {} # preloaded dataformats
self.errors = []
self.data = None
self.code = None
# if the user has not provided a cache, still use one for performance
dataformat_cache = dataformat_cache if dataformat_cache is not None else {}
self._load(data, dataformat_cache)
def _load(self, data, dataformat_cache):
"""Loads the database"""
......@@ -353,111 +192,20 @@ class Database(object):
"unsupported by this version" % (_set['view'],)
)
@property
def name(self):
"""Returns the name of this object
"""
return self._name or '__unnamed_database__'
@name.setter
def name(self, value):
self._name = value
self.storage = Storage(self.prefix, value)
@property
def schema_version(self):
"""Returns the schema version"""
return self.data.get('schema_version', 1)
@property
def protocols(self):
"""The declaration of all the protocols of the database"""
data = self.data['protocols']
return dict(zip([k['name'] for k in data], data))
def protocol(self, name):
"""The declaration of a specific protocol in the database"""
return self.protocols[name]
@property
def protocol_names(self):
"""Names of protocols declared for this database"""
data = self.data['protocols']
return [k['name'] for k in data]
def sets(self, protocol):
"""The declaration of a specific set in the database protocol"""
data = self.protocol(protocol)['sets']
return dict(zip([k['name'] for k in data], data))
def set(self, protocol, name):
"""The declaration of all the protocols of the database"""
return self.sets(protocol)[name]
def set_names(self, protocol):
"""The names of sets in a given protocol for this database"""
data = self.protocol(protocol)['sets']
return [k['name'] for k in data]
@property
def valid(self):
return not bool(self.errors)
def view(self, protocol, name, exc=None):
"""Returns the database view, given the protocol and the set name
Parameters:
protocol (str): The name of the protocol where to retrieve the view from
name (str): The name of the set in the protocol where to retrieve the
view from
exc (class): If passed, must be a valid exception class that will be
used to report errors in the read-out of this database's view.
Returns:
The database view, which will be constructed, but not setup. You
**must** set it up before using methods ``done`` or ``next``.
"""
if not self._name:
exc = exc or RuntimeError
raise exc("database has no name")
if not self.valid:
message = "cannot load view for set `%s' of protocol `%s' " \
"from invalid database (%s)" % (protocol, name, self.name)
if exc: raise exc(message)
raise RuntimeError(message)
# loads the module only once through the lifetime of the database object
try:
if not hasattr(self, '_module'):
self._module = loader.load_module(self.name.replace(os.sep, '_'),
self.storage.code.path, {})
except Exception as e:
if exc is not None:
type, value, traceback = sys.exc_info()
six.reraise(exc, exc(value), traceback)
else:
raise #just re-raise the user exception
return View(self._module, self.set(protocol, name), self.prefix,
self.data['root_folder'], exc)
def hash_output(self, protocol, set, output):
"""Creates a unique hash the represents the output from the dataset
......@@ -497,6 +245,7 @@ class Database(object):
"""The short description for this object"""
return self.data.get('description', None)
@description.setter
def description(self, value):
"""Sets the short description for this object"""
......
......@@ -29,11 +29,9 @@
"""Validation and parsing for dataformats"""
import os
import re
import copy
import six
import numpy
import simplejson
from . import schema
......@@ -41,37 +39,13 @@ from . import prototypes
from . import utils
from .baseformat import baseformat
class Storage(utils.Storage):
"""Resolves paths for dataformats
from beat.backend.python.dataformat import Storage
from beat.backend.python.dataformat import DataFormat as BackendDataFormat
Parameters:
prefix (str): Establishes the prefix of your installation.
name (str): The name of the dataformat object in the format
``<user>/<name>/<version>``.
"""
def __init__(self, prefix, name):
if name.count('/') != 2:
raise RuntimeError("invalid dataformat name: `%s'" % name)
self.username, self.name, self.version = name.split('/')
self.fullname = name
path = utils.hashed_or_simple(prefix, 'dataformats', name)
super(Storage, self).__init__(path)
def hash(self):
"""The 64-character hash of the database declaration JSON"""
return super(Storage, self).hash('#description')
class DataFormat(object):
"""Data formats define the chunks of data that circulate at data formats.
class DataFormat(BackendDataFormat):
"""Data formats define the chunks of data that circulate between blocks.
Parameters:
......@@ -127,25 +101,8 @@ class DataFormat(object):
"""
def __init__(self, prefix, data, parent=None, dataformat_cache=None):
super(DataFormat, self).__init__(prefix, data, parent, dataformat_cache)
self._name = None
self.storage = None
self.resolved = None
self.prefix = prefix
self.errors = []
self.data = None
self.resolved = None
self.referenced = {}
self.parent = parent
# if the user has not provided a cache, still use one for performance
dataformat_cache = dataformat_cache if dataformat_cache is not None else {}
try:
self._load(data, dataformat_cache)
finally:
if self._name is not None: #registers it into the cache, even if failed
dataformat_cache[self._name] = self
def _load(self, data, dataformat_cache):
"""Loads the dataformat"""
......@@ -264,217 +221,13 @@ class DataFormat(object):
# all references are resolved at this point and the final model is built
# you can lookup the original data in ``self.data`` and the final model
# in ``self.resolved``.
if self.errors: self.errors = utils.uniq(self.errors)
@property
def name(self):
"""Returns the name of this object, either from the filename or composed
from the hierarchy it belongs.
"""
if self.parent and self._name is None:
return self.parent[0].name + '.' + self.parent[1] + '_type'
else:
return self._name or '__unnamed_dataformat__'
@property
def schema_version(self):
"""Returns the schema version"""
return self.data.get('#schema_version', 1)
@name.setter
def name(self, value):
self._name = value
self.storage = Storage(self.prefix, value)
@property
def extends(self):
"""If this dataformat extends another one, this is it, otherwise ``None``
"""
return self.data.get('#extends')
@property
def type(self):
"""Returns a new type that can create instances of this dataformat.
The new returned type provides a basis to construct new objects which
represent the dataformat. It provides a simple JSON serializer and a
for-screen representation.
Example:
To create an object respecting the data format from a JSON descriptor, use
the following technique:
.. code-block:: python
ftype = dataformat(...).type
json = simplejson.loads(...)
newobj = ftype(**json) # instantiates the new object, checks format
To dump the object into JSON, use the following technique:
.. code-block:: python
simplejson.dumps(newobj.as_dict(), indent=4)
A string representation of the object uses the technique above to
pretty-print the object contents to the screen.
"""
if self.resolved is None:
raise RuntimeError("Cannot prototype while not properly initialized")
classname = re.sub(r'[-/]', '_', self.name)
if not isinstance(classname, str): classname = str(classname)
def init(self, **kwargs): baseformat.__init__(self, **kwargs)
attributes = dict(
__init__=init,
_name=self.name,
_format=self.resolved,
)
# create the converters for the class we're about to return
for k, v in self.resolved.items():
if isinstance(v, list): #it is an array
attributes[k] = copy.deepcopy(v)
if isinstance(v[-1], DataFormat):
attributes[k][-1] = v[-1].type
else:
if v[-1] in ('string', 'str'):
attributes[k][-1] = str
else:
attributes[k][-1] = numpy.dtype(v[-1])
elif isinstance(v, DataFormat): #it is another dataformat
attributes[k] = v.type
else: #it is a simple type
if v in ('string', 'str'):
attributes[k] = str
else:
attributes[k] = numpy.dtype(v)
return type(
classname,
(baseformat,),
attributes,
)
@property
def valid(self):
return not bool(self.errors)
def validate(self, data):
"""Validates a piece of data provided by the user
In order to validate, the data object must be complete and safe-castable to
this dataformat. For any other validation operation that would require
special settings, use instead the :py:meth:`type` method to generate a
valid type and use either ``from_dict``, ``unpack`` or ``unpack_from``
depending on your use-case.
Parameters:
data (dict, str, fd): This parameter represents the data to be validated.
It may be a dictionary with the JSON representation of a data blob or,
else, a binary blob (represented by either a string or a file
descriptor object) from which the data will be read. If problems occur,
an exception is raised.
Returns:
``None``: Raises if an error occurs.
"""
obj = self.type()
if isinstance(data, dict):
obj.from_dict(data, casting='safe', add_defaults=False)
elif isinstance(data, six.string_types):
obj.unpack(data)
else:
obj.unpack_from(data)
def isparent(self, other):
"""Tells if the other object extends self (directly or indirectly).
Parameters:
other (DataFormat): another object to check
Returns:
bool: ``True``, if ``other`` is a parent of ``self``. ``False``
otherwise.
"""
if other.extends:
if self.name == other.extends: return True
else: return self.isparent(other.referenced[other.extends])
return False
@property
def description(self):
"""The short description for this object"""
return self.data.get('#description', None)
@description.setter
def description(self, value):
"""Sets the short description for this object"""
self.data['#description'] = value
@property
def documentation(self):
"""The full-length description for this object"""
if not self._name:
raise RuntimeError("dataformat has no name")
if self.storage.doc.exists():
return self.storage.doc.load()
return None
@documentation.setter
def documentation(self, value):
"""Sets the full-length description for this object"""
if not self._name:
raise RuntimeError("dataformat has no name")
if hasattr(value, 'read'):
self.storage.doc.save(value.read())
else:
self.storage.doc.save(value)
def hash(self):
"""Returns the hexadecimal hash for its declaration"""
if not self._name:
raise RuntimeError("dataformat has no name")
return self.storage.hash()
if self.errors:
self.errors = utils.uniq(self.errors)
def json_dumps(self, indent=4):
"""Dumps the JSON declaration of this object in a string
Parameters:
indent (int): The number of indentation spaces at every indentation level
......
This diff is collapsed.
......@@ -62,17 +62,18 @@ class Host(object):
self.kwargs = kwargs
self.environments = {}
self.db_environments = {}
def setup(self, raise_on_errors=True):
self.client = docker.Client(**self.kwargs)
self.environments = self._discover_environments(raise_on_errors)
(self.environments, self.db_environments) = self._discover_environments(raise_on_errors)
def __contains__(self, key):
return key in self.environments
return (key in self.environments) or (key in self.db_environments)
def __str__(self):
......@@ -84,13 +85,29 @@ class Host(object):
attrs = self.environments[key]
if attrs['tag'] is not None: return attrs['tag']
if attrs['tag'] is not None:
return attrs['tag']
return attrs['short_id']
def teardown(self):
def db2docker(self, db_names):
'''Returns a nice docker image name given a database name'''
def _all_in(db_names, databases):
return len([ x for x in db_names if x in databases ]) == len(db_names)
attrs = [ x for x in self.db_environments.values() if _all_in(db_names, x['databases']) ][0]
for container in self.containers: self.rm(container)
if attrs['tag'] is not None:
return attrs['tag']
return attrs['short_id']