diff --git a/.gitignore b/.gitignore index ce80e6f8f365712edd442d822f9786e82b61eec2..f4b226d84c0ab51b8f937e6041dcabf247d32fa2 100644 --- a/.gitignore +++ b/.gitignore @@ -21,3 +21,12 @@ opsnr.stt .DS_Store src/ html/ +.beat/ +algorithms/ +cache/ +databases/ +dataformats/ +experiments/ +libraries/ +toolchains/ +.noseids diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 775f882f4a76e654a4630582d6dd8ffacc68f3a9..c005677d1d95f25fb4dd0a1654a1e0ae8cae8a06 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -15,7 +15,7 @@ build: - ./bin/buildout - ./bin/python ${PREFIX}/bin/coverage run --source=${CI_PROJECT_NAME} ${PREFIX}/bin/nosetests -sv ${CI_PROJECT_NAME} - ./bin/python ${PREFIX}/bin/coverage report - - ./bin/python ${PREFIX}/bin/sphinx-apidoc --separate -d 2 --output=doc/api ${CI_PROJECT_NAMESPACE} + - ./bin/python ${PREFIX}/bin/sphinx-apidoc --separate -d 2 --output=doc/api beat - ./bin/python ${PREFIX}/bin/sphinx-build doc html tags: - docker-build diff --git a/beat/cmdline/config.py b/beat/cmdline/config.py index 3ceeb40f4a858a367a668f8b9d0b0346d103ee85..034d3d445652d85fda9e1780d31ac3e644e8d376 100644 --- a/beat/cmdline/config.py +++ b/beat/cmdline/config.py @@ -191,14 +191,15 @@ class Configuration(object): if not os.path.exists(c): return with open(c, 'rt') as f: user_data = simplejson.load(f) - for k in DEFAULTS: - if k in user_data: self.__data[k] = user_data[k] + for k in user_data: + if self._is_valid_key(k): self.__data[k] = user_data[k] except simplejson.JSONDecodeError: - print("WARNING: invalid state file at `%s' - removing and " \ - "re-starting..." % c) - from beat.core.utils import safe_rmfile - safe_rmfile(c) + raise + # print("WARNING: invalid state file at `%s' - removing and " \ + # "re-starting..." % c) + # from beat.core.utils import safe_rmfile + # safe_rmfile(c) @property @@ -210,16 +211,23 @@ class Configuration(object): return self.__data['cache'] + @property + def database_paths(self): + '''A dict of paths for databases''' + + return dict((k, self.__data[k]) for k in self.__data if self.is_database_key(k)) + + def set(self, key, value): '''Sets or resets a field in the configuration''' - if key not in DEFAULTS: + if not self._is_valid_key(key): print("ERROR: don't know about parameter `%s'" % key) sys.exit(1) if value is not None: self.__data[key] = value - else: + elif key in DEFAULTS: self.__data[key] = DEFAULTS[key] self.save() @@ -249,6 +257,11 @@ class Configuration(object): with os.fdopen(os.open(c, os.O_WRONLY | os.O_CREAT, 0600), 'wt') as f: simplejson.dump(self.__data, f, indent=4) + def _is_valid_key(self, key): + return key in DEFAULTS or self.is_database_key(key) + + def is_database_key(self, key): + return key.startswith('database/') def __str__(self): @@ -264,6 +277,10 @@ class Configuration(object): value = self.__data[key] value = "`%s'" % value if value is not None else '' retval.append(" * %-15s: %s" % (key, value)) + for key in sorted([k for k in self.__data if self.is_database_key(k)]): + value = self.__data[key] + value = "`%s'" % value if value is not None else '' + retval.append(" * %-15s: %s" % (key, value)) for key in sorted([k for k in DOC if k.startswith('color')]): value = self.__data[key] color, on_color, attrs = colorlog_to_termcolor(value) diff --git a/beat/cmdline/experiments.py b/beat/cmdline/experiments.py index 5167f6bb960baa312cccf99e8d514e27eda0169e..f8e41cc6319569e45f039ecbb9878947de3ec5a7 100755 --- a/beat/cmdline/experiments.py +++ b/beat/cmdline/experiments.py @@ -27,7 +27,7 @@ """Usage: - %(prog)s experiments run [--force] + %(prog)s experiments run [--force] [(--docker|--local)] %(prog)s experiments caches [--list | --delete | --checksum] %(prog)s experiments list [--remote] %(prog)s experiments check []... @@ -66,6 +66,8 @@ Options: --help Display this screen --path= Use path to write files to disk (instead of the current directory) + --local Uses the local executor to execute the experiment on the local machine (default). + --docker Uses the docker executor to execute the experiment using docker containers. """ @@ -80,13 +82,14 @@ import simplejson from . import common from beat.core.experiment import Experiment -from beat.core.execution import Executor +from beat.core.execution import Executor as DockerExecutor +from .local_execution import Executor as LocalExecutor from beat.core.utils import NumpyJSONEncoder from beat.core.data import CachedDataSource, load_data_index from beat.core.dock import Host -def run_experiment(configuration, name, force): +def run_experiment(configuration, name, force, use_docker, use_local): '''Run experiments locally''' def load_result(executor): @@ -160,10 +163,11 @@ def run_experiment(configuration, name, force): scheduled = experiment.setup() - # load existing environments - host = Host() - host.setup(raise_on_errors=False) - environments = host.environments + if use_docker: + # load existing environments + host = Host() + host.setup(raise_on_errors=False) + environments = host.environments # can we execute it? results = [] @@ -172,17 +176,24 @@ def run_experiment(configuration, name, force): # checks and sets-up executable executable = None #use the default - env = value['configuration']['environment'] - search_key = '%s (%s)' % (env['name'], env['version']) - if search_key not in environments: - logger.error("Cannot execute block `%s' on environment `%s': " \ - "environment was not found' - please install it", - key, search_key) - return 1 + if use_docker: + env = value['configuration']['environment'] + search_key = '%s (%s)' % (env['name'], env['version']) + if search_key not in environments: + logger.error("Cannot execute block `%s' on environment `%s': " \ + "environment was not found' - please install it", + key, search_key) + return 1 + + if use_docker: + executor = DockerExecutor(configuration.path, value['configuration'], + configuration.cache, dataformat_cache, database_cache, + algorithm_cache, library_cache) + else: + executor = LocalExecutor(configuration.path, value['configuration'], + configuration.cache, dataformat_cache, database_cache, + algorithm_cache, library_cache, configuration.database_paths) - executor = Executor(configuration.path, value['configuration'], - configuration.cache, dataformat_cache, database_cache, - algorithm_cache, library_cache) if not executor.valid: logger.error("Failed to load the execution information for `%s':", key) @@ -201,44 +212,50 @@ def run_experiment(configuration, name, force): logger.extra(" -> using fallback (default) environment") with executor: - result = executor.process(host) - - if result['status'] != 0: - logger.error("Block did not execute properly - outputs were reset") - logger.error(" Standard output:\n%s", reindent(result['stdout'], 4)) - logger.error(" Standard error:\n%s", reindent(result['stderr'], 4)) - logger.error(" Captured user error:\n%s", - reindent(result['user_error'], 4)) - logger.error(" Captured system error:\n%s", - reindent(result['system_error'], 4)) - return 1 + if use_docker: + result = executor.process(host) + else: + result = executor.process() + + if use_docker: + if result['status'] != 0: + logger.error("Block did not execute properly - outputs were reset") + logger.error(" Standard output:\n%s", reindent(result['stdout'], 4)) + logger.error(" Standard error:\n%s", reindent(result['stderr'], 4)) + logger.error(" Captured user error:\n%s", + reindent(result['user_error'], 4)) + logger.error(" Captured system error:\n%s", + reindent(result['system_error'], 4)) + print(" Environment: %s" % 'default environment') + return 1 + else: + stats = result['statistics'] + logger.extra(" CPU time (user, system, total, percent): %s, %s, %s, %d%%", + simplify_time(stats['cpu']['user']), + simplify_time(stats['cpu']['system']), + simplify_time(stats['cpu']['total']), + 100. * (stats['cpu']['user'] + stats['cpu']['system']) / stats['cpu']['total'], + ) + logger.extra(" Memory usage: %s", + simplify_size(stats['memory']['rss'])) + logger.extra(" Cached input read: %s, %s", + simplify_time(stats['data']['time']['read']), + simplify_size(stats['data']['volume']['read'])) + logger.extra(" Cached output write: %s, %s", + simplify_time(stats['data']['time']['write']), + simplify_size(stats['data']['volume']['write'])) + logger.extra(" Communication time: %s (%d%%)", + simplify_time(stats['data']['network']['wait_time']), + 100. * stats['data']['network']['wait_time'] / stats['cpu']['total']) + else: + logger.extra(" Environment: %s" % 'local environment') - logger.extra(" Environment: %s" % 'default environment') if executor.analysis: data = load_result(executor) r = reindent(simplejson.dumps(data.as_dict(), indent=2, cls=NumpyJSONEncoder), 2) logger.info(" Results:\n%s", r) - stats = result['statistics'] - logger.extra(" CPU time (user, system, total, percent): %s, %s, %s, %d%%", - simplify_time(stats['cpu']['user']), - simplify_time(stats['cpu']['system']), - simplify_time(stats['cpu']['total']), - 100. * (stats['cpu']['user'] + stats['cpu']['system']) / stats['cpu']['total'], - ) - logger.extra(" Memory usage: %s", - simplify_size(stats['memory']['rss'])) - logger.extra(" Cached input read: %s, %s", - simplify_time(stats['data']['time']['read']), - simplify_size(stats['data']['volume']['read'])) - logger.extra(" Cached output write: %s, %s", - simplify_time(stats['data']['time']['write']), - simplify_size(stats['data']['volume']['write'])) - logger.extra(" Communication time: %s (%d%%)", - simplify_time(stats['data']['network']['wait_time']), - 100. * stats['data']['network']['wait_time'] / stats['cpu']['total']) - logger.extra(" Outputs produced:") if executor.analysis: logger.extra(" * %s", executor.data['result']['path']) @@ -246,6 +263,7 @@ def run_experiment(configuration, name, force): for name, details in executor.data['outputs'].items(): logger.extra(" * %s", details['path']) + return 0 @@ -374,7 +392,7 @@ def pull(webapi, prefix, names, force, indentation, format_cache): def process(args): if args['run']: - return run_experiment(args['config'], args[''][0], args['--force']) + return run_experiment(args['config'], args[''][0], args['--force'], args['--docker'], args['--local']) if args['caches']: return caches(args['config'], args[''][0], args['--list'], diff --git a/beat/cmdline/local_execution.py b/beat/cmdline/local_execution.py new file mode 100755 index 0000000000000000000000000000000000000000..470e6da59c3f07386a9af606a5843b239f4ebd9d --- /dev/null +++ b/beat/cmdline/local_execution.py @@ -0,0 +1,632 @@ +#!/usr/bin/env python +# vim: set fileencoding=utf-8 : + +############################################################################### +# # +# Copyright (c) 2016 Idiap Research Institute, http://www.idiap.ch/ # +# Contact: beat.support@idiap.ch # +# # +# This file is part of the beat.core module of the BEAT platform. # +# # +# Commercial License Usage # +# Licensees holding valid commercial BEAT licenses may use this file in # +# accordance with the terms contained in a written agreement between you # +# and Idiap. For further information contact tto@idiap.ch # +# # +# Alternatively, this file may be used under the terms of the GNU Affero # +# Public License version 3 as published by the Free Software and appearing # +# in the file LICENSE.AGPL included in the packaging of this file. # +# The BEAT platform is distributed in the hope that it will be useful, but # +# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY # +# or FITNESS FOR A PARTICULAR PURPOSE. # +# # +# You should have received a copy of the GNU Affero Public License along # +# with the BEAT platform. If not, see http://www.gnu.org/licenses/. # +# # +############################################################################### + + +'''Execution utilities''' + +import os +import sys +import glob +import errno +import tempfile +import subprocess +import zmq.green as zmq +import time +import collections + +import logging +logger = logging.getLogger(__name__) + +import simplejson + +from beat.core import schema +from beat.core import database +from beat.core import algorithm +from beat.core import inputs +from beat.core import outputs +from beat.core import data +from beat.core import stats + + +class Executor(object): + """Executors runs the code given an execution block information, externally + + + Parameters: + + prefix (str): Establishes the prefix of your installation. + + data (dict, str): The piece of data representing the block to be executed. + It must validate against the schema defined for execution blocks. If a + string is passed, it is supposed to be a fully qualified absolute path to + a JSON file containing the block execution information. + + cache (str, optional): If your cache is not located under + ``/cache``, then specify a full path here. It will be used + instead. + + dataformat_cache (dict, optional): A dictionary mapping dataformat names to + loaded dataformats. This parameter is optional and, if passed, may + greatly speed-up database loading times as dataformats that are already + loaded may be re-used. If you use this parameter, you must guarantee that + the cache is refreshed as appropriate in case the underlying dataformats + change. + + database_cache (dict, optional): A dictionary mapping database names to + loaded databases. This parameter is optional and, if passed, may + greatly speed-up database loading times as databases that are already + loaded may be re-used. If you use this parameter, you must guarantee that + the cache is refreshed as appropriate in case the underlying databases + change. + + algorithm_cache (dict, optional): A dictionary mapping algorithm names to + loaded algorithms. This parameter is optional and, if passed, may + greatly speed-up database loading times as algorithms that are already + loaded may be re-used. If you use this parameter, you must guarantee that + the cache is refreshed as appropriate in case the underlying algorithms + change. + + library_cache (dict, optional): A dictionary mapping library names to + loaded libraries. This parameter is optional and, if passed, may greatly + speed-up library loading times as libraries that are already loaded may + be re-used. If you use this parameter, you must guarantee that the cache + is refreshed as appropriate in case the underlying libraries change. + + + Attributes: + + cache (str): The path to the cache currently being used + + errors (list): A list containing errors found while loading this execution + block. + + data (dict): The original data for this executor, as loaded by our JSON + decoder. + + algorithm (beat.core.algorithm.Algorithm): An object representing the + algorithm to be run. + + databases (dict): A dictionary in which keys are strings with database + names and values are :py:class:`database.Database`, representing the + databases required for running this block. The dictionary may be empty + in case all inputs are taken from the file cache. + + views (dict): A dictionary in which the keys are tuples pointing to the + ``(, , )`` and the value is a setup view + for that particular combination of details. The dictionary may be empty + in case all inputs are taken from the file cache. + + input_list (beat.core.inputs.InputList): A list of inputs that will be + served to the algorithm. + + output_list (beat.core.outputs.OutputList): A list of outputs that the + algorithm will produce. + + data_sources (list): A list with all data-sources created by our execution + loader. + + data_sinks (list): A list with all data-sinks created by our execution + loader. These are useful for clean-up actions in case of problems. + + custom_root_folders (dict): A dictionary where the keys are database + identifiers (`/`) and the values are paths to the + given database's files. These values will override the value found + in the database's metadata. + + """ + + def __init__(self, prefix, data, cache=None, dataformat_cache=None, + database_cache=None, algorithm_cache=None, library_cache=None, custom_root_folders=None): + + self.prefix = prefix + self.cache = cache or os.path.join(self.prefix, 'cache') + + # check cache - halt if required + if not os.path.exists(self.cache): + raise IOError("Cache path `%s' does not exist" % self.cache) + + # some attributes + self.algorithm = None + self.databases = {} + self.views = {} + self.input_list = None + self.output_list = None + self.data_sinks = [] + self.data_sources = [] + + # runs validation if required + self.errors = [] + self.data = data + + if custom_root_folders is not None and not isinstance(custom_root_folders, collections.Mapping): + raise TypeError("The custom root folders must be in dictionary format") + + self.custom_root_folders = custom_root_folders + + # temporary caches, if the user has not set them, for performance + database_cache = database_cache if database_cache is not None else {} + dataformat_cache = dataformat_cache if dataformat_cache is not None else {} + algorithm_cache = algorithm_cache if algorithm_cache is not None else {} + library_cache = library_cache if library_cache is not None else {} + + self._load(data, dataformat_cache, algorithm_cache, database_cache, + library_cache) + + + def _load(self, data, dataformat_cache, algorithm_cache, database_cache, + library_cache): + """Loads the block execution information""" + + # reset + self.data = None + self.errors = [] + self.algorithm = None + self.databases = {} + self.views = {} + self.input_list = None + self.output_list = None + self.data_sinks = [] + self.data_sources = [] + self.db_address = None + + if not isinstance(data, dict): #user has passed a file pointer + if not os.path.exists(data): + self.errors.append('File not found: %s' % data) + return + + # this runs basic validation, including JSON loading if required + self.data, self.errors = schema.validate('execution', data) + if self.errors: return #don't proceed with the rest of validation + + # at this point, the execution information is loaded, must now go on and + # load the algorithm code. + if self.data['algorithm'] in algorithm_cache: #reuse + self.algorithm = algorithm_cache[self.data['algorithm']] + else: #load it, use dataformat cache if possible + self.algorithm = algorithm.Algorithm(self.prefix, + self.data['algorithm'], dataformat_cache, library_cache) + algorithm_cache[self.algorithm.name] = self.algorithm + + if not self.algorithm.valid: + self.errors += self.algorithm.errors + return #don't proceed if algorithm is bogus! + + # load databases (if any is required) + for name, details in self.data['inputs'].items(): + if 'database' in details: + + if details['database'] not in self.databases: + + if details['database'] in database_cache: #reuse + db = database_cache[details['database']] + else: #load it + db = database.Database(self.prefix, details['database'], + dataformat_cache) + database_cache[db.name] = db + + self.databases[details['database']] = db + + if not db.valid: + self.errors += db.errors + + + def __enter__(self): + """Prepares inputs and outputs for the processing task + + Raises: + + IOError: in case something cannot be properly setup + + """ + + self._prepare_inputs() + self._prepare_outputs() + + return self + + + def __exit__(self, exc_type, exc_value, traceback): + """Closes all sinks and disconnects inputs and outputs + """ + + for sink in self.data_sinks: + # we save the output only if no valid error has been thrown + # n.b.: a system exit will raise SystemExit which is not an Exception + if not isinstance(exc_type, Exception): sink.close() + sink.reset() + + + self.input_list = None + self.output_list = None + self.data_sinks = [] + self.data_sources = [] + + + def _prepare_inputs(self): + """Prepares all input required by the execution.""" + + self.input_list = inputs.InputList() + + # This is used for parallelization purposes + start_index, end_index = self.data.get('range', (None, None)) + + for name, details in self.data['inputs'].items(): + + if 'database' in details: #it is a dataset input + # create the remote input + db = self.databases[details['database']] + configName = "database/%s" % db.name + if self.custom_root_folders is not None and configName in self.custom_root_folders: + db.data['root_folder'] = self.custom_root_folders[configName] + + dataformat_name = db.set(details['protocol'], details['set'])['outputs'][details['output']] + + # Get the relevant data for the requested view + view_key = (details['database'], details['protocol'], details['set']) + # create the view + v = db.view(view_key[1], view_key[2]) + # setup + v.prepare_outputs() + v.setup() + #v.obj.outputs = v.outputs + # Use the database view as an in-memory data source + v_data_source = data.MemoryDataSource(v.done, next_callback=v.next) + v_output = v.outputs[details['output']] + # Output the data from the view + v_output.data_sink.data_sources.append(v_data_source) + # Create a new local input + input = inputs.Input(name, db.dataformats[dataformat_name], v_data_source) + #input = inputs.RemoteInput(name, db.dataformats[dataformat_name], self.db_socket) + + # Synchronization bits + group = self.input_list.group(details['channel']) + if group is None: + group = inputs.InputGroup( + details['channel'], + synchronization_listener=outputs.SynchronizationListener(), + restricted_access=(details['channel'] == self.data['channel']) + ) + self.input_list.add(group) + + group.add(input) + + else: + + data_source = data.CachedDataSource() + self.data_sources.append(data_source) + if details['channel'] == self.data['channel']: #synchronized + status = data_source.setup( + filename=os.path.join(self.cache, details['path'] + '.data'), + prefix=self.prefix, + force_start_index=start_index, + force_end_index=end_index, + ) + else: + status = data_source.setup( + filename=os.path.join(self.cache, details['path'] + '.data'), + prefix=self.prefix, + ) + + if not status: + raise IOError("cannot load cache file `%s'" % details['path']) + + input = inputs.Input(name, self.algorithm.input_map[name], data_source) + + # Synchronization bits + group = self.input_list.group(details['channel']) + if group is None: + group = inputs.InputGroup( + details['channel'], + synchronization_listener=outputs.SynchronizationListener(), + restricted_access=(details['channel'] == self.data['channel']) + ) + self.input_list.add(group) + + group.add(input) + + + def _prepare_outputs(self): + """Prepares all output required by the execution.""" + + self.output_list = outputs.OutputList() + + # This is used for parallelization purposes + start_index, end_index = self.data.get('range', (None, None)) + + if 'outputs' in self.data: #it is a normal block (not analyzer) + + for name, details in self.data['outputs'].items(): + + path = os.path.join(self.cache, details['path'] + '.data') + dirname = os.path.dirname(path) + # Make sure that the directory exists while taking care of race + # conditions. see: http://stackoverflow.com/questions/273192/check-if-a-directory-exists-and-create-it-if-necessary + try: + if (len(dirname) > 0): + os.makedirs(dirname) + except OSError as exception: + if exception.errno != errno.EEXIST: + raise + + data_sink = data.CachedDataSink() + self.data_sinks.append(data_sink) + status = data_sink.setup( + filename=path, + dataformat=self.algorithm.dataformats[self.algorithm.output_map[name]], + encoding='binary', + max_size=0, #in bytes, for individual file chunks + ) + if not status: + raise IOError("cannot create cache sink `%s'" % details['path']) + + input_group = self.input_list.group(details['channel']) + if (input_group is None) or not hasattr(input_group, 'synchronization_listener'): + synchronization_listener = None + else: + synchronization_listener = input_group.synchronization_listener + + self.output_list.add(outputs.Output(name, data_sink, + synchronization_listener=synchronization_listener, + force_start_index=start_index or 0) + ) + + else: #it is an analyzer + + name = 'result' + details = self.data[name] + path = os.path.join(self.cache, details['path'] + '.data') + dirname = os.path.dirname(path) + # Make sure that the directory exists while taking care of race + # conditions. see: http://stackoverflow.com/questions/273192/check-if-a-directory-exists-and-create-it-if-necessary + try: + if (len(dirname) > 0): + os.makedirs(dirname) + except OSError as exception: + if exception.errno != errno.EEXIST: + raise + + data_sink = data.CachedDataSink() + self.data_sinks.append(data_sink) + status = data_sink.setup( + filename=path, + dataformat=self.algorithm.result_dataformat(), + encoding='binary', + ) + if not status: + raise IOError("cannot create cache sink `%s'" % details['path']) + + self.output_list.add(outputs.Output(name, data_sink, + force_start_index=start_index or 0)) + + + def process(self, virtual_memory_in_megabytes=0, + max_cpu_percent=0, timeout_in_minutes=0, daemon=0): + """Executes the user algorithm code using an external program. + + If ``executable`` is set, then execute the process using an external + program, else, uses the python application living by the side of this + installation (if one is found). + + The execution interface follows the backend API as described in our + documentation. + + We use green subprocesses this implementation. Each co-process is linked + to us via 2 uni-directional pipes which work as datain and dataout + end-points. The parent process (i.e. the current one) establishes the + connection to the child and then can pass/receive commands, data and logs. + + Usage of the data pipes (datain, dataout) is **synchronous** - you send a + command and block for an answer. The co-process is normally controlled by + the current process, except for data requests, which are user-code driven. + The nature of our problem does not require an *asynchronous* implementation + which, in turn, would require a much more complex set of dependencies (on + asyncio or Twisted for example). + + + Parameters: + + virtual_memory_in_megabytes (int, Optional): The amount of virtual memory + (in Megabytes) available for the job. If set to zero, no limit will be + applied. + + max_cpu_percent (int, Optional): The maximum amount of CPU usage allowed + in a system. This number must be an integer number between 0 and + ``100*number_of_cores`` in your system. For instance, if your system + has 2 cores, this number can go between 0 and 200. If it is <= 0, then + we don't track CPU usage. + + timeout_in_minutes (int): The number of minutes to wait for the user + process to execute. After this amount of time, the user process is + killed with :py:attr:`signal.SIGKILL`. If set to zero, no timeout will + be applied. + + daemon (int): If this variable is set, then we don't really start the + user process, but just kick out 0MQ server, print the command-line and + sleep for that many seconds. You're supposed to start the client by + hand then and debug it. + + + Returns: + + dict: A dictionary which is JSON formattable containing the summary of + this block execution. + + """ + + if not self.valid: + raise RuntimeError("execution information is bogus:\n * %s" % \ + '\n * '.join(self.errors)) + + self.runner = self.algorithm.runner() + retval = self.runner.setup(self.data['parameters']) + + if not self.input_list or not self.output_list: + raise RuntimeError("I/O for execution block has not yet been set up") + + using_output = self.output_list[0] if self.analysis else self.output_list + + _start = time.time() + + while self.input_list.hasMoreData(): + main_group = self.input_list.main_group + main_group.restricted_access = False + main_group.next() + main_group.restricted_access = True + if not self.runner.process(self.input_list, using_output): return False + + missing_data_outputs = [x for x in self.output_list if x.isDataMissing()] + + proc_time = time.time() - _start + + if missing_data_outputs: + raise RuntimeError("Missing data on the following output(s): %s" % \ + ', '.join([x.name for x in missing_data_outputs])) + + # some local information + logger.debug("Total processing time was %.3f seconds" , proc_time) + + #with agent.Agent(virtual_memory_in_megabytes, max_cpu_percent) as runner: + # self.agent = runner + # #synchronous call - always returns after a certain timeout + # retval = runner.run(self, host, timeout_in_minutes=timeout_in_minutes, + # daemon=daemon, db_address=self.db_address) + + # #adds I/O statistics from the current executor, if its complete already + # #otherwise, it means the running process went bananas, ignore it ;-) + # if 'statistics' in retval: + # if 'data' in retval['statistics']: + # retval['statistics']['data'].update(self.io_statistics) + # else: + # logger.warn("cannot find 'data' entry on returned stats, " \ + # "therefore not appending I/O info either") + + #return retval + + + @property + def valid(self): + """A boolean that indicates if this executor is valid or not""" + + return not bool(self.errors) + + + @property + def analysis(self): + """A boolean that indicates if the current block is an analysis block""" + return 'result' in self.data + + + @property + def outputs_exist(self): + """Returns ``True`` if outputs this block is supposed to produce exists.""" + + if self.analysis: + path = os.path.join(self.cache, self.data['result']['path']) + '*' + if not glob.glob(path): return False + + else: + for name, details in self.data['outputs'].items(): + path = os.path.join(self.cache, details['path']) + '*' + if not glob.glob(path): return False + + # if you get to this point all outputs already exist + return True + + + @property + def io_statistics(self): + """Summarize current I/O statistics looking at data sources and sinks + + Returns: + + dict: A dictionary summarizing current I/O statistics, read from our + sinks, sources, inputs and outputs. + """ + + is_analyzer = 'outputs' not in self.data + return stats.io_statistics(self.data_sources, self.input_list, self.data_sinks, self.output_list, self.data, is_analyzer) + + + def __str__(self): + + return simplejson.dumps(self.data, indent=4) + + + def write(self, path): + """Writes contents to precise filesystem location""" + + with open(path, 'wt') as f: f.write(str(self)) + + + def dump_runner_configuration(self, directory): + """Exports contents useful for a backend runner to run the algorithm""" + + data = { + 'algorithm': self.data['algorithm'], + 'parameters': self.data['parameters'], + } + + data['inputs'] = \ + dict([(k, v['channel']) for k,v in self.data['inputs'].items()]) + + if 'outputs' in self.data: + data['outputs'] = \ + dict([(k, v['channel']) for k,v in self.data['outputs'].items()]) + else: + data['result'] = self.data['channel'] + + data['channel'] = self.data['channel'] + + with open(os.path.join(directory, 'configuration.json'), 'wb') as f: + simplejson.dump(data, f, indent=2) + + tmp_prefix = os.path.join(directory, 'prefix') + if not os.path.exists(tmp_prefix): os.makedirs(tmp_prefix) + + self.algorithm.export(tmp_prefix) + + + def dump_databases_provider_configuration(self, directory): + """Exports contents useful for a backend runner to run the algorithm""" + + with open(os.path.join(directory, 'configuration.json'), 'wb') as f: + simplejson.dump(self.data, f, indent=2) + + tmp_prefix = os.path.join(directory, 'prefix') + if not os.path.exists(tmp_prefix): os.makedirs(tmp_prefix) + + for db in self.databases.values(): + db.export(tmp_prefix) + + + def kill(self): + """Stops the user process by force - to be called from signal handlers""" + + if self.agent is not None: + self.agent.kill() + return True + return False diff --git a/beat/cmdline/test/test_config.py b/beat/cmdline/test/test_config.py index 2f633cd372729d240fe1abaee88ca002120c241a..9cd42b5af294c7558f46c172a3069e0ec0dbf6d0 100644 --- a/beat/cmdline/test/test_config.py +++ b/beat/cmdline/test/test_config.py @@ -31,6 +31,7 @@ import os import nose.tools +from nose.tools import assert_raises import simplejson from . import tmp_prefix @@ -83,6 +84,41 @@ def test_set_token(): assert contents['token'] == token_value +@nose.tools.with_setup(teardown=cleanup) +def test_set_atnt_db(): + db_config = 'database/atnt' + db_path = './atnt_db' + nose.tools.eq_(call('config', 'set', db_config, db_path), 0) + config = os.path.join(tmp_prefix, '.beat', 'config.json') + assert os.path.exists(config) + with open(config, 'rt') as f: contents = simplejson.load(f) + assert contents[db_config] == db_path + + +@nose.tools.with_setup(teardown=cleanup) +def test_set_get_atnt_db(): + db_config = 'database/atnt' + db_path = './atnt_db' + nose.tools.eq_(call('config', 'set', db_config, db_path), 0) + nose.tools.eq_(call('config', 'get', db_config), 0) + + +@nose.tools.with_setup(teardown=cleanup) +def test_set_bad_config_key(): + db_config = 'fail' + with assert_raises(SystemExit) as c: + call('config', 'set', db_config, db_config) + + assert c.exception.code == 1 + + +@nose.tools.with_setup(teardown=cleanup) +@nose.tools.raises(KeyError) +def test_get_bad_config_key(): + db_config = 'fail' + nose.tools.eq_(call('config', 'get', db_config), 1) + + @nose.tools.with_setup(teardown=cleanup) def test_get_token(): nose.tools.eq_(call('config', 'get', 'token'), 0) diff --git a/beat/cmdline/test/test_experiments.py b/beat/cmdline/test/test_experiments.py index cd3266197c23c48f6430c523997c00af7fc4ea5b..92f2be6f439124984dfd37d45f651f6ea12f2dee 100644 --- a/beat/cmdline/test/test_experiments.py +++ b/beat/cmdline/test/test_experiments.py @@ -183,20 +183,42 @@ def test_run_double_triangle_1(): @slow @nose.tools.with_setup(teardown=cleanup) -def test_run_single_error_1(): +@nose.tools.raises(NameError) +def test_run_single_error_1_local(): + # When running locally, the module with the error is loaded + # inside the currently running process and will raise a NameError. obj = 'user/user/single/1/single_error' - nose.tools.eq_(call('run', obj, cache=tmp_prefix), 1) + nose.tools.eq_(call('run', obj, '--local', cache=tmp_prefix), 1) @slow @nose.tools.with_setup(teardown=cleanup) -def test_run_single_error_twice(): +def test_run_single_error_1_docker(): + # When running on docker, the module is loaded in the docker + # container and the local process will return '1'. + obj = 'user/user/single/1/single_error' + nose.tools.eq_(call('run', obj, '--docker', cache=tmp_prefix), 1) + +@slow +@nose.tools.with_setup(teardown=cleanup) +@nose.tools.raises(NameError) +def test_run_single_error_twice_local(): + # This one makes sure our output reset is working properly. Both tries should + # give out the same error. + obj = 'user/user/single/1/single_error' + nose.tools.eq_(call('run', obj, '--local', cache=tmp_prefix), 1) + nose.tools.eq_(call('run', obj, '--local', cache=tmp_prefix), 1) + + +@slow +@nose.tools.with_setup(teardown=cleanup) +def test_run_single_error_twice_docker(): # This one makes sure our output reset is working properly. Both tries should # give out the same error. obj = 'user/user/single/1/single_error' - nose.tools.eq_(call('run', obj, cache=tmp_prefix), 1) - nose.tools.eq_(call('run', obj, cache=tmp_prefix), 1) + nose.tools.eq_(call('run', obj, '--docker', cache=tmp_prefix), 1) + nose.tools.eq_(call('run', obj, '--docker', cache=tmp_prefix), 1) @nose.tools.with_setup(teardown=cleanup) diff --git a/buildout.cfg b/buildout.cfg index e25f6932ac5db441efb2c6289a479335147ae7bf..6fcdc47bc81ea45597bfaf8665fd37e26d8a3622 100644 --- a/buildout.cfg +++ b/buildout.cfg @@ -10,6 +10,7 @@ develop = . newest = false eggs = beat.cmdline beat.backend.python + beat.core ipdb [sources] diff --git a/doc/conf.py b/doc/conf.py index e39168955941722c29a663c18662eddd28f80c26..1d657faf2e8b70afe883747af27bf4c1cc482e2b 100644 --- a/doc/conf.py +++ b/doc/conf.py @@ -46,7 +46,7 @@ needs_sphinx = '1.3' extensions = [ 'sphinx.ext.todo', 'sphinx.ext.coverage', - 'sphinx.ext.pngmath', + 'sphinx.ext.imgmath', 'sphinx.ext.ifconfig', 'sphinx.ext.autodoc', 'sphinx.ext.autosummary', @@ -69,7 +69,7 @@ autosummary_generate = True # If we are on OSX, the 'dvipng' path maybe different dvipng_osx = '/opt/local/libexec/texlive/binaries/dvipng' -if os.path.exists(dvipng_osx): pngmath_dvipng = dvipng_osx +if os.path.exists(dvipng_osx): imgmath_dvipng = dvipng_osx # Add any paths that contain templates here, relative to this directory. templates_path = ['_templates'] diff --git a/doc/configuration.rst b/doc/configuration.rst index 8b403bf47934220e72e8c5c1ceadfecdc58e10bd..e7ea78c3a87f6d064a05cad8c2424f78e565aade 100644 --- a/doc/configuration.rst +++ b/doc/configuration.rst @@ -72,6 +72,24 @@ flag: ... +When running an experiment via the ``beat`` application using the local +executor (the default executor, also behind the ``--local`` flag), ``beat`` +will look into your configuration for any options set by the user that follow +the format ``database//``. ``beat`` expects that this +option points to a string representing the path to the root folder of the +actual database files for the given database. + +For example, the AT&T "Database of Faces" is available on the BEAT platform +as the "atnt" database. The third version of the "atnt" database would be +referenced as "atnt/3". The object "atnt/3" has a root folder defined on +the BEAT platform already, and changing this locally would mean creating a +new version of the database. +Instead, you may override that path by setting the configuration option +``database/atnt/3`` to your local path to the database files. +Assuming your username is "user" and you extracted the database files to +``~/Downloads/atnt_db``, you can set ``database/atnt/3`` to +``/home/user/Downloads/atnt_db``, and ``beat`` will find the database files. + You may explore different configuration options with the ``--help`` flag of ``beat config``: diff --git a/doc/experiments.rst b/doc/experiments.rst index f5f566111ad487856bd37a1eb9febce896a09d0f..4ce1bfd747124f27c7d0ec94d1fc3cdd1c8f2fe9 100644 --- a/doc/experiments.rst +++ b/doc/experiments.rst @@ -21,7 +21,7 @@ .. with the BEAT platform. If not, see http://www.gnu.org/licenses/. .. -.. _beat-core-experiments-cmdline: +.. _beat-cmdline-experiments: Experiments ----------- @@ -42,325 +42,60 @@ The commands available for experiments are: :cwd: .. -.. _beat-core-experiments-running: +.. _beat-cmdline-experiments-running: How to run an experiment? ......................... -The ``run_toolchain.py`` script can be used to perform the experiment defined -in a toolchain. It is the ideal way to debug an algorithm, since this script -doesn't try to do any advanced trick like the Scheduler (multi-processing, -optimizations, sandboxing, ...). - -For example, we execute a simple toolchain with two processing blocks (found in -``src/beat.core/beat/core/test/toolchains/integers_addition2.json``): - -.. code-block:: sh - - $ ./bin/run_toolchain.py --prefix=src/beat.core/beat/core/test/ integers_addition2 - Processing block 'addition1'... - Algorithm: sum - Inputs: - - a (single_integer): beat/src/beat.core/beat/core/test/databases/integers/output1.data - - b (single_integer): beat/src/beat.core/beat/core/test/databases/integers/output2.data - Outputs: - - sum (single_integer): beat/src/beat.core/beat/core/test/cache/addition1/sum.data - - Processing block 'addition2'... - Algorithm: sum - Inputs: - - a (single_integer): beat/src/beat.core/beat/core/test/cache/addition1/sum.data - - b (single_integer): beat/src/beat.core/beat/core/test/databases/integers/output3.data - Outputs: - - sum (single_integer): beat/src/beat.core/beat/core/test/cache/addition2/sum.data - - DONE - - Results available at: - - addition2.sum: beat/src/beat.core/beat/core/test/cache/addition2/sum.data +The command ``beat experiments run `` can be used to run the experiment +defined in an experiment definition file. It is the ideal way to debug an +experiment, since by default ``beat`` will use the local executor, which provides +a simple environment with PDB support without advanced features +(multi-processing, optimizations, sandboxing, multiple environments, etc.). Here, the ``--prefix`` option is used to tell the scripts where all our data -formats, toolchains and algorithms are located, and ``integers_addition2`` is -the name of the toolchain we want to check (note that we don't add the -``.json`` extension, as this is the name of the toolchain, not the filename!). +formats, toolchains and algorithms are located. This option can be set +in your configuration file (see ``beat config``). -This script displays for each block the files containing the data to use as +This command displays for each block the files containing the data to use as input, and the files generated by the outputs of the block. -By default, files are generated in binary format, but you can force them to be -in a more readable JSON format with the ``--json`` flag: - -.. code-block:: sh - - $ ./bin/run_toolchain.py --prefix=src/beat.core/beat/core/test/ --json integers_addition2 - The default behavior is to not regenerate data files already present in the cache. You can force the script to not take the content of the cache into -account with the ``--force`` flag: - -.. code-block:: sh - - $ ./bin/run_toolchain.py --prefix=src/beat.core/beat/core/test/ --force integers_addition2 - - -.. _beat-core-experiments-displaydata: +account with the ``--force`` flag. -How to examine the content of a data file? -.......................................... - -The ``display_data.py`` script can be used to examine the content of a data -file generated by the execution of a toolchain. - -For example, we look at the content of one of the data file used by the tests -of beat.core (found in -``src/beat.core/beat/core/test/data/single_integer.data``): - -.. code-block:: sh - - $ ./bin/display_data.py --prefix=src/beat.core/beat/core/test data/single_integer_delayed.data - Data format: single_integer - ---------------------------------------------- - Indexes: 0-1 - { - "value": 0 - } - ---------------------------------------------- - Indexes: 2-3 - { - "value": 1 - } - ---------------------------------------------- - Indexes: 4-5 - { - "value": 2 - } - ---------------------------------------------- - Indexes: 6-7 - { - "value": 3 - } - ---------------------------------------------- - Indexes: 8-9 - { - "value": 4 - } - ---------------------------------------------- - Indexes: 10-11 - { - "value": 5 - } - ---------------------------------------------- - Indexes: 12-13 - { - "value": 6 - } - ---------------------------------------------- - Indexes: 14-15 - { - "value": 7 - } - ---------------------------------------------- - Indexes: 16-17 - { - "value": 8 - } - ---------------------------------------------- - Indexes: 18-19 - { - "value": 9 - } - -The script tells us that the data correspond to the data format -``single_integer``, and displays each entry (with the indexes it correspond to) -in a JSON representation. +Executors +========= +"Executors" are modules that execute each block in an experiment. On the BEAT +platform, there is only the one executor, which executes the experiment using +Docker containers with advanced scheduling and security features. When +developing using ``beat.cmdline``, however, you have the option of using either +the BEAT platform's executor, behind the ``--docker`` flag, or the "local" +executor, provided in this project. The local executor, as explained above, is +much simpler, aimed at providing a smooth development experience. However, +there are two important tradeoffs: -.. _beat-core-experiments-example: +- Lower performance for non-trivial experiments, as it runs everything + synchronously in one process on the CPU. +- No multiple environments, as the Python environment that built + ``beat.cmdline`` is used. This means that many BEAT experiments that + rely on different/multiple environments will not work. -Putting it all together: a complete example -........................................... +If you want to use the local executor, pay attention to the python environment +used to call `buildout` in your copy of ``beat.cmdline``. The suggested way +to use Bob libraries while developing on the local executor is to use install +``zc.buildout`` in a Python2.7 conda environment with Bob installed. Using +the ``buildout`` command from the environment will make the entire environment +available to ``beat.cmdline`` even when the environment is not active. -.. _beat-core-experiments-example-figure: -.. figure:: img/toolchain-example.* - A complete toolchain that train and test a face detector +.. _beat-cmdline-experiments-displaydata: -The following example describes the toolchain visible at :num:`figure -#beat-core-toolchains-example-figure`, a complete toolchain that: - - #. train a face detector on one set of images (*beat_face_dataset_train*) - #. validate it on another set of images (*beat_face_dataset_validation*) - #. test it on a third set of images (*beat_face_dataset_test*) - -.. note:: - - This toolchain is still not considered as an executable one by the platform, - since it contains no mention of the algorithms that must be used in each - processing block. +How to examine the content of a data file? +.......................................... -.. code-block:: json +The ``beat cache`` collection of commands interact with the cache: - { - "databases": [ { - "name": "beat_face_dataset_train", - "outputs": { - "images": "image/rgb", - "faces": "coordinates_list" - } - }, - { - "name": "beat_face_dataset_validation", - "outputs": { - "images": "image/rgb", - "faces": "coordinates_list" - } - }, - { - "name": "beat_face_dataset_test", - "outputs": { - "images": "image/rgb", - "faces": "coordinates_list" - } - } - ], - "blocks": [{ - "name": "features_extractor_train", - "inputs": { - "images": "images/rgb" - }, - "outputs": { - "features": "array/float" - } - }, - { - "name": "face_model_builder", - "inputs": { - "features": "array/float", - "faces": "coordinates_list" - }, - "outputs": { - "model": "face_model" - } - }, - { - "name": "features_extractor_validation", - "inputs": { - "images": "images/rgb" - }, - "outputs": { - "features": "array/float" - } - }, - { - "name": "face_detector_validation", - "inputs": { - "model": "face_model", - "features": "array/float" - }, - "outputs": { - "faces": "coordinates_list" - } - }, - { - "name": "thresholder", - "inputs": { - "detected_faces": "coordinates_list", - "labelled_faces": "coordinates_list" - }, - "outputs": { - "threshold": "float" - } - }, - { - "name": "features_extractor_test", - "inputs": { - "images": "images/rgb" - }, - "outputs": { - "features": "array/float" - } - }, - { - "name": "face_detector_test", - "inputs": { - "model": "face_model", - "features": "array/float" - }, - "outputs": { - "faces": "coordinates_list" - } - }, - { - "name": "evaluator", - "inputs": { - "threshold": "float", - "detected_faces": "coordinates_list", - "labelled_faces": "coordinates_list" - }, - "outputs": { - "score": "float" - } - } - ], - "connections": [{ - "from": "beat_face_dataset_train.images", - "to": "features_extractor_train.images" - }, - { - "from": "features_extractor_train.features", - "to": "face_model_builder.features" - }, - { - "from": "beat_face_dataset_train.faces", - "to": "face_model_builder.faces" - }, - { - "from": "beat_face_dataset_validation.images", - "to": "features_extractor_validation.images" - }, - { - "from": "face_model_builder.model", - "to": "face_detector_validation.model" - }, - { - "from": "features_extractor_validation.features", - "to": "face_detector_validation.features" - }, - { - "from": "face_detector_validation.faces", - "to": "thresholder.detected_faces" - }, - { - "from": "beat_face_dataset_validation.faces", - "to": "thresholder.labelled_faces" - }, - { - "from": "beat_face_dataset_test.images", - "to": "features_extractor_test.images" - }, - { - "from": "features_extractor_test.features", - "to": "face_detector_test.features" - }, - { - "from": "face_model_builder.model", - "to": "face_detector_test.model" - }, - { - "from": "thresholder.threshold", - "to": "evaluator.threshold" - }, - { - "from": "face_detector_test.faces", - "to": "evaluator.detected_faces" - }, - { - "from": "beat_face_dataset_test.faces", - "to": "evaluator.labelled_faces" - } - ], - "results": [ - "thresholder.threshold", - "evaluator.score" - ] - } +.. command-output:: ./bin/beat cache --help + :cwd: .. diff --git a/doc/index.rst b/doc/index.rst index fe66d2a278884d092aea9ce6e917b6068a549ea1..b678dcb787352353242907675e5e3aa9aa32cfd6 100644 --- a/doc/index.rst +++ b/doc/index.rst @@ -27,11 +27,11 @@ BEAT Command-line Client .. _beat-cmdline-introduction: -This package implements a Python-based client for BEAT's web service or locally +This package provides a Python-based client for BEAT's web service or locally installed repositories. It allows users to list, validate, edit, download and -upload objects from remote BEAT instances. It dubs as a Python-client API for -packages required to implement more advanced functionality than already -available on the currently avaiable tool, called `beat`, as we shall see next. +upload objects from remote BEAT instances, as well as running BEAT experiments +locally. It also doubles as a Python-client API for packages that need to +implement more advanced functionality than this client (`beat`) provides. .. toctree:: @@ -43,7 +43,7 @@ available on the currently avaiable tool, called `beat`, as we shall see next. toolchains experiments databases - api/beat.cmdline + walkthrough Indices and tables ================== diff --git a/doc/introduction.rst b/doc/introduction.rst index 7e614b2426474a09be49cb001e7504652e9c1d7d..01d4f632427b17d7566f44cd45c1067e428dbf6a 100644 --- a/doc/introduction.rst +++ b/doc/introduction.rst @@ -44,19 +44,19 @@ typical directory structure in a prefix directory: :cwd: .. Each of the subdirectories in the prefix keeps only objects of a given type. -For example, the ``dataformats`` subdirectory keeps only data format objects +For example, the ``dataformats`` subdirectory keeps only data format objects, and so on. Inside each subdirectory, the user will find an organization that resembles the naming convention of objects in the BEAT platform. For example, you'd be able to find the data format ``my_dataformat``, belonging to user ``user``, version ``1``, under the directory ``/dataformats/user/my_dataformat/1``. Objects are described by a JSON -file, an option full-length description in reStructuredText format and, +file, an optional full-length description in reStructuredText format and, depending on the object type, a program file containing user routines programmed in one of the supported languages. The ``beat`` command-line utility bridges user interaction with a remote BEAT web platform and locally available objects in a seamless way. The program is -normally available on your work environment: +normally available in the Idiap work environment: .. command-output:: ./bin/beat --help :cwd: .. diff --git a/doc/toolchains.rst b/doc/toolchains.rst index 86399208998c8c8793bf43c39ac3446327b39a2b..2674c84274ba85f62af8328707e86ee6e6620bb9 100644 --- a/doc/toolchains.rst +++ b/doc/toolchains.rst @@ -30,7 +30,7 @@ The commands available for toolchains are: :cwd: .. -.. _beat-core-toolchains-checkscript: +.. _beat-cmdline-toolchains-checkscript: How to check that a toolchain is correctly declared? .................................................... diff --git a/doc/walkthrough.rst b/doc/walkthrough.rst new file mode 100644 index 0000000000000000000000000000000000000000..398a73fd99fa96a19bbf82d63e89f4b30006aad3 --- /dev/null +++ b/doc/walkthrough.rst @@ -0,0 +1,101 @@ +.. vim: set fileencoding=utf-8 : + +.. Copyright (c) 2016 Idiap Research Institute, http://www.idiap.ch/ .. +.. Contact: beat.support@idiap.ch .. +.. .. +.. This file is part of the beat.cmdline module of the BEAT platform. .. +.. .. +.. Commercial License Usage .. +.. Licensees holding valid commercial BEAT licenses may use this file in .. +.. accordance with the terms contained in a written agreement between you .. +.. and Idiap. For further information contact tto@idiap.ch .. +.. .. +.. Alternatively, this file may be used under the terms of the GNU Affero .. +.. Public License version 3 as published by the Free Software and appearing .. +.. in the file LICENSE.AGPL included in the packaging of this file. .. +.. The BEAT platform is distributed in the hope that it will be useful, but .. +.. WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY .. +.. or FITNESS FOR A PARTICULAR PURPOSE. .. +.. .. +.. You should have received a copy of the GNU Affero Public License along .. +.. with the BEAT platform. If not, see http://www.gnu.org/licenses/. .. + + +.. _beat-cmdline-walkthrough: + +Walkthrough +----------- + +This page gives an overview on setting up an environment to develop BEAT +experiments locally. The topics covered are the following: + +- Preparing a proper Python environment +- Bootstrapping ``beat.cmdline`` +- Running the `BEAT web platform `_ + locally +- Detailing an example workflow for BEAT development + +Tips for preparing a proper environment +....................................... + +- Using a `Conda `_ environment is recommended. +- ``beat.cmdline`` requires Python 2.7 and will not work on Python 3.x. +- It uses `Buildout `_, to bootstrap, + commonly known as `zc.buildout`. This package is also available on Conda. +- The Python 2.7 environment used for bootstrapping will also be the + environment used to execute BEAT code when using the local executor. +- While you can bootstrap and use ``beat.cmdline`` without having a local + BEAT instance, ``beat`` can't yet create BEAT objects (experiments, etc.) + from scratch. We recommend you instead modify existing BEAT objects. +- Running experiments using the Docker executor requires Docker to be + set up locally and several docker containers. + +Boostrapping ``beat.cmdline`` +............................. + +Thanks to Buildout, setup is simple: + +- Clone the ``beat.cmdline`` repository. +- Using a compatible environment (see above) run ``buildout`` in the project's + root directory. Assuming this step is successful, the ``beat`` tool is now + functional. +- To build this documentation locally, use the ``sphinx-build`` tool in + ``bin/``: ``./bin/sphinx-build doc/ sphinx/`` to output to ``sphinx/``. + +Running the BEAT Platform locally +................................. + +- Build + `the BEAT web environment `_ + locally. +- Build ``beat.web`` using the Python environment built from ``beat.env.web``. +- Install the example environment in ``beat.web`` via ``./bin/django install``. +- Run ``./bin/django runserver`` and go to ``localhost:8000/`` to see if it + works. The default example login is ``user`` with password ``user``. + +An example workflow +................... + +First, have ``beat.web`` running locally. To interact with the local instance +through ``beat.cmdline``, set the ``platform`` config option through ``beat`` +to ``http://localhost:8000/``: ``beat set platform 'http://localhost:8080/'``. + +Pull example experiments from the platform via the ``beat`` tool: +``beat exp pull user/single/1``. Run the example experiment using the +local executor: ``beat exp run user/single/1/single`` to make sure one works. + +Fork the ``single`` experiment, using the same ``user/single/1`` toolchain: +``beat exp fork user/single/1/single user/single/1/single_test``. +Run the new experiment to make sure the fork works: +``beat exp run user/single/1/single_test``. +Feel free to edit the ``single_test`` JSON file to change the experiment. + +Likewise, you can fork other BEAT objects (algorithms, databases, toolchains, +etc.). + +To see if your new BEAT objects will run on the BEAT platform, try to run +using the Docker executor (which will use the BEAT docker images it can find): +``beat exp run --docker user/single/1/single_test``. + +If it works, you can push it to your BEAT platform to have it on your BEAT +platform instance: ``beat exp push``.