From dd82a333fe3df1d2b1525f8dc9111e440ce80962 Mon Sep 17 00:00:00 2001 From: Jaden Date: Fri, 4 Aug 2017 14:58:00 +0200 Subject: [PATCH 01/27] ignore beat cmdline caches & local data --- .gitignore | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/.gitignore b/.gitignore index ce80e6f..8b117ea 100644 --- a/.gitignore +++ b/.gitignore @@ -21,3 +21,11 @@ opsnr.stt .DS_Store src/ html/ +.beat/ +algorithms/ +cache/ +databases/ +dataformats/ +experiments/ +libraries/ +toolchains/ -- GitLab From 0c85a5f9ee72eeb86ea9b1ca6fa9d7708ed4afd5 Mon Sep 17 00:00:00 2001 From: Jaden Date: Fri, 4 Aug 2017 14:58:40 +0200 Subject: [PATCH 02/27] got beat.core Executor working in beat.cmdline package --- beat/cmdline/experiments.py | 3 +- beat/cmdline/local_execution.py | 595 ++++++++++++++++++++++++++++++++ 2 files changed, 597 insertions(+), 1 deletion(-) create mode 100755 beat/cmdline/local_execution.py diff --git a/beat/cmdline/experiments.py b/beat/cmdline/experiments.py index 5167f6b..ab065f7 100755 --- a/beat/cmdline/experiments.py +++ b/beat/cmdline/experiments.py @@ -80,7 +80,8 @@ import simplejson from . import common from beat.core.experiment import Experiment -from beat.core.execution import Executor +#from beat.core.execution import Executor +from .local_execution import Executor from beat.core.utils import NumpyJSONEncoder from beat.core.data import CachedDataSource, load_data_index from beat.core.dock import Host diff --git a/beat/cmdline/local_execution.py b/beat/cmdline/local_execution.py new file mode 100755 index 0000000..da47dc6 --- /dev/null +++ b/beat/cmdline/local_execution.py @@ -0,0 +1,595 @@ +#!/usr/bin/env python +# vim: set fileencoding=utf-8 : + +############################################################################### +# # +# Copyright (c) 2016 Idiap Research Institute, http://www.idiap.ch/ # +# Contact: beat.support@idiap.ch # +# # +# This file is part of the beat.core module of the BEAT platform. # +# # +# Commercial License Usage # +# Licensees holding valid commercial BEAT licenses may use this file in # +# accordance with the terms contained in a written agreement between you # +# and Idiap. For further information contact tto@idiap.ch # +# # +# Alternatively, this file may be used under the terms of the GNU Affero # +# Public License version 3 as published by the Free Software and appearing # +# in the file LICENSE.AGPL included in the packaging of this file. # +# The BEAT platform is distributed in the hope that it will be useful, but # +# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY # +# or FITNESS FOR A PARTICULAR PURPOSE. # +# # +# You should have received a copy of the GNU Affero Public License along # +# with the BEAT platform. If not, see http://www.gnu.org/licenses/. # +# # +############################################################################### + + +'''Execution utilities''' + +import os +import sys +import glob +import errno +import tempfile +import subprocess +import zmq.green as zmq + +import logging +logger = logging.getLogger(__name__) + +import simplejson + +from beat.core import schema +from beat.core import database +from beat.core import algorithm +from beat.core import inputs +from beat.core import outputs +from beat.core import data +from beat.core import stats +from beat.core import agent +from beat.core import dock + + +class Executor(object): + """Executors runs the code given an execution block information, externally + + + Parameters: + + prefix (str): Establishes the prefix of your installation. + + data (dict, str): The piece of data representing the block to be executed. + It must validate against the schema defined for execution blocks. If a + string is passed, it is supposed to be a fully qualified absolute path to + a JSON file containing the block execution information. + + cache (str, optional): If your cache is not located under + ``/cache``, then specify a full path here. It will be used + instead. + + dataformat_cache (dict, optional): A dictionary mapping dataformat names to + loaded dataformats. This parameter is optional and, if passed, may + greatly speed-up database loading times as dataformats that are already + loaded may be re-used. If you use this parameter, you must guarantee that + the cache is refreshed as appropriate in case the underlying dataformats + change. + + database_cache (dict, optional): A dictionary mapping database names to + loaded databases. This parameter is optional and, if passed, may + greatly speed-up database loading times as databases that are already + loaded may be re-used. If you use this parameter, you must guarantee that + the cache is refreshed as appropriate in case the underlying databases + change. + + algorithm_cache (dict, optional): A dictionary mapping algorithm names to + loaded algorithms. This parameter is optional and, if passed, may + greatly speed-up database loading times as algorithms that are already + loaded may be re-used. If you use this parameter, you must guarantee that + the cache is refreshed as appropriate in case the underlying algorithms + change. + + library_cache (dict, optional): A dictionary mapping library names to + loaded libraries. This parameter is optional and, if passed, may greatly + speed-up library loading times as libraries that are already loaded may + be re-used. If you use this parameter, you must guarantee that the cache + is refreshed as appropriate in case the underlying libraries change. + + + Attributes: + + cache (str): The path to the cache currently being used + + errors (list): A list containing errors found while loading this execution + block. + + data (dict): The original data for this executor, as loaded by our JSON + decoder. + + algorithm (beat.core.algorithm.Algorithm): An object representing the + algorithm to be run. + + databases (dict): A dictionary in which keys are strings with database + names and values are :py:class:`database.Database`, representing the + databases required for running this block. The dictionary may be empty + in case all inputs are taken from the file cache. + + views (dict): A dictionary in which the keys are tuples pointing to the + ``(, , )`` and the value is a setup view + for that particular combination of details. The dictionary may be empty + in case all inputs are taken from the file cache. + + input_list (beat.core.inputs.InputList): A list of inputs that will be + served to the algorithm. + + output_list (beat.core.outputs.OutputList): A list of outputs that the + algorithm will produce. + + data_sources (list): A list with all data-sources created by our execution + loader. + + data_sinks (list): A list with all data-sinks created by our execution + loader. These are useful for clean-up actions in case of problems. + + """ + + def __init__(self, prefix, data, cache=None, dataformat_cache=None, + database_cache=None, algorithm_cache=None, library_cache=None): + + self.prefix = prefix + self.cache = cache or os.path.join(self.prefix, 'cache') + + # check cache - halt if required + if not os.path.exists(self.cache): + raise IOError("Cache path `%s' does not exist" % self.cache) + + # some attributes + self.algorithm = None + self.databases = {} + self.views = {} + self.input_list = None + self.output_list = None + self.data_sinks = [] + self.data_sources = [] + + # runs validation if required + self.errors = [] + self.data = data + + # temporary caches, if the user has not set them, for performance + database_cache = database_cache if database_cache is not None else {} + dataformat_cache = dataformat_cache if dataformat_cache is not None else {} + algorithm_cache = algorithm_cache if algorithm_cache is not None else {} + library_cache = library_cache if library_cache is not None else {} + + self._load(data, dataformat_cache, algorithm_cache, database_cache, + library_cache) + + self.agent = None + + + def _load(self, data, dataformat_cache, algorithm_cache, database_cache, + library_cache): + """Loads the block execution information""" + + # reset + self.data = None + self.errors = [] + self.algorithm = None + self.databases = {} + self.views = {} + self.input_list = None + self.output_list = None + self.data_sinks = [] + self.data_sources = [] + self.db_address = None + + if not isinstance(data, dict): #user has passed a file pointer + if not os.path.exists(data): + self.errors.append('File not found: %s' % data) + return + + # this runs basic validation, including JSON loading if required + self.data, self.errors = schema.validate('execution', data) + if self.errors: return #don't proceed with the rest of validation + + # at this point, the execution information is loaded, must now go on and + # load the algorithm code. + if self.data['algorithm'] in algorithm_cache: #reuse + self.algorithm = algorithm_cache[self.data['algorithm']] + else: #load it, use dataformat cache if possible + self.algorithm = algorithm.Algorithm(self.prefix, + self.data['algorithm'], dataformat_cache, library_cache) + algorithm_cache[self.algorithm.name] = self.algorithm + + if not self.algorithm.valid: + self.errors += self.algorithm.errors + return #don't proceed if algorithm is bogus! + + # load databases (if any is required) + for name, details in self.data['inputs'].items(): + if 'database' in details: + + if details['database'] not in self.databases: + + if details['database'] in database_cache: #reuse + db = database_cache[details['database']] + else: #load it + db = database.Database(self.prefix, details['database'], + dataformat_cache) + database_cache[db.name] = db + + self.databases[details['database']] = db + + if not db.valid: + self.errors += db.errors + + + def __enter__(self): + """Prepares inputs and outputs for the processing task + + Raises: + + IOError: in case something cannot be properly setup + + """ + + if len(self.databases) > 0: + host = dock.Host() + self.context = zmq.Context() + self.db_socket = self.context.socket(zmq.PAIR) + self.db_address = 'tcp://' + host.ip + port = self.db_socket.bind_to_random_port(self.db_address) + self.db_address += ':%d' % port + + self._prepare_inputs() + self._prepare_outputs() + + self.agent = None + + return self + + + def __exit__(self, exc_type, exc_value, traceback): + """Closes all sinks and disconnects inputs and outputs + """ + + for sink in self.data_sinks: + # we save the output only if no valid error has been thrown + # n.b.: a system exit will raise SystemExit which is not an Exception + if not isinstance(exc_type, Exception): sink.close() + sink.reset() + + + self.input_list = None + self.output_list = None + self.data_sinks = [] + self.data_sources = [] + self.agent = None + + + def _prepare_inputs(self): + """Prepares all input required by the execution.""" + + self.input_list = inputs.InputList() + + # This is used for parallelization purposes + start_index, end_index = self.data.get('range', (None, None)) + + for name, details in self.data['inputs'].items(): + + if 'database' in details: #it is a dataset input + + # create the remote input + db = self.databases[details['database']] + + dataformat_name = db.set(details['protocol'], details['set'])['outputs'][details['output']] + input = inputs.RemoteInput(name, db.dataformats[dataformat_name], self.db_socket) + + # Synchronization bits + group = self.input_list.group(details['channel']) + if group is None: + group = inputs.InputGroup( + details['channel'], + synchronization_listener=outputs.SynchronizationListener(), + restricted_access=(details['channel'] == self.data['channel']) + ) + self.input_list.add(group) + + group.add(input) + + else: + + data_source = data.CachedDataSource() + self.data_sources.append(data_source) + if details['channel'] == self.data['channel']: #synchronized + status = data_source.setup( + filename=os.path.join(self.cache, details['path'] + '.data'), + prefix=self.prefix, + force_start_index=start_index, + force_end_index=end_index, + ) + else: + status = data_source.setup( + filename=os.path.join(self.cache, details['path'] + '.data'), + prefix=self.prefix, + ) + + if not status: + raise IOError("cannot load cache file `%s'" % details['path']) + + input = inputs.Input(name, self.algorithm.input_map[name], data_source) + + # Synchronization bits + group = self.input_list.group(details['channel']) + if group is None: + group = inputs.InputGroup( + details['channel'], + synchronization_listener=outputs.SynchronizationListener(), + restricted_access=(details['channel'] == self.data['channel']) + ) + self.input_list.add(group) + + group.add(input) + + + def _prepare_outputs(self): + """Prepares all output required by the execution.""" + + self.output_list = outputs.OutputList() + + # This is used for parallelization purposes + start_index, end_index = self.data.get('range', (None, None)) + + if 'outputs' in self.data: #it is a normal block (not analyzer) + + for name, details in self.data['outputs'].items(): + + path = os.path.join(self.cache, details['path'] + '.data') + dirname = os.path.dirname(path) + # Make sure that the directory exists while taking care of race + # conditions. see: http://stackoverflow.com/questions/273192/check-if-a-directory-exists-and-create-it-if-necessary + try: + if (len(dirname) > 0): + os.makedirs(dirname) + except OSError as exception: + if exception.errno != errno.EEXIST: + raise + + data_sink = data.CachedDataSink() + self.data_sinks.append(data_sink) + status = data_sink.setup( + filename=path, + dataformat=self.algorithm.dataformats[self.algorithm.output_map[name]], + encoding='binary', + max_size=0, #in bytes, for individual file chunks + ) + if not status: + raise IOError("cannot create cache sink `%s'" % details['path']) + + input_group = self.input_list.group(details['channel']) + if (input_group is None) or not hasattr(input_group, 'synchronization_listener'): + synchronization_listener = None + else: + synchronization_listener = input_group.synchronization_listener + + self.output_list.add(outputs.Output(name, data_sink, + synchronization_listener=synchronization_listener, + force_start_index=start_index or 0) + ) + + else: #it is an analyzer + + name = 'result' + details = self.data[name] + path = os.path.join(self.cache, details['path'] + '.data') + dirname = os.path.dirname(path) + # Make sure that the directory exists while taking care of race + # conditions. see: http://stackoverflow.com/questions/273192/check-if-a-directory-exists-and-create-it-if-necessary + try: + if (len(dirname) > 0): + os.makedirs(dirname) + except OSError as exception: + if exception.errno != errno.EEXIST: + raise + + data_sink = data.CachedDataSink() + self.data_sinks.append(data_sink) + status = data_sink.setup( + filename=path, + dataformat=self.algorithm.result_dataformat(), + encoding='binary', + ) + if not status: + raise IOError("cannot create cache sink `%s'" % details['path']) + + self.output_list.add(outputs.Output(name, data_sink, + force_start_index=start_index or 0)) + + + def process(self, host, virtual_memory_in_megabytes=0, + max_cpu_percent=0, timeout_in_minutes=0, daemon=0): + """Executes the user algorithm code using an external program. + + If ``executable`` is set, then execute the process using an external + program, else, uses the python application living by the side of this + installation (if one is found). + + The execution interface follows the backend API as described in our + documentation. + + We use green subprocesses this implementation. Each co-process is linked + to us via 2 uni-directional pipes which work as datain and dataout + end-points. The parent process (i.e. the current one) establishes the + connection to the child and then can pass/receive commands, data and logs. + + Usage of the data pipes (datain, dataout) is **synchronous** - you send a + command and block for an answer. The co-process is normally controlled by + the current process, except for data requests, which are user-code driven. + The nature of our problem does not require an *asynchronous* implementation + which, in turn, would require a much more complex set of dependencies (on + asyncio or Twisted for example). + + + Parameters: + + host (:py:class:Host): A configured docker host that will execute the + user process. If the host does not have access to the required + environment, an exception will be raised. + + virtual_memory_in_megabytes (int, Optional): The amount of virtual memory + (in Megabytes) available for the job. If set to zero, no limit will be + applied. + + max_cpu_percent (int, Optional): The maximum amount of CPU usage allowed + in a system. This number must be an integer number between 0 and + ``100*number_of_cores`` in your system. For instance, if your system + has 2 cores, this number can go between 0 and 200. If it is <= 0, then + we don't track CPU usage. + + timeout_in_minutes (int): The number of minutes to wait for the user + process to execute. After this amount of time, the user process is + killed with :py:attr:`signal.SIGKILL`. If set to zero, no timeout will + be applied. + + daemon (int): If this variable is set, then we don't really start the + user process, but just kick out 0MQ server, print the command-line and + sleep for that many seconds. You're supposed to start the client by + hand then and debug it. + + + Returns: + + dict: A dictionary which is JSON formattable containing the summary of + this block execution. + + """ + + if not self.valid: + raise RuntimeError("execution information is bogus:\n * %s" % \ + '\n * '.join(self.errors)) + + with agent.Agent(virtual_memory_in_megabytes, max_cpu_percent) as runner: + + self.agent = runner + + #synchronous call - always returns after a certain timeout + retval = runner.run(self, host, timeout_in_minutes=timeout_in_minutes, + daemon=daemon, db_address=self.db_address) + + #adds I/O statistics from the current executor, if its complete already + #otherwise, it means the running process went bananas, ignore it ;-) + if 'statistics' in retval: + if 'data' in retval['statistics']: + retval['statistics']['data'].update(self.io_statistics) + else: + logger.warn("cannot find 'data' entry on returned stats, " \ + "therefore not appending I/O info either") + + return retval + + + @property + def valid(self): + """A boolean that indicates if this executor is valid or not""" + + return not bool(self.errors) + + + @property + def analysis(self): + """A boolean that indicates if the current block is an analysis block""" + return 'result' in self.data + + + @property + def outputs_exist(self): + """Returns ``True`` if outputs this block is supposed to produce exists.""" + + if self.analysis: + path = os.path.join(self.cache, self.data['result']['path']) + '*' + if not glob.glob(path): return False + + else: + for name, details in self.data['outputs'].items(): + path = os.path.join(self.cache, details['path']) + '*' + if not glob.glob(path): return False + + # if you get to this point all outputs already exist + return True + + + @property + def io_statistics(self): + """Summarize current I/O statistics looking at data sources and sinks + + Returns: + + dict: A dictionary summarizing current I/O statistics, read from our + sinks, sources, inputs and outputs. + """ + + is_analyzer = 'outputs' not in self.data + return stats.io_statistics(self.data_sources, self.input_list, self.data_sinks, self.output_list, self.data, is_analyzer) + + + def __str__(self): + + return simplejson.dumps(self.data, indent=4) + + + def write(self, path): + """Writes contents to precise filesystem location""" + + with open(path, 'wt') as f: f.write(str(self)) + + + def dump_runner_configuration(self, directory): + """Exports contents useful for a backend runner to run the algorithm""" + + data = { + 'algorithm': self.data['algorithm'], + 'parameters': self.data['parameters'], + } + + data['inputs'] = \ + dict([(k, v['channel']) for k,v in self.data['inputs'].items()]) + + if 'outputs' in self.data: + data['outputs'] = \ + dict([(k, v['channel']) for k,v in self.data['outputs'].items()]) + else: + data['result'] = self.data['channel'] + + data['channel'] = self.data['channel'] + + with open(os.path.join(directory, 'configuration.json'), 'wb') as f: + simplejson.dump(data, f, indent=2) + + tmp_prefix = os.path.join(directory, 'prefix') + if not os.path.exists(tmp_prefix): os.makedirs(tmp_prefix) + + self.algorithm.export(tmp_prefix) + + + def dump_databases_provider_configuration(self, directory): + """Exports contents useful for a backend runner to run the algorithm""" + + with open(os.path.join(directory, 'configuration.json'), 'wb') as f: + simplejson.dump(self.data, f, indent=2) + + tmp_prefix = os.path.join(directory, 'prefix') + if not os.path.exists(tmp_prefix): os.makedirs(tmp_prefix) + + for db in self.databases.values(): + db.export(tmp_prefix) + + + def kill(self): + """Stops the user process by force - to be called from signal handlers""" + + if self.agent is not None: + self.agent.kill() + return True + return False -- GitLab From 97b6ceb092ee664e04ab99a58fbf575b16e8bf31 Mon Sep 17 00:00:00 2001 From: Jaden Date: Mon, 7 Aug 2017 16:18:52 +0200 Subject: [PATCH 03/27] local execution of the sample database works --- beat/cmdline/local_execution.py | 19 ++++++++++++++++++- 1 file changed, 18 insertions(+), 1 deletion(-) diff --git a/beat/cmdline/local_execution.py b/beat/cmdline/local_execution.py index da47dc6..abe4761 100755 --- a/beat/cmdline/local_execution.py +++ b/beat/cmdline/local_execution.py @@ -285,7 +285,24 @@ class Executor(object): db = self.databases[details['database']] dataformat_name = db.set(details['protocol'], details['set'])['outputs'][details['output']] - input = inputs.RemoteInput(name, db.dataformats[dataformat_name], self.db_socket) + + # Get the relevant data for the requested view + view_key = (details['database'], details['protocol'], details['set']) + # create the view + v = db.view(view_key[1], view_key[2]) + # setup + v.setup() + v.prepare_outputs() + # TODO: this step should probably be integrated into View creation itself + v.obj.outputs = v.outputs + # Use the database view as an in-memory data source + v_data_source = data.MemoryDataSource(v.done, next_callback=v.next) + v_output = v.outputs[details['output']] + # Output the data from the view + v_output.data_sink.data_sources.append(v_data_source) + # Create a new local input + input = inputs.Input(name, db.dataformats[dataformat_name], v_data_source) + #input = inputs.RemoteInput(name, db.dataformats[dataformat_name], self.db_socket) # Synchronization bits group = self.input_list.group(details['channel']) -- GitLab From 473c5f358bd727ea37baf75ad199c2c1bc4829cd Mon Sep 17 00:00:00 2001 From: Jaden Date: Wed, 16 Aug 2017 11:11:47 +0200 Subject: [PATCH 04/27] using local repos for beat.* deps --- buildout.cfg | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/buildout.cfg b/buildout.cfg index e25f693..2a070d9 100644 --- a/buildout.cfg +++ b/buildout.cfg @@ -6,15 +6,16 @@ parts = scripts extensions = mr.developer auto-checkout = * -develop = . +develop = . ../beat.backend.python ../beat.core newest = false eggs = beat.cmdline beat.backend.python + beat.core ipdb [sources] -beat.core = git https://gitlab.idiap.ch/beat/beat.core -beat.backend.python = git https://gitlab.idiap.ch/beat/beat.backend.python +#beat.core = git https://gitlab.idiap.ch/beat/beat.core +#beat.backend.python = git https://gitlab.idiap.ch/beat/beat.backend.python [scripts] recipe = bob.buildout:scripts -- GitLab From 0a6317a501840dcb5f731d3fbbebaf2e0ae7d652 Mon Sep 17 00:00:00 2001 From: Jaden Date: Wed, 16 Aug 2017 12:15:17 +0200 Subject: [PATCH 05/27] entire experiment running locally! --- beat/cmdline/experiments.py | 70 ++++++++++++++++----------------- beat/cmdline/local_execution.py | 57 ++++++++++++++++++++------- 2 files changed, 78 insertions(+), 49 deletions(-) diff --git a/beat/cmdline/experiments.py b/beat/cmdline/experiments.py index ab065f7..d760d04 100755 --- a/beat/cmdline/experiments.py +++ b/beat/cmdline/experiments.py @@ -204,48 +204,48 @@ def run_experiment(configuration, name, force): with executor: result = executor.process(host) - if result['status'] != 0: - logger.error("Block did not execute properly - outputs were reset") - logger.error(" Standard output:\n%s", reindent(result['stdout'], 4)) - logger.error(" Standard error:\n%s", reindent(result['stderr'], 4)) - logger.error(" Captured user error:\n%s", - reindent(result['user_error'], 4)) - logger.error(" Captured system error:\n%s", - reindent(result['system_error'], 4)) - return 1 - - logger.extra(" Environment: %s" % 'default environment') + #if result['status'] != 0: + # logger.error("Block did not execute properly - outputs were reset") + # logger.error(" Standard output:\n%s", reindent(result['stdout'], 4)) + # logger.error(" Standard error:\n%s", reindent(result['stderr'], 4)) + # logger.error(" Captured user error:\n%s", + # reindent(result['user_error'], 4)) + # logger.error(" Captured system error:\n%s", + # reindent(result['system_error'], 4)) + # return 1 + + print(" Environment: %s" % 'default environment') if executor.analysis: data = load_result(executor) r = reindent(simplejson.dumps(data.as_dict(), indent=2, cls=NumpyJSONEncoder), 2) - logger.info(" Results:\n%s", r) - - stats = result['statistics'] - logger.extra(" CPU time (user, system, total, percent): %s, %s, %s, %d%%", - simplify_time(stats['cpu']['user']), - simplify_time(stats['cpu']['system']), - simplify_time(stats['cpu']['total']), - 100. * (stats['cpu']['user'] + stats['cpu']['system']) / stats['cpu']['total'], - ) - logger.extra(" Memory usage: %s", - simplify_size(stats['memory']['rss'])) - logger.extra(" Cached input read: %s, %s", - simplify_time(stats['data']['time']['read']), - simplify_size(stats['data']['volume']['read'])) - logger.extra(" Cached output write: %s, %s", - simplify_time(stats['data']['time']['write']), - simplify_size(stats['data']['volume']['write'])) - logger.extra(" Communication time: %s (%d%%)", - simplify_time(stats['data']['network']['wait_time']), - 100. * stats['data']['network']['wait_time'] / stats['cpu']['total']) - - logger.extra(" Outputs produced:") + print(" Results:\n%s" % r) + + #stats = result['statistics'] + #logger.extra(" CPU time (user, system, total, percent): %s, %s, %s, %d%%", + # simplify_time(stats['cpu']['user']), + # simplify_time(stats['cpu']['system']), + # simplify_time(stats['cpu']['total']), + # 100. * (stats['cpu']['user'] + stats['cpu']['system']) / stats['cpu']['total'], + # ) + #logger.extra(" Memory usage: %s", + # simplify_size(stats['memory']['rss'])) + #logger.extra(" Cached input read: %s, %s", + # simplify_time(stats['data']['time']['read']), + # simplify_size(stats['data']['volume']['read'])) + #logger.extra(" Cached output write: %s, %s", + # simplify_time(stats['data']['time']['write']), + # simplify_size(stats['data']['volume']['write'])) + #logger.extra(" Communication time: %s (%d%%)", + # simplify_time(stats['data']['network']['wait_time']), + # 100. * stats['data']['network']['wait_time'] / stats['cpu']['total']) + + print(" Outputs produced:") if executor.analysis: - logger.extra(" * %s", executor.data['result']['path']) + print(" * %s" % executor.data['result']['path']) else: for name, details in executor.data['outputs'].items(): - logger.extra(" * %s", details['path']) + print(" * %s" % details['path']) return 0 diff --git a/beat/cmdline/local_execution.py b/beat/cmdline/local_execution.py index abe4761..19c14dc 100755 --- a/beat/cmdline/local_execution.py +++ b/beat/cmdline/local_execution.py @@ -35,6 +35,7 @@ import errno import tempfile import subprocess import zmq.green as zmq +import time import logging logger = logging.getLogger(__name__) @@ -487,24 +488,52 @@ class Executor(object): raise RuntimeError("execution information is bogus:\n * %s" % \ '\n * '.join(self.errors)) - with agent.Agent(virtual_memory_in_megabytes, max_cpu_percent) as runner: + import ipdb; ipdb.set_trace() - self.agent = runner + self.runner = self.algorithm.runner() + retval = self.runner.setup(self.data['parameters']) - #synchronous call - always returns after a certain timeout - retval = runner.run(self, host, timeout_in_minutes=timeout_in_minutes, - daemon=daemon, db_address=self.db_address) + if not self.input_list or not self.output_list: + raise RuntimeError("I/O for execution block has not yet been set up") - #adds I/O statistics from the current executor, if its complete already - #otherwise, it means the running process went bananas, ignore it ;-) - if 'statistics' in retval: - if 'data' in retval['statistics']: - retval['statistics']['data'].update(self.io_statistics) - else: - logger.warn("cannot find 'data' entry on returned stats, " \ - "therefore not appending I/O info either") + using_output = self.output_list[0] if self.analysis else self.output_list + + _start = time.time() + + while self.input_list.hasMoreData(): + main_group = self.input_list.main_group + main_group.restricted_access = False + main_group.next() + main_group.restricted_access = True + if not self.runner.process(self.input_list, using_output): return False + + missing_data_outputs = [x for x in self.output_list if x.isDataMissing()] + + proc_time = time.time() - _start + + if missing_data_outputs: + raise RuntimeError("Missing data on the following output(s): %s" % \ + ', '.join([x.name for x in missing_data_outputs])) + + # some local information + logger.debug("Total processing time was %.3f seconds" , proc_time) + + #with agent.Agent(virtual_memory_in_megabytes, max_cpu_percent) as runner: + # self.agent = runner + # #synchronous call - always returns after a certain timeout + # retval = runner.run(self, host, timeout_in_minutes=timeout_in_minutes, + # daemon=daemon, db_address=self.db_address) + + # #adds I/O statistics from the current executor, if its complete already + # #otherwise, it means the running process went bananas, ignore it ;-) + # if 'statistics' in retval: + # if 'data' in retval['statistics']: + # retval['statistics']['data'].update(self.io_statistics) + # else: + # logger.warn("cannot find 'data' entry on returned stats, " \ + # "therefore not appending I/O info either") - return retval + #return retval @property -- GitLab From 15b8c9b667adb7afd7f159164291e5c7a4e8c726 Mon Sep 17 00:00:00 2001 From: Jaden Date: Wed, 16 Aug 2017 12:25:08 +0200 Subject: [PATCH 06/27] rm docker/Host references from "exp run" codepaths --- beat/cmdline/experiments.py | 18 ++---------------- beat/cmdline/local_execution.py | 21 +-------------------- 2 files changed, 3 insertions(+), 36 deletions(-) diff --git a/beat/cmdline/experiments.py b/beat/cmdline/experiments.py index d760d04..a29f673 100755 --- a/beat/cmdline/experiments.py +++ b/beat/cmdline/experiments.py @@ -84,7 +84,6 @@ from beat.core.experiment import Experiment from .local_execution import Executor from beat.core.utils import NumpyJSONEncoder from beat.core.data import CachedDataSource, load_data_index -from beat.core.dock import Host def run_experiment(configuration, name, force): @@ -161,11 +160,6 @@ def run_experiment(configuration, name, force): scheduled = experiment.setup() - # load existing environments - host = Host() - host.setup(raise_on_errors=False) - environments = host.environments - # can we execute it? results = [] for key, value in scheduled.items(): @@ -173,14 +167,6 @@ def run_experiment(configuration, name, force): # checks and sets-up executable executable = None #use the default - env = value['configuration']['environment'] - search_key = '%s (%s)' % (env['name'], env['version']) - if search_key not in environments: - logger.error("Cannot execute block `%s' on environment `%s': " \ - "environment was not found' - please install it", - key, search_key) - return 1 - executor = Executor(configuration.path, value['configuration'], configuration.cache, dataformat_cache, database_cache, algorithm_cache, library_cache) @@ -202,7 +188,7 @@ def run_experiment(configuration, name, force): logger.extra(" -> using fallback (default) environment") with executor: - result = executor.process(host) + result = executor.process() #if result['status'] != 0: # logger.error("Block did not execute properly - outputs were reset") @@ -214,7 +200,7 @@ def run_experiment(configuration, name, force): # reindent(result['system_error'], 4)) # return 1 - print(" Environment: %s" % 'default environment') + print(" Environment: %s" % 'local environment') if executor.analysis: data = load_result(executor) r = reindent(simplejson.dumps(data.as_dict(), indent=2, diff --git a/beat/cmdline/local_execution.py b/beat/cmdline/local_execution.py index 19c14dc..7f18073 100755 --- a/beat/cmdline/local_execution.py +++ b/beat/cmdline/local_execution.py @@ -49,8 +49,6 @@ from beat.core import inputs from beat.core import outputs from beat.core import data from beat.core import stats -from beat.core import agent -from beat.core import dock class Executor(object): @@ -167,8 +165,6 @@ class Executor(object): self._load(data, dataformat_cache, algorithm_cache, database_cache, library_cache) - self.agent = None - def _load(self, data, dataformat_cache, algorithm_cache, database_cache, library_cache): @@ -236,19 +232,9 @@ class Executor(object): """ - if len(self.databases) > 0: - host = dock.Host() - self.context = zmq.Context() - self.db_socket = self.context.socket(zmq.PAIR) - self.db_address = 'tcp://' + host.ip - port = self.db_socket.bind_to_random_port(self.db_address) - self.db_address += ':%d' % port - self._prepare_inputs() self._prepare_outputs() - self.agent = None - return self @@ -267,7 +253,6 @@ class Executor(object): self.output_list = None self.data_sinks = [] self.data_sources = [] - self.agent = None def _prepare_inputs(self): @@ -426,7 +411,7 @@ class Executor(object): force_start_index=start_index or 0)) - def process(self, host, virtual_memory_in_megabytes=0, + def process(self, virtual_memory_in_megabytes=0, max_cpu_percent=0, timeout_in_minutes=0, daemon=0): """Executes the user algorithm code using an external program. @@ -452,10 +437,6 @@ class Executor(object): Parameters: - host (:py:class:Host): A configured docker host that will execute the - user process. If the host does not have access to the required - environment, an exception will be raised. - virtual_memory_in_megabytes (int, Optional): The amount of virtual memory (in Megabytes) available for the job. If set to zero, no limit will be applied. -- GitLab From 3e701e1c396f8ea17063d9b6ee36d7185c1f4e35 Mon Sep 17 00:00:00 2001 From: Jaden Date: Wed, 16 Aug 2017 17:46:12 +0200 Subject: [PATCH 07/27] should actually call prepare_outputs before calling setup for view --- beat/cmdline/local_execution.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/beat/cmdline/local_execution.py b/beat/cmdline/local_execution.py index 7f18073..7c16a17 100755 --- a/beat/cmdline/local_execution.py +++ b/beat/cmdline/local_execution.py @@ -277,10 +277,9 @@ class Executor(object): # create the view v = db.view(view_key[1], view_key[2]) # setup - v.setup() v.prepare_outputs() - # TODO: this step should probably be integrated into View creation itself - v.obj.outputs = v.outputs + v.setup() + #v.obj.outputs = v.outputs # Use the database view as an in-memory data source v_data_source = data.MemoryDataSource(v.done, next_callback=v.next) v_output = v.outputs[details['output']] -- GitLab From ce7c8d4d3bee4d040a5662718e06428ec388e5a1 Mon Sep 17 00:00:00 2001 From: Jaden Date: Thu, 17 Aug 2017 11:52:53 +0200 Subject: [PATCH 08/27] let user use either local or docker executor to run an exp --- beat/cmdline/experiments.py | 110 +++++++++++++++++++++++------------- 1 file changed, 70 insertions(+), 40 deletions(-) diff --git a/beat/cmdline/experiments.py b/beat/cmdline/experiments.py index a29f673..5851281 100755 --- a/beat/cmdline/experiments.py +++ b/beat/cmdline/experiments.py @@ -27,7 +27,7 @@ """Usage: - %(prog)s experiments run [--force] + %(prog)s experiments run [--force] [(--docker|--local)] %(prog)s experiments caches [--list | --delete | --checksum] %(prog)s experiments list [--remote] %(prog)s experiments check []... @@ -66,6 +66,8 @@ Options: --help Display this screen --path= Use path to write files to disk (instead of the current directory) + --local Uses the local executor to execute the experiment on the local machine (default). + --docker Uses the docker executor to execute the experiment using docker containers. """ @@ -80,13 +82,14 @@ import simplejson from . import common from beat.core.experiment import Experiment -#from beat.core.execution import Executor -from .local_execution import Executor +from beat.core.execution import Executor as DockerExecutor +from .local_execution import Executor as LocalExecutor from beat.core.utils import NumpyJSONEncoder from beat.core.data import CachedDataSource, load_data_index +from beat.core.dock import Host -def run_experiment(configuration, name, force): +def run_experiment(configuration, name, force, use_docker, use_local): '''Run experiments locally''' def load_result(executor): @@ -160,6 +163,12 @@ def run_experiment(configuration, name, force): scheduled = experiment.setup() + if use_docker: + # load existing environments + host = Host() + host.setup(raise_on_errors=False) + environments = host.environments + # can we execute it? results = [] for key, value in scheduled.items(): @@ -167,9 +176,24 @@ def run_experiment(configuration, name, force): # checks and sets-up executable executable = None #use the default - executor = Executor(configuration.path, value['configuration'], - configuration.cache, dataformat_cache, database_cache, - algorithm_cache, library_cache) + if use_docker: + env = value['configuration']['environment'] + search_key = '%s (%s)' % (env['name'], env['version']) + if search_key not in environments: + logger.error("Cannot execute block `%s' on environment `%s': " \ + "environment was not found' - please install it", + key, search_key) + return 1 + + if use_docker: + executor = DockerExecutor(configuration.path, value['configuration'], + configuration.cache, dataformat_cache, database_cache, + algorithm_cache, library_cache) + else: + executor = LocalExecutor(configuration.path, value['configuration'], + configuration.cache, dataformat_cache, database_cache, + algorithm_cache, library_cache) + if not executor.valid: logger.error("Failed to load the execution information for `%s':", key) @@ -188,44 +212,50 @@ def run_experiment(configuration, name, force): logger.extra(" -> using fallback (default) environment") with executor: - result = executor.process() - - #if result['status'] != 0: - # logger.error("Block did not execute properly - outputs were reset") - # logger.error(" Standard output:\n%s", reindent(result['stdout'], 4)) - # logger.error(" Standard error:\n%s", reindent(result['stderr'], 4)) - # logger.error(" Captured user error:\n%s", - # reindent(result['user_error'], 4)) - # logger.error(" Captured system error:\n%s", - # reindent(result['system_error'], 4)) - # return 1 - - print(" Environment: %s" % 'local environment') + if use_docker: + result = executor.process(host) + else: + result = executor.process() + + if use_docker: + if result['status'] != 0: + logger.error("Block did not execute properly - outputs were reset") + logger.error(" Standard output:\n%s", reindent(result['stdout'], 4)) + logger.error(" Standard error:\n%s", reindent(result['stderr'], 4)) + logger.error(" Captured user error:\n%s", + reindent(result['user_error'], 4)) + logger.error(" Captured system error:\n%s", + reindent(result['system_error'], 4)) + print(" Environment: %s" % 'default environment') + return 1 + else: + stats = result['statistics'] + logger.extra(" CPU time (user, system, total, percent): %s, %s, %s, %d%%", + simplify_time(stats['cpu']['user']), + simplify_time(stats['cpu']['system']), + simplify_time(stats['cpu']['total']), + 100. * (stats['cpu']['user'] + stats['cpu']['system']) / stats['cpu']['total'], + ) + logger.extra(" Memory usage: %s", + simplify_size(stats['memory']['rss'])) + logger.extra(" Cached input read: %s, %s", + simplify_time(stats['data']['time']['read']), + simplify_size(stats['data']['volume']['read'])) + logger.extra(" Cached output write: %s, %s", + simplify_time(stats['data']['time']['write']), + simplify_size(stats['data']['volume']['write'])) + logger.extra(" Communication time: %s (%d%%)", + simplify_time(stats['data']['network']['wait_time']), + 100. * stats['data']['network']['wait_time'] / stats['cpu']['total']) + else: + print(" Environment: %s" % 'local environment') + if executor.analysis: data = load_result(executor) r = reindent(simplejson.dumps(data.as_dict(), indent=2, cls=NumpyJSONEncoder), 2) print(" Results:\n%s" % r) - #stats = result['statistics'] - #logger.extra(" CPU time (user, system, total, percent): %s, %s, %s, %d%%", - # simplify_time(stats['cpu']['user']), - # simplify_time(stats['cpu']['system']), - # simplify_time(stats['cpu']['total']), - # 100. * (stats['cpu']['user'] + stats['cpu']['system']) / stats['cpu']['total'], - # ) - #logger.extra(" Memory usage: %s", - # simplify_size(stats['memory']['rss'])) - #logger.extra(" Cached input read: %s, %s", - # simplify_time(stats['data']['time']['read']), - # simplify_size(stats['data']['volume']['read'])) - #logger.extra(" Cached output write: %s, %s", - # simplify_time(stats['data']['time']['write']), - # simplify_size(stats['data']['volume']['write'])) - #logger.extra(" Communication time: %s (%d%%)", - # simplify_time(stats['data']['network']['wait_time']), - # 100. * stats['data']['network']['wait_time'] / stats['cpu']['total']) - print(" Outputs produced:") if executor.analysis: print(" * %s" % executor.data['result']['path']) @@ -361,7 +391,7 @@ def pull(webapi, prefix, names, force, indentation, format_cache): def process(args): if args['run']: - return run_experiment(args['config'], args[''][0], args['--force']) + return run_experiment(args['config'], args[''][0], args['--force'], args['--docker'], args['--local']) if args['caches']: return caches(args['config'], args[''][0], args['--list'], -- GitLab From 0f9e36f3e2146348563c45d18726c692711fc9f2 Mon Sep 17 00:00:00 2001 From: Jaden Date: Thu, 17 Aug 2017 15:37:53 +0200 Subject: [PATCH 09/27] converted exp run tests to test both executors --- beat/cmdline/local_execution.py | 2 +- beat/cmdline/test/test_experiments.py | 32 ++++++++++++++++++++++----- 2 files changed, 28 insertions(+), 6 deletions(-) diff --git a/beat/cmdline/local_execution.py b/beat/cmdline/local_execution.py index 7c16a17..454c4a0 100755 --- a/beat/cmdline/local_execution.py +++ b/beat/cmdline/local_execution.py @@ -468,7 +468,7 @@ class Executor(object): raise RuntimeError("execution information is bogus:\n * %s" % \ '\n * '.join(self.errors)) - import ipdb; ipdb.set_trace() + #import ipdb; ipdb.set_trace() self.runner = self.algorithm.runner() retval = self.runner.setup(self.data['parameters']) diff --git a/beat/cmdline/test/test_experiments.py b/beat/cmdline/test/test_experiments.py index cd32661..92f2be6 100644 --- a/beat/cmdline/test/test_experiments.py +++ b/beat/cmdline/test/test_experiments.py @@ -183,20 +183,42 @@ def test_run_double_triangle_1(): @slow @nose.tools.with_setup(teardown=cleanup) -def test_run_single_error_1(): +@nose.tools.raises(NameError) +def test_run_single_error_1_local(): + # When running locally, the module with the error is loaded + # inside the currently running process and will raise a NameError. obj = 'user/user/single/1/single_error' - nose.tools.eq_(call('run', obj, cache=tmp_prefix), 1) + nose.tools.eq_(call('run', obj, '--local', cache=tmp_prefix), 1) @slow @nose.tools.with_setup(teardown=cleanup) -def test_run_single_error_twice(): +def test_run_single_error_1_docker(): + # When running on docker, the module is loaded in the docker + # container and the local process will return '1'. + obj = 'user/user/single/1/single_error' + nose.tools.eq_(call('run', obj, '--docker', cache=tmp_prefix), 1) + +@slow +@nose.tools.with_setup(teardown=cleanup) +@nose.tools.raises(NameError) +def test_run_single_error_twice_local(): + # This one makes sure our output reset is working properly. Both tries should + # give out the same error. + obj = 'user/user/single/1/single_error' + nose.tools.eq_(call('run', obj, '--local', cache=tmp_prefix), 1) + nose.tools.eq_(call('run', obj, '--local', cache=tmp_prefix), 1) + + +@slow +@nose.tools.with_setup(teardown=cleanup) +def test_run_single_error_twice_docker(): # This one makes sure our output reset is working properly. Both tries should # give out the same error. obj = 'user/user/single/1/single_error' - nose.tools.eq_(call('run', obj, cache=tmp_prefix), 1) - nose.tools.eq_(call('run', obj, cache=tmp_prefix), 1) + nose.tools.eq_(call('run', obj, '--docker', cache=tmp_prefix), 1) + nose.tools.eq_(call('run', obj, '--docker', cache=tmp_prefix), 1) @nose.tools.with_setup(teardown=cleanup) -- GitLab From 89cb7498cd71069c17e8457d6a9f5e07d3d23f8a Mon Sep 17 00:00:00 2001 From: Jaden Date: Fri, 18 Aug 2017 12:07:47 +0200 Subject: [PATCH 10/27] allow override db root folder locally --- beat/cmdline/config.py | 33 +++++++++++++++++++++++++-------- beat/cmdline/experiments.py | 2 +- beat/cmdline/local_execution.py | 18 +++++++++++++++++- 3 files changed, 43 insertions(+), 10 deletions(-) diff --git a/beat/cmdline/config.py b/beat/cmdline/config.py index 3ceeb40..034d3d4 100644 --- a/beat/cmdline/config.py +++ b/beat/cmdline/config.py @@ -191,14 +191,15 @@ class Configuration(object): if not os.path.exists(c): return with open(c, 'rt') as f: user_data = simplejson.load(f) - for k in DEFAULTS: - if k in user_data: self.__data[k] = user_data[k] + for k in user_data: + if self._is_valid_key(k): self.__data[k] = user_data[k] except simplejson.JSONDecodeError: - print("WARNING: invalid state file at `%s' - removing and " \ - "re-starting..." % c) - from beat.core.utils import safe_rmfile - safe_rmfile(c) + raise + # print("WARNING: invalid state file at `%s' - removing and " \ + # "re-starting..." % c) + # from beat.core.utils import safe_rmfile + # safe_rmfile(c) @property @@ -210,16 +211,23 @@ class Configuration(object): return self.__data['cache'] + @property + def database_paths(self): + '''A dict of paths for databases''' + + return dict((k, self.__data[k]) for k in self.__data if self.is_database_key(k)) + + def set(self, key, value): '''Sets or resets a field in the configuration''' - if key not in DEFAULTS: + if not self._is_valid_key(key): print("ERROR: don't know about parameter `%s'" % key) sys.exit(1) if value is not None: self.__data[key] = value - else: + elif key in DEFAULTS: self.__data[key] = DEFAULTS[key] self.save() @@ -249,6 +257,11 @@ class Configuration(object): with os.fdopen(os.open(c, os.O_WRONLY | os.O_CREAT, 0600), 'wt') as f: simplejson.dump(self.__data, f, indent=4) + def _is_valid_key(self, key): + return key in DEFAULTS or self.is_database_key(key) + + def is_database_key(self, key): + return key.startswith('database/') def __str__(self): @@ -264,6 +277,10 @@ class Configuration(object): value = self.__data[key] value = "`%s'" % value if value is not None else '' retval.append(" * %-15s: %s" % (key, value)) + for key in sorted([k for k in self.__data if self.is_database_key(k)]): + value = self.__data[key] + value = "`%s'" % value if value is not None else '' + retval.append(" * %-15s: %s" % (key, value)) for key in sorted([k for k in DOC if k.startswith('color')]): value = self.__data[key] color, on_color, attrs = colorlog_to_termcolor(value) diff --git a/beat/cmdline/experiments.py b/beat/cmdline/experiments.py index 5851281..57d8a2b 100755 --- a/beat/cmdline/experiments.py +++ b/beat/cmdline/experiments.py @@ -192,7 +192,7 @@ def run_experiment(configuration, name, force, use_docker, use_local): else: executor = LocalExecutor(configuration.path, value['configuration'], configuration.cache, dataformat_cache, database_cache, - algorithm_cache, library_cache) + algorithm_cache, library_cache, configuration.database_paths) if not executor.valid: diff --git a/beat/cmdline/local_execution.py b/beat/cmdline/local_execution.py index 454c4a0..117dc33 100755 --- a/beat/cmdline/local_execution.py +++ b/beat/cmdline/local_execution.py @@ -36,6 +36,7 @@ import tempfile import subprocess import zmq.green as zmq import time +import collections import logging logger = logging.getLogger(__name__) @@ -131,10 +132,15 @@ class Executor(object): data_sinks (list): A list with all data-sinks created by our execution loader. These are useful for clean-up actions in case of problems. + custom_root_folders (dict): A dictionary where the keys are database + identifiers (`/`) and the values are paths to the + given database's files. These values will override the value found + in the database's metadata. + """ def __init__(self, prefix, data, cache=None, dataformat_cache=None, - database_cache=None, algorithm_cache=None, library_cache=None): + database_cache=None, algorithm_cache=None, library_cache=None, custom_root_folders=None): self.prefix = prefix self.cache = cache or os.path.join(self.prefix, 'cache') @@ -156,6 +162,11 @@ class Executor(object): self.errors = [] self.data = data + if custom_root_folders is not None and not isinstance(custom_root_folders, collections.Mapping): + raise TypeError("The custom root folders must be in dictionary format") + + self.custom_root_folders = custom_root_folders + # temporary caches, if the user has not set them, for performance database_cache = database_cache if database_cache is not None else {} dataformat_cache = dataformat_cache if dataformat_cache is not None else {} @@ -267,8 +278,13 @@ class Executor(object): if 'database' in details: #it is a dataset input + import ipdb; ipdb.set_trace() + # create the remote input db = self.databases[details['database']] + configName = "database/%s" % db.name + if self.custom_root_folders is not None and configName in self.custom_root_folders: + db.data['root_folder'] = self.custom_root_folders[configName] dataformat_name = db.set(details['protocol'], details['set'])['outputs'][details['output']] -- GitLab From 75f141a9c76ea0cc3e21f5dd4680494c1fc6e73c Mon Sep 17 00:00:00 2001 From: Jaden Date: Mon, 21 Aug 2017 15:11:49 +0200 Subject: [PATCH 11/27] ignore nosetest test id tracking file --- .gitignore | 1 + 1 file changed, 1 insertion(+) diff --git a/.gitignore b/.gitignore index 8b117ea..f4b226d 100644 --- a/.gitignore +++ b/.gitignore @@ -29,3 +29,4 @@ dataformats/ experiments/ libraries/ toolchains/ +.noseids -- GitLab From 5bf2fdca2c184e5daa2162b415a12489e277b072 Mon Sep 17 00:00:00 2001 From: Jaden Date: Mon, 21 Aug 2017 15:12:07 +0200 Subject: [PATCH 12/27] rm ipdb lines from local_execution --- beat/cmdline/local_execution.py | 5 ----- 1 file changed, 5 deletions(-) diff --git a/beat/cmdline/local_execution.py b/beat/cmdline/local_execution.py index 117dc33..470e6da 100755 --- a/beat/cmdline/local_execution.py +++ b/beat/cmdline/local_execution.py @@ -277,9 +277,6 @@ class Executor(object): for name, details in self.data['inputs'].items(): if 'database' in details: #it is a dataset input - - import ipdb; ipdb.set_trace() - # create the remote input db = self.databases[details['database']] configName = "database/%s" % db.name @@ -484,8 +481,6 @@ class Executor(object): raise RuntimeError("execution information is bogus:\n * %s" % \ '\n * '.join(self.errors)) - #import ipdb; ipdb.set_trace() - self.runner = self.algorithm.runner() retval = self.runner.setup(self.data['parameters']) -- GitLab From 1f93a5a925c82d8e2c4d47311ed4801cac1ff8a5 Mon Sep 17 00:00:00 2001 From: Jaden Date: Mon, 21 Aug 2017 15:13:13 +0200 Subject: [PATCH 13/27] add tests for changes to config to support database paths --- beat/cmdline/test/test_config.py | 35 ++++++++++++++++++++++++++++++++ 1 file changed, 35 insertions(+) diff --git a/beat/cmdline/test/test_config.py b/beat/cmdline/test/test_config.py index 2f633cd..108e396 100644 --- a/beat/cmdline/test/test_config.py +++ b/beat/cmdline/test/test_config.py @@ -83,6 +83,41 @@ def test_set_token(): assert contents['token'] == token_value +@nose.tools.with_setup(teardown=cleanup) +def test_set_atnt_db(): + db_config = 'database/atnt' + db_path = './atnt_db' + nose.tools.eq_(call('config', 'set', db_config, db_path), 0) + config = os.path.join(tmp_prefix, '.beat', 'config.json') + assert os.path.exists(config) + with open(config, 'rt') as f: contents = simplejson.load(f) + assert contents[db_config] == db_path + + +@nose.tools.with_setup(teardown=cleanup) +def test_set_get_atnt_db(): + db_config = 'database/atnt' + db_path = './atnt_db' + nose.tools.eq_(call('config', 'set', db_config, db_path), 0) + nose.tools.eq_(call('config', 'get', db_config), 0) + + +@nose.tools.with_setup(teardown=cleanup) +def test_set_bad_config_key(): + db_config = 'fail' + with assert_raises(SystemExit) as c: + call('config', 'set', db_config, db_config) + + assert c.exception.code == 1 + + +@nose.tools.with_setup(teardown=cleanup) +@nose.tools.raises(KeyError) +def test_get_bad_config_key(): + db_config = 'fail' + nose.tools.eq_(call('config', 'get', db_config), 1) + + @nose.tools.with_setup(teardown=cleanup) def test_get_token(): nose.tools.eq_(call('config', 'get', 'token'), 0) -- GitLab From e52f929618c98239a0b2820b2c4b5ced123c2573 Mon Sep 17 00:00:00 2001 From: Jaden Date: Tue, 22 Aug 2017 14:16:40 +0200 Subject: [PATCH 14/27] use logger instead of print --- beat/cmdline/experiments.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/beat/cmdline/experiments.py b/beat/cmdline/experiments.py index 57d8a2b..253bd1f 100755 --- a/beat/cmdline/experiments.py +++ b/beat/cmdline/experiments.py @@ -254,14 +254,15 @@ def run_experiment(configuration, name, force, use_docker, use_local): data = load_result(executor) r = reindent(simplejson.dumps(data.as_dict(), indent=2, cls=NumpyJSONEncoder), 2) - print(" Results:\n%s" % r) + logger.info(" Results:\n%s", r) - print(" Outputs produced:") + logger.extra(" Outputs produced:") if executor.analysis: - print(" * %s" % executor.data['result']['path']) + logger.extra(" * %s", executor.data['result']['path']) else: for name, details in executor.data['outputs'].items(): - print(" * %s" % details['path']) + logger.extra(" * %s", details['path']) + return 0 -- GitLab From 79822398e93ab3f03709bf28847d24186bdb2391 Mon Sep 17 00:00:00 2001 From: Jaden Date: Tue, 22 Aug 2017 14:16:52 +0200 Subject: [PATCH 15/27] revert using local versions --- buildout.cfg | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/buildout.cfg b/buildout.cfg index 2a070d9..6fcdc47 100644 --- a/buildout.cfg +++ b/buildout.cfg @@ -6,7 +6,7 @@ parts = scripts extensions = mr.developer auto-checkout = * -develop = . ../beat.backend.python ../beat.core +develop = . newest = false eggs = beat.cmdline beat.backend.python @@ -14,8 +14,8 @@ eggs = beat.cmdline ipdb [sources] -#beat.core = git https://gitlab.idiap.ch/beat/beat.core -#beat.backend.python = git https://gitlab.idiap.ch/beat/beat.backend.python +beat.core = git https://gitlab.idiap.ch/beat/beat.core +beat.backend.python = git https://gitlab.idiap.ch/beat/beat.backend.python [scripts] recipe = bob.buildout:scripts -- GitLab From 8930f4ac7e80a0493444c7cdf826200d61e8005f Mon Sep 17 00:00:00 2001 From: Jaden Date: Tue, 22 Aug 2017 15:06:22 +0200 Subject: [PATCH 16/27] use imgmath since pmgmath is deprecated --- doc/conf.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/doc/conf.py b/doc/conf.py index e391689..1d657fa 100644 --- a/doc/conf.py +++ b/doc/conf.py @@ -46,7 +46,7 @@ needs_sphinx = '1.3' extensions = [ 'sphinx.ext.todo', 'sphinx.ext.coverage', - 'sphinx.ext.pngmath', + 'sphinx.ext.imgmath', 'sphinx.ext.ifconfig', 'sphinx.ext.autodoc', 'sphinx.ext.autosummary', @@ -69,7 +69,7 @@ autosummary_generate = True # If we are on OSX, the 'dvipng' path maybe different dvipng_osx = '/opt/local/libexec/texlive/binaries/dvipng' -if os.path.exists(dvipng_osx): pngmath_dvipng = dvipng_osx +if os.path.exists(dvipng_osx): imgmath_dvipng = dvipng_osx # Add any paths that contain templates here, relative to this directory. templates_path = ['_templates'] -- GitLab From e2c6759be15ba9caf52e4afe935c1fc5115deaa9 Mon Sep 17 00:00:00 2001 From: Jaden Date: Tue, 22 Aug 2017 15:07:00 +0200 Subject: [PATCH 17/27] clean up some of the doc --- doc/index.rst | 8 ++++---- doc/introduction.rst | 6 +++--- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/doc/index.rst b/doc/index.rst index fe66d2a..da671f8 100644 --- a/doc/index.rst +++ b/doc/index.rst @@ -27,11 +27,11 @@ BEAT Command-line Client .. _beat-cmdline-introduction: -This package implements a Python-based client for BEAT's web service or locally +This package provides a Python-based client for BEAT's web service or locally installed repositories. It allows users to list, validate, edit, download and -upload objects from remote BEAT instances. It dubs as a Python-client API for -packages required to implement more advanced functionality than already -available on the currently avaiable tool, called `beat`, as we shall see next. +upload objects from remote BEAT instances, as well as running BEAT experiments +locally. It also doubles as a Python-client API for packages that need to +implement more advanced functionality than this client (`beat`) provides. .. toctree:: diff --git a/doc/introduction.rst b/doc/introduction.rst index 7e614b2..01d4f63 100644 --- a/doc/introduction.rst +++ b/doc/introduction.rst @@ -44,19 +44,19 @@ typical directory structure in a prefix directory: :cwd: .. Each of the subdirectories in the prefix keeps only objects of a given type. -For example, the ``dataformats`` subdirectory keeps only data format objects +For example, the ``dataformats`` subdirectory keeps only data format objects, and so on. Inside each subdirectory, the user will find an organization that resembles the naming convention of objects in the BEAT platform. For example, you'd be able to find the data format ``my_dataformat``, belonging to user ``user``, version ``1``, under the directory ``/dataformats/user/my_dataformat/1``. Objects are described by a JSON -file, an option full-length description in reStructuredText format and, +file, an optional full-length description in reStructuredText format and, depending on the object type, a program file containing user routines programmed in one of the supported languages. The ``beat`` command-line utility bridges user interaction with a remote BEAT web platform and locally available objects in a seamless way. The program is -normally available on your work environment: +normally available in the Idiap work environment: .. command-output:: ./bin/beat --help :cwd: .. -- GitLab From fae6e45005e6513f59a3438a2c32cfc2539f2241 Mon Sep 17 00:00:00 2001 From: Jaden Date: Tue, 22 Aug 2017 15:07:37 +0200 Subject: [PATCH 18/27] add blurb in configuration doc page for overriding database locations --- doc/configuration.rst | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/doc/configuration.rst b/doc/configuration.rst index 8b403bf..e7ea78c 100644 --- a/doc/configuration.rst +++ b/doc/configuration.rst @@ -72,6 +72,24 @@ flag: ... +When running an experiment via the ``beat`` application using the local +executor (the default executor, also behind the ``--local`` flag), ``beat`` +will look into your configuration for any options set by the user that follow +the format ``database//``. ``beat`` expects that this +option points to a string representing the path to the root folder of the +actual database files for the given database. + +For example, the AT&T "Database of Faces" is available on the BEAT platform +as the "atnt" database. The third version of the "atnt" database would be +referenced as "atnt/3". The object "atnt/3" has a root folder defined on +the BEAT platform already, and changing this locally would mean creating a +new version of the database. +Instead, you may override that path by setting the configuration option +``database/atnt/3`` to your local path to the database files. +Assuming your username is "user" and you extracted the database files to +``~/Downloads/atnt_db``, you can set ``database/atnt/3`` to +``/home/user/Downloads/atnt_db``, and ``beat`` will find the database files. + You may explore different configuration options with the ``--help`` flag of ``beat config``: -- GitLab From 505d7d665c18504eb30f355a460467a8bb622141 Mon Sep 17 00:00:00 2001 From: Jaden Date: Tue, 22 Aug 2017 17:05:07 +0200 Subject: [PATCH 19/27] invalid toc file ref --- doc/index.rst | 1 - 1 file changed, 1 deletion(-) diff --git a/doc/index.rst b/doc/index.rst index da671f8..267a8b9 100644 --- a/doc/index.rst +++ b/doc/index.rst @@ -43,7 +43,6 @@ implement more advanced functionality than this client (`beat`) provides. toolchains experiments databases - api/beat.cmdline Indices and tables ================== -- GitLab From 826288fc735a5384f1f42af0b89ecc222b649cd5 Mon Sep 17 00:00:00 2001 From: Jaden Date: Tue, 22 Aug 2017 17:05:40 +0200 Subject: [PATCH 20/27] rewrite exp docs and add section on executors --- doc/experiments.rst | 339 +++++--------------------------------------- 1 file changed, 37 insertions(+), 302 deletions(-) diff --git a/doc/experiments.rst b/doc/experiments.rst index f5f5661..f852d5f 100644 --- a/doc/experiments.rst +++ b/doc/experiments.rst @@ -47,60 +47,47 @@ The commands available for experiments are: How to run an experiment? ......................... -The ``run_toolchain.py`` script can be used to perform the experiment defined -in a toolchain. It is the ideal way to debug an algorithm, since this script -doesn't try to do any advanced trick like the Scheduler (multi-processing, -optimizations, sandboxing, ...). - -For example, we execute a simple toolchain with two processing blocks (found in -``src/beat.core/beat/core/test/toolchains/integers_addition2.json``): - -.. code-block:: sh - - $ ./bin/run_toolchain.py --prefix=src/beat.core/beat/core/test/ integers_addition2 - Processing block 'addition1'... - Algorithm: sum - Inputs: - - a (single_integer): beat/src/beat.core/beat/core/test/databases/integers/output1.data - - b (single_integer): beat/src/beat.core/beat/core/test/databases/integers/output2.data - Outputs: - - sum (single_integer): beat/src/beat.core/beat/core/test/cache/addition1/sum.data - - Processing block 'addition2'... - Algorithm: sum - Inputs: - - a (single_integer): beat/src/beat.core/beat/core/test/cache/addition1/sum.data - - b (single_integer): beat/src/beat.core/beat/core/test/databases/integers/output3.data - Outputs: - - sum (single_integer): beat/src/beat.core/beat/core/test/cache/addition2/sum.data - - DONE - - Results available at: - - addition2.sum: beat/src/beat.core/beat/core/test/cache/addition2/sum.data +The command ``beat experiments run `` can be used to run the experiment +defined in an experiment definition file. It is the ideal way to debug an +experiment, since by default ``beat`` will use the local executor, which provides +a simple environment with PDB support without advanced features +(multi-processing, optimizations, sandboxing, multiple environments, etc.). Here, the ``--prefix`` option is used to tell the scripts where all our data -formats, toolchains and algorithms are located, and ``integers_addition2`` is -the name of the toolchain we want to check (note that we don't add the -``.json`` extension, as this is the name of the toolchain, not the filename!). +formats, toolchains and algorithms are located. This option can be set +in your configuration file (see ``beat config``). -This script displays for each block the files containing the data to use as +This command displays for each block the files containing the data to use as input, and the files generated by the outputs of the block. -By default, files are generated in binary format, but you can force them to be -in a more readable JSON format with the ``--json`` flag: - -.. code-block:: sh - - $ ./bin/run_toolchain.py --prefix=src/beat.core/beat/core/test/ --json integers_addition2 - The default behavior is to not regenerate data files already present in the cache. You can force the script to not take the content of the cache into -account with the ``--force`` flag: - -.. code-block:: sh - - $ ./bin/run_toolchain.py --prefix=src/beat.core/beat/core/test/ --force integers_addition2 +account with the ``--force`` flag. + +Executors +========= + +"Executors" are modules that execute each block in an experiment. On the BEAT +platform, there is only the one executor, which executes the experiment using +Docker containers with advanced scheduling and security features. When +developing using ``beat.cmdline``, however, you have the option of using either +the BEAT platform's executor, behind the ``--docker`` flag, or the "local" +executor, provided in this project. The local executor, as explained above, is +much simpler, aimed at providing a smooth development experience. However, +there are two important tradeoffs: + +- Lower performance for non-trivial experiments, as it runs everything + synchronously in one process on the CPU. +- No multiple environments, as the Python environment that built + ``beat.cmdline`` is used. This means that many BEAT experiments that + rely on different/multiple environments will not work. + +If you want to use the local executor, pay attention to the python environment +used to call `buildout` in your copy of ``beat.cmdline``. The suggested way +to use Bob libraries while developing on the local executor is to use install +``zc.buildout`` in a Python2.7 conda environment with Bob installed. Using +the ``buildout`` command from the environment will make the entire environment +available to ``beat.cmdline`` even when the environment is not active. .. _beat-core-experiments-displaydata: @@ -108,259 +95,7 @@ account with the ``--force`` flag: How to examine the content of a data file? .......................................... -The ``display_data.py`` script can be used to examine the content of a data -file generated by the execution of a toolchain. - -For example, we look at the content of one of the data file used by the tests -of beat.core (found in -``src/beat.core/beat/core/test/data/single_integer.data``): - -.. code-block:: sh +The ``beat cache`` collection of commands interact with the cache: - $ ./bin/display_data.py --prefix=src/beat.core/beat/core/test data/single_integer_delayed.data - Data format: single_integer - ---------------------------------------------- - Indexes: 0-1 - { - "value": 0 - } - ---------------------------------------------- - Indexes: 2-3 - { - "value": 1 - } - ---------------------------------------------- - Indexes: 4-5 - { - "value": 2 - } - ---------------------------------------------- - Indexes: 6-7 - { - "value": 3 - } - ---------------------------------------------- - Indexes: 8-9 - { - "value": 4 - } - ---------------------------------------------- - Indexes: 10-11 - { - "value": 5 - } - ---------------------------------------------- - Indexes: 12-13 - { - "value": 6 - } - ---------------------------------------------- - Indexes: 14-15 - { - "value": 7 - } - ---------------------------------------------- - Indexes: 16-17 - { - "value": 8 - } - ---------------------------------------------- - Indexes: 18-19 - { - "value": 9 - } - -The script tells us that the data correspond to the data format -``single_integer``, and displays each entry (with the indexes it correspond to) -in a JSON representation. - - -.. _beat-core-experiments-example: - -Putting it all together: a complete example -........................................... - -.. _beat-core-experiments-example-figure: -.. figure:: img/toolchain-example.* - - A complete toolchain that train and test a face detector - -The following example describes the toolchain visible at :num:`figure -#beat-core-toolchains-example-figure`, a complete toolchain that: - - #. train a face detector on one set of images (*beat_face_dataset_train*) - #. validate it on another set of images (*beat_face_dataset_validation*) - #. test it on a third set of images (*beat_face_dataset_test*) - -.. note:: - - This toolchain is still not considered as an executable one by the platform, - since it contains no mention of the algorithms that must be used in each - processing block. - -.. code-block:: json - - { - "databases": [ { - "name": "beat_face_dataset_train", - "outputs": { - "images": "image/rgb", - "faces": "coordinates_list" - } - }, - { - "name": "beat_face_dataset_validation", - "outputs": { - "images": "image/rgb", - "faces": "coordinates_list" - } - }, - { - "name": "beat_face_dataset_test", - "outputs": { - "images": "image/rgb", - "faces": "coordinates_list" - } - } - ], - "blocks": [{ - "name": "features_extractor_train", - "inputs": { - "images": "images/rgb" - }, - "outputs": { - "features": "array/float" - } - }, - { - "name": "face_model_builder", - "inputs": { - "features": "array/float", - "faces": "coordinates_list" - }, - "outputs": { - "model": "face_model" - } - }, - { - "name": "features_extractor_validation", - "inputs": { - "images": "images/rgb" - }, - "outputs": { - "features": "array/float" - } - }, - { - "name": "face_detector_validation", - "inputs": { - "model": "face_model", - "features": "array/float" - }, - "outputs": { - "faces": "coordinates_list" - } - }, - { - "name": "thresholder", - "inputs": { - "detected_faces": "coordinates_list", - "labelled_faces": "coordinates_list" - }, - "outputs": { - "threshold": "float" - } - }, - { - "name": "features_extractor_test", - "inputs": { - "images": "images/rgb" - }, - "outputs": { - "features": "array/float" - } - }, - { - "name": "face_detector_test", - "inputs": { - "model": "face_model", - "features": "array/float" - }, - "outputs": { - "faces": "coordinates_list" - } - }, - { - "name": "evaluator", - "inputs": { - "threshold": "float", - "detected_faces": "coordinates_list", - "labelled_faces": "coordinates_list" - }, - "outputs": { - "score": "float" - } - } - ], - "connections": [{ - "from": "beat_face_dataset_train.images", - "to": "features_extractor_train.images" - }, - { - "from": "features_extractor_train.features", - "to": "face_model_builder.features" - }, - { - "from": "beat_face_dataset_train.faces", - "to": "face_model_builder.faces" - }, - { - "from": "beat_face_dataset_validation.images", - "to": "features_extractor_validation.images" - }, - { - "from": "face_model_builder.model", - "to": "face_detector_validation.model" - }, - { - "from": "features_extractor_validation.features", - "to": "face_detector_validation.features" - }, - { - "from": "face_detector_validation.faces", - "to": "thresholder.detected_faces" - }, - { - "from": "beat_face_dataset_validation.faces", - "to": "thresholder.labelled_faces" - }, - { - "from": "beat_face_dataset_test.images", - "to": "features_extractor_test.images" - }, - { - "from": "features_extractor_test.features", - "to": "face_detector_test.features" - }, - { - "from": "face_model_builder.model", - "to": "face_detector_test.model" - }, - { - "from": "thresholder.threshold", - "to": "evaluator.threshold" - }, - { - "from": "face_detector_test.faces", - "to": "evaluator.detected_faces" - }, - { - "from": "beat_face_dataset_test.faces", - "to": "evaluator.labelled_faces" - } - ], - "results": [ - "thresholder.threshold", - "evaluator.score" - ] - } +.. command-output:: ./bin/beat cache --help + :cwd: .. -- GitLab From da470a36c2b747324fc6e9f00d789c4c5c672760 Mon Sep 17 00:00:00 2001 From: Jaden Date: Wed, 23 Aug 2017 10:13:48 +0200 Subject: [PATCH 21/27] fix pkg name --- doc/experiments.rst | 6 +++--- doc/toolchains.rst | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/doc/experiments.rst b/doc/experiments.rst index f852d5f..71966d3 100644 --- a/doc/experiments.rst +++ b/doc/experiments.rst @@ -21,7 +21,7 @@ .. with the BEAT platform. If not, see http://www.gnu.org/licenses/. .. -.. _beat-core-experiments-cmdline: +.. _beat-cmdline-experiments-cmdline: Experiments ----------- @@ -42,7 +42,7 @@ The commands available for experiments are: :cwd: .. -.. _beat-core-experiments-running: +.. _beat-cmdline-experiments-running: How to run an experiment? ......................... @@ -90,7 +90,7 @@ the ``buildout`` command from the environment will make the entire environment available to ``beat.cmdline`` even when the environment is not active. -.. _beat-core-experiments-displaydata: +.. _beat-cmdline-experiments-displaydata: How to examine the content of a data file? .......................................... diff --git a/doc/toolchains.rst b/doc/toolchains.rst index 8639920..2674c84 100644 --- a/doc/toolchains.rst +++ b/doc/toolchains.rst @@ -30,7 +30,7 @@ The commands available for toolchains are: :cwd: .. -.. _beat-core-toolchains-checkscript: +.. _beat-cmdline-toolchains-checkscript: How to check that a toolchain is correctly declared? .................................................... -- GitLab From eecb0dd855bae2ca47e62ece0c9c19ad6f7048aa Mon Sep 17 00:00:00 2001 From: Jaden Date: Wed, 23 Aug 2017 10:14:28 +0200 Subject: [PATCH 22/27] simplify link name --- doc/experiments.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/experiments.rst b/doc/experiments.rst index 71966d3..4ce1bfd 100644 --- a/doc/experiments.rst +++ b/doc/experiments.rst @@ -21,7 +21,7 @@ .. with the BEAT platform. If not, see http://www.gnu.org/licenses/. .. -.. _beat-cmdline-experiments-cmdline: +.. _beat-cmdline-experiments: Experiments ----------- -- GitLab From 6a0ebdc187d466cf0117ad764dfa60cf590545f4 Mon Sep 17 00:00:00 2001 From: Jaden Date: Wed, 23 Aug 2017 10:46:42 +0200 Subject: [PATCH 23/27] parts of walkthrough complete --- doc/index.rst | 1 + doc/walkthrough.rst | 66 +++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 67 insertions(+) create mode 100644 doc/walkthrough.rst diff --git a/doc/index.rst b/doc/index.rst index 267a8b9..b678dcb 100644 --- a/doc/index.rst +++ b/doc/index.rst @@ -43,6 +43,7 @@ implement more advanced functionality than this client (`beat`) provides. toolchains experiments databases + walkthrough Indices and tables ================== diff --git a/doc/walkthrough.rst b/doc/walkthrough.rst new file mode 100644 index 0000000..a57361b --- /dev/null +++ b/doc/walkthrough.rst @@ -0,0 +1,66 @@ +.. vim: set fileencoding=utf-8 : + +.. Copyright (c) 2016 Idiap Research Institute, http://www.idiap.ch/ .. +.. Contact: beat.support@idiap.ch .. +.. .. +.. This file is part of the beat.cmdline module of the BEAT platform. .. +.. .. +.. Commercial License Usage .. +.. Licensees holding valid commercial BEAT licenses may use this file in .. +.. accordance with the terms contained in a written agreement between you .. +.. and Idiap. For further information contact tto@idiap.ch .. +.. .. +.. Alternatively, this file may be used under the terms of the GNU Affero .. +.. Public License version 3 as published by the Free Software and appearing .. +.. in the file LICENSE.AGPL included in the packaging of this file. .. +.. The BEAT platform is distributed in the hope that it will be useful, but .. +.. WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY .. +.. or FITNESS FOR A PARTICULAR PURPOSE. .. +.. .. +.. You should have received a copy of the GNU Affero Public License along .. +.. with the BEAT platform. If not, see http://www.gnu.org/licenses/. .. + + +.. _beat-cmdline-walkthrough: + +Walkthrough +----------- + +This page gives an overview on setting up an environment to develop BEAT +experiments locally. The topics covered are the following: + +- Preparing a proper Python environment +- Bootstrapping ``beat.cmdline`` +- Running the `BEAT Platform `_ locally +- Introducing a possible workflow for BEAT development + +Tips for preparing a proper environment +....................................... + +- Using a `Conda `_ environment is recommended. +- ``beat.cmdline`` requires Python 2.7 and will not work on Python 3.x. +- It uses `Buildout `_, to bootstrap, + commonly known as `zc.buildout`. This package is also available on Conda. +- The Python 2.7 environment used for bootstrapping will also be the + environment used to execute BEAT code when using the local executor. +- While you can bootstrap and use ``beat.cmdline`` without having a local + BEAT instance, ``beat`` can't yet create BEAT objects (experiments, etc.) + from scratch. We recommend you instead modify existing BEAT objects. + +Boostrapping ``beat.cmdline`` +............................. + +Thanks to Buildout, setup is simple: + +- Clone the ``beat.cmdline`` repository. +- Using a compatible environment (see above) run ``buildout`` in the project's + root directory. Assuming this step is successful, the ``beat`` tool is now + functional. +- To build this documentation locally, use the ``sphinx-build`` tool in + ``bin/``: ``./bin/sphinx-build doc/ sphinx/`` to output to ``sphinx/``. + +Running the BEAT Platform locally +................................. + + + -- GitLab From 8a2b5fd515b0c1ee5ebfe5f3c486e7cc7d0df256 Mon Sep 17 00:00:00 2001 From: Jaden Date: Wed, 23 Aug 2017 11:53:32 +0200 Subject: [PATCH 24/27] fleshed out walkthrough --- doc/walkthrough.rst | 39 +++++++++++++++++++++++++++++++++++++-- 1 file changed, 37 insertions(+), 2 deletions(-) diff --git a/doc/walkthrough.rst b/doc/walkthrough.rst index a57361b..398a73f 100644 --- a/doc/walkthrough.rst +++ b/doc/walkthrough.rst @@ -31,8 +31,9 @@ experiments locally. The topics covered are the following: - Preparing a proper Python environment - Bootstrapping ``beat.cmdline`` -- Running the `BEAT Platform `_ locally -- Introducing a possible workflow for BEAT development +- Running the `BEAT web platform `_ + locally +- Detailing an example workflow for BEAT development Tips for preparing a proper environment ....................................... @@ -46,6 +47,8 @@ Tips for preparing a proper environment - While you can bootstrap and use ``beat.cmdline`` without having a local BEAT instance, ``beat`` can't yet create BEAT objects (experiments, etc.) from scratch. We recommend you instead modify existing BEAT objects. +- Running experiments using the Docker executor requires Docker to be + set up locally and several docker containers. Boostrapping ``beat.cmdline`` ............................. @@ -62,5 +65,37 @@ Thanks to Buildout, setup is simple: Running the BEAT Platform locally ................................. +- Build + `the BEAT web environment `_ + locally. +- Build ``beat.web`` using the Python environment built from ``beat.env.web``. +- Install the example environment in ``beat.web`` via ``./bin/django install``. +- Run ``./bin/django runserver`` and go to ``localhost:8000/`` to see if it + works. The default example login is ``user`` with password ``user``. +An example workflow +................... +First, have ``beat.web`` running locally. To interact with the local instance +through ``beat.cmdline``, set the ``platform`` config option through ``beat`` +to ``http://localhost:8000/``: ``beat set platform 'http://localhost:8080/'``. + +Pull example experiments from the platform via the ``beat`` tool: +``beat exp pull user/single/1``. Run the example experiment using the +local executor: ``beat exp run user/single/1/single`` to make sure one works. + +Fork the ``single`` experiment, using the same ``user/single/1`` toolchain: +``beat exp fork user/single/1/single user/single/1/single_test``. +Run the new experiment to make sure the fork works: +``beat exp run user/single/1/single_test``. +Feel free to edit the ``single_test`` JSON file to change the experiment. + +Likewise, you can fork other BEAT objects (algorithms, databases, toolchains, +etc.). + +To see if your new BEAT objects will run on the BEAT platform, try to run +using the Docker executor (which will use the BEAT docker images it can find): +``beat exp run --docker user/single/1/single_test``. + +If it works, you can push it to your BEAT platform to have it on your BEAT +platform instance: ``beat exp push``. -- GitLab From cc1d581f7f30e0fb56000ea26593bbfc749d52a0 Mon Sep 17 00:00:00 2001 From: Jaden Date: Thu, 24 Aug 2017 16:23:24 +0200 Subject: [PATCH 25/27] log local env dont print it --- beat/cmdline/experiments.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/beat/cmdline/experiments.py b/beat/cmdline/experiments.py index 253bd1f..f8e41cc 100755 --- a/beat/cmdline/experiments.py +++ b/beat/cmdline/experiments.py @@ -248,7 +248,7 @@ def run_experiment(configuration, name, force, use_docker, use_local): simplify_time(stats['data']['network']['wait_time']), 100. * stats['data']['network']['wait_time'] / stats['cpu']['total']) else: - print(" Environment: %s" % 'local environment') + logger.extra(" Environment: %s" % 'local environment') if executor.analysis: data = load_result(executor) -- GitLab From 06eb1b73a07cb3b2dc2ac27036e6461b348d40c7 Mon Sep 17 00:00:00 2001 From: Jaden Date: Thu, 24 Aug 2017 16:23:55 +0200 Subject: [PATCH 26/27] import specific nose function to appease the CI --- beat/cmdline/test/test_config.py | 1 + 1 file changed, 1 insertion(+) diff --git a/beat/cmdline/test/test_config.py b/beat/cmdline/test/test_config.py index 108e396..9cd42b5 100644 --- a/beat/cmdline/test/test_config.py +++ b/beat/cmdline/test/test_config.py @@ -31,6 +31,7 @@ import os import nose.tools +from nose.tools import assert_raises import simplejson from . import tmp_prefix -- GitLab From af5070cdc579938c54b80d6409210578b6b0046b Mon Sep 17 00:00:00 2001 From: Jaden Date: Thu, 24 Aug 2017 16:33:44 +0200 Subject: [PATCH 27/27] fix ci yml to always document beat/ folder not CI_PROJ_NMSPC --- .gitlab-ci.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 775f882..c005677 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -15,7 +15,7 @@ build: - ./bin/buildout - ./bin/python ${PREFIX}/bin/coverage run --source=${CI_PROJECT_NAME} ${PREFIX}/bin/nosetests -sv ${CI_PROJECT_NAME} - ./bin/python ${PREFIX}/bin/coverage report - - ./bin/python ${PREFIX}/bin/sphinx-apidoc --separate -d 2 --output=doc/api ${CI_PROJECT_NAMESPACE} + - ./bin/python ${PREFIX}/bin/sphinx-apidoc --separate -d 2 --output=doc/api beat - ./bin/python ${PREFIX}/bin/sphinx-build doc html tags: - docker-build -- GitLab