#!/usr/bin/env python # vim: set fileencoding=utf-8 : ############################################################################### # # # Copyright (c) 2017 Idiap Research Institute, http://www.idiap.ch/ # # Contact: beat.support@idiap.ch # # # # This file is part of the beat.core module of the BEAT platform. # # # # Commercial License Usage # # Licensees holding valid commercial BEAT licenses may use this file in # # accordance with the terms contained in a written agreement between you # # and Idiap. For further information contact tto@idiap.ch # # # # Alternatively, this file may be used under the terms of the GNU Affero # # Public License version 3 as published by the Free Software and appearing # # in the file LICENSE.AGPL included in the packaging of this file. # # The BEAT platform is distributed in the hope that it will be useful, but # # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY # # or FITNESS FOR A PARTICULAR PURPOSE. # # # # You should have received a copy of the GNU Affero Public License along # # with the BEAT platform. If not, see http://www.gnu.org/licenses/. # # # ############################################################################### """ ==== base ==== Execution utilities """ import os import sys import glob import collections import logging logger = logging.getLogger(__name__) import simplejson from .. import schema from .. import database from .. import algorithm from .. import stats from beat.backend.python.helpers import convert_experiment_configuration_to_container class BaseExecutor(object): """Executors runs the code given an execution block information Parameters: prefix (str): Establishes the prefix of your installation. data (dict, str): The piece of data representing the block to be executed. It must validate against the schema defined for execution blocks. If a string is passed, it is supposed to be a fully qualified absolute path to a JSON file containing the block execution information. cache (:py:class:`str`, Optional): If your cache is not located under ``/cache``, then specify a full path here. It will be used instead. dataformat_cache (:py:class:`dict`, Optional): A dictionary mapping dataformat names to loaded dataformats. This parameter is optional and, if passed, may greatly speed-up database loading times as dataformats that are already loaded may be re-used. If you use this parameter, you must guarantee that the cache is refreshed as appropriate in case the underlying dataformats change. database_cache (:py:class:`dict`, Optional): A dictionary mapping database names to loaded databases. This parameter is optional and, if passed, may greatly speed-up database loading times as databases that are already loaded may be re-used. If you use this parameter, you must guarantee that the cache is refreshed as appropriate in case the underlying databases change. algorithm_cache (:py:class:`dict`, Optional): A dictionary mapping algorithm names to loaded algorithms. This parameter is optional and, if passed, may greatly speed-up database loading times as algorithms that are already loaded may be re-used. If you use this parameter, you must guarantee that the cache is refreshed as appropriate in case the underlying algorithms change. library_cache (:py:class:`dict`, Optional): A dictionary mapping library names to loaded libraries. This parameter is optional and, if passed, may greatly speed-up library loading times as libraries that are already loaded may be re-used. If you use this parameter, you must guarantee that the cache is refreshed as appropriate in case the underlying libraries change. Attributes: cache (str): The path to the cache currently being used errors (list): A list containing errors found while loading this execution block. data (dict): The original data for this executor, as loaded by our JSON decoder. algorithm (beat.core.algorithm.Algorithm): An object representing the algorithm to be run. databases (dict): A dictionary in which keys are strings with database names and values are :py:class:`.database.Database`, representing the databases required for running this block. The dictionary may be empty in case all inputs are taken from the file cache. views (dict): A dictionary in which the keys are tuples pointing to the ``(, , )`` and the value is a setup view for that particular combination of details. The dictionary may be empty in case all inputs are taken from the file cache. input_list (beat.core.inputs.InputList): A list of inputs that will be served to the algorithm. output_list (beat.core.outputs.OutputList): A list of outputs that the algorithm will produce. data_sources (list): A list with all data-sources created by our execution loader. data_sinks (list): A list with all data-sinks created by our execution loader. These are useful for clean-up actions in case of problems. """ def __init__(self, prefix, data, cache=None, dataformat_cache=None, database_cache=None, algorithm_cache=None, library_cache=None, custom_root_folders=None): # Initialisations self.prefix = prefix self.cache = cache or os.path.join(self.prefix, 'cache') self.algorithm = None self.databases = {} self.input_list = None self.data_loaders = None self.output_list = None self.data_sinks = [] self.errors = [] self.data = data # Check that the cache path exists if not os.path.exists(self.cache): raise IOError("Cache path `%s' does not exist" % self.cache) # Check the custom root folders if custom_root_folders is not None: if not isinstance(custom_root_folders, collections.Mapping): raise TypeError("The custom root folders must be in dictionary format") else: custom_root_folders = {} # Temporary caches, if the user has not set them, for performance database_cache = database_cache if database_cache is not None else {} dataformat_cache = dataformat_cache if dataformat_cache is not None else {} algorithm_cache = algorithm_cache if algorithm_cache is not None else {} library_cache = library_cache if library_cache is not None else {} # Basic validation of the data declaration, including JSON loading if required if not isinstance(data, dict): if not os.path.exists(data): self.errors.append('File not found: %s' % data) return self.data, self.errors = schema.validate('execution', data) if self.errors: return # Load the algorithm (using the algorithm cache if possible) if self.data['algorithm'] in algorithm_cache: self.algorithm = algorithm_cache[self.data['algorithm']] else: self.algorithm = algorithm.Algorithm(self.prefix, self.data['algorithm'], dataformat_cache, library_cache) algorithm_cache[self.algorithm.name] = self.algorithm if not self.algorithm.valid: self.errors += self.algorithm.errors return # Check that the mapping in coherent if len(self.data['inputs']) != len(self.algorithm.input_map): self.errors.append("The number of inputs of the algorithm doesn't correspond") if 'outputs' in self.data and (len(self.data['outputs']) != len(self.algorithm.output_map)): self.errors.append("The number of outputs of the algorithm doesn't correspond") for name in self.data['inputs'].keys(): if name not in self.algorithm.input_map.keys(): self.errors.append("The input '%s' doesn't exist in the algorithm" % name) if 'outputs' in self.data: for name in self.data['outputs'].keys(): if name not in self.algorithm.output_map.keys(): self.errors.append("The output '%s' doesn't exist in the algorithm" % name) if self.errors: return # Load the databases (if any is required) for name, details in self.data['inputs'].items(): if 'database' in details: if details['database'] not in self.databases: if details['database'] in database_cache: db = database_cache[details['database']] else: db = database.Database(self.prefix, details['database'], dataformat_cache) name = "database/%s" % db.name if name in custom_root_folders: db.data['root_folder'] = custom_root_folders[name] database_cache[db.name] = db self.databases[db.name] = db if not db.valid: self.errors += db.errors def __enter__(self): """Prepares inputs and outputs for the processing task Raises: IOError: in case something cannot be properly setup """ logger.info("Start the execution of '%s'", self.algorithm.name) # self._prepare_inputs() # self._prepare_outputs() return self def __exit__(self, exc_type, exc_value, traceback): """Closes all sinks and disconnects inputs and outputs """ for sink in self.data_sinks: # we save the output only if no valid error has been thrown # n.b.: a system exit will raise SystemExit which is not an Exception if not isinstance(exc_type, Exception): sink.close() self.input_list = None self.data_loaders = [] self.output_list = None self.data_sinks = [] def _prepare_inputs(self): """Prepares all input required by the execution.""" raise NotImplementedError() def _prepare_outputs(self): """Prepares all output required by the execution.""" raise NotImplementedError() def process(self, virtual_memory_in_megabytes=0, max_cpu_percent=0, timeout_in_minutes=0): """Executes the user algorithm code Parameters: virtual_memory_in_megabytes (:py:class:`int`, Optional): The amount of virtual memory (in Megabytes) available for the job. If set to zero, no limit will be applied. max_cpu_percent (:py:class:`int`, Optional): The maximum amount of CPU usage allowed in a system. This number must be an integer number between 0 and ``100*number_of_cores`` in your system. For instance, if your system has 2 cores, this number can go between 0 and 200. If it is <= 0, then we don't track CPU usage. timeout_in_minutes (int): The number of minutes to wait for the user process to execute. After this amount of time, the user process is killed with ``signal.SIGKILL``. If set to zero, no timeout will be applied. Returns: dict: A dictionary which is JSON formattable containing the summary of this block execution. """ raise NotImplementedError() @property def valid(self): """A boolean that indicates if this executor is valid or not""" return not bool(self.errors) @property def analysis(self): """A boolean that indicates if the current block is an analysis block""" return 'result' in self.data @property def outputs_exist(self): """Returns ``True`` if outputs this block is supposed to produce exists.""" if self.analysis: path = os.path.join(self.cache, self.data['result']['path']) + '*' if not glob.glob(path): return False else: for name, details in self.data['outputs'].items(): path = os.path.join(self.cache, details['path']) + '*' if not glob.glob(path): return False # if you get to this point all outputs already exist return True @property def io_statistics(self): """Summarize current I/O statistics looking at data sources and sinks, inputs and outputs Returns: dict: A dictionary summarizing current I/O statistics """ return stats.io_statistics(self.data, self.input_list, self.output_list) def __str__(self): return simplejson.dumps(self.data, indent=4) def write(self, path): """Writes contents to precise filesystem location""" with open(path, 'wt') as f: f.write(str(self)) def dump_runner_configuration(self, directory): """Exports contents useful for a backend runner to run the algorithm""" data = convert_experiment_configuration_to_container(self.data) with open(os.path.join(directory, 'configuration.json'), 'wb') as f: json_data = simplejson.dumps(data, indent=2) f.write(json_data.encode('utf-8')) tmp_prefix = os.path.join(directory, 'prefix') if not os.path.exists(tmp_prefix): os.makedirs(tmp_prefix) self.algorithm.export(tmp_prefix) def dump_databases_provider_configuration(self, directory): """Exports contents useful for a backend runner to run the algorithm""" with open(os.path.join(directory, 'configuration.json'), 'wb') as f: json_data = simplejson.dumps(self.data, indent=2) f.write(json_data.encode('utf-8')) tmp_prefix = os.path.join(directory, 'prefix') if not os.path.exists(tmp_prefix): os.makedirs(tmp_prefix) for db in self.databases.values(): db.export(tmp_prefix)