diff --git a/beat/web/backend/admin.py b/beat/web/backend/admin.py index 3a113b9e26426d0d6b92954b814dfe005fcad630..87d912ed05e2fdb116e42d40c7dbfe75c0552698 100755 --- a/beat/web/backend/admin.py +++ b/beat/web/backend/admin.py @@ -201,10 +201,10 @@ class JobSplitInline(admin.TabularInline): class Job(admin.ModelAdmin): - list_display = ('id', 'status', 'runnable_date', 'block', 'splits') + list_display = ('id', 'key', 'runnable_date', 'start_date', 'block', 'splits') search_fields = ['block__name', 'block__experiment__name'] - list_display_links = ('id', 'block') - ordering = ('runnable_date', 'id') + list_display_links = ('id', 'block', 'key') + ordering = ('runnable_date', 'start_date', 'id') inlines = [JobSplitInline] # to avoid very slow loading of cached files diff --git a/beat/web/backend/helpers.py b/beat/web/backend/helpers.py index 118c4d3d4f370aa1dec43bda4cf5de80f6aba86a..0e6ecdb2d50bc21c85a178b422819585268eb9b4 100755 --- a/beat/web/backend/helpers.py +++ b/beat/web/backend/helpers.py @@ -25,17 +25,30 @@ # # ############################################################################### -from django.db import transaction +from django.conf import settings +from django.db.models import Count +from django.db.models import Q +import logging +logger = logging.getLogger(__name__) + +import os +import glob +import simplejson from datetime import datetime from ..experiments.models import Experiment from ..experiments.models import Block from ..experiments.models import CachedFile +from .models import Queue from .models import Job +from .models import JobSplit +from .models import Worker +from .models import Result + +import beat.core.hash -@transaction.atomic def schedule_experiment(experiment): '''Schedules the experiment for execution at the backend @@ -84,30 +97,15 @@ def schedule_experiment(experiment): if must_skip: block.status = Block.DONE - block.start_date = datetime.now() - block.end_date = block.start_date + block.creation_date = datetime.now() + block.start_date = block.creation_date + block.end_date = block.creation_date block.save() else: - # search for other jobs with similar outputs that have no children yet - # do this carefully, as other experiments may be scheduled at the same - # time, invalidating our "parent" choice - parent = Job.objects.filter(block__outputs__in=block.outputs.all(), - child=None).first() - - if parent is not None: #(candidate only) try to lock it - while True: - parent = Job.objects.select_for_update().get(pk=parent.pk) - if parent.child_ is not None: #was taken meanwhile, retry - parent = parent.child - continue - job = Job(block=block, parent=parent) - break - else: - job = Job(block=block) - - job.save() - + Job.objects.create_job(block) + block.creation_date = datetime.now() + block.save() already_done = False @@ -125,3 +123,788 @@ def schedule_experiment(experiment): #---------------------------------------------------------- +def cancel_experiment(experiment): + '''Cancel the execution of the experiment on the backend + + Cancelling an experiment only means marking the experiment as 'cancelling'. + + This function is expected to be called on the web server. The Scheduler + is tasked to notice the newly-cancelled experiment and does what it takes. + ''' + + # Lock the experiment, so nobody else can modify it + experiment = Experiment.objects.select_for_update().get(pk=experiment.pk) + + # Can't cancel an experiment not started or already finished + if experiment.status not in [Experiment.SCHEDULED, Experiment.RUNNING]: + return + + # Mark the experiment as cancelling + experiment.status = Experiment.CANCELLING + experiment.save() + + +#---------------------------------------------------------- + + +def split_new_jobs(): + '''Retrieve all the jobs not already splitted, and create the appropriate splits''' + + def _process(candidate_jobs): + additional_jobs = [] + + # Iterate through all the candidate jobs + for job in candidate_jobs: + + # Check that the files weren't generated since the scheduling of the job + must_skip = all([cached_file.status == CachedFile.CACHED + for cached_file in job.block.outputs.all()]) + + if must_skip: + job.block.status = Block.DONE + job.block.start_date = datetime.now() + job.block.end_date = job.block.start_date + job.block.save() + + additional_jobs.extend(update_dependent_jobs(job)) + if len(additional_jobs) == 0: + update_experiment(job.block.experiment) + + job.delete() + continue + + # Check that the job isn't a mirror of an currently running one + nb_existing_splits = JobSplit.objects.filter( + ~Q(status=JobSplit.QUEUED) | Q(worker__isnull=False), job__key=job.key, + job__runnable_date__isnull=False, job__mirror=False, + ).count() + + if nb_existing_splits > 0: + job.mirror = True + job.save() + continue + + # Create the splits + JobSplit.objects.create_splits(job) + + return additional_jobs + + + # First retrieve all the candidate jobs from the database, process them and + # if the processing mark any other job as a candidate one, process it too, + # recursively, until no candidate job is left + candidate_jobs = Job.objects.annotate(nb_splits=Count('splits')).filter( + runnable_date__isnull=False, mirror=False, nb_splits=0) + + while len(candidate_jobs) > 0: + candidate_jobs = _process(candidate_jobs) + + +#---------------------------------------------------------- + + +def process_newly_cancelled_experiments(): + '''Retrieve all the experiments that must be cancelled, and do it''' + + # Retrieve all the experiments marked as cancelling + cancelling_experiments = Experiment.objects.filter(status=Experiment.CANCELLING) + + splits_to_cancel = [] + + for experiment in cancelling_experiments: + # Only process those which have blocks that aren't already cancelling + if experiment.blocks.filter(Q(status=Block.PENDING) | Q(status=Block.PROCESSING)).count() == 0: + continue + + new_splits_to_cancel = cancel_all_blocks(experiment) + + if len(new_splits_to_cancel) == 0: + update_experiment(experiment) + + splits_to_cancel.extend(new_splits_to_cancel) + + return splits_to_cancel + + +#---------------------------------------------------------- + + +def is_cache_complete(path, nb_expected_blocks, cache=settings.CACHE_ROOT): + '''Check that an entry of the cache is complete + + Due to the distributed nature of the platform, with volumes shared by + several different machines, a (hopefully) small delay might occur between + the writing of a file in the cache on a processing node and its availability + on the current machine. + + This function checks that all the necessary files are there, and complete. + ''' + + def _extract_indices_from_filename(filename): + parts = filename.split('.') + return (int(parts[-3]), int(parts[-2]), filename) + + + def _verify_checksum(filename): + checksum_file = filename + '.checksum' + + try: + with open(checksum_file, 'rt') as f: + recorded = f.read().strip() + + actual = beat.core.hash.hashFileContents(filename) + except: + return False + + return (actual == recorded) + + + # Retrieve all the index files + abs_path = os.path.join(cache, path) + + index_files = glob.glob(abs_path + '*.index') + index_files = sorted([ _extract_indices_from_filename(x) for x in index_files ]) + + # Check that there is no missing index + if len(index_files) > 1: + for i in range(1, len(index_files)): + if index_files[i][0] != index_files[i-1][1] + 1: + return False + + # Sum the number of blocks represented by each index file + nb_blocks = 0 + for start, end, index_file in index_files: + + # Check that the file is complete + if not _verify_checksum(index_file): + return False + + # Check that the data file is complete + data_file = index_file.replace('.index', '.data') + if not _verify_checksum(data_file): + return False + + # Retrieve the number of blocks from the file + with open(index_file, 'rt') as f: + lines = f.readlines() + nb_blocks += len(lines) + + return (nb_blocks == nb_expected_blocks) + + +#---------------------------------------------------------- + + +def assign_splits_to_workers(): + '''Assign existing job splits to available workers from the appropriate queues''' + + # Retrieve the queues in a good order + queues = Queue.objects.order_by('-cores_per_slot', 'max_slots_per_user') + + # Retrieve the candidate jobs on each queue + candidate_splits_per_queue = [ (q, retrieve_candidate_splits_for_queue(q)) for q in queues ] + candidate_splits_per_queue = [ x for x in candidate_splits_per_queue if x[1] ] + + if not candidate_splits_per_queue: + return [] + + logger.debug('Considering splits: %s', candidate_splits_per_queue) + + # Build a "white list" of available workers + whitelist = dict([ (worker, worker.available_cores()) + for worker in Worker.objects.filter(active=True) ]) + + logger.debug('Worker availability: %s', whitelist) + + + # Process the candidates of each queue + assigned_splits = [] + + for queue, candidate_splits in candidate_splits_per_queue: + + candidate_workers = queue.worker_availability() + required_cores = queue.cores_per_slot + + for candidate_split in candidate_splits: + + # Check that the job wasn't marked as a mirror during a previous + # iteration + candidate_split.job.refresh_from_db() + if candidate_split.job.mirror: + continue + + # Search an available worker + for candidate_worker in candidate_workers: + + # Check that there are enough available cores on the worker + available_cores = whitelist.get(candidate_worker, 0) + if available_cores < required_cores: + continue + + logger.debug("Assigning `%s' to worker `%s'", + candidate_split, candidate_worker) + + assign_split_to_worker(candidate_split, candidate_worker) + assigned_splits.append(candidate_split) + + mark_similar_jobs_as_mirror(candidate_split.job) + + whitelist[candidate_worker] -= required_cores + logger.debug("`%s' cores available: %d", candidate_worker, whitelist[candidate_worker]) + break + + return JobSplit.objects.filter(id__in=[ x.id for x in assigned_splits ]) + + +#---------------------------------------------------------- + + +def get_configuration_for_split(split): + '''Retrieve the configuration to be used to execute the provided job split + on a worker node + ''' + + # Retrieve the block configuration + configuration = simplejson.loads(str(split.job.block.command)) + + # (If necessary) Add the infos needed to access the database files + if settings.DATASETS_UID is not None: + configuration['datasets_uid'] = settings.DATASETS_UID + + if settings.DATASETS_ROOT_PATH is not None: + configuration['datasets_root_path'] = settings.DATASETS_ROOT_PATH + + # (If necessary) Add the range of indices to process + if (split.start_index is not None) and (split.end_index is not None): + configuration['range'] = [split.start_index, split.end_index] + + return configuration + + +#---------------------------------------------------------- + + +def on_split_started(split): + '''Must be called each time a split job is started''' + + now = datetime.now() + + # Mark the split job as running + split.status = JobSplit.PROCESSING + split.start_date = now + split.save() + + # (If necessary) Mark the job and block as running + split.job.refresh_from_db() + if split.job.start_date is None: + split.job.start_date = now + split.job.save() + + split.job.block.status = Block.PROCESSING + split.job.block.start_date = now + split.job.block.save() + + # (If necessary) Mark the experiment as running + split.job.block.experiment.refresh_from_db() + if split.job.block.experiment.status == Experiment.SCHEDULED: + split.job.block.experiment.status = Experiment.RUNNING + split.job.block.experiment.start_date = now + split.job.block.experiment.save() + + + # Mark the mirror jobs and their blocks as running + mirror_jobs = Job.objects.filter(key=split.job.key, mirror=True) + for mirror_job in mirror_jobs: + mirror_job.start_date = now + mirror_job.save() + + mirror_job.block.status = Block.PROCESSING + mirror_job.block.start_date = now + mirror_job.block.save() + + # (If necessary) Mark the experiment as running + if mirror_job.block.experiment.status == Experiment.SCHEDULED: + mirror_job.block.experiment.status = Experiment.RUNNING + mirror_job.block.experiment.start_date = now + mirror_job.block.experiment.save() + + +#---------------------------------------------------------- + + +def on_split_done(split, result): + '''Must be called each time a split job is successfully completed''' + + result = Result( + status = result['status'], + stdout = result['stdout'], + stderr = result['stderr'], + usrerr = result['user_error'], + _stats = simplejson.dumps(result['statistics'], indent=2), + ) + result.save() + + split.status = JobSplit.COMPLETED + split.end_date = datetime.now() + split.result = result + split.worker = None + split.save() + + update_job(split.job) + + +#---------------------------------------------------------- + + +def on_split_fail(split, result): + '''Must be called each time a split job is successfully completed''' + + if isinstance(result, dict): + result = Result( + status = result['status'], + stdout = result['stdout'], + stderr = result['stderr'], + usrerr = result['user_error'], + _stats = simplejson.dumps(result['statistics'], indent=2), + ) + else: + result = Result( + status = 1, + stdout = '', + stderr = result, + usrerr = '', + ) + + result.save() + + split.status = JobSplit.FAILED + split.end_date = datetime.now() + split.result = result + split.worker = None + split.save() + + return update_job(split.job) + + +#---------------------------------------------------------- + + +def on_split_cancelled(split): + '''Must be called each time a split job is successfully cancelled''' + + split.status = JobSplit.CANCELLED + split.end_date = datetime.now() + split.worker = None + split.save() + + return update_job(split.job) + + +#---------------------------------------------------------- + + +def retrieve_candidate_splits_for_queue(queue): + '''Retrieve the splits assigned to the given queue that could be considered + for execution + ''' + + # Retrieve the pending jobs assigned to the queue, from oldest to newest + splits = JobSplit.objects.filter(job__block__queue=queue, status=JobSplit.QUEUED, + worker__isnull=True + ).order_by('job__runnable_date') + + + # Retrieve the list of the users that submitted those jobs + users = set(splits.values_list('job__block__experiment__author', flat=True)) + + + # Determine how much slots each user is already using on the queue + user_current_slots = [ JobSplit.objects.filter(job__block__experiment__author=k, + job__block__queue=queue, + status=JobSplit.PROCESSING).count() + for k in users ] + + + # Determine how much slots each user is still afforded on the queue + allowance = [ queue.max_slots_per_user - k for k in user_current_slots ] + allowance = dict(zip(users, allowance)) + + + # Limit runnable splits so we reach a maximum of allowed user slots + candidates = [] + for split in splits: + author = split.job.block.experiment.author.id + if allowance[author] > 0: + candidates.append(split) + allowance[author] -= 1 + + + # Return the list of candidates splits + return candidates + + +#---------------------------------------------------------- + + +def assign_split_to_worker(split, worker): + '''Schedules the split to be executed on a given worker''' + + split = JobSplit.objects.select_for_update().get(pk=split.pk) + worker = Worker.objects.select_for_update().get(pk=worker.pk) + + split.worker = worker + split.save() + + logger.info("Job split %s scheduled at `%s' was assigned to `%s'", + split, split.job.block.queue, worker) + + +#---------------------------------------------------------- + + +def mark_similar_jobs_as_mirror(job): + '''Mark all similar jobs as mirror, and delete their job splits''' + + similar_jobs = Job.objects.select_for_update().filter(key=job.key).exclude(pk=job.pk) + + for similar_job in similar_jobs: + similar_job.mirror = True + similar_job.save() + + for split in similar_job.splits.all(): + split.delete() + + logger.info("Job `%s' is now marked as a mirror of `%s'", similar_job, job) + + +#---------------------------------------------------------- + + +def update_job(job): + + def _collect_results(splits): + cached_files_infos = dict( + cpu_time = 0.0, + max_memory = 0, + stdout = '', + stderr = '', + error_report = '', + data_read_size = 0, + data_written_size = 0, + data_read_nb_blocks = 0, + data_written_nb_blocks = 0, + data_read_time = 0.0, + data_written_time = 0.0, + queuing_time = 0.0, + linear_execution_time = 0.0, + speed_up_real = 1.0, + speed_up_maximal = 1.0, + ) + + split_durations = [] + + for split in splits: + split_durations.append((split.end_date - split.start_date).total_seconds()) + + statistics = split.result.stats + + cached_files_infos['cpu_time'] += statistics.cpu['user'] + statistics.cpu['system'] + cached_files_infos['max_memory'] += statistics.memory['rss'] + + header = '' + if split.start_index is not None: + header = 'Split #%d (from indices %d to %d):' % ( + split.split_index, split.start_index, split.end_index) + header += '\n' + ('=' * len(header)) + '\n' + + stdout = split.result.stdout if split.result.stdout != '\n' else '' + stderr = split.result.stderr if split.result.stderr != '\n' else '' + + if stdout != '': + cached_files_infos['stdout'] += header + stdout + '\n' + + if stderr != '': + cached_files_infos['stderr'] += header + stderr + '\n' + + if split.result.usrerr != '': + cached_files_infos['error_report'] += header + split.result.usrerr + '\n' + + if 'volume' in statistics.data: + cached_files_infos['data_read_size'] += statistics.data['volume'].get('read', 0) + cached_files_infos['data_written_size'] += statistics.data['volume'].get('write', 0) + + if 'blocks' in statistics.data: + cached_files_infos['data_read_nb_blocks'] += statistics.data['blocks'].get('read', 0) + cached_files_infos['data_written_nb_blocks'] += statistics.data['blocks'].get('write', 0) + + if 'time' in statistics.data: + cached_files_infos['data_read_time'] += statistics.data['time'].get('read', 0) + cached_files_infos['data_written_time'] += statistics.data['time'].get('write', 0) + + job = splits[0].job + + cached_files_infos['queuing_time'] = (job.start_date - job.runnable_date).total_seconds() + cached_files_infos['linear_execution_time'] = sum(split_durations) + + if job.block.required_slots > 1: + cached_files_infos['speed_up_real'] = float(cached_files_infos['linear_execution_time']) / \ + (job.end_date - job.start_date).total_seconds() + cached_files_infos['speed_up_maximal'] = float(cached_files_infos['linear_execution_time']) / \ + max(split_durations) + + return cached_files_infos + + + splits_to_cancel = [] + + + # If the job is failed + if job.splits.filter(status=JobSplit.FAILED).count() > 0: + + # Mark queued splits of the same job as cancelled + for split in job.splits.filter(status=JobSplit.QUEUED): + split.status = JobSplit.CANCELLED + split.start_date = datetime.now() + split.end_date = split.start_date + split.save() + + # Cancel running splits + splits_to_cancel = list(job.splits.filter(status=JobSplit.PROCESSING).all()) + for split in splits_to_cancel: + split.status = JobSplit.CANCELLING + split.save() + + + # Check that all the splits are done + if job.splits.filter(Q(status=JobSplit.QUEUED) | Q(status=JobSplit.PROCESSING) | Q(status=JobSplit.CANCELLING)).count() > 0: + return splits_to_cancel + + + # Save the end date + job.end_date = job.splits.order_by('-end_date')[0].end_date + job.save() + + + # Did the job fail? + if job.splits.filter(status=JobSplit.FAILED).count() > 0: + + # (If necessary) Update the cached files + splits = job.splits.filter(Q(status=JobSplit.FAILED) | Q(status=JobSplit.COMPLETED)) + cached_files_infos = _collect_results(splits) + job.block.outputs.update(**cached_files_infos) + + for cached_file in job.block.outputs.all(): + cached_file.update(Block.FAILED) + + # Update the block + job.block.status = Block.FAILED + job.block.end_date = job.end_date + job.block.save() + + # Cancel all the remaining blocks of the experiment + splits_to_cancel.extend(cancel_all_blocks(job.block.experiment)) + + # Update the experiment + update_experiment(job.block.experiment) + + # Mark the blocks of the mirror jobs as failed + mirror_jobs = Job.objects.filter(key=job.key, mirror=True) + for mirror_job in mirror_jobs: + mirror_job.end_date = job.end_date + mirror_job.save() + + mirror_job.block.status = Block.FAILED + mirror_job.block.end_date = job.end_date + mirror_job.block.save() + + # Cancel all the remaining blocks of the experiment + splits_to_cancel.extend(cancel_all_blocks(mirror_job.block.experiment)) + + # Update the experiment + update_experiment(mirror_job.block.experiment) + + mirror_jobs.delete() + + # Delete the job + job.delete() + + + # Did the job succeed? + elif job.splits.exclude(status=JobSplit.COMPLETED).count() == 0: + + # Update the cached files + cached_files_infos = _collect_results(job.splits.all()) + job.block.outputs.update(**cached_files_infos) + + for cached_file in job.block.outputs.all(): + cached_file.update(Block.DONE) + + # Update the block + job.block.status = Block.DONE + job.block.end_date = job.end_date + job.block.save() + + # Update the dependent jobs + additional_jobs = update_dependent_jobs(job) + + # (If necessary) Update the experiment + if len(additional_jobs) == 0: + update_experiment(job.block.experiment) + + # Mark the blocks of the mirror jobs as completed + mirror_jobs = Job.objects.filter(key=job.key, mirror=True) + for mirror_job in mirror_jobs: + mirror_job.block.status = Block.DONE + mirror_job.block.end_date = job.end_date + mirror_job.block.save() + + # Update the dependent jobs + additional_jobs = update_dependent_jobs(mirror_job) + + # (If necessary) Update the experiment + if len(additional_jobs) == 0: + update_experiment(mirror_job.block.experiment) + + mirror_jobs.delete() + + # Delete the job + job.delete() + + + # Was the job cancelled? + elif job.splits.filter(status=JobSplit.CANCELLED).count() > 0: + + for cached_file in job.block.outputs.all(): + cached_file.update(Block.CANCELLED) + + # Update the block + job.block.status = Block.CANCELLED + job.block.end_date = job.end_date + job.block.save() + + # Update the experiment + update_experiment(job.block.experiment) + + # Delete the job + job.delete() + + + return splits_to_cancel + + +#---------------------------------------------------------- + + +def update_dependent_jobs(job): + '''Mark the dependent jobs of the provided one as runnable + + Intended to be called after a job is done + ''' + + updated_jobs = [] + + for dependent_block in job.block.dependents.all(): + if dependent_block.is_runnable(): + dependent_block.job.runnable_date = datetime.now() + dependent_block.job.save() + updated_jobs.append(dependent_block.job) + + return updated_jobs + + +#---------------------------------------------------------- + + +def cancel_all_blocks(experiment): + '''Mark the all the blocks of the provided experiment as cancelled + + Intended to be called after a job has failed + ''' + + splits_to_cancel = [] + + + # Retrieve all the blocks to cancel + blocks_to_cancel = experiment.blocks.filter(Q(status=Block.PROCESSING) | Q(status=Block.PENDING)) \ + .exclude(job__mirror=True) + + for block in blocks_to_cancel: + + # If a mirror job exists, reassign any existing split + mirror_jobs = Job.objects.filter(key=block.job.key, mirror=True) + if len(mirror_jobs) > 0: + mirror_job = mirror_jobs[0] + mirror_job.mirror = False + mirror_job.save() + + for split in block.job.splits.all(): + split.job = mirror_job + split.save() + + else: + # Queued splits: Mark them as cancelled + for split in block.job.splits.filter(status=JobSplit.QUEUED): + split.status = JobSplit.CANCELLED + split.start_date = datetime.now() + split.end_date = split.start_date + split.save() + + # Processing splits splits: Cancel them + for split in block.job.splits.filter(status=JobSplit.PROCESSING): + split.status = JobSplit.CANCELLING + split.save() + splits_to_cancel.append(split) + + # (If possible) Mark the block as cancelled + if block.job.splits.filter(status=JobSplit.CANCELLING).count() == 0: + block.status = Block.CANCELLED + block.end_date = datetime.now() + if block.start_date is None: + block.start_date = block.end_date + block.save() + + block.job.delete() + + + # Retrieve all the mirror blocks + mirror_blocks_to_cancel = experiment.blocks.filter(Q(status=Block.PROCESSING) | Q(status=Block.PENDING)) \ + .filter(job__mirror=True) + + for block in mirror_blocks_to_cancel: + block.status = Block.CANCELLED + block.end_date = datetime.now() + if block.start_date is None: + block.start_date = block.end_date + block.save() + + block.job.delete() + + + return splits_to_cancel + + +#---------------------------------------------------------- + + +def update_experiment(experiment): + experiment = Experiment.objects.select_for_update().get(pk=experiment.pk) + + # Experiment done? + if experiment.blocks.exclude(status=Block.DONE).count() == 0: + experiment.status = Experiment.DONE + experiment.end_date = experiment.blocks.order_by('-end_date')[0].end_date + experiment.save() + + # Experiment failed? + elif experiment.blocks.filter(status=Block.FAILED).count() > 0: + if experiment.blocks.filter(status=Block.PROCESSING).count() == 0: + experiment.status = Experiment.FAILED + experiment.end_date = experiment.blocks.order_by('-end_date')[0].end_date + experiment.save() + + # Experiment cancelled? + elif experiment.blocks.filter(status=Block.CANCELLED).count() > 0: + if experiment.blocks.filter(status=Block.PROCESSING).count() == 0: + experiment.status = Experiment.PENDING + experiment.end_date = experiment.blocks.order_by('-end_date')[0].end_date + experiment.save() diff --git a/beat/web/backend/migrations/0005_job_modifications.py b/beat/web/backend/migrations/0005_job_modifications.py new file mode 100644 index 0000000000000000000000000000000000000000..f9cfb4f38c62d5544118258a66846905dd182fd5 --- /dev/null +++ b/beat/web/backend/migrations/0005_job_modifications.py @@ -0,0 +1,51 @@ +# -*- coding: utf-8 -*- +# Generated by Django 1.9.13 on 2017-09-29 16:54 +from __future__ import unicode_literals + +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ('backend', '0004_environmentlanguage'), + ] + + operations = [ + migrations.RemoveField( + model_name='job', + name='parent', + ), + migrations.RemoveField( + model_name='job', + name='split_errors', + ), + migrations.AddField( + model_name='job', + name='key', + field=models.CharField(default='', max_length=64), + preserve_default=False, + ), + migrations.AddField( + model_name='job', + name='mirror', + field=models.BooleanField(default=False), + ), + migrations.RemoveField( + model_name='job', + name='status', + ), + migrations.RemoveField( + model_name='jobsplit', + name='cache_errors', + ), + migrations.RemoveField( + model_name='jobsplit', + name='process_id', + ), + migrations.AlterField( + model_name='jobsplit', + name='status', + field=models.CharField(choices=[(b'N', b'Queued'), (b'P', b'Processing'), (b'C', b'Completed'), (b'F', b'Failed'), (b'L', b'Cancelled'), (b'K', b'Cancelling')], default=b'N', max_length=1), + ), + ] diff --git a/beat/web/backend/models/job.py b/beat/web/backend/models/job.py index cf699cd6907cf4ae29bb110b36e0f854de06cb45..590b9b5a072c21a7bb09d54c12530876c1c93b33 100755 --- a/beat/web/backend/models/job.py +++ b/beat/web/backend/models/job.py @@ -25,10 +25,7 @@ # # ############################################################################### -import os -import time import datetime -import traceback import logging logger = logging.getLogger(__name__) @@ -37,15 +34,10 @@ import simplejson from django.db import utils from django.db import models -from django.db import transaction from django.conf import settings -import beat.core.stats import beat.core.data -import beat.core.execution -from beat.core.dock import Host - -from ...statistics.utils import updateStatistics +import beat.core.hash from .result import Result @@ -53,13 +45,29 @@ from .result import Result #---------------------------------------------------------- -def _merge_strings(s): - if len(s) == 1: return s[0] - s = [k.strip() for k in s] - if any(s): - return '\n'.join(['Process %d: %s' % (i,k) for i,k in enumerate(s)]) - else: - return '' +class JobManager(models.Manager): + + def create_job(self, block): + + # Compute the key of the job + hashes = [ x.hash for x in block.outputs.order_by('hash') ] + key = beat.core.hash.hash(''.join(hashes)) + + # Determine if the job can be run or is dependent on others + runnable_date = None + if block.is_runnable(): + runnable_date = datetime.datetime.now() + + # Create the job + job = self.model( + block=block, + key=key, + runnable_date=runnable_date + ) + + job.save() + + return job #---------------------------------------------------------- @@ -68,31 +76,11 @@ def _merge_strings(s): class Job(models.Model): '''Class describing the execution of a Job on the backend''' - QUEUED = 'N' #Block.PENDING - PROCESSING = 'P' #Block.PROCESSING - COMPLETED = 'C' #Block.COMPLETED - FAILED = 'F' #Block.FAILED - SKIPPED = 'S' - CANCELLED = 'L' #Block.CANCELLED - CANCEL = 'K' #Job was asked to be killed - - STATUS = ( - (QUEUED, 'Queued'), - (PROCESSING, 'Processing'), - (COMPLETED, 'Completed'), - (FAILED, 'Failed'), - (SKIPPED, 'Skipped'), - (CANCELLED, 'Cancelled'), - (CANCEL, 'Cancel'), - ) - block = models.OneToOneField('experiments.Block', null=True, - on_delete=models.CASCADE, related_name='job') - - status = models.CharField(max_length=1, choices=STATUS, default=QUEUED) + on_delete=models.CASCADE, related_name='job') result = models.OneToOneField(Result, null=True, on_delete=models.CASCADE, - related_name='job') + related_name='job') runnable_date = models.DateTimeField(null=True, blank=True) @@ -100,339 +88,81 @@ class Job(models.Model): end_date = models.DateTimeField(null=True, blank=True) - parent = models.OneToOneField('self', related_name='child', null=True, - on_delete=models.SET_NULL) - - split_errors = models.PositiveIntegerField(default=0) - + key = models.CharField(max_length=64) - def _get_child(self): - return self.child if hasattr(self, 'child') else None + mirror = models.BooleanField(default=False) - def _set_child(self, val): - val.parent = self - val.save() - child_ = property(_get_child, _set_child) + objects = JobManager() def __str__(self): - - return "Job(%s, %s, splits=%d, status=%s, cores=%d)" % \ - (self.block.name, self.block.experiment.name, - self.block.required_slots, self.status, - self.block.queue.cores_per_slot) - - - def done(self): - '''Says whether the job has finished or not''' - - return self.status in (Job.COMPLETED, Job.SKIPPED, Job.FAILED, - Job.CANCELLED) + return "Job(%s, %s, key=%s, mirror=%s, splits=%d, cores=%d)" % \ + (self.block.name, self.block.experiment.name, + self.key, str(self.mirror), + self.block.required_slots, + self.block.queue.cores_per_slot) +#---------------------------------------------------------- - def _copy(self, other): - '''Copy state from another block''' - - self.refresh_from_db() - if self.done(): return +class JobSplitManager(models.Manager): - self.start_date = other.start_date - self.end_date = other.end_date - self.status = other.status + def create_splits(self, job): - # update status of parent jobs - self.save() - self._cascade_updates() - self.block._update_state(None) + # If no splitting is required, only create one split + if job.block.required_slots == 1: + split = JobSplit(job=job, split_index=0) + split.save() + return [split] - def _make_runnable(self): - '''Tries to make this job runnable - if it is cached, we skip it''' + # Retrieve the list of synchronized inputs + configuration = simplejson.loads(job.block.command) + inputs = [ entry for name, entry in configuration['inputs'].items() + if entry['channel'] == configuration['channel'] ] - # lock self - avoids concurrent update from scheduler/worker subsystem - self_ = Job.objects.select_for_update().get(pk=self.pk) - # checks for the presence of output caches - if they exist and - # checksum, skip and update related blocks - if all([k.exists() for k in self.block.outputs.all()]): - if all([k.check_checksums() for k in self.block.outputs.all()]): - self.status = Job.SKIPPED - self.split_errors = 0 - self.end_date = datetime.datetime.now() - self.save() - self._cascade_updates() #to similar blocks - self.block._update_state() - return - else: - logger.warning("Trying to make block `%s' runnable, but " \ - "indexes do not checksum - waiting...", self.block) - self.split_errors += 1 - self.save() - return + # Load the list of indices for each inputs + indices = [ beat.core.data.load_data_index(settings.CACHE_ROOT, x['path']) + for x in inputs ] - # else, flag it as runnable - self.runnable_date = datetime.datetime.now() - # runs index splitting once, for all created splits - self._split_indices() + # Attempt to split the indices + nb_splits = job.block.required_slots + while nb_splits > 1: + # Determine N splits using the possible indices for split: + split_indices = beat.core.data.foundSplitRanges(indices, nb_splits) - def _split_indices(self): - '''Runs the index splitting machinery once for all associated splits''' + if len(split_indices) > 0: + break - # no index spliting is required - if self.block.required_slots == 1: - self.save() - s = JobSplit(job=self, split_index=0) - s.save() - return + nb_splits -= 1 - indices = [] + if nb_splits != job.block.required_slots: + message = "The processing of the block `%s' of experiment `%s' " \ + "was splitted in %d instead of the requested %d" % \ + (job.block.name, job.block.experiment.fullname(), + nb_splits, job.block.required_slots) + logger.warning(message) - conf = simplejson.loads(self.block.command) - try: + # Create the necessary splits and assign the ranges + splits = [] + for i, indices in enumerate(split_indices): + split = JobSplit( + job=job, + split_index=i, + start_index=indices[0], + end_index=indices[1], + ) + split.save() - # For all synchronized inputs with the current block, append the - # list of generated object indices. This is necessary for an - # informed decision on where to split the processing - sync = [conf['inputs'][i] for i in conf['inputs'] if conf['inputs'][i]['channel']==conf['channel']] - for i in sync: - indices.append(beat.core.data.load_data_index( - settings.CACHE_ROOT, str(i['path']))) + splits.append(split) - # Determine N splits using the possible indices for split: - indices = beat.core.data.foundSplitRanges(indices, - self.block.required_slots) - - self.split_errors = 0 - self.save() - - if len(indices) == 0: - message = "Index splitting for block `%s' of experiment " \ - "`%s' could not be completed: not splittable!" % \ - (self.block.name, self.block.experiment.fullname()) - logger.error(message) - self._cancel(usrerr=settings.DEFAULT_USER_ERROR) - - # if you get to this point, the splitting has succeeded, - # create the necessary splits and assign the ranges - for i, s in enumerate(indices): - split_indices = indices[i] - s = JobSplit( - job=self, - split_index=i, - start_index=split_indices[0], - end_index=split_indices[1], - ) - s.save() - - except Exception as e: - - self.split_errors += 1 - self.save() - - if self.split_errors > settings.MAXIMUM_SPLIT_ERRORS: #stop - message = "Index splitting for block `%s' of experiment " \ - "`%s' could not be completed due to an index split " \ - "error: %s" % (self.block.name, - self.block.experiment.fullname(), - traceback.format_exc()) - logger.error(message) - self._cancel(usrerr=settings.DEFAULT_USER_ERROR) - - - def _cascade_updates(self): - '''Cascade updates to children before I'm deleted. - ''' - - if hasattr(self, 'child'): - if self.status == Job.CANCELLED: - if self.parent: #I have a parent, so must give to child - parent = self.parent - self.parent = None - self.child.parent = parent - else: #child is the new parent - child = self.child - self.child.parent = None - # does this unblock the child to run? - if child.block.is_runnable(): child._make_runnable() - - else: - self.child._copy(self) - - if self.parent and self.status == Job.CANCELLED: - self.parent = None - - - def _update_state(self): - '''Update self state based on associated job states - - This method is not part of the Job's public API. It is supposed to be - called by children splits or itself. - ''' - - # lock - self_ = Job.objects.select_for_update().get(pk=self.pk) - - if self_.done(): return - - # If this process has a parent, then don't try to get split - # statuses - if not self.parent: - - split_statuses = self.splits.values_list('status', flat=True) - - if self.start_date is None: - qs = self.splits.filter(start_date__isnull=False).\ - order_by('start_date') - if qs: - self.start_date = qs.first().start_date - else: - self.start_date = datetime.datetime.now() - - # Process main status and status from job results - if Job.FAILED in split_statuses: - self.status = Job.FAILED - - elif Job.CANCELLED in split_statuses: - self.status = Job.CANCELLED - - elif (Job.PROCESSING in split_statuses) or \ - (Job.QUEUED in split_statuses and \ - Job.COMPLETED in split_statuses) or \ - (Job.CANCEL in split_statuses): - self.status = Job.PROCESSING - - elif all([s == Job.SKIPPED for s in split_statuses]): - self.status = Job.SKIPPED - - elif Job.QUEUED not in split_statuses: - self.status = Job.COMPLETED - - else: - self.status = Job.QUEUED - - # if required, erase dangling files, update own results - timings = None - if self.done() and self.status != Job.CANCELLED: - # compute final timings and update parent block - if self.status != Job.SKIPPED: - diff_timings = self._merge_results() - # delete all splits w/o results (still queued) - self.splits.filter(result__isnull=True).delete() - self.end_date = self.splits.order_by('-end_date').\ - first().end_date - updateStatistics(self.result.stats, self.end_date) - Result.objects.filter(split__in=self.splits.all()).delete() - seqtime = sum(diff_timings) - if self.start_date is None: - queuing = 0 - else: - queuing = (self.start_date - \ - self.runnable_date).total_seconds() - if not seqtime: - speed_up_real = 1.0 - speed_up_maximal = 1.0 - else: - speed_up_real = float(seqtime) / \ - (self.end_date - self.start_date).total_seconds() - speed_up_maximal = float(seqtime) / max(diff_timings) - timings = dict( - queuing = queuing, - linear_execution = seqtime, - speed_up_real = speed_up_real, - speed_up_maximal = speed_up_maximal, - ) - self.runnable_date = None - self.erase_dangling_files() - - # updates the dependents and child state - self.save() - - self._cascade_updates() - self.block._update_state(timings) - - - def erase_dangling_files(self): - '''Erase dangling files produced by this job in case of errors''' - - l = [] - failed = self.status in (Job.FAILED, Job.CANCELLED) - - if failed: - for o in self.block.outputs.all(): l += o.files() - - for f in l: - logger.info("Erasing output file `%s' because Job `%s' failed", f, - self) - os.remove(f) - - - def _cancel(self, usrerr=None): - '''Cancel the execution of this job - - As a consequence: delete all associated jobs, mark end_date and set - cancelled state. This method should only be called by the owning Block. - ''' - - # lock - self_ = Job.objects.select_for_update().get(pk=self.pk) - - if self_.done(): return - - logger.info("Marking job `%s' as 'cancelled'", self) - self.runnable_date = None - self.start_date = None - if self.splits.count(): - for s in self.splits.all(): s._cancel() - else: - self.status = Job.CANCELLED - if usrerr: - r = Result(status=1, usrerr=usrerr) - r.save() - self.result = r - self.save() - self.block._update_state() - self._cascade_updates() - - - def _merge_results(self): - '''Merge results from jobs, if any exist''' - - # update results - job_results = Result.objects.filter(pk__in=self.splits.filter(result__isnull=False).values_list('result', flat=True)) - - diff_timings = [(k[0]-k[1]).total_seconds() for k in \ - self.splits.filter(end_date__isnull=False, - start_date__isnull=False).values_list('end_date', 'start_date')] - - status = sum([k.status for k in job_results]) - stdout = _merge_strings([k.stdout for k in job_results]) - stderr = _merge_strings([k.stderr for k in job_results]) - usrerr = _merge_strings([k.usrerr for k in job_results]) - - # merge beat.core statistics - if job_results: - stats = job_results[0].stats - for k in job_results[1:]: stats += k.stats - stats = stats - else: - stats = beat.core.stats.Statistics() - - cancelled = any([k.cancelled for k in job_results]) - timed_out = any([k.timed_out for k in job_results]) - - r = Result(status=status, stdout=stdout, stderr=stderr, usrerr=usrerr, - timed_out=timed_out, cancelled=cancelled) - r.stats = stats - r.save() - self.result = r - - return diff_timings + return splits #---------------------------------------------------------- @@ -441,11 +171,28 @@ class Job(models.Model): class JobSplit(models.Model): '''Class describing a part of job of an experiment''' + QUEUED = 'N' + PROCESSING = 'P' + COMPLETED = 'C' + FAILED = 'F' + CANCELLED = 'L' + CANCELLING = 'K' + + STATUS = ( + (QUEUED, 'Queued'), + (PROCESSING, 'Processing'), + (COMPLETED, 'Completed'), + (FAILED, 'Failed'), + (CANCELLED, 'Cancelled'), + (CANCELLING, 'Cancelling'), + ) + + worker = models.ForeignKey('Worker', null=True, on_delete=models.SET_NULL, - related_name='splits') + related_name='splits') job = models.ForeignKey(Job, null=True, on_delete=models.CASCADE, - related_name='splits') + related_name='splits') split_index = models.PositiveIntegerField() @@ -453,22 +200,17 @@ class JobSplit(models.Model): end_index = models.PositiveIntegerField(null=True) - cache_errors = models.PositiveIntegerField(default=0) - - status = models.CharField(max_length=1, choices=Job.STATUS, - default=Job.QUEUED) + status = models.CharField(max_length=1, choices=STATUS, default=QUEUED) result = models.OneToOneField(Result, null=True, on_delete=models.CASCADE, - related_name='split') + related_name='split') start_date = models.DateTimeField(null=True) end_date = models.DateTimeField(null=True) - process_id = models.PositiveIntegerField(null=True) - - host = None + objects = JobSplitManager() class Meta: @@ -476,7 +218,6 @@ class JobSplit(models.Model): def __str__(self): - return "JobSplit(%s, index=%d, state=%s)%s" % \ (self.job, self.split_index, self.status, ('@%s' % self.worker) if self.worker else '') @@ -486,351 +227,7 @@ class JobSplit(models.Model): '''Says whether the job has finished or not''' return self.status in ( - Job.COMPLETED, - Job.SKIPPED, - Job.FAILED, - Job.CANCELLED, - ) - - - @transaction.atomic - def schedule(self, worker): - '''Schedules this split to be executed on a given worker - - Parameters: - - worker (:py:class:Worker): The worker this job split was actually - submitted to, if there is one. - - ''' - - from .worker import Worker - - # lock self - avoids concurrent update from scheduler/worker - # subsystem - self_ = JobSplit.objects.select_for_update().get(pk=self.pk) - worker_ = Worker.objects.select_for_update().get(pk=worker.pk) - - self.worker = worker - self.save() - - logger.info("Job split %s scheduled at `%s' was assigned to `%s'", - self, self.job.block.queue, self.worker) - - - def signal_io_error(self): - '''Marks the split as having an IOError (cache sync issues, likely) - ''' - - tries = 0 - - while True: - - try: - - with transaction.atomic(): - - # lock self - avoids concurrent update from - # scheduler/worker subsystem - self_ = JobSplit.objects.select_for_update().get(pk=self.pk) - - if self_.start_date is not None: return - - self.cache_errors += 1 - self.status = Job.QUEUED - self.save() - - break - - except utils.OperationalError: - tries += 1 - if tries > settings.MAXIMUM_SPLIT_SAVE_RETRIES: - raise - else: - logger.info("Database error caught starting `%s': retrying " \ - "in 1 second (%d/%d)...", self, tries, - settings.MAXIMUM_SPLIT_SAVE_RETRIES) - # wait a second and retry - time.sleep(1) - - - def start(self): - '''Marks the job as started, acknowledging scheduler assignment - - Once this function is called, a second call no longer alters anything. - ''' - - tries = 0 - - while True: - - try: - - with transaction.atomic(): - - # lock self - avoids concurrent update from - # scheduler/worker subsystem - self_ = JobSplit.objects.select_for_update().get(pk=self.pk) - - if self_.start_date is not None: return - - self.start_date = datetime.datetime.now() - self.process_id = os.getpid() - - self.status = Job.PROCESSING - - self.save() - - logger.info("Job split `%s' was just started.", self) - - self.job._update_state() - - break - - except utils.OperationalError: - tries += 1 - if tries > settings.MAXIMUM_SPLIT_SAVE_RETRIES: - raise - else: - logger.info("Database error caught starting `%s': retrying " \ - "in 1 second (%d/%d)...", self, tries, - settings.MAXIMUM_SPLIT_SAVE_RETRIES) - # wait a second and retry - time.sleep(1) - - - def _cancel(self): - '''Marks this job as cancelled.''' - - # If this split is running, then wait - if self.status == Job.PROCESSING: - with transaction.atomic(): - # lock self - avoids concurrent update from scheduler/worker - # subsystem - self_ = JobSplit.objects.select_for_update().get(pk=self.pk) - self.status = Job.CANCEL - self.save() - - logger.info("Job split `%s' is currently processing. Waiting " \ - "for worker to cancel split remotely.", self) - - else: #just end it - self.end(None, Job.CANCELLED) - - - def end(self, result, status=None): - '''Marks the job as finished on the state database - - Disassociates the worker from this job. Once this function is called, a - second call no longer alters anything. - - - Parameters: - - result (:py:class:`Result`): The result of the task - - status (str): One of the possible (single character) Job statuses, in - case ``result`` is not provided. Notice that, if ``result`` is - provided, this variable is **ignored** and the state - (``Job.COMPLETED`` or ``Job.FAILED``) is filled in from - ``result.status``. A ``result.status`` of 0 (zero) indicates a - successful task (set job status to ``Job.COMPLETED``), whereas if the - status is different than zero, the job status is set to - ``Job.FAILED``. - - ''' - - tries = 0 - - while True: - - try: - - with transaction.atomic(): - - # lock self - avoids concurrent update from - # scheduler/worker subsystem - self_ = JobSplit.objects.select_for_update().get(pk=self.pk) - - if self_.done(): return - - if status: - logger.info("Marking job split `%s' as '%s'", self, - status) - - if self.start_date is None: - self.start_date = datetime.datetime.now() - self.end_date = datetime.datetime.now() - self.worker = None #frees worker slot - - if result: - # special condition to handle cancelled jobs - # which should be marked as cancelled, unless - # they have finished before I could act on them - # in those, cases, preserve the result as the - # caches are already setup - if result.status != 0 and self_.status == Job.CANCEL: - self.status = Job.CANCELLED - if result.id is not None: result.delete() - else: - self.status = Job.COMPLETED if \ - result.status == 0 else Job.FAILED - if result.status in (-15, 15, -9, 9): - # job received a term/kill signal - if not result.usrerr: - result.usrerr = 'User process was ' \ - 'terminated by an external agent' - if result.id is None: result.save() - self.result = result - - else: - self.status = status - - self.save() - - logger.info("Job split `%s' finished executing", self) - - self.job._update_state() - - break - - except utils.OperationalError: - tries += 1 - if tries > settings.MAXIMUM_SPLIT_SAVE_RETRIES: - raise - else: - logger.info("Database error caught ending `%s': retrying " \ - "in 1 second (%d/%d)...", self, tries, - settings.MAXIMUM_SPLIT_SAVE_RETRIES) - # wait a second and retry - time.sleep(1) - - - def try_end(self, result): - '''Tries to end the split - ignores if the split was deleted''' - - try: - self.refresh_from_db() - except JobSplit.DoesNotExist: - logger.warn("Job split(pk=%d) does not exist. Likely cancelled, " \ - "so ignoring result `%s'", self.pk, result) - self.end(result) - - - def process(self, cache=settings.CACHE_ROOT, docker_images_cache=None): - '''Process assigned job splits using beat.core - - This task executes the user algorithm on a subprocess. It also serves - the data to the user process so it works like an I/O daemon. - - If ``required_slots == 1``, then this job takes care of the whole data - set. Otherwise, it takes care of a subset of the input data that is - synchronised with this block, determined by ``split_index``. - - - Parameters: - - cache (str, Optional): The path leading to the root of the cache to - use for this run. If not set, use the global default at - ``settings.CACHE_ROOT``. - - ''' - - logger.info("Starting to process split `%s' (pid=%d)...", self, - os.getpid()) - - self.executor = None - - config = simplejson.loads(str(self.job.block.command)) - - if settings.DATASETS_UID is not None: - config['datasets_uid'] = settings.DATASETS_UID - - if settings.DATASETS_ROOT_PATH is not None: - config['datasets_root_path'] = settings.DATASETS_ROOT_PATH - - # setup range if necessary - if self.job.block.required_slots > 1: - - if (self.start_index) is None or (self.end_index is None): - message = "The split %d/%d (pid=%d) running on worker `%s' " \ - "for block `%s' of experiment `%s' could not " \ - "be completed: indexes are missing!" % \ - (self.split_index+1, self.job.block.required_slots, - os.getpid(), self.worker, self.job.block.name, - self.job.block.experiment.fullname()) - logger.error(message) - self.try_end(Result(status=1, - usrerr=settings.DEFAULT_USER_ERROR)) - - config['range'] = [self.start_index, self.end_index] - - # For reference, this bit of code should match (or be very similar) to - # the one at beat.cmdline.experiments:run_experiment() - - try: - - if JobSplit.host is None: - JobSplit.host = Host(images_cache=docker_images_cache, - raise_on_errors=not(getattr(settings, 'TEST_CONFIGURATION', False))) - - self.executor = beat.core.execution.DockerExecutor( - JobSplit.host, settings.PREFIX, config, cache - ) - - if not self.executor.valid: - err = '' - for e in self.executor.errors: err += ' * %s\n' % e - message = "Failed to load execution information for split " \ - "%d/%d running at worker `%s', for block `%s' of " \ - "experiment `%s': %s" % (self.split_index+1, - self.job.block.required_slots, - self.worker, self.job.block.name, - self.job.block.experiment.fullname(), err) - raise RuntimeError(message) - - queue = self.job.block.queue - - logger.info("Running `%s' on worker request", - self.executor.algorithm.name) - - # n.b.: with executor may crash on the database view setup - with self.executor: - self.start() - result = self.executor.process( - virtual_memory_in_megabytes=queue.memory_limit, - max_cpu_percent=int(100*float(queue.cores_per_slot)), #allows for 150% - timeout_in_minutes=queue.time_limit - ) - - self.try_end(Result( - status=result['status'], - stdout=result['stdout'], - stderr=result['stderr'], - usrerr=result['user_error'], - _stats=simplejson.dumps(result['statistics'], indent=2), - )) - logger.info("Split `%s' (pid=%d) ended gracefully", self, - os.getpid()) - - except IOError: - logger.warn("Split `%s' (pid=%d) execution raised an IOError: %s", - self, os.getpid(), traceback.format_exc()) - self.signal_io_error() - if self.cache_errors > settings.MAXIMUM_IO_ERRORS: - logger.info("Split `%s' reached the maximum number of IO " \ - "errors (%d > %d). Force failing this split." % \ - (self, self.cache_errors, settings.MAXIMUM_IO_ERRORS)) - logger.error(traceback.format_exc()) - self.try_end(Result(status=1, - usrerr=settings.DEFAULT_USER_ERROR)) - else: - logger.info("Split `%s' will be retried (%d/%d)", - self, self.cache_errors, settings.MAXIMUM_IO_ERRORS) - - except Exception: - logger.error("Split `%s' (pid=%d) ended with an error: %s", - self, os.getpid(), traceback.format_exc()) - self.try_end(Result(status=1, usrerr=settings.DEFAULT_USER_ERROR)) - - self.executor = None + JobSplit.COMPLETED, + JobSplit.FAILED, + JobSplit.CANCELLED, + ) diff --git a/beat/web/backend/models/queue.py b/beat/web/backend/models/queue.py index ea52698c85477111604318244380ebc38a354f54..d87ae84b3f072d20870936579a2cf444f38557a5 100755 --- a/beat/web/backend/models/queue.py +++ b/beat/web/backend/models/queue.py @@ -120,7 +120,7 @@ class Queue(models.Model): from ..models import Job running = JobSplit.objects.filter(job__block__in=self.blocks.all(), - status=Job.PROCESSING).count() + status=JobSplit.PROCESSING).count() return max(self.number_of_slots() - running, 0) diff --git a/beat/web/backend/models/worker.py b/beat/web/backend/models/worker.py index b63241ff2959f13a9fb6b6cac5a28eb1be52ce29..405b40b23c5721bfdb25849f9646a3553528af5c 100755 --- a/beat/web/backend/models/worker.py +++ b/beat/web/backend/models/worker.py @@ -128,7 +128,7 @@ class Worker(models.Model): def current_load(self): '''Calculates the number of cores being used currently''' - return sum([j.job.block.queue.cores_per_slot for j in self.splits.filter(status=Job.PROCESSING)]) + return sum([j.job.block.queue.cores_per_slot for j in self.splits.filter(status=JobSplit.PROCESSING)]) def available_cores(self): @@ -137,18 +137,18 @@ class Worker(models.Model): return max(self.cores - self.load(), 0) - def deactivate(self, reason): - '''Deactivates the current worker for a reason, that is registered''' - - self.info = reason - self.active = False - - - def activate(self, reason=None): - '''Reactivates the worker, deletes any associated information''' - - self.info = reason - self.active = True + # def deactivate(self, reason): + # '''Deactivates the current worker for a reason, that is registered''' + # + # self.info = reason + # self.active = False + # + # + # def activate(self, reason=None): + # '''Reactivates the worker, deletes any associated information''' + # + # self.info = reason + # self.active = True def as_dict(self): @@ -157,220 +157,220 @@ class Worker(models.Model): return dict(cores=self.cores, memory=self.memory) - def check_environments(self, environments): - '''Checks that this worker has access to all environments it needs - - This method will check if the found set of environments (in the - dictionary ``environments``) contains, at least, one environment for - each environment object this worker is supposed to be able to execute - user algorithms for. - - - Parameters: - - environments (dict): A dictionary of environments found by using - :py:func:`utils.find_environments` in which, keys represent the - natural keys of Django database environments. - - - Returns: - - list: A list of missing environments this worker can be assigned to - work with, but where not found - - list: A list of unused environments this worker cannot be assigned to - work with, but where nevertheless found - - ''' - - slots = Slot.objects.filter(worker=self) - queues = Queue.objects.filter(slots__in=slots) - wishlist = Environment.objects.filter(queues__in=queues, active=True) - wishlist = wishlist.order_by('id').distinct() - - required = [k.fullname() for k in wishlist] - missing = [k for k in required if k not in environments] - unused = [k for k in environments if k not in required] - - return missing, unused - - - def update_state(self): - '''Updates state on the database based on current machine readings''' - - # check I have at least all cores and memory I'm supposed to have - cores = psutil.cpu_count() - ram = psutil.virtual_memory().total / (1024 * 1024) - self.info = '' - - if cores < self.cores: - logger.warn("Worker `%s' only has %d cores which is less then " \ - "the value declared on the database - it's not a problem, " \ - "but note this self may get overloaded", self, cores) - self.info += 'only %d cores;' % cores - - if ram < self.memory: - logger.warn("Worker `%s' only has %d Mb of RAM which is less " \ - "then the value declared on the database - it's not a " \ - "problem, but note this self may get overloaded", self, - ram) - self.info += 'only %d Mb of RAM;' % ram - - with transaction.atomic(): - self_ = Worker.objects.select_for_update().get(pk=self.pk) #lock - - # update process and memory usage - self.used_cores = int(psutil.cpu_percent()) - self.used_memory = int(psutil.virtual_memory().percent) - - # save current self state - self.active = True - self.update = False - self.save() - - - def terminate(self): - '''Cleanly terminates a particular worker at the database - - .. note:: - - This method does not destroy running or assigned processes that may - be running or assigned to this worker. This is implemented in this - way to allow for a clean replacement of the worker program w/o an - interruption of the backend service. - - ''' - - from ..models import JobSplit - from ..models import Job - - # disables worker, so no more splits can be assigned to it - with transaction.atomic(): - self_ = Worker.objects.select_for_update().get(pk=self.pk) - self_.active = False - self_.used_cores = 0 - self_.used_memory = 0 - self_.info = 'Worker deactivated by system administrator' - self_.save() - - # cancel job splits which should be cancelled anyways - for j in JobSplit.objects.filter(worker=self, status=Job.CANCEL, - end_date__isnull=True, process_id__isnull=False): - if psutil.pid_exists(j.process_id): - os.kill(j.process_id, signal.SIGTERM) - - # cleans-up zombie processes that may linger - _cleanup_zombies() - - - def shutdown(self): - '''Removes all running/assigned jobs from the queue, shuts down - - This method should be used with care as it may potentially cancel all - assigned splits for the current worker. - - ''' - - from ..models import JobSplit - from ..models import Job - - self.terminate() - - message = 'Cancelled on forced worker shutdown (maintenance)' \ - ' - you may retry submitting your experiment shortly' - - # cancel job splits which were not yet started - for j in JobSplit.objects.filter(worker=self, status=Job.QUEUED, - start_date__isnull=True, process_id__isnull=True): - j.end(Result(status=1, usrerr=message)) - - # cancel job splits which are running - for j in JobSplit.objects.filter(worker=self, status=Job.PROCESSING, - end_date__isnull=True, process_id__isnull=False): - j._cancel() - - - - def work(self, environments, process): - '''Launches user code on isolated processes - - This function is supposed to be called asynchronously, by a - scheduled agent, every few seconds. It examines job splits assigned - to the current host and launches an individual process to handle - these splits. The process is started locally and the process ID - stored with the split. - - Job split cancelling is executed by setting the split state as - ``CANCEL`` and waiting for this function to handle it. - - - Parameters: - - environments (dict): A dictionary containing installed - environments, their description and execute-file paths. - - process (str): The path to the ``process.py`` program to use for - running the user code on isolated processes. - - ''' - - from ..models import JobSplit - from ..models import Job - - # refresh state from database and update state if required - self.refresh_from_db() - if self.update: self.update_state() - - # cancel job splits by killing associated processes - for j in JobSplit.objects.filter(worker=self, status=Job.CANCEL, - end_date__isnull=True): - if j.process_id is not None and psutil.pid_exists(j.process_id): - os.kill(j.process_id, signal.SIGTERM) - else: # process went away without any apparent reason - with transaction.atomic(): - message = "Split %d/%d running at worker `%s' for " \ - "block `%s' of experiment `%s' finished before " \ - "even starting. Force-cancelling job split at " \ - "database..." % (j.split_index+1, - j.job.block.required_slots, - self, - j.job.block.name, - j.job.block.experiment.fullname(), - ) - logger.error(message) - j.end(Result(status=1, usrerr=settings.DEFAULT_USER_ERROR)) - - # cmdline base argument - cmdline = [process] - if settings.DEBUG: - cmdline += ['-vv'] - else: - cmdline += ['-v'] - - # start newly assigned job splits - with transaction.atomic(): - splits = JobSplit.objects.select_for_update().filter(worker=self, - status=Job.QUEUED, start_date__isnull=True, - process_id__isnull=True) - for split in splits: - # if we get to this point, then we launch the user process - # -> see settings.WORKER_DETACH_CHILDREN for more info - kwargs = dict() - if settings.WORKER_DETACH_CHILDREN: - kwargs['preexec_fn'] = os.setpgrp - subprocess.Popen(cmdline + [str(split.pk)], **kwargs) - split.status = Job.PROCESSING #avoids re-running - split.save() - - # cleans-up zombie processes that may linger - _cleanup_zombies() - - - def __enter__(self): - self.update_state() - return self - - - def __exit__(self, *exc): - self.terminate() - return False #propagate exceptions + # def check_environments(self, environments): + # '''Checks that this worker has access to all environments it needs + # + # This method will check if the found set of environments (in the + # dictionary ``environments``) contains, at least, one environment for + # each environment object this worker is supposed to be able to execute + # user algorithms for. + # + # + # Parameters: + # + # environments (dict): A dictionary of environments found by using + # :py:func:`utils.find_environments` in which, keys represent the + # natural keys of Django database environments. + # + # + # Returns: + # + # list: A list of missing environments this worker can be assigned to + # work with, but where not found + # + # list: A list of unused environments this worker cannot be assigned to + # work with, but where nevertheless found + # + # ''' + # + # slots = Slot.objects.filter(worker=self) + # queues = Queue.objects.filter(slots__in=slots) + # wishlist = Environment.objects.filter(queues__in=queues, active=True) + # wishlist = wishlist.order_by('id').distinct() + # + # required = [k.fullname() for k in wishlist] + # missing = [k for k in required if k not in environments] + # unused = [k for k in environments if k not in required] + # + # return missing, unused + # + # + # def update_state(self): + # '''Updates state on the database based on current machine readings''' + # + # # check I have at least all cores and memory I'm supposed to have + # cores = psutil.cpu_count() + # ram = psutil.virtual_memory().total / (1024 * 1024) + # self.info = '' + # + # if cores < self.cores: + # logger.warn("Worker `%s' only has %d cores which is less then " \ + # "the value declared on the database - it's not a problem, " \ + # "but note this self may get overloaded", self, cores) + # self.info += 'only %d cores;' % cores + # + # if ram < self.memory: + # logger.warn("Worker `%s' only has %d Mb of RAM which is less " \ + # "then the value declared on the database - it's not a " \ + # "problem, but note this self may get overloaded", self, + # ram) + # self.info += 'only %d Mb of RAM;' % ram + # + # with transaction.atomic(): + # self_ = Worker.objects.select_for_update().get(pk=self.pk) #lock + # + # # update process and memory usage + # self.used_cores = int(psutil.cpu_percent()) + # self.used_memory = int(psutil.virtual_memory().percent) + # + # # save current self state + # self.active = True + # self.update = False + # self.save() + # + # + # def terminate(self): + # '''Cleanly terminates a particular worker at the database + # + # .. note:: + # + # This method does not destroy running or assigned processes that may + # be running or assigned to this worker. This is implemented in this + # way to allow for a clean replacement of the worker program w/o an + # interruption of the backend service. + # + # ''' + # + # from ..models import JobSplit + # from ..models import Job + # + # # disables worker, so no more splits can be assigned to it + # with transaction.atomic(): + # self_ = Worker.objects.select_for_update().get(pk=self.pk) + # self_.active = False + # self_.used_cores = 0 + # self_.used_memory = 0 + # self_.info = 'Worker deactivated by system administrator' + # self_.save() + # + # # cancel job splits which should be cancelled anyways + # for j in JobSplit.objects.filter(worker=self, status=JobSplit.CANCELLING, + # end_date__isnull=True, process_id__isnull=False): + # if psutil.pid_exists(j.process_id): + # os.kill(j.process_id, signal.SIGTERM) + # + # # cleans-up zombie processes that may linger + # _cleanup_zombies() + # + # + # def shutdown(self): + # '''Removes all running/assigned jobs from the queue, shuts down + # + # This method should be used with care as it may potentially cancel all + # assigned splits for the current worker. + # + # ''' + # + # from ..models import JobSplit + # from ..models import Job + # + # self.terminate() + # + # message = 'Cancelled on forced worker shutdown (maintenance)' \ + # ' - you may retry submitting your experiment shortly' + # + # # cancel job splits which were not yet started + # for j in JobSplit.objects.filter(worker=self, status=JobSplit.QUEUED, + # start_date__isnull=True, process_id__isnull=True): + # j.end(Result(status=1, usrerr=message)) + # + # # cancel job splits which are running + # for j in JobSplit.objects.filter(worker=self, status=JobSplit.PROCESSING, + # end_date__isnull=True, process_id__isnull=False): + # j._cancel() + # + # + # + # def work(self, environments, process): + # '''Launches user code on isolated processes + # + # This function is supposed to be called asynchronously, by a + # scheduled agent, every few seconds. It examines job splits assigned + # to the current host and launches an individual process to handle + # these splits. The process is started locally and the process ID + # stored with the split. + # + # Job split cancelling is executed by setting the split state as + # ``CANCEL`` and waiting for this function to handle it. + # + # + # Parameters: + # + # environments (dict): A dictionary containing installed + # environments, their description and execute-file paths. + # + # process (str): The path to the ``process.py`` program to use for + # running the user code on isolated processes. + # + # ''' + # + # from ..models import JobSplit + # from ..models import Job + # + # # refresh state from database and update state if required + # self.refresh_from_db() + # if self.update: self.update_state() + # + # # cancel job splits by killing associated processes + # for j in JobSplit.objects.filter(worker=self, status=JobSplit.CANCELLING, + # end_date__isnull=True): + # if j.process_id is not None and psutil.pid_exists(j.process_id): + # os.kill(j.process_id, signal.SIGTERM) + # else: # process went away without any apparent reason + # with transaction.atomic(): + # message = "Split %d/%d running at worker `%s' for " \ + # "block `%s' of experiment `%s' finished before " \ + # "even starting. Force-cancelling job split at " \ + # "database..." % (j.split_index+1, + # j.job.block.required_slots, + # self, + # j.job.block.name, + # j.job.block.experiment.fullname(), + # ) + # logger.error(message) + # j.end(Result(status=1, usrerr=settings.DEFAULT_USER_ERROR)) + # + # # cmdline base argument + # cmdline = [process] + # if settings.DEBUG: + # cmdline += ['-vv'] + # else: + # cmdline += ['-v'] + # + # # start newly assigned job splits + # with transaction.atomic(): + # splits = JobSplit.objects.select_for_update().filter(worker=self, + # status=JobSplit.QUEUED, start_date__isnull=True, + # process_id__isnull=True) + # for split in splits: + # # if we get to this point, then we launch the user process + # # -> see settings.WORKER_DETACH_CHILDREN for more info + # kwargs = dict() + # if settings.WORKER_DETACH_CHILDREN: + # kwargs['preexec_fn'] = os.setpgrp + # subprocess.Popen(cmdline + [str(split.pk)], **kwargs) + # split.status = JobSplit.PROCESSING #avoids re-running + # split.save() + # + # # cleans-up zombie processes that may linger + # _cleanup_zombies() + # + # + # def __enter__(self): + # self.update_state() + # return self + # + # + # def __exit__(self, *exc): + # self.terminate() + # return False #propagate exceptions diff --git a/beat/web/backend/schedule.py b/beat/web/backend/schedule.py deleted file mode 100755 index 06eb7c1405457b8d2b7ca65fa1da884769893790..0000000000000000000000000000000000000000 --- a/beat/web/backend/schedule.py +++ /dev/null @@ -1,244 +0,0 @@ -#!/usr/bin/env python -# vim: set fileencoding=utf-8 : - -############################################################################### -# # -# Copyright (c) 2016 Idiap Research Institute, http://www.idiap.ch/ # -# Contact: beat.support@idiap.ch # -# # -# This file is part of the beat.web module of the BEAT platform. # -# # -# Commercial License Usage # -# Licensees holding valid commercial BEAT licenses may use this file in # -# accordance with the terms contained in a written agreement between you # -# and Idiap. For further information contact tto@idiap.ch # -# # -# Alternatively, this file may be used under the terms of the GNU Affero # -# Public License version 3 as published by the Free Software and appearing # -# in the file LICENSE.AGPL included in the packaging of this file. # -# The BEAT platform is distributed in the hope that it will be useful, but # -# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY # -# or FITNESS FOR A PARTICULAR PURPOSE. # -# # -# You should have received a copy of the GNU Affero Public License along # -# with the BEAT platform. If not, see http://www.gnu.org/licenses/. # -# # -############################################################################### - -'''Scheduling functions and utilities''' - -import logging -logger = logging.getLogger(__name__) - -from django.db import transaction - -from .models import Job, JobSplit, Queue, Worker - - -def _select_splits_for_queue(queue): - '''Returns a list of job splits that can run now, at a certain queue - - Here is the work done: - - 1. Find the queue availability. This is a bit tricky as queues are only - allowed to consume a limited (configurable) number of slots in each - worker, per user - - 2. Calculate runnable job splits - - 3. TODO: Calculates the list of job splits that can potentially run now - (for which there is space in the current queue being analyzed), taking - into consideration the relative use for every user and giving more - priority to user with less load - - 4. Return such a list clipping it so that the number of job splits returned - does not exceed the queue availability - - The work is done inside the "queue" domain: it does not take into - consideration other job splits that may also have slots shared on one of - the machines this queue also has slots on. - - ''' - - splits = JobSplit.objects.filter(job__block__queue=queue, - worker__isnull=True, job__split_errors=0, - status=Job.QUEUED).order_by('job__runnable_date') - - # lists of all users currently running - users = set(splits.values_list('job__block__experiment__author', flat=True)) - - # number of splits (== slots) running on this queue for each user - user_slots = [JobSplit.objects.filter(job__block__experiment__author=k, - job__block__queue=queue, status=Job.PROCESSING).count() for k in users] - - allowance = [queue.max_slots_per_user - k for k in user_slots] - allowance = dict(zip(users, allowance)) - - # limit runnable splits so we reach a maximum of allowed user slots - splits_to_consider = [] - for s in splits: - author = s.job.block.experiment.author.id - if allowance[author] > 0: - splits_to_consider.append(s) - allowance[author] -= 1 - - # TODO: Sort splits taking into consideration current user load - - # now we have a list of job splits, within each users' allowance, we could - # submit, clip it by using the total number of available slots on the queue - # the list is ordered having oldest job splits first - return splits_to_consider[:queue.availability()] - - -def schedule(): - '''Schedules job splits that can run now, respecting user/queue usage - - The priorities are: - - 1. job splits that require more cores to run (more resource intense) - 2. job splits that are older (runnable since a long time) - - Reasoning: Jobs which require more cores are prioritary over job splits - that don't require that many resources. The reason for this is practical: - nodes associated to queues with more cores maybe used by queues that - requires less slots in case no job splits requiring more cores are present, - which will happen most of the time if there are more slots available with - less resources. This will not work if either priorities are not established - for the use of each node or you only have 1 machine hosting all queues. In - these cases, you may face a situation in which job splits requesting more - cores block the execution of job splits requesting less cores because of - the lack of farm avalability. - - For example, a good setup could be like this (note priorities in this case - are not useful, so they are all set to 0): - - Queue A (1 core/4Gb RAM/60 minutes/1 job per user): - - Host 1 (1 core/4Gb): 1 slot (priority 0) - - Host 2 (2 cores/8Gb): 2 slots (priority 0) - - Queue B (2 cores/8Gb RAM/120 minutes/1 job per user): - - Host 2 (2 cores/8Gb): 1 slot (priority 0) - - In this way: job splits for Queue B have priority of job splits for Queue - A. - - If no job splits are scheduled requiring 2 cores, then we have 3 slots - available to run simpler job splits. In case a job with 2 cores pops-up, - then we must make sure 1-core job splits running at ``Host 2`` are freed so - to run that job. To do that, we must first check on more resource intensive - job splits and then order those by age, processing one after the other. - - You may further setup a queue for "special" users that can run more job - splits in parallel, like this: - - Queue A (1 core/4Gb RAM/60 minutes/1 job per user): - - Host 1 (1 core/4Gb): 1 slot (priority 0) - - Host 2 (2 cores/8Gb): 2 slots (priority 0) - - Queue A* (1 core/4Gb RAM/60 minutes/1 job per user): - - Host 1 (1 core/4Gb): 1 slot (priority 0) - - Host 2 (2 cores/8Gb): 2 slots (priority 1) - - Queue B (2 cores/8Gb RAM/120 minutes/1 job per user): - - Host 2 (2 cores/8Gb): 1 slot (priority 2) - - Queue A* may be setup so that some "special" users can run more job splits - than others, having the same configuration as Queue A, except for the - priority on Host 2, which is higher. Users that have access to Queue A or - Queue A* can use Host 1 on a "first come first served" basis (as they have - the same priority). Host 2 is primarily used for 2-core job splits. If job - splits requiring a single core appears to run on Host 2 (from either Queue - A or A*), then job splits from Queue A* have priority. - - Here is a summary of the work done: - - 1. Use :py:func:`_select_splits_for_queue` to figure out which splits can - actually run on a queue basis, respecting user limits - - 2. For those splits, prioritize execution by: number of cores required per - job split and then, by age. - - 3. Remove splits, based on the order, which have the same output - - 4. Attribute more resource intensive job splits to the slots that are free. - If no slots are free, then virtually block free slots on hosts that can - process job splits requiring more resources. This will guarantee the - remaining cores will be freed in the future and this resource-intensive - job splits will be able to run at a certain point. - - 5. For each job split on the list, check the queue virtual availability - (taking into consideration blocking as executed on step 3) and leave the - job split on the list if there is a free slot, otherwise remove it. - - - Returns: - - list: The list of splits assigned at this scheduling iteration - - ''' - - for j in Job.objects.filter(status=Job.QUEUED, runnable_date__isnull=True): - if j.block.is_runnable(): - j._make_runnable() - - - # updates jobs with split errors, cancel experiments if problems occur - for j in Job.objects.filter(split_errors__gt=0): - j._split_indices() - - # get queues in a good order - sorted_queues = Queue.objects.order_by('-cores_per_slot', - 'max_slots_per_user') - - splits_to_consider = \ - [_select_splits_for_queue(q) for q in sorted_queues] - - if not any(splits_to_consider): return [] - - logger.debug('Considering splits: %s', splits_to_consider) - - # decides which split to run considering the 'on-the-fly' availability - - # workers that can run job splits - whitelist = {} - for worker in Worker.objects.filter(active=True): - availability = worker.available_cores() - if availability <= 0: continue - whitelist[worker] = availability - - assigned_splits = [] - - logger.debug('Worker availability: %s', whitelist) - - for batch in splits_to_consider: - - if not batch: continue #empty list - candidates = batch[0].job.block.queue.worker_availability() - - for split in batch: - - assigned = False - required_cores = split.job.block.queue.cores_per_slot - - for c in candidates: - avail = whitelist.get(c, 0) - if not avail: continue #should not use this worker - if avail >= required_cores: - logger.debug("Assigning `%s' to worker `%s'", split, c) - split.schedule(c) #assign job split to worker - assigned_splits.append(split) - whitelist[c] -= required_cores - logger.debug("`%s' cores available: %d", c, whitelist[c]) - assigned = True - break - - if not assigned and required_cores > 1: - #blacklist the most promising worker, otherwise will never be - #picked - c0 = candidates[0] - if c0 in whitelist: - logger.info("Could not assign `%s' so blacklisting `%s'", - split, c0) - del whitelist[c0] - - return assigned_splits diff --git a/beat/web/backend/state.py b/beat/web/backend/state.py old mode 100644 new mode 100755 index 0b37ea6843cb8524abaa7ddb09fa7e82cd126cfa..bfd69e657a7f4ea5ff8c342cfb10fe6f543d9c05 --- a/beat/web/backend/state.py +++ b/beat/web/backend/state.py @@ -64,12 +64,11 @@ def jobs(): return dict( total=JobSplit.objects.count(), - running=JobSplit.objects.filter(status=Job.PROCESSING).count(), - queued=JobSplit.objects.filter(status=Job.QUEUED).count(), - cancelled=JobSplit.objects.filter(status=Job.CANCELLED).count(), - skipped=JobSplit.objects.filter(status=Job.SKIPPED).count(), - completed=JobSplit.objects.filter(status=Job.COMPLETED).count(), - failed=JobSplit.objects.filter(status=Job.FAILED).count(), + running=JobSplit.objects.filter(status=JobSplit.PROCESSING).count(), + queued=JobSplit.objects.filter(status=JobSplit.QUEUED).count(), + cancelled=JobSplit.objects.filter(status=JobSplit.CANCELLED).count(), + completed=JobSplit.objects.filter(status=JobSplit.COMPLETED).count(), + failed=JobSplit.objects.filter(status=JobSplit.FAILED).count(), ) diff --git a/beat/web/backend/templatetags/backend_tags.py b/beat/web/backend/templatetags/backend_tags.py old mode 100644 new mode 100755 index b3be2c7db7a7938274d55aafde560b40a57a7243..8d05e57fb538394c8728860caf05ad3c0e62fd36 --- a/beat/web/backend/templatetags/backend_tags.py +++ b/beat/web/backend/templatetags/backend_tags.py @@ -29,7 +29,7 @@ from django import template from django.contrib.auth.models import User -from ..models import Job +from ..models import JobSplit register = template.Library() @@ -89,5 +89,5 @@ def visible_queues(context, object): def count_job_splits(xp, status=None): """Returns job splits for an experiment in a certain state""" if status == 'A': - return xp.job_splits(status=Job.QUEUED).filter(worker__isnull=False).count() + return xp.job_splits(stJobSplit.QUEUEDUEUED).filter(worker__isnull=False).count() return xp.job_splits(status=status).count() diff --git a/beat/web/backend/tests.py b/beat/web/backend/tests.py deleted file mode 100755 index eb52eebaa33280882b13a3b516bcfe7ca8a75fd8..0000000000000000000000000000000000000000 --- a/beat/web/backend/tests.py +++ /dev/null @@ -1,2835 +0,0 @@ -#!/usr/bin/env python -# vim: set fileencoding=utf-8 : - -############################################################################### -# # -# Copyright (c) 2016 Idiap Research Institute, http://www.idiap.ch/ # -# Contact: beat.support@idiap.ch # -# # -# This file is part of the beat.web module of the BEAT platform. # -# # -# Commercial License Usage # -# Licensees holding valid commercial BEAT licenses may use this file in # -# accordance with the terms contained in a written agreement between you # -# and Idiap. For further information contact tto@idiap.ch # -# # -# Alternatively, this file may be used under the terms of the GNU Affero # -# Public License version 3 as published by the Free Software and appearing # -# in the file LICENSE.AGPL included in the packaging of this file. # -# The BEAT platform is distributed in the hope that it will be useful, but # -# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY # -# or FITNESS FOR A PARTICULAR PURPOSE. # -# # -# You should have received a copy of the GNU Affero Public License along # -# with the BEAT platform. If not, see http://www.gnu.org/licenses/. # -# # -############################################################################### - -import os -import sys -import time -import shutil -import tempfile -import collections -import time - -from django.conf import settings -from django.core.urlresolvers import reverse -from django.core import management -from django.contrib.auth.models import User, Group -from django.test import TestCase, TransactionTestCase - -from guardian.shortcuts import get_perms - -from ..common.testutils import BaseTestCase as APITestCase, tearDownModule -from ..experiments.models import Experiment, Block -from ..algorithms.models import Algorithm -from ..utils.management.commands import install -from ..statistics.models import HourlyStatistics - -from .models import Queue, Worker, Slot, Environment, Job, JobSplit, Result -from .utils import cleanup_cache, dump_backend, setup_backend -from .management.commands import qsetup -from .schedule import schedule - - -def _sleep(tries, condition): - """For some reason, time.sleep is not reliable on this test unit. Use this""" - - seconds = 1.0 #between tries - for i in range(tries): - if condition(): return - slept = 0 - while slept < seconds: - start = time.time() - time.sleep(seconds - slept) - slept += time.time() - start - - -# Example configuration with 3 queues with an increasing amount of resources -# running on the same host -QUEUES_WITHOUT_PRIORITY = { - "queues": collections.OrderedDict([ - ("q1", { - "memory-limit": 4*1024, - "time-limit": 180, #3 hours - "cores-per-slot": 1, - "max-slots-per-user": 4, - "environments": ['Python 2.7 (1.1.0)'], - "groups": [ - "Default", - ], - "slots": { - "node1": { - "quantity": 4, - "priority": 0 - } - } - } - ), - ("q2", { - "memory-limit": 8*1024, - "time-limit": 360, #6 hours - "cores-per-slot": 2, - "max-slots-per-user": 2, - "environments": ['Python 2.7 (1.1.0)'], - "groups": [ - "Default", - ], - "slots": { - "node1": { - "quantity": 2, - "priority": 0 - }, - } - } - ), - ("q4", { - "memory-limit": 16*1024, - "time-limit": 720, #12 hours - "cores-per-slot": 4, - "max-slots-per-user": 1, - "environments": ['Python 2.7 (1.1.0)'], - "groups": [ - "Default", - ], - "slots": { - "node1": { - "quantity": 1, - "priority": 0 - }, - } - } - ) - ]), - "workers": { - "node1": { - "cores": 4, - "memory": 16*1024, - } - }, - "environments": { - 'Python 2.7 (1.1.0)': { - "name": 'Python 2.7', - "version": '1.1.0', - "short_description": "Test", - "description": "Test environment", - "languages": "python", - }, - }, - } - -# Example configuration with 3 queues sharing slots on 2 hosts -PRIORITY_QUEUES = { - "queues": collections.OrderedDict([ - ("q1", { - "memory-limit": 4*1024, - "time-limit": 180, #3 hours - "cores-per-slot": 1, - "max-slots-per-user": 2, - "environments": ['Python 2.7 (1.1.0)'], - "groups": [ - "Default", - ], - "slots": { - "node1": { - "quantity": 4, - "priority": 5 - }, - "node2": { - "quantity": 4, - "priority": 0 - }, - } - }, - ), - ("q2", { - "memory-limit": 8*1024, - "time-limit": 360, #6 hours - "cores-per-slot": 2, - "max-slots-per-user": 1, - "environments": ['Python 2.7 (1.1.0)'], - "groups": [ - "Default", - ], - "slots": { - "node1": { - "quantity": 2, - "priority": 0 - }, - "node2": { - "quantity": 2, - "priority": 10 - } - } - }, - ), - ("q1_special", { - "memory-limit": 4*1024, - "time-limit": 180, #3 hours - "cores-per-slot": 1, - "max-slots-per-user": 8, - "environments": ['Python 2.7 (1.1.0)'], - "groups": [ - "Default", - ], - "slots": { - "node1": { - "quantity": 4, - "priority": 0 - }, - "node2": { - "quantity": 4, - "priority": 5 - } - } - } - ), - ]), - "workers": collections.OrderedDict([ - ("node1", { - "cores": 4, - "memory": 32*1024, - } - ), - ("node2", { - "cores": 4, - "memory": 16*1024, - } - ) - ]), - "environments": { - 'Python 2.7 (1.1.0)': { - "name": 'Python 2.7', - "version": '1.1.0', - "short_description": "Test", - "description": "Test environment", - "languages": "python", - }, - }, - } - - -class CancelAllExperimentsAPI(APITestCase): - - def setUp(self): - self.url = reverse('backend:cancel-experiments') - - - def test_no_access_for_anonymous_user(self): - response = self.client.get(self.url) - self.checkResponse(response, 302) #redirects to login page - - - def test_no_access_for_non_superuser(self): - User.objects.create_user('johndoe', 'johndoe@test.org', '1234') - self.client.login(username='johndoe', password='1234') - response = self.client.get(self.url) - self.checkResponse(response, 403) - - -class CacheCleanUp(TestCase): - - - def setUp(self): - self.cache = tempfile.mkdtemp(prefix='beat_') - - - def tearDown(self): - shutil.rmtree(self.cache) - - - def touch(self, f, times=None): - """Replicates the `touch' command-line utility""" - with open(f, 'a'): os.utime(f, times) - - - def J(self, *args): - return os.path.join(*((self.cache,) + args)) - - - def prepare_cleanup_full(self): - - # creates a temporary directory structure - os.makedirs(self.J('a', 'b', 'c')) - os.makedirs(self.J('a', 'c', 'd')) - os.makedirs(self.J('a', 'c', 'e')) - self.touch(self.J('a', 'b', 'c', 'd.json')) - self.touch(self.J('a', 'c', 'd', 'e.json')) - - - def check_cleanup_full(self): - - assert not os.listdir(self.cache) - - - def test_cache_cleanup_full(self): - - self.prepare_cleanup_full() - cleanup_cache(self.cache, delete=True) - self.check_cleanup_full() - - - def test_cmd_cleanup_full(self): - - self.prepare_cleanup_full() - management.call_command('cleanup_cache', path=self.cache, - verbosity=0, delete=True) - self.check_cleanup_full() - - - def prepare_cleanup_aged(self): - - two_min_ago = time.time() - 60*2 - - # creates a temporary directory structure - os.makedirs(self.J('a', 'b', 'c')) - os.makedirs(self.J('a', 'c', 'd')) - os.makedirs(self.J('a', 'c', 'e')) - self.touch(self.J('a', 'b', 'c', 'd.json'), (two_min_ago, two_min_ago)) - self.touch(self.J('a', 'c', 'd', 'e.json')) - - - def check_cleanup_aged(self): - - assert os.path.exists(self.J('a', 'c', 'd', 'e.json')) - assert not os.path.exists(self.J('a', 'b', 'c')) - assert not os.path.exists(self.J('a', 'b', 'c', 'd.json')) - assert not os.path.exists(self.J('a', 'b', 'e')) - - - def test_cache_cleanup_aged(self): - - self.prepare_cleanup_aged() - cleanup_cache(self.cache, age_in_minutes=2, delete=True) - self.check_cleanup_aged() - - - def test_cmd_cleanup_aged(self): - - self.prepare_cleanup_aged() - management.call_command('cleanup_cache', path=self.cache, - verbosity=0, olderthan=2, delete=True) - self.check_cleanup_aged() - - - def prepare_cleanup_lock(self): - - two_min_ago = time.time() - 60*2 - ten_min_ago = time.time() - 60*10 - - # creates a temporary directory structure - os.makedirs(self.J('a', 'b', 'c')) - os.makedirs(self.J('a', 'c', 'd')) - os.makedirs(self.J('a', 'c', 'e')) - self.touch(self.J('a', 'b', 'c', 'd.json'), (two_min_ago, two_min_ago)) - self.touch(self.J('a', 'c', 'd', 'e.json'), (ten_min_ago, ten_min_ago)) - - self.touch(self.J('a', 'c', 'd', 'e.lock')) #create a lock - - - def check_cleanup_lock(self): - - assert os.path.exists(self.J('a', 'c', 'd', 'e.json')) - assert not os.path.exists(self.J('a', 'b', 'c')) - assert not os.path.exists(self.J('a', 'b', 'c', 'd.json')) - assert not os.path.exists(self.J('a', 'b', 'e')) - - - def test_cache_cleanup_lock(self): - - self.prepare_cleanup_lock() - cleanup_cache(self.cache, delete=True) - self.check_cleanup_lock() - - - def test_cmd_cleanup_lock(self): - - self.prepare_cleanup_lock() - management.call_command('cleanup_cache', path=self.cache, - verbosity=0, delete=True) - self.check_cleanup_lock() - - -class BaseBackendTestCase(TestCase): - - - @classmethod - def setUpTestData(cls): - install.create_sites() - system_user, plot_user, user = install.create_users('user', 'user') - install.add_group('Default') - - setup_backend(qsetup.DEFAULT_CONFIGURATION) - - Worker.objects.update(active=True) - env = Environment.objects.get(name='Python 2.7') - queue = Queue.objects.first() - - template_data = dict( - system_user = system_user, - plot_user = plot_user, - user = user, - private = False, - queue = queue.name, - environment = dict(name=env.name, version=env.version), - ) - prefix = os.path.join( - os.path.dirname(os.path.dirname(os.path.realpath(sys.argv[0]))), - 'src', - 'beat.examples', - ) - install.install_contributions(prefix, 'system', template_data) - install.install_contributions(prefix, 'test', template_data) - - - def check_single(self, xp): - '''Checks user/user/single/1/single''' - - self.assertEqual(xp.blocks.count(), 2) - - b0 = xp.blocks.all()[0] - - self.assertEqual(b0.name, 'echo') - self.assertEqual(b0.status, Block.PENDING) - self.assertEqual(b0.algorithm, - Algorithm.objects.get(name='integers_echo')) - self.assertEqual(b0.dependencies.count(), 0) - self.assertEqual(b0.dependents.count(), 1) - self.assertEqual(b0.job.status, Job.QUEUED) - self.assertEqual(b0.job.parent, None) - self.assertEqual(b0.job.child_, None) - self.assertEqual(b0.queue.name, 'queue') - self.assertEqual(b0.environment.name, 'Python 2.7') - self.assertEqual(b0.required_slots, 1) - self.assertEqual(b0.inputs.count(), 1) - self.assertEqual(b0.outputs.count(), 1) - self.assertEqual(b0.job.splits.count(), 0) #not scheduled yet - - assert not b0.done() - - b1 = xp.blocks.all()[1] - - self.assertEqual(b1.name, 'analysis') - self.assertEqual(b1.status, Block.PENDING) - self.assertEqual(b1.algorithm, - Algorithm.objects.get(name='integers_echo_analyzer')) - self.assertEqual(b1.dependencies.count(), 1) - self.assertEqual(b1.dependents.count(), 0) - self.assertEqual(b1.job.status, Job.QUEUED) - self.assertEqual(b1.job.parent, None) - self.assertEqual(b1.job.child_, None) - self.assertEqual(b1.queue.name, 'queue') - self.assertEqual(b1.environment.name, 'Python 2.7') - self.assertEqual(b1.required_slots, 1) - self.assertEqual(b1.inputs.count(), 1) - self.assertEqual(b1.outputs.count(), 1) - self.assertEqual(b1.job.splits.count(), 0) #not scheduled yet - - assert not b1.done() - - - -class BackendSetup(BaseBackendTestCase): - - - def check_default_config(self): - - # checks all is there - self.assertEqual(dump_backend(), qsetup.DEFAULT_CONFIGURATION) - - worker = Worker.objects.get() - queue = Queue.objects.get() - Worker.objects.update(active=True) - - self.assertEqual(worker.available_cores(), qsetup.CORES) - self.assertEqual(list(worker.slots.values_list('id', flat=True)), - list(queue.slots.values_list('id', flat=True))) - - # worker has no job splits assigned to it - self.assertEqual(worker.splits.count(), 0) - - self.assertEqual(queue.availability(), qsetup.CORES) - self.assertEqual(queue.number_of_slots(), qsetup.CORES) - self.assertEqual(queue.worker_availability(), [worker]) - - # checks the single slot and priority - slot = queue.slots.get() - self.assertEqual(slot.quantity, qsetup.CORES) - self.assertEqual(slot.priority, 0) - self.assertEqual(slot.worker, worker) - - # checks no orphan slots exist - self.assertEqual(Slot.objects.filter(queue=None).count(), 0) - self.assertEqual(Slot.objects.filter(worker=None).count(), 0) - - - def test_setup(self): - - self.check_default_config() - - - def test_cmd_reset(self): - - # installs the default configuration command - management.call_command('qsetup', verbosity=0, reset=True) - self.check_default_config() - - - def check_noprior_config(self): - - qs = Queue.objects.all() - - self.assertEqual(qs.count(), 3) - - q1, q2, q3 = qs.order_by('name') - - self.assertEqual(q1.name, 'q1') - self.assertEqual(q2.name, 'q2') - self.assertEqual(q3.name, 'q4') - self.assertEqual(q1.splits().count(), 0) - self.assertEqual(q2.splits().count(), 0) - self.assertEqual(q3.splits().count(), 0) - - self.assertEqual(q1.number_of_slots(), 4) - self.assertEqual(q2.number_of_slots(), 2) - self.assertEqual(q3.number_of_slots(), 1) - self.assertEqual(q1.availability(), 4) - self.assertEqual(q2.availability(), 2) - self.assertEqual(q3.availability(), 1) - self.assertEqual(q1.environments.count(), 1) - self.assertEqual(q2.environments.count(), 1) - self.assertEqual(q3.environments.count(), 1) - - self.assertEqual(q1.environments.first(), q2.environments.first()) - self.assertEqual(q2.environments.first(), q3.environments.first()) - - env = q1.environments.first() - - self.assertEqual(env.name, 'Python 2.7') - self.assertEqual(env.version, '1.1.0') - - self.assertEqual(q1.slots.count(), 1) - self.assertEqual(q2.slots.count(), 1) - self.assertEqual(q3.slots.count(), 1) - - slot1 = q1.slots.first() - slot2 = q2.slots.first() - slot3 = q3.slots.first() - - self.assertEqual(slot1.quantity, 4) - self.assertEqual(slot1.priority, 0) - self.assertEqual(slot1.queue, q1) - self.assertEqual(slot2.quantity, 2) - self.assertEqual(slot2.priority, 0) - self.assertEqual(slot2.queue, q2) - self.assertEqual(slot3.quantity, 1) - self.assertEqual(slot3.priority, 0) - self.assertEqual(slot3.queue, q3) - - worker1 = slot1.worker - worker2 = slot2.worker - worker3 = slot3.worker - - self.assertEqual(worker1, worker2) - self.assertEqual(worker2, worker3) - - self.assertEqual(worker1.name, 'node1') - self.assertEqual(list(worker1.splits.all()), []) - self.assertEqual(worker1.memory, 16*1024) - self.assertEqual(worker1.cores, 4) - self.assertEqual(worker1.available_cores(), 4) - - self.assertEqual(worker1.slots.count(), 3) - - self.assertEqual(set(worker1.slots.all()), - set(list(q1.slots.all()) + list(q2.slots.all()) + \ - list(q3.slots.all()))) - - avail1 = q1.worker_availability() - self.assertEqual(avail1, [worker1]) - - avail2 = q2.worker_availability() - self.assertEqual(avail2, [worker1]) - - avail3 = q3.worker_availability() - self.assertEqual(avail2, [worker1]) - - # checks no orphan slots exist - self.assertEqual(Slot.objects.filter(queue=None).count(), 0) - self.assertEqual(Slot.objects.filter(worker=None).count(), 0) - - - def test_reconfigure_noprior(self): - - setup_backend(QUEUES_WITHOUT_PRIORITY) - Worker.objects.update(active=True) - self.check_noprior_config() - - - def test_reconfigure_fail_qenv_used(self): - - fullname = 'user/user/single/1/single' - xp = Experiment.objects.get(name=fullname.split(os.sep)[-1]) - - # schedules the experiment and check it - xp.schedule() - self.check_single(xp) - - try: - setup_backend(QUEUES_WITHOUT_PRIORITY) - except RuntimeError as e: - assert str(e).find('on the following queue/environment combinations') != -1 - else: - assert False, 'Queue re-configuration worked with q/env in use' - - - def check_prior_config(self): - - qs = Queue.objects.all() - - self.assertEqual(qs.count(), 3) - - q1, q1_special, q2 = qs.order_by('name') - - self.assertEqual(q1.name, 'q1') - self.assertEqual(q2.name, 'q2') - self.assertEqual(q1_special.name, 'q1_special') - self.assertEqual(q1.splits().count(), 0) - self.assertEqual(q2.splits().count(), 0) - self.assertEqual(q1_special.splits().count(), 0) - - self.assertEqual(q1.number_of_slots(), 8) - self.assertEqual(q2.number_of_slots(), 4) - self.assertEqual(q1_special.number_of_slots(), 8) - self.assertEqual(q1.availability(), 8) - self.assertEqual(q2.availability(), 4) - self.assertEqual(q1_special.availability(), 8) - self.assertEqual(q1.environments.count(), 1) - self.assertEqual(q2.environments.count(), 1) - self.assertEqual(q1_special.environments.count(), 1) - - self.assertEqual(q1.environments.first(), q2.environments.first()) - self.assertEqual(q2.environments.first(), - q1_special.environments.first()) - - env = q1.environments.first() - - self.assertEqual(env.name, 'Python 2.7') - self.assertEqual(env.version, '1.1.0') - - self.assertEqual(q1.slots.count(), 2) - self.assertEqual(q1_special.slots.count(), 2) - self.assertEqual(q2.slots.count(), 2) - - slot11, slot12 = q1.slots.all() - slot1_special1, slot1_special2 = q1_special.slots.all() - slot21, slot22 = q2.slots.all() - - self.assertEqual(slot11.quantity, 4) - self.assertEqual(slot11.priority, 5) - self.assertEqual(slot12.quantity, 4) - self.assertEqual(slot12.priority, 0) - self.assertEqual(slot11.queue, q1) - self.assertEqual(slot12.queue, q1) - - self.assertEqual(slot21.quantity, 2) - self.assertEqual(slot21.priority, 0) - self.assertEqual(slot22.quantity, 2) - self.assertEqual(slot22.priority, 10) - self.assertEqual(slot21.queue, q2) - self.assertEqual(slot22.queue, q2) - - self.assertEqual(slot1_special1.quantity, 4) - self.assertEqual(slot1_special1.priority, 0) - self.assertEqual(slot1_special2.quantity, 4) - self.assertEqual(slot1_special2.priority, 5) - self.assertEqual(slot1_special1.queue, q1_special) - self.assertEqual(slot1_special2.queue, q1_special) - - worker1 = slot11.worker - worker2 = slot12.worker - worker21 = slot21.worker - worker22 = slot22.worker - worker1_special1 = slot1_special1.worker - worker1_special2 = slot1_special2.worker - - self.assertEqual(worker1, worker21) - self.assertEqual(worker1, worker1_special1) - self.assertEqual(worker2, worker22) - self.assertEqual(worker2, worker1_special2) - - self.assertEqual(worker1.name, 'node1') - self.assertEqual(worker1.splits.count(), 0) - self.assertEqual(worker1.memory, 32*1024) - self.assertEqual(worker1.cores, 4) - self.assertEqual(worker1.available_cores(), 4) - - self.assertEqual(worker2.name, 'node2') - self.assertEqual(worker2.splits.count(), 0) - self.assertEqual(worker2.memory, 16*1024) - self.assertEqual(worker2.cores, 4) - self.assertEqual(worker2.available_cores(), 4) - - self.assertEqual(worker1.slots.count(), 3) - self.assertEqual(worker2.slots.count(), 3) - - avail1 = q1.worker_availability() - self.assertEqual(avail1, [worker1, worker2]) - - avail2 = q2.worker_availability() - self.assertEqual(avail2, [worker2, worker1]) - - avail1_special = q1_special.worker_availability() - self.assertEqual(avail1_special, [worker2, worker1]) - - # checks no orphan slots exist - self.assertEqual(Slot.objects.filter(queue=None).count(), 0) - self.assertEqual(Slot.objects.filter(worker=None).count(), 0) - - - def test_reconfigure_priors(self): - - setup_backend(PRIORITY_QUEUES) - Worker.objects.update(active=True) - self.check_prior_config() - - - - -class Scheduling(BaseBackendTestCase): - - - def check_stats_success(self, split): - - assert abs(split.job.block.speed_up_real() - 1.0) < 0.1 - assert abs(split.job.block.speed_up_maximal() - 1.0) < 0.1 - assert split.job.block.linear_execution_time() > 0.0 - assert split.job.block.queuing_time() > 0.0 - assert split.job.block.stdout() is None - assert split.job.block.stderr() is None - assert split.job.block.error_report() is None - - - def test_success(self): - - # tests a simple successful experiment scheduling and execution - - current_stats = HourlyStatistics.objects.count() - - fullname = 'user/user/single/1/single' - xp = Experiment.objects.get(name=fullname.split(os.sep)[-1]) - - # schedules the experiment and check it - xp.schedule() - self.check_single(xp) - assigned_splits = schedule() - - # schedules the first runnable block - assert xp.blocks.first().job.runnable_date is not None - assert xp.blocks.last().job.runnable_date is None - - worker = Worker.objects.get() - - self.assertEqual(len(assigned_splits), 1) - split = assigned_splits[0] - self.assertEqual(split.job.block.experiment, xp) - self.assertEqual(split.job.block.name, 'echo') - self.assertEqual(split.worker, worker) - self.assertEqual(worker.name, qsetup.HOSTNAME) - self.assertEqual(worker.available_cores(), qsetup.CORES-1) - - # simulate job start on worker - split.start() - self.assertEqual(split.job.status, Job.PROCESSING) - self.assertEqual(split.job.block.status, Block.PROCESSING) - self.assertEqual(split.job.block.experiment.status, Experiment.RUNNING) - - self.assertEqual(worker.available_cores(), qsetup.CORES-1) - - # no job can be run right now - assigned_splits = schedule() - self.assertEqual(len(assigned_splits), 0) - - # simulate end job signal - split.end(Result(status=0)) - self.assertEqual(split.job.status, Job.COMPLETED) - self.assertEqual(split.job.block.status, Block.DONE) - self.assertEqual(split.job.block.experiment.status, Experiment.RUNNING) - - # checks the number of statistics objects has increased by 1 - self.assertTrue(HourlyStatistics.objects.count() > current_stats) - - self.check_stats_success(split) - - # assert we have no database traces after the block is done - self.assertEqual(Job.objects.filter(block=split.job.block).count(), 0) - self.assertEqual(JobSplit.objects.filter(job=split.job).count(), 0) - self.assertEqual(Result.objects.filter(job__isnull=True).count(), 0) - - self.assertEqual(worker.available_cores(), qsetup.CORES) - - # since this job was successful, the next one should be ready to run - - # schedules the last block of the experiment - assert xp.blocks.last().job.runnable_date is not None - assigned_splits = schedule() - - self.assertEqual(len(assigned_splits), 1) - split = assigned_splits[0] - self.assertEqual(split.job.block.experiment, xp) - self.assertEqual(split.job.block.name, 'analysis') - self.assertEqual(split.worker, worker) - self.assertEqual(worker.name, qsetup.HOSTNAME) - self.assertEqual(worker.available_cores(), qsetup.CORES-1) - - # simulate job start on worker - split.start() - self.assertEqual(split.job.status, Job.PROCESSING) - self.assertEqual(split.job.block.status, Block.PROCESSING) - self.assertEqual(split.job.block.experiment.status, Experiment.RUNNING) - - self.assertEqual(worker.available_cores(), qsetup.CORES-1) - - # no job can be run right now - assigned_splits = schedule() - self.assertEqual(len(assigned_splits), 0) - - # simulate end job signal - split.end(Result(status=0)) - - # checks the number of statistics objects has increased by 1 - self.assertTrue(HourlyStatistics.objects.count() > current_stats) - - self.assertEqual(split.job.status, Job.COMPLETED) - self.assertEqual(split.job.block.status, Block.DONE) - self.assertEqual(split.job.block.experiment.status, Experiment.DONE) - - self.check_stats_success(split) - - # assert we have no database traces after the last block is done - self.assertEqual(Job.objects.count(), 0) - self.assertEqual(JobSplit.objects.count(), 0) - self.assertEqual(Result.objects.count(), 0) - - self.assertEqual(worker.available_cores(), qsetup.CORES) - - - def test_does_not_reassign(self): - - # tests if the scheduling routine never re-assigns splits which are - # already assigned. - - fullname = 'user/user/single/1/single' - xp = Experiment.objects.get(name=fullname.split(os.sep)[-1]) - - # schedules the experiment and check it - xp.schedule() - self.check_single(xp) - assigned_splits = schedule() - - self.assertEqual(len(assigned_splits), 1) - - assigned_splits = schedule() - self.assertEqual(len(assigned_splits), 0) - - - def test_worker_activation(self): - - # tests that scheduling depends on worker activation - - fullname = 'user/user/single/1/single' - xp = Experiment.objects.get(name=fullname.split(os.sep)[-1]) - - # de-activates worker - Worker.objects.update(active=False) - - # schedules the experiment and check it - xp.schedule() - self.check_single(xp) - - # no job can be run right now - assigned_splits = schedule() - self.assertEqual(len(assigned_splits), 0) - - # re-activate the worker, show it now schedules fine - Worker.objects.update(active=True) - assigned_splits = schedule() - self.assertEqual(len(assigned_splits), 1) - - # the rest would continue like with test_success - - - def test_fails_on_first_block(self): - - # tests that, if we fail on the first block, experiment fails, all - # stops as foreseen - - current_stats = HourlyStatistics.objects.count() - - fullname = 'user/user/single/1/single' - xp = Experiment.objects.get(name=fullname.split(os.sep)[-1]) - - # schedules the experiment and check it - xp.schedule() - self.check_single(xp) - - # schedules the first runnable block - assert xp.blocks.first().job.runnable_date is None - assert xp.blocks.last().job.runnable_date is None - - assigned_splits = schedule() - - assert xp.blocks.first().job.runnable_date is not None - assert xp.blocks.last().job.runnable_date is None - - worker = Worker.objects.get() - - self.assertEqual(len(assigned_splits), 1) - split = assigned_splits[0] - self.assertEqual(split.job.block.experiment, xp) - self.assertEqual(split.job.block.name, 'echo') - self.assertEqual(split.worker, worker) - self.assertEqual(worker.name, qsetup.HOSTNAME) - self.assertEqual(worker.available_cores(), qsetup.CORES-1) - - # simulate job start on worker - split.start() - self.assertEqual(split.job.status, Job.PROCESSING) - self.assertEqual(split.job.block.status, Block.PROCESSING) - self.assertEqual(split.job.block.experiment.status, Experiment.RUNNING) - - self.assertEqual(worker.available_cores(), qsetup.CORES-1) - - # no job can be run right now - assigned_splits = schedule() - self.assertEqual(len(assigned_splits), 0) - - # simulate end job signal, faiulre - split.end(Result(status=1)) - self.assertEqual(split.job.status, Job.FAILED) - self.assertEqual(split.job.block.status, Block.FAILED) - split.job.block.experiment.refresh_from_db() - self.assertEqual(split.job.block.experiment.status, Experiment.FAILED) - - # checks the number of statistics objects has increased by 1 - self.assertTrue(HourlyStatistics.objects.count() > current_stats) - - # assert we have no database traces after the last block is done - self.assertEqual(Job.objects.count(), 0) - self.assertEqual(JobSplit.objects.count(), 0) - self.assertEqual(Result.objects.count(), 0) - - self.assertEqual(worker.available_cores(), qsetup.CORES) - - - def test_fails_on_last_block(self): - - # tests a simple successful experiment scheduling and execution - - current_stats = HourlyStatistics.objects.count() - - fullname = 'user/user/single/1/single' - xp = Experiment.objects.get(name=fullname.split(os.sep)[-1]) - - # schedules the experiment and check it - xp.schedule() - self.check_single(xp) - - # schedules the first runnable block - assert xp.blocks.first().job.runnable_date is None - assert xp.blocks.last().job.runnable_date is None - - assigned_splits = schedule() - - assert xp.blocks.first().job.runnable_date is not None - assert xp.blocks.last().job.runnable_date is None - - worker = Worker.objects.get() - - self.assertEqual(len(assigned_splits), 1) - split = assigned_splits[0] - self.assertEqual(split.job.block.experiment, xp) - self.assertEqual(split.job.block.name, 'echo') - self.assertEqual(split.worker, worker) - self.assertEqual(worker.name, qsetup.HOSTNAME) - self.assertEqual(worker.available_cores(), qsetup.CORES-1) - - # simulate job start on worker - split.start() - self.assertEqual(split.job.status, Job.PROCESSING) - self.assertEqual(split.job.block.status, Block.PROCESSING) - self.assertEqual(split.job.block.experiment.status, Experiment.RUNNING) - - self.assertEqual(worker.available_cores(), qsetup.CORES-1) - - # no job can be run right now - assigned_splits = schedule() - self.assertEqual(len(assigned_splits), 0) - - # simulate end job signal - split.end(Result(status=0)) - self.assertEqual(split.job.status, Job.COMPLETED) - self.assertEqual(split.job.block.status, Block.DONE) - self.assertEqual(split.job.block.experiment.status, Experiment.RUNNING) - - # checks the number of statistics objects has increased by 1 - self.assertTrue(HourlyStatistics.objects.count() > current_stats) - - self.check_stats_success(split) - - # assert we have no database traces after the last block is done - self.assertEqual(Job.objects.filter(block=split.job.block).count(), 0) - self.assertEqual(JobSplit.objects.filter(job=split.job).count(), 0) - self.assertEqual(Result.objects.filter(job__isnull=True).count(), 0) - - self.assertEqual(worker.available_cores(), qsetup.CORES) - - # since this job was successful, the next one should be ready to run - - # schedules the last block of the experiment - assert xp.blocks.last().job.runnable_date is not None - assigned_splits = schedule() - - self.assertEqual(len(assigned_splits), 1) - split = assigned_splits[0] - self.assertEqual(split.job.block.experiment, xp) - self.assertEqual(split.job.block.name, 'analysis') - self.assertEqual(split.worker, worker) - self.assertEqual(worker.name, qsetup.HOSTNAME) - self.assertEqual(worker.available_cores(), qsetup.CORES-1) - - # simulate job start on worker - split.start() - self.assertEqual(split.job.status, Job.PROCESSING) - self.assertEqual(split.job.block.status, Block.PROCESSING) - self.assertEqual(split.job.block.experiment.status, Experiment.RUNNING) - - self.assertEqual(worker.available_cores(), qsetup.CORES-1) - - # no job can be run right now - assigned_splits = schedule() - self.assertEqual(len(assigned_splits), 0) - - # simulate end job signal - split.end(Result(status=1)) - - # checks the number of statistics objects has increased by 1 - self.assertTrue(HourlyStatistics.objects.count() > current_stats) - - self.assertEqual(split.job.status, Job.FAILED) - self.assertEqual(split.job.block.status, Block.FAILED) - self.assertEqual(split.job.block.experiment.status, Experiment.FAILED) - - # assert we have no database traces after the last block is done - self.assertEqual(Job.objects.count(), 0) - self.assertEqual(JobSplit.objects.count(), 0) - self.assertEqual(Result.objects.count(), 0) - - self.assertEqual(worker.available_cores(), qsetup.CORES) - - - def test_cancel_before_starting(self): - - # tests experiment cancellation before the experiment is started - - current_stats = HourlyStatistics.objects.count() - - fullname = 'user/user/single/1/single' - xp = Experiment.objects.get(name=fullname.split(os.sep)[-1]) - - # schedules the experiment and check it - xp.schedule() - self.check_single(xp) - - # schedules the first runnable block - assert xp.blocks.first().job.runnable_date is None - assert xp.blocks.last().job.runnable_date is None - - xp.cancel() - - self.assertEqual( - [str(k) for k in xp.blocks.values_list('status', flat=True)], - [Block.CANCELLED, Block.CANCELLED] - ) - xp.refresh_from_db() - self.assertEqual(xp.status, Experiment.FAILED) - - # assert we have no database traces after the last block is done - self.assertEqual(Job.objects.count(), 0) - self.assertEqual(JobSplit.objects.count(), 0) - self.assertEqual(Result.objects.count(), 0) - - worker = Worker.objects.get() - self.assertEqual(worker.available_cores(), qsetup.CORES) - - - def test_cancel_after_success(self): - - # tests experiment cancellation while the experiment is running - - current_stats = HourlyStatistics.objects.count() - - fullname = 'user/user/single/1/single' - xp = Experiment.objects.get(name=fullname.split(os.sep)[-1]) - - # schedules the experiment and check it - xp.schedule() - self.check_single(xp) - - # schedules the first runnable block - assert xp.blocks.first().job.runnable_date is None - assert xp.blocks.last().job.runnable_date is None - - assigned_splits = schedule() - - assert xp.blocks.first().job.runnable_date is not None - assert xp.blocks.last().job.runnable_date is None - - worker = Worker.objects.get() - - self.assertEqual(len(assigned_splits), 1) - split = assigned_splits[0] - self.assertEqual(split.job.block.experiment, xp) - self.assertEqual(split.job.block.name, 'echo') - self.assertEqual(split.worker, worker) - self.assertEqual(worker.name, qsetup.HOSTNAME) - self.assertEqual(worker.available_cores(), qsetup.CORES-1) - - # simulate job start on worker - split.start() - self.assertEqual(split.job.status, Job.PROCESSING) - self.assertEqual(split.job.block.status, Block.PROCESSING) - self.assertEqual(split.job.block.experiment.status, Experiment.RUNNING) - - self.assertEqual(worker.available_cores(), qsetup.CORES-1) - - # no job can be run right now - assigned_splits = schedule() - self.assertEqual(len(assigned_splits), 0) - - # simulate end job signal - split.end(Result(status=0)) - self.assertEqual(split.job.status, Job.COMPLETED) - self.assertEqual(split.job.block.status, Block.DONE) - self.assertEqual(split.job.block.experiment.status, Experiment.RUNNING) - - # checks the number of statistics objects has increased by 1 - self.assertTrue(HourlyStatistics.objects.count() > current_stats) - - self.check_stats_success(split) - - # assert we have no database traces after the last block is done - self.assertEqual(Job.objects.filter(block=split.job.block).count(), 0) - self.assertEqual(JobSplit.objects.filter(job=split.job).count(), 0) - self.assertEqual(Result.objects.filter(job__isnull=True).count(), 0) - - self.assertEqual(worker.available_cores(), qsetup.CORES) - - # since this job was successful, the next one should be ready to run - - # schedules the last block of the experiment - assert xp.blocks.last().job.runnable_date is not None - xp.cancel() - - self.assertEqual( - [str(k) for k in xp.blocks.order_by('id').values_list('status', flat=True)], - [Block.DONE, Block.CANCELLED] - ) - self.assertEqual(xp.status, Experiment.FAILED) - - # assert we have no database traces after the last block is done - self.assertEqual(Job.objects.count(), 0) - self.assertEqual(JobSplit.objects.count(), 0) - self.assertEqual(Result.objects.count(), 0) - - worker = Worker.objects.get() - self.assertEqual(worker.available_cores(), qsetup.CORES) - - - def test_cancel_while_running(self): - - # tests experiment cancellation while a block is running - - fullname = 'user/user/single/1/single' - xp = Experiment.objects.get(name=fullname.split(os.sep)[-1]) - - # schedules the experiment and check it - xp.schedule() - self.check_single(xp) - - # schedules the first runnable block - assert xp.blocks.first().job.runnable_date is None - assert xp.blocks.last().job.runnable_date is None - - assigned_splits = schedule() - - assert xp.blocks.first().job.runnable_date is not None - assert xp.blocks.last().job.runnable_date is None - - worker = Worker.objects.get() - - self.assertEqual(len(assigned_splits), 1) - split = assigned_splits[0] - self.assertEqual(split.job.block.experiment, xp) - self.assertEqual(split.job.block.name, 'echo') - self.assertEqual(split.worker, worker) - self.assertEqual(worker.name, qsetup.HOSTNAME) - self.assertEqual(worker.available_cores(), qsetup.CORES-1) - - # simulate job start on worker - split.start() - self.assertEqual(split.job.status, Job.PROCESSING) - self.assertEqual(split.job.block.status, Block.PROCESSING) - self.assertEqual(split.job.block.experiment.status, Experiment.RUNNING) - - self.assertEqual(worker.available_cores(), qsetup.CORES-1) - - # no job can be run right now - assigned_splits = schedule() - self.assertEqual(len(assigned_splits), 0) - - xp.cancel() - - # simulate worker cancelling - split.refresh_from_db() - self.assertEqual(split.status, Job.CANCEL) - split.end(None, Job.CANCELLED) - - xp.refresh_from_db() - self.assertEqual( - [str(k) for k in xp.blocks.order_by('id').values_list('status', flat=True)], - [Block.CANCELLED, Block.CANCELLED] - ) - self.assertEqual(xp.status, Experiment.FAILED) - - # assert we have no database traces after the last block is done - self.assertEqual(Job.objects.count(), 0) - self.assertEqual(JobSplit.objects.count(), 0) - self.assertEqual(Result.objects.count(), 0) - - worker = Worker.objects.get() - self.assertEqual(worker.available_cores(), qsetup.CORES) - - - def test_cancel_after_failure(self): - - # tests that, if we fail on the first block, experiment fails and a - # cancellation that comes after that is a NOOP - - current_stats = HourlyStatistics.objects.count() - - fullname = 'user/user/single/1/single' - xp = Experiment.objects.get(name=fullname.split(os.sep)[-1]) - - # schedules the experiment and check it - xp.schedule() - self.check_single(xp) - - # schedules the first runnable block - assert xp.blocks.first().job.runnable_date is None - assert xp.blocks.last().job.runnable_date is None - - assigned_splits = schedule() - - assert xp.blocks.first().job.runnable_date is not None - assert xp.blocks.last().job.runnable_date is None - - worker = Worker.objects.get() - - self.assertEqual(len(assigned_splits), 1) - split = assigned_splits[0] - self.assertEqual(split.job.block.experiment, xp) - self.assertEqual(split.job.block.name, 'echo') - self.assertEqual(split.worker, worker) - self.assertEqual(worker.name, qsetup.HOSTNAME) - self.assertEqual(worker.available_cores(), qsetup.CORES-1) - - # simulate job start on worker - split.start() - self.assertEqual(split.job.status, Job.PROCESSING) - self.assertEqual(split.job.block.status, Block.PROCESSING) - self.assertEqual(split.job.block.experiment.status, Experiment.RUNNING) - - self.assertEqual(worker.available_cores(), qsetup.CORES-1) - - # no job can be run right now - assigned_splits = schedule() - self.assertEqual(len(assigned_splits), 0) - - # simulate end job signal, faiulre - split.end(Result(status=1)) - self.assertEqual(split.job.status, Job.FAILED) - self.assertEqual(split.job.block.status, Block.FAILED) - split.job.block.experiment.refresh_from_db() - self.assertEqual(split.job.block.experiment.status, Experiment.FAILED) - - # checks the number of statistics objects has increased by 1 - self.assertTrue(HourlyStatistics.objects.count() > current_stats) - - # assert we have no database traces after the last block is done - self.assertEqual(Job.objects.count(), 0) - self.assertEqual(JobSplit.objects.count(), 0) - self.assertEqual(Result.objects.count(), 0) - - self.assertEqual(worker.available_cores(), qsetup.CORES) - - xp.cancel() - self.assertEqual(split.job.block.experiment.status, Experiment.FAILED) - - - def test_blocking_success(self): - - # tests two experiments that are similar can be scheduled at the same - # time and we'll optimise correctly and only run one of them. The other - # is updated as the blocking experiment is executed. - - current_stats = HourlyStatistics.objects.count() - - fullname = 'user/user/single/1/single' - xp = Experiment.objects.get(name=fullname.split(os.sep)[-1]) - - xpc = xp.fork(name='single_copy') - - # schedules the experiment and check it - xp.schedule() - xpc.schedule() - - # schedules the first runnable block - assert xp.blocks.first().job.runnable_date is None - assert xp.blocks.last().job.runnable_date is None - assert xpc.blocks.first().job.runnable_date is None - assert xpc.blocks.last().job.runnable_date is None - - assigned_splits = schedule() - - assert xp.blocks.first().job.runnable_date is not None - assert xp.blocks.last().job.runnable_date is None - assert xpc.blocks.first().job.runnable_date is None - assert xpc.blocks.last().job.runnable_date is None - - worker = Worker.objects.get() - - self.assertEqual(len(assigned_splits), 1) - split = assigned_splits[0] - self.assertEqual(split.job.block.experiment, xp) - self.assertEqual(split.job.block.name, 'echo') - self.assertEqual(split.worker, worker) - self.assertEqual(worker.name, qsetup.HOSTNAME) - self.assertEqual(worker.available_cores(), qsetup.CORES-1) - - # checks the jobs are connected one to the other across experiments - self.assertEqual(xp.blocks.first().job.child.block.experiment, xpc) - self.assertEqual(xp.blocks.last().job.child.block.experiment, xpc) - - # simulate job start on worker - split.start() - self.assertEqual(split.job.status, Job.PROCESSING) - self.assertEqual(split.job.block.status, Block.PROCESSING) - self.assertEqual(split.job.block.experiment.status, Experiment.RUNNING) - self.assertEqual(split.job.child.status, Job.PROCESSING) - self.assertEqual(split.job.child.block.status, Block.PROCESSING) - self.assertEqual(split.job.child.block.experiment.status, Experiment.RUNNING) - - self.assertEqual(worker.available_cores(), qsetup.CORES-1) - - # no job can be run right now - assigned_splits = schedule() - self.assertEqual(len(assigned_splits), 0) - - # simulate end job signal - split.end(Result(status=0)) - self.assertEqual(split.job.status, Job.COMPLETED) - self.assertEqual(split.job.block.status, Block.DONE) - self.assertEqual(split.job.block.experiment.status, Experiment.RUNNING) - self.assertEqual(split.job.child.status, Job.COMPLETED) - self.assertEqual(split.job.child.block.status, Block.DONE) - self.assertEqual(split.job.child.block.experiment.status, - Experiment.RUNNING) - - # checks the number of statistics objects has increased by 1 - self.assertTrue(HourlyStatistics.objects.count() > current_stats) - - self.check_stats_success(split) - - # assert we have no database traces after the block is done - self.assertEqual(Job.objects.filter(block=split.job.block).count(), 0) - self.assertEqual( - Job.objects.filter(block=split.job.child.block).count(), 0) - self.assertEqual(JobSplit.objects.filter(job=split.job.child).count(), - 0) - self.assertEqual(Result.objects.filter(job__isnull=True).count(), 0) - - self.assertEqual(worker.available_cores(), qsetup.CORES) - - # since this job was successful, the next one should be ready to run - - # schedules the last block of the experiment - assert xp.blocks.last().job.runnable_date is not None - assigned_splits = schedule() - - self.assertEqual(len(assigned_splits), 1) - split = assigned_splits[0] - self.assertEqual(split.job.block.experiment, xp) - self.assertEqual(split.job.block.name, 'analysis') - self.assertEqual(split.worker, worker) - self.assertEqual(worker.name, qsetup.HOSTNAME) - self.assertEqual(worker.available_cores(), qsetup.CORES-1) - - # simulate job start on worker - split.start() - self.assertEqual(split.job.status, Job.PROCESSING) - self.assertEqual(split.job.block.status, Block.PROCESSING) - self.assertEqual(split.job.block.experiment.status, Experiment.RUNNING) - self.assertEqual(split.job.child.status, Job.PROCESSING) - self.assertEqual(split.job.child.block.status, Block.PROCESSING) - self.assertEqual(split.job.child.block.experiment.status, - Experiment.RUNNING) - - self.assertEqual(worker.available_cores(), qsetup.CORES-1) - - # no job can be run right now - assigned_splits = schedule() - self.assertEqual(len(assigned_splits), 0) - - # simulate end job signal - split.end(Result(status=0)) - - # checks the number of statistics objects has increased by 1 - self.assertTrue(HourlyStatistics.objects.count() > current_stats) - - self.assertEqual(split.job.status, Job.COMPLETED) - self.assertEqual(split.job.block.status, Block.DONE) - self.assertEqual(split.job.block.experiment.status, Experiment.DONE) - self.assertEqual(split.job.child.status, Job.COMPLETED) - self.assertEqual(split.job.child.block.status, Block.DONE) - self.assertEqual(split.job.child.block.experiment.status, - Experiment.DONE) - - self.check_stats_success(split) - - # assert we have no database traces after the last block is done - self.assertEqual(Job.objects.count(), 0) - self.assertEqual(JobSplit.objects.count(), 0) - self.assertEqual(Result.objects.count(), 0) - - self.assertEqual(worker.available_cores(), qsetup.CORES) - - - def test_blocking_failure(self): - - # tests two experiments that are similar can be scheduled at the same - # time and we'll optimise correctly and only run one of them. If the - # blocking experiment fails, so does the blocked one too. - - current_stats = HourlyStatistics.objects.count() - - fullname = 'user/user/single/1/single' - xp = Experiment.objects.get(name=fullname.split(os.sep)[-1]) - - xpc = xp.fork(name='single_copy') - - # schedules the experiment and check it - xp.schedule() - xpc.schedule() - - # schedules the first runnable block - assert xp.blocks.first().job.runnable_date is None - assert xp.blocks.last().job.runnable_date is None - assert xpc.blocks.first().job.runnable_date is None - assert xpc.blocks.last().job.runnable_date is None - - assigned_splits = schedule() - - assert xp.blocks.first().job.runnable_date is not None - assert xp.blocks.last().job.runnable_date is None - assert xpc.blocks.first().job.runnable_date is None - assert xpc.blocks.last().job.runnable_date is None - - worker = Worker.objects.get() - - self.assertEqual(len(assigned_splits), 1) - split = assigned_splits[0] - self.assertEqual(split.job.block.experiment, xp) - self.assertEqual(split.job.block.name, 'echo') - self.assertEqual(split.worker, worker) - self.assertEqual(worker.name, qsetup.HOSTNAME) - self.assertEqual(worker.available_cores(), qsetup.CORES-1) - - # checks the jobs are connected one to the other across experiments - self.assertEqual(xp.blocks.first().job.child.block.experiment, xpc) - self.assertEqual(xp.blocks.last().job.child.block.experiment, xpc) - - # simulate job start on worker - split.start() - self.assertEqual(split.job.status, Job.PROCESSING) - self.assertEqual(split.job.block.status, Block.PROCESSING) - self.assertEqual(split.job.block.experiment.status, Experiment.RUNNING) - self.assertEqual(split.job.child.status, Job.PROCESSING) - self.assertEqual(split.job.child.block.status, Block.PROCESSING) - self.assertEqual(split.job.child.block.experiment.status, Experiment.RUNNING) - - self.assertEqual(worker.available_cores(), qsetup.CORES-1) - - # no job can be run right now - assigned_splits = schedule() - self.assertEqual(len(assigned_splits), 0) - - # simulate end job signal - split.end(Result(status=1)) - self.assertEqual(split.job.status, Job.FAILED) - self.assertEqual(split.job.block.status, Block.FAILED) - split.job.block.experiment.refresh_from_db() - self.assertEqual(split.job.block.experiment.status, Experiment.FAILED) - self.assertEqual(split.job.child.status, Job.FAILED) - self.assertEqual(split.job.child.block.status, Block.FAILED) - split.job.child.block.experiment.refresh_from_db() - self.assertEqual(split.job.child.block.experiment.status, - Experiment.FAILED) - - # checks the number of statistics objects has increased by 1 - self.assertTrue(HourlyStatistics.objects.count() > current_stats) - - # assert we have no database traces after the last block is done - self.assertEqual(Job.objects.count(), 0) - self.assertEqual(JobSplit.objects.count(), 0) - self.assertEqual(Result.objects.count(), 0) - - self.assertEqual(worker.available_cores(), qsetup.CORES) - - - def test_blocking_cancel_after_success(self): - - # tests two experiments that are similar can be scheduled at the same - # time and we'll optimise correctly and only run one of them. If the - # first experiment is cancelled, then the second one proceeds normally. - - current_stats = HourlyStatistics.objects.count() - - fullname = 'user/user/single/1/single' - xp = Experiment.objects.get(name=fullname.split(os.sep)[-1]) - - xpc = xp.fork(name='single_copy') - - # schedules the experiment and check it - xp.schedule() - xpc.schedule() - - # schedules the first runnable block - assert xp.blocks.first().job.runnable_date is None - assert xp.blocks.last().job.runnable_date is None - assert xpc.blocks.first().job.runnable_date is None - assert xpc.blocks.last().job.runnable_date is None - - assigned_splits = schedule() - - assert xp.blocks.first().job.runnable_date is not None - assert xp.blocks.last().job.runnable_date is None - assert xpc.blocks.first().job.runnable_date is None - assert xpc.blocks.last().job.runnable_date is None - - worker = Worker.objects.get() - - self.assertEqual(len(assigned_splits), 1) - split = assigned_splits[0] - self.assertEqual(split.job.block.experiment, xp) - self.assertEqual(split.job.block.name, 'echo') - self.assertEqual(split.worker, worker) - self.assertEqual(worker.name, qsetup.HOSTNAME) - self.assertEqual(worker.available_cores(), qsetup.CORES-1) - - # checks the jobs are connected one to the other across experiments - self.assertEqual(xp.blocks.first().job.child.block.experiment, xpc) - self.assertEqual(xp.blocks.last().job.child.block.experiment, xpc) - - # simulate job start on worker - split.start() - self.assertEqual(split.job.status, Job.PROCESSING) - self.assertEqual(split.job.block.status, Block.PROCESSING) - self.assertEqual(split.job.block.experiment.status, Experiment.RUNNING) - self.assertEqual(split.job.child.status, Job.PROCESSING) - self.assertEqual(split.job.child.block.status, Block.PROCESSING) - self.assertEqual(split.job.child.block.experiment.status, Experiment.RUNNING) - - self.assertEqual(worker.available_cores(), qsetup.CORES-1) - - # no job can be run right now - assigned_splits = schedule() - self.assertEqual(len(assigned_splits), 0) - - # simulate end job signal - split.end(Result(status=0)) - self.assertEqual(split.job.status, Job.COMPLETED) - self.assertEqual(split.job.block.status, Block.DONE) - self.assertEqual(split.job.block.experiment.status, Experiment.RUNNING) - self.assertEqual(split.job.child.status, Job.COMPLETED) - self.assertEqual(split.job.child.block.status, Block.DONE) - self.assertEqual(split.job.child.block.experiment.status, - Experiment.RUNNING) - - # checks the number of statistics objects has increased by 1 - self.assertTrue(HourlyStatistics.objects.count() > current_stats) - - self.check_stats_success(split) - - # assert we have no database traces after the block is done - self.assertEqual(Job.objects.filter(block=split.job.block).count(), 0) - self.assertEqual( - Job.objects.filter(block=split.job.child.block).count(), 0) - self.assertEqual(JobSplit.objects.filter(job=split.job.child).count(), - 0) - self.assertEqual(Result.objects.filter(job__isnull=True).count(), 0) - - self.assertEqual(worker.available_cores(), qsetup.CORES) - - # cancels the blocking experiment - the blocked one must continue - xp.cancel() - self.assertEqual( - [str(k) for k in xp.blocks.order_by('id').values_list('status', flat=True)], - [Block.DONE, Block.CANCELLED] - ) - self.assertEqual(xp.status, Experiment.FAILED) - - # assert we have no database traces after the last block is done - self.assertEqual(Job.objects.filter(block__in=xp.blocks.all()).count(), 0) - self.assertEqual(JobSplit.objects.filter(job__block__in=xp.blocks.all()).count(), 0) - self.assertEqual(Result.objects.count(), 0) - - self.assertEqual(worker.available_cores(), qsetup.CORES) - - # since the first job was successful, the second block of the - # previously blocked experiment must be ready to run - - # schedules the last block of the experiment - assert xpc.blocks.last().job.runnable_date is not None - assigned_splits = schedule() - - self.assertEqual(len(assigned_splits), 1) - split = assigned_splits[0] - self.assertEqual(split.job.block.experiment, xpc) - self.assertEqual(split.job.block.name, 'analysis') - self.assertEqual(split.worker, worker) - self.assertEqual(worker.name, qsetup.HOSTNAME) - self.assertEqual(worker.available_cores(), qsetup.CORES-1) - - # the rest would continue normally - - - def test_blocking_cancel_while_running(self): - - # tests two experiments that are similar can be scheduled at the same - # time and we'll optimise correctly and only run one of them. If the - # first experiment is cancelled while one of the blocks is running, - # then the second one proceeds normally. - - fullname = 'user/user/single/1/single' - xp = Experiment.objects.get(name=fullname.split(os.sep)[-1]) - - xpc = xp.fork(name='single_copy') - - # schedules the experiment and check it - xp.schedule() - xpc.schedule() - - # schedules the first runnable block - assert xp.blocks.first().job.runnable_date is None - assert xp.blocks.last().job.runnable_date is None - assert xpc.blocks.first().job.runnable_date is None - assert xpc.blocks.last().job.runnable_date is None - - assigned_splits = schedule() - - assert xp.blocks.first().job.runnable_date is not None - assert xp.blocks.last().job.runnable_date is None - assert xpc.blocks.first().job.runnable_date is None - assert xpc.blocks.last().job.runnable_date is None - - worker = Worker.objects.get() - - self.assertEqual(len(assigned_splits), 1) - split = assigned_splits[0] - self.assertEqual(split.job.block.experiment, xp) - self.assertEqual(split.job.block.name, 'echo') - self.assertEqual(split.worker, worker) - self.assertEqual(worker.name, qsetup.HOSTNAME) - self.assertEqual(worker.available_cores(), qsetup.CORES-1) - - # checks the jobs are connected one to the other across experiments - self.assertEqual(xp.blocks.first().job.child.block.experiment, xpc) - self.assertEqual(xp.blocks.last().job.child.block.experiment, xpc) - - # simulate job start on worker - split.start() - self.assertEqual(split.job.status, Job.PROCESSING) - self.assertEqual(split.job.block.status, Block.PROCESSING) - self.assertEqual(split.job.block.experiment.status, Experiment.RUNNING) - self.assertEqual(split.job.child.status, Job.PROCESSING) - self.assertEqual(split.job.child.block.status, Block.PROCESSING) - self.assertEqual(split.job.child.block.experiment.status, Experiment.RUNNING) - - self.assertEqual(worker.available_cores(), qsetup.CORES-1) - - # no job can be run right now - assigned_splits = schedule() - self.assertEqual(len(assigned_splits), 0) - - # cancels the blocking experiment - the blocked one must continue - xp.cancel() - - # simulate worker cancelling - split.refresh_from_db() - self.assertEqual(split.status, Job.CANCEL) - split.end(None, Job.CANCELLED) - - xp.refresh_from_db() - self.assertEqual( - [str(k) for k in xp.blocks.order_by('id').values_list('status', flat=True)], - [Block.CANCELLED, Block.CANCELLED] - ) - self.assertEqual(xp.status, Experiment.FAILED) - - # assert we have no database traces after the last block is cancel - self.assertEqual(Job.objects.filter(block__in=xp.blocks.all()).count(), 0) - self.assertEqual(JobSplit.objects.filter(job__block__in=xp.blocks.all()).count(), 0) - self.assertEqual(Result.objects.count(), 0) - - self.assertEqual(worker.available_cores(), qsetup.CORES) - - # since the first job was successful, the second block of the - # previously blocked experiment must be ready to run - - # schedules the last block of the experiment - assert xpc.blocks.first().job.runnable_date is not None - assigned_splits = schedule() - - assigned_splits = JobSplit.objects.filter(worker__isnull=False) - self.assertEqual(len(assigned_splits), 1) - split = assigned_splits.first() - self.assertEqual(split.job.block.experiment, xpc) - self.assertEqual(split.job.block.name, 'echo') - self.assertEqual(split.worker, worker) - self.assertEqual(worker.name, qsetup.HOSTNAME) - self.assertEqual(worker.available_cores(), qsetup.CORES-1) - - # the rest would continue normally - - - def test_blocking_cancel_blocked(self): - - # tests two experiments that are similar can be scheduled at the same - # time and we'll optimise correctly and only run one of them. If the - # blocked experiment is cancelled, this does not affect the running - # experiment. - - current_stats = HourlyStatistics.objects.count() - - fullname = 'user/user/single/1/single' - xp = Experiment.objects.get(name=fullname.split(os.sep)[-1]) - - xpc = xp.fork(name='single_copy') - - # schedules the experiment and check it - xp.schedule() - xpc.schedule() - - # schedules the first runnable block - assert xp.blocks.first().job.runnable_date is None - assert xp.blocks.last().job.runnable_date is None - assert xpc.blocks.first().job.runnable_date is None - assert xpc.blocks.last().job.runnable_date is None - - assigned_splits = schedule() - - assert xp.blocks.first().job.runnable_date is not None - assert xp.blocks.last().job.runnable_date is None - assert xpc.blocks.first().job.runnable_date is None - assert xpc.blocks.last().job.runnable_date is None - - worker = Worker.objects.get() - - self.assertEqual(len(assigned_splits), 1) - split = assigned_splits[0] - self.assertEqual(split.job.block.experiment, xp) - self.assertEqual(split.job.block.name, 'echo') - self.assertEqual(split.worker, worker) - self.assertEqual(worker.name, qsetup.HOSTNAME) - self.assertEqual(worker.available_cores(), qsetup.CORES-1) - - # checks the jobs are connected one to the other across experiments - self.assertEqual(xp.blocks.first().job.child.block.experiment, xpc) - self.assertEqual(xp.blocks.last().job.child.block.experiment, xpc) - - # simulate job start on worker - split.start() - self.assertEqual(split.job.status, Job.PROCESSING) - self.assertEqual(split.job.block.status, Block.PROCESSING) - self.assertEqual(split.job.block.experiment.status, Experiment.RUNNING) - self.assertEqual(split.job.child.status, Job.PROCESSING) - self.assertEqual(split.job.child.block.status, Block.PROCESSING) - self.assertEqual(split.job.child.block.experiment.status, Experiment.RUNNING) - - self.assertEqual(worker.available_cores(), qsetup.CORES-1) - - # no job can be run right now - assigned_splits = schedule() - self.assertEqual(len(assigned_splits), 0) - - # simulate end job signal - split.end(Result(status=0)) - self.assertEqual(split.job.status, Job.COMPLETED) - self.assertEqual(split.job.block.status, Block.DONE) - self.assertEqual(split.job.block.experiment.status, Experiment.RUNNING) - self.assertEqual(split.job.child.status, Job.COMPLETED) - self.assertEqual(split.job.child.block.status, Block.DONE) - self.assertEqual(split.job.child.block.experiment.status, - Experiment.RUNNING) - - # checks the number of statistics objects has increased by 1 - self.assertTrue(HourlyStatistics.objects.count() > current_stats) - - self.check_stats_success(split) - - # cancels the blocked experiment - the blocking one must continue - xpc.cancel() - self.assertEqual( - [str(k) for k in xpc.blocks.order_by('id').values_list('status', - flat=True)], - [Block.DONE, Block.CANCELLED] - ) - self.assertEqual(xpc.status, Experiment.FAILED) - - # assert we have no database traces after the last cancel - self.assertEqual(Job.objects.filter(block__in=xpc.blocks.all()).count(), 0) - self.assertEqual(JobSplit.objects.filter(job__block__in=xpc.blocks.all()).count(), 0) - self.assertEqual(Result.objects.count(), 0) - - self.assertEqual(worker.available_cores(), qsetup.CORES) - - # since the first job was successful, the second block of the - # running experiment must be ready to run - - # schedules the last block of the experiment - assert xp.blocks.last().job.runnable_date is not None - assigned_splits = schedule() - - assigned_splits = JobSplit.objects.filter(worker__isnull=False) - self.assertEqual(len(assigned_splits), 1) - split = assigned_splits.first() - self.assertEqual(split.job.block.experiment, xp) - self.assertEqual(split.job.block.name, 'analysis') - self.assertEqual(split.worker, worker) - self.assertEqual(worker.name, qsetup.HOSTNAME) - self.assertEqual(worker.available_cores(), qsetup.CORES-1) - - # the rest would continue normally - - - def test_schedule_without_queue(self): - - # tests that an experiment with a queue that disappeared is correctly - # aborted - setup_backend(QUEUES_WITHOUT_PRIORITY) - - fullname = 'user/user/single/1/single' - xp = Experiment.objects.get(name=fullname.split(os.sep)[-1]) - - self.assertRaises(RuntimeError, xp.schedule) - - - def test_split_no_index(self): - - # tests a simple experiment with splitting and show it can fail - # gracefully - - current_stats = HourlyStatistics.objects.count() - - fullname = 'user/user/single/1/single_large' - xp = Experiment.objects.get(name=fullname.split(os.sep)[-1]) - - worker = Worker.objects.get() - - # schedules the experiment and check it - xp.schedule() - - schedule() - - xp.refresh_from_db() - self.assertEqual(xp.status, Experiment.FAILED) - - self.assertEqual(xp.blocks.first().status, Block.CANCELLED) - assert xp.blocks.first().error_report().find(settings.DEFAULT_USER_ERROR) == 0 - self.assertEqual(xp.blocks.last().status, Block.CANCELLED) - - # assert we have no database traces after the last block is done - self.assertEqual(Job.objects.filter(block__in=xp.blocks.all()).count(), 0) - self.assertEqual(JobSplit.objects.filter(job__block__in=xp.blocks.all()).count(), 0) - self.assertEqual(Result.objects.count(), 0) - - self.assertEqual(worker.available_cores(), qsetup.CORES) - - - def test_schedules_two_jobs(self): - - # tests a simple scheduling activity in which two jobs of the same - # experiment must be scheduled concurrently, provided there is enough - # space - - current_stats = HourlyStatistics.objects.count() - - fullname = 'user/user/triangle/1/triangle' - xp = Experiment.objects.get(name=fullname.split(os.sep)[-1]) - - worker = Worker.objects.get() - - # schedules the experiment and check it - xp.schedule() - xp.refresh_from_db() - self.assertEqual(xp.status, Experiment.SCHEDULED) - - assigned_splits = schedule() - - self.assertEqual(len(assigned_splits), 2) - self.assertEqual(assigned_splits[0].job.block.experiment, xp) - self.assertEqual(assigned_splits[1].job.block.experiment, xp) - self.assertNotEqual(assigned_splits[0], assigned_splits[1]) - - - def test_cancel_concurrent_job(self): - - # tests a simple scheduling activity in which two jobs of the same - # experiment must be scheduled concurrently, provided there is enough - # space. Then, fails one of them, waits the experiment to fail - # completely. Processing jobs must be cancelled. - - current_stats = HourlyStatistics.objects.count() - - fullname = 'user/user/triangle/1/triangle' - xp = Experiment.objects.get(name=fullname.split(os.sep)[-1]) - - worker = Worker.objects.get() - - # schedules the experiment and check it - xp.schedule() - xp.refresh_from_db() - self.assertEqual(xp.status, Experiment.SCHEDULED) - - assigned_splits = schedule() - - self.assertEqual(len(assigned_splits), 2) - self.assertEqual(assigned_splits[0].job.block.experiment, xp) - self.assertEqual(assigned_splits[1].job.block.experiment, xp) - self.assertNotEqual(assigned_splits[0], assigned_splits[1]) - - - # simulate job start on worker - assigned_splits[0].start() - assigned_splits[1].start() - - # now fail one of the jobs, the end result is the experiment fails - assigned_splits[1].end(Result(status=15)) #simulated sigterm sent - self.assertEqual(assigned_splits[1].job.status, Job.FAILED) - - # cancels the job which is marked for cancelling, checks the final - # experiment state is as expected (this is the worker job) - self.assertEqual(assigned_splits[0].job.splits.first().status, - Job.CANCEL) - assigned_splits[0].job.splits.first().end(None, Job.CANCELLED) - - xp.refresh_from_db() - self.assertEqual( - [str(k) for k in xp.blocks.order_by('id').values_list('status', - flat=True)], - [Block.CANCELLED, Block.FAILED, Block.CANCELLED, Block.CANCELLED] - ) - self.assertEqual(xp.status, Experiment.FAILED) - - - -class SchedulingPriority(BaseBackendTestCase): - - - def set_globals(self, xp, queue, environment): - '''Sets the global queue of the experiment''' - - decl = xp.declaration - decl['globals']['queue'] = queue.name - decl['globals']['environment']['name'] = environment.name - decl['globals']['environment']['version'] = environment.version - xp.declaration = decl - xp.save() #reloads all blocks - - - def reset_slots(self, xp): - '''Only use one slot in all blocks''' - - decl = xp.declaration - for b in decl['blocks']: - if 'nb_slots' in decl['blocks'][b]: - del decl['blocks'][b]['nb_slots'] - xp.declaration = decl - xp.save() #reloads all blocks - - - def test_priority_multicore(self): - - # tests that in an heterogeneous backend setup, priority is given to - # jobs that require more cores correctly. - - setup_backend(QUEUES_WITHOUT_PRIORITY) - Worker.objects.update(active=True) - - fullname = 'user/user/single/1/single' - xp = Experiment.objects.get(name=fullname.split(os.sep)[-1]) - fullname = 'user/user/single/1/single_add' - xp_add = Experiment.objects.get(name=fullname.split(os.sep)[-1]) - fullname = 'user/user/single/1/single_add2' - xp_add2 = Experiment.objects.get(name=fullname.split(os.sep)[-1]) - fullname = 'user/user/single/1/single_large' - xp_large = Experiment.objects.get(name=fullname.split(os.sep)[-1]) - - q1 = Queue.objects.get(name='q1') - q2 = Queue.objects.get(name='q2') - env = Environment.objects.get(name='Python 2.7') - - # reset queue and environment to new backend configuration - self.set_globals(xp, q1, env) - self.set_globals(xp_add, q1, env) - self.set_globals(xp_add2, q1, env) - self.set_globals(xp_large, q2, env) #notice different queue - self.reset_slots(xp_large) #one slot per block only - - xp.schedule() - xp_add.schedule() - xp_add2.schedule() - xp_large.schedule() - - assigned_splits = schedule() - - self.assertEqual(len(assigned_splits), 3) - - self.assertEqual(assigned_splits[0].job.block.experiment, xp_large) - # then, the scheduling order is respected - self.assertEqual(assigned_splits[1].job.block.experiment, xp) - self.assertEqual(assigned_splits[2].job.block.experiment, xp_add) - # notice that the last experiment is not assigned - - - def test_priority_multicore_delayed(self): - - # tests that in an heterogeneous backend setup, priority is given to - # jobs that require more cores correctly. In this test, specifically, - # we verify that, if the farm is taken, new jobs that require more - # resources will block other possible jobs to run if they cannot run, - # even if free cores are available. - - setup_backend(QUEUES_WITHOUT_PRIORITY) - Worker.objects.update(active=True) - - fullname = 'user/user/single/1/single' - xp = Experiment.objects.get(name=fullname.split(os.sep)[-1]) - fullname = 'user/user/single/1/single_add' - xp_add = Experiment.objects.get(name=fullname.split(os.sep)[-1]) - fullname = 'user/user/single/1/single_add2' - xp_add2 = Experiment.objects.get(name=fullname.split(os.sep)[-1]) - fullname = 'user/user/single/1/single_large' - xp_large = Experiment.objects.get(name=fullname.split(os.sep)[-1]) - - q1 = Queue.objects.get(name='q1') - q4 = Queue.objects.get(name='q4') - env = Environment.objects.get(name='Python 2.7') - - # reset queue and environment to new backend configuration - self.set_globals(xp, q1, env) - self.set_globals(xp_add, q1, env) - self.set_globals(xp_large, q4, env) #notice different queue - self.reset_slots(xp_large) #one slot per block only - - xp.schedule() - assigned_splits = schedule() - - self.assertEqual(len(assigned_splits), 1) - self.assertEqual(assigned_splits[0].job.block.experiment, xp) - split = assigned_splits[0] - - xp_large.schedule() #will now block anything else from running - xp_add.schedule() - - assigned_splits = schedule() - self.assertEqual(len(assigned_splits), 0) - - # start/end the xp block and schedule again - split.start() - split.end(Result(0)) - - # now, the job with more cores should be scheduled first - assigned_splits = schedule() - self.assertEqual(len(assigned_splits), 1) - self.assertEqual(assigned_splits[0].job.block.experiment, xp_large) - - - def test_priorities(self): - - # tests that in an heterogeneous backend setup, priority is given to - # different computers based on their priority settings - - setup_backend(PRIORITY_QUEUES) - Worker.objects.update(active=True) - - q1 = Queue.objects.get(name='q1') - env = Environment.objects.get(name='Python 2.7') - - fullname = 'user/user/single/1/single' - xp = Experiment.objects.get(name=fullname.split(os.sep)[-1]) - self.set_globals(xp, q1, env) - - q1_special = Queue.objects.get(name='q1_special') - fullname = 'user/user/single/1/single_add' - xp_add = Experiment.objects.get(name=fullname.split(os.sep)[-1]) - self.set_globals(xp_add, q1_special, env) - - q2 = Queue.objects.get(name='q2') - fullname = 'user/user/single/1/single_large' - xp_large = Experiment.objects.get(name=fullname.split(os.sep)[-1]) - self.set_globals(xp_large, q2, env) #notice different queue - self.reset_slots(xp_large) #one slot per block only - - node1 = Worker.objects.get(name='node1') - node2 = Worker.objects.get(name='node2') - - # verify that xp_large has priority, other jobs corresponding to - # q1/_special - xp.schedule() - xp_add.schedule() - xp_large.schedule() - - assigned_splits = schedule() - self.assertTrue(len(assigned_splits), 3) - - self.assertEqual(assigned_splits[0].job.block.experiment, xp_large) - self.assertEqual(assigned_splits[0].job.block.name, 'echo') - self.assertEqual(assigned_splits[0].job.splits.first().worker, node2) - - self.assertEqual(assigned_splits[1].job.block.experiment, xp) - self.assertEqual(assigned_splits[1].job.block.name, 'echo') - self.assertEqual(assigned_splits[1].job.splits.first().worker, node1) - - self.assertEqual(assigned_splits[2].job.block.experiment, xp_add) - self.assertEqual(assigned_splits[2].job.block.name, 'echo') - self.assertEqual(assigned_splits[2].job.splits.first().worker, node2) - - -class Working(BaseBackendTestCase): - - - def setUp(self): - - from . import utils - self.process = utils.resolve_process_path() - self.environments = utils.find_environments(None) - - if not os.path.exists(settings.CACHE_ROOT): - os.makedirs(settings.CACHE_ROOT) - - - def tearDown(self): - if os.path.exists(settings.CACHE_ROOT): - shutil.rmtree(settings.CACHE_ROOT) - - - def check_stats_success(self, block): - - assert abs(block.speed_up_real() - 1.0) < 0.1 - assert abs(block.speed_up_maximal() - 1.0) < 0.1 - assert block.linear_execution_time() > 0.0 - assert block.queuing_time() > 0.0 - assert block.stdout() == '' - assert block.stderr() == '' - assert block.error_report() == '' - - - def test_success(self): - - # tests an experiment can actually be run - - current_stats = HourlyStatistics.objects.count() - - fullname = 'user/user/single/1/single' - xp = Experiment.objects.get(name=fullname.split(os.sep)[-1]) - - # schedules the experiment and check it - xp.schedule() - - # schedules the first runnable block - assert xp.blocks.first().job.runnable_date is None - assert xp.blocks.last().job.runnable_date is None - - assigned_splits = schedule() - - assert xp.blocks.first().job.runnable_date is not None - assert xp.blocks.last().job.runnable_date is None - - worker = Worker.objects.get() - - self.assertEqual(len(assigned_splits), 1) - split = assigned_splits[0] - self.assertEqual(split.job.block.experiment, xp) - self.assertEqual(split.job.block.name, 'echo') - self.assertEqual(split.worker, worker) - self.assertEqual(worker.name, qsetup.HOSTNAME) - self.assertEqual(worker.available_cores(), qsetup.CORES-1) - - # actually runs the job (blocking) - split.process() - - # at this point, job should have been successful - xp.refresh_from_db() - block = xp.blocks.first() - self.assertEqual(block.status, Block.DONE) - self.assertEqual(xp.status, Experiment.RUNNING) - - # all caches must be have been generated - assert all([k.check_checksums() for k in block.outputs.all()]) - - # checks the number of statistics objects has increased by 1 - self.assertTrue(HourlyStatistics.objects.count() > current_stats) - - self.check_stats_success(block) - - # assert we have no database traces after the block is done - self.assertEqual(Job.objects.filter(block=split.job.block).count(), 0) - self.assertEqual(JobSplit.objects.filter(job=split.job).count(), 0) - self.assertEqual(Result.objects.filter(job__isnull=True).count(), 0) - - self.assertEqual(worker.available_cores(), qsetup.CORES) - - # since this job was successful, the next one should be ready to run - - # schedules the last block of the experiment - assert xp.blocks.last().job.runnable_date is not None - assigned_splits = schedule() - - self.assertEqual(len(assigned_splits), 1) - split = assigned_splits[0] - self.assertEqual(split.job.block.experiment, xp) - self.assertEqual(split.job.block.name, 'analysis') - self.assertEqual(split.worker, worker) - self.assertEqual(worker.name, qsetup.HOSTNAME) - self.assertEqual(worker.available_cores(), qsetup.CORES-1) - - # actually runs the job (blocking) - split.process() - - # checks the number of statistics objects has increased by 1 - self.assertTrue(HourlyStatistics.objects.count() > current_stats) - - xp.refresh_from_db() - block = xp.blocks.last() - self.assertEqual(block.status, Block.DONE) - self.assertEqual(xp.status, Experiment.DONE) - - # all caches must be have been generated - assert all([k.check_checksums() for k in block.outputs.all()]) - - self.check_stats_success(block) - - # assert we have no database traces after the last block is done - self.assertEqual(Job.objects.count(), 0) - self.assertEqual(JobSplit.objects.count(), 0) - self.assertEqual(Result.objects.count(), 0) - - self.assertEqual(worker.available_cores(), qsetup.CORES) - - - def test_failure(self): - - # tests an experiment can fail and we can handle it fine - - current_stats = HourlyStatistics.objects.count() - - fullname = 'user/user/single/1/single_error' - xp = Experiment.objects.get(name=fullname.split(os.sep)[-1]) - - # schedules the experiment and check it - xp.schedule() - - # schedules the first runnable block - assert xp.blocks.first().job.runnable_date is None - assert xp.blocks.last().job.runnable_date is None - - assigned_splits = schedule() - - assert xp.blocks.first().job.runnable_date is not None - assert xp.blocks.last().job.runnable_date is None - - worker = Worker.objects.get() - - self.assertEqual(len(assigned_splits), 1) - split = assigned_splits[0] - self.assertEqual(split.job.block.experiment, xp) - self.assertEqual(split.job.block.name, 'echo') - self.assertEqual(split.worker, worker) - self.assertEqual(worker.name, qsetup.HOSTNAME) - self.assertEqual(worker.available_cores(), qsetup.CORES-1) - - # actually runs the job (blocking) - split.process() - - # at this point, job should have failed - xp.refresh_from_db() - block = xp.blocks.first() - self.assertEqual(block.status, Block.FAILED) - self.assertEqual(block.experiment.status, Experiment.FAILED) - - # all caches have not been generated - assert all([not k.exists() for k in block.outputs.all()]) - - # checks the number of statistics objects has increased by 1 - self.assertTrue(HourlyStatistics.objects.count() > current_stats) - - assert abs(block.speed_up_real() - 1.0) < 0.1 - assert abs(block.speed_up_maximal() - 1.0) < 0.1 - assert block.linear_execution_time() > 0.0 - assert block.queuing_time() > 0.0 - assert block.stdout() == '' - assert block.error_report().find('Error') != -1 - - # assert we have no database traces after the block is done - self.assertEqual(Job.objects.filter(block=split.job.block).count(), 0) - self.assertEqual(JobSplit.objects.filter(job=split.job).count(), 0) - self.assertEqual(Result.objects.filter(job__isnull=True).count(), 0) - - self.assertEqual(worker.available_cores(), qsetup.CORES) - - - def test_skip(self): - - # tests an experiment can actually be completely skipped if all files - # are already cached - - current_stats = HourlyStatistics.objects.count() - - fullname = 'user/user/single/1/single' - xp = Experiment.objects.get(name=fullname.split(os.sep)[-1]) - - # schedules the experiment and check it - xp.schedule() - - # schedules the first runnable block - assert xp.blocks.first().job.runnable_date is None - assert xp.blocks.last().job.runnable_date is None - - assigned_splits = schedule() - - assert xp.blocks.first().job.runnable_date is not None - assert xp.blocks.last().job.runnable_date is None - - worker = Worker.objects.get() - - self.assertEqual(len(assigned_splits), 1) - split = assigned_splits[0] - self.assertEqual(split.job.block.experiment, xp) - self.assertEqual(split.job.block.name, 'echo') - self.assertEqual(split.worker, worker) - self.assertEqual(worker.name, qsetup.HOSTNAME) - self.assertEqual(worker.available_cores(), qsetup.CORES-1) - - # actually runs the job (blocking) - split.process() - - # at this point, job should have been successful - xp.refresh_from_db() - block = xp.blocks.first() - self.assertEqual(block.status, Block.DONE) - self.assertEqual(block.experiment.status, Experiment.RUNNING) - - # all caches must be have been generated - assert all([k.check_checksums() for k in block.outputs.all()]) - - # checks the number of statistics objects has increased by 1 - self.assertTrue(HourlyStatistics.objects.count() > current_stats) - - self.check_stats_success(block) - - # assert we have no database traces after the block is done - self.assertEqual(Job.objects.filter(block=split.job.block).count(), 0) - self.assertEqual(JobSplit.objects.filter(job=split.job).count(), 0) - self.assertEqual(Result.objects.filter(job__isnull=True).count(), 0) - - self.assertEqual(worker.available_cores(), qsetup.CORES) - - # since this job was successful, the next one should be ready to run - - # schedules the last block of the experiment - assert xp.blocks.last().job.runnable_date is not None - assigned_splits = schedule() - - self.assertEqual(len(assigned_splits), 1) - split = assigned_splits[0] - self.assertEqual(split.job.block.experiment, xp) - self.assertEqual(split.job.block.name, 'analysis') - self.assertEqual(split.worker, worker) - self.assertEqual(worker.name, qsetup.HOSTNAME) - self.assertEqual(worker.available_cores(), qsetup.CORES-1) - - # actually runs the job (blocking) - split.process() - - # checks the number of statistics objects has increased by 1 - self.assertTrue(HourlyStatistics.objects.count() > current_stats) - - xp.refresh_from_db() - block = xp.blocks.last() - self.assertEqual(block.status, Block.DONE) - self.assertEqual(block.experiment.status, Experiment.DONE) - - # all caches must be have been generated - assert all([k.check_checksums() for k in block.outputs.all()]) - - self.check_stats_success(block) - - # assert we have no database traces after the last block is done - self.assertEqual(Job.objects.count(), 0) - self.assertEqual(JobSplit.objects.count(), 0) - self.assertEqual(Result.objects.count(), 0) - - self.assertEqual(worker.available_cores(), qsetup.CORES) - - # now we fork and re-run the same experiment - xpc = xp.fork(name='single_copy') - - # schedules the experiment (it should immediately load from the db) - xpc.schedule() - - xpc.refresh_from_db() - self.assertEqual(xpc.status, Experiment.DONE) - - - def test_does_not_skip(self): - - # tests an experiment can actually be partially skipped if some files - # are ready - - current_stats = HourlyStatistics.objects.count() - - fullname = 'user/user/single/1/single' - xp = Experiment.objects.get(name=fullname.split(os.sep)[-1]) - - # schedules the experiment and check it - xp.schedule() - - # schedules the first runnable block - assert xp.blocks.first().job.runnable_date is None - assert xp.blocks.last().job.runnable_date is None - - assigned_splits = schedule() - - assert xp.blocks.first().job.runnable_date is not None - assert xp.blocks.last().job.runnable_date is None - - worker = Worker.objects.get() - - self.assertEqual(len(assigned_splits), 1) - split = assigned_splits[0] - self.assertEqual(split.job.block.experiment, xp) - self.assertEqual(split.job.block.name, 'echo') - self.assertEqual(split.worker, worker) - self.assertEqual(worker.name, qsetup.HOSTNAME) - self.assertEqual(worker.available_cores(), qsetup.CORES-1) - - # actually runs the job (blocking) - split.process() - - # at this point, job should have been successful - xp.refresh_from_db() - block = xp.blocks.first() - self.assertEqual(block.status, Block.DONE) - self.assertEqual(block.experiment.status, Experiment.RUNNING) - - # all caches must be have been generated - assert all([k.check_checksums() for k in block.outputs.all()]) - - # checks the number of statistics objects has increased by 1 - self.assertTrue(HourlyStatistics.objects.count() > current_stats) - - self.check_stats_success(block) - - # assert we have no database traces after the block is done - self.assertEqual(Job.objects.filter(block=split.job.block).count(), 0) - self.assertEqual(JobSplit.objects.filter(job=split.job).count(), 0) - self.assertEqual(Result.objects.filter(job__isnull=True).count(), 0) - - self.assertEqual(worker.available_cores(), qsetup.CORES) - - # now we cancel the experiment - xp.cancel() - - # we fork it and re-run it - only the last block will run again - xpc = xp.fork(name='single_copy') - xpc.schedule() - - # schedules the first runnable block - assert not hasattr(xpc.blocks.first(), 'job') - assert xpc.blocks.first().status == Block.DONE - - # since this job was successful, the next one should be ready to run - - # schedules the last block of the experiment - assert xpc.blocks.last().job.runnable_date is None - - assigned_splits = schedule() - - assert xpc.blocks.last().job.runnable_date is not None - - self.assertEqual(len(assigned_splits), 1) - split = assigned_splits[0] - self.assertEqual(split.job.block.experiment, xpc) - self.assertEqual(split.job.block.name, 'analysis') - self.assertEqual(split.worker, worker) - self.assertEqual(worker.name, qsetup.HOSTNAME) - self.assertEqual(worker.available_cores(), qsetup.CORES-1) - - # actually runs the job (blocking) - split.process() - - # checks the number of statistics objects has increased by 1 - self.assertTrue(HourlyStatistics.objects.count() > current_stats) - - xpc.refresh_from_db() - block = xpc.blocks.last() - self.assertEqual(block.status, Block.DONE) - self.assertEqual(block.experiment.status, Experiment.DONE) - - # all caches must be have been generated - assert all([k.check_checksums() for k in block.outputs.all()]) - - self.check_stats_success(block) - - # assert we have no database traces after the last block is done - self.assertEqual(Job.objects.count(), 0) - self.assertEqual(JobSplit.objects.count(), 0) - self.assertEqual(Result.objects.count(), 0) - - self.assertEqual(worker.available_cores(), qsetup.CORES) - - # asserts the old experiment is still on a failed state - self.assertEqual( - [str(k) for k in xp.blocks.order_by('id').values_list('status', - flat=True)], - [Block.DONE, Block.CANCELLED] - ) - self.assertEqual(xp.status, Experiment.FAILED) - - - def test_partially_blocking(self): - - # tests an experiment can actually be partially skipped if some files - # are ready on the cache - blocking occurs as foreseeable on blocks to - # run - - current_stats = HourlyStatistics.objects.count() - - fullname = 'user/user/single/1/single' - xp = Experiment.objects.get(name=fullname.split(os.sep)[-1]) - - # schedules the experiment and check it - xp.schedule() - - # schedules the first runnable block - assert xp.blocks.first().job.runnable_date is None - assert xp.blocks.last().job.runnable_date is None - - assigned_splits = schedule() - - assert xp.blocks.first().job.runnable_date is not None - assert xp.blocks.last().job.runnable_date is None - - worker = Worker.objects.get() - - self.assertEqual(len(assigned_splits), 1) - split = assigned_splits[0] - self.assertEqual(split.job.block.experiment, xp) - self.assertEqual(split.job.block.name, 'echo') - self.assertEqual(split.worker, worker) - self.assertEqual(worker.name, qsetup.HOSTNAME) - self.assertEqual(worker.available_cores(), qsetup.CORES-1) - - # actually runs the job (blocking) - split.process() - - # at this point, job should have been successful - xp.refresh_from_db() - block = xp.blocks.first() - self.assertEqual(block.status, Block.DONE) - self.assertEqual(block.experiment.status, Experiment.RUNNING) - - # all caches must be have been generated - assert all([k.check_checksums() for k in block.outputs.all()]) - - # checks the number of statistics objects has increased by 1 - self.assertTrue(HourlyStatistics.objects.count() > current_stats) - - self.check_stats_success(block) - - # assert we have no database traces after the block is done - self.assertEqual(Job.objects.filter(block=split.job.block).count(), 0) - self.assertEqual(JobSplit.objects.filter(job=split.job).count(), 0) - self.assertEqual(Result.objects.filter(job__isnull=True).count(), 0) - - self.assertEqual(worker.available_cores(), qsetup.CORES) - - # now we fork and wait the second experiment to hook-into the last - # block - xpc = xp.fork(name='single_copy') - xpc.schedule() - - schedule() - - self.assertEqual([k.status for k in xpc.blocks.all()], - [Block.DONE, Block.PENDING]) - assert xpc.blocks.last().job.parent == xp.blocks.last().job - - -class WorkingExternally(TransactionTestCase): - - - def setUp(self): - - from . import utils - self.process = utils.resolve_process_path() - self.environments = utils.find_environments(None) - - if not os.path.exists(settings.CACHE_ROOT): - os.makedirs(settings.CACHE_ROOT) - - install.create_sites() - system_user, plot_user, user = install.create_users('user', 'user') - install.add_group('Default') - - setup_backend(qsetup.DEFAULT_CONFIGURATION) - - Worker.objects.update(active=True) - env = Environment.objects.get(name='Python 2.7') - queue = Queue.objects.first() - - template_data = dict( - system_user = system_user, - plot_user = plot_user, - user = user, - private = False, - queue = queue.name, - environment = dict(name=env.name, version=env.version), - ) - prefix = os.path.join( - os.path.dirname(os.path.dirname(os.path.realpath(sys.argv[0]))), - 'src', - 'beat.examples', - ) - install.install_contributions(prefix, 'system', template_data) - install.install_contributions(prefix, 'test', template_data) - - - def tearDown(self): - if os.path.exists(settings.CACHE_ROOT): - shutil.rmtree(settings.CACHE_ROOT) - if os.path.exists(settings.PREFIX): - shutil.rmtree(settings.PREFIX) - - - def test_success(self): - - # tests an experiment can actually be run - - current_stats = HourlyStatistics.objects.count() - - fullname = 'user/user/single/1/single' - xp = Experiment.objects.get(name=fullname.split(os.sep)[-1]) - - # schedules the experiment and check it - xp.schedule() - - # schedules the first runnable block - assert xp.blocks.first().job.runnable_date is None - assert xp.blocks.last().job.runnable_date is None - - assigned_splits = schedule() - - assert xp.blocks.first().job.runnable_date is not None - assert xp.blocks.last().job.runnable_date is None - - worker = Worker.objects.get() - - self.assertEqual(len(assigned_splits), 1) - split = assigned_splits[0] - self.assertEqual(split.job.block.experiment, xp) - self.assertEqual(split.job.block.name, 'echo') - self.assertEqual(split.worker, worker) - self.assertEqual(worker.name, qsetup.HOSTNAME) - self.assertEqual(worker.available_cores(), qsetup.CORES-1) - - # actually runs the job (non-blocking) - worker.work(self.environments, self.process) - - def condition(): - xp.refresh_from_db() - block = xp.blocks.first() - return block.status == Block.DONE - - _sleep(120, condition) - - # at this point, split should have been successful which shall - # trigger job deletion and block update - xp.refresh_from_db() - block = xp.blocks.first() - - self.assertEqual(block.status, Block.DONE) - self.assertEqual(xp.status, Experiment.RUNNING) - - # all caches must be have been generated - assert all([k.check_checksums() for k in split.job.block.outputs.all()]) - - # checks the number of statistics objects has increased by 1 - self.assertTrue(HourlyStatistics.objects.count() > current_stats) - - # assert we have no database traces after the block is done - self.assertEqual(Job.objects.filter(block=split.job.block).count(), 0) - self.assertEqual(JobSplit.objects.filter(job=split.job).count(), 0) - self.assertEqual(Result.objects.filter(job__isnull=True).count(), 0) - - self.assertEqual(worker.available_cores(), qsetup.CORES) - - # since this job was successful, the next one should be ready to run - - # schedules the last block of the experiment - assert xp.blocks.last().job.runnable_date is not None - assigned_splits = schedule() - - self.assertEqual(len(assigned_splits), 1) - split = assigned_splits[0] - self.assertEqual(split.job.block.experiment, xp) - self.assertEqual(split.job.block.name, 'analysis') - self.assertEqual(split.worker, worker) - self.assertEqual(worker.name, qsetup.HOSTNAME) - self.assertEqual(worker.available_cores(), qsetup.CORES-1) - - # actually runs the job (non-blocking) - worker.work(self.environments, self.process) - - def condition(): - xp.refresh_from_db() - return xp.status == Experiment.DONE - - _sleep(120, condition) #wait job completion - - # checks the number of statistics objects has increased by 1 - self.assertTrue(HourlyStatistics.objects.count() > current_stats) - - # at this point, split should have been successful which shall - # trigger job deletion and block update - xp.refresh_from_db() - block = xp.blocks.last() - - self.assertEqual(block.status, Block.DONE) - self.assertEqual(xp.status, Experiment.DONE) - - # all caches must be have been generated - assert all([k.check_checksums() for k in split.job.block.outputs.all()]) - - # assert we have no database traces after the last block is done - self.assertEqual(Job.objects.count(), 0) - self.assertEqual(JobSplit.objects.count(), 0) - self.assertEqual(Result.objects.count(), 0) - - self.assertEqual(worker.available_cores(), qsetup.CORES) - - - def test_cancel_running(self): - - # tests an experiment can be cancelled while running - - current_stats = HourlyStatistics.objects.count() - - fullname = 'user/user/single/1/single_sleep' - xp = Experiment.objects.get(name=fullname.split(os.sep)[-1]) - - # schedules the experiment and check it - xp.schedule() - - # schedules the first runnable block - assert xp.blocks.first().job.runnable_date is None - assert xp.blocks.last().job.runnable_date is None - - assigned_splits = schedule() - - assert xp.blocks.first().job.runnable_date is not None - assert xp.blocks.last().job.runnable_date is None - - worker = Worker.objects.get() - - self.assertEqual(len(assigned_splits), 1) - split = assigned_splits[0] - self.assertEqual(split.job.block.experiment, xp) - self.assertEqual(split.job.block.name, 'echo') - self.assertEqual(split.worker, worker) - self.assertEqual(worker.name, qsetup.HOSTNAME) - self.assertEqual(worker.available_cores(), qsetup.CORES-1) - - # actually runs the job (non-blocking) - worker.work(self.environments, self.process) - - def condition(): - xp.refresh_from_db() - return xp.status == Experiment.RUNNING - - _sleep(20, condition) - - # Just to be sure that the docker container really started - time.sleep(3) - - # cancels the experiment - xp.cancel() - split.refresh_from_db() - self.assertEqual(split.status, Job.CANCEL) - - # launch another working cycle to kill the process - worker.work(self.environments, self.process) - - def condition(): - xp.refresh_from_db() - return xp.status == Experiment.FAILED and Job.objects.count() == 0 - _sleep(20, condition) - xp.refresh_from_db() - - # assert we have no database traces after the last block is done - self.assertEqual(Job.objects.count(), 0) - self.assertEqual(JobSplit.objects.count(), 0) - self.assertEqual(Result.objects.count(), 0) - - self.assertEqual(worker.available_cores(), qsetup.CORES) - - # asserts the old experiment is still on a failed state - self.assertEqual( - [str(k) for k in xp.blocks.order_by('id').values_list('status', - flat=True)], - [Block.CANCELLED, Block.CANCELLED] - ) - self.assertEqual(xp.status, Experiment.FAILED) diff --git a/beat/web/backend/tests/__init__.py b/beat/web/backend/tests/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/beat/web/backend/tests/common.py b/beat/web/backend/tests/common.py new file mode 100755 index 0000000000000000000000000000000000000000..6eca5672367fdc95511a842dfc124b81c148add9 --- /dev/null +++ b/beat/web/backend/tests/common.py @@ -0,0 +1,260 @@ +#!/usr/bin/env python +# vim: set fileencoding=utf-8 : + +############################################################################### +# # +# Copyright (c) 2017 Idiap Research Institute, http://www.idiap.ch/ # +# Contact: beat.support@idiap.ch # +# # +# This file is part of the beat.web module of the BEAT platform. # +# # +# Commercial License Usage # +# Licensees holding valid commercial BEAT licenses may use this file in # +# accordance with the terms contained in a written agreement between you # +# and Idiap. For further information contact tto@idiap.ch # +# # +# Alternatively, this file may be used under the terms of the GNU Affero # +# Public License version 3 as published by the Free Software and appearing # +# in the file LICENSE.AGPL included in the packaging of this file. # +# The BEAT platform is distributed in the hope that it will be useful, but # +# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY # +# or FITNESS FOR A PARTICULAR PURPOSE. # +# # +# You should have received a copy of the GNU Affero Public License along # +# with the BEAT platform. If not, see http://www.gnu.org/licenses/. # +# # +############################################################################### + +from django.test import TestCase +from django.conf import settings + +from ...utils.management.commands import install +from ...experiments.models import Experiment +from ...experiments.models import Block +from ...experiments.models import CachedFile +from ...algorithms.models import Algorithm + +from ..models import Queue +from ..models import Worker +from ..models import Environment +from ..models import Job +from ..models import JobSplit + +from ..utils import setup_backend +from ..management.commands import qsetup + +from beat.core.dataformat import DataFormat +from beat.core.data import CachedDataSink +import beat.core.hash + +import os +import sys + + +#---------------------------------------------------------- + + +ONE_QUEUE_TWO_WORKERS = { + "queues": { + "queue": { + "memory-limit": 4*1024, + "time-limit": 1440, #1 day + "cores-per-slot": 1, + "max-slots-per-user": 2, + "environments": [ + 'Python 2.7 (1.1.0)' + ], + "slots": { + 'node1': { + "quantity": 1, + "priority": 0 + }, + 'node2': { + "quantity": 1, + "priority": 0 + } + }, + "groups": [ + "Default", + ], + } + }, + "workers": { + "node1": { + "cores": 1, + "memory": 16*1024, + }, + "node2": { + "cores": 1, + "memory": 16*1024, + } + }, + "environments": { + "Python 2.7 (1.1.0)": { + "name": 'Python 2.7', + "version": '1.1.0', + "short_description": "Test", + "description": "Test environment", + "languages": "python", + }, + }, +} + + +#---------------------------------------------------------- + + +class BackendUtilitiesMixin(object): + + @classmethod + def setup_test_data(cls): + install.create_sites() + system_user, plot_user, user = install.create_users('user', 'user') + install.add_group('Default') + + setup_backend(qsetup.DEFAULT_CONFIGURATION) + + Worker.objects.update(active=True) + env = Environment.objects.get(name='Python 2.7') + queue = Queue.objects.first() + + template_data = dict( + system_user = system_user, + plot_user = plot_user, + user = user, + private = False, + queue = queue.name, + environment = dict(name=env.name, version=env.version), + ) + + prefix = os.path.join( + os.path.dirname(os.path.dirname(os.path.realpath(sys.argv[0]))), + 'src', + 'beat.examples', + ) + + install.install_contributions(prefix, 'system', template_data) + install.install_contributions(prefix, 'test', template_data) + + + def clean_cache(self): + for p, dirs, files in os.walk(settings.CACHE_ROOT, topdown=False): + + files = [f for f in files if not f.startswith('.')] + dirs[:] = [d for d in dirs if not d.startswith('.')] #note: in-place + + for f in files: + fullpath = os.path.join(p, f) + os.remove(fullpath) + + for d in dirs: + fullpath = os.path.join(p, d) + os.rmdir(fullpath) + + + def set_experiment_state(self, experiment, experiment_status=None, block_status=None, + cache_status=None): + if block_status: + for name, status in block_status.items(): + block = experiment.blocks.get(name=name) + block.status = status + block.save() + + if cache_status: + for name, status in cache_status.items(): + block = experiment.blocks.get(name=name) + for cached_file in block.outputs.all(): + cached_file.status = status + cached_file.save() + + if experiment_status: + experiment.status = experiment_status + experiment.save() + + + def generate_cached_files(self, hash, splits): + dataformat = DataFormat(settings.PREFIX, 'system/integer/1') + + path = os.path.join(settings.CACHE_ROOT, beat.core.hash.toPath(hash)) + os.makedirs(os.path.dirname(path)) + + value = 0 + + for index, split in enumerate(splits): + sink = CachedDataSink() + sink.setup(path, dataformat, process_id=index) + + for indices in split: + if not isinstance(indices, tuple): + start = indices + end = indices + else: + start = indices[0] + end = indices[1] + + sink.write({ + 'value': value, + }, + start_data_index = start, + end_data_index = end + ) + + value += 1 + + sink.close() + + +#---------------------------------------------------------- + + +class BaseBackendTestCase(TestCase, BackendUtilitiesMixin): + + @classmethod + def setUpTestData(cls): + cls.setup_test_data() + + + def setUp(self): + self.clean_cache() + + + def tearDown(self): + self.clean_cache() + + + def check_single(self, xp): + '''Checks user/user/single/1/single''' + + self.assertEqual(xp.blocks.count(), 2) + + b0 = xp.blocks.all()[0] + + self.assertEqual(b0.name, 'echo') + self.assertEqual(b0.status, Block.PENDING) + self.assertEqual(b0.algorithm, Algorithm.objects.get(name='integers_echo')) + self.assertEqual(b0.dependencies.count(), 0) + self.assertEqual(b0.dependents.count(), 1) + self.assertEqual(b0.queue.name, 'queue') + self.assertEqual(b0.environment.name, 'Python 2.7') + self.assertEqual(b0.required_slots, 1) + self.assertEqual(b0.inputs.count(), 1) + self.assertEqual(b0.outputs.count(), 1) + self.assertEqual(b0.job.splits.count(), 0) #not scheduled yet + + assert not b0.done() + + b1 = xp.blocks.all()[1] + + self.assertEqual(b1.name, 'analysis') + self.assertEqual(b1.status, Block.PENDING) + self.assertEqual(b1.algorithm, Algorithm.objects.get(name='integers_echo_analyzer')) + self.assertEqual(b1.dependencies.count(), 1) + self.assertEqual(b1.dependents.count(), 0) + self.assertEqual(b1.queue.name, 'queue') + self.assertEqual(b1.environment.name, 'Python 2.7') + self.assertEqual(b1.required_slots, 1) + self.assertEqual(b1.inputs.count(), 1) + self.assertEqual(b1.outputs.count(), 1) + self.assertEqual(b1.job.splits.count(), 0) #not scheduled yet + + assert not b1.done() diff --git a/beat/web/backend/tests/test_api.py b/beat/web/backend/tests/test_api.py new file mode 100755 index 0000000000000000000000000000000000000000..f4b74736bf42b8674b58a3bd4dd691f12b4ea42a --- /dev/null +++ b/beat/web/backend/tests/test_api.py @@ -0,0 +1,49 @@ +#!/usr/bin/env python +# vim: set fileencoding=utf-8 : + +############################################################################### +# # +# Copyright (c) 2017 Idiap Research Institute, http://www.idiap.ch/ # +# Contact: beat.support@idiap.ch # +# # +# This file is part of the beat.web module of the BEAT platform. # +# # +# Commercial License Usage # +# Licensees holding valid commercial BEAT licenses may use this file in # +# accordance with the terms contained in a written agreement between you # +# and Idiap. For further information contact tto@idiap.ch # +# # +# Alternatively, this file may be used under the terms of the GNU Affero # +# Public License version 3 as published by the Free Software and appearing # +# in the file LICENSE.AGPL included in the packaging of this file. # +# The BEAT platform is distributed in the hope that it will be useful, but # +# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY # +# or FITNESS FOR A PARTICULAR PURPOSE. # +# # +# You should have received a copy of the GNU Affero Public License along # +# with the BEAT platform. If not, see http://www.gnu.org/licenses/. # +# # +############################################################################### + +from ...common.testutils import BaseTestCase as APITestCase + +from django.contrib.auth.models import User +from django.core.urlresolvers import reverse + + +class CancelAllExperimentsAPI(APITestCase): + + def setUp(self): + self.url = reverse('backend:cancel-experiments') + + + def test_no_access_for_anonymous_user(self): + response = self.client.get(self.url) + self.checkResponse(response, 302) #redirects to login page + + + def test_no_access_for_non_superuser(self): + User.objects.create_user('johndoe', 'johndoe@test.org', '1234') + self.client.login(username='johndoe', password='1234') + response = self.client.get(self.url) + self.checkResponse(response, 403) diff --git a/beat/web/backend/tests/test_cache.py b/beat/web/backend/tests/test_cache.py new file mode 100755 index 0000000000000000000000000000000000000000..2cafb114972ab1f8aea2e24bd6ac9e45b6ffa4ce --- /dev/null +++ b/beat/web/backend/tests/test_cache.py @@ -0,0 +1,160 @@ +#!/usr/bin/env python +# vim: set fileencoding=utf-8 : + +############################################################################### +# # +# Copyright (c) 2017 Idiap Research Institute, http://www.idiap.ch/ # +# Contact: beat.support@idiap.ch # +# # +# This file is part of the beat.web module of the BEAT platform. # +# # +# Commercial License Usage # +# Licensees holding valid commercial BEAT licenses may use this file in # +# accordance with the terms contained in a written agreement between you # +# and Idiap. For further information contact tto@idiap.ch # +# # +# Alternatively, this file may be used under the terms of the GNU Affero # +# Public License version 3 as published by the Free Software and appearing # +# in the file LICENSE.AGPL included in the packaging of this file. # +# The BEAT platform is distributed in the hope that it will be useful, but # +# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY # +# or FITNESS FOR A PARTICULAR PURPOSE. # +# # +# You should have received a copy of the GNU Affero Public License along # +# with the BEAT platform. If not, see http://www.gnu.org/licenses/. # +# # +############################################################################### + +import os +import sys +import time +import shutil +import tempfile + +from django.core import management +from django.test import TestCase + +from ..utils import cleanup_cache + + +class CacheCleanUp(TestCase): + + + def setUp(self): + self.cache = tempfile.mkdtemp(prefix='beat_') + + + def tearDown(self): + shutil.rmtree(self.cache) + + + def touch(self, f, times=None): + """Replicates the `touch' command-line utility""" + with open(f, 'a'): os.utime(f, times) + + + def J(self, *args): + return os.path.join(*((self.cache,) + args)) + + + def prepare_cleanup_full(self): + + # creates a temporary directory structure + os.makedirs(self.J('a', 'b', 'c')) + os.makedirs(self.J('a', 'c', 'd')) + os.makedirs(self.J('a', 'c', 'e')) + self.touch(self.J('a', 'b', 'c', 'd.json')) + self.touch(self.J('a', 'c', 'd', 'e.json')) + + + def check_cleanup_full(self): + + assert not os.listdir(self.cache) + + + def test_cache_cleanup_full(self): + + self.prepare_cleanup_full() + cleanup_cache(self.cache, delete=True) + self.check_cleanup_full() + + + def test_cmd_cleanup_full(self): + + self.prepare_cleanup_full() + management.call_command('cleanup_cache', path=self.cache, + verbosity=0, delete=True) + self.check_cleanup_full() + + + def prepare_cleanup_aged(self): + + two_min_ago = time.time() - 60*2 + + # creates a temporary directory structure + os.makedirs(self.J('a', 'b', 'c')) + os.makedirs(self.J('a', 'c', 'd')) + os.makedirs(self.J('a', 'c', 'e')) + self.touch(self.J('a', 'b', 'c', 'd.json'), (two_min_ago, two_min_ago)) + self.touch(self.J('a', 'c', 'd', 'e.json')) + + + def check_cleanup_aged(self): + + assert os.path.exists(self.J('a', 'c', 'd', 'e.json')) + assert not os.path.exists(self.J('a', 'b', 'c')) + assert not os.path.exists(self.J('a', 'b', 'c', 'd.json')) + assert not os.path.exists(self.J('a', 'b', 'e')) + + + def test_cache_cleanup_aged(self): + + self.prepare_cleanup_aged() + cleanup_cache(self.cache, age_in_minutes=2, delete=True) + self.check_cleanup_aged() + + + def test_cmd_cleanup_aged(self): + + self.prepare_cleanup_aged() + management.call_command('cleanup_cache', path=self.cache, + verbosity=0, olderthan=2, delete=True) + self.check_cleanup_aged() + + + def prepare_cleanup_lock(self): + + two_min_ago = time.time() - 60*2 + ten_min_ago = time.time() - 60*10 + + # creates a temporary directory structure + os.makedirs(self.J('a', 'b', 'c')) + os.makedirs(self.J('a', 'c', 'd')) + os.makedirs(self.J('a', 'c', 'e')) + self.touch(self.J('a', 'b', 'c', 'd.json'), (two_min_ago, two_min_ago)) + self.touch(self.J('a', 'c', 'd', 'e.json'), (ten_min_ago, ten_min_ago)) + + self.touch(self.J('a', 'c', 'd', 'e.lock')) #create a lock + + + def check_cleanup_lock(self): + + assert os.path.exists(self.J('a', 'c', 'd', 'e.json')) + assert not os.path.exists(self.J('a', 'b', 'c')) + assert not os.path.exists(self.J('a', 'b', 'c', 'd.json')) + assert not os.path.exists(self.J('a', 'b', 'e')) + + + def test_cache_cleanup_lock(self): + + self.prepare_cleanup_lock() + cleanup_cache(self.cache, delete=True) + self.check_cleanup_lock() + + + def test_cmd_cleanup_lock(self): + + self.prepare_cleanup_lock() + management.call_command('cleanup_cache', path=self.cache, + verbosity=0, delete=True) + self.check_cleanup_lock() diff --git a/beat/web/backend/tests/test_helpers.py b/beat/web/backend/tests/test_helpers.py new file mode 100755 index 0000000000000000000000000000000000000000..e1fb8985bc992bccdf7d4fd4c93f312e1415b17c --- /dev/null +++ b/beat/web/backend/tests/test_helpers.py @@ -0,0 +1,3380 @@ +#!/usr/bin/env python +# vim: set fileencoding=utf-8 : + +############################################################################### +# # +# Copyright (c) 2017 Idiap Research Institute, http://www.idiap.ch/ # +# Contact: beat.support@idiap.ch # +# # +# This file is part of the beat.web module of the BEAT platform. # +# # +# Commercial License Usage # +# Licensees holding valid commercial BEAT licenses may use this file in # +# accordance with the terms contained in a written agreement between you # +# and Idiap. For further information contact tto@idiap.ch # +# # +# Alternatively, this file may be used under the terms of the GNU Affero # +# Public License version 3 as published by the Free Software and appearing # +# in the file LICENSE.AGPL included in the packaging of this file. # +# The BEAT platform is distributed in the hope that it will be useful, but # +# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY # +# or FITNESS FOR A PARTICULAR PURPOSE. # +# # +# You should have received a copy of the GNU Affero Public License along # +# with the BEAT platform. If not, see http://www.gnu.org/licenses/. # +# # +############################################################################### + +from django.conf import settings + +from .common import BaseBackendTestCase +from .common import ONE_QUEUE_TWO_WORKERS + +from ...experiments.models import Experiment +from ...experiments.models import Block +from ...experiments.models import CachedFile + +from ..models import Job +from ..models import JobSplit +from ..models import Result +from ..models import Worker +from ..models import Queue + +from ..helpers import schedule_experiment +from ..helpers import cancel_experiment +from ..helpers import split_new_jobs +from ..helpers import process_newly_cancelled_experiments +from ..helpers import is_cache_complete +from ..helpers import assign_splits_to_workers +from ..helpers import get_configuration_for_split +from ..helpers import on_split_started +from ..helpers import on_split_done +from ..helpers import on_split_fail +from ..helpers import on_split_cancelled + +from ..utils import setup_backend + +import beat.core.hash + +import os +import sys + + +#---------------------------------------------------------- + + +class ScheduleExperimentTest(BaseBackendTestCase): + + def check_pending_block_of_pending_experiment(self, block): + block.refresh_from_db() + block.experiment.refresh_from_db() + + experiment = block.experiment + + self.assertEqual(block.status, Block.PENDING) + self.assertTrue(block.creation_date is not None) + self.assertTrue(block.start_date is None) + self.assertTrue(block.end_date is None) + + self.assertEqual(experiment.status, Experiment.PENDING) + + + def check_pending_block_of_scheduled_experiment(self, block, runnable=True): + block.refresh_from_db() + block.job.refresh_from_db() + block.experiment.refresh_from_db() + + job = block.job + experiment = block.experiment + + self.assertEqual(block.status, Block.PENDING) + self.assertTrue(block.creation_date is not None) + self.assertTrue(block.start_date is None) + self.assertTrue(block.end_date is None) + + if runnable: + self.assertTrue(job.runnable_date is not None) + else: + self.assertTrue(job.runnable_date is None) + + self.assertTrue(job.start_date is None) + self.assertTrue(job.end_date is None) + + self.assertEqual(experiment.status, Experiment.SCHEDULED) + + + def check_done_block_of_scheduled_experiment(self, block): + block.refresh_from_db() + block.experiment.refresh_from_db() + + experiment = block.experiment + + self.assertEqual(block.status, Block.DONE) + self.assertTrue(block.creation_date is not None) + self.assertTrue(block.start_date is not None) + self.assertTrue(block.end_date is not None) + + self.assertEqual(experiment.status, Experiment.SCHEDULED) + + + def check_done_block_of_done_experiment(self, block): + block.refresh_from_db() + block.experiment.refresh_from_db() + + experiment = block.experiment + + self.assertEqual(block.status, Block.DONE) + self.assertTrue(block.creation_date is not None) + self.assertTrue(block.start_date is not None) + self.assertTrue(block.end_date is not None) + + self.assertEqual(experiment.status, Experiment.DONE) + + + def test_success(self): + fullname = 'user/user/single/1/single' + + xp = Experiment.objects.get(name=fullname.split('/')[-1]) + + b0 = xp.blocks.all()[0] + b1 = xp.blocks.all()[1] + + self.check_pending_block_of_pending_experiment(b0) + self.check_pending_block_of_pending_experiment(b1) + + self.assertEqual(Job.objects.count(), 0) + + schedule_experiment(xp) + + self.check_pending_block_of_scheduled_experiment(b0) + self.check_pending_block_of_scheduled_experiment(b1, runnable=False) + + self.assertEqual(Job.objects.count(), 2) + self.assertEqual(JobSplit.objects.count(), 0) + + + def test_first_block_in_cache(self): + fullname = 'user/user/single/1/single' + + xp = Experiment.objects.get(name=fullname.split('/')[-1]) + + self.set_experiment_state( + xp, + cache_status={ + 'echo': CachedFile.CACHED, + } + ) + + schedule_experiment(xp) + + b0 = xp.blocks.all()[0] + b1 = xp.blocks.all()[1] + + self.check_done_block_of_scheduled_experiment(b0) + self.check_pending_block_of_scheduled_experiment(b1) + + self.assertEqual(Job.objects.count(), 1) + self.assertEqual(JobSplit.objects.count(), 0) + + + def test_all_blocks_in_cache(self): + fullname = 'user/user/single/1/single' + + xp = Experiment.objects.get(name=fullname.split('/')[-1]) + + self.set_experiment_state( + xp, + cache_status={ + 'echo': CachedFile.CACHED, + 'analysis': CachedFile.CACHED, + } + ) + + schedule_experiment(xp) + + b0 = xp.blocks.all()[0] + b1 = xp.blocks.all()[1] + + self.check_done_block_of_done_experiment(b0) + self.check_done_block_of_done_experiment(b1) + + self.assertEqual(Job.objects.count(), 0) + self.assertEqual(JobSplit.objects.count(), 0) + + + def test_already_scheduled(self): + fullname = 'user/user/single/1/single' + + xp = Experiment.objects.get(name=fullname.split('/')[-1]) + + schedule_experiment(xp) + xp.refresh_from_db() + + schedule_experiment(xp) + xp.refresh_from_db() + + b0 = xp.blocks.all()[0] + b1 = xp.blocks.all()[1] + + self.check_pending_block_of_scheduled_experiment(b0) + self.check_pending_block_of_scheduled_experiment(b1, runnable=False) + + self.assertEqual(Job.objects.count(), 2) + self.assertEqual(JobSplit.objects.count(), 0) + + + def test_running_experiment(self): + fullname = 'user/user/single/1/single' + + xp = Experiment.objects.get(name=fullname.split('/')[-1]) + + self.set_experiment_state( + xp, + experiment_status=Experiment.RUNNING, + block_status={ + 'echo': Block.PROCESSING, + }, + cache_status={ + 'echo': CachedFile.PROCESSING, + } + ) + + schedule_experiment(xp) + xp.refresh_from_db() + + b0 = xp.blocks.all()[0] + b1 = xp.blocks.all()[1] + + self.assertEqual(b0.status, Block.PROCESSING) + self.assertEqual(b1.status, Block.PENDING) + self.assertEqual(xp.status, Experiment.RUNNING) + + # schedule_experiment() didn't do anything + self.assertEqual(Job.objects.count(), 0) + self.assertEqual(JobSplit.objects.count(), 0) + + + def test_done_experiment(self): + fullname = 'user/user/single/1/single' + + xp = Experiment.objects.get(name=fullname.split('/')[-1]) + + self.set_experiment_state( + xp, + experiment_status=Experiment.DONE, + block_status={ + 'echo': Block.DONE, + 'analysis': Block.DONE, + }, + cache_status={ + 'echo': CachedFile.CACHED, + 'analysis': CachedFile.CACHED, + } + ) + + schedule_experiment(xp) + xp.refresh_from_db() + + b0 = xp.blocks.all()[0] + b1 = xp.blocks.all()[1] + + self.assertEqual(b0.status, Block.DONE) + self.assertEqual(b1.status, Block.DONE) + self.assertEqual(xp.status, Experiment.DONE) + + # schedule_experiment() didn't do anything + self.assertEqual(Job.objects.count(), 0) + self.assertEqual(JobSplit.objects.count(), 0) + + + def test_failed_experiment(self): + fullname = 'user/user/single/1/single' + + xp = Experiment.objects.get(name=fullname.split('/')[-1]) + + self.set_experiment_state( + xp, + experiment_status=Experiment.FAILED, + block_status={ + 'echo': Block.FAILED, + }, + ) + + schedule_experiment(xp) + xp.refresh_from_db() + + b0 = xp.blocks.all()[0] + b1 = xp.blocks.all()[1] + + self.assertEqual(b0.status, Block.FAILED) + self.assertEqual(b1.status, Block.PENDING) + self.assertEqual(xp.status, Experiment.FAILED) + + # schedule_experiment() didn't do anything + self.assertEqual(Job.objects.count(), 0) + self.assertEqual(JobSplit.objects.count(), 0) + + + def test_cancelling_experiment(self): + fullname = 'user/user/single/1/single' + + xp = Experiment.objects.get(name=fullname.split('/')[-1]) + + self.set_experiment_state( + xp, + experiment_status=Experiment.CANCELLING, + block_status={ + 'echo': Block.CANCELLED, + }, + ) + + schedule_experiment(xp) + xp.refresh_from_db() + + b0 = xp.blocks.all()[0] + b1 = xp.blocks.all()[1] + + self.assertEqual(b0.status, Block.CANCELLED) + self.assertEqual(b1.status, Block.PENDING) + self.assertEqual(xp.status, Experiment.CANCELLING) + + # schedule_experiment() didn't do anything + self.assertEqual(Job.objects.count(), 0) + self.assertEqual(JobSplit.objects.count(), 0) + + + def test_two_different_experiments(self): + fullname1 = 'user/user/single/1/single' + fullname2 = 'user/user/single/1/single_add' + + xp1 = Experiment.objects.get(name=fullname1.split('/')[-1]) + xp2 = Experiment.objects.get(name=fullname2.split('/')[-1]) + + schedule_experiment(xp1) + schedule_experiment(xp2) + + xp1.refresh_from_db() + xp2.refresh_from_db() + + b1_0 = xp1.blocks.all()[0] + b1_1 = xp1.blocks.all()[1] + b2_0 = xp2.blocks.all()[0] + b2_1 = xp2.blocks.all()[1] + + self.check_pending_block_of_scheduled_experiment(b1_0) + self.check_pending_block_of_scheduled_experiment(b1_1, runnable=False) + + self.check_pending_block_of_scheduled_experiment(b2_0) + self.check_pending_block_of_scheduled_experiment(b2_1, runnable=False) + + self.assertEqual(Job.objects.count(), 4) + self.assertEqual(JobSplit.objects.count(), 0) + + self.assertNotEqual(b1_0.job.key, b2_0.job.key) + + + def test_two_similar_experiments(self): + fullname1 = 'user/user/single/1/single' + fullname2 = 'user/user/single/1/single_split_2' + + xp1 = Experiment.objects.get(name=fullname1.split('/')[-1]) + xp2 = Experiment.objects.get(name=fullname2.split('/')[-1]) + + schedule_experiment(xp1) + schedule_experiment(xp2) + + xp1.refresh_from_db() + xp2.refresh_from_db() + + b1_0 = xp1.blocks.all()[0] + b1_1 = xp1.blocks.all()[1] + b2_0 = xp2.blocks.all()[0] + b2_1 = xp2.blocks.all()[1] + + self.check_pending_block_of_scheduled_experiment(b1_0) + self.check_pending_block_of_scheduled_experiment(b1_1, runnable=False) + + self.check_pending_block_of_scheduled_experiment(b2_0) + self.check_pending_block_of_scheduled_experiment(b2_1, runnable=False) + + self.assertEqual(Job.objects.count(), 4) + self.assertEqual(JobSplit.objects.count(), 0) + + self.assertEqual(b1_0.job.key, b2_0.job.key) + + +#---------------------------------------------------------- + + +class IsCacheCompleteTest(BaseBackendTestCase): + + def test_one_split(self): + hash = '0123456789ABCDEF' + + self.generate_cached_files(hash, [ + [0, 1, 2, 3, 4, 5, 6, 7, 8, 9], + ]) + + self.assertTrue(is_cache_complete(beat.core.hash.toPath(hash, suffix=''), 10)) + + + def test_two_splits(self): + hash = '0123456789ABCDEF' + + self.generate_cached_files(hash, [ + [0, 1, 2, 3, 4, 5, 6, 7, 8, 9], + [10, 11, 12, 13, 14, 15, 16, 17, 18, 19], + ]) + + self.assertTrue(is_cache_complete(beat.core.hash.toPath(hash, suffix=''), 20)) + + + def test_three_splits(self): + hash = '0123456789ABCDEF' + + self.generate_cached_files(hash, [ + [0, 1, 2, 3, 4, 5, 6, 7, 8, 9], + [10, 11, 12, 13, 14, 15, 16, 17, 18, 19], + [20, 21, 22, 23, 24], + ]) + + self.assertTrue(is_cache_complete(beat.core.hash.toPath(hash, suffix=''), 25)) + + + def test_one_split_with_ranges(self): + hash = '0123456789ABCDEF' + + self.generate_cached_files(hash, [ + [(0, 2), (3, 5), (6, 8)], + ]) + + self.assertTrue(is_cache_complete(beat.core.hash.toPath(hash, suffix=''), 3)) + + + def test_two_splits_with_ranges(self): + hash = '0123456789ABCDEF' + + self.generate_cached_files(hash, [ + [(0, 2), (3, 5), (6, 8)], + [(9, 11), (12, 20)], + ]) + + self.assertTrue(is_cache_complete(beat.core.hash.toPath(hash, suffix=''), 5)) + + + def test_missing_indexes(self): + hash = '0123456789ABCDEF' + + self.generate_cached_files(hash, [ + [0, 1, 2, 3, 4, 5, 6, 7, 8, 9], + ]) + + os.remove(os.path.join(settings.CACHE_ROOT, beat.core.hash.toPath(hash, suffix='.0.9.index'))) + + self.assertFalse(is_cache_complete(beat.core.hash.toPath(hash, suffix=''), 25)) + + + def test_missing_indexes_checksum(self): + hash = '0123456789ABCDEF' + + self.generate_cached_files(hash, [ + [0, 1, 2, 3, 4, 5, 6, 7, 8, 9], + ]) + + os.remove(os.path.join(settings.CACHE_ROOT, beat.core.hash.toPath(hash, suffix='.0.9.index.checksum'))) + + self.assertFalse(is_cache_complete(beat.core.hash.toPath(hash, suffix=''), 25)) + + + def test_missing_data(self): + hash = '0123456789ABCDEF' + + self.generate_cached_files(hash, [ + [0, 1, 2, 3, 4, 5, 6, 7, 8, 9], + ]) + + os.remove(os.path.join(settings.CACHE_ROOT, beat.core.hash.toPath(hash, suffix='.0.9.data'))) + + self.assertFalse(is_cache_complete(beat.core.hash.toPath(hash, suffix=''), 25)) + + + def test_missing_data_checksum(self): + hash = '0123456789ABCDEF' + + self.generate_cached_files(hash, [ + [0, 1, 2, 3, 4, 5, 6, 7, 8, 9], + ]) + + os.remove(os.path.join(settings.CACHE_ROOT, beat.core.hash.toPath(hash, suffix='.0.9.data.checksum'))) + + self.assertFalse(is_cache_complete(beat.core.hash.toPath(hash, suffix=''), 25)) + + + def test_missing_indexes_and_data(self): + hash = '0123456789ABCDEF' + + self.generate_cached_files(hash, [ + [0, 1, 2, 3, 4, 5, 6, 7, 8, 9], + ]) + + os.remove(os.path.join(settings.CACHE_ROOT, beat.core.hash.toPath(hash, suffix='.0.9.index'))) + os.remove(os.path.join(settings.CACHE_ROOT, beat.core.hash.toPath(hash, suffix='.0.9.index.checksum'))) + os.remove(os.path.join(settings.CACHE_ROOT, beat.core.hash.toPath(hash, suffix='.0.9.data'))) + os.remove(os.path.join(settings.CACHE_ROOT, beat.core.hash.toPath(hash, suffix='.0.9.data.checksum'))) + + self.assertFalse(is_cache_complete(beat.core.hash.toPath(hash, suffix=''), 25)) + + + def test_splitted_missing_indexes(self): + hash = '0123456789ABCDEF' + + self.generate_cached_files(hash, [ + [0, 1, 2, 3, 4, 5, 6, 7, 8, 9], + [10, 11, 12, 13, 14, 15, 16, 17, 18, 19], + [20, 21, 22, 23, 24], + ]) + + os.remove(os.path.join(settings.CACHE_ROOT, beat.core.hash.toPath(hash, suffix='.10.19.index'))) + + self.assertFalse(is_cache_complete(beat.core.hash.toPath(hash, suffix=''), 25)) + + + def test_splitted_missing_indexes_checksum(self): + hash = '0123456789ABCDEF' + + self.generate_cached_files(hash, [ + [0, 1, 2, 3, 4, 5, 6, 7, 8, 9], + [10, 11, 12, 13, 14, 15, 16, 17, 18, 19], + [20, 21, 22, 23, 24], + ]) + + os.remove(os.path.join(settings.CACHE_ROOT, beat.core.hash.toPath(hash, suffix='.10.19.index.checksum'))) + + self.assertFalse(is_cache_complete(beat.core.hash.toPath(hash, suffix=''), 25)) + + + def test_splitted_missing_data(self): + hash = '0123456789ABCDEF' + + self.generate_cached_files(hash, [ + [0, 1, 2, 3, 4, 5, 6, 7, 8, 9], + [10, 11, 12, 13, 14, 15, 16, 17, 18, 19], + [20, 21, 22, 23, 24], + ]) + + os.remove(os.path.join(settings.CACHE_ROOT, beat.core.hash.toPath(hash, suffix='.10.19.data'))) + + self.assertFalse(is_cache_complete(beat.core.hash.toPath(hash, suffix=''), 25)) + + + def test_splitted_missing_data_checksum(self): + hash = '0123456789ABCDEF' + + self.generate_cached_files(hash, [ + [0, 1, 2, 3, 4, 5, 6, 7, 8, 9], + [10, 11, 12, 13, 14, 15, 16, 17, 18, 19], + [20, 21, 22, 23, 24], + ]) + + os.remove(os.path.join(settings.CACHE_ROOT, beat.core.hash.toPath(hash, suffix='.10.19.data.checksum'))) + + self.assertFalse(is_cache_complete(beat.core.hash.toPath(hash, suffix=''), 25)) + + + def test_splitted_missing_indexes_and_data(self): + hash = '0123456789ABCDEF' + + self.generate_cached_files(hash, [ + [0, 1, 2, 3, 4, 5, 6, 7, 8, 9], + [20, 21, 22, 23, 24], + ]) + + self.assertFalse(is_cache_complete(beat.core.hash.toPath(hash, suffix=''), 25)) + + +#---------------------------------------------------------- + + +class SplitNewJobsTest(BaseBackendTestCase): + + def check_split(self, split, split_index=None, start_index=None, end_index=None): + self.assertTrue(split.worker is None) + self.assertEqual(split.split_index, split_index) + self.assertEqual(split.start_index, start_index) + self.assertEqual(split.end_index, end_index) + self.assertEqual(split.status, JobSplit.QUEUED) + self.assertTrue(split.result is None) + self.assertTrue(split.start_date is None) + self.assertTrue(split.end_date is None) + + + def test_one_experiment_one_slot(self): + fullname = 'user/user/single/1/single' + + xp = Experiment.objects.get(name=fullname.split('/')[-1]) + + schedule_experiment(xp) + + self.assertEqual(Job.objects.count(), 2) + self.assertEqual(JobSplit.objects.count(), 0) + + split_new_jobs() + + self.assertEqual(JobSplit.objects.count(), 1) + + xp.refresh_from_db() + + b0 = xp.blocks.all()[0] + split = b0.job.splits.all()[0] + + self.check_split(split, split_index=0) + + + def test_one_experiment_two_slots(self): + fullname = 'user/user/single/1/single_split_2' + + xp = Experiment.objects.get(name=fullname.split('/')[-1]) + + b0 = xp.blocks.all()[0] + + schedule_experiment(xp) + + self.assertEqual(Job.objects.count(), 2) + self.assertEqual(JobSplit.objects.count(), 0) + + self.generate_cached_files(b0.inputs.all()[0].database.hash, [ + [0, 1, 2, 3, 4, 5, 6, 7, 8, 9] + ]) + + split_new_jobs() + + self.assertEqual(JobSplit.objects.count(), 2) + + xp.refresh_from_db() + + b0 = xp.blocks.all()[0] + + self.assertEqual(b0.job.splits.count(), 2) + + split1 = b0.job.splits.all()[0] + split2 = b0.job.splits.all()[1] + + self.check_split(split1, split_index=0, start_index=0, end_index=4) + self.check_split(split2, split_index=1, start_index=5, end_index=9) + + + def test_two_different_experiments(self): + fullname1 = 'user/user/single/1/single' + fullname2 = 'user/user/single/1/single_add' + + xp1 = Experiment.objects.get(name=fullname1.split('/')[-1]) + xp2 = Experiment.objects.get(name=fullname2.split('/')[-1]) + + schedule_experiment(xp1) + schedule_experiment(xp2) + + self.assertEqual(Job.objects.count(), 4) + self.assertEqual(JobSplit.objects.count(), 0) + + split_new_jobs() + + self.assertEqual(JobSplit.objects.count(), 2) + + xp1.refresh_from_db() + xp2.refresh_from_db() + + b0 = xp1.blocks.all()[0] + split = b0.job.splits.all()[0] + + self.check_split(split, split_index=0) + + b0 = xp2.blocks.all()[0] + split = b0.job.splits.all()[0] + + self.check_split(split, split_index=0) + + + def test_two_similar_experiments(self): + fullname1 = 'user/user/single/1/single' + fullname2 = 'user/user/single/1/single_split_2' + + xp1 = Experiment.objects.get(name=fullname1.split('/')[-1]) + xp2 = Experiment.objects.get(name=fullname2.split('/')[-1]) + + schedule_experiment(xp1) + schedule_experiment(xp2) + + self.assertEqual(Job.objects.count(), 4) + self.assertEqual(JobSplit.objects.count(), 0) + + b0 = xp1.blocks.all()[0] + + self.generate_cached_files(b0.inputs.all()[0].database.hash, [ + [0, 1, 2, 3, 4, 5, 6, 7, 8, 9] + ]) + + split_new_jobs() + + self.assertEqual(JobSplit.objects.count(), 3) + + xp1.refresh_from_db() + xp2.refresh_from_db() + + b0 = xp1.blocks.all()[0] + + self.assertEqual(b0.job.splits.count(), 1) + + split = b0.job.splits.all()[0] + + self.check_split(split, split_index=0) + + b0 = xp2.blocks.all()[0] + + self.assertEqual(b0.job.splits.count(), 2) + + split1 = b0.job.splits.all()[0] + split2 = b0.job.splits.all()[1] + + self.check_split(split1, split_index=0, start_index=0, end_index=4) + self.check_split(split2, split_index=1, start_index=5, end_index=9) + + + def test_one_experiment_two_uneven_slots(self): + fullname = 'user/user/single/1/single_split_2' + + xp = Experiment.objects.get(name=fullname.split('/')[-1]) + + b0 = xp.blocks.all()[0] + + schedule_experiment(xp) + + self.assertEqual(Job.objects.count(), 2) + self.assertEqual(JobSplit.objects.count(), 0) + + self.generate_cached_files(b0.inputs.all()[0].database.hash, [ + [(0, 2), (3, 5), (6, 8)] + ]) + + split_new_jobs() + + self.assertEqual(JobSplit.objects.count(), 2) + + xp.refresh_from_db() + + b0 = xp.blocks.all()[0] + + self.assertEqual(b0.job.splits.count(), 2) + + split1 = b0.job.splits.all()[0] + split2 = b0.job.splits.all()[1] + + self.check_split(split1, split_index=0, start_index=0, end_index=5) + self.check_split(split2, split_index=1, start_index=6, end_index=8) + + + def test_one_experiment_too_much_slots(self): + fullname = 'user/user/single/1/single_split_10' + + xp = Experiment.objects.get(name=fullname.split('/')[-1]) + + b0 = xp.blocks.all()[0] + + schedule_experiment(xp) + + self.assertEqual(Job.objects.count(), 2) + self.assertEqual(JobSplit.objects.count(), 0) + + self.generate_cached_files(b0.inputs.all()[0].database.hash, [ + [(0, 2), (3, 5), (6, 8)] + ]) + + split_new_jobs() + + self.assertEqual(JobSplit.objects.count(), 3) + + xp.refresh_from_db() + + b0 = xp.blocks.all()[0] + + self.assertEqual(b0.job.splits.count(), 3) + + split1 = b0.job.splits.all()[0] + split2 = b0.job.splits.all()[1] + split3 = b0.job.splits.all()[2] + + self.check_split(split1, split_index=0, start_index=0, end_index=2) + self.check_split(split2, split_index=1, start_index=3, end_index=5) + self.check_split(split3, split_index=2, start_index=6, end_index=8) + + + def test_similar_experiment_after_assignation(self): + setup_backend(ONE_QUEUE_TWO_WORKERS) + Worker.objects.update(active=True) + + fullname = 'user/user/single/1/single' + + xp1 = Experiment.objects.get(name=fullname.split('/')[-1]) + xp2 = xp1.fork(name='single_fork') + + schedule_experiment(xp1) + split_new_jobs() + + self.assertEqual(Job.objects.count(), 2) + self.assertEqual(JobSplit.objects.count(), 1) + + assigned_splits = assign_splits_to_workers() + + self.assertEqual(JobSplit.objects.count(), 1) + self.assertEqual(len(assigned_splits), 1) + + schedule_experiment(xp2) + split_new_jobs() + + self.assertEqual(Job.objects.count(), 4) + self.assertEqual(JobSplit.objects.count(), 1) + + split = assigned_splits[0] + + self.assertTrue(split.worker is not None) + self.assertFalse(split.job.mirror) + + mirror_job = Job.objects.get(block__experiment=xp2, runnable_date__isnull=False) + + self.assertEqual(split.job.key, mirror_job.key) + self.assertEqual(mirror_job.splits.count(), 0) + self.assertTrue(mirror_job.mirror) + + + def test_similar_experiment_during_processing(self): + setup_backend(ONE_QUEUE_TWO_WORKERS) + Worker.objects.update(active=True) + + fullname = 'user/user/single/1/single' + + xp1 = Experiment.objects.get(name=fullname.split('/')[-1]) + xp2 = xp1.fork(name='single_fork') + + schedule_experiment(xp1) + split_new_jobs() + + self.assertEqual(Job.objects.count(), 2) + self.assertEqual(JobSplit.objects.count(), 1) + + assigned_splits = assign_splits_to_workers() + + self.assertEqual(JobSplit.objects.count(), 1) + self.assertEqual(len(assigned_splits), 1) + + split = assigned_splits[0] + + split.status = JobSplit.PROCESSING + split.save() + + schedule_experiment(xp2) + split_new_jobs() + + self.assertEqual(Job.objects.count(), 4) + self.assertEqual(JobSplit.objects.count(), 1) + + split = assigned_splits[0] + + self.assertTrue(split.worker is not None) + self.assertFalse(split.job.mirror) + + mirror_job = Job.objects.get(block__experiment=xp2, runnable_date__isnull=False) + + self.assertEqual(split.job.key, mirror_job.key) + self.assertEqual(mirror_job.splits.count(), 0) + self.assertTrue(mirror_job.mirror) + + + def test_similar_experiment_after_completion_of_first_block(self): + fullname = 'user/user/single/1/single' + + xp = Experiment.objects.get(name=fullname.split('/')[-1]) + + schedule_experiment(xp) + + self.assertEqual(Job.objects.count(), 2) + self.assertEqual(JobSplit.objects.count(), 0) + + self.set_experiment_state( + xp, + cache_status={ + 'echo': CachedFile.CACHED, + } + ) + + b0 = xp.blocks.all()[0] + b1 = xp.blocks.all()[1] + + self.generate_cached_files(b0.outputs.all()[0].hash, [ + [0, 1, 2, 3, 4, 5, 6, 7, 8, 9] + ]) + + split_new_jobs() + + self.assertEqual(Job.objects.count(), 1) + self.assertEqual(JobSplit.objects.count(), 1) + + xp.refresh_from_db() + b0 = xp.blocks.all()[0] + b1 = xp.blocks.all()[1] + + self.assertEqual(b0.status, Block.DONE) + self.assertEqual(b1.status, Block.PENDING) + + split = b1.job.splits.all()[0] + + self.check_split(split, split_index=0) + + + def test_similar_experiment_after_completion(self): + fullname = 'user/user/single/1/single' + + xp = Experiment.objects.get(name=fullname.split('/')[-1]) + + schedule_experiment(xp) + + self.assertEqual(Job.objects.count(), 2) + self.assertEqual(JobSplit.objects.count(), 0) + + self.set_experiment_state( + xp, + cache_status={ + 'echo': CachedFile.CACHED, + 'analysis': CachedFile.CACHED, + } + ) + + b0 = xp.blocks.all()[0] + b1 = xp.blocks.all()[1] + + self.generate_cached_files(b0.outputs.all()[0].hash, [ + [0, 1, 2, 3, 4, 5, 6, 7, 8, 9] + ]) + + self.generate_cached_files(b1.outputs.all()[0].hash, [ + [0, 1, 2, 3, 4, 5, 6, 7, 8, 9] + ]) + + split_new_jobs() + + self.assertEqual(Job.objects.count(), 0) + self.assertEqual(JobSplit.objects.count(), 0) + + xp.refresh_from_db() + b0 = xp.blocks.all()[0] + b1 = xp.blocks.all()[1] + + self.assertEqual(b0.status, Block.DONE) + self.assertEqual(b1.status, Block.DONE) + self.assertEqual(xp.status, Experiment.DONE) + + +#---------------------------------------------------------- + + +class AssignSplitsToWorkersTest(BaseBackendTestCase): + + def test_one_experiment_one_slot(self): + setup_backend(ONE_QUEUE_TWO_WORKERS) + Worker.objects.filter(name='node1').update(active=True) + + fullname = 'user/user/single/1/single' + + xp = Experiment.objects.get(name=fullname.split('/')[-1]) + + schedule_experiment(xp) + split_new_jobs() + + self.assertEqual(Job.objects.count(), 2) + self.assertEqual(JobSplit.objects.count(), 1) + + assigned_splits = assign_splits_to_workers() + + self.assertEqual(JobSplit.objects.count(), 1) + self.assertEqual(len(assigned_splits), 1) + + split = assigned_splits[0] + + self.assertTrue(split.worker is not None) + self.assertFalse(split.job.mirror) + + + def test_two_different_experiments_one_slot(self): + setup_backend(ONE_QUEUE_TWO_WORKERS) + Worker.objects.filter(name='node1').update(active=True) + + fullname1 = 'user/user/single/1/single' + fullname2 = 'user/user/single/1/single_add' + + xp1 = Experiment.objects.get(name=fullname1.split('/')[-1]) + xp2 = Experiment.objects.get(name=fullname2.split('/')[-1]) + + schedule_experiment(xp1) + schedule_experiment(xp2) + split_new_jobs() + + self.assertEqual(Job.objects.count(), 4) + self.assertEqual(JobSplit.objects.count(), 2) + + assigned_splits = assign_splits_to_workers() + + self.assertEqual(JobSplit.objects.count(), 2) + self.assertEqual(len(assigned_splits), 1) + + split = assigned_splits[0] + + self.assertTrue(split.worker is not None) + self.assertFalse(split.job.mirror) + + + def test_two_different_experiments_two_slots(self): + setup_backend(ONE_QUEUE_TWO_WORKERS) + Worker.objects.update(active=True) + + fullname1 = 'user/user/single/1/single' + fullname2 = 'user/user/single/1/single_add' + + xp1 = Experiment.objects.get(name=fullname1.split('/')[-1]) + xp2 = Experiment.objects.get(name=fullname2.split('/')[-1]) + + schedule_experiment(xp1) + schedule_experiment(xp2) + split_new_jobs() + + self.assertEqual(Job.objects.count(), 4) + self.assertEqual(JobSplit.objects.count(), 2) + + assigned_splits = assign_splits_to_workers() + + self.assertEqual(JobSplit.objects.count(), 2) + self.assertEqual(len(assigned_splits), 2) + + split1 = assigned_splits[0] + + self.assertTrue(split1.worker is not None) + self.assertFalse(split1.job.mirror) + + split2 = assigned_splits[1] + + self.assertTrue(split2.worker is not None) + self.assertFalse(split2.job.mirror) + + self.assertNotEqual(split1.job.key, split2.job.key) + self.assertNotEqual(split1.worker, split2.worker) + + + def test_two_different_experiments_two_slots_user_limitation(self): + setup_backend(ONE_QUEUE_TWO_WORKERS) + Worker.objects.update(active=True) + Queue.objects.filter(name='queue').update(max_slots_per_user=1) + + fullname1 = 'user/user/single/1/single' + fullname2 = 'user/user/single/1/single_add' + + xp1 = Experiment.objects.get(name=fullname1.split('/')[-1]) + xp2 = Experiment.objects.get(name=fullname2.split('/')[-1]) + + schedule_experiment(xp1) + schedule_experiment(xp2) + split_new_jobs() + + self.assertEqual(Job.objects.count(), 4) + self.assertEqual(JobSplit.objects.count(), 2) + + assigned_splits = assign_splits_to_workers() + + self.assertEqual(JobSplit.objects.count(), 2) + self.assertEqual(len(assigned_splits), 1) + + split1 = assigned_splits[0] + + self.assertTrue(split1.worker is not None) + self.assertFalse(split1.job.mirror) + + + def test_two_similar_experiments_one_slot(self): + setup_backend(ONE_QUEUE_TWO_WORKERS) + Worker.objects.filter(name='node1').update(active=True) + + fullname = 'user/user/single/1/single' + + xp1 = Experiment.objects.get(name=fullname.split('/')[-1]) + xp2 = xp1.fork(name='single_fork') + + schedule_experiment(xp1) + schedule_experiment(xp2) + split_new_jobs() + + self.assertEqual(Job.objects.count(), 4) + self.assertEqual(JobSplit.objects.count(), 2) + + assigned_splits = assign_splits_to_workers() + + self.assertEqual(JobSplit.objects.count(), 1) + self.assertEqual(len(assigned_splits), 1) + + split = assigned_splits[0] + + self.assertTrue(split.worker is not None) + self.assertFalse(split.job.mirror) + + mirror_job = Job.objects.get(mirror=True) + + self.assertEqual(split.job.key, mirror_job.key) + self.assertEqual(mirror_job.splits.count(), 0) + + +#---------------------------------------------------------- + + +class GetConfigurationForSplitTest(BaseBackendTestCase): + + def setUp(self): + super(GetConfigurationForSplitTest, self).setUp() + + self.previous_DATASETS_UID = settings.DATASETS_UID + self.previous_DATASETS_ROOT_PATH = settings.DATASETS_ROOT_PATH + + settings.DATASETS_UID = None + settings.DATASETS_ROOT_PATH = None + + + def tearDown(self): + super(GetConfigurationForSplitTest, self).tearDown() + + settings.DATASETS_UID = self.previous_DATASETS_UID + settings.DATASETS_ROOT_PATH = self.previous_DATASETS_ROOT_PATH + + + def prepare_experiment(self, name): + xp = Experiment.objects.get(name=name.split('/')[-1]) + + schedule_experiment(xp) + + b0 = xp.blocks.all()[0] + + self.generate_cached_files(b0.inputs.all()[0].database.hash, [ + [0, 1, 2, 3, 4, 5, 6, 7, 8, 9] + ]) + + split_new_jobs() + + xp.refresh_from_db() + + return xp + + + def test_one_split(self): + xp = self.prepare_experiment('user/user/single/1/single') + + b0 = xp.blocks.all()[0] + split = b0.job.splits.all()[0] + + configuration = get_configuration_for_split(split) + + self.assertTrue('datasets_uid' not in configuration) + self.assertTrue('datasets_root_path' not in configuration) + self.assertTrue('range' not in configuration) + + + def test_two_splits(self): + xp = self.prepare_experiment('user/user/single/1/single_split_2') + + b0 = xp.blocks.all()[0] + split = b0.job.splits.all()[0] + + configuration = get_configuration_for_split(split) + + self.assertTrue('datasets_uid' not in configuration) + self.assertTrue('datasets_root_path' not in configuration) + self.assertTrue('range' in configuration) + + self.assertEqual(configuration['range'][0], 0) + self.assertEqual(configuration['range'][1], 4) + + split = b0.job.splits.all()[1] + + configuration = get_configuration_for_split(split) + + self.assertTrue('datasets_uid' not in configuration) + self.assertTrue('datasets_root_path' not in configuration) + self.assertTrue('range' in configuration) + + self.assertEqual(configuration['range'][0], 5) + self.assertEqual(configuration['range'][1], 9) + + + def test_dataset_uid(self): + settings.DATASETS_UID = 12345 + + xp = self.prepare_experiment('user/user/single/1/single') + + b0 = xp.blocks.all()[0] + split = b0.job.splits.all()[0] + + configuration = get_configuration_for_split(split) + + self.assertTrue('datasets_uid' in configuration) + self.assertTrue('datasets_root_path' not in configuration) + self.assertTrue('range' not in configuration) + + self.assertEqual(configuration['datasets_uid'], settings.DATASETS_UID) + + + def test_datasets_root_path(self): + settings.DATASETS_ROOT_PATH = '/some/path/' + + xp = self.prepare_experiment('user/user/single/1/single') + + b0 = xp.blocks.all()[0] + split = b0.job.splits.all()[0] + + configuration = get_configuration_for_split(split) + + self.assertTrue('datasets_uid' not in configuration) + self.assertTrue('datasets_root_path' in configuration) + self.assertTrue('range' not in configuration) + + self.assertEqual(configuration['datasets_root_path'], settings.DATASETS_ROOT_PATH) + + +#---------------------------------------------------------- + + +class SplitHelpersBaseTest(BaseBackendTestCase): + + def prepare_experiment(self, name, generate_cache=True): + xp = Experiment.objects.get(name=name.split('/')[-1]) + + schedule_experiment(xp) + + b0 = xp.blocks.all()[0] + + if generate_cache: + self.generate_cached_files(b0.inputs.all()[0].database.hash, [ + [0, 1, 2, 3, 4, 5, 6, 7, 8, 9] + ]) + + split_new_jobs() + + xp.refresh_from_db() + + return xp + + + def check_cached_file_statistics(self, reference, block, factor=1.0, cached=True): + cpu = reference['statistics']['cpu'] + memory = reference['statistics']['memory'] + data = reference['statistics']['data'] + + for cached_file in block.outputs.all(): + self.assertEqual(cached_file.status, CachedFile.CACHED if cached else CachedFile.NOT_CACHED) + self.assertEqual(cached_file.cpu_time, (cpu['user'] + cpu['system']) * factor) + self.assertEqual(cached_file.max_memory, memory['rss'] * factor) + self.assertEqual(cached_file.data_read_size, data['volume']['read'] * factor) + self.assertEqual(cached_file.data_read_nb_blocks, data['blocks']['read'] * factor) + self.assertEqual(cached_file.data_read_time, data['time']['read'] * factor) + self.assertEqual(cached_file.data_written_size, data['volume']['write'] * factor) + self.assertEqual(cached_file.data_written_nb_blocks, data['blocks']['write'] * factor) + self.assertEqual(cached_file.data_written_time, data['time']['write'] * factor) + + + def check_split_before_start(self, split, experiment_status=Experiment.SCHEDULED): + self.assertTrue(split.worker is not None) + self.assertEqual(split.status, JobSplit.QUEUED) + self.assertTrue(split.start_date is None) + self.assertTrue(split.end_date is None) + + self.check_job_before_start(split.job, experiment_status) + + + def check_job_before_start(self, job, experiment_status=Experiment.SCHEDULED): + self.assertTrue(job.start_date is None) + self.assertTrue(job.end_date is None) + + self.assertEqual(job.block.status, Block.PENDING) + self.assertTrue(job.block.start_date is None) + self.assertTrue(job.block.end_date is None) + + self.assertEqual(job.block.experiment.status, experiment_status) + + if experiment_status == Experiment.SCHEDULED: + self.assertTrue(job.block.experiment.start_date is None) + self.assertTrue(job.block.experiment.end_date is None) + + elif experiment_status == Experiment.RUNNING: + self.assertTrue(job.block.experiment.start_date is not None) + self.assertTrue(job.block.experiment.end_date is None) + + + def check_processing_split(self, split): + split.refresh_from_db() + split.job.refresh_from_db() + split.job.block.refresh_from_db() + split.job.block.experiment.refresh_from_db() + + job = split.job + block = job.block + xp = block.experiment + + self.assertEqual(split.status, JobSplit.PROCESSING) + self.assertTrue(split.start_date is not None) + self.assertTrue(split.end_date is None) + + self.assertTrue(job.start_date is not None) + self.assertTrue(job.end_date is None) + + self.assertEqual(block.status, Block.PROCESSING) + self.assertTrue(block.start_date is not None) + self.assertTrue(block.end_date is None) + + self.assertEqual(xp.status, Experiment.RUNNING) + self.assertTrue(xp.start_date is not None) + self.assertTrue(xp.end_date is None) + + self.assertTrue(split.start_date >= job.start_date) + self.assertEqual(block.start_date, job.start_date) + self.assertTrue(job.start_date >= xp.start_date) + + + def check_queued_split_of_running_job(self, split): + split.refresh_from_db() + split.job.refresh_from_db() + split.job.block.refresh_from_db() + split.job.block.experiment.refresh_from_db() + + job = split.job + block = job.block + xp = block.experiment + + self.assertEqual(split.status, JobSplit.QUEUED) + self.assertTrue(split.start_date is None) + self.assertTrue(split.end_date is None) + + self.assertTrue(job.start_date is not None) + self.assertTrue(job.end_date is None) + + self.assertEqual(block.status, Block.PROCESSING) + self.assertTrue(block.start_date is not None) + self.assertTrue(block.end_date is None) + + self.assertEqual(xp.status, Experiment.RUNNING) + self.assertTrue(xp.start_date is not None) + self.assertTrue(xp.end_date is None) + + self.assertEqual(xp.start_date, job.start_date) + + + def check_done_split_of_running_job(self, split): + split.refresh_from_db() + split.job.refresh_from_db() + split.job.block.refresh_from_db() + split.job.block.experiment.refresh_from_db() + + job = split.job + block = job.block + xp = block.experiment + + self.assertEqual(split.status, JobSplit.COMPLETED) + self.assertTrue(split.start_date is not None) + self.assertTrue(split.end_date is not None) + + self.assertTrue(job.start_date is not None) + self.assertTrue(job.end_date is None) + + self.assertEqual(block.status, Block.PROCESSING) + self.assertTrue(block.start_date is not None) + self.assertTrue(block.end_date is None) + + self.assertEqual(xp.status, Experiment.RUNNING) + self.assertTrue(xp.start_date is not None) + self.assertTrue(xp.end_date is None) + + self.assertTrue(split.start_date >= job.start_date) + self.assertEqual(block.start_date, job.start_date) + self.assertEqual(xp.start_date, job.start_date) + + + def check_failed_split_of_running_job(self, split): + split.refresh_from_db() + split.job.refresh_from_db() + split.job.block.refresh_from_db() + split.job.block.experiment.refresh_from_db() + + job = split.job + block = job.block + xp = block.experiment + + self.assertEqual(split.status, JobSplit.FAILED) + self.assertTrue(split.start_date is not None) + self.assertTrue(split.end_date is not None) + + self.assertTrue(job.start_date is not None) + self.assertTrue(job.end_date is None) + + self.assertEqual(block.status, Block.PROCESSING) + self.assertTrue(block.start_date is not None) + self.assertTrue(block.end_date is None) + + self.assertEqual(xp.status, Experiment.RUNNING) + self.assertTrue(xp.start_date is not None) + self.assertTrue(xp.end_date is None) + + self.assertTrue(split.end_date >= split.start_date) + self.assertTrue(split.start_date >= job.start_date) + self.assertTrue(job.start_date >= block.start_date) + self.assertTrue(block.start_date >= xp.start_date) + + + def check_cancelling_split_of_cancelling_experiment( + self, split, experiment_status=Experiment.RUNNING): + split.refresh_from_db() + split.job.refresh_from_db() + split.job.block.refresh_from_db() + split.job.block.experiment.refresh_from_db() + + job = split.job + block = job.block + xp = block.experiment + + self.assertEqual(split.status, JobSplit.CANCELLING) + self.assertTrue(split.start_date is not None) + self.assertTrue(split.end_date is None) + + self.assertTrue(job.start_date is not None) + self.assertTrue(job.end_date is None) + + self.assertEqual(block.status, Block.PROCESSING) + self.assertTrue(block.start_date is not None) + self.assertTrue(block.end_date is None) + + self.assertEqual(xp.status, experiment_status) + self.assertTrue(xp.start_date is not None) + self.assertTrue(xp.end_date is None) + + self.assertTrue(split.start_date >= job.start_date) + self.assertTrue(job.start_date >= block.start_date) + self.assertTrue(block.start_date >= xp.start_date) + + + def check_cancelled_split_of_cancelling_experiment( + self, split, experiment_status=Experiment.RUNNING): + + split.refresh_from_db() + split.job.refresh_from_db() + split.job.block.refresh_from_db() + split.job.block.experiment.refresh_from_db() + + job = split.job + block = job.block + xp = block.experiment + + self.assertEqual(split.status, JobSplit.CANCELLED) + self.assertTrue(split.start_date is not None) + self.assertTrue(split.end_date is not None) + + self.assertTrue(job.start_date is not None) + self.assertTrue(job.end_date is None) + + self.assertEqual(block.status, Block.PROCESSING) + self.assertTrue(block.start_date is not None) + self.assertTrue(block.end_date is None) + + self.assertEqual(xp.status, experiment_status) + self.assertTrue(xp.start_date is not None) + self.assertTrue(xp.end_date is None) + + self.assertTrue(split.end_date >= split.start_date) + self.assertTrue(split.start_date >= job.start_date) + self.assertTrue(job.start_date >= block.start_date) + self.assertTrue(block.start_date >= xp.start_date) + + + def check_processing_mirror_job(self, job): + job.refresh_from_db() + job.block.refresh_from_db() + job.block.experiment.refresh_from_db() + + block = job.block + xp = block.experiment + + self.assertTrue(job.start_date is not None) + self.assertTrue(job.end_date is None) + + self.assertEqual(block.status, Block.PROCESSING) + self.assertTrue(block.start_date is not None) + self.assertTrue(block.end_date is None) + + self.assertEqual(xp.status, Experiment.RUNNING) + self.assertTrue(xp.start_date is not None) + self.assertTrue(xp.end_date is None) + + self.assertEqual(block.start_date, job.start_date) + self.assertEqual(xp.start_date, job.start_date) + + + def check_queued_job_of_running_experiment(self, job): + job.refresh_from_db() + job.block.refresh_from_db() + job.block.experiment.refresh_from_db() + + block = job.block + xp = block.experiment + + self.assertTrue(job.start_date is None) + self.assertTrue(job.end_date is None) + + self.assertEqual(block.status, Block.PENDING) + self.assertTrue(block.start_date is None) + self.assertTrue(block.end_date is None) + + self.assertEqual(xp.status, Experiment.RUNNING) + self.assertTrue(xp.start_date is not None) + self.assertTrue(xp.end_date is None) + + + def check_done_block_of_running_experiment(self, block): + block.refresh_from_db() + block.experiment.refresh_from_db() + + xp = block.experiment + + self.assertEqual(block.status, Block.DONE) + self.assertTrue(block.start_date is not None) + self.assertTrue(block.end_date is not None) + + self.assertEqual(xp.status, Experiment.RUNNING) + self.assertTrue(xp.start_date is not None) + self.assertTrue(xp.end_date is None) + + self.assertTrue(block.end_date >= block.start_date) + self.assertTrue(block.start_date >= xp.start_date) + + + def check_pending_block_of_running_experiment(self, block): + block.refresh_from_db() + block.experiment.refresh_from_db() + + xp = block.experiment + + self.assertEqual(block.status, Block.PENDING) + self.assertTrue(block.start_date is None) + self.assertTrue(block.end_date is None) + + self.assertEqual(xp.status, Experiment.RUNNING) + self.assertTrue(xp.start_date is not None) + self.assertTrue(xp.end_date is None) + + + def check_done_block_of_complete_experiment(self, block): + block.refresh_from_db() + block.experiment.refresh_from_db() + + xp = block.experiment + + self.assertEqual(block.status, Block.DONE) + self.assertTrue(block.start_date is not None) + self.assertTrue(block.end_date is not None) + + self.assertEqual(xp.status, Experiment.DONE) + self.assertTrue(xp.start_date is not None) + self.assertTrue(xp.end_date is not None) + + self.assertTrue(block.end_date >= block.start_date) + self.assertTrue(block.start_date >= xp.start_date) + self.assertTrue(xp.end_date >= block.end_date) + + + def check_failed_block_of_failed_experiment(self, block): + block.refresh_from_db() + block.experiment.refresh_from_db() + + xp = block.experiment + + self.assertEqual(block.status, Block.FAILED) + self.assertTrue(block.start_date is not None) + self.assertTrue(block.end_date is not None) + + self.assertEqual(xp.status, Experiment.FAILED) + self.assertTrue(xp.start_date is not None) + self.assertTrue(xp.end_date is not None) + + self.assertTrue(block.end_date >= block.start_date) + self.assertTrue(block.start_date >= xp.start_date) + self.assertTrue(xp.end_date >= block.end_date) + + + def check_cancelled_block_of_failed_experiment(self, block): + block.refresh_from_db() + block.experiment.refresh_from_db() + + xp = block.experiment + + self.assertEqual(block.status, Block.CANCELLED) + self.assertTrue(block.start_date is not None) + self.assertTrue(block.end_date is not None) + + self.assertEqual(xp.status, Experiment.FAILED) + self.assertTrue(xp.start_date is not None) + self.assertTrue(xp.end_date is not None) + + self.assertTrue(block.end_date >= block.start_date) + self.assertTrue(block.start_date >= xp.start_date) + self.assertTrue(xp.end_date >= block.end_date) + + + def check_failed_block_of_cancelling_experiment(self, block): + block.refresh_from_db() + block.experiment.refresh_from_db() + + xp = block.experiment + + self.assertEqual(block.status, Block.FAILED) + self.assertTrue(block.start_date is not None) + self.assertTrue(block.end_date is not None) + + self.assertEqual(xp.status, Experiment.RUNNING) + self.assertTrue(xp.start_date is not None) + self.assertTrue(xp.end_date is None) + + self.assertTrue(block.end_date >= block.start_date) + self.assertTrue(block.start_date >= xp.start_date) + + + def check_cancelled_block_of_cancelling_experiment( + self, block, experiment_status=Experiment.RUNNING): + block.refresh_from_db() + block.experiment.refresh_from_db() + + xp = block.experiment + + self.assertEqual(block.status, Block.CANCELLED) + self.assertTrue(block.start_date is not None) + self.assertTrue(block.end_date is not None) + + self.assertEqual(xp.status, experiment_status) + self.assertTrue(xp.start_date is not None) + self.assertTrue(xp.end_date is None) + + self.assertTrue(block.end_date >= block.start_date) + self.assertTrue(block.start_date >= xp.start_date) + + + def check_cancelled_block_of_cancelled_experiment(self, block): + block.refresh_from_db() + block.experiment.refresh_from_db() + + xp = block.experiment + + self.assertEqual(block.status, Block.CANCELLED) + self.assertTrue(block.start_date is not None) + self.assertTrue(block.end_date is not None) + + self.assertEqual(xp.status, Experiment.PENDING) + self.assertTrue(xp.start_date is not None) + self.assertTrue(xp.end_date is not None) + + self.assertTrue(block.end_date >= block.start_date) + self.assertTrue(block.start_date >= xp.start_date) + self.assertTrue(xp.end_date >= block.end_date) + + +#---------------------------------------------------------- + + +class OnSplitStartedTest(SplitHelpersBaseTest): + + def test_one_split(self): + xp = self.prepare_experiment('user/user/single/1/single') + assigned_splits = assign_splits_to_workers() + + split = assigned_splits[0] + + self.check_split_before_start(split) + + on_split_started(split) + + self.check_processing_split(split) + + + def test_two_splits(self): + xp = self.prepare_experiment('user/user/single/1/single_split_2') + assigned_splits = assign_splits_to_workers() + + split0 = assigned_splits[0] + split1 = assigned_splits[1] + + self.check_split_before_start(split0) + self.check_split_before_start(split1) + + on_split_started(split0) + + self.check_processing_split(split0) + self.check_queued_split_of_running_job(split1) + + on_split_started(split1) + + self.check_processing_split(split0) + self.check_processing_split(split1) + + split0.refresh_from_db() + split1.refresh_from_db() + + self.assertTrue(split1.start_date > split0.start_date) + + + def test_mirror_jobs_one_split(self): + xp1 = self.prepare_experiment('user/user/single/1/single') + xp2 = self.prepare_experiment('user/user/single/1/single_split_2', + generate_cache=False) + + assigned_splits = assign_splits_to_workers() + + b1 = xp1.blocks.all()[0] + b2 = xp2.blocks.all()[0] + split = assigned_splits[0] + + self.assertEqual(split.job, b1.job) + + self.check_split_before_start(split) + self.check_job_before_start(b2.job) + + on_split_started(split) + + self.check_processing_split(split) + self.check_processing_mirror_job(b2.job) + + xp1.refresh_from_db() + xp2.refresh_from_db() + + self.assertEqual(xp2.start_date, xp1.start_date) + + + def test_mirror_jobs_two_splits(self): + xp1 = self.prepare_experiment('user/user/single/1/single_split_2') + xp2 = self.prepare_experiment('user/user/single/1/single', + generate_cache=False) + + assigned_splits = assign_splits_to_workers() + + b1 = xp1.blocks.all()[0] + b2 = xp2.blocks.all()[0] + split1 = assigned_splits[0] + split2 = assigned_splits[1] + + self.assertEqual(split1.job, b1.job) + self.assertEqual(split2.job, b1.job) + + self.check_split_before_start(split1) + self.check_split_before_start(split2) + self.check_job_before_start(b2.job) + + on_split_started(split1) + + xp1.refresh_from_db() + xp2.refresh_from_db() + + self.check_processing_split(split1) + self.check_queued_split_of_running_job(split2) + self.check_processing_mirror_job(b2.job) + + self.assertEqual(xp2.start_date, xp1.start_date) + + on_split_started(split2) + + self.check_processing_split(split1) + self.check_processing_split(split2) + self.check_processing_mirror_job(b2.job) + + xp1.refresh_from_db() + xp2.refresh_from_db() + split1.refresh_from_db() + split2.refresh_from_db() + + self.assertTrue(split2.start_date > split1.start_date) + self.assertEqual(xp2.start_date, xp1.start_date) + + +#---------------------------------------------------------- + + +class OnSplitDoneTest(SplitHelpersBaseTest): + + RESULT = dict( + status = 0, + stdout = '', + stderr = '', + user_error = '', + statistics = dict( + data = dict( + volume = dict( + read = 100, + write = 50 + ), + blocks = dict( + read = 10, + write = 5 + ), + time = dict( + read = 20, + write = 15 + ), + network = dict( + wait_time = 4 + ), + files = [], + ), + cpu = dict( + user = 30.0, + system = 40.0, + total = 80.0, + percent = 0.0, + processors = 1, + ), + memory = dict( + rss = 1024, + limit = 2048, + percent = 50.0, + ), + ) + ) + + + def check_cached_file_statistics(self, block, factor=1.0): + super(OnSplitDoneTest, self).check_cached_file_statistics( + OnSplitDoneTest.RESULT, block, factor=factor + ) + + + def test_one_split_per_block(self): + xp = self.prepare_experiment('user/user/single/1/single') + b0 = xp.blocks.all()[0] + b1 = xp.blocks.all()[1] + + self.assertEqual(Job.objects.count(), 2) + + # Block 0 + assigned_splits = assign_splits_to_workers() + + split = assigned_splits[0] + + self.check_split_before_start(split) + + self.assertEqual(split.job.block, b0) + + on_split_started(split) + + xp.refresh_from_db() + split.refresh_from_db() + + self.check_processing_split(split) + + on_split_done(split, OnSplitDoneTest.RESULT) + + self.assertEqual(Job.objects.count(), 1) + + self.check_done_block_of_running_experiment(b0) + self.check_pending_block_of_running_experiment(b1) + + self.check_cached_file_statistics(b0) + + # Block 1 + split_new_jobs() + assigned_splits = assign_splits_to_workers() + + split = assigned_splits[0] + + self.check_split_before_start(split, experiment_status=Experiment.RUNNING) + + self.assertEqual(split.job.block, b1) + + on_split_started(split) + + xp.refresh_from_db() + split.refresh_from_db() + + self.check_processing_split(split) + + on_split_done(split, OnSplitDoneTest.RESULT) + + self.assertEqual(Job.objects.count(), 0) + + xp.refresh_from_db() + b0.refresh_from_db() + b1.refresh_from_db() + + self.check_done_block_of_complete_experiment(b0) + self.check_done_block_of_complete_experiment(b1) + + self.assertEqual(xp.start_date, b0.start_date) + self.assertEqual(xp.end_date, b1.end_date) + + self.check_cached_file_statistics(b1) + + + def test_two_splits_for_first_block_sequential(self): + xp = self.prepare_experiment('user/user/single/1/single_split_2') + b0 = xp.blocks.all()[0] + b1 = xp.blocks.all()[1] + + self.assertEqual(JobSplit.objects.count(), 2) + self.assertEqual(Job.objects.count(), 2) + + # Block 0 + assigned_splits = assign_splits_to_workers() + + split1 = assigned_splits[0] + split2 = assigned_splits[1] + + self.check_split_before_start(split1) + self.check_split_before_start(split2) + + self.assertEqual(split1.job.block, b0) + self.assertEqual(split2.job.block, b0) + + # Block 0 - Split 0 + on_split_started(split1) + + xp.refresh_from_db() + split1.refresh_from_db() + split2.refresh_from_db() + + self.check_processing_split(split1) + + on_split_done(split1, OnSplitDoneTest.RESULT) + + xp.refresh_from_db() + split1.refresh_from_db() + split2.refresh_from_db() + b0.refresh_from_db() + b1.refresh_from_db() + + self.assertEqual(JobSplit.objects.count(), 2) + self.assertEqual(Job.objects.count(), 2) + + self.check_done_split_of_running_job(split1) + self.check_queued_split_of_running_job(split2) + self.check_queued_job_of_running_experiment(b1.job) + + self.assertEqual(xp.start_date, b0.start_date) + + # Block 0 - Split 1 + on_split_started(split2) + + xp.refresh_from_db() + split2.refresh_from_db() + + self.check_processing_split(split2) + + on_split_done(split2, OnSplitDoneTest.RESULT) + + xp.refresh_from_db() + b0.refresh_from_db() + b1.refresh_from_db() + + self.assertEqual(JobSplit.objects.count(), 0) + self.assertEqual(Job.objects.count(), 1) + + self.check_done_block_of_running_experiment(b0) + self.check_queued_job_of_running_experiment(b1.job) + + self.assertEqual(xp.start_date, b0.start_date) + + self.check_cached_file_statistics(b0, factor=2) + + # Block 1 + split_new_jobs() + assigned_splits = assign_splits_to_workers() + + split = assigned_splits[0] + + self.check_split_before_start(split, experiment_status=Experiment.RUNNING) + + self.assertEqual(split.job.block, b1) + + on_split_started(split) + + xp.refresh_from_db() + split.refresh_from_db() + + self.check_processing_split(split) + + on_split_done(split, OnSplitDoneTest.RESULT) + + self.assertEqual(Job.objects.count(), 0) + + xp.refresh_from_db() + b0.refresh_from_db() + b1.refresh_from_db() + + self.check_done_block_of_complete_experiment(b0) + self.check_done_block_of_complete_experiment(b1) + + self.assertEqual(xp.start_date, b0.start_date) + self.assertEqual(xp.end_date, b1.end_date) + + self.check_cached_file_statistics(b1) + + + def test_two_splits_for_first_block_in_parallel(self): + xp = self.prepare_experiment('user/user/single/1/single_split_2') + b0 = xp.blocks.all()[0] + b1 = xp.blocks.all()[1] + + self.assertEqual(JobSplit.objects.count(), 2) + self.assertEqual(Job.objects.count(), 2) + + # Block 0 + assigned_splits = assign_splits_to_workers() + + split1 = assigned_splits[0] + split2 = assigned_splits[1] + + self.check_split_before_start(split1) + self.check_split_before_start(split2) + + self.assertEqual(split1.job.block, b0) + self.assertEqual(split2.job.block, b0) + + on_split_started(split1) + on_split_started(split2) + + xp.refresh_from_db() + split1.refresh_from_db() + split2.refresh_from_db() + + self.check_processing_split(split1) + self.check_processing_split(split2) + + on_split_done(split1, OnSplitDoneTest.RESULT) + + xp.refresh_from_db() + split1.refresh_from_db() + split2.refresh_from_db() + b0.refresh_from_db() + b1.refresh_from_db() + + self.assertEqual(JobSplit.objects.count(), 2) + self.assertEqual(Job.objects.count(), 2) + + self.check_done_split_of_running_job(split1) + self.check_processing_split(split2) + self.check_queued_job_of_running_experiment(b1.job) + + self.assertEqual(xp.start_date, b0.start_date) + + on_split_done(split2, OnSplitDoneTest.RESULT) + + xp.refresh_from_db() + b0.refresh_from_db() + b1.refresh_from_db() + + self.assertEqual(JobSplit.objects.count(), 0) + self.assertEqual(Job.objects.count(), 1) + + self.check_done_block_of_running_experiment(b0) + self.check_queued_job_of_running_experiment(b1.job) + + self.assertEqual(xp.start_date, b0.start_date) + + self.check_cached_file_statistics(b0, factor=2) + + # Block 1 + split_new_jobs() + assigned_splits = assign_splits_to_workers() + + split = assigned_splits[0] + + self.check_split_before_start(split, experiment_status=Experiment.RUNNING) + + self.assertEqual(split.job.block, b1) + + on_split_started(split) + + xp.refresh_from_db() + split.refresh_from_db() + + self.check_processing_split(split) + + on_split_done(split, OnSplitDoneTest.RESULT) + + self.assertEqual(Job.objects.count(), 0) + + xp.refresh_from_db() + b0.refresh_from_db() + b1.refresh_from_db() + + self.check_done_block_of_complete_experiment(b0) + self.check_done_block_of_complete_experiment(b1) + + self.assertEqual(xp.start_date, b0.start_date) + self.assertEqual(xp.end_date, b1.end_date) + + self.check_cached_file_statistics(b1) + + + def test_mirror_experiments_one_split_per_block(self): + xp1 = self.prepare_experiment('user/user/single/1/single') + xp2 = self.prepare_experiment('user/user/single/1/single_split_2', + generate_cache=False) + b1_0 = xp1.blocks.all()[0] + b1_1 = xp1.blocks.all()[1] + b2_0 = xp2.blocks.all()[0] + b2_1 = xp2.blocks.all()[1] + + self.assertEqual(Job.objects.count(), 4) + + # Block 0 + assigned_splits = assign_splits_to_workers() + + split = assigned_splits[0] + + self.assertEqual(split.job.block, b1_0) + + self.check_split_before_start(split) + self.check_job_before_start(b2_0.job) + + on_split_started(split) + + self.check_processing_split(split) + + on_split_done(split, OnSplitDoneTest.RESULT) + + self.assertEqual(Job.objects.count(), 2) + + self.check_done_block_of_running_experiment(b1_0) + self.check_pending_block_of_running_experiment(b1_1) + + self.check_done_block_of_running_experiment(b2_0) + self.check_pending_block_of_running_experiment(b2_1) + + self.check_cached_file_statistics(b1_0) + + # Block 1 + split_new_jobs() + assigned_splits = assign_splits_to_workers() + + split = assigned_splits[0] + + self.assertEqual(split.job.block, b1_1) + + self.check_split_before_start(split, experiment_status=Experiment.RUNNING) + self.check_job_before_start(b2_1.job, experiment_status=Experiment.RUNNING) + + on_split_started(split) + + self.check_processing_split(split) + + on_split_done(split, OnSplitDoneTest.RESULT) + + self.assertEqual(Job.objects.count(), 0) + + xp1.refresh_from_db() + xp2.refresh_from_db() + b1_0.refresh_from_db() + b1_1.refresh_from_db() + b2_0.refresh_from_db() + b2_1.refresh_from_db() + + self.check_done_block_of_complete_experiment(b1_0) + self.check_done_block_of_complete_experiment(b1_1) + self.check_done_block_of_complete_experiment(b2_0) + self.check_done_block_of_complete_experiment(b2_1) + + self.assertEqual(xp1.start_date, b1_0.start_date) + self.assertEqual(xp1.end_date, b1_1.end_date) + + self.assertEqual(xp2.start_date, b2_0.start_date) + self.assertEqual(xp2.end_date, b2_1.end_date) + + self.assertEqual(xp2.start_date, xp1.start_date) + self.assertEqual(xp2.end_date, xp1.end_date) + + self.check_cached_file_statistics(b1_1) + + +#---------------------------------------------------------- + + +class OnSplitFailTest(SplitHelpersBaseTest): + + RESULT = dict( + status = 1, + stdout = '', + stderr = '', + user_error = 'Some error', + statistics = dict( + data = dict( + volume = dict( + read = 100, + write = 50 + ), + blocks = dict( + read = 10, + write = 5 + ), + time = dict( + read = 20, + write = 15 + ), + network = dict( + wait_time = 4 + ), + files = [], + ), + cpu = dict( + user = 30.0, + system = 40.0, + total = 80.0, + percent = 0.0, + processors = 1, + ), + memory = dict( + rss = 1024, + limit = 2048, + percent = 50.0, + ), + ) + ) + + + def check_cached_file_statistics(self, block, factor=1.0, cached=True): + super(OnSplitFailTest, self).check_cached_file_statistics( + OnSplitFailTest.RESULT, block, factor=factor, cached=cached + ) + + + def test_system_error(self): + xp = self.prepare_experiment('user/user/single/1/single_error') + b0 = xp.blocks.all()[0] + b1 = xp.blocks.all()[1] + + # Block 0 + assigned_splits = assign_splits_to_workers() + + split = assigned_splits[0] + + on_split_started(split) + + xp.refresh_from_db() + split.refresh_from_db() + + self.assertEqual(Job.objects.count(), 2) + + splits_to_cancel = on_split_fail(split, "Some error") + + self.assertEqual(len(splits_to_cancel), 0) + + self.assertEqual(Job.objects.count(), 0) + + self.check_failed_block_of_failed_experiment(b0) + self.check_cancelled_block_of_failed_experiment(b1) + + self.check_cached_file_statistics(b0, cached=False, factor=0) + + xp.refresh_from_db() + b0.refresh_from_db() + b1.refresh_from_db() + + self.assertEqual(xp.start_date, b0.start_date) + self.assertEqual(xp.end_date, b1.end_date) + self.assertEqual(b1.start_date, b1.end_date) + + + def test_one_split_per_block(self): + xp = self.prepare_experiment('user/user/single/1/single_error') + b0 = xp.blocks.all()[0] + b1 = xp.blocks.all()[1] + + # Block 0 + assigned_splits = assign_splits_to_workers() + + split = assigned_splits[0] + + on_split_started(split) + + xp.refresh_from_db() + split.refresh_from_db() + + self.assertEqual(Job.objects.count(), 2) + + splits_to_cancel = on_split_fail(split, OnSplitFailTest.RESULT) + + self.assertEqual(len(splits_to_cancel), 0) + + self.assertEqual(Job.objects.count(), 0) + + self.check_failed_block_of_failed_experiment(b0) + self.check_cancelled_block_of_failed_experiment(b1) + + self.check_cached_file_statistics(b0, cached=False) + + xp.refresh_from_db() + b0.refresh_from_db() + b1.refresh_from_db() + + self.assertEqual(xp.start_date, b0.start_date) + self.assertEqual(xp.end_date, b1.end_date) + self.assertEqual(b1.start_date, b1.end_date) + + + def test_two_splits_only_one_running(self): + xp = self.prepare_experiment('user/user/single/1/single_error_split_2') + b0 = xp.blocks.all()[0] + b1 = xp.blocks.all()[1] + + # Block 0 + assigned_splits = assign_splits_to_workers() + + split = assigned_splits[0] + + on_split_started(split) + + xp.refresh_from_db() + split.refresh_from_db() + + self.assertEqual(Job.objects.count(), 2) + self.assertEqual(JobSplit.objects.count(), 2) + + splits_to_cancel = on_split_fail(split, OnSplitFailTest.RESULT) + + self.assertEqual(len(splits_to_cancel), 0) + + self.assertEqual(Job.objects.count(), 0) + self.assertEqual(JobSplit.objects.count(), 0) + + self.check_failed_block_of_failed_experiment(b0) + self.check_cancelled_block_of_failed_experiment(b1) + + self.check_cached_file_statistics(b0, cached=False) + + xp.refresh_from_db() + b0.refresh_from_db() + b1.refresh_from_db() + + self.assertEqual(xp.start_date, b0.start_date) + self.assertEqual(xp.end_date, b1.end_date) + self.assertEqual(b1.start_date, b1.end_date) + + + def test_two_running_splits(self): + xp = self.prepare_experiment('user/user/single/1/single_error_split_2') + b0 = xp.blocks.all()[0] + b1 = xp.blocks.all()[1] + + # Block 0 + assigned_splits = assign_splits_to_workers() + + split1 = assigned_splits[0] + split2 = assigned_splits[1] + + on_split_started(split1) + on_split_started(split2) + + xp.refresh_from_db() + split1.refresh_from_db() + split2.refresh_from_db() + + self.assertEqual(Job.objects.count(), 2) + self.assertEqual(JobSplit.objects.count(), 2) + + splits_to_cancel = on_split_fail(split1, OnSplitFailTest.RESULT) + + self.assertEqual(len(splits_to_cancel), 1) + self.assertEqual(splits_to_cancel[0], split2) + + self.assertEqual(Job.objects.count(), 2) + self.assertEqual(JobSplit.objects.count(), 2) + + self.check_failed_split_of_running_job(split1) + self.check_cancelling_split_of_cancelling_experiment(split2) + self.check_queued_job_of_running_experiment(b1.job) + + xp.refresh_from_db() + b0.refresh_from_db() + b1.refresh_from_db() + + self.assertEqual(xp.start_date, b0.start_date) + self.assertEqual(xp.end_date, b1.end_date) + self.assertEqual(b1.start_date, b1.end_date) + + + def test_two_splits_with_one_failure_and_one_success(self): + xp = self.prepare_experiment('user/user/single/1/single_error_split_2') + b0 = xp.blocks.all()[0] + b1 = xp.blocks.all()[1] + + # Block 0 + assigned_splits = assign_splits_to_workers() + + split1 = assigned_splits[0] + split2 = assigned_splits[1] + + on_split_started(split1) + on_split_started(split2) + + xp.refresh_from_db() + split1.refresh_from_db() + split2.refresh_from_db() + + self.assertEqual(Job.objects.count(), 2) + self.assertEqual(JobSplit.objects.count(), 2) + + splits_to_cancel = on_split_fail(split1, OnSplitFailTest.RESULT) + + self.assertEqual(len(splits_to_cancel), 1) + self.assertEqual(splits_to_cancel[0], split2) + + self.assertEqual(Job.objects.count(), 2) + self.assertEqual(JobSplit.objects.count(), 2) + + self.check_failed_split_of_running_job(split1) + self.check_cancelling_split_of_cancelling_experiment(split2) + self.check_queued_job_of_running_experiment(b1.job) + + # Surprise: split2 sucessfully finished before we could cancel it + on_split_done(split2, OnSplitFailTest.RESULT) + + self.assertEqual(Job.objects.count(), 0) + self.assertEqual(JobSplit.objects.count(), 0) + + self.check_failed_block_of_failed_experiment(b0) + self.check_cancelled_block_of_failed_experiment(b1) + + self.check_cached_file_statistics(b0, factor=2, cached=False) + + xp.refresh_from_db() + b0.refresh_from_db() + b1.refresh_from_db() + + self.assertEqual(xp.start_date, b0.start_date) + self.assertEqual(xp.end_date, b1.end_date) + self.assertEqual(b1.start_date, b1.end_date) + + + def test_two_splits_with_one_success_and_one_failure(self): + xp = self.prepare_experiment('user/user/single/1/single_error_split_2') + b0 = xp.blocks.all()[0] + b1 = xp.blocks.all()[1] + + # Block 0 + assigned_splits = assign_splits_to_workers() + + split1 = assigned_splits[0] + split2 = assigned_splits[1] + + on_split_started(split1) + on_split_started(split2) + + xp.refresh_from_db() + split1.refresh_from_db() + split2.refresh_from_db() + + self.assertEqual(Job.objects.count(), 2) + self.assertEqual(JobSplit.objects.count(), 2) + + on_split_done(split1, OnSplitFailTest.RESULT) + + self.assertEqual(Job.objects.count(), 2) + self.assertEqual(JobSplit.objects.count(), 2) + + self.check_done_split_of_running_job(split1) + + splits_to_cancel = on_split_fail(split2, OnSplitFailTest.RESULT) + + self.assertEqual(len(splits_to_cancel), 0) + + self.assertEqual(Job.objects.count(), 0) + self.assertEqual(JobSplit.objects.count(), 0) + + self.check_failed_block_of_failed_experiment(b0) + self.check_cancelled_block_of_failed_experiment(b1) + + self.check_cached_file_statistics(b0, factor=2, cached=False) + + xp.refresh_from_db() + b0.refresh_from_db() + b1.refresh_from_db() + + self.assertEqual(xp.start_date, b0.start_date) + self.assertEqual(xp.end_date, b1.end_date) + self.assertEqual(b1.start_date, b1.end_date) + + + def test_two_splits_both_failing(self): + xp = self.prepare_experiment('user/user/single/1/single_error_split_2') + b0 = xp.blocks.all()[0] + b1 = xp.blocks.all()[1] + + # Block 0 + assigned_splits = assign_splits_to_workers() + + split1 = assigned_splits[0] + split2 = assigned_splits[1] + + on_split_started(split1) + on_split_started(split2) + + xp.refresh_from_db() + split1.refresh_from_db() + split2.refresh_from_db() + + self.assertEqual(Job.objects.count(), 2) + self.assertEqual(JobSplit.objects.count(), 2) + + splits_to_cancel = on_split_fail(split1, OnSplitFailTest.RESULT) + + self.assertEqual(len(splits_to_cancel), 1) + self.assertEqual(splits_to_cancel[0], split2) + + self.assertEqual(Job.objects.count(), 2) + self.assertEqual(JobSplit.objects.count(), 2) + + self.check_failed_split_of_running_job(split1) + self.check_cancelling_split_of_cancelling_experiment(split2) + self.check_queued_job_of_running_experiment(b1.job) + + # Surprise: split2 failed before we could cancel it + splits_to_cancel = on_split_fail(split2, OnSplitFailTest.RESULT) + + self.assertEqual(len(splits_to_cancel), 0) + + self.assertEqual(Job.objects.count(), 0) + self.assertEqual(JobSplit.objects.count(), 0) + + self.check_failed_block_of_failed_experiment(b0) + self.check_cancelled_block_of_failed_experiment(b1) + + self.check_cached_file_statistics(b0, factor=2, cached=False) + + xp.refresh_from_db() + b0.refresh_from_db() + b1.refresh_from_db() + + self.assertEqual(xp.start_date, b0.start_date) + self.assertEqual(xp.end_date, b1.end_date) + self.assertEqual(b1.start_date, b1.end_date) + + + def test_mirror_experiments_one_split_per_block(self): + xp1 = self.prepare_experiment('user/user/single/1/single_error') + xp2 = self.prepare_experiment('user/user/single/1/single_error_split_2', + generate_cache=False) + b1_0 = xp1.blocks.all()[0] + b1_1 = xp1.blocks.all()[1] + b2_0 = xp2.blocks.all()[0] + b2_1 = xp2.blocks.all()[1] + + # Block 0 + assigned_splits = assign_splits_to_workers() + + split = assigned_splits[0] + + on_split_started(split) + + self.assertEqual(Job.objects.count(), 4) + self.assertEqual(JobSplit.objects.count(), 1) + + splits_to_cancel = on_split_fail(split, OnSplitFailTest.RESULT) + + self.assertEqual(len(splits_to_cancel), 0) + + self.assertEqual(Job.objects.count(), 0) + self.assertEqual(JobSplit.objects.count(), 0) + + self.check_failed_block_of_failed_experiment(b1_0) + self.check_cancelled_block_of_failed_experiment(b1_1) + + self.check_failed_block_of_failed_experiment(b2_0) + self.check_cancelled_block_of_failed_experiment(b2_1) + + self.check_cached_file_statistics(b1_0, cached=False) + + xp1.refresh_from_db() + xp2.refresh_from_db() + b1_0.refresh_from_db() + b1_1.refresh_from_db() + b2_0.refresh_from_db() + b2_1.refresh_from_db() + + self.assertEqual(xp1.start_date, b1_0.start_date) + self.assertEqual(xp1.end_date, b1_1.end_date) + + self.assertEqual(xp2.start_date, b2_0.start_date) + self.assertEqual(xp2.end_date, b2_1.end_date) + + self.assertEqual(xp2.start_date, xp1.start_date) + self.assertTrue(xp2.end_date >= xp1.end_date) + + + def test_two_blocks_in_parallel_one_split_per_block_only_one_running(self): + xp = self.prepare_experiment('user/user/triangle/1/triangle') + b1 = xp.blocks.get(name='echo1') + b2 = xp.blocks.get(name='echo2') + b3 = xp.blocks.get(name='echo3') + b4 = xp.blocks.get(name='analysis') + + # Block 0 + assigned_splits = assign_splits_to_workers() + + split1 = assigned_splits[0] + split2 = assigned_splits[1] + + on_split_started(split1) + + xp.refresh_from_db() + split1.refresh_from_db() + + self.assertEqual(Job.objects.count(), 4) + + splits_to_cancel = on_split_fail(split1, OnSplitFailTest.RESULT) + + self.assertEqual(len(splits_to_cancel), 0) + + self.assertEqual(Job.objects.count(), 0) + + self.check_failed_block_of_failed_experiment(b1) + self.check_cancelled_block_of_failed_experiment(b2) + self.check_cancelled_block_of_failed_experiment(b3) + self.check_cancelled_block_of_failed_experiment(b4) + + self.check_cached_file_statistics(b1, cached=False) + + xp.refresh_from_db() + b1.refresh_from_db() + b2.refresh_from_db() + b3.refresh_from_db() + b4.refresh_from_db() + + self.assertEqual(xp.start_date, b1.start_date) + self.assertEqual(b2.start_date, b2.end_date) + self.assertEqual(b3.start_date, b3.end_date) + self.assertEqual(b4.start_date, b4.end_date) + self.assertEqual(xp.end_date, b4.end_date) + + + def test_two_blocks_in_parallel_one_split_per_block_both_running(self): + xp = self.prepare_experiment('user/user/triangle/1/triangle') + b1 = xp.blocks.get(name='echo1') + b2 = xp.blocks.get(name='echo2') + b3 = xp.blocks.get(name='echo3') + b4 = xp.blocks.get(name='analysis') + + # Block 0 + assigned_splits = assign_splits_to_workers() + + split1 = assigned_splits[0] + split2 = assigned_splits[1] + + on_split_started(split1) + on_split_started(split2) + + xp.refresh_from_db() + split2.refresh_from_db() + + self.assertEqual(Job.objects.count(), 4) + + splits_to_cancel = on_split_fail(split1, OnSplitFailTest.RESULT) + + self.assertEqual(len(splits_to_cancel), 1) + + self.assertEqual(Job.objects.count(), 1) + + split2.refresh_from_db() + + self.assertEqual(splits_to_cancel[0], split2) + + self.check_failed_block_of_cancelling_experiment(b1) + self.check_cancelling_split_of_cancelling_experiment(split2) + self.check_cancelled_block_of_cancelling_experiment(b3) + self.check_cancelled_block_of_cancelling_experiment(b4) + + self.check_cached_file_statistics(b1, cached=False) + + xp.refresh_from_db() + b1.refresh_from_db() + b2.refresh_from_db() + b3.refresh_from_db() + b4.refresh_from_db() + + self.assertEqual(xp.start_date, b1.start_date) + self.assertEqual(b3.start_date, b3.end_date) + self.assertEqual(b4.start_date, b4.end_date) + + +#---------------------------------------------------------- + + +class CancelExperimentTest(SplitHelpersBaseTest): + + def test_pending_experiment(self): + fullname = 'user/user/single/1/single' + + xp = Experiment.objects.get(name=fullname.split('/')[-1]) + + cancel_experiment(xp) + + # No change + self.assertEqual(xp.status, Experiment.PENDING) + + + def test_scheduled_experiment(self): + fullname = 'user/user/single/1/single' + + xp = Experiment.objects.get(name=fullname.split('/')[-1]) + + schedule_experiment(xp) + + cancel_experiment(xp) + + xp.refresh_from_db() + + self.assertEqual(xp.status, Experiment.CANCELLING) + + + def test_running_experiment(self): + xp = self.prepare_experiment('user/user/single/1/single') + + assigned_splits = assign_splits_to_workers() + + split = assigned_splits[0] + + on_split_started(split) + + cancel_experiment(xp) + + xp.refresh_from_db() + + self.assertEqual(xp.status, Experiment.CANCELLING) + + + def test_done_experiment(self): + xp = self.prepare_experiment('user/user/single/1/single') + + assigned_splits = assign_splits_to_workers() + + split = assigned_splits[0] + + on_split_started(split) + on_split_done(split, OnSplitDoneTest.RESULT) + + split_new_jobs() + assigned_splits = assign_splits_to_workers() + + split = assigned_splits[0] + + on_split_started(split) + on_split_done(split, OnSplitDoneTest.RESULT) + + cancel_experiment(xp) + + xp.refresh_from_db() + + self.assertEqual(xp.status, Experiment.DONE) + + + def test_failed_experiment(self): + xp = self.prepare_experiment('user/user/single/1/single') + + assigned_splits = assign_splits_to_workers() + + split = assigned_splits[0] + + on_split_started(split) + on_split_fail(split, OnSplitFailTest.RESULT) + + cancel_experiment(xp) + + xp.refresh_from_db() + + self.assertEqual(xp.status, Experiment.FAILED) + + + def test_cancelling_experiment(self): + fullname = 'user/user/single/1/single' + + xp = Experiment.objects.get(name=fullname.split('/')[-1]) + + schedule_experiment(xp) + + cancel_experiment(xp) + + cancel_experiment(xp) + + xp.refresh_from_db() + + self.assertEqual(xp.status, Experiment.CANCELLING) + + +#---------------------------------------------------------- + + +class ProcessNewlyCancelledExperimentsTest(SplitHelpersBaseTest): + + def test_no_split_running(self): + xp = self.prepare_experiment('user/user/single/1/single') + b0 = xp.blocks.all()[0] + b1 = xp.blocks.all()[1] + + assigned_splits = assign_splits_to_workers() + + cancel_experiment(xp) + + self.assertEqual(Job.objects.count(), 2) + self.assertEqual(JobSplit.objects.count(), 1) + + splits_to_cancel = process_newly_cancelled_experiments() + + self.assertEqual(len(splits_to_cancel), 0) + + self.assertEqual(Job.objects.count(), 0) + + self.check_cancelled_block_of_cancelled_experiment(b0) + self.check_cancelled_block_of_cancelled_experiment(b1) + + xp.refresh_from_db() + b0.refresh_from_db() + b1.refresh_from_db() + + self.assertEqual(b1.start_date, b1.end_date) + self.assertEqual(xp.end_date, b1.end_date) + + + def test_one_split_running(self): + xp = self.prepare_experiment('user/user/single/1/single') + b0 = xp.blocks.all()[0] + b1 = xp.blocks.all()[1] + + assigned_splits = assign_splits_to_workers() + + split = assigned_splits[0] + + on_split_started(split) + + cancel_experiment(xp) + + self.assertEqual(Job.objects.count(), 2) + self.assertEqual(JobSplit.objects.count(), 1) + + splits_to_cancel = process_newly_cancelled_experiments() + + self.assertEqual(len(splits_to_cancel), 1) + + self.assertEqual(Job.objects.count(), 1) + + self.check_cancelling_split_of_cancelling_experiment(split, experiment_status=Experiment.CANCELLING) + self.check_cancelled_block_of_cancelling_experiment(b1, experiment_status=Experiment.CANCELLING) + + xp.refresh_from_db() + b0.refresh_from_db() + b1.refresh_from_db() + + self.assertEqual(xp.start_date, b0.start_date) + self.assertEqual(b1.start_date, b1.end_date) + + + def test_one_split_of_two_in_a_block_running(self): + xp = self.prepare_experiment('user/user/single/1/single_split_2') + b0 = xp.blocks.all()[0] + b1 = xp.blocks.all()[1] + + assigned_splits = assign_splits_to_workers() + + split1 = assigned_splits[0] + split2 = assigned_splits[1] + + on_split_started(split1) + + cancel_experiment(xp) + + self.assertEqual(Job.objects.count(), 2) + self.assertEqual(JobSplit.objects.count(), 2) + + splits_to_cancel = process_newly_cancelled_experiments() + + self.assertEqual(len(splits_to_cancel), 1) + + self.assertEqual(Job.objects.count(), 1) + self.assertEqual(JobSplit.objects.count(), 2) + + self.check_cancelling_split_of_cancelling_experiment(split1, experiment_status=Experiment.CANCELLING) + self.check_cancelled_split_of_cancelling_experiment(split2, experiment_status=Experiment.CANCELLING) + self.check_cancelled_block_of_cancelling_experiment(b1, experiment_status=Experiment.CANCELLING) + + xp.refresh_from_db() + b0.refresh_from_db() + b1.refresh_from_db() + + self.assertEqual(xp.start_date, b0.start_date) + self.assertEqual(b1.start_date, b1.end_date) + + + def test_two_splits_of_same_block_running(self): + xp = self.prepare_experiment('user/user/single/1/single_split_2') + b0 = xp.blocks.all()[0] + b1 = xp.blocks.all()[1] + + assigned_splits = assign_splits_to_workers() + + split1 = assigned_splits[0] + split2 = assigned_splits[1] + + on_split_started(split1) + on_split_started(split2) + + cancel_experiment(xp) + + self.assertEqual(Job.objects.count(), 2) + self.assertEqual(JobSplit.objects.count(), 2) + + splits_to_cancel = process_newly_cancelled_experiments() + + self.assertEqual(len(splits_to_cancel), 2) + + self.assertEqual(Job.objects.count(), 1) + + self.check_cancelling_split_of_cancelling_experiment(split1, experiment_status=Experiment.CANCELLING) + self.check_cancelling_split_of_cancelling_experiment(split2, experiment_status=Experiment.CANCELLING) + self.check_cancelled_block_of_cancelling_experiment(b1, experiment_status=Experiment.CANCELLING) + + xp.refresh_from_db() + b0.refresh_from_db() + b1.refresh_from_db() + + self.assertEqual(xp.start_date, b0.start_date) + self.assertEqual(b1.start_date, b1.end_date) + + + def test_two_splits_of_different_blocks_running(self): + xp = self.prepare_experiment('user/user/triangle/1/triangle') + b0 = xp.blocks.get(name='echo1') + b1 = xp.blocks.get(name='echo2') + b2 = xp.blocks.get(name='echo3') + b3 = xp.blocks.get(name='analysis') + + assigned_splits = assign_splits_to_workers() + + split1 = assigned_splits[0] + split2 = assigned_splits[1] + + on_split_started(split1) + on_split_started(split2) + + cancel_experiment(xp) + + self.assertEqual(Job.objects.count(), 4) + self.assertEqual(JobSplit.objects.count(), 2) + + splits_to_cancel = process_newly_cancelled_experiments() + + self.assertEqual(len(splits_to_cancel), 2) + + self.assertEqual(Job.objects.count(), 2) + + self.check_cancelling_split_of_cancelling_experiment(split1, experiment_status=Experiment.CANCELLING) + self.check_cancelling_split_of_cancelling_experiment(split2, experiment_status=Experiment.CANCELLING) + self.check_cancelled_block_of_cancelling_experiment(b2, experiment_status=Experiment.CANCELLING) + self.check_cancelled_block_of_cancelling_experiment(b3, experiment_status=Experiment.CANCELLING) + + xp.refresh_from_db() + b0.refresh_from_db() + b1.refresh_from_db() + b2.refresh_from_db() + b3.refresh_from_db() + + self.assertEqual(xp.start_date, b0.start_date) + self.assertEqual(b2.start_date, b2.end_date) + self.assertEqual(b3.start_date, b3.end_date) + + + def test_mirror_block(self): + xp1 = self.prepare_experiment('user/user/single/1/single') + xp2 = self.prepare_experiment('user/user/single/1/single_split_2', + generate_cache=False) + b1_0 = xp1.blocks.all()[0] + b1_1 = xp1.blocks.all()[1] + + b2_0 = xp2.blocks.all()[0] + b2_1 = xp2.blocks.all()[1] + + assigned_splits = assign_splits_to_workers() + + split = assigned_splits[0] + + self.assertEqual(split.job.block, b1_0) + + on_split_started(split) + + cancel_experiment(xp2) + + self.assertEqual(Job.objects.count(), 4) + self.assertEqual(JobSplit.objects.count(), 1) + + splits_to_cancel = process_newly_cancelled_experiments() + + self.assertEqual(len(splits_to_cancel), 0) + + self.assertEqual(Job.objects.count(), 2) + self.assertEqual(JobSplit.objects.count(), 1) + + self.check_cancelled_block_of_cancelled_experiment(b2_0) + self.check_cancelled_block_of_cancelled_experiment(b2_1) + + self.check_processing_split(split) + self.check_queued_job_of_running_experiment(b1_1.job) + + xp2.refresh_from_db() + b2_0.refresh_from_db() + b2_1.refresh_from_db() + + self.assertEqual(xp2.start_date, b2_0.start_date) + self.assertEqual(b2_1.start_date, b2_1.end_date) + + + def test_mirrored_block(self): + xp1 = self.prepare_experiment('user/user/single/1/single') + xp2 = self.prepare_experiment('user/user/single/1/single_split_2', + generate_cache=False) + b1_0 = xp1.blocks.all()[0] + b1_1 = xp1.blocks.all()[1] + + b2_0 = xp2.blocks.all()[0] + b2_1 = xp2.blocks.all()[1] + + assigned_splits = assign_splits_to_workers() + + split = assigned_splits[0] + + self.assertEqual(split.job.block, b1_0) + + on_split_started(split) + + cancel_experiment(xp1) + + self.assertEqual(Job.objects.count(), 4) + self.assertEqual(JobSplit.objects.count(), 1) + + splits_to_cancel = process_newly_cancelled_experiments() + + self.assertEqual(len(splits_to_cancel), 0) + + self.assertEqual(Job.objects.count(), 2) + self.assertEqual(JobSplit.objects.count(), 1) + + split.refresh_from_db() + self.assertEqual(split.job.block, b2_0) + + self.check_cancelled_block_of_cancelled_experiment(b1_0) + self.check_cancelled_block_of_cancelled_experiment(b1_1) + + self.check_processing_split(split) + self.check_queued_job_of_running_experiment(b2_1.job) + + xp1.refresh_from_db() + b1_0.refresh_from_db() + b1_1.refresh_from_db() + + self.assertEqual(xp1.start_date, b1_0.start_date) + self.assertEqual(b1_1.start_date, b1_1.end_date) + + +#---------------------------------------------------------- + + +class OnSplitCancelledTest(SplitHelpersBaseTest): + + def check_cached_file_statistics(self, block, factor=1.0, cached=True): + super(OnSplitCancelledTest, self).check_cached_file_statistics( + OnSplitFailTest.RESULT, block, factor=factor, cached=cached + ) + + + def test_failure_in_sibling_split(self): + xp = self.prepare_experiment('user/user/single/1/single_error_split_2') + b0 = xp.blocks.all()[0] + b1 = xp.blocks.all()[1] + + # Block 0 + assigned_splits = assign_splits_to_workers() + + split1 = assigned_splits[0] + split2 = assigned_splits[1] + + on_split_started(split1) + on_split_started(split2) + + xp.refresh_from_db() + split1.refresh_from_db() + split2.refresh_from_db() + + self.assertEqual(Job.objects.count(), 2) + self.assertEqual(JobSplit.objects.count(), 2) + + splits_to_cancel = on_split_fail(split1, OnSplitFailTest.RESULT) + + self.assertEqual(len(splits_to_cancel), 1) + self.assertEqual(splits_to_cancel[0], split2) + + self.assertEqual(Job.objects.count(), 2) + self.assertEqual(JobSplit.objects.count(), 2) + + self.check_failed_split_of_running_job(split1) + self.check_cancelling_split_of_cancelling_experiment(split2) + self.check_queued_job_of_running_experiment(b1.job) + + on_split_cancelled(split2) + + self.assertEqual(Job.objects.count(), 0) + self.assertEqual(JobSplit.objects.count(), 0) + + self.check_failed_block_of_failed_experiment(b0) + self.check_cancelled_block_of_failed_experiment(b1) + + xp.refresh_from_db() + b0.refresh_from_db() + b1.refresh_from_db() + + self.assertEqual(xp.start_date, b0.start_date) + self.assertEqual(xp.end_date, b1.end_date) + self.assertEqual(b1.start_date, b1.end_date) + + + def test_failure_in_parallel_block(self): + xp = self.prepare_experiment('user/user/triangle/1/triangle') + b1 = xp.blocks.get(name='echo1') + b2 = xp.blocks.get(name='echo2') + b3 = xp.blocks.get(name='echo3') + b4 = xp.blocks.get(name='analysis') + + # Block 0 + assigned_splits = assign_splits_to_workers() + + split1 = assigned_splits[0] + split2 = assigned_splits[1] + + on_split_started(split1) + on_split_started(split2) + + xp.refresh_from_db() + split2.refresh_from_db() + + self.assertEqual(Job.objects.count(), 4) + self.assertEqual(JobSplit.objects.count(), 2) + + splits_to_cancel = on_split_fail(split1, OnSplitFailTest.RESULT) + + self.assertEqual(len(splits_to_cancel), 1) + + self.assertEqual(Job.objects.count(), 1) + self.assertEqual(JobSplit.objects.count(), 1) + + split2.refresh_from_db() + + self.assertEqual(splits_to_cancel[0], split2) + + self.check_failed_block_of_cancelling_experiment(b1) + self.check_cancelling_split_of_cancelling_experiment(split2) + self.check_cancelled_block_of_cancelling_experiment(b3) + self.check_cancelled_block_of_cancelling_experiment(b4) + + self.check_cached_file_statistics(b1, cached=False) + + on_split_cancelled(split2) + + self.assertEqual(Job.objects.count(), 0) + self.assertEqual(JobSplit.objects.count(), 0) + + self.check_failed_block_of_failed_experiment(b1) + self.check_cancelled_block_of_failed_experiment(b2) + self.check_cancelled_block_of_failed_experiment(b3) + self.check_cancelled_block_of_failed_experiment(b4) + + xp.refresh_from_db() + b1.refresh_from_db() + b2.refresh_from_db() + b3.refresh_from_db() + b4.refresh_from_db() + + self.assertEqual(xp.start_date, b1.start_date) + self.assertEqual(b3.start_date, b3.end_date) + self.assertEqual(b4.start_date, b4.end_date) + self.assertEqual(xp.end_date, b2.end_date) + + + def test_cancelled_experiment_one_split_running(self): + xp = self.prepare_experiment('user/user/single/1/single') + b0 = xp.blocks.all()[0] + b1 = xp.blocks.all()[1] + + assigned_splits = assign_splits_to_workers() + + split = assigned_splits[0] + + on_split_started(split) + + cancel_experiment(xp) + + splits_to_cancel = process_newly_cancelled_experiments() + + self.assertEqual(len(splits_to_cancel), 1) + + self.assertEqual(Job.objects.count(), 1) + self.assertEqual(JobSplit.objects.count(), 1) + + on_split_cancelled(splits_to_cancel[0]) + + self.assertEqual(Job.objects.count(), 0) + self.assertEqual(JobSplit.objects.count(), 0) + + self.check_cancelled_block_of_cancelled_experiment(b0) + self.check_cancelled_block_of_cancelled_experiment(b1) + + xp.refresh_from_db() + b0.refresh_from_db() + b1.refresh_from_db() + + self.assertEqual(xp.start_date, b0.start_date) + self.assertEqual(b1.start_date, b1.end_date) + self.assertEqual(xp.end_date, b0.end_date) + + + def test_cancelled_experiment_one_split_of_two_in_a_block_running(self): + xp = self.prepare_experiment('user/user/single/1/single_split_2') + b0 = xp.blocks.all()[0] + b1 = xp.blocks.all()[1] + + assigned_splits = assign_splits_to_workers() + + split1 = assigned_splits[0] + split2 = assigned_splits[1] + + on_split_started(split1) + + cancel_experiment(xp) + + splits_to_cancel = process_newly_cancelled_experiments() + + self.assertEqual(len(splits_to_cancel), 1) + + self.assertEqual(Job.objects.count(), 1) + self.assertEqual(JobSplit.objects.count(), 2) + + on_split_cancelled(splits_to_cancel[0]) + + self.assertEqual(Job.objects.count(), 0) + self.assertEqual(JobSplit.objects.count(), 0) + + self.check_cancelled_block_of_cancelled_experiment(b0) + self.check_cancelled_block_of_cancelled_experiment(b1) + + xp.refresh_from_db() + b0.refresh_from_db() + b1.refresh_from_db() + + self.assertEqual(xp.start_date, b0.start_date) + self.assertEqual(b1.start_date, b1.end_date) + self.assertEqual(xp.end_date, b0.end_date) + + + def test_cancelled_experiment_two_splits_of_same_block_running(self): + xp = self.prepare_experiment('user/user/single/1/single_split_2') + b0 = xp.blocks.all()[0] + b1 = xp.blocks.all()[1] + + assigned_splits = assign_splits_to_workers() + + split1 = assigned_splits[0] + split2 = assigned_splits[1] + + on_split_started(split1) + on_split_started(split2) + + cancel_experiment(xp) + + splits_to_cancel = process_newly_cancelled_experiments() + + self.assertEqual(len(splits_to_cancel), 2) + + self.assertEqual(Job.objects.count(), 1) + self.assertEqual(JobSplit.objects.count(), 2) + + on_split_cancelled(split1) + + self.check_cancelled_split_of_cancelling_experiment(split1, experiment_status=Experiment.CANCELLING) + self.check_cancelling_split_of_cancelling_experiment(split2, experiment_status=Experiment.CANCELLING) + self.check_cancelled_block_of_cancelling_experiment(b1, experiment_status=Experiment.CANCELLING) + + on_split_cancelled(split2) + + self.assertEqual(Job.objects.count(), 0) + self.assertEqual(JobSplit.objects.count(), 0) + + self.check_cancelled_block_of_cancelled_experiment(b0) + self.check_cancelled_block_of_cancelled_experiment(b1) + + xp.refresh_from_db() + b0.refresh_from_db() + b1.refresh_from_db() + + self.assertEqual(xp.start_date, b0.start_date) + self.assertEqual(b1.start_date, b1.end_date) + self.assertEqual(xp.end_date, b0.end_date) + + + def test_two_splits_of_different_blocks_running(self): + xp = self.prepare_experiment('user/user/triangle/1/triangle') + b0 = xp.blocks.get(name='echo1') + b1 = xp.blocks.get(name='echo2') + b2 = xp.blocks.get(name='echo3') + b3 = xp.blocks.get(name='analysis') + + assigned_splits = assign_splits_to_workers() + + split1 = assigned_splits[0] + split2 = assigned_splits[1] + + on_split_started(split1) + on_split_started(split2) + + cancel_experiment(xp) + + splits_to_cancel = process_newly_cancelled_experiments() + + self.assertEqual(len(splits_to_cancel), 2) + + self.assertEqual(Job.objects.count(), 2) + self.assertEqual(JobSplit.objects.count(), 2) + + on_split_cancelled(split1) + + self.assertEqual(Job.objects.count(), 1) + self.assertEqual(JobSplit.objects.count(), 1) + + self.check_cancelled_block_of_cancelling_experiment(b0, experiment_status=Experiment.CANCELLING) + self.check_cancelling_split_of_cancelling_experiment(split2, experiment_status=Experiment.CANCELLING) + self.check_cancelled_block_of_cancelling_experiment(b2, experiment_status=Experiment.CANCELLING) + self.check_cancelled_block_of_cancelling_experiment(b3, experiment_status=Experiment.CANCELLING) + + on_split_cancelled(split2) + + self.assertEqual(Job.objects.count(), 0) + self.assertEqual(JobSplit.objects.count(), 0) + + self.check_cancelled_block_of_cancelled_experiment(b0) + self.check_cancelled_block_of_cancelled_experiment(b1) + self.check_cancelled_block_of_cancelled_experiment(b2) + self.check_cancelled_block_of_cancelled_experiment(b3) + + xp.refresh_from_db() + b0.refresh_from_db() + b1.refresh_from_db() + b2.refresh_from_db() + b3.refresh_from_db() + + self.assertEqual(xp.start_date, b0.start_date) + self.assertEqual(b2.start_date, b2.end_date) + self.assertEqual(b3.start_date, b3.end_date) + self.assertEqual(xp.end_date, b1.end_date) diff --git a/beat/web/backend/tests/test_scheduler.py b/beat/web/backend/tests/test_scheduler.py new file mode 100755 index 0000000000000000000000000000000000000000..9652820be855d0397bd3bb73785670a962484aa5 --- /dev/null +++ b/beat/web/backend/tests/test_scheduler.py @@ -0,0 +1,569 @@ +#!/usr/bin/env python +# vim: set fileencoding=utf-8 : + +############################################################################### +# # +# Copyright (c) 2017 Idiap Research Institute, http://www.idiap.ch/ # +# Contact: beat.support@idiap.ch # +# # +# This file is part of the beat.web module of the BEAT platform. # +# # +# Commercial License Usage # +# Licensees holding valid commercial BEAT licenses may use this file in # +# accordance with the terms contained in a written agreement between you # +# and Idiap. For further information contact tto@idiap.ch # +# # +# Alternatively, this file may be used under the terms of the GNU Affero # +# Public License version 3 as published by the Free Software and appearing # +# in the file LICENSE.AGPL included in the packaging of this file. # +# The BEAT platform is distributed in the hope that it will be useful, but # +# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY # +# or FITNESS FOR A PARTICULAR PURPOSE. # +# # +# You should have received a copy of the GNU Affero Public License along # +# with the BEAT platform. If not, see http://www.gnu.org/licenses/. # +# # +############################################################################### + +import multiprocessing +from time import time +from time import sleep + +from django.conf import settings +from django.test import TransactionTestCase + +from .common import BaseBackendTestCase +from .common import BackendUtilitiesMixin +from .common import ONE_QUEUE_TWO_WORKERS + +from ..models import Worker + +from ..utils import setup_backend +from ..helpers import schedule_experiment +from ..helpers import cancel_experiment + +from ...scripts import scheduler +from ...experiments.models import Experiment +from ...experiments.models import Block + +from beat.core.scripts import worker + + +#---------------------------------------------------------- + + +# class SchedulerThread(threading.Thread): +class SchedulerThread(multiprocessing.Process): + + def __init__(self, queue, arguments): + super(SchedulerThread, self).__init__() + + self.queue = queue + self.arguments = arguments + + + def run(self): + self.queue.put('STARTED') + scheduler.main(self.arguments) + + +#---------------------------------------------------------- + + +# class WorkerThread(threading.Thread): +class WorkerThread(multiprocessing.Process): + + def __init__(self, queue, arguments): + super(WorkerThread, self).__init__() + + self.queue = queue + self.arguments = arguments + + + def run(self): + self.queue.put('STARTED') + worker.main(self.arguments) + + +#---------------------------------------------------------- + + +class TestSchedulerBase(TransactionTestCase, BackendUtilitiesMixin): + + def __init__(self, methodName='runTest'): + super(TestSchedulerBase, self).__init__(methodName) + self.scheduler_thread = None + self.worker_threads = {} + + + def setUp(self): + self.tearDown() + + TestSchedulerBase.setup_test_data() + setup_backend(ONE_QUEUE_TWO_WORKERS) + Worker.objects.update(active=False) + + + def tearDown(self): + for name in list(self.worker_threads.keys()): + self.stop_worker(name) + + self.worker_threads = {} + + self.stop_scheduler() + + + def start_scheduler(self): + args = [ + '--settings=beat.web.settings.test', + '--interval=1', + '--address=127.0.0.1', + '--port=50000', + ] + + self.scheduler_thread = SchedulerThread(multiprocessing.Queue(), args) + self.scheduler_thread.start() + self.scheduler_thread.queue.get() + + + def stop_scheduler(self): + if self.scheduler_thread is not None: + self.scheduler_thread.terminate() + self.scheduler_thread.join() + self.scheduler_thread = None + + + def start_worker(self, name): + args = [ + '--prefix=%s' % settings.PREFIX, + '--cache=%s' % settings.CACHE_ROOT, + '--name=%s' % name, + 'tcp://127.0.0.1:50000', + ] + + worker_thread = WorkerThread(multiprocessing.Queue(), args) + worker_thread.start() + worker_thread.queue.get() + + self.worker_threads[name] = worker_thread + + + def stop_worker(self, name): + if name in self.worker_threads: + self.worker_threads[name].terminate() + self.worker_threads[name].join() + del self.worker_threads[name] + + + def check_worker_status(self, name, active): + start = time() + while Worker.objects.filter(name=name, active=active).count() == 0: + self.assertTrue(time() - start < 10) # Fail after 10 seconds + + +#---------------------------------------------------------- + + +class TestConnection(TestSchedulerBase): + + def test_worker_connection(self): + self.start_scheduler() + + self.assertEqual(Worker.objects.filter(active=True).count(), 0) + + self.start_worker('node1') + + self.check_worker_status('node1', True) + + self.assertEqual(Worker.objects.filter(active=True).count(), 1) + + + def test_worker_disconnection(self): + self.start_scheduler() + self.start_worker('node1') + + self.check_worker_status('node1', True) + + self.worker_threads['node1'].terminate() + self.worker_threads['node1'].join() + del self.worker_threads['node1'] + + self.check_worker_status('node1', False) + + + def test_two_workers_connection(self): + self.start_scheduler() + + self.assertEqual(Worker.objects.filter(active=True).count(), 0) + + self.start_worker('node1') + self.start_worker('node2') + + self.check_worker_status('node1', True) + self.check_worker_status('node2', True) + + self.assertEqual(Worker.objects.filter(active=True).count(), 2) + + + def test_scheduler_last(self): + self.start_worker('node1') + sleep(1) + + self.start_scheduler() + + self.check_worker_status('node1', True) + + +#---------------------------------------------------------- + + +class TestExecution(TestSchedulerBase): + + def setUp(self): + super(TestExecution, self).setUp() + + self.start_scheduler() + self.start_worker('node1') + self.start_worker('node2') + self.check_worker_status('node1', True) + self.check_worker_status('node2', True) + + + def test_successful_experiment(self): + fullname = 'user/user/double/1/double' + + xp = Experiment.objects.get(name=fullname.split('/')[-1]) + + schedule_experiment(xp) + xp.refresh_from_db() + + start = time() + while xp.status != Experiment.DONE: + self.assertTrue(time() - start < 10) # Fail after 10 seconds + xp.refresh_from_db() + + b1 = xp.blocks.get(name='echo1') + b2 = xp.blocks.get(name='echo2') + b3 = xp.blocks.get(name='analysis') + + self.assertEqual(b1.status, Block.DONE) + self.assertEqual(b2.status, Block.DONE) + self.assertEqual(b3.status, Block.DONE) + + + def test_successful_splitted_experiment(self): + fullname = 'user/user/double/1/double_split_2' + + xp = Experiment.objects.get(name=fullname.split('/')[-1]) + + schedule_experiment(xp) + xp.refresh_from_db() + + start = time() + while xp.status != Experiment.DONE: + self.assertTrue(time() - start < 10) # Fail after 10 seconds + xp.refresh_from_db() + + b1 = xp.blocks.get(name='echo1') + b2 = xp.blocks.get(name='echo2') + b3 = xp.blocks.get(name='analysis') + + self.assertEqual(b1.status, Block.DONE) + self.assertEqual(b2.status, Block.DONE) + self.assertEqual(b3.status, Block.DONE) + + + def test_two_similar_experiments(self): + fullname1 = 'user/user/double/1/double' + fullname2 = 'user/user/double/1/double_split_2' + + xp1 = Experiment.objects.get(name=fullname1.split('/')[-1]) + xp2 = Experiment.objects.get(name=fullname2.split('/')[-1]) + + schedule_experiment(xp1) + schedule_experiment(xp2) + + xp1.refresh_from_db() + xp2.refresh_from_db() + + start = time() + while (xp1.status != Experiment.DONE) or (xp2.status != Experiment.DONE): + self.assertTrue(time() - start < 10) # Fail after 10 seconds + xp1.refresh_from_db() + xp2.refresh_from_db() + + self.assertEqual(xp1.start_date, xp2.start_date) + self.assertEqual(xp1.end_date, xp2.end_date) + + b1 = xp1.blocks.get(name='echo1') + b2 = xp2.blocks.get(name='echo1') + + self.assertEqual(b1.status, Block.DONE) + self.assertEqual(b2.status, Block.DONE) + self.assertEqual(b1.start_date, b2.start_date) + self.assertEqual(b1.end_date, b2.end_date) + + b1 = xp1.blocks.get(name='echo2') + b2 = xp2.blocks.get(name='echo2') + + self.assertEqual(b1.status, Block.DONE) + self.assertEqual(b2.status, Block.DONE) + self.assertEqual(b1.start_date, b2.start_date) + self.assertEqual(b1.end_date, b2.end_date) + + b1 = xp1.blocks.get(name='analysis') + b2 = xp2.blocks.get(name='analysis') + + self.assertEqual(b1.status, Block.DONE) + self.assertEqual(b2.status, Block.DONE) + self.assertEqual(b1.start_date, b2.start_date) + self.assertEqual(b1.end_date, b2.end_date) + + + def test_two_different_experiments(self): + fullname1 = 'user/user/single/1/single' + fullname2 = 'user/user/single/1/single_add' + + xp1 = Experiment.objects.get(name=fullname1.split('/')[-1]) + xp2 = Experiment.objects.get(name=fullname2.split('/')[-1]) + + schedule_experiment(xp1) + schedule_experiment(xp2) + + xp1.refresh_from_db() + xp2.refresh_from_db() + + start = time() + while (xp1.status != Experiment.DONE) or (xp2.status != Experiment.DONE): + self.assertTrue(time() - start < 10) # Fail after 10 seconds + xp1.refresh_from_db() + xp2.refresh_from_db() + + self.assertNotEqual(xp1.start_date, xp2.start_date) + self.assertNotEqual(xp1.end_date, xp2.end_date) + + + def test_two_delayed_similar_experiments(self): + fullname1 = 'user/user/double/1/double' + fullname2 = 'user/user/double/1/double_split_2' + + xp1 = Experiment.objects.get(name=fullname1.split('/')[-1]) + xp2 = Experiment.objects.get(name=fullname2.split('/')[-1]) + + schedule_experiment(xp1) + + xp1.refresh_from_db() + + start = time() + while xp1.status != Experiment.RUNNING: + self.assertTrue(time() - start < 10) # Fail after 10 seconds + xp1.refresh_from_db() + + schedule_experiment(xp2) + xp2.refresh_from_db() + + start = time() + while (xp1.status != Experiment.DONE) or (xp2.status != Experiment.DONE): + self.assertTrue(time() - start < 10) # Fail after 10 seconds + xp1.refresh_from_db() + xp2.refresh_from_db() + + + def test_failed_experiment(self): + fullname = 'user/user/single/1/single_error' + + xp = Experiment.objects.get(name=fullname.split('/')[-1]) + + schedule_experiment(xp) + xp.refresh_from_db() + + start = time() + while xp.status != Experiment.FAILED: + self.assertTrue(time() - start < 10) # Fail after 10 seconds + xp.refresh_from_db() + + b1 = xp.blocks.get(name='echo') + b2 = xp.blocks.get(name='analysis') + + self.assertEqual(b1.status, Block.FAILED) + self.assertEqual(b2.status, Block.CANCELLED) + + + def test_failed_splitted_experiment(self): + fullname = 'user/user/double/1/double_error_split_2' + + xp = Experiment.objects.get(name=fullname.split('/')[-1]) + + schedule_experiment(xp) + xp.refresh_from_db() + + start = time() + while xp.status != Experiment.FAILED: + self.assertTrue(time() - start < 10) # Fail after 10 seconds + xp.refresh_from_db() + + b1 = xp.blocks.get(name='echo1') + b2 = xp.blocks.get(name='echo2') + b3 = xp.blocks.get(name='analysis') + + self.assertEqual(b1.status, Block.DONE) + self.assertEqual(b2.status, Block.FAILED) + self.assertEqual(b3.status, Block.CANCELLED) + + + def test_failed_mirror_experiments(self): + fullname1 = 'user/user/double/1/double_error' + fullname2 = 'user/user/double/1/double_error_split_2' + + xp1 = Experiment.objects.get(name=fullname1.split('/')[-1]) + xp2 = Experiment.objects.get(name=fullname2.split('/')[-1]) + + schedule_experiment(xp1) + schedule_experiment(xp2) + + xp1.refresh_from_db() + xp2.refresh_from_db() + + start = time() + while (xp1.status != Experiment.FAILED) or (xp2.status != Experiment.FAILED): + self.assertTrue(time() - start < 10) # Fail after 10 seconds + xp1.refresh_from_db() + xp2.refresh_from_db() + + b1 = xp1.blocks.get(name='echo1') + b2 = xp1.blocks.get(name='echo2') + b3 = xp1.blocks.get(name='analysis') + + self.assertEqual(b1.status, Block.DONE) + self.assertEqual(b2.status, Block.FAILED) + self.assertEqual(b3.status, Block.CANCELLED) + + b1 = xp2.blocks.get(name='echo1') + b2 = xp2.blocks.get(name='echo2') + b3 = xp2.blocks.get(name='analysis') + + self.assertEqual(b1.status, Block.DONE) + self.assertEqual(b2.status, Block.FAILED) + self.assertEqual(b3.status, Block.CANCELLED) + + +#---------------------------------------------------------- + + +class TestCancellation(TestSchedulerBase): + + def setUp(self): + super(TestCancellation, self).setUp() + + self.start_scheduler() + self.start_worker('node1') + self.start_worker('node2') + self.check_worker_status('node1', True) + self.check_worker_status('node2', True) + + + def process(self, experiment_name, block_name=None): + xp = Experiment.objects.get(name=experiment_name.split('/')[-1]) + + schedule_experiment(xp) + xp.refresh_from_db() + + start = time() + while xp.status != Experiment.RUNNING: + self.assertTrue(time() - start < 10) # Fail after 10 seconds + xp.refresh_from_db() + + if block_name is not None: + block = xp.blocks.get(name=block_name) + + start = time() + while block.status != Block.PROCESSING: + self.assertTrue(time() - start < 10) # Fail after 10 seconds + block.refresh_from_db() + + cancel_experiment(xp) + xp.refresh_from_db() + + start = time() + while xp.status != Experiment.PENDING: + self.assertTrue(time() - start < 10) # Fail after 10 seconds + xp.refresh_from_db() + + self.assertTrue(xp.blocks.filter(status=Block.CANCELLED).count() > 0) + self.assertEqual(xp.blocks.filter(status=Block.PENDING).count(), 0) + self.assertEqual(xp.blocks.filter(status=Block.PROCESSING).count(), 0) + + + def process2(self, experiment_name1, experiment_name2, cancel_index=0): + xp1 = Experiment.objects.get(name=experiment_name1.split('/')[-1]) + xp2 = Experiment.objects.get(name=experiment_name2.split('/')[-1]) + + schedule_experiment(xp1) + schedule_experiment(xp2) + xp1.refresh_from_db() + xp2.refresh_from_db() + + start = time() + while (xp1.status != Experiment.RUNNING) or (xp2.status != Experiment.RUNNING): + self.assertTrue(time() - start < 10) # Fail after 10 seconds + xp1.refresh_from_db() + xp2.refresh_from_db() + + if cancel_index == 0: + xp_to_cancel = xp1 + xp_to_finish = xp2 + else: + xp_to_cancel = xp2 + xp_to_finish = xp1 + + cancel_experiment(xp_to_cancel) + + xp_to_cancel.refresh_from_db() + xp_to_finish.refresh_from_db() + + start = time() + while xp_to_cancel.status != Experiment.PENDING: + self.assertTrue(time() - start < 10) # Fail after 10 seconds + xp_to_cancel.refresh_from_db() + + xp_to_finish.refresh_from_db() + + start = time() + while xp_to_finish.status != Experiment.DONE: + self.assertTrue(time() - start < 10) # Fail after 10 seconds + xp_to_finish.refresh_from_db() + + self.assertTrue(xp_to_cancel.blocks.filter(status=Block.CANCELLED).count() > 0) + self.assertEqual(xp_to_cancel.blocks.filter(status=Block.PENDING).count(), 0) + self.assertEqual(xp_to_cancel.blocks.filter(status=Block.PROCESSING).count(), 0) + + self.assertEqual(xp_to_finish.blocks.filter(status=Block.CANCELLED).count(), 0) + self.assertEqual(xp_to_finish.blocks.filter(status=Block.PENDING).count(), 0) + self.assertEqual(xp_to_finish.blocks.filter(status=Block.PROCESSING).count(), 0) + + + def test_one_split_running(self): + self.process('user/user/single/1/single_sleep_4') + + + def test_one_split_of_two_in_a_block_running(self): + self.stop_worker('node2') + self.process('user/user/double/1/double_sleep_split_2', block_name='echo2') + + + def test_two_splits_of_same_block_running(self): + self.process('user/user/double/1/double_sleep_split_2', block_name='echo2') + + + def test_two_splits_of_different_blocks_running(self): + self.process('user/user/triangle/1/triangle_sleep_4') + + + def test_mirror_block(self): + self.stop_worker('node2') + self.process2('user/user/single/1/single_sleep_4', 'user/user/single/1/single_sleep_5', cancel_index=1) + + + def test_mirrored_block(self): + self.stop_worker('node2') + self.process2('user/user/single/1/single_sleep_4', 'user/user/single/1/single_sleep_5', cancel_index=0) diff --git a/beat/web/backend/tests/test_setup.py b/beat/web/backend/tests/test_setup.py new file mode 100755 index 0000000000000000000000000000000000000000..09498cc027380f0818d529f57e17d51f79de5dd0 --- /dev/null +++ b/beat/web/backend/tests/test_setup.py @@ -0,0 +1,472 @@ +#!/usr/bin/env python +# vim: set fileencoding=utf-8 : + +############################################################################### +# # +# Copyright (c) 2017 Idiap Research Institute, http://www.idiap.ch/ # +# Contact: beat.support@idiap.ch # +# # +# This file is part of the beat.web module of the BEAT platform. # +# # +# Commercial License Usage # +# Licensees holding valid commercial BEAT licenses may use this file in # +# accordance with the terms contained in a written agreement between you # +# and Idiap. For further information contact tto@idiap.ch # +# # +# Alternatively, this file may be used under the terms of the GNU Affero # +# Public License version 3 as published by the Free Software and appearing # +# in the file LICENSE.AGPL included in the packaging of this file. # +# The BEAT platform is distributed in the hope that it will be useful, but # +# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY # +# or FITNESS FOR A PARTICULAR PURPOSE. # +# # +# You should have received a copy of the GNU Affero Public License along # +# with the BEAT platform. If not, see http://www.gnu.org/licenses/. # +# # +############################################################################### + +import time +import collections + +from django.core import management + +from .common import BaseBackendTestCase + +from ...experiments.models import Experiment + +from ..models import Queue +from ..models import Worker +from ..models import Slot +from ..utils import dump_backend +from ..utils import setup_backend +from ..management.commands import qsetup + + + +# Example configuration with 3 queues with an increasing amount of resources +# running on the same host +QUEUES_WITHOUT_PRIORITY = { + "queues": collections.OrderedDict([ + ("q1", { + "memory-limit": 4*1024, + "time-limit": 180, #3 hours + "cores-per-slot": 1, + "max-slots-per-user": 4, + "environments": ['Python 2.7 (1.1.0)'], + "groups": [ + "Default", + ], + "slots": { + "node1": { + "quantity": 4, + "priority": 0 + } + } + } + ), + ("q2", { + "memory-limit": 8*1024, + "time-limit": 360, #6 hours + "cores-per-slot": 2, + "max-slots-per-user": 2, + "environments": ['Python 2.7 (1.1.0)'], + "groups": [ + "Default", + ], + "slots": { + "node1": { + "quantity": 2, + "priority": 0 + }, + } + } + ), + ("q4", { + "memory-limit": 16*1024, + "time-limit": 720, #12 hours + "cores-per-slot": 4, + "max-slots-per-user": 1, + "environments": ['Python 2.7 (1.1.0)'], + "groups": [ + "Default", + ], + "slots": { + "node1": { + "quantity": 1, + "priority": 0 + }, + } + } + ) + ]), + "workers": { + "node1": { + "cores": 4, + "memory": 16*1024, + } + }, + "environments": { + 'Python 2.7 (1.1.0)': { + "name": 'Python 2.7', + "version": '1.1.0', + "short_description": "Test", + "description": "Test environment", + "languages": "python", + }, + }, + } + +# Example configuration with 3 queues sharing slots on 2 hosts +PRIORITY_QUEUES = { + "queues": collections.OrderedDict([ + ("q1", { + "memory-limit": 4*1024, + "time-limit": 180, #3 hours + "cores-per-slot": 1, + "max-slots-per-user": 2, + "environments": ['Python 2.7 (1.1.0)'], + "groups": [ + "Default", + ], + "slots": { + "node1": { + "quantity": 4, + "priority": 5 + }, + "node2": { + "quantity": 4, + "priority": 0 + }, + } + }, + ), + ("q2", { + "memory-limit": 8*1024, + "time-limit": 360, #6 hours + "cores-per-slot": 2, + "max-slots-per-user": 1, + "environments": ['Python 2.7 (1.1.0)'], + "groups": [ + "Default", + ], + "slots": { + "node1": { + "quantity": 2, + "priority": 0 + }, + "node2": { + "quantity": 2, + "priority": 10 + } + } + }, + ), + ("q1_special", { + "memory-limit": 4*1024, + "time-limit": 180, #3 hours + "cores-per-slot": 1, + "max-slots-per-user": 8, + "environments": ['Python 2.7 (1.1.0)'], + "groups": [ + "Default", + ], + "slots": { + "node1": { + "quantity": 4, + "priority": 0 + }, + "node2": { + "quantity": 4, + "priority": 5 + } + } + } + ), + ]), + "workers": collections.OrderedDict([ + ("node1", { + "cores": 4, + "memory": 32*1024, + } + ), + ("node2", { + "cores": 4, + "memory": 16*1024, + } + ) + ]), + "environments": { + 'Python 2.7 (1.1.0)': { + "name": 'Python 2.7', + "version": '1.1.0', + "short_description": "Test", + "description": "Test environment", + "languages": "python", + }, + }, + } + + + +class BackendSetup(BaseBackendTestCase): + + def check_default_config(self): + + # checks all is there + self.assertEqual(dump_backend(), qsetup.DEFAULT_CONFIGURATION) + + worker = Worker.objects.get() + queue = Queue.objects.get() + Worker.objects.update(active=True) + + self.assertEqual(worker.available_cores(), qsetup.CORES) + self.assertEqual(list(worker.slots.values_list('id', flat=True)), + list(queue.slots.values_list('id', flat=True))) + + # worker has no job splits assigned to it + self.assertEqual(worker.splits.count(), 0) + + self.assertEqual(queue.availability(), qsetup.CORES) + self.assertEqual(queue.number_of_slots(), qsetup.CORES) + self.assertEqual(queue.worker_availability(), [worker]) + + # checks the single slot and priority + slot = queue.slots.get() + self.assertEqual(slot.quantity, qsetup.CORES) + self.assertEqual(slot.priority, 0) + self.assertEqual(slot.worker, worker) + + # checks no orphan slots exist + self.assertEqual(Slot.objects.filter(queue=None).count(), 0) + self.assertEqual(Slot.objects.filter(worker=None).count(), 0) + + + def test_setup(self): + + self.check_default_config() + + + def test_cmd_reset(self): + + # installs the default configuration command + management.call_command('qsetup', verbosity=0, reset=True) + self.check_default_config() + + + def check_noprior_config(self): + + qs = Queue.objects.all() + + self.assertEqual(qs.count(), 3) + + q1, q2, q3 = qs.order_by('name') + + self.assertEqual(q1.name, 'q1') + self.assertEqual(q2.name, 'q2') + self.assertEqual(q3.name, 'q4') + self.assertEqual(q1.splits().count(), 0) + self.assertEqual(q2.splits().count(), 0) + self.assertEqual(q3.splits().count(), 0) + + self.assertEqual(q1.number_of_slots(), 4) + self.assertEqual(q2.number_of_slots(), 2) + self.assertEqual(q3.number_of_slots(), 1) + self.assertEqual(q1.availability(), 4) + self.assertEqual(q2.availability(), 2) + self.assertEqual(q3.availability(), 1) + self.assertEqual(q1.environments.count(), 1) + self.assertEqual(q2.environments.count(), 1) + self.assertEqual(q3.environments.count(), 1) + + self.assertEqual(q1.environments.first(), q2.environments.first()) + self.assertEqual(q2.environments.first(), q3.environments.first()) + + env = q1.environments.first() + + self.assertEqual(env.name, 'Python 2.7') + self.assertEqual(env.version, '1.1.0') + + self.assertEqual(q1.slots.count(), 1) + self.assertEqual(q2.slots.count(), 1) + self.assertEqual(q3.slots.count(), 1) + + slot1 = q1.slots.first() + slot2 = q2.slots.first() + slot3 = q3.slots.first() + + self.assertEqual(slot1.quantity, 4) + self.assertEqual(slot1.priority, 0) + self.assertEqual(slot1.queue, q1) + self.assertEqual(slot2.quantity, 2) + self.assertEqual(slot2.priority, 0) + self.assertEqual(slot2.queue, q2) + self.assertEqual(slot3.quantity, 1) + self.assertEqual(slot3.priority, 0) + self.assertEqual(slot3.queue, q3) + + worker1 = slot1.worker + worker2 = slot2.worker + worker3 = slot3.worker + + self.assertEqual(worker1, worker2) + self.assertEqual(worker2, worker3) + + self.assertEqual(worker1.name, 'node1') + self.assertEqual(list(worker1.splits.all()), []) + self.assertEqual(worker1.memory, 16*1024) + self.assertEqual(worker1.cores, 4) + self.assertEqual(worker1.available_cores(), 4) + + self.assertEqual(worker1.slots.count(), 3) + + self.assertEqual(set(worker1.slots.all()), + set(list(q1.slots.all()) + list(q2.slots.all()) + \ + list(q3.slots.all()))) + + avail1 = q1.worker_availability() + self.assertEqual(avail1, [worker1]) + + avail2 = q2.worker_availability() + self.assertEqual(avail2, [worker1]) + + avail3 = q3.worker_availability() + self.assertEqual(avail2, [worker1]) + + # checks no orphan slots exist + self.assertEqual(Slot.objects.filter(queue=None).count(), 0) + self.assertEqual(Slot.objects.filter(worker=None).count(), 0) + + + def test_reconfigure_noprior(self): + + setup_backend(QUEUES_WITHOUT_PRIORITY) + Worker.objects.update(active=True) + self.check_noprior_config() + + + def test_reconfigure_fail_qenv_used(self): + + fullname = 'user/user/single/1/single' + xp = Experiment.objects.get(name=fullname.split('/')[-1]) + + # schedules the experiment and check it + xp.schedule() + self.check_single(xp) + + try: + setup_backend(QUEUES_WITHOUT_PRIORITY) + except RuntimeError as e: + assert str(e).find('on the following queue/environment combinations') != -1 + else: + assert False, 'Queue re-configuration worked with q/env in use' + + + def check_prior_config(self): + + qs = Queue.objects.all() + + self.assertEqual(qs.count(), 3) + + q1, q1_special, q2 = qs.order_by('name') + + self.assertEqual(q1.name, 'q1') + self.assertEqual(q2.name, 'q2') + self.assertEqual(q1_special.name, 'q1_special') + self.assertEqual(q1.splits().count(), 0) + self.assertEqual(q2.splits().count(), 0) + self.assertEqual(q1_special.splits().count(), 0) + + self.assertEqual(q1.number_of_slots(), 8) + self.assertEqual(q2.number_of_slots(), 4) + self.assertEqual(q1_special.number_of_slots(), 8) + self.assertEqual(q1.availability(), 8) + self.assertEqual(q2.availability(), 4) + self.assertEqual(q1_special.availability(), 8) + self.assertEqual(q1.environments.count(), 1) + self.assertEqual(q2.environments.count(), 1) + self.assertEqual(q1_special.environments.count(), 1) + + self.assertEqual(q1.environments.first(), q2.environments.first()) + self.assertEqual(q2.environments.first(), + q1_special.environments.first()) + + env = q1.environments.first() + + self.assertEqual(env.name, 'Python 2.7') + self.assertEqual(env.version, '1.1.0') + + self.assertEqual(q1.slots.count(), 2) + self.assertEqual(q1_special.slots.count(), 2) + self.assertEqual(q2.slots.count(), 2) + + slot11, slot12 = q1.slots.all() + slot1_special1, slot1_special2 = q1_special.slots.all() + slot21, slot22 = q2.slots.all() + + self.assertEqual(slot11.quantity, 4) + self.assertEqual(slot11.priority, 5) + self.assertEqual(slot12.quantity, 4) + self.assertEqual(slot12.priority, 0) + self.assertEqual(slot11.queue, q1) + self.assertEqual(slot12.queue, q1) + + self.assertEqual(slot21.quantity, 2) + self.assertEqual(slot21.priority, 0) + self.assertEqual(slot22.quantity, 2) + self.assertEqual(slot22.priority, 10) + self.assertEqual(slot21.queue, q2) + self.assertEqual(slot22.queue, q2) + + self.assertEqual(slot1_special1.quantity, 4) + self.assertEqual(slot1_special1.priority, 0) + self.assertEqual(slot1_special2.quantity, 4) + self.assertEqual(slot1_special2.priority, 5) + self.assertEqual(slot1_special1.queue, q1_special) + self.assertEqual(slot1_special2.queue, q1_special) + + worker1 = slot11.worker + worker2 = slot12.worker + worker21 = slot21.worker + worker22 = slot22.worker + worker1_special1 = slot1_special1.worker + worker1_special2 = slot1_special2.worker + + self.assertEqual(worker1, worker21) + self.assertEqual(worker1, worker1_special1) + self.assertEqual(worker2, worker22) + self.assertEqual(worker2, worker1_special2) + + self.assertEqual(worker1.name, 'node1') + self.assertEqual(worker1.splits.count(), 0) + self.assertEqual(worker1.memory, 32*1024) + self.assertEqual(worker1.cores, 4) + self.assertEqual(worker1.available_cores(), 4) + + self.assertEqual(worker2.name, 'node2') + self.assertEqual(worker2.splits.count(), 0) + self.assertEqual(worker2.memory, 16*1024) + self.assertEqual(worker2.cores, 4) + self.assertEqual(worker2.available_cores(), 4) + + self.assertEqual(worker1.slots.count(), 3) + self.assertEqual(worker2.slots.count(), 3) + + avail1 = q1.worker_availability() + self.assertEqual(avail1, [worker1, worker2]) + + avail2 = q2.worker_availability() + self.assertEqual(avail2, [worker2, worker1]) + + avail1_special = q1_special.worker_availability() + self.assertEqual(avail1_special, [worker2, worker1]) + + # checks no orphan slots exist + self.assertEqual(Slot.objects.filter(queue=None).count(), 0) + self.assertEqual(Slot.objects.filter(worker=None).count(), 0) + + + def test_reconfigure_priors(self): + + setup_backend(PRIORITY_QUEUES) + Worker.objects.update(active=True) + self.check_prior_config() diff --git a/beat/web/backend/utils.py b/beat/web/backend/utils.py index 89049f18e691cee04071e7ae94df8fac47712d14..8822e0080229446f169117856ffd38ab7da778ad 100755 --- a/beat/web/backend/utils.py +++ b/beat/web/backend/utils.py @@ -46,8 +46,16 @@ from guardian.shortcuts import assign_perm from ..code.models import Code from ..common.models import Shareable -from ..experiments.models import CachedFile, Block, Experiment -from .models import Queue, Worker, Job, Environment, EnvironmentLanguage, Slot +from ..experiments.models import CachedFile +from ..experiments.models import Block +from ..experiments.models import Experiment +from .models import Queue +from .models import Worker +from .models import Job +from .models import JobSplit +from .models import Environment +from .models import EnvironmentLanguage +from .models import Slot def cleanup_cache(path, age_in_minutes=0, delete=False): @@ -174,9 +182,9 @@ def setup_backend(d): delete_q_envs = q_envs.difference(config_q_envs) # 4. We figure out which combinations of queue/environment's are currently - # used by running or queued jobs. + # used by queued jobs. used_q_envs = set([(job.block.queue.name, str(job.block.environment)) \ - for job in Job.objects.filter(status__in=(Job.PROCESSING, Job.QUEUED))]) + for job in Job.objects.filter(start_date__isnull=True)]) # 5. We request that no jobs should be either executing or scheduled for # execution on queue/environment combinations that need to be deleted. diff --git a/beat/web/experiments/models/block.py b/beat/web/experiments/models/block.py index 548d2ef7ca0c649283b5c3903bdd422377c166d5..adb363bbb5f8c5f37d999edcac8bf2d45cc554ab 100755 --- a/beat/web/experiments/models/block.py +++ b/beat/web/experiments/models/block.py @@ -43,6 +43,7 @@ from .result import Result import os import simplejson +from datetime import datetime #---------------------------------------------------------- @@ -127,6 +128,7 @@ class Block(models.Model): def __str__(self): return self.experiment.fullname() + ', ' + self.name + ' (%s)' % self.get_status_display() + def natural_key(self): return ( self.name, @@ -135,61 +137,103 @@ class Block(models.Model): self.experiment.toolchain.name, self.experiment.toolchain.version, self.experiment.name, - ) + ) - # Accessors for statistics + def save(self, *args, **kwargs): + # Ensure that the state of the block is consistent, just in case, but + # we expect the caller to do it properly + if self.status == Block.PENDING: + try: + self.results.all().delete() + except: + pass + + self.start_date = None + self.end_date = None + + elif self.status == Block.PROCESSING: + if self.start_date is None: + self.start_date = datetime.now() + + self.end_date = None + + else: + if self.end_date is None: + self.end_date = datetime.now() + + super(Block, self).save(*args, **kwargs) + + + # Accessors for statistics def __return_first__(self, field, default=None): return getattr(self.outputs.first(), field, default) + def first_cache(self): return self.outputs.first() + def error_report(self): return self.__return_first__('error_report') + def stdout(self): return self.__return_first__('stdout') + def stderr(self): return self.__return_first__('stderr') + def speed_up_real(self): return self.__return_first__('speed_up_real') + def speed_up_maximal(self): return self.__return_first__('speed_up_maximal') + def linear_execution_time(self): return self.__return_first__('linear_execution_time') + def queuing_time(self): return self.__return_first__('queuing_time') + def cpu_time(self): return self.__return_first__('cpu_time') + def max_memory(self): return self.__return_first__('max_memory') + def data_read_size(self): return self.__return_first__('data_read_size') + def data_read_nb_blocks(self): return self.__return_first__('data_read_nb_blocks') + def data_read_time(self): return self.__return_first__('data_read_time') + def data_written_size(self): return self.__return_first__('data_written_size') + def data_written_nb_blocks(self): return self.__return_first__('data_written_nb_blocks') + def data_written_time(self): return self.__return_first__('data_written_time') + # Accessor for results results = property(lambda self: self.__return_first__('results')) @@ -200,164 +244,6 @@ class Block(models.Model): return self.status not in (Block.PENDING, Block.PROCESSING) - def _cancel(self): - '''Cancels the execution of this block on the backend. - - This method should only be called from the experiment equivalent. It is - not part of the Block's public API. - ''' - - # lock self - avoids concurrent update from scheduler/worker subsystem - self_ = Block.objects.select_for_update().get(pk=self.pk) - - if self_.done(): return - - if hasattr(self, 'job'): - self.job._cancel() - else: - self.status = Block.CANCELLED - self.save() - self.experiment._update_state() - - def is_runnable(self): '''Checks if a block is runnable presently''' - - return all([k.status == Block.DONE \ - for k in self.dependencies.all()]) and \ - (hasattr(self, 'job') and self.job.parent is None) - - - def _cascade_updates(self): - '''Cascade updates to blocks once I'm done. - ''' - - for b in self.dependents.all(): - if any([k.status in (Block.FAILED, Block.CANCELLED) \ - for k in b.dependencies.all()]): - b._cancel() - if b.is_runnable(): b.job._make_runnable() - - # Update eventual running siblings in case of a failure - if self.status == Block.FAILED: - for b in Block.objects.filter(experiment=self.experiment, - status=Block.PROCESSING): - b._cancel() - - - def _update_state(self, timings=None): - '''Updates self state as a result of backend running - - - Parameters: - - timings (dict, Optional): A dictionary containing key-value pairs - corresponding to: - - * queuing time (in seconds) - * sequential execution time (in seconds) - * real speed-up obtained - * maximum speed-up obtainable - - - This method is supposed to be called only by the underlying job - instance. It is not part of the Block's public API. - - ''' - - # lock self - avoids concurrent update from scheduler/worker subsystem - self_ = Block.objects.select_for_update().get(pk=self.pk) - - if self_.done(): return - - if self.start_date is None: - self.start_date = self.job.start_date - - if self.job.result: - - statistics = self.job.result.stats - - result_stdout = self.job.result.stdout - result_stderr = self.job.result.stderr - - if result_stdout == '\n': - result_stdout = '' - - if result_stderr == '\n': - result_stderr = '' - - info = dict( - cpu_time = statistics.cpu['user'] + statistics.cpu['system'], - max_memory = statistics.memory['rss'], - stdout = result_stdout, - stderr = result_stderr, - error_report = self.job.result.usrerr, - ) - - if 'volume' in statistics.data: - info['data_read_size'] = statistics.data['volume'].get('read', 0) - info['data_written_size'] = statistics.data['volume'].get('write', 0) - - if 'blocks' in statistics.data: - info['data_read_nb_blocks'] = statistics.data['blocks'].get('read', 0) - info['data_written_nb_blocks'] = statistics.data['blocks'].get('write', 0) - - if 'time' in statistics.data: - info['data_read_time'] = statistics.data['time'].get('read', 0) - info['data_written_time'] = statistics.data['time'].get('write', 0) - - - if timings: - info.update(dict( - queuing_time = timings['queuing'], - linear_execution_time = timings['linear_execution'], - speed_up_real = timings['speed_up_real'], - speed_up_maximal = timings['speed_up_maximal'], - )) - - self.outputs.update(**info) - - if self.job.status == Job.SKIPPED: - self.status = Block.DONE - else: - self.status = self.job.status - - if self.job.done(): - self.end_date = self.job.end_date - r = self.job.result - self.job.delete() - if r: r.delete() - - # Update the associated cached files - for cached_file in self.outputs.all(): - cached_file.update(self.status) - - # Loads Results from cache - if self.job.result and self.analyzer and self.status == Block.DONE: - cache = self.first_cache() - data_source = beat.core.data.CachedDataSource() - data_source.setup(os.path.join(settings.CACHE_ROOT, - beat.core.hash.toPath(cache.hash)), settings.PREFIX) - output_data = data_source.next()[0] - if output_data is not None: - algorithm = beat.core.algorithm.Algorithm(settings.PREFIX, - self.algorithm.fullname()) - for field, value in output_data.as_dict().items(): - res, _ = Result.objects.get_or_create(name=field, - cache=cache) - res.primary = algorithm.results[field]['display'] - res.type = algorithm.results[field]["type"] - - if res.type in ['int32', 'float32', 'bool', 'string']: - res.data_value = str(value) - else: - res.data_value = simplejson.dumps(value, indent=4, - cls=NumpyJSONEncoder) - - res.save() - - data_source.close() - - self.save() - self._cascade_updates() - self.experiment._update_state() + return all([ k.status == Block.DONE for k in self.dependencies.all() ]) diff --git a/beat/web/experiments/models/experiment.py b/beat/web/experiments/models/experiment.py index cf441b6104890666b28090bd74445f559c914583..b22bb5ba624d702f24c6cdf75af467eb04e7ef59 100755 --- a/beat/web/experiments/models/experiment.py +++ b/beat/web/experiments/models/experiment.py @@ -26,7 +26,6 @@ ############################################################################### from django.db import models -from django.db import transaction from django.contrib.auth.models import User from django.core.urlresolvers import reverse from django.conf import settings @@ -159,12 +158,15 @@ class ExperimentManager(ContributionManager): name=name ) + def from_author(self, user, author_name, add_public=False): return super(ExperimentManager, self).from_author(user, author_name, add_public).order_by('-creation_date', 'name') + def from_author_and_public(self, user, author_name): return super(ExperimentManager, self).from_author_and_public(user, author_name).order_by('-creation_date', 'name') + def create_experiment(self, author, toolchain, name, declaration, short_description='', description=''): """Creates a new experiment in pending state""" @@ -216,7 +218,7 @@ class Experiment(Shareable): RUNNING = 'R' DONE = 'D' FAILED = 'F' - CANCELING = 'C' + CANCELLING = 'C' STATUS = ( (PENDING, 'Pending'), @@ -224,7 +226,7 @@ class Experiment(Shareable): (RUNNING, 'Running'), (DONE, 'Done'), (FAILED, 'Failed'), - (CANCELING, 'Canceling'), + (CANCELLING, 'Canceling'), ) @@ -605,7 +607,7 @@ class Experiment(Shareable): #_____ Methods __________ def is_busy(self): - return self.status in [Experiment.PENDING, Experiment.SCHEDULED, Experiment.CANCELING] + return self.status in [Experiment.PENDING, Experiment.SCHEDULED, Experiment.CANCELLING] def is_done(self): @@ -765,58 +767,6 @@ class Experiment(Shareable): return storage.get_file_content(self, 'declaration_file') - def _update_state(self): - '''Update self state based on associated block states - - This method is called by the underlying block. It is not part of the - Experiment's public API and must not be called by any other user code. - ''' - - self_ = Experiment.objects.select_for_update().get(pk=self.pk) - - if self_.is_done(): return - - if self.start_date is None: - d = self.blocks.filter(start_date__isnull=False).\ - order_by('start_date') - if d: - self.start_date = d.first().start_date - else: - self.start_date = datetime.now() - - block_statuses = self.blocks.values_list('status', flat=True) - - # Process main state and state from job results - if Block.FAILED in block_statuses or Block.CANCELLED in block_statuses: - if Block.PROCESSING in block_statuses: - self.status = Experiment.CANCELING - else: - self.status = Experiment.FAILED - - elif (Block.PROCESSING in block_statuses) or \ - ((Block.PENDING in block_statuses) and \ - (Block.DONE in block_statuses)): - self.status = Experiment.RUNNING - - elif Block.PENDING not in block_statuses: - self.status = Experiment.DONE - - else: - self.status = Experiment.SCHEDULED - - # Set end date if experiment is done - if self.is_done() and self.end_date is None: - d = self.blocks.filter(end_date__isnull=False).\ - order_by('-end_date') - if d: - self.end_date = d.first().end_date - else: - self.end_date = datetime.now() - - - self.save() - - def schedule(self): '''Schedules this experiment for execution at the backend''' @@ -824,26 +774,11 @@ class Experiment(Shareable): schedule_experiment(self) - @transaction.atomic def cancel(self): - '''Cancels the execution of this experiment on the backend. - - .. caution:: - - After each block is scheduled, it is possible some or all splits for - a given block are under execution. We must select-for-update all - Blocks and associated Jobs, as to avoid concurrent resetting from a - separate scheduling process. - - ''' - - self_ = Experiment.objects.get(pk=self.pk) - - if self_.status not in (Experiment.SCHEDULED, Experiment.RUNNING): - return + '''Cancels the execution of this experiment on the backend.''' - with transaction.atomic(): - for b in self.blocks.all(): b._cancel() + from ...backend.helpers import cancel_experiment + cancel_experiment(self) def fork(self, username=None, name=None): diff --git a/beat/web/experiments/serializers.py b/beat/web/experiments/serializers.py index deb9c6709b7c6ff8d80e4132c3dddba3b21316bc..2709c2142bab8fa68a6f45286c314912c3f6fe07 100755 --- a/beat/web/experiments/serializers.py +++ b/beat/web/experiments/serializers.py @@ -217,7 +217,7 @@ class ExperimentResultsSerializer(ShareableSerializer): 'execution_info', 'execution_order'] def get_started(self, obj): - return obj.status not in [Experiment.PENDING, Experiment.SCHEDULED, Experiment.CANCELING] + return obj.status not in [Experiment.PENDING, Experiment.SCHEDULED, Experiment.CANCELLING] def get_done(self, obj): return obj.status in [Experiment.DONE, Experiment.FAILED] diff --git a/beat/web/experiments/signals.py b/beat/web/experiments/signals.py old mode 100644 new mode 100755 index aa4967670a99ab9fe097d016a3cf73d957e2e1d8..e05055c70e8db1f42e6c438ee17aa62e7ed79f62 --- a/beat/web/experiments/signals.py +++ b/beat/web/experiments/signals.py @@ -70,33 +70,6 @@ def auto_delete_file_on_change(sender, instance, **kwargs): old_descr.delete(save=False) -@receiver(models.signals.pre_save, sender=Block) -def log_dates_on_state_change(sender, instance, **kwargs): - """Logs the relevant dates upon the setting of a block's state""" - - if not instance.pk: #object creation, we may ignore - return False - - old_block = Block.objects.filter(pk=instance.pk) - - if not old_block: #restoring from archive, ignore - return False - - old_status = old_block[0].status - new_status = instance.status - - if old_status != new_status: #status has changed - - if new_status == Block.PENDING: #admin reset - instance.results.all().delete() - - elif new_status == Block.PROCESSING: #started to process - instance.start_date = datetime.now() - - else: #finishing as FAILED or CACHED - instance.end_date = datetime.now() - - #_________ Algorithms _________ def build_user_algorithm_set(user): diff --git a/beat/web/scripts/scheduler.py b/beat/web/scripts/scheduler.py old mode 100644 new mode 100755 index f8a346f6b80e6f1aee65f4d08a725e1b7fe6bcf5..b518685dcd066a65ae78792bdcddef0f2901456d --- a/beat/web/scripts/scheduler.py +++ b/beat/web/scripts/scheduler.py @@ -30,23 +30,27 @@ Starts the scheduling process. Usage: - %(prog)s [-v ... | --verbose ...] [--settings=<file>] [--period=<seconds>] + %(prog)s [-v ... | --verbose ...] [--settings=<file>] [--interval=<seconds>] + [--address=<address>] [--port=<port>] %(prog)s (-h | --help) %(prog)s (-V | --version) Options: - -h, --help Show this help message - -V, --version Show program's version number - -v, --verbose Increases the output verbosity level - -S <file>, --settings=<file> The module name to the Django settings - file [default: beat.web.settings.settings] - -p <seconds, --period=<seconds> The time, in seconds, in which this - scheduler will try to allocate job splits - to existing workers. If not set, use the - value available on the Django settings - file, at the variable - `SCHEDULING_INTERVAL`. + -h, --help Show this help message + -V, --version Show program's version number + -v, --verbose Increases the output verbosity level + -S <file>, --settings=<file> The module name of the Django settings + file [default: beat.web.settings.settings] + -i <seconds>, --interval=<seconds> The time, in seconds, in which this + scheduler will try to allocate job splits + to existing workers. If not set, use the + value available on the Django settings + file, at the variable `SCHEDULING_INTERVAL`. + -a <address>, --address=<address> The address to which the processing nodes + must establish a connection to + -p <port>, --port=<port> The port to which the processing nodes + must establish a connection to Examples: @@ -63,33 +67,111 @@ Examples: import os import sys -import time import signal import docopt import logging +import simplejson + +from ..version import __version__ + +from beat.core.worker import WorkerController + +from ..backend.models import JobSplit +from ..backend.helpers import split_new_jobs +from ..backend.helpers import process_newly_cancelled_experiments +from ..backend.helpers import assign_splits_to_workers +from ..backend.helpers import get_configuration_for_split +from ..backend.helpers import on_split_started +from ..backend.helpers import on_split_done +from ..backend.helpers import on_split_fail +from ..backend.helpers import on_split_cancelled + + +logger = None + + +#---------------------------------------------------------- + + +def onWorkerReady(name): + from ..backend.models import Worker + + logger.info("Worker '%s' is ready", name) + + try: + worker = Worker.objects.get(name=name) + worker.active = True + worker.save() + except: + import traceback; print traceback.format_exc() + logger.error("No worker named '%s' found in the database", name) + + +#---------------------------------------------------------- + + +def onWorkerGone(name): + from ..backend.models import Worker + + logger.info("Worker '%s' is gone", name) + + try: + worker = Worker.objects.get(name=name) + worker.active = False + worker.save() + except: + logger.error("No worker named '%s' found in the database", name) + + +#---------------------------------------------------------- + + +def remove_split_id_from(list, split_id): + try: + list.remove(list.index(split_id)) + except: + pass + + +#---------------------------------------------------------- + stop = False def main(user_input=None): + # Parse the command-line arguments + if user_input is not None: + arguments = user_input + else: + arguments = sys.argv[1:] + arguments = docopt.docopt( - __doc__ % dict( - prog=os.path.basename(sys.argv[0]), + __doc__ % dict( + prog=os.path.basename(sys.argv[0]), ), - ) + argv=arguments, + version='v%s' % __version__, + ) + + # Initialisation of the application os.environ.setdefault('DJANGO_SETTINGS_MODULE', arguments['--settings']) from django.conf import settings from django import setup setup() - logger = logging.getLogger('beat.web') - if arguments['--verbose'] == 1: logger.setLevel(logging.INFO) - elif arguments['--verbose'] >= 2: logger.setLevel(logging.DEBUG) + global logger + logger = logging.getLogger('beat.scheduler') + if arguments['--verbose'] == 1: + logger.setLevel(logging.INFO) + elif arguments['--verbose'] >= 2: + logger.setLevel(logging.DEBUG) - # installs SIGTERM handler + + # Installs SIGTERM handler def handler(signum, frame): - #ignore further signals + # Ignore further signals signal.signal(signal.SIGTERM, signal.SIG_IGN) signal.signal(signal.SIGINT, signal.SIG_IGN) @@ -100,20 +182,151 @@ def main(user_input=None): signal.signal(signal.SIGTERM, handler) signal.signal(signal.SIGINT, handler) - from ..backend import schedule - timing = int(arguments['--period']) \ - if arguments['--period'] else settings.SCHEDULING_INTERVAL - logger.info("Scheduling every %d seconds", timing) + # Initialisation of the worker controller + # TODO: Default values + worker_controller = WorkerController( + arguments['--address'], + int(arguments['--port']), + callbacks=dict( + onWorkerReady = onWorkerReady, + onWorkerGone = onWorkerGone, + ) + ) + + + # Processing loop + from ..backend.helpers import split_new_jobs + from ..backend.helpers import assign_splits_to_workers + + interval = int(arguments['--interval']) \ + if arguments['--interval'] else settings.SCHEDULING_INTERVAL + logger.info("Scheduling every %d seconds", interval) + + running_job_splits = [] + cancelling_jobs = [] global stop while not stop: - - start = time.time() logger.debug("Starting scheduler cycle...") - schedule.schedule() - duration = time.time() - start - if duration < timing: - time.sleep(timing - duration) + # Process all the incoming messages + splits_to_cancel = [] + + while True: + # Wait for a message + message = worker_controller.process(interval * 1000) + if message is None: + break + + (address, status, split_id, data) = message + + # Was there an error? + if status == WorkerController.ERROR: + if split_id is None: + if data != "Worker isn't busy": + logger.error("Worker '%s' sent: %s", address, data) + continue + + + split_id = int(split_id) + + # Retrieve the job split + try: + split = JobSplit.objects.get(id=split_id) + except: + logger.error("Received message '%s' for unknown job split #%d", + status, split_id) + continue + + # Is the job done? + if status == WorkerController.DONE: + logger.info("Job split #%d (%s %d/%d @ %s) on '%s' is DONE", + split.job.block.name, split.split_index, + split.job.splits.count(), + split.job.block.experiment.fullname(), + split.id, + split.worker.name) + + on_split_done(split, simplejson.loads(data[0])) + remove_split_id_from(running_job_splits, split_id) + + # Has the job failed? + elif status == WorkerController.JOB_ERROR: + logger.info("Job split #%d (%s %d/%d @ %s) on '%s' returned an error", + split.job.block.name, split.split_index, + split.job.splits.count(), + split.job.block.experiment.fullname(), + split.id, + split.worker.name) + + splits_to_cancel.extend(on_split_fail(split, simplejson.loads(data[0]))) + remove_split_id_from(running_job_splits, split_id) + + # Was the job cancelled? + elif status == WorkerController.CANCELLED: + logger.info("Job split #%d (%s %d/%d @ %s) on '%s' is CANCELLED", + split.job.block.name, split.split_index, + split.job.splits.count(), + split.job.block.experiment.fullname(), + split.id, + split.worker.name) + + on_split_cancelled(split) + remove_split_id_from(cancelling_jobs, split_id) + + # Was there an error? + elif status == WorkerController.ERROR: + if split_id in running_job_splits: + logger.info("Job split #%d (%s %d/%d @ %s) on '%s' returned a system error: %s", + split.job.block.name, split.split_index, + split.job.splits.count(), + split.job.block.experiment.fullname(), + split.id, + split.worker.name, data[0]) + + splits_to_cancel.extend(on_split_fail(split, data[0])) + remove_split_id_from(running_job_splits, split_id) + + # Effectively cancel newly-cancelled experiments + splits_to_cancel.extend(process_newly_cancelled_experiments()) + + # Cancel the necessary jobs (if any) + for split_to_cancel in splits_to_cancel: + if split_to_cancel.id in running_job_splits: + logger.info("Cancelling job split #%d (%s %d/%d @ %s) on '%s'", + split_to_cancel.job.block.name, split_to_cancel.split_index, + split_to_cancel.job.splits.count(), + split_to_cancel.job.block.experiment.fullname(), + split_to_cancel.id, + split_to_cancel.worker.name) + + worker_controller.cancel(split_to_cancel.worker.name) + remove_split_id_from(running_job_splits, split_to_cancel.id) + cancelling_jobs.append(split_to_cancel.id) + + # If we must stop, don't start new jobs + if stop: + break + + # Start new jobs + split_new_jobs() + assigned_splits = assign_splits_to_workers() + + for split in assigned_splits: + running_job_splits.append(split.id) + + configuration = get_configuration_for_split(split) + + logger.info("Starting job split #%d (%s %d/%d @ %s) on '%s'", + split.job.block.name, split.split_index, split.job.splits.count(), + split.job.block.experiment.fullname(), split.id, + split.worker.name) + + worker_controller.execute(split.worker.name, split.id, configuration) + on_split_started(split) + + + # Cleanup logger.info("Gracefully exiting the scheduler") + worker_controller.destroy() diff --git a/beat/web/scripts/worker.py b/beat/web/scripts/worker.py deleted file mode 100755 index 8b888b0f1fe9cf50b80d949955073065e20abb03..0000000000000000000000000000000000000000 --- a/beat/web/scripts/worker.py +++ /dev/null @@ -1,163 +0,0 @@ -#!/usr/bin/env python -# vim: set fileencoding=utf-8 : - -############################################################################### -# # -# Copyright (c) 2016 Idiap Research Institute, http://www.idiap.ch/ # -# Contact: beat.support@idiap.ch # -# # -# This file is part of the beat.web module of the BEAT platform. # -# # -# Commercial License Usage # -# Licensees holding valid commercial BEAT licenses may use this file in # -# accordance with the terms contained in a written agreement between you # -# and Idiap. For further information contact tto@idiap.ch # -# # -# Alternatively, this file may be used under the terms of the GNU Affero # -# Public License version 3 as published by the Free Software and appearing # -# in the file LICENSE.AGPL included in the packaging of this file. # -# The BEAT platform is distributed in the hope that it will be useful, but # -# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY # -# or FITNESS FOR A PARTICULAR PURPOSE. # -# # -# You should have received a copy of the GNU Affero Public License along # -# with the BEAT platform. If not, see http://www.gnu.org/licenses/. # -# # -############################################################################### - - -"""\ -Starts the worker process. - -Usage: - %(prog)s [-v ... | --verbose ...] [--settings=<file>] [--period=<seconds>] - [--environments=<path>] [--name=<name>] - %(prog)s (-h | --help) - %(prog)s (-V | --version) - - -Options: - -h, --help Show this help message - -V, --version Show program's version number - -v, --verbose Increases the output verbosity level - -S <file>, --settings=<file> The module name to the Django settings - file [default: beat.web.settings.settings] - -e <path>, --environments=<path> The path to the installation root of - available environments. - -n <name>, --name=<name> The unique name of this worker on the - database. This is typically the assigned - hostname of the node, but not necessarily - [default: %(hostname)s] - -p <seconds, --period=<seconds> The time, in seconds, in which this worker - will probe the database for jobs to run or - cancel. If not set, use the value available - on the Django settings file, at the - variable `WORKER_INTERVAL`. - - -Examples: - - To start the worker do the following: - - $ %(prog)s - - You can pass the ``-v`` flag to start the worker with the logging level set - to ``INFO`` or ``-vv`` to set it to ``DEBUG``. By default, the logging level - is set to ``WARNING`` if no ``-v`` flag is passed. - -""" - -import os -import sys -import time -import socket -import signal -import docopt -import logging - -stop = False - -def main(user_input=None): - - arguments = docopt.docopt( - __doc__ % dict( - prog=os.path.basename(sys.argv[0]), - hostname=socket.gethostname(), - ), - ) - - os.environ.setdefault('DJANGO_SETTINGS_MODULE', arguments['--settings']) - from django.conf import settings - from django import setup - setup() - - logger = logging.getLogger('beat.web') - if arguments['--verbose'] == 1: logger.setLevel(logging.INFO) - elif arguments['--verbose'] >= 2: logger.setLevel(logging.DEBUG) - - # installs SIGTERM handler - def handler(signum, frame): - #ignore further signals - signal.signal(signal.SIGTERM, signal.SIG_IGN) - signal.signal(signal.SIGINT, signal.SIG_IGN) - - logger.info("Signal %d caught, terminating...", signum) - global stop - stop = True - - signal.signal(signal.SIGTERM, handler) - signal.signal(signal.SIGINT, handler) - - from ..backend import utils - from ..backend.models import Worker - - try: - worker = Worker.objects.get(name=arguments['--name']) - except Worker.DoesNotExist: - logger.error("Cannot find worker `%s' in database, aborting", - arguments['--name']) - sys.exit(1) - else: - logger.info("Found worker `%s' in database, proceeding...", - arguments['--name']) - - # figure out paths to programs I need to use - process = utils.resolve_process_path() - logger.debug("(path) process: `%s'", process) - - from django.utils import six - paths = arguments['--environments'] - if isinstance(paths, six.string_types): - paths = paths.split(os.pathsep) - environments = utils.find_environments(paths) - logger.debug("Environments: %s", ", ".join(environments)) - - # check environments - missing, unused = worker.check_environments(environments) - if unused: - logger.info("The following environments where found on your " \ - "setup, but will not be used with the current queue " \ - "configuration: %s" % ", ".join(unused)) - if missing: - raise RuntimeError("The following environments are currently " \ - "missing from your setup: %s" % ", ".join(missing)) - else: - logger.info("All required software environments were found") - - timing = int(arguments['--period']) \ - if arguments['--period'] else settings.WORKER_INTERVAL - logger.info("Working at `%s' every %d seconds", arguments['--name'], timing) - - global stop - with worker: - - while not stop: - - start = time.time() - logger.debug("Starting work cycle...") - worker.work(environments, process) - duration = time.time() - start - if duration < timing: - time.sleep(timing - duration) - - logger.info("Gracefully exiting worker `%s'" % arguments['--name']) diff --git a/beat/web/settings/test.py b/beat/web/settings/test.py index 779aaa939479272aa3f501c5c5f3705e6f95f31d..4b72e0697ebe775b16dd0efde780953aa9da6a08 100755 --- a/beat/web/settings/test.py +++ b/beat/web/settings/test.py @@ -40,6 +40,7 @@ ALLOWED_HOSTS = [ DATABASES['default']['NAME'] = 'test.sql3' DATABASES['default']['TEST'] = {'NAME': DATABASES['default']['NAME']} +DATABASES['default']['OPTIONS']['timeout'] = 30 import sys if 'beat.cmdline' in sys.argv: diff --git a/beat/web/utils/management/commands/restore.py b/beat/web/utils/management/commands/restore.py index e7b769523bbcc651c3256b243f183d4a824a5ebe..6276b5a66cdc010784428065551db6e2bc2ca1df 100644 --- a/beat/web/utils/management/commands/restore.py +++ b/beat/web/utils/management/commands/restore.py @@ -162,5 +162,5 @@ class Command(BaseCommand): # reset all experiments in a transient state (notice Job and # JobSplit are not backed-up because of circular dependence issues # on the "experiments" app). - transient = (Experiment.RUNNING, Experiment.CANCELING) + transient = (Experiment.RUNNING, Experiment.CANCELLING) for e in Experiment.objects.filter(status__in=transient): e.reset() diff --git a/setup.py b/setup.py index 383e2a4849b14fc8261033bdde29a9f72034fba1..56ff205293cb078fc3b036e6c004ee2f5498b752 100755 --- a/setup.py +++ b/setup.py @@ -87,7 +87,6 @@ setup( entry_points={ 'console_scripts': [ 'process = beat.web.scripts.process:main', - 'worker = beat.web.scripts.worker:main', 'scheduler = beat.web.scripts.scheduler:main', ], },