diff --git a/beat/web/backend/models.py b/beat/web/backend/models.py index 052390ffb302f53c2f368dca741e8f3ac5b39bca..540d199b3d3ce7a8f25fb49ae49aa608c5838f05 100644 --- a/beat/web/backend/models.py +++ b/beat/web/backend/models.py @@ -434,9 +434,22 @@ class Worker(models.Model): # cancel job splits by killing associated processes for j in JobSplit.objects.filter(worker=self, status=Job.CANCEL, - end_date__isnull=True, process_id__isnull=False): - if psutil.pid_exists(j.process_id): + end_date__isnull=True): + if j.process_id is not None and psutil.pid_exists(j.process_id): os.kill(j.process_id, signal.SIGTERM) + else: # process went away without any apparent reason + with transaction.atomic(): + message = "Split %d/%d running at worker `%s' for " \ + "block `%s' of experiment `%s' finished without any " \ + "apparent reason. Checking-out job split at " \ + "database by force..." % (j.split_index+1, + j.job.block.required_slots, + self, + j.job.block.name, + j.job.block.experiment.fullname(), + ) + j.end(Result(status=1, usrerr=settings.DEFAULT_USER_ERROR, + syserr=message)) # cmdline base argument cmdline = [process]