From 2a712e438de6e2e141dad54d3bab6c9afc798ddd Mon Sep 17 00:00:00 2001
From: Andre Anjos <andre.anjos@idiap.ch>
Date: Thu, 23 Jun 2016 17:24:26 +0200
Subject: [PATCH] [backend] Improve the handling of splits to be cancelled

---
 beat/web/backend/models.py | 17 +++++++++++++++--
 1 file changed, 15 insertions(+), 2 deletions(-)

diff --git a/beat/web/backend/models.py b/beat/web/backend/models.py
index 052390ffb..540d199b3 100644
--- a/beat/web/backend/models.py
+++ b/beat/web/backend/models.py
@@ -434,9 +434,22 @@ class Worker(models.Model):
 
         # cancel job splits by killing associated processes
         for j in JobSplit.objects.filter(worker=self, status=Job.CANCEL,
-            end_date__isnull=True, process_id__isnull=False):
-            if psutil.pid_exists(j.process_id):
+            end_date__isnull=True):
+            if j.process_id is not None and psutil.pid_exists(j.process_id):
                 os.kill(j.process_id, signal.SIGTERM)
+            else: # process went away without any apparent reason
+                with transaction.atomic():
+                    message = "Split %d/%d running at worker `%s' for " \
+                        "block `%s' of experiment `%s' finished without any " \
+                        "apparent reason. Checking-out job split at " \
+                        "database by force..." % (j.split_index+1,
+                            j.job.block.required_slots,
+                            self,
+                            j.job.block.name,
+                            j.job.block.experiment.fullname(),
+                            )
+                    j.end(Result(status=1, usrerr=settings.DEFAULT_USER_ERROR,
+                      syserr=message))
 
         # cmdline base argument
         cmdline = [process]
-- 
GitLab