From 460ce7e75dfd9abf32c1ec7b7492a00a48bbb0d5 Mon Sep 17 00:00:00 2001
From: Guilherme Gallo <guilherme.gallo@collabora.com>
Date: Fri, 10 Sep 2021 12:58:27 -0300
Subject: [PATCH] gitlab-ci: Implement a simple timeout detection for LAVA jobs

* Retry twice if the job does not generates logs for 5 minutes.
* Only active the timeout detection when the job starts.

Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/12808>
---
 .gitlab-ci/lava/lava_job_submitter.py | 35 ++++++++++++++++++++++++---
 1 file changed, 32 insertions(+), 3 deletions(-)

diff --git a/.gitlab-ci/lava/lava_job_submitter.py b/.gitlab-ci/lava/lava_job_submitter.py
index af20abdbc64..e5ff2d8917e 100755
--- a/.gitlab-ci/lava/lava_job_submitter.py
+++ b/.gitlab-ci/lava/lava_job_submitter.py
@@ -34,7 +34,7 @@ import urllib.parse
 import xmlrpc
 import yaml
 
-from datetime import datetime
+from datetime import datetime, timedelta
 from lavacli.utils import loader
 
 
@@ -210,24 +210,44 @@ def get_job_results(proxy, job_id, test_suite, test_case):
 
     return True
 
+def wait_until_job_is_started(proxy, job_id):
+    print_log(f"Waiting for job {job_id} to start.")
+    current_state = "Submitted"
+    waiting_states = ["Submitted", "Scheduling", "Scheduled"]
+    while current_state in waiting_states:
+        job_state = _call_proxy(proxy.scheduler.job_state, job_id)
+        current_state = job_state["job_state"]
+
+        time.sleep(30)
+    print_log(f"Job {job_id} started.")
 
 def follow_job_execution(proxy, job_id):
     line_count = 0
     finished = False
+    last_time_logs = datetime.now()
     while not finished:
         (finished, data) = _call_proxy(proxy.scheduler.jobs.logs, job_id, line_count)
         logs = yaml.load(str(data), Loader=loader(False))
         if logs:
+            # Reset the timeout
+            last_time_logs = datetime.now()
             for line in logs:
                 print("{} {}".format(line["dt"], line["msg"]))
 
             line_count += len(logs)
 
+        else:
+            time_limit = timedelta(minutes=1)
+            if datetime.now() - last_time_logs > time_limit:
+                print_log("LAVA job {} doesn't advance (machine got hung?). Retry.".format(job_id))
+                return False
+
         # `proxy.scheduler.jobs.logs` does not block, even when there is no
         # new log to be fetched. To avoid dosing the LAVA dispatcher
         # machine, let's add a sleep to save them some stamina.
         time.sleep(5)
 
+    return True
 
 def show_job_data(proxy, job_id):
     show = _call_proxy(proxy.scheduler.jobs.show, job_id)
@@ -262,13 +282,22 @@ def main(args):
         print("LAVA job definition validated successfully")
         return
 
+    retry_count = 2
 
-    while True:
+    while retry_count >= 0:
         job_id = submit_job(proxy, yaml_file)
 
         print_log("LAVA job id: {}".format(job_id))
 
-        follow_job_execution(proxy, job_id)
+        wait_until_job_is_started(proxy, job_id)
+
+        if not follow_job_execution(proxy, job_id):
+            print_log(f"Job {job_id} has timed out. Cancelling it.")
+            # Cancel the job as it is considered unreachable by Mesa CI.
+            proxy.scheduler.jobs.cancel(job_id)
+
+            retry_count -= 1
+            continue
 
         show_job_data(proxy, job_id)