From 460ce7e75dfd9abf32c1ec7b7492a00a48bbb0d5 Mon Sep 17 00:00:00 2001 From: Guilherme Gallo Date: Fri, 10 Sep 2021 12:58:27 -0300 Subject: [PATCH] gitlab-ci: Implement a simple timeout detection for LAVA jobs * Retry twice if the job does not generates logs for 5 minutes. * Only active the timeout detection when the job starts. Part-of: --- .gitlab-ci/lava/lava_job_submitter.py | 35 ++++++++++++++++++++++++--- 1 file changed, 32 insertions(+), 3 deletions(-) diff --git a/.gitlab-ci/lava/lava_job_submitter.py b/.gitlab-ci/lava/lava_job_submitter.py index af20abdbc64..e5ff2d8917e 100755 --- a/.gitlab-ci/lava/lava_job_submitter.py +++ b/.gitlab-ci/lava/lava_job_submitter.py @@ -34,7 +34,7 @@ import urllib.parse import xmlrpc import yaml -from datetime import datetime +from datetime import datetime, timedelta from lavacli.utils import loader @@ -210,24 +210,44 @@ def get_job_results(proxy, job_id, test_suite, test_case): return True +def wait_until_job_is_started(proxy, job_id): + print_log(f"Waiting for job {job_id} to start.") + current_state = "Submitted" + waiting_states = ["Submitted", "Scheduling", "Scheduled"] + while current_state in waiting_states: + job_state = _call_proxy(proxy.scheduler.job_state, job_id) + current_state = job_state["job_state"] + + time.sleep(30) + print_log(f"Job {job_id} started.") def follow_job_execution(proxy, job_id): line_count = 0 finished = False + last_time_logs = datetime.now() while not finished: (finished, data) = _call_proxy(proxy.scheduler.jobs.logs, job_id, line_count) logs = yaml.load(str(data), Loader=loader(False)) if logs: + # Reset the timeout + last_time_logs = datetime.now() for line in logs: print("{} {}".format(line["dt"], line["msg"])) line_count += len(logs) + else: + time_limit = timedelta(minutes=1) + if datetime.now() - last_time_logs > time_limit: + print_log("LAVA job {} doesn't advance (machine got hung?). Retry.".format(job_id)) + return False + # `proxy.scheduler.jobs.logs` does not block, even when there is no # new log to be fetched. To avoid dosing the LAVA dispatcher # machine, let's add a sleep to save them some stamina. time.sleep(5) + return True def show_job_data(proxy, job_id): show = _call_proxy(proxy.scheduler.jobs.show, job_id) @@ -262,13 +282,22 @@ def main(args): print("LAVA job definition validated successfully") return + retry_count = 2 - while True: + while retry_count >= 0: job_id = submit_job(proxy, yaml_file) print_log("LAVA job id: {}".format(job_id)) - follow_job_execution(proxy, job_id) + wait_until_job_is_started(proxy, job_id) + + if not follow_job_execution(proxy, job_id): + print_log(f"Job {job_id} has timed out. Cancelling it.") + # Cancel the job as it is considered unreachable by Mesa CI. + proxy.scheduler.jobs.cancel(job_id) + + retry_count -= 1 + continue show_job_data(proxy, job_id)