gitlab-ci: Implement a simple timeout detection for LAVA jobs
* Retry twice if the job does not generates logs for 5 minutes. * Only active the timeout detection when the job starts. Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/12808>
This commit is contained in:
@@ -34,7 +34,7 @@ import urllib.parse
|
|||||||
import xmlrpc
|
import xmlrpc
|
||||||
import yaml
|
import yaml
|
||||||
|
|
||||||
from datetime import datetime
|
from datetime import datetime, timedelta
|
||||||
from lavacli.utils import loader
|
from lavacli.utils import loader
|
||||||
|
|
||||||
|
|
||||||
@@ -210,24 +210,44 @@ def get_job_results(proxy, job_id, test_suite, test_case):
|
|||||||
|
|
||||||
return True
|
return True
|
||||||
|
|
||||||
|
def wait_until_job_is_started(proxy, job_id):
|
||||||
|
print_log(f"Waiting for job {job_id} to start.")
|
||||||
|
current_state = "Submitted"
|
||||||
|
waiting_states = ["Submitted", "Scheduling", "Scheduled"]
|
||||||
|
while current_state in waiting_states:
|
||||||
|
job_state = _call_proxy(proxy.scheduler.job_state, job_id)
|
||||||
|
current_state = job_state["job_state"]
|
||||||
|
|
||||||
|
time.sleep(30)
|
||||||
|
print_log(f"Job {job_id} started.")
|
||||||
|
|
||||||
def follow_job_execution(proxy, job_id):
|
def follow_job_execution(proxy, job_id):
|
||||||
line_count = 0
|
line_count = 0
|
||||||
finished = False
|
finished = False
|
||||||
|
last_time_logs = datetime.now()
|
||||||
while not finished:
|
while not finished:
|
||||||
(finished, data) = _call_proxy(proxy.scheduler.jobs.logs, job_id, line_count)
|
(finished, data) = _call_proxy(proxy.scheduler.jobs.logs, job_id, line_count)
|
||||||
logs = yaml.load(str(data), Loader=loader(False))
|
logs = yaml.load(str(data), Loader=loader(False))
|
||||||
if logs:
|
if logs:
|
||||||
|
# Reset the timeout
|
||||||
|
last_time_logs = datetime.now()
|
||||||
for line in logs:
|
for line in logs:
|
||||||
print("{} {}".format(line["dt"], line["msg"]))
|
print("{} {}".format(line["dt"], line["msg"]))
|
||||||
|
|
||||||
line_count += len(logs)
|
line_count += len(logs)
|
||||||
|
|
||||||
|
else:
|
||||||
|
time_limit = timedelta(minutes=1)
|
||||||
|
if datetime.now() - last_time_logs > time_limit:
|
||||||
|
print_log("LAVA job {} doesn't advance (machine got hung?). Retry.".format(job_id))
|
||||||
|
return False
|
||||||
|
|
||||||
# `proxy.scheduler.jobs.logs` does not block, even when there is no
|
# `proxy.scheduler.jobs.logs` does not block, even when there is no
|
||||||
# new log to be fetched. To avoid dosing the LAVA dispatcher
|
# new log to be fetched. To avoid dosing the LAVA dispatcher
|
||||||
# machine, let's add a sleep to save them some stamina.
|
# machine, let's add a sleep to save them some stamina.
|
||||||
time.sleep(5)
|
time.sleep(5)
|
||||||
|
|
||||||
|
return True
|
||||||
|
|
||||||
def show_job_data(proxy, job_id):
|
def show_job_data(proxy, job_id):
|
||||||
show = _call_proxy(proxy.scheduler.jobs.show, job_id)
|
show = _call_proxy(proxy.scheduler.jobs.show, job_id)
|
||||||
@@ -262,13 +282,22 @@ def main(args):
|
|||||||
print("LAVA job definition validated successfully")
|
print("LAVA job definition validated successfully")
|
||||||
return
|
return
|
||||||
|
|
||||||
|
retry_count = 2
|
||||||
|
|
||||||
while True:
|
while retry_count >= 0:
|
||||||
job_id = submit_job(proxy, yaml_file)
|
job_id = submit_job(proxy, yaml_file)
|
||||||
|
|
||||||
print_log("LAVA job id: {}".format(job_id))
|
print_log("LAVA job id: {}".format(job_id))
|
||||||
|
|
||||||
follow_job_execution(proxy, job_id)
|
wait_until_job_is_started(proxy, job_id)
|
||||||
|
|
||||||
|
if not follow_job_execution(proxy, job_id):
|
||||||
|
print_log(f"Job {job_id} has timed out. Cancelling it.")
|
||||||
|
# Cancel the job as it is considered unreachable by Mesa CI.
|
||||||
|
proxy.scheduler.jobs.cancel(job_id)
|
||||||
|
|
||||||
|
retry_count -= 1
|
||||||
|
continue
|
||||||
|
|
||||||
show_job_data(proxy, job_id)
|
show_job_data(proxy, job_id)
|
||||||
|
|
||||||
|
Reference in New Issue
Block a user