diff --git a/.gitlab-ci/lava/lava_job_submitter.py b/.gitlab-ci/lava/lava_job_submitter.py index 5d1f469e7c6..a83662afd5c 100755 --- a/.gitlab-ci/lava/lava_job_submitter.py +++ b/.gitlab-ci/lava/lava_job_submitter.py @@ -31,25 +31,27 @@ import time import traceback import urllib.parse import xmlrpc + from datetime import datetime, timedelta +from os import getenv import lavacli import yaml from lavacli.utils import loader -# Timeout in minutes to decide if the device from the dispatched LAVA job has +# Timeout in seconds to decide if the device from the dispatched LAVA job has # hung or not due to the lack of new log output. -DEVICE_HANGING_TIMEOUT_MIN = 5 +DEVICE_HANGING_TIMEOUT_SEC = int(getenv("LAVA_DEVICE_HANGING_TIMEOUT_SEC", 5*60)) # How many seconds the script should wait before try a new polling iteration to # check if the dispatched LAVA job is running or waiting in the job queue. -WAIT_FOR_DEVICE_POLLING_TIME_SEC = 10 +WAIT_FOR_DEVICE_POLLING_TIME_SEC = int(getenv("LAVA_WAIT_FOR_DEVICE_POLLING_TIME_SEC", 10)) # How many seconds to wait between log output LAVA RPC calls. -LOG_POLLING_TIME_SEC = 5 +LOG_POLLING_TIME_SEC = int(getenv("LAVA_LOG_POLLING_TIME_SEC", 5)) # How many retries should be made when a timeout happen. -NUMBER_OF_RETRIES_TIMEOUT_DETECTION = 2 +NUMBER_OF_RETRIES_TIMEOUT_DETECTION = int(getenv("LAVA_NUMBER_OF_RETRIES_TIMEOUT_DETECTION", 2)) def print_log(msg): @@ -61,14 +63,7 @@ def fatal_err(msg): def hide_sensitive_data(yaml_data, hide_tag="HIDEME"): - out_data = "" - - for line in yaml_data.splitlines(True): - if hide_tag in line: - continue - out_data += line - - return out_data + return "".join(line for line in yaml_data.splitlines(True) if hide_tag not in line) def generate_lava_yaml(args): @@ -211,7 +206,6 @@ def _call_proxy(fn, *args): fatal_err("A protocol error occurred (Err {} {})".format(err.errcode, err.errmsg)) else: time.sleep(15) - pass except xmlrpc.client.Fault as err: traceback.print_exc() fatal_err("FATAL: Fault: {} (code: {})".format(err.faultString, err.faultCode)) @@ -222,8 +216,8 @@ def get_job_results(proxy, job_id, test_suite, test_case): results_yaml = _call_proxy(proxy.results.get_testjob_results_yaml, job_id) results = yaml.load(results_yaml, Loader=loader(False)) for res in results: - metadata = res['metadata'] - if not 'result' in metadata or metadata['result'] != 'fail': + metadata = res["metadata"] + if "result" not in metadata or metadata["result"] != "fail": continue if 'error_type' in metadata and metadata['error_type'] == "Infrastructure": print_log("LAVA job {} failed with Infrastructure Error. Retry.".format(job_id)) @@ -260,8 +254,7 @@ def follow_job_execution(proxy, job_id): last_time_logs = datetime.now() while not finished: (finished, data) = _call_proxy(proxy.scheduler.jobs.logs, job_id, line_count) - logs = yaml.load(str(data), Loader=loader(False)) - if logs: + if logs := yaml.load(str(data), Loader=loader(False)): # Reset the timeout last_time_logs = datetime.now() for line in logs: @@ -270,7 +263,7 @@ def follow_job_execution(proxy, job_id): line_count += len(logs) else: - time_limit = timedelta(minutes=DEVICE_HANGING_TIMEOUT_MIN) + time_limit = timedelta(seconds=DEVICE_HANGING_TIMEOUT_SEC) if datetime.now() - last_time_logs > time_limit: print_log("LAVA job {} doesn't advance (machine got hung?). Retry.".format(job_id)) return False @@ -298,21 +291,7 @@ def submit_job(proxy, job_file): return _call_proxy(proxy.scheduler.jobs.submit, job_file) -def main(args): - proxy = setup_lava_proxy() - - yaml_file = generate_lava_yaml(args) - - if args.dump_yaml: - print(hide_sensitive_data(generate_lava_yaml(args))) - - if args.validate_only: - ret = validate_job(proxy, yaml_file) - if not ret: - fatal_err("Error in LAVA job definition") - print("LAVA job definition validated successfully") - return - +def retriable_follow_job(proxy, yaml_file): retry_count = NUMBER_OF_RETRIES_TIMEOUT_DETECTION while retry_count >= 0: @@ -332,8 +311,36 @@ def main(args): show_job_data(proxy, job_id) - if get_job_results(proxy, job_id, "0_mesa", "mesa") == True: - break + if get_job_results(proxy, job_id, "0_mesa", "mesa") == True: + break + else: + # The script attempted all the retries. The job seemed to fail. + return False + + return True + + +def main(args): + proxy = setup_lava_proxy() + + yaml_file = generate_lava_yaml(args) + + if args.dump_yaml: + print(hide_sensitive_data(generate_lava_yaml(args))) + + if args.validate_only: + ret = validate_job(proxy, yaml_file) + if not ret: + fatal_err("Error in LAVA job definition") + print("LAVA job definition validated successfully") + return + + if not retriable_follow_job(proxy, yaml_file): + fatal_err( + "Job failed after it exceeded the number of" + f"{NUMBER_OF_RETRIES_TIMEOUT_DETECTION} retries." + ) + def create_parser(): parser = argparse.ArgumentParser("LAVA job submitter")