ci/lava: Add a simple Structural Logger into submitter
Refactor some pieces of the submitter to improve the clarity of the functions and create a simple dictionary with aggregated data from the submitter execution which will be dumped to a file when the script exits. Add support for the AutoSaveDict based structured logger as well, which will come in a follow-up commit. Signed-off-by: Guilherme Gallo <guilherme.gallo@collabora.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/22500>
This commit is contained in:

committed by
Marge Bot

parent
41f29c5333
commit
0ac3824922
@@ -64,4 +64,5 @@ PYTHONPATH=artifacts/ artifacts/lava/lava_job_submitter.py \
|
||||
--visibility-group "${VISIBILITY_GROUP}" \
|
||||
--lava-tags "${LAVA_TAGS}" \
|
||||
--mesa-job-name "$CI_JOB_NAME" \
|
||||
--structured-log-file "results/lava_job_detail.json" \
|
||||
>> results/lava.log
|
||||
|
@@ -11,9 +11,11 @@
|
||||
|
||||
|
||||
import contextlib
|
||||
import json
|
||||
import pathlib
|
||||
import sys
|
||||
import time
|
||||
from collections import defaultdict
|
||||
from dataclasses import dataclass, fields
|
||||
from datetime import datetime, timedelta
|
||||
from io import StringIO
|
||||
@@ -43,6 +45,10 @@ from lava.utils import (
|
||||
)
|
||||
from lavacli.utils import flow_yaml as lava_yaml
|
||||
|
||||
# Initialize structural logging with a defaultdict, it can be changed for more
|
||||
# sophisticated dict-like data abstractions.
|
||||
STRUCTURAL_LOG = defaultdict(list)
|
||||
|
||||
# Timeout in seconds to decide if the device from the dispatched LAVA job has
|
||||
# hung or not due to the lack of new log output.
|
||||
DEVICE_HANGING_TIMEOUT_SEC = int(getenv("LAVA_DEVICE_HANGING_TIMEOUT_SEC", 5*60))
|
||||
@@ -68,7 +74,11 @@ NUMBER_OF_RETRIES_TIMEOUT_DETECTION = int(
|
||||
getenv("LAVA_NUMBER_OF_RETRIES_TIMEOUT_DETECTION", 2)
|
||||
)
|
||||
|
||||
def find_exception_from_metadata(metadata, job_id):
|
||||
def raise_exception_from_metadata(metadata: dict, job_id: int) -> None:
|
||||
"""
|
||||
Investigate infrastructure errors from the job metadata.
|
||||
If it finds an error, raise it as MesaCIException.
|
||||
"""
|
||||
if "result" not in metadata or metadata["result"] != "fail":
|
||||
return
|
||||
if "error_type" in metadata:
|
||||
@@ -90,23 +100,22 @@ def find_exception_from_metadata(metadata, job_id):
|
||||
raise MesaCIException(
|
||||
f"LAVA job {job_id} failed validation (possible download error). Retry."
|
||||
)
|
||||
return metadata
|
||||
|
||||
|
||||
def find_lava_error(job) -> None:
|
||||
# Look for infrastructure errors and retry if we see them.
|
||||
def raise_lava_error(job) -> None:
|
||||
# Look for infrastructure errors, raise them, and retry if we see them.
|
||||
results_yaml = call_proxy(job.proxy.results.get_testjob_results_yaml, job.job_id)
|
||||
results = lava_yaml.load(results_yaml)
|
||||
for res in results:
|
||||
metadata = res["metadata"]
|
||||
find_exception_from_metadata(metadata, job.job_id)
|
||||
raise_exception_from_metadata(metadata, job.job_id)
|
||||
|
||||
# If we reach this far, it means that the job ended without hwci script
|
||||
# result and no LAVA infrastructure problem was found
|
||||
job.status = "fail"
|
||||
|
||||
|
||||
def show_job_data(job, colour=f"{CONSOLE_LOG['BOLD']}{CONSOLE_LOG['FG_GREEN']}"):
|
||||
def show_final_job_data(job, colour=f"{CONSOLE_LOG['BOLD']}{CONSOLE_LOG['FG_GREEN']}"):
|
||||
with GitlabSection(
|
||||
"job_data",
|
||||
"LAVA job info",
|
||||
@@ -233,7 +242,8 @@ def follow_job_execution(job, log_follower):
|
||||
# If this does not happen, it probably means a LAVA infrastructure error
|
||||
# happened.
|
||||
if job.status not in ["pass", "fail"]:
|
||||
find_lava_error(job)
|
||||
raise_lava_error(job)
|
||||
|
||||
|
||||
def structural_log_phases(job, log_follower):
|
||||
phases: dict[str, Any] = {
|
||||
@@ -244,6 +254,7 @@ def structural_log_phases(job, log_follower):
|
||||
}
|
||||
job.log["dut_job_phases"] = phases
|
||||
|
||||
|
||||
def print_job_final_status(job):
|
||||
if job.status == "running":
|
||||
job.status = "hung"
|
||||
@@ -256,13 +267,19 @@ def print_job_final_status(job):
|
||||
)
|
||||
|
||||
job.refresh_log()
|
||||
job.log["status"] = job.status
|
||||
show_job_data(job, colour=f"{CONSOLE_LOG['BOLD']}{color}")
|
||||
show_final_job_data(job, colour=f"{CONSOLE_LOG['BOLD']}{color}")
|
||||
|
||||
|
||||
def execute_job_with_retries(proxy, job_definition, retry_count) -> Optional[LAVAJob]:
|
||||
def execute_job_with_retries(
|
||||
proxy, job_definition, retry_count, jobs_log
|
||||
) -> Optional[LAVAJob]:
|
||||
for attempt_no in range(1, retry_count + 2):
|
||||
job = LAVAJob(proxy, job_definition)
|
||||
# Need to get the logger value from its object to enable autosave
|
||||
# features
|
||||
jobs_log.append({})
|
||||
job_log = jobs_log[-1]
|
||||
job = LAVAJob(proxy, job_definition, job_log)
|
||||
STRUCTURAL_LOG["dut_attempt_counter"] = attempt_no
|
||||
|
||||
try:
|
||||
submit_job(job)
|
||||
@@ -280,6 +297,7 @@ def execute_job_with_retries(proxy, job_definition, retry_count) -> Optional[LAV
|
||||
)
|
||||
|
||||
finally:
|
||||
job_log["finished_time"] = datetime.now().isoformat()
|
||||
print_job_final_status(job)
|
||||
|
||||
|
||||
@@ -287,7 +305,7 @@ def retriable_follow_job(proxy, job_definition) -> LAVAJob:
|
||||
number_of_retries = NUMBER_OF_RETRIES_TIMEOUT_DETECTION
|
||||
|
||||
if finished_job := execute_job_with_retries(
|
||||
proxy, job_definition, number_of_retries
|
||||
proxy, job_definition, number_of_retries, STRUCTURAL_LOG["dut_jobs"]
|
||||
):
|
||||
return finished_job
|
||||
|
||||
@@ -335,12 +353,16 @@ class LAVAJobSubmitter(PathResolver):
|
||||
validate_only: bool = False # Whether to only validate the job, not execute it
|
||||
visibility_group: str = None # Only affects LAVA farm maintainers
|
||||
job_rootfs_overlay_url: str = None
|
||||
structured_log_file: pathlib.Path = None # Log file path with structured LAVA log
|
||||
|
||||
def __post_init__(self) -> None:
|
||||
super().__post_init__()
|
||||
# Remove mesa job names with spaces, which breaks the lava-test-case command
|
||||
self.mesa_job_name = self.mesa_job_name.split(" ")[0]
|
||||
|
||||
if self.structured_log_file:
|
||||
self.setup_structured_logger()
|
||||
|
||||
def dump(self, job_definition):
|
||||
if self.dump_yaml:
|
||||
with GitlabSection(
|
||||
@@ -377,10 +399,32 @@ class LAVAJobSubmitter(PathResolver):
|
||||
if self.validate_only:
|
||||
return
|
||||
|
||||
finished_job = retriable_follow_job(proxy, job_definition)
|
||||
try:
|
||||
finished_job = retriable_follow_job(proxy, job_definition)
|
||||
except Exception as exception:
|
||||
STRUCTURAL_LOG["job_combined_fail_reason"] = str(exception)
|
||||
raise exception
|
||||
exit_code = 0 if finished_job.status == "pass" else 1
|
||||
STRUCTURAL_LOG["job_combined_status"] = job.status
|
||||
sys.exit(exit_code)
|
||||
|
||||
def setup_structured_logger(self):
|
||||
try:
|
||||
global STRUCTURAL_LOG
|
||||
STRUCTURAL_LOG = StructuredLogger(
|
||||
self.structured_log_file, truncate=True
|
||||
).data
|
||||
except NameError as e:
|
||||
print(
|
||||
f"Could not import StructuredLogger library: {e}. "
|
||||
"Falling back to DummyLogger"
|
||||
)
|
||||
|
||||
STRUCTURAL_LOG["fixed_tags"] = self.lava_tags
|
||||
STRUCTURAL_LOG["dut_job_type"] = self.device_type
|
||||
STRUCTURAL_LOG["job_combined_fail_reason"] = None
|
||||
STRUCTURAL_LOG["job_combined_status"] = "not_submitted"
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
# given that we proxy from DUT -> LAVA dispatcher -> LAVA primary -> us ->
|
||||
|
@@ -1,3 +1,4 @@
|
||||
from collections import defaultdict
|
||||
from unittest.mock import MagicMock, patch
|
||||
|
||||
import pytest
|
||||
@@ -32,7 +33,7 @@ RESULT_GET_TESTJOB_RESULTS = [{"metadata": {"result": "test"}}]
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def mock_proxy():
|
||||
def mock_proxy(frozen_time):
|
||||
def create_proxy_mock(
|
||||
job_results=RESULT_GET_TESTJOB_RESULTS,
|
||||
testsuite_results=[generate_testsuite_result()],
|
||||
@@ -51,6 +52,20 @@ def mock_proxy():
|
||||
proxy_logs_mock = proxy_mock.scheduler.jobs.logs
|
||||
proxy_logs_mock.return_value = jobs_logs_response()
|
||||
|
||||
proxy_job_state = proxy_mock.scheduler.job_state
|
||||
proxy_job_state.return_value = {"job_state": "Running"}
|
||||
proxy_job_state.side_effect = frozen_time.tick(1)
|
||||
|
||||
proxy_show_mock = proxy_mock.scheduler.jobs.show
|
||||
proxy_show_mock.return_value = defaultdict(
|
||||
str,
|
||||
{
|
||||
"device_type": "test_device",
|
||||
"device": "test_device-cbg-1",
|
||||
"state": "created",
|
||||
},
|
||||
)
|
||||
|
||||
for key, value in kwargs.items():
|
||||
setattr(proxy_logs_mock, key, value)
|
||||
|
||||
|
@@ -9,6 +9,7 @@ import xmlrpc.client
|
||||
from contextlib import nullcontext as does_not_raise
|
||||
from datetime import datetime
|
||||
from itertools import chain, repeat
|
||||
from pathlib import Path
|
||||
|
||||
import pytest
|
||||
from lava.exceptions import MesaCIException, MesaCIRetryError
|
||||
@@ -16,6 +17,7 @@ from lava.lava_job_submitter import (
|
||||
DEVICE_HANGING_TIMEOUT_SEC,
|
||||
NUMBER_OF_RETRIES_TIMEOUT_DETECTION,
|
||||
LAVAJob,
|
||||
LAVAJobSubmitter,
|
||||
bootstrap_log_follower,
|
||||
follow_job_execution,
|
||||
retriable_follow_job,
|
||||
@@ -299,7 +301,7 @@ def test_parse_job_result_from_log(message, expectation, mock_proxy):
|
||||
@pytest.mark.slow(
|
||||
reason="Slow and sketchy test. Needs a LAVA log raw file at /tmp/log.yaml"
|
||||
)
|
||||
def test_full_yaml_log(mock_proxy, frozen_time):
|
||||
def test_full_yaml_log(mock_proxy, frozen_time, tmp_path):
|
||||
import random
|
||||
|
||||
from lavacli.utils import flow_yaml as lava_yaml
|
||||
@@ -350,7 +352,10 @@ def test_full_yaml_log(mock_proxy, frozen_time):
|
||||
def reset_logs(*args):
|
||||
proxy.scheduler.jobs.logs.side_effect = load_lines()
|
||||
|
||||
tmp_file = Path(tmp_path) / "log.json"
|
||||
LAVAJobSubmitter(structured_log_file=tmp_file)
|
||||
proxy.scheduler.jobs.submit = reset_logs
|
||||
with pytest.raises(MesaCIRetryError):
|
||||
time_travel_to_test_time()
|
||||
retriable_follow_job(proxy, "")
|
||||
print(tmp_file.read_text())
|
||||
|
Reference in New Issue
Block a user