ci/lava: Retry when data fetching log RPC call is corrupted

Rarely the jobs.logs RPC call can return corrupted data, such as
mal-formed YAML data. As this is expected and very rare to occur, let's
retry this RPC call several times to give it a chance to fix itself.

Retrying would not swallow the log lines since we keep track of how many
log lines each job has.

Signed-off-by: Guilherme Gallo <guilherme.gallo@collabora.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/15938>
This commit is contained in:
Guilherme Gallo
2022-04-20 20:55:47 -03:00
committed by Marge Bot
parent 4ffd21ca70
commit 201b0b6d29
3 changed files with 64 additions and 12 deletions

View File

@@ -14,4 +14,8 @@ class MesaCITimeoutError(MesaCIException):
class MesaCIRetryError(MesaCIException):
def __init__(self, *args, retry_count: int) -> None:
super().__init__(*args)
self.retry_count = retry_count
self.retry_count = retry_count
class MesaCIParseException(MesaCIException):
pass

View File

@@ -24,7 +24,9 @@
"""Send a job to LAVA, track it and collect log back"""
import argparse
import contextlib
import pathlib
import re
import sys
@@ -38,7 +40,12 @@ from typing import Any, Optional
import lavacli
import yaml
from lava.exceptions import MesaCIException, MesaCIRetryError, MesaCITimeoutError
from lava.exceptions import (
MesaCIException,
MesaCIParseException,
MesaCIRetryError,
MesaCITimeoutError,
)
from lavacli.utils import loader
# Timeout in seconds to decide if the device from the dispatched LAVA job has
@@ -264,20 +271,26 @@ class LAVAJob:
)
return job_state["job_state"] not in waiting_states
def _load_log_from_data(self, data) -> list[str]:
lines = []
# When there is no new log data, the YAML is empty
if loaded_lines := yaml.load(str(data), Loader=loader(False)):
lines = loaded_lines
# If we had non-empty log data, we can assure that the device is alive.
self.heartbeat()
self.last_log_line += len(lines)
return lines
def get_logs(self) -> list[str]:
try:
(finished, data) = _call_proxy(
self.proxy.scheduler.jobs.logs, self.job_id, self.last_log_line
)
lines = yaml.load(str(data), Loader=loader(False))
self.is_finished = finished
if not lines:
return []
self.heartbeat()
self.last_log_line += len(lines)
return lines
return self._load_log_from_data(data)
except Exception as mesa_ci_err:
raise MesaCIException(
raise MesaCIParseException(
f"Could not get LAVA job logs. Reason: {mesa_ci_err}"
) from mesa_ci_err
@@ -381,7 +394,15 @@ def fetch_logs(job, max_idle_time):
time.sleep(LOG_POLLING_TIME_SEC)
new_lines = job.get_logs()
# The XMLRPC binary packet may be corrupted, causing a YAML scanner error.
# Retry the log fetching several times before exposing the error.
for _ in range(5):
with contextlib.suppress(MesaCIParseException):
new_lines = job.get_logs()
break
else:
raise MesaCIParseException
parsed_lines = parse_lava_lines(new_lines)
for line in parsed_lines: