ci/lava: Follow job execution via LogFollower

Now LogFollower is used to deal with the LAVA logs. Moreover, this commit adds timeouts per Gitlab section, if a section takes longer than expected, cancel the job and retry again. Signed-off-by: Guilherme Gallo <guilherme.gallo@collabora.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/16323>
2022-04-04 11:26:17 -03:00
parent 2569d7d7df
commit aa26a6ab72
5 changed files with 300 additions and 62 deletions
--- a/.gitlab-ci/tests/lava/helpers.py
+++ b/.gitlab-ci/tests/lava/helpers.py
@@ -1,10 +1,23 @@
 from contextlib import nullcontext as does_not_raise
-from datetime import datetime, timedelta
+from datetime import datetime
 from itertools import cycle
 from typing import Callable, Generator, Iterable, Tuple, Union

 import yaml
 from freezegun import freeze_time
+from lava.utils.lava_log import (
+    DEFAULT_GITLAB_SECTION_TIMEOUTS,
+    FALLBACK_GITLAB_SECTION_TIMEOUT,
+    LogSectionType,
+)
+
+
+def section_timeout(section_type: LogSectionType) -> int:
+    return int(
+        DEFAULT_GITLAB_SECTION_TIMEOUTS.get(
+            section_type, FALLBACK_GITLAB_SECTION_TIMEOUT
+        ).total_seconds()
+    )


 def create_lava_yaml_msg(
@@ -21,8 +34,6 @@ def generate_testsuite_result(
    if extra is None:
        extra = {}
    return {"metadata": {"result": result, **metadata_extra}, "name": name}
-
-
 def jobs_logs_response(
    finished=False, msg=None, lvl="target", result=None
 ) -> Tuple[bool, str]:
@@ -36,6 +47,19 @@ def jobs_logs_response(
    return finished, yaml.safe_dump(logs)


+def section_aware_message_generator(
+    messages: dict[LogSectionType, Iterable[int]]
+) -> Iterable[tuple[dict, Iterable[int]]]:
+    default = [1]
+    for section_type in LogSectionType:
+        delay = messages.get(section_type, default)
+        yield mock_lava_signal(section_type), delay
+
+
+def message_generator():
+    for section_type in LogSectionType:
+        yield mock_lava_signal(section_type)
+

 def level_generator():
    # Tests all known levels by default
@@ -80,3 +104,28 @@ def to_iterable(tick_fn):
    else:
        tick_gen = cycle((tick_fn,))
    return tick_gen
+
+
+def mock_logs(messages={}, result="pass"):
+    with freeze_time(datetime.now()) as time_travel:
+        # Simulate a complete run given by message_fn
+        for msg, tick_list in section_aware_message_generator(messages):
+            for tick_sec in tick_list:
+                yield jobs_logs_response(finished=False, msg=[msg])
+                time_travel.tick(tick_sec)
+
+        yield jobs_logs_response(finished=True, result="pass")
+
+
+def mock_lava_signal(type: LogSectionType) -> dict[str, str]:
+    return {
+        LogSectionType.TEST_CASE: create_lava_yaml_msg(
+            msg="<STARTTC> case", lvl="debug"
+        ),
+        LogSectionType.TEST_SUITE: create_lava_yaml_msg(
+            msg="<STARTRUN> suite", lvl="debug"
+        ),
+        LogSectionType.LAVA_POST_PROCESSING: create_lava_yaml_msg(
+            msg="<LAVA_SIGNAL_ENDTC case>", lvl="target"
+        ),
+    }.get(type, create_lava_yaml_msg())
--- a/.gitlab-ci/tests/test_lava_job_submitter.py
+++ b/.gitlab-ci/tests/test_lava_job_submitter.py
@@ -36,12 +36,15 @@ from lava.lava_job_submitter import (
    follow_job_execution,
    retriable_follow_job,
 )
+from lava.utils.lava_log import LogSectionType

 from .lava.helpers import (
    create_lava_yaml_msg,
    generate_n_logs,
    generate_testsuite_result,
    jobs_logs_response,
+    mock_logs,
+    section_timeout,
 )

 NUMBER_OF_MAX_ATTEMPTS = NUMBER_OF_RETRIES_TIMEOUT_DETECTION + 1
@@ -74,17 +77,43 @@ XMLRPC_FAULT = xmlrpc.client.Fault(0, "test")

 PROXY_SCENARIOS = {
    "finish case": (generate_n_logs(1), does_not_raise(), True, {}),
-    "works at last retry": (
-        generate_n_logs(n=NUMBER_OF_MAX_ATTEMPTS, tick_fn=[ DEVICE_HANGING_TIMEOUT_SEC + 1 ] * NUMBER_OF_RETRIES_TIMEOUT_DETECTION + [1]),
+    "boot works at last retry": (
+        mock_logs(
+            {
+                LogSectionType.LAVA_BOOT: [
+                    section_timeout(LogSectionType.LAVA_BOOT) + 1
+                ]
+                * NUMBER_OF_RETRIES_TIMEOUT_DETECTION
+                + [1]
+            },
+        ),
        does_not_raise(),
        True,
        {},
    ),
-    "timed out more times than retry attempts": (
-        generate_n_logs(
-            n=NUMBER_OF_MAX_ATTEMPTS + 1, tick_fn=DEVICE_HANGING_TIMEOUT_SEC + 1
+    "post process test case took too long": pytest.param(
+        mock_logs(
+            {
+                LogSectionType.LAVA_POST_PROCESSING: [
+                    section_timeout(LogSectionType.LAVA_POST_PROCESSING) + 1
+                ]
+                * (NUMBER_OF_MAX_ATTEMPTS + 1)
+            },
        ),
        pytest.raises(MesaCIRetryError),
+        True,
+        {},
+        marks=pytest.mark.xfail(
+            reason=(
+                "The time travel mock is not behaving as expected. "
+                "It makes a gitlab section end in the past when an "
+                "exception happens."
+            )
+        ),
+    ),
+    "timed out more times than retry attempts": (
+        generate_n_logs(n=4, tick_fn=9999999),
+        pytest.raises(MesaCIRetryError),
        False,
        {},
    ),
@@ -150,15 +179,20 @@ PROXY_SCENARIOS = {


@pytest.mark.parametrize(
-    "side_effect, expectation, job_result, proxy_args",
+    "test_log, expectation, job_result, proxy_args",
    PROXY_SCENARIOS.values(),
    ids=PROXY_SCENARIOS.keys(),
 )
 def test_retriable_follow_job(
-    mock_sleep, side_effect, expectation, job_result, proxy_args, mock_proxy
+    mock_sleep,
+    test_log,
+    expectation,
+    job_result,
+    proxy_args,
+    mock_proxy,
 ):
    with expectation:
-        proxy = mock_proxy(side_effect=side_effect, **proxy_args)
+        proxy = mock_proxy(side_effect=test_log, **proxy_args)
        job: LAVAJob = retriable_follow_job(proxy, "")
        assert job_result == (job.status == "pass")

@@ -196,6 +230,7 @@ def test_simulate_a_long_wait_to_start_a_job(
    assert delta_time.total_seconds() >= wait_time


+
 CORRUPTED_LOG_SCENARIOS = {
    "too much subsequent corrupted data": (
        [(False, "{'msg': 'Incomplete}")] * 100 + [jobs_logs_response(True)],
--- a/.gitlab-ci/tests/utils/test_lava_log.py
+++ b/.gitlab-ci/tests/utils/test_lava_log.py
@@ -22,13 +22,15 @@
 # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 # SOFTWARE.

-from datetime import datetime
+from datetime import datetime, timedelta

 import pytest
 import yaml
+from lava.exceptions import MesaCITimeoutError
 from lava.utils.lava_log import (
    GitlabSection,
    LogFollower,
+    LogSectionType,
    filter_debug_messages,
    fix_lava_color_log,
    fix_lava_gitlab_section_log,
@@ -66,8 +68,14 @@ GITLAB_SECTION_SCENARIOS = {
    ids=GITLAB_SECTION_SCENARIOS.keys(),
 )
 def test_gitlab_section(method, collapsed, expectation):
-    gs = GitlabSection(id="my_first_section", header="my_header", start_collapsed=collapsed)
-    gs.get_timestamp = lambda: "mock_date"
+    gs = GitlabSection(
+        id="my_first_section",
+        header="my_header",
+        type=LogSectionType.TEST_CASE,
+        start_collapsed=collapsed,
+    )
+    gs.get_timestamp = lambda x: "mock_date"
+    gs.start()
    result = getattr(gs, method)()
    assert result == expectation

@@ -274,3 +282,49 @@ LAVA_DEBUG_SPAM_MESSAGES = {
 )
 def test_filter_debug_messages(message, expectation):
    assert filter_debug_messages(message) == expectation
+
+
+WATCHDOG_SCENARIOS = {
+    "1 second before timeout": ({"seconds": -1}, does_not_raise()),
+    "1 second after timeout": ({"seconds": 1}, pytest.raises(MesaCITimeoutError)),
+}
+
+
+@pytest.mark.parametrize(
+    "timedelta_kwargs, exception",
+    WATCHDOG_SCENARIOS.values(),
+    ids=WATCHDOG_SCENARIOS.keys(),
+)
+def test_log_follower_watchdog(frozen_time, timedelta_kwargs, exception):
+    lines = [
+        {
+            "dt": datetime.now(),
+            "lvl": "debug",
+            "msg": "Received signal: <STARTTC> mesa-ci_iris-kbl-traces",
+        },
+    ]
+    td = {LogSectionType.TEST_CASE: timedelta(minutes=1)}
+    lf = LogFollower(timeout_durations=td)
+    lf.feed(lines)
+    frozen_time.tick(
+        lf.timeout_durations[LogSectionType.TEST_CASE] + timedelta(**timedelta_kwargs)
+    )
+    lines = [create_lava_yaml_msg()]
+    with exception:
+        lf.feed(lines)
+
+
+GITLAB_SECTION_ID_SCENARIOS = [
+    ("a-good_name", "a-good_name"),
+    ("spaces are not welcome", "spaces-are-not-welcome"),
+    ("abc:amd64 1/3", "abc-amd64-1-3"),
+]
+
+
+@pytest.mark.parametrize("case_name, expected_id", GITLAB_SECTION_ID_SCENARIOS)
+def test_gitlab_section_id(case_name, expected_id):
+    gl = GitlabSection(
+        id=case_name, header=case_name, type=LogSectionType.LAVA_POST_PROCESSING
+    )
+
+    assert gl.id == expected_id