From 72c182f873bd0b6c98018cc9f15d2c3bac7c06cb Mon Sep 17 00:00:00 2001 From: Deborah Brouwer Date: Fri, 5 Jul 2024 15:28:24 -0700 Subject: [PATCH] ci/lava: Detect a6xx gpu recovery failures Sporadically a6xx gpu will fail to recover causing the lava job a660_vk_full to loop on error messages for three hours before timing out. A few sporadic error messages may still be recoverable, but when multiple errors occur over a short period, successful recovery is unlikely. Parse the logs to look for repeated error messages within a short time period. If found, cancel the lava job and rerun it. Also add unit tests for this behaviour. cc: mesa-stable Reported-by: Valentine Burley Acked-by: Daniel Stone Reviewed-by: Guilherme Gallo Signed-off-by: Deborah Brouwer Part-of: --- .gitlab-ci/lava/utils/constants.py | 7 ++++ .gitlab-ci/lava/utils/lava_log_hints.py | 29 ++++++++++++++- .gitlab-ci/tests/utils/test_lava_log.py | 49 ++++++++++++++++++++++++- 3 files changed, 83 insertions(+), 2 deletions(-) diff --git a/.gitlab-ci/lava/utils/constants.py b/.gitlab-ci/lava/utils/constants.py index 8a688fb04d3..82f0b66fc7e 100644 --- a/.gitlab-ci/lava/utils/constants.py +++ b/.gitlab-ci/lava/utils/constants.py @@ -23,3 +23,10 @@ KNOWN_ISSUE_R8152_PATTERNS: tuple[str, ...] = ( # This is considered noise, since LAVA produces this log after receiving a package of feedback # messages. LOG_DEBUG_FEEDBACK_NOISE = "Listened to connection for namespace 'dut' done" + +A6XX_GPU_RECOVERY_WATCH_PERIOD_MIN = 3 +A6XX_GPU_RECOVERY_FAILURE_MAX_COUNT = 30 +A6XX_GPU_RECOVERY_FAILURE_MESSAGE = ( + "cx gdsc didn't collapse", + "Timeout waiting for GMU OOB", +) diff --git a/.gitlab-ci/lava/utils/lava_log_hints.py b/.gitlab-ci/lava/utils/lava_log_hints.py index 04c158eb34b..3a35ae97c3c 100644 --- a/.gitlab-ci/lava/utils/lava_log_hints.py +++ b/.gitlab-ci/lava/utils/lava_log_hints.py @@ -1,8 +1,9 @@ from __future__ import annotations import re +from datetime import datetime, timedelta from dataclasses import dataclass, field -from typing import TYPE_CHECKING, Any, Sequence +from typing import TYPE_CHECKING, Any, Optional, Sequence if TYPE_CHECKING: from lava.utils import LogFollower @@ -13,6 +14,9 @@ from lava.utils.constants import ( KNOWN_ISSUE_R8152_MAX_CONSECUTIVE_COUNTER, LOG_DEBUG_FEEDBACK_NOISE, KNOWN_ISSUE_R8152_PATTERNS, + A6XX_GPU_RECOVERY_WATCH_PERIOD_MIN, + A6XX_GPU_RECOVERY_FAILURE_MESSAGE, + A6XX_GPU_RECOVERY_FAILURE_MAX_COUNT, ) from lava.utils.log_section import LogSectionType @@ -29,6 +33,8 @@ class LAVALogHints: log_follower: LogFollower r8152_issue_consecutive_counter: int = field(default=0, init=False) reboot_counter: int = field(default=0, init=False) + a6xx_gpu_recovery_fail_counter: int = field(default=0, init=False) + a6xx_gpu_first_fail_time: Optional[datetime] = field(default=None, init=False) def raise_known_issue(self, message) -> None: raise MesaCIKnownIssueException( @@ -44,6 +50,7 @@ class LAVALogHints: continue self.detect_r8152_issue(line) self.detect_forced_reboot(line) + self.detect_a6xx_gpu_recovery_failure(line) def detect_r8152_issue(self, line): if self.log_follower.phase in ( @@ -77,3 +84,23 @@ class LAVALogHints: self.raise_known_issue( "Forced reboot detected during test phase, failing the job..." ) + + # If the a6xx gpu repeatedly fails to recover over a short period of time, + # then successful recovery is unlikely so cancel the job preemptively. + def detect_a6xx_gpu_recovery_failure(self, line: dict[str, Any]) -> None: + if search_known_issue_patterns(A6XX_GPU_RECOVERY_FAILURE_MESSAGE, line["msg"]): + time_of_failure = datetime.fromisoformat(line["dt"]) + self.a6xx_gpu_recovery_fail_counter += 1 + + if self.a6xx_gpu_first_fail_time is None: + self.a6xx_gpu_first_fail_time = time_of_failure + + if self.a6xx_gpu_recovery_fail_counter == A6XX_GPU_RECOVERY_FAILURE_MAX_COUNT: + time_since_first_fail = time_of_failure - self.a6xx_gpu_first_fail_time + if time_since_first_fail <= timedelta(minutes=A6XX_GPU_RECOVERY_WATCH_PERIOD_MIN): + self.raise_known_issue( + "Repeated GPU recovery failure detected: cancelling the job" + ) + else: + self.a6xx_gpu_first_fail_time = None + self.a6xx_gpu_recovery_fail_counter = 0 diff --git a/.gitlab-ci/tests/utils/test_lava_log.py b/.gitlab-ci/tests/utils/test_lava_log.py index 7bf5510a610..18a0af4e046 100644 --- a/.gitlab-ci/tests/utils/test_lava_log.py +++ b/.gitlab-ci/tests/utils/test_lava_log.py @@ -16,7 +16,13 @@ from lava.utils import ( fix_lava_gitlab_section_log, hide_sensitive_data, ) -from lava.utils.constants import KNOWN_ISSUE_R8152_MAX_CONSECUTIVE_COUNTER +from lava.utils.constants import ( + KNOWN_ISSUE_R8152_MAX_CONSECUTIVE_COUNTER, + A6XX_GPU_RECOVERY_WATCH_PERIOD_MIN, + A6XX_GPU_RECOVERY_FAILURE_MESSAGE, + A6XX_GPU_RECOVERY_FAILURE_MAX_COUNT, +) +from lava.utils.lava_log_hints import LAVALogHints from ..lava.helpers import ( create_lava_yaml_msg, @@ -390,3 +396,44 @@ def test_detect_failure(messages, expectation): lf = LogFollower(starting_section=boot_section) with expectation: lf.feed(messages) + +def test_detect_a6xx_gpu_recovery_failure(frozen_time): + log_follower = LogFollower() + lava_log_hints = LAVALogHints(log_follower=log_follower) + failure_message = { + "dt": datetime.now().isoformat(), + "msg": A6XX_GPU_RECOVERY_FAILURE_MESSAGE[0], + "lvl": "feedback", + } + with pytest.raises(MesaCIKnownIssueException): + for _ in range(A6XX_GPU_RECOVERY_FAILURE_MAX_COUNT): + lava_log_hints.detect_a6xx_gpu_recovery_failure(failure_message) + # Simulate the passage of time within the watch period + frozen_time.tick(1) + failure_message["dt"] = datetime.now().isoformat() + +def test_detect_a6xx_gpu_recovery_success(frozen_time): + log_follower = LogFollower() + lava_log_hints = LAVALogHints(log_follower=log_follower) + failure_message = { + "dt": datetime.now().isoformat(), + "msg": A6XX_GPU_RECOVERY_FAILURE_MESSAGE[0], + "lvl": "feedback", + } + # Simulate sending a tolerable number of failure messages + for _ in range(A6XX_GPU_RECOVERY_FAILURE_MAX_COUNT - 1): + lava_log_hints.detect_a6xx_gpu_recovery_failure(failure_message) + frozen_time.tick(1) + failure_message["dt"] = datetime.now().isoformat() + + # Simulate the passage of time outside of the watch period + frozen_time.tick(60 * A6XX_GPU_RECOVERY_WATCH_PERIOD_MIN + 1) + failure_message = { + "dt": datetime.now().isoformat(), + "msg": A6XX_GPU_RECOVERY_FAILURE_MESSAGE[1], + "lvl": "feedback", + } + with does_not_raise(): + lava_log_hints.detect_a6xx_gpu_recovery_failure(failure_message) + assert lava_log_hints.a6xx_gpu_first_fail_time is None, "a6xx_gpu_first_fail_time is not None" + assert lava_log_hints.a6xx_gpu_recovery_fail_counter == 0, "a6xx_gpu_recovery_fail_counter is not 0"