Sporadically a6xx gpu will fail to recover causing the lava job a660_vk_full to loop on error messages for three hours before timing out. A few sporadic error messages may still be recoverable, but when multiple errors occur over a short period, successful recovery is unlikely. Parse the logs to look for repeated error messages within a short time period. If found, cancel the lava job and rerun it. Also add unit tests for this behaviour. cc: mesa-stable Reported-by: Valentine Burley <valentine.burley@gmail.com> Acked-by: Daniel Stone <daniel.stone@collabora.com> Reviewed-by: Guilherme Gallo <guilherme.gallo@collabora.com> Signed-off-by: Deborah Brouwer <deborah.brouwer@collabora.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/30032>
107 lines
4.1 KiB
Python
107 lines
4.1 KiB
Python
from __future__ import annotations
|
|
|
|
import re
|
|
from datetime import datetime, timedelta
|
|
from dataclasses import dataclass, field
|
|
from typing import TYPE_CHECKING, Any, Optional, Sequence
|
|
|
|
if TYPE_CHECKING:
|
|
from lava.utils import LogFollower
|
|
|
|
from lava.exceptions import MesaCIKnownIssueException
|
|
from lava.utils.console_format import CONSOLE_LOG
|
|
from lava.utils.constants import (
|
|
KNOWN_ISSUE_R8152_MAX_CONSECUTIVE_COUNTER,
|
|
LOG_DEBUG_FEEDBACK_NOISE,
|
|
KNOWN_ISSUE_R8152_PATTERNS,
|
|
A6XX_GPU_RECOVERY_WATCH_PERIOD_MIN,
|
|
A6XX_GPU_RECOVERY_FAILURE_MESSAGE,
|
|
A6XX_GPU_RECOVERY_FAILURE_MAX_COUNT,
|
|
)
|
|
from lava.utils.log_section import LogSectionType
|
|
|
|
|
|
def search_known_issue_patterns(patterns: Sequence[str], line: str) -> str:
|
|
for pattern in patterns:
|
|
if re.search(pattern, line):
|
|
return pattern
|
|
return ""
|
|
|
|
|
|
@dataclass
|
|
class LAVALogHints:
|
|
log_follower: LogFollower
|
|
r8152_issue_consecutive_counter: int = field(default=0, init=False)
|
|
reboot_counter: int = field(default=0, init=False)
|
|
a6xx_gpu_recovery_fail_counter: int = field(default=0, init=False)
|
|
a6xx_gpu_first_fail_time: Optional[datetime] = field(default=None, init=False)
|
|
|
|
def raise_known_issue(self, message) -> None:
|
|
raise MesaCIKnownIssueException(
|
|
"Found known issue: "
|
|
f"{CONSOLE_LOG['FG_MAGENTA']}"
|
|
f"{message}"
|
|
f"{CONSOLE_LOG['RESET']}"
|
|
)
|
|
|
|
def detect_failure(self, new_lines: list[dict[str, Any]]):
|
|
for line in new_lines:
|
|
if line["msg"] == LOG_DEBUG_FEEDBACK_NOISE:
|
|
continue
|
|
self.detect_r8152_issue(line)
|
|
self.detect_forced_reboot(line)
|
|
self.detect_a6xx_gpu_recovery_failure(line)
|
|
|
|
def detect_r8152_issue(self, line):
|
|
if self.log_follower.phase in (
|
|
LogSectionType.LAVA_BOOT,
|
|
LogSectionType.TEST_CASE,
|
|
) and line["lvl"] in ("feedback", "target"):
|
|
if search_known_issue_patterns(KNOWN_ISSUE_R8152_PATTERNS, line["msg"]):
|
|
if (
|
|
self.r8152_issue_consecutive_counter
|
|
< KNOWN_ISSUE_R8152_MAX_CONSECUTIVE_COUNTER
|
|
):
|
|
self.r8152_issue_consecutive_counter += 1
|
|
return
|
|
|
|
self.raise_known_issue(
|
|
"Probable network issue failure encountered, retrying the job"
|
|
)
|
|
|
|
# Reset the status, as the `nfs... still trying` complaint was not detected
|
|
self.r8152_issue_consecutive_counter = 0
|
|
|
|
def detect_forced_reboot(self, line: dict[str, Any]) -> None:
|
|
if (
|
|
self.log_follower.phase == LogSectionType.TEST_CASE
|
|
and line["lvl"] == "feedback"
|
|
):
|
|
if re.search(r"^Reboot requested", line["msg"]):
|
|
self.reboot_counter += 1
|
|
|
|
if self.reboot_counter > 0:
|
|
self.raise_known_issue(
|
|
"Forced reboot detected during test phase, failing the job..."
|
|
)
|
|
|
|
# If the a6xx gpu repeatedly fails to recover over a short period of time,
|
|
# then successful recovery is unlikely so cancel the job preemptively.
|
|
def detect_a6xx_gpu_recovery_failure(self, line: dict[str, Any]) -> None:
|
|
if search_known_issue_patterns(A6XX_GPU_RECOVERY_FAILURE_MESSAGE, line["msg"]):
|
|
time_of_failure = datetime.fromisoformat(line["dt"])
|
|
self.a6xx_gpu_recovery_fail_counter += 1
|
|
|
|
if self.a6xx_gpu_first_fail_time is None:
|
|
self.a6xx_gpu_first_fail_time = time_of_failure
|
|
|
|
if self.a6xx_gpu_recovery_fail_counter == A6XX_GPU_RECOVERY_FAILURE_MAX_COUNT:
|
|
time_since_first_fail = time_of_failure - self.a6xx_gpu_first_fail_time
|
|
if time_since_first_fail <= timedelta(minutes=A6XX_GPU_RECOVERY_WATCH_PERIOD_MIN):
|
|
self.raise_known_issue(
|
|
"Repeated GPU recovery failure detected: cancelling the job"
|
|
)
|
|
else:
|
|
self.a6xx_gpu_first_fail_time = None
|
|
self.a6xx_gpu_recovery_fail_counter = 0
|