Files
mesa/.gitlab-ci/tests/utils/test_lava_log.py
Guilherme Gallo 9024e0df83 ci/lava: Don't fail if the section times mismatches
Time drift can occur during LAVA job execution due to transitions
between three different clocks.

The process begins in the GitLab job [1], using the CI_JOB_STARTED_AT
variable. If SSH is enabled, we then connect to the DUT through an
Alpine-based SSH client container inside the LAVA dispatcher [2], where
some GitLab-related steps are timestamped by lava_job_submitter.
Finally, the DUT [3] runs and uses the setup-test-env.sh helper to
handle GitLab sections, potentially using a third distinct clock.

Signed-off-by: Guilherme Gallo <guilherme.gallo@collabora.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/35222>
2025-06-05 22:32:37 +00:00

503 lines
16 KiB
Python

#!/usr/bin/env python3
#
# Copyright (C) 2022 Collabora Limited
# Author: Guilherme Gallo <guilherme.gallo@collabora.com>
#
# SPDX-License-Identifier: MIT
from datetime import UTC, datetime, timedelta
import pytest
from lava.exceptions import MesaCIKnownIssueException, MesaCITimeoutError
from lava.utils import (
GitlabSection,
LogFollower,
LogSectionType,
fix_lava_gitlab_section_log,
hide_sensitive_data,
)
from lava.utils.constants import (
KNOWN_ISSUE_R8152_MAX_CONSECUTIVE_COUNTER,
A6XX_GPU_RECOVERY_WATCH_PERIOD_MIN,
A6XX_GPU_RECOVERY_FAILURE_MESSAGE,
A6XX_GPU_RECOVERY_FAILURE_MAX_COUNT,
)
from lava.utils.lava_log_hints import LAVALogHints
from ..lava.helpers import (
create_lava_yaml_msg,
does_not_raise,
lava_yaml,
mock_lava_signal,
yaml_dump,
)
GITLAB_SECTION_SCENARIOS = {
"start collapsed": (
"start",
True,
f"\x1b[0Ksection_start:mock_date:my_first_section[collapsed=true]\r\x1b[0K"
f"{GitlabSection.colour}my_header\x1b[0m",
),
"start non_collapsed": (
"start",
False,
f"\x1b[0Ksection_start:mock_date:my_first_section\r\x1b[0K"
f"{GitlabSection.colour}my_header\x1b[0m",
),
"end collapsed": (
"end",
True,
"\x1b[0Ksection_end:mock_date:my_first_section\r\x1b[0K",
),
"end non_collapsed": (
"end",
False,
"\x1b[0Ksection_end:mock_date:my_first_section\r\x1b[0K",
),
}
@pytest.mark.parametrize(
"method, collapsed, expectation",
GITLAB_SECTION_SCENARIOS.values(),
ids=GITLAB_SECTION_SCENARIOS.keys(),
)
def test_gitlab_section(method, collapsed, expectation):
gs = GitlabSection(
id="my_first_section",
header="my_header",
type=LogSectionType.TEST_CASE,
start_collapsed=collapsed,
)
gs.get_timestamp = lambda mock_date: "mock_date"
gs.start()
result = getattr(gs, method)()
assert result == expectation
def test_gl_sections():
lines = [
{
"dt": datetime.now(tz=UTC),
"lvl": "debug",
"msg": "Received signal: <STARTRUN> 0_setup-ssh-server 10145749_1.3.2.3.1",
},
{
"dt": datetime.now(tz=UTC),
"lvl": "debug",
"msg": "Received signal: <STARTRUN> 0_mesa 5971831_1.3.2.3.1",
},
# Redundant log message which triggers the same Gitlab Section, it
# should be ignored, unless the id is different
{
"dt": datetime.now(tz=UTC),
"lvl": "target",
"msg": "[ 7.778836] <LAVA_SIGNAL_STARTRUN 0_mesa 5971831_1.3.2.3.1>",
},
{
"dt": datetime.now(tz=UTC),
"lvl": "debug",
"msg": "Received signal: <STARTTC> mesa-ci_iris-kbl-traces",
},
# Another redundant log message
{
"dt": datetime.now(tz=UTC),
"lvl": "target",
"msg": "[ 16.997829] <LAVA_SIGNAL_STARTTC mesa-ci_iris-kbl-traces>",
},
{
"dt": datetime.now(tz=UTC),
"lvl": "target",
"msg": "<LAVA_SIGNAL_ENDTC mesa-ci_iris-kbl-traces>",
},
]
lf = LogFollower()
with lf:
for line in lines:
lf.manage_gl_sections(line)
parsed_lines = lf.flush()
section_types = [s.type for s in lf.section_history]
assert "section_start" in parsed_lines[0]
assert "collapsed=true" in parsed_lines[0]
assert "section_end" in parsed_lines[1]
assert "section_start" in parsed_lines[2]
assert "collapsed=true" in parsed_lines[2]
assert "section_end" in parsed_lines[3]
assert "section_start" in parsed_lines[4]
assert "collapsed=true" in parsed_lines[4]
assert section_types == [
# LogSectionType.LAVA_BOOT, True, if LogFollower started with Boot section
LogSectionType.TEST_SUITE,
LogSectionType.TEST_CASE,
LogSectionType.LAVA_POST_PROCESSING,
]
def test_log_follower_flush():
lines = [
{
"dt": datetime.now(tz=UTC),
"lvl": "debug",
"msg": "Received signal: <STARTTC> mesa-ci_iris-kbl-traces",
},
{
"dt": datetime.now(tz=UTC),
"lvl": "target",
"msg": "<LAVA_SIGNAL_ENDTC mesa-ci_iris-kbl-traces>",
},
]
lf = LogFollower()
lf.feed(lines)
parsed_lines = lf.flush()
empty = lf.flush()
lf.feed(lines)
repeated_parsed_lines = lf.flush()
assert parsed_lines
assert not empty
assert repeated_parsed_lines
SENSITIVE_DATA_SCENARIOS = {
"no sensitive data tagged": (
["bla bla", "mytoken: asdkfjsde1341=="],
["bla bla", "mytoken: asdkfjsde1341=="],
["HIDEME"],
),
"sensitive data tagged": (
["bla bla", "mytoken: asdkfjsde1341== # HIDEME"],
["bla bla"],
["HIDEME"],
),
"sensitive data tagged with custom word": (
["bla bla", "mytoken: asdkfjsde1341== # DELETETHISLINE", "third line # NOTANYMORE"],
["bla bla", "third line # NOTANYMORE"],
["DELETETHISLINE", "NOTANYMORE"],
),
}
@pytest.mark.parametrize(
"input, expectation, tags",
SENSITIVE_DATA_SCENARIOS.values(),
ids=SENSITIVE_DATA_SCENARIOS.keys(),
)
def test_hide_sensitive_data(input, expectation, tags):
yaml_data = yaml_dump(input)
yaml_result = hide_sensitive_data(yaml_data, *tags)
result = lava_yaml.load(yaml_result)
assert result == expectation
GITLAB_SECTION_SPLIT_SCENARIOS = {
"Split section_start at target level": (
"\x1b[0Ksection_start:1668454947:test_post_process[collapsed=true]\r\x1b[0K"
"post-processing test results",
(
"\x1b[0Ksection_start:1668454947:test_post_process[collapsed=true]",
"\x1b[0Kpost-processing test results",
),
),
"Split section_end at target level": (
"\x1b[0Ksection_end:1666309222:test_post_process\r\x1b[0K",
("\x1b[0Ksection_end:1666309222:test_post_process", "\x1b[0K"),
),
"Second line is not split from the first": (
("\x1b[0Ksection_end:1666309222:test_post_process", "Any message"),
("\x1b[0Ksection_end:1666309222:test_post_process", "Any message"),
),
}
@pytest.mark.parametrize(
"expected_message, messages",
GITLAB_SECTION_SPLIT_SCENARIOS.values(),
ids=GITLAB_SECTION_SPLIT_SCENARIOS.keys(),
)
def test_fix_lava_gitlab_section_log(expected_message, messages):
fixed_messages = []
gen = fix_lava_gitlab_section_log()
next(gen)
for message in messages:
lava_log = create_lava_yaml_msg(msg=message, lvl="target")
if recovered_line := gen.send(lava_log):
fixed_messages.append((recovered_line, lava_log["msg"]))
fixed_messages.append(lava_log["msg"])
assert expected_message in fixed_messages
@pytest.mark.parametrize(
"expected_message, messages",
GITLAB_SECTION_SPLIT_SCENARIOS.values(),
ids=GITLAB_SECTION_SPLIT_SCENARIOS.keys(),
)
def test_lava_gitlab_section_log_collabora(expected_message, messages, monkeypatch):
"""Check if LogFollower does not change the message if we are running in Collabora farm."""
monkeypatch.setenv("RUNNER_TAG", "mesa-ci-x86_64-lava-test")
lf = LogFollower()
for message in messages:
lf.feed([create_lava_yaml_msg(msg=message)])
new_messages = lf.flush()
new_messages = tuple(new_messages) if len(new_messages) > 1 else new_messages[0]
assert new_messages == expected_message
CARRIAGE_RETURN_SCENARIOS = {
"Carriage return at the end of the previous line": (
(
"\x1b[0Ksection_start:1677609903:test_setup[collapsed=true]\r\x1b[0K\x1b[0;36m[303:44] "
"deqp: preparing test setup\x1b[0m",
),
(
"\x1b[0Ksection_start:1677609903:test_setup[collapsed=true]\r",
"\x1b[0K\x1b[0;36m[303:44] deqp: preparing test setup\x1b[0m\r\n",
),
),
"Newline at the end of the line": (
("\x1b[0K\x1b[0;36m[303:44] deqp: preparing test setup\x1b[0m", "log"),
("\x1b[0K\x1b[0;36m[303:44] deqp: preparing test setup\x1b[0m\r\n", "log"),
),
}
@pytest.mark.parametrize(
"expected_message, messages",
CARRIAGE_RETURN_SCENARIOS.values(),
ids=CARRIAGE_RETURN_SCENARIOS.keys(),
)
def test_lava_log_merge_carriage_return_lines(expected_message, messages):
lf = LogFollower()
for message in messages:
lf.feed([create_lava_yaml_msg(msg=message)])
new_messages = tuple(lf.flush())
assert new_messages == expected_message
WATCHDOG_SCENARIOS = {
"1 second before timeout": ({"seconds": -1}, does_not_raise()),
"1 second after timeout": ({"seconds": 1}, pytest.raises(MesaCITimeoutError)),
}
@pytest.mark.parametrize(
"timedelta_kwargs, exception",
WATCHDOG_SCENARIOS.values(),
ids=WATCHDOG_SCENARIOS.keys(),
)
def test_log_follower_watchdog(frozen_time, timedelta_kwargs, exception):
lines = [
{
"dt": datetime.now(tz=UTC),
"lvl": "debug",
"msg": "Received signal: <STARTTC> mesa-ci_iris-kbl-traces",
},
]
td = {LogSectionType.TEST_CASE: timedelta(minutes=1)}
lf = LogFollower(timeout_durations=td)
lf.feed(lines)
frozen_time.tick(
lf.timeout_durations[LogSectionType.TEST_CASE] + timedelta(**timedelta_kwargs)
)
lines = [create_lava_yaml_msg()]
with exception:
lf.feed(lines)
GITLAB_SECTION_ID_SCENARIOS = [
("a-good_name", "a-good_name"),
("spaces are not welcome", "spaces-are-not-welcome"),
("abc:amd64 1/3", "abc-amd64-1-3"),
]
@pytest.mark.parametrize("case_name, expected_id", GITLAB_SECTION_ID_SCENARIOS)
def test_gitlab_section_id(case_name, expected_id):
gl = GitlabSection(
id=case_name, header=case_name, type=LogSectionType.LAVA_POST_PROCESSING
)
assert gl.id == expected_id
def a618_network_issue_logs(level: str = "target") -> list:
net_error = create_lava_yaml_msg(
msg="[ 1733.599402] r8152 2-1.3:1.0 eth0: Tx status -71", lvl=level)
nfs_error = create_lava_yaml_msg(
msg="[ 1733.604506] nfs: server 192.168.201.1 not responding, still trying",
lvl=level,
)
return [
*(KNOWN_ISSUE_R8152_MAX_CONSECUTIVE_COUNTER*[net_error]),
nfs_error
]
TEST_PHASE_LAVA_SIGNAL = mock_lava_signal(LogSectionType.TEST_CASE)
A618_NET_ISSUE_BOOT = a618_network_issue_logs(level="feedback")
A618_NET_ISSUE_TEST = [TEST_PHASE_LAVA_SIGNAL, *a618_network_issue_logs(level="target")]
A618_NETWORK_ISSUE_SCENARIOS = {
"Fail - R8152 kmsg during boot phase": (
A618_NET_ISSUE_BOOT,
pytest.raises(MesaCIKnownIssueException),
),
"Fail - R8152 kmsg during test phase": (
A618_NET_ISSUE_TEST,
pytest.raises(MesaCIKnownIssueException),
),
"Pass - Partial (1) R8152 kmsg during test phase": (
A618_NET_ISSUE_TEST[:1],
does_not_raise(),
),
"Pass - Partial (2) R8152 kmsg during test phase": (
A618_NET_ISSUE_TEST[:2],
does_not_raise(),
),
"Pass - Partial (3) subsequent R8152 kmsg during test phase": (
[
TEST_PHASE_LAVA_SIGNAL,
A618_NET_ISSUE_TEST[1],
A618_NET_ISSUE_TEST[1],
],
does_not_raise(),
),
"Pass - Partial (4) subsequent nfs kmsg during test phase": (
[
TEST_PHASE_LAVA_SIGNAL,
A618_NET_ISSUE_TEST[-1],
A618_NET_ISSUE_TEST[-1],
],
does_not_raise(),
),
}
@pytest.mark.parametrize(
"messages, expectation",
A618_NETWORK_ISSUE_SCENARIOS.values(),
ids=A618_NETWORK_ISSUE_SCENARIOS.keys(),
)
def test_detect_failure(messages, expectation):
boot_section = GitlabSection(
id="dut_boot",
header="Booting hardware device",
type=LogSectionType.LAVA_BOOT,
start_collapsed=True,
)
boot_section.start()
lf = LogFollower(starting_section=boot_section)
with expectation:
lf.feed(messages)
def test_detect_a6xx_gpu_recovery_failure(frozen_time):
log_follower = LogFollower()
lava_log_hints = LAVALogHints(log_follower=log_follower)
failure_message = {
"dt": datetime.now(tz=UTC).isoformat(),
"msg": A6XX_GPU_RECOVERY_FAILURE_MESSAGE[0],
"lvl": "feedback",
}
with pytest.raises(MesaCIKnownIssueException):
for _ in range(A6XX_GPU_RECOVERY_FAILURE_MAX_COUNT):
lava_log_hints.detect_a6xx_gpu_recovery_failure(failure_message)
# Simulate the passage of time within the watch period
frozen_time.tick(1)
failure_message["dt"] = datetime.now(tz=UTC).isoformat()
def test_detect_a6xx_gpu_recovery_success(frozen_time):
log_follower = LogFollower()
lava_log_hints = LAVALogHints(log_follower=log_follower)
failure_message = {
"dt": datetime.now(tz=UTC).isoformat(),
"msg": A6XX_GPU_RECOVERY_FAILURE_MESSAGE[0],
"lvl": "feedback",
}
# Simulate sending a tolerable number of failure messages
for _ in range(A6XX_GPU_RECOVERY_FAILURE_MAX_COUNT - 1):
lava_log_hints.detect_a6xx_gpu_recovery_failure(failure_message)
frozen_time.tick(1)
failure_message["dt"] = datetime.now(tz=UTC).isoformat()
# Simulate the passage of time outside of the watch period
frozen_time.tick(60 * A6XX_GPU_RECOVERY_WATCH_PERIOD_MIN + 1)
failure_message = {
"dt": datetime.now(tz=UTC).isoformat(),
"msg": A6XX_GPU_RECOVERY_FAILURE_MESSAGE[1],
"lvl": "feedback",
}
with does_not_raise():
lava_log_hints.detect_a6xx_gpu_recovery_failure(failure_message)
assert lava_log_hints.a6xx_gpu_first_fail_time is None, (
"a6xx_gpu_first_fail_time is not None"
)
assert lava_log_hints.a6xx_gpu_recovery_fail_counter == 0, (
"a6xx_gpu_recovery_fail_counter is not 0"
)
@pytest.mark.parametrize(
"start_offset",
[
timedelta(hours=0),
timedelta(hours=1),
],
ids=["equal timestamps", "negative delta"],
)
def test_gitlab_section_relative_time_clamping(start_offset):
"""Test that delta time is clamped to zero if start_time <= timestamp_relative_to."""
now = datetime.now(tz=UTC)
timestamp_relative_to = now + start_offset
gs = GitlabSection(
id="clamp_section",
header=f"clamp_section header {start_offset}",
type=LogSectionType.TEST_CASE,
timestamp_relative_to=timestamp_relative_to,
)
gs.start()
output = gs.print_start_section()
assert "[00:00]" in output, f"Expected clamped relative time, got: {output}"
@pytest.mark.parametrize(
"delta_seconds,expected_seconds",
[
(-5, 0), # Negative delta should be clamped to 0
(0, 0), # Zero delta should remain 0
(5, 5), # Positive delta should remain unchanged
],
ids=["negative delta", "zero delta", "positive delta"],
)
def test_gitlab_section_delta_time(frozen_time, delta_seconds, expected_seconds):
"""Test that delta_time() properly clamps negative deltas to zero."""
gs = GitlabSection(
id="delta_section",
header=f"delta_section header {delta_seconds}",
type=LogSectionType.TEST_CASE,
)
with gs:
frozen_time.tick(delta_seconds)
# Test internal _delta_time() returns exact delta
internal_delta = gs._delta_time()
assert internal_delta == timedelta(seconds=delta_seconds), (
f"_delta_time() returned {internal_delta}, expected {timedelta(seconds=delta_seconds)}"
)
# Test public delta_time() returns clamped delta
clamped_delta = gs.delta_time()
assert clamped_delta == timedelta(seconds=expected_seconds), (
f"delta_time() returned {clamped_delta}, expected {timedelta(seconds=expected_seconds)}"
)