mesa/.gitlab-ci/lava/utils/log_section.py

import re
from dataclasses import dataclass
from datetime import datetime, timedelta
from enum import Enum, auto
from os import getenv
from typing import Optional, Pattern, Union

from lava.utils.gitlab_section import GitlabSection


class LogSectionType(Enum):
    UNKNOWN = auto()
    LAVA_SUBMIT = auto()
    LAVA_QUEUE = auto()
    LAVA_DEPLOY = auto()
    LAVA_BOOT = auto()
    TEST_DUT_SUITE = auto()
    TEST_SUITE = auto()
    TEST_CASE = auto()
    LAVA_POST_PROCESSING = auto()

# How long to wait whilst we try to submit a job; make it fairly short,
# since the job will be retried.
LAVA_SUBMIT_TIMEOUT = int(getenv("LAVA_SUBMIT_TIMEOUT", 5))

# How long should we wait for a device to become available?
# For post-merge jobs, this should be ~infinite, but we can fail more
# aggressively for pre-merge.
LAVA_QUEUE_TIMEOUT = int(getenv("LAVA_QUEUE_TIMEOUT", 60))

# How long should we wait for a device to be deployed?
# The deploy involves downloading and decompressing the kernel, modules, dtb and the overlays.
# We should retry, to overcome network issues.
LAVA_DEPLOY_TIMEOUT = int(getenv("LAVA_DEPLOY_TIMEOUT", 5))

# Empirically, successful device deploy+boot in LAVA time takes less than 3 minutes.
# LAVA itself is configured to attempt `failure_retry` times (NUMBER_OF_ATTEMPTS_LAVA_BOOT) to boot
# the device.
# It is better to retry the boot than cancel the job and re-submit to avoid
# the enqueue delay.
LAVA_BOOT_TIMEOUT = int(getenv("LAVA_BOOT_TIMEOUT", 5))

# Estimated overhead in minutes for a job from GitLab to reach the test phase,
# including LAVA scheduling and boot duration
LAVA_TEST_OVERHEAD_MIN = 5

# Test DUT suite phase is where the initialization happens in DUT, not on docker.
# The device will be listening to SSH session until the end of the job.
LAVA_TEST_DUT_SUITE_TIMEOUT = int(getenv("CI_JOB_TIMEOUT")) // 60 - LAVA_TEST_OVERHEAD_MIN

# Test suite phase is where the initialization happens on docker.
LAVA_TEST_SUITE_TIMEOUT = int(getenv("LAVA_TEST_SUITE_TIMEOUT", 5))

# Test cases may take a long time, this script has no right to interrupt
# them. But if the test case takes almost 1h, it will never succeed due to
# Gitlab job timeout.
LAVA_TEST_CASE_TIMEOUT = int(getenv("CI_JOB_TIMEOUT")) // 60 - LAVA_TEST_OVERHEAD_MIN

# LAVA post processing may refer to a test suite teardown, or the
# adjustments to start the next test_case
LAVA_POST_PROCESSING_TIMEOUT = int(getenv("LAVA_POST_PROCESSING_TIMEOUT", 5))

FALLBACK_GITLAB_SECTION_TIMEOUT = timedelta(minutes=10)
DEFAULT_GITLAB_SECTION_TIMEOUTS = {
    LogSectionType.LAVA_SUBMIT: timedelta(minutes=LAVA_SUBMIT_TIMEOUT),
    LogSectionType.LAVA_QUEUE: timedelta(minutes=LAVA_QUEUE_TIMEOUT),
    LogSectionType.LAVA_DEPLOY: timedelta(minutes=LAVA_DEPLOY_TIMEOUT),
    LogSectionType.LAVA_BOOT: timedelta(minutes=LAVA_BOOT_TIMEOUT),
    LogSectionType.TEST_DUT_SUITE: timedelta(minutes=LAVA_TEST_DUT_SUITE_TIMEOUT),
    LogSectionType.TEST_SUITE: timedelta(minutes=LAVA_TEST_SUITE_TIMEOUT),
    LogSectionType.TEST_CASE: timedelta(minutes=LAVA_TEST_CASE_TIMEOUT),
    LogSectionType.LAVA_POST_PROCESSING: timedelta(
        minutes=LAVA_POST_PROCESSING_TIMEOUT
    ),
}


@dataclass(frozen=True)
class LogSection:
    regex: Union[Pattern, str]
    levels: tuple[str]
    section_id: str
    section_header: str
    section_type: LogSectionType
    collapsed: bool = False

    def from_log_line_to_section(
        self, lava_log_line: dict[str, str], main_test_case: Optional[str],
        timestamp_relative_to: Optional[datetime]
    ) -> Optional[GitlabSection]:
        if lava_log_line["lvl"] not in self.levels:
            return

        if match := re.search(self.regex, lava_log_line["msg"]):
            section_id = self.section_id.format(*match.groups())
            section_header = self.section_header.format(*match.groups())
            is_main_test_case = section_id == main_test_case
            timeout = DEFAULT_GITLAB_SECTION_TIMEOUTS[self.section_type]
            return GitlabSection(
                id=section_id,
                header=f"{section_header} - Timeout: {timeout}",
                type=self.section_type,
                start_collapsed=self.collapsed,
                suppress_start=is_main_test_case,
                suppress_end=is_main_test_case,
                timestamp_relative_to=timestamp_relative_to,
            )


LOG_SECTIONS = (
    LogSection(
        regex=re.compile(r"start: 2 (\S+) \(timeout ([^)]+)\).*"),
        levels=("info"),
        section_id="{}",
        section_header="Booting via {}",
        section_type=LogSectionType.LAVA_BOOT,
        collapsed=True,
    ),
    LogSection(
        regex=re.compile(r"<?STARTTC>? ([^>]*)"),
        levels=("target", "debug"),
        section_id="{}",
        section_header="test_case {}",
        section_type=LogSectionType.TEST_CASE,
    ),
    LogSection(
        regex=re.compile(r"<?STARTRUN>? ([^>]*ssh.*server.*)"),
        levels=("debug"),
        section_id="{}",
        section_header="[dut] test_suite {}",
        section_type=LogSectionType.TEST_DUT_SUITE,
    ),
    LogSection(
        regex=re.compile(r"<?STARTRUN>? ([^>]*)"),
        levels=("debug"),
        section_id="{}",
        section_header="[docker] test_suite {}",
        section_type=LogSectionType.TEST_SUITE,
    ),
    LogSection(
        regex=re.compile(r"ENDTC>? ([^>]+)"),
        levels=("target", "debug"),
        section_id="post-{}",
        section_header="Post test_case {}",
        collapsed=True,
        section_type=LogSectionType.LAVA_POST_PROCESSING,
    ),
)