ci/lava: Don't run jobs if the remaining execution time is too short

Signed-off-by: Guilherme Gallo <guilherme.gallo@collabora.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/28778>
This commit is contained in:
Guilherme Gallo
2024-04-11 18:51:54 -03:00
committed by Marge Bot
parent 3e33171471
commit e96e25f323
2 changed files with 51 additions and 4 deletions

View File

@@ -16,7 +16,7 @@ import sys
import time
from collections import defaultdict
from dataclasses import dataclass, fields
from datetime import datetime, timedelta
from datetime import datetime, timedelta, timezone
from os import environ, getenv, path
from typing import Any, Optional
@@ -83,6 +83,17 @@ NUMBER_OF_RETRIES_TIMEOUT_DETECTION = int(
getenv("LAVA_NUMBER_OF_RETRIES_TIMEOUT_DETECTION", 2)
)
CI_JOB_TIMEOUT_SEC = int(getenv("CI_JOB_TIMEOUT", 3600))
# How many seconds the script will wait to let LAVA run the job and give the final details.
EXPECTED_JOB_DURATION_SEC = int(getenv("EXPECTED_JOB_DURATION_SEC", 60 * 10))
# CI_JOB_STARTED is given by GitLab CI/CD in UTC timezone by default.
CI_JOB_STARTED_AT_RAW = getenv("CI_JOB_STARTED_AT", "")
CI_JOB_STARTED_AT: datetime = (
datetime.fromisoformat(CI_JOB_STARTED_AT_RAW)
if CI_JOB_STARTED_AT_RAW
else datetime.now(timezone.utc)
)
def raise_exception_from_metadata(metadata: dict, job_id: int) -> None:
"""
@@ -221,9 +232,23 @@ def submit_job(job):
) from mesa_ci_err
def wait_for_job_get_started(job):
def wait_for_job_get_started(job, attempt_no):
print_log(f"Waiting for job {job.job_id} to start.")
while not job.is_started():
current_job_duration_sec: int = int(
(datetime.now(timezone.utc) - CI_JOB_STARTED_AT).total_seconds()
)
remaining_time_sec: int = max(0, CI_JOB_TIMEOUT_SEC - current_job_duration_sec)
if remaining_time_sec < EXPECTED_JOB_DURATION_SEC:
job.cancel()
raise MesaCIFatalException(
f"{CONSOLE_LOG['BOLD']}"
f"{CONSOLE_LOG['FG_YELLOW']}"
f"Job {job.job_id} only has {remaining_time_sec} seconds "
"remaining to run, but it is expected to take at least "
f"{EXPECTED_JOB_DURATION_SEC} seconds."
f"{CONSOLE_LOG['RESET']}",
)
time.sleep(WAIT_FOR_DEVICE_POLLING_TIME_SEC)
job.refresh_log()
print_log(f"Job {job.job_id} started.")
@@ -299,7 +324,7 @@ def execute_job_with_retries(
try:
job_log["submitter_start_time"] = datetime.now().isoformat()
submit_job(job)
wait_for_job_get_started(job)
wait_for_job_get_started(job, attempt_no)
log_follower: LogFollower = bootstrap_log_follower()
follow_job_execution(job, log_follower)
return job

View File

@@ -15,7 +15,7 @@ from typing import Generator
from unittest.mock import MagicMock, patch
import pytest
from lava.exceptions import MesaCIException, MesaCIRetryError
from lava.exceptions import MesaCIException, MesaCIRetryError, MesaCIFatalException
from lava.lava_job_submitter import (
DEVICE_HANGING_TIMEOUT_SEC,
NUMBER_OF_RETRIES_TIMEOUT_DETECTION,
@@ -24,6 +24,7 @@ from lava.lava_job_submitter import (
bootstrap_log_follower,
follow_job_execution,
retriable_follow_job,
wait_for_job_get_started,
)
from lava.utils import LogSectionType
@@ -257,6 +258,27 @@ def test_simulate_a_long_wait_to_start_a_job(
assert delta_time.total_seconds() >= wait_time
LONG_LAVA_QUEUE_SCENARIOS = {
"no_time_to_run": (0, pytest.raises(MesaCIFatalException)),
"enough_time_to_run": (9999999999, does_not_raise()),
}
@pytest.mark.parametrize(
"job_timeout, expectation",
LONG_LAVA_QUEUE_SCENARIOS.values(),
ids=LONG_LAVA_QUEUE_SCENARIOS.keys(),
)
def test_wait_for_job_get_started_no_time_to_run(monkeypatch, job_timeout, expectation):
monkeypatch.setattr("lava.lava_job_submitter.CI_JOB_TIMEOUT_SEC", job_timeout)
job = MagicMock()
# Make it escape the loop
job.is_started.side_effect = (False, False, True)
with expectation as e:
wait_for_job_get_started(job, 1)
if e:
job.cancel.assert_called_with()
CORRUPTED_LOG_SCENARIOS = {
"too much subsequent corrupted data": (