ci/lava: Don't run jobs if the remaining execution time is too short
Signed-off-by: Guilherme Gallo <guilherme.gallo@collabora.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/28778>
This commit is contained in:

committed by
Marge Bot

parent
3e33171471
commit
e96e25f323
@@ -16,7 +16,7 @@ import sys
|
||||
import time
|
||||
from collections import defaultdict
|
||||
from dataclasses import dataclass, fields
|
||||
from datetime import datetime, timedelta
|
||||
from datetime import datetime, timedelta, timezone
|
||||
from os import environ, getenv, path
|
||||
from typing import Any, Optional
|
||||
|
||||
@@ -83,6 +83,17 @@ NUMBER_OF_RETRIES_TIMEOUT_DETECTION = int(
|
||||
getenv("LAVA_NUMBER_OF_RETRIES_TIMEOUT_DETECTION", 2)
|
||||
)
|
||||
|
||||
CI_JOB_TIMEOUT_SEC = int(getenv("CI_JOB_TIMEOUT", 3600))
|
||||
# How many seconds the script will wait to let LAVA run the job and give the final details.
|
||||
EXPECTED_JOB_DURATION_SEC = int(getenv("EXPECTED_JOB_DURATION_SEC", 60 * 10))
|
||||
# CI_JOB_STARTED is given by GitLab CI/CD in UTC timezone by default.
|
||||
CI_JOB_STARTED_AT_RAW = getenv("CI_JOB_STARTED_AT", "")
|
||||
CI_JOB_STARTED_AT: datetime = (
|
||||
datetime.fromisoformat(CI_JOB_STARTED_AT_RAW)
|
||||
if CI_JOB_STARTED_AT_RAW
|
||||
else datetime.now(timezone.utc)
|
||||
)
|
||||
|
||||
|
||||
def raise_exception_from_metadata(metadata: dict, job_id: int) -> None:
|
||||
"""
|
||||
@@ -221,9 +232,23 @@ def submit_job(job):
|
||||
) from mesa_ci_err
|
||||
|
||||
|
||||
def wait_for_job_get_started(job):
|
||||
def wait_for_job_get_started(job, attempt_no):
|
||||
print_log(f"Waiting for job {job.job_id} to start.")
|
||||
while not job.is_started():
|
||||
current_job_duration_sec: int = int(
|
||||
(datetime.now(timezone.utc) - CI_JOB_STARTED_AT).total_seconds()
|
||||
)
|
||||
remaining_time_sec: int = max(0, CI_JOB_TIMEOUT_SEC - current_job_duration_sec)
|
||||
if remaining_time_sec < EXPECTED_JOB_DURATION_SEC:
|
||||
job.cancel()
|
||||
raise MesaCIFatalException(
|
||||
f"{CONSOLE_LOG['BOLD']}"
|
||||
f"{CONSOLE_LOG['FG_YELLOW']}"
|
||||
f"Job {job.job_id} only has {remaining_time_sec} seconds "
|
||||
"remaining to run, but it is expected to take at least "
|
||||
f"{EXPECTED_JOB_DURATION_SEC} seconds."
|
||||
f"{CONSOLE_LOG['RESET']}",
|
||||
)
|
||||
time.sleep(WAIT_FOR_DEVICE_POLLING_TIME_SEC)
|
||||
job.refresh_log()
|
||||
print_log(f"Job {job.job_id} started.")
|
||||
@@ -299,7 +324,7 @@ def execute_job_with_retries(
|
||||
try:
|
||||
job_log["submitter_start_time"] = datetime.now().isoformat()
|
||||
submit_job(job)
|
||||
wait_for_job_get_started(job)
|
||||
wait_for_job_get_started(job, attempt_no)
|
||||
log_follower: LogFollower = bootstrap_log_follower()
|
||||
follow_job_execution(job, log_follower)
|
||||
return job
|
||||
|
@@ -15,7 +15,7 @@ from typing import Generator
|
||||
from unittest.mock import MagicMock, patch
|
||||
|
||||
import pytest
|
||||
from lava.exceptions import MesaCIException, MesaCIRetryError
|
||||
from lava.exceptions import MesaCIException, MesaCIRetryError, MesaCIFatalException
|
||||
from lava.lava_job_submitter import (
|
||||
DEVICE_HANGING_TIMEOUT_SEC,
|
||||
NUMBER_OF_RETRIES_TIMEOUT_DETECTION,
|
||||
@@ -24,6 +24,7 @@ from lava.lava_job_submitter import (
|
||||
bootstrap_log_follower,
|
||||
follow_job_execution,
|
||||
retriable_follow_job,
|
||||
wait_for_job_get_started,
|
||||
)
|
||||
from lava.utils import LogSectionType
|
||||
|
||||
@@ -257,6 +258,27 @@ def test_simulate_a_long_wait_to_start_a_job(
|
||||
assert delta_time.total_seconds() >= wait_time
|
||||
|
||||
|
||||
LONG_LAVA_QUEUE_SCENARIOS = {
|
||||
"no_time_to_run": (0, pytest.raises(MesaCIFatalException)),
|
||||
"enough_time_to_run": (9999999999, does_not_raise()),
|
||||
}
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"job_timeout, expectation",
|
||||
LONG_LAVA_QUEUE_SCENARIOS.values(),
|
||||
ids=LONG_LAVA_QUEUE_SCENARIOS.keys(),
|
||||
)
|
||||
def test_wait_for_job_get_started_no_time_to_run(monkeypatch, job_timeout, expectation):
|
||||
monkeypatch.setattr("lava.lava_job_submitter.CI_JOB_TIMEOUT_SEC", job_timeout)
|
||||
job = MagicMock()
|
||||
# Make it escape the loop
|
||||
job.is_started.side_effect = (False, False, True)
|
||||
with expectation as e:
|
||||
wait_for_job_get_started(job, 1)
|
||||
if e:
|
||||
job.cancel.assert_called_with()
|
||||
|
||||
|
||||
CORRUPTED_LOG_SCENARIOS = {
|
||||
"too much subsequent corrupted data": (
|
||||
|
Reference in New Issue
Block a user