ci/lava: Don't run jobs if the remaining execution time is too short
Signed-off-by: Guilherme Gallo <guilherme.gallo@collabora.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/28778>
This commit is contained in:

committed by
Marge Bot

parent
3e33171471
commit
e96e25f323
@@ -16,7 +16,7 @@ import sys
|
|||||||
import time
|
import time
|
||||||
from collections import defaultdict
|
from collections import defaultdict
|
||||||
from dataclasses import dataclass, fields
|
from dataclasses import dataclass, fields
|
||||||
from datetime import datetime, timedelta
|
from datetime import datetime, timedelta, timezone
|
||||||
from os import environ, getenv, path
|
from os import environ, getenv, path
|
||||||
from typing import Any, Optional
|
from typing import Any, Optional
|
||||||
|
|
||||||
@@ -83,6 +83,17 @@ NUMBER_OF_RETRIES_TIMEOUT_DETECTION = int(
|
|||||||
getenv("LAVA_NUMBER_OF_RETRIES_TIMEOUT_DETECTION", 2)
|
getenv("LAVA_NUMBER_OF_RETRIES_TIMEOUT_DETECTION", 2)
|
||||||
)
|
)
|
||||||
|
|
||||||
|
CI_JOB_TIMEOUT_SEC = int(getenv("CI_JOB_TIMEOUT", 3600))
|
||||||
|
# How many seconds the script will wait to let LAVA run the job and give the final details.
|
||||||
|
EXPECTED_JOB_DURATION_SEC = int(getenv("EXPECTED_JOB_DURATION_SEC", 60 * 10))
|
||||||
|
# CI_JOB_STARTED is given by GitLab CI/CD in UTC timezone by default.
|
||||||
|
CI_JOB_STARTED_AT_RAW = getenv("CI_JOB_STARTED_AT", "")
|
||||||
|
CI_JOB_STARTED_AT: datetime = (
|
||||||
|
datetime.fromisoformat(CI_JOB_STARTED_AT_RAW)
|
||||||
|
if CI_JOB_STARTED_AT_RAW
|
||||||
|
else datetime.now(timezone.utc)
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
def raise_exception_from_metadata(metadata: dict, job_id: int) -> None:
|
def raise_exception_from_metadata(metadata: dict, job_id: int) -> None:
|
||||||
"""
|
"""
|
||||||
@@ -221,9 +232,23 @@ def submit_job(job):
|
|||||||
) from mesa_ci_err
|
) from mesa_ci_err
|
||||||
|
|
||||||
|
|
||||||
def wait_for_job_get_started(job):
|
def wait_for_job_get_started(job, attempt_no):
|
||||||
print_log(f"Waiting for job {job.job_id} to start.")
|
print_log(f"Waiting for job {job.job_id} to start.")
|
||||||
while not job.is_started():
|
while not job.is_started():
|
||||||
|
current_job_duration_sec: int = int(
|
||||||
|
(datetime.now(timezone.utc) - CI_JOB_STARTED_AT).total_seconds()
|
||||||
|
)
|
||||||
|
remaining_time_sec: int = max(0, CI_JOB_TIMEOUT_SEC - current_job_duration_sec)
|
||||||
|
if remaining_time_sec < EXPECTED_JOB_DURATION_SEC:
|
||||||
|
job.cancel()
|
||||||
|
raise MesaCIFatalException(
|
||||||
|
f"{CONSOLE_LOG['BOLD']}"
|
||||||
|
f"{CONSOLE_LOG['FG_YELLOW']}"
|
||||||
|
f"Job {job.job_id} only has {remaining_time_sec} seconds "
|
||||||
|
"remaining to run, but it is expected to take at least "
|
||||||
|
f"{EXPECTED_JOB_DURATION_SEC} seconds."
|
||||||
|
f"{CONSOLE_LOG['RESET']}",
|
||||||
|
)
|
||||||
time.sleep(WAIT_FOR_DEVICE_POLLING_TIME_SEC)
|
time.sleep(WAIT_FOR_DEVICE_POLLING_TIME_SEC)
|
||||||
job.refresh_log()
|
job.refresh_log()
|
||||||
print_log(f"Job {job.job_id} started.")
|
print_log(f"Job {job.job_id} started.")
|
||||||
@@ -299,7 +324,7 @@ def execute_job_with_retries(
|
|||||||
try:
|
try:
|
||||||
job_log["submitter_start_time"] = datetime.now().isoformat()
|
job_log["submitter_start_time"] = datetime.now().isoformat()
|
||||||
submit_job(job)
|
submit_job(job)
|
||||||
wait_for_job_get_started(job)
|
wait_for_job_get_started(job, attempt_no)
|
||||||
log_follower: LogFollower = bootstrap_log_follower()
|
log_follower: LogFollower = bootstrap_log_follower()
|
||||||
follow_job_execution(job, log_follower)
|
follow_job_execution(job, log_follower)
|
||||||
return job
|
return job
|
||||||
|
@@ -15,7 +15,7 @@ from typing import Generator
|
|||||||
from unittest.mock import MagicMock, patch
|
from unittest.mock import MagicMock, patch
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
from lava.exceptions import MesaCIException, MesaCIRetryError
|
from lava.exceptions import MesaCIException, MesaCIRetryError, MesaCIFatalException
|
||||||
from lava.lava_job_submitter import (
|
from lava.lava_job_submitter import (
|
||||||
DEVICE_HANGING_TIMEOUT_SEC,
|
DEVICE_HANGING_TIMEOUT_SEC,
|
||||||
NUMBER_OF_RETRIES_TIMEOUT_DETECTION,
|
NUMBER_OF_RETRIES_TIMEOUT_DETECTION,
|
||||||
@@ -24,6 +24,7 @@ from lava.lava_job_submitter import (
|
|||||||
bootstrap_log_follower,
|
bootstrap_log_follower,
|
||||||
follow_job_execution,
|
follow_job_execution,
|
||||||
retriable_follow_job,
|
retriable_follow_job,
|
||||||
|
wait_for_job_get_started,
|
||||||
)
|
)
|
||||||
from lava.utils import LogSectionType
|
from lava.utils import LogSectionType
|
||||||
|
|
||||||
@@ -257,6 +258,27 @@ def test_simulate_a_long_wait_to_start_a_job(
|
|||||||
assert delta_time.total_seconds() >= wait_time
|
assert delta_time.total_seconds() >= wait_time
|
||||||
|
|
||||||
|
|
||||||
|
LONG_LAVA_QUEUE_SCENARIOS = {
|
||||||
|
"no_time_to_run": (0, pytest.raises(MesaCIFatalException)),
|
||||||
|
"enough_time_to_run": (9999999999, does_not_raise()),
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize(
|
||||||
|
"job_timeout, expectation",
|
||||||
|
LONG_LAVA_QUEUE_SCENARIOS.values(),
|
||||||
|
ids=LONG_LAVA_QUEUE_SCENARIOS.keys(),
|
||||||
|
)
|
||||||
|
def test_wait_for_job_get_started_no_time_to_run(monkeypatch, job_timeout, expectation):
|
||||||
|
monkeypatch.setattr("lava.lava_job_submitter.CI_JOB_TIMEOUT_SEC", job_timeout)
|
||||||
|
job = MagicMock()
|
||||||
|
# Make it escape the loop
|
||||||
|
job.is_started.side_effect = (False, False, True)
|
||||||
|
with expectation as e:
|
||||||
|
wait_for_job_get_started(job, 1)
|
||||||
|
if e:
|
||||||
|
job.cancel.assert_called_with()
|
||||||
|
|
||||||
|
|
||||||
CORRUPTED_LOG_SCENARIOS = {
|
CORRUPTED_LOG_SCENARIOS = {
|
||||||
"too much subsequent corrupted data": (
|
"too much subsequent corrupted data": (
|
||||||
|
Reference in New Issue
Block a user