ci/lava: Add bridge function for job definition

To use the supported job definition depending on some Mesa CI job
characteristics.

The strategy here, is to use LAVA with a containerized SSH session to
follow the job output, escaping from dumping data to the UART, which
proves to be error prone in some devices.

Signed-off-by: Guilherme Gallo <guilherme.gallo@collabora.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/22870>
This commit is contained in:
Guilherme Gallo
2023-05-05 08:57:32 -03:00
committed by Marge Bot
parent 02d07f3380
commit 8626a52637
5 changed files with 272 additions and 264 deletions

View File

@@ -37,7 +37,7 @@ from lava.utils import (
LogSectionType,
call_proxy,
fatal_err,
generate_lava_yaml_payload,
generate_lava_job_definition,
hide_sensitive_data,
print_log,
setup_lava_proxy,
@@ -399,9 +399,7 @@ class LAVAJobSubmitter(PathResolver):
minutes=self.job_timeout_min
)
job_definition_stream = StringIO()
lava_yaml.dump(generate_lava_yaml_payload(self), job_definition_stream)
job_definition = job_definition_stream.getvalue()
job_definition = generate_lava_job_definition(self)
if self.dump_yaml:
self.dump_job_definition(job_definition)

View File

@@ -1,7 +1,7 @@
from .console_format import CONSOLE_LOG
from .gitlab_section import GitlabSection
from .lava_job import LAVAJob
from .lava_job_definition import generate_lava_yaml_payload
from .lava_job_definition import generate_lava_job_definition
from .lava_proxy import call_proxy, setup_lava_proxy
from .log_follower import (
LogFollower,

View File

@@ -1,7 +1,16 @@
# How many attempts should be made when a timeout happen during LAVA device boot.
from os import getenv
from typing import Any
from io import StringIO
from typing import TYPE_CHECKING, Any
import re
from lava.utils.lava_farm import LavaFarm, get_lava_farm
from ruamel.yaml.scalarstring import LiteralScalarString
from ruamel.yaml import YAML
from os import getenv
if TYPE_CHECKING:
from lava.lava_job_submitter import LAVAJobSubmitter
# How many attempts should be made when a timeout happen during LAVA device boot.
NUMBER_OF_ATTEMPTS_LAVA_BOOT = int(getenv("LAVA_NUMBER_OF_ATTEMPTS_LAVA_BOOT", 3))
# Supports any integers in [0, 100].
@@ -10,8 +19,58 @@ NUMBER_OF_ATTEMPTS_LAVA_BOOT = int(getenv("LAVA_NUMBER_OF_ATTEMPTS_LAVA_BOOT", 3
JOB_PRIORITY = int(getenv("LAVA_JOB_PRIORITY", 75))
def generate_lava_yaml_payload(args) -> dict[str, Any]:
# General metadata and permissions, plus also inexplicably kernel arguments
def generate_lava_yaml_payload(job_submitter: "LAVAJobSubmitter") -> dict[str, Any]:
"""
Bridge function to use the supported job definition depending on some Mesa
CI job characteristics.
The strategy here, is to use LAVA with a containerized SSH session to follow
the job output, escaping from dumping data to the UART, which proves to be
error prone in some devices.
"""
from lava.utils.ssh_job_definition import (
generate_lava_yaml_payload as ssh_lava_yaml,
)
from lava.utils.uart_job_definition import (
generate_lava_yaml_payload as uart_lava_yaml,
)
# Only Collabora's farm supports to run docker container as a LAVA actions,
# which is required to follow the job in a SSH section
current_farm = get_lava_farm()
# SSH job definition still needs to add support for fastboot.
job_uses_fastboot: bool = job_submitter.boot_method == "fastboot"
if current_farm == LavaFarm.COLLABORA and not job_uses_fastboot:
return ssh_lava_yaml(job_submitter)
return uart_lava_yaml(job_submitter)
def generate_lava_job_definition(job_submitter: "LAVAJobSubmitter") -> str:
job_stream = StringIO()
yaml = YAML()
yaml.width = 4096
yaml.dump(generate_lava_yaml_payload(job_submitter), job_stream)
return job_stream.getvalue()
def to_yaml_block(steps_array: list[str], escape_vars=[]) -> LiteralScalarString:
def escape_envvar(match):
return "\\" + match.group(0)
filtered_array = [s for s in steps_array if s.strip() and not s.startswith("#")]
final_str = "\n".join(filtered_array)
for escape_var in escape_vars:
# Find env vars and add '\\' before them
final_str = re.sub(rf"\${escape_var}*", escape_envvar, final_str)
return LiteralScalarString(final_str)
def generate_metadata(args) -> dict[str, Any]:
# General metadata and permissions
values = {
"job_name": f"mesa: {args.pipeline_info}",
"device_type": args.device_type,
@@ -25,7 +84,7 @@ def generate_lava_yaml_payload(args) -> dict[str, Any]:
"actions": {
"depthcharge-retry": {
# Could take between 1 and 1.5 min in slower boots
"minutes": 2
"minutes": 4
},
"depthcharge-start": {
# Should take less than 1 min.
@@ -34,7 +93,7 @@ def generate_lava_yaml_payload(args) -> dict[str, Any]:
"depthcharge-action": {
# This timeout englobes the entire depthcharge timing,
# including retries
"minutes": 2
"minutes": 5
* NUMBER_OF_ATTEMPTS_LAVA_BOOT,
},
},
@@ -44,176 +103,39 @@ def generate_lava_yaml_payload(args) -> dict[str, Any]:
if args.lava_tags:
values["tags"] = args.lava_tags.split(",")
# URLs to our kernel rootfs to boot from, both generated by the base
# container build
return values
nfsrootfs = {
"url": f"{args.rootfs_url_prefix}/lava-rootfs.tar.zst",
"compression": "zstd",
}
fastboot_deploy_nfs = {
"timeout": {"minutes": 10},
"to": "nfs",
"nfsrootfs": nfsrootfs,
}
fastboot_deploy_prepare = {
"timeout": {"minutes": 5},
"to": "downloads",
"os": "oe",
"images": {
"kernel": {
"url": f"{args.kernel_url_prefix}/{args.kernel_image_name}",
},
},
"postprocess": {
"docker": {
"image": "registry.gitlab.collabora.com/lava/health-check-docker",
"steps": [
'gzip Image',
"cat Image.gz " + args.dtb_filename + ".dtb > Image.gz+dtb",
"mkbootimg --kernel Image.gz+dtb" +
' --cmdline "root=/dev/nfs rw nfsroot=$NFS_SERVER_IP:$NFS_ROOTFS,tcp,hard rootwait ip=dhcp init=/init"' +
" --pagesize 4096 --base 0x80000000 -o boot.img",
],
},
}
}
if args.kernel_image_type:
fastboot_deploy_prepare["images"]["kernel"]["type"] = args.kernel_image_type
if args.dtb_filename:
fastboot_deploy_prepare["images"]["dtb"] = {"url": f"{args.kernel_url_prefix}/{args.dtb_filename}.dtb"}
tftp_deploy = {
"timeout": {"minutes": 5},
"to": "tftp",
"os": "oe",
"kernel": {
"url": f"{args.kernel_url_prefix}/{args.kernel_image_name}",
},
"nfsrootfs": nfsrootfs,
}
if args.kernel_image_type:
tftp_deploy["kernel"]["type"] = args.kernel_image_type
if args.dtb_filename:
tftp_deploy["dtb"] = {"url": f"{args.kernel_url_prefix}/{args.dtb_filename}.dtb"}
fastboot_deploy = {
"timeout": {"minutes": 2},
"to": "fastboot",
"docker": {
"image": "registry.gitlab.collabora.com/lava/health-check-docker",
},
"images": {
"boot": {"url": "downloads://boot.img"},
},
}
fastboot_boot = {
"timeout": {"minutes": 2},
"docker": {"image": "registry.gitlab.collabora.com/lava/health-check-docker"},
"failure_retry": NUMBER_OF_ATTEMPTS_LAVA_BOOT,
"method": args.boot_method,
"prompts": ["lava-shell:"],
"commands": ["set_active a"]
}
tftp_boot = {
"failure_retry": NUMBER_OF_ATTEMPTS_LAVA_BOOT,
"method": args.boot_method,
"prompts": ["lava-shell:"],
"commands": "nfs"
}
# skeleton test definition: only declaring each job as a single 'test'
# since LAVA's test parsing is not useful to us
run_steps = []
test = {
"timeout": {"minutes": args.job_timeout_min},
"failure_retry": 1,
"definitions": [
{
"name": "mesa",
"from": "inline",
"lava-signal": "kmsg",
"path": "inline/mesa.yaml",
"repository": {
"metadata": {
"name": "mesa",
"description": "Mesa test plan",
"os": ["oe"],
"scope": ["functional"],
"format": "Lava-Test Test Definition 1.0",
},
"run": {"steps": run_steps},
},
}
],
}
# job execution script:
# - inline .gitlab-ci/common/init-stage1.sh
# - fetch and unpack per-pipeline build artifacts from build job
# - fetch and unpack per-job environment from lava-submit.sh
# - exec .gitlab-ci/common/init-stage2.sh
with open(args.first_stage_init, "r") as init_sh:
run_steps += [
x.rstrip() for x in init_sh if not x.startswith("#") and x.rstrip()
]
# We cannot distribute the Adreno 660 shader firmware inside rootfs,
# since the license isn't bundled inside the repository
if args.device_type == "sm8350-hdk":
run_steps.append(
"curl -L --retry 4 -f --retry-all-errors --retry-delay 60 " +
"https://github.com/allahjasif1990/hdk888-firmware/raw/main/a660_zap.mbn " +
"-o \"/lib/firmware/qcom/sm8350/a660_zap.mbn\""
)
run_steps.append(
def artifact_download_steps(args):
"""
This function is responsible for setting up the SSH server in the DUT and to
export the first boot environment to a file.
"""
# Putting JWT pre-processing and mesa download, within init-stage1.sh file,
# as we do with non-SSH version.
download_steps = [
"set -ex",
"curl -L --retry 4 -f --retry-all-errors --retry-delay 60 "
f"{args.job_rootfs_overlay_url} | tar -xz -C /",
)
f"mkdir -p {args.ci_project_dir}",
f"curl -L --retry 4 -f --retry-all-errors --retry-delay 60 {args.build_url} | "
f"tar --zstd -x -C {args.ci_project_dir}",
]
# If the JWT file is provided, we will use it to authenticate with the cloud
# storage provider and will hide it from the job output in Gitlab.
if args.jwt_file:
with open(args.jwt_file) as jwt_file:
run_steps += [
download_steps += [
"set +x",
f'echo -n "{jwt_file.read()}" > "{args.jwt_file}" # HIDEME',
"set -x",
f'echo "export CI_JOB_JWT_FILE={args.jwt_file}" >> /set-job-env-vars.sh',
]
else:
run_steps += [
"echo Could not find jwt file, disabling MINIO requests...",
download_steps += [
"echo Could not find jwt file, disabling S3 requests...",
"sed -i '/MINIO_RESULTS_UPLOAD/d' /set-job-env-vars.sh",
]
run_steps += [
f"mkdir -p {args.ci_project_dir}",
f"curl {args.build_url} | tar --zstd -x -C {args.ci_project_dir}",
# Sleep a bit to give time for bash to dump shell xtrace messages into
# console which may cause interleaving with LAVA_SIGNAL_STARTTC in some
# devices like a618.
"sleep 1",
# Putting CI_JOB name as the testcase name, it may help LAVA farm
# maintainers with monitoring
f"lava-test-case 'mesa-ci_{args.mesa_job_name}' --shell /init-stage2.sh",
]
if args.boot_method == "fastboot":
values["actions"] = [
{"deploy": fastboot_deploy_nfs},
{"deploy": fastboot_deploy_prepare},
{"deploy": fastboot_deploy},
{"boot": fastboot_boot},
{"test": test},
]
else: # tftp
values["actions"] = [
{"deploy": tftp_deploy},
{"boot": tftp_boot},
{"test": test},
]
return values
return download_steps

View File

@@ -27,20 +27,15 @@ Therefore, we have divided the job definition into four parts:
script after sourcing "dut-env-vars.sh" again for the second SSH test case.
"""
import re
from os import getenv
from pathlib import Path
from typing import Any
from ruamel.yaml.scalarstring import LiteralScalarString
# How many attempts should be made when a timeout happen during LAVA device boot.
NUMBER_OF_ATTEMPTS_LAVA_BOOT = int(getenv("LAVA_NUMBER_OF_ATTEMPTS_LAVA_BOOT", 3))
# Supports any integers in [0, 100].
# The scheduler considers the job priority when ordering the queue
# to consider which job should run next.
JOB_PRIORITY = int(getenv("LAVA_JOB_PRIORITY", 75))
from .lava_job_definition import (
artifact_download_steps,
to_yaml_block,
generate_metadata,
NUMBER_OF_ATTEMPTS_LAVA_BOOT,
)
# Very early SSH server setup. Uses /dut_ready file to flag it is done.
SSH_SERVER_COMMANDS = {
@@ -83,55 +78,6 @@ lava_ssh_test_case() {
]
def to_yaml_block(steps_array: list[str], escape_vars=[]) -> LiteralScalarString:
def escape_envvar(match):
return "\\" + match.group(0)
filtered_array = [s for s in steps_array if s.strip() and not s.startswith("#")]
final_str = "\n".join(filtered_array)
for escape_var in escape_vars:
# Find env vars and add '\\' before them
final_str = re.sub(rf"\${escape_var}*", escape_envvar, final_str)
return LiteralScalarString(final_str)
def artifact_download_steps(args):
"""
This function is responsible for setting up the SSH server in the DUT and to
export the first boot environment to a file.
"""
# Putting JWT pre-processing and mesa download, within init-stage1.sh file,
# as we do with non-SSH version.
download_steps = [
"set -ex",
"source /dut-env-vars.sh",
"curl -L --retry 4 -f --retry-all-errors --retry-delay 60 "
f"{args.job_rootfs_overlay_url} | tar -xz -C /",
f"mkdir -p {args.ci_project_dir}",
f"curl -L --retry 4 -f --retry-all-errors --retry-delay 60 {args.build_url} | "
f"tar --zstd -x -C {args.ci_project_dir}",
]
# If the JWT file is provided, we will use it to authenticate with the cloud
# storage provider and will hide it from the job output in Gitlab.
if args.jwt_file:
with open(args.jwt_file) as jwt_file:
download_steps += [
"set +x",
f'echo -n "{jwt_file.read()}" > "{args.jwt_file}" # HIDEME',
"set -x",
f'echo "export CI_JOB_JWT_FILE={args.jwt_file}" >> /set-job-env-vars.sh',
]
else:
download_steps += [
"echo Could not find jwt file, disabling S3 requests...",
"sed -i '/MINIO_RESULTS_UPLOAD/d' /set-job-env-vars.sh",
]
return download_steps
def generate_dut_test(args):
# Commands executed on DUT.
# Trying to execute the minimal number of commands, because the console data is
@@ -200,6 +146,7 @@ def generate_docker_test(args):
to_yaml_block(
(
"lava_ssh_test_case 'artifact_download' 'bash --' << EOF",
"source /dut-env-vars.sh",
*artifact_download_steps(args),
"EOF",
)
@@ -216,38 +163,7 @@ def generate_docker_test(args):
def generate_lava_yaml_payload(args) -> dict[str, Any]:
# General metadata and permissions
values = {
"job_name": f"mesa: {args.pipeline_info}",
"device_type": args.device_type,
"visibility": {"group": [args.visibility_group]},
"priority": JOB_PRIORITY,
"context": {
"extra_nfsroot_args": " init=/init rootwait usbcore.quirks=0bda:8153:k"
},
"timeouts": {
"job": {"minutes": args.job_timeout_min},
"actions": {
"depthcharge-retry": {
# Could take between 1 and 1.5 min in slower boots
"minutes": 4
},
"depthcharge-start": {
# Should take less than 1 min.
"minutes": 1,
},
"depthcharge-action": {
# This timeout englobes the entire depthcharge timing,
# including retries
"minutes": 5
* NUMBER_OF_ATTEMPTS_LAVA_BOOT,
},
},
},
}
if args.lava_tags:
values["tags"] = args.lava_tags.split(",")
values = generate_metadata(args)
# URLs to our kernel rootfs to boot from, both generated by the base
# container build

View File

@@ -0,0 +1,172 @@
from typing import Any
from .lava_job_definition import (
generate_metadata,
NUMBER_OF_ATTEMPTS_LAVA_BOOT,
artifact_download_steps,
)
def generate_lava_yaml_payload(args) -> dict[str, Any]:
values = generate_metadata(args)
# URLs to our kernel rootfs to boot from, both generated by the base
# container build
nfsrootfs = {
"url": f"{args.rootfs_url_prefix}/lava-rootfs.tar.zst",
"compression": "zstd",
}
fastboot_deploy_nfs = {
"timeout": {"minutes": 10},
"to": "nfs",
"nfsrootfs": nfsrootfs,
}
fastboot_deploy_prepare = {
"timeout": {"minutes": 5},
"to": "downloads",
"os": "oe",
"images": {
"kernel": {
"url": f"{args.kernel_url_prefix}/{args.kernel_image_name}",
},
},
"postprocess": {
"docker": {
"image": "registry.gitlab.collabora.com/lava/health-check-docker",
"steps": [
"gzip Image",
f"cat Image.gz {args.dtb_filename}.dtb > Image.gz+dtb",
"mkbootimg --kernel Image.gz+dtb"
+ ' --cmdline "root=/dev/nfs rw nfsroot=$NFS_SERVER_IP:$NFS_ROOTFS,tcp,hard rootwait ip=dhcp init=/init"'
+ " --pagesize 4096 --base 0x80000000 -o boot.img",
],
}
},
}
if args.kernel_image_type:
fastboot_deploy_prepare["images"]["kernel"]["type"] = args.kernel_image_type
if args.dtb_filename:
fastboot_deploy_prepare["images"]["dtb"] = {
"url": f"{args.kernel_url_prefix}/{args.dtb_filename}.dtb"
}
tftp_deploy = {
"timeout": {"minutes": 5},
"to": "tftp",
"os": "oe",
"kernel": {
"url": f"{args.kernel_url_prefix}/{args.kernel_image_name}",
},
"nfsrootfs": nfsrootfs,
}
if args.kernel_image_type:
tftp_deploy["kernel"]["type"] = args.kernel_image_type
if args.dtb_filename:
tftp_deploy["dtb"] = {
"url": f"{args.kernel_url_prefix}/{args.dtb_filename}.dtb"
}
fastboot_deploy = {
"timeout": {"minutes": 2},
"to": "fastboot",
"docker": {
"image": "registry.gitlab.collabora.com/lava/health-check-docker",
},
"images": {
"boot": {"url": "downloads://boot.img"},
},
}
fastboot_boot = {
"timeout": {"minutes": 2},
"docker": {"image": "registry.gitlab.collabora.com/lava/health-check-docker"},
"failure_retry": NUMBER_OF_ATTEMPTS_LAVA_BOOT,
"method": args.boot_method,
"prompts": ["lava-shell:"],
"commands": ["set_active a"],
}
tftp_boot = {
"failure_retry": NUMBER_OF_ATTEMPTS_LAVA_BOOT,
"method": args.boot_method,
"prompts": ["lava-shell:"],
"commands": "nfs",
}
# skeleton test definition: only declaring each job as a single 'test'
# since LAVA's test parsing is not useful to us
run_steps = []
test = {
"timeout": {"minutes": args.job_timeout_min},
"failure_retry": 1,
"definitions": [
{
"name": "mesa",
"from": "inline",
"lava-signal": "kmsg",
"path": "inline/mesa.yaml",
"repository": {
"metadata": {
"name": "mesa",
"description": "Mesa test plan",
"os": ["oe"],
"scope": ["functional"],
"format": "Lava-Test Test Definition 1.0",
},
"run": {"steps": run_steps},
},
}
],
}
# job execution script:
# - inline .gitlab-ci/common/init-stage1.sh
# - fetch and unpack per-pipeline build artifacts from build job
# - fetch and unpack per-job environment from lava-submit.sh
# - exec .gitlab-ci/common/init-stage2.sh
with open(args.first_stage_init, "r") as init_sh:
run_steps += [
x.rstrip() for x in init_sh if not x.startswith("#") and x.rstrip()
]
# We cannot distribute the Adreno 660 shader firmware inside rootfs,
# since the license isn't bundled inside the repository
if args.device_type == "sm8350-hdk":
run_steps.append(
"curl -L --retry 4 -f --retry-all-errors --retry-delay 60 "
+ "https://github.com/allahjasif1990/hdk888-firmware/raw/main/a660_zap.mbn "
+ '-o "/lib/firmware/qcom/sm8350/a660_zap.mbn"'
)
run_steps += artifact_download_steps(args)
run_steps += [
f"mkdir -p {args.ci_project_dir}",
f"curl {args.build_url} | tar --zstd -x -C {args.ci_project_dir}",
# Sleep a bit to give time for bash to dump shell xtrace messages into
# console which may cause interleaving with LAVA_SIGNAL_STARTTC in some
# devices like a618.
"sleep 1",
# Putting CI_JOB name as the testcase name, it may help LAVA farm
# maintainers with monitoring
f"lava-test-case 'mesa-ci_{args.mesa_job_name}' --shell /init-stage2.sh",
]
if args.boot_method == "fastboot":
values["actions"] = [
{"deploy": fastboot_deploy_nfs},
{"deploy": fastboot_deploy_prepare},
{"deploy": fastboot_deploy},
{"boot": fastboot_boot},
{"test": test},
]
else: # tftp
values["actions"] = [
{"deploy": tftp_deploy},
{"boot": tftp_boot},
{"test": test},
]
return values