
We have job-level retry on failure now, and will continue to need to in order to work around fd.o infrastructure flakes. If we stop doing retry inside the job, then we can crank down the gitlab-level timeouts on test jobs to be closer to our CI guidelines and avoid blocking a runner for an hour when things go wrong (for example, cheza #16 failing to boot in a recognized way and continuously looping due to the intra-job retry). Plus, the job logs will be more readable when you don't have two boots in one job, and we'll get the flakes surfaced in our monitoring dashboards. If internal retries were really doing useful work we may see an increase in flakes as a result of this. I'm committing to turning off boards or reducing coverage as necessary to handle this. Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/25790>
159 lines
6.0 KiB
Python
Executable File
159 lines
6.0 KiB
Python
Executable File
#!/usr/bin/env python3
|
|
#
|
|
# Copyright © 2020 Google LLC
|
|
# SPDX-License-Identifier: MIT
|
|
|
|
import argparse
|
|
import re
|
|
import sys
|
|
|
|
from serial_buffer import SerialBuffer
|
|
|
|
|
|
class CrosServoRun:
|
|
def __init__(self, cpu, ec, test_timeout):
|
|
self.cpu_ser = SerialBuffer(
|
|
cpu, "results/serial.txt", "R SERIAL-CPU> ")
|
|
# Merge the EC serial into the cpu_ser's line stream so that we can
|
|
# effectively poll on both at the same time and not have to worry about
|
|
self.ec_ser = SerialBuffer(
|
|
ec, "results/serial-ec.txt", "R SERIAL-EC> ", line_queue=self.cpu_ser.line_queue)
|
|
self.test_timeout = test_timeout
|
|
|
|
def close(self):
|
|
self.ec_ser.close()
|
|
self.cpu_ser.close()
|
|
|
|
def ec_write(self, s):
|
|
print("W SERIAL-EC> %s" % s)
|
|
self.ec_ser.serial.write(s.encode())
|
|
|
|
def cpu_write(self, s):
|
|
print("W SERIAL-CPU> %s" % s)
|
|
self.cpu_ser.serial.write(s.encode())
|
|
|
|
def print_error(self, message):
|
|
RED = '\033[0;31m'
|
|
NO_COLOR = '\033[0m'
|
|
print(RED + message + NO_COLOR)
|
|
|
|
def run(self):
|
|
# Flush any partial commands in the EC's prompt, then ask for a reboot.
|
|
self.ec_write("\n")
|
|
self.ec_write("reboot\n")
|
|
|
|
bootloader_done = False
|
|
tftp_failures = 0
|
|
# This is emitted right when the bootloader pauses to check for input.
|
|
# Emit a ^N character to request network boot, because we don't have a
|
|
# direct-to-netboot firmware on cheza.
|
|
for line in self.cpu_ser.lines(timeout=120, phase="bootloader"):
|
|
if re.search("load_archive: loading locale_en.bin", line):
|
|
self.cpu_write("\016")
|
|
bootloader_done = True
|
|
break
|
|
|
|
# The Cheza firmware seems to occasionally get stuck looping in
|
|
# this error state during TFTP booting, possibly based on amount of
|
|
# network traffic around it, but it'll usually recover after a
|
|
# reboot. Currently mostly visible on google-freedreno-cheza-14.
|
|
if re.search("R8152: Bulk read error 0xffffffbf", line):
|
|
tftp_failures += 1
|
|
if tftp_failures >= 10:
|
|
self.print_error(
|
|
"Detected intermittent tftp failure, restarting run.")
|
|
return 1
|
|
|
|
# If the board has a netboot firmware and we made it to booting the
|
|
# kernel, proceed to processing of the test run.
|
|
if re.search("Booting Linux", line):
|
|
bootloader_done = True
|
|
break
|
|
|
|
# The Cheza boards have issues with failing to bring up power to
|
|
# the system sometimes, possibly dependent on ambient temperature
|
|
# in the farm.
|
|
if re.search("POWER_GOOD not seen in time", line):
|
|
self.print_error(
|
|
"Detected intermittent poweron failure, abandoning run.")
|
|
return 1
|
|
|
|
if not bootloader_done:
|
|
print("Failed to make it through bootloader, abandoning run.")
|
|
return 1
|
|
|
|
for line in self.cpu_ser.lines(timeout=self.test_timeout, phase="test"):
|
|
if re.search("---. end Kernel panic", line):
|
|
return 1
|
|
|
|
# There are very infrequent bus errors during power management transitions
|
|
# on cheza, which we don't expect to be the case on future boards.
|
|
if re.search("Kernel panic - not syncing: Asynchronous SError Interrupt", line):
|
|
self.print_error(
|
|
"Detected cheza power management bus error, abandoning run.")
|
|
return 1
|
|
|
|
# If the network device dies, it's probably not graphics's fault, just try again.
|
|
if re.search("NETDEV WATCHDOG", line):
|
|
self.print_error(
|
|
"Detected network device failure, abandoning run.")
|
|
return 1
|
|
|
|
# These HFI response errors started appearing with the introduction
|
|
# of piglit runs. CosmicPenguin says:
|
|
#
|
|
# "message ID 106 isn't a thing, so likely what happened is that we
|
|
# got confused when parsing the HFI queue. If it happened on only
|
|
# one run, then memory corruption could be a possible clue"
|
|
#
|
|
# Given that it seems to trigger randomly near a GPU fault and then
|
|
# break many tests after that, just restart the whole run.
|
|
if re.search("a6xx_hfi_send_msg.*Unexpected message id .* on the response queue", line):
|
|
self.print_error(
|
|
"Detected cheza power management bus error, abandoning run.")
|
|
return 1
|
|
|
|
if re.search("coreboot.*bootblock starting", line):
|
|
self.print_error(
|
|
"Detected spontaneous reboot, abandoning run.")
|
|
return 1
|
|
|
|
if re.search("arm-smmu 5040000.iommu: TLB sync timed out -- SMMU may be deadlocked", line):
|
|
self.print_error("Detected cheza MMU fail, abandoning run.")
|
|
return 1
|
|
|
|
result = re.search("hwci: mesa: (\S*)", line)
|
|
if result:
|
|
if result.group(1) == "pass":
|
|
return 0
|
|
else:
|
|
return 1
|
|
|
|
self.print_error(
|
|
"Reached the end of the CPU serial log without finding a result")
|
|
return 1
|
|
|
|
|
|
def main():
|
|
parser = argparse.ArgumentParser()
|
|
parser.add_argument('--cpu', type=str,
|
|
help='CPU Serial device', required=True)
|
|
parser.add_argument(
|
|
'--ec', type=str, help='EC Serial device', required=True)
|
|
parser.add_argument(
|
|
'--test-timeout', type=int, help='Test phase timeout (minutes)', required=True)
|
|
args = parser.parse_args()
|
|
|
|
servo = CrosServoRun(args.cpu, args.ec, args.test_timeout * 60)
|
|
retval = servo.run()
|
|
|
|
# power down the CPU on the device
|
|
servo.ec_write("power off\n")
|
|
servo.close()
|
|
|
|
sys.exit(retval)
|
|
|
|
|
|
if __name__ == '__main__':
|
|
main()
|