radv: Remove check_status

Following discussion on kernel mailing list[1], we are not gaining
anything from this right now, and it does not handle soft recovery.

We will hear about the context loss and rationale when we vkQueueSubmit
next.

We can come back to this if there is ever a Vulkan extension for
figuring out innocent vs guilty like GL_EXT_robustness.

This does mean however that we return VK_SUCCESS for cancelled semaphore
and fence waits, but this is legal per the Vulkan spec:

"Commands that wait indefinitely for device execution (namely
vkDeviceWaitIdle, vkQueueWaitIdle, vkWaitForFences with a maximum
timeout, and vkGetQueryPoolResults with the VK_QUERY_RESULT_WAIT_BIT
bit set in flags) must return in finite time even in the case of a lost
device, and return either VK_SUCCESS or VK_ERROR_DEVICE_LOST."

"If device loss occurs (see Lost Device) before the timeout has expired,
vkWaitSemaphores must return in finite time with either VK_SUCCESS or
VK_ERROR_DEVICE_LOST."

[1]: https://lists.freedesktop.org/archives/amd-gfx/2024-January/103337.html

Signed-off-by: Joshua Ashton <joshua@froggi.es>

Reviewed-by: Samuel Pitoiset <samuel.pitoiset@gmail.com>
Reviewed-by: Friedrich Vock <friedrich.vock@gmx.de>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/27091>
This commit is contained in:
Joshua Ashton
2024-01-16 00:42:45 +00:00
committed by Marge Bot
parent b24a4b8949
commit f62bdde703
2 changed files with 16 additions and 46 deletions

View File

@@ -83,8 +83,6 @@ typedef void *drmDevicePtr;
#include "ac_llvm_util.h"
#endif
#include "ac_debug.h"
static bool
radv_spm_trace_enabled(struct radv_instance *instance)
{
@@ -619,49 +617,6 @@ init_dispatch_tables(struct radv_device *device, struct radv_physical_device *ph
add_entrypoints(&b, &vk_common_device_entrypoints, RADV_DISPATCH_TABLE_COUNT);
}
static void
radv_report_gpuvm_fault(struct radv_device *device)
{
struct radv_winsys_gpuvm_fault_info fault_info = {0};
if (!radv_vm_fault_occurred(device, &fault_info))
return;
fprintf(stderr, "radv: GPUVM fault detected at address 0x%08" PRIx64 ".\n", fault_info.addr);
ac_print_gpuvm_fault_status(stderr, device->physical_device->rad_info.gfx_level, fault_info.status);
}
static VkResult
radv_check_status(struct vk_device *vk_device)
{
struct radv_device *device = container_of(vk_device, struct radv_device, vk);
enum radv_reset_status status;
bool context_reset = false;
/* If an INNOCENT_CONTEXT_RESET is found in one of the contexts, we need to
* keep querying in case there's a guilty one, so we can correctly log if the
* hung happened in this app or not */
for (int i = 0; i < RADV_NUM_HW_CTX; i++) {
if (device->hw_ctx[i]) {
status = device->ws->ctx_query_reset_status(device->hw_ctx[i]);
if (status == RADV_GUILTY_CONTEXT_RESET) {
radv_report_gpuvm_fault(device);
return vk_device_set_lost(&device->vk, "GPU hung detected in this process");
} else if (status == RADV_INNOCENT_CONTEXT_RESET) {
context_reset = true;
}
}
}
if (context_reset) {
radv_report_gpuvm_fault(device);
return vk_device_set_lost(&device->vk, "GPU hung triggered by other process");
}
return VK_SUCCESS;
}
static VkResult
capture_trace(VkQueue _queue)
{
@@ -907,7 +862,6 @@ radv_CreateDevice(VkPhysicalDevice physicalDevice, const VkDeviceCreateInfo *pCr
device->vk.capture_trace = capture_trace;
device->vk.command_buffer_ops = &radv_cmd_buffer_ops;
device->vk.check_status = radv_check_status;
device->instance = physical_device->instance;
device->physical_device = physical_device;

View File

@@ -31,6 +31,8 @@
#include "vk_semaphore.h"
#include "vk_sync.h"
#include "ac_debug.h"
enum radeon_ctx_priority
radv_get_queue_global_priority(const VkDeviceQueueGlobalPriorityCreateInfoKHR *pObj)
{
@@ -1687,6 +1689,18 @@ fail:
return result;
}
static void
radv_report_gpuvm_fault(struct radv_device *device)
{
struct radv_winsys_gpuvm_fault_info fault_info = {0};
if (!radv_vm_fault_occurred(device, &fault_info))
return;
fprintf(stderr, "radv: GPUVM fault detected at address 0x%08" PRIx64 ".\n", fault_info.addr);
ac_print_gpuvm_fault_status(stderr, device->physical_device->rad_info.gfx_level, fault_info.status);
}
static VkResult
radv_queue_sparse_submit(struct vk_queue *vqueue, struct vk_queue_submit *submission)
{
@@ -1722,6 +1736,7 @@ fail:
* VK_ERROR_DEVICE_LOST to ensure the clients do not attempt
* to submit the same job again to this device.
*/
radv_report_gpuvm_fault(queue->device);
result = vk_device_set_lost(&queue->device->vk, "vkQueueSubmit() failed");
}
return result;
@@ -1759,6 +1774,7 @@ fail:
* VK_ERROR_DEVICE_LOST to ensure the clients do not attempt
* to submit the same job again to this device.
*/
radv_report_gpuvm_fault(queue->device);
result = vk_device_set_lost(&queue->device->vk, "vkQueueSubmit() failed");
}
return result;