radv: Remove check_status
Following discussion on kernel mailing list[1], we are not gaining anything from this right now, and it does not handle soft recovery. We will hear about the context loss and rationale when we vkQueueSubmit next. We can come back to this if there is ever a Vulkan extension for figuring out innocent vs guilty like GL_EXT_robustness. This does mean however that we return VK_SUCCESS for cancelled semaphore and fence waits, but this is legal per the Vulkan spec: "Commands that wait indefinitely for device execution (namely vkDeviceWaitIdle, vkQueueWaitIdle, vkWaitForFences with a maximum timeout, and vkGetQueryPoolResults with the VK_QUERY_RESULT_WAIT_BIT bit set in flags) must return in finite time even in the case of a lost device, and return either VK_SUCCESS or VK_ERROR_DEVICE_LOST." "If device loss occurs (see Lost Device) before the timeout has expired, vkWaitSemaphores must return in finite time with either VK_SUCCESS or VK_ERROR_DEVICE_LOST." [1]: https://lists.freedesktop.org/archives/amd-gfx/2024-January/103337.html Signed-off-by: Joshua Ashton <joshua@froggi.es> Reviewed-by: Samuel Pitoiset <samuel.pitoiset@gmail.com> Reviewed-by: Friedrich Vock <friedrich.vock@gmx.de> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/27091>
This commit is contained in:
@@ -83,8 +83,6 @@ typedef void *drmDevicePtr;
|
||||
#include "ac_llvm_util.h"
|
||||
#endif
|
||||
|
||||
#include "ac_debug.h"
|
||||
|
||||
static bool
|
||||
radv_spm_trace_enabled(struct radv_instance *instance)
|
||||
{
|
||||
@@ -619,49 +617,6 @@ init_dispatch_tables(struct radv_device *device, struct radv_physical_device *ph
|
||||
add_entrypoints(&b, &vk_common_device_entrypoints, RADV_DISPATCH_TABLE_COUNT);
|
||||
}
|
||||
|
||||
static void
|
||||
radv_report_gpuvm_fault(struct radv_device *device)
|
||||
{
|
||||
struct radv_winsys_gpuvm_fault_info fault_info = {0};
|
||||
|
||||
if (!radv_vm_fault_occurred(device, &fault_info))
|
||||
return;
|
||||
|
||||
fprintf(stderr, "radv: GPUVM fault detected at address 0x%08" PRIx64 ".\n", fault_info.addr);
|
||||
ac_print_gpuvm_fault_status(stderr, device->physical_device->rad_info.gfx_level, fault_info.status);
|
||||
}
|
||||
|
||||
static VkResult
|
||||
radv_check_status(struct vk_device *vk_device)
|
||||
{
|
||||
struct radv_device *device = container_of(vk_device, struct radv_device, vk);
|
||||
enum radv_reset_status status;
|
||||
bool context_reset = false;
|
||||
|
||||
/* If an INNOCENT_CONTEXT_RESET is found in one of the contexts, we need to
|
||||
* keep querying in case there's a guilty one, so we can correctly log if the
|
||||
* hung happened in this app or not */
|
||||
for (int i = 0; i < RADV_NUM_HW_CTX; i++) {
|
||||
if (device->hw_ctx[i]) {
|
||||
status = device->ws->ctx_query_reset_status(device->hw_ctx[i]);
|
||||
|
||||
if (status == RADV_GUILTY_CONTEXT_RESET) {
|
||||
radv_report_gpuvm_fault(device);
|
||||
return vk_device_set_lost(&device->vk, "GPU hung detected in this process");
|
||||
} else if (status == RADV_INNOCENT_CONTEXT_RESET) {
|
||||
context_reset = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (context_reset) {
|
||||
radv_report_gpuvm_fault(device);
|
||||
return vk_device_set_lost(&device->vk, "GPU hung triggered by other process");
|
||||
}
|
||||
|
||||
return VK_SUCCESS;
|
||||
}
|
||||
|
||||
static VkResult
|
||||
capture_trace(VkQueue _queue)
|
||||
{
|
||||
@@ -907,7 +862,6 @@ radv_CreateDevice(VkPhysicalDevice physicalDevice, const VkDeviceCreateInfo *pCr
|
||||
device->vk.capture_trace = capture_trace;
|
||||
|
||||
device->vk.command_buffer_ops = &radv_cmd_buffer_ops;
|
||||
device->vk.check_status = radv_check_status;
|
||||
|
||||
device->instance = physical_device->instance;
|
||||
device->physical_device = physical_device;
|
||||
|
@@ -31,6 +31,8 @@
|
||||
#include "vk_semaphore.h"
|
||||
#include "vk_sync.h"
|
||||
|
||||
#include "ac_debug.h"
|
||||
|
||||
enum radeon_ctx_priority
|
||||
radv_get_queue_global_priority(const VkDeviceQueueGlobalPriorityCreateInfoKHR *pObj)
|
||||
{
|
||||
@@ -1687,6 +1689,18 @@ fail:
|
||||
return result;
|
||||
}
|
||||
|
||||
static void
|
||||
radv_report_gpuvm_fault(struct radv_device *device)
|
||||
{
|
||||
struct radv_winsys_gpuvm_fault_info fault_info = {0};
|
||||
|
||||
if (!radv_vm_fault_occurred(device, &fault_info))
|
||||
return;
|
||||
|
||||
fprintf(stderr, "radv: GPUVM fault detected at address 0x%08" PRIx64 ".\n", fault_info.addr);
|
||||
ac_print_gpuvm_fault_status(stderr, device->physical_device->rad_info.gfx_level, fault_info.status);
|
||||
}
|
||||
|
||||
static VkResult
|
||||
radv_queue_sparse_submit(struct vk_queue *vqueue, struct vk_queue_submit *submission)
|
||||
{
|
||||
@@ -1722,6 +1736,7 @@ fail:
|
||||
* VK_ERROR_DEVICE_LOST to ensure the clients do not attempt
|
||||
* to submit the same job again to this device.
|
||||
*/
|
||||
radv_report_gpuvm_fault(queue->device);
|
||||
result = vk_device_set_lost(&queue->device->vk, "vkQueueSubmit() failed");
|
||||
}
|
||||
return result;
|
||||
@@ -1759,6 +1774,7 @@ fail:
|
||||
* VK_ERROR_DEVICE_LOST to ensure the clients do not attempt
|
||||
* to submit the same job again to this device.
|
||||
*/
|
||||
radv_report_gpuvm_fault(queue->device);
|
||||
result = vk_device_set_lost(&queue->device->vk, "vkQueueSubmit() failed");
|
||||
}
|
||||
return result;
|
||||
|
Reference in New Issue
Block a user