radv: Remove check_status

Following discussion on kernel mailing list[1], we are not gaining anything from this right now, and it does not handle soft recovery. We will hear about the context loss and rationale when we vkQueueSubmit next. We can come back to this if there is ever a Vulkan extension for figuring out innocent vs guilty like GL_EXT_robustness. This does mean however that we return VK_SUCCESS for cancelled semaphore and fence waits, but this is legal per the Vulkan spec: "Commands that wait indefinitely for device execution (namely vkDeviceWaitIdle, vkQueueWaitIdle, vkWaitForFences with a maximum timeout, and vkGetQueryPoolResults with the VK_QUERY_RESULT_WAIT_BIT bit set in flags) must return in finite time even in the case of a lost device, and return either VK_SUCCESS or VK_ERROR_DEVICE_LOST." "If device loss occurs (see Lost Device) before the timeout has expired, vkWaitSemaphores must return in finite time with either VK_SUCCESS or VK_ERROR_DEVICE_LOST." [1]: https://lists.freedesktop.org/archives/amd-gfx/2024-January/103337.html Signed-off-by: Joshua Ashton <joshua@froggi.es> Reviewed-by: Samuel Pitoiset <samuel.pitoiset@gmail.com> Reviewed-by: Friedrich Vock <friedrich.vock@gmx.de> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/27091>
2024-01-16 00:42:45 +00:00
parent b24a4b8949
commit f62bdde703
2 changed files with 16 additions and 46 deletions
--- a/src/amd/vulkan/radv_device.c
+++ b/src/amd/vulkan/radv_device.c
@@ -83,8 +83,6 @@ typedef void *drmDevicePtr;
 #include "ac_llvm_util.h"
 #endif

-#include "ac_debug.h"
-
 static bool
 radv_spm_trace_enabled(struct radv_instance *instance)
 {
@@ -619,49 +617,6 @@ init_dispatch_tables(struct radv_device *device, struct radv_physical_device *ph
   add_entrypoints(&b, &vk_common_device_entrypoints, RADV_DISPATCH_TABLE_COUNT);
 }

-static void
-radv_report_gpuvm_fault(struct radv_device *device)
-{
-   struct radv_winsys_gpuvm_fault_info fault_info = {0};
-
-   if (!radv_vm_fault_occurred(device, &fault_info))
-      return;
-
-   fprintf(stderr, "radv: GPUVM fault detected at address 0x%08" PRIx64 ".\n", fault_info.addr);
-   ac_print_gpuvm_fault_status(stderr, device->physical_device->rad_info.gfx_level, fault_info.status);
-}
-
-static VkResult
-radv_check_status(struct vk_device *vk_device)
-{
-   struct radv_device *device = container_of(vk_device, struct radv_device, vk);
-   enum radv_reset_status status;
-   bool context_reset = false;
-
-   /* If an INNOCENT_CONTEXT_RESET is found in one of the contexts, we need to
-    * keep querying in case there's a guilty one, so we can correctly log if the
-    * hung happened in this app or not */
-   for (int i = 0; i < RADV_NUM_HW_CTX; i++) {
-      if (device->hw_ctx[i]) {
-         status = device->ws->ctx_query_reset_status(device->hw_ctx[i]);
-
-         if (status == RADV_GUILTY_CONTEXT_RESET) {
-            radv_report_gpuvm_fault(device);
-            return vk_device_set_lost(&device->vk, "GPU hung detected in this process");
-         } else if (status == RADV_INNOCENT_CONTEXT_RESET) {
-            context_reset = true;
-         }
-      }
-   }
-
-   if (context_reset) {
-      radv_report_gpuvm_fault(device);
-      return vk_device_set_lost(&device->vk, "GPU hung triggered by other process");
-   }
-
-   return VK_SUCCESS;
-}
-
 static VkResult
 capture_trace(VkQueue _queue)
 {
@@ -907,7 +862,6 @@ radv_CreateDevice(VkPhysicalDevice physicalDevice, const VkDeviceCreateInfo *pCr
   device->vk.capture_trace = capture_trace;

   device->vk.command_buffer_ops = &radv_cmd_buffer_ops;
-   device->vk.check_status = radv_check_status;

   device->instance = physical_device->instance;
   device->physical_device = physical_device;
--- a/src/amd/vulkan/radv_queue.c
+++ b/src/amd/vulkan/radv_queue.c
@@ -31,6 +31,8 @@
 #include "vk_semaphore.h"
 #include "vk_sync.h"

+#include "ac_debug.h"
+
 enum radeon_ctx_priority
 radv_get_queue_global_priority(const VkDeviceQueueGlobalPriorityCreateInfoKHR *pObj)
 {
@@ -1687,6 +1689,18 @@ fail:
   return result;
 }

+static void
+radv_report_gpuvm_fault(struct radv_device *device)
+{
+   struct radv_winsys_gpuvm_fault_info fault_info = {0};
+
+   if (!radv_vm_fault_occurred(device, &fault_info))
+      return;
+
+   fprintf(stderr, "radv: GPUVM fault detected at address 0x%08" PRIx64 ".\n", fault_info.addr);
+   ac_print_gpuvm_fault_status(stderr, device->physical_device->rad_info.gfx_level, fault_info.status);
+}
+
 static VkResult
 radv_queue_sparse_submit(struct vk_queue *vqueue, struct vk_queue_submit *submission)
 {
@@ -1722,6 +1736,7 @@ fail:
       * VK_ERROR_DEVICE_LOST to ensure the clients do not attempt
       * to submit the same job again to this device.
       */
+      radv_report_gpuvm_fault(queue->device);
      result = vk_device_set_lost(&queue->device->vk, "vkQueueSubmit() failed");
   }
   return result;
@@ -1759,6 +1774,7 @@ fail:
       * VK_ERROR_DEVICE_LOST to ensure the clients do not attempt
       * to submit the same job again to this device.
       */
+      radv_report_gpuvm_fault(queue->device);
      result = vk_device_set_lost(&queue->device->vk, "vkQueueSubmit() failed");
   }
   return result;