tu: Re-enable bufferDeviceAddressCaptureReplay

We cannot immidiately free VMA range when BO is freed, we have to wait until kernel stops considered BO as busy and frees its internal VMA range. Otherwise userspace and kernel VMA will get desynchronized. To fix this and re-enable replaying of BDA we place BO's information into a queue. The queue is drained: - On BO allocation; - When we cannot allocate an iova passed from the client. For more information about this see: https://gitlab.freedesktop.org/mesa/mesa/-/issues/7106 Signed-off-by: Danylo Piliaiev <dpiliaiev@igalia.com> Signed-off-by: Rob Clark <robdclark@chromium.org> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/18254>
2022-08-25 18:25:30 +03:00
parent d2f9346d9d
commit 63904240f2
5 changed files with 168 additions and 57 deletions
--- a/src/freedreno/vulkan/tu_device.cc
+++ b/src/freedreno/vulkan/tu_device.cc
@@ -1744,7 +1744,7 @@ tu_queue_init(struct tu_device *device,
      return vk_startup_errorf(device->instance, VK_ERROR_INITIALIZATION_FAILED,
                               "submitqueue create failed");

-   queue->last_submit_timestamp = -1;
+   queue->fence = -1;

   return VK_SUCCESS;
 }
@@ -2412,6 +2412,17 @@ tu_CreateDevice(VkPhysicalDevice physicalDevice,
      goto fail_timeline_cond;
   }

+   if (physical_device->has_set_iova) {
+      STATIC_ASSERT(TU_MAX_QUEUE_FAMILIES == 1);
+      if (!u_vector_init(&device->zombie_vmas, 64,
+                         sizeof(struct tu_zombie_vma))) {
+         result = vk_startup_errorf(physical_device->instance,
+                                    VK_ERROR_INITIALIZATION_FAILED,
+                                    "zombie_vmas create failed");
+         goto fail_free_zombie_vma;
+      }
+   }
+
   for (unsigned i = 0; i < ARRAY_SIZE(device->scratch_bos); i++)
      mtx_init(&device->scratch_bos[i].construct_mtx, mtx_plain);

@@ -2439,6 +2450,8 @@ tu_CreateDevice(VkPhysicalDevice physicalDevice,
   *pDevice = tu_device_to_handle(device);
   return VK_SUCCESS;

+fail_free_zombie_vma:
+   u_vector_finish(&device->zombie_vmas);
 fail_timeline_cond:
 fail_prepare_perfcntrs_pass_cs:
   free(device->perfcntrs_pass_cs_entries);
@@ -2534,6 +2547,8 @@ tu_DestroyDevice(VkDevice _device, const VkAllocationCallbacks *pAllocator)
   util_sparse_array_finish(&device->bo_map);
   u_rwlock_destroy(&device->dma_bo_lock);

+   u_vector_finish(&device->zombie_vmas);
+
   for (unsigned i = 0; i < TU_MAX_QUEUE_FAMILIES; i++) {
      for (unsigned q = 0; q < device->queue_count[i]; q++)
         tu_queue_finish(&device->queues[i][q]);
--- a/src/freedreno/vulkan/tu_device.h
+++ b/src/freedreno/vulkan/tu_device.h
@@ -21,6 +21,7 @@
 #include "tu_util.h"

 #include "util/vma.h"
+#include "util/u_vector.h"

 /* queue types */
 #define TU_QUEUE_GENERAL 0
@@ -153,7 +154,7 @@ struct tu_queue

   uint32_t msm_queue_id;

-   int64_t last_submit_timestamp; /* timestamp of the last queue submission for kgsl */
+   int fence;           /* timestamp/fence of the last queue submission */
 };
 VK_DEFINE_HANDLE_CASTS(tu_queue, vk.base, VkQueue, VK_OBJECT_TYPE_QUEUE)

@@ -317,6 +318,12 @@ struct tu_device
    */
   struct util_sparse_array bo_map;

+   /* We cannot immediately free VMA when freeing BO, kernel truly
+    * frees BO when it stops being busy.
+    * So we have to free our VMA only after the kernel does it.
+    */
+   struct u_vector zombie_vmas;
+
   /* Command streams to set pass index to a scratch reg */
   struct tu_cs *perfcntrs_pass_cs;
   struct tu_cs_entry *perfcntrs_pass_cs_entries;
--- a/src/freedreno/vulkan/tu_knl.h
+++ b/src/freedreno/vulkan/tu_knl.h
@@ -78,6 +78,13 @@ struct tu_knl {
   const struct vk_device_entrypoint_table *device_entrypoints;
 };

+struct tu_zombie_vma {
+   int fence;
+   uint32_t gem_handle;
+   uint64_t iova;
+   uint64_t size;
+};
+
 struct tu_timeline_sync {
   struct vk_sync base;

--- a/src/freedreno/vulkan/tu_knl_drm_msm.cc
+++ b/src/freedreno/vulkan/tu_knl_drm_msm.cc
@@ -273,6 +273,100 @@ sync_cache_bo(struct tu_device *dev,
              VkDeviceSize size,
              enum tu_mem_sync_op op);

+static inline void
+get_abs_timeout(struct drm_msm_timespec *tv, uint64_t ns)
+{
+   struct timespec t;
+   clock_gettime(CLOCK_MONOTONIC, &t);
+   tv->tv_sec = t.tv_sec + ns / 1000000000;
+   tv->tv_nsec = t.tv_nsec + ns % 1000000000;
+}
+
+static VkResult
+tu_wait_fence(struct tu_device *dev,
+              uint32_t queue_id,
+              int fence,
+              uint64_t timeout_ns)
+{
+   /* fence was created when no work was yet submitted */
+   if (fence < 0)
+      return VK_SUCCESS;
+
+   struct drm_msm_wait_fence req = {
+      .fence = fence,
+      .queueid = queue_id,
+   };
+   int ret;
+
+   get_abs_timeout(&req.timeout, timeout_ns);
+
+   ret = drmCommandWrite(dev->fd, DRM_MSM_WAIT_FENCE, &req, sizeof(req));
+   if (ret) {
+      if (ret == -ETIMEDOUT) {
+         return VK_TIMEOUT;
+      } else {
+         mesa_loge("tu_wait_fence failed! %d (%s)", ret, strerror(errno));
+         return VK_ERROR_UNKNOWN;
+      }
+   }
+
+   return VK_SUCCESS;
+}
+
+static VkResult
+tu_free_zombie_vma_locked(struct tu_device *dev, bool wait)
+{
+   if (!u_vector_length(&dev->zombie_vmas))
+      return VK_SUCCESS;
+
+   if (wait) {
+      struct tu_zombie_vma *vma = (struct tu_zombie_vma *)
+            u_vector_head(&dev->zombie_vmas);
+      /* Wait for 3s (arbitrary timeout) */
+      VkResult ret = tu_wait_fence(dev, dev->queues[0]->msm_queue_id,
+                                   vma->fence, 3000000000);
+
+      if (ret != VK_SUCCESS)
+         return ret;
+   }
+
+   int last_signaled_fence = -1;
+   while (u_vector_length(&dev->zombie_vmas) > 0) {
+      struct tu_zombie_vma *vma = (struct tu_zombie_vma *)
+            u_vector_tail(&dev->zombie_vmas);
+      if (vma->fence > last_signaled_fence) {
+         VkResult ret =
+            tu_wait_fence(dev, dev->queues[0]->msm_queue_id, vma->fence, 0);
+         if (ret != VK_SUCCESS)
+            return ret;
+
+         last_signaled_fence = vma->fence;
+      }
+
+      /* Ensure that internal kernel's vma is freed. */
+      struct drm_msm_gem_info req = {
+         .handle = vma->gem_handle,
+         .info = MSM_INFO_SET_IOVA,
+         .value = 0,
+      };
+
+      int ret =
+         drmCommandWriteRead(dev->fd, DRM_MSM_GEM_INFO, &req, sizeof(req));
+      if (ret < 0) {
+         mesa_loge("MSM_INFO_SET_IOVA(0) failed! %d (%s)", ret,
+                   strerror(errno));
+         return VK_ERROR_UNKNOWN;
+      }
+
+      tu_gem_close(dev, vma->gem_handle);
+
+      util_vma_heap_free(&dev->vma, vma->iova, vma->size);
+      u_vector_remove(&dev->zombie_vmas);
+   }
+
+   return VK_SUCCESS;
+}
+
 static VkResult
 tu_allocate_userspace_iova(struct tu_device *dev,
                           uint32_t gem_handle,
@@ -285,13 +379,24 @@ tu_allocate_userspace_iova(struct tu_device *dev,

   *iova = 0;

+   tu_free_zombie_vma_locked(dev, false);
+
   if (flags & TU_BO_ALLOC_REPLAYABLE) {
      if (client_iova) {
-         if (util_vma_heap_alloc_addr(&dev->vma, client_iova,
-                                      size)) {
+         if (util_vma_heap_alloc_addr(&dev->vma, client_iova, size)) {
            *iova = client_iova;
         } else {
-            return VK_ERROR_INVALID_OPAQUE_CAPTURE_ADDRESS;
+            /* Address may be already freed by us, but not considered as
+             * freed by the kernel. We have to wait until all work that
+             * may hold the address is done. Since addresses are meant to
+             * be replayed only by debug tooling, it should be ok to wait.
+             */
+            if (tu_free_zombie_vma_locked(dev, true) == VK_SUCCESS &&
+                util_vma_heap_alloc_addr(&dev->vma, client_iova, size)) {
+               *iova = client_iova;
+            } else {
+               return VK_ERROR_INVALID_OPAQUE_CAPTURE_ADDRESS;
+            }
         }
      } else {
         /* We have to separate replayable IOVAs from ordinary one in order to
@@ -299,8 +404,7 @@ tu_allocate_userspace_iova(struct tu_device *dev,
          * them from the other end of the address space.
          */
         dev->vma.alloc_high = true;
-         *iova =
-            util_vma_heap_alloc(&dev->vma, size, 0x1000);
+         *iova = util_vma_heap_alloc(&dev->vma, size, 0x1000);
      }
   } else {
      dev->vma.alloc_high = false;
@@ -320,8 +424,10 @@ tu_allocate_userspace_iova(struct tu_device *dev,

   int ret =
      drmCommandWriteRead(dev->fd, DRM_MSM_GEM_INFO, &req, sizeof(req));
-   if (ret < 0)
+   if (ret < 0) {
+      mesa_loge("MSM_INFO_SET_IOVA failed! %d (%s)", ret, strerror(errno));
      return VK_ERROR_OUT_OF_HOST_MEMORY;
+   }

   return VK_SUCCESS;
 }
@@ -620,19 +726,26 @@ msm_bo_finish(struct tu_device *dev, struct tu_bo *bo)

   if (dev->physical_device->has_set_iova) {
      mtx_lock(&dev->vma_mutex);
-      util_vma_heap_free(&dev->vma, bo->iova, bo->size);
+      struct tu_zombie_vma *vma = (struct tu_zombie_vma *)
+            u_vector_add(&dev->zombie_vmas);
+      vma->gem_handle = bo->gem_handle;
+      vma->iova = bo->iova;
+      vma->size = bo->size;
+      vma->fence = p_atomic_read(&dev->queues[0]->fence);
      mtx_unlock(&dev->vma_mutex);
+
+      memset(bo, 0, sizeof(*bo));
+   } else {
+      /* Our BO structs are stored in a sparse array in the physical device,
+       * so we don't want to free the BO pointer, instead we want to reset it
+       * to 0, to signal that array entry as being free.
+       */
+      uint32_t gem_handle = bo->gem_handle;
+      memset(bo, 0, sizeof(*bo));
+
+      tu_gem_close(dev, gem_handle);
   }

-   /* Our BO structs are stored in a sparse array in the physical device,
-    * so we don't want to free the BO pointer, instead we want to reset it
-    * to 0, to signal that array entry as being free.
-    */
-   uint32_t gem_handle = bo->gem_handle;
-   memset(bo, 0, sizeof(*bo));
-
-   tu_gem_close(dev, gem_handle);
-
   u_rwlock_rdunlock(&dev->dma_bo_lock);
 }

@@ -1166,6 +1279,8 @@ tu_queue_submit_locked(struct tu_queue *queue, struct tu_queue_submit *submit)
   if (ret)
      return vk_device_set_lost(&queue->device->vk, "submit failed: %m");

+   p_atomic_set(&queue->fence, req.fence);
+
 #if HAVE_PERFETTO
   tu_perfetto_submit(queue->device, queue->device->submit_count);
 #endif
@@ -1230,33 +1345,10 @@ tu_queue_submit_locked(struct tu_queue *queue, struct tu_queue_submit *submit)
   return VK_SUCCESS;
 }

-static inline void
-get_abs_timeout(struct drm_msm_timespec *tv, uint64_t ns)
-{
-   struct timespec t;
-   clock_gettime(CLOCK_MONOTONIC, &t);
-   tv->tv_sec = t.tv_sec + ns / 1000000000;
-   tv->tv_nsec = t.tv_nsec + ns % 1000000000;
-}
-
 static VkResult
 msm_device_wait_u_trace(struct tu_device *dev, struct tu_u_trace_syncobj *syncobj)
 {
-   struct drm_msm_wait_fence req = {
-      .fence = syncobj->fence,
-      .queueid = syncobj->msm_queue_id,
-   };
-   int ret;
-
-   get_abs_timeout(&req.timeout, 1000000000);
-
-   ret = drmCommandWrite(dev->fd, DRM_MSM_WAIT_FENCE, &req, sizeof(req));
-   if (ret && (ret != -ETIMEDOUT)) {
-      fprintf(stderr, "wait-fence failed! %d (%s)", ret, strerror(errno));
-      return VK_TIMEOUT;
-   }
-
-   return VK_SUCCESS;
+   return tu_wait_fence(dev, syncobj->msm_queue_id, syncobj->fence, 1000000000);
 }

 static VkResult
@@ -1413,18 +1505,8 @@ tu_knl_drm_msm_load(struct tu_instance *instance,
      goto fail;
   }

-   /*
-    * device->has_set_iova = !tu_drm_get_va_prop(device, &device->va_start,
-    *                                            &device->va_size);
-    *
-    * If BO is freed while kernel considers it busy, our VMA state gets
-    * desynchronized from kernel's VMA state, because kernel waits
-    * until BO stops being busy. And whether BO is busy decided at
-    * submission granularity.
-    *
-    * Disable this capability until solution is found.
-    */
-   device->has_set_iova = false;
+   device->has_set_iova = !tu_drm_get_va_prop(device, &device->va_start,
+                                              &device->va_size);

   /* Even if kernel is new enough, the GPU itself may not support it. */
   device->has_cached_coherent_memory =
--- a/src/freedreno/vulkan/tu_knl_kgsl.cc
+++ b/src/freedreno/vulkan/tu_knl_kgsl.cc
@@ -941,11 +941,11 @@ kgsl_queue_submit(struct tu_queue *queue, struct vk_queue_submit *vk_submit)
      }

      struct kgsl_syncobj last_submit_sync;
-      if (queue->last_submit_timestamp >= 0)
+      if (queue->fence >= 0)
         last_submit_sync = (struct kgsl_syncobj) {
            .state = KGSL_SYNCOBJ_STATE_TS,
            .queue = queue,
-            .timestamp = queue->last_submit_timestamp,
+            .timestamp = queue->fence,
         };
      else
         last_submit_sync = (struct kgsl_syncobj) {
@@ -1124,7 +1124,7 @@ kgsl_queue_submit(struct tu_queue *queue, struct vk_queue_submit *vk_submit)
      return result;
   }

-   queue->last_submit_timestamp = req.timestamp;
+   p_atomic_set(&queue->fence, req.timestamp);

   for (uint32_t i = 0; i < vk_submit->signal_count; i++) {
      struct kgsl_syncobj *signal_sync =