venus: dispatch background shader tasks to secondary ring

Summary: - Add a perf option to force primary ring submission - Let device own secondary ring(s) for ad-hoc spawn - For threads where swapchain and command pool are created, track with TLS to instruct ring dispatch. - If the pipeline creation or cache retrieval happens on the background threads not on the hot paths, force synchronous and dispatch to the secondary ring after waiting for primary ring becoming current. - If the pipeline creation or cache retrieval happens on the hot paths threads, dispatch to the primary ring to avoid being blocked by those tasks on the secondary ring. Signed-off-by: Yiwei Zhang <zzyiwei@chromium.org> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/26179>
2023-12-05 20:17:44 -08:00
parent 5b26bebcf4
commit d17ddcc847
10 changed files with 189 additions and 14 deletions
--- a/src/virtio/vulkan/vn_android.c
+++ b/src/virtio/vulkan/vn_android.c
@@ -355,6 +355,8 @@ vn_GetSwapchainGrallocUsage2ANDROID(
   if (swapchainImageUsage & VK_SWAPCHAIN_IMAGE_USAGE_SHARED_BIT_ANDROID)
      *grallocProducerUsage |= vn_android_gralloc_get_shared_present_usage();

+   vn_tls_set_primary_ring_submission();
+
   return VK_SUCCESS;
 }

--- a/src/virtio/vulkan/vn_command_buffer.c
+++ b/src/virtio/vulkan/vn_command_buffer.c
@@ -687,6 +687,8 @@ vn_CreateCommandPool(VkDevice device,
   vn_async_vkCreateCommandPool(dev->primary_ring, device, pCreateInfo, NULL,
                                &pool_handle);

+   vn_tls_set_primary_ring_submission();
+
   *pCommandPool = pool_handle;

   return VK_SUCCESS;
--- a/src/virtio/vulkan/vn_common.c
+++ b/src/virtio/vulkan/vn_common.c
@@ -51,6 +51,7 @@ static const struct debug_control vn_perf_options[] = {
   { "no_query_feedback", VN_PERF_NO_QUERY_FEEDBACK },
   { "no_async_mem_alloc", VN_PERF_NO_ASYNC_MEM_ALLOC },
   { "no_tiled_wsi_image", VN_PERF_NO_TILED_WSI_IMAGE },
+   { "no_multi_ring", VN_PERF_NO_MULTI_RING },
   { NULL, 0 },
   /* clang-format on */
 };
@@ -238,3 +239,40 @@ vn_relax(struct vn_relax_state *state)
   const uint32_t shift = util_last_bit(*iter) - busy_wait_order - 1;
   os_time_sleep(base_sleep_us << shift);
 }
+
+static void
+vn_tls_free(void *tls)
+{
+   free(tls);
+}
+
+static tss_t vn_tls_key;
+static bool vn_tls_key_valid;
+
+static void
+vn_tls_key_create_once(void)
+{
+   vn_tls_key_valid = tss_create(&vn_tls_key, vn_tls_free) == thrd_success;
+   if (!vn_tls_key_valid && VN_DEBUG(INIT))
+      vn_log(NULL, "WARNING: failed to create vn_tls_key");
+}
+
+struct vn_tls *
+vn_tls_get(void)
+{
+   static once_flag once = ONCE_FLAG_INIT;
+   call_once(&once, vn_tls_key_create_once);
+   if (unlikely(!vn_tls_key_valid))
+      return NULL;
+
+   struct vn_tls *tls = tss_get(vn_tls_key);
+   if (likely(tls))
+      return tls;
+
+   tls = calloc(1, sizeof(*tls));
+   if (tls && tss_set(vn_tls_key, tls) == thrd_success)
+      return tls;
+
+   free(tls);
+   return NULL;
+}
--- a/src/virtio/vulkan/vn_common.h
+++ b/src/virtio/vulkan/vn_common.h
@@ -124,6 +124,7 @@ enum vn_perf {
   VN_PERF_NO_QUERY_FEEDBACK = 1ull << 8,
   VN_PERF_NO_ASYNC_MEM_ALLOC = 1ull << 9,
   VN_PERF_NO_TILED_WSI_IMAGE = 1ull << 10,
+   VN_PERF_NO_MULTI_RING = 1ull << 11,
 };

 typedef uint64_t vn_object_id;
@@ -208,6 +209,16 @@ struct vn_relax_state {
   const char *reason;
 };

+struct vn_tls {
+   /* Track swapchain and command pool creations on threads so dispatch of the
+    * following on non-tracked threads can be routed as synchronous on the
+    * secondary ring:
+    * - pipeline creations
+    * - pipeline cache retrievals
+    */
+   bool primary_ring_submission;
+};
+
 void
 vn_env_init(void);

@@ -469,4 +480,24 @@ vn_gettid(void)
 #endif
 }

+struct vn_tls *
+vn_tls_get(void);
+
+static inline void
+vn_tls_set_primary_ring_submission(void)
+{
+   struct vn_tls *tls = vn_tls_get();
+   if (likely(tls))
+      tls->primary_ring_submission = true;
+}
+
+static inline bool
+vn_tls_get_primary_ring_submission(void)
+{
+   const struct vn_tls *tls = vn_tls_get();
+   if (likely(tls))
+      return tls->primary_ring_submission;
+   return true;
+}
+
 #endif /* VN_COMMON_H */
--- a/src/virtio/vulkan/vn_device.c
+++ b/src/virtio/vulkan/vn_device.c
@@ -436,6 +436,41 @@ vn_device_update_shader_cache_id(struct vn_device *dev)
 #endif
 }

+bool
+vn_device_secondary_ring_init_once(struct vn_device *dev)
+{
+   VN_TRACE_FUNC();
+
+   assert(!dev->force_primary_ring_submission);
+
+   static bool ok = true;
+   if (!ok)
+      return ok;
+
+   mtx_lock(&dev->ring_mutex);
+   /* allows caller to check secondary ring without holding a lock */
+   if (dev->secondary_ring)
+      goto out_unlock;
+
+   /* keep the extra for potential roundtrip sync on secondary ring */
+   static const size_t extra_size = sizeof(uint32_t);
+
+   /* only need a small ring for synchronous cmds on secondary ring */
+   static const size_t buf_size = 16 * 1024;
+
+   struct vn_ring_layout layout;
+   vn_ring_get_layout(buf_size, extra_size, &layout);
+
+   dev->secondary_ring = vn_ring_create(dev->instance, &layout);
+   if (!dev->secondary_ring) {
+      ok = false;
+      vn_log(dev->instance, "WARNING: failed to create secondary ring");
+   }
+out_unlock:
+   mtx_unlock(&dev->ring_mutex);
+   return ok;
+}
+
 static VkResult
 vn_device_init(struct vn_device *dev,
               struct vn_physical_device *physical_dev,
@@ -454,6 +489,9 @@ vn_device_init(struct vn_device *dev,
   dev->renderer = instance->renderer;
   dev->primary_ring = instance->ring.ring;

+   /* can be extended for app compat purpose */
+   dev->force_primary_ring_submission = VN_PERF(NO_MULTI_RING);
+
   create_info =
      vn_device_fix_create_info(dev, create_info, alloc, &local_create_info);
   if (!create_info)
@@ -469,6 +507,8 @@ vn_device_init(struct vn_device *dev,
   if (result != VK_SUCCESS)
      return result;

+   mtx_init(&dev->ring_mutex, mtx_plain);
+
   result = vn_device_memory_report_init(dev, create_info);
   if (result != VK_SUCCESS)
      goto out_destroy_device;
@@ -520,6 +560,7 @@ out_memory_report_fini:
   vn_device_memory_report_fini(dev);

 out_destroy_device:
+   mtx_destroy(&dev->ring_mutex);
   vn_call_vkDestroyDevice(dev->primary_ring, dev_handle, NULL);

   return result;
@@ -617,6 +658,11 @@ vn_DestroyDevice(VkDevice device, const VkAllocationCallbacks *pAllocator)
      }
   }

+   if (dev->secondary_ring)
+      vn_ring_destroy(dev->secondary_ring);
+
+   mtx_destroy(&dev->ring_mutex);
+
   vk_free(alloc, dev->queues);

   vn_device_base_fini(&dev->base);
--- a/src/virtio/vulkan/vn_device.h
+++ b/src/virtio/vulkan/vn_device.h
@@ -29,6 +29,10 @@ struct vn_device {
   struct vn_physical_device *physical_device;
   struct vn_renderer *renderer;
   struct vn_ring *primary_ring;
+   bool force_primary_ring_submission;
+
+   mtx_t ring_mutex;
+   struct vn_ring *secondary_ring;

   struct vn_device_memory_report *memory_reports;
   uint32_t memory_report_count;
@@ -80,4 +84,7 @@ vn_device_emit_device_memory_report(struct vn_device *dev,
      dev->memory_reports[i].callback(&report, dev->memory_reports[i].data);
 }

+bool
+vn_device_secondary_ring_init_once(struct vn_device *dev);
+
 #endif /* VN_DEVICE_H */
--- a/src/virtio/vulkan/vn_pipeline.c
+++ b/src/virtio/vulkan/vn_pipeline.c
@@ -417,6 +417,34 @@ vn_DestroyPipelineCache(VkDevice device,
   vk_free(alloc, cache);
 }

+static struct vn_ring *
+vn_get_target_ring(struct vn_device *dev)
+{
+   if (dev->force_primary_ring_submission)
+      return dev->primary_ring;
+
+   if (vn_tls_get_primary_ring_submission())
+      return dev->primary_ring;
+
+   if (!dev->secondary_ring) {
+      if (!vn_device_secondary_ring_init_once(dev)) {
+         /* fallback to primary ring submission */
+         return dev->primary_ring;
+      }
+   }
+
+   /* Ensure pipeline cache and pipeline deps are ready in the renderer.
+    *
+    * TODO:
+    * - For cache retrieval, track ring seqno of cache obj and only wait
+    *   for that seqno once.
+    * - For pipeline creation, track ring seqnos of pipeline layout and
+    *   renderpass objs it depends on, and only wait for those seqnos once.
+    */
+   vn_ring_wait_all(dev->primary_ring);
+   return dev->secondary_ring;
+}
+
 VkResult
 vn_GetPipelineCacheData(VkDevice device,
                        VkPipelineCache pipelineCache,
@@ -427,10 +455,13 @@ vn_GetPipelineCacheData(VkDevice device,
   struct vn_device *dev = vn_device_from_handle(device);
   struct vn_physical_device *physical_dev = dev->physical_device;

+   struct vn_ring *target_ring = vn_get_target_ring(dev);
+   assert(target_ring);
+
   struct vk_pipeline_cache_header *header = pData;
   VkResult result;
   if (!pData) {
-      result = vn_call_vkGetPipelineCacheData(dev->primary_ring, device,
+      result = vn_call_vkGetPipelineCacheData(target_ring, device,
                                              pipelineCache, pDataSize, NULL);
      if (result != VK_SUCCESS)
         return vn_error(dev->instance, result);
@@ -454,7 +485,7 @@ vn_GetPipelineCacheData(VkDevice device,

   *pDataSize -= header->header_size;
   result =
-      vn_call_vkGetPipelineCacheData(dev->primary_ring, device, pipelineCache,
+      vn_call_vkGetPipelineCacheData(target_ring, device, pipelineCache,
                                     pDataSize, pData + header->header_size);
   if (result < VK_SUCCESS)
      return vn_error(dev->instance, result);
@@ -1404,16 +1435,18 @@ vn_CreateGraphicsPipelines(VkDevice device,
         (const VkBaseInStructure *)pCreateInfos[i].pNext);
   }

-   if (want_sync) {
+   struct vn_ring *target_ring = vn_get_target_ring(dev);
+   assert(target_ring);
+   if (want_sync || target_ring == dev->secondary_ring) {
      result = vn_call_vkCreateGraphicsPipelines(
-         dev->primary_ring, device, pipelineCache, createInfoCount,
-         pCreateInfos, NULL, pPipelines);
+         target_ring, device, pipelineCache, createInfoCount, pCreateInfos,
+         NULL, pPipelines);
      if (result != VK_SUCCESS)
         vn_destroy_failed_pipelines(dev, createInfoCount, pPipelines, alloc);
   } else {
-      vn_async_vkCreateGraphicsPipelines(dev->primary_ring, device,
-                                         pipelineCache, createInfoCount,
-                                         pCreateInfos, NULL, pPipelines);
+      vn_async_vkCreateGraphicsPipelines(target_ring, device, pipelineCache,
+                                         createInfoCount, pCreateInfos, NULL,
+                                         pPipelines);
      result = VK_SUCCESS;
   }

@@ -1458,16 +1491,18 @@ vn_CreateComputePipelines(VkDevice device,
         (const VkBaseInStructure *)pCreateInfos[i].pNext);
   }

-   if (want_sync) {
+   struct vn_ring *target_ring = vn_get_target_ring(dev);
+   assert(target_ring);
+   if (want_sync || target_ring == dev->secondary_ring) {
      result = vn_call_vkCreateComputePipelines(
-         dev->primary_ring, device, pipelineCache, createInfoCount,
-         pCreateInfos, NULL, pPipelines);
+         target_ring, device, pipelineCache, createInfoCount, pCreateInfos,
+         NULL, pPipelines);
      if (result != VK_SUCCESS)
         vn_destroy_failed_pipelines(dev, createInfoCount, pPipelines, alloc);
   } else {
-      vn_async_vkCreateComputePipelines(dev->primary_ring, device,
-                                        pipelineCache, createInfoCount,
-                                        pCreateInfos, NULL, pPipelines);
+      vn_async_vkCreateComputePipelines(target_ring, device, pipelineCache,
+                                        createInfoCount, pCreateInfos, NULL,
+                                        pPipelines);
      result = VK_SUCCESS;
   }

--- a/src/virtio/vulkan/vn_ring.c
+++ b/src/virtio/vulkan/vn_ring.c
@@ -172,6 +172,15 @@ vn_ring_wait_seqno(struct vn_ring *ring, uint32_t seqno)
   } while (true);
 }

+void
+vn_ring_wait_all(struct vn_ring *ring)
+{
+   /* load from tail rather than ring->cur for atomicity */
+   const uint32_t pending_seqno =
+      atomic_load_explicit(ring->shared.tail, memory_order_relaxed);
+   vn_ring_wait_seqno(ring, pending_seqno);
+}
+
 static bool
 vn_ring_has_space(const struct vn_ring *ring,
                  uint32_t size,
--- a/src/virtio/vulkan/vn_ring.h
+++ b/src/virtio/vulkan/vn_ring.h
@@ -63,6 +63,9 @@ vn_ring_unset_status_bits(struct vn_ring *ring, uint32_t mask);
 bool
 vn_ring_get_seqno_status(struct vn_ring *ring, uint32_t seqno);

+void
+vn_ring_wait_all(struct vn_ring *ring);
+
 struct vn_ring_submit_command {
   /* empty command implies errors */
   struct vn_cs_encoder command;
--- a/src/virtio/vulkan/vn_wsi.c
+++ b/src/virtio/vulkan/vn_wsi.c
@@ -270,6 +270,8 @@ vn_CreateSwapchainKHR(VkDevice device,
             VN_WSI_PTR(pCreateInfo->oldSwapchain));
   }

+   vn_tls_set_primary_ring_submission();
+
   return vn_result(dev->instance, result);
 }