venus: dispatch background shader tasks to secondary ring

Summary:
- Add a perf option to force primary ring submission
- Let device own secondary ring(s) for ad-hoc spawn
- For threads where swapchain and command pool are created, track with
  TLS to instruct ring dispatch.
- If the pipeline creation or cache retrieval happens on the background
  threads not on the hot paths, force synchronous and dispatch to the
  secondary ring after waiting for primary ring becoming current.
- If the pipeline creation or cache retrieval happens on the hot paths
  threads, dispatch to the primary ring to avoid being blocked by those
  tasks on the secondary ring.

Signed-off-by: Yiwei Zhang <zzyiwei@chromium.org>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/26179>
This commit is contained in:
Yiwei Zhang
2023-12-05 20:17:44 -08:00
committed by Marge Bot
parent 5b26bebcf4
commit d17ddcc847
10 changed files with 189 additions and 14 deletions

View File

@@ -355,6 +355,8 @@ vn_GetSwapchainGrallocUsage2ANDROID(
if (swapchainImageUsage & VK_SWAPCHAIN_IMAGE_USAGE_SHARED_BIT_ANDROID)
*grallocProducerUsage |= vn_android_gralloc_get_shared_present_usage();
vn_tls_set_primary_ring_submission();
return VK_SUCCESS;
}

View File

@@ -687,6 +687,8 @@ vn_CreateCommandPool(VkDevice device,
vn_async_vkCreateCommandPool(dev->primary_ring, device, pCreateInfo, NULL,
&pool_handle);
vn_tls_set_primary_ring_submission();
*pCommandPool = pool_handle;
return VK_SUCCESS;

View File

@@ -51,6 +51,7 @@ static const struct debug_control vn_perf_options[] = {
{ "no_query_feedback", VN_PERF_NO_QUERY_FEEDBACK },
{ "no_async_mem_alloc", VN_PERF_NO_ASYNC_MEM_ALLOC },
{ "no_tiled_wsi_image", VN_PERF_NO_TILED_WSI_IMAGE },
{ "no_multi_ring", VN_PERF_NO_MULTI_RING },
{ NULL, 0 },
/* clang-format on */
};
@@ -238,3 +239,40 @@ vn_relax(struct vn_relax_state *state)
const uint32_t shift = util_last_bit(*iter) - busy_wait_order - 1;
os_time_sleep(base_sleep_us << shift);
}
static void
vn_tls_free(void *tls)
{
free(tls);
}
static tss_t vn_tls_key;
static bool vn_tls_key_valid;
static void
vn_tls_key_create_once(void)
{
vn_tls_key_valid = tss_create(&vn_tls_key, vn_tls_free) == thrd_success;
if (!vn_tls_key_valid && VN_DEBUG(INIT))
vn_log(NULL, "WARNING: failed to create vn_tls_key");
}
struct vn_tls *
vn_tls_get(void)
{
static once_flag once = ONCE_FLAG_INIT;
call_once(&once, vn_tls_key_create_once);
if (unlikely(!vn_tls_key_valid))
return NULL;
struct vn_tls *tls = tss_get(vn_tls_key);
if (likely(tls))
return tls;
tls = calloc(1, sizeof(*tls));
if (tls && tss_set(vn_tls_key, tls) == thrd_success)
return tls;
free(tls);
return NULL;
}

View File

@@ -124,6 +124,7 @@ enum vn_perf {
VN_PERF_NO_QUERY_FEEDBACK = 1ull << 8,
VN_PERF_NO_ASYNC_MEM_ALLOC = 1ull << 9,
VN_PERF_NO_TILED_WSI_IMAGE = 1ull << 10,
VN_PERF_NO_MULTI_RING = 1ull << 11,
};
typedef uint64_t vn_object_id;
@@ -208,6 +209,16 @@ struct vn_relax_state {
const char *reason;
};
struct vn_tls {
/* Track swapchain and command pool creations on threads so dispatch of the
* following on non-tracked threads can be routed as synchronous on the
* secondary ring:
* - pipeline creations
* - pipeline cache retrievals
*/
bool primary_ring_submission;
};
void
vn_env_init(void);
@@ -469,4 +480,24 @@ vn_gettid(void)
#endif
}
struct vn_tls *
vn_tls_get(void);
static inline void
vn_tls_set_primary_ring_submission(void)
{
struct vn_tls *tls = vn_tls_get();
if (likely(tls))
tls->primary_ring_submission = true;
}
static inline bool
vn_tls_get_primary_ring_submission(void)
{
const struct vn_tls *tls = vn_tls_get();
if (likely(tls))
return tls->primary_ring_submission;
return true;
}
#endif /* VN_COMMON_H */

View File

@@ -436,6 +436,41 @@ vn_device_update_shader_cache_id(struct vn_device *dev)
#endif
}
bool
vn_device_secondary_ring_init_once(struct vn_device *dev)
{
VN_TRACE_FUNC();
assert(!dev->force_primary_ring_submission);
static bool ok = true;
if (!ok)
return ok;
mtx_lock(&dev->ring_mutex);
/* allows caller to check secondary ring without holding a lock */
if (dev->secondary_ring)
goto out_unlock;
/* keep the extra for potential roundtrip sync on secondary ring */
static const size_t extra_size = sizeof(uint32_t);
/* only need a small ring for synchronous cmds on secondary ring */
static const size_t buf_size = 16 * 1024;
struct vn_ring_layout layout;
vn_ring_get_layout(buf_size, extra_size, &layout);
dev->secondary_ring = vn_ring_create(dev->instance, &layout);
if (!dev->secondary_ring) {
ok = false;
vn_log(dev->instance, "WARNING: failed to create secondary ring");
}
out_unlock:
mtx_unlock(&dev->ring_mutex);
return ok;
}
static VkResult
vn_device_init(struct vn_device *dev,
struct vn_physical_device *physical_dev,
@@ -454,6 +489,9 @@ vn_device_init(struct vn_device *dev,
dev->renderer = instance->renderer;
dev->primary_ring = instance->ring.ring;
/* can be extended for app compat purpose */
dev->force_primary_ring_submission = VN_PERF(NO_MULTI_RING);
create_info =
vn_device_fix_create_info(dev, create_info, alloc, &local_create_info);
if (!create_info)
@@ -469,6 +507,8 @@ vn_device_init(struct vn_device *dev,
if (result != VK_SUCCESS)
return result;
mtx_init(&dev->ring_mutex, mtx_plain);
result = vn_device_memory_report_init(dev, create_info);
if (result != VK_SUCCESS)
goto out_destroy_device;
@@ -520,6 +560,7 @@ out_memory_report_fini:
vn_device_memory_report_fini(dev);
out_destroy_device:
mtx_destroy(&dev->ring_mutex);
vn_call_vkDestroyDevice(dev->primary_ring, dev_handle, NULL);
return result;
@@ -617,6 +658,11 @@ vn_DestroyDevice(VkDevice device, const VkAllocationCallbacks *pAllocator)
}
}
if (dev->secondary_ring)
vn_ring_destroy(dev->secondary_ring);
mtx_destroy(&dev->ring_mutex);
vk_free(alloc, dev->queues);
vn_device_base_fini(&dev->base);

View File

@@ -29,6 +29,10 @@ struct vn_device {
struct vn_physical_device *physical_device;
struct vn_renderer *renderer;
struct vn_ring *primary_ring;
bool force_primary_ring_submission;
mtx_t ring_mutex;
struct vn_ring *secondary_ring;
struct vn_device_memory_report *memory_reports;
uint32_t memory_report_count;
@@ -80,4 +84,7 @@ vn_device_emit_device_memory_report(struct vn_device *dev,
dev->memory_reports[i].callback(&report, dev->memory_reports[i].data);
}
bool
vn_device_secondary_ring_init_once(struct vn_device *dev);
#endif /* VN_DEVICE_H */

View File

@@ -417,6 +417,34 @@ vn_DestroyPipelineCache(VkDevice device,
vk_free(alloc, cache);
}
static struct vn_ring *
vn_get_target_ring(struct vn_device *dev)
{
if (dev->force_primary_ring_submission)
return dev->primary_ring;
if (vn_tls_get_primary_ring_submission())
return dev->primary_ring;
if (!dev->secondary_ring) {
if (!vn_device_secondary_ring_init_once(dev)) {
/* fallback to primary ring submission */
return dev->primary_ring;
}
}
/* Ensure pipeline cache and pipeline deps are ready in the renderer.
*
* TODO:
* - For cache retrieval, track ring seqno of cache obj and only wait
* for that seqno once.
* - For pipeline creation, track ring seqnos of pipeline layout and
* renderpass objs it depends on, and only wait for those seqnos once.
*/
vn_ring_wait_all(dev->primary_ring);
return dev->secondary_ring;
}
VkResult
vn_GetPipelineCacheData(VkDevice device,
VkPipelineCache pipelineCache,
@@ -427,10 +455,13 @@ vn_GetPipelineCacheData(VkDevice device,
struct vn_device *dev = vn_device_from_handle(device);
struct vn_physical_device *physical_dev = dev->physical_device;
struct vn_ring *target_ring = vn_get_target_ring(dev);
assert(target_ring);
struct vk_pipeline_cache_header *header = pData;
VkResult result;
if (!pData) {
result = vn_call_vkGetPipelineCacheData(dev->primary_ring, device,
result = vn_call_vkGetPipelineCacheData(target_ring, device,
pipelineCache, pDataSize, NULL);
if (result != VK_SUCCESS)
return vn_error(dev->instance, result);
@@ -454,7 +485,7 @@ vn_GetPipelineCacheData(VkDevice device,
*pDataSize -= header->header_size;
result =
vn_call_vkGetPipelineCacheData(dev->primary_ring, device, pipelineCache,
vn_call_vkGetPipelineCacheData(target_ring, device, pipelineCache,
pDataSize, pData + header->header_size);
if (result < VK_SUCCESS)
return vn_error(dev->instance, result);
@@ -1404,16 +1435,18 @@ vn_CreateGraphicsPipelines(VkDevice device,
(const VkBaseInStructure *)pCreateInfos[i].pNext);
}
if (want_sync) {
struct vn_ring *target_ring = vn_get_target_ring(dev);
assert(target_ring);
if (want_sync || target_ring == dev->secondary_ring) {
result = vn_call_vkCreateGraphicsPipelines(
dev->primary_ring, device, pipelineCache, createInfoCount,
pCreateInfos, NULL, pPipelines);
target_ring, device, pipelineCache, createInfoCount, pCreateInfos,
NULL, pPipelines);
if (result != VK_SUCCESS)
vn_destroy_failed_pipelines(dev, createInfoCount, pPipelines, alloc);
} else {
vn_async_vkCreateGraphicsPipelines(dev->primary_ring, device,
pipelineCache, createInfoCount,
pCreateInfos, NULL, pPipelines);
vn_async_vkCreateGraphicsPipelines(target_ring, device, pipelineCache,
createInfoCount, pCreateInfos, NULL,
pPipelines);
result = VK_SUCCESS;
}
@@ -1458,16 +1491,18 @@ vn_CreateComputePipelines(VkDevice device,
(const VkBaseInStructure *)pCreateInfos[i].pNext);
}
if (want_sync) {
struct vn_ring *target_ring = vn_get_target_ring(dev);
assert(target_ring);
if (want_sync || target_ring == dev->secondary_ring) {
result = vn_call_vkCreateComputePipelines(
dev->primary_ring, device, pipelineCache, createInfoCount,
pCreateInfos, NULL, pPipelines);
target_ring, device, pipelineCache, createInfoCount, pCreateInfos,
NULL, pPipelines);
if (result != VK_SUCCESS)
vn_destroy_failed_pipelines(dev, createInfoCount, pPipelines, alloc);
} else {
vn_async_vkCreateComputePipelines(dev->primary_ring, device,
pipelineCache, createInfoCount,
pCreateInfos, NULL, pPipelines);
vn_async_vkCreateComputePipelines(target_ring, device, pipelineCache,
createInfoCount, pCreateInfos, NULL,
pPipelines);
result = VK_SUCCESS;
}

View File

@@ -172,6 +172,15 @@ vn_ring_wait_seqno(struct vn_ring *ring, uint32_t seqno)
} while (true);
}
void
vn_ring_wait_all(struct vn_ring *ring)
{
/* load from tail rather than ring->cur for atomicity */
const uint32_t pending_seqno =
atomic_load_explicit(ring->shared.tail, memory_order_relaxed);
vn_ring_wait_seqno(ring, pending_seqno);
}
static bool
vn_ring_has_space(const struct vn_ring *ring,
uint32_t size,

View File

@@ -63,6 +63,9 @@ vn_ring_unset_status_bits(struct vn_ring *ring, uint32_t mask);
bool
vn_ring_get_seqno_status(struct vn_ring *ring, uint32_t seqno);
void
vn_ring_wait_all(struct vn_ring *ring);
struct vn_ring_submit_command {
/* empty command implies errors */
struct vn_cs_encoder command;

View File

@@ -270,6 +270,8 @@ vn_CreateSwapchainKHR(VkDevice device,
VN_WSI_PTR(pCreateInfo->oldSwapchain));
}
vn_tls_set_primary_ring_submission();
return vn_result(dev->instance, result);
}