diff --git a/src/broadcom/ci/broadcom-rpi4-fails.txt b/src/broadcom/ci/broadcom-rpi4-fails.txt index f0ee1dc7797..c85007c3115 100644 --- a/src/broadcom/ci/broadcom-rpi4-fails.txt +++ b/src/broadcom/ci/broadcom-rpi4-fails.txt @@ -352,7 +352,6 @@ spec@oes_texture_view@rendering-formats@clear GL_RGB10_A2 as GL_RGBA8I,Fail # https://gitlab.khronos.org/Tracker/vk-gl-cts/-/issues/3510 dEQP-VK.api.external.semaphore.opaque_fd.info_timeline,Fail -dEQP-VK.api.external.semaphore.sync_fd.info_timeline,Fail dEQP-VK.draw.renderpass.inverted_depth_ranges.nodepthclamp_deltazero,Fail diff --git a/src/broadcom/vulkan/v3dv_device.c b/src/broadcom/vulkan/v3dv_device.c index abc7073a0b8..81d2e48203f 100644 --- a/src/broadcom/vulkan/v3dv_device.c +++ b/src/broadcom/vulkan/v3dv_device.c @@ -45,6 +45,7 @@ #include "drm-uapi/v3d_drm.h" #include "format/u_format.h" +#include "vk_drm_syncobj.h" #include "vk_util.h" #include "git_sha1.h" @@ -844,6 +845,44 @@ physical_device_init(struct v3dv_physical_device *device, device->options.merge_jobs = getenv("V3DV_NO_MERGE_JOBS") == NULL; + device->drm_syncobj_type = vk_drm_syncobj_get_type(device->render_fd); + + /* We don't support timelines in the uAPI yet and we don't want it getting + * suddenly turned on by vk_drm_syncobj_get_type() without us adding v3dv + * code for it first. + */ + device->drm_syncobj_type.features &= ~VK_SYNC_FEATURE_TIMELINE; + + /* Sync file export is incompatible with the current model of execution + * where some jobs may run on the CPU. There are CTS tests which do the + * following: + * + * 1. Create a command buffer with a vkCmdWaitEvents() + * 2. Submit the command buffer + * 3. vkGetSemaphoreFdKHR() to try to get a sync_file + * 4. vkSetEvent() + * + * This deadlocks because we have to wait for the syncobj to get a real + * fence in vkGetSemaphoreFdKHR() which only happens after all the work + * from the command buffer is complete which only happens after + * vkSetEvent(). No amount of CPU threading in userspace will ever fix + * this. Sadly, this is pretty explicitly allowed by the Vulkan spec: + * + * VUID-vkCmdWaitEvents-pEvents-01163 + * + * "If pEvents includes one or more events that will be signaled by + * vkSetEvent after commandBuffer has been submitted to a queue, then + * vkCmdWaitEvents must not be called inside a render pass instance" + * + * Disable sync file support for now. + */ + device->drm_syncobj_type.import_sync_file = NULL; + device->drm_syncobj_type.export_sync_file = NULL; + + device->sync_types[0] = &device->drm_syncobj_type; + device->sync_types[1] = NULL; + device->vk.supported_sync_types = device->sync_types; + result = v3dv_wsi_init(device); if (result != VK_SUCCESS) { vk_error(instance, result); @@ -1845,6 +1884,17 @@ v3dv_EnumerateDeviceLayerProperties(VkPhysicalDevice physicalDevice, return vk_error(physical_device, VK_ERROR_LAYER_NOT_PRESENT); } +static void +destroy_queue_syncs(struct v3dv_queue *queue) +{ + for (int i = 0; i < V3DV_QUEUE_COUNT; i++) { + if (queue->last_job_syncs.syncs[i]) { + drmSyncobjDestroy(queue->device->pdevice->render_fd, + queue->last_job_syncs.syncs[i]); + } + } +} + static VkResult queue_init(struct v3dv_device *device, struct v3dv_queue *queue, const VkDeviceQueueCreateInfo *create_info, @@ -1854,23 +1904,43 @@ queue_init(struct v3dv_device *device, struct v3dv_queue *queue, index_in_family); if (result != VK_SUCCESS) return result; + + result = vk_queue_enable_submit_thread(&queue->vk); + if (result != VK_SUCCESS) + goto fail_submit_thread; + queue->device = device; + queue->vk.driver_submit = v3dv_queue_driver_submit; + + for (int i = 0; i < V3DV_QUEUE_COUNT; i++) { + queue->last_job_syncs.first[i] = true; + int ret = drmSyncobjCreate(device->pdevice->render_fd, + DRM_SYNCOBJ_CREATE_SIGNALED, + &queue->last_job_syncs.syncs[i]); + if (ret) { + result = vk_errorf(device, VK_ERROR_INITIALIZATION_FAILED, + "syncobj create failed: %m"); + goto fail_last_job_syncs; + } + } + queue->noop_job = NULL; - list_inithead(&queue->submit_wait_list); - mtx_init(&queue->mutex, mtx_plain); - mtx_init(&queue->noop_mutex, mtx_plain); return VK_SUCCESS; + +fail_last_job_syncs: + destroy_queue_syncs(queue); +fail_submit_thread: + vk_queue_finish(&queue->vk); + return result; } static void queue_finish(struct v3dv_queue *queue) { - vk_queue_finish(&queue->vk); - assert(list_is_empty(&queue->submit_wait_list)); if (queue->noop_job) v3dv_job_destroy(queue->noop_job); - mtx_destroy(&queue->mutex); - mtx_destroy(&queue->noop_mutex); + destroy_queue_syncs(queue); + vk_queue_finish(&queue->vk); } static void @@ -1882,16 +1952,6 @@ init_device_meta(struct v3dv_device *device) v3dv_meta_texel_buffer_copy_init(device); } -static void -destroy_device_syncs(struct v3dv_device *device, - int render_fd) -{ - for (int i = 0; i < V3DV_QUEUE_COUNT; i++) { - if (device->last_job_syncs.syncs[i]) - drmSyncobjDestroy(render_fd, device->last_job_syncs.syncs[i]); - } -} - static void destroy_device_meta(struct v3dv_device *device) { @@ -1944,10 +2004,12 @@ v3dv_CreateDevice(VkPhysicalDevice physicalDevice, device->instance = instance; device->pdevice = physical_device; - mtx_init(&device->mutex, mtx_plain); mtx_init(&device->query_mutex, mtx_plain); cnd_init(&device->query_ended); + vk_device_set_drm_fd(&device->vk, physical_device->render_fd); + vk_device_enable_threaded_submit(&device->vk); + result = queue_init(device, &device->queue, pCreateInfo->pQueueCreateInfos, 0); if (result != VK_SUCCESS) @@ -1973,17 +2035,6 @@ v3dv_CreateDevice(VkPhysicalDevice physicalDevice, if (device->features.robustBufferAccess) perf_debug("Device created with Robust Buffer Access enabled.\n"); - for (int i = 0; i < V3DV_QUEUE_COUNT; i++) { - device->last_job_syncs.first[i] = true; - int ret = drmSyncobjCreate(physical_device->render_fd, - DRM_SYNCOBJ_CREATE_SIGNALED, - &device->last_job_syncs.syncs[i]); - if (ret) { - result = VK_ERROR_INITIALIZATION_FAILED; - goto fail; - } - } - #ifdef DEBUG v3dv_X(device, device_check_prepacked_sizes)(); #endif @@ -1999,10 +2050,8 @@ v3dv_CreateDevice(VkPhysicalDevice physicalDevice, return VK_SUCCESS; fail: - destroy_device_syncs(device, physical_device->render_fd); cnd_destroy(&device->query_ended); mtx_destroy(&device->query_mutex); - mtx_destroy(&device->mutex); vk_device_finish(&device->vk); vk_free(&device->vk.alloc, device); @@ -2015,10 +2064,8 @@ v3dv_DestroyDevice(VkDevice _device, { V3DV_FROM_HANDLE(v3dv_device, device, _device); - v3dv_DeviceWaitIdle(_device); + device->vk.dispatch_table.DeviceWaitIdle(_device); queue_finish(&device->queue); - mtx_destroy(&device->mutex); - destroy_device_syncs(device, device->pdevice->render_fd); destroy_device_meta(device); v3dv_pipeline_cache_finish(&device->default_pipeline_cache); @@ -2039,17 +2086,6 @@ v3dv_DestroyDevice(VkDevice _device, vk_free2(&device->vk.alloc, pAllocator, device); } -VKAPI_ATTR VkResult VKAPI_CALL -v3dv_DeviceWaitIdle(VkDevice _device) -{ - V3DV_FROM_HANDLE(v3dv_device, device, _device); - - if (vk_device_is_lost(&device->vk)) - return VK_ERROR_DEVICE_LOST; - - return v3dv_QueueWaitIdle(v3dv_queue_to_handle(&device->queue)); -} - static VkResult device_alloc(struct v3dv_device *device, struct v3dv_device_memory *mem, diff --git a/src/broadcom/vulkan/v3dv_private.h b/src/broadcom/vulkan/v3dv_private.h index c0b313c4f77..950fd3329c8 100644 --- a/src/broadcom/vulkan/v3dv_private.h +++ b/src/broadcom/vulkan/v3dv_private.h @@ -43,6 +43,7 @@ #include "vk_log.h" #include "vk_physical_device.h" #include "vk_shader_module.h" +#include "vk_sync.h" #include "vk_util.h" #include "vk_command_buffer.h" @@ -140,6 +141,9 @@ struct v3dv_physical_device { uint8_t device_uuid[VK_UUID_SIZE]; uint8_t driver_uuid[VK_UUID_SIZE]; + struct vk_sync_type drm_syncobj_type; + const struct vk_sync_type *sync_types[2]; + struct disk_cache *disk_cache; mtx_t mutex; @@ -219,34 +223,30 @@ struct v3dv_instance { bool default_pipeline_cache_enabled; }; -/* Tracks wait threads spawned from a single vkQueueSubmit call */ -struct v3dv_queue_submit_wait_info { - /* struct vk_object_base base; ?*/ - struct list_head list_link; +/* FIXME: In addition to tracking the last job submitted by GPU queue (cl, csd, + * tfu), we still need a syncobj to track the last overall job submitted + * (V3DV_QUEUE_ANY) for the case we don't support multisync. Someday we can + * start expecting multisync to be present and drop the legacy implementation + * together with this V3DV_QUEUE_ANY tracker. + */ +enum v3dv_queue_type { + V3DV_QUEUE_CL = 0, + V3DV_QUEUE_CSD, + V3DV_QUEUE_TFU, + V3DV_QUEUE_ANY, + V3DV_QUEUE_COUNT, +}; - struct v3dv_device *device; - - /* List of wait threads spawned for any command buffers in a particular - * call to vkQueueSubmit. - */ - uint32_t wait_thread_count; - struct { - pthread_t thread; - bool finished; - } wait_threads[16]; - - /* The master wait thread for the entire submit. This will wait for all - * other threads in this submit to complete before processing signal - * semaphores and fences. - */ - pthread_t master_wait_thread; - - /* List of semaphores (and fence) to signal after all wait threads completed - * and all command buffer jobs in the submission have been sent to the GPU. - */ - uint32_t signal_semaphore_count; - VkSemaphore *signal_semaphores; - VkFence fence; +/* For each GPU queue, we use a syncobj to track the last job submitted. We + * set the flag `first` to determine when we are starting a new cmd buffer + * batch and therefore a job submitted to a given queue will be the first in a + * cmd buf batch. + */ +struct v3dv_last_job_sync { + /* If the job is the first submitted to a GPU queue in a cmd buffer batch */ + bool first[V3DV_QUEUE_COUNT]; + /* Array of syncobj to track the last job submitted to a GPU queue */ + uint32_t syncs[V3DV_QUEUE_COUNT]; }; struct v3dv_queue { @@ -254,18 +254,14 @@ struct v3dv_queue { struct v3dv_device *device; - /* A list of active v3dv_queue_submit_wait_info */ - struct list_head submit_wait_list; - - /* A mutex to prevent concurrent access to the list of wait threads */ - mtx_t mutex; - - /* A mutex to prevent concurrent noop job submissions */ - mtx_t noop_mutex; + struct v3dv_last_job_sync last_job_syncs; struct v3dv_job *noop_job; }; +VkResult v3dv_queue_driver_submit(struct vk_queue *vk_queue, + struct vk_queue_submit *submit); + #define V3DV_META_BLIT_CACHE_KEY_SIZE (4 * sizeof(uint32_t)) #define V3DV_META_TEXEL_BUFFER_COPY_CACHE_KEY_SIZE (3 * sizeof(uint32_t) + \ sizeof(VkComponentMapping)) @@ -438,32 +434,6 @@ struct v3dv_pipeline_cache { bool externally_synchronized; }; -/* FIXME: In addition to tracking the last job submitted by GPU queue (cl, csd, - * tfu), we still need a syncobj to track the last overall job submitted - * (V3DV_QUEUE_ANY) for the case we don't support multisync. Someday we can - * start expecting multisync to be present and drop the legacy implementation - * together with this V3DV_QUEUE_ANY tracker. - */ -enum v3dv_queue_type { - V3DV_QUEUE_CL = 0, - V3DV_QUEUE_CSD, - V3DV_QUEUE_TFU, - V3DV_QUEUE_ANY, - V3DV_QUEUE_COUNT, -}; - -/* For each GPU queue, we use a syncobj to track the last job submitted. We - * set the flag `first` to determine when we are starting a new cmd buffer - * batch and therefore a job submitted to a given queue will be the first in a - * cmd buf batch. - */ -struct v3dv_last_job_sync { - /* If the job is the first submitted to a GPU queue in a cmd buffer batch */ - bool first[V3DV_QUEUE_COUNT]; - /* Array of syncobj to track the last job submitted to a GPU queue */ - uint32_t syncs[V3DV_QUEUE_COUNT]; -}; - struct v3dv_device { struct vk_device vk; @@ -473,12 +443,6 @@ struct v3dv_device { struct v3d_device_info devinfo; struct v3dv_queue queue; - /* Syncobjs to track the last job submitted to any GPU queue */ - struct v3dv_last_job_sync last_job_syncs; - - /* A mutex to prevent concurrent access to last_job_sync from the queue */ - mtx_t mutex; - /* Guards query->maybe_available and value for timestamps */ mtx_t query_mutex; @@ -1001,17 +965,14 @@ struct v3dv_copy_query_results_cpu_job_info { VkQueryResultFlags flags; }; -struct v3dv_submit_info_semaphores { - /* List of semaphores to wait before running a job */ - uint32_t wait_sem_count; - VkSemaphore *wait_sems; +struct v3dv_submit_sync_info { + /* List of syncs to wait before running a job */ + uint32_t wait_count; + struct vk_sync_wait *waits; - /* List of semaphores to signal when all jobs complete */ - uint32_t signal_sem_count; - VkSemaphore *signal_sems; - - /* A fence to signal when all jobs complete */ - VkFence fence; + /* List of syncs to signal when all jobs complete */ + uint32_t signal_count; + struct vk_sync_signal *signals; }; struct v3dv_event_set_cpu_job_info { @@ -1122,9 +1083,6 @@ struct v3dv_job { /* Whether we need to serialize this job in our command stream */ bool serialize; - /* Whether this job is in charge of signalling semaphores */ - bool do_sem_signal; - /* If this is a CL job, whether we should sync before binning */ bool needs_bcl_sync; @@ -1156,7 +1114,7 @@ struct v3dv_wait_thread_info { struct v3dv_job *job; /* Semaphores info for any postponed jobs after a wait event */ - struct v3dv_submit_info_semaphores *sems_info; + struct v3dv_submit_sync_info *sync_info; }; void v3dv_job_init(struct v3dv_job *job, @@ -1514,28 +1472,6 @@ void v3dv_cmd_buffer_add_private_obj(struct v3dv_cmd_buffer *cmd_buffer, uint64_t obj, v3dv_cmd_buffer_private_obj_destroy_cb destroy_cb); -struct v3dv_semaphore { - struct vk_object_base base; - - /* A syncobject handle associated with this semaphore */ - uint32_t sync; - - /* A temporary syncobject handle produced from a vkImportSemaphoreFd. */ - uint32_t temp_sync; - bool has_temp; -}; - -struct v3dv_fence { - struct vk_object_base base; - - /* A syncobject handle associated with this fence */ - uint32_t sync; - - /* A temporary syncobject handle produced from a vkImportFenceFd. */ - uint32_t temp_sync; - bool has_temp; -}; - struct v3dv_event { struct vk_object_base base; int state; @@ -2210,7 +2146,6 @@ VK_DEFINE_NONDISP_HANDLE_CASTS(v3dv_descriptor_update_template, base, VkDescriptorUpdateTemplate, VK_OBJECT_TYPE_DESCRIPTOR_UPDATE_TEMPLATE) VK_DEFINE_NONDISP_HANDLE_CASTS(v3dv_event, base, VkEvent, VK_OBJECT_TYPE_EVENT) -VK_DEFINE_NONDISP_HANDLE_CASTS(v3dv_fence, base, VkFence, VK_OBJECT_TYPE_FENCE) VK_DEFINE_NONDISP_HANDLE_CASTS(v3dv_framebuffer, base, VkFramebuffer, VK_OBJECT_TYPE_FRAMEBUFFER) VK_DEFINE_NONDISP_HANDLE_CASTS(v3dv_image, vk.base, VkImage, @@ -2229,8 +2164,6 @@ VK_DEFINE_NONDISP_HANDLE_CASTS(v3dv_render_pass, base, VkRenderPass, VK_OBJECT_TYPE_RENDER_PASS) VK_DEFINE_NONDISP_HANDLE_CASTS(v3dv_sampler, base, VkSampler, VK_OBJECT_TYPE_SAMPLER) -VK_DEFINE_NONDISP_HANDLE_CASTS(v3dv_semaphore, base, VkSemaphore, - VK_OBJECT_TYPE_SEMAPHORE) static inline int v3dv_ioctl(int fd, unsigned long request, void *arg) diff --git a/src/broadcom/vulkan/v3dv_queue.c b/src/broadcom/vulkan/v3dv_queue.c index d5501371388..244f0166f02 100644 --- a/src/broadcom/vulkan/v3dv_queue.c +++ b/src/broadcom/vulkan/v3dv_queue.c @@ -25,7 +25,9 @@ #include "drm-uapi/v3d_drm.h" #include "broadcom/clif/clif_dump.h" +#include "util/libsync.h" #include "util/os_time.h" +#include "vk_drm_syncobj.h" #include #include @@ -69,95 +71,61 @@ v3dv_clif_dump(struct v3dv_device *device, } static VkResult -queue_submit_job(struct v3dv_queue *queue, - struct v3dv_job *job, - struct v3dv_submit_info_semaphores *sems_info, - pthread_t *wait_thread); - -/* Waits for active CPU wait threads spawned before the current thread to - * complete and submit all their GPU jobs. - */ -static void -cpu_queue_wait_idle(struct v3dv_queue *queue) +queue_wait_idle(struct v3dv_queue *queue, + struct v3dv_submit_sync_info *sync_info) { - const pthread_t this_thread = pthread_self(); - -retry: - mtx_lock(&queue->mutex); - list_for_each_entry(struct v3dv_queue_submit_wait_info, info, - &queue->submit_wait_list, list_link) { - for (uint32_t i = 0; i < info->wait_thread_count; i++) { - if (info->wait_threads[i].finished) - continue; - - /* Because we are testing this against the list of spawned threads - * it will never match for the main thread, so when we call this from - * the main thread we are effectively waiting for all active threads - * to complete, and otherwise we are only waiting for work submitted - * before the wait thread that called this (a wait thread should never - * be waiting for work submitted after it). - */ - if (info->wait_threads[i].thread == this_thread) - goto done; - - /* Wait and try again */ - mtx_unlock(&queue->mutex); - usleep(500); /* 0.5 ms */ - goto retry; + if (queue->device->pdevice->caps.multisync) { + int ret = drmSyncobjWait(queue->device->pdevice->render_fd, + queue->last_job_syncs.syncs, 3, + INT64_MAX, DRM_SYNCOBJ_WAIT_FLAGS_WAIT_ALL, + NULL); + if (ret) { + return vk_errorf(queue, VK_ERROR_DEVICE_LOST, + "syncobj wait failed: %m"); } - } -done: - mtx_unlock(&queue->mutex); -} + bool first = true; + for (int i = 0; i < 3; i++) { + if (!queue->last_job_syncs.first[i]) + first = false; + } -static VkResult -gpu_queue_wait_idle(struct v3dv_queue *queue) -{ - struct v3dv_device *device = queue->device; - int render_fd = device->pdevice->render_fd; - struct v3dv_last_job_sync last_job_syncs; + /* If we're not the first job, that means we're waiting on some + * per-queue-type syncobj which transitively waited on the semaphores + * so we can skip the semaphore wait. + */ + if (first) { + VkResult result = vk_sync_wait_many(&queue->device->vk, + sync_info->wait_count, + sync_info->waits, + VK_SYNC_WAIT_COMPLETE, + UINT64_MAX); + if (result != VK_SUCCESS) + return result; + } - mtx_lock(&device->mutex); - memcpy(&last_job_syncs, &device->last_job_syncs, sizeof(last_job_syncs)); - mtx_unlock(&device->mutex); - - if (device->pdevice->caps.multisync) { - int ret = drmSyncobjWait(render_fd, (uint32_t *) &last_job_syncs.syncs, - 3, INT64_MAX, - DRM_SYNCOBJ_WAIT_FLAGS_WAIT_ALL, NULL); - if (ret) - return vk_queue_set_lost(&queue->vk, "Syncobj wait failed: %m"); + for (int i = 0; i < 3; i++) + queue->last_job_syncs.first[i] = false; } else { - int ret = - drmSyncobjWait(render_fd, &last_job_syncs.syncs[V3DV_QUEUE_ANY], 1, - INT64_MAX, 0, NULL); - if (ret) - return vk_queue_set_lost(&queue->vk, "Syncobj wait failed: %m"); + /* Without multisync, all the semaphores are baked into the one syncobj + * at the start of each submit so we only need to wait on the one. + */ + int ret = drmSyncobjWait(queue->device->pdevice->render_fd, + &queue->last_job_syncs.syncs[V3DV_QUEUE_ANY], 1, + INT64_MAX, DRM_SYNCOBJ_WAIT_FLAGS_WAIT_ALL, + NULL); + if (ret) { + return vk_errorf(queue, VK_ERROR_DEVICE_LOST, + "syncobj wait failed: %m"); + } } return VK_SUCCESS; } -VKAPI_ATTR VkResult VKAPI_CALL -v3dv_QueueWaitIdle(VkQueue _queue) -{ - V3DV_FROM_HANDLE(v3dv_queue, queue, _queue); - - if (vk_device_is_lost(&queue->device->vk)) - return VK_ERROR_DEVICE_LOST; - - /* Check that we don't have any wait threads running in the CPU first, - * as these can spawn new GPU jobs. - */ - cpu_queue_wait_idle(queue); - - /* Check we don't have any GPU jobs running */ - return gpu_queue_wait_idle(queue); -} - static VkResult -handle_reset_query_cpu_job(struct v3dv_job *job) +handle_reset_query_cpu_job(struct v3dv_queue *queue, struct v3dv_job *job, + struct v3dv_submit_sync_info *sync_info) { struct v3dv_reset_query_cpu_job_info *info = &job->cpu.query_reset; assert(info->pool); @@ -165,12 +133,9 @@ handle_reset_query_cpu_job(struct v3dv_job *job) /* We are about to reset query counters so we need to make sure that * The GPU is not using them. The exception is timestamp queries, since * we handle those in the CPU. - * - * FIXME: we could avoid blocking the main thread for this if we use - * submission thread. */ if (info->pool->query_type == VK_QUERY_TYPE_OCCLUSION) - v3dv_bo_wait(job->device, info->pool->bo, PIPE_TIMEOUT_INFINITE); + v3dv_bo_wait(job->device, info->pool->bo, PIPE_TIMEOUT_INFINITE); v3dv_reset_query_pools(job->device, info->pool, info->first, info->count); @@ -209,10 +174,6 @@ handle_copy_query_results_cpu_job(struct v3dv_job *job) if (!bo->map && !v3dv_bo_map(job->device, bo, bo->size)) return vk_error(job->device, VK_ERROR_OUT_OF_HOST_MEMORY); - /* FIXME: if flags includes VK_QUERY_RESULT_WAIT_BIT this could trigger a - * sync wait on the CPU for the corresponding GPU jobs to finish. We might - * want to use a submission thread to avoid blocking on the main thread. - */ uint8_t *offset = ((uint8_t *) bo->map) + info->offset + info->dst->mem_offset; v3dv_get_query_pool_results_cpu(job->device, @@ -227,7 +188,8 @@ handle_copy_query_results_cpu_job(struct v3dv_job *job) } static VkResult -handle_set_event_cpu_job(struct v3dv_job *job) +handle_set_event_cpu_job(struct v3dv_queue *queue, struct v3dv_job *job, + struct v3dv_submit_sync_info *sync_info) { /* From the Vulkan 1.0 spec: * @@ -246,13 +208,7 @@ handle_set_event_cpu_job(struct v3dv_job *job) * submission thread. */ - /* If we are calling this from a wait thread it will only wait - * wait threads sspawned before it, otherwise it will wait for - * all active threads to complete. - */ - cpu_queue_wait_idle(&job->device->queue); - - VkResult result = gpu_queue_wait_idle(&job->device->queue); + VkResult result = queue_wait_idle(queue, sync_info); if (result != VK_SUCCESS) return result; @@ -262,99 +218,6 @@ handle_set_event_cpu_job(struct v3dv_job *job) return VK_SUCCESS; } -static VkResult -copy_semaphores(struct v3dv_device *device, - VkSemaphore *sems_src, uint32_t sems_src_count, - VkSemaphore **sems_dst, uint32_t *sems_dst_count) -{ - *sems_dst_count = sems_src_count; - - if (*sems_dst_count == 0) { - *sems_dst = NULL; - return VK_SUCCESS; - } - - *sems_dst = vk_alloc(&device->vk.alloc, - *sems_dst_count * sizeof(VkSemaphore), 8, - VK_SYSTEM_ALLOCATION_SCOPE_COMMAND); - if (!sems_dst) { - *sems_dst_count = 0; - return VK_ERROR_OUT_OF_HOST_MEMORY; - } - - memcpy(*sems_dst, sems_src, *sems_dst_count * sizeof(VkSemaphore)); - - return VK_SUCCESS; -} - -static struct v3dv_submit_info_semaphores * -copy_semaphores_info(struct v3dv_device *device, - struct v3dv_submit_info_semaphores *info) -{ - VkResult result; - struct v3dv_submit_info_semaphores *info_copy = - vk_zalloc(&device->vk.alloc, sizeof(struct v3dv_submit_info_semaphores), - 8, VK_SYSTEM_ALLOCATION_SCOPE_COMMAND); - if (!info_copy) - return NULL; - - result = copy_semaphores(device, info->wait_sems, info->wait_sem_count, - &info_copy->wait_sems, &info_copy->wait_sem_count); - if (result != VK_SUCCESS) - goto fail; - - result = copy_semaphores(device, info->signal_sems, info->signal_sem_count, - &info_copy->signal_sems, - &info_copy->signal_sem_count); - if (result != VK_SUCCESS) - goto fail; - - return info_copy; - -fail: - if (info_copy->wait_sem_count > 0) - vk_free(&device->vk.alloc, info_copy->wait_sems); - vk_free(&device->vk.alloc, info_copy); - - return NULL; -} - -static struct v3dv_wait_thread_info * -create_wait_thread_info(struct v3dv_job *job, - struct v3dv_submit_info_semaphores *sems_info) -{ - struct v3dv_wait_thread_info *info = - vk_alloc(&job->device->vk.alloc, sizeof(*info), 8, - VK_SYSTEM_ALLOCATION_SCOPE_COMMAND); - if (!info) - return NULL; - - info->job = job; - info->sems_info = copy_semaphores_info(job->device, sems_info); - if (!info->sems_info) { - vk_free(&job->device->vk.alloc, info); - return NULL; - } - - return info; -} - -static void -free_wait_thread_info(struct v3dv_device *device, - struct v3dv_wait_thread_info *info) -{ - assert(info != NULL); - - if (info->sems_info->wait_sem_count > 0) - vk_free(&device->vk.alloc, info->sems_info->wait_sems); - - if (info->sems_info->signal_sem_count > 0) - vk_free(&device->vk.alloc, info->sems_info->signal_sems); - - vk_free(&device->vk.alloc, info->sems_info); - vk_free(&device->vk.alloc, info); -} - static bool check_wait_events_complete(struct v3dv_job *job) { @@ -368,31 +231,9 @@ check_wait_events_complete(struct v3dv_job *job) return true; } -static void -wait_thread_finish(struct v3dv_queue *queue, pthread_t thread) +static VkResult +handle_wait_events_cpu_job(struct v3dv_job *job) { - mtx_lock(&queue->mutex); - list_for_each_entry(struct v3dv_queue_submit_wait_info, info, - &queue->submit_wait_list, list_link) { - for (uint32_t i = 0; i < info->wait_thread_count; i++) { - if (info->wait_threads[i].thread == thread) { - info->wait_threads[i].finished = true; - goto done; - } - } - } - - unreachable(!"Failed to finish wait thread: not found"); - -done: - mtx_unlock(&queue->mutex); -} - -static void * -event_wait_thread_func(void *_info) -{ - struct v3dv_wait_thread_info *info = (struct v3dv_wait_thread_info *) _info; - struct v3dv_job *job = info->job; assert(job->type == V3DV_JOB_TYPE_CPU_WAIT_EVENTS); /* Wait for events to be signaled */ @@ -400,101 +241,13 @@ event_wait_thread_func(void *_info) while (!check_wait_events_complete(job)) usleep(wait_interval_ms * 1000); - /* Now continue submitting pending jobs for the same command buffer after - * the wait job. - */ - struct v3dv_queue *queue = &job->device->queue; - list_for_each_entry_from(struct v3dv_job, pjob, job->list_link.next, - &job->cmd_buffer->jobs, list_link) { - /* We can't signal semaphores from wait threads because in this case - * we can't ensure job completion order any more (i.e. if the wait for - * events is in the first command buffer of a batch then the last job - * from the last command buffer in that batch can't signal). We always - * need to signal from the master thread in that case, when we know we - * are done submitting all jobs from all command buffers. - */ - pjob->do_sem_signal = false; - - /* We don't want to spawn more than one wait thread per command buffer. - * If this job also requires a wait for events, we will do the wait here. - */ - VkResult result = queue_submit_job(queue, pjob, info->sems_info, NULL); - if (result == VK_NOT_READY) { - while (!check_wait_events_complete(pjob)) { - usleep(wait_interval_ms * 1000); - } - result = VK_SUCCESS; - } - - if (result != VK_SUCCESS) { - fprintf(stderr, "Wait thread job execution failed.\n"); - goto done; - } - } - -done: - wait_thread_finish(queue, pthread_self()); - free_wait_thread_info(job->device, info); - return NULL; + return VK_SUCCESS; } static VkResult -spawn_event_wait_thread(struct v3dv_wait_thread_info *info, pthread_t *wait_thread) - -{ - assert(info->job->type == V3DV_JOB_TYPE_CPU_WAIT_EVENTS); - assert(info->job->cmd_buffer); - assert(wait_thread != NULL); - - if (pthread_create(wait_thread, NULL, event_wait_thread_func, info)) - return vk_queue_set_lost(&info->job->device->queue.vk, - "Thread create failed: %m"); - - return VK_NOT_READY; -} - -static VkResult -handle_wait_events_cpu_job(struct v3dv_job *job, - struct v3dv_submit_info_semaphores *sems_info, - pthread_t *wait_thread) -{ - assert(job->type == V3DV_JOB_TYPE_CPU_WAIT_EVENTS); - - /* If all events are signaled then we are done and can continue submitting - * the rest of the command buffer normally. - */ - if (check_wait_events_complete(job)) - return VK_SUCCESS; - - /* Otherwise, we put the rest of the command buffer on a wait thread until - * all events are signaled. We only spawn a new thread on the first - * wait job we see for a command buffer, any additional wait jobs in the - * same command buffer will run in that same wait thread and will get here - * with a NULL wait_thread pointer. - * - * Also, whether we spawn a wait thread or not, we always return - * VK_NOT_READY (unless an error happened), so we stop trying to submit - * any jobs in the same command buffer after the wait job. The wait thread - * will attempt to submit them after the wait completes. - */ - if (!wait_thread) - return VK_NOT_READY; - - /* As events can be signaled by the host, jobs after the event wait must - * still wait for semaphores, if any. So, whenever we spawn a wait thread, - * we keep a copy of the semaphores (info->sems_info) to be used when - * submitting pending jobs in the wait thread context. - */ - struct v3dv_wait_thread_info *info = - create_wait_thread_info(job, sems_info); - if (!info) - return VK_ERROR_OUT_OF_HOST_MEMORY; - - return spawn_event_wait_thread(info, wait_thread); -} - -static VkResult -handle_copy_buffer_to_image_cpu_job(struct v3dv_job *job) +handle_copy_buffer_to_image_cpu_job(struct v3dv_queue *queue, + struct v3dv_job *job, + struct v3dv_submit_sync_info *sync_info) { assert(job->type == V3DV_JOB_TYPE_CPU_COPY_BUFFER_TO_IMAGE); struct v3dv_copy_buffer_to_image_cpu_job_info *info = @@ -503,7 +256,9 @@ handle_copy_buffer_to_image_cpu_job(struct v3dv_job *job) /* Wait for all GPU work to finish first, since we may be accessing * the BOs involved in the operation. */ - v3dv_QueueWaitIdle(v3dv_queue_to_handle(&job->device->queue)); + VkResult result = queue_wait_idle(queue, sync_info); + if (result != VK_SUCCESS) + return result; /* Map BOs */ struct v3dv_bo *dst_bo = info->image->mem->bo; @@ -543,13 +298,16 @@ handle_copy_buffer_to_image_cpu_job(struct v3dv_job *job) } static VkResult -handle_timestamp_query_cpu_job(struct v3dv_job *job) +handle_timestamp_query_cpu_job(struct v3dv_queue *queue, struct v3dv_job *job, + struct v3dv_submit_sync_info *sync_info) { assert(job->type == V3DV_JOB_TYPE_CPU_TIMESTAMP_QUERY); struct v3dv_timestamp_query_cpu_job_info *info = &job->cpu.query_timestamp; /* Wait for completion of all work queued before the timestamp query */ - v3dv_QueueWaitIdle(v3dv_queue_to_handle(&job->device->queue)); + VkResult result = queue_wait_idle(queue, sync_info); + if (result != VK_SUCCESS) + return result; mtx_lock(&job->device->query_mutex); @@ -574,7 +332,7 @@ handle_timestamp_query_cpu_job(struct v3dv_job *job) static VkResult handle_csd_indirect_cpu_job(struct v3dv_queue *queue, struct v3dv_job *job, - struct v3dv_submit_info_semaphores *sems_info) + struct v3dv_submit_sync_info *sync_info) { assert(job->type == V3DV_JOB_TYPE_CPU_CSD_INDIRECT); struct v3dv_csd_indirect_cpu_job_info *info = &job->cpu.csd_indirect; @@ -604,60 +362,101 @@ handle_csd_indirect_cpu_job(struct v3dv_queue *queue, return VK_SUCCESS; } -static uint32_t -semaphore_get_sync(struct v3dv_semaphore *sem) +static VkResult +process_waits(struct v3dv_queue *queue, + uint32_t count, struct vk_sync_wait *waits) { - if (!sem->has_temp) - return sem->sync; + struct v3dv_device *device = queue->device; + VkResult result = VK_SUCCESS; + int err = 0; - assert(sem->temp_sync > 0); - return sem->temp_sync; -} + if (count == 0) + return VK_SUCCESS; -static uint32_t -fence_get_sync(struct v3dv_fence *fence) -{ - if (!fence->has_temp) - return fence->sync; + /* If multisync is supported, we wait on semaphores in the first job + * submitted to each of the individual queues. We don't need to + * pre-populate the syncobjs. + */ + if (queue->device->pdevice->caps.multisync) + return VK_SUCCESS; - assert(fence->temp_sync > 0); - return fence->temp_sync; + int fd = -1; + err = drmSyncobjExportSyncFile(device->pdevice->render_fd, + queue->last_job_syncs.syncs[V3DV_QUEUE_ANY], + &fd); + if (err) { + result = vk_errorf(queue, VK_ERROR_UNKNOWN, + "sync file export failed: %m"); + goto fail; + } + + for (uint32_t i = 0; i < count; i++) { + uint32_t syncobj = vk_sync_as_drm_syncobj(waits[i].sync)->syncobj; + int wait_fd = -1; + + err = drmSyncobjExportSyncFile(device->pdevice->render_fd, + syncobj, &wait_fd); + if (err) { + result = vk_errorf(queue, VK_ERROR_UNKNOWN, + "sync file export failed: %m"); + goto fail; + } + + err = sync_accumulate("v3dv", &fd, wait_fd); + close(wait_fd); + if (err) { + result = vk_errorf(queue, VK_ERROR_UNKNOWN, + "sync file merge failed: %m"); + goto fail; + } + } + + err = drmSyncobjImportSyncFile(device->pdevice->render_fd, + queue->last_job_syncs.syncs[V3DV_QUEUE_ANY], + fd); + if (err) { + result = vk_errorf(queue, VK_ERROR_UNKNOWN, + "sync file import failed: %m"); + } + +fail: + close(fd); + return result; } static VkResult -process_semaphores_to_signal(struct v3dv_device *device, - uint32_t count, const VkSemaphore *sems, - bool is_master_thread) +process_signals(struct v3dv_queue *queue, + uint32_t count, struct vk_sync_signal *signals) { + struct v3dv_device *device = queue->device; + if (count == 0) return VK_SUCCESS; /* If multisync is supported, we are signalling semaphores in the last job * of the last command buffer and, therefore, we do not need to process any - * semaphores here, unless we come from a wait thread, because in that case - * we never signal. + * semaphores here. */ - if (device->pdevice->caps.multisync && !is_master_thread) + if (device->pdevice->caps.multisync) return VK_SUCCESS; - int render_fd = device->pdevice->render_fd; - int fd; - mtx_lock(&device->mutex); - drmSyncobjExportSyncFile(render_fd, - device->last_job_syncs.syncs[V3DV_QUEUE_ANY], + drmSyncobjExportSyncFile(device->pdevice->render_fd, + queue->last_job_syncs.syncs[V3DV_QUEUE_ANY], &fd); - mtx_unlock(&device->mutex); - if (fd == -1) - return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY); + if (fd == -1) { + return vk_errorf(queue, VK_ERROR_UNKNOWN, + "sync file export failed: %m"); + } VkResult result = VK_SUCCESS; for (uint32_t i = 0; i < count; i++) { - struct v3dv_semaphore *sem = v3dv_semaphore_from_handle(sems[i]); - uint32_t sync = semaphore_get_sync(sem); - int ret = drmSyncobjImportSyncFile(render_fd, sync, fd); - if (ret) { - result = VK_ERROR_OUT_OF_HOST_MEMORY; + uint32_t syncobj = vk_sync_as_drm_syncobj(signals[i].sync)->syncobj; + int err = drmSyncobjImportSyncFile(device->pdevice->render_fd, + syncobj, fd); + if (err) { + result = vk_errorf(queue, VK_ERROR_UNKNOWN, + "sync file import failed: %m"); break; } } @@ -668,59 +467,6 @@ process_semaphores_to_signal(struct v3dv_device *device, return result; } -static VkResult -queue_submit_noop_job(struct v3dv_queue *queue, - struct v3dv_submit_info_semaphores *sems_info, - bool do_sem_signal, bool serialize); - -static VkResult -process_fence_to_signal(struct v3dv_device *device, VkFence _fence) -{ - if (_fence == VK_NULL_HANDLE) - return VK_SUCCESS; - - struct v3dv_fence *fence = v3dv_fence_from_handle(_fence); - - int render_fd = device->pdevice->render_fd; - - if (device->pdevice->caps.multisync) { - struct v3dv_queue *queue = &device->queue; - /* We signal the fence once all submitted command buffers have completed - * execution. For this, we emit a noop job that waits on the completion - * of all submitted jobs and signal the fence for this submission. - * FIXME: In simpler cases (for instance, when all jobs were submitted to - * the same queue), we can just import the last out sync produced into - * the fence. - */ - struct v3dv_submit_info_semaphores sems_info = { - .wait_sem_count = 0, - .wait_sems = NULL, - .signal_sem_count = 0, - .signal_sems = NULL, - .fence = _fence, - }; - - return queue_submit_noop_job(queue, &sems_info, false, true); - } - - int fd; - mtx_lock(&device->mutex); - drmSyncobjExportSyncFile(render_fd, - device->last_job_syncs.syncs[V3DV_QUEUE_ANY], - &fd); - mtx_unlock(&device->mutex); - if (fd == -1) - return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY); - - uint32_t sync = fence_get_sync(fence); - int ret = drmSyncobjImportSyncFile(render_fd, sync, fd); - - assert(fd >= 0); - close(fd); - - return ret ? VK_ERROR_OUT_OF_HOST_MEMORY : VK_SUCCESS; -} - static void multisync_free(struct v3dv_device *device, struct drm_v3d_multi_sync *ms) @@ -730,24 +476,25 @@ multisync_free(struct v3dv_device *device, } static struct drm_v3d_sem * -set_in_syncs(struct v3dv_device *device, +set_in_syncs(struct v3dv_queue *queue, struct v3dv_job *job, - enum v3dv_queue_type queue, + enum v3dv_queue_type queue_sync, uint32_t *count, - struct v3dv_submit_info_semaphores *sems_info) + struct v3dv_submit_sync_info *sync_info) { - uint32_t n_sems = 0; + struct v3dv_device *device = queue->device; + uint32_t n_syncs = 0; /* If this is the first job submitted to a given GPU queue in this cmd buf * batch, it has to wait on wait semaphores (if any) before running. */ - if (device->last_job_syncs.first[queue]) - n_sems = sems_info->wait_sem_count; + if (queue->last_job_syncs.first[queue_sync]) + n_syncs = sync_info->wait_count; /* If the serialize flag is set, this job waits for completion of all GPU * jobs submitted in any queue V3DV_QUEUE_(CL/TFU/CSD) before running. */ - *count = n_sems + (job->serialize ? 3 : 0); + *count = n_syncs + (job->serialize ? 3 : 0); if (!*count) return NULL; @@ -759,51 +506,35 @@ set_in_syncs(struct v3dv_device *device, if (!syncs) return NULL; - for (int i = 0; i < n_sems; i++) { - struct v3dv_semaphore *sem = - v3dv_semaphore_from_handle(sems_info->wait_sems[i]); - syncs[i].handle = semaphore_get_sync(sem); - - /* From the Vulkan 1.0 spec: - * - * "If the import is temporary, the implementation must restore - * the semaphore to its prior permanent state after submitting - * the next semaphore wait operation." - * - * We can't destroy the temporary sync until the kernel is done - * with it, this is why we need to have this 'has_temp' flag instead - * of checking temp_sync for 0 to know if we have a temporary - * payload. The temporary sync will be destroyed if we import into - * the semaphore again or if the semaphore is destroyed by the - * client. - */ - sem->has_temp = false; + for (int i = 0; i < n_syncs; i++) { + syncs[i].handle = + vk_sync_as_drm_syncobj(sync_info->waits[i].sync)->syncobj; } if (job->serialize) { for (int i = 0; i < 3; i++) - syncs[n_sems + i].handle = device->last_job_syncs.syncs[i]; + syncs[n_syncs + i].handle = queue->last_job_syncs.syncs[i]; } return syncs; } static struct drm_v3d_sem * -set_out_syncs(struct v3dv_device *device, +set_out_syncs(struct v3dv_queue *queue, struct v3dv_job *job, - enum v3dv_queue_type queue, + enum v3dv_queue_type queue_sync, uint32_t *count, - struct v3dv_submit_info_semaphores *sems_info) + struct v3dv_submit_sync_info *sync_info, + bool signal_syncs) { - uint32_t n_sems = job->do_sem_signal ? sems_info->signal_sem_count : 0; + struct v3dv_device *device = queue->device; + + uint32_t n_vk_syncs = signal_syncs ? sync_info->signal_count : 0; /* We always signal the syncobj from `device->last_job_syncs` related to * this v3dv_queue_type to track the last job submitted to this queue. */ - (*count) = n_sems + 1; - - if (sems_info->fence) - (*count)++; + (*count) = n_vk_syncs + 1; struct drm_v3d_sem *syncs = vk_zalloc(&device->vk.alloc, *count * sizeof(struct drm_v3d_sem), @@ -812,20 +543,14 @@ set_out_syncs(struct v3dv_device *device, if (!syncs) return NULL; - if (n_sems) { - for (unsigned i = 0; i < n_sems; i++) { - struct v3dv_semaphore *sem = - v3dv_semaphore_from_handle(sems_info->signal_sems[i]); - syncs[i].handle = semaphore_get_sync(sem); + if (n_vk_syncs) { + for (unsigned i = 0; i < n_vk_syncs; i++) { + syncs[i].handle = + vk_sync_as_drm_syncobj(sync_info->signals[i].sync)->syncobj; } } - syncs[n_sems].handle = device->last_job_syncs.syncs[queue]; - - if (sems_info->fence) { - struct v3dv_fence *fence = v3dv_fence_from_handle(sems_info->fence); - syncs[++n_sems].handle = fence_get_sync(fence); - } + syncs[n_vk_syncs].handle = queue->last_job_syncs.syncs[queue_sync]; return syncs; } @@ -847,23 +572,25 @@ set_ext(struct drm_v3d_extension *ext, */ static void set_multisync(struct drm_v3d_multi_sync *ms, - struct v3dv_submit_info_semaphores *sems_info, + struct v3dv_submit_sync_info *sync_info, struct drm_v3d_extension *next, struct v3dv_device *device, struct v3dv_job *job, enum v3dv_queue_type queue_sync, - enum v3d_queue wait_stage) + enum v3d_queue wait_stage, + bool signal_syncs) { + struct v3dv_queue *queue = &device->queue; uint32_t out_sync_count = 0, in_sync_count = 0; struct drm_v3d_sem *out_syncs = NULL, *in_syncs = NULL; - in_syncs = set_in_syncs(device, job, queue_sync, - &in_sync_count, sems_info); + in_syncs = set_in_syncs(queue, job, queue_sync, + &in_sync_count, sync_info); if (!in_syncs && in_sync_count) goto fail; - out_syncs = set_out_syncs(device, job, queue_sync, - &out_sync_count, sems_info); + out_syncs = set_out_syncs(queue, job, queue_sync, + &out_sync_count, sync_info, signal_syncs); assert(out_sync_count > 0); @@ -877,7 +604,7 @@ set_multisync(struct drm_v3d_multi_sync *ms, ms->in_sync_count = in_sync_count; ms->in_syncs = (uintptr_t)(void *)in_syncs; - device->last_job_syncs.first[queue_sync] = false; + queue->last_job_syncs.first[queue_sync] = false; return; @@ -892,7 +619,8 @@ fail: static VkResult handle_cl_job(struct v3dv_queue *queue, struct v3dv_job *job, - struct v3dv_submit_info_semaphores *sems_info) + struct v3dv_submit_sync_info *sync_info, + bool signal_syncs) { struct v3dv_device *device = queue->device; @@ -949,23 +677,19 @@ handle_cl_job(struct v3dv_queue *queue, * dependencies strictly through barriers. */ const bool needs_bcl_sync = - sems_info->wait_sem_count > 0 || job->needs_bcl_sync; + sync_info->wait_count > 0 || job->needs_bcl_sync; const bool needs_rcl_sync = job->serialize && !needs_bcl_sync; - mtx_lock(&queue->device->mutex); - /* Replace single semaphore settings whenever our kernel-driver supports * multiple semaphores extension. */ struct drm_v3d_multi_sync ms = { 0 }; if (device->pdevice->caps.multisync) { enum v3d_queue wait_stage = needs_rcl_sync ? V3D_RENDER : V3D_BIN; - set_multisync(&ms, sems_info, NULL, device, job, - V3DV_QUEUE_CL, wait_stage); - if (!ms.base.id) { - mtx_unlock(&queue->device->mutex); + set_multisync(&ms, sync_info, NULL, device, job, + V3DV_QUEUE_CL, wait_stage, signal_syncs); + if (!ms.base.id) return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY); - } submit.flags |= DRM_V3D_SUBMIT_EXTENSION; submit.extensions = (uintptr_t)(void *)&ms; @@ -974,7 +698,7 @@ handle_cl_job(struct v3dv_queue *queue, submit.in_sync_bcl = 0; submit.out_sync = 0; } else { - uint32_t last_job_sync = device->last_job_syncs.syncs[V3DV_QUEUE_ANY]; + uint32_t last_job_sync = queue->last_job_syncs.syncs[V3DV_QUEUE_ANY]; submit.in_sync_bcl = needs_bcl_sync ? last_job_sync : 0; submit.in_sync_rcl = needs_rcl_sync ? last_job_sync : 0; submit.out_sync = last_job_sync; @@ -983,7 +707,6 @@ handle_cl_job(struct v3dv_queue *queue, v3dv_clif_dump(device, job, &submit); int ret = v3dv_ioctl(device->pdevice->render_fd, DRM_IOCTL_V3D_SUBMIT_CL, &submit); - mtx_unlock(&queue->device->mutex); static bool warned = false; if (ret && !warned) { @@ -1004,25 +727,22 @@ handle_cl_job(struct v3dv_queue *queue, static VkResult handle_tfu_job(struct v3dv_queue *queue, struct v3dv_job *job, - struct v3dv_submit_info_semaphores *sems_info) + struct v3dv_submit_sync_info *sync_info, + bool signal_syncs) { struct v3dv_device *device = queue->device; - const bool needs_sync = sems_info->wait_sem_count || job->serialize; - - mtx_lock(&device->mutex); + const bool needs_sync = sync_info->wait_count || job->serialize; /* Replace single semaphore settings whenever our kernel-driver supports * multiple semaphore extension. */ struct drm_v3d_multi_sync ms = { 0 }; if (device->pdevice->caps.multisync) { - set_multisync(&ms, sems_info, NULL, device, job, - V3DV_QUEUE_TFU, V3D_TFU); - if (!ms.base.id) { - mtx_unlock(&device->mutex); + set_multisync(&ms, sync_info, NULL, device, job, + V3DV_QUEUE_TFU, V3D_TFU, signal_syncs); + if (!ms.base.id) return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY); - } job->tfu.flags |= DRM_V3D_SUBMIT_EXTENSION; job->tfu.extensions = (uintptr_t)(void *)&ms; @@ -1030,13 +750,12 @@ handle_tfu_job(struct v3dv_queue *queue, job->tfu.in_sync = 0; job->tfu.out_sync = 0; } else { - uint32_t last_job_sync = device->last_job_syncs.syncs[V3DV_QUEUE_ANY]; + uint32_t last_job_sync = queue->last_job_syncs.syncs[V3DV_QUEUE_ANY]; job->tfu.in_sync = needs_sync ? last_job_sync : 0; job->tfu.out_sync = last_job_sync; } int ret = v3dv_ioctl(device->pdevice->render_fd, DRM_IOCTL_V3D_SUBMIT_TFU, &job->tfu); - mtx_unlock(&device->mutex); multisync_free(device, &ms); @@ -1049,7 +768,8 @@ handle_tfu_job(struct v3dv_queue *queue, static VkResult handle_csd_job(struct v3dv_queue *queue, struct v3dv_job *job, - struct v3dv_submit_info_semaphores *sems_info) + struct v3dv_submit_sync_info *sync_info, + bool signal_syncs) { struct v3dv_device *device = queue->device; @@ -1066,20 +786,17 @@ handle_csd_job(struct v3dv_queue *queue, assert(bo_idx == submit->bo_handle_count); submit->bo_handles = (uintptr_t)(void *)bo_handles; - const bool needs_sync = sems_info->wait_sem_count || job->serialize; + const bool needs_sync = sync_info->wait_count || job->serialize; - mtx_lock(&queue->device->mutex); /* Replace single semaphore settings whenever our kernel-driver supports * multiple semaphore extension. */ struct drm_v3d_multi_sync ms = { 0 }; if (device->pdevice->caps.multisync) { - set_multisync(&ms, sems_info, NULL, device, job, - V3DV_QUEUE_CSD, V3D_CSD); - if (!ms.base.id) { - mtx_unlock(&queue->device->mutex); + set_multisync(&ms, sync_info, NULL, device, job, + V3DV_QUEUE_CSD, V3D_CSD, signal_syncs); + if (!ms.base.id) return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY); - } submit->flags |= DRM_V3D_SUBMIT_EXTENSION; submit->extensions = (uintptr_t)(void *)&ms; @@ -1087,13 +804,12 @@ handle_csd_job(struct v3dv_queue *queue, submit->in_sync = 0; submit->out_sync = 0; } else { - uint32_t last_job_sync = device->last_job_syncs.syncs[V3DV_QUEUE_ANY]; + uint32_t last_job_sync = queue->last_job_syncs.syncs[V3DV_QUEUE_ANY]; submit->in_sync = needs_sync ? last_job_sync : 0; submit->out_sync = last_job_sync; } int ret = v3dv_ioctl(device->pdevice->render_fd, DRM_IOCTL_V3D_SUBMIT_CSD, submit); - mtx_unlock(&queue->device->mutex); static bool warned = false; if (ret && !warned) { @@ -1113,71 +829,34 @@ handle_csd_job(struct v3dv_queue *queue, } static VkResult -queue_submit_job(struct v3dv_queue *queue, +queue_handle_job(struct v3dv_queue *queue, struct v3dv_job *job, - struct v3dv_submit_info_semaphores *sems_info, - pthread_t *wait_thread) + struct v3dv_submit_sync_info *sync_info, + bool signal_syncs) { - assert(job); - - /* CPU jobs typically execute explicit waits before they are processed. For - * example, a query reset CPU job will explicitly wait for the queries - * being unused before proceeding, etc. However, if we have any wait - * semaphores, we need to honour that too for the first CPU job we process - * in the command buffer batch. We do that by waiting for idle to ensure - * that any previous work has been completed, at which point any wait - * semaphores must be signalled, and we never need to do this again for the - * same batch. - * - * There is a corner case here when the semaphore has been imported from - * another instance/process. In that scenario, the Vulkan spec still requires - * that a signaling operation has been submitted before this semaphore wait - * but our wait for idle checks won't know about that submission (since they - * are based on the last jobs sent from our instance). To fix that we submit - * a noop job to "consume" the semaphores and then we wait for idle, which - * will ensure that our CPU job waits for the semaphores to be signaled even - * if they are signaled from another instance or process. - */ - if (!v3dv_job_type_is_gpu(job) && sems_info->wait_sem_count) { - queue_submit_noop_job(queue, sems_info, false, false); - v3dv_QueueWaitIdle(v3dv_queue_to_handle(&job->device->queue)); -#ifdef DEBUG - /* Loop through wait sems and check they are all signalled */ - for (int i = 0; i < sems_info->wait_sem_count; i++) { - int render_fd = queue->device->pdevice->render_fd; - struct v3dv_semaphore *sem = - v3dv_semaphore_from_handle(sems_info->wait_sems[i]); - uint32_t sem_sync = semaphore_get_sync(sem); - int ret = drmSyncobjWait(render_fd, &sem_sync, 1, 0, 0, NULL); - assert(ret == 0); - } -#endif - sems_info->wait_sem_count = 0; - } - switch (job->type) { case V3DV_JOB_TYPE_GPU_CL: - return handle_cl_job(queue, job, sems_info); + return handle_cl_job(queue, job, sync_info, signal_syncs); case V3DV_JOB_TYPE_GPU_TFU: - return handle_tfu_job(queue, job, sems_info); + return handle_tfu_job(queue, job, sync_info, signal_syncs); case V3DV_JOB_TYPE_GPU_CSD: - return handle_csd_job(queue, job, sems_info); + return handle_csd_job(queue, job, sync_info, signal_syncs); case V3DV_JOB_TYPE_CPU_RESET_QUERIES: - return handle_reset_query_cpu_job(job); + return handle_reset_query_cpu_job(queue, job, sync_info); case V3DV_JOB_TYPE_CPU_END_QUERY: return handle_end_query_cpu_job(job); case V3DV_JOB_TYPE_CPU_COPY_QUERY_RESULTS: return handle_copy_query_results_cpu_job(job); case V3DV_JOB_TYPE_CPU_SET_EVENT: - return handle_set_event_cpu_job(job); + return handle_set_event_cpu_job(queue, job, sync_info); case V3DV_JOB_TYPE_CPU_WAIT_EVENTS: - return handle_wait_events_cpu_job(job, sems_info, wait_thread); + return handle_wait_events_cpu_job(job); case V3DV_JOB_TYPE_CPU_COPY_BUFFER_TO_IMAGE: - return handle_copy_buffer_to_image_cpu_job(job); + return handle_copy_buffer_to_image_cpu_job(queue, job, sync_info); case V3DV_JOB_TYPE_CPU_CSD_INDIRECT: - return handle_csd_indirect_cpu_job(queue, job, sems_info); + return handle_csd_indirect_cpu_job(queue, job, sync_info); case V3DV_JOB_TYPE_CPU_TIMESTAMP_QUERY: - return handle_timestamp_query_cpu_job(job); + return handle_timestamp_query_cpu_job(queue, job, sync_info); default: unreachable("Unhandled job type"); } @@ -1195,887 +874,59 @@ queue_create_noop_job(struct v3dv_queue *queue) v3dv_X(device, job_emit_noop)(queue->noop_job); - return VK_SUCCESS; -} - -static VkResult -queue_submit_noop_job(struct v3dv_queue *queue, - struct v3dv_submit_info_semaphores *sems_info, - bool do_sem_signal, bool serialize) -{ - if (!do_sem_signal && !serialize && !sems_info->wait_sem_count) - return VK_SUCCESS; - - /* We need to protect noop_job against concurrent access. While - * the client must externally synchronize queue submissions, we - * may spawn threads that can submit noop jobs themselves. - */ - mtx_lock(&queue->noop_mutex); - if (!queue->noop_job) { - VkResult result = queue_create_noop_job(queue); - if (result != VK_SUCCESS) { - mtx_unlock(&queue->noop_mutex); - return result; - } - } - queue->noop_job->do_sem_signal = do_sem_signal; - queue->noop_job->serialize = serialize; - - VkResult result = - queue_submit_job(queue, queue->noop_job, sems_info, NULL); - - mtx_unlock(&queue->noop_mutex); - return result; -} - -/* This function takes a job type and returns True if we have - * previously submitted any jobs for the same command buffer batch - * to a queue different to the one for this job type. - */ -static bool -cmd_buffer_batch_is_multi_queue(struct v3dv_device *device, - enum v3dv_job_type job_type) -{ - enum v3dv_queue_type queue_type = V3DV_QUEUE_ANY; - struct v3dv_last_job_sync last_job_syncs; - - mtx_lock(&device->mutex); - memcpy(&last_job_syncs, &device->last_job_syncs, sizeof(last_job_syncs)); - mtx_unlock(&device->mutex); - - switch (job_type) { - case V3DV_JOB_TYPE_GPU_CL: - case V3DV_JOB_TYPE_GPU_CL_SECONDARY: - queue_type = V3DV_QUEUE_CL; - break; - case V3DV_JOB_TYPE_GPU_TFU: - queue_type = V3DV_QUEUE_TFU; - break; - case V3DV_JOB_TYPE_GPU_CSD: - queue_type = V3DV_QUEUE_CSD; - break; - default: - unreachable("Queue type is undefined"); - break; - } - - for (int i = 0; i < V3DV_QUEUE_ANY; i++) { - if (i != queue_type && !last_job_syncs.first[i]) { - return true; - } - } - - return false; -} - -static VkResult -queue_submit_cmd_buffer(struct v3dv_queue *queue, - struct v3dv_cmd_buffer *cmd_buffer, - struct v3dv_submit_info_semaphores *sems_info, - bool is_last_cmd_buffer, - pthread_t *wait_thread) -{ - struct v3dv_job *last; - bool do_sem_signal = is_last_cmd_buffer && sems_info->signal_sem_count > 0; - - assert(cmd_buffer); - assert(cmd_buffer->status == V3DV_CMD_BUFFER_STATUS_EXECUTABLE); - - if (list_is_empty(&cmd_buffer->jobs)) - return queue_submit_noop_job(queue, sems_info, do_sem_signal, false); - - /* When we are in the last cmd buffer and there are semaphores to signal, - * we process semaphores in the last job, following these conditions: - * - CPU-job: we can't signal until all GPU work has completed, so we - * submit a serialized noop GPU job to handle signaling when all on-going - * GPU work on all queues has completed. - * - GPU-job: can signal semaphores only if we have not submitted jobs to - * a queue other than the queue of this job. Otherwise, we submit a - * serialized noop job to handle signaling. - */ - if (do_sem_signal) { - last = list_last_entry(&cmd_buffer->jobs, struct v3dv_job, list_link); - if (v3dv_job_type_is_gpu(last)) - last->do_sem_signal = true; - } - - list_for_each_entry_safe(struct v3dv_job, job, - &cmd_buffer->jobs, list_link) { - if (job->do_sem_signal && - cmd_buffer_batch_is_multi_queue(queue->device, job->type)) - job->do_sem_signal = false; - VkResult result = queue_submit_job(queue, job, sems_info, wait_thread); - if (result != VK_SUCCESS) - return result; - } - - /* If we are in the last cmd buffer batch, but the last job cannot handle - * signal semaphores, we emit a serialized noop_job for signalling. - */ - if (do_sem_signal && !(last && last->do_sem_signal)) - return queue_submit_noop_job(queue, sems_info, true, true); + queue->noop_job->serialize = true; return VK_SUCCESS; } -static void -add_wait_thread_to_list(struct v3dv_device *device, - pthread_t thread, - struct v3dv_queue_submit_wait_info **wait_info) +VkResult +v3dv_queue_driver_submit(struct vk_queue *vk_queue, + struct vk_queue_submit *submit) { - /* If this is the first time we spawn a wait thread for this queue - * submission create a v3dv_queue_submit_wait_info to track this and - * any other threads in the same submission and add it to the global list - * in the queue. - */ - if (*wait_info == NULL) { - *wait_info = - vk_zalloc(&device->vk.alloc, sizeof(struct v3dv_queue_submit_wait_info), 8, - VK_SYSTEM_ALLOCATION_SCOPE_COMMAND); - (*wait_info)->device = device; - } + struct v3dv_queue *queue = container_of(vk_queue, struct v3dv_queue, vk); + VkResult result; - /* And add the thread to the list of wait threads for this submission */ - const uint32_t thread_idx = (*wait_info)->wait_thread_count; - assert(thread_idx < 16); - (*wait_info)->wait_threads[thread_idx].thread = thread; - (*wait_info)->wait_threads[thread_idx].finished = false; - (*wait_info)->wait_thread_count++; -} - -static void -add_signal_semaphores_to_wait_list(struct v3dv_device *device, - const VkSubmitInfo *pSubmit, - struct v3dv_queue_submit_wait_info *wait_info) -{ - assert(wait_info); - - if (pSubmit->signalSemaphoreCount == 0) - return; - - /* Otherwise, we put all the semaphores in a list and we signal all of them - * together from the submit master thread when the last wait thread in the - * submit completes. - */ - - /* Check the size of the current semaphore list */ - const uint32_t prev_count = wait_info->signal_semaphore_count; - const uint32_t prev_alloc_size = prev_count * sizeof(VkSemaphore); - VkSemaphore *prev_list = wait_info->signal_semaphores; - - /* Resize the list to hold the additional semaphores */ - const uint32_t extra_alloc_size = - pSubmit->signalSemaphoreCount * sizeof(VkSemaphore); - wait_info->signal_semaphore_count += pSubmit->signalSemaphoreCount; - wait_info->signal_semaphores = - vk_alloc(&device->vk.alloc, prev_alloc_size + extra_alloc_size, 8, - VK_SYSTEM_ALLOCATION_SCOPE_COMMAND); - - /* Copy the old list to the new allocation and free the old list */ - if (prev_count > 0) { - memcpy(wait_info->signal_semaphores, prev_list, prev_alloc_size); - vk_free(&device->vk.alloc, prev_list); - } - - /* Add the new semaphores to the list */ - memcpy(wait_info->signal_semaphores + prev_count, - pSubmit->pSignalSemaphores, extra_alloc_size); -} - -static VkResult -queue_submit_cmd_buffer_batch(struct v3dv_queue *queue, - const VkSubmitInfo *pSubmit, - struct v3dv_queue_submit_wait_info **wait_info) -{ - VkResult result = VK_SUCCESS; - bool has_wait_threads = false; - - /* Wrap wait semaphores info from VkSubmitInfo to use it whenever we need - * the data to submit all jobs in the same command buffer batch. - */ - struct v3dv_submit_info_semaphores sems_info = { - .wait_sem_count = pSubmit->waitSemaphoreCount, - .wait_sems = (VkSemaphore *) pSubmit->pWaitSemaphores, - .signal_sem_count = pSubmit->signalSemaphoreCount, - .signal_sems = (VkSemaphore *) pSubmit->pSignalSemaphores, - .fence = 0, + struct v3dv_submit_sync_info sync_info = { + .wait_count = submit->wait_count, + .waits = submit->waits, + .signal_count = submit->signal_count, + .signals = submit->signals, }; - /* In the beginning of a cmd buffer batch, we set all last_job_syncs as - * first. It helps to determine wait semaphores conditions. - */ - for (unsigned i = 0; i < V3DV_QUEUE_COUNT; i++) - queue->device->last_job_syncs.first[i] = true; - - /* Even if we don't have any actual work to submit we still need to wait - * on the wait semaphores and signal the signal semaphores and fence, so - * in this scenario we just submit a trivial no-op job so we don't have - * to do anything special, it should not be a common case anyway. - */ - if (pSubmit->commandBufferCount == 0) { - result = queue_submit_noop_job(queue, &sems_info, - sems_info.signal_sem_count > 0, false); - } else { - const uint32_t last_cmd_buffer_idx = pSubmit->commandBufferCount - 1; - for (uint32_t i = 0; i < pSubmit->commandBufferCount; i++) { - pthread_t wait_thread; - struct v3dv_cmd_buffer *cmd_buffer = - v3dv_cmd_buffer_from_handle(pSubmit->pCommandBuffers[i]); - result = queue_submit_cmd_buffer(queue, cmd_buffer, &sems_info, - (i == last_cmd_buffer_idx), - &wait_thread); - - /* We get VK_NOT_READY if we had to spawn a wait thread for the - * command buffer. In that scenario, we want to continue submitting - * any pending command buffers in the batch, but we don't want to - * process any signal semaphores for the batch until we know we have - * submitted every job for every command buffer in the batch. - */ - if (result == VK_NOT_READY) { - result = VK_SUCCESS; - add_wait_thread_to_list(queue->device, wait_thread, wait_info); - has_wait_threads = true; - } - - if (result != VK_SUCCESS) - break; - } - } + for (int i = 0; i < V3DV_QUEUE_COUNT; i++) + queue->last_job_syncs.first[i] = true; + result = process_waits(queue, sync_info.wait_count, sync_info.waits); if (result != VK_SUCCESS) return result; - /* If had to emit any wait threads in this submit we need to wait for all - * of them to complete before we can signal any semaphores. + for (uint32_t i = 0; i < submit->command_buffer_count; i++) { + struct v3dv_cmd_buffer *cmd_buffer = + container_of(submit->command_buffers[i], struct v3dv_cmd_buffer, vk); + list_for_each_entry_safe(struct v3dv_job, job, + &cmd_buffer->jobs, list_link) { + + result = queue_handle_job(queue, job, &sync_info, false); + if (result != VK_SUCCESS) + return result; + } + } + + /* Finish by submitting a no-op job that synchronizes across all queues. + * This will ensure that the signal semaphores don't get triggered until + * all work on any queue completes. */ - if (!has_wait_threads) { - return process_semaphores_to_signal(queue->device, - pSubmit->signalSemaphoreCount, - pSubmit->pSignalSemaphores, - false); - } else { - assert(*wait_info); - add_signal_semaphores_to_wait_list(queue->device, pSubmit, *wait_info); - return VK_NOT_READY; + if (!queue->noop_job) { + result = queue_create_noop_job(queue); + if (result != VK_SUCCESS) + return result; } -} - -static void * -master_wait_thread_func(void *_wait_info) -{ - struct v3dv_queue_submit_wait_info *wait_info = - (struct v3dv_queue_submit_wait_info *) _wait_info; - - struct v3dv_queue *queue = &wait_info->device->queue; - - /* Wait for all command buffer wait threads to complete */ - for (uint32_t i = 0; i < wait_info->wait_thread_count; i++) { - int res = pthread_join(wait_info->wait_threads[i].thread, NULL); - if (res != 0) - fprintf(stderr, "Wait thread failed to join.\n"); - } - - /* Signal semaphores and fences */ - VkResult result; - result = process_semaphores_to_signal(wait_info->device, - wait_info->signal_semaphore_count, - wait_info->signal_semaphores, - true); + result = queue_handle_job(queue, queue->noop_job, &sync_info, true); if (result != VK_SUCCESS) - fprintf(stderr, "Wait thread semaphore signaling failed."); + return result; - result = process_fence_to_signal(wait_info->device, wait_info->fence); - if (result != VK_SUCCESS) - fprintf(stderr, "Wait thread fence signaling failed."); + process_signals(queue, sync_info.signal_count, sync_info.signals); - /* Release wait_info */ - mtx_lock(&queue->mutex); - list_del(&wait_info->list_link); - mtx_unlock(&queue->mutex); - - vk_free(&wait_info->device->vk.alloc, wait_info->signal_semaphores); - vk_free(&wait_info->device->vk.alloc, wait_info); - - return NULL; -} - - -static VkResult -spawn_master_wait_thread(struct v3dv_queue *queue, - struct v3dv_queue_submit_wait_info *wait_info) - -{ - VkResult result = VK_SUCCESS; - - mtx_lock(&queue->mutex); - if (pthread_create(&wait_info->master_wait_thread, NULL, - master_wait_thread_func, wait_info)) { - result = vk_queue_set_lost(&queue->vk, "Thread create failed: %m"); - goto done; - } - - list_addtail(&wait_info->list_link, &queue->submit_wait_list); - -done: - mtx_unlock(&queue->mutex); - return result; -} - -VKAPI_ATTR VkResult VKAPI_CALL -v3dv_QueueSubmit(VkQueue _queue, - uint32_t submitCount, - const VkSubmitInfo* pSubmits, - VkFence fence) -{ - V3DV_FROM_HANDLE(v3dv_queue, queue, _queue); - - if (vk_device_is_lost(&queue->device->vk)) - return VK_ERROR_DEVICE_LOST; - - struct v3dv_queue_submit_wait_info *wait_info = NULL; - - VkResult result = VK_SUCCESS; - for (uint32_t i = 0; i < submitCount; i++) { - result = queue_submit_cmd_buffer_batch(queue, &pSubmits[i], &wait_info); - if (result != VK_SUCCESS && result != VK_NOT_READY) - goto done; - } - - if (!wait_info) { - assert(result != VK_NOT_READY); - result = process_fence_to_signal(queue->device, fence); - goto done; - } - - /* We emitted wait threads, so we have to spwan a master thread for this - * queue submission that waits for all other threads to complete and then - * will signal any semaphores and fences. - */ - assert(wait_info); - wait_info->fence = fence; - result = spawn_master_wait_thread(queue, wait_info); - -done: - return result; -} - -static void -destroy_syncobj(uint32_t device_fd, uint32_t *sync) -{ - assert(sync); - drmSyncobjDestroy(device_fd, *sync); - *sync = 0; -} - -VKAPI_ATTR VkResult VKAPI_CALL -v3dv_CreateSemaphore(VkDevice _device, - const VkSemaphoreCreateInfo *pCreateInfo, - const VkAllocationCallbacks *pAllocator, - VkSemaphore *pSemaphore) -{ - V3DV_FROM_HANDLE(v3dv_device, device, _device); - - assert(pCreateInfo->sType == VK_STRUCTURE_TYPE_SEMAPHORE_CREATE_INFO); - - struct v3dv_semaphore *sem = - vk_object_zalloc(&device->vk, pAllocator, sizeof(struct v3dv_semaphore), - VK_OBJECT_TYPE_SEMAPHORE); - if (sem == NULL) - return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY); - - int ret = drmSyncobjCreate(device->pdevice->render_fd, 0, &sem->sync); - if (ret) { - vk_object_free(&device->vk, pAllocator, sem); - return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY); - } - - *pSemaphore = v3dv_semaphore_to_handle(sem); - - return VK_SUCCESS; -} - -VKAPI_ATTR void VKAPI_CALL -v3dv_GetPhysicalDeviceExternalSemaphoreProperties( - VkPhysicalDevice physicalDevice, - const VkPhysicalDeviceExternalSemaphoreInfo *pExternalSemaphoreInfo, - VkExternalSemaphoreProperties *pExternalSemaphoreProperties) -{ - V3DV_FROM_HANDLE(v3dv_physical_device, pdevice, physicalDevice); - - switch (pExternalSemaphoreInfo->handleType) { - case VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_FD_BIT: - case VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_SYNC_FD_BIT: - pExternalSemaphoreProperties->exportFromImportedHandleTypes = - VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_FD_BIT | - VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_SYNC_FD_BIT; - pExternalSemaphoreProperties->compatibleHandleTypes = - VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_FD_BIT | - VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_SYNC_FD_BIT; - - /* We need to have multisync support in our kernel interface to support - * external semaphore imports because once we have an imported semaphore - * in our list of semaphores to wait on, we can no longer use the - * workaround of waiting on the last syncobj fence produced from the - * device, since the imported semaphore may not (and in fact, it would - * typically not) have been produced from same device. - */ - pExternalSemaphoreProperties->externalSemaphoreFeatures = - pdevice->caps.multisync ? - VK_EXTERNAL_SEMAPHORE_FEATURE_IMPORTABLE_BIT : 0; - - /* FIXME: See comment in GetPhysicalDeviceExternalFenceProperties - * for details on why we can't export to SYNC_FD. - */ - if (pExternalSemaphoreInfo->handleType != - VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_SYNC_FD_BIT) { - pExternalSemaphoreProperties->externalSemaphoreFeatures |= - VK_EXTERNAL_SEMAPHORE_FEATURE_EXPORTABLE_BIT; - } - break; - default: - pExternalSemaphoreProperties->exportFromImportedHandleTypes = 0; - pExternalSemaphoreProperties->compatibleHandleTypes = 0; - pExternalSemaphoreProperties->externalSemaphoreFeatures = 0; - break; - } -} - -VKAPI_ATTR VkResult VKAPI_CALL -v3dv_ImportSemaphoreFdKHR( - VkDevice _device, - const VkImportSemaphoreFdInfoKHR *pImportSemaphoreFdInfo) -{ - V3DV_FROM_HANDLE(v3dv_device, device, _device); - V3DV_FROM_HANDLE(v3dv_semaphore, sem, pImportSemaphoreFdInfo->semaphore); - - assert(pImportSemaphoreFdInfo->sType == - VK_STRUCTURE_TYPE_IMPORT_SEMAPHORE_FD_INFO_KHR); - - int fd = pImportSemaphoreFdInfo->fd; - int render_fd = device->pdevice->render_fd; - - bool is_temporary = - pImportSemaphoreFdInfo->handleType == VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_SYNC_FD_BIT || - (pImportSemaphoreFdInfo->flags & VK_SEMAPHORE_IMPORT_TEMPORARY_BIT); - - uint32_t new_sync; - switch (pImportSemaphoreFdInfo->handleType) { - case VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_SYNC_FD_BIT: { - /* "If handleType is VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_SYNC_FD_BIT, the - * special value -1 for fd is treated like a valid sync file descriptor - * referring to an object that has already signaled. The import - * operation will succeed and the VkSemaphore will have a temporarily - * imported payload as if a valid file descriptor had been provided." - */ - unsigned flags = fd == -1 ? DRM_SYNCOBJ_CREATE_SIGNALED : 0; - if (drmSyncobjCreate(render_fd, flags, &new_sync)) - return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY); - - if (fd != -1) { - if (drmSyncobjImportSyncFile(render_fd, new_sync, fd)) { - drmSyncobjDestroy(render_fd, new_sync); - return vk_error(device, VK_ERROR_INVALID_EXTERNAL_HANDLE); - } - } - break; - } - case VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_FD_BIT: { - if (drmSyncobjFDToHandle(render_fd, fd, &new_sync)) - return vk_error(device, VK_ERROR_INVALID_EXTERNAL_HANDLE); - break; - } - default: - return vk_error(device, VK_ERROR_INVALID_EXTERNAL_HANDLE); - } - - if (sem->temp_sync) { - destroy_syncobj(render_fd, &sem->temp_sync); - sem->has_temp = false; - } - - if (is_temporary) { - sem->temp_sync = new_sync; - sem->has_temp = true; - } else { - destroy_syncobj(render_fd, &sem->sync); - sem->sync = new_sync; - } - - /* From the Vulkan 1.0.53 spec: - * - * "Importing a semaphore payload from a file descriptor transfers - * ownership of the file descriptor from the application to the - * Vulkan implementation. The application must not perform any - * operations on the file descriptor after a successful import." - * - * If the import fails, we leave the file descriptor open. - */ - if (fd != -1) - close(fd); - - return VK_SUCCESS; -} - -VKAPI_ATTR VkResult VKAPI_CALL -v3dv_GetSemaphoreFdKHR(VkDevice _device, - const VkSemaphoreGetFdInfoKHR *pGetFdInfo, - int *pFd) -{ - V3DV_FROM_HANDLE(v3dv_device, device, _device); - V3DV_FROM_HANDLE(v3dv_semaphore, sem, pGetFdInfo->semaphore); - - assert(pGetFdInfo->sType == VK_STRUCTURE_TYPE_SEMAPHORE_GET_FD_INFO_KHR); - - *pFd = -1; - int render_fd = device->pdevice->render_fd; - switch (pGetFdInfo->handleType) { - case VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_SYNC_FD_BIT: { - drmSyncobjExportSyncFile(render_fd, sem->sync, pFd); - if (*pFd == -1) - return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY); - break; - case VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_FD_BIT: - drmSyncobjHandleToFD(render_fd, sem->sync, pFd); - if (*pFd == -1) - return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY); - break; - } - default: - unreachable("Unsupported external semaphore handle type"); - } - - return VK_SUCCESS; -} - -VKAPI_ATTR void VKAPI_CALL -v3dv_DestroySemaphore(VkDevice _device, - VkSemaphore semaphore, - const VkAllocationCallbacks *pAllocator) -{ - V3DV_FROM_HANDLE(v3dv_device, device, _device); - V3DV_FROM_HANDLE(v3dv_semaphore, sem, semaphore); - - if (sem == NULL) - return; - - destroy_syncobj(device->pdevice->render_fd, &sem->sync); - if (sem->temp_sync) - destroy_syncobj(device->pdevice->render_fd, &sem->temp_sync); - - vk_object_free(&device->vk, pAllocator, sem); -} - -VKAPI_ATTR VkResult VKAPI_CALL -v3dv_CreateFence(VkDevice _device, - const VkFenceCreateInfo *pCreateInfo, - const VkAllocationCallbacks *pAllocator, - VkFence *pFence) -{ - V3DV_FROM_HANDLE(v3dv_device, device, _device); - - assert(pCreateInfo->sType == VK_STRUCTURE_TYPE_FENCE_CREATE_INFO); - - struct v3dv_fence *fence = - vk_object_zalloc(&device->vk, pAllocator, sizeof(struct v3dv_fence), - VK_OBJECT_TYPE_FENCE); - if (fence == NULL) - return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY); - - unsigned flags = 0; - if (pCreateInfo->flags & VK_FENCE_CREATE_SIGNALED_BIT) - flags |= DRM_SYNCOBJ_CREATE_SIGNALED; - int ret = drmSyncobjCreate(device->pdevice->render_fd, flags, &fence->sync); - if (ret) { - vk_object_free(&device->vk, pAllocator, fence); - return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY); - } - - *pFence = v3dv_fence_to_handle(fence); - - return VK_SUCCESS; -} - -VKAPI_ATTR void VKAPI_CALL -v3dv_GetPhysicalDeviceExternalFenceProperties( - VkPhysicalDevice physicalDevice, - const VkPhysicalDeviceExternalFenceInfo *pExternalFenceInfo, - VkExternalFenceProperties *pExternalFenceProperties) - -{ - switch (pExternalFenceInfo->handleType) { - case VK_EXTERNAL_FENCE_HANDLE_TYPE_OPAQUE_FD_BIT: - case VK_EXTERNAL_FENCE_HANDLE_TYPE_SYNC_FD_BIT: - pExternalFenceProperties->exportFromImportedHandleTypes = - VK_EXTERNAL_FENCE_HANDLE_TYPE_OPAQUE_FD_BIT | - VK_EXTERNAL_FENCE_HANDLE_TYPE_SYNC_FD_BIT; - pExternalFenceProperties->compatibleHandleTypes = - VK_EXTERNAL_FENCE_HANDLE_TYPE_OPAQUE_FD_BIT | - VK_EXTERNAL_FENCE_HANDLE_TYPE_SYNC_FD_BIT; - pExternalFenceProperties->externalFenceFeatures = - VK_EXTERNAL_FENCE_FEATURE_IMPORTABLE_BIT; - - /* FIXME: SYNC_FD exports the actual fence referenced by the syncobj, not - * the syncobj itself, and that fence is only created after we have - * submitted to the kernel and updated the syncobj for the fence to import - * the actual DRM fence created with the submission. Unfortunately, if the - * queue submission has a 'wait for events' we may hold any jobs after the - * wait in a user-space thread until the events are signaled, and in that - * case we don't update the out fence of the submit until the events are - * signaled and we can submit all the jobs involved with the vkQueueSubmit - * call. This means that if the applications submits with an out fence and - * a wait for events, trying to export the out fence to a SYNC_FD rigth - * after the submission and before the events are signaled will fail, - * because the actual DRM fence won't exist yet. This is not a problem - * with OPAQUE_FD because in this case we export the entire syncobj, not - * the underlying DRM fence. To fix this we need to rework our kernel - * interface to be more flexible and accept multiple in/out syncobjs so - * we can implement event waits as regular fence waits on the kernel side, - * until then, we can only reliably export OPAQUE_FD. - */ - if (pExternalFenceInfo->handleType != - VK_EXTERNAL_FENCE_HANDLE_TYPE_SYNC_FD_BIT) { - pExternalFenceProperties->externalFenceFeatures |= - VK_EXTERNAL_FENCE_FEATURE_EXPORTABLE_BIT; - } - break; - default: - pExternalFenceProperties->exportFromImportedHandleTypes = 0; - pExternalFenceProperties->compatibleHandleTypes = 0; - pExternalFenceProperties->externalFenceFeatures = 0; - break; - } -} - -VKAPI_ATTR VkResult VKAPI_CALL -v3dv_ImportFenceFdKHR(VkDevice _device, - const VkImportFenceFdInfoKHR *pImportFenceFdInfo) -{ - V3DV_FROM_HANDLE(v3dv_device, device, _device); - V3DV_FROM_HANDLE(v3dv_fence, fence, pImportFenceFdInfo->fence); - - assert(pImportFenceFdInfo->sType == - VK_STRUCTURE_TYPE_IMPORT_FENCE_FD_INFO_KHR); - - int fd = pImportFenceFdInfo->fd; - int render_fd = device->pdevice->render_fd; - - bool is_temporary = - pImportFenceFdInfo->handleType == VK_EXTERNAL_FENCE_HANDLE_TYPE_SYNC_FD_BIT || - (pImportFenceFdInfo->flags & VK_FENCE_IMPORT_TEMPORARY_BIT); - - uint32_t new_sync; - switch (pImportFenceFdInfo->handleType) { - case VK_EXTERNAL_FENCE_HANDLE_TYPE_SYNC_FD_BIT: { - /* "If handleType is VK_EXTERNAL_FENCE_HANDLE_TYPE_SYNC_FD_BIT, the - * special value -1 for fd is treated like a valid sync file descriptor - * referring to an object that has already signaled. The import - * operation will succeed and the VkFence will have a temporarily - * imported payload as if a valid file descriptor had been provided." - */ - unsigned flags = fd == -1 ? DRM_SYNCOBJ_CREATE_SIGNALED : 0; - if (drmSyncobjCreate(render_fd, flags, &new_sync)) - return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY); - - if (fd != -1) { - if (drmSyncobjImportSyncFile(render_fd, new_sync, fd)) { - drmSyncobjDestroy(render_fd, new_sync); - return vk_error(device, VK_ERROR_INVALID_EXTERNAL_HANDLE); - } - } - break; - } - case VK_EXTERNAL_FENCE_HANDLE_TYPE_OPAQUE_FD_BIT: { - if (drmSyncobjFDToHandle(render_fd, fd, &new_sync)) - return vk_error(device, VK_ERROR_INVALID_EXTERNAL_HANDLE); - break; - } - default: - return vk_error(device, VK_ERROR_INVALID_EXTERNAL_HANDLE); - } - - if (fence->temp_sync) { - destroy_syncobj(render_fd, &fence->temp_sync); - fence->has_temp = false; - } - - if (is_temporary) { - fence->temp_sync = new_sync; - fence->has_temp = true; - } else { - destroy_syncobj(render_fd, &fence->sync); - fence->sync = new_sync; - } - - /* From the Vulkan 1.0.53 spec: - * - * "Importing a fence payload from a file descriptor transfers - * ownership of the file descriptor from the application to the - * Vulkan implementation. The application must not perform any - * operations on the file descriptor after a successful import." - * - * If the import fails, we leave the file descriptor open. - */ - if (fd != -1) - close(fd); - - return VK_SUCCESS; -} - -VKAPI_ATTR void VKAPI_CALL -v3dv_DestroyFence(VkDevice _device, - VkFence _fence, - const VkAllocationCallbacks *pAllocator) -{ - V3DV_FROM_HANDLE(v3dv_device, device, _device); - V3DV_FROM_HANDLE(v3dv_fence, fence, _fence); - - if (fence == NULL) - return; - - destroy_syncobj(device->pdevice->render_fd, &fence->sync); - if (fence->temp_sync) - destroy_syncobj(device->pdevice->render_fd, &fence->temp_sync); - - vk_object_free(&device->vk, pAllocator, fence); -} - -VKAPI_ATTR VkResult VKAPI_CALL -v3dv_GetFenceStatus(VkDevice _device, VkFence _fence) -{ - V3DV_FROM_HANDLE(v3dv_device, device, _device); - V3DV_FROM_HANDLE(v3dv_fence, fence, _fence); - - if (vk_device_is_lost(&device->vk)) - return VK_ERROR_DEVICE_LOST; - - uint32_t sync = fence_get_sync(fence); - int ret = drmSyncobjWait(device->pdevice->render_fd, &sync, 1, - 0, DRM_SYNCOBJ_WAIT_FLAGS_WAIT_FOR_SUBMIT, NULL); - if (ret == -ETIME) - return VK_NOT_READY; - else if (ret) - return vk_device_set_lost(&device->vk, "Syncobj wait failed: %m"); - return VK_SUCCESS; -} - -VKAPI_ATTR VkResult VKAPI_CALL -v3dv_GetFenceFdKHR(VkDevice _device, - const VkFenceGetFdInfoKHR *pGetFdInfo, - int *pFd) -{ - V3DV_FROM_HANDLE(v3dv_device, device, _device); - V3DV_FROM_HANDLE(v3dv_fence, fence, pGetFdInfo->fence); - - assert(pGetFdInfo->sType == VK_STRUCTURE_TYPE_FENCE_GET_FD_INFO_KHR); - - *pFd = -1; - int render_fd = device->pdevice->render_fd; - switch (pGetFdInfo->handleType) { - case VK_EXTERNAL_FENCE_HANDLE_TYPE_SYNC_FD_BIT: { - drmSyncobjExportSyncFile(render_fd, fence->sync, pFd); - if (*pFd == -1) - return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY); - break; - case VK_EXTERNAL_FENCE_HANDLE_TYPE_OPAQUE_FD_BIT: - drmSyncobjHandleToFD(render_fd, fence->sync, pFd); - if (*pFd == -1) - return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY); - break; - } - default: - unreachable("Unsupported external fence handle type"); - } - - return VK_SUCCESS; -} - -VKAPI_ATTR VkResult VKAPI_CALL -v3dv_ResetFences(VkDevice _device, uint32_t fenceCount, const VkFence *pFences) -{ - V3DV_FROM_HANDLE(v3dv_device, device, _device); - - uint32_t *syncobjs = vk_alloc(&device->vk.alloc, - sizeof(*syncobjs) * fenceCount, 8, - VK_SYSTEM_ALLOCATION_SCOPE_COMMAND); - if (!syncobjs) - return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY); - - int render_fd = device->pdevice->render_fd; - uint32_t reset_count = 0; - for (uint32_t i = 0; i < fenceCount; i++) { - struct v3dv_fence *fence = v3dv_fence_from_handle(pFences[i]); - /* From the Vulkan spec, section 'Importing Fence Payloads': - * - * "If the import is temporary, the fence will be restored to its - * permanent state the next time that fence is passed to - * vkResetFences. - * - * Note: Restoring a fence to its prior permanent payload is a - * distinct operation from resetting a fence payload." - * - * To restore the previous state, we just need to destroy the temporary. - */ - if (fence->has_temp) { - assert(fence->temp_sync); - destroy_syncobj(render_fd, &fence->temp_sync); - fence->has_temp = false; - } else { - syncobjs[reset_count++] = fence->sync; - } - } - - int ret = 0; - if (reset_count > 0) - ret = drmSyncobjReset(render_fd, syncobjs, reset_count); - - vk_free(&device->vk.alloc, syncobjs); - - if (ret) - return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY); - return VK_SUCCESS; -} - -VKAPI_ATTR VkResult VKAPI_CALL -v3dv_WaitForFences(VkDevice _device, - uint32_t fenceCount, - const VkFence *pFences, - VkBool32 waitAll, - uint64_t timeout) -{ - V3DV_FROM_HANDLE(v3dv_device, device, _device); - - if (vk_device_is_lost(&device->vk)) - return VK_ERROR_DEVICE_LOST; - - const uint64_t abs_timeout = os_time_get_absolute_timeout(timeout); - - uint32_t *syncobjs = vk_alloc(&device->vk.alloc, - sizeof(*syncobjs) * fenceCount, 8, - VK_SYSTEM_ALLOCATION_SCOPE_COMMAND); - if (!syncobjs) - return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY); - - for (uint32_t i = 0; i < fenceCount; i++) { - struct v3dv_fence *fence = v3dv_fence_from_handle(pFences[i]); - syncobjs[i] = fence_get_sync(fence); - } - - unsigned flags = DRM_SYNCOBJ_WAIT_FLAGS_WAIT_FOR_SUBMIT; - if (waitAll) - flags |= DRM_SYNCOBJ_WAIT_FLAGS_WAIT_ALL; - - int ret; - do { - ret = drmSyncobjWait(device->pdevice->render_fd, syncobjs, fenceCount, - timeout, flags, NULL); - } while (ret == -ETIME && os_time_get_nano() < abs_timeout); - - vk_free(&device->vk.alloc, syncobjs); - - if (ret == -ETIME) - return VK_TIMEOUT; - else if (ret) - return vk_device_set_lost(&device->vk, "Syncobj wait failed: %m"); return VK_SUCCESS; } diff --git a/src/broadcom/vulkan/v3dv_wsi.c b/src/broadcom/vulkan/v3dv_wsi.c index a7dad11cc6f..ad77d3970e9 100644 --- a/src/broadcom/vulkan/v3dv_wsi.c +++ b/src/broadcom/vulkan/v3dv_wsi.c @@ -29,6 +29,9 @@ #include "vk_util.h" #include "wsi_common.h" #include "wsi_common_drm.h" +#include "vk_fence.h" +#include "vk_semaphore.h" +#include "vk_sync_dummy.h" static VKAPI_ATTR PFN_vkVoidFunction VKAPI_CALL v3dv_wsi_proc_addr(VkPhysicalDevice physicalDevice, const char *pName) @@ -146,26 +149,39 @@ v3dv_wsi_get_image_from_swapchain(VkSwapchainKHR swapchain, uint32_t index) } VKAPI_ATTR VkResult VKAPI_CALL -v3dv_AcquireNextImage2KHR( - VkDevice _device, - const VkAcquireNextImageInfoKHR* pAcquireInfo, - uint32_t* pImageIndex) +v3dv_AcquireNextImage2KHR(VkDevice _device, + const VkAcquireNextImageInfoKHR *pAcquireInfo, + uint32_t *pImageIndex) { V3DV_FROM_HANDLE(v3dv_device, device, _device); - V3DV_FROM_HANDLE(v3dv_fence, fence, pAcquireInfo->fence); - V3DV_FROM_HANDLE(v3dv_semaphore, semaphore, pAcquireInfo->semaphore); + VK_FROM_HANDLE(vk_fence, fence, pAcquireInfo->fence); + VK_FROM_HANDLE(vk_semaphore, semaphore, pAcquireInfo->semaphore); - struct v3dv_physical_device *pdevice = &device->instance->physicalDevice; + struct v3dv_physical_device *pdevice = device->pdevice; - VkResult result; - result = wsi_common_acquire_next_image2(&pdevice->wsi_device, _device, - pAcquireInfo, pImageIndex); + VkResult result = wsi_common_acquire_next_image2( + &pdevice->wsi_device, _device, pAcquireInfo, pImageIndex); + /* signal fence/semaphore - image is available immediately */ if (result == VK_SUCCESS || result == VK_SUBOPTIMAL_KHR) { - if (fence) - drmSyncobjSignal(pdevice->render_fd, &fence->sync, 1); - if (semaphore) - drmSyncobjSignal(pdevice->render_fd, &semaphore->sync, 1); + VkResult sync_res; + if (fence) { + vk_fence_reset_temporary(&device->vk, fence); + sync_res = vk_sync_create(&device->vk, &vk_sync_dummy_type, + 0 /* flags */, 1 /* initial_value */, + &fence->temporary); + if (sync_res != VK_SUCCESS) + return sync_res; + } + + if (semaphore) { + vk_semaphore_reset_temporary(&device->vk, semaphore); + sync_res = vk_sync_create(&device->vk, &vk_sync_dummy_type, + 0 /* flags */, 1 /* initial_value */, + &semaphore->temporary); + if (sync_res != VK_SUCCESS) + return sync_res; + } } return result;