diff --git a/src/broadcom/ci/broadcom-rpi4-fails.txt b/src/broadcom/ci/broadcom-rpi4-fails.txt
index f0ee1dc7797..c85007c3115 100644
--- a/src/broadcom/ci/broadcom-rpi4-fails.txt
+++ b/src/broadcom/ci/broadcom-rpi4-fails.txt
@@ -352,7 +352,6 @@ spec@oes_texture_view@rendering-formats@clear GL_RGB10_A2 as GL_RGBA8I,Fail
 
 # https://gitlab.khronos.org/Tracker/vk-gl-cts/-/issues/3510
 dEQP-VK.api.external.semaphore.opaque_fd.info_timeline,Fail
-dEQP-VK.api.external.semaphore.sync_fd.info_timeline,Fail
 
 dEQP-VK.draw.renderpass.inverted_depth_ranges.nodepthclamp_deltazero,Fail
 
diff --git a/src/broadcom/vulkan/v3dv_device.c b/src/broadcom/vulkan/v3dv_device.c
index abc7073a0b8..81d2e48203f 100644
--- a/src/broadcom/vulkan/v3dv_device.c
+++ b/src/broadcom/vulkan/v3dv_device.c
@@ -45,6 +45,7 @@
 
 #include "drm-uapi/v3d_drm.h"
 #include "format/u_format.h"
+#include "vk_drm_syncobj.h"
 #include "vk_util.h"
 #include "git_sha1.h"
 
@@ -844,6 +845,44 @@ physical_device_init(struct v3dv_physical_device *device,
 
    device->options.merge_jobs = getenv("V3DV_NO_MERGE_JOBS") == NULL;
 
+   device->drm_syncobj_type = vk_drm_syncobj_get_type(device->render_fd);
+
+   /* We don't support timelines in the uAPI yet and we don't want it getting
+    * suddenly turned on by vk_drm_syncobj_get_type() without us adding v3dv
+    * code for it first.
+    */
+   device->drm_syncobj_type.features &= ~VK_SYNC_FEATURE_TIMELINE;
+
+   /* Sync file export is incompatible with the current model of execution
+    * where some jobs may run on the CPU.  There are CTS tests which do the
+    * following:
+    *
+    *  1. Create a command buffer with a vkCmdWaitEvents()
+    *  2. Submit the command buffer
+    *  3. vkGetSemaphoreFdKHR() to try to get a sync_file
+    *  4. vkSetEvent()
+    *
+    * This deadlocks because we have to wait for the syncobj to get a real
+    * fence in vkGetSemaphoreFdKHR() which only happens after all the work
+    * from the command buffer is complete which only happens after
+    * vkSetEvent().  No amount of CPU threading in userspace will ever fix
+    * this.  Sadly, this is pretty explicitly allowed by the Vulkan spec:
+    *
+    *    VUID-vkCmdWaitEvents-pEvents-01163
+    *
+    *    "If pEvents includes one or more events that will be signaled by
+    *    vkSetEvent after commandBuffer has been submitted to a queue, then
+    *    vkCmdWaitEvents must not be called inside a render pass instance"
+    *
+    * Disable sync file support for now.
+    */
+   device->drm_syncobj_type.import_sync_file = NULL;
+   device->drm_syncobj_type.export_sync_file = NULL;
+
+   device->sync_types[0] = &device->drm_syncobj_type;
+   device->sync_types[1] = NULL;
+   device->vk.supported_sync_types = device->sync_types;
+
    result = v3dv_wsi_init(device);
    if (result != VK_SUCCESS) {
       vk_error(instance, result);
@@ -1845,6 +1884,17 @@ v3dv_EnumerateDeviceLayerProperties(VkPhysicalDevice physicalDevice,
    return vk_error(physical_device, VK_ERROR_LAYER_NOT_PRESENT);
 }
 
+static void
+destroy_queue_syncs(struct v3dv_queue *queue)
+{
+   for (int i = 0; i < V3DV_QUEUE_COUNT; i++) {
+      if (queue->last_job_syncs.syncs[i]) {
+         drmSyncobjDestroy(queue->device->pdevice->render_fd,
+                           queue->last_job_syncs.syncs[i]);
+      }
+   }
+}
+
 static VkResult
 queue_init(struct v3dv_device *device, struct v3dv_queue *queue,
            const VkDeviceQueueCreateInfo *create_info,
@@ -1854,23 +1904,43 @@ queue_init(struct v3dv_device *device, struct v3dv_queue *queue,
                                    index_in_family);
    if (result != VK_SUCCESS)
       return result;
+
+   result = vk_queue_enable_submit_thread(&queue->vk);
+   if (result != VK_SUCCESS)
+      goto fail_submit_thread;
+
    queue->device = device;
+   queue->vk.driver_submit = v3dv_queue_driver_submit;
+
+   for (int i = 0; i < V3DV_QUEUE_COUNT; i++) {
+      queue->last_job_syncs.first[i] = true;
+      int ret = drmSyncobjCreate(device->pdevice->render_fd,
+                                 DRM_SYNCOBJ_CREATE_SIGNALED,
+                                 &queue->last_job_syncs.syncs[i]);
+      if (ret) {
+         result = vk_errorf(device, VK_ERROR_INITIALIZATION_FAILED,
+                            "syncobj create failed: %m");
+         goto fail_last_job_syncs;
+      }
+   }
+
    queue->noop_job = NULL;
-   list_inithead(&queue->submit_wait_list);
-   mtx_init(&queue->mutex, mtx_plain);
-   mtx_init(&queue->noop_mutex, mtx_plain);
    return VK_SUCCESS;
+
+fail_last_job_syncs:
+   destroy_queue_syncs(queue);
+fail_submit_thread:
+   vk_queue_finish(&queue->vk);
+   return result;
 }
 
 static void
 queue_finish(struct v3dv_queue *queue)
 {
-   vk_queue_finish(&queue->vk);
-   assert(list_is_empty(&queue->submit_wait_list));
    if (queue->noop_job)
       v3dv_job_destroy(queue->noop_job);
-   mtx_destroy(&queue->mutex);
-   mtx_destroy(&queue->noop_mutex);
+   destroy_queue_syncs(queue);
+   vk_queue_finish(&queue->vk);
 }
 
 static void
@@ -1882,16 +1952,6 @@ init_device_meta(struct v3dv_device *device)
    v3dv_meta_texel_buffer_copy_init(device);
 }
 
-static void
-destroy_device_syncs(struct v3dv_device *device,
-                       int render_fd)
-{
-   for (int i = 0; i < V3DV_QUEUE_COUNT; i++) {
-      if (device->last_job_syncs.syncs[i])
-         drmSyncobjDestroy(render_fd, device->last_job_syncs.syncs[i]);
-   }
-}
-
 static void
 destroy_device_meta(struct v3dv_device *device)
 {
@@ -1944,10 +2004,12 @@ v3dv_CreateDevice(VkPhysicalDevice physicalDevice,
    device->instance = instance;
    device->pdevice = physical_device;
 
-   mtx_init(&device->mutex, mtx_plain);
    mtx_init(&device->query_mutex, mtx_plain);
    cnd_init(&device->query_ended);
 
+   vk_device_set_drm_fd(&device->vk, physical_device->render_fd);
+   vk_device_enable_threaded_submit(&device->vk);
+
    result = queue_init(device, &device->queue,
                        pCreateInfo->pQueueCreateInfos, 0);
    if (result != VK_SUCCESS)
@@ -1973,17 +2035,6 @@ v3dv_CreateDevice(VkPhysicalDevice physicalDevice,
    if (device->features.robustBufferAccess)
       perf_debug("Device created with Robust Buffer Access enabled.\n");
 
-   for (int i = 0; i < V3DV_QUEUE_COUNT; i++) {
-      device->last_job_syncs.first[i] = true;
-      int ret = drmSyncobjCreate(physical_device->render_fd,
-                                 DRM_SYNCOBJ_CREATE_SIGNALED,
-                                 &device->last_job_syncs.syncs[i]);
-      if (ret) {
-         result = VK_ERROR_INITIALIZATION_FAILED;
-         goto fail;
-      }
-   }
-
 #ifdef DEBUG
    v3dv_X(device, device_check_prepacked_sizes)();
 #endif
@@ -1999,10 +2050,8 @@ v3dv_CreateDevice(VkPhysicalDevice physicalDevice,
    return VK_SUCCESS;
 
 fail:
-   destroy_device_syncs(device, physical_device->render_fd);
    cnd_destroy(&device->query_ended);
    mtx_destroy(&device->query_mutex);
-   mtx_destroy(&device->mutex);
    vk_device_finish(&device->vk);
    vk_free(&device->vk.alloc, device);
 
@@ -2015,10 +2064,8 @@ v3dv_DestroyDevice(VkDevice _device,
 {
    V3DV_FROM_HANDLE(v3dv_device, device, _device);
 
-   v3dv_DeviceWaitIdle(_device);
+   device->vk.dispatch_table.DeviceWaitIdle(_device);
    queue_finish(&device->queue);
-   mtx_destroy(&device->mutex);
-   destroy_device_syncs(device, device->pdevice->render_fd);
    destroy_device_meta(device);
    v3dv_pipeline_cache_finish(&device->default_pipeline_cache);
 
@@ -2039,17 +2086,6 @@ v3dv_DestroyDevice(VkDevice _device,
    vk_free2(&device->vk.alloc, pAllocator, device);
 }
 
-VKAPI_ATTR VkResult VKAPI_CALL
-v3dv_DeviceWaitIdle(VkDevice _device)
-{
-   V3DV_FROM_HANDLE(v3dv_device, device, _device);
-
-   if (vk_device_is_lost(&device->vk))
-      return VK_ERROR_DEVICE_LOST;
-
-   return v3dv_QueueWaitIdle(v3dv_queue_to_handle(&device->queue));
-}
-
 static VkResult
 device_alloc(struct v3dv_device *device,
              struct v3dv_device_memory *mem,
diff --git a/src/broadcom/vulkan/v3dv_private.h b/src/broadcom/vulkan/v3dv_private.h
index c0b313c4f77..950fd3329c8 100644
--- a/src/broadcom/vulkan/v3dv_private.h
+++ b/src/broadcom/vulkan/v3dv_private.h
@@ -43,6 +43,7 @@
 #include "vk_log.h"
 #include "vk_physical_device.h"
 #include "vk_shader_module.h"
+#include "vk_sync.h"
 #include "vk_util.h"
 
 #include "vk_command_buffer.h"
@@ -140,6 +141,9 @@ struct v3dv_physical_device {
    uint8_t device_uuid[VK_UUID_SIZE];
    uint8_t driver_uuid[VK_UUID_SIZE];
 
+   struct vk_sync_type drm_syncobj_type;
+   const struct vk_sync_type *sync_types[2];
+
    struct disk_cache *disk_cache;
 
    mtx_t mutex;
@@ -219,34 +223,30 @@ struct v3dv_instance {
    bool default_pipeline_cache_enabled;
 };
 
-/* Tracks wait threads spawned from a single vkQueueSubmit call */
-struct v3dv_queue_submit_wait_info {
-   /*  struct vk_object_base base; ?*/
-   struct list_head list_link;
+/* FIXME: In addition to tracking the last job submitted by GPU queue (cl, csd,
+ * tfu), we still need a syncobj to track the last overall job submitted
+ * (V3DV_QUEUE_ANY) for the case we don't support multisync. Someday we can
+ * start expecting multisync to be present and drop the legacy implementation
+ * together with this V3DV_QUEUE_ANY tracker.
+ */
+enum v3dv_queue_type {
+   V3DV_QUEUE_CL = 0,
+   V3DV_QUEUE_CSD,
+   V3DV_QUEUE_TFU,
+   V3DV_QUEUE_ANY,
+   V3DV_QUEUE_COUNT,
+};
 
-   struct v3dv_device *device;
-
-   /* List of wait threads spawned for any command buffers in a particular
-    * call to vkQueueSubmit.
-    */
-   uint32_t wait_thread_count;
-   struct {
-      pthread_t thread;
-      bool finished;
-   } wait_threads[16];
-
-   /* The master wait thread for the entire submit. This will wait for all
-    * other threads in this submit to complete  before processing signal
-    * semaphores and fences.
-    */
-   pthread_t master_wait_thread;
-
-   /* List of semaphores (and fence) to signal after all wait threads completed
-    * and all command buffer jobs in the submission have been sent to the GPU.
-    */
-   uint32_t signal_semaphore_count;
-   VkSemaphore *signal_semaphores;
-   VkFence fence;
+/* For each GPU queue, we use a syncobj to track the last job submitted. We
+ * set the flag `first` to determine when we are starting a new cmd buffer
+ * batch and therefore a job submitted to a given queue will be the first in a
+ * cmd buf batch.
+ */
+struct v3dv_last_job_sync {
+   /* If the job is the first submitted to a GPU queue in a cmd buffer batch */
+   bool first[V3DV_QUEUE_COUNT];
+   /* Array of syncobj to track the last job submitted to a GPU queue */
+   uint32_t syncs[V3DV_QUEUE_COUNT];
 };
 
 struct v3dv_queue {
@@ -254,18 +254,14 @@ struct v3dv_queue {
 
    struct v3dv_device *device;
 
-   /* A list of active v3dv_queue_submit_wait_info */
-   struct list_head submit_wait_list;
-
-   /* A mutex to prevent concurrent access to the list of wait threads */
-   mtx_t mutex;
-
-   /* A mutex to prevent concurrent noop job submissions */
-   mtx_t noop_mutex;
+   struct v3dv_last_job_sync last_job_syncs;
 
    struct v3dv_job *noop_job;
 };
 
+VkResult v3dv_queue_driver_submit(struct vk_queue *vk_queue,
+                                  struct vk_queue_submit *submit);
+
 #define V3DV_META_BLIT_CACHE_KEY_SIZE              (4 * sizeof(uint32_t))
 #define V3DV_META_TEXEL_BUFFER_COPY_CACHE_KEY_SIZE (3 * sizeof(uint32_t) + \
                                                     sizeof(VkComponentMapping))
@@ -438,32 +434,6 @@ struct v3dv_pipeline_cache {
    bool externally_synchronized;
 };
 
-/* FIXME: In addition to tracking the last job submitted by GPU queue (cl, csd,
- * tfu), we still need a syncobj to track the last overall job submitted
- * (V3DV_QUEUE_ANY) for the case we don't support multisync. Someday we can
- * start expecting multisync to be present and drop the legacy implementation
- * together with this V3DV_QUEUE_ANY tracker.
- */
-enum v3dv_queue_type {
-   V3DV_QUEUE_CL = 0,
-   V3DV_QUEUE_CSD,
-   V3DV_QUEUE_TFU,
-   V3DV_QUEUE_ANY,
-   V3DV_QUEUE_COUNT,
-};
-
-/* For each GPU queue, we use a syncobj to track the last job submitted. We
- * set the flag `first` to determine when we are starting a new cmd buffer
- * batch and therefore a job submitted to a given queue will be the first in a
- * cmd buf batch.
- */
-struct v3dv_last_job_sync {
-   /* If the job is the first submitted to a GPU queue in a cmd buffer batch */
-   bool first[V3DV_QUEUE_COUNT];
-   /* Array of syncobj to track the last job submitted to a GPU queue */
-   uint32_t syncs[V3DV_QUEUE_COUNT];
-};
-
 struct v3dv_device {
    struct vk_device vk;
 
@@ -473,12 +443,6 @@ struct v3dv_device {
    struct v3d_device_info devinfo;
    struct v3dv_queue queue;
 
-   /* Syncobjs to track the last job submitted to any GPU queue */
-   struct v3dv_last_job_sync last_job_syncs;
-
-   /* A mutex to prevent concurrent access to last_job_sync from the queue */
-   mtx_t mutex;
-
    /* Guards query->maybe_available and value for timestamps */
    mtx_t query_mutex;
 
@@ -1001,17 +965,14 @@ struct v3dv_copy_query_results_cpu_job_info {
    VkQueryResultFlags flags;
 };
 
-struct v3dv_submit_info_semaphores {
-   /* List of semaphores to wait before running a job */
-   uint32_t wait_sem_count;
-   VkSemaphore *wait_sems;
+struct v3dv_submit_sync_info {
+   /* List of syncs to wait before running a job */
+   uint32_t wait_count;
+   struct vk_sync_wait *waits;
 
-   /* List of semaphores to signal when all jobs complete */
-   uint32_t signal_sem_count;
-   VkSemaphore *signal_sems;
-
-   /* A fence to signal when all jobs complete */
-   VkFence fence;
+   /* List of syncs to signal when all jobs complete */
+   uint32_t signal_count;
+   struct vk_sync_signal *signals;
 };
 
 struct v3dv_event_set_cpu_job_info {
@@ -1122,9 +1083,6 @@ struct v3dv_job {
    /* Whether we need to serialize this job in our command stream */
    bool serialize;
 
-   /* Whether this job is in charge of signalling semaphores */
-   bool do_sem_signal;
-
    /* If this is a CL job, whether we should sync before binning */
    bool needs_bcl_sync;
 
@@ -1156,7 +1114,7 @@ struct v3dv_wait_thread_info {
    struct v3dv_job *job;
 
    /* Semaphores info for any postponed jobs after a wait event */
-   struct v3dv_submit_info_semaphores *sems_info;
+   struct v3dv_submit_sync_info *sync_info;
 };
 
 void v3dv_job_init(struct v3dv_job *job,
@@ -1514,28 +1472,6 @@ void v3dv_cmd_buffer_add_private_obj(struct v3dv_cmd_buffer *cmd_buffer,
                                      uint64_t obj,
                                      v3dv_cmd_buffer_private_obj_destroy_cb destroy_cb);
 
-struct v3dv_semaphore {
-   struct vk_object_base base;
-
-   /* A syncobject handle associated with this semaphore */
-   uint32_t sync;
-
-   /* A temporary syncobject handle produced from a vkImportSemaphoreFd. */
-   uint32_t temp_sync;
-   bool has_temp;
-};
-
-struct v3dv_fence {
-   struct vk_object_base base;
-
-   /* A syncobject handle associated with this fence */
-   uint32_t sync;
-
-   /* A temporary syncobject handle produced from a vkImportFenceFd. */
-   uint32_t temp_sync;
-   bool has_temp;
-};
-
 struct v3dv_event {
    struct vk_object_base base;
    int state;
@@ -2210,7 +2146,6 @@ VK_DEFINE_NONDISP_HANDLE_CASTS(v3dv_descriptor_update_template, base,
                                VkDescriptorUpdateTemplate,
                                VK_OBJECT_TYPE_DESCRIPTOR_UPDATE_TEMPLATE)
 VK_DEFINE_NONDISP_HANDLE_CASTS(v3dv_event, base, VkEvent, VK_OBJECT_TYPE_EVENT)
-VK_DEFINE_NONDISP_HANDLE_CASTS(v3dv_fence, base, VkFence, VK_OBJECT_TYPE_FENCE)
 VK_DEFINE_NONDISP_HANDLE_CASTS(v3dv_framebuffer, base, VkFramebuffer,
                                VK_OBJECT_TYPE_FRAMEBUFFER)
 VK_DEFINE_NONDISP_HANDLE_CASTS(v3dv_image, vk.base, VkImage,
@@ -2229,8 +2164,6 @@ VK_DEFINE_NONDISP_HANDLE_CASTS(v3dv_render_pass, base, VkRenderPass,
                                VK_OBJECT_TYPE_RENDER_PASS)
 VK_DEFINE_NONDISP_HANDLE_CASTS(v3dv_sampler, base, VkSampler,
                                VK_OBJECT_TYPE_SAMPLER)
-VK_DEFINE_NONDISP_HANDLE_CASTS(v3dv_semaphore, base, VkSemaphore,
-                               VK_OBJECT_TYPE_SEMAPHORE)
 
 static inline int
 v3dv_ioctl(int fd, unsigned long request, void *arg)
diff --git a/src/broadcom/vulkan/v3dv_queue.c b/src/broadcom/vulkan/v3dv_queue.c
index d5501371388..244f0166f02 100644
--- a/src/broadcom/vulkan/v3dv_queue.c
+++ b/src/broadcom/vulkan/v3dv_queue.c
@@ -25,7 +25,9 @@
 #include "drm-uapi/v3d_drm.h"
 
 #include "broadcom/clif/clif_dump.h"
+#include "util/libsync.h"
 #include "util/os_time.h"
+#include "vk_drm_syncobj.h"
 
 #include <errno.h>
 #include <time.h>
@@ -69,95 +71,61 @@ v3dv_clif_dump(struct v3dv_device *device,
 }
 
 static VkResult
-queue_submit_job(struct v3dv_queue *queue,
-                 struct v3dv_job *job,
-                 struct v3dv_submit_info_semaphores *sems_info,
-                 pthread_t *wait_thread);
-
-/* Waits for active CPU wait threads spawned before the current thread to
- * complete and submit all their GPU jobs.
- */
-static void
-cpu_queue_wait_idle(struct v3dv_queue *queue)
+queue_wait_idle(struct v3dv_queue *queue,
+                struct v3dv_submit_sync_info *sync_info)
 {
-   const pthread_t this_thread = pthread_self();
-
-retry:
-   mtx_lock(&queue->mutex);
-   list_for_each_entry(struct v3dv_queue_submit_wait_info, info,
-                       &queue->submit_wait_list, list_link) {
-      for (uint32_t  i = 0; i < info->wait_thread_count; i++) {
-         if (info->wait_threads[i].finished)
-            continue;
-
-         /* Because we are testing this against the list of spawned threads
-          * it will never match for the main thread, so when we call this from
-          * the main thread we are effectively waiting for all active threads
-          * to complete, and otherwise we are only waiting for work submitted
-          * before the wait thread that called this (a wait thread should never
-          * be waiting for work submitted after it).
-          */
-         if (info->wait_threads[i].thread == this_thread)
-            goto done;
-
-         /* Wait and try again */
-         mtx_unlock(&queue->mutex);
-         usleep(500); /* 0.5 ms */
-         goto retry;
+   if (queue->device->pdevice->caps.multisync) {
+      int ret = drmSyncobjWait(queue->device->pdevice->render_fd,
+                               queue->last_job_syncs.syncs, 3,
+                               INT64_MAX, DRM_SYNCOBJ_WAIT_FLAGS_WAIT_ALL,
+                               NULL);
+      if (ret) {
+         return vk_errorf(queue, VK_ERROR_DEVICE_LOST,
+                          "syncobj wait failed: %m");
       }
-   }
 
-done:
-   mtx_unlock(&queue->mutex);
-}
+      bool first = true;
+      for (int i = 0; i < 3; i++) {
+         if (!queue->last_job_syncs.first[i])
+            first = false;
+      }
 
-static VkResult
-gpu_queue_wait_idle(struct v3dv_queue *queue)
-{
-   struct v3dv_device *device = queue->device;
-   int render_fd = device->pdevice->render_fd;
-   struct v3dv_last_job_sync last_job_syncs;
+      /* If we're not the first job, that means we're waiting on some
+       * per-queue-type syncobj which transitively waited on the semaphores
+       * so we can skip the semaphore wait.
+       */
+      if (first) {
+         VkResult result = vk_sync_wait_many(&queue->device->vk,
+                                             sync_info->wait_count,
+                                             sync_info->waits,
+                                             VK_SYNC_WAIT_COMPLETE,
+                                             UINT64_MAX);
+         if (result != VK_SUCCESS)
+            return result;
+      }
 
-   mtx_lock(&device->mutex);
-   memcpy(&last_job_syncs, &device->last_job_syncs, sizeof(last_job_syncs));
-   mtx_unlock(&device->mutex);
-
-   if (device->pdevice->caps.multisync) {
-      int ret = drmSyncobjWait(render_fd, (uint32_t *) &last_job_syncs.syncs,
-                               3, INT64_MAX,
-                               DRM_SYNCOBJ_WAIT_FLAGS_WAIT_ALL, NULL);
-      if (ret)
-         return vk_queue_set_lost(&queue->vk, "Syncobj wait failed: %m");
+      for (int i = 0; i < 3; i++)
+         queue->last_job_syncs.first[i] = false;
    } else {
-      int ret =
-         drmSyncobjWait(render_fd, &last_job_syncs.syncs[V3DV_QUEUE_ANY], 1,
-                        INT64_MAX, 0, NULL);
-      if (ret)
-         return vk_queue_set_lost(&queue->vk, "Syncobj wait failed: %m");
+      /* Without multisync, all the semaphores are baked into the one syncobj
+       * at the start of each submit so we only need to wait on the one.
+       */
+      int ret = drmSyncobjWait(queue->device->pdevice->render_fd,
+                               &queue->last_job_syncs.syncs[V3DV_QUEUE_ANY], 1,
+                               INT64_MAX, DRM_SYNCOBJ_WAIT_FLAGS_WAIT_ALL,
+                               NULL);
+      if (ret) {
+         return vk_errorf(queue, VK_ERROR_DEVICE_LOST,
+                          "syncobj wait failed: %m");
+      }
    }
 
    return VK_SUCCESS;
 }
 
-VKAPI_ATTR VkResult VKAPI_CALL
-v3dv_QueueWaitIdle(VkQueue _queue)
-{
-   V3DV_FROM_HANDLE(v3dv_queue, queue, _queue);
-
-   if (vk_device_is_lost(&queue->device->vk))
-      return VK_ERROR_DEVICE_LOST;
-
-   /* Check that we don't have any wait threads running in the CPU first,
-    * as these can spawn new GPU jobs.
-    */
-   cpu_queue_wait_idle(queue);
-
-   /* Check we don't have any GPU jobs running */
-   return gpu_queue_wait_idle(queue);
-}
-
 static VkResult
-handle_reset_query_cpu_job(struct v3dv_job *job)
+handle_reset_query_cpu_job(struct v3dv_queue *queue, struct v3dv_job *job,
+                           struct v3dv_submit_sync_info *sync_info)
 {
    struct v3dv_reset_query_cpu_job_info *info = &job->cpu.query_reset;
    assert(info->pool);
@@ -165,12 +133,9 @@ handle_reset_query_cpu_job(struct v3dv_job *job)
    /* We are about to reset query counters so we need to make sure that
     * The GPU is not using them. The exception is timestamp queries, since
     * we handle those in the CPU.
-    *
-    * FIXME: we could avoid blocking the main thread for this if we use
-    *        submission thread.
     */
    if (info->pool->query_type == VK_QUERY_TYPE_OCCLUSION)
-         v3dv_bo_wait(job->device, info->pool->bo, PIPE_TIMEOUT_INFINITE);
+      v3dv_bo_wait(job->device, info->pool->bo, PIPE_TIMEOUT_INFINITE);
 
    v3dv_reset_query_pools(job->device, info->pool, info->first, info->count);
 
@@ -209,10 +174,6 @@ handle_copy_query_results_cpu_job(struct v3dv_job *job)
    if (!bo->map && !v3dv_bo_map(job->device, bo, bo->size))
       return vk_error(job->device, VK_ERROR_OUT_OF_HOST_MEMORY);
 
-   /* FIXME: if flags includes VK_QUERY_RESULT_WAIT_BIT this could trigger a
-    * sync wait on the CPU for the corresponding GPU jobs to finish. We might
-    * want to use a submission thread to avoid blocking on the main thread.
-    */
    uint8_t *offset = ((uint8_t *) bo->map) +
                      info->offset + info->dst->mem_offset;
    v3dv_get_query_pool_results_cpu(job->device,
@@ -227,7 +188,8 @@ handle_copy_query_results_cpu_job(struct v3dv_job *job)
 }
 
 static VkResult
-handle_set_event_cpu_job(struct v3dv_job *job)
+handle_set_event_cpu_job(struct v3dv_queue *queue, struct v3dv_job *job,
+                         struct v3dv_submit_sync_info *sync_info)
 {
    /* From the Vulkan 1.0 spec:
     *
@@ -246,13 +208,7 @@ handle_set_event_cpu_job(struct v3dv_job *job)
     *        submission thread.
     */
 
-   /* If we are calling this from a wait thread it will only wait
-    * wait threads sspawned before it, otherwise it will wait for
-    * all active threads to complete.
-    */
-   cpu_queue_wait_idle(&job->device->queue);
-
-   VkResult result = gpu_queue_wait_idle(&job->device->queue);
+   VkResult result = queue_wait_idle(queue, sync_info);
    if (result != VK_SUCCESS)
       return result;
 
@@ -262,99 +218,6 @@ handle_set_event_cpu_job(struct v3dv_job *job)
    return VK_SUCCESS;
 }
 
-static VkResult
-copy_semaphores(struct v3dv_device *device,
-                VkSemaphore *sems_src, uint32_t sems_src_count,
-                VkSemaphore **sems_dst, uint32_t *sems_dst_count)
-{
-   *sems_dst_count = sems_src_count;
-
-   if (*sems_dst_count == 0) {
-      *sems_dst = NULL;
-      return VK_SUCCESS;
-   }
-
-   *sems_dst = vk_alloc(&device->vk.alloc,
-                        *sems_dst_count * sizeof(VkSemaphore), 8,
-		        VK_SYSTEM_ALLOCATION_SCOPE_COMMAND);
-   if (!sems_dst) {
-      *sems_dst_count = 0;
-      return VK_ERROR_OUT_OF_HOST_MEMORY;
-   }
-
-   memcpy(*sems_dst, sems_src, *sems_dst_count * sizeof(VkSemaphore));
-
-   return VK_SUCCESS;
-}
-
-static struct v3dv_submit_info_semaphores *
-copy_semaphores_info(struct v3dv_device *device,
-                     struct v3dv_submit_info_semaphores *info)
-{
-   VkResult result;
-   struct v3dv_submit_info_semaphores *info_copy =
-      vk_zalloc(&device->vk.alloc, sizeof(struct v3dv_submit_info_semaphores),
-                8, VK_SYSTEM_ALLOCATION_SCOPE_COMMAND);
-   if (!info_copy)
-      return NULL;
-
-   result = copy_semaphores(device, info->wait_sems, info->wait_sem_count,
-                            &info_copy->wait_sems, &info_copy->wait_sem_count);
-   if (result != VK_SUCCESS)
-      goto fail;
-
-   result = copy_semaphores(device, info->signal_sems, info->signal_sem_count,
-                            &info_copy->signal_sems,
-                            &info_copy->signal_sem_count);
-   if (result != VK_SUCCESS)
-      goto fail;
-
-   return info_copy;
-
-fail:
-   if (info_copy->wait_sem_count > 0)
-      vk_free(&device->vk.alloc, info_copy->wait_sems);
-   vk_free(&device->vk.alloc, info_copy);
-
-   return NULL;
-}
-
-static struct v3dv_wait_thread_info *
-create_wait_thread_info(struct v3dv_job *job,
-                        struct v3dv_submit_info_semaphores *sems_info)
-{
-   struct v3dv_wait_thread_info *info =
-      vk_alloc(&job->device->vk.alloc, sizeof(*info), 8,
-               VK_SYSTEM_ALLOCATION_SCOPE_COMMAND);
-   if (!info)
-      return NULL;
-
-   info->job = job;
-   info->sems_info = copy_semaphores_info(job->device, sems_info);
-   if (!info->sems_info) {
-      vk_free(&job->device->vk.alloc, info);
-      return NULL;
-   }
-
-   return info;
-}
-
-static void
-free_wait_thread_info(struct v3dv_device *device,
-                      struct v3dv_wait_thread_info *info)
-{
-   assert(info != NULL);
-
-   if (info->sems_info->wait_sem_count > 0)
-      vk_free(&device->vk.alloc, info->sems_info->wait_sems);
-
-   if (info->sems_info->signal_sem_count > 0)
-      vk_free(&device->vk.alloc, info->sems_info->signal_sems);
-
-   vk_free(&device->vk.alloc, info->sems_info);
-   vk_free(&device->vk.alloc, info);
-}
-
 static bool
 check_wait_events_complete(struct v3dv_job *job)
 {
@@ -368,31 +231,9 @@ check_wait_events_complete(struct v3dv_job *job)
    return true;
 }
 
-static void
-wait_thread_finish(struct v3dv_queue *queue, pthread_t thread)
+static VkResult
+handle_wait_events_cpu_job(struct v3dv_job *job)
 {
-   mtx_lock(&queue->mutex);
-   list_for_each_entry(struct v3dv_queue_submit_wait_info, info,
-                       &queue->submit_wait_list, list_link) {
-      for (uint32_t  i = 0; i < info->wait_thread_count; i++) {
-         if (info->wait_threads[i].thread == thread) {
-            info->wait_threads[i].finished = true;
-            goto done;
-         }
-      }
-   }
-
-   unreachable(!"Failed to finish wait thread: not found");
-
-done:
-   mtx_unlock(&queue->mutex);
-}
-
-static void *
-event_wait_thread_func(void *_info)
-{
-   struct v3dv_wait_thread_info *info = (struct v3dv_wait_thread_info *) _info;
-   struct v3dv_job *job = info->job;
    assert(job->type == V3DV_JOB_TYPE_CPU_WAIT_EVENTS);
 
    /* Wait for events to be signaled */
@@ -400,101 +241,13 @@ event_wait_thread_func(void *_info)
    while (!check_wait_events_complete(job))
       usleep(wait_interval_ms * 1000);
 
-   /* Now continue submitting pending jobs for the same command buffer after
-    * the wait job.
-    */
-   struct v3dv_queue *queue = &job->device->queue;
-   list_for_each_entry_from(struct v3dv_job, pjob, job->list_link.next,
-                            &job->cmd_buffer->jobs, list_link) {
-      /* We can't signal semaphores from wait threads because in this case
-       * we can't ensure job completion order any more (i.e. if the wait for
-       * events is in the first command buffer of a batch then the last job
-       * from the last command buffer in that batch can't signal). We always
-       * need to signal from the master thread in that case, when we know we
-       * are done submitting all jobs from all command buffers.
-       */
-      pjob->do_sem_signal = false;
-
-      /* We don't want to spawn more than one wait thread per command buffer.
-       * If this job also requires a wait for events, we will do the wait here.
-       */
-      VkResult result = queue_submit_job(queue, pjob, info->sems_info, NULL);
-      if (result == VK_NOT_READY) {
-         while (!check_wait_events_complete(pjob)) {
-            usleep(wait_interval_ms * 1000);
-         }
-         result = VK_SUCCESS;
-      }
-
-      if (result != VK_SUCCESS) {
-         fprintf(stderr, "Wait thread job execution failed.\n");
-         goto done;
-      }
-   }
-
-done:
-   wait_thread_finish(queue, pthread_self());
-   free_wait_thread_info(job->device, info);
-   return NULL;
+   return VK_SUCCESS;
 }
 
 static VkResult
-spawn_event_wait_thread(struct v3dv_wait_thread_info *info, pthread_t *wait_thread)
-
-{
-   assert(info->job->type == V3DV_JOB_TYPE_CPU_WAIT_EVENTS);
-   assert(info->job->cmd_buffer);
-   assert(wait_thread != NULL);
-
-   if (pthread_create(wait_thread, NULL, event_wait_thread_func, info))
-      return vk_queue_set_lost(&info->job->device->queue.vk,
-                               "Thread create failed: %m");
-
-   return VK_NOT_READY;
-}
-
-static VkResult
-handle_wait_events_cpu_job(struct v3dv_job *job,
-                           struct v3dv_submit_info_semaphores *sems_info,
-                           pthread_t *wait_thread)
-{
-   assert(job->type == V3DV_JOB_TYPE_CPU_WAIT_EVENTS);
-
-   /* If all events are signaled then we are done and can continue submitting
-    * the rest of the command buffer normally.
-    */
-   if (check_wait_events_complete(job))
-      return VK_SUCCESS;
-
-   /* Otherwise, we put the rest of the command buffer on a wait thread until
-    * all events are signaled. We only spawn a new thread on the first
-    * wait job we see for a command buffer, any additional wait jobs in the
-    * same command buffer will run in that same wait thread and will get here
-    * with a NULL wait_thread pointer.
-    *
-    * Also, whether we spawn a wait thread or not, we always return
-    * VK_NOT_READY (unless an error happened), so we stop trying to submit
-    * any jobs in the same command buffer after the wait job. The wait thread
-    * will attempt to submit them after the wait completes.
-    */
-   if (!wait_thread)
-      return VK_NOT_READY;
-
-   /* As events can be signaled by the host, jobs after the event wait must
-    * still wait for semaphores, if any. So, whenever we spawn a wait thread,
-    * we keep a copy of the semaphores (info->sems_info) to be used when
-    * submitting pending jobs in the wait thread context.
-    */
-   struct v3dv_wait_thread_info *info =
-      create_wait_thread_info(job, sems_info);
-   if (!info)
-      return VK_ERROR_OUT_OF_HOST_MEMORY;
-
-   return spawn_event_wait_thread(info, wait_thread);
-}
-
-static VkResult
-handle_copy_buffer_to_image_cpu_job(struct v3dv_job *job)
+handle_copy_buffer_to_image_cpu_job(struct v3dv_queue *queue,
+                                    struct v3dv_job *job,
+                                    struct v3dv_submit_sync_info *sync_info)
 {
    assert(job->type == V3DV_JOB_TYPE_CPU_COPY_BUFFER_TO_IMAGE);
    struct v3dv_copy_buffer_to_image_cpu_job_info *info =
@@ -503,7 +256,9 @@ handle_copy_buffer_to_image_cpu_job(struct v3dv_job *job)
    /* Wait for all GPU work to finish first, since we may be accessing
     * the BOs involved in the operation.
     */
-   v3dv_QueueWaitIdle(v3dv_queue_to_handle(&job->device->queue));
+   VkResult result = queue_wait_idle(queue, sync_info);
+   if (result != VK_SUCCESS)
+      return result;
 
    /* Map BOs */
    struct v3dv_bo *dst_bo = info->image->mem->bo;
@@ -543,13 +298,16 @@ handle_copy_buffer_to_image_cpu_job(struct v3dv_job *job)
 }
 
 static VkResult
-handle_timestamp_query_cpu_job(struct v3dv_job *job)
+handle_timestamp_query_cpu_job(struct v3dv_queue *queue, struct v3dv_job *job,
+                               struct v3dv_submit_sync_info *sync_info)
 {
    assert(job->type == V3DV_JOB_TYPE_CPU_TIMESTAMP_QUERY);
    struct v3dv_timestamp_query_cpu_job_info *info = &job->cpu.query_timestamp;
 
    /* Wait for completion of all work queued before the timestamp query */
-   v3dv_QueueWaitIdle(v3dv_queue_to_handle(&job->device->queue));
+   VkResult result = queue_wait_idle(queue, sync_info);
+   if (result != VK_SUCCESS)
+      return result;
 
    mtx_lock(&job->device->query_mutex);
 
@@ -574,7 +332,7 @@ handle_timestamp_query_cpu_job(struct v3dv_job *job)
 static VkResult
 handle_csd_indirect_cpu_job(struct v3dv_queue *queue,
                             struct v3dv_job *job,
-                            struct v3dv_submit_info_semaphores *sems_info)
+                            struct v3dv_submit_sync_info *sync_info)
 {
    assert(job->type == V3DV_JOB_TYPE_CPU_CSD_INDIRECT);
    struct v3dv_csd_indirect_cpu_job_info *info = &job->cpu.csd_indirect;
@@ -604,60 +362,101 @@ handle_csd_indirect_cpu_job(struct v3dv_queue *queue,
    return VK_SUCCESS;
 }
 
-static uint32_t
-semaphore_get_sync(struct v3dv_semaphore *sem)
+static VkResult
+process_waits(struct v3dv_queue *queue,
+              uint32_t count, struct vk_sync_wait *waits)
 {
-   if (!sem->has_temp)
-      return sem->sync;
+   struct v3dv_device *device = queue->device;
+   VkResult result = VK_SUCCESS;
+   int err = 0;
 
-   assert(sem->temp_sync > 0);
-   return sem->temp_sync;
-}
+   if (count == 0)
+      return VK_SUCCESS;
 
-static uint32_t
-fence_get_sync(struct v3dv_fence *fence)
-{
-   if (!fence->has_temp)
-      return fence->sync;
+   /* If multisync is supported, we wait on semaphores in the first job
+    * submitted to each of the individual queues.  We don't need to
+    * pre-populate the syncobjs.
+    */
+   if (queue->device->pdevice->caps.multisync)
+      return VK_SUCCESS;
 
-   assert(fence->temp_sync > 0);
-   return fence->temp_sync;
+   int fd = -1;
+   err = drmSyncobjExportSyncFile(device->pdevice->render_fd,
+                                  queue->last_job_syncs.syncs[V3DV_QUEUE_ANY],
+                                  &fd);
+   if (err) {
+      result = vk_errorf(queue, VK_ERROR_UNKNOWN,
+                         "sync file export failed: %m");
+      goto fail;
+   }
+
+   for (uint32_t i = 0; i < count; i++) {
+      uint32_t syncobj = vk_sync_as_drm_syncobj(waits[i].sync)->syncobj;
+      int wait_fd = -1;
+
+      err = drmSyncobjExportSyncFile(device->pdevice->render_fd,
+                                     syncobj, &wait_fd);
+      if (err) {
+         result = vk_errorf(queue, VK_ERROR_UNKNOWN,
+                            "sync file export failed: %m");
+         goto fail;
+      }
+
+      err = sync_accumulate("v3dv", &fd, wait_fd);
+      close(wait_fd);
+      if (err) {
+         result = vk_errorf(queue, VK_ERROR_UNKNOWN,
+                            "sync file merge failed: %m");
+         goto fail;
+      }
+   }
+
+   err = drmSyncobjImportSyncFile(device->pdevice->render_fd,
+                                  queue->last_job_syncs.syncs[V3DV_QUEUE_ANY],
+                                  fd);
+   if (err) {
+      result = vk_errorf(queue, VK_ERROR_UNKNOWN,
+                         "sync file import failed: %m");
+   }
+
+fail:
+   close(fd);
+   return result;
 }
 
 static VkResult
-process_semaphores_to_signal(struct v3dv_device *device,
-                             uint32_t count, const VkSemaphore *sems,
-                             bool is_master_thread)
+process_signals(struct v3dv_queue *queue,
+                uint32_t count, struct vk_sync_signal *signals)
 {
+   struct v3dv_device *device = queue->device;
+
    if (count == 0)
       return VK_SUCCESS;
 
    /* If multisync is supported, we are signalling semaphores in the last job
     * of the last command buffer and, therefore, we do not need to process any
-    * semaphores here, unless we come from a wait thread, because in that case
-    * we never signal.
+    * semaphores here.
     */
-   if (device->pdevice->caps.multisync && !is_master_thread)
+   if (device->pdevice->caps.multisync)
       return VK_SUCCESS;
 
-   int render_fd = device->pdevice->render_fd;
-
    int fd;
-   mtx_lock(&device->mutex);
-   drmSyncobjExportSyncFile(render_fd,
-                            device->last_job_syncs.syncs[V3DV_QUEUE_ANY],
+   drmSyncobjExportSyncFile(device->pdevice->render_fd,
+                            queue->last_job_syncs.syncs[V3DV_QUEUE_ANY],
                             &fd);
-   mtx_unlock(&device->mutex);
-   if (fd == -1)
-      return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
+   if (fd == -1) {
+      return vk_errorf(queue, VK_ERROR_UNKNOWN,
+                       "sync file export failed: %m");
+   }
 
    VkResult result = VK_SUCCESS;
    for (uint32_t i = 0; i < count; i++) {
-      struct v3dv_semaphore *sem = v3dv_semaphore_from_handle(sems[i]);
-      uint32_t sync = semaphore_get_sync(sem);
-      int ret = drmSyncobjImportSyncFile(render_fd, sync, fd);
-      if (ret) {
-         result = VK_ERROR_OUT_OF_HOST_MEMORY;
+      uint32_t syncobj = vk_sync_as_drm_syncobj(signals[i].sync)->syncobj;
+      int err = drmSyncobjImportSyncFile(device->pdevice->render_fd,
+                                         syncobj, fd);
+      if (err) {
+         result = vk_errorf(queue, VK_ERROR_UNKNOWN,
+                            "sync file import failed: %m");
          break;
       }
    }
@@ -668,59 +467,6 @@ process_semaphores_to_signal(struct v3dv_device *device,
    return result;
 }
 
-static VkResult
-queue_submit_noop_job(struct v3dv_queue *queue,
-                      struct v3dv_submit_info_semaphores *sems_info,
-                      bool do_sem_signal, bool serialize);
-
-static VkResult
-process_fence_to_signal(struct v3dv_device *device, VkFence _fence)
-{
-   if (_fence == VK_NULL_HANDLE)
-      return VK_SUCCESS;
-
-   struct v3dv_fence *fence = v3dv_fence_from_handle(_fence);
-
-   int render_fd = device->pdevice->render_fd;
-
-   if (device->pdevice->caps.multisync) {
-      struct v3dv_queue *queue = &device->queue;
-      /* We signal the fence once all submitted command buffers have completed
-       * execution. For this, we emit a noop job that waits on the completion
-       * of all submitted jobs and signal the fence for this submission.
-       * FIXME: In simpler cases (for instance, when all jobs were submitted to
-       * the same queue), we can just import the last out sync produced into
-       * the fence.
-       */
-      struct v3dv_submit_info_semaphores sems_info = {
-         .wait_sem_count = 0,
-         .wait_sems = NULL,
-         .signal_sem_count = 0,
-         .signal_sems = NULL,
-         .fence = _fence,
-      };
-
-      return queue_submit_noop_job(queue, &sems_info, false, true);
-   }
-
-   int fd;
-   mtx_lock(&device->mutex);
-   drmSyncobjExportSyncFile(render_fd,
-                            device->last_job_syncs.syncs[V3DV_QUEUE_ANY],
-                            &fd);
-   mtx_unlock(&device->mutex);
-   if (fd == -1)
-      return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
-
-   uint32_t sync = fence_get_sync(fence);
-   int ret = drmSyncobjImportSyncFile(render_fd, sync, fd);
-
-   assert(fd >= 0);
-   close(fd);
-
-   return ret ? VK_ERROR_OUT_OF_HOST_MEMORY : VK_SUCCESS;
-}
-
 static void
 multisync_free(struct v3dv_device *device,
                struct drm_v3d_multi_sync *ms)
@@ -730,24 +476,25 @@ multisync_free(struct v3dv_device *device,
 }
 
 static struct drm_v3d_sem *
-set_in_syncs(struct v3dv_device *device,
+set_in_syncs(struct v3dv_queue *queue,
              struct v3dv_job *job,
-             enum v3dv_queue_type queue,
+             enum v3dv_queue_type queue_sync,
              uint32_t *count,
-             struct v3dv_submit_info_semaphores *sems_info)
+             struct v3dv_submit_sync_info *sync_info)
 {
-   uint32_t n_sems = 0;
+   struct v3dv_device *device = queue->device;
+   uint32_t n_syncs = 0;
 
    /* If this is the first job submitted to a given GPU queue in this cmd buf
     * batch, it has to wait on wait semaphores (if any) before running.
     */
-   if (device->last_job_syncs.first[queue])
-      n_sems = sems_info->wait_sem_count;
+   if (queue->last_job_syncs.first[queue_sync])
+      n_syncs = sync_info->wait_count;
 
    /* If the serialize flag is set, this job waits for completion of all GPU
     * jobs submitted in any queue V3DV_QUEUE_(CL/TFU/CSD) before running.
     */
-   *count = n_sems + (job->serialize ? 3 : 0);
+   *count = n_syncs + (job->serialize ? 3 : 0);
 
    if (!*count)
       return NULL;
@@ -759,51 +506,35 @@ set_in_syncs(struct v3dv_device *device,
    if (!syncs)
       return NULL;
 
-   for (int i = 0; i < n_sems; i++) {
-      struct v3dv_semaphore *sem =
-         v3dv_semaphore_from_handle(sems_info->wait_sems[i]);
-      syncs[i].handle = semaphore_get_sync(sem);
-
-      /* From the Vulkan 1.0 spec:
-       *
-       *    "If the import is temporary, the implementation must restore
-       *     the semaphore to its prior permanent state after submitting
-       *     the next semaphore wait operation."
-       *
-       * We can't destroy the temporary sync until the kernel is done
-       * with it, this is why we need to have this 'has_temp' flag instead
-       * of checking temp_sync for 0 to know if we have a temporary
-       * payload. The temporary sync will be destroyed if we import into
-       * the semaphore again or if the semaphore is destroyed by the
-       * client.
-       */
-      sem->has_temp = false;
+   for (int i = 0; i < n_syncs; i++) {
+      syncs[i].handle =
+         vk_sync_as_drm_syncobj(sync_info->waits[i].sync)->syncobj;
    }
 
    if (job->serialize) {
       for (int i = 0; i < 3; i++)
-         syncs[n_sems + i].handle = device->last_job_syncs.syncs[i];
+         syncs[n_syncs + i].handle = queue->last_job_syncs.syncs[i];
    }
 
    return syncs;
 }
 
 static struct drm_v3d_sem *
-set_out_syncs(struct v3dv_device *device,
+set_out_syncs(struct v3dv_queue *queue,
               struct v3dv_job *job,
-              enum v3dv_queue_type queue,
+              enum v3dv_queue_type queue_sync,
               uint32_t *count,
-              struct v3dv_submit_info_semaphores *sems_info)
+              struct v3dv_submit_sync_info *sync_info,
+              bool signal_syncs)
 {
-   uint32_t n_sems = job->do_sem_signal ? sems_info->signal_sem_count : 0;
+   struct v3dv_device *device = queue->device;
+
+   uint32_t n_vk_syncs = signal_syncs ? sync_info->signal_count : 0;
 
    /* We always signal the syncobj from `device->last_job_syncs` related to
     * this v3dv_queue_type to track the last job submitted to this queue.
     */
-   (*count) = n_sems + 1;
-
-   if (sems_info->fence)
-      (*count)++;
+   (*count) = n_vk_syncs + 1;
 
    struct drm_v3d_sem *syncs =
       vk_zalloc(&device->vk.alloc, *count * sizeof(struct drm_v3d_sem),
@@ -812,20 +543,14 @@ set_out_syncs(struct v3dv_device *device,
    if (!syncs)
       return NULL;
 
-   if (n_sems) {
-      for (unsigned i = 0; i < n_sems; i++) {
-         struct v3dv_semaphore *sem =
-            v3dv_semaphore_from_handle(sems_info->signal_sems[i]);
-         syncs[i].handle = semaphore_get_sync(sem);
+   if (n_vk_syncs) {
+      for (unsigned i = 0; i < n_vk_syncs; i++) {
+         syncs[i].handle =
+            vk_sync_as_drm_syncobj(sync_info->signals[i].sync)->syncobj;
       }
    }
 
-   syncs[n_sems].handle = device->last_job_syncs.syncs[queue];
-
-   if (sems_info->fence) {
-      struct v3dv_fence *fence = v3dv_fence_from_handle(sems_info->fence);
-      syncs[++n_sems].handle = fence_get_sync(fence);
-   }
+   syncs[n_vk_syncs].handle = queue->last_job_syncs.syncs[queue_sync];
 
    return syncs;
 }
@@ -847,23 +572,25 @@ set_ext(struct drm_v3d_extension *ext,
  */
 static void
 set_multisync(struct drm_v3d_multi_sync *ms,
-              struct v3dv_submit_info_semaphores *sems_info,
+              struct v3dv_submit_sync_info *sync_info,
               struct drm_v3d_extension *next,
               struct v3dv_device *device,
               struct v3dv_job *job,
               enum v3dv_queue_type queue_sync,
-              enum v3d_queue wait_stage)
+              enum v3d_queue wait_stage,
+              bool signal_syncs)
 {
+   struct v3dv_queue *queue = &device->queue;
    uint32_t out_sync_count = 0, in_sync_count = 0;
    struct drm_v3d_sem *out_syncs = NULL, *in_syncs = NULL;
 
-   in_syncs = set_in_syncs(device, job, queue_sync,
-                           &in_sync_count, sems_info);
+   in_syncs = set_in_syncs(queue, job, queue_sync,
+                           &in_sync_count, sync_info);
    if (!in_syncs && in_sync_count)
       goto fail;
 
-   out_syncs = set_out_syncs(device, job, queue_sync,
-                             &out_sync_count, sems_info);
+   out_syncs = set_out_syncs(queue, job, queue_sync,
+                             &out_sync_count, sync_info, signal_syncs);
 
    assert(out_sync_count > 0);
 
@@ -877,7 +604,7 @@ set_multisync(struct drm_v3d_multi_sync *ms,
    ms->in_sync_count = in_sync_count;
    ms->in_syncs = (uintptr_t)(void *)in_syncs;
 
-   device->last_job_syncs.first[queue_sync] = false;
+   queue->last_job_syncs.first[queue_sync] = false;
 
    return;
 
@@ -892,7 +619,8 @@ fail:
 static VkResult
 handle_cl_job(struct v3dv_queue *queue,
               struct v3dv_job *job,
-              struct v3dv_submit_info_semaphores *sems_info)
+              struct v3dv_submit_sync_info *sync_info,
+              bool signal_syncs)
 {
    struct v3dv_device *device = queue->device;
 
@@ -949,23 +677,19 @@ handle_cl_job(struct v3dv_queue *queue,
     * dependencies strictly through barriers.
     */
    const bool needs_bcl_sync =
-      sems_info->wait_sem_count > 0 || job->needs_bcl_sync;
+      sync_info->wait_count > 0 || job->needs_bcl_sync;
    const bool needs_rcl_sync = job->serialize && !needs_bcl_sync;
 
-   mtx_lock(&queue->device->mutex);
-
    /* Replace single semaphore settings whenever our kernel-driver supports
     * multiple semaphores extension.
     */
    struct drm_v3d_multi_sync ms = { 0 };
    if (device->pdevice->caps.multisync) {
       enum v3d_queue wait_stage = needs_rcl_sync ? V3D_RENDER : V3D_BIN;
-      set_multisync(&ms, sems_info, NULL, device, job,
-                    V3DV_QUEUE_CL, wait_stage);
-      if (!ms.base.id) {
-         mtx_unlock(&queue->device->mutex);
+      set_multisync(&ms, sync_info, NULL, device, job,
+                    V3DV_QUEUE_CL, wait_stage, signal_syncs);
+      if (!ms.base.id)
          return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
-      }
 
       submit.flags |= DRM_V3D_SUBMIT_EXTENSION;
       submit.extensions = (uintptr_t)(void *)&ms;
@@ -974,7 +698,7 @@ handle_cl_job(struct v3dv_queue *queue,
       submit.in_sync_bcl = 0;
       submit.out_sync = 0;
    } else {
-      uint32_t last_job_sync = device->last_job_syncs.syncs[V3DV_QUEUE_ANY];
+      uint32_t last_job_sync = queue->last_job_syncs.syncs[V3DV_QUEUE_ANY];
       submit.in_sync_bcl = needs_bcl_sync ? last_job_sync : 0;
       submit.in_sync_rcl = needs_rcl_sync ? last_job_sync : 0;
       submit.out_sync = last_job_sync;
@@ -983,7 +707,6 @@ handle_cl_job(struct v3dv_queue *queue,
    v3dv_clif_dump(device, job, &submit);
    int ret = v3dv_ioctl(device->pdevice->render_fd,
                         DRM_IOCTL_V3D_SUBMIT_CL, &submit);
-   mtx_unlock(&queue->device->mutex);
 
    static bool warned = false;
    if (ret && !warned) {
@@ -1004,25 +727,22 @@ handle_cl_job(struct v3dv_queue *queue,
 static VkResult
 handle_tfu_job(struct v3dv_queue *queue,
                struct v3dv_job *job,
-               struct v3dv_submit_info_semaphores *sems_info)
+               struct v3dv_submit_sync_info *sync_info,
+               bool signal_syncs)
 {
    struct v3dv_device *device = queue->device;
 
-   const bool needs_sync = sems_info->wait_sem_count || job->serialize;
-
-   mtx_lock(&device->mutex);
+   const bool needs_sync = sync_info->wait_count || job->serialize;
 
    /* Replace single semaphore settings whenever our kernel-driver supports
     * multiple semaphore extension.
     */
    struct drm_v3d_multi_sync ms = { 0 };
    if (device->pdevice->caps.multisync) {
-      set_multisync(&ms, sems_info, NULL, device, job,
-                    V3DV_QUEUE_TFU, V3D_TFU);
-      if (!ms.base.id) {
-         mtx_unlock(&device->mutex);
+      set_multisync(&ms, sync_info, NULL, device, job,
+                    V3DV_QUEUE_TFU, V3D_TFU, signal_syncs);
+      if (!ms.base.id)
          return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
-      }
 
       job->tfu.flags |= DRM_V3D_SUBMIT_EXTENSION;
       job->tfu.extensions = (uintptr_t)(void *)&ms;
@@ -1030,13 +750,12 @@ handle_tfu_job(struct v3dv_queue *queue,
       job->tfu.in_sync = 0;
       job->tfu.out_sync = 0;
    } else {
-      uint32_t last_job_sync = device->last_job_syncs.syncs[V3DV_QUEUE_ANY];
+      uint32_t last_job_sync = queue->last_job_syncs.syncs[V3DV_QUEUE_ANY];
       job->tfu.in_sync = needs_sync ? last_job_sync : 0;
       job->tfu.out_sync = last_job_sync;
    }
    int ret = v3dv_ioctl(device->pdevice->render_fd,
                         DRM_IOCTL_V3D_SUBMIT_TFU, &job->tfu);
-   mtx_unlock(&device->mutex);
 
    multisync_free(device, &ms);
 
@@ -1049,7 +768,8 @@ handle_tfu_job(struct v3dv_queue *queue,
 static VkResult
 handle_csd_job(struct v3dv_queue *queue,
                struct v3dv_job *job,
-               struct v3dv_submit_info_semaphores *sems_info)
+               struct v3dv_submit_sync_info *sync_info,
+               bool signal_syncs)
 {
    struct v3dv_device *device = queue->device;
 
@@ -1066,20 +786,17 @@ handle_csd_job(struct v3dv_queue *queue,
    assert(bo_idx == submit->bo_handle_count);
    submit->bo_handles = (uintptr_t)(void *)bo_handles;
 
-   const bool needs_sync = sems_info->wait_sem_count || job->serialize;
+   const bool needs_sync = sync_info->wait_count || job->serialize;
 
-   mtx_lock(&queue->device->mutex);
    /* Replace single semaphore settings whenever our kernel-driver supports
     * multiple semaphore extension.
     */
    struct drm_v3d_multi_sync ms = { 0 };
    if (device->pdevice->caps.multisync) {
-      set_multisync(&ms, sems_info, NULL, device, job,
-                    V3DV_QUEUE_CSD, V3D_CSD);
-      if (!ms.base.id) {
-         mtx_unlock(&queue->device->mutex);
+      set_multisync(&ms, sync_info, NULL, device, job,
+                    V3DV_QUEUE_CSD, V3D_CSD, signal_syncs);
+      if (!ms.base.id)
          return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
-      }
 
       submit->flags |= DRM_V3D_SUBMIT_EXTENSION;
       submit->extensions = (uintptr_t)(void *)&ms;
@@ -1087,13 +804,12 @@ handle_csd_job(struct v3dv_queue *queue,
       submit->in_sync = 0;
       submit->out_sync = 0;
    } else {
-      uint32_t last_job_sync = device->last_job_syncs.syncs[V3DV_QUEUE_ANY];
+      uint32_t last_job_sync = queue->last_job_syncs.syncs[V3DV_QUEUE_ANY];
       submit->in_sync = needs_sync ? last_job_sync : 0;
       submit->out_sync = last_job_sync;
    }
    int ret = v3dv_ioctl(device->pdevice->render_fd,
                         DRM_IOCTL_V3D_SUBMIT_CSD, submit);
-   mtx_unlock(&queue->device->mutex);
 
    static bool warned = false;
    if (ret && !warned) {
@@ -1113,71 +829,34 @@ handle_csd_job(struct v3dv_queue *queue,
 }
 
 static VkResult
-queue_submit_job(struct v3dv_queue *queue,
+queue_handle_job(struct v3dv_queue *queue,
                  struct v3dv_job *job,
-                 struct v3dv_submit_info_semaphores *sems_info,
-                 pthread_t *wait_thread)
+                 struct v3dv_submit_sync_info *sync_info,
+                 bool signal_syncs)
 {
-   assert(job);
-
-   /* CPU jobs typically execute explicit waits before they are processed. For
-    * example, a query reset CPU job will explicitly wait for the queries
-    * being unused before proceeding, etc. However, if we have any wait
-    * semaphores, we need to honour that too for the first CPU job we process
-    * in the command buffer batch. We do that by waiting for idle to ensure
-    * that any previous work has been completed, at which point any wait
-    * semaphores must be signalled, and we never need to do this again for the
-    * same batch.
-    *
-    * There is a corner case here when the semaphore has been imported from
-    * another instance/process. In that scenario, the Vulkan spec still requires
-    * that a signaling operation has been submitted before this semaphore wait
-    * but our wait for idle checks won't know about that submission (since they
-    * are based on the last jobs sent from our instance). To fix that we submit
-    * a noop job to "consume" the semaphores and then we wait for idle, which
-    * will ensure that our CPU job waits for the semaphores to be signaled even
-    * if they are signaled from another instance or process.
-    */
-   if (!v3dv_job_type_is_gpu(job) && sems_info->wait_sem_count) {
-      queue_submit_noop_job(queue, sems_info, false, false);
-      v3dv_QueueWaitIdle(v3dv_queue_to_handle(&job->device->queue));
-#ifdef DEBUG
-      /* Loop through wait sems and check they are all signalled */
-      for (int i = 0; i < sems_info->wait_sem_count; i++) {
-         int render_fd = queue->device->pdevice->render_fd;
-         struct v3dv_semaphore *sem =
-            v3dv_semaphore_from_handle(sems_info->wait_sems[i]);
-         uint32_t sem_sync = semaphore_get_sync(sem);
-         int ret = drmSyncobjWait(render_fd, &sem_sync, 1, 0, 0, NULL);
-         assert(ret == 0);
-      }
-#endif
-      sems_info->wait_sem_count = 0;
-   }
-
    switch (job->type) {
    case V3DV_JOB_TYPE_GPU_CL:
-      return handle_cl_job(queue, job, sems_info);
+      return handle_cl_job(queue, job, sync_info, signal_syncs);
    case V3DV_JOB_TYPE_GPU_TFU:
-      return handle_tfu_job(queue, job, sems_info);
+      return handle_tfu_job(queue, job, sync_info, signal_syncs);
    case V3DV_JOB_TYPE_GPU_CSD:
-      return handle_csd_job(queue, job, sems_info);
+      return handle_csd_job(queue, job, sync_info, signal_syncs);
    case V3DV_JOB_TYPE_CPU_RESET_QUERIES:
-      return handle_reset_query_cpu_job(job);
+      return handle_reset_query_cpu_job(queue, job, sync_info);
    case V3DV_JOB_TYPE_CPU_END_QUERY:
       return handle_end_query_cpu_job(job);
    case V3DV_JOB_TYPE_CPU_COPY_QUERY_RESULTS:
       return handle_copy_query_results_cpu_job(job);
    case V3DV_JOB_TYPE_CPU_SET_EVENT:
-      return handle_set_event_cpu_job(job);
+      return handle_set_event_cpu_job(queue, job, sync_info);
    case V3DV_JOB_TYPE_CPU_WAIT_EVENTS:
-      return handle_wait_events_cpu_job(job, sems_info, wait_thread);
+      return handle_wait_events_cpu_job(job);
    case V3DV_JOB_TYPE_CPU_COPY_BUFFER_TO_IMAGE:
-      return handle_copy_buffer_to_image_cpu_job(job);
+      return handle_copy_buffer_to_image_cpu_job(queue, job, sync_info);
    case V3DV_JOB_TYPE_CPU_CSD_INDIRECT:
-      return handle_csd_indirect_cpu_job(queue, job, sems_info);
+      return handle_csd_indirect_cpu_job(queue, job, sync_info);
    case V3DV_JOB_TYPE_CPU_TIMESTAMP_QUERY:
-      return handle_timestamp_query_cpu_job(job);
+      return handle_timestamp_query_cpu_job(queue, job, sync_info);
    default:
       unreachable("Unhandled job type");
    }
@@ -1195,887 +874,59 @@ queue_create_noop_job(struct v3dv_queue *queue)
 
    v3dv_X(device, job_emit_noop)(queue->noop_job);
 
-   return VK_SUCCESS;
-}
-
-static VkResult
-queue_submit_noop_job(struct v3dv_queue *queue,
-                      struct v3dv_submit_info_semaphores *sems_info,
-                      bool do_sem_signal, bool serialize)
-{
-   if (!do_sem_signal && !serialize && !sems_info->wait_sem_count)
-      return VK_SUCCESS;
-
-   /* We need to protect noop_job against concurrent access. While
-    * the client must externally synchronize queue submissions, we
-    * may spawn threads that can submit noop jobs themselves.
-    */
-   mtx_lock(&queue->noop_mutex);
-   if (!queue->noop_job) {
-      VkResult result = queue_create_noop_job(queue);
-      if (result != VK_SUCCESS) {
-         mtx_unlock(&queue->noop_mutex);
-         return result;
-      }
-   }
-   queue->noop_job->do_sem_signal = do_sem_signal;
-   queue->noop_job->serialize = serialize;
-
-   VkResult result =
-      queue_submit_job(queue, queue->noop_job, sems_info, NULL);
-
-   mtx_unlock(&queue->noop_mutex);
-   return result;
-}
-
-/* This function takes a job type and returns True if we have
- * previously submitted any jobs for the same command buffer batch
- * to a queue different to the one for this job type.
- */
-static bool
-cmd_buffer_batch_is_multi_queue(struct v3dv_device *device,
-                                enum v3dv_job_type job_type)
-{
-   enum v3dv_queue_type queue_type = V3DV_QUEUE_ANY;
-   struct v3dv_last_job_sync last_job_syncs;
-
-   mtx_lock(&device->mutex);
-   memcpy(&last_job_syncs, &device->last_job_syncs, sizeof(last_job_syncs));
-   mtx_unlock(&device->mutex);
-
-   switch (job_type) {
-   case V3DV_JOB_TYPE_GPU_CL:
-   case V3DV_JOB_TYPE_GPU_CL_SECONDARY:
-      queue_type = V3DV_QUEUE_CL;
-      break;
-   case V3DV_JOB_TYPE_GPU_TFU:
-      queue_type = V3DV_QUEUE_TFU;
-      break;
-   case V3DV_JOB_TYPE_GPU_CSD:
-      queue_type = V3DV_QUEUE_CSD;
-      break;
-   default:
-      unreachable("Queue type is undefined");
-      break;
-   }
-
-   for (int i = 0; i < V3DV_QUEUE_ANY; i++) {
-      if (i != queue_type && !last_job_syncs.first[i]) {
-         return true;
-      }
-   }
-
-   return false;
-}
-
-static VkResult
-queue_submit_cmd_buffer(struct v3dv_queue *queue,
-                        struct v3dv_cmd_buffer *cmd_buffer,
-                        struct v3dv_submit_info_semaphores *sems_info,
-                        bool is_last_cmd_buffer,
-                        pthread_t *wait_thread)
-{
-   struct v3dv_job *last;
-   bool do_sem_signal = is_last_cmd_buffer && sems_info->signal_sem_count > 0;
-
-   assert(cmd_buffer);
-   assert(cmd_buffer->status == V3DV_CMD_BUFFER_STATUS_EXECUTABLE);
-
-   if (list_is_empty(&cmd_buffer->jobs))
-      return queue_submit_noop_job(queue, sems_info, do_sem_signal, false);
-
-   /* When we are in the last cmd buffer and there are semaphores to signal,
-    * we process semaphores in the last job, following these conditions:
-    * - CPU-job: we can't signal until all GPU work has completed, so we
-    *   submit a serialized noop GPU job to handle signaling when all on-going
-    *   GPU work on all queues has completed.
-    * - GPU-job: can signal semaphores only if we have not submitted jobs to
-    *   a queue other than the queue of this job. Otherwise, we submit a
-    *   serialized noop job to handle signaling.
-    */
-   if (do_sem_signal) {
-      last = list_last_entry(&cmd_buffer->jobs, struct v3dv_job, list_link);
-      if (v3dv_job_type_is_gpu(last))
-         last->do_sem_signal = true;
-   }
-
-   list_for_each_entry_safe(struct v3dv_job, job,
-                            &cmd_buffer->jobs, list_link) {
-      if (job->do_sem_signal &&
-          cmd_buffer_batch_is_multi_queue(queue->device, job->type))
-         job->do_sem_signal = false;
-      VkResult result = queue_submit_job(queue, job, sems_info, wait_thread);
-      if (result != VK_SUCCESS)
-         return result;
-   }
-
-   /* If we are in the last cmd buffer batch, but the last job cannot handle
-    * signal semaphores, we emit a serialized noop_job for signalling.
-    */
-   if (do_sem_signal && !(last && last->do_sem_signal))
-      return queue_submit_noop_job(queue, sems_info, true, true);
+   queue->noop_job->serialize = true;
 
    return VK_SUCCESS;
 }
 
-static void
-add_wait_thread_to_list(struct v3dv_device *device,
-                        pthread_t thread,
-                        struct v3dv_queue_submit_wait_info **wait_info)
+VkResult
+v3dv_queue_driver_submit(struct vk_queue *vk_queue,
+                         struct vk_queue_submit *submit)
 {
-   /* If this is the first time we spawn a wait thread for this queue
-    * submission create a v3dv_queue_submit_wait_info to track this and
-    * any other threads in the same submission and add it to the global list
-    * in the queue.
-    */
-   if (*wait_info == NULL) {
-      *wait_info =
-         vk_zalloc(&device->vk.alloc, sizeof(struct v3dv_queue_submit_wait_info), 8,
-                   VK_SYSTEM_ALLOCATION_SCOPE_COMMAND);
-      (*wait_info)->device = device;
-   }
+   struct v3dv_queue *queue = container_of(vk_queue, struct v3dv_queue, vk);
+   VkResult result;
 
-   /* And add the thread to the list of wait threads for this submission */
-   const uint32_t thread_idx = (*wait_info)->wait_thread_count;
-   assert(thread_idx < 16);
-   (*wait_info)->wait_threads[thread_idx].thread = thread;
-   (*wait_info)->wait_threads[thread_idx].finished = false;
-   (*wait_info)->wait_thread_count++;
-}
-
-static void
-add_signal_semaphores_to_wait_list(struct v3dv_device *device,
-                                   const VkSubmitInfo *pSubmit,
-                                   struct v3dv_queue_submit_wait_info *wait_info)
-{
-   assert(wait_info);
-
-   if (pSubmit->signalSemaphoreCount == 0)
-      return;
-
-   /* Otherwise, we put all the semaphores in a list and we signal all of them
-    * together from the submit master thread when the last wait thread in the
-    * submit completes.
-    */
-
-   /* Check the size of the current semaphore list */
-   const uint32_t prev_count = wait_info->signal_semaphore_count;
-   const uint32_t prev_alloc_size = prev_count * sizeof(VkSemaphore);
-   VkSemaphore *prev_list = wait_info->signal_semaphores;
-
-   /* Resize the list to hold the additional semaphores */
-   const uint32_t extra_alloc_size =
-      pSubmit->signalSemaphoreCount * sizeof(VkSemaphore);
-   wait_info->signal_semaphore_count += pSubmit->signalSemaphoreCount;
-   wait_info->signal_semaphores =
-      vk_alloc(&device->vk.alloc, prev_alloc_size + extra_alloc_size, 8,
-               VK_SYSTEM_ALLOCATION_SCOPE_COMMAND);
-
-   /* Copy the old list to the new allocation and free the old list */
-   if (prev_count > 0) {
-      memcpy(wait_info->signal_semaphores, prev_list, prev_alloc_size);
-      vk_free(&device->vk.alloc, prev_list);
-   }
-
-   /* Add the new semaphores to the list */
-   memcpy(wait_info->signal_semaphores + prev_count,
-          pSubmit->pSignalSemaphores, extra_alloc_size);
-}
-
-static VkResult
-queue_submit_cmd_buffer_batch(struct v3dv_queue *queue,
-                              const VkSubmitInfo *pSubmit,
-                              struct v3dv_queue_submit_wait_info **wait_info)
-{
-   VkResult result = VK_SUCCESS;
-   bool has_wait_threads = false;
-
-   /* Wrap wait semaphores info from VkSubmitInfo to use it whenever we need
-    * the data to submit all jobs in the same command buffer batch.
-    */
-   struct v3dv_submit_info_semaphores sems_info = {
-      .wait_sem_count = pSubmit->waitSemaphoreCount,
-      .wait_sems = (VkSemaphore *) pSubmit->pWaitSemaphores,
-      .signal_sem_count = pSubmit->signalSemaphoreCount,
-      .signal_sems = (VkSemaphore *) pSubmit->pSignalSemaphores,
-      .fence = 0,
+   struct v3dv_submit_sync_info sync_info = {
+      .wait_count = submit->wait_count,
+      .waits = submit->waits,
+      .signal_count = submit->signal_count,
+      .signals = submit->signals,
    };
 
-   /* In the beginning of a cmd buffer batch, we set all last_job_syncs as
-    * first. It helps to determine wait semaphores conditions.
-    */
-   for (unsigned i = 0; i < V3DV_QUEUE_COUNT; i++)
-      queue->device->last_job_syncs.first[i] = true;
-
-   /* Even if we don't have any actual work to submit we still need to wait
-    * on the wait semaphores and signal the signal semaphores and fence, so
-    * in this scenario we just submit a trivial no-op job so we don't have
-    * to do anything special, it should not be a common case anyway.
-    */
-   if (pSubmit->commandBufferCount == 0) {
-      result = queue_submit_noop_job(queue, &sems_info,
-                                     sems_info.signal_sem_count > 0, false);
-   } else {
-      const uint32_t last_cmd_buffer_idx = pSubmit->commandBufferCount - 1;
-      for (uint32_t i = 0; i < pSubmit->commandBufferCount; i++) {
-         pthread_t wait_thread;
-         struct v3dv_cmd_buffer *cmd_buffer =
-            v3dv_cmd_buffer_from_handle(pSubmit->pCommandBuffers[i]);
-         result = queue_submit_cmd_buffer(queue, cmd_buffer, &sems_info,
-                                          (i == last_cmd_buffer_idx),
-                                          &wait_thread);
-
-         /* We get VK_NOT_READY if we had to spawn a wait thread for the
-          * command buffer. In that scenario, we want to continue submitting
-          * any pending command buffers in the batch, but we don't want to
-          * process any signal semaphores for the batch until we know we have
-          * submitted every job for every command buffer in the batch.
-          */
-         if (result == VK_NOT_READY) {
-            result = VK_SUCCESS;
-            add_wait_thread_to_list(queue->device, wait_thread, wait_info);
-            has_wait_threads = true;
-         }
-
-         if (result != VK_SUCCESS)
-            break;
-      }
-   }
+   for (int i = 0; i < V3DV_QUEUE_COUNT; i++)
+      queue->last_job_syncs.first[i] = true;
 
+   result = process_waits(queue, sync_info.wait_count, sync_info.waits);
    if (result != VK_SUCCESS)
       return result;
 
-   /* If had to emit any wait threads in this submit we need to wait for all
-    * of them to complete before we can signal any semaphores.
+   for (uint32_t i = 0; i < submit->command_buffer_count; i++) {
+      struct v3dv_cmd_buffer *cmd_buffer =
+         container_of(submit->command_buffers[i], struct v3dv_cmd_buffer, vk);
+      list_for_each_entry_safe(struct v3dv_job, job,
+                               &cmd_buffer->jobs, list_link) {
+
+         result = queue_handle_job(queue, job, &sync_info, false);
+         if (result != VK_SUCCESS)
+            return result;
+      }
+   }
+
+   /* Finish by submitting a no-op job that synchronizes across all queues.
+    * This will ensure that the signal semaphores don't get triggered until
+    * all work on any queue completes.
     */
-   if (!has_wait_threads) {
-      return process_semaphores_to_signal(queue->device,
-                                          pSubmit->signalSemaphoreCount,
-                                          pSubmit->pSignalSemaphores,
-                                          false);
-   } else {
-      assert(*wait_info);
-      add_signal_semaphores_to_wait_list(queue->device, pSubmit, *wait_info);
-      return VK_NOT_READY;
+   if (!queue->noop_job) {
+      result = queue_create_noop_job(queue);
+      if (result != VK_SUCCESS)
+         return result;
    }
-}
-
-static void *
-master_wait_thread_func(void *_wait_info)
-{
-   struct v3dv_queue_submit_wait_info *wait_info =
-      (struct v3dv_queue_submit_wait_info *) _wait_info;
-
-   struct v3dv_queue *queue = &wait_info->device->queue;
-
-   /* Wait for all command buffer wait threads to complete */
-   for (uint32_t i = 0; i < wait_info->wait_thread_count; i++) {
-      int res = pthread_join(wait_info->wait_threads[i].thread, NULL);
-      if (res != 0)
-         fprintf(stderr, "Wait thread failed to join.\n");
-   }
-
-   /* Signal semaphores and fences */
-   VkResult result;
-   result = process_semaphores_to_signal(wait_info->device,
-                                         wait_info->signal_semaphore_count,
-                                         wait_info->signal_semaphores,
-                                         true);
+   result = queue_handle_job(queue, queue->noop_job, &sync_info, true);
    if (result != VK_SUCCESS)
-      fprintf(stderr, "Wait thread semaphore signaling failed.");
+      return result;
 
-   result = process_fence_to_signal(wait_info->device, wait_info->fence);
-   if (result != VK_SUCCESS)
-      fprintf(stderr, "Wait thread fence signaling failed.");
+   process_signals(queue, sync_info.signal_count, sync_info.signals);
 
-   /* Release wait_info */
-   mtx_lock(&queue->mutex);
-   list_del(&wait_info->list_link);
-   mtx_unlock(&queue->mutex);
-
-   vk_free(&wait_info->device->vk.alloc, wait_info->signal_semaphores);
-   vk_free(&wait_info->device->vk.alloc, wait_info);
-
-   return NULL;
-}
-
-
-static VkResult
-spawn_master_wait_thread(struct v3dv_queue *queue,
-                         struct v3dv_queue_submit_wait_info *wait_info)
-
-{
-   VkResult result = VK_SUCCESS;
-
-   mtx_lock(&queue->mutex);
-   if (pthread_create(&wait_info->master_wait_thread, NULL,
-                      master_wait_thread_func, wait_info)) {
-      result = vk_queue_set_lost(&queue->vk, "Thread create failed: %m");
-      goto done;
-   }
-
-   list_addtail(&wait_info->list_link, &queue->submit_wait_list);
-
-done:
-   mtx_unlock(&queue->mutex);
-   return result;
-}
-
-VKAPI_ATTR VkResult VKAPI_CALL
-v3dv_QueueSubmit(VkQueue _queue,
-                 uint32_t submitCount,
-                 const VkSubmitInfo* pSubmits,
-                 VkFence fence)
-{
-   V3DV_FROM_HANDLE(v3dv_queue, queue, _queue);
-
-   if (vk_device_is_lost(&queue->device->vk))
-      return VK_ERROR_DEVICE_LOST;
-
-   struct v3dv_queue_submit_wait_info *wait_info = NULL;
-
-   VkResult result = VK_SUCCESS;
-   for (uint32_t i = 0; i < submitCount; i++) {
-      result = queue_submit_cmd_buffer_batch(queue, &pSubmits[i], &wait_info);
-      if (result != VK_SUCCESS && result != VK_NOT_READY)
-         goto done;
-   }
-
-   if (!wait_info) {
-      assert(result != VK_NOT_READY);
-      result = process_fence_to_signal(queue->device, fence);
-      goto done;
-   }
-
-   /* We emitted wait threads, so we have to spwan a master thread for this
-    * queue submission that waits for all other threads to complete and then
-    * will signal any semaphores and fences.
-    */
-   assert(wait_info);
-   wait_info->fence = fence;
-   result = spawn_master_wait_thread(queue, wait_info);
-
-done:
-   return result;
-}
-
-static void
-destroy_syncobj(uint32_t device_fd, uint32_t *sync)
-{
-   assert(sync);
-   drmSyncobjDestroy(device_fd, *sync);
-   *sync = 0;
-}
-
-VKAPI_ATTR VkResult VKAPI_CALL
-v3dv_CreateSemaphore(VkDevice _device,
-                     const VkSemaphoreCreateInfo *pCreateInfo,
-                     const VkAllocationCallbacks *pAllocator,
-                     VkSemaphore *pSemaphore)
-{
-   V3DV_FROM_HANDLE(v3dv_device, device, _device);
-
-   assert(pCreateInfo->sType == VK_STRUCTURE_TYPE_SEMAPHORE_CREATE_INFO);
-
-   struct v3dv_semaphore *sem =
-      vk_object_zalloc(&device->vk, pAllocator, sizeof(struct v3dv_semaphore),
-                       VK_OBJECT_TYPE_SEMAPHORE);
-   if (sem == NULL)
-      return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
-
-   int ret = drmSyncobjCreate(device->pdevice->render_fd, 0, &sem->sync);
-   if (ret) {
-      vk_object_free(&device->vk, pAllocator, sem);
-      return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
-   }
-
-   *pSemaphore = v3dv_semaphore_to_handle(sem);
-
-   return VK_SUCCESS;
-}
-
-VKAPI_ATTR void VKAPI_CALL
-v3dv_GetPhysicalDeviceExternalSemaphoreProperties(
-    VkPhysicalDevice physicalDevice,
-    const VkPhysicalDeviceExternalSemaphoreInfo *pExternalSemaphoreInfo,
-    VkExternalSemaphoreProperties *pExternalSemaphoreProperties)
-{
-   V3DV_FROM_HANDLE(v3dv_physical_device, pdevice, physicalDevice);
-
-   switch (pExternalSemaphoreInfo->handleType) {
-   case VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_FD_BIT:
-   case VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_SYNC_FD_BIT:
-      pExternalSemaphoreProperties->exportFromImportedHandleTypes =
-         VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_FD_BIT |
-         VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_SYNC_FD_BIT;
-      pExternalSemaphoreProperties->compatibleHandleTypes =
-         VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_FD_BIT |
-         VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_SYNC_FD_BIT;
-
-      /* We need to have multisync support in our kernel interface to support
-       * external semaphore imports because once we have an imported semaphore
-       * in our list of semaphores to wait on, we can no longer use the
-       * workaround of waiting on the last syncobj fence produced from the
-       * device, since the imported semaphore may not (and in fact, it would
-       * typically not) have been produced from same device.
-       */
-      pExternalSemaphoreProperties->externalSemaphoreFeatures =
-         pdevice->caps.multisync ?
-            VK_EXTERNAL_SEMAPHORE_FEATURE_IMPORTABLE_BIT : 0;
-
-      /* FIXME: See comment in GetPhysicalDeviceExternalFenceProperties
-       * for details on why we can't export to SYNC_FD.
-       */
-      if (pExternalSemaphoreInfo->handleType !=
-          VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_SYNC_FD_BIT) {
-         pExternalSemaphoreProperties->externalSemaphoreFeatures |=
-            VK_EXTERNAL_SEMAPHORE_FEATURE_EXPORTABLE_BIT;
-      }
-      break;
-   default:
-      pExternalSemaphoreProperties->exportFromImportedHandleTypes = 0;
-      pExternalSemaphoreProperties->compatibleHandleTypes = 0;
-      pExternalSemaphoreProperties->externalSemaphoreFeatures = 0;
-      break;
-   }
-}
-
-VKAPI_ATTR VkResult VKAPI_CALL
-v3dv_ImportSemaphoreFdKHR(
-   VkDevice _device,
-   const VkImportSemaphoreFdInfoKHR *pImportSemaphoreFdInfo)
-{
-   V3DV_FROM_HANDLE(v3dv_device, device, _device);
-   V3DV_FROM_HANDLE(v3dv_semaphore, sem, pImportSemaphoreFdInfo->semaphore);
-
-   assert(pImportSemaphoreFdInfo->sType ==
-          VK_STRUCTURE_TYPE_IMPORT_SEMAPHORE_FD_INFO_KHR);
-
-   int fd = pImportSemaphoreFdInfo->fd;
-   int render_fd = device->pdevice->render_fd;
-
-   bool is_temporary =
-      pImportSemaphoreFdInfo->handleType == VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_SYNC_FD_BIT ||
-      (pImportSemaphoreFdInfo->flags & VK_SEMAPHORE_IMPORT_TEMPORARY_BIT);
-
-   uint32_t new_sync;
-   switch (pImportSemaphoreFdInfo->handleType) {
-   case VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_SYNC_FD_BIT: {
-      /* "If handleType is VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_SYNC_FD_BIT, the
-       *  special value -1 for fd is treated like a valid sync file descriptor
-       *  referring to an object that has already signaled. The import
-       *  operation will succeed and the VkSemaphore will have a temporarily
-       *  imported payload as if a valid file descriptor had been provided."
-       */
-      unsigned flags = fd == -1 ? DRM_SYNCOBJ_CREATE_SIGNALED : 0;
-      if (drmSyncobjCreate(render_fd, flags, &new_sync))
-         return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
-
-      if (fd != -1) {
-         if (drmSyncobjImportSyncFile(render_fd, new_sync, fd)) {
-            drmSyncobjDestroy(render_fd, new_sync);
-            return vk_error(device, VK_ERROR_INVALID_EXTERNAL_HANDLE);
-         }
-      }
-      break;
-   }
-   case VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_FD_BIT: {
-      if (drmSyncobjFDToHandle(render_fd, fd, &new_sync))
-         return vk_error(device, VK_ERROR_INVALID_EXTERNAL_HANDLE);
-      break;
-   }
-   default:
-      return vk_error(device, VK_ERROR_INVALID_EXTERNAL_HANDLE);
-   }
-
-   if (sem->temp_sync) {
-      destroy_syncobj(render_fd, &sem->temp_sync);
-      sem->has_temp = false;
-   }
-
-   if (is_temporary) {
-      sem->temp_sync = new_sync;
-      sem->has_temp = true;
-   } else {
-      destroy_syncobj(render_fd, &sem->sync);
-      sem->sync = new_sync;
-   }
-
-   /* From the Vulkan 1.0.53 spec:
-    *
-    *    "Importing a semaphore payload from a file descriptor transfers
-    *     ownership of the file descriptor from the application to the
-    *     Vulkan implementation. The application must not perform any
-    *     operations on the file descriptor after a successful import."
-    *
-    * If the import fails, we leave the file descriptor open.
-    */
-   if (fd != -1)
-      close(fd);
-
-   return VK_SUCCESS;
-}
-
-VKAPI_ATTR VkResult VKAPI_CALL
-v3dv_GetSemaphoreFdKHR(VkDevice _device,
-                       const VkSemaphoreGetFdInfoKHR *pGetFdInfo,
-                       int *pFd)
-{
-   V3DV_FROM_HANDLE(v3dv_device, device, _device);
-   V3DV_FROM_HANDLE(v3dv_semaphore, sem, pGetFdInfo->semaphore);
-
-   assert(pGetFdInfo->sType == VK_STRUCTURE_TYPE_SEMAPHORE_GET_FD_INFO_KHR);
-
-   *pFd = -1;
-   int render_fd = device->pdevice->render_fd;
-   switch (pGetFdInfo->handleType) {
-   case VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_SYNC_FD_BIT: {
-      drmSyncobjExportSyncFile(render_fd, sem->sync, pFd);
-      if (*pFd == -1)
-         return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
-      break;
-   case VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_FD_BIT:
-      drmSyncobjHandleToFD(render_fd, sem->sync, pFd);
-      if (*pFd == -1)
-         return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
-      break;
-   }
-   default:
-      unreachable("Unsupported external semaphore handle type");
-   }
-
-   return VK_SUCCESS;
-}
-
-VKAPI_ATTR void VKAPI_CALL
-v3dv_DestroySemaphore(VkDevice _device,
-                      VkSemaphore semaphore,
-                      const VkAllocationCallbacks *pAllocator)
-{
-   V3DV_FROM_HANDLE(v3dv_device, device, _device);
-   V3DV_FROM_HANDLE(v3dv_semaphore, sem, semaphore);
-
-   if (sem == NULL)
-      return;
-
-   destroy_syncobj(device->pdevice->render_fd, &sem->sync);
-   if (sem->temp_sync)
-      destroy_syncobj(device->pdevice->render_fd, &sem->temp_sync);
-
-   vk_object_free(&device->vk, pAllocator, sem);
-}
-
-VKAPI_ATTR VkResult VKAPI_CALL
-v3dv_CreateFence(VkDevice _device,
-                 const VkFenceCreateInfo *pCreateInfo,
-                 const VkAllocationCallbacks *pAllocator,
-                 VkFence *pFence)
-{
-   V3DV_FROM_HANDLE(v3dv_device, device, _device);
-
-   assert(pCreateInfo->sType == VK_STRUCTURE_TYPE_FENCE_CREATE_INFO);
-
-   struct v3dv_fence *fence =
-      vk_object_zalloc(&device->vk, pAllocator, sizeof(struct v3dv_fence),
-                       VK_OBJECT_TYPE_FENCE);
-   if (fence == NULL)
-      return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
-
-   unsigned flags = 0;
-   if (pCreateInfo->flags & VK_FENCE_CREATE_SIGNALED_BIT)
-      flags |= DRM_SYNCOBJ_CREATE_SIGNALED;
-   int ret = drmSyncobjCreate(device->pdevice->render_fd, flags, &fence->sync);
-   if (ret) {
-      vk_object_free(&device->vk, pAllocator, fence);
-      return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
-   }
-
-   *pFence = v3dv_fence_to_handle(fence);
-
-   return VK_SUCCESS;
-}
-
-VKAPI_ATTR void VKAPI_CALL
-v3dv_GetPhysicalDeviceExternalFenceProperties(
-    VkPhysicalDevice physicalDevice,
-    const VkPhysicalDeviceExternalFenceInfo *pExternalFenceInfo,
-    VkExternalFenceProperties *pExternalFenceProperties)
-
-{
-   switch (pExternalFenceInfo->handleType) {
-   case VK_EXTERNAL_FENCE_HANDLE_TYPE_OPAQUE_FD_BIT:
-   case VK_EXTERNAL_FENCE_HANDLE_TYPE_SYNC_FD_BIT:
-      pExternalFenceProperties->exportFromImportedHandleTypes =
-         VK_EXTERNAL_FENCE_HANDLE_TYPE_OPAQUE_FD_BIT |
-         VK_EXTERNAL_FENCE_HANDLE_TYPE_SYNC_FD_BIT;
-      pExternalFenceProperties->compatibleHandleTypes =
-         VK_EXTERNAL_FENCE_HANDLE_TYPE_OPAQUE_FD_BIT |
-         VK_EXTERNAL_FENCE_HANDLE_TYPE_SYNC_FD_BIT;
-      pExternalFenceProperties->externalFenceFeatures =
-         VK_EXTERNAL_FENCE_FEATURE_IMPORTABLE_BIT;
-
-      /* FIXME: SYNC_FD exports the actual fence referenced by the syncobj, not
-       * the syncobj itself, and that fence is only created after we have
-       * submitted to the kernel and updated the syncobj for the fence to import
-       * the actual DRM fence created with the submission. Unfortunately, if the
-       * queue submission has a 'wait for events' we may hold any jobs after the
-       * wait in a user-space thread until the events are signaled, and in that
-       * case we don't update the out fence of the submit until the events are
-       * signaled and we can submit all the jobs involved with the vkQueueSubmit
-       * call. This means that if the applications submits with an out fence and
-       * a wait for events, trying to export the out fence to a SYNC_FD rigth
-       * after the submission and before the events are signaled will fail,
-       * because the actual DRM fence won't exist yet. This is not a problem
-       * with OPAQUE_FD because in this case we export the entire syncobj, not
-       * the underlying DRM fence. To fix this we need to rework our kernel
-       * interface to be more flexible and accept multiple in/out syncobjs so
-       * we can implement event waits as regular fence waits on the kernel side,
-       * until then, we can only reliably export OPAQUE_FD.
-       */
-      if (pExternalFenceInfo->handleType !=
-          VK_EXTERNAL_FENCE_HANDLE_TYPE_SYNC_FD_BIT) {
-         pExternalFenceProperties->externalFenceFeatures |=
-            VK_EXTERNAL_FENCE_FEATURE_EXPORTABLE_BIT;
-      }
-      break;
-   default:
-      pExternalFenceProperties->exportFromImportedHandleTypes = 0;
-      pExternalFenceProperties->compatibleHandleTypes = 0;
-      pExternalFenceProperties->externalFenceFeatures = 0;
-      break;
-   }
-}
-
-VKAPI_ATTR VkResult VKAPI_CALL
-v3dv_ImportFenceFdKHR(VkDevice _device,
-                      const VkImportFenceFdInfoKHR *pImportFenceFdInfo)
-{
-   V3DV_FROM_HANDLE(v3dv_device, device, _device);
-   V3DV_FROM_HANDLE(v3dv_fence, fence, pImportFenceFdInfo->fence);
-
-   assert(pImportFenceFdInfo->sType ==
-          VK_STRUCTURE_TYPE_IMPORT_FENCE_FD_INFO_KHR);
-
-   int fd = pImportFenceFdInfo->fd;
-   int render_fd = device->pdevice->render_fd;
-
-   bool is_temporary =
-      pImportFenceFdInfo->handleType == VK_EXTERNAL_FENCE_HANDLE_TYPE_SYNC_FD_BIT ||
-      (pImportFenceFdInfo->flags & VK_FENCE_IMPORT_TEMPORARY_BIT);
-
-   uint32_t new_sync;
-   switch (pImportFenceFdInfo->handleType) {
-   case VK_EXTERNAL_FENCE_HANDLE_TYPE_SYNC_FD_BIT: {
-      /* "If handleType is VK_EXTERNAL_FENCE_HANDLE_TYPE_SYNC_FD_BIT, the
-       *  special value -1 for fd is treated like a valid sync file descriptor
-       *  referring to an object that has already signaled. The import
-       *  operation will succeed and the VkFence will have a temporarily
-       *  imported payload as if a valid file descriptor had been provided."
-       */
-      unsigned flags = fd == -1 ? DRM_SYNCOBJ_CREATE_SIGNALED : 0;
-      if (drmSyncobjCreate(render_fd, flags, &new_sync))
-         return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
-
-      if (fd != -1) {
-         if (drmSyncobjImportSyncFile(render_fd, new_sync, fd)) {
-            drmSyncobjDestroy(render_fd, new_sync);
-            return vk_error(device, VK_ERROR_INVALID_EXTERNAL_HANDLE);
-         }
-      }
-      break;
-   }
-   case VK_EXTERNAL_FENCE_HANDLE_TYPE_OPAQUE_FD_BIT: {
-      if (drmSyncobjFDToHandle(render_fd, fd, &new_sync))
-         return vk_error(device, VK_ERROR_INVALID_EXTERNAL_HANDLE);
-      break;
-   }
-   default:
-      return vk_error(device, VK_ERROR_INVALID_EXTERNAL_HANDLE);
-   }
-
-   if (fence->temp_sync) {
-      destroy_syncobj(render_fd, &fence->temp_sync);
-      fence->has_temp = false;
-   }
-
-   if (is_temporary) {
-      fence->temp_sync = new_sync;
-      fence->has_temp = true;
-   } else {
-      destroy_syncobj(render_fd, &fence->sync);
-      fence->sync = new_sync;
-   }
-
-   /* From the Vulkan 1.0.53 spec:
-    *
-    *    "Importing a fence payload from a file descriptor transfers
-    *     ownership of the file descriptor from the application to the
-    *     Vulkan implementation. The application must not perform any
-    *     operations on the file descriptor after a successful import."
-    *
-    * If the import fails, we leave the file descriptor open.
-    */
-   if (fd != -1)
-      close(fd);
-
-   return VK_SUCCESS;
-}
-
-VKAPI_ATTR void VKAPI_CALL
-v3dv_DestroyFence(VkDevice _device,
-                  VkFence _fence,
-                  const VkAllocationCallbacks *pAllocator)
-{
-   V3DV_FROM_HANDLE(v3dv_device, device, _device);
-   V3DV_FROM_HANDLE(v3dv_fence, fence, _fence);
-
-   if (fence == NULL)
-      return;
-
-   destroy_syncobj(device->pdevice->render_fd, &fence->sync);
-   if (fence->temp_sync)
-      destroy_syncobj(device->pdevice->render_fd, &fence->temp_sync);
-
-   vk_object_free(&device->vk, pAllocator, fence);
-}
-
-VKAPI_ATTR VkResult VKAPI_CALL
-v3dv_GetFenceStatus(VkDevice _device, VkFence _fence)
-{
-   V3DV_FROM_HANDLE(v3dv_device, device, _device);
-   V3DV_FROM_HANDLE(v3dv_fence, fence, _fence);
-
-   if (vk_device_is_lost(&device->vk))
-      return VK_ERROR_DEVICE_LOST;
-
-   uint32_t sync = fence_get_sync(fence);
-   int ret = drmSyncobjWait(device->pdevice->render_fd, &sync, 1,
-                            0, DRM_SYNCOBJ_WAIT_FLAGS_WAIT_FOR_SUBMIT, NULL);
-   if (ret == -ETIME)
-      return VK_NOT_READY;
-   else if (ret)
-      return vk_device_set_lost(&device->vk, "Syncobj wait failed: %m");
-   return VK_SUCCESS;
-}
-
-VKAPI_ATTR VkResult VKAPI_CALL
-v3dv_GetFenceFdKHR(VkDevice _device,
-                   const VkFenceGetFdInfoKHR *pGetFdInfo,
-                   int *pFd)
-{
-   V3DV_FROM_HANDLE(v3dv_device, device, _device);
-   V3DV_FROM_HANDLE(v3dv_fence, fence, pGetFdInfo->fence);
-
-   assert(pGetFdInfo->sType == VK_STRUCTURE_TYPE_FENCE_GET_FD_INFO_KHR);
-
-   *pFd = -1;
-   int render_fd = device->pdevice->render_fd;
-   switch (pGetFdInfo->handleType) {
-   case VK_EXTERNAL_FENCE_HANDLE_TYPE_SYNC_FD_BIT: {
-      drmSyncobjExportSyncFile(render_fd, fence->sync, pFd);
-      if (*pFd == -1)
-         return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
-      break;
-   case VK_EXTERNAL_FENCE_HANDLE_TYPE_OPAQUE_FD_BIT:
-      drmSyncobjHandleToFD(render_fd, fence->sync, pFd);
-      if (*pFd == -1)
-         return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
-      break;
-   }
-   default:
-      unreachable("Unsupported external fence handle type");
-   }
-
-   return VK_SUCCESS;
-}
-
-VKAPI_ATTR VkResult VKAPI_CALL
-v3dv_ResetFences(VkDevice _device, uint32_t fenceCount, const VkFence *pFences)
-{
-   V3DV_FROM_HANDLE(v3dv_device, device, _device);
-
-   uint32_t *syncobjs = vk_alloc(&device->vk.alloc,
-                                 sizeof(*syncobjs) * fenceCount, 8,
-                                 VK_SYSTEM_ALLOCATION_SCOPE_COMMAND);
-   if (!syncobjs)
-      return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
-
-   int render_fd = device->pdevice->render_fd;
-   uint32_t reset_count = 0;
-   for (uint32_t i = 0; i < fenceCount; i++) {
-      struct v3dv_fence *fence = v3dv_fence_from_handle(pFences[i]);
-      /* From the Vulkan spec, section 'Importing Fence Payloads':
-       *
-       *    "If the import is temporary, the fence will be restored to its
-       *     permanent state the next time that fence is passed to
-       *     vkResetFences.
-       *
-       *     Note: Restoring a fence to its prior permanent payload is a
-       *     distinct operation from resetting a fence payload."
-       *
-       * To restore the previous state, we just need to destroy the temporary.
-       */
-      if (fence->has_temp) {
-         assert(fence->temp_sync);
-         destroy_syncobj(render_fd, &fence->temp_sync);
-         fence->has_temp = false;
-      } else {
-         syncobjs[reset_count++] = fence->sync;
-      }
-   }
-
-   int ret = 0;
-   if (reset_count > 0)
-      ret = drmSyncobjReset(render_fd, syncobjs, reset_count);
-
-   vk_free(&device->vk.alloc, syncobjs);
-
-   if (ret)
-      return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
-   return VK_SUCCESS;
-}
-
-VKAPI_ATTR VkResult VKAPI_CALL
-v3dv_WaitForFences(VkDevice _device,
-                   uint32_t fenceCount,
-                   const VkFence *pFences,
-                   VkBool32 waitAll,
-                   uint64_t timeout)
-{
-   V3DV_FROM_HANDLE(v3dv_device, device, _device);
-
-   if (vk_device_is_lost(&device->vk))
-      return VK_ERROR_DEVICE_LOST;
-
-   const uint64_t abs_timeout = os_time_get_absolute_timeout(timeout);
-
-   uint32_t *syncobjs = vk_alloc(&device->vk.alloc,
-                                 sizeof(*syncobjs) * fenceCount, 8,
-                                 VK_SYSTEM_ALLOCATION_SCOPE_COMMAND);
-   if (!syncobjs)
-      return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
-
-   for (uint32_t i = 0; i < fenceCount; i++) {
-      struct v3dv_fence *fence = v3dv_fence_from_handle(pFences[i]);
-      syncobjs[i] = fence_get_sync(fence);
-   }
-
-   unsigned flags = DRM_SYNCOBJ_WAIT_FLAGS_WAIT_FOR_SUBMIT;
-   if (waitAll)
-      flags |= DRM_SYNCOBJ_WAIT_FLAGS_WAIT_ALL;
-
-   int ret;
-   do {
-      ret = drmSyncobjWait(device->pdevice->render_fd, syncobjs, fenceCount,
-                           timeout, flags, NULL);
-   } while (ret == -ETIME && os_time_get_nano() < abs_timeout);
-
-   vk_free(&device->vk.alloc, syncobjs);
-
-   if (ret == -ETIME)
-      return VK_TIMEOUT;
-   else if (ret)
-      return vk_device_set_lost(&device->vk, "Syncobj wait failed: %m");
    return VK_SUCCESS;
 }
 
diff --git a/src/broadcom/vulkan/v3dv_wsi.c b/src/broadcom/vulkan/v3dv_wsi.c
index a7dad11cc6f..ad77d3970e9 100644
--- a/src/broadcom/vulkan/v3dv_wsi.c
+++ b/src/broadcom/vulkan/v3dv_wsi.c
@@ -29,6 +29,9 @@
 #include "vk_util.h"
 #include "wsi_common.h"
 #include "wsi_common_drm.h"
+#include "vk_fence.h"
+#include "vk_semaphore.h"
+#include "vk_sync_dummy.h"
 
 static VKAPI_ATTR PFN_vkVoidFunction VKAPI_CALL
 v3dv_wsi_proc_addr(VkPhysicalDevice physicalDevice, const char *pName)
@@ -146,26 +149,39 @@ v3dv_wsi_get_image_from_swapchain(VkSwapchainKHR swapchain, uint32_t index)
 }
 
 VKAPI_ATTR VkResult VKAPI_CALL
-v3dv_AcquireNextImage2KHR(
-    VkDevice                                     _device,
-    const VkAcquireNextImageInfoKHR*             pAcquireInfo,
-    uint32_t*                                    pImageIndex)
+v3dv_AcquireNextImage2KHR(VkDevice _device,
+                          const VkAcquireNextImageInfoKHR *pAcquireInfo,
+                          uint32_t *pImageIndex)
 {
    V3DV_FROM_HANDLE(v3dv_device, device, _device);
-   V3DV_FROM_HANDLE(v3dv_fence, fence, pAcquireInfo->fence);
-   V3DV_FROM_HANDLE(v3dv_semaphore, semaphore, pAcquireInfo->semaphore);
+   VK_FROM_HANDLE(vk_fence, fence, pAcquireInfo->fence);
+   VK_FROM_HANDLE(vk_semaphore, semaphore, pAcquireInfo->semaphore);
 
-   struct v3dv_physical_device *pdevice = &device->instance->physicalDevice;
+   struct v3dv_physical_device *pdevice = device->pdevice;
 
-   VkResult result;
-   result = wsi_common_acquire_next_image2(&pdevice->wsi_device, _device,
-                                           pAcquireInfo, pImageIndex);
+   VkResult result = wsi_common_acquire_next_image2(
+      &pdevice->wsi_device, _device, pAcquireInfo, pImageIndex);
 
+   /* signal fence/semaphore - image is available immediately */
    if (result == VK_SUCCESS || result == VK_SUBOPTIMAL_KHR) {
-      if (fence)
-         drmSyncobjSignal(pdevice->render_fd, &fence->sync, 1);
-      if (semaphore)
-         drmSyncobjSignal(pdevice->render_fd, &semaphore->sync, 1);
+      VkResult sync_res;
+      if (fence) {
+         vk_fence_reset_temporary(&device->vk, fence);
+         sync_res = vk_sync_create(&device->vk, &vk_sync_dummy_type,
+                                   0 /* flags */, 1 /* initial_value */,
+                                   &fence->temporary);
+         if (sync_res != VK_SUCCESS)
+            return sync_res;
+      }
+
+      if (semaphore) {
+         vk_semaphore_reset_temporary(&device->vk, semaphore);
+         sync_res = vk_sync_create(&device->vk, &vk_sync_dummy_type,
+                                   0 /* flags */, 1 /* initial_value */,
+                                   &semaphore->temporary);
+         if (sync_res != VK_SUCCESS)
+            return sync_res;
+      }
    }
 
    return result;