v3dv: only check binning sync for semaphores for the first CL job

Semaphore waits in a command buffer only affect the first jobs we execute in each hardware queue since jobs in the same queue are serialized against each other. Binning syncs in particular, only affect CL jobs. Reviewed-by: Alejandro Piñeiro <apinheiro@igalia.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/17594>
2022-07-18 11:12:11 +02:00
parent 2ca4a51679
commit 903a75245b
2 changed files with 36 additions and 28 deletions
--- a/src/broadcom/vulkan/v3dv_private.h
+++ b/src/broadcom/vulkan/v3dv_private.h
@@ -250,9 +250,17 @@ enum v3dv_queue_type {
 * cmd buf batch.
 */
 struct v3dv_last_job_sync {
-   /* If the job is the first submitted to a GPU queue in a cmd buffer batch */
+   /* If the job is the first submitted to a GPU queue in a cmd buffer batch.
    *
    * We use V3DV_QUEUE_{CL,CSD,TFU} both with and without multisync.
    */
   bool first[V3DV_QUEUE_COUNT];
-   /* Array of syncobj to track the last job submitted to a GPU queue */
+   /* Array of syncobj to track the last job submitted to a GPU queue.
    *
    * With multisync we use V3DV_QUEUE_{CL,CSD,TFU} to track syncobjs for each
    * queue, but without multisync we only track the last job submitted to any
    * queue in V3DV_QUEUE_ANY.
    */
   uint32_t syncs[V3DV_QUEUE_COUNT];
 };
--- a/src/broadcom/vulkan/v3dv_queue.c
+++ b/src/broadcom/vulkan/v3dv_queue.c
@@ -103,9 +103,6 @@ queue_wait_idle(struct v3dv_queue *queue,
         if (result != VK_SUCCESS)
            return result;
      }
      for (int i = 0; i < 3; i++)
         queue->last_job_syncs.first[i] = false;
   } else {
      /* Without multisync, all the semaphores are baked into the one syncobj
       * at the start of each submit so we only need to wait on the one.
@@ -120,6 +117,9 @@ queue_wait_idle(struct v3dv_queue *queue,
      }
   }
   for (int i = 0; i < 3; i++)
      queue->last_job_syncs.first[i] = false;
   return VK_SUCCESS;
 }
@@ -722,8 +722,6 @@ set_multisync(struct drm_v3d_multi_sync *ms,
   ms->in_sync_count = in_sync_count;
   ms->in_syncs = (uintptr_t)(void *)in_syncs;
   queue->last_job_syncs.first[queue_sync] = false;
   return;
 fail:
@@ -797,35 +795,33 @@ handle_cl_job(struct v3dv_queue *queue,
   const bool needs_perf_sync = queue->last_perfmon_id != submit.perfmon_id;
   queue->last_perfmon_id = submit.perfmon_id;
-   /* We need a binning sync if we are waiting on a semaphore with a wait stage
+   /* We need a binning sync if we are the first CL job waiting on a semaphore
-    * that involves the geometry pipeline, or if the job comes after a pipeline
+    * with a wait stage that involves the geometry pipeline, or if the job
-    * barrier that involves geometry stages (needs_bcl_sync), or if
+    * comes after a pipeline barrier that involves geometry stages
-    * performance queries are in use.
+    * (needs_bcl_sync) or when performance queries are in use.
    *
    * We need a render sync if the job doesn't need a binning sync but has
    * still been flagged for serialization. It should be noted that RCL jobs
    * don't start until the previous RCL job has finished so we don't really
    * need to add a fence for those, however, we might need to wait on a CSD or
    * TFU job, which are not automatically serialized with CL jobs.
    *
    * FIXME: see if we can do better and avoid bcl syncs for any jobs in the
    * command buffer after the first job where we should be able to track bcl
    * dependencies strictly through barriers.
    */
-   bool needs_bcl_sync = job->needs_bcl_sync;
+   bool needs_bcl_sync = job->needs_bcl_sync || needs_perf_sync;
-   for (int i = 0; !needs_bcl_sync && i < sync_info->wait_count; i++) {
+   if (queue->last_job_syncs.first[V3DV_QUEUE_CL]) {
-      needs_bcl_sync = sync_info->waits[i].stage_mask &
+      for (int i = 0; !needs_bcl_sync && i < sync_info->wait_count; i++) {
-         (VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT |
+         needs_bcl_sync = sync_info->waits[i].stage_mask &
-          VK_PIPELINE_STAGE_ALL_GRAPHICS_BIT |
+             (VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT |
-          VK_PIPELINE_STAGE_ALL_COMMANDS_BIT |
+              VK_PIPELINE_STAGE_ALL_GRAPHICS_BIT |
-          VK_PIPELINE_STAGE_DRAW_INDIRECT_BIT |
+              VK_PIPELINE_STAGE_ALL_COMMANDS_BIT |
-          VK_PIPELINE_STAGE_VERTEX_INPUT_BIT |
+              VK_PIPELINE_STAGE_DRAW_INDIRECT_BIT |
-          VK_PIPELINE_STAGE_VERTEX_SHADER_BIT |
+              VK_PIPELINE_STAGE_VERTEX_INPUT_BIT |
-          VK_PIPELINE_STAGE_TESSELLATION_CONTROL_SHADER_BIT |
+              VK_PIPELINE_STAGE_VERTEX_SHADER_BIT |
-          VK_PIPELINE_STAGE_TESSELLATION_EVALUATION_SHADER_BIT |
+              VK_PIPELINE_STAGE_TESSELLATION_CONTROL_SHADER_BIT |
-          VK_PIPELINE_STAGE_GEOMETRY_SHADER_BIT);
+              VK_PIPELINE_STAGE_TESSELLATION_EVALUATION_SHADER_BIT |
              VK_PIPELINE_STAGE_GEOMETRY_SHADER_BIT);
      }
   }
-   needs_bcl_sync |= needs_perf_sync;
+
   bool needs_rcl_sync = job->serialize && !needs_bcl_sync;
   /* Replace single semaphore settings whenever our kernel-driver supports
@@ -866,6 +862,8 @@ handle_cl_job(struct v3dv_queue *queue,
   free(bo_handles);
   multisync_free(device, &ms);
   queue->last_job_syncs.first[V3DV_QUEUE_CL] = false;
   if (ret)
      return vk_queue_set_lost(&queue->vk, "V3D_SUBMIT_CL failed: %m");
@@ -906,6 +904,7 @@ handle_tfu_job(struct v3dv_queue *queue,
                        DRM_IOCTL_V3D_SUBMIT_TFU, &job->tfu);
   multisync_free(device, &ms);
   queue->last_job_syncs.first[V3DV_QUEUE_TFU] = false;
   if (ret != 0)
      return vk_queue_set_lost(&queue->vk, "V3D_SUBMIT_TFU failed: %m");
@@ -984,6 +983,7 @@ handle_csd_job(struct v3dv_queue *queue,
   free(bo_handles);
   multisync_free(device, &ms);
   queue->last_job_syncs.first[V3DV_QUEUE_CSD] = false;
   if (ret)
      return vk_queue_set_lost(&queue->vk, "V3D_SUBMIT_CSD failed: %m");