anv: Use new helper functions to pick SIMD variant for CS

Also combine the existing individual anv helpers into a single one for all CS related parameters. Reviewed-by: Jason Ekstrand <jason@jlekstrand.net> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/5142>
2020-05-21 00:17:27 -07:00
parent 594374dd8d
commit 9b8347c988
5 changed files with 40 additions and 32 deletions
--- a/src/intel/vulkan/anv_cmd_buffer.c
+++ b/src/intel/vulkan/anv_cmd_buffer.c
@@ -838,9 +838,9 @@ anv_cmd_buffer_cs_push_constants(struct anv_cmd_buffer *cmd_buffer)
   const struct brw_cs_prog_data *cs_prog_data = get_cs_prog_data(pipeline);
   const struct anv_push_range *range = &pipeline->cs->bind_map.push_ranges[0];

-   const uint32_t threads = anv_cs_threads(pipeline);
+   const struct anv_cs_parameters cs_params = anv_cs_parameters(pipeline);
   const unsigned total_push_constants_size =
-      brw_cs_push_const_total_size(cs_prog_data, threads);
+      brw_cs_push_const_total_size(cs_prog_data, cs_params.threads);
   if (total_push_constants_size == 0)
      return (struct anv_state) { .offset = 0 };

@@ -863,7 +863,7 @@ anv_cmd_buffer_cs_push_constants(struct anv_cmd_buffer *cmd_buffer)
   }

   if (cs_prog_data->push.per_thread.size > 0) {
-      for (unsigned t = 0; t < threads; t++) {
+      for (unsigned t = 0; t < cs_params.threads; t++) {
         memcpy(dst, src, cs_prog_data->push.per_thread.size);

         uint32_t *subgroup_id = dst +
--- a/src/intel/vulkan/anv_pipeline.c
+++ b/src/intel/vulkan/anv_pipeline.c
@@ -1728,21 +1728,22 @@ anv_pipeline_compile_cs(struct anv_compute_pipeline *pipeline,
   return VK_SUCCESS;
 }

-uint32_t
-anv_cs_workgroup_size(const struct anv_compute_pipeline *pipeline)
+struct anv_cs_parameters
+anv_cs_parameters(const struct anv_compute_pipeline *pipeline)
 {
   const struct brw_cs_prog_data *cs_prog_data = get_cs_prog_data(pipeline);
-   return cs_prog_data->local_size[0] *
+
+   struct anv_cs_parameters cs_params = {};
+
+   cs_params.group_size = cs_prog_data->local_size[0] *
                          cs_prog_data->local_size[1] *
                          cs_prog_data->local_size[2];
-}
+   cs_params.simd_size =
+      brw_cs_simd_size_for_group_size(&pipeline->base.device->info,
+                                      cs_prog_data, cs_params.group_size);
+   cs_params.threads = DIV_ROUND_UP(cs_params.group_size, cs_params.simd_size);

-uint32_t
-anv_cs_threads(const struct anv_compute_pipeline *pipeline)
-{
-   const struct brw_cs_prog_data *cs_prog_data = get_cs_prog_data(pipeline);
-   return DIV_ROUND_UP(anv_cs_workgroup_size(pipeline),
-                       cs_prog_data->simd_size);
+   return cs_params;
 }

 /**
--- a/src/intel/vulkan/anv_private.h
+++ b/src/intel/vulkan/anv_private.h
@@ -3413,11 +3413,14 @@ anv_pipeline_compile_cs(struct anv_compute_pipeline *pipeline,
                        const char *entrypoint,
                        const VkSpecializationInfo *spec_info);

-uint32_t
-anv_cs_workgroup_size(const struct anv_compute_pipeline *pipeline);
+struct anv_cs_parameters {
+   uint32_t group_size;
+   uint32_t simd_size;
+   uint32_t threads;
+};

-uint32_t
-anv_cs_threads(const struct anv_compute_pipeline *pipeline);
+struct anv_cs_parameters
+anv_cs_parameters(const struct anv_compute_pipeline *pipeline);

 struct anv_format_plane {
   enum isl_format isl_format:16;
--- a/src/intel/vulkan/genX_cmd_buffer.c
+++ b/src/intel/vulkan/genX_cmd_buffer.c
@@ -4360,12 +4360,14 @@ void genX(CmdDispatchBase)(
   if (cmd_buffer->state.conditional_render_enabled)
      genX(cmd_emit_conditional_render_predicate)(cmd_buffer);

+   const struct anv_cs_parameters cs_params = anv_cs_parameters(pipeline);
+
   anv_batch_emit(&cmd_buffer->batch, GENX(GPGPU_WALKER), ggw) {
      ggw.PredicateEnable              = cmd_buffer->state.conditional_render_enabled;
-      ggw.SIMDSize                     = prog_data->simd_size / 16;
+      ggw.SIMDSize                     = cs_params.simd_size / 16;
      ggw.ThreadDepthCounterMaximum    = 0;
      ggw.ThreadHeightCounterMaximum   = 0;
-      ggw.ThreadWidthCounterMaximum    = anv_cs_threads(pipeline) - 1;
+      ggw.ThreadWidthCounterMaximum    = cs_params.threads - 1;
      ggw.ThreadGroupIDXDimension      = groupCountX;
      ggw.ThreadGroupIDYDimension      = groupCountY;
      ggw.ThreadGroupIDZDimension      = groupCountZ;
@@ -4474,14 +4476,16 @@ void genX(CmdDispatchIndirect)(
      genX(cmd_emit_conditional_render_predicate)(cmd_buffer);
 #endif

+   const struct anv_cs_parameters cs_params = anv_cs_parameters(pipeline);
+
   anv_batch_emit(batch, GENX(GPGPU_WALKER), ggw) {
      ggw.IndirectParameterEnable      = true;
      ggw.PredicateEnable              = GEN_GEN <= 7 ||
                                         cmd_buffer->state.conditional_render_enabled;
-      ggw.SIMDSize                     = prog_data->simd_size / 16;
+      ggw.SIMDSize                     = cs_params.simd_size / 16;
      ggw.ThreadDepthCounterMaximum    = 0;
      ggw.ThreadHeightCounterMaximum   = 0;
-      ggw.ThreadWidthCounterMaximum    = anv_cs_threads(pipeline) - 1;
+      ggw.ThreadWidthCounterMaximum    = cs_params.threads - 1;
      ggw.RightExecutionMask           = pipeline->cs_right_mask;
      ggw.BottomExecutionMask          = 0xffffffff;
   }
--- a/src/intel/vulkan/genX_pipeline.c
+++ b/src/intel/vulkan/genX_pipeline.c
@@ -2325,19 +2325,16 @@ compute_pipeline_create(

   anv_pipeline_setup_l3_config(&pipeline->base, cs_prog_data->base.total_shared > 0);

-   uint32_t group_size = cs_prog_data->local_size[0] *
-      cs_prog_data->local_size[1] * cs_prog_data->local_size[2];
-   uint32_t remainder = group_size & (cs_prog_data->simd_size - 1);
+   const struct anv_cs_parameters cs_params = anv_cs_parameters(pipeline);
+   uint32_t remainder = cs_params.group_size & (cs_params.simd_size - 1);

   if (remainder > 0)
      pipeline->cs_right_mask = ~0u >> (32 - remainder);
   else
-      pipeline->cs_right_mask = ~0u >> (32 - cs_prog_data->simd_size);
-
-   const uint32_t threads = anv_cs_threads(pipeline);
+      pipeline->cs_right_mask = ~0u >> (32 - cs_params.simd_size);

   const uint32_t vfe_curbe_allocation =
-      ALIGN(cs_prog_data->push.per_thread.regs * threads +
+      ALIGN(cs_prog_data->push.per_thread.regs * cs_params.threads +
            cs_prog_data->push.cross_thread.regs, 2);

   const uint32_t subslices = MAX2(device->physical->subslice_total, 1);
@@ -2388,7 +2385,10 @@ compute_pipeline_create(
   }

   struct GENX(INTERFACE_DESCRIPTOR_DATA) desc = {
-      .KernelStartPointer     = cs_bin->kernel.offset,
+      .KernelStartPointer     =
+         cs_bin->kernel.offset +
+         brw_cs_prog_data_prog_offset(cs_prog_data, cs_params.simd_size),
+
      /* WA_1606682166 */
      .SamplerCount           = GEN_GEN == 11 ? 0 : get_sampler_count(cs_bin),
      /* We add 1 because the CS indirect parameters buffer isn't accounted
@@ -2420,7 +2420,7 @@ compute_pipeline_create(
      .ThreadPreemptionDisable = true,
 #endif

-      .NumberofThreadsinGPGPUThreadGroup = threads,
+      .NumberofThreadsinGPGPUThreadGroup = cs_params.threads,
   };
   GENX(INTERFACE_DESCRIPTOR_DATA_pack)(NULL,
                                        pipeline->interface_descriptor_data,