intel/compiler: Add support for variable workgroup size

Add new builtin parameters that are used to keep track of the group size. This will be used to implement ARB_compute_variable_group_size. The compiler will use the maximum group size supported to pick a suitable SIMD variant. A later improvement will be to keep all SIMD variants (like FS) so the driver can select the best one at dispatch time. When variable workgroup size is used, the small workgroup optimization is disabled as it we can't prove at compile time that the barriers won't be needed. Extracted from original i965 patch with additional changes by Caio Marcelo de Oliveira Filho. Reviewed-by: Caio Marcelo de Oliveira Filho <caio.oliveira@intel.com> Reviewed-by: Paulo Zanoni <paulo.r.zanoni@intel.com> Reviewed-by: Jordan Justen <jordan.l.justen@intel.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/4504>
2018-11-12 06:29:51 -08:00
parent c54fc0d07b
commit c77dc51203
6 changed files with 101 additions and 29 deletions
--- a/src/compiler/shader_info.h
+++ b/src/compiler/shader_info.h
@@ -298,6 +298,7 @@ typedef struct shader_info {

      struct {
         uint16_t local_size[3];
+         uint16_t max_variable_local_size;

         bool local_size_variable:1;
         uint8_t user_data_components_amd:3;
--- a/src/intel/compiler/brw_compiler.h
+++ b/src/intel/compiler/brw_compiler.h
@@ -615,6 +615,9 @@ enum brw_param_builtin {
   BRW_PARAM_BUILTIN_BASE_WORK_GROUP_ID_Y,
   BRW_PARAM_BUILTIN_BASE_WORK_GROUP_ID_Z,
   BRW_PARAM_BUILTIN_SUBGROUP_ID,
+   BRW_PARAM_BUILTIN_WORK_GROUP_SIZE_X,
+   BRW_PARAM_BUILTIN_WORK_GROUP_SIZE_Y,
+   BRW_PARAM_BUILTIN_WORK_GROUP_SIZE_Z,
 };

 #define BRW_PARAM_BUILTIN_CLIP_PLANE(idx, comp) \
@@ -901,11 +904,13 @@ struct brw_cs_prog_data {
   struct brw_stage_prog_data base;

   unsigned local_size[3];
+   unsigned max_variable_local_size;
   unsigned simd_size;
   unsigned threads;
   unsigned slm_size;
   bool uses_barrier;
   bool uses_num_work_groups;
+   bool uses_variable_group_size;

   struct {
      struct brw_push_const_block cross_thread;
--- a/src/intel/compiler/brw_fs.cpp
+++ b/src/intel/compiler/brw_fs.cpp
@@ -1190,6 +1190,8 @@ fs_visitor::import_uniforms(fs_visitor *v)
   this->pull_constant_loc = v->pull_constant_loc;
   this->uniforms = v->uniforms;
   this->subgroup_id = v->subgroup_id;
+   for (unsigned i = 0; i < ARRAY_SIZE(this->group_size); i++)
+      this->group_size[i] = v->group_size[i];
 }

 void
@@ -8866,9 +8868,16 @@ static void
 cs_set_simd_size(struct brw_cs_prog_data *cs_prog_data, unsigned size)
 {
   cs_prog_data->simd_size = size;
-   unsigned group_size = cs_prog_data->local_size[0] *
-      cs_prog_data->local_size[1] * cs_prog_data->local_size[2];
-   cs_prog_data->threads = (group_size + size - 1) / size;
+
+   unsigned group_size;
+   if (cs_prog_data->uses_variable_group_size) {
+      group_size = cs_prog_data->max_variable_local_size;
+   } else {
+      group_size = cs_prog_data->local_size[0] *
+                   cs_prog_data->local_size[1] *
+                   cs_prog_data->local_size[2];
+   }
+   cs_prog_data->threads = DIV_ROUND_UP(group_size, size);
 }

 static nir_shader *
@@ -8903,13 +8912,20 @@ brw_compile_cs(const struct brw_compiler *compiler, void *log_data,
               char **error_str)
 {
   prog_data->base.total_shared = src_shader->info.cs.shared_size;
-   prog_data->local_size[0] = src_shader->info.cs.local_size[0];
-   prog_data->local_size[1] = src_shader->info.cs.local_size[1];
-   prog_data->local_size[2] = src_shader->info.cs.local_size[2];
   prog_data->slm_size = src_shader->num_shared;
-   unsigned local_workgroup_size =
-      src_shader->info.cs.local_size[0] * src_shader->info.cs.local_size[1] *
-      src_shader->info.cs.local_size[2];
+
+   unsigned local_workgroup_size;
+   if (prog_data->uses_variable_group_size) {
+      prog_data->max_variable_local_size =
+         src_shader->info.cs.max_variable_local_size;
+      local_workgroup_size = src_shader->info.cs.max_variable_local_size;
+   } else {
+      prog_data->local_size[0] = src_shader->info.cs.local_size[0];
+      prog_data->local_size[1] = src_shader->info.cs.local_size[1];
+      prog_data->local_size[2] = src_shader->info.cs.local_size[2];
+      local_workgroup_size = src_shader->info.cs.local_size[0] *
+         src_shader->info.cs.local_size[1] * src_shader->info.cs.local_size[2];
+   }

   /* Limit max_threads to 64 for the GPGPU_WALKER command */
   const uint32_t max_threads = MIN2(64, compiler->devinfo->max_cs_threads);
--- a/src/intel/compiler/brw_fs.h
+++ b/src/intel/compiler/brw_fs.h
@@ -370,6 +370,7 @@ public:
   int *push_constant_loc;

   fs_reg subgroup_id;
+   fs_reg group_size[3];
   fs_reg scratch_base;
   fs_reg frag_depth;
   fs_reg frag_stencil;
--- a/src/intel/compiler/brw_fs_nir.cpp
+++ b/src/intel/compiler/brw_fs_nir.cpp
@@ -101,11 +101,23 @@ fs_visitor::nir_setup_uniforms()
   uniforms = nir->num_uniforms / 4;

   if (stage == MESA_SHADER_COMPUTE) {
-      /* Add a uniform for the thread local id.  It must be the last uniform
-       * on the list.
-       */
+      /* Add uniforms for builtins after regular NIR uniforms. */
      assert(uniforms == prog_data->nr_params);
-      uint32_t *param = brw_stage_prog_data_add_params(prog_data, 1);
+
+      uint32_t *param;
+      if (brw_cs_prog_data(prog_data)->uses_variable_group_size) {
+         param = brw_stage_prog_data_add_params(prog_data, 3);
+         for (unsigned i = 0; i < 3; i++) {
+            param[i] = (BRW_PARAM_BUILTIN_WORK_GROUP_SIZE_X + i);
+            group_size[i] = fs_reg(UNIFORM, uniforms++, BRW_REGISTER_TYPE_UD);
+         }
+      }
+
+      /* Subgroup ID must be the last uniform on the list.  This will make
+       * easier later to split between cross thread and per thread
+       * uniforms.
+       */
+      param = brw_stage_prog_data_add_params(prog_data, 1);
      *param = BRW_PARAM_BUILTIN_SUBGROUP_ID;
      subgroup_id = fs_reg(UNIFORM, uniforms++, BRW_REGISTER_TYPE_UD);
   }
@@ -3814,7 +3826,8 @@ fs_visitor::nir_emit_cs_intrinsic(const fs_builder &bld,
       * invocations are already executed lock-step.  Instead of an actual
       * barrier just emit a scheduling fence, that will generate no code.
       */
-      if (workgroup_size() <= dispatch_width) {
+      if (!cs_prog_data->uses_variable_group_size &&
+          workgroup_size() <= dispatch_width) {
         bld.exec_all().group(1, 0).emit(FS_OPCODE_SCHEDULING_FENCE);
         break;
      }
@@ -3949,6 +3962,14 @@ fs_visitor::nir_emit_cs_intrinsic(const fs_builder &bld,
      break;
   }

+   case nir_intrinsic_load_local_group_size: {
+      for (unsigned i = 0; i < 3; i++) {
+         bld.MOV(retype(offset(dest, bld, i), BRW_REGISTER_TYPE_UD),
+            group_size[i]);
+      }
+      break;
+   }
+
   default:
      nir_emit_intrinsic(bld, instr);
      break;
@@ -4337,7 +4358,8 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr
       *
       * TODO: Check if applies for many HW threads sharing same Data Port.
       */
-      if (slm_fence && workgroup_size() <= dispatch_width)
+      if (!brw_cs_prog_data(prog_data)->uses_variable_group_size &&
+          slm_fence && workgroup_size() <= dispatch_width)
         slm_fence = false;

      /* Prior to Gen11, there's only L3 fence, so emit that instead. */
--- a/src/intel/compiler/brw_nir_lower_cs_intrinsics.c
+++ b/src/intel/compiler/brw_nir_lower_cs_intrinsics.c
@@ -72,8 +72,16 @@ lower_cs_intrinsics_convert_block(struct lower_intrinsics_state *state,
            nir_ssa_def *channel = nir_load_subgroup_invocation(b);
            nir_ssa_def *linear = nir_iadd(b, channel, thread_local_id);

-            nir_ssa_def *size_x = nir_imm_int(b, nir->info.cs.local_size[0]);
-            nir_ssa_def *size_y = nir_imm_int(b, nir->info.cs.local_size[1]);
+            nir_ssa_def *size_x;
+            nir_ssa_def *size_y;
+            if (state->nir->info.cs.local_size_variable) {
+               nir_ssa_def *size_xyz = nir_load_local_group_size(b);
+               size_x = nir_channel(b, size_xyz, 0);
+               size_y = nir_channel(b, size_xyz, 1);
+            } else {
+               size_x = nir_imm_int(b, nir->info.cs.local_size[0]);
+               size_y = nir_imm_int(b, nir->info.cs.local_size[1]);
+            }

            /* The local invocation index and ID must respect the following
             *
@@ -152,12 +160,26 @@ lower_cs_intrinsics_convert_block(struct lower_intrinsics_state *state,
         break;

      case nir_intrinsic_load_num_subgroups: {
-         unsigned local_workgroup_size =
-            nir->info.cs.local_size[0] * nir->info.cs.local_size[1] *
-            nir->info.cs.local_size[2];
-         unsigned num_subgroups =
-            DIV_ROUND_UP(local_workgroup_size, state->dispatch_width);
-         sysval = nir_imm_int(b, num_subgroups);
+         if (state->nir->info.cs.local_size_variable) {
+            nir_ssa_def *size_xyz = nir_load_local_group_size(b);
+            nir_ssa_def *size_x = nir_channel(b, size_xyz, 0);
+            nir_ssa_def *size_y = nir_channel(b, size_xyz, 1);
+            nir_ssa_def *size_z = nir_channel(b, size_xyz, 2);
+            nir_ssa_def *size = nir_imul(b, nir_imul(b, size_x, size_y), size_z);
+
+            /* Calculate the equivalent of DIV_ROUND_UP. */
+            sysval = nir_idiv(b,
+                              nir_iadd_imm(b,
+                                 nir_iadd_imm(b, size, state->dispatch_width), -1),
+                              nir_imm_int(b, state->dispatch_width));
+         } else {
+            unsigned local_workgroup_size =
+               nir->info.cs.local_size[0] * nir->info.cs.local_size[1] *
+               nir->info.cs.local_size[2];
+            unsigned num_subgroups =
+               DIV_ROUND_UP(local_workgroup_size, state->dispatch_width);
+            sysval = nir_imm_int(b, num_subgroups);
+         }
         break;
      }

@@ -198,16 +220,21 @@ brw_nir_lower_cs_intrinsics(nir_shader *nir,
      .dispatch_width = dispatch_width,
   };

-   assert(!nir->info.cs.local_size_variable);
-   state.local_workgroup_size = nir->info.cs.local_size[0] *
-                                nir->info.cs.local_size[1] *
-                                nir->info.cs.local_size[2];
+   if (!nir->info.cs.local_size_variable) {
+      state.local_workgroup_size = nir->info.cs.local_size[0] *
+                                   nir->info.cs.local_size[1] *
+                                   nir->info.cs.local_size[2];
+   } else {
+      state.local_workgroup_size = nir->info.cs.max_variable_local_size;
+   }

   /* Constraints from NV_compute_shader_derivatives. */
-   if (nir->info.cs.derivative_group == DERIVATIVE_GROUP_QUADS) {
+   if (nir->info.cs.derivative_group == DERIVATIVE_GROUP_QUADS &&
+       !nir->info.cs.local_size_variable) {
      assert(nir->info.cs.local_size[0] % 2 == 0);
      assert(nir->info.cs.local_size[1] % 2 == 0);
-   } else if (nir->info.cs.derivative_group == DERIVATIVE_GROUP_LINEAR) {
+   } else if (nir->info.cs.derivative_group == DERIVATIVE_GROUP_LINEAR &&
+              !nir->info.cs.local_size_variable) {
      assert(state.local_workgroup_size % 4 == 0);
   }