radv: move computing NGG info and GS info to radv_nir_shader_info_link()

It's a link step somehow, except for VS only on GFX10+ but keep it there anyways. Signed-off-by: Samuel Pitoiset <samuel.pitoiset@gmail.com> Reviewed-by: Timur Kristóf <timur.kristof@gmail.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/18278>
2022-08-26 12:03:11 +02:00
parent 13af51b2f3
commit 2f9b30f30c
3 changed files with 396 additions and 393 deletions
--- a/src/amd/vulkan/radv_pipeline.c
+++ b/src/amd/vulkan/radv_pipeline.c
@@ -1893,142 +1893,6 @@ radv_pipeline_init_depth_stencil_state(struct radv_graphics_pipeline *pipeline,
   return ds_state;
 }

-static void
-gfx9_get_gs_info(const struct radv_device *device, struct radv_pipeline_stage *es_stage,
-                 struct radv_pipeline_stage *gs_stage)
-{
-   const enum amd_gfx_level gfx_level = device->physical_device->rad_info.gfx_level;
-   struct radv_shader_info *gs_info = &gs_stage->info;
-   struct radv_shader_info *es_info = &es_stage->info;
-   struct gfx9_gs_info *out = &gs_stage->info.gs_ring_info;
-
-   const unsigned gs_num_invocations = MAX2(gs_info->gs.invocations, 1);
-   const bool uses_adjacency = gs_info->gs.input_prim == SHADER_PRIM_LINES_ADJACENCY ||
-                               gs_info->gs.input_prim == SHADER_PRIM_TRIANGLES_ADJACENCY;
-
-   /* All these are in dwords: */
-   /* We can't allow using the whole LDS, because GS waves compete with
-    * other shader stages for LDS space. */
-   const unsigned max_lds_size = 8 * 1024;
-   const unsigned esgs_itemsize = es_info->esgs_itemsize / 4;
-   unsigned esgs_lds_size;
-
-   /* All these are per subgroup: */
-   const unsigned max_out_prims = 32 * 1024;
-   const unsigned max_es_verts = 255;
-   const unsigned ideal_gs_prims = 64;
-   unsigned max_gs_prims, gs_prims;
-   unsigned min_es_verts, es_verts, worst_case_es_verts;
-
-   if (uses_adjacency || gs_num_invocations > 1)
-      max_gs_prims = 127 / gs_num_invocations;
-   else
-      max_gs_prims = 255;
-
-   /* MAX_PRIMS_PER_SUBGROUP = gs_prims * max_vert_out * gs_invocations.
-    * Make sure we don't go over the maximum value.
-    */
-   if (gs_info->gs.vertices_out > 0) {
-      max_gs_prims =
-         MIN2(max_gs_prims, max_out_prims / (gs_info->gs.vertices_out * gs_num_invocations));
-   }
-   assert(max_gs_prims > 0);
-
-   /* If the primitive has adjacency, halve the number of vertices
-    * that will be reused in multiple primitives.
-    */
-   min_es_verts = gs_info->gs.vertices_in / (uses_adjacency ? 2 : 1);
-
-   gs_prims = MIN2(ideal_gs_prims, max_gs_prims);
-   worst_case_es_verts = MIN2(min_es_verts * gs_prims, max_es_verts);
-
-   /* Compute ESGS LDS size based on the worst case number of ES vertices
-    * needed to create the target number of GS prims per subgroup.
-    */
-   esgs_lds_size = esgs_itemsize * worst_case_es_verts;
-
-   /* If total LDS usage is too big, refactor partitions based on ratio
-    * of ESGS item sizes.
-    */
-   if (esgs_lds_size > max_lds_size) {
-      /* Our target GS Prims Per Subgroup was too large. Calculate
-       * the maximum number of GS Prims Per Subgroup that will fit
-       * into LDS, capped by the maximum that the hardware can support.
-       */
-      gs_prims = MIN2((max_lds_size / (esgs_itemsize * min_es_verts)), max_gs_prims);
-      assert(gs_prims > 0);
-      worst_case_es_verts = MIN2(min_es_verts * gs_prims, max_es_verts);
-
-      esgs_lds_size = esgs_itemsize * worst_case_es_verts;
-      assert(esgs_lds_size <= max_lds_size);
-   }
-
-   /* Now calculate remaining ESGS information. */
-   if (esgs_lds_size)
-      es_verts = MIN2(esgs_lds_size / esgs_itemsize, max_es_verts);
-   else
-      es_verts = max_es_verts;
-
-   /* Vertices for adjacency primitives are not always reused, so restore
-    * it for ES_VERTS_PER_SUBGRP.
-    */
-   min_es_verts = gs_info->gs.vertices_in;
-
-   /* For normal primitives, the VGT only checks if they are past the ES
-    * verts per subgroup after allocating a full GS primitive and if they
-    * are, kick off a new subgroup.  But if those additional ES verts are
-    * unique (e.g. not reused) we need to make sure there is enough LDS
-    * space to account for those ES verts beyond ES_VERTS_PER_SUBGRP.
-    */
-   es_verts -= min_es_verts - 1;
-
-   const uint32_t es_verts_per_subgroup = es_verts;
-   const uint32_t gs_prims_per_subgroup = gs_prims;
-   const uint32_t gs_inst_prims_in_subgroup = gs_prims * gs_num_invocations;
-   const uint32_t max_prims_per_subgroup = gs_inst_prims_in_subgroup * gs_info->gs.vertices_out;
-   out->lds_size = align(esgs_lds_size, 128) / 128;
-   out->vgt_gs_onchip_cntl = S_028A44_ES_VERTS_PER_SUBGRP(es_verts_per_subgroup) |
-                             S_028A44_GS_PRIMS_PER_SUBGRP(gs_prims_per_subgroup) |
-                             S_028A44_GS_INST_PRIMS_IN_SUBGRP(gs_inst_prims_in_subgroup);
-   out->vgt_gs_max_prims_per_subgroup = S_028A94_MAX_PRIMS_PER_SUBGROUP(max_prims_per_subgroup);
-   out->vgt_esgs_ring_itemsize = esgs_itemsize;
-   assert(max_prims_per_subgroup <= max_out_prims);
-
-   unsigned workgroup_size = ac_compute_esgs_workgroup_size(gfx_level, es_info->wave_size,
-      es_verts_per_subgroup, gs_inst_prims_in_subgroup);
-   es_info->workgroup_size = workgroup_size;
-   gs_info->workgroup_size = workgroup_size;
-}
-
-static void
-clamp_gsprims_to_esverts(unsigned *max_gsprims, unsigned max_esverts, unsigned min_verts_per_prim,
-                         bool use_adjacency)
-{
-   unsigned max_reuse = max_esverts - min_verts_per_prim;
-   if (use_adjacency)
-      max_reuse /= 2;
-   *max_gsprims = MIN2(*max_gsprims, 1 + max_reuse);
-}
-
-static unsigned
-radv_get_num_input_vertices(const struct radv_pipeline_stage *es_stage,
-                            const struct radv_pipeline_stage *gs_stage)
-{
-   if (gs_stage) {
-      return gs_stage->nir->info.gs.vertices_in;
-   }
-
-   if (es_stage->stage == MESA_SHADER_TESS_EVAL) {
-      if (es_stage->nir->info.tess.point_mode)
-         return 1;
-      if (es_stage->nir->info.tess._primitive_mode == TESS_PRIMITIVE_ISOLINES)
-         return 2;
-      return 3;
-   }
-
-   return 3;
-}
-
 static void
 gfx10_emit_ge_pc_alloc(struct radeon_cmdbuf *cs, enum amd_gfx_level gfx_level,
                       uint32_t oversub_pc_lines)
@@ -2038,245 +1902,6 @@ gfx10_emit_ge_pc_alloc(struct radeon_cmdbuf *cs, enum amd_gfx_level gfx_level,
      S_030980_OVERSUB_EN(oversub_pc_lines > 0) | S_030980_NUM_PC_LINES(oversub_pc_lines - 1));
 }

-static unsigned
-radv_get_pre_rast_input_topology(const struct radv_pipeline_stage *es_stage,
-                                 const struct radv_pipeline_stage *gs_stage)
-{
-   if (gs_stage) {
-      return gs_stage->nir->info.gs.input_primitive;
-   }
-
-   if (es_stage->stage == MESA_SHADER_TESS_EVAL) {
-      if (es_stage->nir->info.tess.point_mode)
-         return SHADER_PRIM_POINTS;
-      if (es_stage->nir->info.tess._primitive_mode == TESS_PRIMITIVE_ISOLINES)
-         return SHADER_PRIM_LINES;
-      return SHADER_PRIM_TRIANGLES;
-   }
-
-   return SHADER_PRIM_TRIANGLES;
-}
-
-static void
-gfx10_get_ngg_info(const struct radv_device *device, struct radv_pipeline_stage *es_stage,
-                   struct radv_pipeline_stage *gs_stage)
-{
-   const enum amd_gfx_level gfx_level = device->physical_device->rad_info.gfx_level;
-   struct radv_shader_info *gs_info = gs_stage ? &gs_stage->info : NULL;
-   struct radv_shader_info *es_info = &es_stage->info;
-   const unsigned max_verts_per_prim = radv_get_num_input_vertices(es_stage, gs_stage);
-   const unsigned min_verts_per_prim = gs_stage ? max_verts_per_prim : 1;
-   struct gfx10_ngg_info *out = gs_stage ? &gs_info->ngg_info : &es_info->ngg_info;
-
-   const unsigned gs_num_invocations = gs_stage ? MAX2(gs_info->gs.invocations, 1) : 1;
-
-   const unsigned input_prim = radv_get_pre_rast_input_topology(es_stage, gs_stage);
-   const bool uses_adjacency = input_prim == SHADER_PRIM_LINES_ADJACENCY ||
-                               input_prim == SHADER_PRIM_TRIANGLES_ADJACENCY;
-
-   /* All these are in dwords: */
-   /* We can't allow using the whole LDS, because GS waves compete with
-    * other shader stages for LDS space.
-    *
-    * TODO: We should really take the shader's internal LDS use into
-    *       account. The linker will fail if the size is greater than
-    *       8K dwords.
-    */
-   const unsigned max_lds_size = 8 * 1024 - 768;
-   const unsigned target_lds_size = max_lds_size;
-   unsigned esvert_lds_size = 0;
-   unsigned gsprim_lds_size = 0;
-
-   /* All these are per subgroup: */
-   const unsigned min_esverts = gfx_level >= GFX10_3 ? 29 : 24;
-   bool max_vert_out_per_gs_instance = false;
-   unsigned max_esverts_base = 128;
-   unsigned max_gsprims_base = 128; /* default prim group size clamp */
-
-   /* Hardware has the following non-natural restrictions on the value
-    * of GE_CNTL.VERT_GRP_SIZE based on based on the primitive type of
-    * the draw:
-    *  - at most 252 for any line input primitive type
-    *  - at most 251 for any quad input primitive type
-    *  - at most 251 for triangle strips with adjacency (this happens to
-    *    be the natural limit for triangle *lists* with adjacency)
-    */
-   max_esverts_base = MIN2(max_esverts_base, 251 + max_verts_per_prim - 1);
-
-   if (gs_stage) {
-      unsigned max_out_verts_per_gsprim = gs_info->gs.vertices_out * gs_num_invocations;
-
-      if (max_out_verts_per_gsprim <= 256) {
-         if (max_out_verts_per_gsprim) {
-            max_gsprims_base = MIN2(max_gsprims_base, 256 / max_out_verts_per_gsprim);
-         }
-      } else {
-         /* Use special multi-cycling mode in which each GS
-          * instance gets its own subgroup. Does not work with
-          * tessellation. */
-         max_vert_out_per_gs_instance = true;
-         max_gsprims_base = 1;
-         max_out_verts_per_gsprim = gs_info->gs.vertices_out;
-      }
-
-      esvert_lds_size = es_info->esgs_itemsize / 4;
-      gsprim_lds_size = (gs_info->gs.gsvs_vertex_size / 4 + 1) * max_out_verts_per_gsprim;
-   } else {
-      /* VS and TES. */
-      /* LDS size for passing data from GS to ES. */
-      struct radv_streamout_info *so_info = &es_info->so;
-
-      if (so_info->num_outputs)
-         esvert_lds_size = 4 * so_info->num_outputs + 1;
-
-      /* GS stores Primitive IDs (one DWORD) into LDS at the address
-       * corresponding to the ES thread of the provoking vertex. All
-       * ES threads load and export PrimitiveID for their thread.
-       */
-      if (es_stage->stage == MESA_SHADER_VERTEX && es_stage->info.outinfo.export_prim_id)
-         esvert_lds_size = MAX2(esvert_lds_size, 1);
-   }
-
-   unsigned max_gsprims = max_gsprims_base;
-   unsigned max_esverts = max_esverts_base;
-
-   if (esvert_lds_size)
-      max_esverts = MIN2(max_esverts, target_lds_size / esvert_lds_size);
-   if (gsprim_lds_size)
-      max_gsprims = MIN2(max_gsprims, target_lds_size / gsprim_lds_size);
-
-   max_esverts = MIN2(max_esverts, max_gsprims * max_verts_per_prim);
-   clamp_gsprims_to_esverts(&max_gsprims, max_esverts, min_verts_per_prim, uses_adjacency);
-   assert(max_esverts >= max_verts_per_prim && max_gsprims >= 1);
-
-   if (esvert_lds_size || gsprim_lds_size) {
-      /* Now that we have a rough proportionality between esverts
-       * and gsprims based on the primitive type, scale both of them
-       * down simultaneously based on required LDS space.
-       *
-       * We could be smarter about this if we knew how much vertex
-       * reuse to expect.
-       */
-      unsigned lds_total = max_esverts * esvert_lds_size + max_gsprims * gsprim_lds_size;
-      if (lds_total > target_lds_size) {
-         max_esverts = max_esverts * target_lds_size / lds_total;
-         max_gsprims = max_gsprims * target_lds_size / lds_total;
-
-         max_esverts = MIN2(max_esverts, max_gsprims * max_verts_per_prim);
-         clamp_gsprims_to_esverts(&max_gsprims, max_esverts, min_verts_per_prim, uses_adjacency);
-         assert(max_esverts >= max_verts_per_prim && max_gsprims >= 1);
-      }
-   }
-
-   /* Round up towards full wave sizes for better ALU utilization. */
-   if (!max_vert_out_per_gs_instance) {
-      unsigned orig_max_esverts;
-      unsigned orig_max_gsprims;
-      unsigned wavesize;
-
-      if (gs_stage) {
-         wavesize = gs_info->wave_size;
-      } else {
-         wavesize = es_info->wave_size;
-      }
-
-      do {
-         orig_max_esverts = max_esverts;
-         orig_max_gsprims = max_gsprims;
-
-         max_esverts = align(max_esverts, wavesize);
-         max_esverts = MIN2(max_esverts, max_esverts_base);
-         if (esvert_lds_size)
-            max_esverts =
-               MIN2(max_esverts, (max_lds_size - max_gsprims * gsprim_lds_size) / esvert_lds_size);
-         max_esverts = MIN2(max_esverts, max_gsprims * max_verts_per_prim);
-
-         /* Hardware restriction: minimum value of max_esverts */
-         if (gfx_level == GFX10)
-            max_esverts = MAX2(max_esverts, min_esverts - 1 + max_verts_per_prim);
-         else
-            max_esverts = MAX2(max_esverts, min_esverts);
-
-         max_gsprims = align(max_gsprims, wavesize);
-         max_gsprims = MIN2(max_gsprims, max_gsprims_base);
-         if (gsprim_lds_size) {
-            /* Don't count unusable vertices to the LDS
-             * size. Those are vertices above the maximum
-             * number of vertices that can occur in the
-             * workgroup, which is e.g. max_gsprims * 3
-             * for triangles.
-             */
-            unsigned usable_esverts = MIN2(max_esverts, max_gsprims * max_verts_per_prim);
-            max_gsprims = MIN2(max_gsprims,
-                               (max_lds_size - usable_esverts * esvert_lds_size) / gsprim_lds_size);
-         }
-         clamp_gsprims_to_esverts(&max_gsprims, max_esverts, min_verts_per_prim, uses_adjacency);
-         assert(max_esverts >= max_verts_per_prim && max_gsprims >= 1);
-      } while (orig_max_esverts != max_esverts || orig_max_gsprims != max_gsprims);
-
-      /* Verify the restriction. */
-      if (gfx_level == GFX10)
-         assert(max_esverts >= min_esverts - 1 + max_verts_per_prim);
-      else
-         assert(max_esverts >= min_esverts);
-   } else {
-      /* Hardware restriction: minimum value of max_esverts */
-      if (gfx_level == GFX10)
-         max_esverts = MAX2(max_esverts, min_esverts - 1 + max_verts_per_prim);
-      else
-         max_esverts = MAX2(max_esverts, min_esverts);
-   }
-
-   unsigned max_out_vertices = max_vert_out_per_gs_instance ? gs_info->gs.vertices_out
-                               : gs_stage
-                                  ? max_gsprims * gs_num_invocations * gs_info->gs.vertices_out
-                                  : max_esverts;
-   assert(max_out_vertices <= 256);
-
-   unsigned prim_amp_factor = 1;
-   if (gs_stage) {
-      /* Number of output primitives per GS input primitive after
-       * GS instancing. */
-      prim_amp_factor = gs_info->gs.vertices_out;
-   }
-
-   /* On Gfx10, the GE only checks against the maximum number of ES verts
-    * after allocating a full GS primitive. So we need to ensure that
-    * whenever this check passes, there is enough space for a full
-    * primitive without vertex reuse.
-    */
-   if (gfx_level == GFX10)
-      out->hw_max_esverts = max_esverts - max_verts_per_prim + 1;
-   else
-      out->hw_max_esverts = max_esverts;
-
-   out->max_gsprims = max_gsprims;
-   out->max_out_verts = max_out_vertices;
-   out->prim_amp_factor = prim_amp_factor;
-   out->max_vert_out_per_gs_instance = max_vert_out_per_gs_instance;
-   out->ngg_emit_size = max_gsprims * gsprim_lds_size;
-   out->enable_vertex_grouping = true;
-
-   /* Don't count unusable vertices. */
-   out->esgs_ring_size = MIN2(max_esverts, max_gsprims * max_verts_per_prim) * esvert_lds_size * 4;
-
-   if (gs_stage) {
-      out->vgt_esgs_ring_itemsize = es_info->esgs_itemsize / 4;
-   } else {
-      out->vgt_esgs_ring_itemsize = 1;
-   }
-
-   assert(out->hw_max_esverts >= min_esverts); /* HW limitation */
-
-   unsigned workgroup_size =
-      ac_compute_ngg_workgroup_size(
-         max_esverts, max_gsprims * gs_num_invocations, max_out_vertices, prim_amp_factor);
-   if (gs_stage) {
-      gs_info->workgroup_size = workgroup_size;
-   }
-   es_info->workgroup_size = workgroup_size;
-}
-
 static void
 radv_pipeline_init_gs_ring_state(struct radv_graphics_pipeline *pipeline, const struct gfx9_gs_info *gs)
 {
@@ -3328,7 +2953,7 @@ radv_fill_shader_info(struct radv_pipeline *pipeline,
                                &stages[i].info);
   }

-   radv_nir_shader_info_link(device, pipeline_key, stages, last_vgt_api_stage);
+   radv_nir_shader_info_link(device, pipeline_key, stages, pipeline_has_ngg, last_vgt_api_stage);

   if (stages[MESA_SHADER_TESS_CTRL].nir) {
      for (gl_shader_stage s = MESA_SHADER_VERTEX; s <= MESA_SHADER_TESS_CTRL; ++s) {
@@ -3359,21 +2984,7 @@ radv_fill_shader_info(struct radv_pipeline *pipeline,
            stages[MESA_SHADER_TASK].nir->info.workgroup_size, false, UINT32_MAX);
   }

-   if (pipeline_has_ngg) {
-      if (last_vgt_api_stage != MESA_SHADER_MESH) {
-         struct radv_pipeline_stage *es_stage =
-            stages[MESA_SHADER_TESS_EVAL].nir ? &stages[MESA_SHADER_TESS_EVAL] : &stages[MESA_SHADER_VERTEX];
-         struct radv_pipeline_stage *gs_stage =
-            stages[MESA_SHADER_GEOMETRY].nir ? &stages[MESA_SHADER_GEOMETRY] : NULL;
-
-         gfx10_get_ngg_info(device, es_stage, gs_stage);
-      }
-   } else if (stages[MESA_SHADER_GEOMETRY].nir) {
-      struct radv_pipeline_stage *es_stage =
-         stages[MESA_SHADER_TESS_EVAL].nir ? &stages[MESA_SHADER_TESS_EVAL] : &stages[MESA_SHADER_VERTEX];
-
-      gfx9_get_gs_info(device, es_stage, &stages[MESA_SHADER_GEOMETRY]);
-   } else {
+   if (!pipeline_has_ngg && !stages[MESA_SHADER_GEOMETRY].nir) {
      gl_shader_stage hw_vs_api_stage =
         stages[MESA_SHADER_TESS_EVAL].nir ? MESA_SHADER_TESS_EVAL : MESA_SHADER_VERTEX;
      stages[hw_vs_api_stage].info.workgroup_size = stages[hw_vs_api_stage].info.wave_size;