ac,radeonsi: emulate GS primitive pipeline stat on gfx11 because of culling

GS culls too, so the pipeline stat is incorrect. This can be exposed by forcing monolithic shader use, which makes culling shaders immediately available for tests to use. Acked-by: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer@amd.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/26917>
2024-01-01 19:24:34 -05:00
parent 1d3f937142
commit b9b00a0e7a
7 changed files with 32 additions and 10 deletions
--- a/src/amd/common/ac_nir.c
+++ b/src/amd/common/ac_nir.c
@@ -898,7 +898,8 @@ ac_nir_accum_ior(nir_builder *b, nir_def *accum_result, nir_def *new_term)
 bool
 ac_nir_gs_shader_query(nir_builder *b,
                       bool has_gen_prim_query,
-                       bool has_pipeline_stats_query,
+                       bool has_gs_invocations_query,
+                       bool has_gs_primitives_query,
                       unsigned num_vertices_per_primitive,
                       unsigned wave_size,
                       nir_def *vertex_count[4],
@@ -913,7 +914,7 @@ ac_nir_gs_shader_query(nir_builder *b,
      any_query_enabled = ac_nir_accum_ior(b, any_query_enabled, prim_gen_query_enabled);
   }

-   if (has_pipeline_stats_query) {
+   if (has_gs_invocations_query || has_gs_primitives_query) {
      pipeline_query_enabled = nir_load_pipeline_stat_query_enabled_amd(b);
      any_query_enabled = ac_nir_accum_ior(b, any_query_enabled, pipeline_query_enabled);
   }
@@ -959,7 +960,7 @@ ac_nir_gs_shader_query(nir_builder *b,
   /* Store the query result to query result using an atomic add. */
   nir_if *if_first_lane = nir_push_if(b, nir_elect(b, 1));
   {
-      if (has_pipeline_stats_query) {
+      if (has_gs_invocations_query || has_gs_primitives_query) {
         nir_if *if_pipeline_query = nir_push_if(b, pipeline_query_enabled);
         {
            nir_def *count = NULL;
@@ -974,10 +975,11 @@ ac_nir_gs_shader_query(nir_builder *b,
               }
            }

-            if (count)
+            if (has_gs_primitives_query && count)
               nir_atomic_add_gs_emit_prim_count_amd(b, count);

-            nir_atomic_add_shader_invocation_count_amd(b, num_active_threads);
+            if (has_gs_invocations_query)
+               nir_atomic_add_shader_invocation_count_amd(b, num_active_threads);
         }
         nir_pop_if(b, if_pipeline_query);
      }
@@ -1237,6 +1239,7 @@ ac_nir_lower_legacy_gs(nir_shader *nir,
   bool progress = ac_nir_gs_shader_query(b,
                                          has_gen_prim_query,
                                          has_pipeline_stats_query,
+                                          has_pipeline_stats_query,
                                          num_vertices_per_primitive,
                                          64,
                                          s.vertex_count,
--- a/src/amd/common/ac_nir.h
+++ b/src/amd/common/ac_nir.h
@@ -166,6 +166,8 @@ typedef struct {
   bool disable_streamout;
   bool has_gen_prim_query;
   bool has_xfb_prim_query;
+   bool has_gs_invocations_query;
+   bool has_gs_primitives_query;
   bool kill_pointsize;
   bool kill_layer;
   bool force_vrs;
@@ -268,7 +270,8 @@ ac_nir_lower_legacy_vs(nir_shader *nir,
 bool
 ac_nir_gs_shader_query(nir_builder *b,
                       bool has_gen_prim_query,
-                       bool has_pipeline_stats_query,
+                       bool has_gs_invocations_query,
+                       bool has_gs_primitives_query,
                       unsigned num_vertices_per_primitive,
                       unsigned wave_size,
                       nir_def *vertex_count[4],
--- a/src/amd/common/ac_nir_lower_ngg.c
+++ b/src/amd/common/ac_nir_lower_ngg.c
@@ -3573,7 +3573,8 @@ ac_nir_lower_ngg_gs(nir_shader *shader, const ac_nir_lower_ngg_options *options)
   b->cursor = nir_after_cf_list(&if_gs_thread->then_list);
   ac_nir_gs_shader_query(b,
                          state.options->has_gen_prim_query,
-                          state.options->gfx_level < GFX11,
+                          state.options->has_gs_invocations_query,
+                          state.options->has_gs_primitives_query,
                          state.num_vertices_per_primitive,
                          state.options->wave_size,
                          state.vertex_count,
--- a/src/amd/vulkan/radv_shader.c
+++ b/src/amd/vulkan/radv_shader.c
@@ -856,6 +856,8 @@ radv_lower_ngg(struct radv_device *device, struct radv_shader_stage *ngg_stage,
   options.disable_streamout = !device->physical_device->use_ngg_streamout;
   options.has_gen_prim_query = info->has_prim_query;
   options.has_xfb_prim_query = info->has_xfb_query;
+   options.has_gs_invocations_query = device->physical_device->rad_info.gfx_level < GFX11;
+   options.has_gs_primitives_query = device->physical_device->rad_info.gfx_level < GFX11;
   options.force_vrs = info->force_vrs_per_vertex;

   if (nir->info.stage == MESA_SHADER_VERTEX || nir->info.stage == MESA_SHADER_TESS_EVAL) {
--- a/src/gallium/drivers/radeonsi/si_nir_lower_abi.c
+++ b/src/gallium/drivers/radeonsi/si_nir_lower_abi.c
@@ -470,12 +470,17 @@ static bool lower_intrinsic(nir_builder *b, nir_instr *instr, struct lower_abi_s
   }
   case nir_intrinsic_atomic_add_gs_emit_prim_count_amd:
   case nir_intrinsic_atomic_add_shader_invocation_count_amd: {
-      nir_def *buf =
-         si_nir_load_internal_binding(b, args, SI_GS_QUERY_EMULATED_COUNTERS_BUF, 4);
-
      enum pipe_statistics_query_index index =
         intrin->intrinsic == nir_intrinsic_atomic_add_gs_emit_prim_count_amd ?
         PIPE_STAT_QUERY_GS_PRIMITIVES : PIPE_STAT_QUERY_GS_INVOCATIONS;
+
+      /* GFX11 only needs to emulate PIPE_STAT_QUERY_GS_PRIMITIVES because GS culls,
+       * which makes the pipeline statistic incorrect.
+       */
+      assert(sel->screen->info.gfx_level < GFX11 || index == PIPE_STAT_QUERY_GS_PRIMITIVES);
+
+      nir_def *buf =
+         si_nir_load_internal_binding(b, args, SI_GS_QUERY_EMULATED_COUNTERS_BUF, 4);
      unsigned offset = si_query_pipestat_end_dw_offset(sel->screen, index) * 4;

      nir_def *count = intrin->src[0].ssa;
--- a/src/gallium/drivers/radeonsi/si_query.c
+++ b/src/gallium/drivers/radeonsi/si_query.c
@@ -693,6 +693,12 @@ static struct pipe_query *si_query_hw_create(struct si_screen *sscreen, unsigned
      if ((index == PIPE_STAT_QUERY_GS_PRIMITIVES || index == PIPE_STAT_QUERY_GS_INVOCATIONS) &&
          sscreen->use_ngg && (sscreen->info.gfx_level >= GFX10 && sscreen->info.gfx_level <= GFX10_3))
         query->flags |= SI_QUERY_EMULATE_GS_COUNTERS;
+
+      /* GFX11 only emulates PIPE_STAT_QUERY_GS_PRIMITIVES because the shader culls,
+       * which makes the statistic incorrect.
+       */
+      if (sscreen->info.gfx_level >= GFX11 && index == PIPE_STAT_QUERY_GS_PRIMITIVES)
+         query->flags |= SI_QUERY_EMULATE_GS_COUNTERS;
      break;
   default:
      assert(0);
--- a/src/gallium/drivers/radeonsi/si_shader.c
+++ b/src/gallium/drivers/radeonsi/si_shader.c
@@ -1976,6 +1976,8 @@ static void si_lower_ngg(struct si_shader *shader, nir_shader *nir)
      options.gs_out_vtx_bytes = sel->info.gsvs_vertex_size;
      options.has_gen_prim_query = options.has_xfb_prim_query =
         sel->screen->info.gfx_level >= GFX11;
+      options.has_gs_invocations_query = sel->screen->info.gfx_level < GFX11;
+      options.has_gs_primitives_query = true;

      /* For monolithic ES/GS to add vscnt wait when GS export pos0. */
      if (key->ge.part.gs.es)