radeonsi: remove indirection when loading position at the end for NGG culling

If we store the position into LDS after we know the new thread ID, we don't need to remember the old thread ID. The culling code only needs W, X/W, Y/W, so we have to keep those. Acked-by: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer@amd.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/7172>
2020-10-15 14:21:37 -04:00
parent 823ee12d57
commit 1de0bf0a56
5 changed files with 40 additions and 46 deletions
--- a/src/gallium/drivers/radeonsi/gfx10_shader_ngg.c
+++ b/src/gallium/drivers/radeonsi/gfx10_shader_ngg.c
@@ -755,32 +755,34 @@ void gfx10_emit_ngg_culling_epilogue(struct ac_shader_abi *abi, unsigned max_out
   assert(sel->info.stage == MESA_SHADER_VERTEX ||
          (sel->info.stage == MESA_SHADER_TESS_EVAL && !shader->key.as_es));

-   LLVMValueRef position[4] = {};
+   LLVMValueRef es_vtxptr = ngg_nogs_vertex_ptr(ctx, get_thread_id_in_tg(ctx));
+   unsigned pos_index = 0;
+
   for (unsigned i = 0; i < info->num_outputs; i++) {
+      LLVMValueRef position[4];
+
      switch (info->output_semantic[i]) {
      case VARYING_SLOT_POS:
+         pos_index = i;
         for (unsigned j = 0; j < 4; j++) {
            position[j] = LLVMBuildLoad(ctx->ac.builder, addrs[4 * i + j], "");
         }
+
+         /* Store Position.W into LDS. */
+         LLVMBuildStore(
+            builder, ac_to_integer(&ctx->ac, position[3]),
+            ac_build_gep0(&ctx->ac, es_vtxptr, LLVMConstInt(ctx->ac.i32, lds_pos_w, 0)));
+
+         /* Store Position.XY / W into LDS. */
+         for (unsigned chan = 0; chan < 2; chan++) {
+            LLVMValueRef val = ac_build_fdiv(&ctx->ac, position[chan], position[3]);
+            LLVMBuildStore(
+               builder, ac_to_integer(&ctx->ac, val),
+               ac_build_gep0(&ctx->ac, es_vtxptr, LLVMConstInt(ctx->ac.i32, lds_pos_x_div_w + chan, 0)));
+         }
         break;
      }
   }
-   assert(position[0]);
-
-   /* Store Position.XYZW into LDS. */
-   LLVMValueRef es_vtxptr = ngg_nogs_vertex_ptr(ctx, get_thread_id_in_tg(ctx));
-   for (unsigned chan = 0; chan < 4; chan++) {
-      LLVMBuildStore(
-         builder, ac_to_integer(&ctx->ac, position[chan]),
-         ac_build_gep0(&ctx->ac, es_vtxptr, LLVMConstInt(ctx->ac.i32, lds_pos_x + chan, 0)));
-   }
-   /* Store Position.XY / W into LDS. */
-   for (unsigned chan = 0; chan < 2; chan++) {
-      LLVMValueRef val = ac_build_fdiv(&ctx->ac, position[chan], position[3]);
-      LLVMBuildStore(
-         builder, ac_to_integer(&ctx->ac, val),
-         ac_build_gep0(&ctx->ac, es_vtxptr, LLVMConstInt(ctx->ac.i32, lds_pos_x_div_w + chan, 0)));
-   }

   /* Store VertexID and InstanceID. ES threads will have to load them
    * from LDS after vertex compaction and use them instead of their own
@@ -1001,12 +1003,20 @@ void gfx10_emit_ngg_culling_epilogue(struct ac_shader_abi *abi, unsigned max_out
   {
      LLVMValueRef old_id = get_thread_id_in_tg(ctx);
      LLVMValueRef new_id = ac_prefix_bitcount_2x64(&ctx->ac, es_mask, old_id);
+      LLVMValueRef new_vtx = ngg_nogs_vertex_ptr(ctx, new_id);

      LLVMBuildStore(
         builder, LLVMBuildTrunc(builder, old_id, ctx->ac.i8, ""),
-         si_build_gep_i8(ctx, ngg_nogs_vertex_ptr(ctx, new_id), lds_byte0_old_thread_id));
+         si_build_gep_i8(ctx, new_vtx, lds_byte0_old_thread_id));
      LLVMBuildStore(builder, LLVMBuildTrunc(builder, new_id, ctx->ac.i8, ""),
                     si_build_gep_i8(ctx, es_vtxptr, lds_byte1_new_thread_id));
+
+      /* Store Position.XYZW into LDS. */
+      for (unsigned chan = 0; chan < 4; chan++) {
+         LLVMBuildStore(
+            builder, ac_to_integer(&ctx->ac, LLVMBuildLoad(builder, addrs[4 * pos_index + chan], "")),
+            ac_build_gep0(&ctx->ac, new_vtx, LLVMConstInt(ctx->ac.i32, lds_pos_x + chan, 0)));
+      }
   }
   ac_build_endif(&ctx->ac, 16009);

@@ -1187,9 +1197,6 @@ void gfx10_emit_ngg_culling_epilogue(struct ac_shader_abi *abi, unsigned max_out
      if (num_vgprs == 3)
         vgpr++;
   }
-   /* Return the old thread ID. */
-   val = LLVMBuildLoad(builder, old_thread_id, "");
-   ret = LLVMBuildInsertValue(builder, ret, ac_to_float(&ctx->ac, val), vgpr++, "");

   /* These two also use LDS. */
   if (sel->info.writes_edgeflag ||
@@ -1397,7 +1404,7 @@ void gfx10_emit_ngg_epilogue(struct ac_shader_abi *abi, unsigned max_outputs, LL
          */
         if (info->output_semantic[i] == VARYING_SLOT_POS &&
             ctx->shader->key.opt.ngg_culling) {
-            vertex_ptr = ngg_nogs_vertex_ptr(ctx, ac_get_arg(&ctx->ac, ctx->ngg_old_thread_id));
+            vertex_ptr = ngg_nogs_vertex_ptr(ctx, get_thread_id_in_tg(ctx));

            for (unsigned j = 0; j < 4; j++) {
               tmp = LLVMConstInt(ctx->ac.i32, lds_pos_x + j, 0);
--- a/src/gallium/drivers/radeonsi/si_shader.c
+++ b/src/gallium/drivers/radeonsi/si_shader.c
@@ -289,8 +289,7 @@ static void declare_vb_descriptor_input_sgprs(struct si_shader_context *ctx)
   }
 }

-static void declare_vs_input_vgprs(struct si_shader_context *ctx, unsigned *num_prolog_vgprs,
-                                   bool ngg_cull_shader)
+static void declare_vs_input_vgprs(struct si_shader_context *ctx, unsigned *num_prolog_vgprs)
 {
   struct si_shader *shader = ctx->shader;

@@ -316,10 +315,6 @@ static void declare_vs_input_vgprs(struct si_shader_context *ctx, unsigned *num_
   }

   if (!shader->is_gs_copy_shader) {
-      if (shader->key.opt.ngg_culling && !ngg_cull_shader) {
-         ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->ngg_old_thread_id);
-      }
-
      /* Vertex load indices. */
      if (shader->selector->info.num_inputs) {
         ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->vertex_index0);
@@ -351,16 +346,12 @@ static void declare_vs_blit_inputs(struct si_shader_context *ctx, unsigned vs_bl
   }
 }

-static void declare_tes_input_vgprs(struct si_shader_context *ctx, bool ngg_cull_shader)
+static void declare_tes_input_vgprs(struct si_shader_context *ctx)
 {
   ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_FLOAT, &ctx->tes_u);
   ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_FLOAT, &ctx->tes_v);
   ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->tes_rel_patch_id);
   ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.tes_patch_id);
-
-   if (ctx->shader->key.opt.ngg_culling && !ngg_cull_shader) {
-      ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->ngg_old_thread_id);
-   }
 }

 enum
@@ -404,7 +395,7 @@ void si_create_function(struct si_shader_context *ctx, bool ngg_cull_shader)
         declare_vs_blit_inputs(ctx, shader->selector->info.base.vs.blit_sgprs_amd);

         /* VGPRs */
-         declare_vs_input_vgprs(ctx, &num_prolog_vgprs, ngg_cull_shader);
+         declare_vs_input_vgprs(ctx, &num_prolog_vgprs);
         break;
      }

@@ -423,7 +414,7 @@ void si_create_function(struct si_shader_context *ctx, bool ngg_cull_shader)
      }

      /* VGPRs */
-      declare_vs_input_vgprs(ctx, &num_prolog_vgprs, ngg_cull_shader);
+      declare_vs_input_vgprs(ctx, &num_prolog_vgprs);

      /* Return values */
      if (shader->key.opt.vs_as_prim_discard_cs) {
@@ -480,7 +471,7 @@ void si_create_function(struct si_shader_context *ctx, bool ngg_cull_shader)
      ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.tcs_rel_ids);

      if (ctx->stage == MESA_SHADER_VERTEX) {
-         declare_vs_input_vgprs(ctx, &num_prolog_vgprs, ngg_cull_shader);
+         declare_vs_input_vgprs(ctx, &num_prolog_vgprs);

         /* LS return values are inputs to the TCS main shader part. */
         for (i = 0; i < 8 + GFX9_TCS_NUM_USER_SGPR; i++)
@@ -548,9 +539,9 @@ void si_create_function(struct si_shader_context *ctx, bool ngg_cull_shader)
      ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->gs_vtx45_offset);

      if (ctx->stage == MESA_SHADER_VERTEX) {
-         declare_vs_input_vgprs(ctx, &num_prolog_vgprs, ngg_cull_shader);
+         declare_vs_input_vgprs(ctx, &num_prolog_vgprs);
      } else if (ctx->stage == MESA_SHADER_TESS_EVAL) {
-         declare_tes_input_vgprs(ctx, ngg_cull_shader);
+         declare_tes_input_vgprs(ctx);
      }

      if ((ctx->shader->key.as_es || ngg_cull_shader) &&
@@ -572,12 +563,12 @@ void si_create_function(struct si_shader_context *ctx, bool ngg_cull_shader)
            num_user_sgprs = GFX9_TESGS_NUM_USER_SGPR;
         }

-         /* The NGG cull shader has to return all 9 VGPRs + the old thread ID.
+         /* The NGG cull shader has to return all 9 VGPRs.
          *
          * The normal merged ESGS shader only has to return the 5 VGPRs
          * for the GS stage.
          */
-         num_vgprs = ngg_cull_shader ? 10 : 5;
+         num_vgprs = ngg_cull_shader ? 9 : 5;

         /* ES return values are inputs to GS. */
         for (i = 0; i < 8 + num_user_sgprs; i++)
@@ -604,7 +595,7 @@ void si_create_function(struct si_shader_context *ctx, bool ngg_cull_shader)
      }

      /* VGPRs */
-      declare_tes_input_vgprs(ctx, ngg_cull_shader);
+      declare_tes_input_vgprs(ctx);
      break;

   case MESA_SHADER_GEOMETRY:
@@ -1560,8 +1551,6 @@ static void si_get_vs_prolog_key(const struct si_shader_info *info, unsigned num
         !!(shader_out->key.opt.ngg_culling & SI_NGG_CULL_GS_FAST_LAUNCH_TRI_LIST);
      key->vs_prolog.gs_fast_launch_tri_strip =
         !!(shader_out->key.opt.ngg_culling & SI_NGG_CULL_GS_FAST_LAUNCH_TRI_STRIP);
-   } else {
-      key->vs_prolog.has_ngg_cull_inputs = !!shader_out->key.opt.ngg_culling;
   }

   if (shader_out->selector->info.stage == MESA_SHADER_TESS_CTRL) {
--- a/src/gallium/drivers/radeonsi/si_shader.h
+++ b/src/gallium/drivers/radeonsi/si_shader.h
@@ -555,7 +555,6 @@ union si_shader_part_key {
      unsigned as_es : 1;
      unsigned as_ngg : 1;
      unsigned as_prim_discard_cs : 1;
-      unsigned has_ngg_cull_inputs : 1;      /* from the NGG cull shader */
      unsigned gs_fast_launch_tri_list : 1;  /* for NGG culling */
      unsigned gs_fast_launch_tri_strip : 1; /* for NGG culling */
      /* Prologs for monolithic shaders shouldn't set EXEC. */
--- a/src/gallium/drivers/radeonsi/si_shader_internal.h
+++ b/src/gallium/drivers/radeonsi/si_shader_internal.h
@@ -109,7 +109,6 @@ struct si_shader_context {
    */
   struct ac_arg vs_state_bits;
   struct ac_arg vs_blit_inputs;
-   struct ac_arg ngg_old_thread_id; /* generated by the NGG cull shader */
   /* HW VS */
   struct ac_arg streamout_config;
   struct ac_arg streamout_write_index;
--- a/src/gallium/drivers/radeonsi/si_shader_llvm_vs.c
+++ b/src/gallium/drivers/radeonsi/si_shader_llvm_vs.c
@@ -779,7 +779,7 @@ void si_llvm_build_vs_prolog(struct si_shader_context *ctx, union si_shader_part
   int num_returns, i;
   unsigned first_vs_vgpr = key->vs_prolog.num_merged_next_stage_vgprs;
   unsigned num_input_vgprs =
-      key->vs_prolog.num_merged_next_stage_vgprs + 4 + (key->vs_prolog.has_ngg_cull_inputs ? 1 : 0);
+      key->vs_prolog.num_merged_next_stage_vgprs + 4;
   struct ac_arg input_sgpr_param[key->vs_prolog.num_input_sgprs];
   struct ac_arg input_vgpr_param[10];
   LLVMValueRef input_vgprs[10];