radeonsi: move the no-AA small prim precision cull constant into an SGPR

This reduces the scalar load from vec4 to vec2. Reviewed-by: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer@amd.com> Reviewed-by: Qiang Yu <yuq825@gmail.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/17864>
2022-07-28 17:18:04 -04:00
parent 788dce46a3
commit fa46f3d40e
4 changed files with 17 additions and 9 deletions
--- a/src/gallium/drivers/radeonsi/gfx10_shader_ngg.c
+++ b/src/gallium/drivers/radeonsi/gfx10_shader_ngg.c
@@ -960,11 +960,13 @@ static void cull_primitive(struct si_shader_context *ctx,
   options.cull_w = true;

   if (prim_is_lines) {
-      LLVMValueRef terms = ac_build_load_to_sgpr(&ctx->ac, ptr, LLVMConstInt(ctx->ac.i32, 2, 0));
-      terms = LLVMBuildBitCast(builder, terms, ctx->ac.v4f32, "");
+      ptr = LLVMBuildPointerCast(ctx->ac.builder, ptr,
+                                 LLVMPointerType(ctx->ac.v2f32, AC_ADDR_SPACE_CONST_32BIT), "");
+      LLVMValueRef terms = ac_build_load_to_sgpr(&ctx->ac, ptr, LLVMConstInt(ctx->ac.i32, 4, 0));
+      terms = LLVMBuildBitCast(builder, terms, ctx->ac.v2f32, "");
      clip_half_line_width[0] = ac_llvm_extract_elem(&ctx->ac, terms, 0);
      clip_half_line_width[1] = ac_llvm_extract_elem(&ctx->ac, terms, 1);
-      small_prim_precision = ac_llvm_extract_elem(&ctx->ac, terms, 2);
+      small_prim_precision = GET_FIELD(ctx, GS_STATE_SMALL_PRIM_PRECISION_NO_AA);

      options.num_vertices = 2;
      options.cull_small_prims = shader->key.ge.opt.ngg_culling & SI_NGG_CULL_SMALL_LINES_DIAMOND_EXIT;
@@ -974,11 +976,6 @@ static void cull_primitive(struct si_shader_context *ctx,
   } else {
      /* Get the small prim filter precision. */
      small_prim_precision = GET_FIELD(ctx, GS_STATE_SMALL_PRIM_PRECISION);
-      small_prim_precision =
-         LLVMBuildOr(builder, small_prim_precision, LLVMConstInt(ctx->ac.i32, 0x70, 0), "");
-      small_prim_precision =
-         LLVMBuildShl(builder, small_prim_precision, LLVMConstInt(ctx->ac.i32, 23, 0), "");
-      small_prim_precision = LLVMBuildBitCast(builder, small_prim_precision, ctx->ac.f32, "");

      options.num_vertices = 3;
      options.cull_front = shader->key.ge.opt.ngg_culling & SI_NGG_CULL_FRONT_FACE;
@@ -987,6 +984,13 @@ static void cull_primitive(struct si_shader_context *ctx,
      options.cull_zero_area = options.cull_front || options.cull_back;
   }

+   /* Extract the small prim precision. */
+   small_prim_precision =
+      LLVMBuildOr(builder, small_prim_precision, LLVMConstInt(ctx->ac.i32, 0x70, 0), "");
+   small_prim_precision =
+      LLVMBuildShl(builder, small_prim_precision, LLVMConstInt(ctx->ac.i32, 23, 0), "");
+   small_prim_precision = LLVMBuildBitCast(builder, small_prim_precision, ctx->ac.f32, "");
+
   /* Tell ES threads whether their vertex survived. */
   LLVMValueRef params[] = {
      out_prim_accepted,
--- a/src/gallium/drivers/radeonsi/si_pipe.h
+++ b/src/gallium/drivers/radeonsi/si_pipe.h
@@ -913,8 +913,8 @@ struct si_small_prim_cull_info {
   float scale[2], translate[2];
   float scale_no_aa[2], translate_no_aa[2];
   float clip_half_line_width[2];      /* line_width * 0.5 in clip space in X and Y directions */
-   float small_prim_precision_no_aa;   /* same as the small prim precision, but ignores MSAA */
   /* The above fields are uploaded to memory. The below fields are passed via user SGPRs. */
+   float small_prim_precision_no_aa;   /* same as the small prim precision, but ignores MSAA */
   float small_prim_precision;
 };

--- a/src/gallium/drivers/radeonsi/si_shader.h
+++ b/src/gallium/drivers/radeonsi/si_shader.h
@@ -272,6 +272,8 @@ enum
 * in the shader via vs_state_bits in legacy GS, the GS copy shader, and any NGG shader.
 */
 /* bit gap */
+#define GS_STATE_SMALL_PRIM_PRECISION_NO_AA__SHIFT 18
+#define GS_STATE_SMALL_PRIM_PRECISION_NO_AA__MASK  0xf
 #define GS_STATE_SMALL_PRIM_PRECISION__SHIFT    22
 #define GS_STATE_SMALL_PRIM_PRECISION__MASK     0xf
 #define GS_STATE_STREAMOUT_QUERY_ENABLED__SHIFT 26
--- a/src/gallium/drivers/radeonsi/si_state_viewport.c
+++ b/src/gallium/drivers/radeonsi/si_state_viewport.c
@@ -141,6 +141,8 @@ static void si_emit_cull_state(struct si_context *sctx)
    *
    * So pass only the first 4 bits of the float exponent to the shader.
    */
+   SET_FIELD(sctx->current_gs_state, GS_STATE_SMALL_PRIM_PRECISION_NO_AA,
+             (fui(info.small_prim_precision_no_aa) >> 23) & 0xf);
   SET_FIELD(sctx->current_gs_state, GS_STATE_SMALL_PRIM_PRECISION,
             (fui(info.small_prim_precision) >> 23) & 0xf);
 }