From fa46f3d40e7784f3d637eedd0efa7ff7e827bdf1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= Date: Thu, 28 Jul 2022 17:18:04 -0400 Subject: [PATCH] radeonsi: move the no-AA small prim precision cull constant into an SGPR This reduces the scalar load from vec4 to vec2. Reviewed-by: Pierre-Eric Pelloux-Prayer Reviewed-by: Qiang Yu Part-of: --- .../drivers/radeonsi/gfx10_shader_ngg.c | 20 +++++++++++-------- src/gallium/drivers/radeonsi/si_pipe.h | 2 +- src/gallium/drivers/radeonsi/si_shader.h | 2 ++ .../drivers/radeonsi/si_state_viewport.c | 2 ++ 4 files changed, 17 insertions(+), 9 deletions(-) diff --git a/src/gallium/drivers/radeonsi/gfx10_shader_ngg.c b/src/gallium/drivers/radeonsi/gfx10_shader_ngg.c index f0ddb30f800..95deba0355c 100644 --- a/src/gallium/drivers/radeonsi/gfx10_shader_ngg.c +++ b/src/gallium/drivers/radeonsi/gfx10_shader_ngg.c @@ -960,11 +960,13 @@ static void cull_primitive(struct si_shader_context *ctx, options.cull_w = true; if (prim_is_lines) { - LLVMValueRef terms = ac_build_load_to_sgpr(&ctx->ac, ptr, LLVMConstInt(ctx->ac.i32, 2, 0)); - terms = LLVMBuildBitCast(builder, terms, ctx->ac.v4f32, ""); + ptr = LLVMBuildPointerCast(ctx->ac.builder, ptr, + LLVMPointerType(ctx->ac.v2f32, AC_ADDR_SPACE_CONST_32BIT), ""); + LLVMValueRef terms = ac_build_load_to_sgpr(&ctx->ac, ptr, LLVMConstInt(ctx->ac.i32, 4, 0)); + terms = LLVMBuildBitCast(builder, terms, ctx->ac.v2f32, ""); clip_half_line_width[0] = ac_llvm_extract_elem(&ctx->ac, terms, 0); clip_half_line_width[1] = ac_llvm_extract_elem(&ctx->ac, terms, 1); - small_prim_precision = ac_llvm_extract_elem(&ctx->ac, terms, 2); + small_prim_precision = GET_FIELD(ctx, GS_STATE_SMALL_PRIM_PRECISION_NO_AA); options.num_vertices = 2; options.cull_small_prims = shader->key.ge.opt.ngg_culling & SI_NGG_CULL_SMALL_LINES_DIAMOND_EXIT; @@ -974,11 +976,6 @@ static void cull_primitive(struct si_shader_context *ctx, } else { /* Get the small prim filter precision. */ small_prim_precision = GET_FIELD(ctx, GS_STATE_SMALL_PRIM_PRECISION); - small_prim_precision = - LLVMBuildOr(builder, small_prim_precision, LLVMConstInt(ctx->ac.i32, 0x70, 0), ""); - small_prim_precision = - LLVMBuildShl(builder, small_prim_precision, LLVMConstInt(ctx->ac.i32, 23, 0), ""); - small_prim_precision = LLVMBuildBitCast(builder, small_prim_precision, ctx->ac.f32, ""); options.num_vertices = 3; options.cull_front = shader->key.ge.opt.ngg_culling & SI_NGG_CULL_FRONT_FACE; @@ -987,6 +984,13 @@ static void cull_primitive(struct si_shader_context *ctx, options.cull_zero_area = options.cull_front || options.cull_back; } + /* Extract the small prim precision. */ + small_prim_precision = + LLVMBuildOr(builder, small_prim_precision, LLVMConstInt(ctx->ac.i32, 0x70, 0), ""); + small_prim_precision = + LLVMBuildShl(builder, small_prim_precision, LLVMConstInt(ctx->ac.i32, 23, 0), ""); + small_prim_precision = LLVMBuildBitCast(builder, small_prim_precision, ctx->ac.f32, ""); + /* Tell ES threads whether their vertex survived. */ LLVMValueRef params[] = { out_prim_accepted, diff --git a/src/gallium/drivers/radeonsi/si_pipe.h b/src/gallium/drivers/radeonsi/si_pipe.h index a0aca0632e2..1515629699c 100644 --- a/src/gallium/drivers/radeonsi/si_pipe.h +++ b/src/gallium/drivers/radeonsi/si_pipe.h @@ -913,8 +913,8 @@ struct si_small_prim_cull_info { float scale[2], translate[2]; float scale_no_aa[2], translate_no_aa[2]; float clip_half_line_width[2]; /* line_width * 0.5 in clip space in X and Y directions */ - float small_prim_precision_no_aa; /* same as the small prim precision, but ignores MSAA */ /* The above fields are uploaded to memory. The below fields are passed via user SGPRs. */ + float small_prim_precision_no_aa; /* same as the small prim precision, but ignores MSAA */ float small_prim_precision; }; diff --git a/src/gallium/drivers/radeonsi/si_shader.h b/src/gallium/drivers/radeonsi/si_shader.h index dff39ae11f6..f2f63fc8312 100644 --- a/src/gallium/drivers/radeonsi/si_shader.h +++ b/src/gallium/drivers/radeonsi/si_shader.h @@ -272,6 +272,8 @@ enum * in the shader via vs_state_bits in legacy GS, the GS copy shader, and any NGG shader. */ /* bit gap */ +#define GS_STATE_SMALL_PRIM_PRECISION_NO_AA__SHIFT 18 +#define GS_STATE_SMALL_PRIM_PRECISION_NO_AA__MASK 0xf #define GS_STATE_SMALL_PRIM_PRECISION__SHIFT 22 #define GS_STATE_SMALL_PRIM_PRECISION__MASK 0xf #define GS_STATE_STREAMOUT_QUERY_ENABLED__SHIFT 26 diff --git a/src/gallium/drivers/radeonsi/si_state_viewport.c b/src/gallium/drivers/radeonsi/si_state_viewport.c index 1e8444663f3..0ea417ea572 100644 --- a/src/gallium/drivers/radeonsi/si_state_viewport.c +++ b/src/gallium/drivers/radeonsi/si_state_viewport.c @@ -141,6 +141,8 @@ static void si_emit_cull_state(struct si_context *sctx) * * So pass only the first 4 bits of the float exponent to the shader. */ + SET_FIELD(sctx->current_gs_state, GS_STATE_SMALL_PRIM_PRECISION_NO_AA, + (fui(info.small_prim_precision_no_aa) >> 23) & 0xf); SET_FIELD(sctx->current_gs_state, GS_STATE_SMALL_PRIM_PRECISION, (fui(info.small_prim_precision) >> 23) & 0xf); }