From 9b5b5cbc53430d39993db5fb3bcd9f99ed006f2c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= Date: Thu, 12 Nov 2020 22:07:56 -0500 Subject: [PATCH] radeonsi: adjust tess SGPRs to allow fully occupied 3 HS waves of triangles With triangles and 3 HS waves, 3 lanes were unoccupied. Adjust the SGPR encoding to allow 1 more triangle to fit there. Some of the fields are not large enough, but they weren't large enough before either. Acked-by: Pierre-Eric Pelloux-Prayer Part-of: --- .../drivers/radeonsi/si_shader_internal.h | 18 +++++++++--------- .../drivers/radeonsi/si_shader_llvm_tess.c | 6 ++++-- src/gallium/drivers/radeonsi/si_state_draw.c | 7 +++++-- 3 files changed, 18 insertions(+), 13 deletions(-) diff --git a/src/gallium/drivers/radeonsi/si_shader_internal.h b/src/gallium/drivers/radeonsi/si_shader_internal.h index 8649a78db5c..6722e581415 100644 --- a/src/gallium/drivers/radeonsi/si_shader_internal.h +++ b/src/gallium/drivers/radeonsi/si_shader_internal.h @@ -117,25 +117,25 @@ struct si_shader_context { /* API TCS & TES */ /* Layout of TCS outputs in the offchip buffer * # 6 bits - * [0:5] = the number of patches per threadgroup, max = NUM_PATCHES (40) - * # 6 bits - * [6:11] = the number of output vertices per patch, max = 32 - * # 20 bits - * [12:31] = the offset of per patch attributes in the buffer in bytes. - * max = NUM_PATCHES*32*32*16 + * [0:5] = the number of patches per threadgroup - 1, max = 63 + * # 5 bits + * [6:10] = the number of output vertices per patch - 1, max = 31 + * # 21 bits + * [11:31] = the offset of per patch attributes in the buffer in bytes. + * max = NUM_PATCHES*32*32*16 = 1M */ struct ac_arg tcs_offchip_layout; /* API TCS */ /* Offsets where TCS outputs and TCS patch outputs live in LDS: - * [0:15] = TCS output patch0 offset / 16, max = NUM_PATCHES * 32 * 32 + * [0:15] = TCS output patch0 offset / 16, max = NUM_PATCHES * 32 * 32 = 64K (TODO: not enough bits) * [16:31] = TCS output patch0 offset for per-patch / 16 - * max = (NUM_PATCHES + 1) * 32*32 + * max = (NUM_PATCHES + 1) * 32*32 = 66624 (TODO: not enough bits) */ struct ac_arg tcs_out_lds_offsets; /* Layout of TCS outputs / TES inputs: * [0:12] = stride between output patches in DW, num_outputs * num_vertices * 4 - * max = 32*32*4 + 32*4 + * max = 32*32*4 + 32*4 = 4224 * [13:18] = gl_PatchVerticesIn, max = 32 * [19:31] = high 13 bits of the 32-bit address of tessellation ring buffers */ diff --git a/src/gallium/drivers/radeonsi/si_shader_llvm_tess.c b/src/gallium/drivers/radeonsi/si_shader_llvm_tess.c index 97e5db45955..1ee6ffd547e 100644 --- a/src/gallium/drivers/radeonsi/si_shader_llvm_tess.c +++ b/src/gallium/drivers/radeonsi/si_shader_llvm_tess.c @@ -145,7 +145,8 @@ static LLVMValueRef get_num_tcs_out_vertices(struct si_shader_context *ctx) if (ctx->stage == MESA_SHADER_TESS_CTRL && tcs_out_vertices) return LLVMConstInt(ctx->ac.i32, tcs_out_vertices, 0); - return si_unpack_param(ctx, ctx->tcs_offchip_layout, 6, 6); + return LLVMBuildAdd(ctx->ac.builder, + si_unpack_param(ctx, ctx->tcs_offchip_layout, 6, 5), ctx->ac.i32_1, ""); } static LLVMValueRef get_tcs_in_vertex_dw_stride(struct si_shader_context *ctx) @@ -220,6 +221,7 @@ static LLVMValueRef get_tcs_tes_buffer_address(struct si_shader_context *ctx, vertices_per_patch = get_num_tcs_out_vertices(ctx); num_patches = si_unpack_param(ctx, ctx->tcs_offchip_layout, 0, 6); + num_patches = LLVMBuildAdd(ctx->ac.builder, num_patches, ctx->ac.i32_1, ""); total_vertices = LLVMBuildMul(ctx->ac.builder, vertices_per_patch, num_patches, ""); constant16 = LLVMConstInt(ctx->ac.i32, 16, 0); @@ -235,7 +237,7 @@ static LLVMValueRef get_tcs_tes_buffer_address(struct si_shader_context *ctx, base_addr = LLVMBuildMul(ctx->ac.builder, base_addr, constant16, ""); if (!vertex_index) { - LLVMValueRef patch_data_offset = si_unpack_param(ctx, ctx->tcs_offchip_layout, 12, 20); + LLVMValueRef patch_data_offset = si_unpack_param(ctx, ctx->tcs_offchip_layout, 11, 21); base_addr = LLVMBuildAdd(ctx->ac.builder, base_addr, patch_data_offset, ""); } diff --git a/src/gallium/drivers/radeonsi/si_state_draw.c b/src/gallium/drivers/radeonsi/si_state_draw.c index 4c739eeef2d..ef5eae2cc47 100644 --- a/src/gallium/drivers/radeonsi/si_state_draw.c +++ b/src/gallium/drivers/radeonsi/si_state_draw.c @@ -161,7 +161,7 @@ static void si_emit_derived_tess_state(struct si_context *sctx, const struct pip * The hardware can do more, but the radeonsi shader constant is * limited to 6 bits. */ - *num_patches = MIN2(*num_patches, 63); /* triangles: 3 full waves except 3 lanes */ + *num_patches = MIN2(*num_patches, 64); /* triangles: 3 full waves */ /* When distributed tessellation is unsupported, switch between SEs * at a higher frequency to compensate for it. @@ -214,6 +214,8 @@ static void si_emit_derived_tess_state(struct si_context *sctx, const struct pip assert(((perpatch_output_offset / 16) & ~0xffff) == 0); assert(num_tcs_input_cp <= 32); assert(num_tcs_output_cp <= 32); + assert(*num_patches <= 64); + assert(((pervertex_output_patch_size * *num_patches) & ~0x1fffff) == 0); uint64_t ring_va = (unlikely(sctx->ws->cs_is_secure(sctx->gfx_cs)) ? si_resource(sctx->tess_rings_tmz) : si_resource(sctx->tess_rings))->gpu_address; @@ -224,7 +226,8 @@ static void si_emit_derived_tess_state(struct si_context *sctx, const struct pip tcs_out_layout = (output_patch_size / 4) | (num_tcs_input_cp << 13) | ring_va; tcs_out_offsets = (output_patch0_offset / 16) | ((perpatch_output_offset / 16) << 16); offchip_layout = - *num_patches | (num_tcs_output_cp << 6) | (pervertex_output_patch_size * *num_patches << 12); + (*num_patches - 1) | ((num_tcs_output_cp - 1) << 6) | + ((pervertex_output_patch_size * *num_patches) << 11); /* Compute the LDS size. */ lds_size = output_patch0_offset + output_patch_size * *num_patches;