radeonsi: pass VS->TCS IO via VGPRs if VS and TCS have the same thread count
It can only be done if a TCS input is accessed without indirect indexing and with gl_InvocationID as the vertex index, and the number of VS and TCS threads is the same. This eliminates LDS stores and loads for VS->TCS IO, reducing shader lifetime and LDS traffic. Acked-by: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer@amd.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/7623>
This commit is contained in:
@@ -479,7 +479,21 @@ void si_create_function(struct si_shader_context *ctx, bool ngg_cull_shader)
|
|||||||
returns[num_returns++] = ctx->ac.i32; /* SGPRs */
|
returns[num_returns++] = ctx->ac.i32; /* SGPRs */
|
||||||
for (i = 0; i < 2; i++)
|
for (i = 0; i < 2; i++)
|
||||||
returns[num_returns++] = ctx->ac.f32; /* VGPRs */
|
returns[num_returns++] = ctx->ac.f32; /* VGPRs */
|
||||||
|
|
||||||
|
/* VS outputs passed via VGPRs to TCS. */
|
||||||
|
if (shader->key.opt.same_patch_vertices) {
|
||||||
|
unsigned num_outputs = util_last_bit64(shader->selector->outputs_written);
|
||||||
|
for (i = 0; i < num_outputs * 4; i++)
|
||||||
|
returns[num_returns++] = ctx->ac.f32; /* VGPRs */
|
||||||
|
}
|
||||||
} else {
|
} else {
|
||||||
|
/* TCS inputs are passed via VGPRs from VS. */
|
||||||
|
if (shader->key.opt.same_patch_vertices) {
|
||||||
|
unsigned num_inputs = util_last_bit64(shader->previous_stage_sel->outputs_written);
|
||||||
|
for (i = 0; i < num_inputs * 4; i++)
|
||||||
|
ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_FLOAT, NULL);
|
||||||
|
}
|
||||||
|
|
||||||
/* TCS return values are inputs to the TCS epilog.
|
/* TCS return values are inputs to the TCS epilog.
|
||||||
*
|
*
|
||||||
* param_tcs_offchip_offset, param_tcs_factor_offset,
|
* param_tcs_offchip_offset, param_tcs_factor_offset,
|
||||||
@@ -1765,6 +1779,7 @@ static bool si_llvm_compile_shader(struct si_screen *sscreen, struct ac_llvm_com
|
|||||||
parts[3] = ctx.main_fn;
|
parts[3] = ctx.main_fn;
|
||||||
|
|
||||||
/* VS as LS main part */
|
/* VS as LS main part */
|
||||||
|
ctx.next_shader_sel = ctx.shader->selector;
|
||||||
nir = get_nir_shader(ls, NULL, &free_nir);
|
nir = get_nir_shader(ls, NULL, &free_nir);
|
||||||
struct si_shader shader_ls = {};
|
struct si_shader shader_ls = {};
|
||||||
shader_ls.selector = ls;
|
shader_ls.selector = ls;
|
||||||
|
@@ -461,6 +461,7 @@ struct si_shader_selector {
|
|||||||
uint32_t patch_outputs_written; /* "get_unique_index_patch" bits */
|
uint32_t patch_outputs_written; /* "get_unique_index_patch" bits */
|
||||||
|
|
||||||
uint64_t inputs_read; /* "get_unique_index" bits */
|
uint64_t inputs_read; /* "get_unique_index" bits */
|
||||||
|
uint64_t tcs_vgpr_only_inputs; /* TCS inputs that are only in VGPRs, not LDS. */
|
||||||
|
|
||||||
/* bitmasks of used descriptor slots */
|
/* bitmasks of used descriptor slots */
|
||||||
uint64_t active_const_and_shader_buffers;
|
uint64_t active_const_and_shader_buffers;
|
||||||
|
@@ -46,6 +46,7 @@ struct si_shader_output_values {
|
|||||||
struct si_shader_context {
|
struct si_shader_context {
|
||||||
struct ac_llvm_context ac;
|
struct ac_llvm_context ac;
|
||||||
struct si_shader *shader;
|
struct si_shader *shader;
|
||||||
|
struct si_shader_selector *next_shader_sel;
|
||||||
struct si_screen *screen;
|
struct si_screen *screen;
|
||||||
|
|
||||||
gl_shader_stage stage;
|
gl_shader_stage stage;
|
||||||
|
@@ -395,6 +395,21 @@ static LLVMValueRef si_nir_load_tcs_varyings(struct ac_shader_abi *abi, LLVMType
|
|||||||
semantic = info->output_semantic[driver_location];
|
semantic = info->output_semantic[driver_location];
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/* Load the TCS input from a VGPR if possible. */
|
||||||
|
if (ctx->shader->key.opt.same_patch_vertices &&
|
||||||
|
load_input && vertex_index_is_invoc_id && !param_index) {
|
||||||
|
unsigned func_param = ctx->args.tcs_rel_ids.arg_index + 1 +
|
||||||
|
si_shader_io_get_unique_index(semantic, false) * 4;
|
||||||
|
LLVMValueRef value[4];
|
||||||
|
|
||||||
|
for (unsigned i = component; i < component + num_components; i++) {
|
||||||
|
value[i] = LLVMGetParam(ctx->main_fn, func_param + i);
|
||||||
|
value[i] = LLVMBuildBitCast(ctx->ac.builder, value[i], type, "");
|
||||||
|
}
|
||||||
|
|
||||||
|
return ac_build_varying_gather_values(&ctx->ac, value, num_components, component);
|
||||||
|
}
|
||||||
|
|
||||||
bool is_patch = vertex_index == NULL;
|
bool is_patch = vertex_index == NULL;
|
||||||
assert((semantic >= VARYING_SLOT_PATCH0 ||
|
assert((semantic >= VARYING_SLOT_PATCH0 ||
|
||||||
semantic == VARYING_SLOT_TESS_LEVEL_INNER ||
|
semantic == VARYING_SLOT_TESS_LEVEL_INNER ||
|
||||||
@@ -944,6 +959,7 @@ void si_llvm_emit_ls_epilogue(struct ac_shader_abi *abi, unsigned max_outputs, L
|
|||||||
LLVMValueRef vertex_id = ac_get_arg(&ctx->ac, ctx->rel_auto_id);
|
LLVMValueRef vertex_id = ac_get_arg(&ctx->ac, ctx->rel_auto_id);
|
||||||
LLVMValueRef vertex_dw_stride = get_tcs_in_vertex_dw_stride(ctx);
|
LLVMValueRef vertex_dw_stride = get_tcs_in_vertex_dw_stride(ctx);
|
||||||
LLVMValueRef base_dw_addr = LLVMBuildMul(ctx->ac.builder, vertex_id, vertex_dw_stride, "");
|
LLVMValueRef base_dw_addr = LLVMBuildMul(ctx->ac.builder, vertex_id, vertex_dw_stride, "");
|
||||||
|
unsigned ret_offset = 8 + GFX9_TCS_NUM_USER_SGPR + 2;
|
||||||
|
|
||||||
/* Write outputs to LDS. The next shader (TCS aka HS) will read
|
/* Write outputs to LDS. The next shader (TCS aka HS) will read
|
||||||
* its inputs from it. */
|
* its inputs from it. */
|
||||||
@@ -976,8 +992,16 @@ void si_llvm_emit_ls_epilogue(struct ac_shader_abi *abi, unsigned max_outputs, L
|
|||||||
if (!(info->output_usagemask[i] & (1 << chan)))
|
if (!(info->output_usagemask[i] & (1 << chan)))
|
||||||
continue;
|
continue;
|
||||||
|
|
||||||
lshs_lds_store(ctx, chan, dw_addr,
|
LLVMValueRef value = LLVMBuildLoad(ctx->ac.builder, addrs[4 * i + chan], "");
|
||||||
LLVMBuildLoad(ctx->ac.builder, addrs[4 * i + chan], ""));
|
|
||||||
|
if (!shader->key.opt.same_patch_vertices ||
|
||||||
|
!(ctx->next_shader_sel->tcs_vgpr_only_inputs & (1ull << semantic)))
|
||||||
|
lshs_lds_store(ctx, chan, dw_addr, value);
|
||||||
|
|
||||||
|
if (shader->key.opt.same_patch_vertices) {
|
||||||
|
ctx->return_value = LLVMBuildInsertValue(ctx->ac.builder, ctx->return_value,
|
||||||
|
value, ret_offset + param * 4 + chan, "");
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@@ -2756,6 +2756,10 @@ static void *si_create_shader_selector(struct pipe_context *ctx,
|
|||||||
|
|
||||||
assert(((sel->esgs_itemsize / 4) & C_028AAC_ITEMSIZE) == 0);
|
assert(((sel->esgs_itemsize / 4) & C_028AAC_ITEMSIZE) == 0);
|
||||||
|
|
||||||
|
sel->tcs_vgpr_only_inputs = ~sel->info.base.tess.tcs_cross_invocation_inputs_read &
|
||||||
|
~sel->info.base.inputs_read_indirectly &
|
||||||
|
sel->info.base.inputs_read;
|
||||||
|
|
||||||
/* Only for TES: */
|
/* Only for TES: */
|
||||||
if (sel->info.stage == MESA_SHADER_TESS_EVAL) {
|
if (sel->info.stage == MESA_SHADER_TESS_EVAL) {
|
||||||
if (sel->info.base.tess.point_mode)
|
if (sel->info.base.tess.point_mode)
|
||||||
|
Reference in New Issue
Block a user