ac/llvm: skip s_barrier if tess patches don't cross a wave boundary
If tess patches are wholly in one wave, "s_waitcnt lgkm(0)" is sufficient. Reviewed-by: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer@amd.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/16304>
This commit is contained in:
@@ -3928,6 +3928,11 @@ static void visit_intrinsic(struct ac_nir_context *ctx, nir_intrinsic_instr *ins
|
|||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
case nir_intrinsic_control_barrier:
|
case nir_intrinsic_control_barrier:
|
||||||
|
/* If output patches are wholly in one wave, we don't need a barrier. */
|
||||||
|
if (ctx->stage == MESA_SHADER_TESS_CTRL &&
|
||||||
|
ctx->ac.wave_size % ctx->info->tess.tcs_vertices_out == 0)
|
||||||
|
break;
|
||||||
|
|
||||||
ac_build_s_barrier(&ctx->ac, ctx->stage);
|
ac_build_s_barrier(&ctx->ac, ctx->stage);
|
||||||
break;
|
break;
|
||||||
case nir_intrinsic_shared_atomic_add:
|
case nir_intrinsic_shared_atomic_add:
|
||||||
|
@@ -1971,6 +1971,14 @@ void si_get_tcs_epilog_key(struct si_shader *shader, union si_shader_part_key *k
|
|||||||
memset(key, 0, sizeof(*key));
|
memset(key, 0, sizeof(*key));
|
||||||
key->tcs_epilog.wave32 = shader->wave_size == 32;
|
key->tcs_epilog.wave32 = shader->wave_size == 32;
|
||||||
key->tcs_epilog.states = shader->key.ge.part.tcs.epilog;
|
key->tcs_epilog.states = shader->key.ge.part.tcs.epilog;
|
||||||
|
|
||||||
|
/* If output patches are wholly in one wave, we don't need a barrier.
|
||||||
|
* The fixed-func TCS doesn't set tcs_vertices_out, but it won't use a barrier
|
||||||
|
* anyway because tess levels are always defined in all invocations there.
|
||||||
|
*/
|
||||||
|
key->tcs_epilog.noop_s_barrier =
|
||||||
|
shader->selector->info.base.tess.tcs_vertices_out &&
|
||||||
|
shader->wave_size % shader->selector->info.base.tess.tcs_vertices_out == 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@@ -601,6 +601,7 @@ union si_shader_part_key {
|
|||||||
struct {
|
struct {
|
||||||
struct si_tcs_epilog_bits states;
|
struct si_tcs_epilog_bits states;
|
||||||
unsigned wave32 : 1;
|
unsigned wave32 : 1;
|
||||||
|
unsigned noop_s_barrier : 1;
|
||||||
} tcs_epilog;
|
} tcs_epilog;
|
||||||
struct {
|
struct {
|
||||||
struct si_ps_prolog_bits states;
|
struct si_ps_prolog_bits states;
|
||||||
|
@@ -1017,6 +1017,16 @@ bool si_llvm_translate_nir(struct si_shader_context *ctx, struct si_shader *shad
|
|||||||
shader->selector->info.base.inputs_read &
|
shader->selector->info.base.inputs_read &
|
||||||
~shader->selector->info.tcs_vgpr_only_inputs) {
|
~shader->selector->info.tcs_vgpr_only_inputs) {
|
||||||
ac_build_waitcnt(&ctx->ac, AC_WAIT_LGKM);
|
ac_build_waitcnt(&ctx->ac, AC_WAIT_LGKM);
|
||||||
|
|
||||||
|
/* If both input and output patches are wholly in one wave, we don't need a barrier.
|
||||||
|
* That's true when both VS and TCS have the same number of patch vertices and
|
||||||
|
* the wave size is a multiple of the number of patch vertices.
|
||||||
|
*
|
||||||
|
* The fixed-func TCS doesn't set tcs_vertices_out.
|
||||||
|
*/
|
||||||
|
if (!shader->key.ge.opt.same_patch_vertices ||
|
||||||
|
(sel->info.base.tess.tcs_vertices_out &&
|
||||||
|
ctx->ac.wave_size % sel->info.base.tess.tcs_vertices_out != 0))
|
||||||
ac_build_s_barrier(&ctx->ac, ctx->stage);
|
ac_build_s_barrier(&ctx->ac, ctx->stage);
|
||||||
}
|
}
|
||||||
} else if (ctx->stage == MESA_SHADER_GEOMETRY && !shader->key.ge.as_ngg) {
|
} else if (ctx->stage == MESA_SHADER_GEOMETRY && !shader->key.ge.as_ngg) {
|
||||||
|
@@ -671,8 +671,8 @@ static void si_copy_tcs_inputs(struct si_shader_context *ctx)
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
static void si_write_tess_factors(struct si_shader_context *ctx, LLVMValueRef rel_patch_id,
|
static void si_write_tess_factors(struct si_shader_context *ctx, union si_shader_part_key *key,
|
||||||
LLVMValueRef invocation_id,
|
LLVMValueRef rel_patch_id, LLVMValueRef invocation_id,
|
||||||
LLVMValueRef tcs_out_current_patch_data_offset,
|
LLVMValueRef tcs_out_current_patch_data_offset,
|
||||||
LLVMValueRef invoc0_tf_outer[4], LLVMValueRef invoc0_tf_inner[2])
|
LLVMValueRef invoc0_tf_outer[4], LLVMValueRef invoc0_tf_inner[2])
|
||||||
{
|
{
|
||||||
@@ -685,6 +685,8 @@ static void si_write_tess_factors(struct si_shader_context *ctx, LLVMValueRef re
|
|||||||
/* Add a barrier before loading tess factors from LDS. */
|
/* Add a barrier before loading tess factors from LDS. */
|
||||||
if (!shader->key.ge.part.tcs.epilog.invoc0_tess_factors_are_def) {
|
if (!shader->key.ge.part.tcs.epilog.invoc0_tess_factors_are_def) {
|
||||||
ac_build_waitcnt(&ctx->ac, AC_WAIT_LGKM);
|
ac_build_waitcnt(&ctx->ac, AC_WAIT_LGKM);
|
||||||
|
|
||||||
|
if (!key->tcs_epilog.noop_s_barrier)
|
||||||
ac_build_s_barrier(&ctx->ac, ctx->stage);
|
ac_build_s_barrier(&ctx->ac, ctx->stage);
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -1075,7 +1077,7 @@ void si_llvm_build_tcs_epilog(struct si_shader_context *ctx, union si_shader_par
|
|||||||
for (unsigned i = 0; i < 6; i++)
|
for (unsigned i = 0; i < 6; i++)
|
||||||
invoc0_tess_factors[i] = ac_get_arg(&ctx->ac, tess_factors[i]);
|
invoc0_tess_factors[i] = ac_get_arg(&ctx->ac, tess_factors[i]);
|
||||||
|
|
||||||
si_write_tess_factors(ctx, ac_get_arg(&ctx->ac, rel_patch_id),
|
si_write_tess_factors(ctx, key, ac_get_arg(&ctx->ac, rel_patch_id),
|
||||||
ac_get_arg(&ctx->ac, invocation_id),
|
ac_get_arg(&ctx->ac, invocation_id),
|
||||||
ac_get_arg(&ctx->ac, tcs_out_current_patch_data_offset),
|
ac_get_arg(&ctx->ac, tcs_out_current_patch_data_offset),
|
||||||
invoc0_tess_factors, invoc0_tess_factors + 4);
|
invoc0_tess_factors, invoc0_tess_factors + 4);
|
||||||
|
Reference in New Issue
Block a user