aco: Allow explicitly removing jumps on GFX10+ when beneficial.
"Removing jumps" in ACO means skipping the jump instruction at the beginning of a divergent branch (but still modify exec). ACO already supports implicitly removing jumps when it decides that executing a branch with empty exec mask is more beneficial than a jump. This commit adds the possibility to use this explicitly through nir_selection_control. ACO will respect this setting and remove the branch instructions when this is specified, unless it decides that this would cause bugs (eg. exp instruction). There are two cases that benefit from the new change: 1. When the application requests to "flatten" a branch (ie. remove control flow), we now respect that. 2. When the compiler stack determines that a divergent branch is always taken. v2 by Georg Lehmann: fixed applying sel_ctrl to else blocks Fossil DB stats on Navi 21: Totals from 13 (0.01% of 134906) affected shaders: CodeSize: 136616 -> 136496 (-0.09%) Instrs: 26196 -> 26166 (-0.11%) Latency: 417928 -> 417889 (-0.01%) Branches: 1241 -> 1211 (-2.42%) Signed-off-by: Timur Kristóf <timur.kristof@gmail.com> Reviewed-by: Daniel Schürmann <daniel@schuermann.dev> Reviewed-By: Georg Lehmann <dadschoorse@gmail.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/17921>
This commit is contained in:
@@ -797,6 +797,7 @@ add_branch_code(exec_ctx& ctx, Block* block)
|
||||
assert(block->linear_succs.size() == 2);
|
||||
assert(block->instructions.back()->opcode == aco_opcode::p_cbranch_z);
|
||||
Temp cond = block->instructions.back()->operands[0].getTemp();
|
||||
nir_selection_control sel_ctrl = block->instructions.back()->branch().selection_control;
|
||||
block->instructions.pop_back();
|
||||
|
||||
uint8_t mask_type = ctx.info[idx].exec.back().second & (mask_type_wqm | mask_type_exact);
|
||||
@@ -812,22 +813,25 @@ add_branch_code(exec_ctx& ctx, Block* block)
|
||||
/* add next current exec to the stack */
|
||||
ctx.info[idx].exec.emplace_back(Operand(bld.lm), mask_type);
|
||||
|
||||
bld.branch(aco_opcode::p_cbranch_z, bld.def(s2), Operand(exec, bld.lm),
|
||||
block->linear_succs[1], block->linear_succs[0]);
|
||||
Builder::Result r = bld.branch(aco_opcode::p_cbranch_z, bld.def(s2), Operand(exec, bld.lm),
|
||||
block->linear_succs[1], block->linear_succs[0]);
|
||||
r.instr->branch().selection_control = sel_ctrl;
|
||||
return;
|
||||
}
|
||||
|
||||
if (block->kind & block_kind_invert) {
|
||||
// exec = s_andn2_b64 (original_exec, exec)
|
||||
assert(block->instructions.back()->opcode == aco_opcode::p_branch);
|
||||
nir_selection_control sel_ctrl = block->instructions.back()->branch().selection_control;
|
||||
block->instructions.pop_back();
|
||||
assert(ctx.info[idx].exec.size() >= 2);
|
||||
Operand orig_exec = ctx.info[idx].exec[ctx.info[idx].exec.size() - 2].first;
|
||||
bld.sop2(Builder::s_andn2, Definition(exec, bld.lm), bld.def(s1, scc), orig_exec,
|
||||
Operand(exec, bld.lm));
|
||||
|
||||
bld.branch(aco_opcode::p_cbranch_z, bld.def(s2), Operand(exec, bld.lm),
|
||||
block->linear_succs[1], block->linear_succs[0]);
|
||||
Builder::Result r = bld.branch(aco_opcode::p_cbranch_z, bld.def(s2), Operand(exec, bld.lm),
|
||||
block->linear_succs[1], block->linear_succs[0]);
|
||||
r.instr->branch().selection_control = sel_ctrl;
|
||||
return;
|
||||
}
|
||||
|
||||
|
@@ -10388,7 +10388,8 @@ visit_loop(isel_context* ctx, nir_loop* loop)
|
||||
}
|
||||
|
||||
static void
|
||||
begin_divergent_if_then(isel_context* ctx, if_context* ic, Temp cond)
|
||||
begin_divergent_if_then(isel_context* ctx, if_context* ic, Temp cond,
|
||||
nir_selection_control sel_ctrl = nir_selection_control_none)
|
||||
{
|
||||
ic->cond = cond;
|
||||
|
||||
@@ -10402,6 +10403,7 @@ begin_divergent_if_then(isel_context* ctx, if_context* ic, Temp cond)
|
||||
Format::PSEUDO_BRANCH, 1, 1));
|
||||
branch->definitions[0] = Definition(ctx->program->allocateTmp(s2));
|
||||
branch->operands[0] = Operand(cond);
|
||||
branch->selection_control = sel_ctrl;
|
||||
ctx->block->instructions.push_back(std::move(branch));
|
||||
|
||||
ic->BB_if_idx = ctx->block->index;
|
||||
@@ -10432,7 +10434,8 @@ begin_divergent_if_then(isel_context* ctx, if_context* ic, Temp cond)
|
||||
}
|
||||
|
||||
static void
|
||||
begin_divergent_if_else(isel_context* ctx, if_context* ic)
|
||||
begin_divergent_if_else(isel_context* ctx, if_context* ic,
|
||||
nir_selection_control sel_ctrl = nir_selection_control_none)
|
||||
{
|
||||
Block* BB_then_logical = ctx->block;
|
||||
append_logical_end(BB_then_logical);
|
||||
@@ -10470,6 +10473,7 @@ begin_divergent_if_else(isel_context* ctx, if_context* ic)
|
||||
branch.reset(create_instruction<Pseudo_branch_instruction>(aco_opcode::p_branch,
|
||||
Format::PSEUDO_BRANCH, 0, 1));
|
||||
branch->definitions[0] = Definition(ctx->program->allocateTmp(s2));
|
||||
branch->selection_control = sel_ctrl;
|
||||
ctx->block->instructions.push_back(std::move(branch));
|
||||
|
||||
ic->exec_potentially_empty_discard_old |= ctx->cf_info.exec_potentially_empty_discard;
|
||||
@@ -10700,10 +10704,10 @@ visit_if(isel_context* ctx, nir_if* if_stmt)
|
||||
* *) Exceptions may be due to break and continue statements within loops
|
||||
**/
|
||||
|
||||
begin_divergent_if_then(ctx, &ic, cond);
|
||||
begin_divergent_if_then(ctx, &ic, cond, if_stmt->control);
|
||||
visit_cf_list(ctx, &if_stmt->then_list);
|
||||
|
||||
begin_divergent_if_else(ctx, &ic);
|
||||
begin_divergent_if_else(ctx, &ic, if_stmt->control);
|
||||
visit_cf_list(ctx, &if_stmt->else_list);
|
||||
|
||||
end_divergent_if(ctx, &ic);
|
||||
|
@@ -1726,8 +1726,9 @@ struct Pseudo_branch_instruction : public Instruction {
|
||||
* A value of 0 means the target has not been initialized (BB0 cannot be a branch target).
|
||||
*/
|
||||
uint32_t target[2];
|
||||
nir_selection_control selection_control;
|
||||
};
|
||||
static_assert(sizeof(Pseudo_branch_instruction) == sizeof(Instruction) + 8, "Unexpected padding");
|
||||
static_assert(sizeof(Pseudo_branch_instruction) == sizeof(Instruction) + 12, "Unexpected padding");
|
||||
|
||||
struct Pseudo_barrier_instruction : public Instruction {
|
||||
memory_sync_info sync;
|
||||
|
@@ -2388,7 +2388,15 @@ lower_to_hw_instr(Program* program)
|
||||
/* Check if the branch instruction can be removed.
|
||||
* This is beneficial when executing the next block with an empty exec mask
|
||||
* is faster than the branch instruction itself.
|
||||
*
|
||||
* Override this judgement when:
|
||||
* - The application prefers to remove control flow
|
||||
* - The compiler stack knows that it's a divergent branch always taken
|
||||
*/
|
||||
const bool prefer_remove =
|
||||
(branch->selection_control == nir_selection_control_flatten ||
|
||||
branch->selection_control == nir_selection_control_divergent_always_taken) &&
|
||||
ctx.program->gfx_level >= GFX10;
|
||||
bool can_remove = block->index < target;
|
||||
unsigned num_scalar = 0;
|
||||
unsigned num_vector = 0;
|
||||
@@ -2427,29 +2435,36 @@ lower_to_hw_instr(Program* program)
|
||||
num_scalar++;
|
||||
}
|
||||
}
|
||||
} else if (inst->isVMEM() || inst->isFlatLike() || inst->isDS() ||
|
||||
inst->isEXP() || inst->isLDSDIR()) {
|
||||
// TODO: GFX6-9 can use vskip
|
||||
} else if (inst->isEXP()) {
|
||||
/* Export instructions with exec=0 can hang some GFX10+ (unclear on old GPUs). */
|
||||
can_remove = false;
|
||||
} else if (inst->isVMEM() || inst->isFlatLike() || inst->isDS() ||
|
||||
inst->isLDSDIR()) {
|
||||
// TODO: GFX6-9 can use vskip
|
||||
can_remove = prefer_remove;
|
||||
} else if (inst->isSMEM()) {
|
||||
/* SMEM are at least as expensive as branches */
|
||||
can_remove = false;
|
||||
can_remove = prefer_remove;
|
||||
} else if (inst->isBarrier()) {
|
||||
can_remove = false;
|
||||
can_remove = prefer_remove;
|
||||
} else {
|
||||
can_remove = false;
|
||||
assert(false && "Pseudo instructions should be lowered by this point.");
|
||||
}
|
||||
|
||||
/* Under these conditions, we shouldn't remove the branch */
|
||||
unsigned est_cycles;
|
||||
if (ctx.program->gfx_level >= GFX10)
|
||||
est_cycles = num_scalar * 2 + num_vector;
|
||||
else
|
||||
est_cycles = num_scalar * 4 + num_vector * 4;
|
||||
if (!prefer_remove) {
|
||||
/* Under these conditions, we shouldn't remove the branch.
|
||||
* Don't care about the estimated cycles when the shader prefers flattening.
|
||||
*/
|
||||
unsigned est_cycles;
|
||||
if (ctx.program->gfx_level >= GFX10)
|
||||
est_cycles = num_scalar * 2 + num_vector;
|
||||
else
|
||||
est_cycles = num_scalar * 4 + num_vector * 4;
|
||||
|
||||
if (est_cycles > 16)
|
||||
can_remove = false;
|
||||
if (est_cycles > 16)
|
||||
can_remove = false;
|
||||
}
|
||||
|
||||
if (!can_remove)
|
||||
break;
|
||||
|
Reference in New Issue
Block a user