aco: Allow explicitly removing jumps on GFX10+ when beneficial.

"Removing jumps" in ACO means skipping the jump instruction
at the beginning of a divergent branch (but still modify exec).

ACO already supports implicitly removing jumps when it decides
that executing a branch with empty exec mask is more beneficial
than a jump.

This commit adds the possibility to use this explicitly
through nir_selection_control. ACO will respect this
setting and remove the branch instructions when this is specified,
unless it decides that this would cause bugs (eg. exp instruction).

There are two cases that benefit from the new change:

1. When the application requests to "flatten" a branch (ie.
remove control flow), we now respect that.
2. When the compiler stack determines that a divergent branch
is always taken.

v2 by Georg Lehmann: fixed applying sel_ctrl to else blocks

Fossil DB stats on Navi 21:

Totals from 13 (0.01% of 134906) affected shaders:
CodeSize: 136616 -> 136496 (-0.09%)
Instrs: 26196 -> 26166 (-0.11%)
Latency: 417928 -> 417889 (-0.01%)
Branches: 1241 -> 1211 (-2.42%)

Signed-off-by: Timur Kristóf <timur.kristof@gmail.com>
Reviewed-by: Daniel Schürmann <daniel@schuermann.dev>
Reviewed-By: Georg Lehmann <dadschoorse@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/17921>
This commit is contained in:
Timur Kristóf
2022-02-23 10:13:54 +01:00
committed by Marge Bot
parent 00a8be3414
commit d8639b7a80
4 changed files with 46 additions and 22 deletions

View File

@@ -797,6 +797,7 @@ add_branch_code(exec_ctx& ctx, Block* block)
assert(block->linear_succs.size() == 2);
assert(block->instructions.back()->opcode == aco_opcode::p_cbranch_z);
Temp cond = block->instructions.back()->operands[0].getTemp();
nir_selection_control sel_ctrl = block->instructions.back()->branch().selection_control;
block->instructions.pop_back();
uint8_t mask_type = ctx.info[idx].exec.back().second & (mask_type_wqm | mask_type_exact);
@@ -812,22 +813,25 @@ add_branch_code(exec_ctx& ctx, Block* block)
/* add next current exec to the stack */
ctx.info[idx].exec.emplace_back(Operand(bld.lm), mask_type);
bld.branch(aco_opcode::p_cbranch_z, bld.def(s2), Operand(exec, bld.lm),
block->linear_succs[1], block->linear_succs[0]);
Builder::Result r = bld.branch(aco_opcode::p_cbranch_z, bld.def(s2), Operand(exec, bld.lm),
block->linear_succs[1], block->linear_succs[0]);
r.instr->branch().selection_control = sel_ctrl;
return;
}
if (block->kind & block_kind_invert) {
// exec = s_andn2_b64 (original_exec, exec)
assert(block->instructions.back()->opcode == aco_opcode::p_branch);
nir_selection_control sel_ctrl = block->instructions.back()->branch().selection_control;
block->instructions.pop_back();
assert(ctx.info[idx].exec.size() >= 2);
Operand orig_exec = ctx.info[idx].exec[ctx.info[idx].exec.size() - 2].first;
bld.sop2(Builder::s_andn2, Definition(exec, bld.lm), bld.def(s1, scc), orig_exec,
Operand(exec, bld.lm));
bld.branch(aco_opcode::p_cbranch_z, bld.def(s2), Operand(exec, bld.lm),
block->linear_succs[1], block->linear_succs[0]);
Builder::Result r = bld.branch(aco_opcode::p_cbranch_z, bld.def(s2), Operand(exec, bld.lm),
block->linear_succs[1], block->linear_succs[0]);
r.instr->branch().selection_control = sel_ctrl;
return;
}

View File

@@ -10388,7 +10388,8 @@ visit_loop(isel_context* ctx, nir_loop* loop)
}
static void
begin_divergent_if_then(isel_context* ctx, if_context* ic, Temp cond)
begin_divergent_if_then(isel_context* ctx, if_context* ic, Temp cond,
nir_selection_control sel_ctrl = nir_selection_control_none)
{
ic->cond = cond;
@@ -10402,6 +10403,7 @@ begin_divergent_if_then(isel_context* ctx, if_context* ic, Temp cond)
Format::PSEUDO_BRANCH, 1, 1));
branch->definitions[0] = Definition(ctx->program->allocateTmp(s2));
branch->operands[0] = Operand(cond);
branch->selection_control = sel_ctrl;
ctx->block->instructions.push_back(std::move(branch));
ic->BB_if_idx = ctx->block->index;
@@ -10432,7 +10434,8 @@ begin_divergent_if_then(isel_context* ctx, if_context* ic, Temp cond)
}
static void
begin_divergent_if_else(isel_context* ctx, if_context* ic)
begin_divergent_if_else(isel_context* ctx, if_context* ic,
nir_selection_control sel_ctrl = nir_selection_control_none)
{
Block* BB_then_logical = ctx->block;
append_logical_end(BB_then_logical);
@@ -10470,6 +10473,7 @@ begin_divergent_if_else(isel_context* ctx, if_context* ic)
branch.reset(create_instruction<Pseudo_branch_instruction>(aco_opcode::p_branch,
Format::PSEUDO_BRANCH, 0, 1));
branch->definitions[0] = Definition(ctx->program->allocateTmp(s2));
branch->selection_control = sel_ctrl;
ctx->block->instructions.push_back(std::move(branch));
ic->exec_potentially_empty_discard_old |= ctx->cf_info.exec_potentially_empty_discard;
@@ -10700,10 +10704,10 @@ visit_if(isel_context* ctx, nir_if* if_stmt)
* *) Exceptions may be due to break and continue statements within loops
**/
begin_divergent_if_then(ctx, &ic, cond);
begin_divergent_if_then(ctx, &ic, cond, if_stmt->control);
visit_cf_list(ctx, &if_stmt->then_list);
begin_divergent_if_else(ctx, &ic);
begin_divergent_if_else(ctx, &ic, if_stmt->control);
visit_cf_list(ctx, &if_stmt->else_list);
end_divergent_if(ctx, &ic);

View File

@@ -1726,8 +1726,9 @@ struct Pseudo_branch_instruction : public Instruction {
* A value of 0 means the target has not been initialized (BB0 cannot be a branch target).
*/
uint32_t target[2];
nir_selection_control selection_control;
};
static_assert(sizeof(Pseudo_branch_instruction) == sizeof(Instruction) + 8, "Unexpected padding");
static_assert(sizeof(Pseudo_branch_instruction) == sizeof(Instruction) + 12, "Unexpected padding");
struct Pseudo_barrier_instruction : public Instruction {
memory_sync_info sync;

View File

@@ -2388,7 +2388,15 @@ lower_to_hw_instr(Program* program)
/* Check if the branch instruction can be removed.
* This is beneficial when executing the next block with an empty exec mask
* is faster than the branch instruction itself.
*
* Override this judgement when:
* - The application prefers to remove control flow
* - The compiler stack knows that it's a divergent branch always taken
*/
const bool prefer_remove =
(branch->selection_control == nir_selection_control_flatten ||
branch->selection_control == nir_selection_control_divergent_always_taken) &&
ctx.program->gfx_level >= GFX10;
bool can_remove = block->index < target;
unsigned num_scalar = 0;
unsigned num_vector = 0;
@@ -2427,29 +2435,36 @@ lower_to_hw_instr(Program* program)
num_scalar++;
}
}
} else if (inst->isVMEM() || inst->isFlatLike() || inst->isDS() ||
inst->isEXP() || inst->isLDSDIR()) {
// TODO: GFX6-9 can use vskip
} else if (inst->isEXP()) {
/* Export instructions with exec=0 can hang some GFX10+ (unclear on old GPUs). */
can_remove = false;
} else if (inst->isVMEM() || inst->isFlatLike() || inst->isDS() ||
inst->isLDSDIR()) {
// TODO: GFX6-9 can use vskip
can_remove = prefer_remove;
} else if (inst->isSMEM()) {
/* SMEM are at least as expensive as branches */
can_remove = false;
can_remove = prefer_remove;
} else if (inst->isBarrier()) {
can_remove = false;
can_remove = prefer_remove;
} else {
can_remove = false;
assert(false && "Pseudo instructions should be lowered by this point.");
}
/* Under these conditions, we shouldn't remove the branch */
unsigned est_cycles;
if (ctx.program->gfx_level >= GFX10)
est_cycles = num_scalar * 2 + num_vector;
else
est_cycles = num_scalar * 4 + num_vector * 4;
if (!prefer_remove) {
/* Under these conditions, we shouldn't remove the branch.
* Don't care about the estimated cycles when the shader prefers flattening.
*/
unsigned est_cycles;
if (ctx.program->gfx_level >= GFX10)
est_cycles = num_scalar * 2 + num_vector;
else
est_cycles = num_scalar * 4 + num_vector * 4;
if (est_cycles > 16)
can_remove = false;
if (est_cycles > 16)
can_remove = false;
}
if (!can_remove)
break;