From d8639b7a801808d1d09dc27c0fb12d1f5e08d87c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Timur=20Krist=C3=B3f?= Date: Wed, 23 Feb 2022 10:13:54 +0100 Subject: [PATCH] aco: Allow explicitly removing jumps on GFX10+ when beneficial. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit "Removing jumps" in ACO means skipping the jump instruction at the beginning of a divergent branch (but still modify exec). ACO already supports implicitly removing jumps when it decides that executing a branch with empty exec mask is more beneficial than a jump. This commit adds the possibility to use this explicitly through nir_selection_control. ACO will respect this setting and remove the branch instructions when this is specified, unless it decides that this would cause bugs (eg. exp instruction). There are two cases that benefit from the new change: 1. When the application requests to "flatten" a branch (ie. remove control flow), we now respect that. 2. When the compiler stack determines that a divergent branch is always taken. v2 by Georg Lehmann: fixed applying sel_ctrl to else blocks Fossil DB stats on Navi 21: Totals from 13 (0.01% of 134906) affected shaders: CodeSize: 136616 -> 136496 (-0.09%) Instrs: 26196 -> 26166 (-0.11%) Latency: 417928 -> 417889 (-0.01%) Branches: 1241 -> 1211 (-2.42%) Signed-off-by: Timur Kristóf Reviewed-by: Daniel Schürmann Reviewed-By: Georg Lehmann Part-of: --- src/amd/compiler/aco_insert_exec_mask.cpp | 12 ++++-- .../compiler/aco_instruction_selection.cpp | 12 ++++-- src/amd/compiler/aco_ir.h | 3 +- src/amd/compiler/aco_lower_to_hw_instr.cpp | 41 +++++++++++++------ 4 files changed, 46 insertions(+), 22 deletions(-) diff --git a/src/amd/compiler/aco_insert_exec_mask.cpp b/src/amd/compiler/aco_insert_exec_mask.cpp index 3486595dcbe..6e7cf49f337 100644 --- a/src/amd/compiler/aco_insert_exec_mask.cpp +++ b/src/amd/compiler/aco_insert_exec_mask.cpp @@ -797,6 +797,7 @@ add_branch_code(exec_ctx& ctx, Block* block) assert(block->linear_succs.size() == 2); assert(block->instructions.back()->opcode == aco_opcode::p_cbranch_z); Temp cond = block->instructions.back()->operands[0].getTemp(); + nir_selection_control sel_ctrl = block->instructions.back()->branch().selection_control; block->instructions.pop_back(); uint8_t mask_type = ctx.info[idx].exec.back().second & (mask_type_wqm | mask_type_exact); @@ -812,22 +813,25 @@ add_branch_code(exec_ctx& ctx, Block* block) /* add next current exec to the stack */ ctx.info[idx].exec.emplace_back(Operand(bld.lm), mask_type); - bld.branch(aco_opcode::p_cbranch_z, bld.def(s2), Operand(exec, bld.lm), - block->linear_succs[1], block->linear_succs[0]); + Builder::Result r = bld.branch(aco_opcode::p_cbranch_z, bld.def(s2), Operand(exec, bld.lm), + block->linear_succs[1], block->linear_succs[0]); + r.instr->branch().selection_control = sel_ctrl; return; } if (block->kind & block_kind_invert) { // exec = s_andn2_b64 (original_exec, exec) assert(block->instructions.back()->opcode == aco_opcode::p_branch); + nir_selection_control sel_ctrl = block->instructions.back()->branch().selection_control; block->instructions.pop_back(); assert(ctx.info[idx].exec.size() >= 2); Operand orig_exec = ctx.info[idx].exec[ctx.info[idx].exec.size() - 2].first; bld.sop2(Builder::s_andn2, Definition(exec, bld.lm), bld.def(s1, scc), orig_exec, Operand(exec, bld.lm)); - bld.branch(aco_opcode::p_cbranch_z, bld.def(s2), Operand(exec, bld.lm), - block->linear_succs[1], block->linear_succs[0]); + Builder::Result r = bld.branch(aco_opcode::p_cbranch_z, bld.def(s2), Operand(exec, bld.lm), + block->linear_succs[1], block->linear_succs[0]); + r.instr->branch().selection_control = sel_ctrl; return; } diff --git a/src/amd/compiler/aco_instruction_selection.cpp b/src/amd/compiler/aco_instruction_selection.cpp index 119458e9ca7..e48f128cc03 100644 --- a/src/amd/compiler/aco_instruction_selection.cpp +++ b/src/amd/compiler/aco_instruction_selection.cpp @@ -10388,7 +10388,8 @@ visit_loop(isel_context* ctx, nir_loop* loop) } static void -begin_divergent_if_then(isel_context* ctx, if_context* ic, Temp cond) +begin_divergent_if_then(isel_context* ctx, if_context* ic, Temp cond, + nir_selection_control sel_ctrl = nir_selection_control_none) { ic->cond = cond; @@ -10402,6 +10403,7 @@ begin_divergent_if_then(isel_context* ctx, if_context* ic, Temp cond) Format::PSEUDO_BRANCH, 1, 1)); branch->definitions[0] = Definition(ctx->program->allocateTmp(s2)); branch->operands[0] = Operand(cond); + branch->selection_control = sel_ctrl; ctx->block->instructions.push_back(std::move(branch)); ic->BB_if_idx = ctx->block->index; @@ -10432,7 +10434,8 @@ begin_divergent_if_then(isel_context* ctx, if_context* ic, Temp cond) } static void -begin_divergent_if_else(isel_context* ctx, if_context* ic) +begin_divergent_if_else(isel_context* ctx, if_context* ic, + nir_selection_control sel_ctrl = nir_selection_control_none) { Block* BB_then_logical = ctx->block; append_logical_end(BB_then_logical); @@ -10470,6 +10473,7 @@ begin_divergent_if_else(isel_context* ctx, if_context* ic) branch.reset(create_instruction(aco_opcode::p_branch, Format::PSEUDO_BRANCH, 0, 1)); branch->definitions[0] = Definition(ctx->program->allocateTmp(s2)); + branch->selection_control = sel_ctrl; ctx->block->instructions.push_back(std::move(branch)); ic->exec_potentially_empty_discard_old |= ctx->cf_info.exec_potentially_empty_discard; @@ -10700,10 +10704,10 @@ visit_if(isel_context* ctx, nir_if* if_stmt) * *) Exceptions may be due to break and continue statements within loops **/ - begin_divergent_if_then(ctx, &ic, cond); + begin_divergent_if_then(ctx, &ic, cond, if_stmt->control); visit_cf_list(ctx, &if_stmt->then_list); - begin_divergent_if_else(ctx, &ic); + begin_divergent_if_else(ctx, &ic, if_stmt->control); visit_cf_list(ctx, &if_stmt->else_list); end_divergent_if(ctx, &ic); diff --git a/src/amd/compiler/aco_ir.h b/src/amd/compiler/aco_ir.h index 6483c2d60c1..df744c83a3b 100644 --- a/src/amd/compiler/aco_ir.h +++ b/src/amd/compiler/aco_ir.h @@ -1726,8 +1726,9 @@ struct Pseudo_branch_instruction : public Instruction { * A value of 0 means the target has not been initialized (BB0 cannot be a branch target). */ uint32_t target[2]; + nir_selection_control selection_control; }; -static_assert(sizeof(Pseudo_branch_instruction) == sizeof(Instruction) + 8, "Unexpected padding"); +static_assert(sizeof(Pseudo_branch_instruction) == sizeof(Instruction) + 12, "Unexpected padding"); struct Pseudo_barrier_instruction : public Instruction { memory_sync_info sync; diff --git a/src/amd/compiler/aco_lower_to_hw_instr.cpp b/src/amd/compiler/aco_lower_to_hw_instr.cpp index 995be1499c7..f3a8296ebfa 100644 --- a/src/amd/compiler/aco_lower_to_hw_instr.cpp +++ b/src/amd/compiler/aco_lower_to_hw_instr.cpp @@ -2388,7 +2388,15 @@ lower_to_hw_instr(Program* program) /* Check if the branch instruction can be removed. * This is beneficial when executing the next block with an empty exec mask * is faster than the branch instruction itself. + * + * Override this judgement when: + * - The application prefers to remove control flow + * - The compiler stack knows that it's a divergent branch always taken */ + const bool prefer_remove = + (branch->selection_control == nir_selection_control_flatten || + branch->selection_control == nir_selection_control_divergent_always_taken) && + ctx.program->gfx_level >= GFX10; bool can_remove = block->index < target; unsigned num_scalar = 0; unsigned num_vector = 0; @@ -2427,29 +2435,36 @@ lower_to_hw_instr(Program* program) num_scalar++; } } - } else if (inst->isVMEM() || inst->isFlatLike() || inst->isDS() || - inst->isEXP() || inst->isLDSDIR()) { - // TODO: GFX6-9 can use vskip + } else if (inst->isEXP()) { + /* Export instructions with exec=0 can hang some GFX10+ (unclear on old GPUs). */ can_remove = false; + } else if (inst->isVMEM() || inst->isFlatLike() || inst->isDS() || + inst->isLDSDIR()) { + // TODO: GFX6-9 can use vskip + can_remove = prefer_remove; } else if (inst->isSMEM()) { /* SMEM are at least as expensive as branches */ - can_remove = false; + can_remove = prefer_remove; } else if (inst->isBarrier()) { - can_remove = false; + can_remove = prefer_remove; } else { can_remove = false; assert(false && "Pseudo instructions should be lowered by this point."); } - /* Under these conditions, we shouldn't remove the branch */ - unsigned est_cycles; - if (ctx.program->gfx_level >= GFX10) - est_cycles = num_scalar * 2 + num_vector; - else - est_cycles = num_scalar * 4 + num_vector * 4; + if (!prefer_remove) { + /* Under these conditions, we shouldn't remove the branch. + * Don't care about the estimated cycles when the shader prefers flattening. + */ + unsigned est_cycles; + if (ctx.program->gfx_level >= GFX10) + est_cycles = num_scalar * 2 + num_vector; + else + est_cycles = num_scalar * 4 + num_vector * 4; - if (est_cycles > 16) - can_remove = false; + if (est_cycles > 16) + can_remove = false; + } if (!can_remove) break;