diff --git a/src/amd/compiler/aco_insert_exec_mask.cpp b/src/amd/compiler/aco_insert_exec_mask.cpp index 3486595dcbe..6e7cf49f337 100644 --- a/src/amd/compiler/aco_insert_exec_mask.cpp +++ b/src/amd/compiler/aco_insert_exec_mask.cpp @@ -797,6 +797,7 @@ add_branch_code(exec_ctx& ctx, Block* block) assert(block->linear_succs.size() == 2); assert(block->instructions.back()->opcode == aco_opcode::p_cbranch_z); Temp cond = block->instructions.back()->operands[0].getTemp(); + nir_selection_control sel_ctrl = block->instructions.back()->branch().selection_control; block->instructions.pop_back(); uint8_t mask_type = ctx.info[idx].exec.back().second & (mask_type_wqm | mask_type_exact); @@ -812,22 +813,25 @@ add_branch_code(exec_ctx& ctx, Block* block) /* add next current exec to the stack */ ctx.info[idx].exec.emplace_back(Operand(bld.lm), mask_type); - bld.branch(aco_opcode::p_cbranch_z, bld.def(s2), Operand(exec, bld.lm), - block->linear_succs[1], block->linear_succs[0]); + Builder::Result r = bld.branch(aco_opcode::p_cbranch_z, bld.def(s2), Operand(exec, bld.lm), + block->linear_succs[1], block->linear_succs[0]); + r.instr->branch().selection_control = sel_ctrl; return; } if (block->kind & block_kind_invert) { // exec = s_andn2_b64 (original_exec, exec) assert(block->instructions.back()->opcode == aco_opcode::p_branch); + nir_selection_control sel_ctrl = block->instructions.back()->branch().selection_control; block->instructions.pop_back(); assert(ctx.info[idx].exec.size() >= 2); Operand orig_exec = ctx.info[idx].exec[ctx.info[idx].exec.size() - 2].first; bld.sop2(Builder::s_andn2, Definition(exec, bld.lm), bld.def(s1, scc), orig_exec, Operand(exec, bld.lm)); - bld.branch(aco_opcode::p_cbranch_z, bld.def(s2), Operand(exec, bld.lm), - block->linear_succs[1], block->linear_succs[0]); + Builder::Result r = bld.branch(aco_opcode::p_cbranch_z, bld.def(s2), Operand(exec, bld.lm), + block->linear_succs[1], block->linear_succs[0]); + r.instr->branch().selection_control = sel_ctrl; return; } diff --git a/src/amd/compiler/aco_instruction_selection.cpp b/src/amd/compiler/aco_instruction_selection.cpp index 119458e9ca7..e48f128cc03 100644 --- a/src/amd/compiler/aco_instruction_selection.cpp +++ b/src/amd/compiler/aco_instruction_selection.cpp @@ -10388,7 +10388,8 @@ visit_loop(isel_context* ctx, nir_loop* loop) } static void -begin_divergent_if_then(isel_context* ctx, if_context* ic, Temp cond) +begin_divergent_if_then(isel_context* ctx, if_context* ic, Temp cond, + nir_selection_control sel_ctrl = nir_selection_control_none) { ic->cond = cond; @@ -10402,6 +10403,7 @@ begin_divergent_if_then(isel_context* ctx, if_context* ic, Temp cond) Format::PSEUDO_BRANCH, 1, 1)); branch->definitions[0] = Definition(ctx->program->allocateTmp(s2)); branch->operands[0] = Operand(cond); + branch->selection_control = sel_ctrl; ctx->block->instructions.push_back(std::move(branch)); ic->BB_if_idx = ctx->block->index; @@ -10432,7 +10434,8 @@ begin_divergent_if_then(isel_context* ctx, if_context* ic, Temp cond) } static void -begin_divergent_if_else(isel_context* ctx, if_context* ic) +begin_divergent_if_else(isel_context* ctx, if_context* ic, + nir_selection_control sel_ctrl = nir_selection_control_none) { Block* BB_then_logical = ctx->block; append_logical_end(BB_then_logical); @@ -10470,6 +10473,7 @@ begin_divergent_if_else(isel_context* ctx, if_context* ic) branch.reset(create_instruction(aco_opcode::p_branch, Format::PSEUDO_BRANCH, 0, 1)); branch->definitions[0] = Definition(ctx->program->allocateTmp(s2)); + branch->selection_control = sel_ctrl; ctx->block->instructions.push_back(std::move(branch)); ic->exec_potentially_empty_discard_old |= ctx->cf_info.exec_potentially_empty_discard; @@ -10700,10 +10704,10 @@ visit_if(isel_context* ctx, nir_if* if_stmt) * *) Exceptions may be due to break and continue statements within loops **/ - begin_divergent_if_then(ctx, &ic, cond); + begin_divergent_if_then(ctx, &ic, cond, if_stmt->control); visit_cf_list(ctx, &if_stmt->then_list); - begin_divergent_if_else(ctx, &ic); + begin_divergent_if_else(ctx, &ic, if_stmt->control); visit_cf_list(ctx, &if_stmt->else_list); end_divergent_if(ctx, &ic); diff --git a/src/amd/compiler/aco_ir.h b/src/amd/compiler/aco_ir.h index 6483c2d60c1..df744c83a3b 100644 --- a/src/amd/compiler/aco_ir.h +++ b/src/amd/compiler/aco_ir.h @@ -1726,8 +1726,9 @@ struct Pseudo_branch_instruction : public Instruction { * A value of 0 means the target has not been initialized (BB0 cannot be a branch target). */ uint32_t target[2]; + nir_selection_control selection_control; }; -static_assert(sizeof(Pseudo_branch_instruction) == sizeof(Instruction) + 8, "Unexpected padding"); +static_assert(sizeof(Pseudo_branch_instruction) == sizeof(Instruction) + 12, "Unexpected padding"); struct Pseudo_barrier_instruction : public Instruction { memory_sync_info sync; diff --git a/src/amd/compiler/aco_lower_to_hw_instr.cpp b/src/amd/compiler/aco_lower_to_hw_instr.cpp index 995be1499c7..f3a8296ebfa 100644 --- a/src/amd/compiler/aco_lower_to_hw_instr.cpp +++ b/src/amd/compiler/aco_lower_to_hw_instr.cpp @@ -2388,7 +2388,15 @@ lower_to_hw_instr(Program* program) /* Check if the branch instruction can be removed. * This is beneficial when executing the next block with an empty exec mask * is faster than the branch instruction itself. + * + * Override this judgement when: + * - The application prefers to remove control flow + * - The compiler stack knows that it's a divergent branch always taken */ + const bool prefer_remove = + (branch->selection_control == nir_selection_control_flatten || + branch->selection_control == nir_selection_control_divergent_always_taken) && + ctx.program->gfx_level >= GFX10; bool can_remove = block->index < target; unsigned num_scalar = 0; unsigned num_vector = 0; @@ -2427,29 +2435,36 @@ lower_to_hw_instr(Program* program) num_scalar++; } } - } else if (inst->isVMEM() || inst->isFlatLike() || inst->isDS() || - inst->isEXP() || inst->isLDSDIR()) { - // TODO: GFX6-9 can use vskip + } else if (inst->isEXP()) { + /* Export instructions with exec=0 can hang some GFX10+ (unclear on old GPUs). */ can_remove = false; + } else if (inst->isVMEM() || inst->isFlatLike() || inst->isDS() || + inst->isLDSDIR()) { + // TODO: GFX6-9 can use vskip + can_remove = prefer_remove; } else if (inst->isSMEM()) { /* SMEM are at least as expensive as branches */ - can_remove = false; + can_remove = prefer_remove; } else if (inst->isBarrier()) { - can_remove = false; + can_remove = prefer_remove; } else { can_remove = false; assert(false && "Pseudo instructions should be lowered by this point."); } - /* Under these conditions, we shouldn't remove the branch */ - unsigned est_cycles; - if (ctx.program->gfx_level >= GFX10) - est_cycles = num_scalar * 2 + num_vector; - else - est_cycles = num_scalar * 4 + num_vector * 4; + if (!prefer_remove) { + /* Under these conditions, we shouldn't remove the branch. + * Don't care about the estimated cycles when the shader prefers flattening. + */ + unsigned est_cycles; + if (ctx.program->gfx_level >= GFX10) + est_cycles = num_scalar * 2 + num_vector; + else + est_cycles = num_scalar * 4 + num_vector * 4; - if (est_cycles > 16) - can_remove = false; + if (est_cycles > 16) + can_remove = false; + } if (!can_remove) break;