aco: always terminate quads if they have been demoted entirely

Previously, quads got only terminated in top-level control flow.
This patch makes the behavior consistent.

Totals from 7811 (9.86% of 79242) affected shaders: (GFX11)

Instrs: 7859667 -> 7850757 (-0.11%); split: -0.18%, +0.07%
CodeSize: 41642280 -> 41611836 (-0.07%); split: -0.13%, +0.06%
Latency: 73692815 -> 73707588 (+0.02%); split: -0.02%, +0.04%
InvThroughput: 10672160 -> 10672323 (+0.00%); split: -0.01%, +0.01%
VClause: 137478 -> 137469 (-0.01%); split: -0.02%, +0.02%
SClause: 314905 -> 314924 (+0.01%); split: -0.19%, +0.20%
Copies: 587014 -> 576039 (-1.87%); split: -2.10%, +0.23%
Branches: 213101 -> 213123 (+0.01%); split: -0.01%, +0.02%
PreSGPRs: 313588 -> 313355 (-0.07%); split: -0.09%, +0.01%
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/27112>
This commit is contained in:
Daniel Schürmann
2024-01-17 11:46:16 +01:00
committed by Marge Bot
parent a42b83e3fb
commit e89977ff71

View File

@@ -516,57 +516,60 @@ process_instructions(exec_ctx& ctx, Block* block, std::vector<aco_ptr<Instructio
instr->definitions[1] = bld.def(s1, scc);
}
} else if (instr->opcode == aco_opcode::p_demote_to_helper) {
/* turn demote into discard_if with only exact masks */
assert((info.exec[0].second & mask_type_exact) &&
(info.exec[0].second & mask_type_global));
int num;
Operand src;
Temp exit_cond;
if (instr->operands[0].isConstant() && !(block->kind & block_kind_top_level)) {
assert(instr->operands[0].constantValue() == -1u);
/* transition to exact and set exec to zero */
exit_cond = bld.tmp(s1);
src = bld.sop1(Builder::s_and_saveexec, bld.def(bld.lm), bld.scc(Definition(exit_cond)),
Definition(exec, bld.lm), Operand::zero(), Operand(exec, bld.lm));
num = info.exec.size() - 2;
if (!(info.exec.back().second & mask_type_exact)) {
info.exec.back().first = src;
info.exec.emplace_back(Operand(bld.lm), mask_type_exact);
}
} else {
/* demote_if: transition to exact */
if (block->kind & block_kind_top_level && info.exec.size() == 2 &&
info.exec.back().second & mask_type_global) {
/* We don't need to actually copy anything into exec, since the s_andn2
* instructions later will do that.
*/
info.exec.pop_back();
} else {
transition_to_Exact(ctx, bld, block->index);
}
src = instr->operands[0];
num = info.exec.size() - 1;
const bool nested_cf = !(info.exec.back().second & mask_type_global);
if (ctx.handle_wqm && state == Exact && nested_cf) {
/* Transition back to WQM without extra instruction. */
info.exec.pop_back();
state = WQM;
} else if (block->instructions[idx + 1]->opcode == aco_opcode::p_end_wqm) {
/* Transition to Exact without extra instruction. */
info.exec.resize(1);
state = Exact;
} else if (nested_cf) {
/* Save curent exec temporarily. */
info.exec.back().first = bld.copy(bld.def(bld.lm), Operand(exec, bld.lm));
}
for (int i = num; i >= 0; i--) {
if (info.exec[i].second & mask_type_exact) {
Instruction* andn2 =
bld.sop2(Builder::s_andn2, bld.def(bld.lm), bld.def(s1, scc),
get_exec_op(info.exec[i].first), src);
if (i == (int)info.exec.size() - 1)
andn2->definitions[0] = Definition(exec, bld.lm);
/* Remove invocations from global exact mask. */
Definition def = state == Exact ? Definition(exec, bld.lm) : bld.def(bld.lm);
Operand src = instr->operands[0].isConstant() ? Operand(exec, bld.lm) : instr->operands[0];
info.exec[i].first = Operand(andn2->definitions[0].getTemp());
exit_cond = andn2->definitions[1].getTemp();
} else {
assert(i != 0);
}
Definition exit_cond =
bld.sop2(Builder::s_andn2, def, bld.def(s1, scc), get_exec_op(info.exec[0].first), src)
.def(1);
info.exec[0].first = Operand(def.getTemp());
/* Update global WQM mask and store in exec. */
if (state == WQM) {
assert(info.exec.size() > 1);
exit_cond =
bld.sop1(Builder::s_wqm, Definition(exec, bld.lm), bld.def(s1, scc), def.getTemp())
.def(1);
}
/* End shader if global mask is zero. */
instr->opcode = aco_opcode::p_exit_early_if;
instr->operands[0] = bld.scc(exit_cond);
state = Exact;
instr->operands[0] = bld.scc(exit_cond.getTemp());
bld.insert(std::move(instr));
/* Update all other exec masks. */
if (nested_cf) {
const unsigned global_idx = state == WQM ? 1 : 0;
for (unsigned i = global_idx + 1; i < info.exec.size() - 1; i++) {
info.exec[i].first =
bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc),
get_exec_op(info.exec[i].first), Operand(exec, bld.lm));
}
/* Update current exec and save WQM mask. */
info.exec[global_idx].first =
bld.sop1(Builder::s_and_saveexec, bld.def(bld.lm), bld.def(s1, scc),
Definition(exec, bld.lm), info.exec.back().first, Operand(exec, bld.lm));
info.exec.back().first = Operand(bld.lm);
}
continue;
} else if (instr->opcode == aco_opcode::p_elect) {
bool all_lanes_enabled = info.exec.back().first.constantEquals(-1u);