aco/ra: set Pseudo_instruction::scratch_sgpr to SCC if it doesn't need to be preserved

Also ensure that 'needs_scratch_reg' is always true if SCC might be overwritten.
Few changes, because some p_split_vector get SCC as scratch reg assigned,
and thus, can inhibit some postRA optimizations.

Totals from 3 (0.00% of 79395) affected shaders: (Navi31)
Instrs: 10501 -> 10500 (-0.01%); split: -0.02%, +0.01%
CodeSize: 51580 -> 51520 (-0.12%); split: -0.12%, +0.01%
Latency: 84166 -> 84174 (+0.01%)
InvThroughput: 13109 -> 13111 (+0.02%)
SALU: 859 -> 860 (+0.12%)

Cc: mesa-stable
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/32217>
(cherry picked from commit a04e096339)
This commit is contained in:
Daniel Schürmann
2024-11-19 09:13:45 +01:00
committed by Dylan Baker
parent 09ad1fbdf2
commit b1f8e15781
4 changed files with 23 additions and 16 deletions

View File

@@ -1124,7 +1124,7 @@
"description": "aco/ra: set Pseudo_instruction::scratch_sgpr to SCC if it doesn't need to be preserved", "description": "aco/ra: set Pseudo_instruction::scratch_sgpr to SCC if it doesn't need to be preserved",
"nominated": true, "nominated": true,
"nomination_type": 1, "nomination_type": 1,
"resolution": 0, "resolution": 1,
"main_sha": null, "main_sha": null,
"because_sha": null, "because_sha": null,
"notes": null "notes": null

View File

@@ -1936,8 +1936,11 @@ handle_operands(std::map<PhysReg, copy_operation>& copy_map, lower_context* ctx,
continue; continue;
} }
if (preserve_scc && it->second.def.getTemp().type() == RegType::sgpr) if (it->second.def.getTemp().type() == RegType::sgpr) {
assert(!(it->second.def.physReg() == pi->scratch_sgpr)); assert(it->second.def.physReg() != pi->scratch_sgpr);
assert(pi->needs_scratch_reg);
assert(!preserve_scc || pi->scratch_sgpr != scc);
}
/* to resolve the cycle, we have to swap the src reg with the dst reg */ /* to resolve the cycle, we have to swap the src reg with the dst reg */
copy_operation swap = it->second; copy_operation swap = it->second;

View File

@@ -2039,12 +2039,17 @@ handle_pseudo(ra_ctx& ctx, const RegisterFile& reg_file, Instruction* instr)
reads_linear = true; reads_linear = true;
} }
if (!writes_linear || !reads_linear || !reg_file[scc]) if (!writes_linear || !reads_linear)
return; return;
instr->pseudo().needs_scratch_reg = true; instr->pseudo().needs_scratch_reg = true;
instr->pseudo().tmp_in_scc = reg_file[scc]; instr->pseudo().tmp_in_scc = reg_file[scc];
if (!reg_file[scc]) {
instr->pseudo().scratch_sgpr = scc;
return;
}
int reg = ctx.max_used_sgpr; int reg = ctx.max_used_sgpr;
for (; reg >= 0 && reg_file[PhysReg{(unsigned)reg}]; reg--) for (; reg >= 0 && reg_file[PhysReg{(unsigned)reg}]; reg--)
; ;
@@ -2933,18 +2938,16 @@ emit_parallel_copy_internal(ra_ctx& ctx, std::vector<std::pair<Operand, Definiti
for (unsigned i = 0; i < parallelcopy.size(); i++) { for (unsigned i = 0; i < parallelcopy.size(); i++) {
linear_vgpr |= parallelcopy[i].first.regClass().is_linear_vgpr(); linear_vgpr |= parallelcopy[i].first.regClass().is_linear_vgpr();
if (temp_in_scc && parallelcopy[i].first.isTemp() && if (!sgpr_operands_alias_defs && parallelcopy[i].first.isTemp() &&
parallelcopy[i].first.getTemp().type() == RegType::sgpr) { parallelcopy[i].first.getTemp().type() == RegType::sgpr) {
if (!sgpr_operands_alias_defs) { unsigned reg = parallelcopy[i].first.physReg().reg();
unsigned reg = parallelcopy[i].first.physReg().reg(); unsigned size = parallelcopy[i].first.getTemp().size();
unsigned size = parallelcopy[i].first.getTemp().size(); sgpr_operands[reg / 64u] |= u_bit_consecutive64(reg % 64u, size);
sgpr_operands[reg / 64u] |= u_bit_consecutive64(reg % 64u, size);
reg = parallelcopy[i].second.physReg().reg(); reg = parallelcopy[i].second.physReg().reg();
size = parallelcopy[i].second.getTemp().size(); size = parallelcopy[i].second.getTemp().size();
if (sgpr_operands[reg / 64u] & u_bit_consecutive64(reg % 64u, size)) if (sgpr_operands[reg / 64u] & u_bit_consecutive64(reg % 64u, size))
sgpr_operands_alias_defs = true; sgpr_operands_alias_defs = true;
}
} }
pc->operands[i] = parallelcopy[i].first; pc->operands[i] = parallelcopy[i].first;
@@ -2974,6 +2977,7 @@ emit_parallel_copy_internal(ra_ctx& ctx, std::vector<std::pair<Operand, Definiti
} else { } else {
pc->pseudo().needs_scratch_reg = sgpr_operands_alias_defs || linear_vgpr; pc->pseudo().needs_scratch_reg = sgpr_operands_alias_defs || linear_vgpr;
pc->pseudo().tmp_in_scc = false; pc->pseudo().tmp_in_scc = false;
pc->pseudo().scratch_sgpr = scc;
} }
instructions.emplace_back(std::move(pc)); instructions.emplace_back(std::move(pc));
@@ -3064,7 +3068,6 @@ register_allocation(Program* program, ra_test_policy policy)
for (; instr_it != block.instructions.end(); ++instr_it) { for (; instr_it != block.instructions.end(); ++instr_it) {
aco_ptr<Instruction>& instr = *instr_it; aco_ptr<Instruction>& instr = *instr_it;
std::vector<std::pair<Operand, Definition>> parallelcopy; std::vector<std::pair<Operand, Definition>> parallelcopy;
bool temp_in_scc = register_file[scc];
if (instr->opcode == aco_opcode::p_branch) { if (instr->opcode == aco_opcode::p_branch) {
/* unconditional branches are handled after phis of the target */ /* unconditional branches are handled after phis of the target */
@@ -3121,6 +3124,7 @@ register_allocation(Program* program, ra_test_policy policy)
ctx.war_hint.set(operand.physReg().reg() + j); ctx.war_hint.set(operand.physReg().reg() + j);
} }
} }
bool temp_in_scc = register_file[scc];
/* remove dead vars from register file */ /* remove dead vars from register file */
for (const Operand& op : instr->operands) { for (const Operand& op : instr->operands) {

View File

@@ -555,7 +555,7 @@ BEGIN_TEST(regalloc.linear_vgpr.compact_for_future_def)
finish_ra_test(ra_test_policy()); finish_ra_test(ra_test_policy());
//~gfx8_cbranch>> lv1: %ltmp2_2:v[29] = p_parallelcopy %ltmp2:v[28] scc:1 scratch:s1 //~gfx8_cbranch>> lv1: %ltmp2_2:v[29] = p_parallelcopy %ltmp2:v[28] scc:1 scratch:s1
//~gfx8_branch>> lv1: %ltmp2_2:v[29] = p_parallelcopy %ltmp2:v[28] scc:0 scratch:s0 //~gfx8_branch>> lv1: %ltmp2_2:v[29] = p_parallelcopy %ltmp2:v[28] scc:0 scratch:s253
aco_ptr<Instruction>& parallelcopy = program->blocks[0].instructions[6]; aco_ptr<Instruction>& parallelcopy = program->blocks[0].instructions[6];
aco_print_instr(program->gfx_level, parallelcopy.get(), output); aco_print_instr(program->gfx_level, parallelcopy.get(), output);
if (parallelcopy->isPseudo()) { if (parallelcopy->isPseudo()) {