diff --git a/.pick_status.json b/.pick_status.json index 9d23c51eb89..8014c8beb4f 100644 --- a/.pick_status.json +++ b/.pick_status.json @@ -1124,7 +1124,7 @@ "description": "aco/ra: set Pseudo_instruction::scratch_sgpr to SCC if it doesn't need to be preserved", "nominated": true, "nomination_type": 1, - "resolution": 0, + "resolution": 1, "main_sha": null, "because_sha": null, "notes": null diff --git a/src/amd/compiler/aco_lower_to_hw_instr.cpp b/src/amd/compiler/aco_lower_to_hw_instr.cpp index 93bc4450118..68fd1ef9b50 100644 --- a/src/amd/compiler/aco_lower_to_hw_instr.cpp +++ b/src/amd/compiler/aco_lower_to_hw_instr.cpp @@ -1936,8 +1936,11 @@ handle_operands(std::map& copy_map, lower_context* ctx, continue; } - if (preserve_scc && it->second.def.getTemp().type() == RegType::sgpr) - assert(!(it->second.def.physReg() == pi->scratch_sgpr)); + if (it->second.def.getTemp().type() == RegType::sgpr) { + assert(it->second.def.physReg() != pi->scratch_sgpr); + assert(pi->needs_scratch_reg); + assert(!preserve_scc || pi->scratch_sgpr != scc); + } /* to resolve the cycle, we have to swap the src reg with the dst reg */ copy_operation swap = it->second; diff --git a/src/amd/compiler/aco_register_allocation.cpp b/src/amd/compiler/aco_register_allocation.cpp index 7ff35c079e2..78f13f8a79a 100644 --- a/src/amd/compiler/aco_register_allocation.cpp +++ b/src/amd/compiler/aco_register_allocation.cpp @@ -2039,12 +2039,17 @@ handle_pseudo(ra_ctx& ctx, const RegisterFile& reg_file, Instruction* instr) reads_linear = true; } - if (!writes_linear || !reads_linear || !reg_file[scc]) + if (!writes_linear || !reads_linear) return; instr->pseudo().needs_scratch_reg = true; instr->pseudo().tmp_in_scc = reg_file[scc]; + if (!reg_file[scc]) { + instr->pseudo().scratch_sgpr = scc; + return; + } + int reg = ctx.max_used_sgpr; for (; reg >= 0 && reg_file[PhysReg{(unsigned)reg}]; reg--) ; @@ -2933,18 +2938,16 @@ emit_parallel_copy_internal(ra_ctx& ctx, std::vectoroperands[i] = parallelcopy[i].first; @@ -2974,6 +2977,7 @@ emit_parallel_copy_internal(ra_ctx& ctx, std::vectorpseudo().needs_scratch_reg = sgpr_operands_alias_defs || linear_vgpr; pc->pseudo().tmp_in_scc = false; + pc->pseudo().scratch_sgpr = scc; } instructions.emplace_back(std::move(pc)); @@ -3064,7 +3068,6 @@ register_allocation(Program* program, ra_test_policy policy) for (; instr_it != block.instructions.end(); ++instr_it) { aco_ptr& instr = *instr_it; std::vector> parallelcopy; - bool temp_in_scc = register_file[scc]; if (instr->opcode == aco_opcode::p_branch) { /* unconditional branches are handled after phis of the target */ @@ -3121,6 +3124,7 @@ register_allocation(Program* program, ra_test_policy policy) ctx.war_hint.set(operand.physReg().reg() + j); } } + bool temp_in_scc = register_file[scc]; /* remove dead vars from register file */ for (const Operand& op : instr->operands) { diff --git a/src/amd/compiler/tests/test_regalloc.cpp b/src/amd/compiler/tests/test_regalloc.cpp index 5902b780ca7..ad9cd311592 100644 --- a/src/amd/compiler/tests/test_regalloc.cpp +++ b/src/amd/compiler/tests/test_regalloc.cpp @@ -555,7 +555,7 @@ BEGIN_TEST(regalloc.linear_vgpr.compact_for_future_def) finish_ra_test(ra_test_policy()); //~gfx8_cbranch>> lv1: %ltmp2_2:v[29] = p_parallelcopy %ltmp2:v[28] scc:1 scratch:s1 - //~gfx8_branch>> lv1: %ltmp2_2:v[29] = p_parallelcopy %ltmp2:v[28] scc:0 scratch:s0 + //~gfx8_branch>> lv1: %ltmp2_2:v[29] = p_parallelcopy %ltmp2:v[28] scc:0 scratch:s253 aco_ptr& parallelcopy = program->blocks[0].instructions[6]; aco_print_instr(program->gfx_level, parallelcopy.get(), output); if (parallelcopy->isPseudo()) {