aco/gfx12: don't use second VALU for VOPD's OPX if there is a WaR

fossil-db (gfx1201):
Totals from 38908 (49.02% of 79377) affected shaders:
Instrs: 30268107 -> 30268131 (+0.00%); split: -0.00%, +0.00%
CodeSize: 180843648 -> 180843640 (-0.00%); split: -0.00%, +0.00%
Latency: 224905962 -> 224906072 (+0.00%); split: -0.00%, +0.00%
InvThroughput: 44322988 -> 44323004 (+0.00%)
VALU: 15124145 -> 15124167 (+0.00%)
VOPD: 4018504 -> 4018482 (-0.00%)

Signed-off-by: Rhys Perry <pendingchaos02@gmail.com>
Backport-to: 25.0
Backport-to: 25.1
Reviewed-by: Daniel Schürmann <daniel@schuermann.dev>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/34246>
(cherry picked from commit 408fa33c092810155baac342de90fd712231aa89)
This commit is contained in:
Rhys Perry
2025-03-27 17:21:09 +00:00
committed by Eric Engestrom
parent 2f1fd84e4d
commit dd304bfd80
3 changed files with 79 additions and 15 deletions

View File

@@ -1574,7 +1574,7 @@
"description": "aco/gfx12: don't use second VALU for VOPD's OPX if there is a WaR",
"nominated": true,
"nomination_type": 4,
"resolution": 0,
"resolution": 1,
"main_sha": null,
"because_sha": null,
"notes": null

View File

@@ -213,7 +213,7 @@ get_vopd_info(const SchedILPContext& ctx, const Instruction* instr)
}
bool
is_vopd_compatible(const VOPDInfo& a, const VOPDInfo& b)
is_vopd_compatible(const VOPDInfo& a, const VOPDInfo& b, bool* swap)
{
if ((a.is_opy_only && b.is_opy_only) || (a.is_dst_odd == b.is_dst_odd))
return false;
@@ -222,6 +222,8 @@ is_vopd_compatible(const VOPDInfo& a, const VOPDInfo& b)
if (a.has_literal && b.has_literal && a.literal != b.literal)
return false;
*swap = false;
/* The rest is checking src VGPR bank compatibility. */
if ((a.src_banks & b.src_banks) == 0)
return true;
@@ -244,11 +246,13 @@ is_vopd_compatible(const VOPDInfo& a, const VOPDInfo& b)
if (b.op == aco_opcode::v_dual_mov_b32 && !a.is_commutative && a.is_opy_only)
return false;
*swap = true;
return true;
}
bool
can_use_vopd(const SchedILPContext& ctx, unsigned idx)
can_use_vopd(const SchedILPContext& ctx, unsigned idx, bool* prev_can_be_opx)
{
VOPDInfo cur_vopd = ctx.vopd[idx];
Instruction* first = ctx.nodes[idx].instr;
@@ -260,9 +264,14 @@ can_use_vopd(const SchedILPContext& ctx, unsigned idx)
if (ctx.prev_vopd_info.op == aco_opcode::num_opcodes || cur_vopd.op == aco_opcode::num_opcodes)
return false;
if (!is_vopd_compatible(ctx.prev_vopd_info, cur_vopd))
bool swap = false;
if (!is_vopd_compatible(ctx.prev_vopd_info, cur_vopd, &swap))
return false;
/* If we have to swap a v_mov_b32, it will become an OPY-only opcode. */
if (swap && !ctx.prev_vopd_info.is_commutative && cur_vopd.op == aco_opcode::v_dual_mov_b32)
cur_vopd.is_opy_only = true;
assert(first->definitions.size() == 1);
assert(first->definitions[0].size() == 1);
assert(second->definitions.size() == 1);
@@ -279,8 +288,23 @@ can_use_vopd(const SchedILPContext& ctx, unsigned idx)
return false;
}
/* WaR dependencies are not a concern. */
return true;
/* WaR dependencies are not a concern before GFX12. */
*prev_can_be_opx = true;
if (ctx.program->gfx_level >= GFX12) {
/* From RDNA4 ISA doc:
* The OPX instruction must not overwrite sources of the OPY instruction".
*/
bool war = false;
for (Operand op : first->operands) {
assert(op.size() == 1);
if (second->definitions[0].physReg() == op.physReg())
war = true;
}
if (war)
*prev_can_be_opx = false;
}
return *prev_can_be_opx || !cur_vopd.is_opy_only;
}
Instruction_cycle_info
@@ -619,9 +643,9 @@ select_instruction_ilp(const SchedILPContext& ctx)
bool
compare_nodes_vopd(const SchedILPContext& ctx, int num_vopd_odd_minus_even, bool* use_vopd,
unsigned current, unsigned candidate)
bool* prev_can_be_opx, unsigned current, unsigned candidate)
{
if (can_use_vopd(ctx, candidate)) {
if (can_use_vopd(ctx, candidate, prev_can_be_opx)) {
/* If we can form a VOPD instruction, always prefer to do so. */
if (!*use_vopd) {
*use_vopd = true;
@@ -657,7 +681,7 @@ compare_nodes_vopd(const SchedILPContext& ctx, int num_vopd_odd_minus_even, bool
}
unsigned
select_instruction_vopd(const SchedILPContext& ctx, bool* use_vopd)
select_instruction_vopd(const SchedILPContext& ctx, bool* use_vopd, bool* prev_can_be_opx)
{
*use_vopd = false;
@@ -679,11 +703,14 @@ select_instruction_vopd(const SchedILPContext& ctx, bool* use_vopd)
if (candidate.dependency_mask)
continue;
bool prev_can_be_opx_for_i;
if (cur == -1u) {
cur = i;
*use_vopd = can_use_vopd(ctx, i);
} else if (compare_nodes_vopd(ctx, num_vopd_odd_minus_even, use_vopd, cur, i)) {
*use_vopd = can_use_vopd(ctx, i, prev_can_be_opx);
} else if (compare_nodes_vopd(ctx, num_vopd_odd_minus_even, use_vopd, &prev_can_be_opx_for_i,
cur, i)) {
cur = i;
*prev_can_be_opx = prev_can_be_opx_for_i;
}
}
@@ -719,12 +746,13 @@ get_vopd_opcode_operands(const SchedILPContext& ctx, Instruction* instr, const V
}
Instruction*
create_vopd_instruction(const SchedILPContext& ctx, unsigned idx)
create_vopd_instruction(const SchedILPContext& ctx, unsigned idx, bool prev_can_be_opx)
{
Instruction* x = ctx.prev_info.instr;
Instruction* y = ctx.nodes[idx].instr;
VOPDInfo x_info = ctx.prev_vopd_info;
VOPDInfo y_info = ctx.vopd[idx];
x_info.is_opy_only |= !prev_can_be_opx;
bool swap_x = false, swap_y = false;
if (x_info.src_banks & y_info.src_banks) {
@@ -744,6 +772,7 @@ create_vopd_instruction(const SchedILPContext& ctx, unsigned idx)
std::swap(x_info, y_info);
std::swap(swap_x, swap_y);
}
assert(!x_info.is_opy_only);
aco_opcode x_op, y_op;
unsigned num_operands = 0;
@@ -774,14 +803,15 @@ do_schedule(SchedILPContext& ctx, It& insert_it, It& remove_it, It instructions_
ctx.prev_info.instr = NULL;
bool use_vopd = false;
bool prev_can_be_opx;
while (ctx.active_mask) {
unsigned next_idx =
ctx.is_vopd ? select_instruction_vopd(ctx, &use_vopd) : select_instruction_ilp(ctx);
unsigned next_idx = ctx.is_vopd ? select_instruction_vopd(ctx, &use_vopd, &prev_can_be_opx)
: select_instruction_ilp(ctx);
Instruction* next_instr = ctx.nodes[next_idx].instr;
if (use_vopd) {
std::prev(insert_it)->reset(create_vopd_instruction(ctx, next_idx));
std::prev(insert_it)->reset(create_vopd_instruction(ctx, next_idx, prev_can_be_opx));
ctx.prev_info.instr = NULL;
} else {
(insert_it++)->reset(next_instr);

View File

@@ -153,3 +153,37 @@ BEGIN_TEST(vopd_sched.mov_to_add_bfrev)
finish_schedule_vopd_test();
END_TEST
BEGIN_TEST(vopd_sched.war)
for (amd_gfx_level gfx : {GFX11, GFX12}) {
if (!setup_cs(NULL, gfx, CHIP_UNKNOWN, "", 32))
continue;
PhysReg reg_v0{256};
PhysReg reg_v1{257};
PhysReg reg_v3{259};
//>> p_unit_test 0
//~gfx11! v1: %0:v[1] = v_dual_add_f32 %0:v[3], %0:v[1] :: v1: %0:v[0] = v_dual_mul_f32 %0:v[1], %0:v[3]
//~gfx12! v1: %0:v[0] = v_dual_mul_f32 %0:v[1], %0:v[3] :: v1: %0:v[1] = v_dual_add_f32 %0:v[3], %0:v[1]
bld.pseudo(aco_opcode::p_unit_test, Operand::zero());
bld.vop2(aco_opcode::v_mul_f32, Definition(reg_v0, v1), Operand(reg_v1, v1),
Operand(reg_v3, v1));
bld.vop2(aco_opcode::v_add_f32, Definition(reg_v1, v1), Operand(reg_v3, v1),
Operand(reg_v1, v1));
/* We can't use OPX for the v_mul_f32 because of the WaR, but we also can't use OPX for the
* v_add_u32 because that opcode is OPY-only. */
//>> p_unit_test 1
//~gfx11! v1: %0:v[1] = v_dual_mul_f32 %0:v[3], %0:v[1] :: v1: %0:v[0] = v_dual_add_nc_u32 %0:v[1], %0:v[3]
//~gfx12! v1: %0:v[0] = v_add_u32 %0:v[1], %0:v[3]
//~gfx12! v1: %0:v[1] = v_mul_f32 %0:v[3], %0:v[1]
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(1));
bld.vop2(aco_opcode::v_add_u32, Definition(reg_v0, v1), Operand(reg_v1, v1),
Operand(reg_v3, v1));
bld.vop2(aco_opcode::v_mul_f32, Definition(reg_v1, v1), Operand(reg_v3, v1),
Operand(reg_v1, v1));
finish_schedule_vopd_test();
}
END_TEST