aco/gfx12: don't use second VALU for VOPD's OPX if there is a WaR
fossil-db (gfx1201): Totals from 38908 (49.02% of 79377) affected shaders: Instrs: 30268107 -> 30268131 (+0.00%); split: -0.00%, +0.00% CodeSize: 180843648 -> 180843640 (-0.00%); split: -0.00%, +0.00% Latency: 224905962 -> 224906072 (+0.00%); split: -0.00%, +0.00% InvThroughput: 44322988 -> 44323004 (+0.00%) VALU: 15124145 -> 15124167 (+0.00%) VOPD: 4018504 -> 4018482 (-0.00%) Signed-off-by: Rhys Perry <pendingchaos02@gmail.com> Backport-to: 25.0 Backport-to: 25.1 Reviewed-by: Daniel Schürmann <daniel@schuermann.dev> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/34246> (cherry picked from commit 408fa33c092810155baac342de90fd712231aa89)
This commit is contained in:

committed by
Eric Engestrom

parent
2f1fd84e4d
commit
dd304bfd80
@@ -1574,7 +1574,7 @@
|
||||
"description": "aco/gfx12: don't use second VALU for VOPD's OPX if there is a WaR",
|
||||
"nominated": true,
|
||||
"nomination_type": 4,
|
||||
"resolution": 0,
|
||||
"resolution": 1,
|
||||
"main_sha": null,
|
||||
"because_sha": null,
|
||||
"notes": null
|
||||
|
@@ -213,7 +213,7 @@ get_vopd_info(const SchedILPContext& ctx, const Instruction* instr)
|
||||
}
|
||||
|
||||
bool
|
||||
is_vopd_compatible(const VOPDInfo& a, const VOPDInfo& b)
|
||||
is_vopd_compatible(const VOPDInfo& a, const VOPDInfo& b, bool* swap)
|
||||
{
|
||||
if ((a.is_opy_only && b.is_opy_only) || (a.is_dst_odd == b.is_dst_odd))
|
||||
return false;
|
||||
@@ -222,6 +222,8 @@ is_vopd_compatible(const VOPDInfo& a, const VOPDInfo& b)
|
||||
if (a.has_literal && b.has_literal && a.literal != b.literal)
|
||||
return false;
|
||||
|
||||
*swap = false;
|
||||
|
||||
/* The rest is checking src VGPR bank compatibility. */
|
||||
if ((a.src_banks & b.src_banks) == 0)
|
||||
return true;
|
||||
@@ -244,11 +246,13 @@ is_vopd_compatible(const VOPDInfo& a, const VOPDInfo& b)
|
||||
if (b.op == aco_opcode::v_dual_mov_b32 && !a.is_commutative && a.is_opy_only)
|
||||
return false;
|
||||
|
||||
*swap = true;
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
bool
|
||||
can_use_vopd(const SchedILPContext& ctx, unsigned idx)
|
||||
can_use_vopd(const SchedILPContext& ctx, unsigned idx, bool* prev_can_be_opx)
|
||||
{
|
||||
VOPDInfo cur_vopd = ctx.vopd[idx];
|
||||
Instruction* first = ctx.nodes[idx].instr;
|
||||
@@ -260,9 +264,14 @@ can_use_vopd(const SchedILPContext& ctx, unsigned idx)
|
||||
if (ctx.prev_vopd_info.op == aco_opcode::num_opcodes || cur_vopd.op == aco_opcode::num_opcodes)
|
||||
return false;
|
||||
|
||||
if (!is_vopd_compatible(ctx.prev_vopd_info, cur_vopd))
|
||||
bool swap = false;
|
||||
if (!is_vopd_compatible(ctx.prev_vopd_info, cur_vopd, &swap))
|
||||
return false;
|
||||
|
||||
/* If we have to swap a v_mov_b32, it will become an OPY-only opcode. */
|
||||
if (swap && !ctx.prev_vopd_info.is_commutative && cur_vopd.op == aco_opcode::v_dual_mov_b32)
|
||||
cur_vopd.is_opy_only = true;
|
||||
|
||||
assert(first->definitions.size() == 1);
|
||||
assert(first->definitions[0].size() == 1);
|
||||
assert(second->definitions.size() == 1);
|
||||
@@ -279,8 +288,23 @@ can_use_vopd(const SchedILPContext& ctx, unsigned idx)
|
||||
return false;
|
||||
}
|
||||
|
||||
/* WaR dependencies are not a concern. */
|
||||
return true;
|
||||
/* WaR dependencies are not a concern before GFX12. */
|
||||
*prev_can_be_opx = true;
|
||||
if (ctx.program->gfx_level >= GFX12) {
|
||||
/* From RDNA4 ISA doc:
|
||||
* The OPX instruction must not overwrite sources of the OPY instruction".
|
||||
*/
|
||||
bool war = false;
|
||||
for (Operand op : first->operands) {
|
||||
assert(op.size() == 1);
|
||||
if (second->definitions[0].physReg() == op.physReg())
|
||||
war = true;
|
||||
}
|
||||
if (war)
|
||||
*prev_can_be_opx = false;
|
||||
}
|
||||
|
||||
return *prev_can_be_opx || !cur_vopd.is_opy_only;
|
||||
}
|
||||
|
||||
Instruction_cycle_info
|
||||
@@ -619,9 +643,9 @@ select_instruction_ilp(const SchedILPContext& ctx)
|
||||
|
||||
bool
|
||||
compare_nodes_vopd(const SchedILPContext& ctx, int num_vopd_odd_minus_even, bool* use_vopd,
|
||||
unsigned current, unsigned candidate)
|
||||
bool* prev_can_be_opx, unsigned current, unsigned candidate)
|
||||
{
|
||||
if (can_use_vopd(ctx, candidate)) {
|
||||
if (can_use_vopd(ctx, candidate, prev_can_be_opx)) {
|
||||
/* If we can form a VOPD instruction, always prefer to do so. */
|
||||
if (!*use_vopd) {
|
||||
*use_vopd = true;
|
||||
@@ -657,7 +681,7 @@ compare_nodes_vopd(const SchedILPContext& ctx, int num_vopd_odd_minus_even, bool
|
||||
}
|
||||
|
||||
unsigned
|
||||
select_instruction_vopd(const SchedILPContext& ctx, bool* use_vopd)
|
||||
select_instruction_vopd(const SchedILPContext& ctx, bool* use_vopd, bool* prev_can_be_opx)
|
||||
{
|
||||
*use_vopd = false;
|
||||
|
||||
@@ -679,11 +703,14 @@ select_instruction_vopd(const SchedILPContext& ctx, bool* use_vopd)
|
||||
if (candidate.dependency_mask)
|
||||
continue;
|
||||
|
||||
bool prev_can_be_opx_for_i;
|
||||
if (cur == -1u) {
|
||||
cur = i;
|
||||
*use_vopd = can_use_vopd(ctx, i);
|
||||
} else if (compare_nodes_vopd(ctx, num_vopd_odd_minus_even, use_vopd, cur, i)) {
|
||||
*use_vopd = can_use_vopd(ctx, i, prev_can_be_opx);
|
||||
} else if (compare_nodes_vopd(ctx, num_vopd_odd_minus_even, use_vopd, &prev_can_be_opx_for_i,
|
||||
cur, i)) {
|
||||
cur = i;
|
||||
*prev_can_be_opx = prev_can_be_opx_for_i;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -719,12 +746,13 @@ get_vopd_opcode_operands(const SchedILPContext& ctx, Instruction* instr, const V
|
||||
}
|
||||
|
||||
Instruction*
|
||||
create_vopd_instruction(const SchedILPContext& ctx, unsigned idx)
|
||||
create_vopd_instruction(const SchedILPContext& ctx, unsigned idx, bool prev_can_be_opx)
|
||||
{
|
||||
Instruction* x = ctx.prev_info.instr;
|
||||
Instruction* y = ctx.nodes[idx].instr;
|
||||
VOPDInfo x_info = ctx.prev_vopd_info;
|
||||
VOPDInfo y_info = ctx.vopd[idx];
|
||||
x_info.is_opy_only |= !prev_can_be_opx;
|
||||
|
||||
bool swap_x = false, swap_y = false;
|
||||
if (x_info.src_banks & y_info.src_banks) {
|
||||
@@ -744,6 +772,7 @@ create_vopd_instruction(const SchedILPContext& ctx, unsigned idx)
|
||||
std::swap(x_info, y_info);
|
||||
std::swap(swap_x, swap_y);
|
||||
}
|
||||
assert(!x_info.is_opy_only);
|
||||
|
||||
aco_opcode x_op, y_op;
|
||||
unsigned num_operands = 0;
|
||||
@@ -774,14 +803,15 @@ do_schedule(SchedILPContext& ctx, It& insert_it, It& remove_it, It instructions_
|
||||
|
||||
ctx.prev_info.instr = NULL;
|
||||
bool use_vopd = false;
|
||||
bool prev_can_be_opx;
|
||||
|
||||
while (ctx.active_mask) {
|
||||
unsigned next_idx =
|
||||
ctx.is_vopd ? select_instruction_vopd(ctx, &use_vopd) : select_instruction_ilp(ctx);
|
||||
unsigned next_idx = ctx.is_vopd ? select_instruction_vopd(ctx, &use_vopd, &prev_can_be_opx)
|
||||
: select_instruction_ilp(ctx);
|
||||
Instruction* next_instr = ctx.nodes[next_idx].instr;
|
||||
|
||||
if (use_vopd) {
|
||||
std::prev(insert_it)->reset(create_vopd_instruction(ctx, next_idx));
|
||||
std::prev(insert_it)->reset(create_vopd_instruction(ctx, next_idx, prev_can_be_opx));
|
||||
ctx.prev_info.instr = NULL;
|
||||
} else {
|
||||
(insert_it++)->reset(next_instr);
|
||||
|
@@ -153,3 +153,37 @@ BEGIN_TEST(vopd_sched.mov_to_add_bfrev)
|
||||
|
||||
finish_schedule_vopd_test();
|
||||
END_TEST
|
||||
|
||||
BEGIN_TEST(vopd_sched.war)
|
||||
for (amd_gfx_level gfx : {GFX11, GFX12}) {
|
||||
if (!setup_cs(NULL, gfx, CHIP_UNKNOWN, "", 32))
|
||||
continue;
|
||||
|
||||
PhysReg reg_v0{256};
|
||||
PhysReg reg_v1{257};
|
||||
PhysReg reg_v3{259};
|
||||
|
||||
//>> p_unit_test 0
|
||||
//~gfx11! v1: %0:v[1] = v_dual_add_f32 %0:v[3], %0:v[1] :: v1: %0:v[0] = v_dual_mul_f32 %0:v[1], %0:v[3]
|
||||
//~gfx12! v1: %0:v[0] = v_dual_mul_f32 %0:v[1], %0:v[3] :: v1: %0:v[1] = v_dual_add_f32 %0:v[3], %0:v[1]
|
||||
bld.pseudo(aco_opcode::p_unit_test, Operand::zero());
|
||||
bld.vop2(aco_opcode::v_mul_f32, Definition(reg_v0, v1), Operand(reg_v1, v1),
|
||||
Operand(reg_v3, v1));
|
||||
bld.vop2(aco_opcode::v_add_f32, Definition(reg_v1, v1), Operand(reg_v3, v1),
|
||||
Operand(reg_v1, v1));
|
||||
|
||||
/* We can't use OPX for the v_mul_f32 because of the WaR, but we also can't use OPX for the
|
||||
* v_add_u32 because that opcode is OPY-only. */
|
||||
//>> p_unit_test 1
|
||||
//~gfx11! v1: %0:v[1] = v_dual_mul_f32 %0:v[3], %0:v[1] :: v1: %0:v[0] = v_dual_add_nc_u32 %0:v[1], %0:v[3]
|
||||
//~gfx12! v1: %0:v[0] = v_add_u32 %0:v[1], %0:v[3]
|
||||
//~gfx12! v1: %0:v[1] = v_mul_f32 %0:v[3], %0:v[1]
|
||||
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(1));
|
||||
bld.vop2(aco_opcode::v_add_u32, Definition(reg_v0, v1), Operand(reg_v1, v1),
|
||||
Operand(reg_v3, v1));
|
||||
bld.vop2(aco_opcode::v_mul_f32, Definition(reg_v1, v1), Operand(reg_v3, v1),
|
||||
Operand(reg_v1, v1));
|
||||
|
||||
finish_schedule_vopd_test();
|
||||
}
|
||||
END_TEST
|
||||
|
Reference in New Issue
Block a user