aco/gfx12: don't use second VALU for VOPD's OPX if there is a WaR
fossil-db (gfx1201): Totals from 38908 (49.02% of 79377) affected shaders: Instrs: 30268107 -> 30268131 (+0.00%); split: -0.00%, +0.00% CodeSize: 180843648 -> 180843640 (-0.00%); split: -0.00%, +0.00% Latency: 224905962 -> 224906072 (+0.00%); split: -0.00%, +0.00% InvThroughput: 44322988 -> 44323004 (+0.00%) VALU: 15124145 -> 15124167 (+0.00%) VOPD: 4018504 -> 4018482 (-0.00%) Signed-off-by: Rhys Perry <pendingchaos02@gmail.com> Backport-to: 25.0 Backport-to: 25.1 Reviewed-by: Daniel Schürmann <daniel@schuermann.dev> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/34246> (cherry picked from commit 408fa33c092810155baac342de90fd712231aa89)
This commit is contained in:

committed by
Eric Engestrom

parent
2f1fd84e4d
commit
dd304bfd80
@@ -1574,7 +1574,7 @@
|
|||||||
"description": "aco/gfx12: don't use second VALU for VOPD's OPX if there is a WaR",
|
"description": "aco/gfx12: don't use second VALU for VOPD's OPX if there is a WaR",
|
||||||
"nominated": true,
|
"nominated": true,
|
||||||
"nomination_type": 4,
|
"nomination_type": 4,
|
||||||
"resolution": 0,
|
"resolution": 1,
|
||||||
"main_sha": null,
|
"main_sha": null,
|
||||||
"because_sha": null,
|
"because_sha": null,
|
||||||
"notes": null
|
"notes": null
|
||||||
|
@@ -213,7 +213,7 @@ get_vopd_info(const SchedILPContext& ctx, const Instruction* instr)
|
|||||||
}
|
}
|
||||||
|
|
||||||
bool
|
bool
|
||||||
is_vopd_compatible(const VOPDInfo& a, const VOPDInfo& b)
|
is_vopd_compatible(const VOPDInfo& a, const VOPDInfo& b, bool* swap)
|
||||||
{
|
{
|
||||||
if ((a.is_opy_only && b.is_opy_only) || (a.is_dst_odd == b.is_dst_odd))
|
if ((a.is_opy_only && b.is_opy_only) || (a.is_dst_odd == b.is_dst_odd))
|
||||||
return false;
|
return false;
|
||||||
@@ -222,6 +222,8 @@ is_vopd_compatible(const VOPDInfo& a, const VOPDInfo& b)
|
|||||||
if (a.has_literal && b.has_literal && a.literal != b.literal)
|
if (a.has_literal && b.has_literal && a.literal != b.literal)
|
||||||
return false;
|
return false;
|
||||||
|
|
||||||
|
*swap = false;
|
||||||
|
|
||||||
/* The rest is checking src VGPR bank compatibility. */
|
/* The rest is checking src VGPR bank compatibility. */
|
||||||
if ((a.src_banks & b.src_banks) == 0)
|
if ((a.src_banks & b.src_banks) == 0)
|
||||||
return true;
|
return true;
|
||||||
@@ -244,11 +246,13 @@ is_vopd_compatible(const VOPDInfo& a, const VOPDInfo& b)
|
|||||||
if (b.op == aco_opcode::v_dual_mov_b32 && !a.is_commutative && a.is_opy_only)
|
if (b.op == aco_opcode::v_dual_mov_b32 && !a.is_commutative && a.is_opy_only)
|
||||||
return false;
|
return false;
|
||||||
|
|
||||||
|
*swap = true;
|
||||||
|
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
bool
|
bool
|
||||||
can_use_vopd(const SchedILPContext& ctx, unsigned idx)
|
can_use_vopd(const SchedILPContext& ctx, unsigned idx, bool* prev_can_be_opx)
|
||||||
{
|
{
|
||||||
VOPDInfo cur_vopd = ctx.vopd[idx];
|
VOPDInfo cur_vopd = ctx.vopd[idx];
|
||||||
Instruction* first = ctx.nodes[idx].instr;
|
Instruction* first = ctx.nodes[idx].instr;
|
||||||
@@ -260,9 +264,14 @@ can_use_vopd(const SchedILPContext& ctx, unsigned idx)
|
|||||||
if (ctx.prev_vopd_info.op == aco_opcode::num_opcodes || cur_vopd.op == aco_opcode::num_opcodes)
|
if (ctx.prev_vopd_info.op == aco_opcode::num_opcodes || cur_vopd.op == aco_opcode::num_opcodes)
|
||||||
return false;
|
return false;
|
||||||
|
|
||||||
if (!is_vopd_compatible(ctx.prev_vopd_info, cur_vopd))
|
bool swap = false;
|
||||||
|
if (!is_vopd_compatible(ctx.prev_vopd_info, cur_vopd, &swap))
|
||||||
return false;
|
return false;
|
||||||
|
|
||||||
|
/* If we have to swap a v_mov_b32, it will become an OPY-only opcode. */
|
||||||
|
if (swap && !ctx.prev_vopd_info.is_commutative && cur_vopd.op == aco_opcode::v_dual_mov_b32)
|
||||||
|
cur_vopd.is_opy_only = true;
|
||||||
|
|
||||||
assert(first->definitions.size() == 1);
|
assert(first->definitions.size() == 1);
|
||||||
assert(first->definitions[0].size() == 1);
|
assert(first->definitions[0].size() == 1);
|
||||||
assert(second->definitions.size() == 1);
|
assert(second->definitions.size() == 1);
|
||||||
@@ -279,8 +288,23 @@ can_use_vopd(const SchedILPContext& ctx, unsigned idx)
|
|||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
/* WaR dependencies are not a concern. */
|
/* WaR dependencies are not a concern before GFX12. */
|
||||||
return true;
|
*prev_can_be_opx = true;
|
||||||
|
if (ctx.program->gfx_level >= GFX12) {
|
||||||
|
/* From RDNA4 ISA doc:
|
||||||
|
* The OPX instruction must not overwrite sources of the OPY instruction".
|
||||||
|
*/
|
||||||
|
bool war = false;
|
||||||
|
for (Operand op : first->operands) {
|
||||||
|
assert(op.size() == 1);
|
||||||
|
if (second->definitions[0].physReg() == op.physReg())
|
||||||
|
war = true;
|
||||||
|
}
|
||||||
|
if (war)
|
||||||
|
*prev_can_be_opx = false;
|
||||||
|
}
|
||||||
|
|
||||||
|
return *prev_can_be_opx || !cur_vopd.is_opy_only;
|
||||||
}
|
}
|
||||||
|
|
||||||
Instruction_cycle_info
|
Instruction_cycle_info
|
||||||
@@ -619,9 +643,9 @@ select_instruction_ilp(const SchedILPContext& ctx)
|
|||||||
|
|
||||||
bool
|
bool
|
||||||
compare_nodes_vopd(const SchedILPContext& ctx, int num_vopd_odd_minus_even, bool* use_vopd,
|
compare_nodes_vopd(const SchedILPContext& ctx, int num_vopd_odd_minus_even, bool* use_vopd,
|
||||||
unsigned current, unsigned candidate)
|
bool* prev_can_be_opx, unsigned current, unsigned candidate)
|
||||||
{
|
{
|
||||||
if (can_use_vopd(ctx, candidate)) {
|
if (can_use_vopd(ctx, candidate, prev_can_be_opx)) {
|
||||||
/* If we can form a VOPD instruction, always prefer to do so. */
|
/* If we can form a VOPD instruction, always prefer to do so. */
|
||||||
if (!*use_vopd) {
|
if (!*use_vopd) {
|
||||||
*use_vopd = true;
|
*use_vopd = true;
|
||||||
@@ -657,7 +681,7 @@ compare_nodes_vopd(const SchedILPContext& ctx, int num_vopd_odd_minus_even, bool
|
|||||||
}
|
}
|
||||||
|
|
||||||
unsigned
|
unsigned
|
||||||
select_instruction_vopd(const SchedILPContext& ctx, bool* use_vopd)
|
select_instruction_vopd(const SchedILPContext& ctx, bool* use_vopd, bool* prev_can_be_opx)
|
||||||
{
|
{
|
||||||
*use_vopd = false;
|
*use_vopd = false;
|
||||||
|
|
||||||
@@ -679,11 +703,14 @@ select_instruction_vopd(const SchedILPContext& ctx, bool* use_vopd)
|
|||||||
if (candidate.dependency_mask)
|
if (candidate.dependency_mask)
|
||||||
continue;
|
continue;
|
||||||
|
|
||||||
|
bool prev_can_be_opx_for_i;
|
||||||
if (cur == -1u) {
|
if (cur == -1u) {
|
||||||
cur = i;
|
cur = i;
|
||||||
*use_vopd = can_use_vopd(ctx, i);
|
*use_vopd = can_use_vopd(ctx, i, prev_can_be_opx);
|
||||||
} else if (compare_nodes_vopd(ctx, num_vopd_odd_minus_even, use_vopd, cur, i)) {
|
} else if (compare_nodes_vopd(ctx, num_vopd_odd_minus_even, use_vopd, &prev_can_be_opx_for_i,
|
||||||
|
cur, i)) {
|
||||||
cur = i;
|
cur = i;
|
||||||
|
*prev_can_be_opx = prev_can_be_opx_for_i;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -719,12 +746,13 @@ get_vopd_opcode_operands(const SchedILPContext& ctx, Instruction* instr, const V
|
|||||||
}
|
}
|
||||||
|
|
||||||
Instruction*
|
Instruction*
|
||||||
create_vopd_instruction(const SchedILPContext& ctx, unsigned idx)
|
create_vopd_instruction(const SchedILPContext& ctx, unsigned idx, bool prev_can_be_opx)
|
||||||
{
|
{
|
||||||
Instruction* x = ctx.prev_info.instr;
|
Instruction* x = ctx.prev_info.instr;
|
||||||
Instruction* y = ctx.nodes[idx].instr;
|
Instruction* y = ctx.nodes[idx].instr;
|
||||||
VOPDInfo x_info = ctx.prev_vopd_info;
|
VOPDInfo x_info = ctx.prev_vopd_info;
|
||||||
VOPDInfo y_info = ctx.vopd[idx];
|
VOPDInfo y_info = ctx.vopd[idx];
|
||||||
|
x_info.is_opy_only |= !prev_can_be_opx;
|
||||||
|
|
||||||
bool swap_x = false, swap_y = false;
|
bool swap_x = false, swap_y = false;
|
||||||
if (x_info.src_banks & y_info.src_banks) {
|
if (x_info.src_banks & y_info.src_banks) {
|
||||||
@@ -744,6 +772,7 @@ create_vopd_instruction(const SchedILPContext& ctx, unsigned idx)
|
|||||||
std::swap(x_info, y_info);
|
std::swap(x_info, y_info);
|
||||||
std::swap(swap_x, swap_y);
|
std::swap(swap_x, swap_y);
|
||||||
}
|
}
|
||||||
|
assert(!x_info.is_opy_only);
|
||||||
|
|
||||||
aco_opcode x_op, y_op;
|
aco_opcode x_op, y_op;
|
||||||
unsigned num_operands = 0;
|
unsigned num_operands = 0;
|
||||||
@@ -774,14 +803,15 @@ do_schedule(SchedILPContext& ctx, It& insert_it, It& remove_it, It instructions_
|
|||||||
|
|
||||||
ctx.prev_info.instr = NULL;
|
ctx.prev_info.instr = NULL;
|
||||||
bool use_vopd = false;
|
bool use_vopd = false;
|
||||||
|
bool prev_can_be_opx;
|
||||||
|
|
||||||
while (ctx.active_mask) {
|
while (ctx.active_mask) {
|
||||||
unsigned next_idx =
|
unsigned next_idx = ctx.is_vopd ? select_instruction_vopd(ctx, &use_vopd, &prev_can_be_opx)
|
||||||
ctx.is_vopd ? select_instruction_vopd(ctx, &use_vopd) : select_instruction_ilp(ctx);
|
: select_instruction_ilp(ctx);
|
||||||
Instruction* next_instr = ctx.nodes[next_idx].instr;
|
Instruction* next_instr = ctx.nodes[next_idx].instr;
|
||||||
|
|
||||||
if (use_vopd) {
|
if (use_vopd) {
|
||||||
std::prev(insert_it)->reset(create_vopd_instruction(ctx, next_idx));
|
std::prev(insert_it)->reset(create_vopd_instruction(ctx, next_idx, prev_can_be_opx));
|
||||||
ctx.prev_info.instr = NULL;
|
ctx.prev_info.instr = NULL;
|
||||||
} else {
|
} else {
|
||||||
(insert_it++)->reset(next_instr);
|
(insert_it++)->reset(next_instr);
|
||||||
|
@@ -153,3 +153,37 @@ BEGIN_TEST(vopd_sched.mov_to_add_bfrev)
|
|||||||
|
|
||||||
finish_schedule_vopd_test();
|
finish_schedule_vopd_test();
|
||||||
END_TEST
|
END_TEST
|
||||||
|
|
||||||
|
BEGIN_TEST(vopd_sched.war)
|
||||||
|
for (amd_gfx_level gfx : {GFX11, GFX12}) {
|
||||||
|
if (!setup_cs(NULL, gfx, CHIP_UNKNOWN, "", 32))
|
||||||
|
continue;
|
||||||
|
|
||||||
|
PhysReg reg_v0{256};
|
||||||
|
PhysReg reg_v1{257};
|
||||||
|
PhysReg reg_v3{259};
|
||||||
|
|
||||||
|
//>> p_unit_test 0
|
||||||
|
//~gfx11! v1: %0:v[1] = v_dual_add_f32 %0:v[3], %0:v[1] :: v1: %0:v[0] = v_dual_mul_f32 %0:v[1], %0:v[3]
|
||||||
|
//~gfx12! v1: %0:v[0] = v_dual_mul_f32 %0:v[1], %0:v[3] :: v1: %0:v[1] = v_dual_add_f32 %0:v[3], %0:v[1]
|
||||||
|
bld.pseudo(aco_opcode::p_unit_test, Operand::zero());
|
||||||
|
bld.vop2(aco_opcode::v_mul_f32, Definition(reg_v0, v1), Operand(reg_v1, v1),
|
||||||
|
Operand(reg_v3, v1));
|
||||||
|
bld.vop2(aco_opcode::v_add_f32, Definition(reg_v1, v1), Operand(reg_v3, v1),
|
||||||
|
Operand(reg_v1, v1));
|
||||||
|
|
||||||
|
/* We can't use OPX for the v_mul_f32 because of the WaR, but we also can't use OPX for the
|
||||||
|
* v_add_u32 because that opcode is OPY-only. */
|
||||||
|
//>> p_unit_test 1
|
||||||
|
//~gfx11! v1: %0:v[1] = v_dual_mul_f32 %0:v[3], %0:v[1] :: v1: %0:v[0] = v_dual_add_nc_u32 %0:v[1], %0:v[3]
|
||||||
|
//~gfx12! v1: %0:v[0] = v_add_u32 %0:v[1], %0:v[3]
|
||||||
|
//~gfx12! v1: %0:v[1] = v_mul_f32 %0:v[3], %0:v[1]
|
||||||
|
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(1));
|
||||||
|
bld.vop2(aco_opcode::v_add_u32, Definition(reg_v0, v1), Operand(reg_v1, v1),
|
||||||
|
Operand(reg_v3, v1));
|
||||||
|
bld.vop2(aco_opcode::v_mul_f32, Definition(reg_v1, v1), Operand(reg_v3, v1),
|
||||||
|
Operand(reg_v1, v1));
|
||||||
|
|
||||||
|
finish_schedule_vopd_test();
|
||||||
|
}
|
||||||
|
END_TEST
|
||||||
|
Reference in New Issue
Block a user