diff --git a/.pick_status.json b/.pick_status.json index dc8fc22c97e..6666c0cb4e3 100644 --- a/.pick_status.json +++ b/.pick_status.json @@ -1574,7 +1574,7 @@ "description": "aco/gfx12: don't use second VALU for VOPD's OPX if there is a WaR", "nominated": true, "nomination_type": 4, - "resolution": 0, + "resolution": 1, "main_sha": null, "because_sha": null, "notes": null diff --git a/src/amd/compiler/aco_scheduler_ilp.cpp b/src/amd/compiler/aco_scheduler_ilp.cpp index cc177f81e79..4d3ab25ef7b 100644 --- a/src/amd/compiler/aco_scheduler_ilp.cpp +++ b/src/amd/compiler/aco_scheduler_ilp.cpp @@ -213,7 +213,7 @@ get_vopd_info(const SchedILPContext& ctx, const Instruction* instr) } bool -is_vopd_compatible(const VOPDInfo& a, const VOPDInfo& b) +is_vopd_compatible(const VOPDInfo& a, const VOPDInfo& b, bool* swap) { if ((a.is_opy_only && b.is_opy_only) || (a.is_dst_odd == b.is_dst_odd)) return false; @@ -222,6 +222,8 @@ is_vopd_compatible(const VOPDInfo& a, const VOPDInfo& b) if (a.has_literal && b.has_literal && a.literal != b.literal) return false; + *swap = false; + /* The rest is checking src VGPR bank compatibility. */ if ((a.src_banks & b.src_banks) == 0) return true; @@ -244,11 +246,13 @@ is_vopd_compatible(const VOPDInfo& a, const VOPDInfo& b) if (b.op == aco_opcode::v_dual_mov_b32 && !a.is_commutative && a.is_opy_only) return false; + *swap = true; + return true; } bool -can_use_vopd(const SchedILPContext& ctx, unsigned idx) +can_use_vopd(const SchedILPContext& ctx, unsigned idx, bool* prev_can_be_opx) { VOPDInfo cur_vopd = ctx.vopd[idx]; Instruction* first = ctx.nodes[idx].instr; @@ -260,9 +264,14 @@ can_use_vopd(const SchedILPContext& ctx, unsigned idx) if (ctx.prev_vopd_info.op == aco_opcode::num_opcodes || cur_vopd.op == aco_opcode::num_opcodes) return false; - if (!is_vopd_compatible(ctx.prev_vopd_info, cur_vopd)) + bool swap = false; + if (!is_vopd_compatible(ctx.prev_vopd_info, cur_vopd, &swap)) return false; + /* If we have to swap a v_mov_b32, it will become an OPY-only opcode. */ + if (swap && !ctx.prev_vopd_info.is_commutative && cur_vopd.op == aco_opcode::v_dual_mov_b32) + cur_vopd.is_opy_only = true; + assert(first->definitions.size() == 1); assert(first->definitions[0].size() == 1); assert(second->definitions.size() == 1); @@ -279,8 +288,23 @@ can_use_vopd(const SchedILPContext& ctx, unsigned idx) return false; } - /* WaR dependencies are not a concern. */ - return true; + /* WaR dependencies are not a concern before GFX12. */ + *prev_can_be_opx = true; + if (ctx.program->gfx_level >= GFX12) { + /* From RDNA4 ISA doc: + * The OPX instruction must not overwrite sources of the OPY instruction". + */ + bool war = false; + for (Operand op : first->operands) { + assert(op.size() == 1); + if (second->definitions[0].physReg() == op.physReg()) + war = true; + } + if (war) + *prev_can_be_opx = false; + } + + return *prev_can_be_opx || !cur_vopd.is_opy_only; } Instruction_cycle_info @@ -619,9 +643,9 @@ select_instruction_ilp(const SchedILPContext& ctx) bool compare_nodes_vopd(const SchedILPContext& ctx, int num_vopd_odd_minus_even, bool* use_vopd, - unsigned current, unsigned candidate) + bool* prev_can_be_opx, unsigned current, unsigned candidate) { - if (can_use_vopd(ctx, candidate)) { + if (can_use_vopd(ctx, candidate, prev_can_be_opx)) { /* If we can form a VOPD instruction, always prefer to do so. */ if (!*use_vopd) { *use_vopd = true; @@ -657,7 +681,7 @@ compare_nodes_vopd(const SchedILPContext& ctx, int num_vopd_odd_minus_even, bool } unsigned -select_instruction_vopd(const SchedILPContext& ctx, bool* use_vopd) +select_instruction_vopd(const SchedILPContext& ctx, bool* use_vopd, bool* prev_can_be_opx) { *use_vopd = false; @@ -679,11 +703,14 @@ select_instruction_vopd(const SchedILPContext& ctx, bool* use_vopd) if (candidate.dependency_mask) continue; + bool prev_can_be_opx_for_i; if (cur == -1u) { cur = i; - *use_vopd = can_use_vopd(ctx, i); - } else if (compare_nodes_vopd(ctx, num_vopd_odd_minus_even, use_vopd, cur, i)) { + *use_vopd = can_use_vopd(ctx, i, prev_can_be_opx); + } else if (compare_nodes_vopd(ctx, num_vopd_odd_minus_even, use_vopd, &prev_can_be_opx_for_i, + cur, i)) { cur = i; + *prev_can_be_opx = prev_can_be_opx_for_i; } } @@ -719,12 +746,13 @@ get_vopd_opcode_operands(const SchedILPContext& ctx, Instruction* instr, const V } Instruction* -create_vopd_instruction(const SchedILPContext& ctx, unsigned idx) +create_vopd_instruction(const SchedILPContext& ctx, unsigned idx, bool prev_can_be_opx) { Instruction* x = ctx.prev_info.instr; Instruction* y = ctx.nodes[idx].instr; VOPDInfo x_info = ctx.prev_vopd_info; VOPDInfo y_info = ctx.vopd[idx]; + x_info.is_opy_only |= !prev_can_be_opx; bool swap_x = false, swap_y = false; if (x_info.src_banks & y_info.src_banks) { @@ -744,6 +772,7 @@ create_vopd_instruction(const SchedILPContext& ctx, unsigned idx) std::swap(x_info, y_info); std::swap(swap_x, swap_y); } + assert(!x_info.is_opy_only); aco_opcode x_op, y_op; unsigned num_operands = 0; @@ -774,14 +803,15 @@ do_schedule(SchedILPContext& ctx, It& insert_it, It& remove_it, It instructions_ ctx.prev_info.instr = NULL; bool use_vopd = false; + bool prev_can_be_opx; while (ctx.active_mask) { - unsigned next_idx = - ctx.is_vopd ? select_instruction_vopd(ctx, &use_vopd) : select_instruction_ilp(ctx); + unsigned next_idx = ctx.is_vopd ? select_instruction_vopd(ctx, &use_vopd, &prev_can_be_opx) + : select_instruction_ilp(ctx); Instruction* next_instr = ctx.nodes[next_idx].instr; if (use_vopd) { - std::prev(insert_it)->reset(create_vopd_instruction(ctx, next_idx)); + std::prev(insert_it)->reset(create_vopd_instruction(ctx, next_idx, prev_can_be_opx)); ctx.prev_info.instr = NULL; } else { (insert_it++)->reset(next_instr); diff --git a/src/amd/compiler/tests/test_scheduler.cpp b/src/amd/compiler/tests/test_scheduler.cpp index 7d5842658d3..bdb75243d1f 100644 --- a/src/amd/compiler/tests/test_scheduler.cpp +++ b/src/amd/compiler/tests/test_scheduler.cpp @@ -153,3 +153,37 @@ BEGIN_TEST(vopd_sched.mov_to_add_bfrev) finish_schedule_vopd_test(); END_TEST + +BEGIN_TEST(vopd_sched.war) + for (amd_gfx_level gfx : {GFX11, GFX12}) { + if (!setup_cs(NULL, gfx, CHIP_UNKNOWN, "", 32)) + continue; + + PhysReg reg_v0{256}; + PhysReg reg_v1{257}; + PhysReg reg_v3{259}; + + //>> p_unit_test 0 + //~gfx11! v1: %0:v[1] = v_dual_add_f32 %0:v[3], %0:v[1] :: v1: %0:v[0] = v_dual_mul_f32 %0:v[1], %0:v[3] + //~gfx12! v1: %0:v[0] = v_dual_mul_f32 %0:v[1], %0:v[3] :: v1: %0:v[1] = v_dual_add_f32 %0:v[3], %0:v[1] + bld.pseudo(aco_opcode::p_unit_test, Operand::zero()); + bld.vop2(aco_opcode::v_mul_f32, Definition(reg_v0, v1), Operand(reg_v1, v1), + Operand(reg_v3, v1)); + bld.vop2(aco_opcode::v_add_f32, Definition(reg_v1, v1), Operand(reg_v3, v1), + Operand(reg_v1, v1)); + + /* We can't use OPX for the v_mul_f32 because of the WaR, but we also can't use OPX for the + * v_add_u32 because that opcode is OPY-only. */ + //>> p_unit_test 1 + //~gfx11! v1: %0:v[1] = v_dual_mul_f32 %0:v[3], %0:v[1] :: v1: %0:v[0] = v_dual_add_nc_u32 %0:v[1], %0:v[3] + //~gfx12! v1: %0:v[0] = v_add_u32 %0:v[1], %0:v[3] + //~gfx12! v1: %0:v[1] = v_mul_f32 %0:v[3], %0:v[1] + bld.pseudo(aco_opcode::p_unit_test, Operand::c32(1)); + bld.vop2(aco_opcode::v_add_u32, Definition(reg_v0, v1), Operand(reg_v1, v1), + Operand(reg_v3, v1)); + bld.vop2(aco_opcode::v_mul_f32, Definition(reg_v1, v1), Operand(reg_v3, v1), + Operand(reg_v1, v1)); + + finish_schedule_vopd_test(); + } +END_TEST