From 75a76ec3fdb85a3b518cddbdba4ce9bef4e460dc Mon Sep 17 00:00:00 2001 From: Rhys Perry Date: Wed, 17 Jan 2024 20:47:27 +0000 Subject: [PATCH] aco: implement VOPD scheduler MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Rhys Perry Reviewed-by: Georg Lehmann Reviewed-by: Daniel Schürmann Part-of: --- src/amd/compiler/aco_ir.h | 1 + src/amd/compiler/aco_scheduler_ilp.cpp | 280 ++++++++++++++++++++++++- 2 files changed, 275 insertions(+), 6 deletions(-) diff --git a/src/amd/compiler/aco_ir.h b/src/amd/compiler/aco_ir.h index 107961fa524..7c74d2fc73e 100644 --- a/src/amd/compiler/aco_ir.h +++ b/src/amd/compiler/aco_ir.h @@ -2228,6 +2228,7 @@ void ssa_elimination(Program* program); void lower_to_hw_instr(Program* program); void schedule_program(Program* program, live& live_vars); void schedule_ilp(Program* program); +void schedule_vopd(Program* program); void spill(Program* program, live& live_vars); void insert_wait_states(Program* program); bool dealloc_vgprs(Program* program); diff --git a/src/amd/compiler/aco_scheduler_ilp.cpp b/src/amd/compiler/aco_scheduler_ilp.cpp index 4bcd1d80e8c..c6a288fe5ef 100644 --- a/src/amd/compiler/aco_scheduler_ilp.cpp +++ b/src/amd/compiler/aco_scheduler_ilp.cpp @@ -27,6 +27,16 @@ constexpr unsigned num_nodes = 16; using mask_t = uint16_t; static_assert(std::numeric_limits::digits >= num_nodes); +struct VOPDInfo { + VOPDInfo() : is_opy_only(0), is_dst_odd(0), src_banks(0), has_literal(0) {} + uint16_t is_opy_only : 1; + uint16_t is_dst_odd : 1; + uint16_t src_banks : 10; /* 0-3: src0, 4-7: src1, 8-9: src2 */ + uint16_t has_literal : 1; + aco_opcode op = aco_opcode::num_opcodes; + uint32_t literal = 0; +}; + struct InstrInfo { Instruction* instr; int32_t priority; @@ -46,12 +56,21 @@ struct RegisterInfo { struct SchedILPContext { Program* program; + bool is_vopd = false; InstrInfo nodes[num_nodes]; RegisterInfo regs[512]; mask_t non_reorder_mask = 0; /* bitmask of instruction nodes which should not be reordered. */ mask_t active_mask = 0; /* bitmask of valid instruction nodes. */ uint8_t next_non_reorderable = UINT8_MAX; /* index of next node which should not be reordered. */ uint8_t last_non_reorderable = UINT8_MAX; /* index of last node which should not be reordered. */ + + /* VOPD scheduler: */ + VOPDInfo vopd[num_nodes]; + VOPDInfo prev_vopd_info; + InstrInfo prev_info; + + mask_t vopd_odd_mask = 0; + mask_t vopd_even_mask = 0; }; /** @@ -98,6 +117,117 @@ can_reorder(const Instruction* const instr) return true; } +VOPDInfo +get_vopd_info(const Instruction* instr) +{ + if (instr->format != Format::VOP1 && instr->format != Format::VOP2) + return VOPDInfo(); + + VOPDInfo info; + switch (instr->opcode) { + case aco_opcode::v_fmac_f32: info.op = aco_opcode::v_dual_fmac_f32; break; + case aco_opcode::v_fmaak_f32: info.op = aco_opcode::v_dual_fmaak_f32; break; + case aco_opcode::v_fmamk_f32: info.op = aco_opcode::v_dual_fmamk_f32; break; + case aco_opcode::v_mul_f32: info.op = aco_opcode::v_dual_mul_f32; break; + case aco_opcode::v_add_f32: info.op = aco_opcode::v_dual_add_f32; break; + case aco_opcode::v_sub_f32: info.op = aco_opcode::v_dual_sub_f32; break; + case aco_opcode::v_subrev_f32: info.op = aco_opcode::v_dual_subrev_f32; break; + case aco_opcode::v_mul_legacy_f32: info.op = aco_opcode::v_dual_mul_dx9_zero_f32; break; + case aco_opcode::v_mov_b32: info.op = aco_opcode::v_dual_mov_b32; break; + case aco_opcode::v_cndmask_b32: info.op = aco_opcode::v_dual_cndmask_b32; break; + case aco_opcode::v_max_f32: info.op = aco_opcode::v_dual_max_f32; break; + case aco_opcode::v_min_f32: info.op = aco_opcode::v_dual_min_f32; break; + case aco_opcode::v_dot2c_f32_f16: info.op = aco_opcode::v_dual_dot2acc_f32_f16; break; + case aco_opcode::v_add_u32: + info.op = aco_opcode::v_dual_add_nc_u32; + info.is_opy_only = true; + break; + case aco_opcode::v_lshlrev_b32: + info.op = aco_opcode::v_dual_lshlrev_b32; + info.is_opy_only = true; + break; + case aco_opcode::v_and_b32: + info.op = aco_opcode::v_dual_and_b32; + info.is_opy_only = true; + break; + default: return VOPDInfo(); + } + + /* Each instruction may use at most one SGPR. */ + if (instr->opcode == aco_opcode::v_cndmask_b32 && instr->operands[0].isOfType(RegType::sgpr)) + return VOPDInfo(); + + info.is_dst_odd = instr->definitions[0].physReg().reg() & 0x1; + + static const unsigned bank_mask[3] = {0x3, 0x3, 0x1}; + bool has_sgpr = false; + for (unsigned i = 0; i < instr->operands.size(); i++) { + unsigned port = (instr->opcode == aco_opcode::v_fmamk_f32 && i == 1) ? 2 : i; + if (instr->operands[i].isOfType(RegType::vgpr)) + info.src_banks |= 1 << (port * 4 + (instr->operands[i].physReg().reg() & bank_mask[port])); + + /* Check all operands because of fmaak/fmamk. */ + if (instr->operands[i].isLiteral()) { + assert(!info.has_literal || info.literal == instr->operands[i].constantValue()); + info.has_literal = true; + info.literal = instr->operands[i].constantValue(); + } + + /* Check all operands because of cndmask. */ + has_sgpr |= !instr->operands[i].isConstant() && instr->operands[i].isOfType(RegType::sgpr); + } + + /* An instruction can't use both a literal and an SGPR. */ + if (has_sgpr && info.has_literal) + return VOPDInfo(); + + return info; +} + +bool +can_use_vopd(const SchedILPContext& ctx, unsigned idx) +{ + VOPDInfo cur_vopd = ctx.vopd[idx]; + Instruction* first = ctx.nodes[idx].instr; + Instruction* second = ctx.prev_info.instr; + + if (!second) + return false; + + if (ctx.prev_vopd_info.op == aco_opcode::num_opcodes || cur_vopd.op == aco_opcode::num_opcodes) + return false; + + if ((ctx.prev_vopd_info.src_banks & cur_vopd.src_banks) || + (ctx.prev_vopd_info.is_opy_only & cur_vopd.is_opy_only) || + (ctx.prev_vopd_info.is_dst_odd == cur_vopd.is_dst_odd)) { + return false; + } + + /* Both can use a literal, but it must be the same literal. */ + if (ctx.prev_vopd_info.has_literal && cur_vopd.has_literal && + ctx.prev_vopd_info.literal != cur_vopd.literal) + return false; + + assert(first->definitions.size() == 1); + assert(first->definitions[0].size() == 1); + assert(second->definitions.size() == 1); + assert(second->definitions[0].size() == 1); + + /* Check for WaW dependency. */ + if (first->definitions[0].physReg() == second->definitions[0].physReg()) + return false; + + /* Check for RaW dependency. */ + for (Operand op : second->operands) { + assert(op.size() == 1); + if (first->definitions[0].physReg() == op.physReg()) + return false; + } + + /* WaR dependencies are not a concern. */ + return true; +} + unsigned get_latency(const Instruction* const instr) { @@ -138,6 +268,16 @@ add_entry(SchedILPContext& ctx, Instruction* const instr, const uint32_t idx) bool reorder = can_reorder(instr); ctx.active_mask |= mask; + if (ctx.is_vopd) { + VOPDInfo vopd = get_vopd_info(entry.instr); + + ctx.vopd[idx] = vopd; + ctx.vopd_odd_mask &= ~mask; + ctx.vopd_odd_mask |= vopd.is_dst_odd ? mask : 0; + ctx.vopd_even_mask &= ~mask; + ctx.vopd_even_mask |= vopd.is_dst_odd || vopd.op == aco_opcode::num_opcodes ? 0 : mask; + } + for (const Operand& op : instr->operands) { assert(op.isFixed()); unsigned reg = op.physReg(); @@ -206,8 +346,10 @@ add_entry(SchedILPContext& ctx, Instruction* const instr, const uint32_t idx) reg_info.has_direct_dependency = 1; reg_info.direct_dependency = idx; - /* Add latency information for the next register read. */ - reg_info.latency = get_latency(instr); + if (!ctx.is_vopd) { + /* Add latency information for the next register read. */ + reg_info.latency = get_latency(instr); + } } } @@ -225,7 +367,7 @@ add_entry(SchedILPContext& ctx, Instruction* const instr, const uint32_t idx) /* Just don't reorder these at all. */ if (!is_memory_instr(instr) || instr->definitions.empty() || - get_sync_info(instr).semantics & semantic_volatile) { + get_sync_info(instr).semantics & semantic_volatile || ctx.is_vopd) { /* Add all previous instructions as dependencies. */ entry.dependency_mask = ctx.active_mask; } @@ -343,7 +485,7 @@ collect_clause_dependencies(const SchedILPContext& ctx, const uint8_t next, mask * Returns the index of the next instruction to be selected. */ unsigned -select_instruction(const SchedILPContext& ctx) +select_instruction_ilp(const SchedILPContext& ctx) { mask_t mask = ctx.active_mask; @@ -377,6 +519,104 @@ select_instruction(const SchedILPContext& ctx) return idx; } +bool +compare_nodes_vopd(const SchedILPContext& ctx, int num_vopd_odd_minus_even, bool* use_vopd, + unsigned current, unsigned candidate) +{ + if (can_use_vopd(ctx, candidate)) { + /* If we can form a VOPD instruction, always prefer to do so. */ + if (!*use_vopd) { + *use_vopd = true; + return true; + } + } else { + if (*use_vopd) + return false; + + /* Neither current nor candidate can form a VOPD instruction with the previously scheduled + * instruction. */ + VOPDInfo current_vopd = ctx.vopd[current]; + VOPDInfo candidate_vopd = ctx.vopd[candidate]; + + /* Delay scheduling VOPD-capable instructions in case an opportunity appears later. */ + bool current_vopd_capable = current_vopd.op != aco_opcode::num_opcodes; + bool candidate_vopd_capable = candidate_vopd.op != aco_opcode::num_opcodes; + if (current_vopd_capable != candidate_vopd_capable) + return !candidate_vopd_capable; + + /* If we have to select from VOPD-capable instructions, prefer maintaining a balance of + * odd/even instructions, in case selecting this instruction fails to make a pair. + */ + if (current_vopd_capable && num_vopd_odd_minus_even != 0) { + assert(candidate_vopd_capable); + bool prefer_vopd_dst_odd = num_vopd_odd_minus_even > 0; + if (current_vopd.is_dst_odd != candidate_vopd.is_dst_odd) + return prefer_vopd_dst_odd ? candidate_vopd.is_dst_odd : !candidate_vopd.is_dst_odd; + } + } + + return ctx.nodes[candidate].priority > ctx.nodes[current].priority; +} + +unsigned +select_instruction_vopd(const SchedILPContext& ctx, bool* use_vopd) +{ + *use_vopd = false; + + mask_t mask = ctx.active_mask; + if (ctx.next_non_reorderable != UINT8_MAX) + mask = ctx.nodes[ctx.next_non_reorderable].dependency_mask; + + if (mask == 0) + return ctx.next_non_reorderable; + + int num_vopd_odd_minus_even = + (int)util_bitcount(ctx.vopd_odd_mask & mask) - (int)util_bitcount(ctx.vopd_even_mask & mask); + + unsigned cur = -1u; + u_foreach_bit (i, mask) { + const InstrInfo& candidate = ctx.nodes[i]; + + /* Check if the candidate has pending dependencies. */ + if (candidate.dependency_mask) + continue; + + if (cur == -1u) { + cur = i; + *use_vopd = can_use_vopd(ctx, i); + } else if (compare_nodes_vopd(ctx, num_vopd_odd_minus_even, use_vopd, cur, i)) { + cur = i; + } + } + + assert(cur != -1u); + return cur; +} + +Instruction* +create_vopd_instruction(const SchedILPContext& ctx, unsigned idx) +{ + Instruction* x = ctx.prev_info.instr; + Instruction* y = ctx.nodes[idx].instr; + aco_opcode opx = ctx.prev_vopd_info.op; + aco_opcode opy = ctx.vopd[idx].op; + if (ctx.prev_vopd_info.is_opy_only) { + std::swap(x, y); + std::swap(opx, opy); + } + + VOPD_instruction* instr = create_instruction( + opx, Format::VOPD, x->operands.size() + y->operands.size(), 2); + instr->opy = opy; + instr->definitions[0] = x->definitions[0]; + instr->definitions[1] = y->definitions[0]; + std::copy(x->operands.begin(), x->operands.end(), instr->operands.begin()); + std::copy(y->operands.begin(), y->operands.end(), + std::next(instr->operands.begin(), x->operands.size())); + + return instr; +} + template void do_schedule(SchedILPContext& ctx, It& insert_it, It& remove_it, It instructions_begin, @@ -389,11 +629,22 @@ do_schedule(SchedILPContext& ctx, It& insert_it, It& remove_it, It instructions_ add_entry(ctx, (remove_it++)->get(), i); } + ctx.prev_info.instr = NULL; + bool use_vopd = false; + while (ctx.active_mask) { - unsigned next_idx = select_instruction(ctx); + unsigned next_idx = + ctx.is_vopd ? select_instruction_vopd(ctx, &use_vopd) : select_instruction_ilp(ctx); Instruction* next_instr = ctx.nodes[next_idx].instr; - (insert_it++)->reset(next_instr); + if (use_vopd) { + std::prev(insert_it)->reset(create_vopd_instruction(ctx, next_idx)); + ctx.prev_info.instr = NULL; + } else { + (insert_it++)->reset(next_instr); + ctx.prev_info = ctx.nodes[next_idx]; + ctx.prev_vopd_info = ctx.vopd[next_idx]; + } remove_entry(ctx, next_instr, next_idx); ctx.nodes[next_idx].instr = NULL; @@ -422,4 +673,21 @@ schedule_ilp(Program* program) } } +void +schedule_vopd(Program* program) +{ + if (program->gfx_level < GFX11 || program->wave_size != 32) + return; + + SchedILPContext ctx = {program}; + ctx.is_vopd = true; + + for (Block& block : program->blocks) { + auto it = block.instructions.rbegin(); + auto insert_it = block.instructions.rbegin(); + do_schedule(ctx, insert_it, it, block.instructions.rbegin(), block.instructions.rend()); + block.instructions.erase(block.instructions.begin(), insert_it.base()); + } +} + } // namespace aco