aco: resolve all possible hazards at the end of shader parts
fossil-db (vega10): Totals from 1266 (2.01% of 63055) affected shaders: Instrs: 707116 -> 708382 (+0.18%) CodeSize: 3512452 -> 3517516 (+0.14%) Latency: 6661724 -> 6666788 (+0.08%) InvThroughput: 4393626 -> 4393904 (+0.01%); split: -0.00%, +0.01% fossil-db (navi10): Totals from 1305 (2.07% of 63015) affected shaders: Instrs: 719699 -> 722009 (+0.32%) CodeSize: 3650836 -> 3660076 (+0.25%) Latency: 5691633 -> 5693933 (+0.04%) InvThroughput: 1532010 -> 1532024 (+0.00%); split: -0.00%, +0.00% fossil-db (navi31): Totals from 1580 (1.99% of 79332) affected shaders: Instrs: 1678242 -> 1679879 (+0.10%) CodeSize: 8463464 -> 8470168 (+0.08%) Latency: 14273661 -> 14275298 (+0.01%) InvThroughput: 3668049 -> 3668080 (+0.00%); split: -0.00%, +0.00% Signed-off-by: Rhys Perry <pendingchaos02@gmail.com> Reviewed-by: Daniel Schürmann <daniel@schuermann.dev> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/25374>
This commit is contained in:
@@ -722,6 +722,97 @@ handle_instruction_gfx6(State& state, NOP_ctx_gfx6& ctx, aco_ptr<Instruction>& i
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
bool
|
||||||
|
is_latest_instr_vintrp(bool& global_state, bool& block_state, aco_ptr<Instruction>& pred)
|
||||||
|
{
|
||||||
|
if (pred->isVINTRP())
|
||||||
|
global_state = true;
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
template <bool Salu, bool Sgpr>
|
||||||
|
bool
|
||||||
|
handle_wr_hazard_instr(int& global_state, int& block_state, aco_ptr<Instruction>& pred)
|
||||||
|
{
|
||||||
|
if (Salu ? pred->isSALU() : (pred->isVALU() || pred->isVINTRP())) {
|
||||||
|
for (Definition dst : pred->definitions) {
|
||||||
|
if ((dst.physReg().reg() < 256) == Sgpr) {
|
||||||
|
global_state = MAX2(global_state, block_state);
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
block_state -= get_wait_states(pred);
|
||||||
|
return block_state <= 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
template <bool Salu, bool Sgpr>
|
||||||
|
void
|
||||||
|
handle_wr_hazard(State& state, int* NOPs, int min_states)
|
||||||
|
{
|
||||||
|
if (*NOPs >= min_states)
|
||||||
|
return;
|
||||||
|
|
||||||
|
int global = 0;
|
||||||
|
int block = min_states;
|
||||||
|
search_backwards<int, int, nullptr, handle_wr_hazard_instr<Salu, Sgpr>>(state, global, block);
|
||||||
|
*NOPs = MAX2(*NOPs, global);
|
||||||
|
}
|
||||||
|
|
||||||
|
void
|
||||||
|
resolve_all_gfx6(State& state, NOP_ctx_gfx6& ctx,
|
||||||
|
std::vector<aco_ptr<Instruction>>& new_instructions)
|
||||||
|
{
|
||||||
|
int NOPs = 0;
|
||||||
|
|
||||||
|
/* SGPR->SMEM hazards */
|
||||||
|
if (state.program->gfx_level == GFX6) {
|
||||||
|
handle_wr_hazard<true, true>(state, &NOPs, 4);
|
||||||
|
handle_wr_hazard<false, true>(state, &NOPs, 4);
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Break up SMEM clauses */
|
||||||
|
if (ctx.smem_clause || ctx.smem_write)
|
||||||
|
NOPs = MAX2(NOPs, 1);
|
||||||
|
|
||||||
|
/* SALU/GDS hazards */
|
||||||
|
NOPs = MAX2(NOPs, ctx.setreg_then_getsetreg);
|
||||||
|
if (state.program->gfx_level == GFX9)
|
||||||
|
NOPs = MAX2(NOPs, ctx.salu_wr_m0_then_moverel);
|
||||||
|
NOPs = MAX2(NOPs, ctx.salu_wr_m0_then_gds_msg_ttrace);
|
||||||
|
|
||||||
|
/* VALU hazards */
|
||||||
|
NOPs = MAX2(NOPs, ctx.valu_wr_vcc_then_vccz);
|
||||||
|
NOPs = MAX2(NOPs, ctx.valu_wr_exec_then_execz);
|
||||||
|
NOPs = MAX2(NOPs, ctx.valu_wr_exec_then_dpp);
|
||||||
|
if (state.program->gfx_level >= GFX8)
|
||||||
|
handle_wr_hazard<false, false>(state, &NOPs, 2); /* VALU->DPP */
|
||||||
|
NOPs = MAX2(NOPs, ctx.vmem_store_then_wr_data.any() ? 1 : 0);
|
||||||
|
if (state.program->gfx_level == GFX6) {
|
||||||
|
/* VINTRP->v_readlane_b32/etc */
|
||||||
|
bool vintrp = false;
|
||||||
|
search_backwards<bool, bool, nullptr, is_latest_instr_vintrp>(state, vintrp, vintrp);
|
||||||
|
if (vintrp)
|
||||||
|
NOPs = MAX2(NOPs, 1);
|
||||||
|
}
|
||||||
|
NOPs = MAX2(NOPs, ctx.valu_wr_vcc_then_div_fmas);
|
||||||
|
|
||||||
|
/* VALU(sgpr)->VMEM/v_readlane_b32/etc hazards. v_readlane_b32/etc require only 4 NOPs. */
|
||||||
|
handle_wr_hazard<false, true>(state, &NOPs, 5);
|
||||||
|
|
||||||
|
NOPs = MAX2(NOPs, ctx.set_vskip_mode_then_vector);
|
||||||
|
|
||||||
|
if (state.program->gfx_level == GFX9)
|
||||||
|
NOPs = MAX2(NOPs, ctx.salu_wr_m0_then_lds);
|
||||||
|
|
||||||
|
ctx.add_wait_states(NOPs);
|
||||||
|
if (NOPs) {
|
||||||
|
Builder bld(state.program, &new_instructions);
|
||||||
|
bld.sopp(aco_opcode::s_nop, -1, NOPs - 1);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
template <std::size_t N>
|
template <std::size_t N>
|
||||||
bool
|
bool
|
||||||
check_written_regs(const aco_ptr<Instruction>& instr, const std::bitset<N>& check_regs)
|
check_written_regs(const aco_ptr<Instruction>& instr, const std::bitset<N>& check_regs)
|
||||||
@@ -1004,6 +1095,66 @@ handle_instruction_gfx10(State& state, NOP_ctx_gfx10& ctx, aco_ptr<Instruction>&
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void
|
||||||
|
resolve_all_gfx10(State& state, NOP_ctx_gfx10& ctx,
|
||||||
|
std::vector<aco_ptr<Instruction>>& new_instructions)
|
||||||
|
{
|
||||||
|
Builder bld(state.program, &new_instructions);
|
||||||
|
|
||||||
|
size_t prev_count = new_instructions.size();
|
||||||
|
|
||||||
|
/* VcmpxPermlaneHazard */
|
||||||
|
if (ctx.has_VOPC_write_exec) {
|
||||||
|
ctx.has_VOPC_write_exec = false;
|
||||||
|
bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(256), v1), Operand(PhysReg(256), v1));
|
||||||
|
|
||||||
|
/* VALU mitigates VMEMtoScalarWriteHazard. */
|
||||||
|
ctx.sgprs_read_by_VMEM.reset();
|
||||||
|
ctx.sgprs_read_by_DS.reset();
|
||||||
|
ctx.sgprs_read_by_VMEM_store.reset();
|
||||||
|
}
|
||||||
|
|
||||||
|
unsigned waitcnt_depctr = 0xffff;
|
||||||
|
|
||||||
|
/* VMEMtoScalarWriteHazard */
|
||||||
|
if (ctx.sgprs_read_by_VMEM.any() || ctx.sgprs_read_by_DS.any() ||
|
||||||
|
ctx.sgprs_read_by_VMEM_store.any()) {
|
||||||
|
ctx.sgprs_read_by_VMEM.reset();
|
||||||
|
ctx.sgprs_read_by_DS.reset();
|
||||||
|
ctx.sgprs_read_by_VMEM_store.reset();
|
||||||
|
waitcnt_depctr &= 0xffe3;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* VcmpxExecWARHazard */
|
||||||
|
if (ctx.has_nonVALU_exec_read) {
|
||||||
|
ctx.has_nonVALU_exec_read = false;
|
||||||
|
waitcnt_depctr &= 0xfffe;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (waitcnt_depctr != 0xffff)
|
||||||
|
bld.sopp(aco_opcode::s_waitcnt_depctr, -1, waitcnt_depctr);
|
||||||
|
|
||||||
|
/* SMEMtoVectorWriteHazard */
|
||||||
|
if (ctx.sgprs_read_by_SMEM.any()) {
|
||||||
|
ctx.sgprs_read_by_SMEM.reset();
|
||||||
|
bld.sop1(aco_opcode::s_mov_b32, Definition(sgpr_null, s1), Operand::zero());
|
||||||
|
}
|
||||||
|
|
||||||
|
/* LdsBranchVmemWARHazard */
|
||||||
|
if (ctx.has_VMEM || ctx.has_branch_after_VMEM || ctx.has_DS || ctx.has_branch_after_DS) {
|
||||||
|
bld.sopk(aco_opcode::s_waitcnt_vscnt, Definition(sgpr_null, s1), 0);
|
||||||
|
ctx.has_VMEM = ctx.has_branch_after_VMEM = ctx.has_DS = ctx.has_branch_after_DS = false;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* NSAToVMEMBug/waNsaCannotFollowWritelane */
|
||||||
|
if (ctx.has_NSA_MIMG || ctx.has_writelane) {
|
||||||
|
ctx.has_NSA_MIMG = ctx.has_writelane = false;
|
||||||
|
/* Any instruction resolves these hazards. */
|
||||||
|
if (new_instructions.size() == prev_count)
|
||||||
|
bld.sopp(aco_opcode::s_nop, -1, 0);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
void
|
void
|
||||||
fill_vgpr_bitset(std::bitset<256>& set, PhysReg reg, unsigned bytes)
|
fill_vgpr_bitset(std::bitset<256>& set, PhysReg reg, unsigned bytes)
|
||||||
{
|
{
|
||||||
@@ -1436,11 +1587,91 @@ handle_instruction_gfx11(State& state, NOP_ctx_gfx11& ctx, aco_ptr<Instruction>&
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
bool
|
||||||
|
has_vdst0_since_valu_instr(bool& global_state, unsigned& block_state, aco_ptr<Instruction>& pred)
|
||||||
|
{
|
||||||
|
if (parse_vdst_wait(pred) == 0)
|
||||||
|
return true;
|
||||||
|
|
||||||
|
if (--block_state == 0) {
|
||||||
|
global_state = false;
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (pred->isVALU()) {
|
||||||
|
bool vgpr_rd_or_wr = false;
|
||||||
|
for (Definition def : pred->definitions) {
|
||||||
|
if (def.physReg().reg() >= 256)
|
||||||
|
vgpr_rd_or_wr = true;
|
||||||
|
}
|
||||||
|
for (Operand op : pred->operands) {
|
||||||
|
if (op.physReg().reg() >= 256)
|
||||||
|
vgpr_rd_or_wr = true;
|
||||||
|
}
|
||||||
|
if (vgpr_rd_or_wr) {
|
||||||
|
global_state = false;
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
void
|
||||||
|
resolve_all_gfx11(State& state, NOP_ctx_gfx11& ctx,
|
||||||
|
std::vector<aco_ptr<Instruction>>& new_instructions)
|
||||||
|
{
|
||||||
|
Builder bld(state.program, &new_instructions);
|
||||||
|
|
||||||
|
unsigned waitcnt_depctr = 0xffff;
|
||||||
|
|
||||||
|
/* LdsDirectVALUHazard/VALUPartialForwardingHazard/VALUTransUseHazard */
|
||||||
|
bool has_vdst0_since_valu = true;
|
||||||
|
unsigned depth = 16;
|
||||||
|
search_backwards<bool, unsigned, nullptr, has_vdst0_since_valu_instr>(
|
||||||
|
state, has_vdst0_since_valu, depth);
|
||||||
|
if (!has_vdst0_since_valu) {
|
||||||
|
waitcnt_depctr &= 0x0fff;
|
||||||
|
ctx.valu_since_wr_by_trans.reset();
|
||||||
|
ctx.trans_since_wr_by_trans.reset();
|
||||||
|
}
|
||||||
|
|
||||||
|
/* VcmpxPermlaneHazard */
|
||||||
|
if (ctx.has_Vcmpx) {
|
||||||
|
ctx.has_Vcmpx = false;
|
||||||
|
bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(256), v1), Operand(PhysReg(256), v1));
|
||||||
|
}
|
||||||
|
|
||||||
|
/* VALUMaskWriteHazard */
|
||||||
|
if (state.program->wave_size == 64 &&
|
||||||
|
(ctx.sgpr_read_by_valu_as_lanemask.any() ||
|
||||||
|
ctx.sgpr_read_by_valu_as_lanemask_then_wr_by_salu.any())) {
|
||||||
|
waitcnt_depctr &= 0xfffe;
|
||||||
|
ctx.sgpr_read_by_valu_as_lanemask.reset();
|
||||||
|
ctx.sgpr_read_by_valu_as_lanemask_then_wr_by_salu.reset();
|
||||||
|
}
|
||||||
|
|
||||||
|
/* LdsDirectVMEMHazard */
|
||||||
|
if (ctx.vgpr_used_by_vmem_load.any() || ctx.vgpr_used_by_vmem_store.any() ||
|
||||||
|
ctx.vgpr_used_by_ds.any()) {
|
||||||
|
waitcnt_depctr &= 0xffe3;
|
||||||
|
ctx.vgpr_used_by_vmem_load.reset();
|
||||||
|
ctx.vgpr_used_by_vmem_store.reset();
|
||||||
|
ctx.vgpr_used_by_ds.reset();
|
||||||
|
}
|
||||||
|
|
||||||
|
if (waitcnt_depctr != 0xffff)
|
||||||
|
bld.sopp(aco_opcode::s_waitcnt_depctr, -1, waitcnt_depctr);
|
||||||
|
}
|
||||||
|
|
||||||
template <typename Ctx>
|
template <typename Ctx>
|
||||||
using HandleInstr = void (*)(State& state, Ctx&, aco_ptr<Instruction>&,
|
using HandleInstr = void (*)(State& state, Ctx&, aco_ptr<Instruction>&,
|
||||||
std::vector<aco_ptr<Instruction>>&);
|
std::vector<aco_ptr<Instruction>>&);
|
||||||
|
|
||||||
template <typename Ctx, HandleInstr<Ctx> Handle>
|
template <typename Ctx>
|
||||||
|
using ResolveAll = void (*)(State& state, Ctx&, std::vector<aco_ptr<Instruction>>&);
|
||||||
|
|
||||||
|
template <typename Ctx, HandleInstr<Ctx> Handle, ResolveAll<Ctx> Resolve>
|
||||||
void
|
void
|
||||||
handle_block(Program* program, Ctx& ctx, Block& block)
|
handle_block(Program* program, Ctx& ctx, Block& block)
|
||||||
{
|
{
|
||||||
@@ -1455,13 +1686,34 @@ handle_block(Program* program, Ctx& ctx, Block& block)
|
|||||||
block.instructions.clear(); // Silence clang-analyzer-cplusplus.Move warning
|
block.instructions.clear(); // Silence clang-analyzer-cplusplus.Move warning
|
||||||
block.instructions.reserve(state.old_instructions.size());
|
block.instructions.reserve(state.old_instructions.size());
|
||||||
|
|
||||||
|
bool found_end = false;
|
||||||
for (aco_ptr<Instruction>& instr : state.old_instructions) {
|
for (aco_ptr<Instruction>& instr : state.old_instructions) {
|
||||||
Handle(state, ctx, instr, block.instructions);
|
Handle(state, ctx, instr, block.instructions);
|
||||||
|
|
||||||
|
/* Resolve all possible hazards (we don't know what s_setpc_b64 jumps to). */
|
||||||
|
if (instr->opcode == aco_opcode::s_setpc_b64) {
|
||||||
|
block.instructions.emplace_back(std::move(instr));
|
||||||
|
|
||||||
|
std::vector<aco_ptr<Instruction>> resolve_instrs;
|
||||||
|
Resolve(state, ctx, resolve_instrs);
|
||||||
|
block.instructions.insert(std::prev(block.instructions.end()),
|
||||||
|
std::move_iterator(resolve_instrs.begin()),
|
||||||
|
std::move_iterator(resolve_instrs.end()));
|
||||||
|
|
||||||
|
found_end = true;
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
found_end |= instr->opcode == aco_opcode::s_endpgm;
|
||||||
block.instructions.emplace_back(std::move(instr));
|
block.instructions.emplace_back(std::move(instr));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/* Resolve all possible hazards (we don't know what the shader is concatenated with). */
|
||||||
|
if (block.linear_succs.empty() && !found_end)
|
||||||
|
Resolve(state, ctx, block.instructions);
|
||||||
}
|
}
|
||||||
|
|
||||||
template <typename Ctx, HandleInstr<Ctx> Handle>
|
template <typename Ctx, HandleInstr<Ctx> Handle, ResolveAll<Ctx> Resolve>
|
||||||
void
|
void
|
||||||
mitigate_hazards(Program* program)
|
mitigate_hazards(Program* program)
|
||||||
{
|
{
|
||||||
@@ -1481,7 +1733,7 @@ mitigate_hazards(Program* program)
|
|||||||
for (unsigned b : program->blocks[idx].linear_preds)
|
for (unsigned b : program->blocks[idx].linear_preds)
|
||||||
loop_block_ctx.join(all_ctx[b]);
|
loop_block_ctx.join(all_ctx[b]);
|
||||||
|
|
||||||
handle_block<Ctx, Handle>(program, loop_block_ctx, program->blocks[idx]);
|
handle_block<Ctx, Handle, Resolve>(program, loop_block_ctx, program->blocks[idx]);
|
||||||
|
|
||||||
/* We only need to continue if the loop header context changed */
|
/* We only need to continue if the loop header context changed */
|
||||||
if (idx == loop_header_indices.top() && loop_block_ctx == all_ctx[idx])
|
if (idx == loop_header_indices.top() && loop_block_ctx == all_ctx[idx])
|
||||||
@@ -1496,7 +1748,7 @@ mitigate_hazards(Program* program)
|
|||||||
for (unsigned b : block.linear_preds)
|
for (unsigned b : block.linear_preds)
|
||||||
ctx.join(all_ctx[b]);
|
ctx.join(all_ctx[b]);
|
||||||
|
|
||||||
handle_block<Ctx, Handle>(program, ctx, block);
|
handle_block<Ctx, Handle, Resolve>(program, ctx, block);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -1506,13 +1758,13 @@ void
|
|||||||
insert_NOPs(Program* program)
|
insert_NOPs(Program* program)
|
||||||
{
|
{
|
||||||
if (program->gfx_level >= GFX11)
|
if (program->gfx_level >= GFX11)
|
||||||
mitigate_hazards<NOP_ctx_gfx11, handle_instruction_gfx11>(program);
|
mitigate_hazards<NOP_ctx_gfx11, handle_instruction_gfx11, resolve_all_gfx11>(program);
|
||||||
else if (program->gfx_level >= GFX10_3)
|
else if (program->gfx_level >= GFX10_3)
|
||||||
; /* no hazards/bugs to mitigate */
|
; /* no hazards/bugs to mitigate */
|
||||||
else if (program->gfx_level >= GFX10)
|
else if (program->gfx_level >= GFX10)
|
||||||
mitigate_hazards<NOP_ctx_gfx10, handle_instruction_gfx10>(program);
|
mitigate_hazards<NOP_ctx_gfx10, handle_instruction_gfx10, resolve_all_gfx10>(program);
|
||||||
else
|
else
|
||||||
mitigate_hazards<NOP_ctx_gfx6, handle_instruction_gfx6>(program);
|
mitigate_hazards<NOP_ctx_gfx6, handle_instruction_gfx6, resolve_all_gfx6>(program);
|
||||||
}
|
}
|
||||||
|
|
||||||
} // namespace aco
|
} // namespace aco
|
||||||
|
Reference in New Issue
Block a user