diff --git a/src/amd/compiler/aco_instruction_selection.cpp b/src/amd/compiler/aco_instruction_selection.cpp index a86307220db..d66ac31a448 100644 --- a/src/amd/compiler/aco_instruction_selection.cpp +++ b/src/amd/compiler/aco_instruction_selection.cpp @@ -12098,6 +12098,30 @@ load_vb_descs(Builder& bld, PhysReg dest, Operand base, unsigned start, unsigned return count; } +void +wait_for_smem_loads(Builder& bld) +{ + if (bld.program->gfx_level >= GFX12) { + bld.sopp(aco_opcode::s_wait_kmcnt, 0); + } else { + wait_imm lgkm_imm; + lgkm_imm.lgkm = 0; + bld.sopp(aco_opcode::s_waitcnt, lgkm_imm.pack(bld.program->gfx_level)); + } +} + +void +wait_for_vmem_loads(Builder& bld) +{ + if (bld.program->gfx_level >= GFX12) { + bld.sopp(aco_opcode::s_wait_loadcnt, 0); + } else { + wait_imm vm_imm; + vm_imm.vm = 0; + bld.sopp(aco_opcode::s_waitcnt, vm_imm.pack(bld.program->gfx_level)); + } +} + Operand calc_nontrivial_instance_id(Builder& bld, const struct ac_shader_args* args, const struct aco_vs_prolog_info* pinfo, unsigned index, @@ -12107,9 +12131,7 @@ calc_nontrivial_instance_id(Builder& bld, const struct ac_shader_args* args, bld.smem(aco_opcode::s_load_dwordx2, Definition(tmp_sgpr, s2), get_arg_fixed(args, pinfo->inputs), Operand::c32(8u + index * 8u)); - wait_imm lgkm_imm; - lgkm_imm.lgkm = 0; - bld.sopp(aco_opcode::s_waitcnt, lgkm_imm.pack(bld.program->gfx_level)); + wait_for_smem_loads(bld); Definition fetch_index_def(tmp_vgpr0, v1); Operand fetch_index(tmp_vgpr0, v1); @@ -12489,9 +12511,7 @@ convert_current_unaligned_vs_attribs(Builder& bld, UnalignedVsAttribLoadState* s if (state->current_loads.empty()) return; - wait_imm vm_imm; - vm_imm.vm = 0; - bld.sopp(aco_opcode::s_waitcnt, vm_imm.pack(bld.program->gfx_level)); + wait_for_vmem_loads(bld); for (UnalignedVsAttribLoad load : state->current_loads) convert_unaligned_vs_attrib(bld, load); @@ -12577,9 +12597,6 @@ select_vs_prolog(Program* program, const struct aco_vs_prolog_info* pinfo, ac_sh uint32_t attrib_mask = BITFIELD_MASK(pinfo->num_attributes); bool has_nontrivial_divisors = pinfo->nontrivial_divisors; - wait_imm lgkm_imm; - lgkm_imm.lgkm = 0; - /* choose sgprs */ PhysReg vertex_buffers(align(max_user_sgprs + 14, 2)); PhysReg prolog_input = vertex_buffers.advance(8); @@ -12682,7 +12699,7 @@ select_vs_prolog(Program* program, const struct aco_vs_prolog_info* pinfo, ac_sh bld.vop1(aco_opcode::v_mov_b32, Definition(start_instance_vgpr, v1), start_instance); } - bld.sopp(aco_opcode::s_waitcnt, lgkm_imm.pack(program->gfx_level)); + wait_for_smem_loads(bld); for (unsigned i = 0; i < num_descs;) { PhysReg dest(attributes_start.reg() + loc * 4u); @@ -12773,11 +12790,8 @@ select_vs_prolog(Program* program, const struct aco_vs_prolog_info* pinfo, ac_sh convert_current_unaligned_vs_attribs(bld, &unaligned_state); - if (pinfo->alpha_adjust_lo | pinfo->alpha_adjust_hi) { - wait_imm vm_imm; - vm_imm.vm = 0; - bld.sopp(aco_opcode::s_waitcnt, vm_imm.pack(program->gfx_level)); - } + if (pinfo->alpha_adjust_lo | pinfo->alpha_adjust_hi) + wait_for_vmem_loads(bld); /* For 2_10_10_10 formats the alpha is handled as unsigned by pre-vega HW. * so we may need to fix it up. */ @@ -12817,7 +12831,7 @@ select_vs_prolog(Program* program, const struct aco_vs_prolog_info* pinfo, ac_sh if (has_nontrivial_divisors) { bld.smem(aco_opcode::s_load_dwordx2, Definition(prolog_input, s2), get_arg_fixed(args, pinfo->inputs), Operand::c32(0u)); - bld.sopp(aco_opcode::s_waitcnt, lgkm_imm.pack(program->gfx_level)); + wait_for_smem_loads(bld); continue_pc = Operand(prolog_input, s2); }