aco/waitcnt: support GFX12 in waitcnt pass
Signed-off-by: Rhys Perry <pendingchaos02@gmail.com> Reviewed-by: Georg Lehmann <dadschoorse@gmail.com> Reviewed-by: Daniel Schürmann <daniel@schuermann.dev> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/29225>
This commit is contained in:
@@ -39,7 +39,7 @@ namespace {
|
||||
|
||||
/* Instructions of the same event will finish in-order except for smem
|
||||
* and maybe flat. Instructions of different events may not finish in-order. */
|
||||
enum wait_event : uint16_t {
|
||||
enum wait_event : uint32_t {
|
||||
event_smem = 1 << 0,
|
||||
event_lds = 1 << 1,
|
||||
event_gds = 1 << 2,
|
||||
@@ -53,10 +53,12 @@ enum wait_event : uint16_t {
|
||||
event_vmem_gpr_lock = 1 << 10,
|
||||
event_sendmsg = 1 << 11,
|
||||
event_ldsdir = 1 << 12,
|
||||
event_valu = 1 << 13,
|
||||
event_trans = 1 << 14,
|
||||
event_salu = 1 << 15,
|
||||
num_events = 16,
|
||||
event_vmem_sample = 1 << 13, /* GFX12+ */
|
||||
event_vmem_bvh = 1 << 14, /* GFX12+ */
|
||||
event_valu = 1 << 15,
|
||||
event_trans = 1 << 16,
|
||||
event_salu = 1 << 17,
|
||||
num_events = 18,
|
||||
};
|
||||
|
||||
enum counter_type : uint8_t {
|
||||
@@ -64,6 +66,9 @@ enum counter_type : uint8_t {
|
||||
counter_lgkm = 1 << wait_type_lgkm,
|
||||
counter_vm = 1 << wait_type_vm,
|
||||
counter_vs = 1 << wait_type_vs,
|
||||
counter_sample = 1 << wait_type_sample,
|
||||
counter_bvh = 1 << wait_type_bvh,
|
||||
counter_km = 1 << wait_type_km,
|
||||
counter_alu = 1 << wait_type_num,
|
||||
num_counters = wait_type_num + 1,
|
||||
wait_counters = BITFIELD_MASK(wait_type_num),
|
||||
@@ -162,11 +167,11 @@ struct alu_delay_info {
|
||||
struct wait_entry {
|
||||
wait_imm imm;
|
||||
alu_delay_info delay;
|
||||
uint16_t events; /* use wait_event notion */
|
||||
uint32_t events; /* use wait_event notion */
|
||||
uint8_t counters; /* use counter_type notion */
|
||||
bool wait_on_read : 1;
|
||||
bool logical : 1;
|
||||
uint8_t vmem_types : 4;
|
||||
uint8_t vmem_types : 4; /* use vmem_type notion. for counter_vm. */
|
||||
|
||||
wait_entry(wait_event event_, wait_imm imm_, alu_delay_info delay_, uint8_t counters_,
|
||||
bool logical_, bool wait_on_read_)
|
||||
@@ -243,6 +248,12 @@ struct target_info {
|
||||
events[wait_type_lgkm] = event_smem | event_lds | event_gds | event_flat | event_sendmsg;
|
||||
events[wait_type_vm] = event_vmem | event_flat;
|
||||
events[wait_type_vs] = event_vmem_store;
|
||||
if (gfx_level >= GFX12) {
|
||||
events[wait_type_sample] = event_vmem_sample;
|
||||
events[wait_type_bvh] = event_vmem_bvh;
|
||||
events[wait_type_km] = event_smem | event_sendmsg;
|
||||
events[wait_type_lgkm] &= ~events[wait_type_km];
|
||||
}
|
||||
|
||||
for (unsigned i = 0; i < wait_type_num; i++) {
|
||||
u_foreach_bit (j, events[i])
|
||||
@@ -339,10 +350,12 @@ struct wait_ctx {
|
||||
};
|
||||
|
||||
uint8_t
|
||||
get_vmem_type(Instruction* instr)
|
||||
get_vmem_type(enum amd_gfx_level gfx_level, Instruction* instr)
|
||||
{
|
||||
if (instr->opcode == aco_opcode::image_bvh64_intersect_ray)
|
||||
return vmem_bvh;
|
||||
else if (gfx_level >= GFX12 && instr->opcode == aco_opcode::image_msaa_load)
|
||||
return vmem_sampler;
|
||||
else if (instr->isMIMG() && !instr->operands[1].isUndefined() &&
|
||||
instr->operands[1].regClass() == s4)
|
||||
return vmem_sampler;
|
||||
@@ -351,6 +364,17 @@ get_vmem_type(Instruction* instr)
|
||||
return 0;
|
||||
}
|
||||
|
||||
wait_event
|
||||
get_vmem_event(wait_ctx& ctx, Instruction* instr, uint8_t type)
|
||||
{
|
||||
if (instr->definitions.empty() && ctx.gfx_level >= GFX10)
|
||||
return event_vmem_store;
|
||||
wait_event ev = event_vmem;
|
||||
if (ctx.gfx_level >= GFX12 && type != vmem_nosampler)
|
||||
ev = type == vmem_bvh ? event_vmem_bvh : event_vmem_sample;
|
||||
return ev;
|
||||
}
|
||||
|
||||
void
|
||||
check_instr(wait_ctx& ctx, wait_imm& wait, alu_delay_info& delay, Instruction* instr)
|
||||
{
|
||||
@@ -383,10 +407,11 @@ check_instr(wait_ctx& ctx, wait_imm& wait, alu_delay_info& delay, Instruction* i
|
||||
wait_imm reg_imm = it->second.imm;
|
||||
|
||||
/* Vector Memory reads and writes return in the order they were issued */
|
||||
uint8_t vmem_type = get_vmem_type(instr);
|
||||
uint8_t vmem_type = get_vmem_type(ctx.gfx_level, instr);
|
||||
if (vmem_type) {
|
||||
wait_type type = (wait_type)(ffs(ctx.info->get_counters_for_event(event_vmem)) - 1);
|
||||
if ((it->second.events & ctx.info->events[type]) == event_vmem &&
|
||||
uint32_t event = get_vmem_event(ctx, instr, vmem_type);
|
||||
wait_type type = (wait_type)(ffs(ctx.info->get_counters_for_event(event)) - 1);
|
||||
if ((it->second.events & ctx.info->events[type]) == event &&
|
||||
(type != wait_type_vm || it->second.vmem_types == vmem_type))
|
||||
reg_imm[type] = wait_imm::unset_counter;
|
||||
}
|
||||
@@ -701,7 +726,8 @@ insert_wait_entry(wait_ctx& ctx, PhysReg reg, RegClass rc, wait_event event, boo
|
||||
|
||||
wait_entry new_entry(event, imm, delay, counters, !rc.is_linear() && !force_linear,
|
||||
wait_on_read);
|
||||
new_entry.vmem_types |= vmem_types;
|
||||
if (counters & counter_vm)
|
||||
new_entry.vmem_types |= vmem_types;
|
||||
|
||||
for (unsigned i = 0; i < rc.size(); i++) {
|
||||
auto it = ctx.gpr_map.emplace(PhysReg{reg.reg() + i}, new_entry);
|
||||
@@ -835,12 +861,13 @@ gen(Instruction* instr, wait_ctx& ctx)
|
||||
case Format::MIMG:
|
||||
case Format::GLOBAL:
|
||||
case Format::SCRATCH: {
|
||||
wait_event ev =
|
||||
!instr->definitions.empty() || ctx.gfx_level < GFX10 ? event_vmem : event_vmem_store;
|
||||
uint8_t type = get_vmem_type(ctx.gfx_level, instr);
|
||||
wait_event ev = get_vmem_event(ctx, instr, type);
|
||||
|
||||
update_counters(ctx, ev, get_sync_info(instr));
|
||||
|
||||
if (!instr->definitions.empty())
|
||||
insert_wait_entry(ctx, instr->definitions[0], ev, get_vmem_type(instr));
|
||||
insert_wait_entry(ctx, instr->definitions[0], ev, type);
|
||||
|
||||
if (ctx.gfx_level == GFX6 && instr->format != Format::MIMG && instr->operands.size() == 4) {
|
||||
update_counters(ctx, event_vmem_gpr_lock);
|
||||
@@ -872,18 +899,42 @@ gen(Instruction* instr, wait_ctx& ctx)
|
||||
void
|
||||
emit_waitcnt(wait_ctx& ctx, std::vector<aco_ptr<Instruction>>& instructions, wait_imm& imm)
|
||||
{
|
||||
if (imm.vs != wait_imm::unset_counter) {
|
||||
assert(ctx.gfx_level >= GFX10);
|
||||
Instruction* waitcnt_vs = create_instruction(aco_opcode::s_waitcnt_vscnt, Format::SOPK, 1, 0);
|
||||
waitcnt_vs->operands[0] = Operand(sgpr_null, s1);
|
||||
waitcnt_vs->salu().imm = imm.vs;
|
||||
instructions.emplace_back(waitcnt_vs);
|
||||
imm.vs = wait_imm::unset_counter;
|
||||
}
|
||||
if (!imm.empty()) {
|
||||
Instruction* waitcnt = create_instruction(aco_opcode::s_waitcnt, Format::SOPP, 0, 0);
|
||||
waitcnt->salu().imm = imm.pack(ctx.gfx_level);
|
||||
instructions.emplace_back(waitcnt);
|
||||
Builder bld(ctx.program, &instructions);
|
||||
|
||||
if (ctx.gfx_level >= GFX12) {
|
||||
if (imm.vm != wait_imm::unset_counter && imm.lgkm != wait_imm::unset_counter) {
|
||||
bld.sopp(aco_opcode::s_wait_loadcnt_dscnt, (imm.vm << 8) | imm.lgkm);
|
||||
imm.vm = wait_imm::unset_counter;
|
||||
imm.lgkm = wait_imm::unset_counter;
|
||||
}
|
||||
|
||||
if (imm.vs != wait_imm::unset_counter && imm.lgkm != wait_imm::unset_counter) {
|
||||
bld.sopp(aco_opcode::s_wait_storecnt_dscnt, (imm.vs << 8) | imm.lgkm);
|
||||
imm.vs = wait_imm::unset_counter;
|
||||
imm.lgkm = wait_imm::unset_counter;
|
||||
}
|
||||
|
||||
aco_opcode op[wait_type_num];
|
||||
op[wait_type_exp] = aco_opcode::s_wait_expcnt;
|
||||
op[wait_type_lgkm] = aco_opcode::s_wait_dscnt;
|
||||
op[wait_type_vm] = aco_opcode::s_wait_loadcnt;
|
||||
op[wait_type_vs] = aco_opcode::s_wait_storecnt;
|
||||
op[wait_type_sample] = aco_opcode::s_wait_samplecnt;
|
||||
op[wait_type_bvh] = aco_opcode::s_wait_bvhcnt;
|
||||
op[wait_type_km] = aco_opcode::s_wait_kmcnt;
|
||||
|
||||
for (unsigned i = 0; i < wait_type_num; i++) {
|
||||
if (imm[i] != wait_imm::unset_counter)
|
||||
bld.sopp(op[i], imm[i]);
|
||||
}
|
||||
} else {
|
||||
if (imm.vs != wait_imm::unset_counter) {
|
||||
assert(ctx.gfx_level >= GFX10);
|
||||
bld.sopk(aco_opcode::s_waitcnt_vscnt, Operand(sgpr_null, s1), imm.vs);
|
||||
imm.vs = wait_imm::unset_counter;
|
||||
}
|
||||
if (!imm.empty())
|
||||
bld.sopp(aco_opcode::s_waitcnt, imm.pack(ctx.gfx_level));
|
||||
}
|
||||
imm = wait_imm();
|
||||
}
|
||||
|
@@ -179,3 +179,356 @@ BEGIN_TEST(insert_waitcnt.waw.mixed_vmem_lds.lds)
|
||||
|
||||
finish_waitcnt_test();
|
||||
END_TEST
|
||||
|
||||
BEGIN_TEST(insert_waitcnt.waw.vmem_types)
|
||||
for (amd_gfx_level gfx : {GFX11, GFX12}) {
|
||||
if (!setup_cs(NULL, gfx))
|
||||
continue;
|
||||
|
||||
Definition def_v4(PhysReg(260), v1);
|
||||
Operand op_v0(PhysReg(256), v1);
|
||||
Operand desc_s4(PhysReg(0), s4);
|
||||
Operand desc_s8(PhysReg(8), s8);
|
||||
|
||||
//>> p_unit_test 0
|
||||
//! v1: %0:v[4] = buffer_load_dword %0:s[0-3], %0:v[0], 0
|
||||
//! v1: %0:v[4] = buffer_load_dword %0:s[0-3], %0:v[0], 0
|
||||
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(0));
|
||||
bld.mubuf(aco_opcode::buffer_load_dword, def_v4, desc_s4, op_v0, Operand::zero(), 0, false);
|
||||
bld.mubuf(aco_opcode::buffer_load_dword, def_v4, desc_s4, op_v0, Operand::zero(), 0, false);
|
||||
|
||||
//>> p_unit_test 1
|
||||
//! v1: %0:v[4] = buffer_load_dword %0:s[0-3], %0:v[0], 0
|
||||
//~gfx11! s_waitcnt vmcnt(0)
|
||||
//~gfx12! s_wait_loadcnt imm:0
|
||||
//! v1: %0:v[4] = image_sample %0:s[8-15], %0:s[0-3], v1: undef, %0:v[0] 1d
|
||||
bld.reset(program->create_and_insert_block());
|
||||
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(1));
|
||||
bld.mubuf(aco_opcode::buffer_load_dword, def_v4, desc_s4, op_v0, Operand::zero(), 0, false);
|
||||
bld.mimg(aco_opcode::image_sample, def_v4, desc_s8, desc_s4, Operand(v1), op_v0);
|
||||
|
||||
//>> p_unit_test 2
|
||||
//! v1: %0:v[4] = buffer_load_dword %0:s[0-3], %0:v[0], 0
|
||||
//~gfx11! s_waitcnt vmcnt(0)
|
||||
//~gfx12! s_wait_loadcnt imm:0
|
||||
//! v1: %0:v[4] = image_bvh64_intersect_ray %0:s[0-3], s4: undef, v1: undef, %0:v[16-26] 1d
|
||||
bld.reset(program->create_and_insert_block());
|
||||
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(2));
|
||||
bld.mubuf(aco_opcode::buffer_load_dword, def_v4, desc_s4, op_v0, Operand::zero(), 0, false);
|
||||
bld.mimg(aco_opcode::image_bvh64_intersect_ray, def_v4, desc_s4, Operand(s4), Operand(v1),
|
||||
Operand(PhysReg(272), RegClass::get(RegType::vgpr, 11 * 4)));
|
||||
|
||||
//>> p_unit_test 3
|
||||
//! v1: %0:v[4] = image_sample %0:s[8-15], %0:s[0-3], v1: undef, %0:v[0] 1d
|
||||
//! v1: %0:v[4] = image_sample %0:s[8-15], %0:s[0-3], v1: undef, %0:v[0] 1d
|
||||
bld.reset(program->create_and_insert_block());
|
||||
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(3));
|
||||
bld.mimg(aco_opcode::image_sample, def_v4, desc_s8, desc_s4, Operand(v1), op_v0);
|
||||
bld.mimg(aco_opcode::image_sample, def_v4, desc_s8, desc_s4, Operand(v1), op_v0);
|
||||
|
||||
//>> p_unit_test 4
|
||||
//! v1: %0:v[4] = image_sample %0:s[8-15], %0:s[0-3], v1: undef, %0:v[0] 1d
|
||||
//~gfx11! s_waitcnt vmcnt(0)
|
||||
//~gfx12! s_wait_samplecnt imm:0
|
||||
//! v1: %0:v[4] = buffer_load_dword %0:s[0-3], %0:v[0], 0
|
||||
bld.reset(program->create_and_insert_block());
|
||||
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(4));
|
||||
bld.mimg(aco_opcode::image_sample, def_v4, desc_s8, desc_s4, Operand(v1), op_v0);
|
||||
bld.mubuf(aco_opcode::buffer_load_dword, def_v4, desc_s4, op_v0, Operand::zero(), 0, false);
|
||||
|
||||
//>> p_unit_test 5
|
||||
//! v1: %0:v[4] = image_sample %0:s[8-15], %0:s[0-3], v1: undef, %0:v[0] 1d
|
||||
//~gfx11! s_waitcnt vmcnt(0)
|
||||
//~gfx12! s_wait_samplecnt imm:0
|
||||
//! v1: %0:v[4] = image_bvh64_intersect_ray %0:s[0-3], s4: undef, v1: undef, %0:v[16-26] 1d
|
||||
bld.reset(program->create_and_insert_block());
|
||||
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(5));
|
||||
bld.mimg(aco_opcode::image_sample, def_v4, desc_s8, desc_s4, Operand(v1), op_v0);
|
||||
bld.mimg(aco_opcode::image_bvh64_intersect_ray, def_v4, desc_s4, Operand(s4), Operand(v1),
|
||||
Operand(PhysReg(272), RegClass::get(RegType::vgpr, 11 * 4)));
|
||||
|
||||
//>> p_unit_test 6
|
||||
//! v1: %0:v[4] = image_bvh64_intersect_ray %0:s[0-3], s4: undef, v1: undef, %0:v[16-26] 1d
|
||||
//! v1: %0:v[4] = image_bvh64_intersect_ray %0:s[0-3], s4: undef, v1: undef, %0:v[16-26] 1d
|
||||
bld.reset(program->create_and_insert_block());
|
||||
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(6));
|
||||
bld.mimg(aco_opcode::image_bvh64_intersect_ray, def_v4, desc_s4, Operand(s4), Operand(v1),
|
||||
Operand(PhysReg(272), RegClass::get(RegType::vgpr, 11 * 4)));
|
||||
bld.mimg(aco_opcode::image_bvh64_intersect_ray, def_v4, desc_s4, Operand(s4), Operand(v1),
|
||||
Operand(PhysReg(272), RegClass::get(RegType::vgpr, 11 * 4)));
|
||||
|
||||
//>> p_unit_test 7
|
||||
//! v1: %0:v[4] = image_bvh64_intersect_ray %0:s[0-3], s4: undef, v1: undef, %0:v[16-26] 1d
|
||||
//~gfx11! s_waitcnt vmcnt(0)
|
||||
//~gfx12! s_wait_bvhcnt imm:0
|
||||
//! v1: %0:v[4] = buffer_load_dword %0:s[0-3], %0:v[0], 0
|
||||
bld.reset(program->create_and_insert_block());
|
||||
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(7));
|
||||
bld.mimg(aco_opcode::image_bvh64_intersect_ray, def_v4, desc_s4, Operand(s4), Operand(v1),
|
||||
Operand(PhysReg(272), RegClass::get(RegType::vgpr, 11 * 4)));
|
||||
bld.mubuf(aco_opcode::buffer_load_dword, def_v4, desc_s4, op_v0, Operand::zero(), 0, false);
|
||||
|
||||
//>> p_unit_test 8
|
||||
//! v1: %0:v[4] = image_bvh64_intersect_ray %0:s[0-3], s4: undef, v1: undef, %0:v[16-26] 1d
|
||||
//~gfx11! s_waitcnt vmcnt(0)
|
||||
//~gfx12! s_wait_bvhcnt imm:0
|
||||
//! v1: %0:v[4] = image_sample %0:s[8-15], %0:s[0-3], v1: undef, %0:v[0] 1d
|
||||
bld.reset(program->create_and_insert_block());
|
||||
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(8));
|
||||
bld.mimg(aco_opcode::image_bvh64_intersect_ray, def_v4, desc_s4, Operand(s4), Operand(v1),
|
||||
Operand(PhysReg(272), RegClass::get(RegType::vgpr, 11 * 4)));
|
||||
bld.mimg(aco_opcode::image_sample, def_v4, desc_s8, desc_s4, Operand(v1), op_v0);
|
||||
|
||||
//>> BB9
|
||||
//! /* logical preds: / linear preds: / kind: */
|
||||
//! v1: %0:v[4] = buffer_load_dword %0:s[0-3], %0:v[0], 0
|
||||
bld.reset(program->create_and_insert_block());
|
||||
bld.mubuf(aco_opcode::buffer_load_dword, def_v4, desc_s4, op_v0, Operand::zero(), 0, false);
|
||||
|
||||
//>> BB10
|
||||
//! /* logical preds: / linear preds: / kind: */
|
||||
//! v1: %0:v[4] = buffer_load_dword %0:s[0-3], %0:v[0], 0
|
||||
bld.reset(program->create_and_insert_block());
|
||||
bld.mubuf(aco_opcode::buffer_load_dword, def_v4, desc_s4, op_v0, Operand::zero(), 0, false);
|
||||
|
||||
bld.reset(program->create_and_insert_block());
|
||||
program->blocks[11].linear_preds.push_back(9);
|
||||
program->blocks[11].linear_preds.push_back(10);
|
||||
program->blocks[11].logical_preds.push_back(9);
|
||||
program->blocks[11].logical_preds.push_back(10);
|
||||
|
||||
//>> BB11
|
||||
//! /* logical preds: BB9, BB10, / linear preds: BB9, BB10, / kind: uniform, */
|
||||
//! p_unit_test 9
|
||||
//! v1: %0:v[4] = buffer_load_dword %0:s[0-3], %0:v[0], 0
|
||||
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(9));
|
||||
bld.mubuf(aco_opcode::buffer_load_dword, def_v4, desc_s4, op_v0, Operand::zero(), 0, false);
|
||||
|
||||
//>> BB12
|
||||
//! /* logical preds: / linear preds: / kind: */
|
||||
//! v1: %0:v[4] = image_sample %0:s[8-15], %0:s[0-3], v1: undef, %0:v[0] 1d
|
||||
bld.reset(program->create_and_insert_block());
|
||||
bld.mimg(aco_opcode::image_sample, def_v4, desc_s8, desc_s4, Operand(v1), op_v0);
|
||||
|
||||
//>> BB13
|
||||
//! /* logical preds: / linear preds: / kind: */
|
||||
//! v1: %0:v[4] = buffer_load_dword %0:s[0-3], %0:v[0], 0
|
||||
bld.reset(program->create_and_insert_block());
|
||||
bld.mubuf(aco_opcode::buffer_load_dword, def_v4, desc_s4, op_v0, Operand::zero(), 0, false);
|
||||
|
||||
bld.reset(program->create_and_insert_block());
|
||||
program->blocks[14].linear_preds.push_back(12);
|
||||
program->blocks[14].linear_preds.push_back(13);
|
||||
program->blocks[14].logical_preds.push_back(12);
|
||||
program->blocks[14].logical_preds.push_back(13);
|
||||
|
||||
//>> BB14
|
||||
//! /* logical preds: BB12, BB13, / linear preds: BB12, BB13, / kind: uniform, */
|
||||
//! p_unit_test 10
|
||||
//~gfx11! s_waitcnt vmcnt(0)
|
||||
//~gfx12! s_wait_samplecnt imm:0
|
||||
//! v1: %0:v[4] = buffer_load_dword %0:s[0-3], %0:v[0], 0
|
||||
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(10));
|
||||
bld.mubuf(aco_opcode::buffer_load_dword, def_v4, desc_s4, op_v0, Operand::zero(), 0, false);
|
||||
|
||||
finish_waitcnt_test();
|
||||
}
|
||||
END_TEST
|
||||
|
||||
BEGIN_TEST(insert_waitcnt.vmem)
|
||||
if (!setup_cs(NULL, GFX12))
|
||||
return;
|
||||
|
||||
Definition def_v4(PhysReg(260), v1);
|
||||
Definition def_v5(PhysReg(261), v1);
|
||||
Definition def_v6(PhysReg(262), v1);
|
||||
Definition def_v7(PhysReg(263), v1);
|
||||
Definition def_v8(PhysReg(264), v1);
|
||||
Definition def_v9(PhysReg(265), v1);
|
||||
Operand op_v0(PhysReg(256), v1);
|
||||
Operand op_v4(PhysReg(260), v1);
|
||||
Operand op_v5(PhysReg(261), v1);
|
||||
Operand op_v6(PhysReg(262), v1);
|
||||
Operand op_v7(PhysReg(263), v1);
|
||||
Operand op_v8(PhysReg(264), v1);
|
||||
Operand op_v9(PhysReg(265), v1);
|
||||
Operand desc_s4(PhysReg(0), s4);
|
||||
Operand desc_s8(PhysReg(8), s8);
|
||||
|
||||
//>> v1: %0:v[4] = buffer_load_dword %0:s[0-3], %0:v[0], 0
|
||||
//! v1: %0:v[5] = image_sample %0:s[8-15], %0:s[0-3], v1: undef, %0:v[0] 1d
|
||||
//! v1: %0:v[6] = image_bvh64_intersect_ray %0:s[0-3], s4: undef, v1: undef, %0:v[16-26] 1d unrm r128
|
||||
bld.mubuf(aco_opcode::buffer_load_dword, def_v4, desc_s4, op_v0, Operand::zero(), 0, false);
|
||||
bld.mimg(aco_opcode::image_sample, def_v5, desc_s8, desc_s4, Operand(v1), op_v0);
|
||||
Instruction* instr =
|
||||
bld.mimg(aco_opcode::image_bvh64_intersect_ray, def_v6, desc_s4, Operand(s4), Operand(v1),
|
||||
Operand(PhysReg(272), RegClass::get(RegType::vgpr, 11 * 4)))
|
||||
.instr;
|
||||
instr->mimg().unrm = true;
|
||||
instr->mimg().r128 = true;
|
||||
|
||||
//! v1: %0:v[7] = image_load %0:s[8-15], s4: undef, v1: undef, %0:v[0] 1d
|
||||
//! v1: %0:v[8] = image_sample %0:s[8-15], %0:s[0-3], v1: undef, %0:v[0] 1d
|
||||
//! v1: %0:v[9] = image_bvh64_intersect_ray %0:s[0-3], s4: undef, v1: undef, %0:v[16-26] 1d unrm r128
|
||||
bld.mimg(aco_opcode::image_load, def_v7, desc_s8, Operand(s4), Operand(v1), op_v0, 0x1);
|
||||
bld.mimg(aco_opcode::image_sample, def_v8, desc_s8, desc_s4, Operand(v1), op_v0);
|
||||
instr = bld.mimg(aco_opcode::image_bvh64_intersect_ray, def_v9, desc_s4, Operand(s4),
|
||||
Operand(v1), Operand(PhysReg(272), RegClass::get(RegType::vgpr, 11 * 4)))
|
||||
.instr;
|
||||
instr->mimg().unrm = true;
|
||||
instr->mimg().r128 = true;
|
||||
|
||||
//! s_wait_loadcnt imm:1
|
||||
//! p_unit_test 0, %0:v[4]
|
||||
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(0), op_v4);
|
||||
//! s_wait_samplecnt imm:1
|
||||
//! p_unit_test 1, %0:v[5]
|
||||
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(1), op_v5);
|
||||
//! s_wait_bvhcnt imm:1
|
||||
//! p_unit_test 2, %0:v[6]
|
||||
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(2), op_v6);
|
||||
//! s_wait_loadcnt imm:0
|
||||
//! p_unit_test 3, %0:v[7]
|
||||
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(3), op_v7);
|
||||
//! s_wait_samplecnt imm:0
|
||||
//! p_unit_test 4, %0:v[8]
|
||||
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(4), op_v8);
|
||||
//! s_wait_bvhcnt imm:0
|
||||
//! p_unit_test 5, %0:v[9]
|
||||
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(5), op_v9);
|
||||
|
||||
/* Despite not using a sampler, this uses samplecnt. */
|
||||
//! v1: %0:v[5] = image_msaa_load %0:s[8-15], s4: undef, v1: undef, %0:v[0] 1d
|
||||
//! s_wait_samplecnt imm:0
|
||||
//! p_unit_test 6, %0:v[5]
|
||||
bld.mimg(aco_opcode::image_msaa_load, def_v5, desc_s8, Operand(s4), Operand(v1), op_v0);
|
||||
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(6), op_v5);
|
||||
|
||||
finish_waitcnt_test();
|
||||
END_TEST
|
||||
|
||||
BEGIN_TEST(insert_waitcnt.lds_smem)
|
||||
for (amd_gfx_level gfx : {GFX11, GFX12}) {
|
||||
if (!setup_cs(NULL, gfx))
|
||||
continue;
|
||||
|
||||
Definition def_v4(PhysReg(260), v1);
|
||||
Definition def_v5(PhysReg(261), v1);
|
||||
Definition def_s4(PhysReg(4), s1);
|
||||
Definition def_s5(PhysReg(5), s1);
|
||||
Operand op_s0(PhysReg(0), s1);
|
||||
Operand op_s4(PhysReg(4), s1);
|
||||
Operand op_s5(PhysReg(5), s1);
|
||||
Operand op_v0(PhysReg(256), v1);
|
||||
Operand op_v4(PhysReg(260), v1);
|
||||
Operand op_v5(PhysReg(261), v1);
|
||||
Operand desc_s4(PhysReg(0), s4);
|
||||
|
||||
//>> v1: %0:v[4] = ds_read_b32 %0:v[0]
|
||||
//! s1: %0:s[4] = s_buffer_load_dword %0:s[0-3], %0:s[0]
|
||||
//! v1: %0:v[5] = ds_read_b32 %0:v[0]
|
||||
//! s1: %0:s[5] = s_buffer_load_dword %0:s[0-3], %0:s[0]
|
||||
bld.ds(aco_opcode::ds_read_b32, def_v4, op_v0);
|
||||
bld.smem(aco_opcode::s_buffer_load_dword, def_s4, desc_s4, op_s0);
|
||||
bld.ds(aco_opcode::ds_read_b32, def_v5, op_v0);
|
||||
bld.smem(aco_opcode::s_buffer_load_dword, def_s5, desc_s4, op_s0);
|
||||
|
||||
//~gfx11! s_waitcnt lgkmcnt(1)
|
||||
//~gfx12! s_wait_dscnt imm:1
|
||||
//! p_unit_test 0, %0:v[4]
|
||||
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(0), op_v4);
|
||||
//~gfx11! s_waitcnt lgkmcnt(0)
|
||||
//~gfx12! s_wait_kmcnt imm:0
|
||||
//! p_unit_test 1, %0:s[4]
|
||||
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(1), op_s4);
|
||||
//~gfx12! s_wait_dscnt imm:0
|
||||
//! p_unit_test 2, %0:v[5]
|
||||
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(2), op_v5);
|
||||
//! p_unit_test 3, %0:s[5]
|
||||
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(3), op_s5);
|
||||
|
||||
finish_waitcnt_test();
|
||||
}
|
||||
END_TEST
|
||||
|
||||
BEGIN_TEST(insert_waitcnt.sendmsg_smem)
|
||||
for (amd_gfx_level gfx : {GFX11, GFX12}) {
|
||||
if (!setup_cs(NULL, gfx))
|
||||
continue;
|
||||
|
||||
Definition def_s4(PhysReg(4), s1);
|
||||
Definition def_s5(PhysReg(5), s1);
|
||||
Definition def_s6(PhysReg(6), s1);
|
||||
Definition def_s7(PhysReg(7), s1);
|
||||
Operand op_s0(PhysReg(0), s1);
|
||||
Operand op_s4(PhysReg(4), s1);
|
||||
Operand op_s5(PhysReg(5), s1);
|
||||
Operand op_s6(PhysReg(6), s1);
|
||||
Operand op_s7(PhysReg(7), s1);
|
||||
Operand desc_s4(PhysReg(0), s4);
|
||||
|
||||
//>> s1: %0:s[4] = s_sendmsg_rtn_b32 3 sendmsg(rtn_get_realtime)
|
||||
//! s1: %0:s[5] = s_buffer_load_dword %0:s[0-3], %0:s[0]
|
||||
//! s1: %0:s[6] = s_sendmsg_rtn_b32 3 sendmsg(rtn_get_realtime)
|
||||
//! s1: %0:s[7] = s_buffer_load_dword %0:s[0-3], %0:s[0]
|
||||
bld.sop1(aco_opcode::s_sendmsg_rtn_b32, def_s4, Operand::c32(sendmsg_rtn_get_realtime));
|
||||
bld.smem(aco_opcode::s_buffer_load_dword, def_s5, desc_s4, op_s0);
|
||||
bld.sop1(aco_opcode::s_sendmsg_rtn_b32, def_s6, Operand::c32(sendmsg_rtn_get_realtime));
|
||||
bld.smem(aco_opcode::s_buffer_load_dword, def_s7, desc_s4, op_s0);
|
||||
|
||||
//~gfx12! s_wait_kmcnt imm:1
|
||||
//~gfx11! s_waitcnt lgkmcnt(1)
|
||||
//! p_unit_test 0, %0:s[4]
|
||||
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(0), op_s4);
|
||||
//~gfx12! s_wait_kmcnt imm:0
|
||||
//~gfx11! s_waitcnt lgkmcnt(0)
|
||||
//! p_unit_test 1, %0:s[5]
|
||||
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(1), op_s5);
|
||||
//! p_unit_test 2, %0:s[6]
|
||||
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(2), op_s6);
|
||||
//! p_unit_test 3, %0:s[7]
|
||||
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(3), op_s7);
|
||||
|
||||
finish_waitcnt_test();
|
||||
}
|
||||
END_TEST
|
||||
|
||||
BEGIN_TEST(insert_waitcnt.vmem_ds)
|
||||
if (!setup_cs(NULL, GFX12))
|
||||
return;
|
||||
|
||||
Definition def_v4(PhysReg(260), v1);
|
||||
Definition def_v5(PhysReg(261), v1);
|
||||
Operand op_v0(PhysReg(256), v1);
|
||||
Operand op_v1(PhysReg(257), v1);
|
||||
Operand op_v4(PhysReg(260), v1);
|
||||
Operand op_v5(PhysReg(261), v1);
|
||||
Operand desc_s4(PhysReg(0), s4);
|
||||
|
||||
program->workgroup_size = 128;
|
||||
program->wgp_mode = true;
|
||||
|
||||
//>> v1: %0:v[4] = buffer_load_dword %0:s[0-3], %0:v[0], 0
|
||||
//! v1: %0:v[5] = ds_read_b32 %0:v[0]
|
||||
bld.mubuf(aco_opcode::buffer_load_dword, def_v4, desc_s4, op_v0, Operand::zero(), 0, false);
|
||||
bld.ds(aco_opcode::ds_read_b32, def_v5, op_v0);
|
||||
|
||||
//! s_wait_loadcnt_dscnt dscnt(0) loadcnt(0)
|
||||
//! p_unit_test 0, %0:v[4], %0:v[5]
|
||||
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(0), op_v4, op_v5);
|
||||
|
||||
//! buffer_store_dword %0:s[0-3], %0:v[0], 0, %0:v[1] storage:buffer
|
||||
//! v1: %0:v[5] = ds_write_b32 %0:v[0], %0:v[1] storage:shared
|
||||
Instruction* instr =
|
||||
bld.mubuf(aco_opcode::buffer_store_dword, desc_s4, op_v0, Operand::zero(), op_v1, 0, false)
|
||||
.instr;
|
||||
instr->mubuf().sync = memory_sync_info(storage_buffer);
|
||||
instr = bld.ds(aco_opcode::ds_write_b32, def_v5, op_v0, op_v1).instr;
|
||||
instr->ds().sync = memory_sync_info(storage_shared);
|
||||
|
||||
//! s_wait_storecnt_dscnt dscnt(0) storecnt(0)
|
||||
bld.barrier(aco_opcode::p_barrier,
|
||||
memory_sync_info(storage_buffer | storage_shared, semantic_acqrel, scope_workgroup));
|
||||
|
||||
finish_waitcnt_test();
|
||||
END_TEST
|
||||
|
Reference in New Issue
Block a user