From 4579586c66a041a191fa4d45f77ba7bb7413d5ab Mon Sep 17 00:00:00 2001 From: Rhys Perry Date: Thu, 1 Aug 2024 14:46:45 +0100 Subject: [PATCH] aco/tests: add tests for VALUReadSGPRHazard Signed-off-by: Rhys Perry Reviewed-by: Georg Lehmann Part-of: --- src/amd/compiler/tests/test_insert_nops.cpp | 227 ++++++++++++++++++++ 1 file changed, 227 insertions(+) diff --git a/src/amd/compiler/tests/test_insert_nops.cpp b/src/amd/compiler/tests/test_insert_nops.cpp index 90cce653152..f487d6484c1 100644 --- a/src/amd/compiler/tests/test_insert_nops.cpp +++ b/src/amd/compiler/tests/test_insert_nops.cpp @@ -1505,6 +1505,196 @@ BEGIN_TEST(insert_nops.export_priority.set_prio) finish_insert_nops_test(); END_TEST +BEGIN_TEST(insert_nops.valu_read_sgpr.basic) + if (!setup_cs(NULL, GFX12)) + return; + + bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(256), v1), Operand(PhysReg(4), s1)); + bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(256), v1), Operand(PhysReg(7), s1)); + bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(256), v1), Operand(sgpr_null, s1)); + bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(256), v1), Operand(exec_lo, s1)); + bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(256), v1), Operand(m0, s1)); + bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(256), v1), Operand(scc, s1)); + + /* no hazard: SALU write missing */ + //>> p_unit_test 0 + //! s1: %0:s[64] = s_mov_b32 %0:s[4] + bld.pseudo(aco_opcode::p_unit_test, Operand::c32(0)); + bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(64), s1), Operand(PhysReg(4), s1)); + + /* no hazard: SGPR never read by VALU */ + //! p_unit_test 1 + //! s1: %0:s[16] = s_mov_b32 0 + //! s1: %0:s[64] = s_mov_b32 %0:s[16] + bld.pseudo(aco_opcode::p_unit_test, Operand::c32(1)); + bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(16), s1), Operand::zero(4)); + bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(64), s1), Operand(PhysReg(16), s1)); + + /* basic case: SALU read */ + //! p_unit_test 2 + //! s1: %0:s[4] = s_mov_b32 0 + //! s_waitcnt_depctr sa_sdst(0) + //! s1: %0:s[64] = s_mov_b32 %0:s[4] + bld.pseudo(aco_opcode::p_unit_test, Operand::c32(2)); + bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(4), s1), Operand::zero(4)); + bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(64), s1), Operand(PhysReg(4), s1)); + + /* basic case again: VALU reads never expire */ + //! p_unit_test 3 + //! s1: %0:s[4] = s_mov_b32 0 + //! s_waitcnt_depctr sa_sdst(0) + //! s1: %0:s[64] = s_mov_b32 %0:s[4] + bld.pseudo(aco_opcode::p_unit_test, Operand::c32(3)); + bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(4), s1), Operand::zero(4)); + bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(64), s1), Operand(PhysReg(4), s1)); + + /* sa_sdst(0) resolves the hazard */ + //! p_unit_test 4 + //! s1: %0:s[4] = s_mov_b32 0 + //! s_waitcnt_depctr sa_sdst(0) + //! s1: %0:s[64] = s_mov_b32 %0:s[4] + //! s1: %0:s[64] = s_mov_b32 %0:s[4] + bld.pseudo(aco_opcode::p_unit_test, Operand::c32(4)); + bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(4), s1), Operand::zero(4)); + bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(64), s1), Operand(PhysReg(4), s1)); + bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(64), s1), Operand(PhysReg(4), s1)); + + //! p_unit_test 5 + //! s1: %0:s[4] = s_mov_b32 0 + //! s_waitcnt_depctr sa_sdst(0) + //! s1: %0:s[64] = s_mov_b32 %0:s[4] + bld.pseudo(aco_opcode::p_unit_test, Operand::c32(5)); + bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(4), s1), Operand::zero(4)); + bld.sopp(aco_opcode::s_waitcnt_depctr, 0xfffe); + bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(64), s1), Operand(PhysReg(4), s1)); + + /* basic case: VALU read */ + //! p_unit_test 6 + //! s1: %0:s[4] = s_mov_b32 0 + //! s_waitcnt_depctr sa_sdst(0) + //! v1: %0:v[0] = v_mov_b32 %0:s[4] + bld.pseudo(aco_opcode::p_unit_test, Operand::c32(6)); + bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(4), s1), Operand::zero(4)); + bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(256), v1), Operand(PhysReg(4), s1)); + + /* the SALU write is in the same SGPR pair as the VALU read */ + //! p_unit_test 7 + //! s1: %0:s[6] = s_mov_b32 0 + //! s_waitcnt_depctr sa_sdst(0) + //! s1: %0:s[64] = s_mov_b32 %0:s[6] + bld.pseudo(aco_opcode::p_unit_test, Operand::c32(7)); + bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(6), s1), Operand::zero(4)); + bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(64), s1), Operand(PhysReg(6), s1)); + + /* no hazard: these registers are not problematic */ + //! p_unit_test 8 + //! s1: %0:null = s_mov_b32 0 + //! s1: %0:s[64] = s_mov_b32 %0:null + bld.pseudo(aco_opcode::p_unit_test, Operand::c32(8)); + bld.sop1(aco_opcode::s_mov_b32, Definition(sgpr_null, s1), Operand::zero(4)); + bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(64), s1), Operand(sgpr_null, s1)); + + //! p_unit_test 9 + //! s1: %0:exec_lo = s_mov_b32 0 + //! s1: %0:s[64] = s_mov_b32 %0:exec_lo + bld.pseudo(aco_opcode::p_unit_test, Operand::c32(9)); + bld.sop1(aco_opcode::s_mov_b32, Definition(exec_lo, s1), Operand::zero(4)); + bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(64), s1), Operand(exec_lo, s1)); + + //! p_unit_test 10 + //! s1: %0:m0 = s_mov_b32 0 + //! s1: %0:s[64] = s_mov_b32 %0:m0 + bld.pseudo(aco_opcode::p_unit_test, Operand::c32(10)); + bld.sop1(aco_opcode::s_mov_b32, Definition(m0, s1), Operand::zero(4)); + bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(64), s1), Operand(m0, s1)); + + //! p_unit_test 11 + //! s1: %0:scc = s_cmp_lg_i32 0, 0 + //! s1: %0:s[64] = s_mov_b32 %0:scc + bld.pseudo(aco_opcode::p_unit_test, Operand::c32(11)); + bld.sopc(aco_opcode::s_cmp_lg_i32, Definition(scc, s1), Operand::zero(4), Operand::zero(4)); + bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(64), s1), Operand(scc, s1)); + + /* 11 SALU between the write and a VALU read expire the hazard */ + //! p_unit_test 12 + //! s1: %0:s[4] = s_mov_b32 0 + //; for i in range(11): insert_pattern('s1: %0:s[64] = s_mov_b32 0') + //! v1: %0:v[0] = v_mov_b32 %0:s[4] + bld.pseudo(aco_opcode::p_unit_test, Operand::c32(12)); + bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(4), s1), Operand::zero(4)); + for (unsigned i = 0; i < 11; i++) + bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(64), s1), Operand::zero(4)); + bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(256), v1), Operand(PhysReg(4), s1)); + + //! p_unit_test 13 + //! s1: %0:s[4] = s_mov_b32 0 + //; for i in range(10): insert_pattern('s1: %0:s[64] = s_mov_b32 0') + //! s_waitcnt_depctr sa_sdst(0) + //! v1: %0:v[0] = v_mov_b32 %0:s[4] + bld.pseudo(aco_opcode::p_unit_test, Operand::c32(13)); + bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(4), s1), Operand::zero(4)); + for (unsigned i = 0; i < 10; i++) + bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(64), s1), Operand::zero(4)); + bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(256), v1), Operand(PhysReg(4), s1)); + + /* 10 SALU between the write and a SALU read expire the hazard */ + //! p_unit_test 14 + //! s1: %0:s[4] = s_mov_b32 0 + //; for i in range(10): insert_pattern('s1: %0:s[64] = s_mov_b32 0') + //! s1: %0:s[64] = s_mov_b32 %0:s[4] + bld.pseudo(aco_opcode::p_unit_test, Operand::c32(14)); + bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(4), s1), Operand::zero(4)); + for (unsigned i = 0; i < 10; i++) + bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(64), s1), Operand::zero(4)); + bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(64), s1), Operand(PhysReg(4), s1)); + + //! p_unit_test 15 + //! s1: %0:s[4] = s_mov_b32 0 + //; for i in range(9): insert_pattern('s1: %0:s[64] = s_mov_b32 0') + //! s_waitcnt_depctr sa_sdst(0) + //! s1: %0:s[64] = s_mov_b32 %0:s[4] + bld.pseudo(aco_opcode::p_unit_test, Operand::c32(15)); + bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(4), s1), Operand::zero(4)); + for (unsigned i = 0; i < 9; i++) + bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(64), s1), Operand::zero(4)); + bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(64), s1), Operand(PhysReg(4), s1)); + + /* SOPP in-between the write and the read do not count */ + //! p_unit_test 16 + //! s1: %0:s[4] = s_mov_b32 0 + //; for i in range(9): insert_pattern('s1: %0:s[64] = s_mov_b32 0') + //! s_nop + //! s_waitcnt_depctr sa_sdst(0) + //! s1: %0:s[64] = s_mov_b32 %0:s[4] + bld.pseudo(aco_opcode::p_unit_test, Operand::c32(16)); + bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(4), s1), Operand::zero(4)); + for (unsigned i = 0; i < 9; i++) + bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(64), s1), Operand::zero(4)); + bld.sopp(aco_opcode::s_nop, 0); + bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(64), s1), Operand(PhysReg(4), s1)); + + finish_insert_nops_test(); +END_TEST + +BEGIN_TEST(insert_nops.valu_read_sgpr.previous_part) + if (!setup_cs(NULL, GFX12)) + return; + + /* Raytracing shaders have a prolog and may also be split into several parts. */ + program->stage = raytracing_cs; + + /* Despite the SGPR never being read by a VALU in this shader, a sa_sdst(0) is needed. */ + //>> p_unit_test 0 + //! s1: %0:s[4] = s_mov_b32 0 + //! s_waitcnt_depctr sa_sdst(0) + //! s1: %0:s[64] = s_mov_b32 %0:s[4] + bld.pseudo(aco_opcode::p_unit_test, Operand::c32(0)); + bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(4), s1), Operand::zero(4)); + bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(64), s1), Operand(PhysReg(4), s1)); + + finish_insert_nops_test(); +END_TEST + BEGIN_TEST(insert_nops.setpc_gfx6) if (!setup_cs(NULL, GFX6)) return; @@ -1894,5 +2084,42 @@ BEGIN_TEST(insert_nops.setpc_gfx12) bld.ds(aco_opcode::ds_read_b32, Definition(PhysReg(256), v1), Operand(PhysReg(256), v1)); bld.sop1(aco_opcode::s_setpc_b64, Operand::zero(8)); + /* VALUReadSGPRHazard */ + //! p_unit_test 4 + //! v1: %0:v[0] = v_mov_b32 %0:s[4] + //! s1: %0:s[4] = s_mov_b32 0 + //! s_waitcnt_depctr va_vdst(0) sa_sdst(0) + //! s_setpc_b64 0 + bld.pseudo(aco_opcode::p_unit_test, Operand::c32(4)); + bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(256), v1), Operand(PhysReg(4), s1)); + bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(4), s1), Operand::zero(4)); + bld.sop1(aco_opcode::s_setpc_b64, Operand::zero(8)); + + //! p_unit_test 5 + //! v1: %0:v[0] = v_mov_b32 %0:s[4] + //! s1: %0:s[4] = s_mov_b32 0 + //; for i in range(10): insert_pattern('s1: %0:s[64] = s_mov_b32 0') + //! s_waitcnt_depctr va_vdst(0) + //! s_setpc_b64 0 + bld.pseudo(aco_opcode::p_unit_test, Operand::c32(5)); + bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(256), v1), Operand(PhysReg(4), s1)); + bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(4), s1), Operand::zero(4)); + for (unsigned i = 0; i < 10; i++) /* the s_setpc_b64 counts */ + bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(64), s1), Operand::zero(4)); + bld.sop1(aco_opcode::s_setpc_b64, Operand::zero(8)); + + //! p_unit_test 6 + //! v1: %0:v[0] = v_mov_b32 %0:s[4] + //! s1: %0:s[4] = s_mov_b32 0 + //; for i in range(9): insert_pattern('s1: %0:s[64] = s_mov_b32 0') + //! s_waitcnt_depctr va_vdst(0) sa_sdst(0) + //! s_setpc_b64 0 + bld.pseudo(aco_opcode::p_unit_test, Operand::c32(6)); + bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(256), v1), Operand(PhysReg(4), s1)); + bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(4), s1), Operand::zero(4)); + for (unsigned i = 0; i < 9; i++) + bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(64), s1), Operand::zero(4)); + bld.sop1(aco_opcode::s_setpc_b64, Operand::zero(8)); + finish_insert_nops_test(true); END_TEST