freedreno/computerator: Fix remaining issues with A7XX

Signed-off-by: Danylo Piliaiev <dpiliaiev@igalia.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/23217>
This commit is contained in:
Danylo Piliaiev
2023-04-17 15:39:09 +02:00
committed by Marge Bot
parent b0ea4883f0
commit 7e10a175c7
3 changed files with 61 additions and 10 deletions

View File

@@ -119,11 +119,13 @@ cs_program_emit(struct fd_ringbuffer *ring, struct kernel *kernel)
struct ir3_kernel *ir3_kernel = to_ir3_kernel(kernel);
struct a6xx_backend *a6xx_backend = to_a6xx_backend(ir3_kernel->backend);
struct ir3_shader_variant *v = ir3_kernel->v;
const unsigned *local_size = kernel->local_size;
const struct ir3_info *i = &v->info;
enum a6xx_threadsize thrsz = i->double_threadsize ? THREAD128 : THREAD64;
OUT_PKT4(ring, REG_A6XX_SP_MODE_CONTROL, 1);
OUT_RING(ring, A6XX_SP_MODE_CONTROL_CONSTANT_DEMOTION_ENABLE | 4);
OUT_REG(ring, A6XX_SP_MODE_CONTROL(.constant_demotion_enable = true,
.isammode = ISAMMODE_GL,
.shared_consts_enable = false));
OUT_PKT4(ring, REG_A6XX_SP_PERFCTR_ENABLE, 1);
OUT_RING(ring, A6XX_SP_PERFCTR_ENABLE_CS);
@@ -168,6 +170,14 @@ cs_program_emit(struct fd_ringbuffer *ring, struct kernel *kernel)
COND(v->mergedregs, A6XX_SP_CS_CTRL_REG0_MERGEDREGS) |
COND(ir3_kernel->info.early_preamble, A6XX_SP_CS_CTRL_REG0_EARLYPREAMBLE) |
A6XX_SP_CS_CTRL_REG0_BRANCHSTACK(ir3_shader_branchstack_hw(v)));
if (CHIP == A7XX) {
OUT_REG(ring, HLSQ_FS_CNTL_0(CHIP, .threadsize = THREAD64));
OUT_REG(ring, HLSQ_CONTROL_2_REG(CHIP, .dword = 0xfcfcfcfc),
HLSQ_CONTROL_3_REG(CHIP, .dword = 0xfcfcfcfc),
HLSQ_CONTROL_4_REG(CHIP, .dword = 0xfcfcfcfc),
HLSQ_CONTROL_5_REG(CHIP, .dword = 0x0000fc00), );
}
OUT_PKT4(ring, REG_A6XX_SP_CS_UNKNOWN_A9B1, 1);
OUT_RING(ring, A6XX_SP_CS_UNKNOWN_A9B1_SHARED_SIZE(1) |
@@ -192,16 +202,28 @@ cs_program_emit(struct fd_ringbuffer *ring, struct kernel *kernel)
A6XX_HLSQ_CS_CNTL_0_LOCALIDREGID(local_invocation_id));
OUT_RING(ring, A6XX_HLSQ_CS_CNTL_1_LINEARLOCALIDREGID(regid(63, 0)) |
A6XX_HLSQ_CS_CNTL_1_THREADSIZE(thrsz));
} else {
enum a7xx_cs_yalign yalign = (local_size[1] % 8 == 0) ? CS_YALIGN_8
: (local_size[1] % 4 == 0) ? CS_YALIGN_4
: (local_size[1] % 2 == 0) ? CS_YALIGN_2
: CS_YALIGN_1;
OUT_REG(ring, A7XX_HLSQ_CS_CNTL_1(.linearlocalidregid = regid(63, 0),
.threadsize = thrsz,
.unk11 = true,
.unk22 = true,
.yalign = yalign, ));
}
if (CHIP == A7XX || a6xx_backend->info->a6xx.has_lpac) {
OUT_PKT4(ring, REG_A6XX_SP_CS_CNTL_0, 2);
OUT_PKT4(ring, REG_A6XX_SP_CS_CNTL_0, 1);
OUT_RING(ring, A6XX_SP_CS_CNTL_0_WGIDCONSTID(work_group_id) |
A6XX_SP_CS_CNTL_0_WGSIZECONSTID(regid(63, 0)) |
A6XX_SP_CS_CNTL_0_WGOFFSETCONSTID(regid(63, 0)) |
A6XX_SP_CS_CNTL_0_LOCALIDREGID(local_invocation_id));
OUT_RING(ring, A6XX_SP_CS_CNTL_1_LINEARLOCALIDREGID(regid(63, 0)) |
A6XX_SP_CS_CNTL_1_THREADSIZE(thrsz));
OUT_REG(ring,
SP_CS_CNTL_1(CHIP, .linearlocalidregid = regid(63, 0),
.threadsize = thrsz, ));
}
OUT_PKT4(ring, REG_A6XX_SP_CS_OBJ_START, 2);
@@ -463,6 +485,12 @@ a6xx_emit_grid(struct kernel *kernel, uint32_t grid[3],
.localsizey = local_size[1] - 1,
.localsizez = local_size[2] - 1,
));
if (CHIP == A7XX) {
OUT_REG(ring, A7XX_HLSQ_CS_LOCAL_SIZE(.localsizex = local_size[0] - 1,
.localsizey = local_size[1] - 1,
.localsizez = local_size[2] - 1, ));
}
OUT_REG(ring, HLSQ_CS_NDRANGE_1(CHIP,
.globalsize_x = local_size[0] * num_groups[0],
));

View File

@@ -29,8 +29,13 @@ computerator_files = [
computerator_cpp_args = cpp.get_supported_arguments([
'-Wno-sign-compare',
'-Wno-array-bounds',
])
if meson.is_cross_build()
computerator_cpp_args += '-Wno-array-bounds'
endif
computerator = executable(
'computerator',
computerator_files,

View File

@@ -3684,7 +3684,7 @@ to upconvert to 32b float internally?
<bitfield name="LOCALIDREGID" low="24" high="31" type="a3xx_regid"/>
</reg32>
<!-- new in a6xx gen4, matches HLSQ_CS_CNTL_1 -->
<reg32 offset="0xa9c3" name="SP_CS_CNTL_1" usage="cmd">
<reg32 offset="0xa9c3" name="SP_CS_CNTL_1" variants="A6XX" usage="cmd">
<!-- gl_LocalInvocationIndex -->
<bitfield name="LINEARLOCALIDREGID" low="0" high="7" type="a3xx_regid"/>
<!-- a650 has 6 "SP cores" (but 3 "SP"). this makes it use only
@@ -3694,11 +3694,20 @@ to upconvert to 32b float internally?
<bitfield name="THREADSIZE" pos="9" type="a6xx_threadsize"/>
<!-- 1 thread per wave (ignored if bit9 set) -->
<bitfield name="THREADSIZE_SCALAR" pos="10" type="boolean"/>
</reg32>
<reg32 offset="0xa9c3" name="SP_CS_CNTL_1" variants="A7XX-" usage="cmd">
<!-- gl_LocalInvocationIndex -->
<bitfield name="LINEARLOCALIDREGID" low="0" high="7" type="a3xx_regid"/>
<!-- Must match SP_CS_CTRL -->
<bitfield name="THREADSIZE" pos="8" type="a6xx_threadsize"/>
<!-- 1 thread per wave (would hang if THREAD128 is also set) -->
<bitfield name="THREADSIZE_SCALAR" pos="9" type="boolean"/>
<!-- Affects getone. If enabled, getone sometimes executed 1? less times
than there are subgroups.
-->
<bitfield name="UNK15" pos="15" type="boolean" variants="A7XX"/>
<bitfield name="UNK15" pos="15" type="boolean"/>
</reg32>
<!-- TODO: two 64kb aligned addresses at a9d0/a9d2 -->
@@ -4146,13 +4155,22 @@ to upconvert to 32b float internally?
<reg32 offset="0xa9dd" name="HLSQ_CS_KERNEL_GROUP_Y" variants="A7XX-" usage="rp_blit"/>
<reg32 offset="0xa9de" name="HLSQ_CS_KERNEL_GROUP_Z" variants="A7XX-" usage="rp_blit"/>
<reg32 offset="0xa9db" name="HLSQ_CS_UNKNOWN_A9DB" variants="A7XX-" usage="rp_blit">
<enum name="a7xx_cs_yalign">
<value name="CS_YALIGN_1" value="8"/>
<value name="CS_YALIGN_2" value="4"/>
<value name="CS_YALIGN_4" value="2"/>
<value name="CS_YALIGN_8" value="1"/>
</enum>
<reg32 offset="0xa9db" name="HLSQ_CS_CNTL_1" variants="A7XX-" usage="rp_blit">
<!-- gl_LocalInvocationIndex -->
<bitfield name="LINEARLOCALIDREGID" low="0" high="7" type="a3xx_regid"/>
<!-- Must match SP_CS_CTRL -->
<bitfield name="THREADSIZE" pos="9" type="a6xx_threadsize"/>
<bitfield name="UNK11" pos="11" type="boolean"/>
<bitfield name="UNK22" pos="22" type="boolean"/>
<bitfield name="UNK27" low="27" high="30" type="uint" variants="A7XX"/>
<!-- TODO: other bits -->
<bitfield name="UNK26" pos="26" type="boolean"/>
<bitfield name="YALIGN" low="27" high="30" type="a7xx_cs_yalign"/>
</reg32>
<reg32 offset="0xa9df" name="HLSQ_CS_LOCAL_SIZE" variants="A7XX-" usage="cmd">