freedreno: Add compute_lb_size device info

This is really a guess except for a6xx and later, however it shouldn't
change behavior from before.

Fixes: 5879eaac18 ("ir3: Increase compute const size on a7xx")
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/34746>
(cherry picked from commit 156ab5839d045ea291a47789014ce61ddbad0804)
This commit is contained in:
Connor Abbott
2025-04-28 18:15:55 -04:00
committed by Eric Engestrom
parent 84f1dcdc2a
commit f6450df88f
5 changed files with 31 additions and 3 deletions

View File

@@ -84,7 +84,7 @@
"description": "freedreno: Add compute_lb_size device info",
"nominated": true,
"nomination_type": 2,
"resolution": 0,
"resolution": 1,
"main_sha": null,
"because_sha": "5879eaac185ed1c167fd01aff9b91c7cbe43ab0a",
"notes": null

View File

@@ -51,6 +51,18 @@ struct fd_dev_info {
uint32_t max_waves;
/* Local Memory (i.e. shared memory in GL/Vulkan) and compute shader
* const registers, as well as other things not relevant here, share the
* same storage space, called the Local Buffer or LB. This is the size of
* the part of the LB used for consts and LM. Consts are duplicated
* wavesize_granularity times, and the size of duplicated consts + local
* memory must not exceed it. If it is left 0, assume that it is
* compute constlen + wavesize_granularity * cs_shared_mem_size, which is
* enough to hold both the maximum possible compute consts and local
* memory at the same time.
*/
uint32_t compute_lb_size;
/* number of CCU is always equal to the number of SP */
union {
uint32_t num_sp_cores;

View File

@@ -103,7 +103,7 @@ class GPUInfo(Struct):
tile_max_w, tile_max_h, num_vsc_pipes,
cs_shared_mem_size, num_sp_cores, wave_granularity, fibers_per_sp,
highest_bank_bit = 0, ubwc_swizzle = 0x7, macrotile_mode = 0,
threadsize_base = 64, max_waves = 16):
threadsize_base = 64, max_waves = 16, compute_lb_size = 0):
self.chip = chip.value
self.gmem_align_w = gmem_align_w
self.gmem_align_h = gmem_align_h
@@ -139,9 +139,13 @@ class A6xxGPUInfo(GPUInfo):
if chip == CHIP.A6XX:
tile_max_w = 1024 # max_bitfield_val(5, 0, 5)
tile_max_h = max_bitfield_val(14, 8, 4) # 1008
compute_lb_size = 0
else:
tile_max_w = 1728
tile_max_h = 1728
# on a7xx the compute_lb_size is 40KB for all known parts for now.
# We have a parameter for it in case some low-end parts cut it down.
compute_lb_size = 40 * 1024
super().__init__(chip, gmem_align_w = 16, gmem_align_h = 4,
tile_align_w = tile_align_w,
@@ -157,7 +161,8 @@ class A6xxGPUInfo(GPUInfo):
ubwc_swizzle = ubwc_swizzle,
macrotile_mode = macrotile_mode,
threadsize_base = threadsize_base,
max_waves = max_waves)
max_waves = max_waves,
compute_lb_size = compute_lb_size)
self.num_ccu = num_ccu

View File

@@ -263,6 +263,14 @@ ir3_compiler_create(struct fd_device *dev, const struct fd_dev_id *dev_id,
compiler->has_early_preamble = false;
}
if (dev_info->compute_lb_size) {
compiler->compute_lb_size = dev_info->compute_lb_size;
} else {
compiler->compute_lb_size =
compiler->max_const_compute * 16 /* bytes/vec4 */ *
compiler->wave_granularity + compiler->local_mem_size;
}
/* This is just a guess for a4xx. */
compiler->pvtmem_per_fiber_align = compiler->gen >= 4 ? 512 : 128;
/* TODO: implement private memory on earlier gen's */

View File

@@ -129,6 +129,9 @@ struct ir3_compiler {
/* The maximum number of constants, in vec4's, for compute shaders. */
uint16_t max_const_compute;
/* See freedreno_dev_info::compute_lb_size. */
uint32_t compute_lb_size;
/* Number of instructions that the shader's base address and length
* (instrlen divides instruction count by this) must be aligned to.
*/