freedreno: Add compute_lb_size device info

This is really a guess except for a6xx and later, however it shouldn't change behavior from before. Fixes: 5879eaac18 ("ir3: Increase compute const size on a7xx") Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/34746> (cherry picked from commit 156ab5839d045ea291a47789014ce61ddbad0804)
2025-04-28 18:15:55 -04:00
parent 84f1dcdc2a
commit f6450df88f
5 changed files with 31 additions and 3 deletions
--- a/.pick_status.json
+++ b/.pick_status.json
@@ -84,7 +84,7 @@
        "description": "freedreno: Add compute_lb_size device info",
        "nominated": true,
        "nomination_type": 2,
-        "resolution": 0,
+        "resolution": 1,
        "main_sha": null,
        "because_sha": "5879eaac185ed1c167fd01aff9b91c7cbe43ab0a",
        "notes": null
--- a/src/freedreno/common/freedreno_dev_info.h
+++ b/src/freedreno/common/freedreno_dev_info.h
@@ -51,6 +51,18 @@ struct fd_dev_info {

   uint32_t max_waves;

+   /* Local Memory (i.e. shared memory in GL/Vulkan) and compute shader
+    * const registers, as well as other things not relevant here, share the
+    * same storage space, called the Local Buffer or LB. This is the size of
+    * the part of the LB used for consts and LM. Consts are duplicated
+    * wavesize_granularity times, and the size of duplicated consts + local
+    * memory must not exceed it. If it is left 0, assume that it is
+    * compute constlen + wavesize_granularity * cs_shared_mem_size, which is
+    * enough to hold both the maximum possible compute consts and local
+    * memory at the same time.
+    */
+   uint32_t compute_lb_size;
+
   /* number of CCU is always equal to the number of SP */
   union {
      uint32_t num_sp_cores;
--- a/src/freedreno/common/freedreno_devices.py
+++ b/src/freedreno/common/freedreno_devices.py
@@ -103,7 +103,7 @@ class GPUInfo(Struct):
                 tile_max_w, tile_max_h, num_vsc_pipes,
                 cs_shared_mem_size, num_sp_cores, wave_granularity, fibers_per_sp,
                 highest_bank_bit = 0, ubwc_swizzle = 0x7, macrotile_mode = 0,
-                 threadsize_base = 64, max_waves = 16):
+                 threadsize_base = 64, max_waves = 16, compute_lb_size = 0):
        self.chip          = chip.value
        self.gmem_align_w  = gmem_align_w
        self.gmem_align_h  = gmem_align_h
@@ -139,9 +139,13 @@ class A6xxGPUInfo(GPUInfo):
        if chip == CHIP.A6XX:
            tile_max_w   = 1024 # max_bitfield_val(5, 0, 5)
            tile_max_h   = max_bitfield_val(14, 8, 4) # 1008
+            compute_lb_size = 0
        else:
            tile_max_w   = 1728
            tile_max_h   = 1728
+            # on a7xx the compute_lb_size is 40KB for all known parts for now.
+            # We have a parameter for it in case some low-end parts cut it down.
+            compute_lb_size = 40 * 1024

        super().__init__(chip, gmem_align_w = 16, gmem_align_h = 4,
                         tile_align_w = tile_align_w,
@@ -157,7 +161,8 @@ class A6xxGPUInfo(GPUInfo):
                         ubwc_swizzle = ubwc_swizzle,
                         macrotile_mode = macrotile_mode,
                         threadsize_base    = threadsize_base,
-                         max_waves    = max_waves)
+                         max_waves    = max_waves,
+                         compute_lb_size = compute_lb_size)

        self.num_ccu = num_ccu

--- a/src/freedreno/ir3/ir3_compiler.c
+++ b/src/freedreno/ir3/ir3_compiler.c
@@ -263,6 +263,14 @@ ir3_compiler_create(struct fd_device *dev, const struct fd_dev_id *dev_id,
      compiler->has_early_preamble = false;
   }

+   if (dev_info->compute_lb_size) {
+      compiler->compute_lb_size = dev_info->compute_lb_size;
+   } else {
+      compiler->compute_lb_size =
+         compiler->max_const_compute * 16 /* bytes/vec4 */ *
+         compiler->wave_granularity + compiler->local_mem_size;
+   }
+
   /* This is just a guess for a4xx. */
   compiler->pvtmem_per_fiber_align = compiler->gen >= 4 ? 512 : 128;
   /* TODO: implement private memory on earlier gen's */
--- a/src/freedreno/ir3/ir3_compiler.h
+++ b/src/freedreno/ir3/ir3_compiler.h
@@ -129,6 +129,9 @@ struct ir3_compiler {
   /* The maximum number of constants, in vec4's, for compute shaders. */
   uint16_t max_const_compute;

+   /* See freedreno_dev_info::compute_lb_size. */
+   uint32_t compute_lb_size;
+
   /* Number of instructions that the shader's base address and length
    * (instrlen divides instruction count by this) must be aligned to.
    */