freedreno,tu,ir3: Move threadsize_base and max_waves to fd_dev_info

Signed-off-by: Valentine Burley <valentine.burley@gmail.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/29277>
2024-05-17 21:12:49 +00:00
parent 692e1ab2c1
commit c4da848a1a
4 changed files with 30 additions and 21 deletions
--- a/src/freedreno/common/freedreno_dev_info.h
+++ b/src/freedreno/common/freedreno_dev_info.h
@@ -56,6 +56,10 @@ struct fd_dev_info {
   /* Information for private memory calculations */
   uint32_t fibers_per_sp;

+   uint32_t threadsize_base;
+
+   uint32_t max_waves;
+
   /* number of CCU is always equal to the number of SP */
   union {
      uint32_t num_sp_cores;
--- a/src/freedreno/common/freedreno_devices.py
+++ b/src/freedreno/common/freedreno_devices.py
@@ -118,7 +118,8 @@ class GPUInfo(Struct):
    def __init__(self, chip, gmem_align_w, gmem_align_h,
                 tile_align_w, tile_align_h,
                 tile_max_w, tile_max_h, num_vsc_pipes,
-                 cs_shared_mem_size, num_sp_cores, wave_granularity, fibers_per_sp):
+                 cs_shared_mem_size, num_sp_cores, wave_granularity, fibers_per_sp,
+                 threadsize_base = 64, max_waves = 16):
        self.chip          = chip.value
        self.gmem_align_w  = gmem_align_w
        self.gmem_align_h  = gmem_align_h
@@ -131,6 +132,8 @@ class GPUInfo(Struct):
        self.num_sp_cores  = num_sp_cores
        self.wave_granularity = wave_granularity
        self.fibers_per_sp = fibers_per_sp
+        self.threadsize_base = threadsize_base
+        self.max_waves     = max_waves

        s.gpu_infos.append(self)

@@ -143,7 +146,8 @@ class A6xxGPUInfo(GPUInfo):
    def __init__(self, chip, template, num_ccu,
                 tile_align_w, tile_align_h, num_vsc_pipes,
                 cs_shared_mem_size, wave_granularity, fibers_per_sp,
-                 magic_regs, raw_magic_regs = None):
+                 magic_regs, raw_magic_regs = None, threadsize_base = 64,
+                 max_waves = 16):
        if chip == CHIP.A6XX:
            tile_max_w   = 1024 # max_bitfield_val(5, 0, 5)
            tile_max_h   = max_bitfield_val(14, 8, 4) # 1008
@@ -160,7 +164,9 @@ class A6xxGPUInfo(GPUInfo):
                         cs_shared_mem_size = cs_shared_mem_size,
                         num_sp_cores = num_ccu, # The # of SP cores seems to always match # of CCU
                         wave_granularity   = wave_granularity,
-                         fibers_per_sp      = fibers_per_sp)
+                         fibers_per_sp      = fibers_per_sp,
+                         threadsize_base    = threadsize_base,
+                         max_waves    = max_waves)

        self.num_ccu = num_ccu

@@ -202,6 +208,7 @@ add_gpus([
        num_sp_cores = 0, # TODO
        wave_granularity = 2,
        fibers_per_sp = 0, # TODO
+        threadsize_base = 8, # TODO: Confirm this
    ))

 add_gpus([
@@ -221,6 +228,7 @@ add_gpus([
        num_sp_cores = 0, # TODO
        wave_granularity = 2,
        fibers_per_sp = 0, # TODO
+        threadsize_base = 8,
    ))

 add_gpus([
@@ -238,6 +246,7 @@ add_gpus([
        num_sp_cores = 0, # TODO
        wave_granularity = 2,
        fibers_per_sp = 0, # TODO
+        threadsize_base = 32, # TODO: Confirm this
    ))

 add_gpus([
@@ -255,6 +264,7 @@ add_gpus([
        num_sp_cores = 1,
        wave_granularity = 2,
        fibers_per_sp = 64 * 16, # Lowest number that didn't fault on spillall fs-varying-array-mat4-col-row-rd.
+        threadsize_base = 32,
    ))

 add_gpus([
@@ -271,6 +281,7 @@ add_gpus([
        num_sp_cores = 2,
        wave_granularity = 2,
        fibers_per_sp = 64 * 16, # Lowest number that didn't fault on spillall fs-varying-array-mat4-col-row-rd.
+        threadsize_base = 32,
    ))

 add_gpus([
@@ -287,6 +298,7 @@ add_gpus([
        num_sp_cores = 4,
        wave_granularity = 2,
        fibers_per_sp = 64 * 16, # Lowest number that didn't fault on spillall fs-varying-array-mat4-col-row-rd.
+        threadsize_base = 32,
    ))


--- a/src/freedreno/ir3/ir3_compiler.c
+++ b/src/freedreno/ir3/ir3_compiler.c
@@ -155,7 +155,7 @@ ir3_compiler_create(struct fd_device *dev, const struct fd_dev_id *dev_id,
   /* TODO see if older GPU's were different here */
   compiler->branchstack_size = 64;
   compiler->wave_granularity = dev_info->wave_granularity;
-   compiler->max_waves = 16;
+   compiler->max_waves = dev_info->max_waves;

   compiler->max_variable_workgroup_size = 1024;

@@ -266,16 +266,7 @@ ir3_compiler_create(struct fd_device *dev, const struct fd_dev_id *dev_id,
      compiler->reg_size_vec4 = 96;
   }

-   if (compiler->gen >= 6) {
-      compiler->threadsize_base = 64;
-   } else if (compiler->gen >= 4) {
-      /* TODO: Confirm this for a4xx. For a5xx this is based on the Vulkan
-       * 1.1 subgroupSize which is 32.
-       */
-      compiler->threadsize_base = 32;
-   } else {
-      compiler->threadsize_base = 8;
-   }
+   compiler->threadsize_base = dev_info->threadsize_base;

   if (compiler->gen >= 4) {
      /* need special handling for "flat" */
--- a/src/freedreno/vulkan/tu_device.cc
+++ b/src/freedreno/vulkan/tu_device.cc
@@ -661,7 +661,8 @@ tu_get_physical_device_properties_1_1(struct tu_physical_device *pdevice,
   p->deviceNodeMask = 0;
   p->deviceLUIDValid = false;

-   p->subgroupSize = pdevice->info->a6xx.supports_double_threadsize ? 128 : 64;
+   p->subgroupSize = pdevice->info->a6xx.supports_double_threadsize ?
+      pdevice->info->threadsize_base * 2 : pdevice->info->threadsize_base;
   p->subgroupSupportedStages = VK_SHADER_STAGE_COMPUTE_BIT;
   p->subgroupSupportedOperations = VK_SUBGROUP_FEATURE_BASIC_BIT |
                                    VK_SUBGROUP_FEATURE_VOTE_BIT |
@@ -778,11 +779,10 @@ static void
 tu_get_physical_device_properties_1_3(struct tu_physical_device *pdevice,
                                      struct vk_properties *p)
 {
-   /* TODO move threadsize_base and max_waves to fd_dev_info and use them here */
-   p->minSubgroupSize = 64; /* threadsize_base */
-   p->maxSubgroupSize =
-      pdevice->info->a6xx.supports_double_threadsize ? 128 : 64;
-   p->maxComputeWorkgroupSubgroups = 16; /* max_waves */
+   p->minSubgroupSize = pdevice->info->threadsize_base;
+   p->maxSubgroupSize = pdevice->info->a6xx.supports_double_threadsize ?
+      pdevice->info->threadsize_base * 2 : pdevice->info->threadsize_base;
+   p->maxComputeWorkgroupSubgroups = pdevice->info->max_waves;
   p->requiredSubgroupSizeStages = VK_SHADER_STAGE_ALL;

   p->maxInlineUniformBlockSize = MAX_INLINE_UBO_RANGE;
@@ -902,7 +902,9 @@ tu_get_properties(struct tu_physical_device *pdevice,
   props->maxComputeWorkGroupCount[0] =
      props->maxComputeWorkGroupCount[1] =
      props->maxComputeWorkGroupCount[2] = 65535;
-   props->maxComputeWorkGroupInvocations = pdevice->info->a6xx.supports_double_threadsize ? 2048 : 1024;
+   props->maxComputeWorkGroupInvocations = pdevice->info->a6xx.supports_double_threadsize ?
+      pdevice->info->threadsize_base * 2 * pdevice->info->max_waves :
+      pdevice->info->threadsize_base * pdevice->info->max_waves;
   props->maxComputeWorkGroupSize[0] =
      props->maxComputeWorkGroupSize[1] =
      props->maxComputeWorkGroupSize[2] = 1024;