diff --git a/src/freedreno/common/freedreno_dev_info.h b/src/freedreno/common/freedreno_dev_info.h index 4ba6ef75be0..3a54d63b85d 100644 --- a/src/freedreno/common/freedreno_dev_info.h +++ b/src/freedreno/common/freedreno_dev_info.h @@ -56,6 +56,10 @@ struct fd_dev_info { /* Information for private memory calculations */ uint32_t fibers_per_sp; + uint32_t threadsize_base; + + uint32_t max_waves; + /* number of CCU is always equal to the number of SP */ union { uint32_t num_sp_cores; diff --git a/src/freedreno/common/freedreno_devices.py b/src/freedreno/common/freedreno_devices.py index 42e1aaad2cf..2d38d01862b 100644 --- a/src/freedreno/common/freedreno_devices.py +++ b/src/freedreno/common/freedreno_devices.py @@ -118,7 +118,8 @@ class GPUInfo(Struct): def __init__(self, chip, gmem_align_w, gmem_align_h, tile_align_w, tile_align_h, tile_max_w, tile_max_h, num_vsc_pipes, - cs_shared_mem_size, num_sp_cores, wave_granularity, fibers_per_sp): + cs_shared_mem_size, num_sp_cores, wave_granularity, fibers_per_sp, + threadsize_base = 64, max_waves = 16): self.chip = chip.value self.gmem_align_w = gmem_align_w self.gmem_align_h = gmem_align_h @@ -131,6 +132,8 @@ class GPUInfo(Struct): self.num_sp_cores = num_sp_cores self.wave_granularity = wave_granularity self.fibers_per_sp = fibers_per_sp + self.threadsize_base = threadsize_base + self.max_waves = max_waves s.gpu_infos.append(self) @@ -143,7 +146,8 @@ class A6xxGPUInfo(GPUInfo): def __init__(self, chip, template, num_ccu, tile_align_w, tile_align_h, num_vsc_pipes, cs_shared_mem_size, wave_granularity, fibers_per_sp, - magic_regs, raw_magic_regs = None): + magic_regs, raw_magic_regs = None, threadsize_base = 64, + max_waves = 16): if chip == CHIP.A6XX: tile_max_w = 1024 # max_bitfield_val(5, 0, 5) tile_max_h = max_bitfield_val(14, 8, 4) # 1008 @@ -160,7 +164,9 @@ class A6xxGPUInfo(GPUInfo): cs_shared_mem_size = cs_shared_mem_size, num_sp_cores = num_ccu, # The # of SP cores seems to always match # of CCU wave_granularity = wave_granularity, - fibers_per_sp = fibers_per_sp) + fibers_per_sp = fibers_per_sp, + threadsize_base = threadsize_base, + max_waves = max_waves) self.num_ccu = num_ccu @@ -202,6 +208,7 @@ add_gpus([ num_sp_cores = 0, # TODO wave_granularity = 2, fibers_per_sp = 0, # TODO + threadsize_base = 8, # TODO: Confirm this )) add_gpus([ @@ -221,6 +228,7 @@ add_gpus([ num_sp_cores = 0, # TODO wave_granularity = 2, fibers_per_sp = 0, # TODO + threadsize_base = 8, )) add_gpus([ @@ -238,6 +246,7 @@ add_gpus([ num_sp_cores = 0, # TODO wave_granularity = 2, fibers_per_sp = 0, # TODO + threadsize_base = 32, # TODO: Confirm this )) add_gpus([ @@ -255,6 +264,7 @@ add_gpus([ num_sp_cores = 1, wave_granularity = 2, fibers_per_sp = 64 * 16, # Lowest number that didn't fault on spillall fs-varying-array-mat4-col-row-rd. + threadsize_base = 32, )) add_gpus([ @@ -271,6 +281,7 @@ add_gpus([ num_sp_cores = 2, wave_granularity = 2, fibers_per_sp = 64 * 16, # Lowest number that didn't fault on spillall fs-varying-array-mat4-col-row-rd. + threadsize_base = 32, )) add_gpus([ @@ -287,6 +298,7 @@ add_gpus([ num_sp_cores = 4, wave_granularity = 2, fibers_per_sp = 64 * 16, # Lowest number that didn't fault on spillall fs-varying-array-mat4-col-row-rd. + threadsize_base = 32, )) diff --git a/src/freedreno/ir3/ir3_compiler.c b/src/freedreno/ir3/ir3_compiler.c index 23c033d0972..f01ec9a1813 100644 --- a/src/freedreno/ir3/ir3_compiler.c +++ b/src/freedreno/ir3/ir3_compiler.c @@ -155,7 +155,7 @@ ir3_compiler_create(struct fd_device *dev, const struct fd_dev_id *dev_id, /* TODO see if older GPU's were different here */ compiler->branchstack_size = 64; compiler->wave_granularity = dev_info->wave_granularity; - compiler->max_waves = 16; + compiler->max_waves = dev_info->max_waves; compiler->max_variable_workgroup_size = 1024; @@ -266,16 +266,7 @@ ir3_compiler_create(struct fd_device *dev, const struct fd_dev_id *dev_id, compiler->reg_size_vec4 = 96; } - if (compiler->gen >= 6) { - compiler->threadsize_base = 64; - } else if (compiler->gen >= 4) { - /* TODO: Confirm this for a4xx. For a5xx this is based on the Vulkan - * 1.1 subgroupSize which is 32. - */ - compiler->threadsize_base = 32; - } else { - compiler->threadsize_base = 8; - } + compiler->threadsize_base = dev_info->threadsize_base; if (compiler->gen >= 4) { /* need special handling for "flat" */ diff --git a/src/freedreno/vulkan/tu_device.cc b/src/freedreno/vulkan/tu_device.cc index 78dfa778fea..cb6819987a8 100644 --- a/src/freedreno/vulkan/tu_device.cc +++ b/src/freedreno/vulkan/tu_device.cc @@ -661,7 +661,8 @@ tu_get_physical_device_properties_1_1(struct tu_physical_device *pdevice, p->deviceNodeMask = 0; p->deviceLUIDValid = false; - p->subgroupSize = pdevice->info->a6xx.supports_double_threadsize ? 128 : 64; + p->subgroupSize = pdevice->info->a6xx.supports_double_threadsize ? + pdevice->info->threadsize_base * 2 : pdevice->info->threadsize_base; p->subgroupSupportedStages = VK_SHADER_STAGE_COMPUTE_BIT; p->subgroupSupportedOperations = VK_SUBGROUP_FEATURE_BASIC_BIT | VK_SUBGROUP_FEATURE_VOTE_BIT | @@ -778,11 +779,10 @@ static void tu_get_physical_device_properties_1_3(struct tu_physical_device *pdevice, struct vk_properties *p) { - /* TODO move threadsize_base and max_waves to fd_dev_info and use them here */ - p->minSubgroupSize = 64; /* threadsize_base */ - p->maxSubgroupSize = - pdevice->info->a6xx.supports_double_threadsize ? 128 : 64; - p->maxComputeWorkgroupSubgroups = 16; /* max_waves */ + p->minSubgroupSize = pdevice->info->threadsize_base; + p->maxSubgroupSize = pdevice->info->a6xx.supports_double_threadsize ? + pdevice->info->threadsize_base * 2 : pdevice->info->threadsize_base; + p->maxComputeWorkgroupSubgroups = pdevice->info->max_waves; p->requiredSubgroupSizeStages = VK_SHADER_STAGE_ALL; p->maxInlineUniformBlockSize = MAX_INLINE_UBO_RANGE; @@ -902,7 +902,9 @@ tu_get_properties(struct tu_physical_device *pdevice, props->maxComputeWorkGroupCount[0] = props->maxComputeWorkGroupCount[1] = props->maxComputeWorkGroupCount[2] = 65535; - props->maxComputeWorkGroupInvocations = pdevice->info->a6xx.supports_double_threadsize ? 2048 : 1024; + props->maxComputeWorkGroupInvocations = pdevice->info->a6xx.supports_double_threadsize ? + pdevice->info->threadsize_base * 2 * pdevice->info->max_waves : + pdevice->info->threadsize_base * pdevice->info->max_waves; props->maxComputeWorkGroupSize[0] = props->maxComputeWorkGroupSize[1] = props->maxComputeWorkGroupSize[2] = 1024;