freedreno,tu,ir3: Move threadsize_base and max_waves to fd_dev_info

Signed-off-by: Valentine Burley <valentine.burley@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/29277>
This commit is contained in:
Valentine Burley
2024-05-17 21:12:49 +00:00
committed by Marge Bot
parent 692e1ab2c1
commit c4da848a1a
4 changed files with 30 additions and 21 deletions

View File

@@ -56,6 +56,10 @@ struct fd_dev_info {
/* Information for private memory calculations */
uint32_t fibers_per_sp;
uint32_t threadsize_base;
uint32_t max_waves;
/* number of CCU is always equal to the number of SP */
union {
uint32_t num_sp_cores;

View File

@@ -118,7 +118,8 @@ class GPUInfo(Struct):
def __init__(self, chip, gmem_align_w, gmem_align_h,
tile_align_w, tile_align_h,
tile_max_w, tile_max_h, num_vsc_pipes,
cs_shared_mem_size, num_sp_cores, wave_granularity, fibers_per_sp):
cs_shared_mem_size, num_sp_cores, wave_granularity, fibers_per_sp,
threadsize_base = 64, max_waves = 16):
self.chip = chip.value
self.gmem_align_w = gmem_align_w
self.gmem_align_h = gmem_align_h
@@ -131,6 +132,8 @@ class GPUInfo(Struct):
self.num_sp_cores = num_sp_cores
self.wave_granularity = wave_granularity
self.fibers_per_sp = fibers_per_sp
self.threadsize_base = threadsize_base
self.max_waves = max_waves
s.gpu_infos.append(self)
@@ -143,7 +146,8 @@ class A6xxGPUInfo(GPUInfo):
def __init__(self, chip, template, num_ccu,
tile_align_w, tile_align_h, num_vsc_pipes,
cs_shared_mem_size, wave_granularity, fibers_per_sp,
magic_regs, raw_magic_regs = None):
magic_regs, raw_magic_regs = None, threadsize_base = 64,
max_waves = 16):
if chip == CHIP.A6XX:
tile_max_w = 1024 # max_bitfield_val(5, 0, 5)
tile_max_h = max_bitfield_val(14, 8, 4) # 1008
@@ -160,7 +164,9 @@ class A6xxGPUInfo(GPUInfo):
cs_shared_mem_size = cs_shared_mem_size,
num_sp_cores = num_ccu, # The # of SP cores seems to always match # of CCU
wave_granularity = wave_granularity,
fibers_per_sp = fibers_per_sp)
fibers_per_sp = fibers_per_sp,
threadsize_base = threadsize_base,
max_waves = max_waves)
self.num_ccu = num_ccu
@@ -202,6 +208,7 @@ add_gpus([
num_sp_cores = 0, # TODO
wave_granularity = 2,
fibers_per_sp = 0, # TODO
threadsize_base = 8, # TODO: Confirm this
))
add_gpus([
@@ -221,6 +228,7 @@ add_gpus([
num_sp_cores = 0, # TODO
wave_granularity = 2,
fibers_per_sp = 0, # TODO
threadsize_base = 8,
))
add_gpus([
@@ -238,6 +246,7 @@ add_gpus([
num_sp_cores = 0, # TODO
wave_granularity = 2,
fibers_per_sp = 0, # TODO
threadsize_base = 32, # TODO: Confirm this
))
add_gpus([
@@ -255,6 +264,7 @@ add_gpus([
num_sp_cores = 1,
wave_granularity = 2,
fibers_per_sp = 64 * 16, # Lowest number that didn't fault on spillall fs-varying-array-mat4-col-row-rd.
threadsize_base = 32,
))
add_gpus([
@@ -271,6 +281,7 @@ add_gpus([
num_sp_cores = 2,
wave_granularity = 2,
fibers_per_sp = 64 * 16, # Lowest number that didn't fault on spillall fs-varying-array-mat4-col-row-rd.
threadsize_base = 32,
))
add_gpus([
@@ -287,6 +298,7 @@ add_gpus([
num_sp_cores = 4,
wave_granularity = 2,
fibers_per_sp = 64 * 16, # Lowest number that didn't fault on spillall fs-varying-array-mat4-col-row-rd.
threadsize_base = 32,
))

View File

@@ -155,7 +155,7 @@ ir3_compiler_create(struct fd_device *dev, const struct fd_dev_id *dev_id,
/* TODO see if older GPU's were different here */
compiler->branchstack_size = 64;
compiler->wave_granularity = dev_info->wave_granularity;
compiler->max_waves = 16;
compiler->max_waves = dev_info->max_waves;
compiler->max_variable_workgroup_size = 1024;
@@ -266,16 +266,7 @@ ir3_compiler_create(struct fd_device *dev, const struct fd_dev_id *dev_id,
compiler->reg_size_vec4 = 96;
}
if (compiler->gen >= 6) {
compiler->threadsize_base = 64;
} else if (compiler->gen >= 4) {
/* TODO: Confirm this for a4xx. For a5xx this is based on the Vulkan
* 1.1 subgroupSize which is 32.
*/
compiler->threadsize_base = 32;
} else {
compiler->threadsize_base = 8;
}
compiler->threadsize_base = dev_info->threadsize_base;
if (compiler->gen >= 4) {
/* need special handling for "flat" */

View File

@@ -661,7 +661,8 @@ tu_get_physical_device_properties_1_1(struct tu_physical_device *pdevice,
p->deviceNodeMask = 0;
p->deviceLUIDValid = false;
p->subgroupSize = pdevice->info->a6xx.supports_double_threadsize ? 128 : 64;
p->subgroupSize = pdevice->info->a6xx.supports_double_threadsize ?
pdevice->info->threadsize_base * 2 : pdevice->info->threadsize_base;
p->subgroupSupportedStages = VK_SHADER_STAGE_COMPUTE_BIT;
p->subgroupSupportedOperations = VK_SUBGROUP_FEATURE_BASIC_BIT |
VK_SUBGROUP_FEATURE_VOTE_BIT |
@@ -778,11 +779,10 @@ static void
tu_get_physical_device_properties_1_3(struct tu_physical_device *pdevice,
struct vk_properties *p)
{
/* TODO move threadsize_base and max_waves to fd_dev_info and use them here */
p->minSubgroupSize = 64; /* threadsize_base */
p->maxSubgroupSize =
pdevice->info->a6xx.supports_double_threadsize ? 128 : 64;
p->maxComputeWorkgroupSubgroups = 16; /* max_waves */
p->minSubgroupSize = pdevice->info->threadsize_base;
p->maxSubgroupSize = pdevice->info->a6xx.supports_double_threadsize ?
pdevice->info->threadsize_base * 2 : pdevice->info->threadsize_base;
p->maxComputeWorkgroupSubgroups = pdevice->info->max_waves;
p->requiredSubgroupSizeStages = VK_SHADER_STAGE_ALL;
p->maxInlineUniformBlockSize = MAX_INLINE_UBO_RANGE;
@@ -902,7 +902,9 @@ tu_get_properties(struct tu_physical_device *pdevice,
props->maxComputeWorkGroupCount[0] =
props->maxComputeWorkGroupCount[1] =
props->maxComputeWorkGroupCount[2] = 65535;
props->maxComputeWorkGroupInvocations = pdevice->info->a6xx.supports_double_threadsize ? 2048 : 1024;
props->maxComputeWorkGroupInvocations = pdevice->info->a6xx.supports_double_threadsize ?
pdevice->info->threadsize_base * 2 * pdevice->info->max_waves :
pdevice->info->threadsize_base * pdevice->info->max_waves;
props->maxComputeWorkGroupSize[0] =
props->maxComputeWorkGroupSize[1] =
props->maxComputeWorkGroupSize[2] = 1024;