freedreno/devices: Move fibers_per_sp to the common info struct.

We'll need it for pvt mem on other GPUs.

Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/24358>
This commit is contained in:
Emma Anholt
2023-07-27 11:11:26 -07:00
committed by Marge Bot
parent b468286937
commit f6ea7c3a99
5 changed files with 30 additions and 17 deletions

View File

@@ -53,6 +53,9 @@ struct fd_dev_info {
int wave_granularity;
/* Information for private memory calculations */
uint32_t fibers_per_sp;
/* number of CCU is always equal to the number of SP */
union {
uint32_t num_sp_cores;
@@ -61,9 +64,6 @@ struct fd_dev_info {
union {
struct {
/* Information for private memory calculations */
uint32_t fibers_per_sp;
uint32_t reg_size_vec4;
/* The size (in instrlen units (128 bytes)) of instruction cache where

View File

@@ -109,7 +109,7 @@ class GPUInfo(Struct):
def __init__(self, chip, gmem_align_w, gmem_align_h,
tile_align_w, tile_align_h,
tile_max_w, tile_max_h, num_vsc_pipes,
cs_shared_mem_size, wave_granularity):
cs_shared_mem_size, wave_granularity, fibers_per_sp):
self.chip = chip.value
self.gmem_align_w = gmem_align_w
self.gmem_align_h = gmem_align_h
@@ -120,6 +120,7 @@ class GPUInfo(Struct):
self.num_vsc_pipes = num_vsc_pipes
self.cs_shared_mem_size = cs_shared_mem_size
self.wave_granularity = wave_granularity
self.fibers_per_sp = fibers_per_sp
s.gpu_infos.append(self)
@@ -129,7 +130,7 @@ class A6xxGPUInfo(GPUInfo):
into distinct sub-generations. The template parameter avoids
duplication of parameters that are unique to the sub-generation.
"""
def __init__(self, chip, template, num_ccu, tile_align_w, tile_align_h, num_vsc_pipes, cs_shared_mem_size, wave_granularity, magic_regs):
def __init__(self, chip, template, num_ccu, tile_align_w, tile_align_h, num_vsc_pipes, cs_shared_mem_size, wave_granularity, fibers_per_sp, magic_regs):
super().__init__(chip, gmem_align_w = 16, gmem_align_h = 4,
tile_align_w = tile_align_w,
tile_align_h = tile_align_h,
@@ -137,7 +138,8 @@ class A6xxGPUInfo(GPUInfo):
tile_max_h = max_bitfield_val(14, 8, 4),
num_vsc_pipes = num_vsc_pipes,
cs_shared_mem_size = cs_shared_mem_size,
wave_granularity = wave_granularity)
wave_granularity = wave_granularity,
fibers_per_sp = fibers_per_sp)
# The # of SP cores seems to always match # of CCU
self.num_sp_cores = num_ccu
self.num_ccu = num_ccu
@@ -185,7 +187,8 @@ add_gpus([
tile_max_h = ~0, # TODO
num_vsc_pipes = 8,
cs_shared_mem_size = 0,
wave_granularity = 2
wave_granularity = 2,
fibers_per_sp = 0, # TODO
))
add_gpus([
@@ -201,7 +204,8 @@ add_gpus([
tile_max_h = max_bitfield_val(9, 5, 5),
num_vsc_pipes = 8,
cs_shared_mem_size = 32 * 1024,
wave_granularity = 2
wave_granularity = 2,
fibers_per_sp = 0, # TODO
))
add_gpus([
@@ -216,7 +220,8 @@ add_gpus([
tile_max_h = max_bitfield_val(9, 5, 5),
num_vsc_pipes = 8,
cs_shared_mem_size = 32 * 1024,
wave_granularity = 2
wave_granularity = 2,
fibers_per_sp = 0, # TODO
))
add_gpus([
@@ -235,7 +240,8 @@ add_gpus([
tile_max_h = max_bitfield_val(16, 9, 5),
num_vsc_pipes = 16,
cs_shared_mem_size = 32 * 1024,
wave_granularity = 2
wave_granularity = 2,
fibers_per_sp = 0, # TODO
))
# a6xx can be divided into distinct sub-generations, where certain device-
@@ -244,7 +250,6 @@ add_gpus([
# a615, a616, a618, a619, a620 and a630:
a6xx_gen1 = dict(
fibers_per_sp = 128 * 16,
reg_size_vec4 = 96,
instr_cache_size = 64,
concurrent_resolve = False,
@@ -268,7 +273,6 @@ a6xx_gen1_low = {**a6xx_gen1, **dict(
# a640, a680:
a6xx_gen2 = dict(
fibers_per_sp = 128 * 4 * 16,
reg_size_vec4 = 96,
instr_cache_size = 64, # TODO
supports_multiview_mask = True,
@@ -282,7 +286,6 @@ a6xx_gen2 = dict(
# a650:
a6xx_gen3 = dict(
fibers_per_sp = 128 * 2 * 16,
reg_size_vec4 = 64,
# Blob limits it to 128 but we hang with 128
instr_cache_size = 127,
@@ -304,7 +307,6 @@ a6xx_gen3 = dict(
# a635, a660:
a6xx_gen4 = dict(
fibers_per_sp = 128 * 2 * 16,
reg_size_vec4 = 64,
# Blob limits it to 128 but we hang with 128
instr_cache_size = 127,
@@ -342,6 +344,7 @@ add_gpus([
num_vsc_pipes = 16,
cs_shared_mem_size = 16 * 1024,
wave_granularity = 1,
fibers_per_sp = 128 * 16,
magic_regs = dict(
PC_POWER_CNTL = 0,
TPL1_DBG_ECO_CNTL = 0,
@@ -373,6 +376,7 @@ add_gpus([
num_vsc_pipes = 32,
cs_shared_mem_size = 32 * 1024,
wave_granularity = 2,
fibers_per_sp = 128 * 16,
magic_regs = dict(
PC_POWER_CNTL = 0,
TPL1_DBG_ECO_CNTL = 0x00108000,
@@ -401,6 +405,7 @@ add_gpus([
num_vsc_pipes = 32,
cs_shared_mem_size = 32 * 1024,
wave_granularity = 2,
fibers_per_sp = 128 * 16,
magic_regs = dict(
PC_POWER_CNTL = 0,
TPL1_DBG_ECO_CNTL = 0x01008000,
@@ -429,6 +434,7 @@ add_gpus([
num_vsc_pipes = 32,
cs_shared_mem_size = 32 * 1024,
wave_granularity = 2,
fibers_per_sp = 128 * 16,
magic_regs = dict(
PC_POWER_CNTL = 1,
TPL1_DBG_ECO_CNTL = 0x00108000,
@@ -457,6 +463,7 @@ add_gpus([
num_vsc_pipes = 32,
cs_shared_mem_size = 32 * 1024,
wave_granularity = 2,
fibers_per_sp = 128 * 4 * 16,
magic_regs = dict(
PC_POWER_CNTL = 1,
TPL1_DBG_ECO_CNTL = 0x00008000,
@@ -485,6 +492,7 @@ add_gpus([
num_vsc_pipes = 32,
cs_shared_mem_size = 32 * 1024,
wave_granularity = 2,
fibers_per_sp = 128 * 4 * 16,
magic_regs = dict(
PC_POWER_CNTL = 3,
TPL1_DBG_ECO_CNTL = 0x00108000,
@@ -513,6 +521,7 @@ add_gpus([
num_vsc_pipes = 32,
cs_shared_mem_size = 32 * 1024,
wave_granularity = 2,
fibers_per_sp = 128 * 2 * 16,
magic_regs = dict(
PC_POWER_CNTL = 2,
# this seems to be a chicken bit that fixes cubic filtering:
@@ -546,6 +555,7 @@ add_gpus([
num_vsc_pipes = 32,
cs_shared_mem_size = 32 * 1024,
wave_granularity = 2,
fibers_per_sp = 128 * 2 * 16,
magic_regs = dict(
PC_POWER_CNTL = 1,
TPL1_DBG_ECO_CNTL = 0x05008000,
@@ -574,6 +584,7 @@ add_gpus([
num_vsc_pipes = 32,
cs_shared_mem_size = 32 * 1024,
wave_granularity = 2,
fibers_per_sp = 128 * 2 * 16,
magic_regs = dict(
PC_POWER_CNTL = 2,
TPL1_DBG_ECO_CNTL = 0x05008000,
@@ -602,6 +613,7 @@ add_gpus([
num_vsc_pipes = 32,
cs_shared_mem_size = 32 * 1024,
wave_granularity = 2,
fibers_per_sp = 128 * 2 * 16,
magic_regs = dict(
PC_POWER_CNTL = 7,
TPL1_DBG_ECO_CNTL = 0x01008000,
@@ -632,6 +644,7 @@ add_gpus([
num_vsc_pipes = 32,
cs_shared_mem_size = 32 * 1024,
wave_granularity = 2,
fibers_per_sp = 128 * 2 * 16,
magic_regs = dict()
))

View File

@@ -217,7 +217,7 @@ cs_program_emit(struct fd_ringbuffer *ring, struct kernel *kernel)
if (v->pvtmem_size > 0) {
uint32_t per_fiber_size = v->pvtmem_size;
uint32_t per_sp_size =
ALIGN(per_fiber_size * a6xx_backend->info->a6xx.fibers_per_sp, 1 << 12);
ALIGN(per_fiber_size * a6xx_backend->info->fibers_per_sp, 1 << 12);
uint32_t total_size = per_sp_size * a6xx_backend->info->num_sp_cores;
struct fd_bo *pvtmem = fd_bo_new(a6xx_backend->dev, total_size, 0, "pvtmem");

View File

@@ -1983,7 +1983,7 @@ tu_setup_pvtmem(struct tu_device *dev,
util_next_power_of_two(ALIGN(pvtmem_bytes, 512));
pvtmem_bo->per_sp_size =
ALIGN(pvtmem_bo->per_fiber_size *
dev->physical_device->info->a6xx.fibers_per_sp,
dev->physical_device->info->fibers_per_sp,
1 << 12);
uint32_t total_size =
dev->physical_device->info->num_sp_cores * pvtmem_bo->per_sp_size;

View File

@@ -104,7 +104,7 @@ fd6_emit_shader(struct fd_context *ctx, struct fd_ringbuffer *ring,
fd_emit_string5(ring, name, strlen(name));
#endif
uint32_t fibers_per_sp = ctx->screen->info->a6xx.fibers_per_sp;
uint32_t fibers_per_sp = ctx->screen->info->fibers_per_sp;
uint32_t num_sp_cores = ctx->screen->info->num_sp_cores;
uint32_t per_fiber_size = so->pvtmem_size;