panfrost: Add support for AFBC packing

When the GPU is converting a texture from linear/tiled to compressed
AFBC, it uses a sparse memory layout. That means that the
superblocks are stored starting at intervals equal to the size of an
uncompressed superblock. When memory usage needs to be optimized, it
is possible to pack the resource by trimming each superblock as much
as possible. The GPU will still be able to read from these packed
textures, but won't be able to write directly to them. If the
layout is AFBC-tiled, the packing process will also de-tile as
tiled+packed is not supported by Mali GPUs.

No new modifier flag has been added as the absence of the
`AFBC_FORMAT_MOD_SPARSE` flag means the resource will be packed.

Signed-off-by: Louis-Francis Ratté-Boulianne <lfrb@collabora.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/25012>
This commit is contained in:
Louis-Francis Ratté-Boulianne
2023-08-31 23:33:45 -04:00
committed by Marge Bot
parent 32fbd38889
commit bc55d150a9
6 changed files with 286 additions and 0 deletions

View File

@@ -50,6 +50,34 @@ read_afbc_header(nir_builder *b, nir_def *buf, nir_def *idx)
AFBC_HEADER_BYTES_PER_TILE / 4, 32);
}
static void
write_afbc_header(nir_builder *b, nir_def *buf, nir_def *idx, nir_def *hdr)
{
nir_def *offset = nir_imul_imm(b, idx, AFBC_HEADER_BYTES_PER_TILE);
nir_store_global(b, nir_iadd(b, buf, nir_u2u64(b, offset)), 16, hdr, 0xF);
}
static nir_def *
get_morton_index(nir_builder *b, nir_def *idx, nir_def *src_stride,
nir_def *dst_stride)
{
nir_def *x = nir_umod(b, idx, dst_stride);
nir_def *y = nir_udiv(b, idx, dst_stride);
nir_def *offset = nir_imul(b, nir_iand_imm(b, y, ~0x7), src_stride);
offset = nir_iadd(b, offset, nir_ishl_imm(b, nir_ushr_imm(b, x, 3), 6));
x = nir_iand_imm(b, x, 0x7);
x = nir_iand_imm(b, nir_ior(b, x, nir_ishl_imm(b, x, 2)), 0x13);
x = nir_iand_imm(b, nir_ior(b, x, nir_ishl_imm(b, x, 1)), 0x15);
y = nir_iand_imm(b, y, 0x7);
y = nir_iand_imm(b, nir_ior(b, y, nir_ishl_imm(b, y, 2)), 0x13);
y = nir_iand_imm(b, nir_ior(b, y, nir_ishl_imm(b, y, 1)), 0x15);
nir_def *tile_idx = nir_ior(b, x, nir_ishl_imm(b, y, 1));
return nir_iadd(b, offset, tile_idx);
}
static nir_def *
get_superblock_size(nir_builder *b, unsigned arch, nir_def *hdr,
nir_def *uncompressed_size)
@@ -99,6 +127,71 @@ get_superblock_size(nir_builder *b, unsigned arch, nir_def *hdr,
: size;
}
static nir_def *
get_packed_offset(nir_builder *b, nir_def *metadata, nir_def *idx,
nir_def **out_size)
{
nir_def *metadata_offset =
nir_u2u64(b, nir_imul_imm(b, idx, sizeof(struct pan_afbc_block_info)));
nir_def *range_ptr = nir_iadd(b, metadata, metadata_offset);
nir_def *entry = nir_load_global(b, range_ptr, 4,
sizeof(struct pan_afbc_block_info) / 4, 32);
nir_def *offset =
nir_channel(b, entry, offsetof(struct pan_afbc_block_info, offset) / 4);
if (out_size)
*out_size =
nir_channel(b, entry, offsetof(struct pan_afbc_block_info, size) / 4);
return nir_u2u64(b, offset);
}
#define MAX_LINE_SIZE 16
static void
copy_superblock(nir_builder *b, nir_def *dst, nir_def *dst_idx, nir_def *hdr_sz,
nir_def *src, nir_def *src_idx, nir_def *metadata,
nir_def *meta_idx, unsigned align)
{
nir_def *hdr = read_afbc_header(b, src, src_idx);
nir_def *src_body_base_ptr = nir_u2u64(b, nir_channel(b, hdr, 0));
nir_def *src_bodyptr = nir_iadd(b, src, src_body_base_ptr);
nir_def *size;
nir_def *dst_offset = get_packed_offset(b, metadata, meta_idx, &size);
nir_def *dst_body_base_ptr = nir_iadd(b, dst_offset, hdr_sz);
nir_def *dst_bodyptr = nir_iadd(b, dst, dst_body_base_ptr);
/* Replace the `base_body_ptr` field if not zero (solid color) */
nir_def *hdr2 =
nir_vector_insert_imm(b, hdr, nir_u2u32(b, dst_body_base_ptr), 0);
hdr = nir_bcsel(b, nir_ieq_imm(b, src_body_base_ptr, 0), hdr, hdr2);
write_afbc_header(b, dst, dst_idx, hdr);
nir_variable *offset_var =
nir_local_variable_create(b->impl, glsl_uint_type(), "offset");
nir_store_var(b, offset_var, nir_imm_int(b, 0), 1);
nir_loop *loop = nir_push_loop(b);
{
nir_def *offset = nir_load_var(b, offset_var);
nir_if *loop_check = nir_push_if(b, nir_uge(b, offset, size));
nir_jump(b, nir_jump_break);
nir_push_else(b, loop_check);
unsigned line_sz = align <= MAX_LINE_SIZE ? align : MAX_LINE_SIZE;
for (unsigned i = 0; i < align / line_sz; ++i) {
nir_def *src_line = nir_iadd(b, src_bodyptr, nir_u2u64(b, offset));
nir_def *dst_line = nir_iadd(b, dst_bodyptr, nir_u2u64(b, offset));
nir_store_global(
b, dst_line, line_sz,
nir_load_global(b, src_line, line_sz, line_sz / 4, 32), ~0);
offset = nir_iadd_imm(b, offset, line_sz);
}
nir_store_var(b, offset_var, offset, 0x1);
nir_pop_if(b, loop_check);
}
nir_pop_loop(b, loop);
}
#define panfrost_afbc_size_get_info_field(b, field) \
panfrost_afbc_get_info_field(size, b, field)
@@ -135,6 +228,37 @@ panfrost_afbc_create_size_shader(struct panfrost_screen *screen, unsigned bpp,
return b.shader;
}
#define panfrost_afbc_pack_get_info_field(b, field) \
panfrost_afbc_get_info_field(pack, b, field)
static nir_shader *
panfrost_afbc_create_pack_shader(struct panfrost_screen *screen, unsigned align,
bool tiled)
{
nir_builder b = nir_builder_init_simple_shader(
MESA_SHADER_COMPUTE, screen->vtbl.get_compiler_options(),
"panfrost_afbc_pack");
panfrost_afbc_add_info_ubo(pack, b);
nir_def *coord = nir_load_global_invocation_id(&b, 32);
nir_def *src_stride = panfrost_afbc_pack_get_info_field(&b, src_stride);
nir_def *dst_stride = panfrost_afbc_pack_get_info_field(&b, dst_stride);
nir_def *dst_idx = nir_channel(&b, coord, 0);
nir_def *src_idx =
tiled ? get_morton_index(&b, dst_idx, src_stride, dst_stride) : dst_idx;
nir_def *src = panfrost_afbc_pack_get_info_field(&b, src);
nir_def *dst = panfrost_afbc_pack_get_info_field(&b, dst);
nir_def *header_size =
nir_u2u64(&b, panfrost_afbc_pack_get_info_field(&b, header_size));
nir_def *metadata = panfrost_afbc_pack_get_info_field(&b, metadata);
copy_superblock(&b, dst, dst_idx, header_size, src, src_idx, metadata,
src_idx, align);
return b.shader;
}
struct pan_afbc_shader_data *
panfrost_afbc_get_shaders(struct panfrost_context *ctx,
struct panfrost_resource *rsrc, unsigned align)
@@ -171,6 +295,7 @@ panfrost_afbc_get_shaders(struct panfrost_context *ctx,
}
COMPILE_SHADER(size, key.bpp, key.align);
COMPILE_SHADER(pack, key.align, key.tiled);
#undef COMPILE_SHADER

View File

@@ -42,6 +42,7 @@ struct pan_afbc_shader_key {
struct pan_afbc_shader_data {
struct pan_afbc_shader_key key;
void *size_cso;
void *pack_cso;
};
struct pan_afbc_shaders {
@@ -59,6 +60,16 @@ struct panfrost_afbc_size_info {
mali_ptr metadata;
} PACKED;
struct panfrost_afbc_pack_info {
mali_ptr src;
mali_ptr dst;
mali_ptr metadata;
uint32_t header_size;
uint32_t src_stride;
uint32_t dst_stride;
uint32_t padding[3]; // FIXME
} PACKED;
void panfrost_afbc_context_init(struct panfrost_context *ctx);
void panfrost_afbc_context_destroy(struct panfrost_context *ctx);

View File

@@ -3953,6 +3953,31 @@ panfrost_afbc_size(struct panfrost_batch *batch, struct panfrost_resource *src,
LAUNCH_AFBC_SHADER(size, batch, src, consts, slice->afbc.nr_blocks);
}
static void
panfrost_afbc_pack(struct panfrost_batch *batch, struct panfrost_resource *src,
struct panfrost_bo *dst,
struct pan_image_slice_layout *dst_slice,
struct panfrost_bo *metadata, unsigned metadata_offset,
unsigned level)
{
struct pan_image_slice_layout *src_slice = &src->image.layout.slices[level];
struct panfrost_afbc_pack_info consts = {
.src = src->image.data.bo->ptr.gpu + src->image.data.offset +
src_slice->offset,
.dst = dst->ptr.gpu + dst_slice->offset,
.metadata = metadata->ptr.gpu + metadata_offset,
.header_size = dst_slice->afbc.header_size,
.src_stride = src_slice->afbc.stride,
.dst_stride = dst_slice->afbc.stride,
};
panfrost_batch_write_rsrc(batch, src, PIPE_SHADER_COMPUTE);
panfrost_batch_write_bo(batch, dst, PIPE_SHADER_COMPUTE);
panfrost_batch_add_bo(batch, metadata, PIPE_SHADER_COMPUTE);
LAUNCH_AFBC_SHADER(pack, batch, src, consts, dst_slice->afbc.nr_blocks);
}
static void *
panfrost_create_rasterizer_state(struct pipe_context *pctx,
const struct pipe_rasterizer_state *cso)
@@ -4570,6 +4595,7 @@ GENX(panfrost_cmdstream_screen_init)(struct panfrost_screen *screen)
screen->vtbl.get_compiler_options = GENX(pan_shader_get_compiler_options);
screen->vtbl.compile_shader = GENX(pan_shader_compile);
screen->vtbl.afbc_size = panfrost_afbc_size;
screen->vtbl.afbc_pack = panfrost_afbc_pack;
GENX(pan_blitter_init)
(dev, &screen->blitter.bin_pool.base, &screen->blitter.desc_pool.base);

View File

@@ -907,6 +907,16 @@ panfrost_load_tiled_images(struct panfrost_transfer *transfer,
}
}
/* Get scan-order index from (x, y) position when blocks are
* arranged in z-order in 8x8 tiles */
static unsigned
get_morton_index(unsigned x, unsigned y, unsigned stride)
{
unsigned i = ((x << 0) & 1) | ((y << 1) & 2) | ((x << 1) & 4) |
((y << 2) & 8) | ((x << 2) & 16) | ((y << 3) & 32);
return (((y & ~7) * stride) + ((x & ~7) << 3)) + i;
}
static void
panfrost_store_tiled_images(struct panfrost_transfer *transfer,
struct panfrost_resource *rsrc)
@@ -1303,6 +1313,110 @@ panfrost_get_afbc_superblock_sizes(struct panfrost_context *ctx,
return bo;
}
void
panfrost_pack_afbc(struct panfrost_context *ctx,
struct panfrost_resource *prsrc)
{
struct panfrost_screen *screen = pan_screen(ctx->base.screen);
struct panfrost_device *dev = pan_device(ctx->base.screen);
struct panfrost_bo *metadata_bo;
unsigned metadata_offsets[PIPE_MAX_TEXTURE_LEVELS];
uint64_t src_modifier = prsrc->image.layout.modifier;
uint64_t dst_modifier =
src_modifier & ~(AFBC_FORMAT_MOD_TILED | AFBC_FORMAT_MOD_SPARSE);
bool is_tiled = src_modifier & AFBC_FORMAT_MOD_TILED;
unsigned last_level = prsrc->base.last_level;
struct pan_image_slice_layout slice_infos[PIPE_MAX_TEXTURE_LEVELS] = {0};
unsigned total_size = 0;
/* It doesn't make sense to pack everything if we need to unpack right
* away to upload data to another level */
for (int i = 0; i <= last_level; i++) {
if (!BITSET_TEST(prsrc->valid.data, i))
return;
}
metadata_bo = panfrost_get_afbc_superblock_sizes(ctx, prsrc, 0, last_level,
metadata_offsets);
panfrost_bo_wait(metadata_bo, INT64_MAX, false);
for (unsigned level = 0; level <= last_level; ++level) {
struct pan_image_slice_layout *src_slice =
&prsrc->image.layout.slices[level];
struct pan_image_slice_layout *dst_slice = &slice_infos[level];
unsigned width = u_minify(prsrc->base.width0, level);
unsigned height = u_minify(prsrc->base.height0, level);
unsigned src_stride =
pan_afbc_stride_blocks(src_modifier, src_slice->row_stride);
unsigned dst_stride =
DIV_ROUND_UP(width, panfrost_afbc_superblock_width(dst_modifier));
unsigned dst_height =
DIV_ROUND_UP(height, panfrost_afbc_superblock_height(dst_modifier));
uint32_t offset = 0;
struct pan_afbc_block_info *meta =
metadata_bo->ptr.cpu + metadata_offsets[level];
for (unsigned y = 0, i = 0; y < dst_height; ++y) {
for (unsigned x = 0; x < dst_stride; ++x, ++i) {
unsigned idx = is_tiled ? get_morton_index(x, y, src_stride) : i;
uint32_t size = meta[idx].size;
meta[idx].offset = offset; /* write the start offset */
offset += size;
}
}
total_size = ALIGN_POT(total_size, pan_slice_align(dst_modifier));
{
dst_slice->afbc.stride = dst_stride;
dst_slice->afbc.nr_blocks = dst_stride * dst_height;
dst_slice->afbc.header_size =
ALIGN_POT(dst_stride * dst_height * AFBC_HEADER_BYTES_PER_TILE,
pan_afbc_body_align(dst_modifier));
dst_slice->afbc.body_size = offset;
dst_slice->afbc.surface_stride = dst_slice->afbc.header_size + offset;
dst_slice->offset = total_size;
dst_slice->row_stride = dst_stride * AFBC_HEADER_BYTES_PER_TILE;
dst_slice->surface_stride = dst_slice->afbc.surface_stride;
dst_slice->size = dst_slice->afbc.surface_stride;
}
total_size += dst_slice->afbc.surface_stride;
}
unsigned new_size = ALIGN_POT(total_size, 4096); // FIXME
unsigned old_size = prsrc->image.data.bo->size;
if (new_size == old_size)
return;
if (dev->debug & PAN_DBG_PERF) {
printf("%i%%: %i KB -> %i KB\n", 100 * new_size / old_size,
old_size / 1024, new_size / 1024);
}
struct panfrost_bo *dst =
panfrost_bo_create(dev, new_size, 0, "AFBC compact texture");
struct panfrost_batch *batch =
panfrost_get_fresh_batch_for_fbo(ctx, "AFBC compaction");
for (unsigned level = 0; level <= last_level; ++level) {
struct pan_image_slice_layout *slice = &slice_infos[level];
screen->vtbl.afbc_pack(batch, prsrc, dst, slice, metadata_bo,
metadata_offsets[level], level);
prsrc->image.layout.slices[level] = *slice;
}
panfrost_flush_batches_accessing_rsrc(ctx, prsrc, "AFBC compaction flush");
prsrc->image.layout.modifier = dst_modifier;
panfrost_bo_unreference(prsrc->image.data.bo);
prsrc->image.data.bo = dst;
panfrost_bo_unreference(metadata_bo);
}
static void
panfrost_ptr_unmap(struct pipe_context *pctx, struct pipe_transfer *transfer)
{

View File

@@ -181,6 +181,9 @@ struct panfrost_bo *panfrost_get_afbc_superblock_sizes(
struct panfrost_context *ctx, struct panfrost_resource *rsrc,
unsigned first_level, unsigned last_level, unsigned *out_offsets);
void panfrost_pack_afbc(struct panfrost_context *ctx,
struct panfrost_resource *prsrc);
void pan_resource_modifier_convert(struct panfrost_context *ctx,
struct panfrost_resource *rsrc,
uint64_t modifier, const char *reason);

View File

@@ -105,6 +105,13 @@ struct panfrost_vtable {
struct panfrost_resource *src,
struct panfrost_bo *metadata, unsigned offset,
unsigned level);
/* Run a compute shader to compact a sparse layout afbc resource */
void (*afbc_pack)(struct panfrost_batch *batch,
struct panfrost_resource *src, struct panfrost_bo *dst,
struct pan_image_slice_layout *slice,
struct panfrost_bo *metadata, unsigned metadata_offset,
unsigned level);
};
struct panfrost_screen {