panfrost: Add support for AFBC packing

When the GPU is converting a texture from linear/tiled to compressed AFBC, it uses a sparse memory layout. That means that the superblocks are stored starting at intervals equal to the size of an uncompressed superblock. When memory usage needs to be optimized, it is possible to pack the resource by trimming each superblock as much as possible. The GPU will still be able to read from these packed textures, but won't be able to write directly to them. If the layout is AFBC-tiled, the packing process will also de-tile as tiled+packed is not supported by Mali GPUs. No new modifier flag has been added as the absence of the `AFBC_FORMAT_MOD_SPARSE` flag means the resource will be packed. Signed-off-by: Louis-Francis Ratté-Boulianne <lfrb@collabora.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/25012>
2023-08-31 23:33:45 -04:00
parent 32fbd38889
commit bc55d150a9
6 changed files with 286 additions and 0 deletions
--- a/src/gallium/drivers/panfrost/pan_afbc_cso.c
+++ b/src/gallium/drivers/panfrost/pan_afbc_cso.c
@@ -50,6 +50,34 @@ read_afbc_header(nir_builder *b, nir_def *buf, nir_def *idx)
                          AFBC_HEADER_BYTES_PER_TILE / 4, 32);
 }

+static void
+write_afbc_header(nir_builder *b, nir_def *buf, nir_def *idx, nir_def *hdr)
+{
+   nir_def *offset = nir_imul_imm(b, idx, AFBC_HEADER_BYTES_PER_TILE);
+   nir_store_global(b, nir_iadd(b, buf, nir_u2u64(b, offset)), 16, hdr, 0xF);
+}
+
+static nir_def *
+get_morton_index(nir_builder *b, nir_def *idx, nir_def *src_stride,
+                 nir_def *dst_stride)
+{
+   nir_def *x = nir_umod(b, idx, dst_stride);
+   nir_def *y = nir_udiv(b, idx, dst_stride);
+
+   nir_def *offset = nir_imul(b, nir_iand_imm(b, y, ~0x7), src_stride);
+   offset = nir_iadd(b, offset, nir_ishl_imm(b, nir_ushr_imm(b, x, 3), 6));
+
+   x = nir_iand_imm(b, x, 0x7);
+   x = nir_iand_imm(b, nir_ior(b, x, nir_ishl_imm(b, x, 2)), 0x13);
+   x = nir_iand_imm(b, nir_ior(b, x, nir_ishl_imm(b, x, 1)), 0x15);
+   y = nir_iand_imm(b, y, 0x7);
+   y = nir_iand_imm(b, nir_ior(b, y, nir_ishl_imm(b, y, 2)), 0x13);
+   y = nir_iand_imm(b, nir_ior(b, y, nir_ishl_imm(b, y, 1)), 0x15);
+   nir_def *tile_idx = nir_ior(b, x, nir_ishl_imm(b, y, 1));
+
+   return nir_iadd(b, offset, tile_idx);
+}
+
 static nir_def *
 get_superblock_size(nir_builder *b, unsigned arch, nir_def *hdr,
                    nir_def *uncompressed_size)
@@ -99,6 +127,71 @@ get_superblock_size(nir_builder *b, unsigned arch, nir_def *hdr,
             : size;
 }

+static nir_def *
+get_packed_offset(nir_builder *b, nir_def *metadata, nir_def *idx,
+                  nir_def **out_size)
+{
+   nir_def *metadata_offset =
+      nir_u2u64(b, nir_imul_imm(b, idx, sizeof(struct pan_afbc_block_info)));
+   nir_def *range_ptr = nir_iadd(b, metadata, metadata_offset);
+   nir_def *entry = nir_load_global(b, range_ptr, 4,
+                                    sizeof(struct pan_afbc_block_info) / 4, 32);
+   nir_def *offset =
+      nir_channel(b, entry, offsetof(struct pan_afbc_block_info, offset) / 4);
+
+   if (out_size)
+      *out_size =
+         nir_channel(b, entry, offsetof(struct pan_afbc_block_info, size) / 4);
+
+   return nir_u2u64(b, offset);
+}
+
+#define MAX_LINE_SIZE 16
+
+static void
+copy_superblock(nir_builder *b, nir_def *dst, nir_def *dst_idx, nir_def *hdr_sz,
+                nir_def *src, nir_def *src_idx, nir_def *metadata,
+                nir_def *meta_idx, unsigned align)
+{
+   nir_def *hdr = read_afbc_header(b, src, src_idx);
+   nir_def *src_body_base_ptr = nir_u2u64(b, nir_channel(b, hdr, 0));
+   nir_def *src_bodyptr = nir_iadd(b, src, src_body_base_ptr);
+
+   nir_def *size;
+   nir_def *dst_offset = get_packed_offset(b, metadata, meta_idx, &size);
+   nir_def *dst_body_base_ptr = nir_iadd(b, dst_offset, hdr_sz);
+   nir_def *dst_bodyptr = nir_iadd(b, dst, dst_body_base_ptr);
+
+   /* Replace the `base_body_ptr` field if not zero (solid color) */
+   nir_def *hdr2 =
+      nir_vector_insert_imm(b, hdr, nir_u2u32(b, dst_body_base_ptr), 0);
+   hdr = nir_bcsel(b, nir_ieq_imm(b, src_body_base_ptr, 0), hdr, hdr2);
+   write_afbc_header(b, dst, dst_idx, hdr);
+
+   nir_variable *offset_var =
+      nir_local_variable_create(b->impl, glsl_uint_type(), "offset");
+   nir_store_var(b, offset_var, nir_imm_int(b, 0), 1);
+   nir_loop *loop = nir_push_loop(b);
+   {
+      nir_def *offset = nir_load_var(b, offset_var);
+      nir_if *loop_check = nir_push_if(b, nir_uge(b, offset, size));
+      nir_jump(b, nir_jump_break);
+      nir_push_else(b, loop_check);
+      unsigned line_sz = align <= MAX_LINE_SIZE ? align : MAX_LINE_SIZE;
+      for (unsigned i = 0; i < align / line_sz; ++i) {
+         nir_def *src_line = nir_iadd(b, src_bodyptr, nir_u2u64(b, offset));
+         nir_def *dst_line = nir_iadd(b, dst_bodyptr, nir_u2u64(b, offset));
+         nir_store_global(
+            b, dst_line, line_sz,
+            nir_load_global(b, src_line, line_sz, line_sz / 4, 32), ~0);
+         offset = nir_iadd_imm(b, offset, line_sz);
+      }
+      nir_store_var(b, offset_var, offset, 0x1);
+      nir_pop_if(b, loop_check);
+   }
+   nir_pop_loop(b, loop);
+}
+
 #define panfrost_afbc_size_get_info_field(b, field)                            \
   panfrost_afbc_get_info_field(size, b, field)

@@ -135,6 +228,37 @@ panfrost_afbc_create_size_shader(struct panfrost_screen *screen, unsigned bpp,
   return b.shader;
 }

+#define panfrost_afbc_pack_get_info_field(b, field)                            \
+   panfrost_afbc_get_info_field(pack, b, field)
+
+static nir_shader *
+panfrost_afbc_create_pack_shader(struct panfrost_screen *screen, unsigned align,
+                                 bool tiled)
+{
+   nir_builder b = nir_builder_init_simple_shader(
+      MESA_SHADER_COMPUTE, screen->vtbl.get_compiler_options(),
+      "panfrost_afbc_pack");
+
+   panfrost_afbc_add_info_ubo(pack, b);
+
+   nir_def *coord = nir_load_global_invocation_id(&b, 32);
+   nir_def *src_stride = panfrost_afbc_pack_get_info_field(&b, src_stride);
+   nir_def *dst_stride = panfrost_afbc_pack_get_info_field(&b, dst_stride);
+   nir_def *dst_idx = nir_channel(&b, coord, 0);
+   nir_def *src_idx =
+      tiled ? get_morton_index(&b, dst_idx, src_stride, dst_stride) : dst_idx;
+   nir_def *src = panfrost_afbc_pack_get_info_field(&b, src);
+   nir_def *dst = panfrost_afbc_pack_get_info_field(&b, dst);
+   nir_def *header_size =
+      nir_u2u64(&b, panfrost_afbc_pack_get_info_field(&b, header_size));
+   nir_def *metadata = panfrost_afbc_pack_get_info_field(&b, metadata);
+
+   copy_superblock(&b, dst, dst_idx, header_size, src, src_idx, metadata,
+                   src_idx, align);
+
+   return b.shader;
+}
+
 struct pan_afbc_shader_data *
 panfrost_afbc_get_shaders(struct panfrost_context *ctx,
                          struct panfrost_resource *rsrc, unsigned align)
@@ -171,6 +295,7 @@ panfrost_afbc_get_shaders(struct panfrost_context *ctx,
   }

   COMPILE_SHADER(size, key.bpp, key.align);
+   COMPILE_SHADER(pack, key.align, key.tiled);

 #undef COMPILE_SHADER

--- a/src/gallium/drivers/panfrost/pan_afbc_cso.h
+++ b/src/gallium/drivers/panfrost/pan_afbc_cso.h
@@ -42,6 +42,7 @@ struct pan_afbc_shader_key {
 struct pan_afbc_shader_data {
   struct pan_afbc_shader_key key;
   void *size_cso;
+   void *pack_cso;
 };

 struct pan_afbc_shaders {
@@ -59,6 +60,16 @@ struct panfrost_afbc_size_info {
   mali_ptr metadata;
 } PACKED;

+struct panfrost_afbc_pack_info {
+   mali_ptr src;
+   mali_ptr dst;
+   mali_ptr metadata;
+   uint32_t header_size;
+   uint32_t src_stride;
+   uint32_t dst_stride;
+   uint32_t padding[3]; // FIXME
+} PACKED;
+
 void panfrost_afbc_context_init(struct panfrost_context *ctx);
 void panfrost_afbc_context_destroy(struct panfrost_context *ctx);

--- a/src/gallium/drivers/panfrost/pan_cmdstream.c
+++ b/src/gallium/drivers/panfrost/pan_cmdstream.c
@@ -3953,6 +3953,31 @@ panfrost_afbc_size(struct panfrost_batch *batch, struct panfrost_resource *src,
   LAUNCH_AFBC_SHADER(size, batch, src, consts, slice->afbc.nr_blocks);
 }

+static void
+panfrost_afbc_pack(struct panfrost_batch *batch, struct panfrost_resource *src,
+                   struct panfrost_bo *dst,
+                   struct pan_image_slice_layout *dst_slice,
+                   struct panfrost_bo *metadata, unsigned metadata_offset,
+                   unsigned level)
+{
+   struct pan_image_slice_layout *src_slice = &src->image.layout.slices[level];
+   struct panfrost_afbc_pack_info consts = {
+      .src = src->image.data.bo->ptr.gpu + src->image.data.offset +
+             src_slice->offset,
+      .dst = dst->ptr.gpu + dst_slice->offset,
+      .metadata = metadata->ptr.gpu + metadata_offset,
+      .header_size = dst_slice->afbc.header_size,
+      .src_stride = src_slice->afbc.stride,
+      .dst_stride = dst_slice->afbc.stride,
+   };
+
+   panfrost_batch_write_rsrc(batch, src, PIPE_SHADER_COMPUTE);
+   panfrost_batch_write_bo(batch, dst, PIPE_SHADER_COMPUTE);
+   panfrost_batch_add_bo(batch, metadata, PIPE_SHADER_COMPUTE);
+
+   LAUNCH_AFBC_SHADER(pack, batch, src, consts, dst_slice->afbc.nr_blocks);
+}
+
 static void *
 panfrost_create_rasterizer_state(struct pipe_context *pctx,
                                 const struct pipe_rasterizer_state *cso)
@@ -4570,6 +4595,7 @@ GENX(panfrost_cmdstream_screen_init)(struct panfrost_screen *screen)
   screen->vtbl.get_compiler_options = GENX(pan_shader_get_compiler_options);
   screen->vtbl.compile_shader = GENX(pan_shader_compile);
   screen->vtbl.afbc_size = panfrost_afbc_size;
+   screen->vtbl.afbc_pack = panfrost_afbc_pack;

   GENX(pan_blitter_init)
   (dev, &screen->blitter.bin_pool.base, &screen->blitter.desc_pool.base);
--- a/src/gallium/drivers/panfrost/pan_resource.c
+++ b/src/gallium/drivers/panfrost/pan_resource.c
@@ -907,6 +907,16 @@ panfrost_load_tiled_images(struct panfrost_transfer *transfer,
   }
 }

+/* Get scan-order index from (x, y) position when blocks are
+ * arranged in z-order in 8x8 tiles */
+static unsigned
+get_morton_index(unsigned x, unsigned y, unsigned stride)
+{
+   unsigned i = ((x << 0) & 1) | ((y << 1) & 2) | ((x << 1) & 4) |
+                ((y << 2) & 8) | ((x << 2) & 16) | ((y << 3) & 32);
+   return (((y & ~7) * stride) + ((x & ~7) << 3)) + i;
+}
+
 static void
 panfrost_store_tiled_images(struct panfrost_transfer *transfer,
                            struct panfrost_resource *rsrc)
@@ -1303,6 +1313,110 @@ panfrost_get_afbc_superblock_sizes(struct panfrost_context *ctx,
   return bo;
 }

+void
+panfrost_pack_afbc(struct panfrost_context *ctx,
+                   struct panfrost_resource *prsrc)
+{
+   struct panfrost_screen *screen = pan_screen(ctx->base.screen);
+   struct panfrost_device *dev = pan_device(ctx->base.screen);
+   struct panfrost_bo *metadata_bo;
+   unsigned metadata_offsets[PIPE_MAX_TEXTURE_LEVELS];
+
+   uint64_t src_modifier = prsrc->image.layout.modifier;
+   uint64_t dst_modifier =
+      src_modifier & ~(AFBC_FORMAT_MOD_TILED | AFBC_FORMAT_MOD_SPARSE);
+   bool is_tiled = src_modifier & AFBC_FORMAT_MOD_TILED;
+   unsigned last_level = prsrc->base.last_level;
+   struct pan_image_slice_layout slice_infos[PIPE_MAX_TEXTURE_LEVELS] = {0};
+   unsigned total_size = 0;
+
+   /* It doesn't make sense to pack everything if we need to unpack right
+    * away to upload data to another level */
+   for (int i = 0; i <= last_level; i++) {
+      if (!BITSET_TEST(prsrc->valid.data, i))
+         return;
+   }
+
+   metadata_bo = panfrost_get_afbc_superblock_sizes(ctx, prsrc, 0, last_level,
+                                                    metadata_offsets);
+   panfrost_bo_wait(metadata_bo, INT64_MAX, false);
+
+   for (unsigned level = 0; level <= last_level; ++level) {
+      struct pan_image_slice_layout *src_slice =
+         &prsrc->image.layout.slices[level];
+      struct pan_image_slice_layout *dst_slice = &slice_infos[level];
+
+      unsigned width = u_minify(prsrc->base.width0, level);
+      unsigned height = u_minify(prsrc->base.height0, level);
+      unsigned src_stride =
+         pan_afbc_stride_blocks(src_modifier, src_slice->row_stride);
+      unsigned dst_stride =
+         DIV_ROUND_UP(width, panfrost_afbc_superblock_width(dst_modifier));
+      unsigned dst_height =
+         DIV_ROUND_UP(height, panfrost_afbc_superblock_height(dst_modifier));
+
+      uint32_t offset = 0;
+      struct pan_afbc_block_info *meta =
+         metadata_bo->ptr.cpu + metadata_offsets[level];
+
+      for (unsigned y = 0, i = 0; y < dst_height; ++y) {
+         for (unsigned x = 0; x < dst_stride; ++x, ++i) {
+            unsigned idx = is_tiled ? get_morton_index(x, y, src_stride) : i;
+            uint32_t size = meta[idx].size;
+            meta[idx].offset = offset; /* write the start offset */
+            offset += size;
+         }
+      }
+
+      total_size = ALIGN_POT(total_size, pan_slice_align(dst_modifier));
+      {
+         dst_slice->afbc.stride = dst_stride;
+         dst_slice->afbc.nr_blocks = dst_stride * dst_height;
+         dst_slice->afbc.header_size =
+            ALIGN_POT(dst_stride * dst_height * AFBC_HEADER_BYTES_PER_TILE,
+                      pan_afbc_body_align(dst_modifier));
+         dst_slice->afbc.body_size = offset;
+         dst_slice->afbc.surface_stride = dst_slice->afbc.header_size + offset;
+
+         dst_slice->offset = total_size;
+         dst_slice->row_stride = dst_stride * AFBC_HEADER_BYTES_PER_TILE;
+         dst_slice->surface_stride = dst_slice->afbc.surface_stride;
+         dst_slice->size = dst_slice->afbc.surface_stride;
+      }
+      total_size += dst_slice->afbc.surface_stride;
+   }
+
+   unsigned new_size = ALIGN_POT(total_size, 4096); // FIXME
+   unsigned old_size = prsrc->image.data.bo->size;
+
+   if (new_size == old_size)
+      return;
+
+   if (dev->debug & PAN_DBG_PERF) {
+      printf("%i%%: %i KB -> %i KB\n", 100 * new_size / old_size,
+             old_size / 1024, new_size / 1024);
+   }
+
+   struct panfrost_bo *dst =
+      panfrost_bo_create(dev, new_size, 0, "AFBC compact texture");
+   struct panfrost_batch *batch =
+      panfrost_get_fresh_batch_for_fbo(ctx, "AFBC compaction");
+
+   for (unsigned level = 0; level <= last_level; ++level) {
+      struct pan_image_slice_layout *slice = &slice_infos[level];
+      screen->vtbl.afbc_pack(batch, prsrc, dst, slice, metadata_bo,
+                             metadata_offsets[level], level);
+      prsrc->image.layout.slices[level] = *slice;
+   }
+
+   panfrost_flush_batches_accessing_rsrc(ctx, prsrc, "AFBC compaction flush");
+
+   prsrc->image.layout.modifier = dst_modifier;
+   panfrost_bo_unreference(prsrc->image.data.bo);
+   prsrc->image.data.bo = dst;
+   panfrost_bo_unreference(metadata_bo);
+}
+
 static void
 panfrost_ptr_unmap(struct pipe_context *pctx, struct pipe_transfer *transfer)
 {
--- a/src/gallium/drivers/panfrost/pan_resource.h
+++ b/src/gallium/drivers/panfrost/pan_resource.h
@@ -181,6 +181,9 @@ struct panfrost_bo *panfrost_get_afbc_superblock_sizes(
   struct panfrost_context *ctx, struct panfrost_resource *rsrc,
   unsigned first_level, unsigned last_level, unsigned *out_offsets);

+void panfrost_pack_afbc(struct panfrost_context *ctx,
+                        struct panfrost_resource *prsrc);
+
 void pan_resource_modifier_convert(struct panfrost_context *ctx,
                                   struct panfrost_resource *rsrc,
                                   uint64_t modifier, const char *reason);
--- a/src/gallium/drivers/panfrost/pan_screen.h
+++ b/src/gallium/drivers/panfrost/pan_screen.h
@@ -105,6 +105,13 @@ struct panfrost_vtable {
                     struct panfrost_resource *src,
                     struct panfrost_bo *metadata, unsigned offset,
                     unsigned level);
+
+   /* Run a compute shader to compact a sparse layout afbc resource */
+   void (*afbc_pack)(struct panfrost_batch *batch,
+                     struct panfrost_resource *src, struct panfrost_bo *dst,
+                     struct pan_image_slice_layout *slice,
+                     struct panfrost_bo *metadata, unsigned metadata_offset,
+                     unsigned level);
 };

 struct panfrost_screen {