asahi,agx: Use new tilebuffer infrastructure

Flag day change to replace the previous hardcoded background/end-of-tile shaders and the API-style load/store_output in fragment shaders with the generated shaders and lowered *_agx intrinsics. This gets us working non-UNORM8 render targets and working MRT. It's also a step in the direction of working MSAA but that needs a lot more work, since the multisampling programming model on AGX is quite different from any of the APIs (including Metal). Signed-off-by: Alyssa Rosenzweig <alyssa@rosenzweig.io> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/19871>
2022-11-19 13:48:14 -05:00
parent c5c0ea39f6
commit 74e92274af
11 changed files with 167 additions and 335 deletions
--- a/src/asahi/compiler/agx_compile.c
+++ b/src/asahi/compiler/agx_compile.c
@@ -504,14 +504,8 @@ agx_emit_store_vary(agx_builder *b, nir_intrinsic_instr *instr)
 }

 static agx_instr *
-agx_emit_fragment_out(agx_builder *b, nir_intrinsic_instr *instr)
+agx_emit_local_store_pixel(agx_builder *b, nir_intrinsic_instr *instr)
 {
-   nir_io_semantics sem = nir_intrinsic_io_semantics(instr);
-   unsigned loc = sem.location;
-   assert(sem.dual_source_blend_index == 0 && "todo: dual-source blending");
-   assert(loc == FRAG_RESULT_DATA0 && "todo: MRT");
-   unsigned rt = (loc - FRAG_RESULT_DATA0);
-
   /* TODO: Reverse-engineer interactions with MRT */
   if (b->shader->key->fs.ignore_tib_dependencies) {
      assert(b->shader->nir->info.internal && "only for clear shaders");
@@ -532,19 +526,15 @@ agx_emit_fragment_out(agx_builder *b, nir_intrinsic_instr *instr)

   b->shader->did_writeout = true;
   return agx_st_tile(b, agx_src_index(&instr->src[0]),
-             b->shader->key->fs.tib_formats[rt],
-             nir_intrinsic_write_mask(instr));
+                         agx_src_index(&instr->src[1]),
+                         agx_format_for_pipe(nir_intrinsic_format(instr)),
+                         nir_intrinsic_write_mask(instr),
+                         nir_intrinsic_base(instr));
 }

 static void
-agx_emit_load_tile(agx_builder *b, agx_index dest, nir_intrinsic_instr *instr)
+agx_emit_local_load_pixel(agx_builder *b, agx_index dest, nir_intrinsic_instr *instr)
 {
-   nir_io_semantics sem = nir_intrinsic_io_semantics(instr);
-   unsigned loc = sem.location;
-   assert(sem.dual_source_blend_index == 0 && "dual src ld_tile is nonsense");
-   assert(loc == FRAG_RESULT_DATA0 && "todo: MRT");
-   unsigned rt = (loc - FRAG_RESULT_DATA0);
-
   /* TODO: Reverse-engineer interactions with MRT */
   assert(!b->shader->key->fs.ignore_tib_dependencies && "invalid usage");
   agx_writeout(b, 0x0008);
@@ -552,8 +542,10 @@ agx_emit_load_tile(agx_builder *b, agx_index dest, nir_intrinsic_instr *instr)
   b->shader->out->reads_tib = true;

   unsigned nr_comps = nir_dest_num_components(instr->dest);
-   agx_ld_tile_to(b, dest, b->shader->key->fs.tib_formats[rt],
-                  BITFIELD_MASK(nr_comps));
+   agx_ld_tile_to(b, dest, agx_src_index(&instr->src[0]),
+                  agx_format_for_pipe(nir_intrinsic_format(instr)),
+                  BITFIELD_MASK(nr_comps),
+                  nir_intrinsic_base(instr));
   agx_emit_cached_split(b, dest, nr_comps);
 }

@@ -770,16 +762,16 @@ agx_emit_intrinsic(agx_builder *b, nir_intrinsic_instr *instr)
        return NULL;

  case nir_intrinsic_store_output:
-     if (stage == MESA_SHADER_FRAGMENT)
-        return agx_emit_fragment_out(b, instr);
-     else if (stage == MESA_SHADER_VERTEX)
-        return agx_emit_store_vary(b, instr);
-     else
-        unreachable("Unsupported shader stage");
+     assert(stage == MESA_SHADER_VERTEX);
+     return agx_emit_store_vary(b, instr);

-  case nir_intrinsic_load_output:
+  case nir_intrinsic_store_local_pixel_agx:
     assert(stage == MESA_SHADER_FRAGMENT);
-     agx_emit_load_tile(b, dst, instr);
+     return agx_emit_local_store_pixel(b, instr);
+
+  case nir_intrinsic_load_local_pixel_agx:
+     assert(stage == MESA_SHADER_FRAGMENT);
+     agx_emit_local_load_pixel(b, dst, instr);
     return NULL;

  case nir_intrinsic_load_ubo:
--- a/src/asahi/compiler/agx_compile.h
+++ b/src/asahi/compiler/agx_compile.h
@@ -254,8 +254,6 @@ struct agx_vs_shader_key {
 };

 struct agx_fs_shader_key {
-   enum agx_format tib_formats[AGX_MAX_RTS];
-
   /* Normally, access to the tilebuffer must be guarded by appropriate fencing
    * instructions to ensure correct results in the presence of out-of-order
    * hardware optimizations. However, specially dispatched clear shaders are
--- a/src/asahi/compiler/agx_compiler.h
+++ b/src/asahi/compiler/agx_compiler.h
@@ -300,15 +300,18 @@ typedef struct {
      uint32_t component;
      uint32_t channels;
      uint32_t bfi_mask;
+      uint16_t pixel_offset;
      enum agx_sr sr;
      enum agx_icond icond;
      enum agx_fcond fcond;
-      enum agx_format format;
      enum agx_round round;
      enum agx_lod_mode lod_mode;
      struct agx_block *target;
   };

+   /* For local access */
+   enum agx_format format;
+
   /* For load varying */
   bool perspective : 1;

--- a/src/asahi/compiler/agx_opcodes.py
+++ b/src/asahi/compiler/agx_opcodes.py
@@ -94,6 +94,7 @@ SHIFT = immediate("shift")
 MASK = immediate("mask")
 BFI_MASK = immediate("bfi_mask")
 LOD_MODE = immediate("lod_mode", "enum agx_lod_mode")
+PIXEL_OFFSET = immediate("pixel_offset")

 DIM = enum("dim", {
    0: '1d',
@@ -250,11 +251,12 @@ op("get_sr", (0x72, 0x7F | L, 4, _), dests = 1, imms = [SR])

 op("sample_mask", (0x7fc1, 0xffff, 6, _), dests = 0, srcs = 1, can_eliminate = False)

-# Essentially same encoding
-op("ld_tile", (0x49, 0x7F, 8, _), dests = 1, srcs = 0, imms = [FORMAT, MASK], can_reorder = False)
+# Essentially same encoding. Last source is the sample mask
+op("ld_tile", (0x49, 0x7F, 8, _), dests = 1, srcs = 1,
+        imms = [FORMAT, MASK, PIXEL_OFFSET], can_reorder = False)

-op("st_tile", (0x09, 0x7F, 8, _), dests = 0, srcs = 1,
-      can_eliminate = False, imms = [FORMAT, MASK])
+op("st_tile", (0x09, 0x7F, 8, _), dests = 0, srcs = 2,
+      can_eliminate = False, imms = [FORMAT, MASK, PIXEL_OFFSET])

 for (name, exact) in [("any", 0xC000), ("none", 0xC200)]:
   op("jmp_exec_" + name, (exact, (1 << 16) - 1, 6, _), dests = 0, srcs = 0,
--- a/src/asahi/compiler/agx_optimizer.c
+++ b/src/asahi/compiler/agx_optimizer.c
@@ -122,6 +122,7 @@ agx_optimizer_inline_imm(agx_instr **defs, agx_instr *I,

      /* cmpselsrc takes integer immediates only */
      if (s >= 2 && I->op == AGX_OPCODE_FCMPSEL) float_src = false;
+      if (I->op == AGX_OPCODE_ST_TILE && s == 0) continue;

      if (float_src) {
         bool fp16 = (def->dest[0].size == AGX_SIZE_16);
@@ -215,9 +216,11 @@ agx_optimizer_forward(agx_context *ctx)
         agx_optimizer_fmov(defs, I);

      /* Inline immediates if we can. TODO: systematic */
-      if (I->op != AGX_OPCODE_ST_VARY && I->op != AGX_OPCODE_ST_TILE &&
-          I->op != AGX_OPCODE_COLLECT && I->op != AGX_OPCODE_TEXTURE_SAMPLE &&
-          I->op != AGX_OPCODE_TEXTURE_LOAD && I->op != AGX_OPCODE_UNIFORM_STORE &&
+      if (I->op != AGX_OPCODE_ST_VARY &&
+          I->op != AGX_OPCODE_COLLECT &&
+          I->op != AGX_OPCODE_TEXTURE_SAMPLE &&
+          I->op != AGX_OPCODE_TEXTURE_LOAD &&
+          I->op != AGX_OPCODE_UNIFORM_STORE &&
          I->op != AGX_OPCODE_BLOCK_IMAGE_STORE)
         agx_optimizer_inline_imm(defs, I, info.nr_srcs, info.is_float);
   }
--- a/src/asahi/compiler/agx_pack.c
+++ b/src/asahi/compiler/agx_pack.c
@@ -426,18 +426,28 @@ agx_pack_instr(struct util_dynarray *emission, struct util_dynarray *fixups, agx
   {
      bool load = (I->op == AGX_OPCODE_LD_TILE);
      unsigned D = agx_pack_alu_dst(load ? I->dest[0] : I->src[0]);
-      unsigned rt = 0; /* TODO */
      assert(I->mask < 0x10);
+      assert(I->pixel_offset < 0x200);
+
+      agx_index sample_index = load ? I->src[0] : I->src[1];
+      assert(sample_index.type == AGX_INDEX_REGISTER ||
+             sample_index.type == AGX_INDEX_IMMEDIATE);
+      assert(sample_index.size == AGX_SIZE_16);
+      unsigned St = (sample_index.type == AGX_INDEX_REGISTER) ? 1 : 0;
+      unsigned S = sample_index.value;
+      assert(S < 0x100);

      uint64_t raw =
-         0x09 |
-         (load ? (1 << 6) : 0) |
+         agx_opcodes_info[I->op].encoding.exact |
         ((uint64_t) (D & BITFIELD_MASK(8)) << 7) |
+         (St << 22) |
         ((uint64_t) (I->format) << 24) |
-         ((uint64_t) (rt) << 32) |
+         ((uint64_t) (I->pixel_offset & BITFIELD_MASK(7)) << 28) |
         (load ? (1ull << 35) : 0) |
         ((uint64_t) (I->mask) << 36) |
-         ((uint64_t) 0x0380FC << 40) |
+         ((uint64_t) (I->pixel_offset >> 7) << 40) |
+         ((uint64_t) (S & BITFIELD_MASK(6)) << 42) |
+         ((uint64_t) (S >> 6) << 56) |
         (((uint64_t) (D >> 8)) << 60);

      unsigned size = 8;
--- a/src/asahi/lib/agx_device.h
+++ b/src/asahi/lib/agx_device.h
@@ -61,18 +61,6 @@ struct agx_device {

   pthread_mutex_t bo_map_lock;
   struct util_sparse_array bo_map;
-
-   /* Fixed shaders */
-   struct {
-      struct agx_bo *bo;
-      uint32_t clear;
-      uint32_t store;
-   } internal;
-
-   struct {
-      struct agx_bo *bo;
-      uint32_t format[AGX_NUM_FORMATS];
-   } reload;
 };

 bool
--- a/src/gallium/drivers/asahi/agx_blit.c
+++ b/src/gallium/drivers/asahi/agx_blit.c
@@ -1,4 +1,4 @@
-/* 
+/*
 * Copyright (C) 2021 Alyssa Rosenzweig
 * Copyright (C) 2020-2021 Collabora, Ltd.
 * Copyright (C) 2014 Broadcom
@@ -28,61 +28,6 @@
 #include "asahi/compiler/agx_compile.h"
 #include "gallium/auxiliary/util/u_blitter.h"

-static void
-agx_build_reload_shader(struct agx_device *dev)
-{
-   nir_builder b = nir_builder_init_simple_shader(MESA_SHADER_FRAGMENT,
-         &agx_nir_options, "agx_reload");
-
-   nir_variable *out = nir_variable_create(b.shader, nir_var_shader_out,
-         glsl_vector_type(GLSL_TYPE_FLOAT, 4), "output");
-   out->data.location = FRAG_RESULT_DATA0;
-
-   nir_ssa_def *fragcoord = nir_load_frag_coord(&b);
-   nir_ssa_def *coord = nir_channels(&b, fragcoord, 0x3);
-
-   nir_tex_instr *tex = nir_tex_instr_create(b.shader, 1);
-   tex->dest_type = nir_type_float32;
-   tex->sampler_dim = GLSL_SAMPLER_DIM_RECT;
-   tex->op = nir_texop_tex;
-   tex->src[0].src_type = nir_tex_src_coord;
-   tex->src[0].src = nir_src_for_ssa(coord);
-   tex->coord_components = 2;
-   nir_ssa_dest_init(&tex->instr, &tex->dest, 4, 32, NULL);
-   nir_builder_instr_insert(&b, &tex->instr);
-   nir_store_var(&b, out, &tex->dest.ssa, 0xFF);
-
-   unsigned offset = 0;
-   unsigned bo_size = 4096;
-
-   struct agx_bo *bo = agx_bo_create(dev, bo_size, AGX_MEMORY_TYPE_SHADER);
-   dev->reload.bo = bo;
-
-   for (unsigned i = 0; i < AGX_NUM_FORMATS; ++i) {
-      struct util_dynarray binary;
-      util_dynarray_init(&binary, NULL);
-
-      nir_shader *s = nir_shader_clone(NULL, b.shader);
-      struct agx_shader_info info;
-
-      struct agx_shader_key key = {
-         .fs.tib_formats[0] = i,
-         .fs.ignore_tib_dependencies = true,
-      };
-
-      agx_preprocess_nir(s);
-      agx_compile_shader_nir(s, &key, NULL, &binary, &info);
-
-      assert(offset + binary.size < bo_size);
-      memcpy(((uint8_t *) bo->ptr.cpu) + offset, binary.data, binary.size);
-
-      dev->reload.format[i] = bo->ptr.gpu + offset;
-      offset += ALIGN_POT(binary.size, 128);
-
-      util_dynarray_fini(&binary);
-   }
-}
-
 void
 agx_blitter_save(struct agx_context *ctx, struct blitter_context *blitter,
                 bool render_cond)
@@ -133,51 +78,3 @@ agx_blit(struct pipe_context *pipe,
   agx_blitter_save(ctx, ctx->blitter, info->render_condition_enable);
   util_blitter_blit(ctx->blitter, info);
 }
-
-/* We need some fixed shaders for common rendering tasks. When colour buffer
- * reload is not in use, a shader is used to clear a particular colour. At the
- * end of rendering a tile, a shader is used to write it out. These shaders are
- * too trivial to go through the compiler at this stage. */
-#define AGX_STOP \
-	0x88, 0x00, 0x08, 0x00, 0x08, 0x00, 0x08, 0x00, 0x08, \
-	0x00, 0x08, 0x00, 0x08, 0x00, 0x08, 0x00, 0x08, 0x00 \
-
-#define AGX_BLEND \
-	0x09, 0x00, 0x00, 0x04, 0xf0, 0xfc, 0x80, 0x03
-
-/* Clears the tilebuffer, where u6-u7 are preloaded with the FP16 clear colour
-
-   0: 7e018c098040         bitop_mov        r0, u6
-   6: 7e058e098000         bitop_mov        r1, u7
-   c: 09000004f0fc8003     TODO.blend
-   */
-
-static uint8_t shader_clear[] = {
-   0x7e, 0x01, 0x8c, 0x09, 0x80, 0x40,
-   0x7e, 0x05, 0x8e, 0x09, 0x80, 0x00,
-   AGX_BLEND,
-   AGX_STOP
-};
-
-static uint8_t shader_store[] = {
-   0x7e, 0x00, 0x04, 0x09, 0x80, 0x00,
-   0xb1, 0x80, 0x00, 0x80, 0x00, 0x4a, 0x00, 0x00, 0x0a, 0x00,
-   AGX_STOP
-};
-
-void
-agx_internal_shaders(struct agx_device *dev)
-{
-   unsigned clear_offset = 0;
-   unsigned store_offset = 1024;
-
-   struct agx_bo *bo = agx_bo_create(dev, 4096, AGX_MEMORY_TYPE_SHADER);
-   memcpy(((uint8_t *) bo->ptr.cpu) + clear_offset, shader_clear, sizeof(shader_clear));
-   memcpy(((uint8_t *) bo->ptr.cpu) + store_offset, shader_store, sizeof(shader_store));
-
-   dev->internal.bo = bo;
-   dev->internal.clear = bo->ptr.gpu + clear_offset;
-   dev->internal.store = bo->ptr.gpu + store_offset;
-
-   agx_build_reload_shader(dev);
-}
--- a/src/gallium/drivers/asahi/agx_pipe.c
+++ b/src/gallium/drivers/asahi/agx_pipe.c
@@ -668,8 +668,15 @@ agx_clear(struct pipe_context *pctx, unsigned buffers, const struct pipe_scissor
   assert(scissor_state == NULL && "we don't support PIPE_CAP_CLEAR_SCISSORED");

   /* Fast clears configure the batch */
-   if (fastclear & PIPE_CLEAR_COLOR0)
-      memcpy(batch->clear_color, color->f, sizeof(color->f));
+   for (unsigned rt = 0; rt < PIPE_MAX_COLOR_BUFS; ++rt) {
+      if (!(fastclear & (PIPE_CLEAR_COLOR0 << rt)))
+         continue;
+
+      static_assert(sizeof(color->f) == 16, "mismatched structure");
+
+      batch->uploaded_clear_color[rt] =
+         agx_pool_upload_aligned(&batch->pool, color->f, sizeof(color->f), 16);
+   }

   if (fastclear & PIPE_CLEAR_DEPTH)
      batch->clear_depth = depth;
@@ -731,49 +738,21 @@ agx_flush_batch(struct agx_context *ctx, struct agx_batch *batch)
   uint8_t stop[5 + 64] = { 0x00, 0x00, 0x00, 0xc0, 0x00 };
   memcpy(batch->encoder_current, stop, sizeof(stop));

-   /* Emit the commandbuffer */
-   uint64_t pipeline_clear = 0, pipeline_reload = 0;
+   uint64_t pipeline_background = agx_build_meta(batch, false, false);
+   uint64_t pipeline_background_partial = agx_build_meta(batch, false, true);
+   uint64_t pipeline_store = agx_build_meta(batch, true, false);
+
   bool clear_pipeline_textures = false;

-   uint16_t clear_colour[4] = {
-      _mesa_float_to_half(batch->clear_color[0]),
-      _mesa_float_to_half(batch->clear_color[1]),
-      _mesa_float_to_half(batch->clear_color[2]),
-      _mesa_float_to_half(batch->clear_color[3])
-   };
-
-   pipeline_clear = agx_build_clear_pipeline(batch,
-         dev->internal.clear,
-         agx_pool_upload(&batch->pool, clear_colour, sizeof(clear_colour)));
-
-   if (batch->key.cbufs[0]) {
-      enum agx_format internal = AGX_FORMAT_U8NORM /* other formats broken */;
-      uint32_t shader = dev->reload.format[internal];
-
-      pipeline_reload = agx_build_reload_pipeline(batch, shader,
-                               batch->key.cbufs[0]);
-   }
-
-   if (batch->key.cbufs[0] && !(batch->clear & PIPE_CLEAR_COLOR0)) {
-      clear_pipeline_textures = true;
-      pipeline_clear = pipeline_reload;
-   }
-
-   uint64_t pipeline_store = 0;
-
-   if (batch->key.cbufs[0]) {
-      pipeline_store =
-         agx_build_store_pipeline(batch,
-                                  dev->internal.store,
-                                  agx_batch_upload_pbe(batch, 0));
-   }
-
   for (unsigned i = 0; i < batch->key.nr_cbufs; ++i) {
      struct pipe_surface *surf = batch->key.cbufs[i];

      if (surf && surf->texture) {
         struct agx_resource *rt = agx_resource(surf->texture);
         BITSET_SET(rt->data_valid, surf->u.tex.level);
+
+         if (!(batch->clear & (PIPE_CLEAR_COLOR0 << i)))
+            clear_pipeline_textures = true;
      }
   }

@@ -797,8 +776,6 @@ agx_flush_batch(struct agx_context *ctx, struct agx_batch *batch)
   agx_batch_add_bo(batch, batch->encoder);
   agx_batch_add_bo(batch, batch->scissor.bo);
   agx_batch_add_bo(batch, batch->depth_bias.bo);
-   agx_batch_add_bo(batch, dev->internal.bo);
-   agx_batch_add_bo(batch, dev->reload.bo);

   unsigned handle_count =
      agx_batch_num_bo(batch) +
@@ -832,8 +809,8 @@ agx_flush_batch(struct agx_context *ctx, struct agx_batch *batch)
               encoder_id,
               batch->scissor.bo->ptr.gpu,
               batch->depth_bias.bo->ptr.gpu,
-               pipeline_clear,
-               pipeline_reload,
+               pipeline_background,
+               pipeline_background_partial,
               pipeline_store,
               clear_pipeline_textures,
               batch->clear,
@@ -846,8 +823,6 @@ agx_flush_batch(struct agx_context *ctx, struct agx_batch *batch)

   free(handles);

-   agx_submit_cmdbuf(dev, dev->cmdbuf.handle, dev->memmap.handle, dev->queue.id);
-
   agx_wait_queue(dev->queue);

   if (dev->debug & AGX_DBG_TRACE) {
@@ -929,6 +904,7 @@ agx_create_context(struct pipe_screen *screen,
   pctx->invalidate_resource = agx_invalidate_resource;
   agx_init_state_functions(pctx);

+   agx_meta_init(&ctx->meta, agx_device(screen), ctx);

   ctx->blitter = util_blitter_create(pctx);

@@ -1529,7 +1505,5 @@ agx_screen_create(int fd, struct renderonly *ro, struct sw_winsys *winsys)
                                                      U_TRANSFER_HELPER_MSAA_MAP |
                                                      U_TRANSFER_HELPER_Z24_IN_Z32F);

-   agx_internal_shaders(&agx_screen->dev);
-
   return screen;
 }
--- a/src/gallium/drivers/asahi/agx_state.c
+++ b/src/gallium/drivers/asahi/agx_state.c
@@ -1140,6 +1140,14 @@ agx_compile_variant(struct agx_device *dev,
   }

   agx_preprocess_nir(nir);
+
+   if (nir->info.stage == MESA_SHADER_FRAGMENT) {
+      struct agx_tilebuffer_layout tib =
+         agx_build_tilebuffer_layout(key->rt_formats, key->nr_cbufs, 1);
+
+      agx_nir_lower_tilebuffer(nir, &tib);
+   }
+
   agx_compile_shader_nir(nir, &key->base, debug, &binary, &compiled->info);

   if (binary.size) {
@@ -1205,7 +1213,7 @@ agx_create_shader_state(struct pipe_context *pctx,
      }
      case MESA_SHADER_FRAGMENT:
         key.nr_cbufs = 1;
-         key.base.fs.tib_formats[0] = AGX_FORMAT_U8NORM;
+         key.rt_formats[0] = PIPE_FORMAT_R8G8B8A8_UNORM;
         break;
      default:
         unreachable("Unknown shader stage in shader-db precompile");
@@ -1275,13 +1283,7 @@ agx_update_fs(struct agx_batch *batch)
   for (unsigned i = 0; i < key.nr_cbufs; ++i) {
      struct pipe_surface *surf = batch->key.cbufs[i];

-      if (surf) {
-         enum pipe_format fmt = surf->format;
-         key.rt_formats[i] = fmt;
-         key.base.fs.tib_formats[i] = AGX_FORMAT_U8NORM /* other formats broken */;
-      } else {
-         key.rt_formats[i] = PIPE_FORMAT_NONE;
-      }
+      key.rt_formats[i] = surf ? surf->format : PIPE_FORMAT_NONE;
   }

   memcpy(&key.blend, ctx->blend, sizeof(key.blend));
@@ -1417,42 +1419,91 @@ agx_build_pipeline(struct agx_batch *batch, struct agx_compiled_shader *cs, enum
   return agx_usc_fini(&b);
 }

-/* Internal pipelines (TODO: refactor?) */
 uint64_t
-agx_build_clear_pipeline(struct agx_batch *batch, uint32_t code, uint64_t clear_buf)
+agx_build_meta(struct agx_batch *batch, bool store, bool partial_render)
 {
+   struct agx_context *ctx = batch->ctx;
+
+   /* Construct the key */
+   struct agx_meta_key key = {
+      .tib = batch->tilebuffer_layout
+   };
+
+   for (unsigned rt = 0; rt < PIPE_MAX_COLOR_BUFS; ++rt) {
+      struct pipe_surface *surf = batch->key.cbufs[rt];
+
+      if (surf == NULL)
+         continue;
+
+      if (store) {
+         /* TODO: Suppress stores to discarded render targets */
+         key.op[rt] = AGX_META_OP_STORE;
+      } else {
+         bool load = !(batch->clear & (PIPE_CLEAR_COLOR0 << rt));
+
+         /* The background program used for partial renders must always load
+          * whatever was stored in the mid-frame end-of-tile program.
+          */
+         load |= partial_render;
+
+         key.op[rt] = load ? AGX_META_OP_LOAD : AGX_META_OP_CLEAR;
+      }
+   }
+
+   /* Get the shader */
+   struct agx_meta_shader *shader = agx_get_meta_shader(&ctx->meta, &key);
+   agx_batch_add_bo(batch, shader->bo);
+
+   /* Begin building the pipeline */
   struct agx_usc_builder b =
-      agx_alloc_usc_control(&batch->pipeline_pool, 1);
+      agx_alloc_usc_control(&batch->pipeline_pool, 1 + PIPE_MAX_COLOR_BUFS);

-   agx_usc_pack(&b, UNIFORM, cfg) {
-      cfg.start_halfs = (6 * 2);
-      cfg.size_halfs = 4;
-      cfg.buffer = clear_buf;
+   for (unsigned rt = 0; rt < PIPE_MAX_COLOR_BUFS; ++rt) {
+      if (key.op[rt] == AGX_META_OP_LOAD) {
+         /* Each reloaded render target is textured */
+         struct agx_ptr texture = agx_pool_alloc_aligned(&batch->pool, AGX_TEXTURE_LENGTH, 64);
+         struct pipe_surface *surf = batch->key.cbufs[rt];
+         assert(surf != NULL && "cannot load nonexistant attachment");
+
+         struct agx_resource *rsrc = agx_resource(surf->texture);
+
+         agx_pack_texture(texture.cpu, rsrc, surf->format, &(struct pipe_sampler_view) {
+               /* To reduce shader variants, we always use a 2D texture. For
+                * reloads of arrays and cube maps, we map a single layer as a 2D
+                * image.
+                */
+               .target = PIPE_TEXTURE_2D,
+               .swizzle_r = PIPE_SWIZZLE_X,
+               .swizzle_g = PIPE_SWIZZLE_Y,
+               .swizzle_b = PIPE_SWIZZLE_Z,
+               .swizzle_a = PIPE_SWIZZLE_W,
+               .u.tex = {
+                  .first_layer = surf->u.tex.first_layer,
+                  .last_layer = surf->u.tex.last_layer,
+                  .first_level = surf->u.tex.level,
+                  .last_level = surf->u.tex.level
+               }
+         });
+
+         agx_usc_pack(&b, TEXTURE, cfg) {
+            cfg.start = rt;
+            cfg.count = 1;
+            cfg.buffer = texture.gpu;
+         }
+      } else if (key.op[rt] == AGX_META_OP_CLEAR) {
+         assert(batch->uploaded_clear_color[rt] && "set when cleared");
+         agx_usc_uniform(&b, 8 * rt, 8, batch->uploaded_clear_color[rt]);
+      } else if (key.op[rt] == AGX_META_OP_STORE) {
+         agx_usc_pack(&b, TEXTURE, cfg) {
+            cfg.start = rt;
+            cfg.count = 1;
+            cfg.buffer = agx_batch_upload_pbe(batch, rt);
+         }
+      }
   }

-   agx_usc_pack(&b, SHARED, cfg) {
-      cfg.uses_shared_memory = true;
-      cfg.layout = AGX_SHARED_LAYOUT_32X32;
-      cfg.sample_stride_in_8_bytes = 1;
-      cfg.bytes_per_threadgroup = 32 * 256;
-   }
-
-   agx_usc_pack(&b, SHADER, cfg) {
-      cfg.code = code;
-      cfg.unk_2 = 3;
-   }
-
-   agx_usc_pack(&b, REGISTERS, cfg) cfg.register_count = 8;
-   agx_usc_pack(&b, NO_PRESHADER, cfg);
-
-   return agx_usc_fini(&b);
-}
-
-uint64_t
-agx_build_reload_pipeline(struct agx_batch *batch, uint32_t code, struct pipe_surface *surf)
-{
+   /* All render targets share a sampler */
   struct agx_ptr sampler = agx_pool_alloc_aligned(&batch->pool, AGX_SAMPLER_LENGTH, 64);
-   struct agx_ptr texture = agx_pool_alloc_aligned(&batch->pool, AGX_TEXTURE_LENGTH, 64);

   agx_pack(sampler.cpu, SAMPLER, cfg) {
      cfg.magnify_linear = true;
@@ -1466,63 +1517,17 @@ agx_build_reload_pipeline(struct agx_batch *batch, uint32_t code, struct pipe_su
      cfg.unk_3 = 0;
   }

-   agx_pack(texture.cpu, TEXTURE, cfg) {
-      struct agx_resource *rsrc = agx_resource(surf->texture);
-      unsigned layer = surf->u.tex.first_layer;
-      const struct util_format_description *desc =
-         util_format_description(surf->format);
-
-      /* To reduce shader variants, we always use a 2D texture. For reloads of
-       * arrays and cube maps, we map a single layer as a 2D image.
-       */
-      cfg.dimension = AGX_TEXTURE_DIMENSION_2D;
-      cfg.layout = agx_translate_layout(rsrc->layout.tiling);
-      cfg.channels = agx_pixel_format[surf->format].channels;
-      cfg.type = agx_pixel_format[surf->format].type;
-      cfg.swizzle_r = agx_channel_from_pipe(desc->swizzle[0]);
-      cfg.swizzle_g = agx_channel_from_pipe(desc->swizzle[1]);
-      cfg.swizzle_b = agx_channel_from_pipe(desc->swizzle[2]);
-      cfg.swizzle_a = agx_channel_from_pipe(desc->swizzle[3]);
-      cfg.width = surf->width;
-      cfg.height = surf->height;
-      cfg.first_level = surf->u.tex.level;
-      cfg.last_level = surf->u.tex.level;
-      cfg.unk_mipmapped = rsrc->mipmapped;
-      cfg.srgb = (desc->colorspace == UTIL_FORMAT_COLORSPACE_SRGB);
-      cfg.address = agx_map_texture_gpu(rsrc, layer);
-
-      if (rsrc->layout.tiling == AIL_TILING_LINEAR)
-         cfg.stride = ail_get_linear_stride_B(&rsrc->layout, surf->u.tex.level) - 16;
-      else
-         cfg.unk_tiled = true;
-   }
-
-   struct agx_usc_builder b =
-      agx_alloc_usc_control(&batch->pipeline_pool, 2);
-
-   agx_usc_pack(&b, TEXTURE, cfg) {
-      cfg.start = 0;
-      cfg.count = 1;
-      cfg.buffer = texture.gpu;
-   }
-
   agx_usc_pack(&b, SAMPLER, cfg) {
      cfg.start = 0;
      cfg.count = 1;
      cfg.buffer = sampler.gpu;
   }

-   agx_usc_pack(&b, SHARED, cfg) {
-      cfg.uses_shared_memory = true;
-      cfg.layout = AGX_SHARED_LAYOUT_32X32;
-      cfg.sample_stride_in_8_bytes = 1;
-      cfg.sample_count = 1;
-      cfg.bytes_per_threadgroup = 8 * 32 * 32;
-   }
+   agx_usc_tilebuffer(&b, &batch->tilebuffer_layout);

   agx_usc_pack(&b, SHADER, cfg) {
-      cfg.code = code;
-      cfg.unk_2 = 3;
+      cfg.code = shader->ptr;
+      cfg.unk_2 = 0;
   }

   agx_usc_pack(&b, REGISTERS, cfg) cfg.register_count = 256;
@@ -1531,40 +1536,6 @@ agx_build_reload_pipeline(struct agx_batch *batch, uint32_t code, struct pipe_su
   return agx_usc_fini(&b);
 }

-uint64_t
-agx_build_store_pipeline(struct agx_batch *batch, uint32_t code,
-                         uint64_t render_target)
-{
-   struct agx_usc_builder b = agx_alloc_usc_control(&batch->pipeline_pool, 2);
-
-   agx_usc_pack(&b, TEXTURE, cfg) {
-      cfg.start = 0;
-      cfg.count = 1;
-      cfg.buffer = render_target;
-   }
-
-   uint32_t unk[] = { 0, ~0 };
-
-   agx_usc_pack(&b, UNIFORM, cfg) {
-      cfg.start_halfs = 4;
-      cfg.size_halfs = 4;
-      cfg.buffer = agx_pool_upload_aligned(&batch->pool, unk, sizeof(unk), 16);
-   }
-
-   agx_usc_pack(&b, SHARED, cfg) {
-      cfg.uses_shared_memory = true;
-      cfg.layout = AGX_SHARED_LAYOUT_32X32;
-      cfg.sample_stride_in_8_bytes = 1;
-      cfg.bytes_per_threadgroup = 32 * 256;
-   }
-
-   agx_usc_pack(&b, SHADER, cfg) cfg.code = code;
-   agx_usc_pack(&b, REGISTERS, cfg) cfg.register_count = 8;
-   agx_usc_pack(&b, NO_PRESHADER, cfg);
-
-   return agx_usc_fini(&b);
-}
-
 void
 agx_batch_init_state(struct agx_batch *batch)
 {
--- a/src/gallium/drivers/asahi/agx_state.h
+++ b/src/gallium/drivers/asahi/agx_state.h
@@ -39,6 +39,7 @@
 #include "compiler/nir/nir_lower_blend.h"
 #include "util/hash_table.h"
 #include "util/bitset.h"
+#include "agx_meta.h"

 struct agx_streamout_target {
   struct pipe_stream_output_target base;
@@ -106,7 +107,7 @@ struct agx_batch {
   /* Base of uploaded texture descriptors */
   uint64_t textures;

-   float clear_color[4];
+   uint64_t uploaded_clear_color[PIPE_MAX_COLOR_BUFS];
   double clear_depth;
   unsigned clear_stencil;

@@ -225,6 +226,8 @@ struct agx_context {

   /* Map of agx_resource to agx_batch that writes that resource */
   struct hash_table *writer;
+
+   struct agx_meta_cache meta;
 };

 static inline struct agx_context *
@@ -368,16 +371,6 @@ agx_push_location(struct agx_batch *batch, struct agx_push push,
 bool
 agx_batch_is_active(struct agx_batch *batch);

-uint64_t
-agx_build_clear_pipeline(struct agx_batch *batch, uint32_t code, uint64_t clear_buf);
-
-uint64_t
-agx_build_store_pipeline(struct agx_batch *batch, uint32_t code,
-                         uint64_t render_target);
-
-uint64_t
-agx_build_reload_pipeline(struct agx_batch *batch, uint32_t code, struct pipe_surface *surf);
-
 uint64_t
 agx_batch_upload_pbe(struct agx_batch *batch, unsigned rt);

@@ -448,11 +441,12 @@ agx_blitter_save(struct agx_context *ctx, struct blitter_context *blitter,
 void agx_blit(struct pipe_context *pipe,
              const struct pipe_blit_info *info);

-void agx_internal_shaders(struct agx_device *dev);
-
 /* Batch logic */

 void
 agx_batch_init_state(struct agx_batch *batch);

+uint64_t
+agx_build_meta(struct agx_batch *batch, bool store, bool partial_render);
+
 #endif