agx: introduce "memory variables"

in order to spill, we need to be able to assign spill slots. this is equivalent to register allocation, so we borrow NAK's neat solution: add "memory variables" into the IR which get RA'd to "memory registers" that correspond directly to stack slots. then we can just spill by inserting moves. this requires extensions to RA, parallel copy lowering, and the IR itself. but it makes competent spilling so much more straightforward. to exercise this code path, the commit also includes a trivial spiller that simply moves every source/destination between memory variables and general variables, which has the effect of spilling everything. to be replaced by Braun-Hack eventually, this is just foundations. Signed-off-by: Alyssa Rosenzweig <alyssa@rosenzweig.io> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/27616>
2024-01-27 14:14:23 -04:00
parent 06eb552baa
commit 40da539d01
5 changed files with 449 additions and 125 deletions
--- a/src/asahi/compiler/agx_compiler.h
+++ b/src/asahi/compiler/agx_compiler.h
@@ -26,6 +26,9 @@ extern "C" {
 /* u0-u255 inclusive, as pairs of 16-bits */
 #define AGX_NUM_UNIFORMS (512)

+/* Semi-arbitrary limit for spill slot allocation */
+#define AGX_NUM_MODELED_REGS (2048)
+
 enum agx_index_type {
   AGX_INDEX_NULL = 0,
   AGX_INDEX_NORMAL = 1,
@@ -71,6 +74,9 @@ typedef struct {
   bool abs : 1;
   bool neg : 1;

+   /* Register class */
+   bool memory : 1;
+
   unsigned channels_m1     : 3;
   enum agx_size size       : 2;
   enum agx_index_type type : 3;
@@ -139,12 +145,22 @@ agx_register(uint32_t imm, enum agx_size size)
 }

 static inline agx_index
-agx_register_like(uint32_t imm, agx_index like)
+agx_memory_register(uint32_t imm, enum agx_size size)
 {
-   assert(imm < AGX_NUM_REGS);
-
   return (agx_index){
      .value = imm,
+      .memory = true,
+      .size = size,
+      .type = AGX_INDEX_REGISTER,
+   };
+}
+
+static inline agx_index
+agx_register_like(uint32_t imm, agx_index like)
+{
+   return (agx_index){
+      .value = imm,
+      .memory = like.memory,
      .channels_m1 = like.channels_m1,
      .size = like.size,
      .type = AGX_INDEX_REGISTER,
@@ -398,7 +414,7 @@ typedef struct agx_block {
   /* For visited blocks during register assignment and live-out registers, the
    * mapping of SSA names to registers at the end of the block.
    */
-   uint8_t *ssa_to_reg_out;
+   uint16_t *ssa_to_reg_out;

   /* Is this block a loop header? If not, all of its predecessors precede it in
    * source order.
@@ -464,6 +480,15 @@ typedef struct {
    */
   agx_index vertex_id, instance_id;

+   /* Beginning of our stack allocation used for spilling, below that is
+    * NIR-level scratch.
+    */
+   unsigned spill_base;
+
+   /* Beginning of stack allocation used for parallel copy lowering */
+   bool has_spill_pcopy_reserved;
+   unsigned spill_pcopy_base;
+
   /* Stats for shader-db */
   unsigned loop_count;
   unsigned spills;
@@ -883,6 +908,9 @@ struct agx_copy {
   /* Base register destination of the copy */
   unsigned dest;

+   /* Destination is memory */
+   bool dest_mem;
+
   /* Source of the copy */
   agx_index src;

--- a/src/asahi/compiler/agx_lower_parallel_copy.c
+++ b/src/asahi/compiler/agx_lower_parallel_copy.c
@@ -22,15 +22,47 @@
 * We only handles register-register copies, not general agx_index sources. This
 * suffices for its internal use for register allocation.
 */
+
+static agx_index
+scratch_slot(agx_context *ctx, enum agx_size size)
+{
+   /* Reserve scratch slots. scratch_size is in bytes, spill_pcopy_base and
+    * agx_memory_register are in memory registers (16-bit elements).
+    */
+   if (!ctx->has_spill_pcopy_reserved) {
+      ctx->scratch_size = align(ctx->scratch_size, 16);
+      ctx->spill_pcopy_base = ctx->scratch_size / 2;
+      ctx->scratch_size += 8;
+      ctx->has_spill_pcopy_reserved = true;
+   }
+
+   return agx_memory_register(ctx->spill_pcopy_base, size);
+}
+
 static void
 do_copy(agx_builder *b, const struct agx_copy *copy)
 {
-   agx_index dst = agx_register(copy->dest, copy->src.size);
+   agx_index dst = copy->dest_mem
+                      ? agx_memory_register(copy->dest, copy->src.size)
+                      : agx_register(copy->dest, copy->src.size);

-   if (copy->src.type == AGX_INDEX_IMMEDIATE)
+   if (copy->dest_mem && copy->src.memory) {
+      /* Memory-memory copies need to be lowered to memory-register and
+       * register-memory, spilling a GPR to an auxiliary memory slot. This
+       * avoids needing reserving a scratch register for this edge case.
+       */
+      agx_index scratch_reg = agx_register(0, copy->src.size);
+      agx_index scratch_mem = scratch_slot(b->shader, copy->src.size);
+
+      agx_mov_to(b, scratch_mem, scratch_reg);
+      agx_mov_to(b, scratch_reg, copy->src);
+      agx_mov_to(b, dst, scratch_reg);
+      agx_mov_to(b, scratch_reg, scratch_mem);
+   } else if (copy->src.type == AGX_INDEX_IMMEDIATE) {
      agx_mov_imm_to(b, dst, copy->src.value);
-   else
+   } else {
      agx_mov_to(b, dst, copy->src);
+   }
 }

 static void
@@ -43,7 +75,7 @@ do_swap(agx_builder *b, const struct agx_copy *copy)

   /* We can swap lo/hi halves of a 32-bit register with a 32-bit extr */
   if (copy->src.size == AGX_SIZE_16 &&
-       (copy->dest >> 1) == (copy->src.value >> 1)) {
+       (copy->dest >> 1) == (copy->src.value >> 1) && !copy->dest_mem) {

      assert(((copy->dest & 1) == (1 - (copy->src.value & 1))) &&
             "no trivial swaps, and only 2 halves of a register");
@@ -58,9 +90,32 @@ do_swap(agx_builder *b, const struct agx_copy *copy)
      return;
   }

-   agx_index x = agx_register(copy->dest, copy->src.size);
+   agx_index x = copy->dest_mem
+                    ? agx_memory_register(copy->dest, copy->src.size)
+                    : agx_register(copy->dest, copy->src.size);
   agx_index y = copy->src;

+   /* Memory-memory swaps need to be lowered */
+   assert(x.memory == y.memory);
+   if (x.memory) {
+      agx_index temp1 = agx_register(0, copy->src.size);
+      agx_index temp2 = agx_register(2, copy->src.size);
+
+      agx_index scratch_reg2 = agx_register(0, copy->src.size);
+      agx_index scratch_mem2 = scratch_slot(b->shader, copy->src.size);
+      scratch_reg2.channels_m1++;
+      scratch_mem2.channels_m1++;
+
+      agx_mov_to(b, scratch_mem2, scratch_reg2);
+      agx_mov_to(b, temp1, x);
+      agx_mov_to(b, temp2, y);
+      agx_mov_to(b, y, temp1);
+      agx_mov_to(b, x, temp2);
+      agx_mov_to(b, scratch_reg2, scratch_mem2);
+      return;
+   }
+
+   /* Otherwise, we're swapping GPRs and fallback on a XOR swap. */
   agx_xor_to(b, x, x, y);
   agx_xor_to(b, y, x, y);
   agx_xor_to(b, x, x, y);
@@ -74,12 +129,12 @@ struct copy_ctx {
    * source. Once this drops to zero, then the physreg is unblocked and can
    * be moved to.
    */
-   unsigned physreg_use_count[AGX_NUM_REGS];
+   unsigned physreg_use_count[AGX_NUM_MODELED_REGS];

   /* For each physreg, the pending copy_entry that uses it as a dest. */
-   struct agx_copy *physreg_dest[AGX_NUM_REGS];
+   struct agx_copy *physreg_dest[AGX_NUM_MODELED_REGS];

-   struct agx_copy entries[AGX_NUM_REGS];
+   struct agx_copy entries[AGX_NUM_MODELED_REGS];
 };

 static bool
@@ -96,7 +151,8 @@ entry_blocked(struct agx_copy *entry, struct copy_ctx *ctx)
 static bool
 is_real(struct agx_copy *entry)
 {
-   return entry->src.type == AGX_INDEX_REGISTER;
+   return entry->src.type == AGX_INDEX_REGISTER &&
+          entry->dest_mem == entry->src.memory;
 }

 /* TODO: Generalize to other bit sizes */
@@ -109,6 +165,7 @@ split_32bit_copy(struct copy_ctx *ctx, struct agx_copy *entry)
   struct agx_copy *new_entry = &ctx->entries[ctx->entry_count++];

   new_entry->dest = entry->dest + 1;
+   new_entry->dest_mem = entry->dest_mem;
   new_entry->src = entry->src;
   new_entry->src.value += 1;
   new_entry->done = false;
@@ -117,9 +174,9 @@ split_32bit_copy(struct copy_ctx *ctx, struct agx_copy *entry)
   ctx->physreg_dest[entry->dest + 1] = new_entry;
 }

-void
-agx_emit_parallel_copies(agx_builder *b, struct agx_copy *copies,
-                         unsigned num_copies)
+static void
+agx_emit_parallel_copies_for_class(agx_builder *b, struct agx_copy *copies,
+                                   unsigned num_copies, bool cls)
 {
   /* First, lower away 64-bit copies to smaller chunks, since we don't have
    * 64-bit ALU so we always want to split.
@@ -130,6 +187,12 @@ agx_emit_parallel_copies(agx_builder *b, struct agx_copy *copies,
   for (unsigned i = 0; i < num_copies; ++i) {
      struct agx_copy copy = copies[i];

+      /* Filter by class */
+      if (copy.dest_mem != cls)
+         continue;
+
+      assert(copy.dest < AGX_NUM_MODELED_REGS);
+
      if (copy.src.size == AGX_SIZE_64) {
         copy.src.size = AGX_SIZE_32;
         copies2[num_copies2++] = copy;
@@ -354,3 +417,14 @@ agx_emit_parallel_copies(agx_builder *b, struct agx_copy *copies,

   free(copies2);
 }
+
+void
+agx_emit_parallel_copies(agx_builder *b, struct agx_copy *copies,
+                         unsigned num_copies)
+{
+   /* Emit copies fo reach register class separately because we don't have
+    * register class awareness in the parallel copy lowering data structure.
+    */
+   agx_emit_parallel_copies_for_class(b, copies, num_copies, false);
+   agx_emit_parallel_copies_for_class(b, copies, num_copies, true);
+}
--- a/src/asahi/compiler/agx_print.c
+++ b/src/asahi/compiler/agx_print.c
@@ -30,6 +30,9 @@ agx_print_sized(char prefix, unsigned value, enum agx_size size, FILE *fp)
 static void
 agx_print_index(agx_index index, bool is_float, FILE *fp)
 {
+   if (index.memory)
+      fprintf(fp, "m");
+
   switch (index.type) {
   case AGX_INDEX_NULL:
      fprintf(fp, "_");
@@ -75,6 +78,8 @@ agx_print_index(agx_index index, bool is_float, FILE *fp)

         fprintf(fp, "...");

+         if (index.memory)
+            fprintf(fp, "m");
         agx_print_sized('r', last, index.size, fp);
      }
      break;
--- a/src/asahi/compiler/agx_register_allocate.c
+++ b/src/asahi/compiler/agx_register_allocate.c
@@ -3,6 +3,8 @@
 * SPDX-License-Identifier: MIT
 */

+#include "util/bitset.h"
+#include "util/macros.h"
 #include "util/u_dynarray.h"
 #include "util/u_qsort.h"
 #include "agx_builder.h"
@@ -10,21 +12,41 @@
 #include "agx_compiler.h"
 #include "agx_debug.h"
 #include "agx_opcodes.h"
+#include "agx_pack.h"
+#include "shader_enums.h"

 /* SSA-based register allocator */

+enum ra_class {
+   /* General purpose register */
+   RA_GPR,
+
+   /* Memory, used to assign stack slots */
+   RA_MEM,
+
+   /* Keep last */
+   RA_CLASSES,
+};
+
+static inline enum ra_class
+ra_class_for_index(agx_index idx)
+{
+   return idx.memory ? RA_MEM : RA_GPR;
+}
+
 struct ra_ctx {
   agx_context *shader;
   agx_block *block;
   agx_instr *instr;
-   uint8_t *ssa_to_reg;
+   uint16_t *ssa_to_reg;
   uint8_t *ncomps;
   enum agx_size *sizes;
+   enum ra_class *classes;
   BITSET_WORD *visited;
-   BITSET_WORD *used_regs;
+   BITSET_WORD *used_regs[RA_CLASSES];

   /* Maintained while assigning registers */
-   unsigned *max_reg;
+   unsigned *max_reg[RA_CLASSES];

   /* For affinities */
   agx_instr **src_to_collect_phi;
@@ -32,11 +54,13 @@ struct ra_ctx {
   /* If bit i of used_regs is set, and register i is the first consecutive
    * register holding an SSA value, then reg_to_ssa[i] is the SSA index of the
    * value currently in register  i.
+    *
+    * Only for GPRs. We can add reg classes later if we have a use case.
    */
   uint32_t reg_to_ssa[AGX_NUM_REGS];

   /* Maximum number of registers that RA is allowed to use */
-   unsigned bound;
+   unsigned bound[RA_CLASSES];
 };

 enum agx_size
@@ -58,12 +82,26 @@ agx_split_width(const agx_instr *I)
 }

 /*
- * Calculate register demand in 16-bit registers. Becuase we allocate in SSA,
- * this calculation is exact in linear-time. Depends on liveness information.
+ * Calculate register demand in 16-bit registers, while gathering widths and
+ * classes. Becuase we allocate in SSA, this calculation is exact in
+ * linear-time. Depends on liveness information.
 */
 static unsigned
-agx_calc_register_demand(agx_context *ctx, uint8_t *widths)
+agx_calc_register_demand(agx_context *ctx)
 {
+   uint8_t *widths = calloc(ctx->alloc, sizeof(uint8_t));
+   enum ra_class *classes = calloc(ctx->alloc, sizeof(enum ra_class));
+
+   agx_foreach_instr_global(ctx, I) {
+      agx_foreach_ssa_dest(I, d) {
+         unsigned v = I->dest[d].value;
+         assert(widths[v] == 0 && "broken SSA");
+         /* Round up vectors for easier live range splitting */
+         widths[v] = util_next_power_of_two(agx_index_size_16(I->dest[d]));
+         classes[v] = ra_class_for_index(I->dest[d]);
+      }
+   }
+
   /* Calculate demand at the start of each block based on live-in, then update
    * for each instruction processed. Calculate rolling maximum.
    */
@@ -82,7 +120,8 @@ agx_calc_register_demand(agx_context *ctx, uint8_t *widths)
      {
         int i;
         BITSET_FOREACH_SET(i, block->live_in, ctx->alloc) {
-            demand += widths[i];
+            if (classes[i] == RA_GPR)
+               demand += widths[i];
         }
      }

@@ -110,6 +149,8 @@ agx_calc_register_demand(agx_context *ctx, uint8_t *widths)
            if (!I->src[s].kill)
               continue;
            assert(I->src[s].type == AGX_INDEX_NORMAL);
+            if (ra_class_for_index(I->src[s]) != RA_GPR)
+               continue;

            bool skip = false;

@@ -125,10 +166,9 @@ agx_calc_register_demand(agx_context *ctx, uint8_t *widths)
         }

         /* Make destinations live */
-         agx_foreach_dest(I, d) {
-            if (agx_is_null(I->dest[d]))
+         agx_foreach_ssa_dest(I, d) {
+            if (ra_class_for_index(I->dest[d]) != RA_GPR)
               continue;
-            assert(I->dest[d].type == AGX_INDEX_NORMAL);

            /* Live range splits allocate at power-of-two granularity. Round up
             * destination sizes (temporarily) to powers-of-two.
@@ -146,15 +186,17 @@ agx_calc_register_demand(agx_context *ctx, uint8_t *widths)
      demand -= late_kill_count;
   }

+   free(widths);
+   free(classes);
   return max_demand;
 }

 static bool
-find_regs_simple(struct ra_ctx *rctx, unsigned count, unsigned align,
-                 unsigned *out)
+find_regs_simple(struct ra_ctx *rctx, enum ra_class cls, unsigned count,
+                 unsigned align, unsigned *out)
 {
-   for (unsigned reg = 0; reg + count <= rctx->bound; reg += align) {
-      if (!BITSET_TEST_RANGE(rctx->used_regs, reg, reg + count - 1)) {
+   for (unsigned reg = 0; reg + count <= rctx->bound[cls]; reg += align) {
+      if (!BITSET_TEST_RANGE(rctx->used_regs[cls], reg, reg + count - 1)) {
         *out = reg;
         return true;
      }
@@ -177,17 +219,18 @@ find_regs_simple(struct ra_ctx *rctx, unsigned count, unsigned align,
 * Postcondition: at least one register in the returned region is already free.
 */
 static unsigned
-find_best_region_to_evict(struct ra_ctx *rctx, unsigned size,
+find_best_region_to_evict(struct ra_ctx *rctx, enum ra_class cls, unsigned size,
                          BITSET_WORD *already_evicted, BITSET_WORD *killed)
 {
   assert(util_is_power_of_two_or_zero(size) && "precondition");
-   assert((rctx->bound % size) == 0 &&
+   assert((rctx->bound[cls] % size) == 0 &&
          "register file size must be aligned to the maximum vector size");
+   assert(cls == RA_GPR);

   unsigned best_base = ~0;
   unsigned best_moves = ~0;

-   for (unsigned base = 0; base + size <= rctx->bound; base += size) {
+   for (unsigned base = 0; base + size <= rctx->bound[cls]; base += size) {
      /* r0l is unevictable, skip it. By itself, this does not pose a problem.
       * We are allocating n registers, but the region containing r0l has at
       * most n-1 free. Since there are at least n free registers total, there
@@ -215,7 +258,7 @@ find_best_region_to_evict(struct ra_ctx *rctx, unsigned size,
         /* We need a move for each blocked register (TODO: we only need a
          * single move for 32-bit pairs, could optimize to use that instead.)
          */
-         if (BITSET_TEST(rctx->used_regs, reg))
+         if (BITSET_TEST(rctx->used_regs[cls], reg))
            moves++;
         else
            any_free = true;
@@ -239,7 +282,7 @@ find_best_region_to_evict(struct ra_ctx *rctx, unsigned size,
      }
   }

-   assert(best_base < rctx->bound &&
+   assert(best_base < rctx->bound[cls] &&
          "not enough registers (should have spilled already)");
   return best_base;
 }
@@ -247,15 +290,22 @@ find_best_region_to_evict(struct ra_ctx *rctx, unsigned size,
 static void
 set_ssa_to_reg(struct ra_ctx *rctx, unsigned ssa, unsigned reg)
 {
-   *(rctx->max_reg) = MAX2(*(rctx->max_reg), reg + rctx->ncomps[ssa] - 1);
+   enum ra_class cls = rctx->classes[ssa];
+
+   *(rctx->max_reg[cls]) =
+      MAX2(*(rctx->max_reg[cls]), reg + rctx->ncomps[ssa] - 1);
+
   rctx->ssa_to_reg[ssa] = reg;
 }

 static unsigned
 assign_regs_by_copying(struct ra_ctx *rctx, unsigned npot_count, unsigned align,
                       const agx_instr *I, struct util_dynarray *copies,
-                       BITSET_WORD *clobbered, BITSET_WORD *killed)
+                       BITSET_WORD *clobbered, BITSET_WORD *killed,
+                       enum ra_class cls)
 {
+   assert(cls == RA_GPR);
+
   /* XXX: This needs some special handling but so far it has been prohibitively
    * difficult to hit the case
    */
@@ -273,14 +323,15 @@ assign_regs_by_copying(struct ra_ctx *rctx, unsigned npot_count, unsigned align,
    * shuffle some variables around. Look for a range of the register file
    * that is partially blocked.
    */
-   unsigned base = find_best_region_to_evict(rctx, count, clobbered, killed);
+   unsigned base =
+      find_best_region_to_evict(rctx, cls, count, clobbered, killed);

   assert(count <= 16 && "max allocation size (conservative)");
   BITSET_DECLARE(evict_set, 16) = {0};

   /* Store the set of blocking registers that need to be evicted */
   for (unsigned i = 0; i < count; ++i) {
-      if (BITSET_TEST(rctx->used_regs, base + i)) {
+      if (BITSET_TEST(rctx->used_regs[cls], base + i)) {
         BITSET_SET(evict_set, i);
      }
   }
@@ -288,7 +339,7 @@ assign_regs_by_copying(struct ra_ctx *rctx, unsigned npot_count, unsigned align,
   /* We are going to allocate the destination to this range, so it is now fully
    * used. Mark it as such so we don't reassign here later.
    */
-   BITSET_SET_RANGE(rctx->used_regs, base, base + count - 1);
+   BITSET_SET_RANGE(rctx->used_regs[cls], base, base + count - 1);

   /* Before overwriting the range, we need to evict blocked variables */
   for (unsigned i = 0; i < 16; ++i) {
@@ -315,11 +366,13 @@ assign_regs_by_copying(struct ra_ctx *rctx, unsigned npot_count, unsigned align,
       * recursion because nr is decreasing because of the gap.
       */
      assert(nr < count && "fully contained in range that's not full");
-      unsigned new_reg =
-         assign_regs_by_copying(rctx, nr, align, I, copies, clobbered, killed);
+      unsigned new_reg = assign_regs_by_copying(rctx, nr, align, I, copies,
+                                                clobbered, killed, cls);

      /* Copy the variable over, register by register */
      for (unsigned i = 0; i < nr; i += align) {
+         assert(cls == RA_GPR);
+
         struct agx_copy copy = {
            .dest = new_reg + i,
            .src = agx_register(reg + i, rctx->sizes[ssa]),
@@ -338,6 +391,7 @@ assign_regs_by_copying(struct ra_ctx *rctx, unsigned npot_count, unsigned align,
      BITSET_SET_RANGE(clobbered, new_reg, new_reg + nr - 1);

      /* Update bookkeeping for this variable */
+      assert(cls == rctx->classes[cls]);
      set_ssa_to_reg(rctx, ssa, new_reg);
      rctx->reg_to_ssa[new_reg] = ssa;

@@ -348,8 +402,10 @@ assign_regs_by_copying(struct ra_ctx *rctx, unsigned npot_count, unsigned align,
   /* We overallocated for non-power-of-two vectors. Free up the excess now.
    * This is modelled as late kill in demand calculation.
    */
-   if (npot_count != count)
-      BITSET_CLEAR_RANGE(rctx->used_regs, base + npot_count, base + count - 1);
+   if (npot_count != count) {
+      BITSET_CLEAR_RANGE(rctx->used_regs[cls], base + npot_count,
+                         base + count - 1);
+   }

   return base;
 }
@@ -401,7 +457,9 @@ insert_copies_for_clobbered_killed(struct ra_ctx *rctx, unsigned reg,
   agx_foreach_ssa_src(I, s) {
      unsigned reg = rctx->ssa_to_reg[I->src[s].value];

-      if (I->src[s].kill && BITSET_TEST(clobbered, reg)) {
+      if (I->src[s].kill && ra_class_for_index(I->src[s]) == RA_GPR &&
+          BITSET_TEST(clobbered, reg)) {
+
         assert(nr_vars < ARRAY_SIZE(vars) &&
                "cannot clobber more than max variable size");

@@ -433,6 +491,7 @@ insert_copies_for_clobbered_killed(struct ra_ctx *rctx, unsigned reg,
      unsigned var_count = rctx->ncomps[var];
      unsigned var_align = agx_size_align_16(rctx->sizes[var]);

+      assert(rctx->classes[var] == RA_GPR && "construction");
      assert((base % var_align) == 0 && "induction");
      assert((var_count % var_align) == 0 && "no partial variables");

@@ -461,9 +520,13 @@ find_regs(struct ra_ctx *rctx, agx_instr *I, unsigned dest_idx, unsigned count,
   unsigned reg;
   assert(count == align);

-   if (find_regs_simple(rctx, count, align, &reg)) {
+   enum ra_class cls = ra_class_for_index(I->dest[dest_idx]);
+
+   if (find_regs_simple(rctx, cls, count, align, &reg)) {
      return reg;
   } else {
+      assert(cls == RA_GPR && "no memory live range splits");
+
      BITSET_DECLARE(clobbered, AGX_NUM_REGS) = {0};
      BITSET_DECLARE(killed, AGX_NUM_REGS) = {0};
      struct util_dynarray copies = {0};
@@ -481,7 +544,7 @@ find_regs(struct ra_ctx *rctx, agx_instr *I, unsigned dest_idx, unsigned count,
      }

      reg = assign_regs_by_copying(rctx, count, align, I, &copies, clobbered,
-                                   killed);
+                                   killed, cls);
      insert_copies_for_clobbered_killed(rctx, reg, count, I, &copies,
                                         clobbered);

@@ -491,7 +554,7 @@ find_regs(struct ra_ctx *rctx, agx_instr *I, unsigned dest_idx, unsigned count,
         &b, copies.data, util_dynarray_num_elements(&copies, struct agx_copy));

      /* assign_regs asserts this is cleared, so clear to be reassigned */
-      BITSET_CLEAR_RANGE(rctx->used_regs, reg, reg + count - 1);
+      BITSET_CLEAR_RANGE(rctx->used_regs[cls], reg, reg + count - 1);
      return reg;
   }
 }
@@ -545,10 +608,13 @@ reserve_live_in(struct ra_ctx *rctx)
                */
               assert(rctx->block->loop_header);
               phi->src[pred_idx] = agx_get_index(i, size);
+               phi->src[pred_idx].memory = rctx->classes[i] == RA_MEM;
            } else {
               /* Otherwise, we can build the phi now */
               unsigned reg = (*pred)->ssa_to_reg_out[i];
-               phi->src[pred_idx] = agx_register(reg, size);
+               phi->src[pred_idx] = rctx->classes[i] == RA_MEM
+                                       ? agx_memory_register(reg, size)
+                                       : agx_register(reg, size);
            }
         }

@@ -567,11 +633,14 @@ reserve_live_in(struct ra_ctx *rctx)
         base = (*pred)->ssa_to_reg_out[i];
      }

+      enum ra_class cls = rctx->classes[i];
      set_ssa_to_reg(rctx, i, base);

      for (unsigned j = 0; j < rctx->ncomps[i]; ++j) {
-         BITSET_SET(rctx->used_regs, base + j);
-         rctx->reg_to_ssa[base + j] = i;
+         BITSET_SET(rctx->used_regs[cls], base + j);
+
+         if (cls == RA_GPR)
+            rctx->reg_to_ssa[base + j] = i;
      }
   }
 }
@@ -579,7 +648,8 @@ reserve_live_in(struct ra_ctx *rctx)
 static void
 assign_regs(struct ra_ctx *rctx, agx_index v, unsigned reg)
 {
-   assert(reg < rctx->bound && "must not overflow register file");
+   enum ra_class cls = ra_class_for_index(v);
+   assert(reg < rctx->bound[cls] && "must not overflow register file");
   assert(v.type == AGX_INDEX_NORMAL && "only SSA gets registers allocated");
   set_ssa_to_reg(rctx, v.value, reg);

@@ -589,10 +659,12 @@ assign_regs(struct ra_ctx *rctx, agx_index v, unsigned reg)
   assert(rctx->ncomps[v.value] >= 1);
   unsigned end = reg + rctx->ncomps[v.value] - 1;

-   assert(!BITSET_TEST_RANGE(rctx->used_regs, reg, end) && "no interference");
-   BITSET_SET_RANGE(rctx->used_regs, reg, end);
+   assert(!BITSET_TEST_RANGE(rctx->used_regs[cls], reg, end) &&
+          "no interference");
+   BITSET_SET_RANGE(rctx->used_regs[cls], reg, end);

-   rctx->reg_to_ssa[reg] = v.value;
+   if (cls == RA_GPR)
+      rctx->reg_to_ssa[reg] = v.value;
 }

 static void
@@ -641,10 +713,12 @@ try_coalesce_with(struct ra_ctx *rctx, agx_index ssa, unsigned count,
   }

   unsigned base = rctx->ssa_to_reg[ssa.value];
-   if (BITSET_TEST_RANGE(rctx->used_regs, base, base + count - 1))
+   enum ra_class cls = ra_class_for_index(ssa);
+
+   if (BITSET_TEST_RANGE(rctx->used_regs[cls], base, base + count - 1))
      return false;

-   assert(base + count <= rctx->bound && "invariant");
+   assert(base + count <= rctx->bound[cls] && "invariant");
   *out = base;
   return true;
 }
@@ -653,6 +727,7 @@ static unsigned
 pick_regs(struct ra_ctx *rctx, agx_instr *I, unsigned d)
 {
   agx_index idx = I->dest[d];
+   enum ra_class cls = ra_class_for_index(idx);
   assert(idx.type == AGX_INDEX_NORMAL);

   unsigned count = rctx->ncomps[idx.value];
@@ -680,14 +755,14 @@ pick_regs(struct ra_ctx *rctx, agx_instr *I, unsigned d)
                "and this is not a phi node, so we have assigned a register");

         unsigned base = affinity_base_of_collect(rctx, I, s);
-         if (base >= rctx->bound || (base + count) > rctx->bound)
+         if (base >= rctx->bound[cls] || (base + count) > rctx->bound[cls])
            continue;

         /* Unaligned destinations can happen when dest size > src size */
         if (base % align)
            continue;

-         if (!BITSET_TEST_RANGE(rctx->used_regs, base, base + count - 1))
+         if (!BITSET_TEST_RANGE(rctx->used_regs[cls], base, base + count - 1))
            return base;
      }
   }
@@ -719,17 +794,18 @@ pick_regs(struct ra_ctx *rctx, agx_instr *I, unsigned d)

         /* Determine where the collect should start relative to the source */
         unsigned base = affinity_base_of_collect(rctx, collect, s);
-         if (base >= rctx->bound)
+         if (base >= rctx->bound[cls])
            continue;

         unsigned our_reg = base + (our_source * align);

         /* Don't allocate past the end of the register file */
-         if ((our_reg + align) > rctx->bound)
+         if ((our_reg + align) > rctx->bound[cls])
            continue;

         /* If those registers are free, then choose them */
-         if (!BITSET_TEST_RANGE(rctx->used_regs, our_reg, our_reg + align - 1))
+         if (!BITSET_TEST_RANGE(rctx->used_regs[cls], our_reg,
+                                our_reg + align - 1))
            return our_reg;
      }

@@ -739,9 +815,10 @@ pick_regs(struct ra_ctx *rctx, agx_instr *I, unsigned d)
      /* Prefer ranges of the register file that leave room for all sources of
       * the collect contiguously.
       */
-      for (unsigned base = 0; base + (collect->nr_srcs * align) <= rctx->bound;
+      for (unsigned base = 0;
+           base + (collect->nr_srcs * align) <= rctx->bound[cls];
           base += collect_align) {
-         if (!BITSET_TEST_RANGE(rctx->used_regs, base,
+         if (!BITSET_TEST_RANGE(rctx->used_regs[cls], base,
                                base + (collect->nr_srcs * align) - 1))
            return base + offset;
      }
@@ -751,9 +828,9 @@ pick_regs(struct ra_ctx *rctx, agx_instr *I, unsigned d)
       * for a register for the source such that the collect base is aligned.
       */
      if (collect_align > align) {
-         for (unsigned reg = offset; reg + collect_align <= rctx->bound;
+         for (unsigned reg = offset; reg + collect_align <= rctx->bound[cls];
              reg += collect_align) {
-            if (!BITSET_TEST_RANGE(rctx->used_regs, reg, reg + count - 1))
+            if (!BITSET_TEST_RANGE(rctx->used_regs[cls], reg, reg + count - 1))
               return reg;
         }
      }
@@ -773,7 +850,7 @@ pick_regs(struct ra_ctx *rctx, agx_instr *I, unsigned d)
      if (phi->dest[0].type == AGX_INDEX_REGISTER) {
         unsigned base = phi->dest[0].value;

-         if (!BITSET_TEST_RANGE(rctx->used_regs, base, base + count - 1))
+         if (!BITSET_TEST_RANGE(rctx->used_regs[cls], base, base + count - 1))
            return base;
      }
   }
@@ -787,12 +864,14 @@ pick_regs(struct ra_ctx *rctx, agx_instr *I, unsigned d)
 static void
 agx_ra_assign_local(struct ra_ctx *rctx)
 {
-   BITSET_DECLARE(used_regs, AGX_NUM_REGS) = {0};
-   uint8_t *ssa_to_reg = calloc(rctx->shader->alloc, sizeof(uint8_t));
+   BITSET_DECLARE(used_regs_gpr, AGX_NUM_REGS) = {0};
+   BITSET_DECLARE(used_regs_mem, AGX_NUM_MODELED_REGS) = {0};
+   uint16_t *ssa_to_reg = calloc(rctx->shader->alloc, sizeof(uint16_t));

   agx_block *block = rctx->block;
   uint8_t *ncomps = rctx->ncomps;
-   rctx->used_regs = used_regs;
+   rctx->used_regs[RA_GPR] = used_regs_gpr;
+   rctx->used_regs[RA_MEM] = used_regs_mem;
   rctx->ssa_to_reg = ssa_to_reg;

   reserve_live_in(rctx);
@@ -801,7 +880,7 @@ agx_ra_assign_local(struct ra_ctx *rctx)
    * This could be optimized (sync with agx_calc_register_demand).
    */
   if (rctx->shader->any_cf)
-      BITSET_SET(used_regs, 0);
+      BITSET_SET(used_regs_gpr, 0);

   agx_foreach_instr_in_block(block, I) {
      rctx->instr = I;
@@ -810,13 +889,17 @@ agx_ra_assign_local(struct ra_ctx *rctx)
       * can be removed by assigning the destinations overlapping the source.
       */
      if (I->op == AGX_OPCODE_SPLIT && I->src[0].kill) {
+         assert(ra_class_for_index(I->src[0]) == RA_GPR);
         unsigned reg = ssa_to_reg[I->src[0].value];
         unsigned width = agx_size_align_16(agx_split_width(I));

         agx_foreach_dest(I, d) {
+            assert(ra_class_for_index(I->dest[0]) == RA_GPR);
+
            /* Free up the source */
            unsigned offset_reg = reg + (d * width);
-            BITSET_CLEAR_RANGE(used_regs, offset_reg, offset_reg + width - 1);
+            BITSET_CLEAR_RANGE(used_regs_gpr, offset_reg,
+                               offset_reg + width - 1);

            /* Assign the destination where the source was */
            if (!agx_is_null(I->dest[d]))
@@ -826,7 +909,7 @@ agx_ra_assign_local(struct ra_ctx *rctx)
         unsigned excess =
            rctx->ncomps[I->src[0].value] - (I->nr_dests * width);
         if (excess) {
-            BITSET_CLEAR_RANGE(used_regs, reg + (I->nr_dests * width),
+            BITSET_CLEAR_RANGE(used_regs_gpr, reg + (I->nr_dests * width),
                               reg + rctx->ncomps[I->src[0].value] - 1);
         }

@@ -846,11 +929,12 @@ agx_ra_assign_local(struct ra_ctx *rctx)
      /* First, free killed sources */
      agx_foreach_ssa_src(I, s) {
         if (I->src[s].kill) {
+            enum ra_class cls = ra_class_for_index(I->src[s]);
            unsigned reg = ssa_to_reg[I->src[s].value];
            unsigned count = ncomps[I->src[s].value];

            assert(count >= 1);
-            BITSET_CLEAR_RANGE(used_regs, reg, reg + count - 1);
+            BITSET_CLEAR_RANGE(rctx->used_regs[cls], reg, reg + count - 1);
         }
      }

@@ -883,7 +967,7 @@ agx_ra_assign_local(struct ra_ctx *rctx)

            agx_replace_src(
               phi, pred_idx,
-               agx_register(rctx->ssa_to_reg[value], phi->src[pred_idx].size));
+               agx_register_like(rctx->ssa_to_reg[value], phi->src[pred_idx]));
         }
      }
   }
@@ -941,6 +1025,7 @@ agx_insert_parallel_copies(agx_context *ctx, agx_block *block)

         copies[i++] = (struct agx_copy){
            .dest = dest.value,
+            .dest_mem = dest.memory,
            .src = src,
         };
      }
@@ -951,49 +1036,89 @@ agx_insert_parallel_copies(agx_context *ctx, agx_block *block)
   }
 }

-void
-agx_ra(agx_context *ctx)
+static inline agx_index
+agx_index_as_mem(agx_index idx, unsigned mem_base)
 {
-   agx_compute_liveness(ctx);
-   uint8_t *ncomps = calloc(ctx->alloc, sizeof(uint8_t));
-   agx_instr **src_to_collect_phi = calloc(ctx->alloc, sizeof(agx_instr *));
-   enum agx_size *sizes = calloc(ctx->alloc, sizeof(enum agx_size));
-   BITSET_WORD *visited = calloc(BITSET_WORDS(ctx->alloc), sizeof(BITSET_WORD));
-   unsigned max_ncomps = 1;
+   assert(idx.type == AGX_INDEX_NORMAL);
+   assert(!idx.memory);
+   idx.memory = true;
+   idx.value = mem_base + idx.value;
+   return idx;
+}

-   agx_foreach_instr_global(ctx, I) {
-      /* Record collects/phis so we can coalesce when assigning */
-      if (I->op == AGX_OPCODE_COLLECT || I->op == AGX_OPCODE_PHI) {
-         agx_foreach_ssa_src(I, s) {
-            src_to_collect_phi[I->src[s].value] = I;
+/*
+ * Spill everything to the stack, trivially. For debugging spilling.
+ *
+ * Only phis and stack moves can access memory variables.
+ */
+static void
+agx_spill_everything(agx_context *ctx)
+{
+   /* Immediates and uniforms are not allowed to be spilled, so they cannot
+    * appear in phi webs. Lower them first.
+    */
+   agx_foreach_block(ctx, block) {
+      agx_block **preds = util_dynarray_begin(&block->predecessors);
+
+      agx_foreach_phi_in_block(block, phi) {
+         agx_foreach_src(phi, s) {
+            if (phi->src[s].type == AGX_INDEX_IMMEDIATE ||
+                phi->src[s].type == AGX_INDEX_UNIFORM) {
+
+               agx_builder b =
+                  agx_init_builder(ctx, agx_after_block_logical(preds[s]));
+
+               agx_index temp = agx_temp(ctx, phi->dest[0].size);
+
+               if (phi->src[s].type == AGX_INDEX_IMMEDIATE)
+                  agx_mov_imm_to(&b, temp, phi->src[s].value);
+               else
+                  agx_mov_to(&b, temp, phi->src[s]);
+
+               agx_replace_src(phi, s, temp);
+            }
         }
      }
-
-      agx_foreach_ssa_dest(I, d) {
-         unsigned v = I->dest[d].value;
-         assert(ncomps[v] == 0 && "broken SSA");
-         /* Round up vectors for easier live range splitting */
-         ncomps[v] = util_next_power_of_two(agx_index_size_16(I->dest[d]));
-         sizes[v] = I->dest[d].size;
-
-         max_ncomps = MAX2(max_ncomps, ncomps[v]);
-      }
   }

-   /* For live range splitting to work properly, ensure the register file is
-    * aligned to the larger vector size. Most of the time, this is a no-op since
-    * the largest vector size is usually 128-bit and the register file is
-    * naturally 128-bit aligned. However, this is required for correctness with
-    * 3D textureGrad, which can have a source vector of length 6x32-bit,
-    * rounding up to 256-bit and requiring special accounting here.
-    */
-   unsigned reg_file_alignment = MAX2(max_ncomps, 8);
-   assert(util_is_power_of_two_nonzero(reg_file_alignment));
+   /* Now we can spill everything */
+   unsigned mem_base = ctx->alloc;
+   ctx->alloc = mem_base + ctx->alloc;

-   /* Calculate the demand and use it to bound register assignment */
-   unsigned demand =
-      ALIGN_POT(agx_calc_register_demand(ctx, ncomps), reg_file_alignment);
+   agx_foreach_instr_global_safe(ctx, I) {
+      if (I->op == AGX_OPCODE_PHI) {
+         agx_foreach_ssa_dest(I, d) {
+            I->dest[d] = agx_replace_index(
+               I->dest[d], agx_index_as_mem(I->dest[d], mem_base));
+         }

+         agx_foreach_ssa_src(I, s) {
+            agx_replace_src(I, s, agx_index_as_mem(I->src[s], mem_base));
+         }
+      } else {
+         agx_builder b = agx_init_builder(ctx, agx_before_instr(I));
+         agx_foreach_ssa_src(I, s) {
+            agx_index fill =
+               agx_vec_temp(ctx, I->src[s].size, agx_channels(I->src[s]));
+
+            agx_mov_to(&b, fill, agx_index_as_mem(I->src[s], mem_base));
+            agx_replace_src(I, s, fill);
+         }
+
+         agx_foreach_ssa_dest(I, d) {
+            agx_builder b = agx_init_builder(ctx, agx_after_instr(I));
+            agx_mov_to(&b, agx_index_as_mem(I->dest[d], mem_base), I->dest[d]);
+         }
+      }
+   }
+
+   agx_validate(ctx, "Trivial spill");
+}
+
+void
+agx_ra(agx_context *ctx)
+{
+   /* Determine maximum possible registers. We won't exceed this! */
   unsigned max_possible_regs = AGX_NUM_REGS;

   /* Compute shaders need to have their entire workgroup together, so our
@@ -1017,19 +1142,71 @@ agx_ra(agx_context *ctx)
         agx_max_registers_for_occupancy(threads_per_workgroup);
   }

-   /* TODO: Spilling. Abort so we don't smash the stack in release builds. */
-   if (demand > max_possible_regs) {
-      fprintf(stderr, "\n");
-      fprintf(stderr, "------------------------------------------------\n");
-      fprintf(stderr, "Asahi Linux shader compiler limitation!\n");
-      fprintf(stderr, "We ran out of registers! Nyaaaa 😿\n");
-      fprintf(stderr, "Do not report this as a bug.\n");
-      fprintf(stderr, "We know -- we're working on it!\n");
-      fprintf(stderr, "------------------------------------------------\n");
-      fprintf(stderr, "\n");
-      abort();
+   /* Calculate the demand. We'll use it to determine if we need to spill and to
+    * bound register assignment.
+    */
+   agx_compute_liveness(ctx);
+   unsigned effective_demand = agx_calc_register_demand(ctx);
+   bool spilling = (effective_demand > max_possible_regs);
+
+   if (spilling) {
+      assert(ctx->key->has_scratch && "internal shaders are unspillable");
+      agx_spill_everything(ctx);
+
+      /* After spilling, recalculate liveness and demand */
+      agx_compute_liveness(ctx);
+      effective_demand = agx_calc_register_demand(ctx);
+
+      /* The resulting program can now be assigned registers */
+      assert(effective_demand <= max_possible_regs && "spiller post-condition");
   }

+   uint8_t *ncomps = calloc(ctx->alloc, sizeof(uint8_t));
+   enum ra_class *classes = calloc(ctx->alloc, sizeof(enum ra_class));
+   agx_instr **src_to_collect_phi = calloc(ctx->alloc, sizeof(agx_instr *));
+   enum agx_size *sizes = calloc(ctx->alloc, sizeof(enum agx_size));
+   BITSET_WORD *visited = calloc(BITSET_WORDS(ctx->alloc), sizeof(BITSET_WORD));
+   unsigned max_ncomps = 1;
+
+   agx_foreach_instr_global(ctx, I) {
+      /* Record collects/phis so we can coalesce when assigning */
+      if (I->op == AGX_OPCODE_COLLECT || I->op == AGX_OPCODE_PHI) {
+         agx_foreach_ssa_src(I, s) {
+            src_to_collect_phi[I->src[s].value] = I;
+         }
+      }
+
+      agx_foreach_ssa_dest(I, d) {
+         unsigned v = I->dest[d].value;
+         assert(ncomps[v] == 0 && "broken SSA");
+         /* Round up vectors for easier live range splitting */
+         ncomps[v] = util_next_power_of_two(agx_index_size_16(I->dest[d]));
+         sizes[v] = I->dest[d].size;
+         classes[v] = ra_class_for_index(I->dest[d]);
+
+         max_ncomps = MAX2(max_ncomps, ncomps[v]);
+      }
+   }
+
+   /* For live range splitting to work properly, ensure the register file is
+    * aligned to the larger vector size. Most of the time, this is a no-op since
+    * the largest vector size is usually 128-bit and the register file is
+    * naturally 128-bit aligned. However, this is required for correctness with
+    * 3D textureGrad, which can have a source vector of length 6x32-bit,
+    * rounding up to 256-bit and requiring special accounting here.
+    */
+   unsigned reg_file_alignment = MAX2(max_ncomps, 8);
+   assert(util_is_power_of_two_nonzero(reg_file_alignment));
+
+   if (spilling) {
+      /* We need to allocate scratch registers for lowering spilling later */
+      effective_demand = MAX2(effective_demand, 6 * 2 /* preloading */);
+      effective_demand += reg_file_alignment;
+   }
+
+   unsigned demand = ALIGN_POT(effective_demand, reg_file_alignment);
+   assert(demand <= max_possible_regs && "Invariant");
+
   /* Round up the demand to the maximum number of registers we can use without
    * affecting occupancy. This reduces live range splitting.
    */
@@ -1045,6 +1222,8 @@ agx_ra(agx_context *ctx)
   assert(max_regs >= (6 * 2) && "space for vertex shader preloading");
   assert(max_regs <= max_possible_regs);

+   unsigned max_mem_slot = 0;
+
   /* Assign registers in dominance-order. This coincides with source-order due
    * to a NIR invariant, so we do not need special handling for this.
    */
@@ -1055,12 +1234,20 @@ agx_ra(agx_context *ctx)
         .src_to_collect_phi = src_to_collect_phi,
         .ncomps = ncomps,
         .sizes = sizes,
+         .classes = classes,
         .visited = visited,
-         .bound = max_regs,
-         .max_reg = &ctx->max_reg,
+         .bound[RA_GPR] = max_regs,
+         .bound[RA_MEM] = AGX_NUM_MODELED_REGS,
+         .max_reg[RA_GPR] = &ctx->max_reg,
+         .max_reg[RA_MEM] = &max_mem_slot,
      });
   }

+   if (spilling) {
+      ctx->spill_base = ctx->scratch_size;
+      ctx->scratch_size += (max_mem_slot + 1) * 2;
+   }
+
   /* Vertex shaders preload the vertex/instance IDs (r5, r6) even if the shader
    * don't use them. Account for that so the preload doesn't clobber GPRs.
    */
@@ -1075,6 +1262,8 @@ agx_ra(agx_context *ctx)

      if (ins->op == AGX_OPCODE_COLLECT) {
         assert(ins->dest[0].type == AGX_INDEX_REGISTER);
+         assert(!ins->dest[0].memory);
+
         unsigned base = ins->dest[0].value;
         unsigned width = agx_size_align_16(ins->src[0].size);

@@ -1111,6 +1300,8 @@ agx_ra(agx_context *ctx)
            if (ins->dest[i].type != AGX_INDEX_REGISTER)
               continue;

+            assert(!ins->dest[i].memory);
+
            agx_index src = ins->src[0];
            src.size = ins->dest[i].size;
            src.channels_m1 = 0;
@@ -1147,7 +1338,8 @@ agx_ra(agx_context *ctx)
      case AGX_OPCODE_MOV:
         if (I->src[0].type == AGX_INDEX_REGISTER &&
             I->dest[0].size == I->src[0].size &&
-             I->src[0].value == I->dest[0].value) {
+             I->src[0].value == I->dest[0].value &&
+             I->src[0].memory == I->dest[0].memory) {

            assert(I->dest[0].type == AGX_INDEX_REGISTER);
            agx_remove_instruction(I);
@@ -1167,5 +1359,6 @@ agx_ra(agx_context *ctx)
   free(src_to_collect_phi);
   free(ncomps);
   free(sizes);
+   free(classes);
   free(visited);
 }
--- a/src/asahi/compiler/agx_validate.c
+++ b/src/asahi/compiler/agx_validate.c
@@ -6,6 +6,7 @@

 #include "agx_compiler.h"
 #include "agx_debug.h"
+#include "agx_opcodes.h"

 /* Validatation doesn't make sense in release builds */
 #ifndef NDEBUG
@@ -67,6 +68,16 @@ agx_validate_block_form(agx_block *block)
   return true;
 }

+/*
+ * Only moves and phis use stack. Phis cannot use moves due to their
+ * parallel nature, so we allow phis to take memory, later lowered to moves.
+ */
+static bool
+is_stack_valid(agx_instr *I)
+{
+   return (I->op == AGX_OPCODE_MOV) || (I->op == AGX_OPCODE_PHI);
+}
+
 static bool
 agx_validate_sources(agx_instr *I)
 {
@@ -91,6 +102,8 @@ agx_validate_sources(agx_instr *I)
      } else if (I->op == AGX_OPCODE_COLLECT && !agx_is_null(src)) {
         agx_validate_assert(src.size == I->src[0].size);
      }
+
+      agx_validate_assert(!src.memory || is_stack_valid(I));
   }

   return true;
@@ -115,6 +128,9 @@ agx_validate_defs(agx_instr *I, BITSET_WORD *defs)
         return false;

      BITSET_SET(defs, I->dest[d].value);
+
+      if (I->dest[d].memory && !is_stack_valid(I))
+         return false;
   }

   return true;
@@ -127,6 +143,10 @@ agx_write_registers(const agx_instr *I, unsigned d)
   unsigned size = agx_size_align_16(I->dest[d].size);

   switch (I->op) {
+   case AGX_OPCODE_MOV:
+      /* Tautological */
+      return agx_index_size_16(I->dest[d]);
+
   case AGX_OPCODE_ITER:
   case AGX_OPCODE_ITERPROJ:
      assert(1 <= I->channels && I->channels <= 4);
@@ -194,6 +214,10 @@ agx_read_registers(const agx_instr *I, unsigned s)
   unsigned size = agx_size_align_16(I->src[s].size);

   switch (I->op) {
+   case AGX_OPCODE_MOV:
+      /* Tautological */
+      return agx_index_size_16(I->src[0]);
+
   case AGX_OPCODE_SPLIT:
      return I->nr_dests * agx_size_align_16(agx_split_width(I));