diff --git a/src/asahi/compiler/agx_compiler.h b/src/asahi/compiler/agx_compiler.h
index 3075ba3f857..ad3e7b386a6 100644
--- a/src/asahi/compiler/agx_compiler.h
+++ b/src/asahi/compiler/agx_compiler.h
@@ -26,6 +26,9 @@ extern "C" {
 /* u0-u255 inclusive, as pairs of 16-bits */
 #define AGX_NUM_UNIFORMS (512)
 
+/* Semi-arbitrary limit for spill slot allocation */
+#define AGX_NUM_MODELED_REGS (2048)
+
 enum agx_index_type {
    AGX_INDEX_NULL = 0,
    AGX_INDEX_NORMAL = 1,
@@ -71,6 +74,9 @@ typedef struct {
    bool abs : 1;
    bool neg : 1;
 
+   /* Register class */
+   bool memory : 1;
+
    unsigned channels_m1     : 3;
    enum agx_size size       : 2;
    enum agx_index_type type : 3;
@@ -139,12 +145,22 @@ agx_register(uint32_t imm, enum agx_size size)
 }
 
 static inline agx_index
-agx_register_like(uint32_t imm, agx_index like)
+agx_memory_register(uint32_t imm, enum agx_size size)
 {
-   assert(imm < AGX_NUM_REGS);
-
    return (agx_index){
       .value = imm,
+      .memory = true,
+      .size = size,
+      .type = AGX_INDEX_REGISTER,
+   };
+}
+
+static inline agx_index
+agx_register_like(uint32_t imm, agx_index like)
+{
+   return (agx_index){
+      .value = imm,
+      .memory = like.memory,
       .channels_m1 = like.channels_m1,
       .size = like.size,
       .type = AGX_INDEX_REGISTER,
@@ -398,7 +414,7 @@ typedef struct agx_block {
    /* For visited blocks during register assignment and live-out registers, the
     * mapping of SSA names to registers at the end of the block.
     */
-   uint8_t *ssa_to_reg_out;
+   uint16_t *ssa_to_reg_out;
 
    /* Is this block a loop header? If not, all of its predecessors precede it in
     * source order.
@@ -464,6 +480,15 @@ typedef struct {
     */
    agx_index vertex_id, instance_id;
 
+   /* Beginning of our stack allocation used for spilling, below that is
+    * NIR-level scratch.
+    */
+   unsigned spill_base;
+
+   /* Beginning of stack allocation used for parallel copy lowering */
+   bool has_spill_pcopy_reserved;
+   unsigned spill_pcopy_base;
+
    /* Stats for shader-db */
    unsigned loop_count;
    unsigned spills;
@@ -883,6 +908,9 @@ struct agx_copy {
    /* Base register destination of the copy */
    unsigned dest;
 
+   /* Destination is memory */
+   bool dest_mem;
+
    /* Source of the copy */
    agx_index src;
 
diff --git a/src/asahi/compiler/agx_lower_parallel_copy.c b/src/asahi/compiler/agx_lower_parallel_copy.c
index a7c271f850f..ea787770e79 100644
--- a/src/asahi/compiler/agx_lower_parallel_copy.c
+++ b/src/asahi/compiler/agx_lower_parallel_copy.c
@@ -22,15 +22,47 @@
  * We only handles register-register copies, not general agx_index sources. This
  * suffices for its internal use for register allocation.
  */
+
+static agx_index
+scratch_slot(agx_context *ctx, enum agx_size size)
+{
+   /* Reserve scratch slots. scratch_size is in bytes, spill_pcopy_base and
+    * agx_memory_register are in memory registers (16-bit elements).
+    */
+   if (!ctx->has_spill_pcopy_reserved) {
+      ctx->scratch_size = align(ctx->scratch_size, 16);
+      ctx->spill_pcopy_base = ctx->scratch_size / 2;
+      ctx->scratch_size += 8;
+      ctx->has_spill_pcopy_reserved = true;
+   }
+
+   return agx_memory_register(ctx->spill_pcopy_base, size);
+}
+
 static void
 do_copy(agx_builder *b, const struct agx_copy *copy)
 {
-   agx_index dst = agx_register(copy->dest, copy->src.size);
+   agx_index dst = copy->dest_mem
+                      ? agx_memory_register(copy->dest, copy->src.size)
+                      : agx_register(copy->dest, copy->src.size);
 
-   if (copy->src.type == AGX_INDEX_IMMEDIATE)
+   if (copy->dest_mem && copy->src.memory) {
+      /* Memory-memory copies need to be lowered to memory-register and
+       * register-memory, spilling a GPR to an auxiliary memory slot. This
+       * avoids needing reserving a scratch register for this edge case.
+       */
+      agx_index scratch_reg = agx_register(0, copy->src.size);
+      agx_index scratch_mem = scratch_slot(b->shader, copy->src.size);
+
+      agx_mov_to(b, scratch_mem, scratch_reg);
+      agx_mov_to(b, scratch_reg, copy->src);
+      agx_mov_to(b, dst, scratch_reg);
+      agx_mov_to(b, scratch_reg, scratch_mem);
+   } else if (copy->src.type == AGX_INDEX_IMMEDIATE) {
       agx_mov_imm_to(b, dst, copy->src.value);
-   else
+   } else {
       agx_mov_to(b, dst, copy->src);
+   }
 }
 
 static void
@@ -43,7 +75,7 @@ do_swap(agx_builder *b, const struct agx_copy *copy)
 
    /* We can swap lo/hi halves of a 32-bit register with a 32-bit extr */
    if (copy->src.size == AGX_SIZE_16 &&
-       (copy->dest >> 1) == (copy->src.value >> 1)) {
+       (copy->dest >> 1) == (copy->src.value >> 1) && !copy->dest_mem) {
 
       assert(((copy->dest & 1) == (1 - (copy->src.value & 1))) &&
              "no trivial swaps, and only 2 halves of a register");
@@ -58,9 +90,32 @@ do_swap(agx_builder *b, const struct agx_copy *copy)
       return;
    }
 
-   agx_index x = agx_register(copy->dest, copy->src.size);
+   agx_index x = copy->dest_mem
+                    ? agx_memory_register(copy->dest, copy->src.size)
+                    : agx_register(copy->dest, copy->src.size);
    agx_index y = copy->src;
 
+   /* Memory-memory swaps need to be lowered */
+   assert(x.memory == y.memory);
+   if (x.memory) {
+      agx_index temp1 = agx_register(0, copy->src.size);
+      agx_index temp2 = agx_register(2, copy->src.size);
+
+      agx_index scratch_reg2 = agx_register(0, copy->src.size);
+      agx_index scratch_mem2 = scratch_slot(b->shader, copy->src.size);
+      scratch_reg2.channels_m1++;
+      scratch_mem2.channels_m1++;
+
+      agx_mov_to(b, scratch_mem2, scratch_reg2);
+      agx_mov_to(b, temp1, x);
+      agx_mov_to(b, temp2, y);
+      agx_mov_to(b, y, temp1);
+      agx_mov_to(b, x, temp2);
+      agx_mov_to(b, scratch_reg2, scratch_mem2);
+      return;
+   }
+
+   /* Otherwise, we're swapping GPRs and fallback on a XOR swap. */
    agx_xor_to(b, x, x, y);
    agx_xor_to(b, y, x, y);
    agx_xor_to(b, x, x, y);
@@ -74,12 +129,12 @@ struct copy_ctx {
     * source. Once this drops to zero, then the physreg is unblocked and can
     * be moved to.
     */
-   unsigned physreg_use_count[AGX_NUM_REGS];
+   unsigned physreg_use_count[AGX_NUM_MODELED_REGS];
 
    /* For each physreg, the pending copy_entry that uses it as a dest. */
-   struct agx_copy *physreg_dest[AGX_NUM_REGS];
+   struct agx_copy *physreg_dest[AGX_NUM_MODELED_REGS];
 
-   struct agx_copy entries[AGX_NUM_REGS];
+   struct agx_copy entries[AGX_NUM_MODELED_REGS];
 };
 
 static bool
@@ -96,7 +151,8 @@ entry_blocked(struct agx_copy *entry, struct copy_ctx *ctx)
 static bool
 is_real(struct agx_copy *entry)
 {
-   return entry->src.type == AGX_INDEX_REGISTER;
+   return entry->src.type == AGX_INDEX_REGISTER &&
+          entry->dest_mem == entry->src.memory;
 }
 
 /* TODO: Generalize to other bit sizes */
@@ -109,6 +165,7 @@ split_32bit_copy(struct copy_ctx *ctx, struct agx_copy *entry)
    struct agx_copy *new_entry = &ctx->entries[ctx->entry_count++];
 
    new_entry->dest = entry->dest + 1;
+   new_entry->dest_mem = entry->dest_mem;
    new_entry->src = entry->src;
    new_entry->src.value += 1;
    new_entry->done = false;
@@ -117,9 +174,9 @@ split_32bit_copy(struct copy_ctx *ctx, struct agx_copy *entry)
    ctx->physreg_dest[entry->dest + 1] = new_entry;
 }
 
-void
-agx_emit_parallel_copies(agx_builder *b, struct agx_copy *copies,
-                         unsigned num_copies)
+static void
+agx_emit_parallel_copies_for_class(agx_builder *b, struct agx_copy *copies,
+                                   unsigned num_copies, bool cls)
 {
    /* First, lower away 64-bit copies to smaller chunks, since we don't have
     * 64-bit ALU so we always want to split.
@@ -130,6 +187,12 @@ agx_emit_parallel_copies(agx_builder *b, struct agx_copy *copies,
    for (unsigned i = 0; i < num_copies; ++i) {
       struct agx_copy copy = copies[i];
 
+      /* Filter by class */
+      if (copy.dest_mem != cls)
+         continue;
+
+      assert(copy.dest < AGX_NUM_MODELED_REGS);
+
       if (copy.src.size == AGX_SIZE_64) {
          copy.src.size = AGX_SIZE_32;
          copies2[num_copies2++] = copy;
@@ -354,3 +417,14 @@ agx_emit_parallel_copies(agx_builder *b, struct agx_copy *copies,
 
    free(copies2);
 }
+
+void
+agx_emit_parallel_copies(agx_builder *b, struct agx_copy *copies,
+                         unsigned num_copies)
+{
+   /* Emit copies fo reach register class separately because we don't have
+    * register class awareness in the parallel copy lowering data structure.
+    */
+   agx_emit_parallel_copies_for_class(b, copies, num_copies, false);
+   agx_emit_parallel_copies_for_class(b, copies, num_copies, true);
+}
diff --git a/src/asahi/compiler/agx_print.c b/src/asahi/compiler/agx_print.c
index a2e453d7f3c..8f71ccf9426 100644
--- a/src/asahi/compiler/agx_print.c
+++ b/src/asahi/compiler/agx_print.c
@@ -30,6 +30,9 @@ agx_print_sized(char prefix, unsigned value, enum agx_size size, FILE *fp)
 static void
 agx_print_index(agx_index index, bool is_float, FILE *fp)
 {
+   if (index.memory)
+      fprintf(fp, "m");
+
    switch (index.type) {
    case AGX_INDEX_NULL:
       fprintf(fp, "_");
@@ -75,6 +78,8 @@ agx_print_index(agx_index index, bool is_float, FILE *fp)
 
          fprintf(fp, "...");
 
+         if (index.memory)
+            fprintf(fp, "m");
          agx_print_sized('r', last, index.size, fp);
       }
       break;
diff --git a/src/asahi/compiler/agx_register_allocate.c b/src/asahi/compiler/agx_register_allocate.c
index 75e542ba85a..39ffbe77bd8 100644
--- a/src/asahi/compiler/agx_register_allocate.c
+++ b/src/asahi/compiler/agx_register_allocate.c
@@ -3,6 +3,8 @@
  * SPDX-License-Identifier: MIT
  */
 
+#include "util/bitset.h"
+#include "util/macros.h"
 #include "util/u_dynarray.h"
 #include "util/u_qsort.h"
 #include "agx_builder.h"
@@ -10,21 +12,41 @@
 #include "agx_compiler.h"
 #include "agx_debug.h"
 #include "agx_opcodes.h"
+#include "agx_pack.h"
+#include "shader_enums.h"
 
 /* SSA-based register allocator */
 
+enum ra_class {
+   /* General purpose register */
+   RA_GPR,
+
+   /* Memory, used to assign stack slots */
+   RA_MEM,
+
+   /* Keep last */
+   RA_CLASSES,
+};
+
+static inline enum ra_class
+ra_class_for_index(agx_index idx)
+{
+   return idx.memory ? RA_MEM : RA_GPR;
+}
+
 struct ra_ctx {
    agx_context *shader;
    agx_block *block;
    agx_instr *instr;
-   uint8_t *ssa_to_reg;
+   uint16_t *ssa_to_reg;
    uint8_t *ncomps;
    enum agx_size *sizes;
+   enum ra_class *classes;
    BITSET_WORD *visited;
-   BITSET_WORD *used_regs;
+   BITSET_WORD *used_regs[RA_CLASSES];
 
    /* Maintained while assigning registers */
-   unsigned *max_reg;
+   unsigned *max_reg[RA_CLASSES];
 
    /* For affinities */
    agx_instr **src_to_collect_phi;
@@ -32,11 +54,13 @@ struct ra_ctx {
    /* If bit i of used_regs is set, and register i is the first consecutive
     * register holding an SSA value, then reg_to_ssa[i] is the SSA index of the
     * value currently in register  i.
+    *
+    * Only for GPRs. We can add reg classes later if we have a use case.
     */
    uint32_t reg_to_ssa[AGX_NUM_REGS];
 
    /* Maximum number of registers that RA is allowed to use */
-   unsigned bound;
+   unsigned bound[RA_CLASSES];
 };
 
 enum agx_size
@@ -58,12 +82,26 @@ agx_split_width(const agx_instr *I)
 }
 
 /*
- * Calculate register demand in 16-bit registers. Becuase we allocate in SSA,
- * this calculation is exact in linear-time. Depends on liveness information.
+ * Calculate register demand in 16-bit registers, while gathering widths and
+ * classes. Becuase we allocate in SSA, this calculation is exact in
+ * linear-time. Depends on liveness information.
  */
 static unsigned
-agx_calc_register_demand(agx_context *ctx, uint8_t *widths)
+agx_calc_register_demand(agx_context *ctx)
 {
+   uint8_t *widths = calloc(ctx->alloc, sizeof(uint8_t));
+   enum ra_class *classes = calloc(ctx->alloc, sizeof(enum ra_class));
+
+   agx_foreach_instr_global(ctx, I) {
+      agx_foreach_ssa_dest(I, d) {
+         unsigned v = I->dest[d].value;
+         assert(widths[v] == 0 && "broken SSA");
+         /* Round up vectors for easier live range splitting */
+         widths[v] = util_next_power_of_two(agx_index_size_16(I->dest[d]));
+         classes[v] = ra_class_for_index(I->dest[d]);
+      }
+   }
+
    /* Calculate demand at the start of each block based on live-in, then update
     * for each instruction processed. Calculate rolling maximum.
     */
@@ -82,7 +120,8 @@ agx_calc_register_demand(agx_context *ctx, uint8_t *widths)
       {
          int i;
          BITSET_FOREACH_SET(i, block->live_in, ctx->alloc) {
-            demand += widths[i];
+            if (classes[i] == RA_GPR)
+               demand += widths[i];
          }
       }
 
@@ -110,6 +149,8 @@ agx_calc_register_demand(agx_context *ctx, uint8_t *widths)
             if (!I->src[s].kill)
                continue;
             assert(I->src[s].type == AGX_INDEX_NORMAL);
+            if (ra_class_for_index(I->src[s]) != RA_GPR)
+               continue;
 
             bool skip = false;
 
@@ -125,10 +166,9 @@ agx_calc_register_demand(agx_context *ctx, uint8_t *widths)
          }
 
          /* Make destinations live */
-         agx_foreach_dest(I, d) {
-            if (agx_is_null(I->dest[d]))
+         agx_foreach_ssa_dest(I, d) {
+            if (ra_class_for_index(I->dest[d]) != RA_GPR)
                continue;
-            assert(I->dest[d].type == AGX_INDEX_NORMAL);
 
             /* Live range splits allocate at power-of-two granularity. Round up
              * destination sizes (temporarily) to powers-of-two.
@@ -146,15 +186,17 @@ agx_calc_register_demand(agx_context *ctx, uint8_t *widths)
       demand -= late_kill_count;
    }
 
+   free(widths);
+   free(classes);
    return max_demand;
 }
 
 static bool
-find_regs_simple(struct ra_ctx *rctx, unsigned count, unsigned align,
-                 unsigned *out)
+find_regs_simple(struct ra_ctx *rctx, enum ra_class cls, unsigned count,
+                 unsigned align, unsigned *out)
 {
-   for (unsigned reg = 0; reg + count <= rctx->bound; reg += align) {
-      if (!BITSET_TEST_RANGE(rctx->used_regs, reg, reg + count - 1)) {
+   for (unsigned reg = 0; reg + count <= rctx->bound[cls]; reg += align) {
+      if (!BITSET_TEST_RANGE(rctx->used_regs[cls], reg, reg + count - 1)) {
          *out = reg;
          return true;
       }
@@ -177,17 +219,18 @@ find_regs_simple(struct ra_ctx *rctx, unsigned count, unsigned align,
  * Postcondition: at least one register in the returned region is already free.
  */
 static unsigned
-find_best_region_to_evict(struct ra_ctx *rctx, unsigned size,
+find_best_region_to_evict(struct ra_ctx *rctx, enum ra_class cls, unsigned size,
                           BITSET_WORD *already_evicted, BITSET_WORD *killed)
 {
    assert(util_is_power_of_two_or_zero(size) && "precondition");
-   assert((rctx->bound % size) == 0 &&
+   assert((rctx->bound[cls] % size) == 0 &&
           "register file size must be aligned to the maximum vector size");
+   assert(cls == RA_GPR);
 
    unsigned best_base = ~0;
    unsigned best_moves = ~0;
 
-   for (unsigned base = 0; base + size <= rctx->bound; base += size) {
+   for (unsigned base = 0; base + size <= rctx->bound[cls]; base += size) {
       /* r0l is unevictable, skip it. By itself, this does not pose a problem.
        * We are allocating n registers, but the region containing r0l has at
        * most n-1 free. Since there are at least n free registers total, there
@@ -215,7 +258,7 @@ find_best_region_to_evict(struct ra_ctx *rctx, unsigned size,
          /* We need a move for each blocked register (TODO: we only need a
           * single move for 32-bit pairs, could optimize to use that instead.)
           */
-         if (BITSET_TEST(rctx->used_regs, reg))
+         if (BITSET_TEST(rctx->used_regs[cls], reg))
             moves++;
          else
             any_free = true;
@@ -239,7 +282,7 @@ find_best_region_to_evict(struct ra_ctx *rctx, unsigned size,
       }
    }
 
-   assert(best_base < rctx->bound &&
+   assert(best_base < rctx->bound[cls] &&
           "not enough registers (should have spilled already)");
    return best_base;
 }
@@ -247,15 +290,22 @@ find_best_region_to_evict(struct ra_ctx *rctx, unsigned size,
 static void
 set_ssa_to_reg(struct ra_ctx *rctx, unsigned ssa, unsigned reg)
 {
-   *(rctx->max_reg) = MAX2(*(rctx->max_reg), reg + rctx->ncomps[ssa] - 1);
+   enum ra_class cls = rctx->classes[ssa];
+
+   *(rctx->max_reg[cls]) =
+      MAX2(*(rctx->max_reg[cls]), reg + rctx->ncomps[ssa] - 1);
+
    rctx->ssa_to_reg[ssa] = reg;
 }
 
 static unsigned
 assign_regs_by_copying(struct ra_ctx *rctx, unsigned npot_count, unsigned align,
                        const agx_instr *I, struct util_dynarray *copies,
-                       BITSET_WORD *clobbered, BITSET_WORD *killed)
+                       BITSET_WORD *clobbered, BITSET_WORD *killed,
+                       enum ra_class cls)
 {
+   assert(cls == RA_GPR);
+
    /* XXX: This needs some special handling but so far it has been prohibitively
     * difficult to hit the case
     */
@@ -273,14 +323,15 @@ assign_regs_by_copying(struct ra_ctx *rctx, unsigned npot_count, unsigned align,
     * shuffle some variables around. Look for a range of the register file
     * that is partially blocked.
     */
-   unsigned base = find_best_region_to_evict(rctx, count, clobbered, killed);
+   unsigned base =
+      find_best_region_to_evict(rctx, cls, count, clobbered, killed);
 
    assert(count <= 16 && "max allocation size (conservative)");
    BITSET_DECLARE(evict_set, 16) = {0};
 
    /* Store the set of blocking registers that need to be evicted */
    for (unsigned i = 0; i < count; ++i) {
-      if (BITSET_TEST(rctx->used_regs, base + i)) {
+      if (BITSET_TEST(rctx->used_regs[cls], base + i)) {
          BITSET_SET(evict_set, i);
       }
    }
@@ -288,7 +339,7 @@ assign_regs_by_copying(struct ra_ctx *rctx, unsigned npot_count, unsigned align,
    /* We are going to allocate the destination to this range, so it is now fully
     * used. Mark it as such so we don't reassign here later.
     */
-   BITSET_SET_RANGE(rctx->used_regs, base, base + count - 1);
+   BITSET_SET_RANGE(rctx->used_regs[cls], base, base + count - 1);
 
    /* Before overwriting the range, we need to evict blocked variables */
    for (unsigned i = 0; i < 16; ++i) {
@@ -315,11 +366,13 @@ assign_regs_by_copying(struct ra_ctx *rctx, unsigned npot_count, unsigned align,
        * recursion because nr is decreasing because of the gap.
        */
       assert(nr < count && "fully contained in range that's not full");
-      unsigned new_reg =
-         assign_regs_by_copying(rctx, nr, align, I, copies, clobbered, killed);
+      unsigned new_reg = assign_regs_by_copying(rctx, nr, align, I, copies,
+                                                clobbered, killed, cls);
 
       /* Copy the variable over, register by register */
       for (unsigned i = 0; i < nr; i += align) {
+         assert(cls == RA_GPR);
+
          struct agx_copy copy = {
             .dest = new_reg + i,
             .src = agx_register(reg + i, rctx->sizes[ssa]),
@@ -338,6 +391,7 @@ assign_regs_by_copying(struct ra_ctx *rctx, unsigned npot_count, unsigned align,
       BITSET_SET_RANGE(clobbered, new_reg, new_reg + nr - 1);
 
       /* Update bookkeeping for this variable */
+      assert(cls == rctx->classes[cls]);
       set_ssa_to_reg(rctx, ssa, new_reg);
       rctx->reg_to_ssa[new_reg] = ssa;
 
@@ -348,8 +402,10 @@ assign_regs_by_copying(struct ra_ctx *rctx, unsigned npot_count, unsigned align,
    /* We overallocated for non-power-of-two vectors. Free up the excess now.
     * This is modelled as late kill in demand calculation.
     */
-   if (npot_count != count)
-      BITSET_CLEAR_RANGE(rctx->used_regs, base + npot_count, base + count - 1);
+   if (npot_count != count) {
+      BITSET_CLEAR_RANGE(rctx->used_regs[cls], base + npot_count,
+                         base + count - 1);
+   }
 
    return base;
 }
@@ -401,7 +457,9 @@ insert_copies_for_clobbered_killed(struct ra_ctx *rctx, unsigned reg,
    agx_foreach_ssa_src(I, s) {
       unsigned reg = rctx->ssa_to_reg[I->src[s].value];
 
-      if (I->src[s].kill && BITSET_TEST(clobbered, reg)) {
+      if (I->src[s].kill && ra_class_for_index(I->src[s]) == RA_GPR &&
+          BITSET_TEST(clobbered, reg)) {
+
          assert(nr_vars < ARRAY_SIZE(vars) &&
                 "cannot clobber more than max variable size");
 
@@ -433,6 +491,7 @@ insert_copies_for_clobbered_killed(struct ra_ctx *rctx, unsigned reg,
       unsigned var_count = rctx->ncomps[var];
       unsigned var_align = agx_size_align_16(rctx->sizes[var]);
 
+      assert(rctx->classes[var] == RA_GPR && "construction");
       assert((base % var_align) == 0 && "induction");
       assert((var_count % var_align) == 0 && "no partial variables");
 
@@ -461,9 +520,13 @@ find_regs(struct ra_ctx *rctx, agx_instr *I, unsigned dest_idx, unsigned count,
    unsigned reg;
    assert(count == align);
 
-   if (find_regs_simple(rctx, count, align, &reg)) {
+   enum ra_class cls = ra_class_for_index(I->dest[dest_idx]);
+
+   if (find_regs_simple(rctx, cls, count, align, &reg)) {
       return reg;
    } else {
+      assert(cls == RA_GPR && "no memory live range splits");
+
       BITSET_DECLARE(clobbered, AGX_NUM_REGS) = {0};
       BITSET_DECLARE(killed, AGX_NUM_REGS) = {0};
       struct util_dynarray copies = {0};
@@ -481,7 +544,7 @@ find_regs(struct ra_ctx *rctx, agx_instr *I, unsigned dest_idx, unsigned count,
       }
 
       reg = assign_regs_by_copying(rctx, count, align, I, &copies, clobbered,
-                                   killed);
+                                   killed, cls);
       insert_copies_for_clobbered_killed(rctx, reg, count, I, &copies,
                                          clobbered);
 
@@ -491,7 +554,7 @@ find_regs(struct ra_ctx *rctx, agx_instr *I, unsigned dest_idx, unsigned count,
          &b, copies.data, util_dynarray_num_elements(&copies, struct agx_copy));
 
       /* assign_regs asserts this is cleared, so clear to be reassigned */
-      BITSET_CLEAR_RANGE(rctx->used_regs, reg, reg + count - 1);
+      BITSET_CLEAR_RANGE(rctx->used_regs[cls], reg, reg + count - 1);
       return reg;
    }
 }
@@ -545,10 +608,13 @@ reserve_live_in(struct ra_ctx *rctx)
                 */
                assert(rctx->block->loop_header);
                phi->src[pred_idx] = agx_get_index(i, size);
+               phi->src[pred_idx].memory = rctx->classes[i] == RA_MEM;
             } else {
                /* Otherwise, we can build the phi now */
                unsigned reg = (*pred)->ssa_to_reg_out[i];
-               phi->src[pred_idx] = agx_register(reg, size);
+               phi->src[pred_idx] = rctx->classes[i] == RA_MEM
+                                       ? agx_memory_register(reg, size)
+                                       : agx_register(reg, size);
             }
          }
 
@@ -567,11 +633,14 @@ reserve_live_in(struct ra_ctx *rctx)
          base = (*pred)->ssa_to_reg_out[i];
       }
 
+      enum ra_class cls = rctx->classes[i];
       set_ssa_to_reg(rctx, i, base);
 
       for (unsigned j = 0; j < rctx->ncomps[i]; ++j) {
-         BITSET_SET(rctx->used_regs, base + j);
-         rctx->reg_to_ssa[base + j] = i;
+         BITSET_SET(rctx->used_regs[cls], base + j);
+
+         if (cls == RA_GPR)
+            rctx->reg_to_ssa[base + j] = i;
       }
    }
 }
@@ -579,7 +648,8 @@ reserve_live_in(struct ra_ctx *rctx)
 static void
 assign_regs(struct ra_ctx *rctx, agx_index v, unsigned reg)
 {
-   assert(reg < rctx->bound && "must not overflow register file");
+   enum ra_class cls = ra_class_for_index(v);
+   assert(reg < rctx->bound[cls] && "must not overflow register file");
    assert(v.type == AGX_INDEX_NORMAL && "only SSA gets registers allocated");
    set_ssa_to_reg(rctx, v.value, reg);
 
@@ -589,10 +659,12 @@ assign_regs(struct ra_ctx *rctx, agx_index v, unsigned reg)
    assert(rctx->ncomps[v.value] >= 1);
    unsigned end = reg + rctx->ncomps[v.value] - 1;
 
-   assert(!BITSET_TEST_RANGE(rctx->used_regs, reg, end) && "no interference");
-   BITSET_SET_RANGE(rctx->used_regs, reg, end);
+   assert(!BITSET_TEST_RANGE(rctx->used_regs[cls], reg, end) &&
+          "no interference");
+   BITSET_SET_RANGE(rctx->used_regs[cls], reg, end);
 
-   rctx->reg_to_ssa[reg] = v.value;
+   if (cls == RA_GPR)
+      rctx->reg_to_ssa[reg] = v.value;
 }
 
 static void
@@ -641,10 +713,12 @@ try_coalesce_with(struct ra_ctx *rctx, agx_index ssa, unsigned count,
    }
 
    unsigned base = rctx->ssa_to_reg[ssa.value];
-   if (BITSET_TEST_RANGE(rctx->used_regs, base, base + count - 1))
+   enum ra_class cls = ra_class_for_index(ssa);
+
+   if (BITSET_TEST_RANGE(rctx->used_regs[cls], base, base + count - 1))
       return false;
 
-   assert(base + count <= rctx->bound && "invariant");
+   assert(base + count <= rctx->bound[cls] && "invariant");
    *out = base;
    return true;
 }
@@ -653,6 +727,7 @@ static unsigned
 pick_regs(struct ra_ctx *rctx, agx_instr *I, unsigned d)
 {
    agx_index idx = I->dest[d];
+   enum ra_class cls = ra_class_for_index(idx);
    assert(idx.type == AGX_INDEX_NORMAL);
 
    unsigned count = rctx->ncomps[idx.value];
@@ -680,14 +755,14 @@ pick_regs(struct ra_ctx *rctx, agx_instr *I, unsigned d)
                 "and this is not a phi node, so we have assigned a register");
 
          unsigned base = affinity_base_of_collect(rctx, I, s);
-         if (base >= rctx->bound || (base + count) > rctx->bound)
+         if (base >= rctx->bound[cls] || (base + count) > rctx->bound[cls])
             continue;
 
          /* Unaligned destinations can happen when dest size > src size */
          if (base % align)
             continue;
 
-         if (!BITSET_TEST_RANGE(rctx->used_regs, base, base + count - 1))
+         if (!BITSET_TEST_RANGE(rctx->used_regs[cls], base, base + count - 1))
             return base;
       }
    }
@@ -719,17 +794,18 @@ pick_regs(struct ra_ctx *rctx, agx_instr *I, unsigned d)
 
          /* Determine where the collect should start relative to the source */
          unsigned base = affinity_base_of_collect(rctx, collect, s);
-         if (base >= rctx->bound)
+         if (base >= rctx->bound[cls])
             continue;
 
          unsigned our_reg = base + (our_source * align);
 
          /* Don't allocate past the end of the register file */
-         if ((our_reg + align) > rctx->bound)
+         if ((our_reg + align) > rctx->bound[cls])
             continue;
 
          /* If those registers are free, then choose them */
-         if (!BITSET_TEST_RANGE(rctx->used_regs, our_reg, our_reg + align - 1))
+         if (!BITSET_TEST_RANGE(rctx->used_regs[cls], our_reg,
+                                our_reg + align - 1))
             return our_reg;
       }
 
@@ -739,9 +815,10 @@ pick_regs(struct ra_ctx *rctx, agx_instr *I, unsigned d)
       /* Prefer ranges of the register file that leave room for all sources of
        * the collect contiguously.
        */
-      for (unsigned base = 0; base + (collect->nr_srcs * align) <= rctx->bound;
+      for (unsigned base = 0;
+           base + (collect->nr_srcs * align) <= rctx->bound[cls];
            base += collect_align) {
-         if (!BITSET_TEST_RANGE(rctx->used_regs, base,
+         if (!BITSET_TEST_RANGE(rctx->used_regs[cls], base,
                                 base + (collect->nr_srcs * align) - 1))
             return base + offset;
       }
@@ -751,9 +828,9 @@ pick_regs(struct ra_ctx *rctx, agx_instr *I, unsigned d)
        * for a register for the source such that the collect base is aligned.
        */
       if (collect_align > align) {
-         for (unsigned reg = offset; reg + collect_align <= rctx->bound;
+         for (unsigned reg = offset; reg + collect_align <= rctx->bound[cls];
               reg += collect_align) {
-            if (!BITSET_TEST_RANGE(rctx->used_regs, reg, reg + count - 1))
+            if (!BITSET_TEST_RANGE(rctx->used_regs[cls], reg, reg + count - 1))
                return reg;
          }
       }
@@ -773,7 +850,7 @@ pick_regs(struct ra_ctx *rctx, agx_instr *I, unsigned d)
       if (phi->dest[0].type == AGX_INDEX_REGISTER) {
          unsigned base = phi->dest[0].value;
 
-         if (!BITSET_TEST_RANGE(rctx->used_regs, base, base + count - 1))
+         if (!BITSET_TEST_RANGE(rctx->used_regs[cls], base, base + count - 1))
             return base;
       }
    }
@@ -787,12 +864,14 @@ pick_regs(struct ra_ctx *rctx, agx_instr *I, unsigned d)
 static void
 agx_ra_assign_local(struct ra_ctx *rctx)
 {
-   BITSET_DECLARE(used_regs, AGX_NUM_REGS) = {0};
-   uint8_t *ssa_to_reg = calloc(rctx->shader->alloc, sizeof(uint8_t));
+   BITSET_DECLARE(used_regs_gpr, AGX_NUM_REGS) = {0};
+   BITSET_DECLARE(used_regs_mem, AGX_NUM_MODELED_REGS) = {0};
+   uint16_t *ssa_to_reg = calloc(rctx->shader->alloc, sizeof(uint16_t));
 
    agx_block *block = rctx->block;
    uint8_t *ncomps = rctx->ncomps;
-   rctx->used_regs = used_regs;
+   rctx->used_regs[RA_GPR] = used_regs_gpr;
+   rctx->used_regs[RA_MEM] = used_regs_mem;
    rctx->ssa_to_reg = ssa_to_reg;
 
    reserve_live_in(rctx);
@@ -801,7 +880,7 @@ agx_ra_assign_local(struct ra_ctx *rctx)
     * This could be optimized (sync with agx_calc_register_demand).
     */
    if (rctx->shader->any_cf)
-      BITSET_SET(used_regs, 0);
+      BITSET_SET(used_regs_gpr, 0);
 
    agx_foreach_instr_in_block(block, I) {
       rctx->instr = I;
@@ -810,13 +889,17 @@ agx_ra_assign_local(struct ra_ctx *rctx)
        * can be removed by assigning the destinations overlapping the source.
        */
       if (I->op == AGX_OPCODE_SPLIT && I->src[0].kill) {
+         assert(ra_class_for_index(I->src[0]) == RA_GPR);
          unsigned reg = ssa_to_reg[I->src[0].value];
          unsigned width = agx_size_align_16(agx_split_width(I));
 
          agx_foreach_dest(I, d) {
+            assert(ra_class_for_index(I->dest[0]) == RA_GPR);
+
             /* Free up the source */
             unsigned offset_reg = reg + (d * width);
-            BITSET_CLEAR_RANGE(used_regs, offset_reg, offset_reg + width - 1);
+            BITSET_CLEAR_RANGE(used_regs_gpr, offset_reg,
+                               offset_reg + width - 1);
 
             /* Assign the destination where the source was */
             if (!agx_is_null(I->dest[d]))
@@ -826,7 +909,7 @@ agx_ra_assign_local(struct ra_ctx *rctx)
          unsigned excess =
             rctx->ncomps[I->src[0].value] - (I->nr_dests * width);
          if (excess) {
-            BITSET_CLEAR_RANGE(used_regs, reg + (I->nr_dests * width),
+            BITSET_CLEAR_RANGE(used_regs_gpr, reg + (I->nr_dests * width),
                                reg + rctx->ncomps[I->src[0].value] - 1);
          }
 
@@ -846,11 +929,12 @@ agx_ra_assign_local(struct ra_ctx *rctx)
       /* First, free killed sources */
       agx_foreach_ssa_src(I, s) {
          if (I->src[s].kill) {
+            enum ra_class cls = ra_class_for_index(I->src[s]);
             unsigned reg = ssa_to_reg[I->src[s].value];
             unsigned count = ncomps[I->src[s].value];
 
             assert(count >= 1);
-            BITSET_CLEAR_RANGE(used_regs, reg, reg + count - 1);
+            BITSET_CLEAR_RANGE(rctx->used_regs[cls], reg, reg + count - 1);
          }
       }
 
@@ -883,7 +967,7 @@ agx_ra_assign_local(struct ra_ctx *rctx)
 
             agx_replace_src(
                phi, pred_idx,
-               agx_register(rctx->ssa_to_reg[value], phi->src[pred_idx].size));
+               agx_register_like(rctx->ssa_to_reg[value], phi->src[pred_idx]));
          }
       }
    }
@@ -941,6 +1025,7 @@ agx_insert_parallel_copies(agx_context *ctx, agx_block *block)
 
          copies[i++] = (struct agx_copy){
             .dest = dest.value,
+            .dest_mem = dest.memory,
             .src = src,
          };
       }
@@ -951,49 +1036,89 @@ agx_insert_parallel_copies(agx_context *ctx, agx_block *block)
    }
 }
 
-void
-agx_ra(agx_context *ctx)
+static inline agx_index
+agx_index_as_mem(agx_index idx, unsigned mem_base)
 {
-   agx_compute_liveness(ctx);
-   uint8_t *ncomps = calloc(ctx->alloc, sizeof(uint8_t));
-   agx_instr **src_to_collect_phi = calloc(ctx->alloc, sizeof(agx_instr *));
-   enum agx_size *sizes = calloc(ctx->alloc, sizeof(enum agx_size));
-   BITSET_WORD *visited = calloc(BITSET_WORDS(ctx->alloc), sizeof(BITSET_WORD));
-   unsigned max_ncomps = 1;
+   assert(idx.type == AGX_INDEX_NORMAL);
+   assert(!idx.memory);
+   idx.memory = true;
+   idx.value = mem_base + idx.value;
+   return idx;
+}
 
-   agx_foreach_instr_global(ctx, I) {
-      /* Record collects/phis so we can coalesce when assigning */
-      if (I->op == AGX_OPCODE_COLLECT || I->op == AGX_OPCODE_PHI) {
-         agx_foreach_ssa_src(I, s) {
-            src_to_collect_phi[I->src[s].value] = I;
+/*
+ * Spill everything to the stack, trivially. For debugging spilling.
+ *
+ * Only phis and stack moves can access memory variables.
+ */
+static void
+agx_spill_everything(agx_context *ctx)
+{
+   /* Immediates and uniforms are not allowed to be spilled, so they cannot
+    * appear in phi webs. Lower them first.
+    */
+   agx_foreach_block(ctx, block) {
+      agx_block **preds = util_dynarray_begin(&block->predecessors);
+
+      agx_foreach_phi_in_block(block, phi) {
+         agx_foreach_src(phi, s) {
+            if (phi->src[s].type == AGX_INDEX_IMMEDIATE ||
+                phi->src[s].type == AGX_INDEX_UNIFORM) {
+
+               agx_builder b =
+                  agx_init_builder(ctx, agx_after_block_logical(preds[s]));
+
+               agx_index temp = agx_temp(ctx, phi->dest[0].size);
+
+               if (phi->src[s].type == AGX_INDEX_IMMEDIATE)
+                  agx_mov_imm_to(&b, temp, phi->src[s].value);
+               else
+                  agx_mov_to(&b, temp, phi->src[s]);
+
+               agx_replace_src(phi, s, temp);
+            }
          }
       }
-
-      agx_foreach_ssa_dest(I, d) {
-         unsigned v = I->dest[d].value;
-         assert(ncomps[v] == 0 && "broken SSA");
-         /* Round up vectors for easier live range splitting */
-         ncomps[v] = util_next_power_of_two(agx_index_size_16(I->dest[d]));
-         sizes[v] = I->dest[d].size;
-
-         max_ncomps = MAX2(max_ncomps, ncomps[v]);
-      }
    }
 
-   /* For live range splitting to work properly, ensure the register file is
-    * aligned to the larger vector size. Most of the time, this is a no-op since
-    * the largest vector size is usually 128-bit and the register file is
-    * naturally 128-bit aligned. However, this is required for correctness with
-    * 3D textureGrad, which can have a source vector of length 6x32-bit,
-    * rounding up to 256-bit and requiring special accounting here.
-    */
-   unsigned reg_file_alignment = MAX2(max_ncomps, 8);
-   assert(util_is_power_of_two_nonzero(reg_file_alignment));
+   /* Now we can spill everything */
+   unsigned mem_base = ctx->alloc;
+   ctx->alloc = mem_base + ctx->alloc;
 
-   /* Calculate the demand and use it to bound register assignment */
-   unsigned demand =
-      ALIGN_POT(agx_calc_register_demand(ctx, ncomps), reg_file_alignment);
+   agx_foreach_instr_global_safe(ctx, I) {
+      if (I->op == AGX_OPCODE_PHI) {
+         agx_foreach_ssa_dest(I, d) {
+            I->dest[d] = agx_replace_index(
+               I->dest[d], agx_index_as_mem(I->dest[d], mem_base));
+         }
 
+         agx_foreach_ssa_src(I, s) {
+            agx_replace_src(I, s, agx_index_as_mem(I->src[s], mem_base));
+         }
+      } else {
+         agx_builder b = agx_init_builder(ctx, agx_before_instr(I));
+         agx_foreach_ssa_src(I, s) {
+            agx_index fill =
+               agx_vec_temp(ctx, I->src[s].size, agx_channels(I->src[s]));
+
+            agx_mov_to(&b, fill, agx_index_as_mem(I->src[s], mem_base));
+            agx_replace_src(I, s, fill);
+         }
+
+         agx_foreach_ssa_dest(I, d) {
+            agx_builder b = agx_init_builder(ctx, agx_after_instr(I));
+            agx_mov_to(&b, agx_index_as_mem(I->dest[d], mem_base), I->dest[d]);
+         }
+      }
+   }
+
+   agx_validate(ctx, "Trivial spill");
+}
+
+void
+agx_ra(agx_context *ctx)
+{
+   /* Determine maximum possible registers. We won't exceed this! */
    unsigned max_possible_regs = AGX_NUM_REGS;
 
    /* Compute shaders need to have their entire workgroup together, so our
@@ -1017,19 +1142,71 @@ agx_ra(agx_context *ctx)
          agx_max_registers_for_occupancy(threads_per_workgroup);
    }
 
-   /* TODO: Spilling. Abort so we don't smash the stack in release builds. */
-   if (demand > max_possible_regs) {
-      fprintf(stderr, "\n");
-      fprintf(stderr, "------------------------------------------------\n");
-      fprintf(stderr, "Asahi Linux shader compiler limitation!\n");
-      fprintf(stderr, "We ran out of registers! Nyaaaa 😿\n");
-      fprintf(stderr, "Do not report this as a bug.\n");
-      fprintf(stderr, "We know -- we're working on it!\n");
-      fprintf(stderr, "------------------------------------------------\n");
-      fprintf(stderr, "\n");
-      abort();
+   /* Calculate the demand. We'll use it to determine if we need to spill and to
+    * bound register assignment.
+    */
+   agx_compute_liveness(ctx);
+   unsigned effective_demand = agx_calc_register_demand(ctx);
+   bool spilling = (effective_demand > max_possible_regs);
+
+   if (spilling) {
+      assert(ctx->key->has_scratch && "internal shaders are unspillable");
+      agx_spill_everything(ctx);
+
+      /* After spilling, recalculate liveness and demand */
+      agx_compute_liveness(ctx);
+      effective_demand = agx_calc_register_demand(ctx);
+
+      /* The resulting program can now be assigned registers */
+      assert(effective_demand <= max_possible_regs && "spiller post-condition");
    }
 
+   uint8_t *ncomps = calloc(ctx->alloc, sizeof(uint8_t));
+   enum ra_class *classes = calloc(ctx->alloc, sizeof(enum ra_class));
+   agx_instr **src_to_collect_phi = calloc(ctx->alloc, sizeof(agx_instr *));
+   enum agx_size *sizes = calloc(ctx->alloc, sizeof(enum agx_size));
+   BITSET_WORD *visited = calloc(BITSET_WORDS(ctx->alloc), sizeof(BITSET_WORD));
+   unsigned max_ncomps = 1;
+
+   agx_foreach_instr_global(ctx, I) {
+      /* Record collects/phis so we can coalesce when assigning */
+      if (I->op == AGX_OPCODE_COLLECT || I->op == AGX_OPCODE_PHI) {
+         agx_foreach_ssa_src(I, s) {
+            src_to_collect_phi[I->src[s].value] = I;
+         }
+      }
+
+      agx_foreach_ssa_dest(I, d) {
+         unsigned v = I->dest[d].value;
+         assert(ncomps[v] == 0 && "broken SSA");
+         /* Round up vectors for easier live range splitting */
+         ncomps[v] = util_next_power_of_two(agx_index_size_16(I->dest[d]));
+         sizes[v] = I->dest[d].size;
+         classes[v] = ra_class_for_index(I->dest[d]);
+
+         max_ncomps = MAX2(max_ncomps, ncomps[v]);
+      }
+   }
+
+   /* For live range splitting to work properly, ensure the register file is
+    * aligned to the larger vector size. Most of the time, this is a no-op since
+    * the largest vector size is usually 128-bit and the register file is
+    * naturally 128-bit aligned. However, this is required for correctness with
+    * 3D textureGrad, which can have a source vector of length 6x32-bit,
+    * rounding up to 256-bit and requiring special accounting here.
+    */
+   unsigned reg_file_alignment = MAX2(max_ncomps, 8);
+   assert(util_is_power_of_two_nonzero(reg_file_alignment));
+
+   if (spilling) {
+      /* We need to allocate scratch registers for lowering spilling later */
+      effective_demand = MAX2(effective_demand, 6 * 2 /* preloading */);
+      effective_demand += reg_file_alignment;
+   }
+
+   unsigned demand = ALIGN_POT(effective_demand, reg_file_alignment);
+   assert(demand <= max_possible_regs && "Invariant");
+
    /* Round up the demand to the maximum number of registers we can use without
     * affecting occupancy. This reduces live range splitting.
     */
@@ -1045,6 +1222,8 @@ agx_ra(agx_context *ctx)
    assert(max_regs >= (6 * 2) && "space for vertex shader preloading");
    assert(max_regs <= max_possible_regs);
 
+   unsigned max_mem_slot = 0;
+
    /* Assign registers in dominance-order. This coincides with source-order due
     * to a NIR invariant, so we do not need special handling for this.
     */
@@ -1055,12 +1234,20 @@ agx_ra(agx_context *ctx)
          .src_to_collect_phi = src_to_collect_phi,
          .ncomps = ncomps,
          .sizes = sizes,
+         .classes = classes,
          .visited = visited,
-         .bound = max_regs,
-         .max_reg = &ctx->max_reg,
+         .bound[RA_GPR] = max_regs,
+         .bound[RA_MEM] = AGX_NUM_MODELED_REGS,
+         .max_reg[RA_GPR] = &ctx->max_reg,
+         .max_reg[RA_MEM] = &max_mem_slot,
       });
    }
 
+   if (spilling) {
+      ctx->spill_base = ctx->scratch_size;
+      ctx->scratch_size += (max_mem_slot + 1) * 2;
+   }
+
    /* Vertex shaders preload the vertex/instance IDs (r5, r6) even if the shader
     * don't use them. Account for that so the preload doesn't clobber GPRs.
     */
@@ -1075,6 +1262,8 @@ agx_ra(agx_context *ctx)
 
       if (ins->op == AGX_OPCODE_COLLECT) {
          assert(ins->dest[0].type == AGX_INDEX_REGISTER);
+         assert(!ins->dest[0].memory);
+
          unsigned base = ins->dest[0].value;
          unsigned width = agx_size_align_16(ins->src[0].size);
 
@@ -1111,6 +1300,8 @@ agx_ra(agx_context *ctx)
             if (ins->dest[i].type != AGX_INDEX_REGISTER)
                continue;
 
+            assert(!ins->dest[i].memory);
+
             agx_index src = ins->src[0];
             src.size = ins->dest[i].size;
             src.channels_m1 = 0;
@@ -1147,7 +1338,8 @@ agx_ra(agx_context *ctx)
       case AGX_OPCODE_MOV:
          if (I->src[0].type == AGX_INDEX_REGISTER &&
              I->dest[0].size == I->src[0].size &&
-             I->src[0].value == I->dest[0].value) {
+             I->src[0].value == I->dest[0].value &&
+             I->src[0].memory == I->dest[0].memory) {
 
             assert(I->dest[0].type == AGX_INDEX_REGISTER);
             agx_remove_instruction(I);
@@ -1167,5 +1359,6 @@ agx_ra(agx_context *ctx)
    free(src_to_collect_phi);
    free(ncomps);
    free(sizes);
+   free(classes);
    free(visited);
 }
diff --git a/src/asahi/compiler/agx_validate.c b/src/asahi/compiler/agx_validate.c
index 44da79e4678..b3639bace1b 100644
--- a/src/asahi/compiler/agx_validate.c
+++ b/src/asahi/compiler/agx_validate.c
@@ -6,6 +6,7 @@
 
 #include "agx_compiler.h"
 #include "agx_debug.h"
+#include "agx_opcodes.h"
 
 /* Validatation doesn't make sense in release builds */
 #ifndef NDEBUG
@@ -67,6 +68,16 @@ agx_validate_block_form(agx_block *block)
    return true;
 }
 
+/*
+ * Only moves and phis use stack. Phis cannot use moves due to their
+ * parallel nature, so we allow phis to take memory, later lowered to moves.
+ */
+static bool
+is_stack_valid(agx_instr *I)
+{
+   return (I->op == AGX_OPCODE_MOV) || (I->op == AGX_OPCODE_PHI);
+}
+
 static bool
 agx_validate_sources(agx_instr *I)
 {
@@ -91,6 +102,8 @@ agx_validate_sources(agx_instr *I)
       } else if (I->op == AGX_OPCODE_COLLECT && !agx_is_null(src)) {
          agx_validate_assert(src.size == I->src[0].size);
       }
+
+      agx_validate_assert(!src.memory || is_stack_valid(I));
    }
 
    return true;
@@ -115,6 +128,9 @@ agx_validate_defs(agx_instr *I, BITSET_WORD *defs)
          return false;
 
       BITSET_SET(defs, I->dest[d].value);
+
+      if (I->dest[d].memory && !is_stack_valid(I))
+         return false;
    }
 
    return true;
@@ -127,6 +143,10 @@ agx_write_registers(const agx_instr *I, unsigned d)
    unsigned size = agx_size_align_16(I->dest[d].size);
 
    switch (I->op) {
+   case AGX_OPCODE_MOV:
+      /* Tautological */
+      return agx_index_size_16(I->dest[d]);
+
    case AGX_OPCODE_ITER:
    case AGX_OPCODE_ITERPROJ:
       assert(1 <= I->channels && I->channels <= 4);
@@ -194,6 +214,10 @@ agx_read_registers(const agx_instr *I, unsigned s)
    unsigned size = agx_size_align_16(I->src[s].size);
 
    switch (I->op) {
+   case AGX_OPCODE_MOV:
+      /* Tautological */
+      return agx_index_size_16(I->src[0]);
+
    case AGX_OPCODE_SPLIT:
       return I->nr_dests * agx_size_align_16(agx_split_width(I));