diff --git a/src/asahi/compiler/agx_compiler.h b/src/asahi/compiler/agx_compiler.h index 3075ba3f857..ad3e7b386a6 100644 --- a/src/asahi/compiler/agx_compiler.h +++ b/src/asahi/compiler/agx_compiler.h @@ -26,6 +26,9 @@ extern "C" { /* u0-u255 inclusive, as pairs of 16-bits */ #define AGX_NUM_UNIFORMS (512) +/* Semi-arbitrary limit for spill slot allocation */ +#define AGX_NUM_MODELED_REGS (2048) + enum agx_index_type { AGX_INDEX_NULL = 0, AGX_INDEX_NORMAL = 1, @@ -71,6 +74,9 @@ typedef struct { bool abs : 1; bool neg : 1; + /* Register class */ + bool memory : 1; + unsigned channels_m1 : 3; enum agx_size size : 2; enum agx_index_type type : 3; @@ -139,12 +145,22 @@ agx_register(uint32_t imm, enum agx_size size) } static inline agx_index -agx_register_like(uint32_t imm, agx_index like) +agx_memory_register(uint32_t imm, enum agx_size size) { - assert(imm < AGX_NUM_REGS); - return (agx_index){ .value = imm, + .memory = true, + .size = size, + .type = AGX_INDEX_REGISTER, + }; +} + +static inline agx_index +agx_register_like(uint32_t imm, agx_index like) +{ + return (agx_index){ + .value = imm, + .memory = like.memory, .channels_m1 = like.channels_m1, .size = like.size, .type = AGX_INDEX_REGISTER, @@ -398,7 +414,7 @@ typedef struct agx_block { /* For visited blocks during register assignment and live-out registers, the * mapping of SSA names to registers at the end of the block. */ - uint8_t *ssa_to_reg_out; + uint16_t *ssa_to_reg_out; /* Is this block a loop header? If not, all of its predecessors precede it in * source order. @@ -464,6 +480,15 @@ typedef struct { */ agx_index vertex_id, instance_id; + /* Beginning of our stack allocation used for spilling, below that is + * NIR-level scratch. + */ + unsigned spill_base; + + /* Beginning of stack allocation used for parallel copy lowering */ + bool has_spill_pcopy_reserved; + unsigned spill_pcopy_base; + /* Stats for shader-db */ unsigned loop_count; unsigned spills; @@ -883,6 +908,9 @@ struct agx_copy { /* Base register destination of the copy */ unsigned dest; + /* Destination is memory */ + bool dest_mem; + /* Source of the copy */ agx_index src; diff --git a/src/asahi/compiler/agx_lower_parallel_copy.c b/src/asahi/compiler/agx_lower_parallel_copy.c index a7c271f850f..ea787770e79 100644 --- a/src/asahi/compiler/agx_lower_parallel_copy.c +++ b/src/asahi/compiler/agx_lower_parallel_copy.c @@ -22,15 +22,47 @@ * We only handles register-register copies, not general agx_index sources. This * suffices for its internal use for register allocation. */ + +static agx_index +scratch_slot(agx_context *ctx, enum agx_size size) +{ + /* Reserve scratch slots. scratch_size is in bytes, spill_pcopy_base and + * agx_memory_register are in memory registers (16-bit elements). + */ + if (!ctx->has_spill_pcopy_reserved) { + ctx->scratch_size = align(ctx->scratch_size, 16); + ctx->spill_pcopy_base = ctx->scratch_size / 2; + ctx->scratch_size += 8; + ctx->has_spill_pcopy_reserved = true; + } + + return agx_memory_register(ctx->spill_pcopy_base, size); +} + static void do_copy(agx_builder *b, const struct agx_copy *copy) { - agx_index dst = agx_register(copy->dest, copy->src.size); + agx_index dst = copy->dest_mem + ? agx_memory_register(copy->dest, copy->src.size) + : agx_register(copy->dest, copy->src.size); - if (copy->src.type == AGX_INDEX_IMMEDIATE) + if (copy->dest_mem && copy->src.memory) { + /* Memory-memory copies need to be lowered to memory-register and + * register-memory, spilling a GPR to an auxiliary memory slot. This + * avoids needing reserving a scratch register for this edge case. + */ + agx_index scratch_reg = agx_register(0, copy->src.size); + agx_index scratch_mem = scratch_slot(b->shader, copy->src.size); + + agx_mov_to(b, scratch_mem, scratch_reg); + agx_mov_to(b, scratch_reg, copy->src); + agx_mov_to(b, dst, scratch_reg); + agx_mov_to(b, scratch_reg, scratch_mem); + } else if (copy->src.type == AGX_INDEX_IMMEDIATE) { agx_mov_imm_to(b, dst, copy->src.value); - else + } else { agx_mov_to(b, dst, copy->src); + } } static void @@ -43,7 +75,7 @@ do_swap(agx_builder *b, const struct agx_copy *copy) /* We can swap lo/hi halves of a 32-bit register with a 32-bit extr */ if (copy->src.size == AGX_SIZE_16 && - (copy->dest >> 1) == (copy->src.value >> 1)) { + (copy->dest >> 1) == (copy->src.value >> 1) && !copy->dest_mem) { assert(((copy->dest & 1) == (1 - (copy->src.value & 1))) && "no trivial swaps, and only 2 halves of a register"); @@ -58,9 +90,32 @@ do_swap(agx_builder *b, const struct agx_copy *copy) return; } - agx_index x = agx_register(copy->dest, copy->src.size); + agx_index x = copy->dest_mem + ? agx_memory_register(copy->dest, copy->src.size) + : agx_register(copy->dest, copy->src.size); agx_index y = copy->src; + /* Memory-memory swaps need to be lowered */ + assert(x.memory == y.memory); + if (x.memory) { + agx_index temp1 = agx_register(0, copy->src.size); + agx_index temp2 = agx_register(2, copy->src.size); + + agx_index scratch_reg2 = agx_register(0, copy->src.size); + agx_index scratch_mem2 = scratch_slot(b->shader, copy->src.size); + scratch_reg2.channels_m1++; + scratch_mem2.channels_m1++; + + agx_mov_to(b, scratch_mem2, scratch_reg2); + agx_mov_to(b, temp1, x); + agx_mov_to(b, temp2, y); + agx_mov_to(b, y, temp1); + agx_mov_to(b, x, temp2); + agx_mov_to(b, scratch_reg2, scratch_mem2); + return; + } + + /* Otherwise, we're swapping GPRs and fallback on a XOR swap. */ agx_xor_to(b, x, x, y); agx_xor_to(b, y, x, y); agx_xor_to(b, x, x, y); @@ -74,12 +129,12 @@ struct copy_ctx { * source. Once this drops to zero, then the physreg is unblocked and can * be moved to. */ - unsigned physreg_use_count[AGX_NUM_REGS]; + unsigned physreg_use_count[AGX_NUM_MODELED_REGS]; /* For each physreg, the pending copy_entry that uses it as a dest. */ - struct agx_copy *physreg_dest[AGX_NUM_REGS]; + struct agx_copy *physreg_dest[AGX_NUM_MODELED_REGS]; - struct agx_copy entries[AGX_NUM_REGS]; + struct agx_copy entries[AGX_NUM_MODELED_REGS]; }; static bool @@ -96,7 +151,8 @@ entry_blocked(struct agx_copy *entry, struct copy_ctx *ctx) static bool is_real(struct agx_copy *entry) { - return entry->src.type == AGX_INDEX_REGISTER; + return entry->src.type == AGX_INDEX_REGISTER && + entry->dest_mem == entry->src.memory; } /* TODO: Generalize to other bit sizes */ @@ -109,6 +165,7 @@ split_32bit_copy(struct copy_ctx *ctx, struct agx_copy *entry) struct agx_copy *new_entry = &ctx->entries[ctx->entry_count++]; new_entry->dest = entry->dest + 1; + new_entry->dest_mem = entry->dest_mem; new_entry->src = entry->src; new_entry->src.value += 1; new_entry->done = false; @@ -117,9 +174,9 @@ split_32bit_copy(struct copy_ctx *ctx, struct agx_copy *entry) ctx->physreg_dest[entry->dest + 1] = new_entry; } -void -agx_emit_parallel_copies(agx_builder *b, struct agx_copy *copies, - unsigned num_copies) +static void +agx_emit_parallel_copies_for_class(agx_builder *b, struct agx_copy *copies, + unsigned num_copies, bool cls) { /* First, lower away 64-bit copies to smaller chunks, since we don't have * 64-bit ALU so we always want to split. @@ -130,6 +187,12 @@ agx_emit_parallel_copies(agx_builder *b, struct agx_copy *copies, for (unsigned i = 0; i < num_copies; ++i) { struct agx_copy copy = copies[i]; + /* Filter by class */ + if (copy.dest_mem != cls) + continue; + + assert(copy.dest < AGX_NUM_MODELED_REGS); + if (copy.src.size == AGX_SIZE_64) { copy.src.size = AGX_SIZE_32; copies2[num_copies2++] = copy; @@ -354,3 +417,14 @@ agx_emit_parallel_copies(agx_builder *b, struct agx_copy *copies, free(copies2); } + +void +agx_emit_parallel_copies(agx_builder *b, struct agx_copy *copies, + unsigned num_copies) +{ + /* Emit copies fo reach register class separately because we don't have + * register class awareness in the parallel copy lowering data structure. + */ + agx_emit_parallel_copies_for_class(b, copies, num_copies, false); + agx_emit_parallel_copies_for_class(b, copies, num_copies, true); +} diff --git a/src/asahi/compiler/agx_print.c b/src/asahi/compiler/agx_print.c index a2e453d7f3c..8f71ccf9426 100644 --- a/src/asahi/compiler/agx_print.c +++ b/src/asahi/compiler/agx_print.c @@ -30,6 +30,9 @@ agx_print_sized(char prefix, unsigned value, enum agx_size size, FILE *fp) static void agx_print_index(agx_index index, bool is_float, FILE *fp) { + if (index.memory) + fprintf(fp, "m"); + switch (index.type) { case AGX_INDEX_NULL: fprintf(fp, "_"); @@ -75,6 +78,8 @@ agx_print_index(agx_index index, bool is_float, FILE *fp) fprintf(fp, "..."); + if (index.memory) + fprintf(fp, "m"); agx_print_sized('r', last, index.size, fp); } break; diff --git a/src/asahi/compiler/agx_register_allocate.c b/src/asahi/compiler/agx_register_allocate.c index 75e542ba85a..39ffbe77bd8 100644 --- a/src/asahi/compiler/agx_register_allocate.c +++ b/src/asahi/compiler/agx_register_allocate.c @@ -3,6 +3,8 @@ * SPDX-License-Identifier: MIT */ +#include "util/bitset.h" +#include "util/macros.h" #include "util/u_dynarray.h" #include "util/u_qsort.h" #include "agx_builder.h" @@ -10,21 +12,41 @@ #include "agx_compiler.h" #include "agx_debug.h" #include "agx_opcodes.h" +#include "agx_pack.h" +#include "shader_enums.h" /* SSA-based register allocator */ +enum ra_class { + /* General purpose register */ + RA_GPR, + + /* Memory, used to assign stack slots */ + RA_MEM, + + /* Keep last */ + RA_CLASSES, +}; + +static inline enum ra_class +ra_class_for_index(agx_index idx) +{ + return idx.memory ? RA_MEM : RA_GPR; +} + struct ra_ctx { agx_context *shader; agx_block *block; agx_instr *instr; - uint8_t *ssa_to_reg; + uint16_t *ssa_to_reg; uint8_t *ncomps; enum agx_size *sizes; + enum ra_class *classes; BITSET_WORD *visited; - BITSET_WORD *used_regs; + BITSET_WORD *used_regs[RA_CLASSES]; /* Maintained while assigning registers */ - unsigned *max_reg; + unsigned *max_reg[RA_CLASSES]; /* For affinities */ agx_instr **src_to_collect_phi; @@ -32,11 +54,13 @@ struct ra_ctx { /* If bit i of used_regs is set, and register i is the first consecutive * register holding an SSA value, then reg_to_ssa[i] is the SSA index of the * value currently in register i. + * + * Only for GPRs. We can add reg classes later if we have a use case. */ uint32_t reg_to_ssa[AGX_NUM_REGS]; /* Maximum number of registers that RA is allowed to use */ - unsigned bound; + unsigned bound[RA_CLASSES]; }; enum agx_size @@ -58,12 +82,26 @@ agx_split_width(const agx_instr *I) } /* - * Calculate register demand in 16-bit registers. Becuase we allocate in SSA, - * this calculation is exact in linear-time. Depends on liveness information. + * Calculate register demand in 16-bit registers, while gathering widths and + * classes. Becuase we allocate in SSA, this calculation is exact in + * linear-time. Depends on liveness information. */ static unsigned -agx_calc_register_demand(agx_context *ctx, uint8_t *widths) +agx_calc_register_demand(agx_context *ctx) { + uint8_t *widths = calloc(ctx->alloc, sizeof(uint8_t)); + enum ra_class *classes = calloc(ctx->alloc, sizeof(enum ra_class)); + + agx_foreach_instr_global(ctx, I) { + agx_foreach_ssa_dest(I, d) { + unsigned v = I->dest[d].value; + assert(widths[v] == 0 && "broken SSA"); + /* Round up vectors for easier live range splitting */ + widths[v] = util_next_power_of_two(agx_index_size_16(I->dest[d])); + classes[v] = ra_class_for_index(I->dest[d]); + } + } + /* Calculate demand at the start of each block based on live-in, then update * for each instruction processed. Calculate rolling maximum. */ @@ -82,7 +120,8 @@ agx_calc_register_demand(agx_context *ctx, uint8_t *widths) { int i; BITSET_FOREACH_SET(i, block->live_in, ctx->alloc) { - demand += widths[i]; + if (classes[i] == RA_GPR) + demand += widths[i]; } } @@ -110,6 +149,8 @@ agx_calc_register_demand(agx_context *ctx, uint8_t *widths) if (!I->src[s].kill) continue; assert(I->src[s].type == AGX_INDEX_NORMAL); + if (ra_class_for_index(I->src[s]) != RA_GPR) + continue; bool skip = false; @@ -125,10 +166,9 @@ agx_calc_register_demand(agx_context *ctx, uint8_t *widths) } /* Make destinations live */ - agx_foreach_dest(I, d) { - if (agx_is_null(I->dest[d])) + agx_foreach_ssa_dest(I, d) { + if (ra_class_for_index(I->dest[d]) != RA_GPR) continue; - assert(I->dest[d].type == AGX_INDEX_NORMAL); /* Live range splits allocate at power-of-two granularity. Round up * destination sizes (temporarily) to powers-of-two. @@ -146,15 +186,17 @@ agx_calc_register_demand(agx_context *ctx, uint8_t *widths) demand -= late_kill_count; } + free(widths); + free(classes); return max_demand; } static bool -find_regs_simple(struct ra_ctx *rctx, unsigned count, unsigned align, - unsigned *out) +find_regs_simple(struct ra_ctx *rctx, enum ra_class cls, unsigned count, + unsigned align, unsigned *out) { - for (unsigned reg = 0; reg + count <= rctx->bound; reg += align) { - if (!BITSET_TEST_RANGE(rctx->used_regs, reg, reg + count - 1)) { + for (unsigned reg = 0; reg + count <= rctx->bound[cls]; reg += align) { + if (!BITSET_TEST_RANGE(rctx->used_regs[cls], reg, reg + count - 1)) { *out = reg; return true; } @@ -177,17 +219,18 @@ find_regs_simple(struct ra_ctx *rctx, unsigned count, unsigned align, * Postcondition: at least one register in the returned region is already free. */ static unsigned -find_best_region_to_evict(struct ra_ctx *rctx, unsigned size, +find_best_region_to_evict(struct ra_ctx *rctx, enum ra_class cls, unsigned size, BITSET_WORD *already_evicted, BITSET_WORD *killed) { assert(util_is_power_of_two_or_zero(size) && "precondition"); - assert((rctx->bound % size) == 0 && + assert((rctx->bound[cls] % size) == 0 && "register file size must be aligned to the maximum vector size"); + assert(cls == RA_GPR); unsigned best_base = ~0; unsigned best_moves = ~0; - for (unsigned base = 0; base + size <= rctx->bound; base += size) { + for (unsigned base = 0; base + size <= rctx->bound[cls]; base += size) { /* r0l is unevictable, skip it. By itself, this does not pose a problem. * We are allocating n registers, but the region containing r0l has at * most n-1 free. Since there are at least n free registers total, there @@ -215,7 +258,7 @@ find_best_region_to_evict(struct ra_ctx *rctx, unsigned size, /* We need a move for each blocked register (TODO: we only need a * single move for 32-bit pairs, could optimize to use that instead.) */ - if (BITSET_TEST(rctx->used_regs, reg)) + if (BITSET_TEST(rctx->used_regs[cls], reg)) moves++; else any_free = true; @@ -239,7 +282,7 @@ find_best_region_to_evict(struct ra_ctx *rctx, unsigned size, } } - assert(best_base < rctx->bound && + assert(best_base < rctx->bound[cls] && "not enough registers (should have spilled already)"); return best_base; } @@ -247,15 +290,22 @@ find_best_region_to_evict(struct ra_ctx *rctx, unsigned size, static void set_ssa_to_reg(struct ra_ctx *rctx, unsigned ssa, unsigned reg) { - *(rctx->max_reg) = MAX2(*(rctx->max_reg), reg + rctx->ncomps[ssa] - 1); + enum ra_class cls = rctx->classes[ssa]; + + *(rctx->max_reg[cls]) = + MAX2(*(rctx->max_reg[cls]), reg + rctx->ncomps[ssa] - 1); + rctx->ssa_to_reg[ssa] = reg; } static unsigned assign_regs_by_copying(struct ra_ctx *rctx, unsigned npot_count, unsigned align, const agx_instr *I, struct util_dynarray *copies, - BITSET_WORD *clobbered, BITSET_WORD *killed) + BITSET_WORD *clobbered, BITSET_WORD *killed, + enum ra_class cls) { + assert(cls == RA_GPR); + /* XXX: This needs some special handling but so far it has been prohibitively * difficult to hit the case */ @@ -273,14 +323,15 @@ assign_regs_by_copying(struct ra_ctx *rctx, unsigned npot_count, unsigned align, * shuffle some variables around. Look for a range of the register file * that is partially blocked. */ - unsigned base = find_best_region_to_evict(rctx, count, clobbered, killed); + unsigned base = + find_best_region_to_evict(rctx, cls, count, clobbered, killed); assert(count <= 16 && "max allocation size (conservative)"); BITSET_DECLARE(evict_set, 16) = {0}; /* Store the set of blocking registers that need to be evicted */ for (unsigned i = 0; i < count; ++i) { - if (BITSET_TEST(rctx->used_regs, base + i)) { + if (BITSET_TEST(rctx->used_regs[cls], base + i)) { BITSET_SET(evict_set, i); } } @@ -288,7 +339,7 @@ assign_regs_by_copying(struct ra_ctx *rctx, unsigned npot_count, unsigned align, /* We are going to allocate the destination to this range, so it is now fully * used. Mark it as such so we don't reassign here later. */ - BITSET_SET_RANGE(rctx->used_regs, base, base + count - 1); + BITSET_SET_RANGE(rctx->used_regs[cls], base, base + count - 1); /* Before overwriting the range, we need to evict blocked variables */ for (unsigned i = 0; i < 16; ++i) { @@ -315,11 +366,13 @@ assign_regs_by_copying(struct ra_ctx *rctx, unsigned npot_count, unsigned align, * recursion because nr is decreasing because of the gap. */ assert(nr < count && "fully contained in range that's not full"); - unsigned new_reg = - assign_regs_by_copying(rctx, nr, align, I, copies, clobbered, killed); + unsigned new_reg = assign_regs_by_copying(rctx, nr, align, I, copies, + clobbered, killed, cls); /* Copy the variable over, register by register */ for (unsigned i = 0; i < nr; i += align) { + assert(cls == RA_GPR); + struct agx_copy copy = { .dest = new_reg + i, .src = agx_register(reg + i, rctx->sizes[ssa]), @@ -338,6 +391,7 @@ assign_regs_by_copying(struct ra_ctx *rctx, unsigned npot_count, unsigned align, BITSET_SET_RANGE(clobbered, new_reg, new_reg + nr - 1); /* Update bookkeeping for this variable */ + assert(cls == rctx->classes[cls]); set_ssa_to_reg(rctx, ssa, new_reg); rctx->reg_to_ssa[new_reg] = ssa; @@ -348,8 +402,10 @@ assign_regs_by_copying(struct ra_ctx *rctx, unsigned npot_count, unsigned align, /* We overallocated for non-power-of-two vectors. Free up the excess now. * This is modelled as late kill in demand calculation. */ - if (npot_count != count) - BITSET_CLEAR_RANGE(rctx->used_regs, base + npot_count, base + count - 1); + if (npot_count != count) { + BITSET_CLEAR_RANGE(rctx->used_regs[cls], base + npot_count, + base + count - 1); + } return base; } @@ -401,7 +457,9 @@ insert_copies_for_clobbered_killed(struct ra_ctx *rctx, unsigned reg, agx_foreach_ssa_src(I, s) { unsigned reg = rctx->ssa_to_reg[I->src[s].value]; - if (I->src[s].kill && BITSET_TEST(clobbered, reg)) { + if (I->src[s].kill && ra_class_for_index(I->src[s]) == RA_GPR && + BITSET_TEST(clobbered, reg)) { + assert(nr_vars < ARRAY_SIZE(vars) && "cannot clobber more than max variable size"); @@ -433,6 +491,7 @@ insert_copies_for_clobbered_killed(struct ra_ctx *rctx, unsigned reg, unsigned var_count = rctx->ncomps[var]; unsigned var_align = agx_size_align_16(rctx->sizes[var]); + assert(rctx->classes[var] == RA_GPR && "construction"); assert((base % var_align) == 0 && "induction"); assert((var_count % var_align) == 0 && "no partial variables"); @@ -461,9 +520,13 @@ find_regs(struct ra_ctx *rctx, agx_instr *I, unsigned dest_idx, unsigned count, unsigned reg; assert(count == align); - if (find_regs_simple(rctx, count, align, ®)) { + enum ra_class cls = ra_class_for_index(I->dest[dest_idx]); + + if (find_regs_simple(rctx, cls, count, align, ®)) { return reg; } else { + assert(cls == RA_GPR && "no memory live range splits"); + BITSET_DECLARE(clobbered, AGX_NUM_REGS) = {0}; BITSET_DECLARE(killed, AGX_NUM_REGS) = {0}; struct util_dynarray copies = {0}; @@ -481,7 +544,7 @@ find_regs(struct ra_ctx *rctx, agx_instr *I, unsigned dest_idx, unsigned count, } reg = assign_regs_by_copying(rctx, count, align, I, &copies, clobbered, - killed); + killed, cls); insert_copies_for_clobbered_killed(rctx, reg, count, I, &copies, clobbered); @@ -491,7 +554,7 @@ find_regs(struct ra_ctx *rctx, agx_instr *I, unsigned dest_idx, unsigned count, &b, copies.data, util_dynarray_num_elements(&copies, struct agx_copy)); /* assign_regs asserts this is cleared, so clear to be reassigned */ - BITSET_CLEAR_RANGE(rctx->used_regs, reg, reg + count - 1); + BITSET_CLEAR_RANGE(rctx->used_regs[cls], reg, reg + count - 1); return reg; } } @@ -545,10 +608,13 @@ reserve_live_in(struct ra_ctx *rctx) */ assert(rctx->block->loop_header); phi->src[pred_idx] = agx_get_index(i, size); + phi->src[pred_idx].memory = rctx->classes[i] == RA_MEM; } else { /* Otherwise, we can build the phi now */ unsigned reg = (*pred)->ssa_to_reg_out[i]; - phi->src[pred_idx] = agx_register(reg, size); + phi->src[pred_idx] = rctx->classes[i] == RA_MEM + ? agx_memory_register(reg, size) + : agx_register(reg, size); } } @@ -567,11 +633,14 @@ reserve_live_in(struct ra_ctx *rctx) base = (*pred)->ssa_to_reg_out[i]; } + enum ra_class cls = rctx->classes[i]; set_ssa_to_reg(rctx, i, base); for (unsigned j = 0; j < rctx->ncomps[i]; ++j) { - BITSET_SET(rctx->used_regs, base + j); - rctx->reg_to_ssa[base + j] = i; + BITSET_SET(rctx->used_regs[cls], base + j); + + if (cls == RA_GPR) + rctx->reg_to_ssa[base + j] = i; } } } @@ -579,7 +648,8 @@ reserve_live_in(struct ra_ctx *rctx) static void assign_regs(struct ra_ctx *rctx, agx_index v, unsigned reg) { - assert(reg < rctx->bound && "must not overflow register file"); + enum ra_class cls = ra_class_for_index(v); + assert(reg < rctx->bound[cls] && "must not overflow register file"); assert(v.type == AGX_INDEX_NORMAL && "only SSA gets registers allocated"); set_ssa_to_reg(rctx, v.value, reg); @@ -589,10 +659,12 @@ assign_regs(struct ra_ctx *rctx, agx_index v, unsigned reg) assert(rctx->ncomps[v.value] >= 1); unsigned end = reg + rctx->ncomps[v.value] - 1; - assert(!BITSET_TEST_RANGE(rctx->used_regs, reg, end) && "no interference"); - BITSET_SET_RANGE(rctx->used_regs, reg, end); + assert(!BITSET_TEST_RANGE(rctx->used_regs[cls], reg, end) && + "no interference"); + BITSET_SET_RANGE(rctx->used_regs[cls], reg, end); - rctx->reg_to_ssa[reg] = v.value; + if (cls == RA_GPR) + rctx->reg_to_ssa[reg] = v.value; } static void @@ -641,10 +713,12 @@ try_coalesce_with(struct ra_ctx *rctx, agx_index ssa, unsigned count, } unsigned base = rctx->ssa_to_reg[ssa.value]; - if (BITSET_TEST_RANGE(rctx->used_regs, base, base + count - 1)) + enum ra_class cls = ra_class_for_index(ssa); + + if (BITSET_TEST_RANGE(rctx->used_regs[cls], base, base + count - 1)) return false; - assert(base + count <= rctx->bound && "invariant"); + assert(base + count <= rctx->bound[cls] && "invariant"); *out = base; return true; } @@ -653,6 +727,7 @@ static unsigned pick_regs(struct ra_ctx *rctx, agx_instr *I, unsigned d) { agx_index idx = I->dest[d]; + enum ra_class cls = ra_class_for_index(idx); assert(idx.type == AGX_INDEX_NORMAL); unsigned count = rctx->ncomps[idx.value]; @@ -680,14 +755,14 @@ pick_regs(struct ra_ctx *rctx, agx_instr *I, unsigned d) "and this is not a phi node, so we have assigned a register"); unsigned base = affinity_base_of_collect(rctx, I, s); - if (base >= rctx->bound || (base + count) > rctx->bound) + if (base >= rctx->bound[cls] || (base + count) > rctx->bound[cls]) continue; /* Unaligned destinations can happen when dest size > src size */ if (base % align) continue; - if (!BITSET_TEST_RANGE(rctx->used_regs, base, base + count - 1)) + if (!BITSET_TEST_RANGE(rctx->used_regs[cls], base, base + count - 1)) return base; } } @@ -719,17 +794,18 @@ pick_regs(struct ra_ctx *rctx, agx_instr *I, unsigned d) /* Determine where the collect should start relative to the source */ unsigned base = affinity_base_of_collect(rctx, collect, s); - if (base >= rctx->bound) + if (base >= rctx->bound[cls]) continue; unsigned our_reg = base + (our_source * align); /* Don't allocate past the end of the register file */ - if ((our_reg + align) > rctx->bound) + if ((our_reg + align) > rctx->bound[cls]) continue; /* If those registers are free, then choose them */ - if (!BITSET_TEST_RANGE(rctx->used_regs, our_reg, our_reg + align - 1)) + if (!BITSET_TEST_RANGE(rctx->used_regs[cls], our_reg, + our_reg + align - 1)) return our_reg; } @@ -739,9 +815,10 @@ pick_regs(struct ra_ctx *rctx, agx_instr *I, unsigned d) /* Prefer ranges of the register file that leave room for all sources of * the collect contiguously. */ - for (unsigned base = 0; base + (collect->nr_srcs * align) <= rctx->bound; + for (unsigned base = 0; + base + (collect->nr_srcs * align) <= rctx->bound[cls]; base += collect_align) { - if (!BITSET_TEST_RANGE(rctx->used_regs, base, + if (!BITSET_TEST_RANGE(rctx->used_regs[cls], base, base + (collect->nr_srcs * align) - 1)) return base + offset; } @@ -751,9 +828,9 @@ pick_regs(struct ra_ctx *rctx, agx_instr *I, unsigned d) * for a register for the source such that the collect base is aligned. */ if (collect_align > align) { - for (unsigned reg = offset; reg + collect_align <= rctx->bound; + for (unsigned reg = offset; reg + collect_align <= rctx->bound[cls]; reg += collect_align) { - if (!BITSET_TEST_RANGE(rctx->used_regs, reg, reg + count - 1)) + if (!BITSET_TEST_RANGE(rctx->used_regs[cls], reg, reg + count - 1)) return reg; } } @@ -773,7 +850,7 @@ pick_regs(struct ra_ctx *rctx, agx_instr *I, unsigned d) if (phi->dest[0].type == AGX_INDEX_REGISTER) { unsigned base = phi->dest[0].value; - if (!BITSET_TEST_RANGE(rctx->used_regs, base, base + count - 1)) + if (!BITSET_TEST_RANGE(rctx->used_regs[cls], base, base + count - 1)) return base; } } @@ -787,12 +864,14 @@ pick_regs(struct ra_ctx *rctx, agx_instr *I, unsigned d) static void agx_ra_assign_local(struct ra_ctx *rctx) { - BITSET_DECLARE(used_regs, AGX_NUM_REGS) = {0}; - uint8_t *ssa_to_reg = calloc(rctx->shader->alloc, sizeof(uint8_t)); + BITSET_DECLARE(used_regs_gpr, AGX_NUM_REGS) = {0}; + BITSET_DECLARE(used_regs_mem, AGX_NUM_MODELED_REGS) = {0}; + uint16_t *ssa_to_reg = calloc(rctx->shader->alloc, sizeof(uint16_t)); agx_block *block = rctx->block; uint8_t *ncomps = rctx->ncomps; - rctx->used_regs = used_regs; + rctx->used_regs[RA_GPR] = used_regs_gpr; + rctx->used_regs[RA_MEM] = used_regs_mem; rctx->ssa_to_reg = ssa_to_reg; reserve_live_in(rctx); @@ -801,7 +880,7 @@ agx_ra_assign_local(struct ra_ctx *rctx) * This could be optimized (sync with agx_calc_register_demand). */ if (rctx->shader->any_cf) - BITSET_SET(used_regs, 0); + BITSET_SET(used_regs_gpr, 0); agx_foreach_instr_in_block(block, I) { rctx->instr = I; @@ -810,13 +889,17 @@ agx_ra_assign_local(struct ra_ctx *rctx) * can be removed by assigning the destinations overlapping the source. */ if (I->op == AGX_OPCODE_SPLIT && I->src[0].kill) { + assert(ra_class_for_index(I->src[0]) == RA_GPR); unsigned reg = ssa_to_reg[I->src[0].value]; unsigned width = agx_size_align_16(agx_split_width(I)); agx_foreach_dest(I, d) { + assert(ra_class_for_index(I->dest[0]) == RA_GPR); + /* Free up the source */ unsigned offset_reg = reg + (d * width); - BITSET_CLEAR_RANGE(used_regs, offset_reg, offset_reg + width - 1); + BITSET_CLEAR_RANGE(used_regs_gpr, offset_reg, + offset_reg + width - 1); /* Assign the destination where the source was */ if (!agx_is_null(I->dest[d])) @@ -826,7 +909,7 @@ agx_ra_assign_local(struct ra_ctx *rctx) unsigned excess = rctx->ncomps[I->src[0].value] - (I->nr_dests * width); if (excess) { - BITSET_CLEAR_RANGE(used_regs, reg + (I->nr_dests * width), + BITSET_CLEAR_RANGE(used_regs_gpr, reg + (I->nr_dests * width), reg + rctx->ncomps[I->src[0].value] - 1); } @@ -846,11 +929,12 @@ agx_ra_assign_local(struct ra_ctx *rctx) /* First, free killed sources */ agx_foreach_ssa_src(I, s) { if (I->src[s].kill) { + enum ra_class cls = ra_class_for_index(I->src[s]); unsigned reg = ssa_to_reg[I->src[s].value]; unsigned count = ncomps[I->src[s].value]; assert(count >= 1); - BITSET_CLEAR_RANGE(used_regs, reg, reg + count - 1); + BITSET_CLEAR_RANGE(rctx->used_regs[cls], reg, reg + count - 1); } } @@ -883,7 +967,7 @@ agx_ra_assign_local(struct ra_ctx *rctx) agx_replace_src( phi, pred_idx, - agx_register(rctx->ssa_to_reg[value], phi->src[pred_idx].size)); + agx_register_like(rctx->ssa_to_reg[value], phi->src[pred_idx])); } } } @@ -941,6 +1025,7 @@ agx_insert_parallel_copies(agx_context *ctx, agx_block *block) copies[i++] = (struct agx_copy){ .dest = dest.value, + .dest_mem = dest.memory, .src = src, }; } @@ -951,49 +1036,89 @@ agx_insert_parallel_copies(agx_context *ctx, agx_block *block) } } -void -agx_ra(agx_context *ctx) +static inline agx_index +agx_index_as_mem(agx_index idx, unsigned mem_base) { - agx_compute_liveness(ctx); - uint8_t *ncomps = calloc(ctx->alloc, sizeof(uint8_t)); - agx_instr **src_to_collect_phi = calloc(ctx->alloc, sizeof(agx_instr *)); - enum agx_size *sizes = calloc(ctx->alloc, sizeof(enum agx_size)); - BITSET_WORD *visited = calloc(BITSET_WORDS(ctx->alloc), sizeof(BITSET_WORD)); - unsigned max_ncomps = 1; + assert(idx.type == AGX_INDEX_NORMAL); + assert(!idx.memory); + idx.memory = true; + idx.value = mem_base + idx.value; + return idx; +} - agx_foreach_instr_global(ctx, I) { - /* Record collects/phis so we can coalesce when assigning */ - if (I->op == AGX_OPCODE_COLLECT || I->op == AGX_OPCODE_PHI) { - agx_foreach_ssa_src(I, s) { - src_to_collect_phi[I->src[s].value] = I; +/* + * Spill everything to the stack, trivially. For debugging spilling. + * + * Only phis and stack moves can access memory variables. + */ +static void +agx_spill_everything(agx_context *ctx) +{ + /* Immediates and uniforms are not allowed to be spilled, so they cannot + * appear in phi webs. Lower them first. + */ + agx_foreach_block(ctx, block) { + agx_block **preds = util_dynarray_begin(&block->predecessors); + + agx_foreach_phi_in_block(block, phi) { + agx_foreach_src(phi, s) { + if (phi->src[s].type == AGX_INDEX_IMMEDIATE || + phi->src[s].type == AGX_INDEX_UNIFORM) { + + agx_builder b = + agx_init_builder(ctx, agx_after_block_logical(preds[s])); + + agx_index temp = agx_temp(ctx, phi->dest[0].size); + + if (phi->src[s].type == AGX_INDEX_IMMEDIATE) + agx_mov_imm_to(&b, temp, phi->src[s].value); + else + agx_mov_to(&b, temp, phi->src[s]); + + agx_replace_src(phi, s, temp); + } } } - - agx_foreach_ssa_dest(I, d) { - unsigned v = I->dest[d].value; - assert(ncomps[v] == 0 && "broken SSA"); - /* Round up vectors for easier live range splitting */ - ncomps[v] = util_next_power_of_two(agx_index_size_16(I->dest[d])); - sizes[v] = I->dest[d].size; - - max_ncomps = MAX2(max_ncomps, ncomps[v]); - } } - /* For live range splitting to work properly, ensure the register file is - * aligned to the larger vector size. Most of the time, this is a no-op since - * the largest vector size is usually 128-bit and the register file is - * naturally 128-bit aligned. However, this is required for correctness with - * 3D textureGrad, which can have a source vector of length 6x32-bit, - * rounding up to 256-bit and requiring special accounting here. - */ - unsigned reg_file_alignment = MAX2(max_ncomps, 8); - assert(util_is_power_of_two_nonzero(reg_file_alignment)); + /* Now we can spill everything */ + unsigned mem_base = ctx->alloc; + ctx->alloc = mem_base + ctx->alloc; - /* Calculate the demand and use it to bound register assignment */ - unsigned demand = - ALIGN_POT(agx_calc_register_demand(ctx, ncomps), reg_file_alignment); + agx_foreach_instr_global_safe(ctx, I) { + if (I->op == AGX_OPCODE_PHI) { + agx_foreach_ssa_dest(I, d) { + I->dest[d] = agx_replace_index( + I->dest[d], agx_index_as_mem(I->dest[d], mem_base)); + } + agx_foreach_ssa_src(I, s) { + agx_replace_src(I, s, agx_index_as_mem(I->src[s], mem_base)); + } + } else { + agx_builder b = agx_init_builder(ctx, agx_before_instr(I)); + agx_foreach_ssa_src(I, s) { + agx_index fill = + agx_vec_temp(ctx, I->src[s].size, agx_channels(I->src[s])); + + agx_mov_to(&b, fill, agx_index_as_mem(I->src[s], mem_base)); + agx_replace_src(I, s, fill); + } + + agx_foreach_ssa_dest(I, d) { + agx_builder b = agx_init_builder(ctx, agx_after_instr(I)); + agx_mov_to(&b, agx_index_as_mem(I->dest[d], mem_base), I->dest[d]); + } + } + } + + agx_validate(ctx, "Trivial spill"); +} + +void +agx_ra(agx_context *ctx) +{ + /* Determine maximum possible registers. We won't exceed this! */ unsigned max_possible_regs = AGX_NUM_REGS; /* Compute shaders need to have their entire workgroup together, so our @@ -1017,19 +1142,71 @@ agx_ra(agx_context *ctx) agx_max_registers_for_occupancy(threads_per_workgroup); } - /* TODO: Spilling. Abort so we don't smash the stack in release builds. */ - if (demand > max_possible_regs) { - fprintf(stderr, "\n"); - fprintf(stderr, "------------------------------------------------\n"); - fprintf(stderr, "Asahi Linux shader compiler limitation!\n"); - fprintf(stderr, "We ran out of registers! Nyaaaa 😿\n"); - fprintf(stderr, "Do not report this as a bug.\n"); - fprintf(stderr, "We know -- we're working on it!\n"); - fprintf(stderr, "------------------------------------------------\n"); - fprintf(stderr, "\n"); - abort(); + /* Calculate the demand. We'll use it to determine if we need to spill and to + * bound register assignment. + */ + agx_compute_liveness(ctx); + unsigned effective_demand = agx_calc_register_demand(ctx); + bool spilling = (effective_demand > max_possible_regs); + + if (spilling) { + assert(ctx->key->has_scratch && "internal shaders are unspillable"); + agx_spill_everything(ctx); + + /* After spilling, recalculate liveness and demand */ + agx_compute_liveness(ctx); + effective_demand = agx_calc_register_demand(ctx); + + /* The resulting program can now be assigned registers */ + assert(effective_demand <= max_possible_regs && "spiller post-condition"); } + uint8_t *ncomps = calloc(ctx->alloc, sizeof(uint8_t)); + enum ra_class *classes = calloc(ctx->alloc, sizeof(enum ra_class)); + agx_instr **src_to_collect_phi = calloc(ctx->alloc, sizeof(agx_instr *)); + enum agx_size *sizes = calloc(ctx->alloc, sizeof(enum agx_size)); + BITSET_WORD *visited = calloc(BITSET_WORDS(ctx->alloc), sizeof(BITSET_WORD)); + unsigned max_ncomps = 1; + + agx_foreach_instr_global(ctx, I) { + /* Record collects/phis so we can coalesce when assigning */ + if (I->op == AGX_OPCODE_COLLECT || I->op == AGX_OPCODE_PHI) { + agx_foreach_ssa_src(I, s) { + src_to_collect_phi[I->src[s].value] = I; + } + } + + agx_foreach_ssa_dest(I, d) { + unsigned v = I->dest[d].value; + assert(ncomps[v] == 0 && "broken SSA"); + /* Round up vectors for easier live range splitting */ + ncomps[v] = util_next_power_of_two(agx_index_size_16(I->dest[d])); + sizes[v] = I->dest[d].size; + classes[v] = ra_class_for_index(I->dest[d]); + + max_ncomps = MAX2(max_ncomps, ncomps[v]); + } + } + + /* For live range splitting to work properly, ensure the register file is + * aligned to the larger vector size. Most of the time, this is a no-op since + * the largest vector size is usually 128-bit and the register file is + * naturally 128-bit aligned. However, this is required for correctness with + * 3D textureGrad, which can have a source vector of length 6x32-bit, + * rounding up to 256-bit and requiring special accounting here. + */ + unsigned reg_file_alignment = MAX2(max_ncomps, 8); + assert(util_is_power_of_two_nonzero(reg_file_alignment)); + + if (spilling) { + /* We need to allocate scratch registers for lowering spilling later */ + effective_demand = MAX2(effective_demand, 6 * 2 /* preloading */); + effective_demand += reg_file_alignment; + } + + unsigned demand = ALIGN_POT(effective_demand, reg_file_alignment); + assert(demand <= max_possible_regs && "Invariant"); + /* Round up the demand to the maximum number of registers we can use without * affecting occupancy. This reduces live range splitting. */ @@ -1045,6 +1222,8 @@ agx_ra(agx_context *ctx) assert(max_regs >= (6 * 2) && "space for vertex shader preloading"); assert(max_regs <= max_possible_regs); + unsigned max_mem_slot = 0; + /* Assign registers in dominance-order. This coincides with source-order due * to a NIR invariant, so we do not need special handling for this. */ @@ -1055,12 +1234,20 @@ agx_ra(agx_context *ctx) .src_to_collect_phi = src_to_collect_phi, .ncomps = ncomps, .sizes = sizes, + .classes = classes, .visited = visited, - .bound = max_regs, - .max_reg = &ctx->max_reg, + .bound[RA_GPR] = max_regs, + .bound[RA_MEM] = AGX_NUM_MODELED_REGS, + .max_reg[RA_GPR] = &ctx->max_reg, + .max_reg[RA_MEM] = &max_mem_slot, }); } + if (spilling) { + ctx->spill_base = ctx->scratch_size; + ctx->scratch_size += (max_mem_slot + 1) * 2; + } + /* Vertex shaders preload the vertex/instance IDs (r5, r6) even if the shader * don't use them. Account for that so the preload doesn't clobber GPRs. */ @@ -1075,6 +1262,8 @@ agx_ra(agx_context *ctx) if (ins->op == AGX_OPCODE_COLLECT) { assert(ins->dest[0].type == AGX_INDEX_REGISTER); + assert(!ins->dest[0].memory); + unsigned base = ins->dest[0].value; unsigned width = agx_size_align_16(ins->src[0].size); @@ -1111,6 +1300,8 @@ agx_ra(agx_context *ctx) if (ins->dest[i].type != AGX_INDEX_REGISTER) continue; + assert(!ins->dest[i].memory); + agx_index src = ins->src[0]; src.size = ins->dest[i].size; src.channels_m1 = 0; @@ -1147,7 +1338,8 @@ agx_ra(agx_context *ctx) case AGX_OPCODE_MOV: if (I->src[0].type == AGX_INDEX_REGISTER && I->dest[0].size == I->src[0].size && - I->src[0].value == I->dest[0].value) { + I->src[0].value == I->dest[0].value && + I->src[0].memory == I->dest[0].memory) { assert(I->dest[0].type == AGX_INDEX_REGISTER); agx_remove_instruction(I); @@ -1167,5 +1359,6 @@ agx_ra(agx_context *ctx) free(src_to_collect_phi); free(ncomps); free(sizes); + free(classes); free(visited); } diff --git a/src/asahi/compiler/agx_validate.c b/src/asahi/compiler/agx_validate.c index 44da79e4678..b3639bace1b 100644 --- a/src/asahi/compiler/agx_validate.c +++ b/src/asahi/compiler/agx_validate.c @@ -6,6 +6,7 @@ #include "agx_compiler.h" #include "agx_debug.h" +#include "agx_opcodes.h" /* Validatation doesn't make sense in release builds */ #ifndef NDEBUG @@ -67,6 +68,16 @@ agx_validate_block_form(agx_block *block) return true; } +/* + * Only moves and phis use stack. Phis cannot use moves due to their + * parallel nature, so we allow phis to take memory, later lowered to moves. + */ +static bool +is_stack_valid(agx_instr *I) +{ + return (I->op == AGX_OPCODE_MOV) || (I->op == AGX_OPCODE_PHI); +} + static bool agx_validate_sources(agx_instr *I) { @@ -91,6 +102,8 @@ agx_validate_sources(agx_instr *I) } else if (I->op == AGX_OPCODE_COLLECT && !agx_is_null(src)) { agx_validate_assert(src.size == I->src[0].size); } + + agx_validate_assert(!src.memory || is_stack_valid(I)); } return true; @@ -115,6 +128,9 @@ agx_validate_defs(agx_instr *I, BITSET_WORD *defs) return false; BITSET_SET(defs, I->dest[d].value); + + if (I->dest[d].memory && !is_stack_valid(I)) + return false; } return true; @@ -127,6 +143,10 @@ agx_write_registers(const agx_instr *I, unsigned d) unsigned size = agx_size_align_16(I->dest[d].size); switch (I->op) { + case AGX_OPCODE_MOV: + /* Tautological */ + return agx_index_size_16(I->dest[d]); + case AGX_OPCODE_ITER: case AGX_OPCODE_ITERPROJ: assert(1 <= I->channels && I->channels <= 4); @@ -194,6 +214,10 @@ agx_read_registers(const agx_instr *I, unsigned s) unsigned size = agx_size_align_16(I->src[s].size); switch (I->op) { + case AGX_OPCODE_MOV: + /* Tautological */ + return agx_index_size_16(I->src[0]); + case AGX_OPCODE_SPLIT: return I->nr_dests * agx_size_align_16(agx_split_width(I));