
Don't smash stack -- explain to the user what happened. Signed-off-by: Alyssa Rosenzweig <alyssa@rosenzweig.io> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/26056>
1298 lines
42 KiB
C
1298 lines
42 KiB
C
/*
|
|
* Copyright 2021 Alyssa Rosenzweig
|
|
* SPDX-License-Identifier: MIT
|
|
*/
|
|
|
|
#include "util/u_dynarray.h"
|
|
#include "agx_builder.h"
|
|
#include "agx_compiler.h"
|
|
#include "agx_debug.h"
|
|
#include "agx_opcodes.h"
|
|
|
|
/* SSA-based register allocator */
|
|
|
|
struct ra_ctx {
|
|
agx_context *shader;
|
|
agx_block *block;
|
|
agx_instr *instr;
|
|
uint8_t *ssa_to_reg;
|
|
uint8_t *ncomps;
|
|
enum agx_size *sizes;
|
|
BITSET_WORD *visited;
|
|
BITSET_WORD *used_regs;
|
|
|
|
/* Maintained while assigning registers */
|
|
unsigned *max_reg;
|
|
|
|
/* For affinities */
|
|
agx_instr **src_to_collect_phi;
|
|
|
|
/* If bit i of used_regs is set, and register i is the first consecutive
|
|
* register holding an SSA value, then reg_to_ssa[i] is the SSA index of the
|
|
* value currently in register i.
|
|
*/
|
|
uint32_t reg_to_ssa[AGX_NUM_REGS];
|
|
|
|
/* Maximum number of registers that RA is allowed to use */
|
|
unsigned bound;
|
|
};
|
|
|
|
/** Returns number of registers written by an instruction */
|
|
unsigned
|
|
agx_write_registers(const agx_instr *I, unsigned d)
|
|
{
|
|
unsigned size = agx_size_align_16(I->dest[d].size);
|
|
|
|
switch (I->op) {
|
|
case AGX_OPCODE_ITER:
|
|
case AGX_OPCODE_ITERPROJ:
|
|
assert(1 <= I->channels && I->channels <= 4);
|
|
return I->channels * size;
|
|
|
|
case AGX_OPCODE_IMAGE_LOAD:
|
|
case AGX_OPCODE_TEXTURE_LOAD:
|
|
case AGX_OPCODE_TEXTURE_SAMPLE:
|
|
/* Even when masked out, these clobber 4 registers */
|
|
return 4 * size;
|
|
|
|
case AGX_OPCODE_DEVICE_LOAD:
|
|
case AGX_OPCODE_LOCAL_LOAD:
|
|
case AGX_OPCODE_LD_TILE:
|
|
/* Can write 16-bit or 32-bit. Anything logically 64-bit is already
|
|
* expanded to 32-bit in the mask.
|
|
*/
|
|
return util_bitcount(I->mask) * MIN2(size, 2);
|
|
|
|
case AGX_OPCODE_LDCF:
|
|
return 6;
|
|
case AGX_OPCODE_COLLECT:
|
|
return I->nr_srcs * agx_size_align_16(I->src[0].size);
|
|
default:
|
|
return size;
|
|
}
|
|
}
|
|
|
|
static inline enum agx_size
|
|
agx_split_width(const agx_instr *I)
|
|
{
|
|
enum agx_size width = ~0;
|
|
|
|
agx_foreach_dest(I, d) {
|
|
if (I->dest[d].type == AGX_INDEX_NULL)
|
|
continue;
|
|
else if (width != ~0)
|
|
assert(width == I->dest[d].size);
|
|
else
|
|
width = I->dest[d].size;
|
|
}
|
|
|
|
assert(width != ~0 && "should have been DCE'd");
|
|
return width;
|
|
}
|
|
|
|
/*
|
|
* Return number of registers required for coordinates for a
|
|
* texture/image instruction. We handle layer + sample index as 32-bit even when
|
|
* only the lower 16-bits are present.
|
|
*/
|
|
static unsigned
|
|
agx_coordinate_registers(const agx_instr *I)
|
|
{
|
|
switch (I->dim) {
|
|
case AGX_DIM_1D:
|
|
return 2 * 1;
|
|
case AGX_DIM_1D_ARRAY:
|
|
return 2 * 2;
|
|
case AGX_DIM_2D:
|
|
return 2 * 2;
|
|
case AGX_DIM_2D_ARRAY:
|
|
return 2 * 3;
|
|
case AGX_DIM_2D_MS:
|
|
return 2 * 3;
|
|
case AGX_DIM_3D:
|
|
return 2 * 3;
|
|
case AGX_DIM_CUBE:
|
|
return 2 * 3;
|
|
case AGX_DIM_CUBE_ARRAY:
|
|
return 2 * 4;
|
|
case AGX_DIM_2D_MS_ARRAY:
|
|
return 2 * 3;
|
|
}
|
|
|
|
unreachable("Invalid texture dimension");
|
|
}
|
|
|
|
/*
|
|
* Calculate register demand in 16-bit registers. Becuase we allocate in SSA,
|
|
* this calculation is exact in linear-time. Depends on liveness information.
|
|
*/
|
|
static unsigned
|
|
agx_calc_register_demand(agx_context *ctx, uint8_t *widths)
|
|
{
|
|
/* Calculate demand at the start of each block based on live-in, then update
|
|
* for each instruction processed. Calculate rolling maximum.
|
|
*/
|
|
unsigned max_demand = 0;
|
|
|
|
agx_foreach_block(ctx, block) {
|
|
unsigned demand = 0;
|
|
|
|
/* RA treats the nesting counter as alive throughout if control flow is
|
|
* used anywhere. This could be optimized.
|
|
*/
|
|
if (ctx->any_cf)
|
|
demand++;
|
|
|
|
/* Everything live-in */
|
|
{
|
|
int i;
|
|
BITSET_FOREACH_SET(i, block->live_in, ctx->alloc) {
|
|
demand += widths[i];
|
|
}
|
|
}
|
|
|
|
max_demand = MAX2(demand, max_demand);
|
|
|
|
/* To handle non-power-of-two vectors, sometimes live range splitting
|
|
* needs extra registers for 1 instruction. This counter tracks the number
|
|
* of registers to be freed after 1 extra instruction.
|
|
*/
|
|
unsigned late_kill_count = 0;
|
|
|
|
agx_foreach_instr_in_block(block, I) {
|
|
/* Phis happen in parallel and are already accounted for in the live-in
|
|
* set, just skip them so we don't double count.
|
|
*/
|
|
if (I->op == AGX_OPCODE_PHI)
|
|
continue;
|
|
|
|
/* Handle late-kill registers from last instruction */
|
|
demand -= late_kill_count;
|
|
late_kill_count = 0;
|
|
|
|
/* Kill sources the first time we see them */
|
|
agx_foreach_src(I, s) {
|
|
if (!I->src[s].kill)
|
|
continue;
|
|
assert(I->src[s].type == AGX_INDEX_NORMAL);
|
|
|
|
bool skip = false;
|
|
|
|
for (unsigned backwards = 0; backwards < s; ++backwards) {
|
|
if (agx_is_equiv(I->src[backwards], I->src[s])) {
|
|
skip = true;
|
|
break;
|
|
}
|
|
}
|
|
|
|
if (!skip)
|
|
demand -= widths[I->src[s].value];
|
|
}
|
|
|
|
/* Make destinations live */
|
|
agx_foreach_dest(I, d) {
|
|
if (agx_is_null(I->dest[d]))
|
|
continue;
|
|
assert(I->dest[d].type == AGX_INDEX_NORMAL);
|
|
|
|
/* Live range splits allocate at power-of-two granularity. Round up
|
|
* destination sizes (temporarily) to powers-of-two.
|
|
*/
|
|
unsigned real_width = widths[I->dest[d].value];
|
|
unsigned pot_width = util_next_power_of_two(real_width);
|
|
|
|
demand += pot_width;
|
|
late_kill_count += (pot_width - real_width);
|
|
}
|
|
|
|
max_demand = MAX2(demand, max_demand);
|
|
}
|
|
|
|
demand -= late_kill_count;
|
|
}
|
|
|
|
return max_demand;
|
|
}
|
|
|
|
unsigned
|
|
agx_read_registers(const agx_instr *I, unsigned s)
|
|
{
|
|
unsigned size = agx_size_align_16(I->src[s].size);
|
|
|
|
switch (I->op) {
|
|
case AGX_OPCODE_SPLIT:
|
|
return I->nr_dests * agx_size_align_16(agx_split_width(I));
|
|
|
|
case AGX_OPCODE_DEVICE_STORE:
|
|
case AGX_OPCODE_LOCAL_STORE:
|
|
case AGX_OPCODE_ST_TILE:
|
|
/* See agx_write_registers */
|
|
if (s == 0)
|
|
return util_bitcount(I->mask) * MIN2(size, 2);
|
|
else
|
|
return size;
|
|
|
|
case AGX_OPCODE_ZS_EMIT:
|
|
if (s == 1) {
|
|
/* Depth (bit 0) is fp32, stencil (bit 1) is u16 in the hw but we pad
|
|
* up to u32 for simplicity
|
|
*/
|
|
bool z = !!(I->zs & 1);
|
|
bool s = !!(I->zs & 2);
|
|
assert(z || s);
|
|
|
|
return (z && s) ? 4 : z ? 2 : 1;
|
|
} else {
|
|
return 1;
|
|
}
|
|
|
|
case AGX_OPCODE_IMAGE_WRITE:
|
|
if (s == 0)
|
|
return 4 * size /* data */;
|
|
else if (s == 1)
|
|
return agx_coordinate_registers(I);
|
|
else
|
|
return size;
|
|
|
|
case AGX_OPCODE_IMAGE_LOAD:
|
|
case AGX_OPCODE_TEXTURE_LOAD:
|
|
case AGX_OPCODE_TEXTURE_SAMPLE:
|
|
if (s == 0) {
|
|
return agx_coordinate_registers(I);
|
|
} else if (s == 1) {
|
|
/* LOD */
|
|
if (I->lod_mode == AGX_LOD_MODE_LOD_GRAD) {
|
|
switch (I->dim) {
|
|
case AGX_DIM_1D:
|
|
case AGX_DIM_1D_ARRAY:
|
|
return 2 * 2 * 1;
|
|
case AGX_DIM_2D:
|
|
case AGX_DIM_2D_ARRAY:
|
|
case AGX_DIM_2D_MS_ARRAY:
|
|
case AGX_DIM_2D_MS:
|
|
return 2 * 2 * 2;
|
|
case AGX_DIM_CUBE:
|
|
case AGX_DIM_CUBE_ARRAY:
|
|
case AGX_DIM_3D:
|
|
return 2 * 2 * 3;
|
|
}
|
|
|
|
unreachable("Invalid texture dimension");
|
|
} else {
|
|
return 1;
|
|
}
|
|
} else if (s == 5) {
|
|
/* Compare/offset */
|
|
return 2 * ((!!I->shadow) + (!!I->offset));
|
|
} else {
|
|
return size;
|
|
}
|
|
|
|
case AGX_OPCODE_ATOMIC:
|
|
case AGX_OPCODE_LOCAL_ATOMIC:
|
|
if (s == 0 && I->atomic_opc == AGX_ATOMIC_OPC_CMPXCHG)
|
|
return size * 2;
|
|
else
|
|
return size;
|
|
|
|
default:
|
|
return size;
|
|
}
|
|
}
|
|
|
|
static bool
|
|
find_regs_simple(struct ra_ctx *rctx, unsigned count, unsigned align,
|
|
unsigned *out)
|
|
{
|
|
for (unsigned reg = 0; reg + count <= rctx->bound; reg += align) {
|
|
if (!BITSET_TEST_RANGE(rctx->used_regs, reg, reg + count - 1)) {
|
|
*out = reg;
|
|
return true;
|
|
}
|
|
}
|
|
|
|
return false;
|
|
}
|
|
|
|
/*
|
|
* Search the register file for the best contiguous aligned region of the given
|
|
* size to evict when shuffling registers. The region must not contain any
|
|
* register marked in the passed bitset.
|
|
*
|
|
* As a hint, this also takes in the set of registers from killed sources passed
|
|
* to this instruction. These should be deprioritized, since they are more
|
|
* expensive to use (extra moves to shuffle the contents away).
|
|
*
|
|
* Precondition: such a region exists.
|
|
*
|
|
* Postcondition: at least one register in the returned region is already free.
|
|
*/
|
|
static unsigned
|
|
find_best_region_to_evict(struct ra_ctx *rctx, unsigned size,
|
|
BITSET_WORD *already_evicted, BITSET_WORD *killed)
|
|
{
|
|
assert(util_is_power_of_two_or_zero(size) && "precondition");
|
|
assert((rctx->bound % size) == 0 &&
|
|
"register file size must be aligned to the maximum vector size");
|
|
|
|
unsigned best_base = ~0;
|
|
unsigned best_moves = ~0;
|
|
|
|
for (unsigned base = 0; base + size <= rctx->bound; base += size) {
|
|
/* r0l is unevictable, skip it. By itself, this does not pose a problem.
|
|
* We are allocating n registers, but the region containing r0l has at
|
|
* most n-1 free. Since there are at least n free registers total, there
|
|
* is at least 1 free register outside this region. Thus the region
|
|
* containing that free register contains at most n-1 occupied registers.
|
|
* In the worst case, those n-1 occupied registers are moved to the region
|
|
* with r0l and then the n free registers are used for the destination.
|
|
* Thus, we do not need extra registers to handle "single point"
|
|
* unevictability.
|
|
*/
|
|
if (base == 0 && rctx->shader->any_cf)
|
|
continue;
|
|
|
|
/* Do not evict the same register multiple times. It's not necessary since
|
|
* we're just shuffling, there are enough free registers elsewhere.
|
|
*/
|
|
if (BITSET_TEST_RANGE(already_evicted, base, base + size - 1))
|
|
continue;
|
|
|
|
/* Estimate the number of moves required if we pick this region */
|
|
unsigned moves = 0;
|
|
bool any_free = false;
|
|
|
|
for (unsigned reg = base; reg < base + size; ++reg) {
|
|
/* We need a move for each blocked register (TODO: we only need a
|
|
* single move for 32-bit pairs, could optimize to use that instead.)
|
|
*/
|
|
if (BITSET_TEST(rctx->used_regs, reg))
|
|
moves++;
|
|
else
|
|
any_free = true;
|
|
|
|
/* Each clobbered killed register requires a move or a swap. Since
|
|
* swaps require more instructions, assign a higher cost here. In
|
|
* practice, 3 is too high but 2 is slightly better than 1.
|
|
*/
|
|
if (BITSET_TEST(killed, reg))
|
|
moves += 2;
|
|
}
|
|
|
|
/* Pick the region requiring fewest moves as a heuristic. Regions with no
|
|
* free registers are skipped even if the heuristic estimates a lower cost
|
|
* (due to killed sources), since the recursive splitting algorithm
|
|
* requires at least one free register.
|
|
*/
|
|
if (any_free && moves < best_moves) {
|
|
best_moves = moves;
|
|
best_base = base;
|
|
}
|
|
}
|
|
|
|
assert(best_base < rctx->bound &&
|
|
"not enough registers (should have spilled already)");
|
|
return best_base;
|
|
}
|
|
|
|
static void
|
|
set_ssa_to_reg(struct ra_ctx *rctx, unsigned ssa, unsigned reg)
|
|
{
|
|
*(rctx->max_reg) = MAX2(*(rctx->max_reg), reg + rctx->ncomps[ssa] - 1);
|
|
rctx->ssa_to_reg[ssa] = reg;
|
|
}
|
|
|
|
static unsigned
|
|
assign_regs_by_copying(struct ra_ctx *rctx, unsigned npot_count, unsigned align,
|
|
const agx_instr *I, struct util_dynarray *copies,
|
|
BITSET_WORD *clobbered, BITSET_WORD *killed)
|
|
{
|
|
/* XXX: This needs some special handling but so far it has been prohibitively
|
|
* difficult to hit the case
|
|
*/
|
|
if (I->op == AGX_OPCODE_PHI)
|
|
unreachable("TODO");
|
|
|
|
/* Expand the destination to the next power-of-two size. This simplifies
|
|
* splitting and is accounted for by the demand calculation, so is legal.
|
|
*/
|
|
unsigned count = util_next_power_of_two(npot_count);
|
|
assert(align <= count && "still aligned");
|
|
align = count;
|
|
|
|
/* There's not enough contiguous room in the register file. We need to
|
|
* shuffle some variables around. Look for a range of the register file
|
|
* that is partially blocked.
|
|
*/
|
|
unsigned base = find_best_region_to_evict(rctx, count, clobbered, killed);
|
|
|
|
assert(count <= 16 && "max allocation size (conservative)");
|
|
BITSET_DECLARE(evict_set, 16) = {0};
|
|
|
|
/* Store the set of blocking registers that need to be evicted */
|
|
for (unsigned i = 0; i < count; ++i) {
|
|
if (BITSET_TEST(rctx->used_regs, base + i)) {
|
|
BITSET_SET(evict_set, i);
|
|
}
|
|
}
|
|
|
|
/* We are going to allocate the destination to this range, so it is now fully
|
|
* used. Mark it as such so we don't reassign here later.
|
|
*/
|
|
BITSET_SET_RANGE(rctx->used_regs, base, base + count - 1);
|
|
|
|
/* Before overwriting the range, we need to evict blocked variables */
|
|
for (unsigned i = 0; i < 16; ++i) {
|
|
/* Look for subranges that needs eviction */
|
|
if (!BITSET_TEST(evict_set, i))
|
|
continue;
|
|
|
|
unsigned reg = base + i;
|
|
uint32_t ssa = rctx->reg_to_ssa[reg];
|
|
uint32_t nr = rctx->ncomps[ssa];
|
|
unsigned align = agx_size_align_16(rctx->sizes[ssa]);
|
|
|
|
assert(nr >= 1 && "must be assigned");
|
|
assert(rctx->ssa_to_reg[ssa] == reg &&
|
|
"variable must start within the range, since vectors are limited");
|
|
|
|
for (unsigned j = 0; j < nr; ++j) {
|
|
assert(BITSET_TEST(evict_set, i + j) &&
|
|
"variable is allocated contiguous and vectors are limited, "
|
|
"so evicted in full");
|
|
}
|
|
|
|
/* Assign a new location for the variable. This terminates with finite
|
|
* recursion because nr is decreasing because of the gap.
|
|
*/
|
|
assert(nr < count && "fully contained in range that's not full");
|
|
unsigned new_reg =
|
|
assign_regs_by_copying(rctx, nr, align, I, copies, clobbered, killed);
|
|
|
|
/* Copy the variable over, register by register */
|
|
for (unsigned i = 0; i < nr; i += align) {
|
|
struct agx_copy copy = {
|
|
.dest = new_reg + i,
|
|
.src = agx_register(reg + i, rctx->sizes[ssa]),
|
|
};
|
|
|
|
assert((copy.dest % agx_size_align_16(rctx->sizes[ssa])) == 0 &&
|
|
"new dest must be aligned");
|
|
assert((copy.src.value % agx_size_align_16(rctx->sizes[ssa])) == 0 &&
|
|
"src must be aligned");
|
|
util_dynarray_append(copies, struct agx_copy, copy);
|
|
}
|
|
|
|
/* Mark down the set of clobbered registers, so that killed sources may be
|
|
* handled correctly later.
|
|
*/
|
|
BITSET_SET_RANGE(clobbered, new_reg, new_reg + nr - 1);
|
|
|
|
/* Update bookkeeping for this variable */
|
|
set_ssa_to_reg(rctx, ssa, new_reg);
|
|
rctx->reg_to_ssa[new_reg] = ssa;
|
|
|
|
/* Skip to the next variable */
|
|
i += nr - 1;
|
|
}
|
|
|
|
/* We overallocated for non-power-of-two vectors. Free up the excess now.
|
|
* This is modelled as late kill in demand calculation.
|
|
*/
|
|
if (npot_count != count)
|
|
BITSET_CLEAR_RANGE(rctx->used_regs, base + npot_count, base + count - 1);
|
|
|
|
return base;
|
|
}
|
|
|
|
static int
|
|
sort_by_size(const void *a_, const void *b_, void *sizes_)
|
|
{
|
|
const enum agx_size *sizes = sizes_;
|
|
const unsigned *a = a_, *b = b_;
|
|
|
|
return sizes[*b] - sizes[*a];
|
|
}
|
|
|
|
/*
|
|
* Allocating a destination of n consecutive registers may require moving those
|
|
* registers' contents to the locations of killed sources. For the instruction
|
|
* to read the correct values, the killed sources themselves need to be moved to
|
|
* the space where the destination will go.
|
|
*
|
|
* This is legal because there is no interference between the killed source and
|
|
* the destination. This is always possible because, after this insertion, the
|
|
* destination needs to contain the killed sources already overlapping with the
|
|
* destination (size k) plus the killed sources clobbered to make room for
|
|
* livethrough sources overlapping with the destination (at most size |dest|-k),
|
|
* so the total size is at most k + |dest| - k = |dest| and so fits in the dest.
|
|
* Sorting by alignment may be necessary.
|
|
*/
|
|
static void
|
|
insert_copies_for_clobbered_killed(struct ra_ctx *rctx, unsigned reg,
|
|
unsigned count, const agx_instr *I,
|
|
struct util_dynarray *copies,
|
|
BITSET_WORD *clobbered)
|
|
{
|
|
unsigned vars[16] = {0};
|
|
unsigned nr_vars = 0;
|
|
|
|
/* Precondition: the nesting counter is not overwritten. Therefore we do not
|
|
* have to move it. find_best_region_to_evict knows better than to try.
|
|
*/
|
|
assert(!(reg == 0 && rctx->shader->any_cf) && "r0l is never moved");
|
|
|
|
/* Consider the destination clobbered for the purpose of source collection.
|
|
* This way, killed sources already in the destination will be preserved
|
|
* (though possibly compacted).
|
|
*/
|
|
BITSET_SET_RANGE(clobbered, reg, reg + count - 1);
|
|
|
|
/* Collect killed clobbered sources, if any */
|
|
agx_foreach_ssa_src(I, s) {
|
|
unsigned reg = rctx->ssa_to_reg[I->src[s].value];
|
|
|
|
if (I->src[s].kill && BITSET_TEST(clobbered, reg)) {
|
|
assert(nr_vars < ARRAY_SIZE(vars) &&
|
|
"cannot clobber more than max variable size");
|
|
|
|
vars[nr_vars++] = I->src[s].value;
|
|
}
|
|
}
|
|
|
|
if (nr_vars == 0)
|
|
return;
|
|
|
|
/* Sort by descending alignment so they are packed with natural alignment */
|
|
qsort_r(vars, nr_vars, sizeof(vars[0]), sort_by_size, rctx->sizes);
|
|
|
|
/* Reassign in the destination region */
|
|
unsigned base = reg;
|
|
|
|
/* We align vectors to their sizes, so this assertion holds as long as no
|
|
* instruction has a source whose scalar size is greater than the entire size
|
|
* of the vector destination. Yet the killed source must fit within this
|
|
* destination, so the destination must be bigger and therefore have bigger
|
|
* alignment.
|
|
*/
|
|
assert((base % agx_size_align_16(rctx->sizes[vars[0]])) == 0 &&
|
|
"destination alignment >= largest killed source alignment");
|
|
|
|
for (unsigned i = 0; i < nr_vars; ++i) {
|
|
unsigned var = vars[i];
|
|
unsigned var_base = rctx->ssa_to_reg[var];
|
|
unsigned var_count = rctx->ncomps[var];
|
|
unsigned var_align = agx_size_align_16(rctx->sizes[var]);
|
|
|
|
assert((base % var_align) == 0 && "induction");
|
|
assert((var_count % var_align) == 0 && "no partial variables");
|
|
|
|
for (unsigned j = 0; j < var_count; j += var_align) {
|
|
struct agx_copy copy = {
|
|
.dest = base + j,
|
|
.src = agx_register(var_base + j, rctx->sizes[var]),
|
|
};
|
|
|
|
util_dynarray_append(copies, struct agx_copy, copy);
|
|
}
|
|
|
|
set_ssa_to_reg(rctx, var, base);
|
|
rctx->reg_to_ssa[base] = var;
|
|
|
|
base += var_count;
|
|
}
|
|
|
|
assert(base <= reg + count && "no overflow");
|
|
}
|
|
|
|
static unsigned
|
|
find_regs(struct ra_ctx *rctx, agx_instr *I, unsigned dest_idx, unsigned count,
|
|
unsigned align)
|
|
{
|
|
unsigned reg;
|
|
assert(count == align);
|
|
|
|
if (find_regs_simple(rctx, count, align, ®)) {
|
|
return reg;
|
|
} else {
|
|
BITSET_DECLARE(clobbered, AGX_NUM_REGS) = {0};
|
|
BITSET_DECLARE(killed, AGX_NUM_REGS) = {0};
|
|
struct util_dynarray copies = {0};
|
|
util_dynarray_init(&copies, NULL);
|
|
|
|
/* Initialize the set of registers killed by this instructions' sources */
|
|
agx_foreach_ssa_src(I, s) {
|
|
unsigned v = I->src[s].value;
|
|
|
|
if (BITSET_TEST(rctx->visited, v)) {
|
|
unsigned base = rctx->ssa_to_reg[v];
|
|
unsigned nr = rctx->ncomps[v];
|
|
BITSET_SET_RANGE(killed, base, base + nr - 1);
|
|
}
|
|
}
|
|
|
|
reg = assign_regs_by_copying(rctx, count, align, I, &copies, clobbered,
|
|
killed);
|
|
insert_copies_for_clobbered_killed(rctx, reg, count, I, &copies,
|
|
clobbered);
|
|
|
|
/* Insert the necessary copies */
|
|
agx_builder b = agx_init_builder(rctx->shader, agx_before_instr(I));
|
|
agx_emit_parallel_copies(
|
|
&b, copies.data, util_dynarray_num_elements(&copies, struct agx_copy));
|
|
|
|
/* assign_regs asserts this is cleared, so clear to be reassigned */
|
|
BITSET_CLEAR_RANGE(rctx->used_regs, reg, reg + count - 1);
|
|
return reg;
|
|
}
|
|
}
|
|
|
|
/*
|
|
* Loop over live-in values at the start of the block and mark their registers
|
|
* as in-use. We process blocks in dominance order, so this handles everything
|
|
* but loop headers.
|
|
*
|
|
* For loop headers, this handles the forward edges but not the back edge.
|
|
* However, that's okay: we don't want to reserve the registers that are
|
|
* defined within the loop, because then we'd get a contradiction. Instead we
|
|
* leave them available and then they become fixed points of a sort.
|
|
*/
|
|
static void
|
|
reserve_live_in(struct ra_ctx *rctx)
|
|
{
|
|
/* If there are no predecessors, there is nothing live-in */
|
|
unsigned nr_preds = agx_num_predecessors(rctx->block);
|
|
if (nr_preds == 0)
|
|
return;
|
|
|
|
agx_builder b =
|
|
agx_init_builder(rctx->shader, agx_before_block(rctx->block));
|
|
|
|
int i;
|
|
BITSET_FOREACH_SET(i, rctx->block->live_in, rctx->shader->alloc) {
|
|
/* Skip values defined in loops when processing the loop header */
|
|
if (!BITSET_TEST(rctx->visited, i))
|
|
continue;
|
|
|
|
unsigned base;
|
|
|
|
/* If we split live ranges, the variable might be defined differently at
|
|
* the end of each predecessor. Join them together with a phi inserted at
|
|
* the start of the block.
|
|
*/
|
|
if (nr_preds > 1) {
|
|
/* We'll fill in the destination after, to coalesce one of the moves */
|
|
agx_instr *phi = agx_phi_to(&b, agx_null(), nr_preds);
|
|
enum agx_size size = rctx->sizes[i];
|
|
|
|
agx_foreach_predecessor(rctx->block, pred) {
|
|
unsigned pred_idx = agx_predecessor_index(rctx->block, *pred);
|
|
|
|
if ((*pred)->ssa_to_reg_out == NULL) {
|
|
/* If this is a loop header, we don't know where the register
|
|
* will end up. So, we create a phi conservatively but don't fill
|
|
* it in until the end of the loop. Stash in the information
|
|
* we'll need to fill in the real register later.
|
|
*/
|
|
assert(rctx->block->loop_header);
|
|
phi->src[pred_idx] = agx_get_index(i, size);
|
|
} else {
|
|
/* Otherwise, we can build the phi now */
|
|
unsigned reg = (*pred)->ssa_to_reg_out[i];
|
|
phi->src[pred_idx] = agx_register(reg, size);
|
|
}
|
|
}
|
|
|
|
/* Pick the phi destination to coalesce a move. Predecessor ordering is
|
|
* stable, so this means all live-in values get their registers from a
|
|
* particular predecessor. That means that such a register allocation
|
|
* is valid here, because it was valid in the predecessor.
|
|
*/
|
|
phi->dest[0] = phi->src[0];
|
|
base = phi->dest[0].value;
|
|
} else {
|
|
/* If we don't emit a phi, there is already a unique register */
|
|
assert(nr_preds == 1);
|
|
|
|
agx_block **pred = util_dynarray_begin(&rctx->block->predecessors);
|
|
base = (*pred)->ssa_to_reg_out[i];
|
|
}
|
|
|
|
set_ssa_to_reg(rctx, i, base);
|
|
|
|
for (unsigned j = 0; j < rctx->ncomps[i]; ++j) {
|
|
BITSET_SET(rctx->used_regs, base + j);
|
|
rctx->reg_to_ssa[base + j] = i;
|
|
}
|
|
}
|
|
}
|
|
|
|
static void
|
|
assign_regs(struct ra_ctx *rctx, agx_index v, unsigned reg)
|
|
{
|
|
assert(reg < rctx->bound && "must not overflow register file");
|
|
assert(v.type == AGX_INDEX_NORMAL && "only SSA gets registers allocated");
|
|
set_ssa_to_reg(rctx, v.value, reg);
|
|
|
|
assert(!BITSET_TEST(rctx->visited, v.value) && "SSA violated");
|
|
BITSET_SET(rctx->visited, v.value);
|
|
|
|
assert(rctx->ncomps[v.value] >= 1);
|
|
unsigned end = reg + rctx->ncomps[v.value] - 1;
|
|
|
|
assert(!BITSET_TEST_RANGE(rctx->used_regs, reg, end) && "no interference");
|
|
BITSET_SET_RANGE(rctx->used_regs, reg, end);
|
|
|
|
rctx->reg_to_ssa[reg] = v.value;
|
|
}
|
|
|
|
static void
|
|
agx_set_sources(struct ra_ctx *rctx, agx_instr *I)
|
|
{
|
|
assert(I->op != AGX_OPCODE_PHI);
|
|
|
|
agx_foreach_ssa_src(I, s) {
|
|
assert(BITSET_TEST(rctx->visited, I->src[s].value) && "no phis");
|
|
|
|
unsigned v = rctx->ssa_to_reg[I->src[s].value];
|
|
agx_replace_src(I, s, agx_register(v, I->src[s].size));
|
|
}
|
|
}
|
|
|
|
static void
|
|
agx_set_dests(struct ra_ctx *rctx, agx_instr *I)
|
|
{
|
|
agx_foreach_ssa_dest(I, s) {
|
|
unsigned v = rctx->ssa_to_reg[I->dest[s].value];
|
|
I->dest[s] =
|
|
agx_replace_index(I->dest[s], agx_register(v, I->dest[s].size));
|
|
}
|
|
}
|
|
|
|
static unsigned
|
|
affinity_base_of_collect(struct ra_ctx *rctx, agx_instr *collect, unsigned src)
|
|
{
|
|
unsigned src_reg = rctx->ssa_to_reg[collect->src[src].value];
|
|
unsigned src_offset = src * agx_size_align_16(collect->src[src].size);
|
|
|
|
if (src_reg >= src_offset)
|
|
return src_reg - src_offset;
|
|
else
|
|
return ~0;
|
|
}
|
|
|
|
static bool
|
|
try_coalesce_with(struct ra_ctx *rctx, agx_index ssa, unsigned count,
|
|
bool may_be_unvisited, unsigned *out)
|
|
{
|
|
assert(ssa.type == AGX_INDEX_NORMAL);
|
|
if (!BITSET_TEST(rctx->visited, ssa.value)) {
|
|
assert(may_be_unvisited);
|
|
return false;
|
|
}
|
|
|
|
unsigned base = rctx->ssa_to_reg[ssa.value];
|
|
if (BITSET_TEST_RANGE(rctx->used_regs, base, base + count - 1))
|
|
return false;
|
|
|
|
assert(base + count <= rctx->bound && "invariant");
|
|
*out = base;
|
|
return true;
|
|
}
|
|
|
|
static unsigned
|
|
pick_regs(struct ra_ctx *rctx, agx_instr *I, unsigned d)
|
|
{
|
|
agx_index idx = I->dest[d];
|
|
assert(idx.type == AGX_INDEX_NORMAL);
|
|
|
|
unsigned count = rctx->ncomps[idx.value];
|
|
assert(count >= 1);
|
|
|
|
unsigned align = count;
|
|
|
|
/* Try to allocate phis compatibly with their sources */
|
|
if (I->op == AGX_OPCODE_PHI) {
|
|
agx_foreach_ssa_src(I, s) {
|
|
/* Loop headers have phis with a source preceding the definition */
|
|
bool may_be_unvisited = rctx->block->loop_header;
|
|
|
|
unsigned out;
|
|
if (try_coalesce_with(rctx, I->src[s], count, may_be_unvisited, &out))
|
|
return out;
|
|
}
|
|
}
|
|
|
|
/* Try to allocate collects compatibly with their sources */
|
|
if (I->op == AGX_OPCODE_COLLECT) {
|
|
agx_foreach_ssa_src(I, s) {
|
|
assert(BITSET_TEST(rctx->visited, I->src[s].value) &&
|
|
"registers assigned in an order compatible with dominance "
|
|
"and this is not a phi node, so we have assigned a register");
|
|
|
|
unsigned base = affinity_base_of_collect(rctx, I, s);
|
|
if (base >= rctx->bound || (base + count) > rctx->bound)
|
|
continue;
|
|
|
|
/* Unaligned destinations can happen when dest size > src size */
|
|
if (base % align)
|
|
continue;
|
|
|
|
if (!BITSET_TEST_RANGE(rctx->used_regs, base, base + count - 1))
|
|
return base;
|
|
}
|
|
}
|
|
|
|
/* Try to allocate sources of collects contiguously */
|
|
agx_instr *collect_phi = rctx->src_to_collect_phi[idx.value];
|
|
if (collect_phi && collect_phi->op == AGX_OPCODE_COLLECT) {
|
|
agx_instr *collect = collect_phi;
|
|
|
|
assert(count == align && "collect sources are scalar");
|
|
|
|
/* Find our offset in the collect. If our source is repeated in the
|
|
* collect, this may not be unique. We arbitrarily choose the first.
|
|
*/
|
|
unsigned our_source = ~0;
|
|
agx_foreach_ssa_src(collect, s) {
|
|
if (agx_is_equiv(collect->src[s], idx)) {
|
|
our_source = s;
|
|
break;
|
|
}
|
|
}
|
|
|
|
assert(our_source < collect->nr_srcs && "source must be in the collect");
|
|
|
|
/* See if we can allocate compatibly with any source of the collect */
|
|
agx_foreach_ssa_src(collect, s) {
|
|
if (!BITSET_TEST(rctx->visited, collect->src[s].value))
|
|
continue;
|
|
|
|
/* Determine where the collect should start relative to the source */
|
|
unsigned base = affinity_base_of_collect(rctx, collect, s);
|
|
if (base >= rctx->bound)
|
|
continue;
|
|
|
|
unsigned our_reg = base + (our_source * align);
|
|
|
|
/* Don't allocate past the end of the register file */
|
|
if ((our_reg + align) > rctx->bound)
|
|
continue;
|
|
|
|
/* If those registers are free, then choose them */
|
|
if (!BITSET_TEST_RANGE(rctx->used_regs, our_reg, our_reg + align - 1))
|
|
return our_reg;
|
|
}
|
|
|
|
unsigned collect_align = rctx->ncomps[collect->dest[0].value];
|
|
unsigned offset = our_source * align;
|
|
|
|
/* Prefer ranges of the register file that leave room for all sources of
|
|
* the collect contiguously.
|
|
*/
|
|
for (unsigned base = 0; base + (collect->nr_srcs * align) <= rctx->bound;
|
|
base += collect_align) {
|
|
if (!BITSET_TEST_RANGE(rctx->used_regs, base,
|
|
base + (collect->nr_srcs * align) - 1))
|
|
return base + offset;
|
|
}
|
|
|
|
/* Try to respect the alignment requirement of the collect destination,
|
|
* which may be greater than the sources (e.g. pack_64_2x32_split). Look
|
|
* for a register for the source such that the collect base is aligned.
|
|
*/
|
|
if (collect_align > align) {
|
|
for (unsigned reg = offset; reg + collect_align <= rctx->bound;
|
|
reg += collect_align) {
|
|
if (!BITSET_TEST_RANGE(rctx->used_regs, reg, reg + count - 1))
|
|
return reg;
|
|
}
|
|
}
|
|
}
|
|
|
|
/* Try to allocate phi sources compatibly with their phis */
|
|
if (collect_phi && collect_phi->op == AGX_OPCODE_PHI) {
|
|
agx_instr *phi = collect_phi;
|
|
unsigned out;
|
|
|
|
agx_foreach_ssa_src(phi, s) {
|
|
if (try_coalesce_with(rctx, phi->src[s], count, true, &out))
|
|
return out;
|
|
}
|
|
|
|
/* If we're in a loop, we may have already allocated the phi. Try that. */
|
|
if (phi->dest[0].type == AGX_INDEX_REGISTER) {
|
|
unsigned base = phi->dest[0].value;
|
|
|
|
if (!BITSET_TEST_RANGE(rctx->used_regs, base, base + count - 1))
|
|
return base;
|
|
}
|
|
}
|
|
|
|
/* Default to any contiguous sequence of registers */
|
|
return find_regs(rctx, I, d, count, align);
|
|
}
|
|
|
|
/** Assign registers to SSA values in a block. */
|
|
|
|
static void
|
|
agx_ra_assign_local(struct ra_ctx *rctx)
|
|
{
|
|
BITSET_DECLARE(used_regs, AGX_NUM_REGS) = {0};
|
|
uint8_t *ssa_to_reg = calloc(rctx->shader->alloc, sizeof(uint8_t));
|
|
|
|
agx_block *block = rctx->block;
|
|
uint8_t *ncomps = rctx->ncomps;
|
|
rctx->used_regs = used_regs;
|
|
rctx->ssa_to_reg = ssa_to_reg;
|
|
|
|
reserve_live_in(rctx);
|
|
|
|
/* Force the nesting counter r0l live throughout shaders using control flow.
|
|
* This could be optimized (sync with agx_calc_register_demand).
|
|
*/
|
|
if (rctx->shader->any_cf)
|
|
BITSET_SET(used_regs, 0);
|
|
|
|
agx_foreach_instr_in_block(block, I) {
|
|
rctx->instr = I;
|
|
|
|
/* Optimization: if a split contains the last use of a vector, the split
|
|
* can be removed by assigning the destinations overlapping the source.
|
|
*/
|
|
if (I->op == AGX_OPCODE_SPLIT && I->src[0].kill) {
|
|
unsigned reg = ssa_to_reg[I->src[0].value];
|
|
unsigned width = agx_size_align_16(agx_split_width(I));
|
|
|
|
agx_foreach_dest(I, d) {
|
|
/* Free up the source */
|
|
unsigned offset_reg = reg + (d * width);
|
|
BITSET_CLEAR_RANGE(used_regs, offset_reg, offset_reg + width - 1);
|
|
|
|
/* Assign the destination where the source was */
|
|
if (!agx_is_null(I->dest[d]))
|
|
assign_regs(rctx, I->dest[d], offset_reg);
|
|
}
|
|
|
|
unsigned excess =
|
|
rctx->ncomps[I->src[0].value] - (I->nr_dests * width);
|
|
if (excess) {
|
|
BITSET_CLEAR_RANGE(used_regs, reg + (I->nr_dests * width),
|
|
reg + rctx->ncomps[I->src[0].value] - 1);
|
|
}
|
|
|
|
agx_set_sources(rctx, I);
|
|
agx_set_dests(rctx, I);
|
|
continue;
|
|
} else if (I->op == AGX_OPCODE_PRELOAD) {
|
|
/* We must coalesce all preload moves */
|
|
assert(I->dest[0].size == I->src[0].size);
|
|
assert(I->src[0].type == AGX_INDEX_REGISTER);
|
|
|
|
assign_regs(rctx, I->dest[0], I->src[0].value);
|
|
agx_set_dests(rctx, I);
|
|
continue;
|
|
}
|
|
|
|
/* First, free killed sources */
|
|
agx_foreach_ssa_src(I, s) {
|
|
if (I->src[s].kill) {
|
|
unsigned reg = ssa_to_reg[I->src[s].value];
|
|
unsigned count = ncomps[I->src[s].value];
|
|
|
|
assert(count >= 1);
|
|
BITSET_CLEAR_RANGE(used_regs, reg, reg + count - 1);
|
|
}
|
|
}
|
|
|
|
/* Next, assign destinations one at a time. This is always legal
|
|
* because of the SSA form.
|
|
*/
|
|
agx_foreach_ssa_dest(I, d) {
|
|
assign_regs(rctx, I->dest[d], pick_regs(rctx, I, d));
|
|
}
|
|
|
|
/* Phi sources are special. Set in the corresponding predecessors */
|
|
if (I->op != AGX_OPCODE_PHI)
|
|
agx_set_sources(rctx, I);
|
|
|
|
agx_set_dests(rctx, I);
|
|
}
|
|
|
|
block->ssa_to_reg_out = rctx->ssa_to_reg;
|
|
|
|
STATIC_ASSERT(sizeof(block->regs_out) == sizeof(used_regs));
|
|
memcpy(block->regs_out, used_regs, sizeof(used_regs));
|
|
|
|
/* Also set the sources for the phis in our successors, since that logically
|
|
* happens now (given the possibility of live range splits, etc)
|
|
*/
|
|
agx_foreach_successor(block, succ) {
|
|
unsigned pred_idx = agx_predecessor_index(succ, block);
|
|
|
|
agx_foreach_phi_in_block(succ, phi) {
|
|
if (phi->src[pred_idx].type == AGX_INDEX_NORMAL) {
|
|
/* This source needs a fixup */
|
|
unsigned value = phi->src[pred_idx].value;
|
|
|
|
agx_replace_src(
|
|
phi, pred_idx,
|
|
agx_register(rctx->ssa_to_reg[value], phi->src[pred_idx].size));
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
/*
|
|
* Lower phis to parallel copies at the logical end of a given block. If a block
|
|
* needs parallel copies inserted, a successor of the block has a phi node. To
|
|
* have a (nontrivial) phi node, a block must have multiple predecessors. So the
|
|
* edge from the block to the successor (with phi) is not the only edge entering
|
|
* the successor. Because the control flow graph has no critical edges, this
|
|
* edge must therefore be the only edge leaving the block, so the block must
|
|
* have only a single successor.
|
|
*/
|
|
static void
|
|
agx_insert_parallel_copies(agx_context *ctx, agx_block *block)
|
|
{
|
|
bool any_succ = false;
|
|
unsigned nr_phi = 0;
|
|
|
|
/* Phi nodes logically happen on the control flow edge, so parallel copies
|
|
* are added at the end of the predecessor */
|
|
agx_builder b = agx_init_builder(ctx, agx_after_block_logical(block));
|
|
|
|
agx_foreach_successor(block, succ) {
|
|
assert(nr_phi == 0 && "control flow graph has a critical edge");
|
|
|
|
agx_foreach_phi_in_block(succ, phi) {
|
|
assert(!any_succ && "control flow graph has a critical edge");
|
|
nr_phi++;
|
|
}
|
|
|
|
any_succ = true;
|
|
|
|
/* Nothing to do if there are no phi nodes */
|
|
if (nr_phi == 0)
|
|
continue;
|
|
|
|
unsigned pred_index = agx_predecessor_index(succ, block);
|
|
|
|
/* Create a parallel copy lowering all the phi nodes */
|
|
struct agx_copy *copies = calloc(sizeof(*copies), nr_phi);
|
|
|
|
unsigned i = 0;
|
|
|
|
agx_foreach_phi_in_block(succ, phi) {
|
|
agx_index dest = phi->dest[0];
|
|
agx_index src = phi->src[pred_index];
|
|
|
|
assert(dest.type == AGX_INDEX_REGISTER);
|
|
assert(dest.size == src.size);
|
|
|
|
copies[i++] = (struct agx_copy){
|
|
.dest = dest.value,
|
|
.src = src,
|
|
};
|
|
}
|
|
|
|
agx_emit_parallel_copies(&b, copies, nr_phi);
|
|
|
|
free(copies);
|
|
}
|
|
}
|
|
|
|
void
|
|
agx_ra(agx_context *ctx)
|
|
{
|
|
agx_compute_liveness(ctx);
|
|
uint8_t *ncomps = calloc(ctx->alloc, sizeof(uint8_t));
|
|
agx_instr **src_to_collect_phi = calloc(ctx->alloc, sizeof(agx_instr *));
|
|
enum agx_size *sizes = calloc(ctx->alloc, sizeof(enum agx_size));
|
|
BITSET_WORD *visited = calloc(BITSET_WORDS(ctx->alloc), sizeof(BITSET_WORD));
|
|
unsigned max_ncomps = 1;
|
|
|
|
agx_foreach_instr_global(ctx, I) {
|
|
/* Record collects/phis so we can coalesce when assigning */
|
|
if (I->op == AGX_OPCODE_COLLECT || I->op == AGX_OPCODE_PHI) {
|
|
agx_foreach_ssa_src(I, s) {
|
|
src_to_collect_phi[I->src[s].value] = I;
|
|
}
|
|
}
|
|
|
|
agx_foreach_ssa_dest(I, d) {
|
|
unsigned v = I->dest[d].value;
|
|
assert(ncomps[v] == 0 && "broken SSA");
|
|
/* Round up vectors for easier live range splitting */
|
|
ncomps[v] = util_next_power_of_two(agx_write_registers(I, d));
|
|
sizes[v] = I->dest[d].size;
|
|
|
|
max_ncomps = MAX2(max_ncomps, ncomps[v]);
|
|
}
|
|
}
|
|
|
|
/* For live range splitting to work properly, ensure the register file is
|
|
* aligned to the larger vector size. Most of the time, this is a no-op since
|
|
* the largest vector size is usually 128-bit and the register file is
|
|
* naturally 128-bit aligned. However, this is required for correctness with
|
|
* 3D textureGrad, which can have a source vector of length 6x32-bit,
|
|
* rounding up to 256-bit and requiring special accounting here.
|
|
*/
|
|
unsigned reg_file_alignment = MAX2(max_ncomps, 8);
|
|
assert(util_is_power_of_two_nonzero(reg_file_alignment));
|
|
|
|
/* Calculate the demand and use it to bound register assignment */
|
|
unsigned demand =
|
|
ALIGN_POT(agx_calc_register_demand(ctx, ncomps), reg_file_alignment);
|
|
|
|
/* TODO: Spilling. Abort so we don't smash the stack in release builds. */
|
|
if (demand > AGX_NUM_REGS) {
|
|
fprintf(stderr, "\n");
|
|
fprintf(stderr, "------------------------------------------------\n");
|
|
fprintf(stderr, "Asahi Linux shader compiler limitation!\n");
|
|
fprintf(stderr, "We ran out of registers! Nyaaaa 😿\n");
|
|
fprintf(stderr, "Do not report this as a bug.\n");
|
|
fprintf(stderr, "We know -- we're working on it!\n");
|
|
fprintf(stderr, "------------------------------------------------\n");
|
|
fprintf(stderr, "\n");
|
|
abort();
|
|
}
|
|
|
|
/* Round up the demand to the maximum number of registers we can use without
|
|
* affecting occupancy. This reduces live range splitting.
|
|
*/
|
|
unsigned max_regs = agx_occupancy_for_register_count(demand).max_registers;
|
|
max_regs = ROUND_DOWN_TO(max_regs, reg_file_alignment);
|
|
|
|
/* Or, we can bound tightly for debugging */
|
|
if (agx_compiler_debug & AGX_DBG_DEMAND)
|
|
max_regs = ALIGN_POT(MAX2(demand, 12), reg_file_alignment);
|
|
|
|
/* ...but not too tightly */
|
|
assert((max_regs % reg_file_alignment) == 0 && "occupancy limits aligned");
|
|
assert(max_regs >= (6 * 2) && "space for vertex shader preloading");
|
|
|
|
/* Assign registers in dominance-order. This coincides with source-order due
|
|
* to a NIR invariant, so we do not need special handling for this.
|
|
*/
|
|
agx_foreach_block(ctx, block) {
|
|
agx_ra_assign_local(&(struct ra_ctx){
|
|
.shader = ctx,
|
|
.block = block,
|
|
.src_to_collect_phi = src_to_collect_phi,
|
|
.ncomps = ncomps,
|
|
.sizes = sizes,
|
|
.visited = visited,
|
|
.bound = max_regs,
|
|
.max_reg = &ctx->max_reg,
|
|
});
|
|
}
|
|
|
|
/* Vertex shaders preload the vertex/instance IDs (r5, r6) even if the shader
|
|
* don't use them. Account for that so the preload doesn't clobber GPRs.
|
|
*/
|
|
if (ctx->nir->info.stage == MESA_SHADER_VERTEX)
|
|
ctx->max_reg = MAX2(ctx->max_reg, 6 * 2);
|
|
|
|
assert(ctx->max_reg <= max_regs);
|
|
|
|
agx_foreach_instr_global_safe(ctx, ins) {
|
|
/* Lower away RA pseudo-instructions */
|
|
agx_builder b = agx_init_builder(ctx, agx_after_instr(ins));
|
|
|
|
if (ins->op == AGX_OPCODE_COLLECT) {
|
|
assert(ins->dest[0].type == AGX_INDEX_REGISTER);
|
|
unsigned base = ins->dest[0].value;
|
|
unsigned width = agx_size_align_16(ins->src[0].size);
|
|
|
|
struct agx_copy *copies = alloca(sizeof(copies[0]) * ins->nr_srcs);
|
|
unsigned n = 0;
|
|
|
|
/* Move the sources */
|
|
agx_foreach_src(ins, i) {
|
|
if (agx_is_null(ins->src[i]) || ins->src[i].type == AGX_INDEX_UNDEF)
|
|
continue;
|
|
assert(ins->src[i].size == ins->src[0].size);
|
|
|
|
copies[n++] = (struct agx_copy){
|
|
.dest = base + (i * width),
|
|
.src = ins->src[i],
|
|
};
|
|
}
|
|
|
|
agx_emit_parallel_copies(&b, copies, n);
|
|
agx_remove_instruction(ins);
|
|
continue;
|
|
} else if (ins->op == AGX_OPCODE_SPLIT) {
|
|
assert(ins->src[0].type == AGX_INDEX_REGISTER ||
|
|
ins->src[0].type == AGX_INDEX_UNIFORM);
|
|
|
|
struct agx_copy copies[4];
|
|
assert(ins->nr_dests <= ARRAY_SIZE(copies));
|
|
|
|
unsigned n = 0;
|
|
unsigned width = agx_size_align_16(agx_split_width(ins));
|
|
|
|
/* Move the sources */
|
|
agx_foreach_dest(ins, i) {
|
|
if (ins->dest[i].type != AGX_INDEX_REGISTER)
|
|
continue;
|
|
|
|
agx_index src = ins->src[0];
|
|
src.size = ins->dest[i].size;
|
|
src.value += (i * width);
|
|
|
|
copies[n++] = (struct agx_copy){
|
|
.dest = ins->dest[i].value,
|
|
.src = src,
|
|
};
|
|
}
|
|
|
|
/* Lower away */
|
|
agx_builder b = agx_init_builder(ctx, agx_after_instr(ins));
|
|
agx_emit_parallel_copies(&b, copies, n);
|
|
agx_remove_instruction(ins);
|
|
continue;
|
|
}
|
|
}
|
|
|
|
/* Insert parallel copies lowering phi nodes */
|
|
agx_foreach_block(ctx, block) {
|
|
agx_insert_parallel_copies(ctx, block);
|
|
}
|
|
|
|
agx_foreach_instr_global_safe(ctx, I) {
|
|
switch (I->op) {
|
|
/* Pseudoinstructions for RA must be removed now */
|
|
case AGX_OPCODE_PHI:
|
|
case AGX_OPCODE_PRELOAD:
|
|
agx_remove_instruction(I);
|
|
break;
|
|
|
|
/* Coalesced moves can be removed */
|
|
case AGX_OPCODE_MOV:
|
|
if (I->src[0].type == AGX_INDEX_REGISTER &&
|
|
I->dest[0].size == I->src[0].size &&
|
|
I->src[0].value == I->dest[0].value) {
|
|
|
|
assert(I->dest[0].type == AGX_INDEX_REGISTER);
|
|
agx_remove_instruction(I);
|
|
}
|
|
break;
|
|
|
|
default:
|
|
break;
|
|
}
|
|
}
|
|
|
|
agx_foreach_block(ctx, block) {
|
|
free(block->ssa_to_reg_out);
|
|
block->ssa_to_reg_out = NULL;
|
|
}
|
|
|
|
free(src_to_collect_phi);
|
|
free(ncomps);
|
|
free(sizes);
|
|
free(visited);
|
|
}
|