agx: early-kill sources only if it won't shuffle

rather than always early killing and then hitting pathological shuffle
situations, only early-kill when we can prove that we won't need to shuffle. it
turns out that's most of the time.

even with this heuristic, we still get hurt bad in shader-db due to extra moves.
but hopefully, the #s here are small enough that we can move on with our lives
and fix this source of known unsoundness.

this is tagged for backport as it's needed to avoid a perf regression with the
previous patch.

combined stats from this commit and the previous commit:

total instrs in shared programs: 2846065 -> 2852257 (0.22%)
instrs in affected programs: 618734 -> 624926 (1.00%)

total alu in shared programs: 2329477 -> 2335534 (0.26%)
alu in affected programs: 508119 -> 514176 (1.19%)

total gprs in shared programs: 894762 -> 901327 (0.73%)
gprs in affected programs: 36946 -> 43511 (17.77%)

Backport-to: 25.1
Signed-off-by: Alyssa Rosenzweig <alyssa@rosenzweig.io>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/34595>
(cherry picked from commit b1e86b3eaeb7efea6b6b38280c3367b0579a7d0a)
This commit is contained in:
Alyssa Rosenzweig
2025-04-17 16:01:56 -04:00
committed by Eric Engestrom
parent 27b46ecfb8
commit 2099c23dab
2 changed files with 75 additions and 11 deletions

View File

@@ -844,7 +844,7 @@
"description": "agx: early-kill sources only if it won't shuffle",
"nominated": true,
"nomination_type": 4,
"resolution": 0,
"resolution": 1,
"main_sha": null,
"because_sha": null,
"notes": null

View File

@@ -89,6 +89,11 @@ struct ra_ctx {
BITSET_WORD *visited;
BITSET_WORD *used_regs[RA_CLASSES];
/* Were any sources killed early this instruction? We assert this is not true
* when shuffling.
*/
bool early_killed;
/* Maintained while assigning registers. Count of registers required, i.e.
* the maximum register assigned + 1.
*/
@@ -604,6 +609,7 @@ find_regs(struct ra_ctx *rctx, agx_instr *I, unsigned dest_idx, unsigned count,
if (find_regs_simple(rctx, cls, count, align, &reg)) {
return reg;
} else {
assert(!rctx->early_killed && "no live range splits with early kill");
assert(cls == RA_GPR && "no memory live range splits");
struct util_dynarray copies = {0};
@@ -998,6 +1004,47 @@ pick_regs(struct ra_ctx *rctx, agx_instr *I, unsigned d)
return find_regs(rctx, I, d, count, align);
}
static void
kill_source(struct ra_ctx *rctx, const agx_instr *I, unsigned s)
{
enum ra_class cls = ra_class_for_index(I->src[s]);
unsigned reg = rctx->ssa_to_reg[I->src[s].value];
unsigned count = rctx->ncomps[I->src[s].value];
assert(I->op != AGX_OPCODE_PHI && "phis don't use .kill");
assert(count >= 1);
BITSET_CLEAR_RANGE(rctx->used_regs[cls], reg, reg + count - 1);
}
static void
try_kill_early_sources(struct ra_ctx *rctx, const agx_instr *I,
unsigned first_source, unsigned last_source,
unsigned region_end, unsigned region_base)
{
unsigned dest_size = util_next_power_of_two(rctx->ncomps[I->dest[0].value]);
unsigned dest_end = region_base + dest_size;
/* We can only early-kill a region if we can trivially allocate the
* destination to it. That way we never shuffle killed sources.
*
* To ensure that, the region must be aligned and cover the destination.
*/
if (region_base == region_end ||
(rctx->ssa_to_reg[I->src[first_source].value] & (dest_size - 1)) ||
((region_end < dest_end) &&
BITSET_TEST_RANGE(rctx->used_regs[RA_GPR], region_end, dest_end)))
return;
for (unsigned s = first_source; s <= last_source; ++s) {
if (I->src[s].kill && !I->src[s].memory) {
kill_source(rctx, I, s);
rctx->early_killed = true;
I->src[s].kill = false;
}
}
}
/** Assign registers to SSA values in a block. */
static void
@@ -1008,7 +1055,6 @@ agx_ra_assign_local(struct ra_ctx *rctx)
uint16_t *ssa_to_reg = calloc(rctx->shader->alloc, sizeof(uint16_t));
agx_block *block = rctx->block;
uint8_t *ncomps = rctx->ncomps;
rctx->used_regs[RA_GPR] = used_regs_gpr;
rctx->used_regs[RA_MEM] = used_regs_mem;
rctx->ssa_to_reg = ssa_to_reg;
@@ -1085,6 +1131,31 @@ agx_ra_assign_local(struct ra_ctx *rctx)
continue;
}
/* Search for regions of contiguous killed sources to early-kill. */
rctx->early_killed = false;
if (I->nr_dests == 1) {
unsigned first_src = 0;
unsigned end = 0;
unsigned start = 0;
agx_foreach_ssa_src(I, s) {
if (I->src[s].kill && !I->src[s].memory) {
unsigned reg = rctx->ssa_to_reg[I->src[s].value];
if (start == end || end != reg) {
try_kill_early_sources(rctx, I, first_src, s, end, start);
first_src = s;
start = reg;
}
end = reg + rctx->ncomps[I->src[s].value];
}
}
try_kill_early_sources(rctx, I, first_src, I->nr_srcs - 1, end, start);
}
/* Next, assign destinations one at a time. This is always legal
* because of the SSA form.
*/
@@ -1095,17 +1166,10 @@ agx_ra_assign_local(struct ra_ctx *rctx)
assign_regs(rctx, I->dest[d], pick_regs(rctx, I, d));
}
/* Free killed sources */
/* Free late-killed sources */
agx_foreach_ssa_src(I, s) {
if (I->src[s].kill) {
assert(I->op != AGX_OPCODE_PHI && "phis don't use .kill");
enum ra_class cls = ra_class_for_index(I->src[s]);
unsigned reg = ssa_to_reg[I->src[s].value];
unsigned count = ncomps[I->src[s].value];
assert(count >= 1);
BITSET_CLEAR_RANGE(rctx->used_regs[cls], reg, reg + count - 1);
kill_source(rctx, I, s);
}
}