agx: early-kill sources only if it won't shuffle
rather than always early killing and then hitting pathological shuffle situations, only early-kill when we can prove that we won't need to shuffle. it turns out that's most of the time. even with this heuristic, we still get hurt bad in shader-db due to extra moves. but hopefully, the #s here are small enough that we can move on with our lives and fix this source of known unsoundness. this is tagged for backport as it's needed to avoid a perf regression with the previous patch. combined stats from this commit and the previous commit: total instrs in shared programs: 2846065 -> 2852257 (0.22%) instrs in affected programs: 618734 -> 624926 (1.00%) total alu in shared programs: 2329477 -> 2335534 (0.26%) alu in affected programs: 508119 -> 514176 (1.19%) total gprs in shared programs: 894762 -> 901327 (0.73%) gprs in affected programs: 36946 -> 43511 (17.77%) Backport-to: 25.1 Signed-off-by: Alyssa Rosenzweig <alyssa@rosenzweig.io> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/34595> (cherry picked from commit b1e86b3eaeb7efea6b6b38280c3367b0579a7d0a)
This commit is contained in:

committed by
Eric Engestrom

parent
27b46ecfb8
commit
2099c23dab
@@ -844,7 +844,7 @@
|
||||
"description": "agx: early-kill sources only if it won't shuffle",
|
||||
"nominated": true,
|
||||
"nomination_type": 4,
|
||||
"resolution": 0,
|
||||
"resolution": 1,
|
||||
"main_sha": null,
|
||||
"because_sha": null,
|
||||
"notes": null
|
||||
|
@@ -89,6 +89,11 @@ struct ra_ctx {
|
||||
BITSET_WORD *visited;
|
||||
BITSET_WORD *used_regs[RA_CLASSES];
|
||||
|
||||
/* Were any sources killed early this instruction? We assert this is not true
|
||||
* when shuffling.
|
||||
*/
|
||||
bool early_killed;
|
||||
|
||||
/* Maintained while assigning registers. Count of registers required, i.e.
|
||||
* the maximum register assigned + 1.
|
||||
*/
|
||||
@@ -604,6 +609,7 @@ find_regs(struct ra_ctx *rctx, agx_instr *I, unsigned dest_idx, unsigned count,
|
||||
if (find_regs_simple(rctx, cls, count, align, ®)) {
|
||||
return reg;
|
||||
} else {
|
||||
assert(!rctx->early_killed && "no live range splits with early kill");
|
||||
assert(cls == RA_GPR && "no memory live range splits");
|
||||
|
||||
struct util_dynarray copies = {0};
|
||||
@@ -998,6 +1004,47 @@ pick_regs(struct ra_ctx *rctx, agx_instr *I, unsigned d)
|
||||
return find_regs(rctx, I, d, count, align);
|
||||
}
|
||||
|
||||
static void
|
||||
kill_source(struct ra_ctx *rctx, const agx_instr *I, unsigned s)
|
||||
{
|
||||
enum ra_class cls = ra_class_for_index(I->src[s]);
|
||||
unsigned reg = rctx->ssa_to_reg[I->src[s].value];
|
||||
unsigned count = rctx->ncomps[I->src[s].value];
|
||||
|
||||
assert(I->op != AGX_OPCODE_PHI && "phis don't use .kill");
|
||||
assert(count >= 1);
|
||||
|
||||
BITSET_CLEAR_RANGE(rctx->used_regs[cls], reg, reg + count - 1);
|
||||
}
|
||||
|
||||
static void
|
||||
try_kill_early_sources(struct ra_ctx *rctx, const agx_instr *I,
|
||||
unsigned first_source, unsigned last_source,
|
||||
unsigned region_end, unsigned region_base)
|
||||
{
|
||||
unsigned dest_size = util_next_power_of_two(rctx->ncomps[I->dest[0].value]);
|
||||
unsigned dest_end = region_base + dest_size;
|
||||
|
||||
/* We can only early-kill a region if we can trivially allocate the
|
||||
* destination to it. That way we never shuffle killed sources.
|
||||
*
|
||||
* To ensure that, the region must be aligned and cover the destination.
|
||||
*/
|
||||
if (region_base == region_end ||
|
||||
(rctx->ssa_to_reg[I->src[first_source].value] & (dest_size - 1)) ||
|
||||
((region_end < dest_end) &&
|
||||
BITSET_TEST_RANGE(rctx->used_regs[RA_GPR], region_end, dest_end)))
|
||||
return;
|
||||
|
||||
for (unsigned s = first_source; s <= last_source; ++s) {
|
||||
if (I->src[s].kill && !I->src[s].memory) {
|
||||
kill_source(rctx, I, s);
|
||||
rctx->early_killed = true;
|
||||
I->src[s].kill = false;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/** Assign registers to SSA values in a block. */
|
||||
|
||||
static void
|
||||
@@ -1008,7 +1055,6 @@ agx_ra_assign_local(struct ra_ctx *rctx)
|
||||
uint16_t *ssa_to_reg = calloc(rctx->shader->alloc, sizeof(uint16_t));
|
||||
|
||||
agx_block *block = rctx->block;
|
||||
uint8_t *ncomps = rctx->ncomps;
|
||||
rctx->used_regs[RA_GPR] = used_regs_gpr;
|
||||
rctx->used_regs[RA_MEM] = used_regs_mem;
|
||||
rctx->ssa_to_reg = ssa_to_reg;
|
||||
@@ -1085,6 +1131,31 @@ agx_ra_assign_local(struct ra_ctx *rctx)
|
||||
continue;
|
||||
}
|
||||
|
||||
/* Search for regions of contiguous killed sources to early-kill. */
|
||||
rctx->early_killed = false;
|
||||
|
||||
if (I->nr_dests == 1) {
|
||||
unsigned first_src = 0;
|
||||
unsigned end = 0;
|
||||
unsigned start = 0;
|
||||
|
||||
agx_foreach_ssa_src(I, s) {
|
||||
if (I->src[s].kill && !I->src[s].memory) {
|
||||
unsigned reg = rctx->ssa_to_reg[I->src[s].value];
|
||||
|
||||
if (start == end || end != reg) {
|
||||
try_kill_early_sources(rctx, I, first_src, s, end, start);
|
||||
first_src = s;
|
||||
start = reg;
|
||||
}
|
||||
|
||||
end = reg + rctx->ncomps[I->src[s].value];
|
||||
}
|
||||
}
|
||||
|
||||
try_kill_early_sources(rctx, I, first_src, I->nr_srcs - 1, end, start);
|
||||
}
|
||||
|
||||
/* Next, assign destinations one at a time. This is always legal
|
||||
* because of the SSA form.
|
||||
*/
|
||||
@@ -1095,17 +1166,10 @@ agx_ra_assign_local(struct ra_ctx *rctx)
|
||||
assign_regs(rctx, I->dest[d], pick_regs(rctx, I, d));
|
||||
}
|
||||
|
||||
/* Free killed sources */
|
||||
/* Free late-killed sources */
|
||||
agx_foreach_ssa_src(I, s) {
|
||||
if (I->src[s].kill) {
|
||||
assert(I->op != AGX_OPCODE_PHI && "phis don't use .kill");
|
||||
|
||||
enum ra_class cls = ra_class_for_index(I->src[s]);
|
||||
unsigned reg = ssa_to_reg[I->src[s].value];
|
||||
unsigned count = ncomps[I->src[s].value];
|
||||
|
||||
assert(count >= 1);
|
||||
BITSET_CLEAR_RANGE(rctx->used_regs[cls], reg, reg + count - 1);
|
||||
kill_source(rctx, I, s);
|
||||
}
|
||||
}
|
||||
|
||||
|
Reference in New Issue
Block a user