agx: Use a transfer graph for parallel copies
Lifted from ir3. Algorithm is the same; the data structures and interface are lightly modified to decouple from ir3's IR. Sequentializing parallel copies after RA is tricky. ir3's implementation works well enough, so I use that one. Original implementation by Connor Abbott. Signed-off-by: Alyssa Rosenzweig <alyssa@rosenzweig.io> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/16268>
This commit is contained in:

committed by
Alyssa Rosenzweig

parent
330ec4260d
commit
4fc023ed42
@@ -674,6 +674,23 @@ void agx_pack_binary(agx_context *ctx, struct util_dynarray *emission);
|
||||
|
||||
unsigned agx_write_registers(agx_instr *I, unsigned d);
|
||||
|
||||
struct agx_copy {
|
||||
/* Base register destination of the copy */
|
||||
unsigned dest;
|
||||
|
||||
/* Base register source of the copy */
|
||||
unsigned src;
|
||||
|
||||
/* Size of the copy */
|
||||
enum agx_size size;
|
||||
|
||||
/* Whether the copy has been handled. Callers must leave to false. */
|
||||
bool done;
|
||||
};
|
||||
|
||||
void
|
||||
agx_emit_parallel_copies(agx_builder *b, struct agx_copy *copies, unsigned n);
|
||||
|
||||
void agx_compute_liveness(agx_context *ctx);
|
||||
void agx_liveness_ins_update(BITSET_WORD *live, agx_instr *I);
|
||||
|
||||
|
283
src/asahi/compiler/agx_lower_parallel_copy.c
Normal file
283
src/asahi/compiler/agx_lower_parallel_copy.c
Normal file
@@ -0,0 +1,283 @@
|
||||
/*
|
||||
* Copyright (C) 2022 Alyssa Rosenzweig <alyssa@rosenzweig.io>
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
* to deal in the Software without restriction, including without limitation
|
||||
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
* and/or sell copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice (including the next
|
||||
* paragraph) shall be included in all copies or substantial portions of the
|
||||
* Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
||||
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
* SOFTWARE.
|
||||
*/
|
||||
|
||||
#include "agx_compiler.h"
|
||||
#include "agx_builder.h"
|
||||
|
||||
/*
|
||||
* Emits code for
|
||||
*
|
||||
* for (int i = 0; i < n; ++i)
|
||||
* registers[dests[i]] = registers[srcs[i]];
|
||||
*
|
||||
* ...with all copies happening in parallel.
|
||||
*
|
||||
* That is, emit machine instructions equivalent to a parallel copy. This is
|
||||
* used to lower not only parallel copies but also collects and splits, which
|
||||
* also have parallel copy semantics.
|
||||
*
|
||||
* We only handles register-register copies, not general agx_index sources. This
|
||||
* suffices for its internal use for register allocation.
|
||||
*/
|
||||
|
||||
static void
|
||||
do_copy(agx_builder *b, const struct agx_copy *copy)
|
||||
{
|
||||
agx_mov_to(b, agx_register(copy->dest, copy->size),
|
||||
agx_register(copy->src, copy->size));
|
||||
}
|
||||
|
||||
static void
|
||||
do_swap(agx_builder *b, const struct agx_copy *copy)
|
||||
{
|
||||
if (copy->dest == copy->src)
|
||||
return;
|
||||
|
||||
agx_index x = agx_register(copy->dest, copy->size);
|
||||
agx_index y = agx_register(copy->src, copy->size);
|
||||
|
||||
agx_xor_to(b, x, x, y);
|
||||
agx_xor_to(b, y, x, y);
|
||||
agx_xor_to(b, x, x, y);
|
||||
}
|
||||
|
||||
struct copy_ctx {
|
||||
/* Number of copies being processed */
|
||||
unsigned entry_count;
|
||||
|
||||
/* For each physreg, the number of pending copy entries that use it as a
|
||||
* source. Once this drops to zero, then the physreg is unblocked and can
|
||||
* be moved to.
|
||||
*/
|
||||
unsigned physreg_use_count[AGX_NUM_REGS];
|
||||
|
||||
/* For each physreg, the pending copy_entry that uses it as a dest. */
|
||||
struct agx_copy *physreg_dest[AGX_NUM_REGS];
|
||||
|
||||
struct agx_copy entries[AGX_NUM_REGS];
|
||||
};
|
||||
|
||||
static bool
|
||||
entry_blocked(struct agx_copy *entry, struct copy_ctx *ctx)
|
||||
{
|
||||
for (unsigned i = 0; i < agx_size_align_16(entry->size); i++) {
|
||||
if (ctx->physreg_use_count[entry->dest + i] != 0)
|
||||
return true;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
static bool
|
||||
is_real(struct agx_copy *entry)
|
||||
{
|
||||
/* TODO: Allow immediates in agx_copy */
|
||||
return true;
|
||||
}
|
||||
|
||||
/* TODO: Generalize to other bit sizes */
|
||||
static void
|
||||
split_32bit_copy(struct copy_ctx *ctx, struct agx_copy *entry)
|
||||
{
|
||||
assert(!entry->done);
|
||||
assert(is_real(entry));
|
||||
assert(agx_size_align_16(entry->size) == 2);
|
||||
struct agx_copy *new_entry = &ctx->entries[ctx->entry_count++];
|
||||
|
||||
new_entry->dest = entry->dest + 1;
|
||||
new_entry->src = entry->src + 1;
|
||||
new_entry->done = false;
|
||||
entry->size = AGX_SIZE_16;
|
||||
new_entry->size = AGX_SIZE_16;
|
||||
ctx->physreg_dest[entry->dest + 1] = new_entry;
|
||||
}
|
||||
|
||||
void
|
||||
agx_emit_parallel_copies(agx_builder *b,
|
||||
struct agx_copy *copies,
|
||||
unsigned num_copies)
|
||||
{
|
||||
struct copy_ctx _ctx = {
|
||||
.entry_count = num_copies
|
||||
};
|
||||
|
||||
struct copy_ctx *ctx = &_ctx;
|
||||
|
||||
/* Set up the bookkeeping */
|
||||
memset(ctx->physreg_dest, 0, sizeof(ctx->physreg_dest));
|
||||
memset(ctx->physreg_use_count, 0, sizeof(ctx->physreg_use_count));
|
||||
|
||||
for (unsigned i = 0; i < ctx->entry_count; i++) {
|
||||
struct agx_copy *entry = &copies[i];
|
||||
|
||||
ctx->entries[i] = *entry;
|
||||
|
||||
for (unsigned j = 0; j < agx_size_align_16(entry->size); j++) {
|
||||
if (is_real(entry))
|
||||
ctx->physreg_use_count[entry->src + j]++;
|
||||
|
||||
/* Copies should not have overlapping destinations. */
|
||||
assert(!ctx->physreg_dest[entry->dest + j]);
|
||||
ctx->physreg_dest[entry->dest + j] = entry;
|
||||
}
|
||||
}
|
||||
|
||||
bool progress = true;
|
||||
while (progress) {
|
||||
progress = false;
|
||||
|
||||
/* Step 1: resolve paths in the transfer graph. This means finding
|
||||
* copies whose destination aren't blocked by something else and then
|
||||
* emitting them, continuing this process until every copy is blocked
|
||||
* and there are only cycles left.
|
||||
*
|
||||
* TODO: We should note that src is also available in dest to unblock
|
||||
* cycles that src is involved in.
|
||||
*/
|
||||
|
||||
for (unsigned i = 0; i < ctx->entry_count; i++) {
|
||||
struct agx_copy *entry = &ctx->entries[i];
|
||||
if (!entry->done && !entry_blocked(entry, ctx)) {
|
||||
entry->done = true;
|
||||
progress = true;
|
||||
do_copy(b, entry);
|
||||
for (unsigned j = 0; j < agx_size_align_16(entry->size); j++) {
|
||||
if (is_real(entry))
|
||||
ctx->physreg_use_count[entry->src + j]--;
|
||||
ctx->physreg_dest[entry->dest + j] = NULL;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (progress)
|
||||
continue;
|
||||
|
||||
/* Step 2: Find partially blocked copies and split them. In the
|
||||
* mergedregs case, we can 32-bit copies which are only blocked on one
|
||||
* 16-bit half, and splitting them helps get things moving.
|
||||
*
|
||||
* We can skip splitting copies if the source isn't a register,
|
||||
* however, because it does not unblock anything and therefore doesn't
|
||||
* contribute to making forward progress with step 1. These copies
|
||||
* should still be resolved eventually in step 1 because they can't be
|
||||
* part of a cycle.
|
||||
*/
|
||||
for (unsigned i = 0; i < ctx->entry_count; i++) {
|
||||
struct agx_copy *entry = &ctx->entries[i];
|
||||
if (entry->done || (agx_size_align_16(entry->size) != 2))
|
||||
continue;
|
||||
|
||||
if (((ctx->physreg_use_count[entry->dest] == 0 ||
|
||||
ctx->physreg_use_count[entry->dest + 1] == 0)) &&
|
||||
is_real(entry)) {
|
||||
split_32bit_copy(ctx, entry);
|
||||
progress = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/* Step 3: resolve cycles through swapping.
|
||||
*
|
||||
* At this point, the transfer graph should consist of only cycles.
|
||||
* The reason is that, given any physreg n_1 that's the source of a
|
||||
* remaining entry, it has a destination n_2, which (because every
|
||||
* copy is blocked) is the source of some other copy whose destination
|
||||
* is n_3, and so we can follow the chain until we get a cycle. If we
|
||||
* reached some other node than n_1:
|
||||
*
|
||||
* n_1 -> n_2 -> ... -> n_i
|
||||
* ^ |
|
||||
* |-------------|
|
||||
*
|
||||
* then n_2 would be the destination of 2 copies, which is illegal
|
||||
* (checked above in an assert). So n_1 must be part of a cycle:
|
||||
*
|
||||
* n_1 -> n_2 -> ... -> n_i
|
||||
* ^ |
|
||||
* |---------------------|
|
||||
*
|
||||
* and this must be only cycle n_1 is involved in, because any other
|
||||
* path starting from n_1 would also have to end in n_1, resulting in
|
||||
* a node somewhere along the way being the destination of 2 copies
|
||||
* when the 2 paths merge.
|
||||
*
|
||||
* The way we resolve the cycle is through picking a copy (n_1, n_2)
|
||||
* and swapping n_1 and n_2. This moves n_1 to n_2, so n_2 is taken
|
||||
* out of the cycle:
|
||||
*
|
||||
* n_1 -> ... -> n_i
|
||||
* ^ |
|
||||
* |--------------|
|
||||
*
|
||||
* and we can keep repeating this until the cycle is empty.
|
||||
*/
|
||||
|
||||
for (unsigned i = 0; i < ctx->entry_count; i++) {
|
||||
struct agx_copy *entry = &ctx->entries[i];
|
||||
if (entry->done)
|
||||
continue;
|
||||
|
||||
assert(is_real(entry));
|
||||
|
||||
/* catch trivial copies */
|
||||
if (entry->dest == entry->src) {
|
||||
entry->done = true;
|
||||
continue;
|
||||
}
|
||||
|
||||
do_swap(b, entry);
|
||||
|
||||
/* Split any blocking copies whose sources are only partially
|
||||
* contained within our destination.
|
||||
*/
|
||||
if (agx_size_align_16(entry->size) == 1) {
|
||||
for (unsigned j = 0; j < ctx->entry_count; j++) {
|
||||
struct agx_copy *blocking = &ctx->entries[j];
|
||||
|
||||
if (blocking->done)
|
||||
continue;
|
||||
|
||||
if (blocking->src <= entry->dest &&
|
||||
blocking->src + 1 >= entry->dest &&
|
||||
agx_size_align_16(blocking->size) == 2) {
|
||||
split_32bit_copy(ctx, blocking);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/* Update sources of blocking copies.
|
||||
*
|
||||
* Note: at this point, every blocking copy's source should be
|
||||
* contained within our destination.
|
||||
*/
|
||||
for (unsigned j = 0; j < ctx->entry_count; j++) {
|
||||
struct agx_copy *blocking = &ctx->entries[j];
|
||||
if (blocking->src >= entry->dest &&
|
||||
blocking->src < entry->dest + agx_size_align_16(entry->size)) {
|
||||
blocking->src = entry->src + (blocking->src - entry->dest);
|
||||
}
|
||||
}
|
||||
|
||||
entry->done = true;
|
||||
}
|
||||
}
|
@@ -139,6 +139,21 @@ agx_ra_assign_local(agx_block *block, uint8_t *ssa_to_reg, uint8_t *ncomps, unsi
|
||||
memcpy(block->regs_out, used_regs, sizeof(used_regs));
|
||||
}
|
||||
|
||||
/*
|
||||
* Resolve an agx_index of type NORMAL or REGISTER to a physical register, once
|
||||
* registers have been allocated for all SSA values.
|
||||
*/
|
||||
static unsigned
|
||||
agx_index_to_reg(uint8_t *ssa_to_reg, agx_index idx)
|
||||
{
|
||||
if (idx.type == AGX_INDEX_NORMAL) {
|
||||
return ssa_to_reg[idx.value];
|
||||
} else {
|
||||
assert(idx.type == AGX_INDEX_REGISTER);
|
||||
return idx.value;
|
||||
}
|
||||
}
|
||||
|
||||
void
|
||||
agx_ra(agx_context *ctx)
|
||||
{
|
||||
@@ -170,57 +185,33 @@ agx_ra(agx_context *ctx)
|
||||
agx_foreach_instr_global_safe(ctx, ins) {
|
||||
/* Lower away RA pseudo-instructions */
|
||||
if (ins->op == AGX_OPCODE_P_COMBINE) {
|
||||
/* TODO: Optimize out the moves! */
|
||||
assert(ins->dest[0].type == AGX_INDEX_NORMAL);
|
||||
enum agx_size common_size = ins->dest[0].size;
|
||||
unsigned base = ssa_to_reg[ins->dest[0].value];
|
||||
unsigned size = common_size == AGX_SIZE_32 ? 2 : 1;
|
||||
unsigned base = agx_index_to_reg(ssa_to_reg, ins->dest[0]);
|
||||
unsigned width = agx_size_align_16(ins->dest[0].size);
|
||||
|
||||
struct agx_copy copies[4];
|
||||
unsigned n = 0;
|
||||
|
||||
/* Move the sources */
|
||||
for (unsigned i = 0; i < 4; ++i) {
|
||||
if (agx_is_null(ins->src[i])) continue;
|
||||
assert(ins->src[i].size == ins->dest[0].size);
|
||||
|
||||
copies[n++] = (struct agx_copy) {
|
||||
.dest = base + (i * width),
|
||||
.src = agx_index_to_reg(ssa_to_reg, ins->src[i]) ,
|
||||
.size = ins->src[i].size
|
||||
};
|
||||
}
|
||||
|
||||
/* Lower away the copies pseudo-instruction */
|
||||
agx_builder b = agx_init_builder(ctx, agx_after_instr(ins));
|
||||
|
||||
/* TODO: Eliminate the intermediate copy by handling parallel copies */
|
||||
for (unsigned i = 0; i < 4; ++i) {
|
||||
if (agx_is_null(ins->src[i])) continue;
|
||||
unsigned base = ins->src[i].value;
|
||||
if (ins->src[i].type == AGX_INDEX_NORMAL)
|
||||
base = ssa_to_reg[base];
|
||||
else
|
||||
assert(ins->src[i].type == AGX_INDEX_REGISTER);
|
||||
|
||||
assert(ins->src[i].size == common_size);
|
||||
|
||||
agx_mov_to(&b, agx_register(124*2 + (i * size), common_size),
|
||||
agx_register(base, common_size));
|
||||
}
|
||||
|
||||
for (unsigned i = 0; i < 4; ++i) {
|
||||
if (agx_is_null(ins->src[i])) continue;
|
||||
agx_index src = ins->src[i];
|
||||
|
||||
if (src.type == AGX_INDEX_NORMAL)
|
||||
src = agx_register(alloc[src.value], src.size);
|
||||
|
||||
agx_mov_to(&b, agx_register(base + (i * size), common_size),
|
||||
agx_register(124*2 + (i * size), common_size));
|
||||
}
|
||||
|
||||
/* We've lowered away, delete the old */
|
||||
agx_remove_instruction(ins);
|
||||
agx_emit_parallel_copies(&b, copies, n);
|
||||
continue;
|
||||
} else if (ins->op == AGX_OPCODE_P_EXTRACT) {
|
||||
/* Uses the destination size */
|
||||
assert(ins->dest[0].type == AGX_INDEX_NORMAL);
|
||||
unsigned base = ins->src[0].value;
|
||||
|
||||
if (ins->src[0].type != AGX_INDEX_REGISTER) {
|
||||
assert(ins->src[0].type == AGX_INDEX_NORMAL);
|
||||
base = alloc[base];
|
||||
}
|
||||
|
||||
unsigned size = ins->dest[0].size == AGX_SIZE_64 ? 4 : ins->dest[0].size == AGX_SIZE_32 ? 2 : 1;
|
||||
unsigned left = ssa_to_reg[ins->dest[0].value];
|
||||
unsigned right = ssa_to_reg[ins->src[0].value] + (size * ins->imm);
|
||||
unsigned size = agx_size_align_16(ins->dest[0].size);
|
||||
unsigned left = agx_index_to_reg(ssa_to_reg, ins->dest[0]);
|
||||
unsigned right = agx_index_to_reg(ssa_to_reg, ins->src[0]) + (size * ins->imm);
|
||||
|
||||
if (left != right) {
|
||||
agx_builder b = agx_init_builder(ctx, agx_after_instr(ins));
|
||||
|
@@ -23,6 +23,7 @@ libasahi_agx_files = files(
|
||||
'agx_compile.c',
|
||||
'agx_dce.c',
|
||||
'agx_liveness.c',
|
||||
'agx_lower_parallel_copy.c',
|
||||
'agx_lower_pseudo.c',
|
||||
'agx_pack.c',
|
||||
'agx_print.c',
|
||||
|
Reference in New Issue
Block a user