diff --git a/src/asahi/compiler/agx_compile.c b/src/asahi/compiler/agx_compile.c index 03bce50571a..f6674d6320d 100644 --- a/src/asahi/compiler/agx_compile.c +++ b/src/asahi/compiler/agx_compile.c @@ -44,6 +44,7 @@ static const struct debug_named_value agx_debug_options[] = { {"internal", AGX_DBG_INTERNAL, "Dump even internal shaders"}, {"novalidate",AGX_DBG_NOVALIDATE,"Skip IR validation in debug builds"}, {"noopt", AGX_DBG_NOOPT, "Disable backend optimizations"}, + {"wait", AGX_DBG_WAIT, "Wait after all async instructions"}, DEBUG_NAMED_VALUE_END }; /* clang-format on */ diff --git a/src/asahi/compiler/agx_compiler.h b/src/asahi/compiler/agx_compiler.h index d1f1ce6ef56..3cb30ccc06d 100644 --- a/src/asahi/compiler/agx_compiler.h +++ b/src/asahi/compiler/agx_compiler.h @@ -47,6 +47,7 @@ enum agx_dbg { AGX_DBG_INTERNAL = BITFIELD_BIT(4), AGX_DBG_NOVALIDATE = BITFIELD_BIT(5), AGX_DBG_NOOPT = BITFIELD_BIT(6), + AGX_DBG_WAIT = BITFIELD_BIT(7), }; /* clang-format on */ diff --git a/src/asahi/compiler/agx_insert_waits.c b/src/asahi/compiler/agx_insert_waits.c index 05abf04dc2f..348204b3357 100644 --- a/src/asahi/compiler/agx_insert_waits.c +++ b/src/asahi/compiler/agx_insert_waits.c @@ -6,6 +6,8 @@ #include "agx_builder.h" #include "agx_compiler.h" +#define AGX_MAX_PENDING (8) + /* * Returns whether an instruction is asynchronous and needs a scoreboard slot */ @@ -15,6 +17,16 @@ instr_is_async(agx_instr *I) return agx_opcodes_info[I->op].immediates & AGX_IMMEDIATE_SCOREBOARD; } +struct slot { + /* Set of registers this slot is currently writing */ + BITSET_DECLARE(writes, AGX_NUM_REGS); + + /* Number of pending messages on this slot. Must not exceed + * AGX_MAX_PENDING for correct results. + */ + uint8_t nr_pending; +}; + /* * Insert waits within a block to stall after every async instruction. Useful * for debugging. @@ -30,6 +42,92 @@ agx_insert_waits_trivial(agx_context *ctx, agx_block *block) } } +/* + * Insert waits within a block, assuming scoreboard slots have already been + * assigned. This waits for everything at the end of the block, rather than + * doing something more intelligent/global. This should be optimized. + * + * XXX: Do any instructions read their sources asynchronously? + */ +static void +agx_insert_waits_local(agx_context *ctx, agx_block *block) +{ + struct slot slots[2] = {0}; + + agx_foreach_instr_in_block_safe(block, I) { + uint8_t wait_mask = 0; + + /* Check for read-after-write */ + agx_foreach_src(I, s) { + if (I->src[s].type != AGX_INDEX_REGISTER) + continue; + + unsigned nr_read = agx_read_registers(I, s); + for (unsigned slot = 0; slot < ARRAY_SIZE(slots); ++slot) { + if (BITSET_TEST_RANGE(slots[slot].writes, I->src[s].value, + I->src[s].value + nr_read - 1)) + wait_mask |= BITSET_BIT(slot); + } + } + + /* Check for write-after-write */ + agx_foreach_dest(I, d) { + if (I->dest[d].type != AGX_INDEX_REGISTER) + continue; + + unsigned nr_writes = agx_write_registers(I, d); + for (unsigned slot = 0; slot < ARRAY_SIZE(slots); ++slot) { + if (BITSET_TEST_RANGE(slots[slot].writes, I->dest[d].value, + I->dest[d].value + nr_writes - 1)) + wait_mask |= BITSET_BIT(slot); + } + } + + /* Try to assign a free slot */ + if (instr_is_async(I)) { + for (unsigned slot = 0; slot < ARRAY_SIZE(slots); ++slot) { + if (slots[slot].nr_pending == 0) { + I->scoreboard = slot; + break; + } + } + } + + /* Check for slot overflow */ + if (instr_is_async(I) && + slots[I->scoreboard].nr_pending >= AGX_MAX_PENDING) + wait_mask |= BITSET_BIT(I->scoreboard); + + /* Insert the appropriate waits, clearing the slots */ + u_foreach_bit(slot, wait_mask) { + agx_builder b = agx_init_builder(ctx, agx_before_instr(I)); + agx_wait(&b, slot); + + BITSET_ZERO(slots[slot].writes); + slots[slot].nr_pending = 0; + } + + /* Record access */ + if (instr_is_async(I)) { + agx_foreach_dest(I, d) { + assert(I->dest[d].type == AGX_INDEX_REGISTER); + BITSET_SET_RANGE(slots[I->scoreboard].writes, I->dest[d].value, + I->dest[d].value + agx_write_registers(I, d) - 1); + } + + slots[I->scoreboard].nr_pending++; + } + } + + /* If there are outstanding messages, wait for them */ + for (unsigned slot = 0; slot < ARRAY_SIZE(slots); ++slot) { + if (slots[slot].nr_pending) { + agx_builder b = agx_init_builder(ctx, agx_after_block_logical(block)); + agx_wait(&b, slot); + } + } +} + /* * Assign scoreboard slots to asynchronous instructions and insert waits for the * appropriate hazard tracking. @@ -38,6 +136,9 @@ void agx_insert_waits(agx_context *ctx) { agx_foreach_block(ctx, block) { - agx_insert_waits_trivial(ctx, block); + if (agx_debug & AGX_DBG_WAIT) + agx_insert_waits_trivial(ctx, block); + else + agx_insert_waits_local(ctx, block); } }