diff --git a/src/asahi/compiler/agx_compile.c b/src/asahi/compiler/agx_compile.c index 83e7ba24701..f0807d60602 100644 --- a/src/asahi/compiler/agx_compile.c +++ b/src/asahi/compiler/agx_compile.c @@ -32,6 +32,7 @@ static const struct debug_named_value agx_debug_options[] = { {"wait", AGX_DBG_WAIT, "Wait after all async instructions"}, {"nopreamble",AGX_DBG_NOPREAMBLE,"Do not use shader preambles"}, {"demand", AGX_DBG_DEMAND, "Bound tightly to register demand"}, + {"nosched", AGX_DBG_NOSCHED, "Do not schedule the shader"}, DEBUG_NAMED_VALUE_END }; /* clang-format on */ @@ -2557,6 +2558,14 @@ agx_compile_function_nir(nir_shader *nir, nir_function_impl *impl, agx_dce(ctx, true); agx_validate(ctx, "Pre-RA passes"); + if (agx_should_dump(nir, AGX_DBG_SHADERS)) + agx_print_shader(ctx, stdout); + + if (likely(!(agx_compiler_debug & AGX_DBG_NOSCHED))) { + agx_pressure_schedule(ctx); + agx_validate(ctx, "Pre-RA scheduler"); + } + if (agx_should_dump(nir, AGX_DBG_SHADERS)) agx_print_shader(ctx, stdout); diff --git a/src/asahi/compiler/agx_compiler.h b/src/asahi/compiler/agx_compiler.h index 1378b5ed0fd..32468192ba7 100644 --- a/src/asahi/compiler/agx_compiler.h +++ b/src/asahi/compiler/agx_compiler.h @@ -759,6 +759,7 @@ void agx_lower_pseudo(agx_context *ctx); void agx_lower_uniform_sources(agx_context *ctx); void agx_opt_cse(agx_context *ctx); void agx_dce(agx_context *ctx, bool partial); +void agx_pressure_schedule(agx_context *ctx); void agx_ra(agx_context *ctx); void agx_lower_64bit_postra(agx_context *ctx); void agx_insert_waits(agx_context *ctx); diff --git a/src/asahi/compiler/agx_debug.h b/src/asahi/compiler/agx_debug.h index c06649127c7..9ff6b4ced29 100644 --- a/src/asahi/compiler/agx_debug.h +++ b/src/asahi/compiler/agx_debug.h @@ -25,6 +25,7 @@ enum agx_compiler_dbg { AGX_DBG_WAIT = BITFIELD_BIT(7), AGX_DBG_NOPREAMBLE = BITFIELD_BIT(8), AGX_DBG_DEMAND = BITFIELD_BIT(9), + AGX_DBG_NOSCHED = BITFIELD_BIT(10), }; /* clang-format on */ diff --git a/src/asahi/compiler/agx_pressure_schedule.c b/src/asahi/compiler/agx_pressure_schedule.c new file mode 100644 index 00000000000..4ae25f15161 --- /dev/null +++ b/src/asahi/compiler/agx_pressure_schedule.c @@ -0,0 +1,245 @@ +/* + * Copyright 2023 Alyssa Rosenzweig + * Copyright 2022 Collabora Ltd. + * SPDX-License-Identifier: MIT + */ + +/* Bottom-up local scheduler to reduce register pressure */ + +#include "util/dag.h" +#include "agx_compiler.h" +#include "agx_opcodes.h" + +struct sched_ctx { + /* Dependency graph */ + struct dag *dag; + + /* Live set */ + BITSET_WORD *live; +}; + +struct sched_node { + struct dag_node dag; + + /* Instruction this node represents */ + agx_instr *instr; +}; + +static void +add_dep(struct sched_node *a, struct sched_node *b) +{ + assert(a != b && "no self-dependencies"); + + if (a && b) + dag_add_edge(&a->dag, &b->dag, 0); +} + +static void +serialize(struct sched_node *a, struct sched_node **b) +{ + add_dep(a, *b); + *b = a; +} + +static struct dag * +create_dag(agx_context *ctx, agx_block *block, void *memctx) +{ + struct dag *dag = dag_create(ctx); + + struct sched_node **last_write = + calloc(ctx->alloc, sizeof(struct sched_node *)); + struct sched_node *coverage = NULL; + struct sched_node *preload = NULL; + + /* Last memory load, to serialize stores against */ + struct sched_node *memory_load = NULL; + + /* Last memory store, to serialize loads and stores against */ + struct sched_node *memory_store = NULL; + + agx_foreach_instr_in_block(block, I) { + /* Don't touch control flow */ + if (I->op == AGX_OPCODE_LOGICAL_END) + break; + + struct sched_node *node = rzalloc(memctx, struct sched_node); + node->instr = I; + dag_init_node(dag, &node->dag); + + /* Reads depend on writes, no other hazards in SSA */ + agx_foreach_ssa_src(I, s) { + add_dep(node, last_write[I->src[s].value]); + } + + agx_foreach_ssa_dest(I, d) { + assert(I->dest[d].value < ctx->alloc); + last_write[I->dest[d].value] = node; + } + + /* Classify the instruction and add dependencies according to the class */ + enum agx_schedule_class dep = agx_opcodes_info[I->op].schedule_class; + assert(dep != AGX_SCHEDULE_CLASS_INVALID && "invalid instruction seen"); + + bool barrier = dep == AGX_SCHEDULE_CLASS_BARRIER; + + if (dep == AGX_SCHEDULE_CLASS_STORE) + add_dep(node, memory_load); + else if (dep == AGX_SCHEDULE_CLASS_ATOMIC || barrier) + serialize(node, &memory_load); + + if (dep == AGX_SCHEDULE_CLASS_LOAD || dep == AGX_SCHEDULE_CLASS_STORE || + dep == AGX_SCHEDULE_CLASS_ATOMIC || barrier) + serialize(node, &memory_store); + + if (dep == AGX_SCHEDULE_CLASS_COVERAGE || barrier) + serialize(node, &coverage); + + if (dep == AGX_SCHEDULE_CLASS_PRELOAD) + serialize(node, &preload); + else + add_dep(node, preload); + } + + free(last_write); + + return dag; +} + +/* + * Calculate the change in register pressure from scheduling a given + * instruction. Equivalently, calculate the difference in the number of live + * registers before and after the instruction, given the live set after the + * instruction. This calculation follows immediately from the dataflow + * definition of liveness: + * + * live_in = (live_out - KILL) + GEN + */ +static signed +calculate_pressure_delta(agx_instr *I, BITSET_WORD *live) +{ + signed delta = 0; + + /* Destinations must be unique */ + agx_foreach_ssa_dest(I, d) { + if (BITSET_TEST(live, I->dest[d].value)) + delta -= agx_write_registers(I, d); + } + + agx_foreach_ssa_src(I, src) { + /* Filter duplicates */ + bool dupe = false; + + for (unsigned i = 0; i < src; ++i) { + if (agx_is_equiv(I->src[i], I->src[src])) { + dupe = true; + break; + } + } + + if (!dupe && !BITSET_TEST(live, I->src[src].value)) + delta += agx_read_registers(I, src); + } + + return delta; +} + +/* + * Choose the next instruction, bottom-up. For now we use a simple greedy + * heuristic: choose the instruction that has the best effect on liveness. + */ +static struct sched_node * +choose_instr(struct sched_ctx *s) +{ + int32_t min_delta = INT32_MAX; + struct sched_node *best = NULL; + + list_for_each_entry(struct sched_node, n, &s->dag->heads, dag.link) { + int32_t delta = calculate_pressure_delta(n->instr, s->live); + + if (delta < min_delta) { + best = n; + min_delta = delta; + } + } + + return best; +} + +static void +pressure_schedule_block(agx_context *ctx, agx_block *block, struct sched_ctx *s) +{ + /* off by a constant, that's ok */ + signed pressure = 0; + signed orig_max_pressure = 0; + unsigned nr_ins = 0; + + memcpy(s->live, block->live_out, + BITSET_WORDS(ctx->alloc) * sizeof(BITSET_WORD)); + + agx_foreach_instr_in_block_rev(block, I) { + pressure += calculate_pressure_delta(I, s->live); + orig_max_pressure = MAX2(pressure, orig_max_pressure); + agx_liveness_ins_update(s->live, I); + nr_ins++; + } + + memcpy(s->live, block->live_out, + BITSET_WORDS(ctx->alloc) * sizeof(BITSET_WORD)); + + /* off by a constant, that's ok */ + signed max_pressure = 0; + pressure = 0; + + struct sched_node **schedule = calloc(nr_ins, sizeof(struct sched_node *)); + nr_ins = 0; + + while (!list_is_empty(&s->dag->heads)) { + struct sched_node *node = choose_instr(s); + pressure += calculate_pressure_delta(node->instr, s->live); + max_pressure = MAX2(pressure, max_pressure); + dag_prune_head(s->dag, &node->dag); + + schedule[nr_ins++] = node; + agx_liveness_ins_update(s->live, node->instr); + } + + /* Bail if it looks like it's worse */ + if (max_pressure >= orig_max_pressure) { + free(schedule); + return; + } + + /* Apply the schedule */ + for (unsigned i = 0; i < nr_ins; ++i) { + agx_remove_instruction(schedule[i]->instr); + list_add(&schedule[i]->instr->link, &block->instructions); + } + + free(schedule); +} + +void +agx_pressure_schedule(agx_context *ctx) +{ + agx_compute_liveness(ctx); + void *memctx = ralloc_context(ctx); + BITSET_WORD *live = + ralloc_array(memctx, BITSET_WORD, BITSET_WORDS(ctx->alloc)); + + agx_foreach_block(ctx, block) { + struct sched_ctx sctx = { + .dag = create_dag(ctx, block, memctx), + .live = live, + }; + + pressure_schedule_block(ctx, block, &sctx); + } + + /* Clean up after liveness analysis */ + agx_foreach_instr_global(ctx, I) { + agx_foreach_ssa_src(I, s) + I->src[s].kill = false; + } + + ralloc_free(memctx); +} diff --git a/src/asahi/compiler/meson.build b/src/asahi/compiler/meson.build index e3e2730e90d..171c2dedb5b 100644 --- a/src/asahi/compiler/meson.build +++ b/src/asahi/compiler/meson.build @@ -24,6 +24,7 @@ libasahi_agx_files = files( 'agx_pack.c', 'agx_performance.c', 'agx_print.c', + 'agx_pressure_schedule.c', 'agx_ir.c', 'agx_opt_cse.c', 'agx_opt_empty_else.c',