diff --git a/src/freedreno/ir3/ir3.h b/src/freedreno/ir3/ir3.h index 98146f158a8..4dfcdf0da51 100644 --- a/src/freedreno/ir3/ir3.h +++ b/src/freedreno/ir3/ir3.h @@ -1190,7 +1190,6 @@ void ir3_remove_nops(struct ir3 *ir); /* depth calculation: */ struct ir3_shader_variant; -void ir3_insert_by_depth(struct ir3_instruction *instr, struct list_head *list); void ir3_depth(struct ir3 *ir, struct ir3_shader_variant *so); /* fp16 conversion folding */ diff --git a/src/freedreno/ir3/ir3_depth.c b/src/freedreno/ir3/ir3_depth.c index 6d389772570..00d0c54fac5 100644 --- a/src/freedreno/ir3/ir3_depth.c +++ b/src/freedreno/ir3/ir3_depth.c @@ -48,23 +48,6 @@ * blocks depth sorted list, which is used by the scheduling pass. */ -void -ir3_insert_by_depth(struct ir3_instruction *instr, struct list_head *list) -{ - /* remove from existing spot in list: */ - list_delinit(&instr->node); - - /* find where to re-insert instruction: */ - foreach_instr (pos, list) { - if (pos->depth > instr->depth) { - list_add(&instr->node, &pos->node); - return; - } - } - /* if we get here, we didn't find an insertion spot: */ - list_addtail(&instr->node, list); -} - static void ir3_instr_depth(struct ir3_instruction *instr, unsigned boost, bool falsedep) { @@ -97,8 +80,6 @@ ir3_instr_depth(struct ir3_instruction *instr, unsigned boost, bool falsedep) if (!is_meta(instr)) instr->depth++; - - ir3_insert_by_depth(instr, &instr->block->instr_list); } static bool diff --git a/src/freedreno/ir3/ir3_sched.c b/src/freedreno/ir3/ir3_sched.c index 9d0bf69d193..95c89ae8461 100644 --- a/src/freedreno/ir3/ir3_sched.c +++ b/src/freedreno/ir3/ir3_sched.c @@ -25,6 +25,7 @@ */ +#include "util/dag.h" #include "util/u_math.h" #include "ir3.h" @@ -47,12 +48,17 @@ /* * Instruction Scheduling: * - * A recursive depth based scheduling algo. Recursively find an eligible - * instruction to schedule from the deepest instruction (recursing through - * it's unscheduled src instructions). Normally this would result in a - * lot of re-traversal of the same instructions, so we cache results in - * instr->data (and clear cached results that would be no longer valid - * after scheduling an instruction). + * A block-level pre-RA scheduler, which works by creating a DAG of + * instruction dependencies, and heuristically picking a DAG head + * (instruction with no unscheduled dependencies). + * + * Where possible, it tries to pick instructions that avoid nop delay + * slots, but it will prefer to pick instructions that reduce (or do + * not increase) the number of live values. + * + * If the only possible choices are instructions that increase the + * number of live values, it will try to pick the one with the earliest + * consumer (based on pre-sched program order). * * There are a few special cases that need to be handled, since sched * is currently independent of register allocation. Usages of address @@ -62,161 +68,59 @@ * To solve this, when we are in such a scheduling "critical section", * and we encounter a conflicting write to a special register, we try * to schedule any remaining instructions that use that value first. + * + * TODO we can detect too-large live_values here.. would be a good place + * to "spill" cheap things, like move from uniform/immed. (Constructing + * list of ssa def consumers before sched pass would make this easier. + * Also, in general it is general it might be best not to re-use load_immed + * across blocks. + * + * TODO we can use (abs)/(neg) src modifiers in a lot of cases to reduce + * the # of immediates in play (or at least that would help with + * dEQP-GLES31.functional.ubo.random.all_per_block_buffers.*).. probably + * do this in a nir pass that inserts fneg/etc? The cp pass should fold + * these into src modifiers.. */ struct ir3_sched_ctx { struct ir3_block *block; /* the current block */ - struct list_head depth_list; /* depth sorted unscheduled instrs */ - struct ir3_instruction *scheduled; /* last scheduled instr XXX remove*/ + struct dag *dag; + + struct list_head unscheduled_list; /* unscheduled instructions */ + struct ir3_instruction *scheduled; /* last scheduled instr */ struct ir3_instruction *addr0; /* current a0.x user, if any */ struct ir3_instruction *addr1; /* current a1.x user, if any */ struct ir3_instruction *pred; /* current p0.x user, if any */ - int live_values; /* estimate of current live values */ - int half_live_values; /* estimate of current half precision live values */ - bool error; - unsigned live_threshold_hi; - unsigned live_threshold_lo; - unsigned depth_threshold_hi; - unsigned depth_threshold_lo; + int remaining_kills; + + bool error; }; +struct ir3_sched_node { + struct dag_node dag; /* must be first for util_dynarray_foreach */ + struct ir3_instruction *instr; + + unsigned delay; + unsigned max_delay; + + /* Is this instruction a direct or indirect dependency for a kill? + * If so, we should prioritize it when possible + */ + bool kill_path; +}; + +#define foreach_sched_node(__n, __list) \ + list_for_each_entry(struct ir3_sched_node, __n, __list, dag.link) + +static void sched_node_init(struct ir3_sched_ctx *ctx, struct ir3_instruction *instr); +static void sched_node_add_dep(struct ir3_instruction *instr, struct ir3_instruction *src, int i); + static bool is_scheduled(struct ir3_instruction *instr) { return !!(instr->flags & IR3_INSTR_MARK); } -static void -unuse_each_src(struct ir3_sched_ctx *ctx, struct ir3_instruction *instr) -{ - struct ir3_instruction *src; - - foreach_ssa_src_n (src, n, instr) { - if (__is_false_dep(instr, n)) - continue; - if (instr->block != src->block) - continue; - if ((src->opc == OPC_META_COLLECT) || (src->opc == OPC_META_SPLIT)) { - unuse_each_src(ctx, src); - } else { - debug_assert(src->use_count > 0); - - if (--src->use_count == 0) { - if (is_half(src)) { - ctx->half_live_values -= dest_regs(src); - debug_assert(ctx->half_live_values >= 0); - } else { - ctx->live_values -= dest_regs(src); - debug_assert(ctx->live_values >= 0); - } - } - } - } -} - -static void clear_cache(struct ir3_sched_ctx *ctx, struct ir3_instruction *instr); -static void use_instr(struct ir3_instruction *instr); - -/* transfers a use-count to new instruction, for cases where we - * "spill" address or predicate. Note this might cause the - * previous instruction that loaded a0.x/p0.x to become live - * again, when we previously thought it was dead. - */ -static void -transfer_use(struct ir3_sched_ctx *ctx, struct ir3_instruction *orig_instr, - struct ir3_instruction *new_instr) -{ - struct ir3_instruction *src; - - debug_assert(is_scheduled(orig_instr)); - - foreach_ssa_src_n (src, n, new_instr) { - if (__is_false_dep(new_instr, n)) - continue; - if (is_half(new_instr)) { - ctx->half_live_values += dest_regs(src); - } else { - ctx->live_values += dest_regs(src); - } - use_instr(src); - } - - clear_cache(ctx, orig_instr); -} - -static void -use_each_src(struct ir3_instruction *instr) -{ - struct ir3_instruction *src; - - foreach_ssa_src_n (src, n, instr) { - if (__is_false_dep(instr, n)) - continue; - use_instr(src); - } -} - -static void -use_instr(struct ir3_instruction *instr) -{ - if ((instr->opc == OPC_META_COLLECT) || (instr->opc == OPC_META_SPLIT)) { - use_each_src(instr); - } else { - instr->use_count++; - } -} - -static void -update_live_values(struct ir3_sched_ctx *ctx, struct ir3_instruction *scheduled) -{ - if ((scheduled->opc == OPC_META_COLLECT) || (scheduled->opc == OPC_META_SPLIT)) - return; - - if ((scheduled->regs_count > 0) && is_half(scheduled)) { - ctx->half_live_values += dest_regs(scheduled); - } else { - ctx->live_values += dest_regs(scheduled); - } - - unuse_each_src(ctx, scheduled); -} - -static void -update_use_count(struct ir3 *ir) -{ - foreach_block (block, &ir->block_list) { - foreach_instr (instr, &block->instr_list) { - instr->use_count = 0; - } - } - - foreach_block (block, &ir->block_list) { - foreach_instr (instr, &block->instr_list) { - if ((instr->opc == OPC_META_COLLECT) || (instr->opc == OPC_META_SPLIT)) - continue; - - use_each_src(instr); - } - } - - /* Shader outputs are also used: - */ - struct ir3_instruction *out; - foreach_output (out, ir) - use_instr(out); -} - -#define NULL_INSTR ((void *)~0) - -static void -clear_cache(struct ir3_sched_ctx *ctx, struct ir3_instruction *instr) -{ - foreach_instr (instr2, &ctx->depth_list) { - if ((instr2->data == instr) || (instr2->data == NULL_INSTR) || !instr) - instr2->data = NULL; - } -} - static void schedule(struct ir3_sched_ctx *ctx, struct ir3_instruction *instr) { @@ -248,35 +152,13 @@ schedule(struct ir3_sched_ctx *ctx, struct ir3_instruction *instr) list_addtail(&instr->node, &instr->block->instr_list); ctx->scheduled = instr; - update_live_values(ctx, instr); - - if (writes_addr0(instr) || writes_addr1(instr) || writes_pred(instr) || is_input(instr)) { - clear_cache(ctx, NULL); - } else { - /* invalidate only the necessary entries.. */ - clear_cache(ctx, instr); + if (is_kill(instr)){ + ctx->remaining_kills--; } -} -static struct ir3_instruction * -deepest(struct ir3_instruction **srcs, unsigned nsrcs) -{ - struct ir3_instruction *d = NULL; - unsigned i = 0, id = 0; + struct ir3_sched_node *n = instr->data; - while ((i < nsrcs) && !(d = srcs[id = i])) - i++; - - if (!d) - return NULL; - - for (; i < nsrcs; i++) - if (srcs[i] && (srcs[i]->depth > d->depth)) - d = srcs[id = i]; - - srcs[id] = NULL; - - return d; + dag_prune_head(ctx->dag, &n->dag); } struct ir3_sched_notes { @@ -313,11 +195,23 @@ check_instr(struct ir3_sched_ctx *ctx, struct ir3_sched_notes *notes, { debug_assert(!is_scheduled(instr)); + if (ctx->remaining_kills && (is_tex(instr) || is_mem(instr))) { + /* avoid texture/memory access if we have unscheduled kills + * that could make the expensive operation unnecessary. By + * definition, if there are remaining kills, and this instr + * is not a dependency of a kill, there are other instructions + * that we can choose from. + */ + struct ir3_sched_node *n = instr->data; + if (!n->kill_path) + return false; + } + /* For instructions that write address register we need to * make sure there is at least one instruction that uses the * addr value which is otherwise ready. * - * TODO if any instructions use pred register and have other + * NOTE if any instructions use pred register and have other * src args, we would need to do the same for writes_pred().. */ if (writes_addr0(instr)) { @@ -384,10 +278,7 @@ check_instr(struct ir3_sched_ctx *ctx, struct ir3_sched_notes *notes, * virtual ssa src for the kill instruction. But we have * fixed length instr->regs[]. * - * TODO this wouldn't be quite right if we had multiple - * basic blocks, if any block was conditional. We'd need - * to schedule the bary.f's outside of any block which - * was conditional that contained a kill.. I think.. + * TODO we could handle this by false-deps now, probably. */ if (is_kill(instr)) { struct ir3 *ir = instr->block->shader; @@ -406,63 +297,43 @@ check_instr(struct ir3_sched_ctx *ctx, struct ir3_sched_notes *notes, return true; } -/* Find the best instruction to schedule from specified instruction or - * recursively it's ssa sources. +/* Find the instr->ip of the closest use of an instruction, in + * pre-sched order. This isn't going to be the same as post-sched + * order, but it is a reasonable approximation to limit scheduling + * instructions *too* early. This is mostly to prevent bad behavior + * in cases where we have a large number of possible instructions + * to choose, to avoid creating too much parallelism (ie. blowing + * up register pressure) + * + * See dEQP-GLES31.functional.atomic_counter.layout.reverse_offset.inc_dec.8_counters_5_calls_1_thread */ -static struct ir3_instruction * -find_instr_recursive(struct ir3_sched_ctx *ctx, struct ir3_sched_notes *notes, - struct ir3_instruction *instr) +static int +nearest_use(struct ir3_instruction *instr) { - struct ir3_instruction *srcs[__ssa_src_cnt(instr)]; - struct ir3_instruction *src; - unsigned nsrcs = 0; + unsigned nearest = ~0; + foreach_ssa_use (use, instr) + if (!is_scheduled(use)) + nearest = MIN2(nearest, use->ip); - if (is_scheduled(instr)) - return NULL; - - /* use instr->data to cache the results of recursing up the - * instr src's. Otherwise the recursive algo can scale quite - * badly w/ shader size. But this takes some care to clear - * the cache appropriately when instructions are scheduled. + /* slight hack.. this heuristic tends to push bary.f's to later + * in the shader, closer to their uses. But we actually would + * prefer to get these scheduled earlier, to unlock varying + * storage for more VS jobs: */ - if (instr->data) { - if (instr->data == NULL_INSTR) - return NULL; - return instr->data; - } + if (is_input(instr)) + nearest /= 2; - /* find unscheduled srcs: */ - foreach_ssa_src (src, instr) { - if (!is_scheduled(src) && (src->block == instr->block)) { - debug_assert(nsrcs < ARRAY_SIZE(srcs)); - srcs[nsrcs++] = src; - } - } + return nearest; +} - /* if all our src's are already scheduled: */ - if (nsrcs == 0) { - if (check_instr(ctx, notes, instr)) { - instr->data = instr; - return instr; - } - return NULL; - } - - while ((src = deepest(srcs, nsrcs))) { - struct ir3_instruction *candidate; - - candidate = find_instr_recursive(ctx, notes, src); - if (!candidate) - continue; - - if (check_instr(ctx, notes, candidate)) { - instr->data = candidate; - return candidate; - } - } - - instr->data = NULL_INSTR; - return NULL; +static int +use_count(struct ir3_instruction *instr) +{ + unsigned cnt = 0; + foreach_ssa_use (use, instr) + if (!is_scheduled(use)) + cnt++; + return cnt; } /* find net change to live values if instruction were scheduled: */ @@ -471,7 +342,7 @@ live_effect(struct ir3_instruction *instr) { struct ir3_instruction *src; int new_live = dest_regs(instr); - int old_live = 0; + int freed_live = 0; foreach_ssa_src_n (src, n, instr) { if (__is_false_dep(instr, n)) @@ -480,137 +351,226 @@ live_effect(struct ir3_instruction *instr) if (instr->block != src->block) continue; - /* for split, just pass things along to the real src: */ - if (src->opc == OPC_META_SPLIT) - src = ssa(src->regs[1]); + if (use_count(src) == 1) + freed_live += dest_regs(src); + } - /* for collect, if this is the last use of *each* src, - * then it will decrease the live values, since RA treats - * them as a whole: - */ - if (src->opc == OPC_META_COLLECT) { - struct ir3_instruction *src2; - bool last_use = true; + return new_live - freed_live; +} - foreach_ssa_src (src2, src) { - if (src2->use_count > 1) { - last_use = false; - break; - } - } +/** + * Chooses an instruction to schedule using the Goodman/Hsu (1988) CSR (Code + * Scheduling for Register pressure) heuristic. + */ +static struct ir3_sched_node * +choose_instr_csr(struct ir3_sched_ctx *ctx, struct ir3_sched_notes *notes) +{ + struct ir3_sched_node *chosen = NULL; - if (last_use) - old_live += dest_regs(src); + /* Find a ready inst with regs freed and pick the one with max cost. */ + foreach_sched_node (n, &ctx->dag->heads) { + unsigned d = ir3_delay_calc(ctx->block, n->instr, false, false); - } else { - debug_assert(src->use_count > 0); + if (d > 0) + continue; - if (src->use_count == 1) { - old_live += dest_regs(src); - } + if (live_effect(n->instr) > -1) + continue; + + if (!check_instr(ctx, notes, n->instr)) + continue; + + if (!chosen || chosen->max_delay < n->max_delay) { + chosen = n; } } - return new_live - old_live; + if (chosen) { + di(chosen->instr, "csr: chose (freed+ready)"); + return chosen; + } + + /* Find a leader with regs freed and pick the one with max cost. */ + foreach_sched_node (n, &ctx->dag->heads) { + if (live_effect(n->instr) > -1) + continue; + + if (!check_instr(ctx, notes, n->instr)) + continue; + + if (!chosen || chosen->max_delay < n->max_delay) { + chosen = n; + } + } + + if (chosen) { + di(chosen->instr, "csr: chose (freed)"); + return chosen; + } + + /* Contra the paper, pick a leader with no effect on used regs. This may + * open up new opportunities, as otherwise a single-operand instr consuming + * a value will tend to block finding freeing that value. This had a + * massive effect on reducing spilling on V3D. + * + * XXX: Should this prioritize ready? + */ + foreach_sched_node (n, &ctx->dag->heads) { + unsigned d = ir3_delay_calc(ctx->block, n->instr, false, false); + + if (d > 0) + continue; + + if (live_effect(n->instr) > 0) + continue; + + if (!check_instr(ctx, notes, n->instr)) + continue; + + if (!chosen || chosen->max_delay < n->max_delay) + chosen = n; + } + + if (chosen) { + di(chosen->instr, "csr: chose (neutral+ready)"); + return chosen; + } + + foreach_sched_node (n, &ctx->dag->heads) { + if (live_effect(n->instr) > 0) + continue; + + if (!check_instr(ctx, notes, n->instr)) + continue; + + if (!chosen || chosen->max_delay < n->max_delay) + chosen = n; + } + + if (chosen) { + di(chosen->instr, "csr: chose (neutral)"); + return chosen; + } + + /* + * From hear on out, we are picking something that increases + * register pressure. So try to pick something which will + * be consumed soon: + */ + unsigned chosen_distance = 0; + + /* Pick the max delay of the remaining ready set. */ + foreach_sched_node (n, &ctx->dag->heads) { + unsigned d = ir3_delay_calc(ctx->block, n->instr, false, false); + + if (d > 0) + continue; + + if (!check_instr(ctx, notes, n->instr)) + continue; + + unsigned distance = nearest_use(n->instr); + + if (!chosen || distance < chosen_distance) { + chosen = n; + chosen_distance = distance; + } + } + + if (chosen) { + di(chosen->instr, "csr: chose (distance+ready)"); + return chosen; + } + + /* Pick the max delay of the remaining leaders. */ + foreach_sched_node (n, &ctx->dag->heads) { + if (!check_instr(ctx, notes, n->instr)) + continue; + + unsigned distance = nearest_use(n->instr); + + if (!chosen || distance < chosen_distance) { + chosen = n; + chosen_distance = distance; + } + } + + if (chosen) { + di(chosen->instr, "csr: chose (distance)"); + return chosen; + } + + return NULL; +} + +/* Handles instruction selections for instructions we want to prioritize + * even if csp/csr would not pick them. + */ +static struct ir3_sched_node * +choose_instr_prio(struct ir3_sched_ctx *ctx, struct ir3_sched_notes *notes) +{ + struct ir3_sched_node *chosen = NULL; + + foreach_sched_node (n, &ctx->dag->heads) { + if (!is_meta(n->instr)) + continue; + + if (!chosen || (chosen->max_delay < n->max_delay)) + chosen = n; + } + + if (chosen) { + di(chosen->instr, "prio: chose (meta)"); + return chosen; + } + + return NULL; +} + +static void +dump_state(struct ir3_sched_ctx *ctx) +{ + if (!SCHED_DEBUG) + return; + + foreach_sched_node (n, &ctx->dag->heads) { + di(n->instr, "maxdel=%3d le=%d del=%u ", + n->max_delay, live_effect(n->instr), + ir3_delay_calc(ctx->block, n->instr, false, false)); + + util_dynarray_foreach(&n->dag.edges, struct dag_edge, edge) { + struct ir3_sched_node *child = (struct ir3_sched_node *)edge->child; + + di(child->instr, " -> (%d parents) ", child->dag.parent_count); + } + } } /* find instruction to schedule: */ static struct ir3_instruction * -find_eligible_instr(struct ir3_sched_ctx *ctx, struct ir3_sched_notes *notes, - bool soft) +choose_instr(struct ir3_sched_ctx *ctx, struct ir3_sched_notes *notes) { - struct ir3_instruction *best_instr = NULL; - int best_rank = INT_MAX; /* lower is better */ - unsigned deepest = 0; + struct ir3_sched_node *chosen; - /* TODO we'd really rather use the list/array of block outputs. But we - * don't have such a thing. Recursing *every* instruction in the list - * will result in a lot of repeated traversal, since instructions will - * get traversed both when they appear as ssa src to a later instruction - * as well as where they appear in the depth_list. - */ - foreach_instr_rev (instr, &ctx->depth_list) { - struct ir3_instruction *candidate; + dump_state(ctx); - candidate = find_instr_recursive(ctx, notes, instr); - if (!candidate) - continue; + chosen = choose_instr_prio(ctx, notes); + if (chosen) + return chosen->instr; - if (is_meta(candidate)) - return candidate; + chosen = choose_instr_csr(ctx, notes); + if (chosen) + return chosen->instr; - deepest = MAX2(deepest, candidate->depth); - } - - /* traverse the list a second time.. but since we cache the result of - * find_instr_recursive() it isn't as bad as it looks. - */ - foreach_instr_rev (instr, &ctx->depth_list) { - struct ir3_instruction *candidate; - - candidate = find_instr_recursive(ctx, notes, instr); - if (!candidate) - continue; - - /* determine net change to # of live values: */ - int le = live_effect(candidate); - unsigned live_values = (2 * ctx->live_values) + ctx->half_live_values; - - /* if there is a net increase in # of live values, then apply some - * threshold to avoid instructions getting scheduled *too* early - * and increasing register pressure. - */ - if (le >= 1) { - unsigned threshold; - - if (live_values > ctx->live_threshold_lo) { - threshold = ctx->depth_threshold_lo; - } else { - threshold = ctx->depth_threshold_hi; - } - - /* Filter out any "shallow" instructions which would otherwise - * tend to get scheduled too early to fill delay slots even - * when they are not needed for a while. There will probably - * be later delay slots that they could just as easily fill. - * - * A classic case where this comes up is frag shaders that - * write a constant value (like 1.0f) to one of the channels - * of the output color(s). Since the mov from immed has no - * dependencies, it would otherwise get scheduled early to - * fill delay slots, occupying a register until the end of - * the program. - */ - if ((deepest - candidate->depth) > threshold) - continue; - } - - int rank = ir3_delay_calc(ctx->block, candidate, soft, false); - - /* if too many live values, prioritize instructions that reduce the - * number of live values: - */ - if (live_values > ctx->live_threshold_hi) { - rank = le; - } else if (live_values > ctx->live_threshold_lo) { - rank += le; - } - - if (rank < best_rank) { - best_instr = candidate; - best_rank = rank; - } - } - - return best_instr; + return NULL; } static struct ir3_instruction * split_instr(struct ir3_sched_ctx *ctx, struct ir3_instruction *orig_instr) { struct ir3_instruction *new_instr = ir3_instr_clone(orig_instr); - ir3_insert_by_depth(new_instr, &ctx->depth_list); - transfer_use(ctx, orig_instr, new_instr); + di(new_instr, "split instruction"); + sched_node_init(ctx, new_instr); return new_instr; } @@ -646,8 +606,12 @@ split_addr(struct ir3_sched_ctx *ctx, struct ir3_instruction **addr, /* original addr is scheduled, but new one isn't: */ new_addr->flags &= ~IR3_INSTR_MARK; } - indirect->address = NULL; - ir3_instr_set_address(indirect, new_addr); + indirect->address = new_addr; + /* don't need to remove old dag edge since old addr is + * already scheduled: + */ + sched_node_add_dep(indirect, new_addr, 0); + di(indirect, "new address"); } } @@ -692,6 +656,11 @@ split_pred(struct ir3_sched_ctx *ctx) new_pred->flags &= ~IR3_INSTR_MARK; } predicated->regs[1]->instr = new_pred; + /* don't need to remove old dag edge since old pred is + * already scheduled: + */ + sched_node_add_dep(predicated, new_pred, 0); + di(predicated, "new predicate"); } } @@ -701,11 +670,109 @@ split_pred(struct ir3_sched_ctx *ctx) return new_pred; } +static void +sched_node_init(struct ir3_sched_ctx *ctx, struct ir3_instruction *instr) +{ + struct ir3_sched_node *n = rzalloc(ctx->dag, struct ir3_sched_node); + + dag_init_node(ctx->dag, &n->dag); + + n->instr = instr; + instr->data = n; +} + +static void +sched_node_add_dep(struct ir3_instruction *instr, struct ir3_instruction *src, int i) +{ + /* don't consider dependencies in other blocks: */ + if (src->block != instr->block) + return; + + /* we could have false-dep's that end up unused: */ + if (src->flags & IR3_INSTR_UNUSED) { + debug_assert(__is_false_dep(instr, i)); + return; + } + + struct ir3_sched_node *n = instr->data; + struct ir3_sched_node *sn = src->data; + + dag_add_edge(&sn->dag, &n->dag, NULL); + + unsigned d = ir3_delayslots(src, instr, i, true); + n->delay = MAX2(n->delay, d); +} + +static void +mark_kill_path(struct ir3_instruction *instr) +{ + struct ir3_sched_node *n = instr->data; + n->kill_path = true; + struct ir3_instruction *src; + foreach_ssa_src (src, instr) { + if (src->block != instr->block) + continue; + mark_kill_path(src); + } +} + +static void +sched_node_add_deps(struct ir3_instruction *instr) +{ + struct ir3_instruction *src; + + /* Since foreach_ssa_src() already handles false-dep's we can construct + * the DAG easily in a single pass. + */ + foreach_ssa_src_n (src, i, instr) { + sched_node_add_dep(instr, src, i); + } + + /* NOTE that all inputs must be scheduled before a kill, so + * mark these to be prioritized as well: + */ + if (is_kill(instr) || is_input(instr)) { + mark_kill_path(instr); + } +} + +static void +sched_dag_max_delay_cb(struct dag_node *node, void *state) +{ + struct ir3_sched_node *n = (struct ir3_sched_node *)node; + uint32_t max_delay = 0; + + util_dynarray_foreach(&n->dag.edges, struct dag_edge, edge) { + struct ir3_sched_node *child = (struct ir3_sched_node *)edge->child; + max_delay = MAX2(child->max_delay, max_delay); + } + + n->max_delay = MAX2(n->max_delay, max_delay + n->delay); +} + +static void +sched_dag_init(struct ir3_sched_ctx *ctx) +{ + ctx->dag = dag_create(ctx); + + foreach_instr (instr, &ctx->unscheduled_list) { + sched_node_init(ctx, instr); + sched_node_add_deps(instr); + } + + dag_traverse_bottom_up(ctx->dag, sched_dag_max_delay_cb, NULL); +} + +static void +sched_dag_destroy(struct ir3_sched_ctx *ctx) +{ + ralloc_free(ctx->dag); + ctx->dag = NULL; +} + static void sched_block(struct ir3_sched_ctx *ctx, struct ir3_block *block) { - struct list_head unscheduled_list; - ctx->block = block; /* addr/pred writes are per-block: */ @@ -717,9 +784,16 @@ sched_block(struct ir3_sched_ctx *ctx, struct ir3_block *block) * empty the block's instruction list (to which we will * be inserting). */ - list_replace(&block->instr_list, &unscheduled_list); + list_replace(&block->instr_list, &ctx->unscheduled_list); list_inithead(&block->instr_list); - list_inithead(&ctx->depth_list); + + sched_dag_init(ctx); + + ctx->remaining_kills = 0; + foreach_instr_safe (instr, &ctx->unscheduled_list) { + if (is_kill(instr)) + ctx->remaining_kills++; + } /* First schedule all meta:input instructions, followed by * tex-prefetch. We want all of the instructions that load @@ -729,29 +803,20 @@ sched_block(struct ir3_sched_ctx *ctx, struct ir3_block *block) * a FS's bary_ij input may not actually be live in the * shader, but it should not be scheduled on top of any * other input (but can be overwritten by a tex prefetch) - * - * Finally, move all the remaining instructions to the depth- - * list */ - foreach_instr_safe (instr, &unscheduled_list) + foreach_instr_safe (instr, &ctx->unscheduled_list) if (instr->opc == OPC_META_INPUT) schedule(ctx, instr); - foreach_instr_safe (instr, &unscheduled_list) + foreach_instr_safe (instr, &ctx->unscheduled_list) if (instr->opc == OPC_META_TEX_PREFETCH) schedule(ctx, instr); - foreach_instr_safe (instr, &unscheduled_list) - ir3_insert_by_depth(instr, &ctx->depth_list); - - while (!list_is_empty(&ctx->depth_list)) { + while (!list_is_empty(&ctx->unscheduled_list)) { struct ir3_sched_notes notes = {0}; struct ir3_instruction *instr; - instr = find_eligible_instr(ctx, ¬es, true); - if (!instr) - instr = find_eligible_instr(ctx, ¬es, false); - + instr = choose_instr(ctx, ¬es); if (instr) { unsigned delay = ir3_delay_calc(ctx->block, instr, false, false); d("delay=%u", delay); @@ -784,62 +849,47 @@ sched_block(struct ir3_sched_ctx *ctx, struct ir3_block *block) } else if (notes.pred_conflict) { new_instr = split_pred(ctx); } else { + d("unscheduled_list:"); + foreach_instr (instr, &ctx->unscheduled_list) + di(instr, "unscheduled: "); debug_assert(0); ctx->error = true; return; } if (new_instr) { - /* clearing current addr/pred can change what is - * available to schedule, so clear cache.. - */ - clear_cache(ctx, NULL); - - ir3_insert_by_depth(new_instr, &ctx->depth_list); - /* the original instr that wrote addr/pred may have - * originated from a different block: - */ - new_instr->block = block; + list_delinit(&new_instr->node); + list_addtail(&new_instr->node, &ctx->unscheduled_list); } } } -} -static void -setup_thresholds(struct ir3_sched_ctx *ctx, struct ir3 *ir) -{ - if (ir3_has_latency_to_hide(ir)) { - ctx->live_threshold_hi = 2 * 16 * 4; - ctx->live_threshold_lo = 2 * 4 * 4; - ctx->depth_threshold_hi = 6; - ctx->depth_threshold_lo = 4; - } else { - ctx->live_threshold_hi = 2 * 16 * 4; - ctx->live_threshold_lo = 2 * 12 * 4; - ctx->depth_threshold_hi = 16; - ctx->depth_threshold_lo = 16; - } + sched_dag_destroy(ctx); } int ir3_sched(struct ir3 *ir) { - struct ir3_sched_ctx ctx = {0}; - - setup_thresholds(&ctx, ir); - - ir3_clear_mark(ir); - update_use_count(ir); + struct ir3_sched_ctx *ctx = rzalloc(NULL, struct ir3_sched_ctx); foreach_block (block, &ir->block_list) { - ctx.live_values = 0; - ctx.half_live_values = 0; - sched_block(&ctx, block); + foreach_instr (instr, &block->instr_list) { + instr->data = NULL; + } } - if (ctx.error) - return -1; + ir3_count_instructions(ir); + ir3_clear_mark(ir); + ir3_find_ssa_uses(ir, ctx, false); - return 0; + foreach_block (block, &ir->block_list) { + sched_block(ctx, block); + } + + int ret = ctx->error ? -1 : 0; + + ralloc_free(ctx); + + return ret; } static unsigned