ir3: Implement helper invocation optimization
This kills helper invocations to ensure that subsequent memory accesses don't fetch unused memory and unnecessary branch divergence from helper invocations is eliminated. shader-db results: total instructions in shared programs: 3840580 -> 3841531 (0.02%) instructions in affected programs: 278416 -> 279367 (0.34%) helped: 0 HURT: 744 HURT stats (abs) min: 1 max: 16 x̄: 1.28 x̃: 1 HURT stats (rel) min: 0.05% max: 8.51% x̄: 0.75% x̃: 0.39% 95% mean confidence interval for instructions value: 1.22 1.34 95% mean confidence interval for instructions %-change: 0.67% 0.83% Instructions are HURT. total nops in shared programs: 866716 -> 867667 (0.11%) nops in affected programs: 72851 -> 73802 (1.31%) helped: 0 HURT: 744 HURT stats (abs) min: 1 max: 16 x̄: 1.28 x̃: 1 HURT stats (rel) min: 0.17% max: 33.33% x̄: 2.84% x̃: 1.82% 95% mean confidence interval for nops value: 1.22 1.34 95% mean confidence interval for nops %-change: 2.59% 3.08% Nops are HURT. total last-baryf in shared programs: 139806 -> 139864 (0.04%) last-baryf in affected programs: 11772 -> 11830 (0.49%) helped: 0 HURT: 58 HURT stats (abs) min: 1 max: 1 x̄: 1.00 x̃: 1 HURT stats (rel) min: 0.40% max: 5.26% x̄: 0.60% x̃: 0.47% 95% mean confidence interval for last-baryf value: 1.00 1.00 95% mean confidence interval for last-baryf %-change: 0.42% 0.78% Last-baryf are HURT. total last-helper in shared programs: 1508295 -> 935561 (-37.97%) last-helper in affected programs: 1192594 -> 619860 (-48.02%) helped: 7816 HURT: 3 helped stats (abs) min: 1 max: 1095 x̄: 73.28 x̃: 34 helped stats (rel) min: 0.42% max: 100.00% x̄: 71.91% x̃: 100.00% HURT stats (abs) min: 1 max: 11 x̄: 4.67 x̃: 2 HURT stats (rel) min: 0.80% max: 1.44% x̄: 1.03% x̃: 0.86% 95% mean confidence interval for last-helper value: -75.64 -70.86 95% mean confidence interval for last-helper %-change: -72.67% -71.10% Last-helper are helped. fossil-db results: Totals: Instrs: 55172795 -> 55189122 (+0.03%) CodeSize: 108952746 -> 108984452 (+0.03%) NOPs: 11536680 -> 11553007 (+0.14%) (ss)-stall: 4166810 -> 4166581 (-0.01%) (sy)-stall: 15890324 -> 15884974 (-0.03%) last-baryf: 659588 -> 659633 (+0.01%) last-helper: 25742996 -> 12601636 (-51.05%); split: -51.05%, +0.00% Cat0: 12294891 -> 12311218 (+0.13%) Totals from 39576 (25.22% of 156916) affected shaders: Instrs: 24200008 -> 24216335 (+0.07%) CodeSize: 44968736 -> 45000442 (+0.07%) NOPs: 5854965 -> 5871292 (+0.28%) (ss)-stall: 2357830 -> 2357601 (-0.01%) (sy)-stall: 6166670 -> 6161320 (-0.09%) last-baryf: 590330 -> 590375 (+0.01%) last-helper: 24160432 -> 11019072 (-54.39%); split: -54.39%, +0.00% Cat0: 6205561 -> 6221888 (+0.26%) Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/24433>
This commit is contained in:
@@ -691,6 +691,12 @@ ir3_start_block(struct ir3 *ir)
|
||||
return list_first_entry(&ir->block_list, struct ir3_block, node);
|
||||
}
|
||||
|
||||
static inline struct ir3_block *
|
||||
ir3_end_block(struct ir3 *ir)
|
||||
{
|
||||
return list_last_entry(&ir->block_list, struct ir3_block, node);
|
||||
}
|
||||
|
||||
static inline struct ir3_block *
|
||||
ir3_after_preamble(struct ir3 *ir)
|
||||
{
|
||||
@@ -1074,6 +1080,53 @@ is_input(struct ir3_instruction *instr)
|
||||
}
|
||||
}
|
||||
|
||||
/* Whether non-helper invocations can read the value of helper invocations. We
|
||||
* cannot insert (eq) before these instructions.
|
||||
*/
|
||||
static inline bool
|
||||
uses_helpers(struct ir3_instruction *instr)
|
||||
{
|
||||
switch (instr->opc) {
|
||||
/* These require helper invocations to be present */
|
||||
case OPC_SAM:
|
||||
case OPC_SAMB:
|
||||
case OPC_GETLOD:
|
||||
case OPC_DSX:
|
||||
case OPC_DSY:
|
||||
case OPC_DSXPP_1:
|
||||
case OPC_DSYPP_1:
|
||||
case OPC_DSXPP_MACRO:
|
||||
case OPC_DSYPP_MACRO:
|
||||
case OPC_QUAD_SHUFFLE_BRCST:
|
||||
case OPC_QUAD_SHUFFLE_HORIZ:
|
||||
case OPC_QUAD_SHUFFLE_VERT:
|
||||
case OPC_QUAD_SHUFFLE_DIAG:
|
||||
case OPC_META_TEX_PREFETCH:
|
||||
return true;
|
||||
|
||||
/* Subgroup operations don't require helper invocations to be present, but
|
||||
* will use helper invocations if they are present.
|
||||
*/
|
||||
case OPC_BALLOT_MACRO:
|
||||
case OPC_ANY_MACRO:
|
||||
case OPC_ALL_MACRO:
|
||||
case OPC_ELECT_MACRO:
|
||||
case OPC_READ_FIRST_MACRO:
|
||||
case OPC_READ_COND_MACRO:
|
||||
case OPC_MOVMSK:
|
||||
case OPC_BRCST_ACTIVE:
|
||||
return true;
|
||||
|
||||
/* Catch lowered READ_FIRST/READ_COND. */
|
||||
case OPC_MOV:
|
||||
return (instr->dsts[0]->flags & IR3_REG_SHARED) &&
|
||||
!(instr->srcs[0]->flags & IR3_REG_SHARED);
|
||||
|
||||
default:
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
static inline bool
|
||||
is_bool(struct ir3_instruction *instr)
|
||||
{
|
||||
@@ -1704,6 +1757,9 @@ __ssa_srcp_n(struct ir3_instruction *instr, unsigned n)
|
||||
/* iterators for instructions: */
|
||||
#define foreach_instr(__instr, __list) \
|
||||
list_for_each_entry (struct ir3_instruction, __instr, __list, node)
|
||||
#define foreach_instr_from(__instr, __start, __list) \
|
||||
list_for_each_entry_from(struct ir3_instruction, __instr, &(__start)->node, \
|
||||
__list, node)
|
||||
#define foreach_instr_rev(__instr, __list) \
|
||||
list_for_each_entry_rev (struct ir3_instruction, __instr, __list, node)
|
||||
#define foreach_instr_safe(__instr, __list) \
|
||||
|
@@ -908,6 +908,215 @@ nop_sched(struct ir3 *ir, struct ir3_shader_variant *so)
|
||||
}
|
||||
}
|
||||
|
||||
struct ir3_helper_block_data {
|
||||
/* Whether helper invocations may be used on any path starting at the
|
||||
* beginning of the block.
|
||||
*/
|
||||
bool uses_helpers_beginning;
|
||||
|
||||
/* Whether helper invocations may be used by the end of the block. Branch
|
||||
* instructions are considered to be "between" blocks, because (eq) has to be
|
||||
* inserted after them in the successor blocks, so branch instructions using
|
||||
* helpers will result in uses_helpers_end = true for their block.
|
||||
*/
|
||||
bool uses_helpers_end;
|
||||
};
|
||||
|
||||
/* Insert (eq) after the last instruction using the results of helper
|
||||
* invocations. Use a backwards dataflow analysis to determine at which points
|
||||
* in the program helper invocations are definitely never used, and then insert
|
||||
* (eq) at the point where we cross from a point where they may be used to a
|
||||
* point where they are never used.
|
||||
*/
|
||||
static void
|
||||
helper_sched(struct ir3_legalize_ctx *ctx, struct ir3 *ir,
|
||||
struct ir3_shader_variant *so)
|
||||
{
|
||||
bool non_prefetch_helpers = false;
|
||||
|
||||
foreach_block (block, &ir->block_list) {
|
||||
struct ir3_helper_block_data *bd =
|
||||
rzalloc(ctx, struct ir3_helper_block_data);
|
||||
foreach_instr (instr, &block->instr_list) {
|
||||
if (uses_helpers(instr)) {
|
||||
bd->uses_helpers_beginning = true;
|
||||
if (instr->opc != OPC_META_TEX_PREFETCH) {
|
||||
non_prefetch_helpers = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (instr->opc == OPC_SHPE) {
|
||||
/* (eq) is not allowed in preambles, mark the whole preamble as
|
||||
* requiring helpers to avoid putting it there.
|
||||
*/
|
||||
bd->uses_helpers_beginning = true;
|
||||
bd->uses_helpers_end = true;
|
||||
}
|
||||
}
|
||||
|
||||
if (block->brtype == IR3_BRANCH_ALL ||
|
||||
block->brtype == IR3_BRANCH_ANY ||
|
||||
block->brtype == IR3_BRANCH_GETONE) {
|
||||
bd->uses_helpers_end = true;
|
||||
}
|
||||
|
||||
block->data = bd;
|
||||
}
|
||||
|
||||
/* If only prefetches use helpers then we can disable them in the shader via
|
||||
* a register setting.
|
||||
*/
|
||||
if (!non_prefetch_helpers) {
|
||||
so->prefetch_end_of_quad = true;
|
||||
return;
|
||||
}
|
||||
|
||||
bool progress;
|
||||
do {
|
||||
progress = false;
|
||||
foreach_block_rev (block, &ir->block_list) {
|
||||
struct ir3_helper_block_data *bd = block->data;
|
||||
|
||||
if (!bd->uses_helpers_beginning)
|
||||
continue;
|
||||
|
||||
for (unsigned i = 0; i < block->predecessors_count; i++) {
|
||||
struct ir3_block *pred = block->predecessors[i];
|
||||
struct ir3_helper_block_data *pred_bd = pred->data;
|
||||
if (!pred_bd->uses_helpers_end) {
|
||||
pred_bd->uses_helpers_end = true;
|
||||
}
|
||||
if (!pred_bd->uses_helpers_beginning) {
|
||||
pred_bd->uses_helpers_beginning = true;
|
||||
progress = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
} while (progress);
|
||||
|
||||
/* Now, we need to determine the points where helper invocations become
|
||||
* unused.
|
||||
*/
|
||||
foreach_block (block, &ir->block_list) {
|
||||
struct ir3_helper_block_data *bd = block->data;
|
||||
if (bd->uses_helpers_end)
|
||||
continue;
|
||||
|
||||
/* We need to check the predecessors because of situations with critical
|
||||
* edges like this that can occur after optimizing jumps:
|
||||
*
|
||||
* br p0.x, #endif
|
||||
* ...
|
||||
* sam ...
|
||||
* ...
|
||||
* endif:
|
||||
* ...
|
||||
* end
|
||||
*
|
||||
* The endif block will have uses_helpers_beginning = false and
|
||||
* uses_helpers_end = false, but because we jump to there from the
|
||||
* beginning of the if where uses_helpers_end = true, we still want to
|
||||
* add an (eq) at the beginning of the block:
|
||||
*
|
||||
* br p0.x, #endif
|
||||
* ...
|
||||
* sam ...
|
||||
* (eq)nop
|
||||
* ...
|
||||
* endif:
|
||||
* (eq)nop
|
||||
* ...
|
||||
* end
|
||||
*
|
||||
* This an extra nop in the case where the branch isn't taken, but that's
|
||||
* probably preferable to adding an extra jump instruction which is what
|
||||
* would happen if we ran this pass before optimizing jumps:
|
||||
*
|
||||
* br p0.x, #else
|
||||
* ...
|
||||
* sam ...
|
||||
* (eq)nop
|
||||
* ...
|
||||
* jump #endif
|
||||
* else:
|
||||
* (eq)nop
|
||||
* endif:
|
||||
* ...
|
||||
* end
|
||||
*
|
||||
* We also need this to make sure we insert (eq) after branches which use
|
||||
* helper invocations.
|
||||
*/
|
||||
bool pred_uses_helpers = bd->uses_helpers_beginning;
|
||||
for (unsigned i = 0; i < block->predecessors_count; i++) {
|
||||
struct ir3_block *pred = block->predecessors[i];
|
||||
struct ir3_helper_block_data *pred_bd = pred->data;
|
||||
if (pred_bd->uses_helpers_end) {
|
||||
pred_uses_helpers = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (!pred_uses_helpers)
|
||||
continue;
|
||||
|
||||
/* The last use of helpers is somewhere between the beginning and the
|
||||
* end. first_instr will be the first instruction where helpers are no
|
||||
* longer required, or NULL if helpers are not required just at the end.
|
||||
*/
|
||||
struct ir3_instruction *first_instr = NULL;
|
||||
foreach_instr_rev (instr, &block->instr_list) {
|
||||
/* Skip prefetches because they actually execute before the block
|
||||
* starts and at this stage they aren't guaranteed to be at the start
|
||||
* of the block.
|
||||
*/
|
||||
if (uses_helpers(instr) && instr->opc != OPC_META_TEX_PREFETCH)
|
||||
break;
|
||||
first_instr = instr;
|
||||
}
|
||||
|
||||
bool killed = false;
|
||||
bool expensive_instruction_in_block = false;
|
||||
if (first_instr) {
|
||||
foreach_instr_from (instr, first_instr, &block->instr_list) {
|
||||
/* If there's already a nop, we don't have to worry about whether to
|
||||
* insert one.
|
||||
*/
|
||||
if (instr->opc == OPC_NOP) {
|
||||
instr->flags |= IR3_INSTR_EQ;
|
||||
killed = true;
|
||||
break;
|
||||
}
|
||||
|
||||
/* ALU and SFU instructions probably aren't going to benefit much
|
||||
* from killing helper invocations, because they complete at least
|
||||
* an entire quad in a cycle and don't access any quad-divergent
|
||||
* memory, so delay emitting (eq) in the hopes that we find a nop
|
||||
* afterwards.
|
||||
*/
|
||||
if (is_alu(instr) || is_sfu(instr))
|
||||
continue;
|
||||
|
||||
expensive_instruction_in_block = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
/* If this block isn't the last block before the end instruction, assume
|
||||
* that there may be expensive instructions in later blocks so it's worth
|
||||
* it to insert a nop.
|
||||
*/
|
||||
if (!killed && (expensive_instruction_in_block ||
|
||||
block->successors[0] != ir3_end_block(ir))) {
|
||||
struct ir3_instruction *nop = ir3_NOP(block);
|
||||
nop->flags |= IR3_INSTR_EQ;
|
||||
if (first_instr)
|
||||
ir3_instr_move_before(nop, first_instr);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
bool
|
||||
ir3_legalize(struct ir3 *ir, struct ir3_shader_variant *so, int *max_bary)
|
||||
{
|
||||
@@ -976,6 +1185,11 @@ ir3_legalize(struct ir3 *ir, struct ir3_shader_variant *so, int *max_bary)
|
||||
while (opt_jump(ir))
|
||||
;
|
||||
|
||||
/* TODO: does (eq) exist before a6xx? */
|
||||
if (so->type == MESA_SHADER_FRAGMENT && so->need_pixlod &&
|
||||
so->compiler->gen >= 6)
|
||||
helper_sched(ctx, ir, so);
|
||||
|
||||
ir3_count_instructions(ir);
|
||||
resolve_jumps(ir);
|
||||
|
||||
|
Reference in New Issue
Block a user