ir3: Implement helper invocation optimization

This kills helper invocations to ensure that subsequent memory accesses
don't fetch unused memory and unnecessary branch divergence from helper
invocations is eliminated.

shader-db results:

total instructions in shared programs: 3840580 -> 3841531 (0.02%)
instructions in affected programs: 278416 -> 279367 (0.34%)
helped: 0
HURT: 744
HURT stats (abs)   min: 1 max: 16 x̄: 1.28 x̃: 1
HURT stats (rel)   min: 0.05% max: 8.51% x̄: 0.75% x̃: 0.39%
95% mean confidence interval for instructions value: 1.22 1.34
95% mean confidence interval for instructions %-change: 0.67% 0.83%
Instructions are HURT.

total nops in shared programs: 866716 -> 867667 (0.11%)
nops in affected programs: 72851 -> 73802 (1.31%)
helped: 0
HURT: 744
HURT stats (abs)   min: 1 max: 16 x̄: 1.28 x̃: 1
HURT stats (rel)   min: 0.17% max: 33.33% x̄: 2.84% x̃: 1.82%
95% mean confidence interval for nops value: 1.22 1.34
95% mean confidence interval for nops %-change: 2.59% 3.08%
Nops are HURT.

total last-baryf in shared programs: 139806 -> 139864 (0.04%)
last-baryf in affected programs: 11772 -> 11830 (0.49%)
helped: 0
HURT: 58
HURT stats (abs)   min: 1 max: 1 x̄: 1.00 x̃: 1
HURT stats (rel)   min: 0.40% max: 5.26% x̄: 0.60% x̃: 0.47%
95% mean confidence interval for last-baryf value: 1.00 1.00
95% mean confidence interval for last-baryf %-change: 0.42% 0.78%
Last-baryf are HURT.

total last-helper in shared programs: 1508295 -> 935561 (-37.97%)
last-helper in affected programs: 1192594 -> 619860 (-48.02%)
helped: 7816
HURT: 3
helped stats (abs) min: 1 max: 1095 x̄: 73.28 x̃: 34
helped stats (rel) min: 0.42% max: 100.00% x̄: 71.91% x̃: 100.00%
HURT stats (abs)   min: 1 max: 11 x̄: 4.67 x̃: 2
HURT stats (rel)   min: 0.80% max: 1.44% x̄: 1.03% x̃: 0.86%
95% mean confidence interval for last-helper value: -75.64 -70.86
95% mean confidence interval for last-helper %-change: -72.67% -71.10%
Last-helper are helped.

fossil-db results:

Totals:
Instrs: 55172795 -> 55189122 (+0.03%)
CodeSize: 108952746 -> 108984452 (+0.03%)
NOPs: 11536680 -> 11553007 (+0.14%)
(ss)-stall: 4166810 -> 4166581 (-0.01%)
(sy)-stall: 15890324 -> 15884974 (-0.03%)
last-baryf: 659588 -> 659633 (+0.01%)
last-helper: 25742996 -> 12601636 (-51.05%); split: -51.05%, +0.00%
Cat0: 12294891 -> 12311218 (+0.13%)

Totals from 39576 (25.22% of 156916) affected shaders:
Instrs: 24200008 -> 24216335 (+0.07%)
CodeSize: 44968736 -> 45000442 (+0.07%)
NOPs: 5854965 -> 5871292 (+0.28%)
(ss)-stall: 2357830 -> 2357601 (-0.01%)
(sy)-stall: 6166670 -> 6161320 (-0.09%)
last-baryf: 590330 -> 590375 (+0.01%)
last-helper: 24160432 -> 11019072 (-54.39%); split: -54.39%, +0.00%

Cat0: 6205561 -> 6221888 (+0.26%)
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/24433>
This commit is contained in:
Connor Abbott
2023-07-31 19:57:06 +02:00
committed by Marge Bot
parent b9489dea3d
commit aa322a37fc
2 changed files with 270 additions and 0 deletions

View File

@@ -691,6 +691,12 @@ ir3_start_block(struct ir3 *ir)
return list_first_entry(&ir->block_list, struct ir3_block, node);
}
static inline struct ir3_block *
ir3_end_block(struct ir3 *ir)
{
return list_last_entry(&ir->block_list, struct ir3_block, node);
}
static inline struct ir3_block *
ir3_after_preamble(struct ir3 *ir)
{
@@ -1074,6 +1080,53 @@ is_input(struct ir3_instruction *instr)
}
}
/* Whether non-helper invocations can read the value of helper invocations. We
* cannot insert (eq) before these instructions.
*/
static inline bool
uses_helpers(struct ir3_instruction *instr)
{
switch (instr->opc) {
/* These require helper invocations to be present */
case OPC_SAM:
case OPC_SAMB:
case OPC_GETLOD:
case OPC_DSX:
case OPC_DSY:
case OPC_DSXPP_1:
case OPC_DSYPP_1:
case OPC_DSXPP_MACRO:
case OPC_DSYPP_MACRO:
case OPC_QUAD_SHUFFLE_BRCST:
case OPC_QUAD_SHUFFLE_HORIZ:
case OPC_QUAD_SHUFFLE_VERT:
case OPC_QUAD_SHUFFLE_DIAG:
case OPC_META_TEX_PREFETCH:
return true;
/* Subgroup operations don't require helper invocations to be present, but
* will use helper invocations if they are present.
*/
case OPC_BALLOT_MACRO:
case OPC_ANY_MACRO:
case OPC_ALL_MACRO:
case OPC_ELECT_MACRO:
case OPC_READ_FIRST_MACRO:
case OPC_READ_COND_MACRO:
case OPC_MOVMSK:
case OPC_BRCST_ACTIVE:
return true;
/* Catch lowered READ_FIRST/READ_COND. */
case OPC_MOV:
return (instr->dsts[0]->flags & IR3_REG_SHARED) &&
!(instr->srcs[0]->flags & IR3_REG_SHARED);
default:
return false;
}
}
static inline bool
is_bool(struct ir3_instruction *instr)
{
@@ -1704,6 +1757,9 @@ __ssa_srcp_n(struct ir3_instruction *instr, unsigned n)
/* iterators for instructions: */
#define foreach_instr(__instr, __list) \
list_for_each_entry (struct ir3_instruction, __instr, __list, node)
#define foreach_instr_from(__instr, __start, __list) \
list_for_each_entry_from(struct ir3_instruction, __instr, &(__start)->node, \
__list, node)
#define foreach_instr_rev(__instr, __list) \
list_for_each_entry_rev (struct ir3_instruction, __instr, __list, node)
#define foreach_instr_safe(__instr, __list) \

View File

@@ -908,6 +908,215 @@ nop_sched(struct ir3 *ir, struct ir3_shader_variant *so)
}
}
struct ir3_helper_block_data {
/* Whether helper invocations may be used on any path starting at the
* beginning of the block.
*/
bool uses_helpers_beginning;
/* Whether helper invocations may be used by the end of the block. Branch
* instructions are considered to be "between" blocks, because (eq) has to be
* inserted after them in the successor blocks, so branch instructions using
* helpers will result in uses_helpers_end = true for their block.
*/
bool uses_helpers_end;
};
/* Insert (eq) after the last instruction using the results of helper
* invocations. Use a backwards dataflow analysis to determine at which points
* in the program helper invocations are definitely never used, and then insert
* (eq) at the point where we cross from a point where they may be used to a
* point where they are never used.
*/
static void
helper_sched(struct ir3_legalize_ctx *ctx, struct ir3 *ir,
struct ir3_shader_variant *so)
{
bool non_prefetch_helpers = false;
foreach_block (block, &ir->block_list) {
struct ir3_helper_block_data *bd =
rzalloc(ctx, struct ir3_helper_block_data);
foreach_instr (instr, &block->instr_list) {
if (uses_helpers(instr)) {
bd->uses_helpers_beginning = true;
if (instr->opc != OPC_META_TEX_PREFETCH) {
non_prefetch_helpers = true;
break;
}
}
if (instr->opc == OPC_SHPE) {
/* (eq) is not allowed in preambles, mark the whole preamble as
* requiring helpers to avoid putting it there.
*/
bd->uses_helpers_beginning = true;
bd->uses_helpers_end = true;
}
}
if (block->brtype == IR3_BRANCH_ALL ||
block->brtype == IR3_BRANCH_ANY ||
block->brtype == IR3_BRANCH_GETONE) {
bd->uses_helpers_end = true;
}
block->data = bd;
}
/* If only prefetches use helpers then we can disable them in the shader via
* a register setting.
*/
if (!non_prefetch_helpers) {
so->prefetch_end_of_quad = true;
return;
}
bool progress;
do {
progress = false;
foreach_block_rev (block, &ir->block_list) {
struct ir3_helper_block_data *bd = block->data;
if (!bd->uses_helpers_beginning)
continue;
for (unsigned i = 0; i < block->predecessors_count; i++) {
struct ir3_block *pred = block->predecessors[i];
struct ir3_helper_block_data *pred_bd = pred->data;
if (!pred_bd->uses_helpers_end) {
pred_bd->uses_helpers_end = true;
}
if (!pred_bd->uses_helpers_beginning) {
pred_bd->uses_helpers_beginning = true;
progress = true;
}
}
}
} while (progress);
/* Now, we need to determine the points where helper invocations become
* unused.
*/
foreach_block (block, &ir->block_list) {
struct ir3_helper_block_data *bd = block->data;
if (bd->uses_helpers_end)
continue;
/* We need to check the predecessors because of situations with critical
* edges like this that can occur after optimizing jumps:
*
* br p0.x, #endif
* ...
* sam ...
* ...
* endif:
* ...
* end
*
* The endif block will have uses_helpers_beginning = false and
* uses_helpers_end = false, but because we jump to there from the
* beginning of the if where uses_helpers_end = true, we still want to
* add an (eq) at the beginning of the block:
*
* br p0.x, #endif
* ...
* sam ...
* (eq)nop
* ...
* endif:
* (eq)nop
* ...
* end
*
* This an extra nop in the case where the branch isn't taken, but that's
* probably preferable to adding an extra jump instruction which is what
* would happen if we ran this pass before optimizing jumps:
*
* br p0.x, #else
* ...
* sam ...
* (eq)nop
* ...
* jump #endif
* else:
* (eq)nop
* endif:
* ...
* end
*
* We also need this to make sure we insert (eq) after branches which use
* helper invocations.
*/
bool pred_uses_helpers = bd->uses_helpers_beginning;
for (unsigned i = 0; i < block->predecessors_count; i++) {
struct ir3_block *pred = block->predecessors[i];
struct ir3_helper_block_data *pred_bd = pred->data;
if (pred_bd->uses_helpers_end) {
pred_uses_helpers = true;
break;
}
}
if (!pred_uses_helpers)
continue;
/* The last use of helpers is somewhere between the beginning and the
* end. first_instr will be the first instruction where helpers are no
* longer required, or NULL if helpers are not required just at the end.
*/
struct ir3_instruction *first_instr = NULL;
foreach_instr_rev (instr, &block->instr_list) {
/* Skip prefetches because they actually execute before the block
* starts and at this stage they aren't guaranteed to be at the start
* of the block.
*/
if (uses_helpers(instr) && instr->opc != OPC_META_TEX_PREFETCH)
break;
first_instr = instr;
}
bool killed = false;
bool expensive_instruction_in_block = false;
if (first_instr) {
foreach_instr_from (instr, first_instr, &block->instr_list) {
/* If there's already a nop, we don't have to worry about whether to
* insert one.
*/
if (instr->opc == OPC_NOP) {
instr->flags |= IR3_INSTR_EQ;
killed = true;
break;
}
/* ALU and SFU instructions probably aren't going to benefit much
* from killing helper invocations, because they complete at least
* an entire quad in a cycle and don't access any quad-divergent
* memory, so delay emitting (eq) in the hopes that we find a nop
* afterwards.
*/
if (is_alu(instr) || is_sfu(instr))
continue;
expensive_instruction_in_block = true;
break;
}
}
/* If this block isn't the last block before the end instruction, assume
* that there may be expensive instructions in later blocks so it's worth
* it to insert a nop.
*/
if (!killed && (expensive_instruction_in_block ||
block->successors[0] != ir3_end_block(ir))) {
struct ir3_instruction *nop = ir3_NOP(block);
nop->flags |= IR3_INSTR_EQ;
if (first_instr)
ir3_instr_move_before(nop, first_instr);
}
}
}
bool
ir3_legalize(struct ir3 *ir, struct ir3_shader_variant *so, int *max_bary)
{
@@ -976,6 +1185,11 @@ ir3_legalize(struct ir3 *ir, struct ir3_shader_variant *so, int *max_bary)
while (opt_jump(ir))
;
/* TODO: does (eq) exist before a6xx? */
if (so->type == MESA_SHADER_FRAGMENT && so->need_pixlod &&
so->compiler->gen >= 6)
helper_sched(ctx, ir, so);
ir3_count_instructions(ir);
resolve_jumps(ir);