nir/opt_if: add opt_if_rewrite_uniform_uses

Turns:
if (a == (b=readfirstlane(a)))
   use(a)
into:
if (a == (b=readfirstlane(a)))
   use(b)

Improves divergence analysis and lets us scalarize use(a). Improves
Cyberpunk 2077 performance.

fossil-db (Sienna Cichlid, Cyberpunk 2077):
Totals from 57 (10.56% of 540) affected shaders:
VGPRs: 4904 -> 4040 (-17.62%)
CodeSize: 624360 -> 626828 (+0.40%); split: -0.06%, +0.46%
MaxWaves: 656 -> 824 (+25.61%)
Instrs: 119770 -> 119447 (-0.27%); split: -0.49%, +0.22%
Latency: 1950256 -> 1633110 (-16.26%); split: -16.26%, +0.00%
InvThroughput: 364852 -> 292089 (-19.94%)
VClause: 1512 -> 1008 (-33.33%)
SClause: 2693 -> 3196 (+18.68%)
Copies: 10050 -> 9955 (-0.95%); split: -3.34%, +2.40%
Branches: 3476 -> 3547 (+2.04%)
PreSGPRs: 4003 -> 5076 (+26.80%)
PreVGPRs: 4709 -> 3810 (-19.09%)

Signed-off-by: Rhys Perry <pendingchaos02@gmail.com>
Reviewed-by: Timur Kristóf <timur.kristof@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/12472>
This commit is contained in:
Rhys Perry
2021-08-18 19:08:54 +01:00
committed by Marge Bot
parent 69f9a96af1
commit e43007af56

View File

@@ -1200,6 +1200,111 @@ opt_if_evaluate_condition_use(nir_builder *b, nir_if *nif)
return progress;
}
static bool
rewrite_comp_uses_within_if(nir_builder *b, nir_if *nif, bool invert,
nir_ssa_scalar scalar, nir_ssa_scalar new_scalar)
{
bool progress = false;
nir_block *first = invert ? nir_if_first_else_block(nif) : nir_if_first_then_block(nif);
nir_block *last = invert ? nir_if_last_else_block(nif) : nir_if_last_then_block(nif);
nir_ssa_def *new_ssa = NULL;
nir_foreach_use_safe(use, scalar.def) {
if (use->parent_instr->block->index < first->index ||
use->parent_instr->block->index > last->index)
continue;
/* Only rewrite users which use only the new component. This is to avoid a
* situation where copy propagation will undo the rewrite and we risk an infinite
* loop.
*
* We could rewrite users which use a mix of the old and new components, but if
* nir_src_components_read() is incomplete, then we risk the new component actually being
* unused and some optimization later undoing the rewrite.
*/
if (nir_src_components_read(use) != BITFIELD64_BIT(scalar.comp))
continue;
if (!new_ssa) {
b->cursor = nir_before_cf_node(&nif->cf_node);
new_ssa = nir_channel(b, new_scalar.def, new_scalar.comp);
if (scalar.def->num_components > 1) {
nir_ssa_def *vec = nir_ssa_undef(b, scalar.def->num_components, scalar.def->bit_size);
new_ssa = nir_vector_insert_imm(b, vec, new_ssa, scalar.comp);
}
}
nir_instr_rewrite_src_ssa(use->parent_instr, use, new_ssa);
progress = true;
}
return progress;
}
/*
* This optimization turns:
*
* if (a == (b=readfirstlane(a)))
* use(a)
* if (c == (d=load_const))
* use(c)
*
* into:
*
* if (a == (b=readfirstlane(a)))
* use(b)
* if (c == (d=load_const))
* use(d)
*/
static bool
opt_if_rewrite_uniform_uses(nir_builder *b, nir_if *nif, nir_ssa_scalar cond, bool accept_ine)
{
bool progress = false;
if (!nir_ssa_scalar_is_alu(cond))
return false;
nir_op op = nir_ssa_scalar_alu_op(cond);
if (op == nir_op_iand) {
progress |= opt_if_rewrite_uniform_uses(b, nif, nir_ssa_scalar_chase_alu_src(cond, 0), false);
progress |= opt_if_rewrite_uniform_uses(b, nif, nir_ssa_scalar_chase_alu_src(cond, 1), false);
return progress;
}
if (op != nir_op_ieq && (op != nir_op_ine || !accept_ine))
return false;
for (unsigned i = 0; i < 2; i++) {
nir_ssa_scalar src_uni = nir_ssa_scalar_chase_alu_src(cond, i);
nir_ssa_scalar src_div = nir_ssa_scalar_chase_alu_src(cond, !i);
if (src_uni.def->parent_instr->type == nir_instr_type_load_const && src_div.def != src_uni.def)
return rewrite_comp_uses_within_if(b, nif, op == nir_op_ine, src_div, src_uni);
if (src_uni.def->parent_instr->type != nir_instr_type_intrinsic)
continue;
nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(src_uni.def->parent_instr);
if (intrin->intrinsic != nir_intrinsic_read_first_invocation &&
(intrin->intrinsic != nir_intrinsic_reduce || nir_intrinsic_cluster_size(intrin)))
continue;
nir_ssa_scalar intrin_src = {intrin->src[0].ssa, src_uni.comp};
nir_ssa_scalar resolved_intrin_src = nir_ssa_scalar_resolved(intrin_src.def, intrin_src.comp);
if (resolved_intrin_src.comp != src_div.comp || resolved_intrin_src.def != src_div.def)
continue;
progress |= rewrite_comp_uses_within_if(b, nif, op == nir_op_ine, resolved_intrin_src, src_uni);
if (intrin_src.comp != resolved_intrin_src.comp || intrin_src.def != resolved_intrin_src.def)
progress |= rewrite_comp_uses_within_if(b, nif, op == nir_op_ine, intrin_src, src_uni);
return progress;
}
return false;
}
static void
simple_merge_if(nir_if *dest_if, nir_if *src_if, bool dest_if_then,
bool src_if_then)
@@ -1387,6 +1492,8 @@ opt_if_safe_cf_list(nir_builder *b, struct exec_list *cf_list)
progress |= opt_if_safe_cf_list(b, &nif->then_list);
progress |= opt_if_safe_cf_list(b, &nif->else_list);
progress |= opt_if_evaluate_condition_use(b, nif);
nir_ssa_scalar cond = nir_ssa_scalar_resolved(nif->condition.ssa, 0);
progress |= opt_if_rewrite_uniform_uses(b, nif, cond, true);
break;
}