aco: value number VOPC instructions with different exec masks

This becomes possible as long as we do

   val = s_and_b32/64 exec, val

before any subgroup operations.

This precautional instruction can be removed by the
optimizer if 'val' was computed by a VOPC instruction
using the same exec mask.

Totals from 59 (0.04% of 146267) affected shaders (Navi10):
VGPRs: 2808 -> 2816 (+0.28%)
CodeSize: 340888 -> 340852 (-0.01%); split: -0.20%, +0.19%
Instrs: 61733 -> 61625 (-0.17%); split: -0.18%, +0.01%
Cycles: 470636 -> 469112 (-0.32%); split: -0.33%, +0.01%
VMEM: 8091 -> 7993 (-1.21%)
SMEM: 2736 -> 2719 (-0.62%); split: +0.29%, -0.91%
VClause: 1745 -> 1741 (-0.23%)
SClause: 2394 -> 2392 (-0.08%); split: -0.25%, +0.17%
Copies: 3249 -> 3253 (+0.12%); split: -0.62%, +0.74%
Branches: 1210 -> 1206 (-0.33%)
PreSGPRs: 3126 -> 3176 (+1.60%); split: -0.16%, +1.76%

Reviewed-by: Tony Wasserka <tony.wasserka@gmx.de>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/9195>
This commit is contained in:
Daniel Schürmann
2020-12-21 12:49:07 +01:00
parent e6ff50f7d3
commit fbf791e70c
2 changed files with 10 additions and 11 deletions

View File

@@ -8169,23 +8169,26 @@ void visit_intrinsic(isel_context *ctx, nir_intrinsic_instr *instr)
case nir_intrinsic_ballot: {
Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
Definition tmp = bld.def(dst.regClass());
Definition lanemask_tmp = dst.size() == bld.lm.size() ? tmp : bld.def(bld.lm);
if (instr->src[0].ssa->bit_size == 1) {
assert(src.regClass() == bld.lm);
bld.sop2(Builder::s_and, lanemask_tmp, bld.def(s1, scc), Operand(exec, bld.lm), src);
} else if (instr->src[0].ssa->bit_size == 32 && src.regClass() == v1) {
bld.vopc(aco_opcode::v_cmp_lg_u32, lanemask_tmp, Operand(0u), src);
src = bld.vopc(aco_opcode::v_cmp_lg_u32, bld.def(bld.lm), Operand(0u), src);
} else if (instr->src[0].ssa->bit_size == 64 && src.regClass() == v2) {
bld.vopc(aco_opcode::v_cmp_lg_u64, lanemask_tmp, Operand(0u), src);
src = bld.vopc(aco_opcode::v_cmp_lg_u64, bld.def(bld.lm), Operand(0u), src);
} else {
isel_err(&instr->instr, "Unimplemented NIR instr bit size");
}
/* Make sure that all inactive lanes return zero.
* Value-numbering might remove the comparison above */
src = bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), Operand(exec, bld.lm), src);
if (dst.size() != bld.lm.size()) {
/* Wave32 with ballot size set to 64 */
bld.pseudo(aco_opcode::p_create_vector, Definition(tmp), lanemask_tmp.getTemp(), Operand(0u));
src = bld.pseudo(aco_opcode::p_create_vector, bld.def(dst.regClass()), src, Operand(0u));
}
emit_wqm(bld, tmp.getTemp(), dst);
emit_wqm(bld, src, dst);
break;
}
case nir_intrinsic_shuffle:

View File

@@ -173,10 +173,6 @@ struct InstrPred {
if (a->opcode == aco_opcode::v_readfirstlane_b32)
return a->pass_flags == b->pass_flags;
/* The results of VOPC depend on the exec mask if used for subgroup operations. */
if (a->isVOPC() && a->pass_flags != b->pass_flags)
return false;
if (a->isVOP3()) {
VOP3_instruction& a3 = a->vop3();
VOP3_instruction& b3 = b->vop3();