aco: value number VOPC instructions with different exec masks
This becomes possible as long as we do val = s_and_b32/64 exec, val before any subgroup operations. This precautional instruction can be removed by the optimizer if 'val' was computed by a VOPC instruction using the same exec mask. Totals from 59 (0.04% of 146267) affected shaders (Navi10): VGPRs: 2808 -> 2816 (+0.28%) CodeSize: 340888 -> 340852 (-0.01%); split: -0.20%, +0.19% Instrs: 61733 -> 61625 (-0.17%); split: -0.18%, +0.01% Cycles: 470636 -> 469112 (-0.32%); split: -0.33%, +0.01% VMEM: 8091 -> 7993 (-1.21%) SMEM: 2736 -> 2719 (-0.62%); split: +0.29%, -0.91% VClause: 1745 -> 1741 (-0.23%) SClause: 2394 -> 2392 (-0.08%); split: -0.25%, +0.17% Copies: 3249 -> 3253 (+0.12%); split: -0.62%, +0.74% Branches: 1210 -> 1206 (-0.33%) PreSGPRs: 3126 -> 3176 (+1.60%); split: -0.16%, +1.76% Reviewed-by: Tony Wasserka <tony.wasserka@gmx.de> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/9195>
This commit is contained in:
@@ -8169,23 +8169,26 @@ void visit_intrinsic(isel_context *ctx, nir_intrinsic_instr *instr)
|
||||
case nir_intrinsic_ballot: {
|
||||
Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
|
||||
Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
|
||||
Definition tmp = bld.def(dst.regClass());
|
||||
Definition lanemask_tmp = dst.size() == bld.lm.size() ? tmp : bld.def(bld.lm);
|
||||
|
||||
if (instr->src[0].ssa->bit_size == 1) {
|
||||
assert(src.regClass() == bld.lm);
|
||||
bld.sop2(Builder::s_and, lanemask_tmp, bld.def(s1, scc), Operand(exec, bld.lm), src);
|
||||
} else if (instr->src[0].ssa->bit_size == 32 && src.regClass() == v1) {
|
||||
bld.vopc(aco_opcode::v_cmp_lg_u32, lanemask_tmp, Operand(0u), src);
|
||||
src = bld.vopc(aco_opcode::v_cmp_lg_u32, bld.def(bld.lm), Operand(0u), src);
|
||||
} else if (instr->src[0].ssa->bit_size == 64 && src.regClass() == v2) {
|
||||
bld.vopc(aco_opcode::v_cmp_lg_u64, lanemask_tmp, Operand(0u), src);
|
||||
src = bld.vopc(aco_opcode::v_cmp_lg_u64, bld.def(bld.lm), Operand(0u), src);
|
||||
} else {
|
||||
isel_err(&instr->instr, "Unimplemented NIR instr bit size");
|
||||
}
|
||||
|
||||
/* Make sure that all inactive lanes return zero.
|
||||
* Value-numbering might remove the comparison above */
|
||||
src = bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), Operand(exec, bld.lm), src);
|
||||
if (dst.size() != bld.lm.size()) {
|
||||
/* Wave32 with ballot size set to 64 */
|
||||
bld.pseudo(aco_opcode::p_create_vector, Definition(tmp), lanemask_tmp.getTemp(), Operand(0u));
|
||||
src = bld.pseudo(aco_opcode::p_create_vector, bld.def(dst.regClass()), src, Operand(0u));
|
||||
}
|
||||
emit_wqm(bld, tmp.getTemp(), dst);
|
||||
|
||||
emit_wqm(bld, src, dst);
|
||||
break;
|
||||
}
|
||||
case nir_intrinsic_shuffle:
|
||||
|
@@ -173,10 +173,6 @@ struct InstrPred {
|
||||
if (a->opcode == aco_opcode::v_readfirstlane_b32)
|
||||
return a->pass_flags == b->pass_flags;
|
||||
|
||||
/* The results of VOPC depend on the exec mask if used for subgroup operations. */
|
||||
if (a->isVOPC() && a->pass_flags != b->pass_flags)
|
||||
return false;
|
||||
|
||||
if (a->isVOP3()) {
|
||||
VOP3_instruction& a3 = a->vop3();
|
||||
VOP3_instruction& b3 = b->vop3();
|
||||
|
Reference in New Issue
Block a user