From fbf791e70cfae1218c66d56a8a15125e99cac2ac Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Daniel=20Sch=C3=BCrmann?= Date: Mon, 21 Dec 2020 12:49:07 +0100 Subject: [PATCH] aco: value number VOPC instructions with different exec masks This becomes possible as long as we do val = s_and_b32/64 exec, val before any subgroup operations. This precautional instruction can be removed by the optimizer if 'val' was computed by a VOPC instruction using the same exec mask. Totals from 59 (0.04% of 146267) affected shaders (Navi10): VGPRs: 2808 -> 2816 (+0.28%) CodeSize: 340888 -> 340852 (-0.01%); split: -0.20%, +0.19% Instrs: 61733 -> 61625 (-0.17%); split: -0.18%, +0.01% Cycles: 470636 -> 469112 (-0.32%); split: -0.33%, +0.01% VMEM: 8091 -> 7993 (-1.21%) SMEM: 2736 -> 2719 (-0.62%); split: +0.29%, -0.91% VClause: 1745 -> 1741 (-0.23%) SClause: 2394 -> 2392 (-0.08%); split: -0.25%, +0.17% Copies: 3249 -> 3253 (+0.12%); split: -0.62%, +0.74% Branches: 1210 -> 1206 (-0.33%) PreSGPRs: 3126 -> 3176 (+1.60%); split: -0.16%, +1.76% Reviewed-by: Tony Wasserka Part-of: --- src/amd/compiler/aco_instruction_selection.cpp | 17 ++++++++++------- src/amd/compiler/aco_opt_value_numbering.cpp | 4 ---- 2 files changed, 10 insertions(+), 11 deletions(-) diff --git a/src/amd/compiler/aco_instruction_selection.cpp b/src/amd/compiler/aco_instruction_selection.cpp index 80834236486..c2c4ed868d4 100644 --- a/src/amd/compiler/aco_instruction_selection.cpp +++ b/src/amd/compiler/aco_instruction_selection.cpp @@ -8169,23 +8169,26 @@ void visit_intrinsic(isel_context *ctx, nir_intrinsic_instr *instr) case nir_intrinsic_ballot: { Temp src = get_ssa_temp(ctx, instr->src[0].ssa); Temp dst = get_ssa_temp(ctx, &instr->dest.ssa); - Definition tmp = bld.def(dst.regClass()); - Definition lanemask_tmp = dst.size() == bld.lm.size() ? tmp : bld.def(bld.lm); + if (instr->src[0].ssa->bit_size == 1) { assert(src.regClass() == bld.lm); - bld.sop2(Builder::s_and, lanemask_tmp, bld.def(s1, scc), Operand(exec, bld.lm), src); } else if (instr->src[0].ssa->bit_size == 32 && src.regClass() == v1) { - bld.vopc(aco_opcode::v_cmp_lg_u32, lanemask_tmp, Operand(0u), src); + src = bld.vopc(aco_opcode::v_cmp_lg_u32, bld.def(bld.lm), Operand(0u), src); } else if (instr->src[0].ssa->bit_size == 64 && src.regClass() == v2) { - bld.vopc(aco_opcode::v_cmp_lg_u64, lanemask_tmp, Operand(0u), src); + src = bld.vopc(aco_opcode::v_cmp_lg_u64, bld.def(bld.lm), Operand(0u), src); } else { isel_err(&instr->instr, "Unimplemented NIR instr bit size"); } + + /* Make sure that all inactive lanes return zero. + * Value-numbering might remove the comparison above */ + src = bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), Operand(exec, bld.lm), src); if (dst.size() != bld.lm.size()) { /* Wave32 with ballot size set to 64 */ - bld.pseudo(aco_opcode::p_create_vector, Definition(tmp), lanemask_tmp.getTemp(), Operand(0u)); + src = bld.pseudo(aco_opcode::p_create_vector, bld.def(dst.regClass()), src, Operand(0u)); } - emit_wqm(bld, tmp.getTemp(), dst); + + emit_wqm(bld, src, dst); break; } case nir_intrinsic_shuffle: diff --git a/src/amd/compiler/aco_opt_value_numbering.cpp b/src/amd/compiler/aco_opt_value_numbering.cpp index 42aa701d40e..bf0d06f63d5 100644 --- a/src/amd/compiler/aco_opt_value_numbering.cpp +++ b/src/amd/compiler/aco_opt_value_numbering.cpp @@ -173,10 +173,6 @@ struct InstrPred { if (a->opcode == aco_opcode::v_readfirstlane_b32) return a->pass_flags == b->pass_flags; - /* The results of VOPC depend on the exec mask if used for subgroup operations. */ - if (a->isVOPC() && a->pass_flags != b->pass_flags) - return false; - if (a->isVOP3()) { VOP3_instruction& a3 = a->vop3(); VOP3_instruction& b3 = b->vop3();