From fbf791e70cfae1218c66d56a8a15125e99cac2ac Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Daniel=20Sch=C3=BCrmann?= <daniel@schuermann.dev>
Date: Mon, 21 Dec 2020 12:49:07 +0100
Subject: [PATCH] aco: value number VOPC instructions with different exec masks

This becomes possible as long as we do

   val = s_and_b32/64 exec, val

before any subgroup operations.

This precautional instruction can be removed by the
optimizer if 'val' was computed by a VOPC instruction
using the same exec mask.

Totals from 59 (0.04% of 146267) affected shaders (Navi10):
VGPRs: 2808 -> 2816 (+0.28%)
CodeSize: 340888 -> 340852 (-0.01%); split: -0.20%, +0.19%
Instrs: 61733 -> 61625 (-0.17%); split: -0.18%, +0.01%
Cycles: 470636 -> 469112 (-0.32%); split: -0.33%, +0.01%
VMEM: 8091 -> 7993 (-1.21%)
SMEM: 2736 -> 2719 (-0.62%); split: +0.29%, -0.91%
VClause: 1745 -> 1741 (-0.23%)
SClause: 2394 -> 2392 (-0.08%); split: -0.25%, +0.17%
Copies: 3249 -> 3253 (+0.12%); split: -0.62%, +0.74%
Branches: 1210 -> 1206 (-0.33%)
PreSGPRs: 3126 -> 3176 (+1.60%); split: -0.16%, +1.76%

Reviewed-by: Tony Wasserka <tony.wasserka@gmx.de>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/9195>
---
 src/amd/compiler/aco_instruction_selection.cpp | 17 ++++++++++-------
 src/amd/compiler/aco_opt_value_numbering.cpp   |  4 ----
 2 files changed, 10 insertions(+), 11 deletions(-)

diff --git a/src/amd/compiler/aco_instruction_selection.cpp b/src/amd/compiler/aco_instruction_selection.cpp
index 80834236486..c2c4ed868d4 100644
--- a/src/amd/compiler/aco_instruction_selection.cpp
+++ b/src/amd/compiler/aco_instruction_selection.cpp
@@ -8169,23 +8169,26 @@ void visit_intrinsic(isel_context *ctx, nir_intrinsic_instr *instr)
    case nir_intrinsic_ballot: {
       Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
       Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
-      Definition tmp = bld.def(dst.regClass());
-      Definition lanemask_tmp = dst.size() == bld.lm.size() ? tmp : bld.def(bld.lm);
+
       if (instr->src[0].ssa->bit_size == 1) {
          assert(src.regClass() == bld.lm);
-         bld.sop2(Builder::s_and, lanemask_tmp, bld.def(s1, scc), Operand(exec, bld.lm), src);
       } else if (instr->src[0].ssa->bit_size == 32 && src.regClass() == v1) {
-         bld.vopc(aco_opcode::v_cmp_lg_u32, lanemask_tmp, Operand(0u), src);
+         src = bld.vopc(aco_opcode::v_cmp_lg_u32, bld.def(bld.lm), Operand(0u), src);
       } else if (instr->src[0].ssa->bit_size == 64 && src.regClass() == v2) {
-         bld.vopc(aco_opcode::v_cmp_lg_u64, lanemask_tmp, Operand(0u), src);
+         src = bld.vopc(aco_opcode::v_cmp_lg_u64, bld.def(bld.lm), Operand(0u), src);
       } else {
          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
       }
+
+      /* Make sure that all inactive lanes return zero.
+       * Value-numbering might remove the comparison above */
+      src = bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), Operand(exec, bld.lm), src);
       if (dst.size() != bld.lm.size()) {
          /* Wave32 with ballot size set to 64 */
-         bld.pseudo(aco_opcode::p_create_vector, Definition(tmp), lanemask_tmp.getTemp(), Operand(0u));
+         src = bld.pseudo(aco_opcode::p_create_vector, bld.def(dst.regClass()), src, Operand(0u));
       }
-      emit_wqm(bld, tmp.getTemp(), dst);
+
+      emit_wqm(bld, src, dst);
       break;
    }
    case nir_intrinsic_shuffle:
diff --git a/src/amd/compiler/aco_opt_value_numbering.cpp b/src/amd/compiler/aco_opt_value_numbering.cpp
index 42aa701d40e..bf0d06f63d5 100644
--- a/src/amd/compiler/aco_opt_value_numbering.cpp
+++ b/src/amd/compiler/aco_opt_value_numbering.cpp
@@ -173,10 +173,6 @@ struct InstrPred {
       if (a->opcode == aco_opcode::v_readfirstlane_b32)
          return a->pass_flags == b->pass_flags;
 
-      /* The results of VOPC depend on the exec mask if used for subgroup operations. */
-      if (a->isVOPC() && a->pass_flags != b->pass_flags)
-         return false;
-
       if (a->isVOP3()) {
          VOP3_instruction& a3 = a->vop3();
          VOP3_instruction& b3 = b->vop3();