nir/algebraic: intel/fs: Optimize some patterns before lowering 64-bit integers

v2: Add some comments explaining some of the nuance of the shift optimizations. Fix a bug in the shift count calculation of the upper 32-bits. Move the @64 from the variable to the opcode. All suggested by Jordan. No shader-db changes on any Intel platform. fossil-db: Meteor Lake and DG2 had similar results. (Meteor Lake shown) Totals: Instrs: 154507026 -> 154506576 (-0.00%) Cycle count: 17436298868 -> 17436295016 (-0.00%) Max live registers: 32635309 -> 32635297 (-0.00%) Totals from 42 (0.01% of 632575) affected shaders: Instrs: 5616 -> 5166 (-8.01%) Cycle count: 133680 -> 129828 (-2.88%) Max live registers: 1158 -> 1146 (-1.04%) No fossil-db changes on any other Intel platform. Reviewed-by: Jordan Justen <jordan.l.justen@intel.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/29148>
2023-08-12 16:17:46 -07:00
parent 4834df82e2
commit 7b7e5cf5d4
3 changed files with 48 additions and 0 deletions
--- a/src/compiler/nir/nir.h
+++ b/src/compiler/nir/nir.h
@@ -6472,6 +6472,7 @@ typedef struct nir_opt_access_options {
 bool nir_opt_access(nir_shader *shader, const nir_opt_access_options *options);
 bool nir_opt_algebraic(nir_shader *shader);
 bool nir_opt_algebraic_before_ffma(nir_shader *shader);
+bool nir_opt_algebraic_before_lower_int64(nir_shader *shader);
 bool nir_opt_algebraic_late(nir_shader *shader);
 bool nir_opt_algebraic_distribute_src_mods(nir_shader *shader);
 bool nir_opt_constant_folding(nir_shader *shader);
--- a/src/compiler/nir/nir_opt_algebraic.py
+++ b/src/compiler/nir/nir_opt_algebraic.py
@@ -3447,9 +3447,48 @@ distribute_src_mods = [
   (('fabs', ('fsign(is_used_once)', a)), ('fsign', ('fabs', a))),
 ]

+before_lower_int64_optimizations = [
+    # The i2i64(a) implies that 'a' has at most 32-bits of data.
+    (('ishl', ('i2i64', a), b),
+     # Effective shift count of zero, just return 'a'.
+     ('bcsel', ('ieq', ('iand', b, 63), 0), ('i2i64', a),
+      ('bcsel', ('ilt', ('iand', b, 63), 32),
+       # Shifting less than 32 bits, so both 32-bit halves will have
+       # some data. These (and the else case) shift counts are of 32-bit
+       # values, so the shift counts are implicitly moduolo 32.
+       ('pack_64_2x32_split', ('ishl', ('i2i32', a), b), ('ishr', ('i2i32', a),          ('iadd', ('ineg', b), 32) )),
+       # Shifting 32 bits or more, so lower 32 bits must be zero.
+       ('pack_64_2x32_split', 0                        , ('ishl', ('i2i32', a), ('iabs', ('iadd', ('ineg', b), 32)))))),
+     '(options->lower_int64_options & nir_lower_shift64) != 0'),
+
+    (('ishl', ('u2u64', a), b),
+     ('bcsel', ('ieq', ('iand', b, 63), 0), ('u2u64', a),
+      ('bcsel', ('ilt', ('iand', b, 63), 32),
+       ('pack_64_2x32_split', ('ishl', ('u2u32', a), b), ('ushr', ('u2u32', a),          ('iadd', ('ineg', b), 32) )),
+       ('pack_64_2x32_split', 0                        , ('ishl', ('u2u32', a), ('iabs', ('iadd', ('ineg', b), 32)))))),
+     '(options->lower_int64_options & nir_lower_shift64) != 0'),
+
+    # If ineg64 is lowered, then the negation is not free. Try to eliminate
+    # some of the negations.
+    (('iadd@64', ('ineg', a), ('ineg(is_used_once)', b)), ('isub', ('ineg', a), b), '(options->lower_int64_options & nir_lower_ineg64) != 0'),
+    (('iadd@64', a, ('ineg', b)), ('isub', a, b), '(options->lower_int64_options & nir_lower_ineg64) != 0'),
+    (('isub@64', a, ('ineg', b)), ('iadd', a, b), '(options->lower_int64_options & nir_lower_ineg64) != 0'),
+    (('isub@64', ('ineg', a), ('ineg', b)), ('isub', b, a), '(options->lower_int64_options & nir_lower_ineg64) != 0'),
+
+    (('imul@64', ('ineg', a), ('ineg', b)), ('imul', a, b)),
+    (('idiv@64', ('ineg', a), ('ineg', b)), ('idiv', a, b)),
+
+    # If the hardware can do int64, the shift is the same cost as the add. It
+    # should be fine to do this transformation unconditionally.
+    (('iadd', ('i2i64', a), ('i2i64', a)), ('ishl', ('i2i64', a), 1)),
+    (('iadd', ('u2u64', a), ('u2u64', a)), ('ishl', ('u2u64', a), 1)),
+]
+
 print(nir_algebraic.AlgebraicPass("nir_opt_algebraic", optimizations).render())
 print(nir_algebraic.AlgebraicPass("nir_opt_algebraic_before_ffma",
                                  before_ffma_optimizations).render())
+print(nir_algebraic.AlgebraicPass("nir_opt_algebraic_before_lower_int64",
+                                  before_lower_int64_optimizations).render())
 print(nir_algebraic.AlgebraicPass("nir_opt_algebraic_late",
                                  late_optimizations).render())
 print(nir_algebraic.AlgebraicPass("nir_opt_algebraic_distribute_src_mods",
--- a/src/intel/compiler/brw_nir.c
+++ b/src/intel/compiler/brw_nir.c
@@ -1678,6 +1678,12 @@ brw_postprocess_nir(nir_shader *nir, const struct brw_compiler *compiler,

   brw_vectorize_lower_mem_access(nir, compiler, robust_flags);

+   /* Potentially perform this optimization pass twice because it can create
+    * additional opportunities for itself.
+    */
+   if (OPT(nir_opt_algebraic_before_lower_int64))
+      OPT(nir_opt_algebraic_before_lower_int64);
+
   if (OPT(nir_lower_int64))
      brw_nir_optimize(nir, devinfo);

@@ -1762,6 +1768,8 @@ brw_postprocess_nir(nir_shader *nir, const struct brw_compiler *compiler,
   if (OPT(nir_opt_uniform_atomics)) {
      OPT(nir_lower_subgroups, &subgroups_options);

+      OPT(nir_opt_algebraic_before_lower_int64);
+
      if (OPT(nir_lower_int64))
         brw_nir_optimize(nir, devinfo);