nir/algebraic: intel/fs: Optimize some patterns before lowering 64-bit integers

v2: Add some comments explaining some of the nuance of the shift
optimizations. Fix a bug in the shift count calculation of the upper
32-bits. Move the @64 from the variable to the opcode. All suggested
by Jordan.

No shader-db changes on any Intel platform.

fossil-db:

Meteor Lake and DG2 had similar results. (Meteor Lake shown)
Totals:
Instrs: 154507026 -> 154506576 (-0.00%)
Cycle count: 17436298868 -> 17436295016 (-0.00%)
Max live registers: 32635309 -> 32635297 (-0.00%)

Totals from 42 (0.01% of 632575) affected shaders:
Instrs: 5616 -> 5166 (-8.01%)
Cycle count: 133680 -> 129828 (-2.88%)
Max live registers: 1158 -> 1146 (-1.04%)

No fossil-db changes on any other Intel platform.

Reviewed-by: Jordan Justen <jordan.l.justen@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/29148>
This commit is contained in:
Ian Romanick
2023-08-12 16:17:46 -07:00
parent 4834df82e2
commit 7b7e5cf5d4
3 changed files with 48 additions and 0 deletions

View File

@@ -6472,6 +6472,7 @@ typedef struct nir_opt_access_options {
bool nir_opt_access(nir_shader *shader, const nir_opt_access_options *options);
bool nir_opt_algebraic(nir_shader *shader);
bool nir_opt_algebraic_before_ffma(nir_shader *shader);
bool nir_opt_algebraic_before_lower_int64(nir_shader *shader);
bool nir_opt_algebraic_late(nir_shader *shader);
bool nir_opt_algebraic_distribute_src_mods(nir_shader *shader);
bool nir_opt_constant_folding(nir_shader *shader);

View File

@@ -3447,9 +3447,48 @@ distribute_src_mods = [
(('fabs', ('fsign(is_used_once)', a)), ('fsign', ('fabs', a))),
]
before_lower_int64_optimizations = [
# The i2i64(a) implies that 'a' has at most 32-bits of data.
(('ishl', ('i2i64', a), b),
# Effective shift count of zero, just return 'a'.
('bcsel', ('ieq', ('iand', b, 63), 0), ('i2i64', a),
('bcsel', ('ilt', ('iand', b, 63), 32),
# Shifting less than 32 bits, so both 32-bit halves will have
# some data. These (and the else case) shift counts are of 32-bit
# values, so the shift counts are implicitly moduolo 32.
('pack_64_2x32_split', ('ishl', ('i2i32', a), b), ('ishr', ('i2i32', a), ('iadd', ('ineg', b), 32) )),
# Shifting 32 bits or more, so lower 32 bits must be zero.
('pack_64_2x32_split', 0 , ('ishl', ('i2i32', a), ('iabs', ('iadd', ('ineg', b), 32)))))),
'(options->lower_int64_options & nir_lower_shift64) != 0'),
(('ishl', ('u2u64', a), b),
('bcsel', ('ieq', ('iand', b, 63), 0), ('u2u64', a),
('bcsel', ('ilt', ('iand', b, 63), 32),
('pack_64_2x32_split', ('ishl', ('u2u32', a), b), ('ushr', ('u2u32', a), ('iadd', ('ineg', b), 32) )),
('pack_64_2x32_split', 0 , ('ishl', ('u2u32', a), ('iabs', ('iadd', ('ineg', b), 32)))))),
'(options->lower_int64_options & nir_lower_shift64) != 0'),
# If ineg64 is lowered, then the negation is not free. Try to eliminate
# some of the negations.
(('iadd@64', ('ineg', a), ('ineg(is_used_once)', b)), ('isub', ('ineg', a), b), '(options->lower_int64_options & nir_lower_ineg64) != 0'),
(('iadd@64', a, ('ineg', b)), ('isub', a, b), '(options->lower_int64_options & nir_lower_ineg64) != 0'),
(('isub@64', a, ('ineg', b)), ('iadd', a, b), '(options->lower_int64_options & nir_lower_ineg64) != 0'),
(('isub@64', ('ineg', a), ('ineg', b)), ('isub', b, a), '(options->lower_int64_options & nir_lower_ineg64) != 0'),
(('imul@64', ('ineg', a), ('ineg', b)), ('imul', a, b)),
(('idiv@64', ('ineg', a), ('ineg', b)), ('idiv', a, b)),
# If the hardware can do int64, the shift is the same cost as the add. It
# should be fine to do this transformation unconditionally.
(('iadd', ('i2i64', a), ('i2i64', a)), ('ishl', ('i2i64', a), 1)),
(('iadd', ('u2u64', a), ('u2u64', a)), ('ishl', ('u2u64', a), 1)),
]
print(nir_algebraic.AlgebraicPass("nir_opt_algebraic", optimizations).render())
print(nir_algebraic.AlgebraicPass("nir_opt_algebraic_before_ffma",
before_ffma_optimizations).render())
print(nir_algebraic.AlgebraicPass("nir_opt_algebraic_before_lower_int64",
before_lower_int64_optimizations).render())
print(nir_algebraic.AlgebraicPass("nir_opt_algebraic_late",
late_optimizations).render())
print(nir_algebraic.AlgebraicPass("nir_opt_algebraic_distribute_src_mods",

View File

@@ -1678,6 +1678,12 @@ brw_postprocess_nir(nir_shader *nir, const struct brw_compiler *compiler,
brw_vectorize_lower_mem_access(nir, compiler, robust_flags);
/* Potentially perform this optimization pass twice because it can create
* additional opportunities for itself.
*/
if (OPT(nir_opt_algebraic_before_lower_int64))
OPT(nir_opt_algebraic_before_lower_int64);
if (OPT(nir_lower_int64))
brw_nir_optimize(nir, devinfo);
@@ -1762,6 +1768,8 @@ brw_postprocess_nir(nir_shader *nir, const struct brw_compiler *compiler,
if (OPT(nir_opt_uniform_atomics)) {
OPT(nir_lower_subgroups, &subgroups_options);
OPT(nir_opt_algebraic_before_lower_int64);
if (OPT(nir_lower_int64))
brw_nir_optimize(nir, devinfo);