nir/algebraic: intel/fs: Optimize some patterns before lowering 64-bit integers
v2: Add some comments explaining some of the nuance of the shift optimizations. Fix a bug in the shift count calculation of the upper 32-bits. Move the @64 from the variable to the opcode. All suggested by Jordan. No shader-db changes on any Intel platform. fossil-db: Meteor Lake and DG2 had similar results. (Meteor Lake shown) Totals: Instrs: 154507026 -> 154506576 (-0.00%) Cycle count: 17436298868 -> 17436295016 (-0.00%) Max live registers: 32635309 -> 32635297 (-0.00%) Totals from 42 (0.01% of 632575) affected shaders: Instrs: 5616 -> 5166 (-8.01%) Cycle count: 133680 -> 129828 (-2.88%) Max live registers: 1158 -> 1146 (-1.04%) No fossil-db changes on any other Intel platform. Reviewed-by: Jordan Justen <jordan.l.justen@intel.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/29148>
This commit is contained in:
@@ -6472,6 +6472,7 @@ typedef struct nir_opt_access_options {
|
||||
bool nir_opt_access(nir_shader *shader, const nir_opt_access_options *options);
|
||||
bool nir_opt_algebraic(nir_shader *shader);
|
||||
bool nir_opt_algebraic_before_ffma(nir_shader *shader);
|
||||
bool nir_opt_algebraic_before_lower_int64(nir_shader *shader);
|
||||
bool nir_opt_algebraic_late(nir_shader *shader);
|
||||
bool nir_opt_algebraic_distribute_src_mods(nir_shader *shader);
|
||||
bool nir_opt_constant_folding(nir_shader *shader);
|
||||
|
@@ -3447,9 +3447,48 @@ distribute_src_mods = [
|
||||
(('fabs', ('fsign(is_used_once)', a)), ('fsign', ('fabs', a))),
|
||||
]
|
||||
|
||||
before_lower_int64_optimizations = [
|
||||
# The i2i64(a) implies that 'a' has at most 32-bits of data.
|
||||
(('ishl', ('i2i64', a), b),
|
||||
# Effective shift count of zero, just return 'a'.
|
||||
('bcsel', ('ieq', ('iand', b, 63), 0), ('i2i64', a),
|
||||
('bcsel', ('ilt', ('iand', b, 63), 32),
|
||||
# Shifting less than 32 bits, so both 32-bit halves will have
|
||||
# some data. These (and the else case) shift counts are of 32-bit
|
||||
# values, so the shift counts are implicitly moduolo 32.
|
||||
('pack_64_2x32_split', ('ishl', ('i2i32', a), b), ('ishr', ('i2i32', a), ('iadd', ('ineg', b), 32) )),
|
||||
# Shifting 32 bits or more, so lower 32 bits must be zero.
|
||||
('pack_64_2x32_split', 0 , ('ishl', ('i2i32', a), ('iabs', ('iadd', ('ineg', b), 32)))))),
|
||||
'(options->lower_int64_options & nir_lower_shift64) != 0'),
|
||||
|
||||
(('ishl', ('u2u64', a), b),
|
||||
('bcsel', ('ieq', ('iand', b, 63), 0), ('u2u64', a),
|
||||
('bcsel', ('ilt', ('iand', b, 63), 32),
|
||||
('pack_64_2x32_split', ('ishl', ('u2u32', a), b), ('ushr', ('u2u32', a), ('iadd', ('ineg', b), 32) )),
|
||||
('pack_64_2x32_split', 0 , ('ishl', ('u2u32', a), ('iabs', ('iadd', ('ineg', b), 32)))))),
|
||||
'(options->lower_int64_options & nir_lower_shift64) != 0'),
|
||||
|
||||
# If ineg64 is lowered, then the negation is not free. Try to eliminate
|
||||
# some of the negations.
|
||||
(('iadd@64', ('ineg', a), ('ineg(is_used_once)', b)), ('isub', ('ineg', a), b), '(options->lower_int64_options & nir_lower_ineg64) != 0'),
|
||||
(('iadd@64', a, ('ineg', b)), ('isub', a, b), '(options->lower_int64_options & nir_lower_ineg64) != 0'),
|
||||
(('isub@64', a, ('ineg', b)), ('iadd', a, b), '(options->lower_int64_options & nir_lower_ineg64) != 0'),
|
||||
(('isub@64', ('ineg', a), ('ineg', b)), ('isub', b, a), '(options->lower_int64_options & nir_lower_ineg64) != 0'),
|
||||
|
||||
(('imul@64', ('ineg', a), ('ineg', b)), ('imul', a, b)),
|
||||
(('idiv@64', ('ineg', a), ('ineg', b)), ('idiv', a, b)),
|
||||
|
||||
# If the hardware can do int64, the shift is the same cost as the add. It
|
||||
# should be fine to do this transformation unconditionally.
|
||||
(('iadd', ('i2i64', a), ('i2i64', a)), ('ishl', ('i2i64', a), 1)),
|
||||
(('iadd', ('u2u64', a), ('u2u64', a)), ('ishl', ('u2u64', a), 1)),
|
||||
]
|
||||
|
||||
print(nir_algebraic.AlgebraicPass("nir_opt_algebraic", optimizations).render())
|
||||
print(nir_algebraic.AlgebraicPass("nir_opt_algebraic_before_ffma",
|
||||
before_ffma_optimizations).render())
|
||||
print(nir_algebraic.AlgebraicPass("nir_opt_algebraic_before_lower_int64",
|
||||
before_lower_int64_optimizations).render())
|
||||
print(nir_algebraic.AlgebraicPass("nir_opt_algebraic_late",
|
||||
late_optimizations).render())
|
||||
print(nir_algebraic.AlgebraicPass("nir_opt_algebraic_distribute_src_mods",
|
||||
|
@@ -1678,6 +1678,12 @@ brw_postprocess_nir(nir_shader *nir, const struct brw_compiler *compiler,
|
||||
|
||||
brw_vectorize_lower_mem_access(nir, compiler, robust_flags);
|
||||
|
||||
/* Potentially perform this optimization pass twice because it can create
|
||||
* additional opportunities for itself.
|
||||
*/
|
||||
if (OPT(nir_opt_algebraic_before_lower_int64))
|
||||
OPT(nir_opt_algebraic_before_lower_int64);
|
||||
|
||||
if (OPT(nir_lower_int64))
|
||||
brw_nir_optimize(nir, devinfo);
|
||||
|
||||
@@ -1762,6 +1768,8 @@ brw_postprocess_nir(nir_shader *nir, const struct brw_compiler *compiler,
|
||||
if (OPT(nir_opt_uniform_atomics)) {
|
||||
OPT(nir_lower_subgroups, &subgroups_options);
|
||||
|
||||
OPT(nir_opt_algebraic_before_lower_int64);
|
||||
|
||||
if (OPT(nir_lower_int64))
|
||||
brw_nir_optimize(nir, devinfo);
|
||||
|
||||
|
Reference in New Issue
Block a user