nir: Be smarter fusing ffma
If there is a single use of fmul, and that single use is fadd, it makes sense to fuse ffma, as we already do. However, if there are multiple uses, fusing may impede code gen. Consider the source fragment: a = fmul(x, y) b = fadd(a, z) c = fmin(a, t) d = fmax(b, c) The fmul has two uses. The current ffma fusing is greedy and will produce the following "optimized" code. a = fmul(x, y) b = ffma(x, y, z) c = fmin(a, t) d = fmax(b, c) Actually, this code is worse! Instead of 1 fmul + 1 fadd, we now have 1 fmul + 1 ffma. In effect, two multiplies (and a fused add) instead of one multiply and an add. Depending on the ISA, that could impede scheduling or increase code size. It can also increase register pressure, extending the live range. It's tempting to gate on is_used_once, but that would hurt in cases where we really do fuse everything, e.g.: a = fmul(x, y) b = fadd(a, z) c = fadd(a, t) For ISAs that fuse ffma, we expect that 2 ffma is faster than 1 fmul + 2 fadd. So what we really want is to fuse ffma iff the fmul will get deleted. That occurs iff all uses of the fmul are fadd and will themselves get fused to ffma, leaving fmul to get dead code eliminated. That's easy to implement with a new NIR search helper, checking that all uses are fadd. shader-db results on Mali-G57 [open shader-db + subset of closed]: total instructions in shared programs: 179491 -> 178991 (-0.28%) instructions in affected programs: 36862 -> 36362 (-1.36%) helped: 190 HURT: 27 total cycles in shared programs: 10573.20 -> 10571.75 (-0.01%) cycles in affected programs: 72.02 -> 70.56 (-2.02%) helped: 28 HURT: 1 total fma in shared programs: 1590.47 -> 1582.61 (-0.49%) fma in affected programs: 319.95 -> 312.09 (-2.46%) helped: 194 HURT: 1 total cvt in shared programs: 812.98 -> 813.03 (<.01%) cvt in affected programs: 118.53 -> 118.58 (0.04%) helped: 65 HURT: 81 total quadwords in shared programs: 98968 -> 98840 (-0.13%) quadwords in affected programs: 2960 -> 2832 (-4.32%) helped: 20 HURT: 4 total threads in shared programs: 4693 -> 4697 (0.09%) threads in affected programs: 4 -> 8 (100.00%) helped: 4 HURT: 0 v2: Update trace checksums for virgl due to numerical differences. Signed-off-by: Alyssa Rosenzweig <alyssa@collabora.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/18814>
This commit is contained in:

committed by
Marge Bot

parent
07c654e08f
commit
ac2964dfbd
@@ -2589,10 +2589,10 @@ late_optimizations = [
|
||||
|
||||
# re-combine inexact mul+add to ffma. Do this before fsub so that a * b - c
|
||||
# gets combined to fma(a, b, -c).
|
||||
(('~fadd@16', ('fmul', a, b), c), ('ffma', a, b, c), 'options->fuse_ffma16'),
|
||||
(('~fadd@32', ('fmul', a, b), c), ('ffma', a, b, c), 'options->fuse_ffma32'),
|
||||
(('~fadd@64', ('fmul', a, b), c), ('ffma', a, b, c), 'options->fuse_ffma64'),
|
||||
(('~fadd@32', ('fmulz', a, b), c), ('ffmaz', a, b, c), 'options->fuse_ffma32'),
|
||||
(('~fadd@16', ('fmul(is_only_used_by_fadd)', a, b), c), ('ffma', a, b, c), 'options->fuse_ffma16'),
|
||||
(('~fadd@32', ('fmul(is_only_used_by_fadd)', a, b), c), ('ffma', a, b, c), 'options->fuse_ffma32'),
|
||||
(('~fadd@64', ('fmul(is_only_used_by_fadd)', a, b), c), ('ffma', a, b, c), 'options->fuse_ffma64'),
|
||||
(('~fadd@32', ('fmulz(is_only_used_by_fadd)', a, b), c), ('ffmaz', a, b, c), 'options->fuse_ffma32'),
|
||||
|
||||
# Subtractions get lowered during optimization, so we need to recombine them
|
||||
(('fadd@8', a, ('fneg', 'b')), ('fsub', 'a', 'b'), 'options->has_fsub'),
|
||||
|
@@ -422,6 +422,24 @@ is_only_used_as_float(const nir_alu_instr *instr)
|
||||
return true;
|
||||
}
|
||||
|
||||
static inline bool
|
||||
is_only_used_by_fadd(const nir_alu_instr *instr)
|
||||
{
|
||||
nir_foreach_use(src, &instr->dest.dest.ssa) {
|
||||
const nir_instr *const user_instr = src->parent_instr;
|
||||
if (user_instr->type != nir_instr_type_alu)
|
||||
return false;
|
||||
|
||||
const nir_alu_instr *const user_alu = nir_instr_as_alu(user_instr);
|
||||
assert(instr != user_alu);
|
||||
|
||||
if (user_alu->op != nir_op_fadd)
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
static inline bool
|
||||
only_lower_8_bits_used(const nir_alu_instr *instr)
|
||||
{
|
||||
|
Reference in New Issue
Block a user