nir,vc4: Suffix a bunch of unorm 4x8 opcodes _vc4

Reviewed-by: Alyssa Rosenzweig <alyssa@collabora.com>
Reviewed-by: Iago Toral Quiroga <itoral@igalia.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/11463>
This commit is contained in:
Jason Ekstrand
2021-06-18 09:28:59 -05:00
parent 0afbfee8da
commit 2e08bae9b3
4 changed files with 66 additions and 64 deletions

View File

@@ -883,51 +883,6 @@ binop("fmax", tfloat, _2src_commutative + associative, "fmax(src0, src1)")
binop("imax", tint, _2src_commutative + associative, "src1 > src0 ? src1 : src0") binop("imax", tint, _2src_commutative + associative, "src1 > src0 ? src1 : src0")
binop("umax", tuint, _2src_commutative + associative, "src1 > src0 ? src1 : src0") binop("umax", tuint, _2src_commutative + associative, "src1 > src0 ? src1 : src0")
# Saturated vector add for 4 8bit ints.
binop("usadd_4x8", tint32, _2src_commutative + associative, """
dst = 0;
for (int i = 0; i < 32; i += 8) {
dst |= MIN2(((src0 >> i) & 0xff) + ((src1 >> i) & 0xff), 0xff) << i;
}
""")
# Saturated vector subtract for 4 8bit ints.
binop("ussub_4x8", tint32, "", """
dst = 0;
for (int i = 0; i < 32; i += 8) {
int src0_chan = (src0 >> i) & 0xff;
int src1_chan = (src1 >> i) & 0xff;
if (src0_chan > src1_chan)
dst |= (src0_chan - src1_chan) << i;
}
""")
# vector min for 4 8bit ints.
binop("umin_4x8", tint32, _2src_commutative + associative, """
dst = 0;
for (int i = 0; i < 32; i += 8) {
dst |= MIN2((src0 >> i) & 0xff, (src1 >> i) & 0xff) << i;
}
""")
# vector max for 4 8bit ints.
binop("umax_4x8", tint32, _2src_commutative + associative, """
dst = 0;
for (int i = 0; i < 32; i += 8) {
dst |= MAX2((src0 >> i) & 0xff, (src1 >> i) & 0xff) << i;
}
""")
# unorm multiply: (a * b) / 255.
binop("umul_unorm_4x8", tint32, _2src_commutative + associative, """
dst = 0;
for (int i = 0; i < 32; i += 8) {
int src0_chan = (src0 >> i) & 0xff;
int src1_chan = (src1 >> i) & 0xff;
dst |= ((src0_chan * src1_chan) / 255) << i;
}
""")
binop("fpow", tfloat, "", "bit_size == 64 ? powf(src0, src1) : pow(src0, src1)") binop("fpow", tfloat, "", "bit_size == 64 ? powf(src0, src1) : pow(src0, src1)")
binop_horiz("pack_half_2x16_split", 1, tuint32, 1, tfloat32, 1, tfloat32, binop_horiz("pack_half_2x16_split", 1, tuint32, 1, tfloat32, 1, tfloat32,
@@ -1286,6 +1241,53 @@ binop("umul24_relaxed", tuint32, _2src_commutative + associative, "src0 * src1")
unop_convert("fisnormal", tbool1, tfloat, "isnormal(src0)") unop_convert("fisnormal", tbool1, tfloat, "isnormal(src0)")
unop_convert("fisfinite", tbool1, tfloat, "isfinite(src0)") unop_convert("fisfinite", tbool1, tfloat, "isfinite(src0)")
# vc4-specific opcodes
# Saturated vector add for 4 8bit ints.
binop("usadd_4x8_vc4", tint32, _2src_commutative + associative, """
dst = 0;
for (int i = 0; i < 32; i += 8) {
dst |= MIN2(((src0 >> i) & 0xff) + ((src1 >> i) & 0xff), 0xff) << i;
}
""")
# Saturated vector subtract for 4 8bit ints.
binop("ussub_4x8_vc4", tint32, "", """
dst = 0;
for (int i = 0; i < 32; i += 8) {
int src0_chan = (src0 >> i) & 0xff;
int src1_chan = (src1 >> i) & 0xff;
if (src0_chan > src1_chan)
dst |= (src0_chan - src1_chan) << i;
}
""")
# vector min for 4 8bit ints.
binop("umin_4x8_vc4", tint32, _2src_commutative + associative, """
dst = 0;
for (int i = 0; i < 32; i += 8) {
dst |= MIN2((src0 >> i) & 0xff, (src1 >> i) & 0xff) << i;
}
""")
# vector max for 4 8bit ints.
binop("umax_4x8_vc4", tint32, _2src_commutative + associative, """
dst = 0;
for (int i = 0; i < 32; i += 8) {
dst |= MAX2((src0 >> i) & 0xff, (src1 >> i) & 0xff) << i;
}
""")
# unorm multiply: (a * b) / 255.
binop("umul_unorm_4x8_vc4", tint32, _2src_commutative + associative, """
dst = 0;
for (int i = 0; i < 32; i += 8) {
int src0_chan = (src0 >> i) & 0xff;
int src1_chan = (src1 >> i) & 0xff;
dst |= ((src0_chan * src1_chan) / 255) << i;
}
""")
# Mali-specific opcodes # Mali-specific opcodes
unop("fsat_signed_mali", tfloat, ("fmin(fmax(src0, -1.0), 1.0)")) unop("fsat_signed_mali", tfloat, ("fmin(fmax(src0, -1.0), 1.0)"))
unop("fclamp_pos_mali", tfloat, ("fmax(src0, 0.0)")) unop("fclamp_pos_mali", tfloat, ("fmax(src0, 0.0)"))

View File

@@ -133,8 +133,8 @@ optimizations = [
(('fadd(is_only_used_as_float)', 'a@16', 0.0), a, '!'+signed_zero_inf_nan_preserve_16), (('fadd(is_only_used_as_float)', 'a@16', 0.0), a, '!'+signed_zero_inf_nan_preserve_16),
(('fadd(is_only_used_as_float)', 'a@32', 0.0), a, '!'+signed_zero_inf_nan_preserve_32), (('fadd(is_only_used_as_float)', 'a@32', 0.0), a, '!'+signed_zero_inf_nan_preserve_32),
(('iadd', a, 0), a), (('iadd', a, 0), a),
(('usadd_4x8', a, 0), a), (('usadd_4x8_vc4', a, 0), a),
(('usadd_4x8', a, ~0), ~0), (('usadd_4x8_vc4', a, ~0), ~0),
(('~fadd', ('fmul', a, b), ('fmul', a, c)), ('fmul', a, ('fadd', b, c))), (('~fadd', ('fmul', a, b), ('fmul', a, c)), ('fmul', a, ('fadd', b, c))),
(('iadd', ('imul', a, b), ('imul', a, c)), ('imul', a, ('iadd', b, c))), (('iadd', ('imul', a, b), ('imul', a, c)), ('imul', a, ('iadd', b, c))),
(('iand', ('ior', a, b), ('ior', a, c)), ('ior', a, ('iand', b, c))), (('iand', ('ior', a, b), ('ior', a, c)), ('ior', a, ('iand', b, c))),
@@ -151,8 +151,8 @@ optimizations = [
(('fmul', 'a@16', 0.0), 0.0, '!'+signed_zero_inf_nan_preserve_16), (('fmul', 'a@16', 0.0), 0.0, '!'+signed_zero_inf_nan_preserve_16),
(('fmul', 'a@32', 0.0), 0.0, '!'+signed_zero_inf_nan_preserve_32), (('fmul', 'a@32', 0.0), 0.0, '!'+signed_zero_inf_nan_preserve_32),
(('imul', a, 0), 0), (('imul', a, 0), 0),
(('umul_unorm_4x8', a, 0), 0), (('umul_unorm_4x8_vc4', a, 0), 0),
(('umul_unorm_4x8', a, ~0), a), (('umul_unorm_4x8_vc4', a, ~0), a),
(('~fmul', a, 1.0), a), (('~fmul', a, 1.0), a),
# The only effect a*1.0 can have is flushing denormals. If it's only used by # The only effect a*1.0 can have is flushing denormals. If it's only used by
# a floating point instruction, they should flush any input denormals and # a floating point instruction, they should flush any input denormals and
@@ -1333,8 +1333,8 @@ for op in ('extract_u8', 'extract_i8'):
optimizations.extend([ optimizations.extend([
# Subtracts # Subtracts
(('ussub_4x8', a, 0), a), (('ussub_4x8_vc4', a, 0), a),
(('ussub_4x8', a, ~0), 0), (('ussub_4x8_vc4', a, ~0), 0),
# Lower all Subtractions first - they can get recombined later # Lower all Subtractions first - they can get recombined later
(('fsub', a, b), ('fadd', a, ('fneg', b))), (('fsub', a, b), ('fadd', a, ('fneg', b))),
(('isub', a, b), ('iadd', a, ('ineg', b))), (('isub', a, b), ('iadd', a, ('ineg', b))),

View File

@@ -159,7 +159,7 @@ vc4_blend_channel_i(nir_builder *b,
return dst; return dst;
case PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE: case PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE:
return vc4_nir_set_packed_chan(b, return vc4_nir_set_packed_chan(b,
nir_umin_4x8(b, nir_umin_4x8_vc4(b,
src_a, src_a,
nir_inot(b, dst_a)), nir_inot(b, dst_a)),
nir_imm_int(b, ~0), nir_imm_int(b, ~0),
@@ -226,15 +226,15 @@ vc4_blend_func_i(nir_builder *b, nir_ssa_def *src, nir_ssa_def *dst,
{ {
switch (func) { switch (func) {
case PIPE_BLEND_ADD: case PIPE_BLEND_ADD:
return nir_usadd_4x8(b, src, dst); return nir_usadd_4x8_vc4(b, src, dst);
case PIPE_BLEND_SUBTRACT: case PIPE_BLEND_SUBTRACT:
return nir_ussub_4x8(b, src, dst); return nir_ussub_4x8_vc4(b, src, dst);
case PIPE_BLEND_REVERSE_SUBTRACT: case PIPE_BLEND_REVERSE_SUBTRACT:
return nir_ussub_4x8(b, dst, src); return nir_ussub_4x8_vc4(b, dst, src);
case PIPE_BLEND_MIN: case PIPE_BLEND_MIN:
return nir_umin_4x8(b, src, dst); return nir_umin_4x8_vc4(b, src, dst);
case PIPE_BLEND_MAX: case PIPE_BLEND_MAX:
return nir_umax_4x8(b, src, dst); return nir_umax_4x8_vc4(b, src, dst);
default: default:
/* Unsupported. */ /* Unsupported. */
@@ -353,8 +353,8 @@ vc4_do_blending_i(struct vc4_compile *c, nir_builder *b,
dst_alpha_factor, dst_alpha_factor,
alpha_chan); alpha_chan);
} }
nir_ssa_def *src_blend = nir_umul_unorm_4x8(b, src_color, src_factor); nir_ssa_def *src_blend = nir_umul_unorm_4x8_vc4(b, src_color, src_factor);
nir_ssa_def *dst_blend = nir_umul_unorm_4x8(b, dst_color, dst_factor); nir_ssa_def *dst_blend = nir_umul_unorm_4x8_vc4(b, dst_color, dst_factor);
nir_ssa_def *result = nir_ssa_def *result =
vc4_blend_func_i(b, src_blend, dst_blend, blend->rgb_func); vc4_blend_func_i(b, src_blend, dst_blend, blend->rgb_func);

View File

@@ -1276,23 +1276,23 @@ ntq_emit_alu(struct vc4_compile *c, nir_alu_instr *instr)
result = ntq_emit_ubfe(c, src[0], src[1], src[2]); result = ntq_emit_ubfe(c, src[0], src[1], src[2]);
break; break;
case nir_op_usadd_4x8: case nir_op_usadd_4x8_vc4:
result = qir_V8ADDS(c, src[0], src[1]); result = qir_V8ADDS(c, src[0], src[1]);
break; break;
case nir_op_ussub_4x8: case nir_op_ussub_4x8_vc4:
result = qir_V8SUBS(c, src[0], src[1]); result = qir_V8SUBS(c, src[0], src[1]);
break; break;
case nir_op_umin_4x8: case nir_op_umin_4x8_vc4:
result = qir_V8MIN(c, src[0], src[1]); result = qir_V8MIN(c, src[0], src[1]);
break; break;
case nir_op_umax_4x8: case nir_op_umax_4x8_vc4:
result = qir_V8MAX(c, src[0], src[1]); result = qir_V8MAX(c, src[0], src[1]);
break; break;
case nir_op_umul_unorm_4x8: case nir_op_umul_unorm_4x8_vc4:
result = qir_V8MULD(c, src[0], src[1]); result = qir_V8MULD(c, src[0], src[1]);
break; break;