aco: use v_add_f{16,32} with clamp for fsat
v_add can be dual issued on gfx11, v_med3 cannot. Don't use v_add directly to still optimize omod(fsat(x)). Foz-DB GFX1100: Totals from 32702 (24.24% of 134913) affected shaders: Latency: 475008203 -> 474928037 (-0.02%); split: -0.02%, +0.00% InvThroughput: 59226198 -> 59140787 (-0.14%); split: -0.14%, +0.00% Reviewed-by: Rhys Perry <pendingchaos02@gmail.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/21402>
This commit is contained in:
@@ -1296,6 +1296,33 @@ is_scratch_offset_valid(opt_ctx& ctx, Instruction* instr, int64_t offset0, int64
|
||||
return offset >= min && offset <= max;
|
||||
}
|
||||
|
||||
bool
|
||||
detect_clamp(Instruction* instr, unsigned* clamped_idx)
|
||||
{
|
||||
VALU_instruction& valu = instr->valu();
|
||||
if (valu.omod != 0 || valu.opsel != 0)
|
||||
return false;
|
||||
|
||||
unsigned idx = 0;
|
||||
bool found_zero = false, found_one = false;
|
||||
bool is_fp16 = instr->opcode == aco_opcode::v_med3_f16;
|
||||
for (unsigned i = 0; i < 3; i++) {
|
||||
if (!valu.neg[i] && instr->operands[i].constantEquals(0))
|
||||
found_zero = true;
|
||||
else if (!valu.neg[i] &&
|
||||
instr->operands[i].constantEquals(is_fp16 ? 0x3c00 : 0x3f800000)) /* 1.0 */
|
||||
found_one = true;
|
||||
else
|
||||
idx = i;
|
||||
}
|
||||
if (found_zero && found_one && instr->operands[idx].isTemp()) {
|
||||
*clamped_idx = idx;
|
||||
return true;
|
||||
} else {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
void
|
||||
label_instruction(opt_ctx& ctx, aco_ptr<Instruction>& instr)
|
||||
{
|
||||
@@ -1882,22 +1909,8 @@ label_instruction(opt_ctx& ctx, aco_ptr<Instruction>& instr)
|
||||
break;
|
||||
case aco_opcode::v_med3_f16:
|
||||
case aco_opcode::v_med3_f32: { /* clamp */
|
||||
VALU_instruction& vop3 = instr->valu();
|
||||
if (vop3.abs != 0 || vop3.neg != 0 || vop3.omod != 0 || vop3.opsel != 0)
|
||||
break;
|
||||
|
||||
unsigned idx = 0;
|
||||
bool found_zero = false, found_one = false;
|
||||
bool is_fp16 = instr->opcode == aco_opcode::v_med3_f16;
|
||||
for (unsigned i = 0; i < 3; i++) {
|
||||
if (instr->operands[i].constantEquals(0))
|
||||
found_zero = true;
|
||||
else if (instr->operands[i].constantEquals(is_fp16 ? 0x3c00 : 0x3f800000)) /* 1.0 */
|
||||
found_one = true;
|
||||
else
|
||||
idx = i;
|
||||
}
|
||||
if (found_zero && found_one && instr->operands[idx].isTemp())
|
||||
unsigned idx;
|
||||
if (detect_clamp(instr.get(), &idx) && !instr->valu().abs && !instr->valu().neg)
|
||||
ctx.info[instr->operands[idx].tempId()].set_clamp(instr.get());
|
||||
break;
|
||||
}
|
||||
@@ -4503,6 +4516,19 @@ combine_instruction(opt_ctx& ctx, aco_ptr<Instruction>& instr)
|
||||
*/
|
||||
ctx.mad_infos.emplace_back(nullptr, 0);
|
||||
ctx.info[instr->definitions[0].tempId()].set_mad(ctx.mad_infos.size() - 1);
|
||||
} else if (instr->opcode == aco_opcode::v_med3_f32 || instr->opcode == aco_opcode::v_med3_f16) {
|
||||
unsigned idx;
|
||||
if (detect_clamp(instr.get(), &idx)) {
|
||||
instr->format = asVOP3(Format::VOP2);
|
||||
instr->operands[0] = instr->operands[idx];
|
||||
instr->operands[1] = Operand::zero();
|
||||
instr->opcode =
|
||||
instr->opcode == aco_opcode::v_med3_f32 ? aco_opcode::v_add_f32 : aco_opcode::v_add_f16;
|
||||
instr->valu().clamp = true;
|
||||
instr->valu().abs = (uint8_t)instr->valu().abs[idx];
|
||||
instr->valu().neg = (uint8_t)instr->valu().neg[idx];
|
||||
instr->operands.pop_back();
|
||||
}
|
||||
} else {
|
||||
aco_opcode min, max, min3, max3, med3, minmax;
|
||||
bool some_gfx9_only;
|
||||
|
@@ -1174,12 +1174,12 @@ BEGIN_TEST(optimize.casts)
|
||||
writeout(2, fmul(u2u16(bld.vop2_e64(aco_opcode::v_mul_f32, bld.def(v1), Operand::c32(0xbf800000u), bld.as_uniform(a16))), a16));
|
||||
|
||||
//! v1: %res3_tmp = v_mul_f32 %a, %a
|
||||
//! v2b: %res3 = v_med3_f16 0, 1.0, %res3_tmp
|
||||
//! v2b: %res3 = v_add_f16 %res3_tmp, 0 clamp
|
||||
//! p_unit_test 3, %res3
|
||||
writeout(3, fsat(u2u16(fmul(a, a))));
|
||||
|
||||
//! v2b: %res4_tmp = v_mul_f16 %a16, %a16
|
||||
//! v1: %res4 = v_med3_f32 0, 1.0, %res4_tmp
|
||||
//! v1: %res4 = v_add_f32 %res4_tmp, 0 clamp
|
||||
//! p_unit_test 4, %res4
|
||||
writeout(4, fsat(bld.as_uniform(fmul(a16, a16))));
|
||||
|
||||
@@ -1701,12 +1701,12 @@ BEGIN_TEST(optimize.mad_mix.cast)
|
||||
writeout(3, f2f32(u2u16(fmul(a, a))));
|
||||
|
||||
//! v1: %res4_mul = v_fma_mix_f32 lo(%a16), %a, -0
|
||||
//! v2b: %res4 = v_med3_f16 0, 1.0, %res4_mul
|
||||
//! v2b: %res4 = v_add_f16 %res4_mul, 0 clamp
|
||||
//! p_unit_test 4, %res4
|
||||
writeout(4, fsat(u2u16(fmul(f2f32(a16), a))));
|
||||
|
||||
//! v2b: %res5_mul = v_fma_mixlo_f16 %a, %a, -0
|
||||
//! v1: %res5 = v_med3_f32 0, 1.0, %res5_mul
|
||||
//! v1: %res5 = v_add_f32 %res5_mul, 0 clamp
|
||||
//! p_unit_test 5, %res5
|
||||
writeout(5, fsat(bld.as_uniform(f2f16(fmul(a, a)))));
|
||||
|
||||
|
@@ -551,7 +551,7 @@ BEGIN_TEST(optimize.sdwa.insert_modifiers)
|
||||
writeout(2, val);
|
||||
|
||||
//! v1: %tmp3 = v_rcp_f32 %a dst_sel:ubyte0 src0_sel:dword
|
||||
//! v1: %res3 = v_med3_f32 %tmp3, 0, 1.0
|
||||
//! v1: %res3 = v_add_f32 %tmp3, 0 clamp
|
||||
//! p_unit_test 3, %res3
|
||||
val = bld.vop1(aco_opcode::v_rcp_f32, bld.def(v1), inputs[0]);
|
||||
val = bld.pseudo(ins, bld.def(v1), val, Operand::zero(), Operand::c32(8u));
|
||||
|
Reference in New Issue
Block a user