aco: try harder to not create v_mul_lo_u32

fossil-db (Vega):
Totals from 4 (0.00% of 137413) affected shaders:
CodeSize: 13708 -> 13716 (+0.06%)
Instrs: 2742 -> 2744 (+0.07%)
Cycles: 24348 -> 24236 (-0.46%)

Signed-off-by: Rhys Perry <pendingchaos02@gmail.com>
Reviewed-by: Timur Kristóf <timur.kristof@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/5390>
This commit is contained in:
Rhys Perry
2020-06-05 17:52:24 +01:00
committed by Marge Bot
parent 8ca23bcf39
commit 02c5519e6c

View File

@@ -371,6 +371,10 @@ public:
Result v_mul_imm(Definition dst, Temp tmp, uint32_t imm, bool bits24=false)
{
assert(tmp.type() == RegType::vgpr);
bool has_lshl_add = program->chip_class >= GFX9;
/* v_mul_lo_u32 has 1.6x the latency of most VALU on GFX10 (8 vs 5 cycles),
* compared to 4x the latency on <GFX10. */
unsigned mul_cost = program->chip_class >= GFX10 ? 1 : (4 + Operand(imm).isLiteral());
if (imm == 0) {
return copy(dst, Operand(0u));
} else if (imm == 1) {
@@ -379,10 +383,40 @@ public:
return vop2(aco_opcode::v_lshlrev_b32, dst, Operand((uint32_t)ffs(imm) - 1u), tmp);
} else if (bits24) {
return vop2(aco_opcode::v_mul_u32_u24, dst, Operand(imm), tmp);
} else {
Temp imm_tmp = copy(def(s1), Operand(imm));
return vop3(aco_opcode::v_mul_lo_u32, dst, imm_tmp, tmp);
} else if (util_is_power_of_two_nonzero(imm - 1u)) {
return vadd32(dst, vop2(aco_opcode::v_lshlrev_b32, def(v1), Operand((uint32_t)ffs(imm - 1u) - 1u), tmp), tmp);
} else if (mul_cost > 2 && util_is_power_of_two_nonzero(imm + 1u)) {
return vsub32(dst, vop2(aco_opcode::v_lshlrev_b32, def(v1), Operand((uint32_t)ffs(imm + 1u) - 1u), tmp), tmp);
}
unsigned instrs_required = util_bitcount(imm);
if (!has_lshl_add) {
instrs_required = util_bitcount(imm) - (imm & 0x1); /* shifts */
instrs_required += util_bitcount(imm) - 1; /* additions */
}
if (instrs_required < mul_cost) {
Result res(NULL);
Temp cur;
while (imm) {
unsigned shift = u_bit_scan(&imm);
Definition tmp_dst = imm ? def(v1) : dst;
if (shift && cur.id())
res = vadd32(Definition(tmp_dst), vop2(aco_opcode::v_lshlrev_b32, def(v1), Operand(shift), tmp), cur);
else if (shift)
res = vop2(aco_opcode::v_lshlrev_b32, Definition(tmp_dst), Operand(shift), tmp);
else if (cur.id())
res = vadd32(Definition(tmp_dst), tmp, cur);
else
tmp_dst = Definition(tmp);
cur = tmp_dst.getTemp();
}
return res;
}
Temp imm_tmp = copy(def(s1), Operand(imm));
return vop3(aco_opcode::v_mul_lo_u32, dst, imm_tmp, tmp);
}
Result v_mul24_imm(Definition dst, Temp tmp, uint32_t imm)