aco: try harder to not create v_mul_lo_u32
fossil-db (Vega): Totals from 4 (0.00% of 137413) affected shaders: CodeSize: 13708 -> 13716 (+0.06%) Instrs: 2742 -> 2744 (+0.07%) Cycles: 24348 -> 24236 (-0.46%) Signed-off-by: Rhys Perry <pendingchaos02@gmail.com> Reviewed-by: Timur Kristóf <timur.kristof@gmail.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/5390>
This commit is contained in:
@@ -371,6 +371,10 @@ public:
|
||||
Result v_mul_imm(Definition dst, Temp tmp, uint32_t imm, bool bits24=false)
|
||||
{
|
||||
assert(tmp.type() == RegType::vgpr);
|
||||
bool has_lshl_add = program->chip_class >= GFX9;
|
||||
/* v_mul_lo_u32 has 1.6x the latency of most VALU on GFX10 (8 vs 5 cycles),
|
||||
* compared to 4x the latency on <GFX10. */
|
||||
unsigned mul_cost = program->chip_class >= GFX10 ? 1 : (4 + Operand(imm).isLiteral());
|
||||
if (imm == 0) {
|
||||
return copy(dst, Operand(0u));
|
||||
} else if (imm == 1) {
|
||||
@@ -379,10 +383,40 @@ public:
|
||||
return vop2(aco_opcode::v_lshlrev_b32, dst, Operand((uint32_t)ffs(imm) - 1u), tmp);
|
||||
} else if (bits24) {
|
||||
return vop2(aco_opcode::v_mul_u32_u24, dst, Operand(imm), tmp);
|
||||
} else {
|
||||
Temp imm_tmp = copy(def(s1), Operand(imm));
|
||||
return vop3(aco_opcode::v_mul_lo_u32, dst, imm_tmp, tmp);
|
||||
} else if (util_is_power_of_two_nonzero(imm - 1u)) {
|
||||
return vadd32(dst, vop2(aco_opcode::v_lshlrev_b32, def(v1), Operand((uint32_t)ffs(imm - 1u) - 1u), tmp), tmp);
|
||||
} else if (mul_cost > 2 && util_is_power_of_two_nonzero(imm + 1u)) {
|
||||
return vsub32(dst, vop2(aco_opcode::v_lshlrev_b32, def(v1), Operand((uint32_t)ffs(imm + 1u) - 1u), tmp), tmp);
|
||||
}
|
||||
|
||||
unsigned instrs_required = util_bitcount(imm);
|
||||
if (!has_lshl_add) {
|
||||
instrs_required = util_bitcount(imm) - (imm & 0x1); /* shifts */
|
||||
instrs_required += util_bitcount(imm) - 1; /* additions */
|
||||
}
|
||||
if (instrs_required < mul_cost) {
|
||||
Result res(NULL);
|
||||
Temp cur;
|
||||
while (imm) {
|
||||
unsigned shift = u_bit_scan(&imm);
|
||||
Definition tmp_dst = imm ? def(v1) : dst;
|
||||
|
||||
if (shift && cur.id())
|
||||
res = vadd32(Definition(tmp_dst), vop2(aco_opcode::v_lshlrev_b32, def(v1), Operand(shift), tmp), cur);
|
||||
else if (shift)
|
||||
res = vop2(aco_opcode::v_lshlrev_b32, Definition(tmp_dst), Operand(shift), tmp);
|
||||
else if (cur.id())
|
||||
res = vadd32(Definition(tmp_dst), tmp, cur);
|
||||
else
|
||||
tmp_dst = Definition(tmp);
|
||||
|
||||
cur = tmp_dst.getTemp();
|
||||
}
|
||||
return res;
|
||||
}
|
||||
|
||||
Temp imm_tmp = copy(def(s1), Operand(imm));
|
||||
return vop3(aco_opcode::v_mul_lo_u32, dst, imm_tmp, tmp);
|
||||
}
|
||||
|
||||
Result v_mul24_imm(Definition dst, Temp tmp, uint32_t imm)
|
||||
|
Reference in New Issue
Block a user