aco: try harder to not create v_mul_lo_u32
fossil-db (Vega): Totals from 4 (0.00% of 137413) affected shaders: CodeSize: 13708 -> 13716 (+0.06%) Instrs: 2742 -> 2744 (+0.07%) Cycles: 24348 -> 24236 (-0.46%) Signed-off-by: Rhys Perry <pendingchaos02@gmail.com> Reviewed-by: Timur Kristóf <timur.kristof@gmail.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/5390>
This commit is contained in:
@@ -371,6 +371,10 @@ public:
|
|||||||
Result v_mul_imm(Definition dst, Temp tmp, uint32_t imm, bool bits24=false)
|
Result v_mul_imm(Definition dst, Temp tmp, uint32_t imm, bool bits24=false)
|
||||||
{
|
{
|
||||||
assert(tmp.type() == RegType::vgpr);
|
assert(tmp.type() == RegType::vgpr);
|
||||||
|
bool has_lshl_add = program->chip_class >= GFX9;
|
||||||
|
/* v_mul_lo_u32 has 1.6x the latency of most VALU on GFX10 (8 vs 5 cycles),
|
||||||
|
* compared to 4x the latency on <GFX10. */
|
||||||
|
unsigned mul_cost = program->chip_class >= GFX10 ? 1 : (4 + Operand(imm).isLiteral());
|
||||||
if (imm == 0) {
|
if (imm == 0) {
|
||||||
return copy(dst, Operand(0u));
|
return copy(dst, Operand(0u));
|
||||||
} else if (imm == 1) {
|
} else if (imm == 1) {
|
||||||
@@ -379,10 +383,40 @@ public:
|
|||||||
return vop2(aco_opcode::v_lshlrev_b32, dst, Operand((uint32_t)ffs(imm) - 1u), tmp);
|
return vop2(aco_opcode::v_lshlrev_b32, dst, Operand((uint32_t)ffs(imm) - 1u), tmp);
|
||||||
} else if (bits24) {
|
} else if (bits24) {
|
||||||
return vop2(aco_opcode::v_mul_u32_u24, dst, Operand(imm), tmp);
|
return vop2(aco_opcode::v_mul_u32_u24, dst, Operand(imm), tmp);
|
||||||
} else {
|
} else if (util_is_power_of_two_nonzero(imm - 1u)) {
|
||||||
Temp imm_tmp = copy(def(s1), Operand(imm));
|
return vadd32(dst, vop2(aco_opcode::v_lshlrev_b32, def(v1), Operand((uint32_t)ffs(imm - 1u) - 1u), tmp), tmp);
|
||||||
return vop3(aco_opcode::v_mul_lo_u32, dst, imm_tmp, tmp);
|
} else if (mul_cost > 2 && util_is_power_of_two_nonzero(imm + 1u)) {
|
||||||
|
return vsub32(dst, vop2(aco_opcode::v_lshlrev_b32, def(v1), Operand((uint32_t)ffs(imm + 1u) - 1u), tmp), tmp);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
unsigned instrs_required = util_bitcount(imm);
|
||||||
|
if (!has_lshl_add) {
|
||||||
|
instrs_required = util_bitcount(imm) - (imm & 0x1); /* shifts */
|
||||||
|
instrs_required += util_bitcount(imm) - 1; /* additions */
|
||||||
|
}
|
||||||
|
if (instrs_required < mul_cost) {
|
||||||
|
Result res(NULL);
|
||||||
|
Temp cur;
|
||||||
|
while (imm) {
|
||||||
|
unsigned shift = u_bit_scan(&imm);
|
||||||
|
Definition tmp_dst = imm ? def(v1) : dst;
|
||||||
|
|
||||||
|
if (shift && cur.id())
|
||||||
|
res = vadd32(Definition(tmp_dst), vop2(aco_opcode::v_lshlrev_b32, def(v1), Operand(shift), tmp), cur);
|
||||||
|
else if (shift)
|
||||||
|
res = vop2(aco_opcode::v_lshlrev_b32, Definition(tmp_dst), Operand(shift), tmp);
|
||||||
|
else if (cur.id())
|
||||||
|
res = vadd32(Definition(tmp_dst), tmp, cur);
|
||||||
|
else
|
||||||
|
tmp_dst = Definition(tmp);
|
||||||
|
|
||||||
|
cur = tmp_dst.getTemp();
|
||||||
|
}
|
||||||
|
return res;
|
||||||
|
}
|
||||||
|
|
||||||
|
Temp imm_tmp = copy(def(s1), Operand(imm));
|
||||||
|
return vop3(aco_opcode::v_mul_lo_u32, dst, imm_tmp, tmp);
|
||||||
}
|
}
|
||||||
|
|
||||||
Result v_mul24_imm(Definition dst, Temp tmp, uint32_t imm)
|
Result v_mul24_imm(Definition dst, Temp tmp, uint32_t imm)
|
||||||
|
Reference in New Issue
Block a user