diff --git a/src/amd/compiler/aco_builder_h.py b/src/amd/compiler/aco_builder_h.py index 7f8b5c472c4..27f477856c1 100644 --- a/src/amd/compiler/aco_builder_h.py +++ b/src/amd/compiler/aco_builder_h.py @@ -371,6 +371,10 @@ public: Result v_mul_imm(Definition dst, Temp tmp, uint32_t imm, bool bits24=false) { assert(tmp.type() == RegType::vgpr); + bool has_lshl_add = program->chip_class >= GFX9; + /* v_mul_lo_u32 has 1.6x the latency of most VALU on GFX10 (8 vs 5 cycles), + * compared to 4x the latency on chip_class >= GFX10 ? 1 : (4 + Operand(imm).isLiteral()); if (imm == 0) { return copy(dst, Operand(0u)); } else if (imm == 1) { @@ -379,10 +383,40 @@ public: return vop2(aco_opcode::v_lshlrev_b32, dst, Operand((uint32_t)ffs(imm) - 1u), tmp); } else if (bits24) { return vop2(aco_opcode::v_mul_u32_u24, dst, Operand(imm), tmp); - } else { - Temp imm_tmp = copy(def(s1), Operand(imm)); - return vop3(aco_opcode::v_mul_lo_u32, dst, imm_tmp, tmp); + } else if (util_is_power_of_two_nonzero(imm - 1u)) { + return vadd32(dst, vop2(aco_opcode::v_lshlrev_b32, def(v1), Operand((uint32_t)ffs(imm - 1u) - 1u), tmp), tmp); + } else if (mul_cost > 2 && util_is_power_of_two_nonzero(imm + 1u)) { + return vsub32(dst, vop2(aco_opcode::v_lshlrev_b32, def(v1), Operand((uint32_t)ffs(imm + 1u) - 1u), tmp), tmp); } + + unsigned instrs_required = util_bitcount(imm); + if (!has_lshl_add) { + instrs_required = util_bitcount(imm) - (imm & 0x1); /* shifts */ + instrs_required += util_bitcount(imm) - 1; /* additions */ + } + if (instrs_required < mul_cost) { + Result res(NULL); + Temp cur; + while (imm) { + unsigned shift = u_bit_scan(&imm); + Definition tmp_dst = imm ? def(v1) : dst; + + if (shift && cur.id()) + res = vadd32(Definition(tmp_dst), vop2(aco_opcode::v_lshlrev_b32, def(v1), Operand(shift), tmp), cur); + else if (shift) + res = vop2(aco_opcode::v_lshlrev_b32, Definition(tmp_dst), Operand(shift), tmp); + else if (cur.id()) + res = vadd32(Definition(tmp_dst), tmp, cur); + else + tmp_dst = Definition(tmp); + + cur = tmp_dst.getTemp(); + } + return res; + } + + Temp imm_tmp = copy(def(s1), Operand(imm)); + return vop3(aco_opcode::v_mul_lo_u32, dst, imm_tmp, tmp); } Result v_mul24_imm(Definition dst, Temp tmp, uint32_t imm)