From 02c5519e6c6bca75f9cd5603fa6e8a6e51eff4e3 Mon Sep 17 00:00:00 2001 From: Rhys Perry Date: Fri, 5 Jun 2020 17:52:24 +0100 Subject: [PATCH] aco: try harder to not create v_mul_lo_u32 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit fossil-db (Vega): Totals from 4 (0.00% of 137413) affected shaders: CodeSize: 13708 -> 13716 (+0.06%) Instrs: 2742 -> 2744 (+0.07%) Cycles: 24348 -> 24236 (-0.46%) Signed-off-by: Rhys Perry Reviewed-by: Timur Kristóf Part-of: --- src/amd/compiler/aco_builder_h.py | 40 ++++++++++++++++++++++++++++--- 1 file changed, 37 insertions(+), 3 deletions(-) diff --git a/src/amd/compiler/aco_builder_h.py b/src/amd/compiler/aco_builder_h.py index 7f8b5c472c4..27f477856c1 100644 --- a/src/amd/compiler/aco_builder_h.py +++ b/src/amd/compiler/aco_builder_h.py @@ -371,6 +371,10 @@ public: Result v_mul_imm(Definition dst, Temp tmp, uint32_t imm, bool bits24=false) { assert(tmp.type() == RegType::vgpr); + bool has_lshl_add = program->chip_class >= GFX9; + /* v_mul_lo_u32 has 1.6x the latency of most VALU on GFX10 (8 vs 5 cycles), + * compared to 4x the latency on chip_class >= GFX10 ? 1 : (4 + Operand(imm).isLiteral()); if (imm == 0) { return copy(dst, Operand(0u)); } else if (imm == 1) { @@ -379,10 +383,40 @@ public: return vop2(aco_opcode::v_lshlrev_b32, dst, Operand((uint32_t)ffs(imm) - 1u), tmp); } else if (bits24) { return vop2(aco_opcode::v_mul_u32_u24, dst, Operand(imm), tmp); - } else { - Temp imm_tmp = copy(def(s1), Operand(imm)); - return vop3(aco_opcode::v_mul_lo_u32, dst, imm_tmp, tmp); + } else if (util_is_power_of_two_nonzero(imm - 1u)) { + return vadd32(dst, vop2(aco_opcode::v_lshlrev_b32, def(v1), Operand((uint32_t)ffs(imm - 1u) - 1u), tmp), tmp); + } else if (mul_cost > 2 && util_is_power_of_two_nonzero(imm + 1u)) { + return vsub32(dst, vop2(aco_opcode::v_lshlrev_b32, def(v1), Operand((uint32_t)ffs(imm + 1u) - 1u), tmp), tmp); } + + unsigned instrs_required = util_bitcount(imm); + if (!has_lshl_add) { + instrs_required = util_bitcount(imm) - (imm & 0x1); /* shifts */ + instrs_required += util_bitcount(imm) - 1; /* additions */ + } + if (instrs_required < mul_cost) { + Result res(NULL); + Temp cur; + while (imm) { + unsigned shift = u_bit_scan(&imm); + Definition tmp_dst = imm ? def(v1) : dst; + + if (shift && cur.id()) + res = vadd32(Definition(tmp_dst), vop2(aco_opcode::v_lshlrev_b32, def(v1), Operand(shift), tmp), cur); + else if (shift) + res = vop2(aco_opcode::v_lshlrev_b32, Definition(tmp_dst), Operand(shift), tmp); + else if (cur.id()) + res = vadd32(Definition(tmp_dst), tmp, cur); + else + tmp_dst = Definition(tmp); + + cur = tmp_dst.getTemp(); + } + return res; + } + + Temp imm_tmp = copy(def(s1), Operand(imm)); + return vop3(aco_opcode::v_mul_lo_u32, dst, imm_tmp, tmp); } Result v_mul24_imm(Definition dst, Temp tmp, uint32_t imm)