diff --git a/src/amd/compiler/aco_optimizer.cpp b/src/amd/compiler/aco_optimizer.cpp index 10592a4d1ef..f427b02c926 100644 --- a/src/amd/compiler/aco_optimizer.cpp +++ b/src/amd/compiler/aco_optimizer.cpp @@ -2939,6 +2939,50 @@ bool to_uniform_bool_instr(opt_ctx &ctx, aco_ptr &instr) return true; } +void select_mul_u32_u24(opt_ctx &ctx, aco_ptr& instr) +{ + if (instr->usesModifiers()) + return; + + /* Only valid if the accumulator is zero (this is selected by isel to + * combine more v_add_u32+v_mad_u32_u16 together), but the optimizer + * fallbacks here when not possible. + */ + if (!instr->operands[2].constantEquals(0)) + return; + + /* Only valid if the upper 16-bits of both operands are zero (because + * v_mul_u32_u24 doesn't mask them). + */ + for (unsigned i = 0; i < 2; i++) { + if (instr->operands[i].isTemp() && !instr->operands[i].is16bit()) + return; + } + + bool swap = false; + + /* VOP2 instructions can only take constants/sgprs in operand 0. */ + if ((instr->operands[1].isConstant() || + (instr->operands[1].hasRegClass() && + instr->operands[1].regClass().type() == RegType::sgpr))) { + swap = true; + if ((instr->operands[0].isConstant() || + (instr->operands[0].hasRegClass() && + instr->operands[0].regClass().type() == RegType::sgpr))) { + /* VOP2 can't take both constants/sgprs, keep v_mad_u32_u16 because + * v_mul_u32_u24 has no advantages. + */ + return; + } + } + + VOP2_instruction *new_instr = create_instruction(aco_opcode::v_mul_u32_u24, Format::VOP2, 2, 1); + new_instr->operands[0] = instr->operands[swap]; + new_instr->operands[1] = instr->operands[!swap]; + new_instr->definitions[0] = instr->definitions[0]; + instr.reset(new_instr); +} + void select_instruction(opt_ctx &ctx, aco_ptr& instr) { const uint32_t threshold = 4; @@ -3102,6 +3146,9 @@ void select_instruction(opt_ctx &ctx, aco_ptr& instr) return; } + if (instr->opcode == aco_opcode::v_mad_u32_u16) + select_mul_u32_u24(ctx, instr); + if (instr->isSDWA() || instr->isDPP() || (instr->isVOP3() && ctx.program->chip_class < GFX10)) return; /* some encodings can't ever take literals */ diff --git a/src/amd/compiler/tests/test_optimizer.cpp b/src/amd/compiler/tests/test_optimizer.cpp index 4be2f6226ae..4ac8dc4dd11 100644 --- a/src/amd/compiler/tests/test_optimizer.cpp +++ b/src/amd/compiler/tests/test_optimizer.cpp @@ -156,3 +156,45 @@ BEGIN_TEST(optimize.add_lshl) finish_opt_test(); } END_TEST + +Temp create_mad_u32_u16(Operand a, Operand b, Operand c, bool is16bit = true) +{ + a.set16bit(is16bit); + b.set16bit(is16bit); + + return bld.vop3(aco_opcode::v_mad_u32_u16, bld.def(v1), a, b, c); +} + +BEGIN_TEST(optimize.mad_u32_u16) + for (unsigned i = GFX9; i <= GFX10; i++) { + //>> v1: %a, v1: %b, s1: %c, s2: %_:exec = p_startpgm + if (!setup_cs("v1 v1 s1", (chip_class)i)) + continue; + + //! v1: %res0 = v_mul_u32_u24 (is16bit)%a, (is16bit)%b + //! p_unit_test 0, %res0 + writeout(0, create_mad_u32_u16(Operand(inputs[0]), Operand(inputs[1]), Operand(0u))); + + //! v1: %res1 = v_mul_u32_u24 42, (is16bit)%a + //! p_unit_test 1, %res1 + writeout(1, create_mad_u32_u16(Operand(42u), Operand(inputs[0]), Operand(0u))); + + //! v1: %res2 = v_mul_u32_u24 42, (is16bit)%a + //! p_unit_test 2, %res2 + writeout(2, create_mad_u32_u16(Operand(inputs[0]), Operand(42u), Operand(0u))); + + //! v1: %res3 = v_mul_u32_u24 (is16bit)%c, (is16bit)%a + //! p_unit_test 3, %res3 + writeout(3, create_mad_u32_u16(Operand(inputs[2]), Operand(inputs[0]), Operand(0u))); + + //! v1: %res4 = v_mad_u32_u16 42, (is16bit)%c, 0 + //! p_unit_test 4, %res4 + writeout(4, create_mad_u32_u16(Operand(42u), Operand(inputs[2]), Operand(0u))); + + //! v1: %res5 = v_mad_u32_u16 42, %a, 0 + //! p_unit_test 5, %res5 + writeout(5, create_mad_u32_u16(Operand(42u), Operand(inputs[0]), Operand(0u), false)); + + finish_opt_test(); + } +END_TEST