diff --git a/src/amd/compiler/aco_assembler.cpp b/src/amd/compiler/aco_assembler.cpp index bb4ae1bcad7..8d78351fd65 100644 --- a/src/amd/compiler/aco_assembler.cpp +++ b/src/amd/compiler/aco_assembler.cpp @@ -98,8 +98,30 @@ reg(asm_context& ctx, Definition def, unsigned width = 32) return reg(ctx, def.physReg()) & BITFIELD_MASK(width); } +bool +needs_vop3_gfx11(asm_context& ctx, Instruction* instr, Operand *dpp_op) +{ + if (ctx.gfx_level <= GFX10_3) + return false; + + uint8_t mask = get_gfx11_true16_mask(instr->opcode); + if (!mask) + return false; + + u_foreach_bit (i, mask & 0x3) { + if (i == 0 && dpp_op && dpp_op->physReg().reg() >= (256 + 128)) + return true; + if (instr->operands[i].physReg().reg() >= (256 + 128)) + return true; + } + if ((mask & 0x8) && instr->definitions[0].physReg().reg() >= (256 + 128)) + return true; + return false; +} + void -emit_instruction(asm_context& ctx, std::vector& out, Instruction* instr) +emit_instruction(asm_context& ctx, std::vector& out, Instruction* instr, + Operand *dpp_op_ptr = NULL, DPP16_instruction *dpp16_ptr = NULL) { /* lower remaining pseudo-instructions */ if (instr->opcode == aco_opcode::p_constaddr_getpc) { @@ -298,30 +320,80 @@ emit_instruction(asm_context& ctx, std::vector& out, Instruction* inst return; } case Format::VOP2: { - uint32_t encoding = 0; - encoding |= opcode << 25; - encoding |= reg(ctx, instr->definitions[0], 8) << 17; - encoding |= reg(ctx, instr->operands[1], 8) << 9; - encoding |= reg(ctx, instr->operands[0]); - out.push_back(encoding); + if (needs_vop3_gfx11(ctx, instr, dpp_op_ptr)) { + if (instr->opcode == aco_opcode::v_fmaak_f16) { + opcode = ctx.opcode[(int)aco_opcode::v_fma_f16]; + } else if (instr->opcode == aco_opcode::v_fmamk_f16) { + std::swap(instr->operands[1], instr->operands[2]); + opcode = ctx.opcode[(int)aco_opcode::v_fma_f16]; + } else { + opcode += 0x100; + } + + uint32_t encoding = (0b110101 << 26); + encoding |= opcode << 16; + encoding |= reg(ctx, instr->definitions[0], 8); + encoding |= dpp16_ptr ? (dpp16_ptr->abs[0] << 8) | (dpp16_ptr->abs[1] << 9) : 0; + out.push_back(encoding); + + encoding = reg(ctx, instr->operands[0]); + encoding |= reg(ctx, instr->operands[1]) << 9; + if (instr->opcode == aco_opcode::v_fmaak_f16 || + instr->opcode == aco_opcode::v_fmamk_f16) + encoding |= reg(ctx, instr->operands[2]) << 18; + encoding |= dpp16_ptr ? (dpp16_ptr->neg[0] << 29) | (dpp16_ptr->neg[1] << 30) : 0; + out.push_back(encoding); + } else { + uint32_t encoding = 0; + encoding |= opcode << 25; + encoding |= reg(ctx, instr->definitions[0], 8) << 17; + encoding |= reg(ctx, instr->operands[1], 8) << 9; + encoding |= reg(ctx, instr->operands[0]); + out.push_back(encoding); + } break; } case Format::VOP1: { - uint32_t encoding = (0b0111111 << 25); - if (!instr->definitions.empty()) - encoding |= reg(ctx, instr->definitions[0], 8) << 17; - encoding |= opcode << 9; - if (!instr->operands.empty()) - encoding |= reg(ctx, instr->operands[0]); - out.push_back(encoding); + if (needs_vop3_gfx11(ctx, instr, dpp_op_ptr)) { + uint32_t encoding = (0b110101 << 26); + encoding |= (opcode + 0x180) << 16; + encoding |= reg(ctx, instr->definitions[0], 8); + encoding |= dpp16_ptr ? dpp16_ptr->abs[0] << 8 : 0; + out.push_back(encoding); + + encoding = reg(ctx, instr->operands[0]); + encoding |= dpp16_ptr ? dpp16_ptr->neg[0] << 29 : 0; + out.push_back(encoding); + } else { + uint32_t encoding = (0b0111111 << 25); + if (!instr->definitions.empty()) + encoding |= reg(ctx, instr->definitions[0], 8) << 17; + encoding |= opcode << 9; + if (!instr->operands.empty()) + encoding |= reg(ctx, instr->operands[0]); + out.push_back(encoding); + } break; } case Format::VOPC: { - uint32_t encoding = (0b0111110 << 25); - encoding |= opcode << 17; - encoding |= reg(ctx, instr->operands[1], 8) << 9; - encoding |= reg(ctx, instr->operands[0]); - out.push_back(encoding); + if (needs_vop3_gfx11(ctx, instr, dpp_op_ptr)) { + uint32_t encoding = (0b110101 << 26); + encoding |= opcode << 16; + encoding |= reg(ctx, instr->definitions[0], 8); + encoding |= dpp16_ptr ? (dpp16_ptr->abs[0] << 8) | (dpp16_ptr->abs[1] << 9) : 0; + out.push_back(encoding); + + encoding = reg(ctx, instr->operands[0]); + encoding |= reg(ctx, instr->operands[1]) << 9; + encoding |= dpp16_ptr ? (dpp16_ptr->neg[0] << 29) | (dpp16_ptr->neg[1] << 30) : 0; + out.push_back(encoding); + } else { + uint32_t encoding = (0b0111110 << 25); + encoding |= opcode << 17; + encoding |= reg(ctx, instr->operands[1], 8) << 9; + encoding |= reg(ctx, instr->operands[0]); + out.push_back(encoding); + } break; } case Format::VINTRP: { @@ -802,7 +874,7 @@ emit_instruction(asm_context& ctx, std::vector& out, Instruction* inst Operand dpp_op = instr->operands[0]; instr->operands[0] = Operand(PhysReg{250}, v1); instr->format = (Format)((uint16_t)instr->format & ~(uint16_t)Format::DPP16); - emit_instruction(ctx, out, instr); + emit_instruction(ctx, out, instr, &dpp_op, &dpp); uint32_t encoding = (0xF & dpp.row_mask) << 28; encoding |= (0xF & dpp.bank_mask) << 24; encoding |= dpp.abs[1] << 23; @@ -824,7 +896,7 @@ emit_instruction(asm_context& ctx, std::vector& out, Instruction* inst Operand dpp_op = instr->operands[0]; instr->operands[0] = Operand(PhysReg{234}, v1); instr->format = (Format)((uint16_t)instr->format & ~(uint16_t)Format::DPP8); - emit_instruction(ctx, out, instr); + emit_instruction(ctx, out, instr, &dpp_op); uint32_t encoding = reg(ctx, dpp_op, 8); for (unsigned i = 0; i < 8; ++i) encoding |= dpp.lane_sel[i] << (8 + i * 3); diff --git a/src/amd/compiler/aco_ir.cpp b/src/amd/compiler/aco_ir.cpp index 44a975b6f4e..eb1daf4d3d1 100644 --- a/src/amd/compiler/aco_ir.cpp +++ b/src/amd/compiler/aco_ir.cpp @@ -533,6 +533,112 @@ instr_is_16bit(amd_gfx_level gfx_level, aco_opcode op) } } +/* On GFX11, for some instructions, bit 7 of the destination/operand vgpr is opsel and the field + * only supports v0-v127. + */ +// TODO: take advantage of this functionality in the RA and assembler +uint8_t +get_gfx11_true16_mask(aco_opcode op) +{ + switch (op) { + case aco_opcode::v_ceil_f16: + case aco_opcode::v_cos_f16: + case aco_opcode::v_cvt_f16_i16: + case aco_opcode::v_cvt_f16_u16: + case aco_opcode::v_cvt_i16_f16: + case aco_opcode::v_cvt_u16_f16: + case aco_opcode::v_cvt_norm_i16_f16: + case aco_opcode::v_cvt_norm_u16_f16: + case aco_opcode::v_exp_f16: + case aco_opcode::v_floor_f16: + case aco_opcode::v_fract_f16: + case aco_opcode::v_frexp_exp_i16_f16: + case aco_opcode::v_frexp_mant_f16: + case aco_opcode::v_log_f16: + case aco_opcode::v_not_b16: + case aco_opcode::v_rcp_f16: + case aco_opcode::v_rndne_f16: + case aco_opcode::v_rsq_f16: + case aco_opcode::v_sin_f16: + case aco_opcode::v_sqrt_f16: + case aco_opcode::v_trunc_f16: + case aco_opcode::v_mov_b16: return 0x1 | 0x8; + case aco_opcode::v_add_f16: + case aco_opcode::v_fmaak_f16: + case aco_opcode::v_fmac_f16: + case aco_opcode::v_fmamk_f16: + case aco_opcode::v_ldexp_f16: + case aco_opcode::v_max_f16: + case aco_opcode::v_min_f16: + case aco_opcode::v_mul_f16: + case aco_opcode::v_sub_f16: + case aco_opcode::v_subrev_f16: + case aco_opcode::v_and_b16: + case aco_opcode::v_or_b16: + case aco_opcode::v_xor_b16: return 0x3 | 0x8; + case aco_opcode::v_cmp_class_f16: + case aco_opcode::v_cmpx_class_f16: + case aco_opcode::v_cvt_f32_f16: + case aco_opcode::v_cvt_i32_i16: + case aco_opcode::v_cvt_u32_u16: return 0x1; + case aco_opcode::v_cmp_eq_f16: + case aco_opcode::v_cmp_eq_i16: + case aco_opcode::v_cmp_eq_u16: + case aco_opcode::v_cmp_ge_f16: + case aco_opcode::v_cmp_ge_i16: + case aco_opcode::v_cmp_ge_u16: + case aco_opcode::v_cmp_gt_f16: + case aco_opcode::v_cmp_gt_i16: + case aco_opcode::v_cmp_gt_u16: + case aco_opcode::v_cmp_le_f16: + case aco_opcode::v_cmp_le_i16: + case aco_opcode::v_cmp_le_u16: + case aco_opcode::v_cmp_lg_f16: + case aco_opcode::v_cmp_lg_i16: + case aco_opcode::v_cmp_lg_u16: + case aco_opcode::v_cmp_lt_f16: + case aco_opcode::v_cmp_lt_i16: + case aco_opcode::v_cmp_lt_u16: + case aco_opcode::v_cmp_neq_f16: + case aco_opcode::v_cmp_nge_f16: + case aco_opcode::v_cmp_ngt_f16: + case aco_opcode::v_cmp_nle_f16: + case aco_opcode::v_cmp_nlg_f16: + case aco_opcode::v_cmp_nlt_f16: + case aco_opcode::v_cmp_o_f16: + case aco_opcode::v_cmp_u_f16: + case aco_opcode::v_cmpx_eq_f16: + case aco_opcode::v_cmpx_eq_i16: + case aco_opcode::v_cmpx_eq_u16: + case aco_opcode::v_cmpx_ge_f16: + case aco_opcode::v_cmpx_ge_i16: + case aco_opcode::v_cmpx_ge_u16: + case aco_opcode::v_cmpx_gt_f16: + case aco_opcode::v_cmpx_gt_i16: + case aco_opcode::v_cmpx_gt_u16: + case aco_opcode::v_cmpx_le_f16: + case aco_opcode::v_cmpx_le_i16: + case aco_opcode::v_cmpx_le_u16: + case aco_opcode::v_cmpx_lg_f16: + case aco_opcode::v_cmpx_lg_i16: + case aco_opcode::v_cmpx_lg_u16: + case aco_opcode::v_cmpx_lt_f16: + case aco_opcode::v_cmpx_lt_i16: + case aco_opcode::v_cmpx_lt_u16: + case aco_opcode::v_cmpx_neq_f16: + case aco_opcode::v_cmpx_nge_f16: + case aco_opcode::v_cmpx_ngt_f16: + case aco_opcode::v_cmpx_nle_f16: + case aco_opcode::v_cmpx_nlg_f16: + case aco_opcode::v_cmpx_nlt_f16: + case aco_opcode::v_cmpx_o_f16: + case aco_opcode::v_cmpx_u_f16: return 0x3; + case aco_opcode::v_cvt_f16_f32: + case aco_opcode::v_sat_pk_u8_i16: return 0x8; + default: return 0x0; + } +} + uint32_t get_reduction_identity(ReduceOp op, unsigned idx) { diff --git a/src/amd/compiler/aco_ir.h b/src/amd/compiler/aco_ir.h index cbdd300117c..baf64b04267 100644 --- a/src/amd/compiler/aco_ir.h +++ b/src/amd/compiler/aco_ir.h @@ -1872,6 +1872,7 @@ is_dead(const std::vector& uses, const Instruction* instr) bool can_use_opsel(amd_gfx_level gfx_level, aco_opcode op, int idx); bool instr_is_16bit(amd_gfx_level gfx_level, aco_opcode op); +uint8_t get_gfx11_true16_mask(aco_opcode op); bool can_use_SDWA(amd_gfx_level gfx_level, const aco_ptr& instr, bool pre_ra); bool can_use_DPP(const aco_ptr& instr, bool pre_ra, bool dpp8); /* updates "instr" and returns the old instruction (or NULL if no update was needed) */ diff --git a/src/amd/compiler/tests/test_assembler.cpp b/src/amd/compiler/tests/test_assembler.cpp index 533f69c6ba7..4f4c29a60a7 100644 --- a/src/amd/compiler/tests/test_assembler.cpp +++ b/src/amd/compiler/tests/test_assembler.cpp @@ -807,4 +807,90 @@ BEGIN_TEST(assembler.gfx11.ldsdir) finish_assembler_test(); END_TEST + +BEGIN_TEST(assembler.gfx11.vop12c_v128) + if (!setup_cs(NULL, GFX11)) + return; + + Definition dst_v0 = bld.def(v1); + dst_v0.setFixed(PhysReg(256)); + + Definition dst_v128 = bld.def(v1); + dst_v128.setFixed(PhysReg(256 + 128)); + + Operand op_v1(bld.tmp(v1)); + op_v1.setFixed(PhysReg(256 + 1)); + + Operand op_v2(bld.tmp(v1)); + op_v2.setFixed(PhysReg(256 + 2)); + + Operand op_v129(bld.tmp(v1)); + op_v129.setFixed(PhysReg(256 + 129)); + + Operand op_v130(bld.tmp(v1)); + op_v130.setFixed(PhysReg(256 + 130)); + + //>> BB0: + //! v_mul_f16_e32 v0, v1, v2 ; Error: VGPR_32_Lo128: unknown register 128 ; 6a000501 + bld.vop2(aco_opcode::v_mul_f16, dst_v0, op_v1, op_v2); + + //! v_mul_f16_e64 v128, v1, v2 ; d5350080 00020501 + bld.vop2(aco_opcode::v_mul_f16, dst_v128, op_v1, op_v2); + + //! v_mul_f16_e64 v0, v129, v2 ; d5350000 00020581 + bld.vop2(aco_opcode::v_mul_f16, dst_v0, op_v129, op_v2); + + //! v_mul_f16_e64 v0, v1, v130 ; d5350000 00030501 + bld.vop2(aco_opcode::v_mul_f16, dst_v0, op_v1, op_v130); + + //! v_rcp_f16_e64 v128, v1 ; d5d40080 00000101 + bld.vop1(aco_opcode::v_rcp_f16, dst_v128, op_v1); + + //! v_cmp_eq_f16_e64 vcc, v129, v2 ; d402006a 00020581 + bld.vopc(aco_opcode::v_cmp_eq_f16, bld.def(s2, vcc), op_v129, op_v2); + + //! v_mul_f16_e64_dpp v128, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 fi:1 ; d5350080 000204fa ff0d2101 + bld.vop2_dpp(aco_opcode::v_mul_f16, dst_v128, op_v1, op_v2, dpp_row_rr(1)); + + //! v_mul_f16_e64_dpp v0, v129, v2 row_ror:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 fi:1 ; d5350000 000204fa ff0d2181 + bld.vop2_dpp(aco_opcode::v_mul_f16, dst_v0, op_v129, op_v2, dpp_row_rr(1)); + + //! v_mul_f16_e64_dpp v0, v1, v130 row_ror:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 fi:1 ; d5350000 000304fa ff0d2101 + bld.vop2_dpp(aco_opcode::v_mul_f16, dst_v0, op_v1, op_v130, dpp_row_rr(1)); + + //! v_mul_f16_e64_dpp v128, v1, v2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; d5350080 000204ea 00000001 + bld.vop2_dpp8(aco_opcode::v_mul_f16, dst_v128, op_v1, op_v2); + + //! v_mul_f16_e64_dpp v0, v129, v2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; d5350000 000204ea 00000081 + bld.vop2_dpp8(aco_opcode::v_mul_f16, dst_v0, op_v129, op_v2); + + //! v_mul_f16_e64_dpp v0, v1, v130 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; d5350000 000304ea 00000001 + bld.vop2_dpp8(aco_opcode::v_mul_f16, dst_v0, op_v1, op_v130); + + //! v_fma_f16 v128, v1, v2, 0x60 ; d6480080 03fe0501 00000060 + bld.vop2(aco_opcode::v_fmaak_f16, dst_v128, op_v1, op_v2, Operand::literal32(96)); + + //! v_fma_f16 v128, v1, 0x60, v2 ; d6480080 0409ff01 00000060 + bld.vop2(aco_opcode::v_fmamk_f16, dst_v128, op_v1, op_v2, Operand::literal32(96)); + + //! v_rcp_f16_e64_dpp v128, -v1 row_ror:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 fi:1 ; d5d40080 200000fa ff1d2101 + bld.vop1_dpp(aco_opcode::v_rcp_f16, dst_v128, op_v1, dpp_row_rr(1))->dpp16().neg[0] = true; + + //! v_rcp_f16_e64_dpp v128, |v1| row_ror:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 fi:1 ; d5d40180 000000fa ff2d2101 + bld.vop1_dpp(aco_opcode::v_rcp_f16, dst_v128, op_v1, dpp_row_rr(1))->dpp16().abs[0] = true; + + //! v_mul_f16_e64_dpp v128, -v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 fi:1 ; d5350080 200204fa ff1d2101 + bld.vop2_dpp(aco_opcode::v_mul_f16, dst_v128, op_v1, op_v2, dpp_row_rr(1))->dpp16().neg[0] = true; + + //! v_mul_f16_e64_dpp v128, |v1|, v2 row_ror:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 fi:1 ; d5350180 000204fa ff2d2101 + bld.vop2_dpp(aco_opcode::v_mul_f16, dst_v128, op_v1, op_v2, dpp_row_rr(1))->dpp16().abs[0] = true; + + //! v_cmp_eq_f16_e64_dpp vcc, -v129, v2 row_ror:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 fi:1 ; d402006a 200204fa ff1d2181 + bld.vopc_dpp(aco_opcode::v_cmp_eq_f16, bld.def(s2, vcc), op_v129, op_v2, dpp_row_rr(1))->dpp16().neg[0] = true; + + //! v_cmp_eq_f16_e64_dpp vcc, |v129|, v2 row_ror:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 fi:1 ; d402016a 000204fa ff2d2181 + bld.vopc_dpp(aco_opcode::v_cmp_eq_f16, bld.def(s2, vcc), op_v129, op_v2, dpp_row_rr(1))->dpp16().abs[0] = true; + + finish_assembler_test(); +END_TEST #endif