diff --git a/src/amd/compiler/aco_instruction_selection.cpp b/src/amd/compiler/aco_instruction_selection.cpp index 1cbcb9b1619..92768a637ab 100644 --- a/src/amd/compiler/aco_instruction_selection.cpp +++ b/src/amd/compiler/aco_instruction_selection.cpp @@ -4201,32 +4201,33 @@ visit_alu_instr(isel_context* ctx, nir_alu_instr* instr) opsel_hi |= opsel_hi << 1; Temp tl = src; - Temp tr = src; - if (nir_src_is_divergent(instr->src[0].src)) { + if (nir_src_is_divergent(instr->src[0].src)) tl = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), src, dpp_ctrl1); - tr = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), src, dpp_ctrl2); - } - VALU_instruction& sub = - bld.vop3p(aco_opcode::v_pk_add_f16, Definition(dst), tr, tl, opsel_lo, opsel_hi) - .instr->valu(); - sub.neg_lo[1] = true; - sub.neg_hi[1] = true; + Builder::Result sub = + bld.vop3p(aco_opcode::v_pk_add_f16, bld.def(v1), src, tl, opsel_lo, opsel_hi); + sub->valu().neg_lo[1] = true; + sub->valu().neg_hi[1] = true; + + if (nir_src_is_divergent(instr->src[0].src)) + bld.vop1_dpp(aco_opcode::v_mov_b32, Definition(dst), sub, dpp_ctrl2); + else + bld.copy(Definition(dst), sub); emit_split_vector(ctx, dst, 2); } else { Temp src = as_vgpr(ctx, get_alu_src(ctx, instr->src[0])); - aco_opcode sub = - instr->def.bit_size == 16 ? aco_opcode::v_sub_f16 : aco_opcode::v_sub_f32; + aco_opcode subrev = + instr->def.bit_size == 16 ? aco_opcode::v_subrev_f16 : aco_opcode::v_subrev_f32; if (!nir_src_is_divergent(instr->src[0].src)) { - bld.vop2(sub, Definition(dst), src, src); + bld.vop2(subrev, Definition(dst), src, src); } else if (ctx->program->gfx_level >= GFX8) { - Temp tl = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), src, dpp_ctrl1); - bld.vop2_dpp(sub, Definition(dst), src, tl, dpp_ctrl2); + Temp tmp = bld.vop2_dpp(subrev, bld.def(v1), src, src, dpp_ctrl1); + bld.vop1_dpp(aco_opcode::v_mov_b32, Definition(dst), tmp, dpp_ctrl2); } else { Temp tl = bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), src, (1 << 15) | dpp_ctrl1); Temp tr = bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), src, (1 << 15) | dpp_ctrl2); - bld.vop2(aco_opcode::v_sub_f32, Definition(dst), tr, tl); + bld.vop2(subrev, Definition(dst), tl, tr); } } set_wqm(ctx, true); diff --git a/src/amd/compiler/tests/test_d3d11_derivs.cpp b/src/amd/compiler/tests/test_d3d11_derivs.cpp index aa4071798cd..5f2a2265281 100644 --- a/src/amd/compiler/tests/test_d3d11_derivs.cpp +++ b/src/amd/compiler/tests/test_d3d11_derivs.cpp @@ -509,8 +509,8 @@ BEGIN_TEST(d3d11_derivs.fddxy) pbld.add_vsfs(vs, fs); /* Must be before BB1 */ - //>> v1: %_ = v_sub_f32 (kill)%_, (kill)%_ quad_perm:[1,1,3,3] bound_ctrl:1 fi - //>> v1: %_ = v_sub_f32 (kill)%_, (kill)%_ quad_perm:[2,2,2,2] bound_ctrl:1 fi + //>> v1: %_ = v_subrev_f32 (kill)%_, (kill)%_ quad_perm:[0,0,2,2] bound_ctrl:1 fi + //>> v1: %_ = v_subrev_f32 (kill)%_, (kill)%_ quad_perm:[0,0,0,0] bound_ctrl:1 fi //>> BB1 pbld.print_ir(VK_SHADER_STAGE_FRAGMENT_BIT, "ACO IR"); END_TEST @@ -579,12 +579,12 @@ BEGIN_TEST(d3d11_derivs.get_lod) //>> v1: %x = v_interp_p2_f32 %_, %_:m0, (kill)%_ attr0.x //>> v1: %y = v_interp_p2_f32 (kill)%_, (kill)%_:m0, (kill)%_ attr0.y //>> lv2: %wqm = p_start_linear_vgpr %x, %y - //>> v1: %x0 = v_mov_b32 %x quad_perm:[0,0,0,0] bound_ctrl:1 fi - //>> v1: %x1_m_x0 = v_sub_f32 %x, %x0 quad_perm:[1,1,1,1] bound_ctrl:1 fi - //>> v1: %x2_m_x0 = v_sub_f32 (kill)%x, (kill)%x0 quad_perm:[2,2,2,2] bound_ctrl:1 fi - //>> v1: %y0 = v_mov_b32 %y quad_perm:[0,0,0,0] bound_ctrl:1 fi - //>> v1: %y1_m_y0 = v_sub_f32 %y, %y0 quad_perm:[1,1,1,1] bound_ctrl:1 fi - //>> v1: %y2_m_y0 = v_sub_f32 (kill)%y, (kill)%y0 quad_perm:[2,2,2,2] bound_ctrl:1 fi + //>> v1: %x12_m_x0 = v_subrev_f32 (kill)%x, (kill)%x quad_perm:[0,0,0,0] bound_ctrl:1 fi + //>> v1: %x1_m_x0 = v_mov_b32 %x12_m_x0 quad_perm:[1,1,1,1] bound_ctrl:1 fi + //>> v1: %x2_m_x0 = v_mov_b32 (kill)%x12_m_x0 quad_perm:[2,2,2,2] bound_ctrl:1 fi + //>> v1: %y12_m_y0 = v_subrev_f32 (kill)%y, (kill)%y quad_perm:[0,0,0,0] bound_ctrl:1 fi + //>> v1: %y1_m_y0 = v_mov_b32 %y12_m_x0 quad_perm:[1,1,1,1] bound_ctrl:1 fi + //>> v1: %y2_m_y0 = v_mov_b32 (kill)%y12_m_x0 quad_perm:[2,2,2,2] bound_ctrl:1 fi //>> BB1 //>> v2: %_ = image_get_lod (kill)%_, (kill)%_, v1: undef, (latekill)%wqm 2d //>> BB2