aco: reorder dpp for ddx/ddy
Having the mov last allows us to fuse it with the use instruction. Foz-DB Navi31: Totals from 9400 (11.84% of 79395) affected shaders: MaxWaves: 273998 -> 274030 (+0.01%) Instrs: 8303778 -> 8282997 (-0.25%); split: -0.29%, +0.04% CodeSize: 44428088 -> 44464860 (+0.08%); split: -0.09%, +0.18% VGPRs: 506616 -> 504492 (-0.42%) SpillSGPRs: 1389 -> 1393 (+0.29%) Latency: 76923466 -> 76983332 (+0.08%); split: -0.06%, +0.14% InvThroughput: 12386888 -> 12391262 (+0.04%); split: -0.04%, +0.07% VClause: 125136 -> 125059 (-0.06%); split: -0.13%, +0.07% SClause: 227361 -> 226615 (-0.33%); split: -0.43%, +0.10% Copies: 440787 -> 440749 (-0.01%); split: -0.17%, +0.16% PreVGPRs: 339783 -> 333343 (-1.90%); split: -1.92%, +0.02% VALU: 5088362 -> 5069737 (-0.37%); split: -0.37%, +0.01% SALU: 606596 -> 606609 (+0.00%); split: -0.01%, +0.01% Reviewed-by: Rhys Perry <pendingchaos02@gmail.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/30350>
This commit is contained in:
@@ -4201,32 +4201,33 @@ visit_alu_instr(isel_context* ctx, nir_alu_instr* instr)
|
||||
opsel_hi |= opsel_hi << 1;
|
||||
|
||||
Temp tl = src;
|
||||
Temp tr = src;
|
||||
if (nir_src_is_divergent(instr->src[0].src)) {
|
||||
if (nir_src_is_divergent(instr->src[0].src))
|
||||
tl = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), src, dpp_ctrl1);
|
||||
tr = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), src, dpp_ctrl2);
|
||||
}
|
||||
|
||||
VALU_instruction& sub =
|
||||
bld.vop3p(aco_opcode::v_pk_add_f16, Definition(dst), tr, tl, opsel_lo, opsel_hi)
|
||||
.instr->valu();
|
||||
sub.neg_lo[1] = true;
|
||||
sub.neg_hi[1] = true;
|
||||
Builder::Result sub =
|
||||
bld.vop3p(aco_opcode::v_pk_add_f16, bld.def(v1), src, tl, opsel_lo, opsel_hi);
|
||||
sub->valu().neg_lo[1] = true;
|
||||
sub->valu().neg_hi[1] = true;
|
||||
|
||||
if (nir_src_is_divergent(instr->src[0].src))
|
||||
bld.vop1_dpp(aco_opcode::v_mov_b32, Definition(dst), sub, dpp_ctrl2);
|
||||
else
|
||||
bld.copy(Definition(dst), sub);
|
||||
emit_split_vector(ctx, dst, 2);
|
||||
} else {
|
||||
Temp src = as_vgpr(ctx, get_alu_src(ctx, instr->src[0]));
|
||||
|
||||
aco_opcode sub =
|
||||
instr->def.bit_size == 16 ? aco_opcode::v_sub_f16 : aco_opcode::v_sub_f32;
|
||||
aco_opcode subrev =
|
||||
instr->def.bit_size == 16 ? aco_opcode::v_subrev_f16 : aco_opcode::v_subrev_f32;
|
||||
if (!nir_src_is_divergent(instr->src[0].src)) {
|
||||
bld.vop2(sub, Definition(dst), src, src);
|
||||
bld.vop2(subrev, Definition(dst), src, src);
|
||||
} else if (ctx->program->gfx_level >= GFX8) {
|
||||
Temp tl = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), src, dpp_ctrl1);
|
||||
bld.vop2_dpp(sub, Definition(dst), src, tl, dpp_ctrl2);
|
||||
Temp tmp = bld.vop2_dpp(subrev, bld.def(v1), src, src, dpp_ctrl1);
|
||||
bld.vop1_dpp(aco_opcode::v_mov_b32, Definition(dst), tmp, dpp_ctrl2);
|
||||
} else {
|
||||
Temp tl = bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), src, (1 << 15) | dpp_ctrl1);
|
||||
Temp tr = bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), src, (1 << 15) | dpp_ctrl2);
|
||||
bld.vop2(aco_opcode::v_sub_f32, Definition(dst), tr, tl);
|
||||
bld.vop2(subrev, Definition(dst), tl, tr);
|
||||
}
|
||||
}
|
||||
set_wqm(ctx, true);
|
||||
|
@@ -509,8 +509,8 @@ BEGIN_TEST(d3d11_derivs.fddxy)
|
||||
pbld.add_vsfs(vs, fs);
|
||||
|
||||
/* Must be before BB1 */
|
||||
//>> v1: %_ = v_sub_f32 (kill)%_, (kill)%_ quad_perm:[1,1,3,3] bound_ctrl:1 fi
|
||||
//>> v1: %_ = v_sub_f32 (kill)%_, (kill)%_ quad_perm:[2,2,2,2] bound_ctrl:1 fi
|
||||
//>> v1: %_ = v_subrev_f32 (kill)%_, (kill)%_ quad_perm:[0,0,2,2] bound_ctrl:1 fi
|
||||
//>> v1: %_ = v_subrev_f32 (kill)%_, (kill)%_ quad_perm:[0,0,0,0] bound_ctrl:1 fi
|
||||
//>> BB1
|
||||
pbld.print_ir(VK_SHADER_STAGE_FRAGMENT_BIT, "ACO IR");
|
||||
END_TEST
|
||||
@@ -579,12 +579,12 @@ BEGIN_TEST(d3d11_derivs.get_lod)
|
||||
//>> v1: %x = v_interp_p2_f32 %_, %_:m0, (kill)%_ attr0.x
|
||||
//>> v1: %y = v_interp_p2_f32 (kill)%_, (kill)%_:m0, (kill)%_ attr0.y
|
||||
//>> lv2: %wqm = p_start_linear_vgpr %x, %y
|
||||
//>> v1: %x0 = v_mov_b32 %x quad_perm:[0,0,0,0] bound_ctrl:1 fi
|
||||
//>> v1: %x1_m_x0 = v_sub_f32 %x, %x0 quad_perm:[1,1,1,1] bound_ctrl:1 fi
|
||||
//>> v1: %x2_m_x0 = v_sub_f32 (kill)%x, (kill)%x0 quad_perm:[2,2,2,2] bound_ctrl:1 fi
|
||||
//>> v1: %y0 = v_mov_b32 %y quad_perm:[0,0,0,0] bound_ctrl:1 fi
|
||||
//>> v1: %y1_m_y0 = v_sub_f32 %y, %y0 quad_perm:[1,1,1,1] bound_ctrl:1 fi
|
||||
//>> v1: %y2_m_y0 = v_sub_f32 (kill)%y, (kill)%y0 quad_perm:[2,2,2,2] bound_ctrl:1 fi
|
||||
//>> v1: %x12_m_x0 = v_subrev_f32 (kill)%x, (kill)%x quad_perm:[0,0,0,0] bound_ctrl:1 fi
|
||||
//>> v1: %x1_m_x0 = v_mov_b32 %x12_m_x0 quad_perm:[1,1,1,1] bound_ctrl:1 fi
|
||||
//>> v1: %x2_m_x0 = v_mov_b32 (kill)%x12_m_x0 quad_perm:[2,2,2,2] bound_ctrl:1 fi
|
||||
//>> v1: %y12_m_y0 = v_subrev_f32 (kill)%y, (kill)%y quad_perm:[0,0,0,0] bound_ctrl:1 fi
|
||||
//>> v1: %y1_m_y0 = v_mov_b32 %y12_m_x0 quad_perm:[1,1,1,1] bound_ctrl:1 fi
|
||||
//>> v1: %y2_m_y0 = v_mov_b32 (kill)%y12_m_x0 quad_perm:[2,2,2,2] bound_ctrl:1 fi
|
||||
//>> BB1
|
||||
//>> v2: %_ = image_get_lod (kill)%_, (kill)%_, v1: undef, (latekill)%wqm 2d
|
||||
//>> BB2
|
||||
|
Reference in New Issue
Block a user