aco: add fetch_inactive field to DPP instructions

Signed-off-by: Rhys Perry <pendingchaos02@gmail.com>
Reviewed-by: Georg Lehmann <dadschoorse@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/25525>
This commit is contained in:
Rhys Perry
2023-10-02 15:47:11 +01:00
committed by Marge Bot
parent 26fce534b5
commit 0e79f76aa5
12 changed files with 82 additions and 59 deletions

View File

@@ -795,8 +795,7 @@ emit_instruction(asm_context& ctx, std::vector<uint32_t>& out, Instruction* inst
encoding |= dpp.neg[1] << 22;
encoding |= dpp.abs[0] << 21;
encoding |= dpp.neg[0] << 20;
if (ctx.gfx_level >= GFX10)
encoding |= 1 << 18; /* set Fetch Inactive */
encoding |= dpp.fetch_inactive << 18;
encoding |= dpp.bound_ctrl << 19;
encoding |= dpp.dpp_ctrl << 8;
encoding |= reg(ctx, dpp_op, 8);
@@ -809,7 +808,7 @@ emit_instruction(asm_context& ctx, std::vector<uint32_t>& out, Instruction* inst
/* first emit the instruction without the DPP operand */
Operand dpp_op = instr->operands[0];
instr->operands[0] = Operand(PhysReg{234}, v1);
instr->operands[0] = Operand(PhysReg{233u + dpp.fetch_inactive}, v1);
instr->format = (Format)((uint16_t)instr->format & ~(uint16_t)Format::DPP8);
emit_instruction(ctx, out, instr);
uint32_t encoding = reg(ctx, dpp_op, 8);

View File

@@ -456,11 +456,13 @@ convert_to_DPP(amd_gfx_level gfx_level, aco_ptr<Instruction>& instr, bool dpp8)
if (dpp8) {
DPP8_instruction* dpp = &instr->dpp8();
dpp->lane_sel = 0xfac688; /* [0,1,2,3,4,5,6,7] */
dpp->fetch_inactive = gfx_level >= GFX10;
} else {
DPP16_instruction* dpp = &instr->dpp16();
dpp->dpp_ctrl = dpp_quad_perm(0, 1, 2, 3);
dpp->row_mask = 0xf;
dpp->bank_mask = 0xf;
dpp->fetch_inactive = gfx_level >= GFX10;
}
instr->valu().neg = tmp->valu().neg;

View File

@@ -1456,13 +1456,15 @@ struct DPP16_instruction : public VALU_instruction {
uint8_t row_mask : 4;
uint8_t bank_mask : 4;
bool bound_ctrl : 1;
uint8_t padding3 : 7;
uint8_t fetch_inactive : 1;
uint8_t padding3 : 6;
};
static_assert(sizeof(DPP16_instruction) == sizeof(VALU_instruction) + 4, "Unexpected padding");
struct DPP8_instruction : public VALU_instruction {
uint32_t lane_sel : 24;
uint32_t padding : 8;
uint32_t fetch_inactive : 1;
uint32_t padding : 7;
};
static_assert(sizeof(DPP8_instruction) == sizeof(VALU_instruction) + 4, "Unexpected padding");

View File

@@ -160,9 +160,11 @@ class Format(Enum):
return [('uint16_t', 'dpp_ctrl', None),
('uint8_t', 'row_mask', '0xF'),
('uint8_t', 'bank_mask', '0xF'),
('bool', 'bound_ctrl', 'true')]
('bool', 'bound_ctrl', 'true'),
('bool', 'fetch_inactive', 'true')]
elif self == Format.DPP8:
return [('uint32_t', 'lane_sel', 0)]
return [('uint32_t', 'lane_sel', 0),
('bool', 'fetch_inactive', 'true')]
elif self == Format.VOP3P:
return [('uint8_t', 'opsel_lo', None),
('uint8_t', 'opsel_hi', None)]
@@ -194,6 +196,8 @@ class Format(Enum):
for i in range(min(num_operands, 2)):
res += 'instr->sel[{0}] = SubdwordSel(op{0}.op.bytes(), 0, false);'.format(i)
res += 'instr->dst_sel = SubdwordSel(def0.bytes(), 0, false);\n'
elif self in [Format.DPP16, Format.DPP8]:
res += 'instr->fetch_inactive &= program->gfx_level >= GFX10;\n'
return res

View File

@@ -181,12 +181,13 @@ struct InstrPred {
DPP16_instruction& bDPP = b->dpp16();
return aDPP.pass_flags == bDPP.pass_flags && aDPP.dpp_ctrl == bDPP.dpp_ctrl &&
aDPP.bank_mask == bDPP.bank_mask && aDPP.row_mask == bDPP.row_mask &&
aDPP.bound_ctrl == bDPP.bound_ctrl;
aDPP.bound_ctrl == bDPP.bound_ctrl && aDPP.fetch_inactive == bDPP.fetch_inactive;
}
if (a->isDPP8()) {
DPP8_instruction& aDPP = a->dpp8();
DPP8_instruction& bDPP = b->dpp8();
return aDPP.pass_flags == bDPP.pass_flags && aDPP.lane_sel == bDPP.lane_sel;
return aDPP.pass_flags == bDPP.pass_flags && aDPP.lane_sel == bDPP.lane_sel &&
aDPP.fetch_inactive == bDPP.fetch_inactive;
}
if (a->isSDWA()) {
SDWA_instruction& aSDWA = a->sdwa();

View File

@@ -4866,12 +4866,14 @@ select_instruction(opt_ctx& ctx, aco_ptr<Instruction>& instr)
if (dpp8) {
DPP8_instruction* dpp = &instr->dpp8();
dpp->lane_sel = info.instr->dpp8().lane_sel;
dpp->fetch_inactive = info.instr->dpp8().fetch_inactive;
if (mov_uses_mods)
instr->format = asVOP3(instr->format);
} else {
DPP16_instruction* dpp = &instr->dpp16();
dpp->dpp_ctrl = info.instr->dpp16().dpp_ctrl;
dpp->bound_ctrl = info.instr->dpp16().bound_ctrl;
dpp->fetch_inactive = info.instr->dpp16().fetch_inactive;
}
instr->valu().neg[0] ^= info.instr->valu().neg[0] && !instr->valu().abs[0];

View File

@@ -507,8 +507,10 @@ try_combine_dpp(pr_opt_ctx& ctx, aco_ptr<Instruction>& instr)
if (is_overwritten_since(ctx, mov->operands[0], op_instr_idx))
continue;
/* GFX8/9 don't have fetch-inactive. */
if (ctx.program->gfx_level < GFX10 &&
bool dpp8 = mov->isDPP8();
/* Fetch-inactive means exec is ignored, which allows us to combine across exec changes. */
if (!(dpp8 ? mov->dpp8().fetch_inactive : mov->dpp16().fetch_inactive) &&
is_overwritten_since(ctx, Operand(exec, ctx.program->lane_mask), op_instr_idx))
continue;
@@ -519,7 +521,6 @@ try_combine_dpp(pr_opt_ctx& ctx, aco_ptr<Instruction>& instr)
if (op_used_twice)
continue;
bool dpp8 = mov->isDPP8();
bool input_mods = can_use_input_modifiers(ctx.program->gfx_level, instr->opcode, i) &&
get_operand_size(instr, i) == 32;
bool mov_uses_mods = mov->valu().neg[0] || mov->valu().abs[0];
@@ -548,12 +549,14 @@ try_combine_dpp(pr_opt_ctx& ctx, aco_ptr<Instruction>& instr)
if (dpp8) {
DPP8_instruction* dpp = &instr->dpp8();
dpp->lane_sel = mov->dpp8().lane_sel;
dpp->fetch_inactive = mov->dpp8().fetch_inactive;
if (mov_uses_mods)
instr->format = asVOP3(instr->format);
} else {
DPP16_instruction* dpp = &instr->dpp16();
dpp->dpp_ctrl = mov->dpp16().dpp_ctrl;
dpp->bound_ctrl = true;
dpp->fetch_inactive = mov->dpp16().fetch_inactive;
}
instr->valu().neg[0] ^= mov->valu().neg[0] && !instr->valu().abs[0];
instr->valu().abs[0] |= mov->valu().abs[0];

View File

@@ -707,12 +707,16 @@ print_instr_format_specific(enum amd_gfx_level gfx_level, const Instruction* ins
fprintf(output, " bank_mask:0x%.1x", dpp.bank_mask);
if (dpp.bound_ctrl)
fprintf(output, " bound_ctrl:1");
if (dpp.fetch_inactive)
fprintf(output, " fi");
} else if (instr->isDPP8()) {
const DPP8_instruction& dpp = instr->dpp8();
fprintf(output, " dpp8:[");
for (unsigned i = 0; i < 8; i++)
fprintf(output, "%s%u", i ? "," : "", (dpp.lane_sel >> (i * 3)) & 0x8);
fprintf(output, "]");
if (dpp.fetch_inactive)
fprintf(output, " fi");
} else if (instr->isSDWA()) {
const SDWA_instruction& sdwa = instr->sdwa();
if (!instr->isVOPC()) {

View File

@@ -146,6 +146,11 @@ validate_ir(Program* program)
"Format cannot have DPP applied", instr.get());
check((!instr->isVOP3() && !instr->isVOP3P()) || program->gfx_level >= GFX11,
"VOP3+DPP is GFX11+ only", instr.get());
bool fi =
instr->isDPP8() ? instr->dpp8().fetch_inactive : instr->dpp16().fetch_inactive;
check(!fi || program->gfx_level >= GFX10, "DPP Fetch-Inactive is GFX10+ only",
instr.get());
}
/* check SDWA */

View File

@@ -526,8 +526,8 @@ BEGIN_TEST(d3d11_derivs.fddxy)
pbld.add_vsfs(vs, fs);
/* Must be before BB1 */
//>> v1: %_ = v_sub_f32 (kill)%_, (kill)%_ quad_perm:[1,1,3,3] bound_ctrl:1
//>> v1: %_ = v_sub_f32 (kill)%_, (kill)%_ quad_perm:[2,2,2,2] bound_ctrl:1
//>> v1: %_ = v_sub_f32 (kill)%_, (kill)%_ quad_perm:[1,1,3,3] bound_ctrl:1 fi
//>> v1: %_ = v_sub_f32 (kill)%_, (kill)%_ quad_perm:[2,2,2,2] bound_ctrl:1 fi
//>> BB1
pbld.print_ir(VK_SHADER_STAGE_FRAGMENT_BIT, "ACO IR");
END_TEST
@@ -598,12 +598,12 @@ BEGIN_TEST(d3d11_derivs.get_lod)
//>> v1: %y = v_interp_p2_f32 (kill)%_, (kill)%_:m0, (kill)%_ attr0.y
//>> v2: %vec = p_create_vector %x, %y
//>> lv2: %wqm = p_start_linear_vgpr (kill)%vec
//>> v1: %x0 = v_mov_b32 %x quad_perm:[0,0,0,0] bound_ctrl:1
//>> v1: %x1_m_x0 = v_sub_f32 %x, %x0 quad_perm:[1,1,1,1] bound_ctrl:1
//>> v1: %x2_m_x0 = v_sub_f32 (kill)%x, (kill)%x0 quad_perm:[2,2,2,2] bound_ctrl:1
//>> v1: %y0 = v_mov_b32 %y quad_perm:[0,0,0,0] bound_ctrl:1
//>> v1: %y1_m_y0 = v_sub_f32 %y, %y0 quad_perm:[1,1,1,1] bound_ctrl:1
//>> v1: %y2_m_y0 = v_sub_f32 (kill)%y, (kill)%y0 quad_perm:[2,2,2,2] bound_ctrl:1
//>> v1: %x0 = v_mov_b32 %x quad_perm:[0,0,0,0] bound_ctrl:1 fi
//>> v1: %x1_m_x0 = v_sub_f32 %x, %x0 quad_perm:[1,1,1,1] bound_ctrl:1 fi
//>> v1: %x2_m_x0 = v_sub_f32 (kill)%x, (kill)%x0 quad_perm:[2,2,2,2] bound_ctrl:1 fi
//>> v1: %y0 = v_mov_b32 %y quad_perm:[0,0,0,0] bound_ctrl:1 fi
//>> v1: %y1_m_y0 = v_sub_f32 %y, %y0 quad_perm:[1,1,1,1] bound_ctrl:1 fi
//>> v1: %y2_m_y0 = v_sub_f32 (kill)%y, (kill)%y0 quad_perm:[2,2,2,2] bound_ctrl:1 fi
//>> BB1
//>> v2: %_ = image_get_lod (kill)%_, (kill)%_, v1: undef, %wqm 2d
//>> BB2

View File

@@ -59,7 +59,8 @@ BEGIN_TEST(optimize.neg)
Temp neg_abs_a = fneg(abs_a);
writeout(4, bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), neg_abs_a, inputs[1]));
//! v1: %res5 = v_mul_f32 -%a, %b row_shl:1 bound_ctrl:1
//~gfx9! v1: %res5 = v_mul_f32 -%a, %b row_shl:1 bound_ctrl:1
//~gfx10! v1: %res5 = v_mul_f32 -%a, %b row_shl:1 bound_ctrl:1 fi
//! p_unit_test 5, %res5
writeout(5,
bld.vop2_dpp(aco_opcode::v_mul_f32, bld.def(v1), neg_a, inputs[1], dpp_row_sl(1)));
@@ -999,42 +1000,42 @@ BEGIN_TEST(optimizer.dpp)
Operand d(inputs[3]);
/* basic optimization */
//! v1: %res0 = v_add_f32 %a, %b row_mirror bound_ctrl:1
//! v1: %res0 = v_add_f32 %a, %b row_mirror bound_ctrl:1 fi
//! p_unit_test 0, %res0
Temp tmp0 = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), a, dpp_row_mirror);
Temp res0 = bld.vop2(aco_opcode::v_add_f32, bld.def(v1), tmp0, b);
writeout(0, res0);
/* operand swapping */
//! v1: %res1 = v_subrev_f32 %a, %b row_mirror bound_ctrl:1
//! v1: %res1 = v_subrev_f32 %a, %b row_mirror bound_ctrl:1 fi
//! p_unit_test 1, %res1
Temp tmp1 = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), a, dpp_row_mirror);
Temp res1 = bld.vop2(aco_opcode::v_sub_f32, bld.def(v1), b, tmp1);
writeout(1, res1);
//! v1: %tmp2 = v_mov_b32 %a row_mirror bound_ctrl:1
//! v1: %res2 = v_sub_f32 %b, %tmp2 row_half_mirror bound_ctrl:1
//! v1: %tmp2 = v_mov_b32 %a row_mirror bound_ctrl:1 fi
//! v1: %res2 = v_sub_f32 %b, %tmp2 row_half_mirror bound_ctrl:1 fi
//! p_unit_test 2, %res2
Temp tmp2 = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), a, dpp_row_mirror);
Temp res2 = bld.vop2_dpp(aco_opcode::v_sub_f32, bld.def(v1), b, tmp2, dpp_row_half_mirror);
writeout(2, res2);
/* modifiers */
//! v1: %res3 = v_add_f32 -%a, %b row_mirror bound_ctrl:1
//! v1: %res3 = v_add_f32 -%a, %b row_mirror bound_ctrl:1 fi
//! p_unit_test 3, %res3
auto tmp3 = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), a, dpp_row_mirror);
tmp3->dpp16().neg[0] = true;
Temp res3 = bld.vop2(aco_opcode::v_add_f32, bld.def(v1), tmp3, b);
writeout(3, res3);
//! v1: %res4 = v_add_f32 -%a, %b row_mirror bound_ctrl:1
//! v1: %res4 = v_add_f32 -%a, %b row_mirror bound_ctrl:1 fi
//! p_unit_test 4, %res4
Temp tmp4 = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), a, dpp_row_mirror);
auto res4 = bld.vop2_e64(aco_opcode::v_add_f32, bld.def(v1), tmp4, b);
res4->valu().neg[0] = true;
writeout(4, res4);
//! v1: %tmp5 = v_mov_b32 %a row_mirror bound_ctrl:1
//! v1: %tmp5 = v_mov_b32 %a row_mirror bound_ctrl:1 fi
//! v1: %res5 = v_add_f32 %tmp5, %b clamp
//! p_unit_test 5, %res5
Temp tmp5 = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), a, dpp_row_mirror);
@@ -1042,7 +1043,7 @@ BEGIN_TEST(optimizer.dpp)
res5->valu().clamp = true;
writeout(5, res5);
//! v1: %res6 = v_add_f32 |%a|, %b row_mirror bound_ctrl:1
//! v1: %res6 = v_add_f32 |%a|, %b row_mirror bound_ctrl:1 fi
//! p_unit_test 6, %res6
auto tmp6 = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), a, dpp_row_mirror);
tmp6->dpp16().neg[0] = true;
@@ -1050,14 +1051,14 @@ BEGIN_TEST(optimizer.dpp)
res6->valu().abs[0] = true;
writeout(6, res6);
//! v1: %res7 = v_subrev_f32 %a, |%b| row_mirror bound_ctrl:1
//! v1: %res7 = v_subrev_f32 %a, |%b| row_mirror bound_ctrl:1 fi
//! p_unit_test 7, %res7
Temp tmp7 = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), a, dpp_row_mirror);
auto res7 = bld.vop2_e64(aco_opcode::v_sub_f32, bld.def(v1), b, tmp7);
res7->valu().abs[0] = true;
writeout(7, res7);
//! v1: %tmp11 = v_mov_b32 -%a row_mirror bound_ctrl:1
//! v1: %tmp11 = v_mov_b32 -%a row_mirror bound_ctrl:1 fi
//! v1: %res11 = v_add_u32 %tmp11, %b
//! p_unit_test 11, %res11
auto tmp11 = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), a, dpp_row_mirror);
@@ -1065,7 +1066,7 @@ BEGIN_TEST(optimizer.dpp)
Temp res11 = bld.vop2(aco_opcode::v_add_u32, bld.def(v1), tmp11, b);
writeout(11, res11);
//! v1: %tmp12 = v_mov_b32 -%a row_mirror bound_ctrl:1
//! v1: %tmp12 = v_mov_b32 -%a row_mirror bound_ctrl:1 fi
//! v1: %res12 = v_add_f16 %tmp12, %b
//! p_unit_test 12, %res12
auto tmp12 = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), a, dpp_row_mirror);
@@ -1074,21 +1075,21 @@ BEGIN_TEST(optimizer.dpp)
writeout(12, res12);
/* vcc */
//! v1: %res8 = v_cndmask_b32 %a, %b, %c:vcc row_mirror bound_ctrl:1
//! v1: %res8 = v_cndmask_b32 %a, %b, %c:vcc row_mirror bound_ctrl:1 fi
//! p_unit_test 8, %res8
Temp tmp8 = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), a, dpp_row_mirror);
Temp res8 = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), tmp8, b, c);
writeout(8, res8);
/* sgprs */
//! v1: %tmp9 = v_mov_b32 %a row_mirror bound_ctrl:1
//! v1: %tmp9 = v_mov_b32 %a row_mirror bound_ctrl:1 fi
//! v1: %res9 = v_add_f32 %tmp9, %d
//! p_unit_test 9, %res9
Temp tmp9 = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), a, dpp_row_mirror);
Temp res9 = bld.vop2_e64(aco_opcode::v_add_f32, bld.def(v1), tmp9, d);
writeout(9, res9);
//! v1: %tmp10 = v_mov_b32 %a row_mirror bound_ctrl:1
//! v1: %tmp10 = v_mov_b32 %a row_mirror bound_ctrl:1 fi
//! v1: %res10 = v_add_f32 %d, %tmp10
//! p_unit_test 10, %res10
Temp tmp10 = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), a, dpp_row_mirror);
@@ -1109,7 +1110,7 @@ BEGIN_TEST(optimize.dpp_prop)
Temp one = bld.copy(bld.def(v1), Operand::c32(1));
writeout(0, bld.vop2_dpp(aco_opcode::v_mul_f32, bld.def(v1), one, inputs[0], dpp_row_sl(1)));
//! v1: %res1 = v_mul_f32 %a, %one row_shl:1 bound_ctrl:1
//! v1: %res1 = v_mul_f32 %a, %one row_shl:1 bound_ctrl:1 fi
//! p_unit_test 1, %res1
writeout(1, bld.vop2_dpp(aco_opcode::v_mul_f32, bld.def(v1), inputs[0], one, dpp_row_sl(1)));
@@ -1120,7 +1121,7 @@ BEGIN_TEST(optimize.dpp_prop)
bld.vop2_dpp(aco_opcode::v_mul_f32, bld.def(v1), literal1, inputs[0], dpp_row_sl(1)));
//! v1: %literal2 = p_parallelcopy 0x12345679
//! v1: %res3 = v_mul_f32 %a, %literal row_shl:1 bound_ctrl:1
//! v1: %res3 = v_mul_f32 %a, %literal row_shl:1 bound_ctrl:1 fi
//! p_unit_test 3, %res3
Temp literal2 = bld.copy(bld.def(v1), Operand::c32(0x12345679u));
writeout(3,
@@ -1132,7 +1133,7 @@ BEGIN_TEST(optimize.dpp_prop)
Temp b_v = bld.copy(bld.def(v1), inputs[1]);
writeout(4, bld.vop2_dpp(aco_opcode::v_mul_f32, bld.def(v1), b_v, inputs[0], dpp_row_sl(1)));
//! v1: %res5 = v_mul_f32 %a, %b_v row_shl:1 bound_ctrl:1
//! v1: %res5 = v_mul_f32 %a, %b_v row_shl:1 bound_ctrl:1 fi
//! p_unit_test 5, %res5
writeout(5, bld.vop2_dpp(aco_opcode::v_mul_f32, bld.def(v1), inputs[0], b_v, dpp_row_sl(1)));
@@ -2006,11 +2007,11 @@ BEGIN_TEST(optimize.dpp_opsel)
Temp b_hi = bld.pseudo(aco_opcode::p_extract_vector, bld.def(v2b), b, Operand::c32(1));
Temp b_lo = bld.pseudo(aco_opcode::p_extract_vector, bld.def(v2b), b, Operand::c32(0));
//! v2b: %res0 = v_add_f16 hi(%a), hi(%b) row_mirror bound_ctrl:1
//! v2b: %res0 = v_add_f16 hi(%a), hi(%b) row_mirror bound_ctrl:1 fi
//! p_unit_test 0, %res0
writeout(0, fadd(dpp16_hi, b_hi));
//! v2b: %res1 = v_add_f16 hi(%a), %b dpp8:[0,0,0,0,0,0,0,0]
//! v2b: %res1 = v_add_f16 hi(%a), %b dpp8:[0,0,0,0,0,0,0,0] fi
//! p_unit_test 1, %res1
writeout(1, fadd(b_lo, dpp8_hi));

View File

@@ -365,21 +365,21 @@ BEGIN_TEST(optimizer_postRA.dpp)
Operand d(inputs[3], PhysReg(0));
/* basic optimization */
//! v1: %res0:v[2] = v_add_f32 %a:v[0], %b:v[1] row_mirror bound_ctrl:1
//! v1: %res0:v[2] = v_add_f32 %a:v[0], %b:v[1] row_mirror bound_ctrl:1 fi
//! p_unit_test 0, %res0:v[2]
Temp tmp0 = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1, reg_v2), a, dpp_row_mirror);
Temp res0 = bld.vop2(aco_opcode::v_add_f32, bld.def(v1, reg_v2), Operand(tmp0, reg_v2), b);
writeout(0, Operand(res0, reg_v2));
/* operand swapping */
//! v1: %res1:v[2] = v_subrev_f32 %a:v[0], %b:v[1] row_mirror bound_ctrl:1
//! v1: %res1:v[2] = v_subrev_f32 %a:v[0], %b:v[1] row_mirror bound_ctrl:1 fi
//! p_unit_test 1, %res1:v[2]
Temp tmp1 = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1, reg_v2), a, dpp_row_mirror);
Temp res1 = bld.vop2(aco_opcode::v_sub_f32, bld.def(v1, reg_v2), b, Operand(tmp1, reg_v2));
writeout(1, Operand(res1, reg_v2));
//! v1: %tmp2:v[2] = v_mov_b32 %a:v[0] row_mirror bound_ctrl:1
//! v1: %res2:v[2] = v_sub_f32 %b:v[1], %tmp2:v[2] row_half_mirror bound_ctrl:1
//! v1: %tmp2:v[2] = v_mov_b32 %a:v[0] row_mirror bound_ctrl:1 fi
//! v1: %res2:v[2] = v_sub_f32 %b:v[1], %tmp2:v[2] row_half_mirror bound_ctrl:1 fi
//! p_unit_test 2, %res2:v[2]
Temp tmp2 = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1, reg_v2), a, dpp_row_mirror);
Temp res2 = bld.vop2_dpp(aco_opcode::v_sub_f32, bld.def(v1, reg_v2), b, Operand(tmp2, reg_v2),
@@ -387,21 +387,21 @@ BEGIN_TEST(optimizer_postRA.dpp)
writeout(2, Operand(res2, reg_v2));
/* modifiers */
//! v1: %res3:v[2] = v_add_f32 -%a:v[0], %b:v[1] row_mirror bound_ctrl:1
//! v1: %res3:v[2] = v_add_f32 -%a:v[0], %b:v[1] row_mirror bound_ctrl:1 fi
//! p_unit_test 3, %res3:v[2]
auto tmp3 = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1, reg_v2), a, dpp_row_mirror);
tmp3->dpp16().neg[0] = true;
Temp res3 = bld.vop2(aco_opcode::v_add_f32, bld.def(v1, reg_v2), Operand(tmp3, reg_v2), b);
writeout(3, Operand(res3, reg_v2));
//! v1: %res4:v[2] = v_add_f32 -%a:v[0], %b:v[1] row_mirror bound_ctrl:1
//! v1: %res4:v[2] = v_add_f32 -%a:v[0], %b:v[1] row_mirror bound_ctrl:1 fi
//! p_unit_test 4, %res4:v[2]
Temp tmp4 = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1, reg_v2), a, dpp_row_mirror);
auto res4 = bld.vop2_e64(aco_opcode::v_add_f32, bld.def(v1, reg_v2), Operand(tmp4, reg_v2), b);
res4->valu().neg[0] = true;
writeout(4, Operand(res4, reg_v2));
//! v1: %tmp5:v[2] = v_mov_b32 %a:v[0] row_mirror bound_ctrl:1
//! v1: %tmp5:v[2] = v_mov_b32 %a:v[0] row_mirror bound_ctrl:1 fi
//! v1: %res5:v[2] = v_add_f32 %tmp5:v[2], %b:v[1] clamp
//! p_unit_test 5, %res5:v[2]
Temp tmp5 = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1, reg_v2), a, dpp_row_mirror);
@@ -409,7 +409,7 @@ BEGIN_TEST(optimizer_postRA.dpp)
res5->valu().clamp = true;
writeout(5, Operand(res5, reg_v2));
//! v1: %res6:v[2] = v_add_f32 |%a:v[0]|, %b:v[1] row_mirror bound_ctrl:1
//! v1: %res6:v[2] = v_add_f32 |%a:v[0]|, %b:v[1] row_mirror bound_ctrl:1 fi
//! p_unit_test 6, %res6:v[2]
auto tmp6 = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1, reg_v2), a, dpp_row_mirror);
tmp6->dpp16().neg[0] = true;
@@ -417,14 +417,14 @@ BEGIN_TEST(optimizer_postRA.dpp)
res6->valu().abs[0] = true;
writeout(6, Operand(res6, reg_v2));
//! v1: %res7:v[2] = v_subrev_f32 %a:v[0], |%b:v[1]| row_mirror bound_ctrl:1
//! v1: %res7:v[2] = v_subrev_f32 %a:v[0], |%b:v[1]| row_mirror bound_ctrl:1 fi
//! p_unit_test 7, %res7:v[2]
Temp tmp7 = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1, reg_v2), a, dpp_row_mirror);
auto res7 = bld.vop2_e64(aco_opcode::v_sub_f32, bld.def(v1, reg_v2), b, Operand(tmp7, reg_v2));
res7->valu().abs[0] = true;
writeout(7, Operand(res7, reg_v2));
//! v1: %tmp12:v[2] = v_mov_b32 -%a:v[0] row_mirror bound_ctrl:1
//! v1: %tmp12:v[2] = v_mov_b32 -%a:v[0] row_mirror bound_ctrl:1 fi
//! v1: %res12:v[2] = v_add_u32 %tmp12:v[2], %b:v[1]
//! p_unit_test 12, %res12:v[2]
auto tmp12 = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1, reg_v2), a, dpp_row_mirror);
@@ -432,7 +432,7 @@ BEGIN_TEST(optimizer_postRA.dpp)
Temp res12 = bld.vop2(aco_opcode::v_add_u32, bld.def(v1, reg_v2), Operand(tmp12, reg_v2), b);
writeout(12, Operand(res12, reg_v2));
//! v1: %tmp13:v[2] = v_mov_b32 -%a:v[0] row_mirror bound_ctrl:1
//! v1: %tmp13:v[2] = v_mov_b32 -%a:v[0] row_mirror bound_ctrl:1 fi
//! v1: %res13:v[2] = v_add_f16 %tmp13:v[2], %b:v[1]
//! p_unit_test 13, %res13:v[2]
auto tmp13 = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1, reg_v2), a, dpp_row_mirror);
@@ -441,14 +441,14 @@ BEGIN_TEST(optimizer_postRA.dpp)
writeout(13, Operand(res13, reg_v2));
/* vcc */
//! v1: %res8:v[2] = v_cndmask_b32 %a:v[0], %b:v[1], %c:vcc row_mirror bound_ctrl:1
//! v1: %res8:v[2] = v_cndmask_b32 %a:v[0], %b:v[1], %c:vcc row_mirror bound_ctrl:1 fi
//! p_unit_test 8, %res8:v[2]
Temp tmp8 = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1, reg_v2), a, dpp_row_mirror);
Temp res8 =
bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1, reg_v2), Operand(tmp8, reg_v2), b, c);
writeout(8, Operand(res8, reg_v2));
//! v1: %tmp9:v[2] = v_mov_b32 %a:v[0] row_mirror bound_ctrl:1
//! v1: %tmp9:v[2] = v_mov_b32 %a:v[0] row_mirror bound_ctrl:1 fi
//! v1: %res9:v[2] = v_cndmask_b32 %tmp9:v[2], %b:v[1], %d:s[0-1]
//! p_unit_test 9, %res9:v[2]
Temp tmp9 = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1, reg_v2), a, dpp_row_mirror);
@@ -459,7 +459,7 @@ BEGIN_TEST(optimizer_postRA.dpp)
/* control flow */
//! BB1
//! /* logical preds: BB0, / linear preds: BB0, / kind: uniform, */
//! v1: %res10:v[2] = v_add_f32 %a:v[0], %b:v[1] row_mirror bound_ctrl:1
//! v1: %res10:v[2] = v_add_f32 %a:v[0], %b:v[1] row_mirror bound_ctrl:1 fi
//! p_unit_test 10, %res10:v[2]
Temp tmp10 = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1, reg_v2), a, dpp_row_mirror);
@@ -473,7 +473,7 @@ BEGIN_TEST(optimizer_postRA.dpp)
writeout(10, Operand(res10, reg_v2));
/* can't combine if the v_mov_b32's operand is modified */
//! v1: %tmp11_1:v[2] = v_mov_b32 %a:v[0] row_mirror bound_ctrl:1
//! v1: %tmp11_1:v[2] = v_mov_b32 %a:v[0] row_mirror bound_ctrl:1 fi
//! v1: %tmp11_2:v[0] = v_mov_b32 0
//! v1: %res11:v[2] = v_add_f32 %tmp11_1:v[2], %b:v[1]
//! p_unit_test 11, %res11_1:v[2], %tmp11_2:v[0]
@@ -501,7 +501,7 @@ BEGIN_TEST(optimizer_postRA.dpp_across_exec)
//~gfx9! v1: %tmp0:v[2] = v_mov_b32 %a:v[0] row_mirror bound_ctrl:1
//! s2: %0:exec, s1: %0:scc = s_not_b64 %0:exec
//~gfx9! v1: %res0:v[2] = v_add_f32 %tmp0:v[2], %b:v[1]
//~gfx10! v1: %res0:v[2] = v_add_f32 %a:v[0], %b:v[1] row_mirror bound_ctrl:1
//~gfx10! v1: %res0:v[2] = v_add_f32 %a:v[0], %b:v[1] row_mirror bound_ctrl:1 fi
//! p_unit_test 0, %res0:v[2]
Temp tmp0 = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1, reg_v2), a, dpp_row_mirror);
bld.sop1(Builder::s_not, Definition(exec, bld.lm), Definition(scc, s1),
@@ -525,7 +525,7 @@ BEGIN_TEST(optimizer_postRA.dpp_vcmpx)
Operand a(inputs[0], PhysReg(256));
Operand b(inputs[1], PhysReg(257));
//! v1: %tmp0:v[2] = v_mov_b32 %a:v[0] row_mirror bound_ctrl:1
//! v1: %tmp0:v[2] = v_mov_b32 %a:v[0] row_mirror bound_ctrl:1 fi
//! s2: %res0:exec = v_cmpx_lt_f32 %tmp0:v[2], %b:v[1]
//! p_unit_test 0, %res0:exec
Temp tmp0 = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1, reg_v2), a, dpp_row_mirror);
@@ -605,7 +605,7 @@ BEGIN_TEST(optimizer_postRA.dpp_across_cf)
//! /* logical preds: BB1, BB4, / linear preds: BB4, BB5, / kind: uniform, top-level, merge, */
//! s2: %0:exec = p_parallelcopy %saved_exec:s[84-85]
//! v1: %res10:v[12] = v_add_f32 %a:v[0], %b:v[1] row_mirror bound_ctrl:1
//! v1: %res10:v[12] = v_add_f32 %a:v[0], %b:v[1] row_mirror bound_ctrl:1 fi
//! p_unit_test 10, %res10:v[12]
Temp result =
bld.vop2(aco_opcode::v_add_f32, bld.def(v1, reg_v12), Operand(dpp_tmp, reg_v12), b);
@@ -635,7 +635,7 @@ BEGIN_TEST(optimizer_postRA.dpp_across_cf_overwritten)
Operand f(inputs[5], PhysReg(2)); /* buffer store address (scalar) */
PhysReg reg_v12(268); /* temporary register */
//! v1: %dpp_tmp:v[12] = v_mov_b32 %a:v[0] row_mirror bound_ctrl:1
//! v1: %dpp_tmp:v[12] = v_mov_b32 %a:v[0] row_mirror bound_ctrl:1 fi
Temp dpp_tmp = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1, reg_v12), a, dpp_row_mirror);
//! s2: %saved_exec:s[84-85], s1: %0:scc, s2: %0:exec = s_and_saveexec_b64 %e:s[0-1], %0:exec