aco: add fetch_inactive field to DPP instructions
Signed-off-by: Rhys Perry <pendingchaos02@gmail.com> Reviewed-by: Georg Lehmann <dadschoorse@gmail.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/25525>
This commit is contained in:
@@ -795,8 +795,7 @@ emit_instruction(asm_context& ctx, std::vector<uint32_t>& out, Instruction* inst
|
||||
encoding |= dpp.neg[1] << 22;
|
||||
encoding |= dpp.abs[0] << 21;
|
||||
encoding |= dpp.neg[0] << 20;
|
||||
if (ctx.gfx_level >= GFX10)
|
||||
encoding |= 1 << 18; /* set Fetch Inactive */
|
||||
encoding |= dpp.fetch_inactive << 18;
|
||||
encoding |= dpp.bound_ctrl << 19;
|
||||
encoding |= dpp.dpp_ctrl << 8;
|
||||
encoding |= reg(ctx, dpp_op, 8);
|
||||
@@ -809,7 +808,7 @@ emit_instruction(asm_context& ctx, std::vector<uint32_t>& out, Instruction* inst
|
||||
|
||||
/* first emit the instruction without the DPP operand */
|
||||
Operand dpp_op = instr->operands[0];
|
||||
instr->operands[0] = Operand(PhysReg{234}, v1);
|
||||
instr->operands[0] = Operand(PhysReg{233u + dpp.fetch_inactive}, v1);
|
||||
instr->format = (Format)((uint16_t)instr->format & ~(uint16_t)Format::DPP8);
|
||||
emit_instruction(ctx, out, instr);
|
||||
uint32_t encoding = reg(ctx, dpp_op, 8);
|
||||
|
@@ -456,11 +456,13 @@ convert_to_DPP(amd_gfx_level gfx_level, aco_ptr<Instruction>& instr, bool dpp8)
|
||||
if (dpp8) {
|
||||
DPP8_instruction* dpp = &instr->dpp8();
|
||||
dpp->lane_sel = 0xfac688; /* [0,1,2,3,4,5,6,7] */
|
||||
dpp->fetch_inactive = gfx_level >= GFX10;
|
||||
} else {
|
||||
DPP16_instruction* dpp = &instr->dpp16();
|
||||
dpp->dpp_ctrl = dpp_quad_perm(0, 1, 2, 3);
|
||||
dpp->row_mask = 0xf;
|
||||
dpp->bank_mask = 0xf;
|
||||
dpp->fetch_inactive = gfx_level >= GFX10;
|
||||
}
|
||||
|
||||
instr->valu().neg = tmp->valu().neg;
|
||||
|
@@ -1456,13 +1456,15 @@ struct DPP16_instruction : public VALU_instruction {
|
||||
uint8_t row_mask : 4;
|
||||
uint8_t bank_mask : 4;
|
||||
bool bound_ctrl : 1;
|
||||
uint8_t padding3 : 7;
|
||||
uint8_t fetch_inactive : 1;
|
||||
uint8_t padding3 : 6;
|
||||
};
|
||||
static_assert(sizeof(DPP16_instruction) == sizeof(VALU_instruction) + 4, "Unexpected padding");
|
||||
|
||||
struct DPP8_instruction : public VALU_instruction {
|
||||
uint32_t lane_sel : 24;
|
||||
uint32_t padding : 8;
|
||||
uint32_t fetch_inactive : 1;
|
||||
uint32_t padding : 7;
|
||||
};
|
||||
static_assert(sizeof(DPP8_instruction) == sizeof(VALU_instruction) + 4, "Unexpected padding");
|
||||
|
||||
|
@@ -160,9 +160,11 @@ class Format(Enum):
|
||||
return [('uint16_t', 'dpp_ctrl', None),
|
||||
('uint8_t', 'row_mask', '0xF'),
|
||||
('uint8_t', 'bank_mask', '0xF'),
|
||||
('bool', 'bound_ctrl', 'true')]
|
||||
('bool', 'bound_ctrl', 'true'),
|
||||
('bool', 'fetch_inactive', 'true')]
|
||||
elif self == Format.DPP8:
|
||||
return [('uint32_t', 'lane_sel', 0)]
|
||||
return [('uint32_t', 'lane_sel', 0),
|
||||
('bool', 'fetch_inactive', 'true')]
|
||||
elif self == Format.VOP3P:
|
||||
return [('uint8_t', 'opsel_lo', None),
|
||||
('uint8_t', 'opsel_hi', None)]
|
||||
@@ -194,6 +196,8 @@ class Format(Enum):
|
||||
for i in range(min(num_operands, 2)):
|
||||
res += 'instr->sel[{0}] = SubdwordSel(op{0}.op.bytes(), 0, false);'.format(i)
|
||||
res += 'instr->dst_sel = SubdwordSel(def0.bytes(), 0, false);\n'
|
||||
elif self in [Format.DPP16, Format.DPP8]:
|
||||
res += 'instr->fetch_inactive &= program->gfx_level >= GFX10;\n'
|
||||
return res
|
||||
|
||||
|
||||
|
@@ -181,12 +181,13 @@ struct InstrPred {
|
||||
DPP16_instruction& bDPP = b->dpp16();
|
||||
return aDPP.pass_flags == bDPP.pass_flags && aDPP.dpp_ctrl == bDPP.dpp_ctrl &&
|
||||
aDPP.bank_mask == bDPP.bank_mask && aDPP.row_mask == bDPP.row_mask &&
|
||||
aDPP.bound_ctrl == bDPP.bound_ctrl;
|
||||
aDPP.bound_ctrl == bDPP.bound_ctrl && aDPP.fetch_inactive == bDPP.fetch_inactive;
|
||||
}
|
||||
if (a->isDPP8()) {
|
||||
DPP8_instruction& aDPP = a->dpp8();
|
||||
DPP8_instruction& bDPP = b->dpp8();
|
||||
return aDPP.pass_flags == bDPP.pass_flags && aDPP.lane_sel == bDPP.lane_sel;
|
||||
return aDPP.pass_flags == bDPP.pass_flags && aDPP.lane_sel == bDPP.lane_sel &&
|
||||
aDPP.fetch_inactive == bDPP.fetch_inactive;
|
||||
}
|
||||
if (a->isSDWA()) {
|
||||
SDWA_instruction& aSDWA = a->sdwa();
|
||||
|
@@ -4866,12 +4866,14 @@ select_instruction(opt_ctx& ctx, aco_ptr<Instruction>& instr)
|
||||
if (dpp8) {
|
||||
DPP8_instruction* dpp = &instr->dpp8();
|
||||
dpp->lane_sel = info.instr->dpp8().lane_sel;
|
||||
dpp->fetch_inactive = info.instr->dpp8().fetch_inactive;
|
||||
if (mov_uses_mods)
|
||||
instr->format = asVOP3(instr->format);
|
||||
} else {
|
||||
DPP16_instruction* dpp = &instr->dpp16();
|
||||
dpp->dpp_ctrl = info.instr->dpp16().dpp_ctrl;
|
||||
dpp->bound_ctrl = info.instr->dpp16().bound_ctrl;
|
||||
dpp->fetch_inactive = info.instr->dpp16().fetch_inactive;
|
||||
}
|
||||
|
||||
instr->valu().neg[0] ^= info.instr->valu().neg[0] && !instr->valu().abs[0];
|
||||
|
@@ -507,8 +507,10 @@ try_combine_dpp(pr_opt_ctx& ctx, aco_ptr<Instruction>& instr)
|
||||
if (is_overwritten_since(ctx, mov->operands[0], op_instr_idx))
|
||||
continue;
|
||||
|
||||
/* GFX8/9 don't have fetch-inactive. */
|
||||
if (ctx.program->gfx_level < GFX10 &&
|
||||
bool dpp8 = mov->isDPP8();
|
||||
|
||||
/* Fetch-inactive means exec is ignored, which allows us to combine across exec changes. */
|
||||
if (!(dpp8 ? mov->dpp8().fetch_inactive : mov->dpp16().fetch_inactive) &&
|
||||
is_overwritten_since(ctx, Operand(exec, ctx.program->lane_mask), op_instr_idx))
|
||||
continue;
|
||||
|
||||
@@ -519,7 +521,6 @@ try_combine_dpp(pr_opt_ctx& ctx, aco_ptr<Instruction>& instr)
|
||||
if (op_used_twice)
|
||||
continue;
|
||||
|
||||
bool dpp8 = mov->isDPP8();
|
||||
bool input_mods = can_use_input_modifiers(ctx.program->gfx_level, instr->opcode, i) &&
|
||||
get_operand_size(instr, i) == 32;
|
||||
bool mov_uses_mods = mov->valu().neg[0] || mov->valu().abs[0];
|
||||
@@ -548,12 +549,14 @@ try_combine_dpp(pr_opt_ctx& ctx, aco_ptr<Instruction>& instr)
|
||||
if (dpp8) {
|
||||
DPP8_instruction* dpp = &instr->dpp8();
|
||||
dpp->lane_sel = mov->dpp8().lane_sel;
|
||||
dpp->fetch_inactive = mov->dpp8().fetch_inactive;
|
||||
if (mov_uses_mods)
|
||||
instr->format = asVOP3(instr->format);
|
||||
} else {
|
||||
DPP16_instruction* dpp = &instr->dpp16();
|
||||
dpp->dpp_ctrl = mov->dpp16().dpp_ctrl;
|
||||
dpp->bound_ctrl = true;
|
||||
dpp->fetch_inactive = mov->dpp16().fetch_inactive;
|
||||
}
|
||||
instr->valu().neg[0] ^= mov->valu().neg[0] && !instr->valu().abs[0];
|
||||
instr->valu().abs[0] |= mov->valu().abs[0];
|
||||
|
@@ -707,12 +707,16 @@ print_instr_format_specific(enum amd_gfx_level gfx_level, const Instruction* ins
|
||||
fprintf(output, " bank_mask:0x%.1x", dpp.bank_mask);
|
||||
if (dpp.bound_ctrl)
|
||||
fprintf(output, " bound_ctrl:1");
|
||||
if (dpp.fetch_inactive)
|
||||
fprintf(output, " fi");
|
||||
} else if (instr->isDPP8()) {
|
||||
const DPP8_instruction& dpp = instr->dpp8();
|
||||
fprintf(output, " dpp8:[");
|
||||
for (unsigned i = 0; i < 8; i++)
|
||||
fprintf(output, "%s%u", i ? "," : "", (dpp.lane_sel >> (i * 3)) & 0x8);
|
||||
fprintf(output, "]");
|
||||
if (dpp.fetch_inactive)
|
||||
fprintf(output, " fi");
|
||||
} else if (instr->isSDWA()) {
|
||||
const SDWA_instruction& sdwa = instr->sdwa();
|
||||
if (!instr->isVOPC()) {
|
||||
|
@@ -146,6 +146,11 @@ validate_ir(Program* program)
|
||||
"Format cannot have DPP applied", instr.get());
|
||||
check((!instr->isVOP3() && !instr->isVOP3P()) || program->gfx_level >= GFX11,
|
||||
"VOP3+DPP is GFX11+ only", instr.get());
|
||||
|
||||
bool fi =
|
||||
instr->isDPP8() ? instr->dpp8().fetch_inactive : instr->dpp16().fetch_inactive;
|
||||
check(!fi || program->gfx_level >= GFX10, "DPP Fetch-Inactive is GFX10+ only",
|
||||
instr.get());
|
||||
}
|
||||
|
||||
/* check SDWA */
|
||||
|
@@ -526,8 +526,8 @@ BEGIN_TEST(d3d11_derivs.fddxy)
|
||||
pbld.add_vsfs(vs, fs);
|
||||
|
||||
/* Must be before BB1 */
|
||||
//>> v1: %_ = v_sub_f32 (kill)%_, (kill)%_ quad_perm:[1,1,3,3] bound_ctrl:1
|
||||
//>> v1: %_ = v_sub_f32 (kill)%_, (kill)%_ quad_perm:[2,2,2,2] bound_ctrl:1
|
||||
//>> v1: %_ = v_sub_f32 (kill)%_, (kill)%_ quad_perm:[1,1,3,3] bound_ctrl:1 fi
|
||||
//>> v1: %_ = v_sub_f32 (kill)%_, (kill)%_ quad_perm:[2,2,2,2] bound_ctrl:1 fi
|
||||
//>> BB1
|
||||
pbld.print_ir(VK_SHADER_STAGE_FRAGMENT_BIT, "ACO IR");
|
||||
END_TEST
|
||||
@@ -598,12 +598,12 @@ BEGIN_TEST(d3d11_derivs.get_lod)
|
||||
//>> v1: %y = v_interp_p2_f32 (kill)%_, (kill)%_:m0, (kill)%_ attr0.y
|
||||
//>> v2: %vec = p_create_vector %x, %y
|
||||
//>> lv2: %wqm = p_start_linear_vgpr (kill)%vec
|
||||
//>> v1: %x0 = v_mov_b32 %x quad_perm:[0,0,0,0] bound_ctrl:1
|
||||
//>> v1: %x1_m_x0 = v_sub_f32 %x, %x0 quad_perm:[1,1,1,1] bound_ctrl:1
|
||||
//>> v1: %x2_m_x0 = v_sub_f32 (kill)%x, (kill)%x0 quad_perm:[2,2,2,2] bound_ctrl:1
|
||||
//>> v1: %y0 = v_mov_b32 %y quad_perm:[0,0,0,0] bound_ctrl:1
|
||||
//>> v1: %y1_m_y0 = v_sub_f32 %y, %y0 quad_perm:[1,1,1,1] bound_ctrl:1
|
||||
//>> v1: %y2_m_y0 = v_sub_f32 (kill)%y, (kill)%y0 quad_perm:[2,2,2,2] bound_ctrl:1
|
||||
//>> v1: %x0 = v_mov_b32 %x quad_perm:[0,0,0,0] bound_ctrl:1 fi
|
||||
//>> v1: %x1_m_x0 = v_sub_f32 %x, %x0 quad_perm:[1,1,1,1] bound_ctrl:1 fi
|
||||
//>> v1: %x2_m_x0 = v_sub_f32 (kill)%x, (kill)%x0 quad_perm:[2,2,2,2] bound_ctrl:1 fi
|
||||
//>> v1: %y0 = v_mov_b32 %y quad_perm:[0,0,0,0] bound_ctrl:1 fi
|
||||
//>> v1: %y1_m_y0 = v_sub_f32 %y, %y0 quad_perm:[1,1,1,1] bound_ctrl:1 fi
|
||||
//>> v1: %y2_m_y0 = v_sub_f32 (kill)%y, (kill)%y0 quad_perm:[2,2,2,2] bound_ctrl:1 fi
|
||||
//>> BB1
|
||||
//>> v2: %_ = image_get_lod (kill)%_, (kill)%_, v1: undef, %wqm 2d
|
||||
//>> BB2
|
||||
|
@@ -59,7 +59,8 @@ BEGIN_TEST(optimize.neg)
|
||||
Temp neg_abs_a = fneg(abs_a);
|
||||
writeout(4, bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), neg_abs_a, inputs[1]));
|
||||
|
||||
//! v1: %res5 = v_mul_f32 -%a, %b row_shl:1 bound_ctrl:1
|
||||
//~gfx9! v1: %res5 = v_mul_f32 -%a, %b row_shl:1 bound_ctrl:1
|
||||
//~gfx10! v1: %res5 = v_mul_f32 -%a, %b row_shl:1 bound_ctrl:1 fi
|
||||
//! p_unit_test 5, %res5
|
||||
writeout(5,
|
||||
bld.vop2_dpp(aco_opcode::v_mul_f32, bld.def(v1), neg_a, inputs[1], dpp_row_sl(1)));
|
||||
@@ -999,42 +1000,42 @@ BEGIN_TEST(optimizer.dpp)
|
||||
Operand d(inputs[3]);
|
||||
|
||||
/* basic optimization */
|
||||
//! v1: %res0 = v_add_f32 %a, %b row_mirror bound_ctrl:1
|
||||
//! v1: %res0 = v_add_f32 %a, %b row_mirror bound_ctrl:1 fi
|
||||
//! p_unit_test 0, %res0
|
||||
Temp tmp0 = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), a, dpp_row_mirror);
|
||||
Temp res0 = bld.vop2(aco_opcode::v_add_f32, bld.def(v1), tmp0, b);
|
||||
writeout(0, res0);
|
||||
|
||||
/* operand swapping */
|
||||
//! v1: %res1 = v_subrev_f32 %a, %b row_mirror bound_ctrl:1
|
||||
//! v1: %res1 = v_subrev_f32 %a, %b row_mirror bound_ctrl:1 fi
|
||||
//! p_unit_test 1, %res1
|
||||
Temp tmp1 = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), a, dpp_row_mirror);
|
||||
Temp res1 = bld.vop2(aco_opcode::v_sub_f32, bld.def(v1), b, tmp1);
|
||||
writeout(1, res1);
|
||||
|
||||
//! v1: %tmp2 = v_mov_b32 %a row_mirror bound_ctrl:1
|
||||
//! v1: %res2 = v_sub_f32 %b, %tmp2 row_half_mirror bound_ctrl:1
|
||||
//! v1: %tmp2 = v_mov_b32 %a row_mirror bound_ctrl:1 fi
|
||||
//! v1: %res2 = v_sub_f32 %b, %tmp2 row_half_mirror bound_ctrl:1 fi
|
||||
//! p_unit_test 2, %res2
|
||||
Temp tmp2 = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), a, dpp_row_mirror);
|
||||
Temp res2 = bld.vop2_dpp(aco_opcode::v_sub_f32, bld.def(v1), b, tmp2, dpp_row_half_mirror);
|
||||
writeout(2, res2);
|
||||
|
||||
/* modifiers */
|
||||
//! v1: %res3 = v_add_f32 -%a, %b row_mirror bound_ctrl:1
|
||||
//! v1: %res3 = v_add_f32 -%a, %b row_mirror bound_ctrl:1 fi
|
||||
//! p_unit_test 3, %res3
|
||||
auto tmp3 = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), a, dpp_row_mirror);
|
||||
tmp3->dpp16().neg[0] = true;
|
||||
Temp res3 = bld.vop2(aco_opcode::v_add_f32, bld.def(v1), tmp3, b);
|
||||
writeout(3, res3);
|
||||
|
||||
//! v1: %res4 = v_add_f32 -%a, %b row_mirror bound_ctrl:1
|
||||
//! v1: %res4 = v_add_f32 -%a, %b row_mirror bound_ctrl:1 fi
|
||||
//! p_unit_test 4, %res4
|
||||
Temp tmp4 = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), a, dpp_row_mirror);
|
||||
auto res4 = bld.vop2_e64(aco_opcode::v_add_f32, bld.def(v1), tmp4, b);
|
||||
res4->valu().neg[0] = true;
|
||||
writeout(4, res4);
|
||||
|
||||
//! v1: %tmp5 = v_mov_b32 %a row_mirror bound_ctrl:1
|
||||
//! v1: %tmp5 = v_mov_b32 %a row_mirror bound_ctrl:1 fi
|
||||
//! v1: %res5 = v_add_f32 %tmp5, %b clamp
|
||||
//! p_unit_test 5, %res5
|
||||
Temp tmp5 = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), a, dpp_row_mirror);
|
||||
@@ -1042,7 +1043,7 @@ BEGIN_TEST(optimizer.dpp)
|
||||
res5->valu().clamp = true;
|
||||
writeout(5, res5);
|
||||
|
||||
//! v1: %res6 = v_add_f32 |%a|, %b row_mirror bound_ctrl:1
|
||||
//! v1: %res6 = v_add_f32 |%a|, %b row_mirror bound_ctrl:1 fi
|
||||
//! p_unit_test 6, %res6
|
||||
auto tmp6 = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), a, dpp_row_mirror);
|
||||
tmp6->dpp16().neg[0] = true;
|
||||
@@ -1050,14 +1051,14 @@ BEGIN_TEST(optimizer.dpp)
|
||||
res6->valu().abs[0] = true;
|
||||
writeout(6, res6);
|
||||
|
||||
//! v1: %res7 = v_subrev_f32 %a, |%b| row_mirror bound_ctrl:1
|
||||
//! v1: %res7 = v_subrev_f32 %a, |%b| row_mirror bound_ctrl:1 fi
|
||||
//! p_unit_test 7, %res7
|
||||
Temp tmp7 = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), a, dpp_row_mirror);
|
||||
auto res7 = bld.vop2_e64(aco_opcode::v_sub_f32, bld.def(v1), b, tmp7);
|
||||
res7->valu().abs[0] = true;
|
||||
writeout(7, res7);
|
||||
|
||||
//! v1: %tmp11 = v_mov_b32 -%a row_mirror bound_ctrl:1
|
||||
//! v1: %tmp11 = v_mov_b32 -%a row_mirror bound_ctrl:1 fi
|
||||
//! v1: %res11 = v_add_u32 %tmp11, %b
|
||||
//! p_unit_test 11, %res11
|
||||
auto tmp11 = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), a, dpp_row_mirror);
|
||||
@@ -1065,7 +1066,7 @@ BEGIN_TEST(optimizer.dpp)
|
||||
Temp res11 = bld.vop2(aco_opcode::v_add_u32, bld.def(v1), tmp11, b);
|
||||
writeout(11, res11);
|
||||
|
||||
//! v1: %tmp12 = v_mov_b32 -%a row_mirror bound_ctrl:1
|
||||
//! v1: %tmp12 = v_mov_b32 -%a row_mirror bound_ctrl:1 fi
|
||||
//! v1: %res12 = v_add_f16 %tmp12, %b
|
||||
//! p_unit_test 12, %res12
|
||||
auto tmp12 = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), a, dpp_row_mirror);
|
||||
@@ -1074,21 +1075,21 @@ BEGIN_TEST(optimizer.dpp)
|
||||
writeout(12, res12);
|
||||
|
||||
/* vcc */
|
||||
//! v1: %res8 = v_cndmask_b32 %a, %b, %c:vcc row_mirror bound_ctrl:1
|
||||
//! v1: %res8 = v_cndmask_b32 %a, %b, %c:vcc row_mirror bound_ctrl:1 fi
|
||||
//! p_unit_test 8, %res8
|
||||
Temp tmp8 = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), a, dpp_row_mirror);
|
||||
Temp res8 = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), tmp8, b, c);
|
||||
writeout(8, res8);
|
||||
|
||||
/* sgprs */
|
||||
//! v1: %tmp9 = v_mov_b32 %a row_mirror bound_ctrl:1
|
||||
//! v1: %tmp9 = v_mov_b32 %a row_mirror bound_ctrl:1 fi
|
||||
//! v1: %res9 = v_add_f32 %tmp9, %d
|
||||
//! p_unit_test 9, %res9
|
||||
Temp tmp9 = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), a, dpp_row_mirror);
|
||||
Temp res9 = bld.vop2_e64(aco_opcode::v_add_f32, bld.def(v1), tmp9, d);
|
||||
writeout(9, res9);
|
||||
|
||||
//! v1: %tmp10 = v_mov_b32 %a row_mirror bound_ctrl:1
|
||||
//! v1: %tmp10 = v_mov_b32 %a row_mirror bound_ctrl:1 fi
|
||||
//! v1: %res10 = v_add_f32 %d, %tmp10
|
||||
//! p_unit_test 10, %res10
|
||||
Temp tmp10 = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), a, dpp_row_mirror);
|
||||
@@ -1109,7 +1110,7 @@ BEGIN_TEST(optimize.dpp_prop)
|
||||
Temp one = bld.copy(bld.def(v1), Operand::c32(1));
|
||||
writeout(0, bld.vop2_dpp(aco_opcode::v_mul_f32, bld.def(v1), one, inputs[0], dpp_row_sl(1)));
|
||||
|
||||
//! v1: %res1 = v_mul_f32 %a, %one row_shl:1 bound_ctrl:1
|
||||
//! v1: %res1 = v_mul_f32 %a, %one row_shl:1 bound_ctrl:1 fi
|
||||
//! p_unit_test 1, %res1
|
||||
writeout(1, bld.vop2_dpp(aco_opcode::v_mul_f32, bld.def(v1), inputs[0], one, dpp_row_sl(1)));
|
||||
|
||||
@@ -1120,7 +1121,7 @@ BEGIN_TEST(optimize.dpp_prop)
|
||||
bld.vop2_dpp(aco_opcode::v_mul_f32, bld.def(v1), literal1, inputs[0], dpp_row_sl(1)));
|
||||
|
||||
//! v1: %literal2 = p_parallelcopy 0x12345679
|
||||
//! v1: %res3 = v_mul_f32 %a, %literal row_shl:1 bound_ctrl:1
|
||||
//! v1: %res3 = v_mul_f32 %a, %literal row_shl:1 bound_ctrl:1 fi
|
||||
//! p_unit_test 3, %res3
|
||||
Temp literal2 = bld.copy(bld.def(v1), Operand::c32(0x12345679u));
|
||||
writeout(3,
|
||||
@@ -1132,7 +1133,7 @@ BEGIN_TEST(optimize.dpp_prop)
|
||||
Temp b_v = bld.copy(bld.def(v1), inputs[1]);
|
||||
writeout(4, bld.vop2_dpp(aco_opcode::v_mul_f32, bld.def(v1), b_v, inputs[0], dpp_row_sl(1)));
|
||||
|
||||
//! v1: %res5 = v_mul_f32 %a, %b_v row_shl:1 bound_ctrl:1
|
||||
//! v1: %res5 = v_mul_f32 %a, %b_v row_shl:1 bound_ctrl:1 fi
|
||||
//! p_unit_test 5, %res5
|
||||
writeout(5, bld.vop2_dpp(aco_opcode::v_mul_f32, bld.def(v1), inputs[0], b_v, dpp_row_sl(1)));
|
||||
|
||||
@@ -2006,11 +2007,11 @@ BEGIN_TEST(optimize.dpp_opsel)
|
||||
Temp b_hi = bld.pseudo(aco_opcode::p_extract_vector, bld.def(v2b), b, Operand::c32(1));
|
||||
Temp b_lo = bld.pseudo(aco_opcode::p_extract_vector, bld.def(v2b), b, Operand::c32(0));
|
||||
|
||||
//! v2b: %res0 = v_add_f16 hi(%a), hi(%b) row_mirror bound_ctrl:1
|
||||
//! v2b: %res0 = v_add_f16 hi(%a), hi(%b) row_mirror bound_ctrl:1 fi
|
||||
//! p_unit_test 0, %res0
|
||||
writeout(0, fadd(dpp16_hi, b_hi));
|
||||
|
||||
//! v2b: %res1 = v_add_f16 hi(%a), %b dpp8:[0,0,0,0,0,0,0,0]
|
||||
//! v2b: %res1 = v_add_f16 hi(%a), %b dpp8:[0,0,0,0,0,0,0,0] fi
|
||||
//! p_unit_test 1, %res1
|
||||
writeout(1, fadd(b_lo, dpp8_hi));
|
||||
|
||||
|
@@ -365,21 +365,21 @@ BEGIN_TEST(optimizer_postRA.dpp)
|
||||
Operand d(inputs[3], PhysReg(0));
|
||||
|
||||
/* basic optimization */
|
||||
//! v1: %res0:v[2] = v_add_f32 %a:v[0], %b:v[1] row_mirror bound_ctrl:1
|
||||
//! v1: %res0:v[2] = v_add_f32 %a:v[0], %b:v[1] row_mirror bound_ctrl:1 fi
|
||||
//! p_unit_test 0, %res0:v[2]
|
||||
Temp tmp0 = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1, reg_v2), a, dpp_row_mirror);
|
||||
Temp res0 = bld.vop2(aco_opcode::v_add_f32, bld.def(v1, reg_v2), Operand(tmp0, reg_v2), b);
|
||||
writeout(0, Operand(res0, reg_v2));
|
||||
|
||||
/* operand swapping */
|
||||
//! v1: %res1:v[2] = v_subrev_f32 %a:v[0], %b:v[1] row_mirror bound_ctrl:1
|
||||
//! v1: %res1:v[2] = v_subrev_f32 %a:v[0], %b:v[1] row_mirror bound_ctrl:1 fi
|
||||
//! p_unit_test 1, %res1:v[2]
|
||||
Temp tmp1 = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1, reg_v2), a, dpp_row_mirror);
|
||||
Temp res1 = bld.vop2(aco_opcode::v_sub_f32, bld.def(v1, reg_v2), b, Operand(tmp1, reg_v2));
|
||||
writeout(1, Operand(res1, reg_v2));
|
||||
|
||||
//! v1: %tmp2:v[2] = v_mov_b32 %a:v[0] row_mirror bound_ctrl:1
|
||||
//! v1: %res2:v[2] = v_sub_f32 %b:v[1], %tmp2:v[2] row_half_mirror bound_ctrl:1
|
||||
//! v1: %tmp2:v[2] = v_mov_b32 %a:v[0] row_mirror bound_ctrl:1 fi
|
||||
//! v1: %res2:v[2] = v_sub_f32 %b:v[1], %tmp2:v[2] row_half_mirror bound_ctrl:1 fi
|
||||
//! p_unit_test 2, %res2:v[2]
|
||||
Temp tmp2 = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1, reg_v2), a, dpp_row_mirror);
|
||||
Temp res2 = bld.vop2_dpp(aco_opcode::v_sub_f32, bld.def(v1, reg_v2), b, Operand(tmp2, reg_v2),
|
||||
@@ -387,21 +387,21 @@ BEGIN_TEST(optimizer_postRA.dpp)
|
||||
writeout(2, Operand(res2, reg_v2));
|
||||
|
||||
/* modifiers */
|
||||
//! v1: %res3:v[2] = v_add_f32 -%a:v[0], %b:v[1] row_mirror bound_ctrl:1
|
||||
//! v1: %res3:v[2] = v_add_f32 -%a:v[0], %b:v[1] row_mirror bound_ctrl:1 fi
|
||||
//! p_unit_test 3, %res3:v[2]
|
||||
auto tmp3 = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1, reg_v2), a, dpp_row_mirror);
|
||||
tmp3->dpp16().neg[0] = true;
|
||||
Temp res3 = bld.vop2(aco_opcode::v_add_f32, bld.def(v1, reg_v2), Operand(tmp3, reg_v2), b);
|
||||
writeout(3, Operand(res3, reg_v2));
|
||||
|
||||
//! v1: %res4:v[2] = v_add_f32 -%a:v[0], %b:v[1] row_mirror bound_ctrl:1
|
||||
//! v1: %res4:v[2] = v_add_f32 -%a:v[0], %b:v[1] row_mirror bound_ctrl:1 fi
|
||||
//! p_unit_test 4, %res4:v[2]
|
||||
Temp tmp4 = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1, reg_v2), a, dpp_row_mirror);
|
||||
auto res4 = bld.vop2_e64(aco_opcode::v_add_f32, bld.def(v1, reg_v2), Operand(tmp4, reg_v2), b);
|
||||
res4->valu().neg[0] = true;
|
||||
writeout(4, Operand(res4, reg_v2));
|
||||
|
||||
//! v1: %tmp5:v[2] = v_mov_b32 %a:v[0] row_mirror bound_ctrl:1
|
||||
//! v1: %tmp5:v[2] = v_mov_b32 %a:v[0] row_mirror bound_ctrl:1 fi
|
||||
//! v1: %res5:v[2] = v_add_f32 %tmp5:v[2], %b:v[1] clamp
|
||||
//! p_unit_test 5, %res5:v[2]
|
||||
Temp tmp5 = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1, reg_v2), a, dpp_row_mirror);
|
||||
@@ -409,7 +409,7 @@ BEGIN_TEST(optimizer_postRA.dpp)
|
||||
res5->valu().clamp = true;
|
||||
writeout(5, Operand(res5, reg_v2));
|
||||
|
||||
//! v1: %res6:v[2] = v_add_f32 |%a:v[0]|, %b:v[1] row_mirror bound_ctrl:1
|
||||
//! v1: %res6:v[2] = v_add_f32 |%a:v[0]|, %b:v[1] row_mirror bound_ctrl:1 fi
|
||||
//! p_unit_test 6, %res6:v[2]
|
||||
auto tmp6 = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1, reg_v2), a, dpp_row_mirror);
|
||||
tmp6->dpp16().neg[0] = true;
|
||||
@@ -417,14 +417,14 @@ BEGIN_TEST(optimizer_postRA.dpp)
|
||||
res6->valu().abs[0] = true;
|
||||
writeout(6, Operand(res6, reg_v2));
|
||||
|
||||
//! v1: %res7:v[2] = v_subrev_f32 %a:v[0], |%b:v[1]| row_mirror bound_ctrl:1
|
||||
//! v1: %res7:v[2] = v_subrev_f32 %a:v[0], |%b:v[1]| row_mirror bound_ctrl:1 fi
|
||||
//! p_unit_test 7, %res7:v[2]
|
||||
Temp tmp7 = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1, reg_v2), a, dpp_row_mirror);
|
||||
auto res7 = bld.vop2_e64(aco_opcode::v_sub_f32, bld.def(v1, reg_v2), b, Operand(tmp7, reg_v2));
|
||||
res7->valu().abs[0] = true;
|
||||
writeout(7, Operand(res7, reg_v2));
|
||||
|
||||
//! v1: %tmp12:v[2] = v_mov_b32 -%a:v[0] row_mirror bound_ctrl:1
|
||||
//! v1: %tmp12:v[2] = v_mov_b32 -%a:v[0] row_mirror bound_ctrl:1 fi
|
||||
//! v1: %res12:v[2] = v_add_u32 %tmp12:v[2], %b:v[1]
|
||||
//! p_unit_test 12, %res12:v[2]
|
||||
auto tmp12 = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1, reg_v2), a, dpp_row_mirror);
|
||||
@@ -432,7 +432,7 @@ BEGIN_TEST(optimizer_postRA.dpp)
|
||||
Temp res12 = bld.vop2(aco_opcode::v_add_u32, bld.def(v1, reg_v2), Operand(tmp12, reg_v2), b);
|
||||
writeout(12, Operand(res12, reg_v2));
|
||||
|
||||
//! v1: %tmp13:v[2] = v_mov_b32 -%a:v[0] row_mirror bound_ctrl:1
|
||||
//! v1: %tmp13:v[2] = v_mov_b32 -%a:v[0] row_mirror bound_ctrl:1 fi
|
||||
//! v1: %res13:v[2] = v_add_f16 %tmp13:v[2], %b:v[1]
|
||||
//! p_unit_test 13, %res13:v[2]
|
||||
auto tmp13 = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1, reg_v2), a, dpp_row_mirror);
|
||||
@@ -441,14 +441,14 @@ BEGIN_TEST(optimizer_postRA.dpp)
|
||||
writeout(13, Operand(res13, reg_v2));
|
||||
|
||||
/* vcc */
|
||||
//! v1: %res8:v[2] = v_cndmask_b32 %a:v[0], %b:v[1], %c:vcc row_mirror bound_ctrl:1
|
||||
//! v1: %res8:v[2] = v_cndmask_b32 %a:v[0], %b:v[1], %c:vcc row_mirror bound_ctrl:1 fi
|
||||
//! p_unit_test 8, %res8:v[2]
|
||||
Temp tmp8 = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1, reg_v2), a, dpp_row_mirror);
|
||||
Temp res8 =
|
||||
bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1, reg_v2), Operand(tmp8, reg_v2), b, c);
|
||||
writeout(8, Operand(res8, reg_v2));
|
||||
|
||||
//! v1: %tmp9:v[2] = v_mov_b32 %a:v[0] row_mirror bound_ctrl:1
|
||||
//! v1: %tmp9:v[2] = v_mov_b32 %a:v[0] row_mirror bound_ctrl:1 fi
|
||||
//! v1: %res9:v[2] = v_cndmask_b32 %tmp9:v[2], %b:v[1], %d:s[0-1]
|
||||
//! p_unit_test 9, %res9:v[2]
|
||||
Temp tmp9 = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1, reg_v2), a, dpp_row_mirror);
|
||||
@@ -459,7 +459,7 @@ BEGIN_TEST(optimizer_postRA.dpp)
|
||||
/* control flow */
|
||||
//! BB1
|
||||
//! /* logical preds: BB0, / linear preds: BB0, / kind: uniform, */
|
||||
//! v1: %res10:v[2] = v_add_f32 %a:v[0], %b:v[1] row_mirror bound_ctrl:1
|
||||
//! v1: %res10:v[2] = v_add_f32 %a:v[0], %b:v[1] row_mirror bound_ctrl:1 fi
|
||||
//! p_unit_test 10, %res10:v[2]
|
||||
Temp tmp10 = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1, reg_v2), a, dpp_row_mirror);
|
||||
|
||||
@@ -473,7 +473,7 @@ BEGIN_TEST(optimizer_postRA.dpp)
|
||||
writeout(10, Operand(res10, reg_v2));
|
||||
|
||||
/* can't combine if the v_mov_b32's operand is modified */
|
||||
//! v1: %tmp11_1:v[2] = v_mov_b32 %a:v[0] row_mirror bound_ctrl:1
|
||||
//! v1: %tmp11_1:v[2] = v_mov_b32 %a:v[0] row_mirror bound_ctrl:1 fi
|
||||
//! v1: %tmp11_2:v[0] = v_mov_b32 0
|
||||
//! v1: %res11:v[2] = v_add_f32 %tmp11_1:v[2], %b:v[1]
|
||||
//! p_unit_test 11, %res11_1:v[2], %tmp11_2:v[0]
|
||||
@@ -501,7 +501,7 @@ BEGIN_TEST(optimizer_postRA.dpp_across_exec)
|
||||
//~gfx9! v1: %tmp0:v[2] = v_mov_b32 %a:v[0] row_mirror bound_ctrl:1
|
||||
//! s2: %0:exec, s1: %0:scc = s_not_b64 %0:exec
|
||||
//~gfx9! v1: %res0:v[2] = v_add_f32 %tmp0:v[2], %b:v[1]
|
||||
//~gfx10! v1: %res0:v[2] = v_add_f32 %a:v[0], %b:v[1] row_mirror bound_ctrl:1
|
||||
//~gfx10! v1: %res0:v[2] = v_add_f32 %a:v[0], %b:v[1] row_mirror bound_ctrl:1 fi
|
||||
//! p_unit_test 0, %res0:v[2]
|
||||
Temp tmp0 = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1, reg_v2), a, dpp_row_mirror);
|
||||
bld.sop1(Builder::s_not, Definition(exec, bld.lm), Definition(scc, s1),
|
||||
@@ -525,7 +525,7 @@ BEGIN_TEST(optimizer_postRA.dpp_vcmpx)
|
||||
Operand a(inputs[0], PhysReg(256));
|
||||
Operand b(inputs[1], PhysReg(257));
|
||||
|
||||
//! v1: %tmp0:v[2] = v_mov_b32 %a:v[0] row_mirror bound_ctrl:1
|
||||
//! v1: %tmp0:v[2] = v_mov_b32 %a:v[0] row_mirror bound_ctrl:1 fi
|
||||
//! s2: %res0:exec = v_cmpx_lt_f32 %tmp0:v[2], %b:v[1]
|
||||
//! p_unit_test 0, %res0:exec
|
||||
Temp tmp0 = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1, reg_v2), a, dpp_row_mirror);
|
||||
@@ -605,7 +605,7 @@ BEGIN_TEST(optimizer_postRA.dpp_across_cf)
|
||||
//! /* logical preds: BB1, BB4, / linear preds: BB4, BB5, / kind: uniform, top-level, merge, */
|
||||
//! s2: %0:exec = p_parallelcopy %saved_exec:s[84-85]
|
||||
|
||||
//! v1: %res10:v[12] = v_add_f32 %a:v[0], %b:v[1] row_mirror bound_ctrl:1
|
||||
//! v1: %res10:v[12] = v_add_f32 %a:v[0], %b:v[1] row_mirror bound_ctrl:1 fi
|
||||
//! p_unit_test 10, %res10:v[12]
|
||||
Temp result =
|
||||
bld.vop2(aco_opcode::v_add_f32, bld.def(v1, reg_v12), Operand(dpp_tmp, reg_v12), b);
|
||||
@@ -635,7 +635,7 @@ BEGIN_TEST(optimizer_postRA.dpp_across_cf_overwritten)
|
||||
Operand f(inputs[5], PhysReg(2)); /* buffer store address (scalar) */
|
||||
PhysReg reg_v12(268); /* temporary register */
|
||||
|
||||
//! v1: %dpp_tmp:v[12] = v_mov_b32 %a:v[0] row_mirror bound_ctrl:1
|
||||
//! v1: %dpp_tmp:v[12] = v_mov_b32 %a:v[0] row_mirror bound_ctrl:1 fi
|
||||
Temp dpp_tmp = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1, reg_v12), a, dpp_row_mirror);
|
||||
|
||||
//! s2: %saved_exec:s[84-85], s1: %0:scc, s2: %0:exec = s_and_saveexec_b64 %e:s[0-1], %0:exec
|
||||
|
Reference in New Issue
Block a user