aco: always use p_parallelcopy for pre-RA copies

Most fossil-db changes are because literals are applied earlier
(in label_instruction), so use counts are more accurate and more literals
are applied.

fossil-db (Navi):
Totals from 79551 (57.89% of 137413) affected shaders:
SGPRs: 4549610 -> 4542802 (-0.15%); split: -0.19%, +0.04%
VGPRs: 3326764 -> 3324172 (-0.08%); split: -0.10%, +0.03%
SpillSGPRs: 38886 -> 34562 (-11.12%); split: -11.14%, +0.02%
CodeSize: 240143456 -> 240001008 (-0.06%); split: -0.11%, +0.05%
MaxWaves: 1078919 -> 1079281 (+0.03%); split: +0.04%, -0.01%
Instrs: 46627073 -> 46528490 (-0.21%); split: -0.22%, +0.01%

fossil-db (Polaris):
Totals from 98463 (70.90% of 138881) affected shaders:
SGPRs: 5164689 -> 5164353 (-0.01%); split: -0.02%, +0.01%
VGPRs: 3920936 -> 3921856 (+0.02%); split: -0.00%, +0.03%
SpillSGPRs: 56298 -> 52259 (-7.17%); split: -7.22%, +0.04%
CodeSize: 258680092 -> 258692712 (+0.00%); split: -0.02%, +0.03%
MaxWaves: 620863 -> 620823 (-0.01%); split: +0.00%, -0.01%
Instrs: 50776289 -> 50757577 (-0.04%); split: -0.04%, +0.00%

Signed-off-by: Rhys Perry <pendingchaos02@gmail.com>
Reviewed-by: Timur Kristóf <timur.kristof@gmail.com>
Reviewed-by: Daniel Schürmann <daniel@schuermann.dev>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/7216>
This commit is contained in:
Rhys Perry
2020-10-14 13:50:24 +01:00
committed by Marge Bot
parent 6db5fbf9f2
commit e54c111c45
6 changed files with 13 additions and 84 deletions

View File

@@ -85,8 +85,6 @@ ds_pattern_bitmode(unsigned and_mask, unsigned or_mask, unsigned xor_mask)
aco_ptr<Instruction> create_s_mov(Definition dst, Operand src);
extern uint8_t int8_mul_table[512];
enum sendmsg {
sendmsg_none = 0,
_sendmsg_gs = 2,
@@ -386,82 +384,8 @@ public:
return v_mul_imm(dst, tmp, imm, true);
}
Result copy(Definition dst, Op op_) {
Operand op = op_.op;
assert(op.bytes() == dst.bytes());
if (dst.regClass() == s1 && op.size() == 1 && op.isLiteral()) {
uint32_t imm = op.constantValue();
if (imm == 0x3e22f983) {
if (program->chip_class >= GFX8)
op.setFixed(PhysReg{248}); /* it can be an inline constant on GFX8+ */
} else if (imm >= 0xffff8000 || imm <= 0x7fff) {
return sopk(aco_opcode::s_movk_i32, dst, imm & 0xFFFFu);
} else if (util_bitreverse(imm) <= 64 || util_bitreverse(imm) >= 0xFFFFFFF0) {
uint32_t rev = util_bitreverse(imm);
return dst.regClass() == v1 ?
vop1(aco_opcode::v_bfrev_b32, dst, Operand(rev)) :
sop1(aco_opcode::s_brev_b32, dst, Operand(rev));
} else if (imm != 0) {
unsigned start = (ffs(imm) - 1) & 0x1f;
unsigned size = util_bitcount(imm) & 0x1f;
if ((((1u << size) - 1u) << start) == imm)
return sop2(aco_opcode::s_bfm_b32, dst, Operand(size), Operand(start));
}
}
if (dst.regClass() == s1) {
return sop1(aco_opcode::s_mov_b32, dst, op);
} else if (dst.regClass() == s2) {
return sop1(aco_opcode::s_mov_b64, dst, op);
} else if (dst.regClass() == v1 || dst.regClass() == v1.as_linear()) {
return vop1(aco_opcode::v_mov_b32, dst, op);
} else if (op.bytes() > 2 || (op.isLiteral() && dst.regClass().is_subdword())) {
return pseudo(aco_opcode::p_create_vector, dst, op);
} else if (op.bytes() == 1 && op.isConstant()) {
uint8_t val = op.constantValue();
Operand op32((uint32_t)val | (val & 0x80u ? 0xffffff00u : 0u));
aco_ptr<SDWA_instruction> sdwa;
if (op32.isLiteral()) {
sdwa.reset(create_instruction<SDWA_instruction>(aco_opcode::v_mul_u32_u24, asSDWA(Format::VOP2), 2, 1));
uint32_t a = (uint32_t)int8_mul_table[val * 2];
uint32_t b = (uint32_t)int8_mul_table[val * 2 + 1];
sdwa->operands[0] = Operand(a | (a & 0x80u ? 0xffffff00u : 0x0u));
sdwa->operands[1] = Operand(b | (b & 0x80u ? 0xffffff00u : 0x0u));
} else {
sdwa.reset(create_instruction<SDWA_instruction>(aco_opcode::v_mov_b32, asSDWA(Format::VOP1), 1, 1));
sdwa->operands[0] = op32;
}
sdwa->definitions[0] = dst;
sdwa->sel[0] = sdwa_udword;
sdwa->sel[1] = sdwa_udword;
sdwa->dst_sel = sdwa_ubyte;
sdwa->dst_preserve = true;
return insert(std::move(sdwa));
} else if (op.bytes() == 2 && op.isConstant() && !op.isLiteral()) {
aco_ptr<SDWA_instruction> sdwa{create_instruction<SDWA_instruction>(aco_opcode::v_add_f16, asSDWA(Format::VOP2), 2, 1)};
sdwa->operands[0] = op;
sdwa->operands[1] = Operand(0u);
sdwa->definitions[0] = dst;
sdwa->sel[0] = sdwa_uword;
sdwa->sel[1] = sdwa_udword;
sdwa->dst_sel = dst.bytes() == 1 ? sdwa_ubyte : sdwa_uword;
sdwa->dst_preserve = true;
return insert(std::move(sdwa));
} else if (dst.regClass().is_subdword()) {
if (program->chip_class >= GFX8) {
aco_ptr<SDWA_instruction> sdwa{create_instruction<SDWA_instruction>(aco_opcode::v_mov_b32, asSDWA(Format::VOP1), 1, 1)};
sdwa->operands[0] = op;
sdwa->definitions[0] = dst;
sdwa->sel[0] = op.bytes() == 1 ? sdwa_ubyte : sdwa_uword;
sdwa->dst_sel = dst.bytes() == 1 ? sdwa_ubyte : sdwa_uword;
sdwa->dst_preserve = true;
return insert(std::move(sdwa));
} else {
return vop1(aco_opcode::v_mov_b32, dst, op);
}
} else {
unreachable("Unhandled case in bld.copy()");
}
Result copy(Definition dst, Op op) {
return pseudo(aco_opcode::p_parallelcopy, dst, op);
}
Result vadd32(Definition dst, Op a, Op b, bool carry_out=false, Op carry_in=Op(Operand(s2)), bool post_ra=false) {