intel/fs: add 64 bit integer multiplication lowering
While NIR's lower_imul64() solves the case of 64 bit integer multiplications generated early, we don't have a way to lower such instructions when they are generated by our own backend, such as the scan/reduce intrinsics. We'll need this soon, so implement it now. An easy way to test this is to simply disable nir_lower_imul64 to let those operations reach the backend. v2: - Fix Q/UQ copy/paste errors (Caio). - Transform an 'if' into 'else if' (Caio). - Add an extra comment to clarify the need for 64b = 32b * 32b (Caio). - Make private functions private (Caio). v3: - Remove ambiguity with 'b' and 'd' variables (Caio). - Allocate potentially less regs for the dwords (Caio). Cc: Jason Ekstrand <jason.ekstrand@intel.com> Cc: Matt Turner <matt.turner@intel.com> Cc: Caio Marcelo de Oliveira Filho <caio.oliveira@intel.com> Reviewed-by: Matt Turner <mattst88@gmail.com> Reviewed-by: Caio Marcelo de Oliveira Filho <caio.oliveira@intel.com> Signed-off-by: Paulo Zanoni <paulo.r.zanoni@intel.com>
This commit is contained in:

committed by
Caio Marcelo de Oliveira Filho

parent
9217cf3b5e
commit
866bb775de
@@ -3990,6 +3990,62 @@ fs_visitor::lower_mul_dword_inst(fs_inst *inst, bblock_t *block)
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void
|
||||||
|
fs_visitor::lower_mul_qword_inst(fs_inst *inst, bblock_t *block)
|
||||||
|
{
|
||||||
|
const fs_builder ibld(this, block, inst);
|
||||||
|
|
||||||
|
/* Considering two 64-bit integers ab and cd where each letter ab
|
||||||
|
* corresponds to 32 bits, we get a 128-bit result WXYZ. We * cd
|
||||||
|
* only need to provide the YZ part of the result. -------
|
||||||
|
* BD
|
||||||
|
* Only BD needs to be 64 bits. For AD and BC we only care + AD
|
||||||
|
* about the lower 32 bits (since they are part of the upper + BC
|
||||||
|
* 32 bits of our result). AC is not needed since it starts + AC
|
||||||
|
* on the 65th bit of the result. -------
|
||||||
|
* WXYZ
|
||||||
|
*/
|
||||||
|
unsigned int q_regs = regs_written(inst);
|
||||||
|
unsigned int d_regs = (q_regs + 1) / 2;
|
||||||
|
|
||||||
|
fs_reg bd(VGRF, alloc.allocate(q_regs), BRW_REGISTER_TYPE_UQ);
|
||||||
|
fs_reg ad(VGRF, alloc.allocate(d_regs), BRW_REGISTER_TYPE_UD);
|
||||||
|
fs_reg bc(VGRF, alloc.allocate(d_regs), BRW_REGISTER_TYPE_UD);
|
||||||
|
|
||||||
|
/* Here we need the full 64 bit result for 32b * 32b. */
|
||||||
|
if (devinfo->has_integer_dword_mul) {
|
||||||
|
ibld.MUL(bd, subscript(inst->src[0], BRW_REGISTER_TYPE_UD, 0),
|
||||||
|
subscript(inst->src[1], BRW_REGISTER_TYPE_UD, 0));
|
||||||
|
} else {
|
||||||
|
fs_reg bd_high(VGRF, alloc.allocate(d_regs), BRW_REGISTER_TYPE_UD);
|
||||||
|
fs_reg bd_low(VGRF, alloc.allocate(d_regs), BRW_REGISTER_TYPE_UD);
|
||||||
|
fs_reg acc = retype(brw_acc_reg(inst->exec_size), BRW_REGISTER_TYPE_UD);
|
||||||
|
|
||||||
|
fs_inst *mul = ibld.MUL(acc,
|
||||||
|
subscript(inst->src[0], BRW_REGISTER_TYPE_UD, 0),
|
||||||
|
subscript(inst->src[1], BRW_REGISTER_TYPE_UW, 0));
|
||||||
|
mul->writes_accumulator = true;
|
||||||
|
|
||||||
|
ibld.MACH(bd_high, subscript(inst->src[0], BRW_REGISTER_TYPE_UD, 0),
|
||||||
|
subscript(inst->src[1], BRW_REGISTER_TYPE_UD, 0));
|
||||||
|
ibld.MOV(bd_low, acc);
|
||||||
|
|
||||||
|
ibld.MOV(subscript(bd, BRW_REGISTER_TYPE_UD, 0), bd_low);
|
||||||
|
ibld.MOV(subscript(bd, BRW_REGISTER_TYPE_UD, 1), bd_high);
|
||||||
|
}
|
||||||
|
|
||||||
|
ibld.MUL(ad, subscript(inst->src[0], BRW_REGISTER_TYPE_UD, 1),
|
||||||
|
subscript(inst->src[1], BRW_REGISTER_TYPE_UD, 0));
|
||||||
|
ibld.MUL(bc, subscript(inst->src[0], BRW_REGISTER_TYPE_UD, 0),
|
||||||
|
subscript(inst->src[1], BRW_REGISTER_TYPE_UD, 1));
|
||||||
|
|
||||||
|
ibld.ADD(ad, ad, bc);
|
||||||
|
ibld.ADD(subscript(bd, BRW_REGISTER_TYPE_UD, 1),
|
||||||
|
subscript(bd, BRW_REGISTER_TYPE_UD, 1), ad);
|
||||||
|
|
||||||
|
ibld.MOV(inst->dst, bd);
|
||||||
|
}
|
||||||
|
|
||||||
void
|
void
|
||||||
fs_visitor::lower_mulh_inst(fs_inst *inst, bblock_t *block)
|
fs_visitor::lower_mulh_inst(fs_inst *inst, bblock_t *block)
|
||||||
{
|
{
|
||||||
@@ -4062,10 +4118,19 @@ fs_visitor::lower_integer_multiplication()
|
|||||||
|
|
||||||
foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
|
foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
|
||||||
if (inst->opcode == BRW_OPCODE_MUL) {
|
if (inst->opcode == BRW_OPCODE_MUL) {
|
||||||
if (!inst->dst.is_accumulator() &&
|
if ((inst->dst.type == BRW_REGISTER_TYPE_Q ||
|
||||||
(inst->dst.type == BRW_REGISTER_TYPE_D ||
|
inst->dst.type == BRW_REGISTER_TYPE_UQ) &&
|
||||||
inst->dst.type == BRW_REGISTER_TYPE_UD) &&
|
(inst->src[0].type == BRW_REGISTER_TYPE_Q ||
|
||||||
!devinfo->has_integer_dword_mul) {
|
inst->src[0].type == BRW_REGISTER_TYPE_UQ) &&
|
||||||
|
(inst->src[1].type == BRW_REGISTER_TYPE_Q ||
|
||||||
|
inst->src[1].type == BRW_REGISTER_TYPE_UQ)) {
|
||||||
|
lower_mul_qword_inst(inst, block);
|
||||||
|
inst->remove(block);
|
||||||
|
progress = true;
|
||||||
|
} else if (!inst->dst.is_accumulator() &&
|
||||||
|
(inst->dst.type == BRW_REGISTER_TYPE_D ||
|
||||||
|
inst->dst.type == BRW_REGISTER_TYPE_UD) &&
|
||||||
|
!devinfo->has_integer_dword_mul) {
|
||||||
lower_mul_dword_inst(inst, block);
|
lower_mul_dword_inst(inst, block);
|
||||||
inst->remove(block);
|
inst->remove(block);
|
||||||
progress = true;
|
progress = true;
|
||||||
|
@@ -407,6 +407,7 @@ private:
|
|||||||
void resolve_inot_sources(const brw::fs_builder &bld, nir_alu_instr *instr,
|
void resolve_inot_sources(const brw::fs_builder &bld, nir_alu_instr *instr,
|
||||||
fs_reg *op);
|
fs_reg *op);
|
||||||
void lower_mul_dword_inst(fs_inst *inst, bblock_t *block);
|
void lower_mul_dword_inst(fs_inst *inst, bblock_t *block);
|
||||||
|
void lower_mul_qword_inst(fs_inst *inst, bblock_t *block);
|
||||||
void lower_mulh_inst(fs_inst *inst, bblock_t *block);
|
void lower_mulh_inst(fs_inst *inst, bblock_t *block);
|
||||||
};
|
};
|
||||||
|
|
||||||
|
Reference in New Issue
Block a user