intel/nir: Lower 8-bit ops to 16-bit in NIR on Gen11+
Intel hardware supports 8-bit arithmetic but it's tricky and annoying: - Byte operations don't actually execute with a byte type. The execution type for byte operations is actually word. (I don't know if this has implications for the HW implementation. Probably?) - Destinations are required to be strided out to at least the execution type size. This means that B-type operations always have a stride of at least 2. This means wreaks havoc on the back-end in multiple ways. - Thanks to the strided destination, we don't actually save register space by storing things in bytes. We could, in theory, interleave two byte values into a single 2B-strided register but that's both a pain for RA and would lead to piles of false dependencies pre-Gen12 and on Gen12+, we'd need some significant improvements to the SWSB pass. - Also thanks to the strided destination, all byte writes are treated as partial writes by the back-end and we don't know how to copy-prop them. - On Gen11, they added a new hardware restriction that byte types aren't allowed in the 2nd and 3rd sources of instructions. This means that we have to emit B->W conversions all over to resolve things. If we emit said conversions in NIR, instead, there's a chance NIR can get rid of some of them for us. We can get rid of a lot of this pain by just asking NIR to get rid of 8-bit arithmetic for us. It may lead to a few more conversions in some cases but having back-end copy-prop actually work is probably a bigger bonus. There is still a bit we have to handle in the back-end. In particular, basic MOVs and conversions because 8-bit load/store ops still require 8-bit types. Reviewed-by: Kenneth Graunke <kenneth@whitecape.org> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/7482>
This commit is contained in:

committed by
Marge Bot

parent
b98f0d3d7c
commit
68092df8d8
@@ -49,7 +49,6 @@
|
||||
.vertex_id_zero_based = true, \
|
||||
.lower_base_vertex = true, \
|
||||
.use_scoped_barrier = true, \
|
||||
.support_8bit_alu = true, \
|
||||
.support_16bit_alu = true, \
|
||||
.lower_uniforms_to_ubo = true
|
||||
|
||||
@@ -187,6 +186,9 @@ brw_compiler_create(void *mem_ctx, const struct gen_device_info *devinfo)
|
||||
nir_options->lower_int64_options = int64_options;
|
||||
nir_options->lower_doubles_options = fp64_options;
|
||||
|
||||
/* Starting with Gen11, we lower away 8-bit arithmetic */
|
||||
nir_options->support_8bit_alu = devinfo->gen < 11;
|
||||
|
||||
nir_options->unify_interfaces = i < MESA_SHADER_FRAGMENT;
|
||||
|
||||
compiler->glsl_compiler_options[i].NirOptions = nir_options;
|
||||
|
@@ -304,11 +304,11 @@ namespace brw {
|
||||
case SHADER_OPCODE_INT_REMAINDER:
|
||||
return emit(instruction(opcode, dispatch_width(), dst,
|
||||
fix_math_operand(src0),
|
||||
fix_math_operand(fix_byte_src(src1))));
|
||||
fix_math_operand(src1)));
|
||||
|
||||
default:
|
||||
return emit(instruction(opcode, dispatch_width(), dst,
|
||||
src0, fix_byte_src(src1)));
|
||||
src0, src1));
|
||||
|
||||
}
|
||||
}
|
||||
@@ -327,12 +327,12 @@ namespace brw {
|
||||
case BRW_OPCODE_LRP:
|
||||
return emit(instruction(opcode, dispatch_width(), dst,
|
||||
fix_3src_operand(src0),
|
||||
fix_3src_operand(fix_byte_src(src1)),
|
||||
fix_3src_operand(fix_byte_src(src2))));
|
||||
fix_3src_operand(src1),
|
||||
fix_3src_operand(src2)));
|
||||
|
||||
default:
|
||||
return emit(instruction(opcode, dispatch_width(), dst,
|
||||
src0, fix_byte_src(src1), fix_byte_src(src2)));
|
||||
src0, src1, src2));
|
||||
}
|
||||
}
|
||||
|
||||
@@ -394,8 +394,8 @@ namespace brw {
|
||||
/* In some cases we can't have bytes as operand for src1, so use the
|
||||
* same type for both operand.
|
||||
*/
|
||||
return set_condmod(mod, SEL(dst, fix_unsigned_negate(fix_byte_src(src0)),
|
||||
fix_unsigned_negate(fix_byte_src(src1))));
|
||||
return set_condmod(mod, SEL(dst, fix_unsigned_negate(src0),
|
||||
fix_unsigned_negate(src1)));
|
||||
}
|
||||
|
||||
/**
|
||||
@@ -659,8 +659,8 @@ namespace brw {
|
||||
emit(BRW_OPCODE_CSEL,
|
||||
retype(dst, BRW_REGISTER_TYPE_F),
|
||||
retype(src0, BRW_REGISTER_TYPE_F),
|
||||
retype(fix_byte_src(src1), BRW_REGISTER_TYPE_F),
|
||||
fix_byte_src(src2)));
|
||||
retype(src1, BRW_REGISTER_TYPE_F),
|
||||
src2));
|
||||
}
|
||||
|
||||
/**
|
||||
@@ -721,21 +721,6 @@ namespace brw {
|
||||
|
||||
backend_shader *shader;
|
||||
|
||||
/**
|
||||
* Byte sized operands are not supported for src1 on Gen11+.
|
||||
*/
|
||||
src_reg
|
||||
fix_byte_src(const src_reg &src) const
|
||||
{
|
||||
if (shader->devinfo->gen < 11 || type_sz(src.type) != 1)
|
||||
return src;
|
||||
|
||||
dst_reg temp = vgrf(src.type == BRW_REGISTER_TYPE_UB ?
|
||||
BRW_REGISTER_TYPE_UD : BRW_REGISTER_TYPE_D);
|
||||
MOV(temp, src);
|
||||
return src_reg(temp);
|
||||
}
|
||||
|
||||
private:
|
||||
/**
|
||||
* Workaround for negation of UD registers. See comment in
|
||||
|
@@ -1424,18 +1424,11 @@ fs_visitor::nir_emit_alu(const fs_builder &bld, nir_alu_instr *instr,
|
||||
case nir_op_ine32: {
|
||||
fs_reg dest = result;
|
||||
|
||||
/* On Gen11 we have an additional issue being that src1 cannot be a byte
|
||||
* type. So we convert both operands for the comparison.
|
||||
*/
|
||||
fs_reg temp_op[2];
|
||||
temp_op[0] = bld.fix_byte_src(op[0]);
|
||||
temp_op[1] = bld.fix_byte_src(op[1]);
|
||||
|
||||
const uint32_t bit_size = type_sz(temp_op[0].type) * 8;
|
||||
const uint32_t bit_size = type_sz(op[0].type) * 8;
|
||||
if (bit_size != 32)
|
||||
dest = bld.vgrf(temp_op[0].type, 1);
|
||||
dest = bld.vgrf(op[0].type, 1);
|
||||
|
||||
bld.CMP(dest, temp_op[0], temp_op[1],
|
||||
bld.CMP(dest, op[0], op[1],
|
||||
brw_cmod_for_nir_comparison(instr->op));
|
||||
|
||||
if (bit_size > 32) {
|
||||
|
@@ -667,6 +667,15 @@ lower_bit_size_callback(const nir_instr *instr, UNUSED void *data)
|
||||
case nir_op_fcos:
|
||||
return devinfo->gen < 9 ? 32 : 0;
|
||||
default:
|
||||
if (devinfo->gen >= 11) {
|
||||
if (nir_op_infos[alu->op].num_inputs >= 2 &&
|
||||
alu->dest.dest.ssa.bit_size == 8)
|
||||
return 16;
|
||||
|
||||
if (nir_alu_instr_is_comparison(alu) &&
|
||||
alu->src[0].src.ssa->bit_size == 8)
|
||||
return 16;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
break;
|
||||
@@ -675,6 +684,22 @@ lower_bit_size_callback(const nir_instr *instr, UNUSED void *data)
|
||||
case nir_instr_type_intrinsic: {
|
||||
nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
|
||||
switch (intrin->intrinsic) {
|
||||
case nir_intrinsic_read_invocation:
|
||||
case nir_intrinsic_read_first_invocation:
|
||||
case nir_intrinsic_vote_feq:
|
||||
case nir_intrinsic_vote_ieq:
|
||||
case nir_intrinsic_shuffle:
|
||||
case nir_intrinsic_shuffle_xor:
|
||||
case nir_intrinsic_shuffle_up:
|
||||
case nir_intrinsic_shuffle_down:
|
||||
case nir_intrinsic_quad_broadcast:
|
||||
case nir_intrinsic_quad_swap_horizontal:
|
||||
case nir_intrinsic_quad_swap_vertical:
|
||||
case nir_intrinsic_quad_swap_diagonal:
|
||||
if (intrin->src[0].ssa->bit_size == 8 && devinfo->gen >= 11)
|
||||
return 16;
|
||||
return 0;
|
||||
|
||||
case nir_intrinsic_reduce:
|
||||
case nir_intrinsic_inclusive_scan:
|
||||
case nir_intrinsic_exclusive_scan:
|
||||
|
Reference in New Issue
Block a user