From 68092df8d8872bffb07dbd21d1d58733651dc97c Mon Sep 17 00:00:00 2001 From: Jason Ekstrand Date: Thu, 5 Nov 2020 23:19:31 -0600 Subject: [PATCH] intel/nir: Lower 8-bit ops to 16-bit in NIR on Gen11+ Intel hardware supports 8-bit arithmetic but it's tricky and annoying: - Byte operations don't actually execute with a byte type. The execution type for byte operations is actually word. (I don't know if this has implications for the HW implementation. Probably?) - Destinations are required to be strided out to at least the execution type size. This means that B-type operations always have a stride of at least 2. This means wreaks havoc on the back-end in multiple ways. - Thanks to the strided destination, we don't actually save register space by storing things in bytes. We could, in theory, interleave two byte values into a single 2B-strided register but that's both a pain for RA and would lead to piles of false dependencies pre-Gen12 and on Gen12+, we'd need some significant improvements to the SWSB pass. - Also thanks to the strided destination, all byte writes are treated as partial writes by the back-end and we don't know how to copy-prop them. - On Gen11, they added a new hardware restriction that byte types aren't allowed in the 2nd and 3rd sources of instructions. This means that we have to emit B->W conversions all over to resolve things. If we emit said conversions in NIR, instead, there's a chance NIR can get rid of some of them for us. We can get rid of a lot of this pain by just asking NIR to get rid of 8-bit arithmetic for us. It may lead to a few more conversions in some cases but having back-end copy-prop actually work is probably a bigger bonus. There is still a bit we have to handle in the back-end. In particular, basic MOVs and conversions because 8-bit load/store ops still require 8-bit types. Reviewed-by: Kenneth Graunke Part-of: --- src/intel/compiler/brw_compiler.c | 4 +++- src/intel/compiler/brw_fs_builder.h | 33 ++++++++--------------------- src/intel/compiler/brw_fs_nir.cpp | 13 +++--------- src/intel/compiler/brw_nir.c | 25 ++++++++++++++++++++++ 4 files changed, 40 insertions(+), 35 deletions(-) diff --git a/src/intel/compiler/brw_compiler.c b/src/intel/compiler/brw_compiler.c index 9b2a0427144..933b00440dd 100644 --- a/src/intel/compiler/brw_compiler.c +++ b/src/intel/compiler/brw_compiler.c @@ -49,7 +49,6 @@ .vertex_id_zero_based = true, \ .lower_base_vertex = true, \ .use_scoped_barrier = true, \ - .support_8bit_alu = true, \ .support_16bit_alu = true, \ .lower_uniforms_to_ubo = true @@ -187,6 +186,9 @@ brw_compiler_create(void *mem_ctx, const struct gen_device_info *devinfo) nir_options->lower_int64_options = int64_options; nir_options->lower_doubles_options = fp64_options; + /* Starting with Gen11, we lower away 8-bit arithmetic */ + nir_options->support_8bit_alu = devinfo->gen < 11; + nir_options->unify_interfaces = i < MESA_SHADER_FRAGMENT; compiler->glsl_compiler_options[i].NirOptions = nir_options; diff --git a/src/intel/compiler/brw_fs_builder.h b/src/intel/compiler/brw_fs_builder.h index bdd10638389..368285a3cbe 100644 --- a/src/intel/compiler/brw_fs_builder.h +++ b/src/intel/compiler/brw_fs_builder.h @@ -304,11 +304,11 @@ namespace brw { case SHADER_OPCODE_INT_REMAINDER: return emit(instruction(opcode, dispatch_width(), dst, fix_math_operand(src0), - fix_math_operand(fix_byte_src(src1)))); + fix_math_operand(src1))); default: return emit(instruction(opcode, dispatch_width(), dst, - src0, fix_byte_src(src1))); + src0, src1)); } } @@ -327,12 +327,12 @@ namespace brw { case BRW_OPCODE_LRP: return emit(instruction(opcode, dispatch_width(), dst, fix_3src_operand(src0), - fix_3src_operand(fix_byte_src(src1)), - fix_3src_operand(fix_byte_src(src2)))); + fix_3src_operand(src1), + fix_3src_operand(src2))); default: return emit(instruction(opcode, dispatch_width(), dst, - src0, fix_byte_src(src1), fix_byte_src(src2))); + src0, src1, src2)); } } @@ -394,8 +394,8 @@ namespace brw { /* In some cases we can't have bytes as operand for src1, so use the * same type for both operand. */ - return set_condmod(mod, SEL(dst, fix_unsigned_negate(fix_byte_src(src0)), - fix_unsigned_negate(fix_byte_src(src1)))); + return set_condmod(mod, SEL(dst, fix_unsigned_negate(src0), + fix_unsigned_negate(src1))); } /** @@ -659,8 +659,8 @@ namespace brw { emit(BRW_OPCODE_CSEL, retype(dst, BRW_REGISTER_TYPE_F), retype(src0, BRW_REGISTER_TYPE_F), - retype(fix_byte_src(src1), BRW_REGISTER_TYPE_F), - fix_byte_src(src2))); + retype(src1, BRW_REGISTER_TYPE_F), + src2)); } /** @@ -721,21 +721,6 @@ namespace brw { backend_shader *shader; - /** - * Byte sized operands are not supported for src1 on Gen11+. - */ - src_reg - fix_byte_src(const src_reg &src) const - { - if (shader->devinfo->gen < 11 || type_sz(src.type) != 1) - return src; - - dst_reg temp = vgrf(src.type == BRW_REGISTER_TYPE_UB ? - BRW_REGISTER_TYPE_UD : BRW_REGISTER_TYPE_D); - MOV(temp, src); - return src_reg(temp); - } - private: /** * Workaround for negation of UD registers. See comment in diff --git a/src/intel/compiler/brw_fs_nir.cpp b/src/intel/compiler/brw_fs_nir.cpp index 38d7540fce0..56a0b2942bf 100644 --- a/src/intel/compiler/brw_fs_nir.cpp +++ b/src/intel/compiler/brw_fs_nir.cpp @@ -1424,18 +1424,11 @@ fs_visitor::nir_emit_alu(const fs_builder &bld, nir_alu_instr *instr, case nir_op_ine32: { fs_reg dest = result; - /* On Gen11 we have an additional issue being that src1 cannot be a byte - * type. So we convert both operands for the comparison. - */ - fs_reg temp_op[2]; - temp_op[0] = bld.fix_byte_src(op[0]); - temp_op[1] = bld.fix_byte_src(op[1]); - - const uint32_t bit_size = type_sz(temp_op[0].type) * 8; + const uint32_t bit_size = type_sz(op[0].type) * 8; if (bit_size != 32) - dest = bld.vgrf(temp_op[0].type, 1); + dest = bld.vgrf(op[0].type, 1); - bld.CMP(dest, temp_op[0], temp_op[1], + bld.CMP(dest, op[0], op[1], brw_cmod_for_nir_comparison(instr->op)); if (bit_size > 32) { diff --git a/src/intel/compiler/brw_nir.c b/src/intel/compiler/brw_nir.c index 282eac338fa..8c45f762c13 100644 --- a/src/intel/compiler/brw_nir.c +++ b/src/intel/compiler/brw_nir.c @@ -667,6 +667,15 @@ lower_bit_size_callback(const nir_instr *instr, UNUSED void *data) case nir_op_fcos: return devinfo->gen < 9 ? 32 : 0; default: + if (devinfo->gen >= 11) { + if (nir_op_infos[alu->op].num_inputs >= 2 && + alu->dest.dest.ssa.bit_size == 8) + return 16; + + if (nir_alu_instr_is_comparison(alu) && + alu->src[0].src.ssa->bit_size == 8) + return 16; + } return 0; } break; @@ -675,6 +684,22 @@ lower_bit_size_callback(const nir_instr *instr, UNUSED void *data) case nir_instr_type_intrinsic: { nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr); switch (intrin->intrinsic) { + case nir_intrinsic_read_invocation: + case nir_intrinsic_read_first_invocation: + case nir_intrinsic_vote_feq: + case nir_intrinsic_vote_ieq: + case nir_intrinsic_shuffle: + case nir_intrinsic_shuffle_xor: + case nir_intrinsic_shuffle_up: + case nir_intrinsic_shuffle_down: + case nir_intrinsic_quad_broadcast: + case nir_intrinsic_quad_swap_horizontal: + case nir_intrinsic_quad_swap_vertical: + case nir_intrinsic_quad_swap_diagonal: + if (intrin->src[0].ssa->bit_size == 8 && devinfo->gen >= 11) + return 16; + return 0; + case nir_intrinsic_reduce: case nir_intrinsic_inclusive_scan: case nir_intrinsic_exclusive_scan: