From 68092df8d8872bffb07dbd21d1d58733651dc97c Mon Sep 17 00:00:00 2001
From: Jason Ekstrand <jason@jlekstrand.net>
Date: Thu, 5 Nov 2020 23:19:31 -0600
Subject: [PATCH] intel/nir: Lower 8-bit ops to 16-bit in NIR on Gen11+

Intel hardware supports 8-bit arithmetic but it's tricky and annoying:

  - Byte operations don't actually execute with a byte type.  The
    execution type for byte operations is actually word.  (I don't know
    if this has implications for the HW implementation.  Probably?)

  - Destinations are required to be strided out to at least the
    execution type size.  This means that B-type operations always have
    a stride of at least 2.  This means wreaks havoc on the back-end in
    multiple ways.

  - Thanks to the strided destination, we don't actually save register
    space by storing things in bytes.  We could, in theory, interleave
    two byte values into a single 2B-strided register but that's both a
    pain for RA and would lead to piles of false dependencies pre-Gen12
    and on Gen12+, we'd need some significant improvements to the SWSB
    pass.

  - Also thanks to the strided destination, all byte writes are treated
    as partial writes by the back-end and we don't know how to copy-prop
    them.

  - On Gen11, they added a new hardware restriction that byte types
    aren't allowed in the 2nd and 3rd sources of instructions.  This
    means that we have to emit B->W conversions all over to resolve
    things.  If we emit said conversions in NIR, instead, there's a
    chance NIR can get rid of some of them for us.

We can get rid of a lot of this pain by just asking NIR to get rid of
8-bit arithmetic for us.  It may lead to a few more conversions in some
cases but having back-end copy-prop actually work is probably a bigger
bonus.  There is still a bit we have to handle in the back-end.  In
particular, basic MOVs and conversions because 8-bit load/store ops
still require 8-bit types.

Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/7482>
---
 src/intel/compiler/brw_compiler.c   |  4 +++-
 src/intel/compiler/brw_fs_builder.h | 33 ++++++++---------------------
 src/intel/compiler/brw_fs_nir.cpp   | 13 +++---------
 src/intel/compiler/brw_nir.c        | 25 ++++++++++++++++++++++
 4 files changed, 40 insertions(+), 35 deletions(-)

diff --git a/src/intel/compiler/brw_compiler.c b/src/intel/compiler/brw_compiler.c
index 9b2a0427144..933b00440dd 100644
--- a/src/intel/compiler/brw_compiler.c
+++ b/src/intel/compiler/brw_compiler.c
@@ -49,7 +49,6 @@
    .vertex_id_zero_based = true,                                              \
    .lower_base_vertex = true,                                                 \
    .use_scoped_barrier = true,                                                \
-   .support_8bit_alu = true,                                                  \
    .support_16bit_alu = true,                                                 \
    .lower_uniforms_to_ubo = true
 
@@ -187,6 +186,9 @@ brw_compiler_create(void *mem_ctx, const struct gen_device_info *devinfo)
       nir_options->lower_int64_options = int64_options;
       nir_options->lower_doubles_options = fp64_options;
 
+      /* Starting with Gen11, we lower away 8-bit arithmetic */
+      nir_options->support_8bit_alu = devinfo->gen < 11;
+
       nir_options->unify_interfaces = i < MESA_SHADER_FRAGMENT;
 
       compiler->glsl_compiler_options[i].NirOptions = nir_options;
diff --git a/src/intel/compiler/brw_fs_builder.h b/src/intel/compiler/brw_fs_builder.h
index bdd10638389..368285a3cbe 100644
--- a/src/intel/compiler/brw_fs_builder.h
+++ b/src/intel/compiler/brw_fs_builder.h
@@ -304,11 +304,11 @@ namespace brw {
          case SHADER_OPCODE_INT_REMAINDER:
             return emit(instruction(opcode, dispatch_width(), dst,
                                     fix_math_operand(src0),
-                                    fix_math_operand(fix_byte_src(src1))));
+                                    fix_math_operand(src1)));
 
          default:
             return emit(instruction(opcode, dispatch_width(), dst,
-                                    src0, fix_byte_src(src1)));
+                                    src0, src1));
 
          }
       }
@@ -327,12 +327,12 @@ namespace brw {
          case BRW_OPCODE_LRP:
             return emit(instruction(opcode, dispatch_width(), dst,
                                     fix_3src_operand(src0),
-                                    fix_3src_operand(fix_byte_src(src1)),
-                                    fix_3src_operand(fix_byte_src(src2))));
+                                    fix_3src_operand(src1),
+                                    fix_3src_operand(src2)));
 
          default:
             return emit(instruction(opcode, dispatch_width(), dst,
-                                    src0, fix_byte_src(src1), fix_byte_src(src2)));
+                                    src0, src1, src2));
          }
       }
 
@@ -394,8 +394,8 @@ namespace brw {
          /* In some cases we can't have bytes as operand for src1, so use the
           * same type for both operand.
           */
-         return set_condmod(mod, SEL(dst, fix_unsigned_negate(fix_byte_src(src0)),
-                                     fix_unsigned_negate(fix_byte_src(src1))));
+         return set_condmod(mod, SEL(dst, fix_unsigned_negate(src0),
+                                     fix_unsigned_negate(src1)));
       }
 
       /**
@@ -659,8 +659,8 @@ namespace brw {
                             emit(BRW_OPCODE_CSEL,
                                  retype(dst, BRW_REGISTER_TYPE_F),
                                  retype(src0, BRW_REGISTER_TYPE_F),
-                                 retype(fix_byte_src(src1), BRW_REGISTER_TYPE_F),
-                                 fix_byte_src(src2)));
+                                 retype(src1, BRW_REGISTER_TYPE_F),
+                                 src2));
       }
 
       /**
@@ -721,21 +721,6 @@ namespace brw {
 
       backend_shader *shader;
 
-      /**
-       * Byte sized operands are not supported for src1 on Gen11+.
-       */
-      src_reg
-      fix_byte_src(const src_reg &src) const
-      {
-         if (shader->devinfo->gen < 11 || type_sz(src.type) != 1)
-            return src;
-
-         dst_reg temp = vgrf(src.type == BRW_REGISTER_TYPE_UB ?
-                             BRW_REGISTER_TYPE_UD : BRW_REGISTER_TYPE_D);
-         MOV(temp, src);
-         return src_reg(temp);
-      }
-
    private:
       /**
        * Workaround for negation of UD registers.  See comment in
diff --git a/src/intel/compiler/brw_fs_nir.cpp b/src/intel/compiler/brw_fs_nir.cpp
index 38d7540fce0..56a0b2942bf 100644
--- a/src/intel/compiler/brw_fs_nir.cpp
+++ b/src/intel/compiler/brw_fs_nir.cpp
@@ -1424,18 +1424,11 @@ fs_visitor::nir_emit_alu(const fs_builder &bld, nir_alu_instr *instr,
    case nir_op_ine32: {
       fs_reg dest = result;
 
-      /* On Gen11 we have an additional issue being that src1 cannot be a byte
-       * type. So we convert both operands for the comparison.
-       */
-      fs_reg temp_op[2];
-      temp_op[0] = bld.fix_byte_src(op[0]);
-      temp_op[1] = bld.fix_byte_src(op[1]);
-
-      const uint32_t bit_size = type_sz(temp_op[0].type) * 8;
+      const uint32_t bit_size = type_sz(op[0].type) * 8;
       if (bit_size != 32)
-         dest = bld.vgrf(temp_op[0].type, 1);
+         dest = bld.vgrf(op[0].type, 1);
 
-      bld.CMP(dest, temp_op[0], temp_op[1],
+      bld.CMP(dest, op[0], op[1],
               brw_cmod_for_nir_comparison(instr->op));
 
       if (bit_size > 32) {
diff --git a/src/intel/compiler/brw_nir.c b/src/intel/compiler/brw_nir.c
index 282eac338fa..8c45f762c13 100644
--- a/src/intel/compiler/brw_nir.c
+++ b/src/intel/compiler/brw_nir.c
@@ -667,6 +667,15 @@ lower_bit_size_callback(const nir_instr *instr, UNUSED void *data)
       case nir_op_fcos:
          return devinfo->gen < 9 ? 32 : 0;
       default:
+         if (devinfo->gen >= 11) {
+            if (nir_op_infos[alu->op].num_inputs >= 2 &&
+                alu->dest.dest.ssa.bit_size == 8)
+               return 16;
+
+            if (nir_alu_instr_is_comparison(alu) &&
+                alu->src[0].src.ssa->bit_size == 8)
+               return 16;
+         }
          return 0;
       }
       break;
@@ -675,6 +684,22 @@ lower_bit_size_callback(const nir_instr *instr, UNUSED void *data)
    case nir_instr_type_intrinsic: {
       nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
       switch (intrin->intrinsic) {
+      case nir_intrinsic_read_invocation:
+      case nir_intrinsic_read_first_invocation:
+      case nir_intrinsic_vote_feq:
+      case nir_intrinsic_vote_ieq:
+      case nir_intrinsic_shuffle:
+      case nir_intrinsic_shuffle_xor:
+      case nir_intrinsic_shuffle_up:
+      case nir_intrinsic_shuffle_down:
+      case nir_intrinsic_quad_broadcast:
+      case nir_intrinsic_quad_swap_horizontal:
+      case nir_intrinsic_quad_swap_vertical:
+      case nir_intrinsic_quad_swap_diagonal:
+         if (intrin->src[0].ssa->bit_size == 8 && devinfo->gen >= 11)
+            return 16;
+         return 0;
+
       case nir_intrinsic_reduce:
       case nir_intrinsic_inclusive_scan:
       case nir_intrinsic_exclusive_scan: