i965/fs: Add support for nir_intrinsic_shuffle

Reviewed-by: Iago Toral Quiroga <itoral@igalia.com>
2017-08-29 09:21:32 -07:00
parent 8256ee3fa3
commit 90c9f29518
7 changed files with 150 additions and 0 deletions
--- a/src/intel/compiler/brw_eu_defines.h
+++ b/src/intel/compiler/brw_eu_defines.h
@@ -451,6 +451,15 @@ enum opcode {
    */
   SHADER_OPCODE_BROADCAST,

+   /* Pick the channel from its first source register given by the index
+    * specified as second source.
+    *
+    * This is similar to the BROADCAST instruction except that it takes a
+    * dynamic index and potentially puts a different value in each output
+    * channel.
+    */
+   SHADER_OPCODE_SHUFFLE,
+
   SHADER_OPCODE_GET_BUFFER_SIZE,

   VEC4_OPCODE_MOV_BYTES,
--- a/src/intel/compiler/brw_fs.cpp
+++ b/src/intel/compiler/brw_fs.cpp
@@ -310,6 +310,13 @@ fs_inst::has_source_and_destination_hazard() const
   case FS_OPCODE_PACK_HALF_2x16_SPLIT:
      /* Multiple partial writes to the destination */
      return true;
+   case SHADER_OPCODE_SHUFFLE:
+      /* This instruction returns an arbitrary channel from the source and
+       * gets split into smaller instructions in the generator.  It's possible
+       * that one of the instructions will read from a channel corresponding
+       * to an earlier instruction.
+       */
+      return true;
   default:
      /* The SIMD16 compressed instruction
       *
@@ -2531,6 +2538,20 @@ fs_visitor::opt_algebraic()
         }
         break;

+      case SHADER_OPCODE_SHUFFLE:
+         if (is_uniform(inst->src[0])) {
+            inst->opcode = BRW_OPCODE_MOV;
+            inst->sources = 1;
+            progress = true;
+         } else if (inst->src[1].file == IMM) {
+            inst->opcode = BRW_OPCODE_MOV;
+            inst->src[0] = component(inst->src[0],
+                                     inst->src[1].ud);
+            inst->sources = 1;
+            progress = true;
+         }
+         break;
+
      default:
 	 break;
      }
--- a/src/intel/compiler/brw_fs.h
+++ b/src/intel/compiler/brw_fs.h
@@ -471,6 +471,11 @@ private:
                              struct brw_reg reg,
                              struct brw_reg indirect_byte_offset);

+   void generate_shuffle(fs_inst *inst,
+                         struct brw_reg dst,
+                         struct brw_reg src,
+                         struct brw_reg idx);
+
   bool patch_discard_jumps_to_fb_writes();

   const struct brw_compiler *compiler;
--- a/src/intel/compiler/brw_fs_generator.cpp
+++ b/src/intel/compiler/brw_fs_generator.cpp
@@ -540,6 +540,106 @@ fs_generator::generate_mov_indirect(fs_inst *inst,
   }
 }

+void
+fs_generator::generate_shuffle(fs_inst *inst,
+                               struct brw_reg dst,
+                               struct brw_reg src,
+                               struct brw_reg idx)
+{
+   /* Ivy bridge has some strange behavior that makes this a real pain to
+    * implement for 64-bit values so we just don't bother.
+    */
+   assert(devinfo->gen >= 8 || devinfo->is_haswell || type_sz(src.type) <= 4);
+
+   /* Because we're using the address register, we're limited to 8-wide
+    * execution on gen7.  On gen8, we're limited to 16-wide by the address
+    * register file and 8-wide for 64-bit types.  We could try and make this
+    * instruction splittable higher up in the compiler but that gets weird
+    * because it reads all of the channels regardless of execution size.  It's
+    * easier just to split it here.
+    */
+   const unsigned lower_width =
+      (devinfo->gen <= 7 || type_sz(src.type) > 4) ?
+      8 : MIN2(16, inst->exec_size);
+
+   brw_set_default_exec_size(p, cvt(lower_width) - 1);
+   for (unsigned group = 0; group < inst->exec_size; group += lower_width) {
+      brw_set_default_group(p, group);
+
+      if ((src.vstride == 0 && src.hstride == 0) ||
+          idx.file == BRW_IMMEDIATE_VALUE) {
+         /* Trivial, the source is already uniform or the index is a constant.
+          * We will typically not get here if the optimizer is doing its job,
+          * but asserting would be mean.
+          */
+         const unsigned i = idx.file == BRW_IMMEDIATE_VALUE ? idx.ud : 0;
+         brw_MOV(p, suboffset(dst, group), stride(suboffset(src, i), 0, 1, 0));
+      } else {
+         /* We use VxH indirect addressing, clobbering a0.0 through a0.7. */
+         struct brw_reg addr = vec8(brw_address_reg(0));
+
+         struct brw_reg group_idx = suboffset(idx, group);
+
+         if (lower_width == 8 && group_idx.width == BRW_WIDTH_16) {
+            /* Things get grumpy if the register is too wide. */
+            group_idx.width--;
+            group_idx.vstride--;
+         }
+
+         assert(type_sz(group_idx.type) <= 4);
+         if (type_sz(group_idx.type) == 4) {
+            /* The destination stride of an instruction (in bytes) must be
+             * greater than or equal to the size of the rest of the
+             * instruction.  Since the address register is of type UW, we
+             * can't use a D-type instruction.  In order to get around this,
+             * re retype to UW and use a stride.
+             */
+            group_idx = retype(spread(group_idx, 2), BRW_REGISTER_TYPE_W);
+         }
+
+         /* Take into account the component size and horizontal stride. */
+         assert(src.vstride == src.hstride + src.width);
+         brw_SHL(p, addr, group_idx,
+                 brw_imm_uw(_mesa_logbase2(type_sz(src.type)) +
+                            src.hstride - 1));
+
+         /* Add on the register start offset */
+         brw_ADD(p, addr, addr, brw_imm_uw(src.nr * REG_SIZE + src.subnr));
+
+         if (type_sz(src.type) > 4 &&
+             ((devinfo->gen == 7 && !devinfo->is_haswell) ||
+              devinfo->is_cherryview || gen_device_info_is_9lp(devinfo))) {
+            /* IVB has an issue (which we found empirically) where it reads
+             * two address register components per channel for indirectly
+             * addressed 64-bit sources.
+             *
+             * From the Cherryview PRM Vol 7. "Register Region Restrictions":
+             *
+             *    "When source or destination datatype is 64b or operation is
+             *    integer DWord multiply, indirect addressing must not be
+             *    used."
+             *
+             * To work around both of these, we do two integer MOVs insead of
+             * one 64-bit MOV.  Because no double value should ever cross a
+             * register boundary, it's safe to use the immediate offset in the
+             * indirect here to handle adding 4 bytes to the offset and avoid
+             * the extra ADD to the register file.
+             */
+            struct brw_reg gdst = suboffset(dst, group);
+            struct brw_reg dst_d = retype(spread(gdst, 2),
+                                          BRW_REGISTER_TYPE_D);
+            brw_MOV(p, dst_d,
+                    retype(brw_VxH_indirect(0, 0), BRW_REGISTER_TYPE_D));
+            brw_MOV(p, byte_offset(dst_d, 4),
+                    retype(brw_VxH_indirect(0, 4), BRW_REGISTER_TYPE_D));
+         } else {
+            brw_MOV(p, suboffset(dst, group),
+                    retype(brw_VxH_indirect(0, 0), src.type));
+         }
+      }
+   }
+}
+
 void
 fs_generator::generate_urb_read(fs_inst *inst,
                                struct brw_reg dst,
@@ -2189,6 +2289,10 @@ fs_generator::generate_code(const cfg_t *cfg, int dispatch_width)
         brw_broadcast(p, dst, src[0], src[1]);
         break;

+      case SHADER_OPCODE_SHUFFLE:
+         generate_shuffle(inst, dst, src[0], src[1]);
+         break;
+
      case FS_OPCODE_SET_SAMPLE_ID:
         generate_set_sample_id(inst, dst, src[0], src[1]);
         break;
--- a/src/intel/compiler/brw_fs_nir.cpp
+++ b/src/intel/compiler/brw_fs_nir.cpp
@@ -4507,6 +4507,14 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr
      break;
   }

+   case nir_intrinsic_shuffle: {
+      const fs_reg value = get_nir_src(instr->src[0]);
+      const fs_reg index = get_nir_src(instr->src[1]);
+
+      bld.emit(SHADER_OPCODE_SHUFFLE, retype(dest, value.type), value, index);
+      break;
+   }
+
   case nir_intrinsic_first_invocation: {
      fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_UD);
      bld.exec_all().emit(SHADER_OPCODE_FIND_LIVE_CHANNEL, tmp);
--- a/src/intel/compiler/brw_nir.c
+++ b/src/intel/compiler/brw_nir.c
@@ -656,6 +656,7 @@ brw_preprocess_nir(const struct brw_compiler *compiler, nir_shader *nir)
      .lower_to_scalar = true,
      .lower_subgroup_masks = true,
      .lower_vote_trivial = !is_scalar,
+      .lower_shuffle = true,
   };
   OPT(nir_lower_subgroups, &subgroups_options);

--- a/src/intel/compiler/brw_shader.cpp
+++ b/src/intel/compiler/brw_shader.cpp
@@ -330,6 +330,8 @@ brw_instruction_name(const struct gen_device_info *devinfo, enum opcode op)
      return "find_live_channel";
   case SHADER_OPCODE_BROADCAST:
      return "broadcast";
+   case SHADER_OPCODE_SHUFFLE:
+      return "shuffle";

   case SHADER_OPCODE_GET_BUFFER_SIZE:
      return "get_buffer_size";