intel/brw: Introduce a new SSA-based copy propagation pass

(Quite a few of the restrictions here are ported from the old pass.) Reviewed-by: Caio Oliveira <caio.oliveira@intel.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/28666>
2024-03-08 23:18:33 -08:00
parent 9690bd369d
commit 580e1c592d
2 changed files with 373 additions and 0 deletions
--- a/src/intel/compiler/brw_fs.h
+++ b/src/intel/compiler/brw_fs.h
@@ -671,6 +671,7 @@ bool brw_fs_opt_cmod_propagation(fs_visitor &s);
 bool brw_fs_opt_combine_constants(fs_visitor &s);
 bool brw_fs_opt_compact_virtual_grfs(fs_visitor &s);
 bool brw_fs_opt_copy_propagation(fs_visitor &s);
+bool brw_fs_opt_copy_propagation_defs(fs_visitor &s);
 bool brw_fs_opt_cse_defs(fs_visitor &s);
 bool brw_fs_opt_dead_code_eliminate(fs_visitor &s);
 bool brw_fs_opt_dead_control_flow_eliminate(fs_visitor &s);
--- a/src/intel/compiler/brw_fs_copy_propagation.cpp
+++ b/src/intel/compiler/brw_fs_copy_propagation.cpp
@@ -1482,3 +1482,375 @@ brw_fs_opt_copy_propagation(fs_visitor &s)

   return progress;
 }
+
+static bool
+try_copy_propagate_def(const brw_compiler *compiler,
+                       const brw::simple_allocator &alloc,
+                       fs_inst *def, const fs_reg &val,
+                       fs_inst *inst, int arg,
+                       uint8_t max_polygons)
+{
+   const struct intel_device_info *devinfo = compiler->devinfo;
+
+   assert(val.file != BAD_FILE);
+
+   /* We can't generally copy-propagate UD negations because we can end up
+    * accessing the resulting values as signed integers instead.
+    */
+   if (val.negate && val.type == BRW_TYPE_UD)
+      return false;
+
+   /* Bail if the instruction type is larger than the execution type of the
+    * copy, what implies that each channel is reading multiple channels of the
+    * destination of the copy, and simply replacing the sources would give a
+    * program with different semantics.
+    */
+   if (inst->opcode != BRW_OPCODE_MOV &&
+       brw_type_size_bits(def->dst.type) <
+       brw_type_size_bits(inst->src[arg].type))
+      return false;
+
+   const bool has_source_modifiers = val.abs || val.negate;
+
+   if (has_source_modifiers) {
+      if (is_logic_op(inst->opcode) || !inst->can_do_source_mods(devinfo))
+         return false;
+
+      /* Since semantics of source modifiers are type-dependent we need to
+       * ensure that the meaning of the instruction remains the same if we
+       * change the type. If the sizes of the types are different the new
+       * instruction will read a different amount of data than the original
+       * and the semantics will always be different.
+       */
+      if (def->dst.type != inst->src[arg].type &&
+          (!inst->can_change_types() ||
+           brw_type_size_bits(def->dst.type) !=
+           brw_type_size_bits(inst->src[arg].type)))
+         return false;
+   }
+
+   /* Send messages with EOT set are restricted to use g112-g127 (and we
+    * sometimes need g127 for other purposes), so avoid copy propagating
+    * anything that would make it impossible to satisfy that restriction.
+    */
+   if (inst->eot) {
+      /* Don't propagate things that are already pinned. */
+      if (val.file != VGRF)
+         return false;
+
+      /* We might be propagating from a large register, while the SEND only
+       * is reading a portion of it (say the .A channel in an RGBA value).
+       * We need to pin both split SEND sources in g112-g126/127, so only
+       * allow this if the registers aren't too large.
+       */
+      if (inst->opcode == SHADER_OPCODE_SEND && inst->sources >= 4 &&
+          val.file == VGRF) {
+         int other_src = arg == 2 ? 3 : 2;
+         unsigned other_size = inst->src[other_src].file == VGRF ?
+                               alloc.sizes[inst->src[other_src].nr] :
+                               inst->size_read(other_src);
+         unsigned prop_src_size = alloc.sizes[val.nr];
+         if (other_size + prop_src_size > 15)
+            return false;
+      }
+   }
+
+   /* Reject cases that would violate register regioning restrictions. */
+   if ((val.file == UNIFORM || !val.is_contiguous()) &&
+       (inst->is_send_from_grf() || inst->uses_indirect_addressing())) {
+      return false;
+   }
+
+   /* Some instructions implemented in the generator backend, such as
+    * derivatives, assume that their operands are packed so we can't
+    * generally propagate strided regions to them.
+    */
+   const unsigned entry_stride = val.file == FIXED_GRF ? 1 : val.stride;
+   if (instruction_requires_packed_data(inst) && entry_stride != 1)
+      return false;
+
+   const brw_reg_type dst_type = (has_source_modifiers &&
+                                  def->dst.type != inst->src[arg].type) ?
+      def->dst.type : inst->dst.type;
+
+   /* Bail if the result of composing both strides would exceed the
+    * hardware limit.
+    */
+   if (!can_take_stride(inst, dst_type, arg,
+                        entry_stride * inst->src[arg].stride,
+                        compiler))
+      return false;
+
+   /* Bail if the source FIXED_GRF region of the copy cannot be trivially
+    * composed with the source region of the instruction -- E.g. because the
+    * copy uses some extended stride greater than 4 not supported natively by
+    * the hardware as a horizontal stride, or because instruction compression
+    * could require us to use a vertical stride shorter than a GRF.
+    */
+   if (val.file == FIXED_GRF &&
+       (inst->src[arg].stride > 4 ||
+        inst->dst.component_size(inst->exec_size) >
+        inst->src[arg].component_size(inst->exec_size)))
+      return false;
+
+   /* Bail if the result of composing both strides cannot be expressed
+    * as another stride. This avoids, for example, trying to transform
+    * this:
+    *
+    *     MOV (8) rX<1>UD rY<0;1,0>UD
+    *     FOO (8) ...     rX<8;8,1>UW
+    *
+    * into this:
+    *
+    *     FOO (8) ...     rY<0;1,0>UW
+    *
+    * Which would have different semantics.
+    */
+   if (entry_stride != 1 &&
+       (inst->src[arg].stride *
+        brw_type_size_bytes(inst->src[arg].type)) % brw_type_size_bytes(val.type) != 0)
+      return false;
+
+   /* From the Cherry Trail/Braswell PRMs, Volume 7: 3D Media GPGPU:
+    *    EU Overview
+    *       Register Region Restrictions
+    *          Special Requirements for Handling Double Precision Data Types :
+    *
+    *   "When source or destination datatype is 64b or operation is integer
+    *    DWord multiply, regioning in Align1 must follow these rules:
+    *
+    *      1. Source and Destination horizontal stride must be aligned to the
+    *         same qword.
+    *      2. Regioning must ensure Src.Vstride = Src.Width * Src.Hstride.
+    *      3. Source and Destination offset must be the same, except the case
+    *         of scalar source."
+    *
+    * Most of this is already checked in can_take_stride(), we're only left
+    * with checking 3.
+    */
+   if (has_dst_aligned_region_restriction(devinfo, inst, dst_type) &&
+       entry_stride != 0 &&
+       (reg_offset(inst->dst) % (REG_SIZE * reg_unit(devinfo))) != (reg_offset(val) % (REG_SIZE * reg_unit(devinfo))))
+      return false;
+
+   /* The <8;8,0> regions used for FS attributes in multipolygon
+    * dispatch mode could violate regioning restrictions, don't copy
+    * propagate them in such cases.
+    */
+   if (max_polygons > 1 && val.file == ATTR &&
+       (has_dst_aligned_region_restriction(devinfo, inst, dst_type) ||
+        instruction_requires_packed_data(inst) ||
+        (inst->is_3src(compiler) && arg == 2) ||
+        def->dst.type != inst->src[arg].type))
+      return false;
+
+   /* Fold the copy into the instruction consuming it. */
+   inst->src[arg].file = val.file;
+   inst->src[arg].nr = val.nr;
+   inst->src[arg].subnr = val.subnr;
+   inst->src[arg].offset = val.offset;
+
+   /* Compose the strides of both regions. */
+   if (val.file == FIXED_GRF) {
+      if (inst->src[arg].stride) {
+         const unsigned orig_width = 1 << val.width;
+         const unsigned reg_width =
+            REG_SIZE / (brw_type_size_bytes(inst->src[arg].type) *
+                        inst->src[arg].stride);
+         inst->src[arg].width = cvt(MIN2(orig_width, reg_width)) - 1;
+         inst->src[arg].hstride = cvt(inst->src[arg].stride);
+         inst->src[arg].vstride = inst->src[arg].hstride + inst->src[arg].width;
+      } else {
+         inst->src[arg].vstride = inst->src[arg].hstride =
+            inst->src[arg].width = 0;
+      }
+
+      inst->src[arg].stride = 1;
+
+      /* Hopefully no Align16 around here... */
+      assert(val.swizzle == BRW_SWIZZLE_XYZW);
+      inst->src[arg].swizzle = val.swizzle;
+   } else {
+      inst->src[arg].stride *= val.stride;
+   }
+
+   /* Handle NoMask cases where the def replicates a small scalar to a number
+    * of channels, but the use is a lower SIMD width but larger type, so each
+    * invocation reads multiple channels worth of data, e.g.
+    *
+    *    mov(16) vgrf1:UW, u0<0>:UW NoMask
+    *    mov(8)  vgrf2:UD, vgrf1:UD NoMask group0
+    *
+    * In this case, we should just use the scalar's type.
+    */
+   if (val.stride == 0 &&
+       inst->opcode == BRW_OPCODE_MOV &&
+       inst->force_writemask_all && def->force_writemask_all &&
+       inst->exec_size < def->exec_size &&
+       (inst->exec_size * brw_type_size_bytes(inst->src[arg].type) ==
+        def->exec_size * brw_type_size_bytes(val.type))) {
+      inst->src[arg].type = val.type;
+      inst->dst.type = val.type;
+      inst->exec_size = def->exec_size;
+   }
+
+   if (has_source_modifiers) {
+      if (def->dst.type != inst->src[arg].type) {
+         /* We are propagating source modifiers from a MOV with a different
+          * type.  If we got here, then we can just change the source and
+          * destination types of the instruction and keep going.
+          */
+         for (int i = 0; i < inst->sources; i++) {
+            inst->src[i].type = def->dst.type;
+         }
+         inst->dst.type = def->dst.type;
+      }
+
+      if (!inst->src[arg].abs) {
+         inst->src[arg].abs = val.abs;
+         inst->src[arg].negate ^= val.negate;
+      }
+   }
+
+   return true;
+}
+
+static bool
+try_constant_propagate_def(fs_inst *def, fs_reg val, fs_inst *inst, int arg)
+{
+   /* Bail if inst is reading more than a single vector component of entry */
+   if (inst->size_read(arg) > def->dst.component_size(inst->exec_size))
+      return false;
+
+   return try_constant_propagate_value(val, def->dst.type, inst, arg);
+}
+
+/**
+ * Handle cases like UW subreads of a UD immediate, with an offset.
+ */
+static fs_reg
+extract_imm(fs_reg val, brw_reg_type type, unsigned offset)
+{
+   assert(val.file == IMM);
+
+   const unsigned bitsize = brw_type_size_bits(type);
+
+   if (offset == 0 || bitsize == brw_type_size_bits(val.type))
+      return val;
+
+   assert(bitsize < brw_type_size_bits(val.type));
+
+   switch (val.type) {
+   case BRW_TYPE_UD:
+      val.ud = (val.ud >> (bitsize * offset)) & ((1u << bitsize) - 1);
+      break;
+   case BRW_TYPE_D:
+      val.d = (val.d << (bitsize * (32/bitsize - 1 - offset))) >> ((32/bitsize - 1) * bitsize);
+      break;
+   default:
+      return fs_reg();
+   }
+
+   return val;
+}
+
+static fs_reg
+find_value_for_offset(fs_inst *def, const fs_reg &src, unsigned src_size)
+{
+   fs_reg val;
+
+   switch (def->opcode) {
+   case BRW_OPCODE_MOV:
+      if (def->dst.type == def->src[0].type && def->src[0].stride <= 1) {
+         val = def->src[0];
+
+         unsigned rel_offset = src.offset - def->dst.offset;
+
+         if (val.stride == 0)
+            rel_offset %= brw_type_size_bytes(def->dst.type);
+
+         if (val.file == IMM)
+            val = extract_imm(val, src.type, rel_offset);
+         else
+            val = byte_offset(def->src[0], rel_offset);
+      }
+      break;
+   case SHADER_OPCODE_LOAD_PAYLOAD: {
+      unsigned offset = 0;
+      for (int i = def->header_size; i < def->sources; i++) {
+         const unsigned splat = def->src[i].stride == 0 ? def->exec_size : 1;
+         if (offset == src.offset) {
+            if (def->dst.type == def->src[i].type &&
+                def->src[i].stride <= 1 &&
+                def->src[i].component_size(def->exec_size) * splat == src_size)
+               val = def->src[i];
+
+            break;
+         }
+
+         offset += def->exec_size * brw_type_size_bytes(def->src[i].type);
+      }
+      break;
+   }
+   default:
+      break;
+   }
+
+   return val;
+}
+
+bool
+brw_fs_opt_copy_propagation_defs(fs_visitor &s)
+{
+   const brw::def_analysis &defs = s.def_analysis.require();
+   bool progress = false;
+
+   foreach_block_and_inst(block, fs_inst, inst, s.cfg) {
+      /* Try propagating into this instruction. */
+      bool instruction_progress = false;
+
+      for (int i = inst->sources - 1; i >= 0; i--) {
+         fs_inst *def = defs.get(inst->src[i]);
+
+         if (!def || def->saturate)
+            continue;
+
+         if (def->opcode == SHADER_OPCODE_LOAD_PAYLOAD) {
+            if (inst->size_read(i) == def->size_written &&
+                def->src[0].file != BAD_FILE && def->src[0].file != IMM &&
+                is_identity_payload(def->src[0].file, def)) {
+               instruction_progress |=
+                  try_copy_propagate_def(s.compiler, s.alloc, def, def->src[0],
+                                         inst, i, s.max_polygons);
+               continue;
+            }
+         }
+
+         fs_reg val =
+            find_value_for_offset(def, inst->src[i], inst->size_read(i));
+
+         if (val.file == IMM) {
+            instruction_progress |=
+               try_constant_propagate_def(def, val, inst, i);
+         } else if (val.file == VGRF ||
+                    val.file == ATTR || val.file == UNIFORM ||
+                    (val.file == FIXED_GRF && val.is_contiguous())) {
+            instruction_progress |=
+               try_copy_propagate_def(s.compiler, s.alloc, def, val, inst, i,
+                                      s.max_polygons);
+         }
+      }
+
+      if (instruction_progress) {
+         progress = true;
+         commute_immediates(inst);
+      }
+   }
+
+   if (progress) {
+      s.invalidate_analysis(DEPENDENCY_INSTRUCTION_DATA_FLOW |
+                            DEPENDENCY_INSTRUCTION_DETAIL);
+   }
+
+   return progress;
+}