diff --git a/src/intel/compiler/brw_fs_lower_regioning.cpp b/src/intel/compiler/brw_fs_lower_regioning.cpp
index 2c901434e5b..127499ac99e 100644
--- a/src/intel/compiler/brw_fs_lower_regioning.cpp
+++ b/src/intel/compiler/brw_fs_lower_regioning.cpp
@@ -46,6 +46,99 @@ namespace {
              !inst->src[0].abs;
    }
 
+   /*
+    * Return an acceptable byte stride for the specified source of an
+    * instruction affected by a regioning restriction.
+    */
+   unsigned
+   required_src_byte_stride(const intel_device_info *devinfo, const fs_inst *inst,
+                            unsigned i)
+   {
+      if (has_dst_aligned_region_restriction(devinfo, inst)) {
+         return MAX2(type_sz(inst->dst.type), byte_stride(inst->dst));
+
+      } else if (has_subdword_integer_region_restriction(devinfo, inst) &&
+                 type_sz(inst->src[i].type) < 4 && byte_stride(inst->src[i]) >= 4) {
+         /* Use a stride of 32bits if possible, since that will guarantee that
+          * the copy emitted to lower this region won't be affected by the
+          * sub-dword integer region restrictions.  This may not be possible
+          * for the second source of an instruction if we're required to use
+          * packed data due to Wa_16012383669.
+          */
+         return (i == 1 ? type_sz(inst->src[i].type) : 4);
+
+      } else {
+         return byte_stride(inst->src[i]);
+      }
+   }
+
+   /*
+    * Return an acceptable byte sub-register offset for the specified source
+    * of an instruction affected by a regioning restriction.
+    */
+   unsigned
+   required_src_byte_offset(const intel_device_info *devinfo, const fs_inst *inst,
+                            unsigned i)
+   {
+      if (has_dst_aligned_region_restriction(devinfo, inst)) {
+         return reg_offset(inst->dst) % (reg_unit(devinfo) * REG_SIZE);
+
+      } else if (has_subdword_integer_region_restriction(devinfo, inst) &&
+                 type_sz(inst->src[i].type) < 4 && byte_stride(inst->src[i]) >= 4) {
+         const unsigned dst_byte_stride = MAX2(byte_stride(inst->dst),
+                                               type_sz(inst->dst.type));
+         const unsigned src_byte_stride = required_src_byte_stride(devinfo, inst, i);
+         const unsigned dst_byte_offset =
+            reg_offset(inst->dst) % (reg_unit(devinfo) * REG_SIZE);
+         const unsigned src_byte_offset =
+            reg_offset(inst->src[i]) % (reg_unit(devinfo) * REG_SIZE);
+
+         if (src_byte_stride > type_sz(inst->src[i].type)) {
+            assert(src_byte_stride >= dst_byte_stride);
+            /* The source is affected by the Xe2+ sub-dword integer regioning
+             * restrictions.  For the case of source 0 BSpec#56640 specifies a
+             * number of equations relating the source and destination
+             * sub-register numbers in all cases where a source stride of
+             * 32bits is allowed.  These equations have the form:
+             *
+             *   k * Dst.SubReg % m = Src.SubReg / l
+             *
+             * For some constants k, l and m different for each combination of
+             * source and destination types and strides.  The expression in
+             * the return statement below computes a valid source offset by
+             * inverting the equation like:
+             *
+             *   Src.SubReg = l * k * (Dst.SubReg % m)
+             *
+             * and then scaling by the element type sizes in order to get an
+             * expression in terms of byte offsets instead of sub-register
+             * numbers.  It can be easily verified that in all cases listed on
+             * the hardware spec where the source has a well-defined uniform
+             * stride the product l*k is equal to the ratio between the source
+             * and destination strides.
+             */
+            const unsigned m = 64 * dst_byte_stride / src_byte_stride;
+            return dst_byte_offset % m * src_byte_stride / dst_byte_stride;
+         } else {
+            assert(src_byte_stride == type_sz(inst->src[i].type));
+            /* A packed source is required, likely due to the stricter
+             * requirements of the second source region.  The source being
+             * packed guarantees that the region of the original instruction
+             * will be valid, but the copy may break the regioning
+             * restrictions.  Do our best to try to prevent that from
+             * happening by making sure the offset of the temporary matches
+             * the original source based on the same equation above -- However
+             * that may not be sufficient if the source had a stride larger
+             * than 32bits, lowering the copy recursively may be necessary.
+             */
+            return src_byte_offset * src_byte_stride / byte_stride(inst->src[i]);
+         }
+
+      } else {
+         return reg_offset(inst->src[i]) % (reg_unit(devinfo) * REG_SIZE);
+      }
+   }
+
    /*
     * Return an acceptable byte stride for the destination of an instruction
     * that requires it to have some particular alignment.
@@ -193,43 +286,6 @@ namespace {
       }
    }
 
-   /*
-    * Return the stride between channels of the specified register in
-    * byte units, or ~0u if the region cannot be represented with a
-    * single one-dimensional stride.
-    */
-   unsigned
-   byte_stride(const fs_reg &reg)
-   {
-      switch (reg.file) {
-      case BAD_FILE:
-      case UNIFORM:
-      case IMM:
-      case VGRF:
-      case ATTR:
-         return reg.stride * type_sz(reg.type);
-      case ARF:
-      case FIXED_GRF:
-         if (reg.is_null()) {
-            return 0;
-         } else {
-            const unsigned hstride = reg.hstride ? 1 << (reg.hstride - 1) : 0;
-            const unsigned vstride = reg.vstride ? 1 << (reg.vstride - 1) : 0;
-            const unsigned width = 1 << reg.width;
-
-            if (width == 1) {
-               return vstride * type_sz(reg.type);
-            } else if (hstride * width == vstride) {
-               return hstride * type_sz(reg.type);
-            } else {
-               return ~0u;
-            }
-         }
-      default:
-         unreachable("Invalid register file");
-      }
-   }
-
    /*
     * Return whether the instruction has an unsupported channel bit layout
     * specified for the i-th source region.
@@ -257,10 +313,13 @@ namespace {
       const unsigned dst_byte_offset = reg_offset(inst->dst) % (reg_unit(devinfo) * REG_SIZE);
       const unsigned src_byte_offset = reg_offset(inst->src[i]) % (reg_unit(devinfo) * REG_SIZE);
 
-      return has_dst_aligned_region_restriction(devinfo, inst) &&
-             !is_uniform(inst->src[i]) &&
-             (byte_stride(inst->src[i]) != byte_stride(inst->dst) ||
-              src_byte_offset != dst_byte_offset);
+      return (has_dst_aligned_region_restriction(devinfo, inst) &&
+              !is_uniform(inst->src[i]) &&
+              (byte_stride(inst->src[i]) != byte_stride(inst->dst) ||
+               src_byte_offset != dst_byte_offset)) ||
+             (has_subdword_integer_region_restriction(devinfo, inst) &&
+              (byte_stride(inst->src[i]) != required_src_byte_stride(devinfo, inst, i) ||
+               src_byte_offset != required_src_byte_offset(devinfo, inst, i)));
    }
 
    /*
@@ -470,13 +529,24 @@ namespace {
    lower_src_region(fs_visitor *v, bblock_t *block, fs_inst *inst, unsigned i)
    {
       assert(inst->components_read(i) == 1);
+      const intel_device_info *devinfo = v->devinfo;
       const fs_builder ibld(v, block, inst);
-      const unsigned stride = type_sz(inst->dst.type) * inst->dst.stride /
+      const unsigned stride = required_src_byte_stride(devinfo, inst, i) /
                               type_sz(inst->src[i].type);
       assert(stride > 0);
-      fs_reg tmp = ibld.vgrf(inst->src[i].type, stride);
+      /* Calculate the size of the temporary allocation manually instead of
+       * relying on the builder, since we may have to add some amount of
+       * padding mandated by the hardware for Xe2+ instructions with sub-dword
+       * integer regions.
+       */
+      const unsigned size =
+         DIV_ROUND_UP(required_src_byte_offset(v->devinfo, inst, i) +
+                      inst->exec_size * stride * type_sz(inst->src[i].type),
+                      reg_unit(devinfo) * REG_SIZE) * reg_unit(devinfo);
+      fs_reg tmp(VGRF, v->alloc.allocate(size), inst->src[i].type);
       ibld.UNDEF(tmp);
-      tmp = horiz_stride(tmp, stride);
+      tmp = byte_offset(horiz_stride(tmp, stride),
+                        required_src_byte_offset(devinfo, inst, i));
 
       /* Emit a series of 32-bit integer copies with any source modifiers
        * cleaned up (because their semantics are dependent on the type).
@@ -488,8 +558,16 @@ namespace {
       raw_src.negate = false;
       raw_src.abs = false;
 
-      for (unsigned j = 0; j < n; j++)
-         ibld.MOV(subscript(tmp, raw_type, j), subscript(raw_src, raw_type, j));
+      for (unsigned j = 0; j < n; j++) {
+	fs_inst *jnst = ibld.MOV(subscript(tmp, raw_type, j),
+				 subscript(raw_src, raw_type, j));
+	if (has_subdword_integer_region_restriction(devinfo, jnst)) {
+           /* The copy isn't guaranteed to comply with all subdword integer
+            * regioning restrictions in some cases.  Lower it recursively.
+            */
+	   lower_instruction(v, block, jnst);
+        }
+      }
 
       /* Point the original instruction at the temporary, making sure to keep
        * any source modifiers in the instruction.
diff --git a/src/intel/compiler/brw_ir_fs.h b/src/intel/compiler/brw_ir_fs.h
index 4861a2ae279..7c196a8b71a 100644
--- a/src/intel/compiler/brw_ir_fs.h
+++ b/src/intel/compiler/brw_ir_fs.h
@@ -720,6 +720,43 @@ is_unordered(const intel_device_info *devinfo, const fs_inst *inst)
             inst->dst.type == BRW_REGISTER_TYPE_DF));
 }
 
+/*
+ * Return the stride between channels of the specified register in
+ * byte units, or ~0u if the region cannot be represented with a
+ * single one-dimensional stride.
+ */
+static inline unsigned
+byte_stride(const fs_reg &reg)
+{
+   switch (reg.file) {
+   case BAD_FILE:
+   case UNIFORM:
+   case IMM:
+   case VGRF:
+   case ATTR:
+      return reg.stride * type_sz(reg.type);
+   case ARF:
+   case FIXED_GRF:
+      if (reg.is_null()) {
+         return 0;
+      } else {
+         const unsigned hstride = reg.hstride ? 1 << (reg.hstride - 1) : 0;
+         const unsigned vstride = reg.vstride ? 1 << (reg.vstride - 1) : 0;
+         const unsigned width = 1 << reg.width;
+
+         if (width == 1) {
+            return vstride * type_sz(reg.type);
+         } else if (hstride * width == vstride) {
+            return hstride * type_sz(reg.type);
+         } else {
+            return ~0u;
+         }
+      }
+   default:
+      unreachable("Invalid register file");
+   }
+}
+
 /**
  * Return whether the following regioning restriction applies to the specified
  * instruction.  From the Cherryview PRM Vol 7. "Register Region
@@ -768,6 +805,30 @@ has_dst_aligned_region_restriction(const intel_device_info *devinfo,
    return has_dst_aligned_region_restriction(devinfo, inst, inst->dst.type);
 }
 
+/**
+ * Return true if the instruction can be potentially affected by the Xe2+
+ * regioning restrictions that apply to integer types smaller than a dword.
+ * The restriction isn't quoted here due to its length, see BSpec #56640 for
+ * details.
+ */
+static inline bool
+has_subdword_integer_region_restriction(const intel_device_info *devinfo,
+                                        const fs_inst *inst)
+{
+   if (devinfo->ver >= 20 &&
+       brw_reg_type_is_integer(inst->dst.type) &&
+       MAX2(byte_stride(inst->dst), type_sz(inst->dst.type)) < 4) {
+      for (unsigned i = 0; i < inst->sources; i++) {
+         if (brw_reg_type_is_integer(inst->src[i].type) &&
+             type_sz(inst->src[i].type) < 4 &&
+             byte_stride(inst->src[i]) >= 4)
+            return true;
+      }
+   }
+
+   return false;
+}
+
 /**
  * Return whether the LOAD_PAYLOAD instruction is a plain copy of bits from
  * the specified register file into a VGRF.