diff --git a/src/intel/compiler/brw_fs.cpp b/src/intel/compiler/brw_fs.cpp
index 1b582cda0cd..7ebbe47b887 100644
--- a/src/intel/compiler/brw_fs.cpp
+++ b/src/intel/compiler/brw_fs.cpp
@@ -31,6 +31,7 @@
 #include "main/macros.h"
 #include "brw_eu.h"
 #include "brw_fs.h"
+#include "brw_fs_live_variables.h"
 #include "brw_nir.h"
 #include "brw_vec4_gs_visitor.h"
 #include "brw_cfg.h"
@@ -7448,6 +7449,9 @@ fs_visitor::optimize()
 
    OPT(lower_logical_sends);
 
+   /* After logical SEND lowering. */
+   OPT(fixup_nomask_control_flow);
+
    if (progress) {
       OPT(opt_copy_propagation);
       /* Only run after logical send lowering because it's easier to implement
@@ -7576,6 +7580,151 @@ fs_visitor::fixup_3src_null_dest()
       invalidate_live_intervals();
 }
 
+/**
+ * Find the first instruction in the program that might start a region of
+ * divergent control flow due to a HALT jump.  There is no
+ * find_halt_control_flow_region_end(), the region of divergence extends until
+ * the only FS_OPCODE_PLACEHOLDER_HALT in the program.
+ */
+static const fs_inst *
+find_halt_control_flow_region_start(const fs_visitor *v)
+{
+   if (brw_wm_prog_data(v->prog_data)->uses_kill) {
+      foreach_block_and_inst(block, fs_inst, inst, v->cfg) {
+         if (inst->opcode == FS_OPCODE_DISCARD_JUMP ||
+             inst->opcode == FS_OPCODE_PLACEHOLDER_HALT)
+            return inst;
+      }
+   }
+
+   return NULL;
+}
+
+/**
+ * Work around the Gen12 hardware bug filed as GEN:BUG:1407528679.  EU fusion
+ * can cause a BB to be executed with all channels disabled, which will lead
+ * to the execution of any NoMask instructions in it, even though any
+ * execution-masked instructions will be correctly shot down.  This may break
+ * assumptions of some NoMask SEND messages whose descriptor depends on data
+ * generated by live invocations of the shader.
+ *
+ * This avoids the problem by predicating certain instructions on an ANY
+ * horizontal predicate that makes sure that their execution is omitted when
+ * all channels of the program are disabled.
+ */
+bool
+fs_visitor::fixup_nomask_control_flow()
+{
+   if (devinfo->gen != 12)
+      return false;
+
+   const brw_predicate pred = dispatch_width > 16 ? BRW_PREDICATE_ALIGN1_ANY32H :
+                              dispatch_width > 8 ? BRW_PREDICATE_ALIGN1_ANY16H :
+                              BRW_PREDICATE_ALIGN1_ANY8H;
+   const fs_inst *halt_start = find_halt_control_flow_region_start(this);
+   unsigned depth = 0;
+   bool progress = false;
+
+   calculate_live_intervals();
+
+   /* Scan the program backwards in order to be able to easily determine
+    * whether the flag register is live at any point.
+    */
+   foreach_block_reverse_safe(block, cfg) {
+      BITSET_WORD flag_liveout = live_intervals->block_data[block->num]
+                                               .flag_liveout[0];
+      STATIC_ASSERT(ARRAY_SIZE(live_intervals->block_data[0].flag_liveout) == 1);
+
+      foreach_inst_in_block_reverse_safe(fs_inst, inst, block) {
+         if (!inst->predicate && inst->exec_size >= 8)
+            flag_liveout &= ~inst->flags_written();
+
+         switch (inst->opcode) {
+         case BRW_OPCODE_DO:
+         case BRW_OPCODE_IF:
+            /* Note that this doesn't handle FS_OPCODE_DISCARD_JUMP since only
+             * the first one in the program closes the region of divergent
+             * control flow due to any HALT instructions -- Instead this is
+             * handled with the halt_start check below.
+             */
+            depth--;
+            break;
+
+         case BRW_OPCODE_WHILE:
+         case BRW_OPCODE_ENDIF:
+         case FS_OPCODE_PLACEHOLDER_HALT:
+            depth++;
+            break;
+
+         default:
+            /* Note that the vast majority of NoMask SEND instructions in the
+             * program are harmless while executed in a block with all
+             * channels disabled, since any instructions with side effects we
+             * could hit here should be execution-masked.
+             *
+             * The main concern is NoMask SEND instructions where the message
+             * descriptor or header depends on data generated by live
+             * invocations of the shader (RESINFO and
+             * FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD with a dynamically
+             * computed surface index seem to be the only examples right now
+             * where this could easily lead to GPU hangs).  Unfortunately we
+             * have no straightforward way to detect that currently, so just
+             * predicate any NoMask SEND instructions we find under control
+             * flow.
+             *
+             * If this proves to have a measurable performance impact it can
+             * be easily extended with a whitelist of messages we know we can
+             * safely omit the predication for.
+             */
+            if (depth && inst->force_writemask_all &&
+                is_send(inst) && !inst->predicate) {
+               /* We need to load the execution mask into the flag register by
+                * using a builder with channel group matching the whole shader
+                * (rather than the default which is derived from the original
+                * instruction), in order to avoid getting a right-shifted
+                * value.
+                */
+               const fs_builder ubld = fs_builder(this, block, inst)
+                                       .exec_all().group(dispatch_width, 0);
+               const fs_reg flag = retype(brw_flag_reg(0, 0),
+                                          BRW_REGISTER_TYPE_UD);
+
+               /* Due to the lack of flag register allocation we need to save
+                * and restore the flag register if it's live.
+                */
+               const bool save_flag = flag_liveout &
+                                      flag_mask(flag, dispatch_width / 8);
+               const fs_reg tmp = ubld.group(1, 0).vgrf(flag.type);
+
+               if (save_flag)
+                  ubld.group(1, 0).MOV(tmp, flag);
+
+               ubld.emit(FS_OPCODE_LOAD_LIVE_CHANNELS);
+
+               set_predicate(pred, inst);
+               inst->flag_subreg = 0;
+
+               if (save_flag)
+                  ubld.group(1, 0).at(block, inst->next).MOV(flag, tmp);
+
+               progress = true;
+            }
+            break;
+         }
+
+         if (inst == halt_start)
+            depth--;
+
+         flag_liveout |= inst->flags_read(devinfo);
+      }
+   }
+
+   if (progress)
+      invalidate_live_intervals();
+
+   return progress;
+}
+
 void
 fs_visitor::allocate_registers(unsigned min_dispatch_width, bool allow_spilling)
 {
diff --git a/src/intel/compiler/brw_fs.h b/src/intel/compiler/brw_fs.h
index 5784da7535c..dfa1ff4d0e1 100644
--- a/src/intel/compiler/brw_fs.h
+++ b/src/intel/compiler/brw_fs.h
@@ -107,6 +107,7 @@ public:
    void setup_cs_payload();
    bool fixup_sends_duplicate_payload();
    void fixup_3src_null_dest();
+   bool fixup_nomask_control_flow();
    void assign_curb_setup();
    void assign_urb_setup();
    void convert_attr_sources_to_hw_regs(fs_inst *inst);