diff --git a/src/intel/compiler/brw_disasm.c b/src/intel/compiler/brw_disasm.c
index ff46cb9549a..486f55d7bec 100644
--- a/src/intel/compiler/brw_disasm.c
+++ b/src/intel/compiler/brw_disasm.c
@@ -708,6 +708,9 @@ reg(FILE *file, unsigned _reg_file, unsigned _reg_nr)
          format(file, "mask%d", _reg_nr & 0x0f);
          break;
       case BRW_ARF_MASK_STACK:
+         format(file, "ms%d", _reg_nr & 0x0f);
+         break;
+      case BRW_ARF_MASK_STACK_DEPTH:
          format(file, "msd%d", _reg_nr & 0x0f);
          break;
       case BRW_ARF_STATE:
diff --git a/src/intel/compiler/brw_eu.h b/src/intel/compiler/brw_eu.h
index 262c527b2e9..9ed481e10e6 100644
--- a/src/intel/compiler/brw_eu.h
+++ b/src/intel/compiler/brw_eu.h
@@ -1102,7 +1102,7 @@ brw_inst *brw_WHILE(struct brw_codegen *p);
 
 brw_inst *brw_BREAK(struct brw_codegen *p);
 brw_inst *brw_CONT(struct brw_codegen *p);
-brw_inst *gen6_HALT(struct brw_codegen *p);
+brw_inst *brw_HALT(struct brw_codegen *p);
 
 /* Forward jumps:
  */
diff --git a/src/intel/compiler/brw_eu_emit.c b/src/intel/compiler/brw_eu_emit.c
index 12042131999..cc1bc8cc13f 100644
--- a/src/intel/compiler/brw_eu_emit.c
+++ b/src/intel/compiler/brw_eu_emit.c
@@ -1725,14 +1725,23 @@ brw_CONT(struct brw_codegen *p)
 }
 
 brw_inst *
-gen6_HALT(struct brw_codegen *p)
+brw_HALT(struct brw_codegen *p)
 {
    const struct gen_device_info *devinfo = p->devinfo;
    brw_inst *insn;
 
    insn = next_insn(p, BRW_OPCODE_HALT);
    brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
-   if (devinfo->gen < 8) {
+   if (devinfo->gen < 6) {
+      /* From the Gen4 PRM:
+       *
+       *    "IP register must be put (for example, by the assembler) at <dst>
+       *    and <src0> locations.
+       */
+      brw_set_dest(p, insn, brw_ip_reg());
+      brw_set_src0(p, insn, brw_ip_reg());
+      brw_set_src1(p, insn, brw_imm_d(0x0)); /* exitcode updated later. */
+   } else if (devinfo->gen < 8) {
       brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
       brw_set_src1(p, insn, brw_imm_d(0x0)); /* UIP and JIP, updated later. */
    } else if (devinfo->gen < 12) {
diff --git a/src/intel/compiler/brw_fs_generator.cpp b/src/intel/compiler/brw_fs_generator.cpp
index 5825e0770d4..bee9816d815 100644
--- a/src/intel/compiler/brw_fs_generator.cpp
+++ b/src/intel/compiler/brw_fs_generator.cpp
@@ -224,25 +224,27 @@ public:
 bool
 fs_generator::patch_discard_jumps_to_fb_writes()
 {
-   if (devinfo->gen < 6 || this->discard_halt_patches.is_empty())
+   if (this->discard_halt_patches.is_empty())
       return false;
 
    int scale = brw_jump_scale(p->devinfo);
 
-   /* There is a somewhat strange undocumented requirement of using
-    * HALT, according to the simulator.  If some channel has HALTed to
-    * a particular UIP, then by the end of the program, every channel
-    * must have HALTed to that UIP.  Furthermore, the tracking is a
-    * stack, so you can't do the final halt of a UIP after starting
-    * halting to a new UIP.
-    *
-    * Symptoms of not emitting this instruction on actual hardware
-    * included GPU hangs and sparkly rendering on the piglit discard
-    * tests.
-    */
-   brw_inst *last_halt = gen6_HALT(p);
-   brw_inst_set_uip(p->devinfo, last_halt, 1 * scale);
-   brw_inst_set_jip(p->devinfo, last_halt, 1 * scale);
+   if (devinfo->gen >= 6) {
+      /* There is a somewhat strange undocumented requirement of using
+       * HALT, according to the simulator.  If some channel has HALTed to
+       * a particular UIP, then by the end of the program, every channel
+       * must have HALTed to that UIP.  Furthermore, the tracking is a
+       * stack, so you can't do the final halt of a UIP after starting
+       * halting to a new UIP.
+       *
+       * Symptoms of not emitting this instruction on actual hardware
+       * included GPU hangs and sparkly rendering on the piglit discard
+       * tests.
+       */
+      brw_inst *last_halt = brw_HALT(p);
+      brw_inst_set_uip(p->devinfo, last_halt, 1 * scale);
+      brw_inst_set_jip(p->devinfo, last_halt, 1 * scale);
+   }
 
    int ip = p->nr_insn;
 
@@ -250,11 +252,67 @@ fs_generator::patch_discard_jumps_to_fb_writes()
       brw_inst *patch = &p->store[patch_ip->ip];
 
       assert(brw_inst_opcode(p->devinfo, patch) == BRW_OPCODE_HALT);
-      /* HALT takes a half-instruction distance from the pre-incremented IP. */
-      brw_inst_set_uip(p->devinfo, patch, (ip - patch_ip->ip) * scale);
+      if (devinfo->gen >= 6) {
+         /* HALT takes a half-instruction distance from the pre-incremented IP. */
+         brw_inst_set_uip(p->devinfo, patch, (ip - patch_ip->ip) * scale);
+      } else {
+         brw_set_src1(p, patch, brw_imm_d((ip - patch_ip->ip) * scale));
+      }
    }
 
    this->discard_halt_patches.make_empty();
+
+   if (devinfo->gen < 6) {
+      /* From the g965 PRM:
+       *
+       *    "As DMask is not automatically reloaded into AMask upon completion
+       *    of this instruction, software has to manually restore AMask upon
+       *    completion."
+       *
+       * DMask lives in the bottom 16 bits of sr0.1.
+       */
+      brw_inst *reset = brw_MOV(p, brw_mask_reg(BRW_AMASK),
+                                   retype(brw_sr0_reg(1), BRW_REGISTER_TYPE_UW));
+      brw_inst_set_exec_size(devinfo, reset, BRW_EXECUTE_1);
+      brw_inst_set_mask_control(devinfo, reset, BRW_MASK_DISABLE);
+      brw_inst_set_qtr_control(devinfo, reset, BRW_COMPRESSION_NONE);
+      brw_inst_set_thread_control(devinfo, reset, BRW_THREAD_SWITCH);
+   }
+
+   if (devinfo->gen == 4 && !devinfo->is_g4x) {
+      /* From the g965 PRM:
+       *
+       *    "[DevBW, DevCL] Erratum: The subfields in mask stack register are
+       *    reset to zero during graphics reset, however, they are not
+       *    initialized at thread dispatch. These subfields will retain the
+       *    values from the previous thread. Software should make sure the
+       *    mask stack is empty (reset to zero) before terminating the thread.
+       *    In case that this is not practical, software may have to reset the
+       *    mask stack at the beginning of each kernel, which will impact the
+       *    performance."
+       *
+       * Luckily we can rely on:
+       *
+       *    "[DevBW, DevCL] This register access restriction is not
+       *    applicable, hardware does ensure execution pipeline coherency,
+       *    when a mask stack register is used as an explicit source and/or
+       *    destination."
+       */
+      brw_push_insn_state(p);
+      brw_set_default_mask_control(p, BRW_MASK_DISABLE);
+      brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
+
+      brw_set_default_exec_size(p, BRW_EXECUTE_2);
+      brw_MOV(p, vec2(brw_mask_stack_depth_reg(0)), brw_imm_uw(0));
+
+      brw_set_default_exec_size(p, BRW_EXECUTE_16);
+      /* Reset the if stack. */
+      brw_MOV(p, retype(brw_mask_stack_reg(0), BRW_REGISTER_TYPE_UW),
+              brw_imm_uw(0));
+
+      brw_pop_insn_state(p);
+   }
+
    return true;
 }
 
@@ -1362,14 +1420,12 @@ fs_generator::generate_ddy(const fs_inst *inst,
 void
 fs_generator::generate_discard_jump(fs_inst *)
 {
-   assert(devinfo->gen >= 6);
-
    /* This HALT will be patched up at FB write time to point UIP at the end of
     * the program, and at brw_uip_jip() JIP will be set to the end of the
     * current block (or the program).
     */
    this->discard_halt_patches.push_tail(new(mem_ctx) ip_record(p->nr_insn));
-   gen6_HALT(p);
+   brw_HALT(p);
 }
 
 void
diff --git a/src/intel/compiler/brw_fs_nir.cpp b/src/intel/compiler/brw_fs_nir.cpp
index 8916f9b876e..37c1d2f4cc9 100644
--- a/src/intel/compiler/brw_fs_nir.cpp
+++ b/src/intel/compiler/brw_fs_nir.cpp
@@ -3482,13 +3482,7 @@ fs_visitor::nir_emit_fs_intrinsic(const fs_builder &bld,
       cmp->predicate = BRW_PREDICATE_NORMAL;
       cmp->flag_subreg = sample_mask_flag_subreg(this);
 
-      if (devinfo->gen >= 6) {
-         /* Due to the way we implement discard, the jump will only happen
-          * when the whole quad is discarded.  So we can do this even for
-          * demote as it won't break its uniformity promises.
-          */
-         emit_discard_jump();
-      }
+      emit_discard_jump();
 
       if (devinfo->gen < 7)
          limit_dispatch_width(
diff --git a/src/intel/compiler/brw_reg.h b/src/intel/compiler/brw_reg.h
index 865cd9e0f36..620b8722d92 100644
--- a/src/intel/compiler/brw_reg.h
+++ b/src/intel/compiler/brw_reg.h
@@ -939,6 +939,21 @@ brw_dmask_reg()
    return brw_sr0_reg(2);
 }
 
+static inline struct brw_reg
+brw_mask_stack_reg(unsigned subnr)
+{
+   return suboffset(retype(brw_vec16_reg(BRW_ARCHITECTURE_REGISTER_FILE,
+                                         BRW_ARF_MASK_STACK, 0),
+                           BRW_REGISTER_TYPE_UB), subnr);
+}
+
+static inline struct brw_reg
+brw_mask_stack_depth_reg(unsigned subnr)
+{
+   return brw_uw1_reg(BRW_ARCHITECTURE_REGISTER_FILE,
+                      BRW_ARF_MASK_STACK_DEPTH, subnr);
+}
+
 static inline struct brw_reg
 brw_message_reg(unsigned nr)
 {