ir3: Rewrite (jp) and branchstack handling

This pass will later also serve as a way to accurately insert physical edges, which is the original motivation. However it also lets us put branchstack handling on a more solid footing. There was an off-by-one in the old branchstack handling because it didn't consider that a single if-else actually has two reconvergence points active at the same time, so it undercounted the branchstack by 1 for pretty much every shader. We change the HW formula to produce the same result, which now makes it much more sensible. We can also delete the physical predecessor handling in ir3_legalize, because it was only needed to handle (jp) which is now handled earlier. Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/22072>
2023-02-09 13:48:54 +01:00
parent 6ad0cbafe8
commit 9dbe511f18
10 changed files with 338 additions and 140 deletions
--- a/src/freedreno/ir3/ir3.h
+++ b/src/freedreno/ir3/ir3.h
@@ -660,6 +660,8 @@ struct ir3_block {

   uint16_t start_ip, end_ip;

+   bool reconvergence_point;
+
   /* Track instructions which do not write a register but other-
    * wise must not be discarded (such as kill, stg, etc)
    */
@@ -1927,10 +1929,12 @@ soft_sy_delay(struct ir3_instruction *instr, struct ir3 *shader)
   }
 }

-
 /* unreachable block elimination: */
 bool ir3_remove_unreachable(struct ir3 *ir);

+/* calculate reconvergence information: */
+void ir3_calc_reconvergence(struct ir3_shader_variant *so);
+
 /* dead code elimination: */
 struct ir3_shader_variant;
 bool ir3_dce(struct ir3 *ir, struct ir3_shader_variant *so);
--- a/src/freedreno/ir3/ir3_compiler_nir.c
+++ b/src/freedreno/ir3/ir3_compiler_nir.c
@@ -2537,14 +2537,9 @@ emit_intrinsic(struct ir3_context *ctx, nir_intrinsic_instr *intr)
   }
   case nir_intrinsic_elect:
      dst[0] = ir3_ELECT_MACRO(ctx->block);
-      /* This may expand to a divergent if/then, so allocate stack space for
-       * it.
-       */
-      ctx->max_stack = MAX2(ctx->max_stack, ctx->stack + 1);
      break;
   case nir_intrinsic_preamble_start_ir3:
      dst[0] = ir3_SHPS_MACRO(ctx->block);
-      ctx->max_stack = MAX2(ctx->max_stack, ctx->stack + 1);
      break;

   case nir_intrinsic_read_invocation_cond_ir3: {
@@ -2555,7 +2550,6 @@ emit_intrinsic(struct ir3_context *ctx, nir_intrinsic_instr *intr)
      dst[0]->dsts[0]->flags |= IR3_REG_SHARED;
      dst[0]->srcs[0]->num = regid(REG_P0, 0);
      array_insert(ctx->ir, ctx->ir->predicates, dst[0]);
-      ctx->max_stack = MAX2(ctx->max_stack, ctx->stack + 1);
      break;
   }

@@ -2563,7 +2557,6 @@ emit_intrinsic(struct ir3_context *ctx, nir_intrinsic_instr *intr)
      struct ir3_instruction *src = ir3_get_src(ctx, &intr->src[0])[0];
      dst[0] = ir3_READ_FIRST_MACRO(ctx->block, src, 0);
      dst[0]->dsts[0]->flags |= IR3_REG_SHARED;
-      ctx->max_stack = MAX2(ctx->max_stack, ctx->stack + 1);
      break;
   }

@@ -2579,7 +2572,6 @@ emit_intrinsic(struct ir3_context *ctx, nir_intrinsic_instr *intr)
         ballot = ir3_BALLOT_MACRO(ctx->block, pred, components);
         ballot->srcs[0]->num = regid(REG_P0, 0);
         array_insert(ctx->ir, ctx->ir->predicates, ballot);
-         ctx->max_stack = MAX2(ctx->max_stack, ctx->stack + 1);
      }

      ballot->barrier_class = IR3_BARRIER_ACTIVE_FIBERS_R;
@@ -3747,20 +3739,6 @@ emit_loop(struct ir3_context *ctx, nir_loop *nloop)
   ctx->loop_id = old_loop_id;
 }

-static void
-stack_push(struct ir3_context *ctx)
-{
-   ctx->stack++;
-   ctx->max_stack = MAX2(ctx->max_stack, ctx->stack);
-}
-
-static void
-stack_pop(struct ir3_context *ctx)
-{
-   compile_assert(ctx, ctx->stack > 0);
-   ctx->stack--;
-}
-
 static void
 emit_cf_list(struct ir3_context *ctx, struct exec_list *list)
 {
@@ -3770,14 +3748,10 @@ emit_cf_list(struct ir3_context *ctx, struct exec_list *list)
         emit_block(ctx, nir_cf_node_as_block(node));
         break;
      case nir_cf_node_if:
-         stack_push(ctx);
         emit_if(ctx, nir_cf_node_as_if(node));
-         stack_pop(ctx);
         break;
      case nir_cf_node_loop:
-         stack_push(ctx);
         emit_loop(ctx, nir_cf_node_as_loop(node));
-         stack_pop(ctx);
         break;
      case nir_cf_node_function:
         ir3_context_error(ctx, "TODO\n");
@@ -3924,13 +3898,9 @@ emit_function(struct ir3_context *ctx, nir_function_impl *impl)
 {
   nir_metadata_require(impl, nir_metadata_block_index);

-   compile_assert(ctx, ctx->stack == 0);
-
   emit_cf_list(ctx, &impl->body);
   emit_block(ctx, impl->end_block);

-   compile_assert(ctx, ctx->stack == 0);
-
   /* at this point, we should have a single empty block,
    * into which we emit the 'end' instruction.
    */
@@ -4687,8 +4657,6 @@ ir3_compile_shader_nir(struct ir3_compiler *compiler,
      goto out;
   }

-   so->branchstack = ctx->max_stack;
-
   ir = so->ir = ctx->ir;

   if (gl_shader_stage_is_compute(so->type)) {
@@ -4879,6 +4847,8 @@ ir3_compile_shader_nir(struct ir3_compiler *compiler,

   IR3_PASS(ir, ir3_array_to_ssa);

+   ir3_calc_reconvergence(so);
+
   do {
      progress = false;

--- a/src/freedreno/ir3/ir3_context.h
+++ b/src/freedreno/ir3/ir3_context.h
@@ -107,11 +107,6 @@ struct ir3_context {

   unsigned num_arrays;

-   /* Tracking for max level of flowcontrol (branchstack) needed
-    * by a5xx+:
-    */
-   unsigned stack, max_stack;
-
   unsigned loop_id;
   unsigned loop_depth;

--- a/src/freedreno/ir3/ir3_legalize.c
+++ b/src/freedreno/ir3/ir3_legalize.c
@@ -523,44 +523,6 @@ remove_unused_block(struct ir3_block *old_target)
 {
   list_delinit(&old_target->node);

-   /* If there are any physical predecessors due to fallthroughs, then they may
-    * fall through to any of the physical successors of this block. But we can
-    * only fit two, so just pick the "earliest" one, i.e. the fallthrough if
-    * possible.
-    *
-    * TODO: we really ought to have unlimited numbers of physical successors,
-    * both because of this and because we currently don't model some scenarios
-    * with nested break/continue correctly.
-    */
-   struct ir3_block *new_target;
-   if (old_target->physical_successors[1] &&
-       old_target->physical_successors[1]->start_ip <
-       old_target->physical_successors[0]->start_ip) {
-      new_target = old_target->physical_successors[1];
-   } else {
-      new_target = old_target->physical_successors[0];
-   }
-
-   for (unsigned i = 0; i < old_target->physical_predecessors_count; i++) {
-      struct ir3_block *pred = old_target->physical_predecessors[i];
-      if (pred->physical_successors[0] == old_target) {
-         if (!new_target) {
-            /* If we remove a physical successor, make sure the only physical
-             * successor is the first one.
-             */
-            pred->physical_successors[0] = pred->physical_successors[1];
-            pred->physical_successors[1] = NULL;
-         } else {
-            pred->physical_successors[0] = new_target;
-         }
-      } else {
-         assert(pred->physical_successors[1] == old_target);
-         pred->physical_successors[1] = new_target;
-      }
-      if (new_target)
-         ir3_block_add_physical_predecessor(new_target, pred);
-   }
-
   /* cleanup dangling predecessors: */
   for (unsigned i = 0; i < ARRAY_SIZE(old_target->successors); i++) {
      if (old_target->successors[i]) {
@@ -568,13 +530,6 @@ remove_unused_block(struct ir3_block *old_target)
         ir3_block_remove_predecessor(succ, old_target);
      }
   }
-
-   for (unsigned i = 0; i < ARRAY_SIZE(old_target->physical_successors); i++) {
-      if (old_target->physical_successors[i]) {
-         struct ir3_block *succ = old_target->physical_successors[i];
-         ir3_block_remove_physical_predecessor(succ, old_target);
-      }
-   }
 }

 static bool
@@ -591,21 +546,16 @@ retarget_jump(struct ir3_instruction *instr, struct ir3_block *new_target)
      cur_block->successors[1] = new_target;
   }

-   /* also update physical_successors: */
-   if (cur_block->physical_successors[0] == old_target) {
-      cur_block->physical_successors[0] = new_target;
-   } else {
-      assert(cur_block->physical_successors[1] == old_target);
-      cur_block->physical_successors[1] = new_target;
-   }
-
   /* update new target's predecessors: */
   ir3_block_add_predecessor(new_target, cur_block);
-   ir3_block_add_physical_predecessor(new_target, cur_block);

   /* and remove old_target's predecessor: */
   ir3_block_remove_predecessor(old_target, cur_block);
-   ir3_block_remove_physical_predecessor(old_target, cur_block);
+
+   /* If we reconverged at the old target, we'll reconverge at the new target
+    * too:
+    */
+   new_target->reconvergence_point |= old_target->reconvergence_point;

   instr->cat0.target = new_target;

@@ -627,6 +577,12 @@ opt_jump(struct ir3 *ir)
      block->index = index++;

   foreach_block (block, &ir->block_list) {
+      /* This pass destroys the physical CFG so don't keep it around to avoid
+       * validation errors.
+       */
+      block->physical_successors[0] = block->physical_successors[1] = NULL;
+      block->physical_predecessors_count = 0;
+
      foreach_instr (instr, &block->instr_list) {
         if (!is_flow(instr) || !instr->cat0.target)
            continue;
@@ -707,51 +663,18 @@ mark_jp(struct ir3_block *block)
   target->flags |= IR3_INSTR_JP;
 }

-/* Mark points where control flow converges or diverges.
+/* Mark points where control flow reconverges.
 *
- * Divergence points could actually be re-convergence points where
- * "parked" threads are recoverged with threads that took the opposite
- * path last time around.  Possibly it is easier to think of (jp) as
- * "the execution mask might have changed".
+ * Re-convergence points are where "parked" threads are reconverged with threads
+ * that took the opposite path last time around. We already calculated them, we
+ * just need to mark them with (jp).
 */
 static void
 mark_xvergence_points(struct ir3 *ir)
 {
   foreach_block (block, &ir->block_list) {
-      /* We need to insert (jp) if an entry in the "branch stack" is created for
-       * our block. This happens if there is a predecessor to our block that may
-       * fallthrough to an earlier block in the physical CFG, either because it
-       * ends in a non-uniform conditional branch or because there's a
-       * fallthrough for an block in-between that also starts with (jp) and was
-       * pushed on the branch stack already.
-       */
-      for (unsigned i = 0; i < block->predecessors_count; i++) {
-         struct ir3_block *pred = block->predecessors[i];
-
-         for (unsigned j = 0; j < ARRAY_SIZE(pred->physical_successors); j++) {
-            if (pred->physical_successors[j] != NULL &&
-                pred->physical_successors[j]->start_ip < block->start_ip)
-               mark_jp(block);
-
-            /* If the predecessor just falls through to this block, we still
-             * need to check if it "falls through" by jumping to the block. This
-             * can happen if opt_jump fails and the block ends in two branches,
-             * or if there's an empty if-statement (which currently can happen
-             * with binning shaders after dead-code elimination) and the block
-             * before ends with a conditional branch directly to this block.
-             */
-            if (pred->physical_successors[j] == block) {
-               foreach_instr_rev (instr, &pred->instr_list) {
-                  if (!is_flow(instr))
-                     break;
-                  if (instr->cat0.target == block) {
-                     mark_jp(block);
-                     break;
-                  }
-               }
-            }
-         }
-      }
+      if (block->reconvergence_point)
+         mark_jp(block);
   }
 }

--- a/src/freedreno/ir3/ir3_lower_subgroups.c
+++ b/src/freedreno/ir3/ir3_lower_subgroups.c
@@ -264,6 +264,7 @@ lower_instr(struct ir3 *ir, struct ir3_block **block, struct ir3_instruction *in
       *       exclusive = reduce;
       *       inclusive = src OP exclusive;
       *       reduce = inclusive;
+       *       break;
       *    }
       *    footer:
       * }
@@ -280,6 +281,9 @@ lower_instr(struct ir3 *ir, struct ir3_block **block, struct ir3_instruction *in

      struct ir3_block *footer = ir3_block_create(ir);
      list_add(&footer->node, &exit->node);
+      footer->reconvergence_point = true;
+
+      after_block->reconvergence_point = true;

      link_blocks(before_block, header, 0);

@@ -312,6 +316,7 @@ lower_instr(struct ir3 *ir, struct ir3_block **block, struct ir3_instruction *in
         before_block->brtype = IR3_BRANCH_GETONE;
         before_block->condition = NULL;
         mov_immed(instr->dsts[0], then_block, 0);
+         after_block->reconvergence_point = true;
         before_block = after_block;
         after_block = split_block(ir, before_block, instr);
         then_block = create_if(ir, before_block, after_block);
@@ -333,6 +338,7 @@ lower_instr(struct ir3 *ir, struct ir3_block **block, struct ir3_instruction *in
      case OPC_BALLOT_MACRO:
      case OPC_READ_COND_MACRO:
         before_block->brtype = IR3_BRANCH_COND;
+         after_block->reconvergence_point = true;
         break;
      case OPC_ANY_MACRO:
         before_block->brtype = IR3_BRANCH_ANY;
@@ -344,6 +350,7 @@ lower_instr(struct ir3 *ir, struct ir3_block **block, struct ir3_instruction *in
      case OPC_READ_FIRST_MACRO:
      case OPC_SWZ_SHARED_MACRO:
         before_block->brtype = IR3_BRANCH_GETONE;
+         after_block->reconvergence_point = true;
         break;
      default:
         unreachable("bad opcode");
--- a/src/freedreno/ir3/ir3_print.c
+++ b/src/freedreno/ir3/ir3_print.c
@@ -489,7 +489,9 @@ print_block(struct ir3_block *block, int lvl)
   struct log_stream *stream = mesa_log_streami();

   tab(stream, lvl);
-   mesa_log_stream_printf(stream, "block%u {\n", block_id(block));
+   mesa_log_stream_printf(stream, "%sblock%u {\n",
+                          block->reconvergence_point ? "(jp)" : "",
+                          block_id(block));

   if (block->predecessors_count > 0) {
      tab(stream, lvl + 1);
--- a/src/freedreno/ir3/ir3_reconvergence.c
+++ b/src/freedreno/ir3/ir3_reconvergence.c
@@ -0,0 +1,300 @@
+/*
+ * Copyright (C) 2023 Valve Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+/* The pass uses information on which branches are divergent in order to
+ * determine which blocks are "reconvergence points" where parked threads may
+ * become reactivated as well as to add "physical" edges where the machine may
+ * fall through to the next reconvergence point. Reconvergence points need a
+ * (jp) added in the assembly, and physical edges are needed to model shared
+ * register liveness correctly. Reconvergence happens in the following two
+ * scenarios:
+ *
+ * 1. When there is a divergent branch, the later of the two block destinations
+ *    becomes a reconvergence point.
+ * 2. When a forward edge crosses over a reconvergence point that may be
+ *    outstanding at the start of the edge, we need to park the threads that
+ *    take the edge and resume execution at the reconvergence point. This means
+ *    that there is a physical edge from the start of the edge to the
+ *    reconvergence point, and the destination of the edge becomes a new
+ *    reconvergence point.
+ *
+ * For example, consider this simple if-else:
+ *
+ *    bb0:
+ *    ...
+ *    br p0.x, #bb1, #bb2
+ *    bb1:
+ *    ...
+ *    jump bb3
+ *    bb2:
+ *    ...
+ *    jump bb3
+ *    bb3:
+ *    ...
+ *
+ * The divergent branch at the end of bb0 makes bb2 a reconvergence point
+ * following (1), which starts being outstanding after the branch at the end of
+ * bb1. The jump to bb3 at the end of bb1 goes over bb2 while it is outstanding,
+ * so there is a physical edge from bb1 to bb2 and bb3 is a reconvergence point
+ * following (2).
+ * 
+ * Note that (2) can apply recursively. To handle this efficiently we build an
+ * interval tree of forward edges that cross other blocks and whenever a block
+ * becomes a RP we iterate through the edges jumping across it using the tree.
+ * We also need to keep track of the range where each RP may be
+ * "outstanding." A RP becomes outstanding after a branch to it parks its
+ * threads there. This range may increase in size as we discover more and more
+ * branches to it that may park their threads there.
+ *
+ * Finally, we need to compute the branchstack value, which is the maximum
+ * number of outstanding reconvergence points. For the if-else, the branchstack
+ * is 2, because after the jump at the end of bb2 both reconvergence points are
+ * outstanding (although the first is removed immediately afterwards). Because
+ * we already computed the range where each RP is outstanding, this part is
+ * relatively straightforward.
+ */
+
+#include <limits.h>
+
+#include "ir3_shader.h"
+
+#include "util/rb_tree.h"
+#include "util/u_worklist.h"
+#include "util/ralloc.h"
+
+struct logical_edge {
+   struct uinterval_node node;
+   struct ir3_block *start_block;
+   struct ir3_block *end_block;
+};
+
+struct block_data {
+   /* For a reconvergance point, the index of the first block where, upon
+    * exiting, the RP may be outstanding. Normally this is a predecessor but may
+    * be a loop header for loops.
+    */
+   unsigned first_divergent_pred;
+
+   /* The last processed first_divergent_pred. */
+   unsigned first_processed_divergent_pred;
+
+   /* The number of blocks that have this block as a first_divergent_pred. */
+   unsigned divergence_count;
+};
+
+void
+ir3_calc_reconvergence(struct ir3_shader_variant *so)
+{
+   void *mem_ctx = ralloc_context(NULL);
+
+   /* It's important that the index we use corresponds to the final order blocks
+    * are emitted in!
+    */
+   unsigned index = 0;
+   foreach_block (block, &so->ir->block_list) {
+      block->index = index++;
+   }
+
+   /* Setup the tree of edges */
+   unsigned edge_count = 0;
+   foreach_block (block, &so->ir->block_list) {
+      if (block->successors[0])
+         edge_count++;
+      if (block->successors[1])
+         edge_count++;
+   }
+
+   struct rb_tree forward_edges, backward_edges;
+   rb_tree_init(&forward_edges);
+   rb_tree_init(&backward_edges);
+
+   unsigned edge = 0;
+   struct logical_edge *edges =
+      ralloc_array(mem_ctx, struct logical_edge, edge_count);
+   struct block_data *blocks =
+      ralloc_array(mem_ctx, struct block_data, index);
+   foreach_block (block, &so->ir->block_list) {
+      blocks[block->index].divergence_count = 0;
+      blocks[block->index].first_divergent_pred = UINT_MAX;
+      blocks[block->index].first_processed_divergent_pred = UINT_MAX;
+      for (unsigned i = 0; i < ARRAY_SIZE(block->successors); i++) {
+         if (block->successors[i]) {
+            if (block->successors[i]->index > block->index + 1) {
+               edges[edge] = (struct logical_edge) {
+                  .node = {
+                     .interval = {
+                        block->index + 1,
+                        block->successors[i]->index - 1
+                     },
+                  },
+                  .start_block = block,
+                  .end_block = block->successors[i],
+               };
+
+               uinterval_tree_insert(&forward_edges, &edges[edge++].node);
+            } else if (block->successors[i]->index < block->index - 1) {
+               edges[edge] = (struct logical_edge) {
+                  .node = {
+                     .interval = {
+                        block->successors[i]->index - 1,
+                        block->index + 1
+                     },
+                  },
+                  .start_block = block->successors[i],
+                  .end_block = block,
+               };
+
+               uinterval_tree_insert(&backward_edges, &edges[edge++].node);
+            }
+         }
+      }
+   }
+
+   assert(edge <= edge_count);
+
+   u_worklist worklist;
+   u_worklist_init(&worklist, index, mem_ctx);
+
+   /* First, find and mark divergent branches. The later destination will be the
+    * reconvergence point.
+    */
+   foreach_block (block, &so->ir->block_list) {
+      if (block->successors[0] && block->successors[1]) {
+         unsigned idx = block->successors[0]->index >
+            block->successors[1]->index ? 0 : 1;
+         block->successors[idx]->reconvergence_point = true;
+         blocks[block->successors[idx]->index].first_divergent_pred =
+            block->index;
+         u_worklist_push_tail(&worklist, block->successors[idx], index);
+      }
+   }
+
+   while (!u_worklist_is_empty(&worklist)) {
+      struct ir3_block *block =
+         u_worklist_pop_head(&worklist, struct ir3_block, index);
+      assert(block->reconvergence_point);
+
+      /* Iterate over all edges stepping over the block. */
+      struct uinterval interval = { block->index, block->index };
+      uinterval_tree_foreach (struct logical_edge, edge, interval, &forward_edges,
+                              node) {
+         /* If "block" definitely isn't outstanding when the branch
+          * corresponding to "edge" is taken, then we don't need to park
+          * "edge->end_block" and we can ignore this.
+          *
+          * TODO: add uinterval_tree_foreach_from() and use that instead.
+          */
+         if (edge->start_block->index <= blocks[block->index].first_divergent_pred)
+            continue;
+
+         /* If we've already processed this edge + RP pair, don't process it
+          * again. Because edges are ordered by start point, we must have
+          * processed every edge after this too.
+          */
+         if (edge->start_block->index >
+             blocks[block->index].first_processed_divergent_pred)
+            break;
+
+         edge->end_block->reconvergence_point = true;
+         if (blocks[edge->end_block->index].first_divergent_pred >
+             edge->start_block->index) {
+            blocks[edge->end_block->index].first_divergent_pred =
+               edge->start_block->index;
+            u_worklist_push_tail(&worklist, edge->end_block, index);
+         }
+
+         /* Backwards branches extend the range of divergence. For example, a
+          * divergent break creates a reconvergence point after the loop that
+          * stays outstanding throughout subsequent iterations, even at points
+          * before the break. This takes that into account.
+          *
+          * More precisely, a backwards edge that originates between the start
+          * and end of "edge" extends the divergence range to the beginning of
+          * its destination if it is taken, or alternatively to the end of the
+          * block before its destination.
+          *
+          * TODO: in case we ever start accepting weird non-structured control
+          * flow, we may also need to handle this above if a divergent branch
+          * crosses over a backwards edge.
+          */
+         struct uinterval interval2 = { edge->start_block->index, edge->start_block->index };
+         uinterval_tree_foreach (struct logical_edge, back_edge, interval2, &backward_edges,
+                                 node) {
+            if (back_edge->end_block->index < edge->end_block->index) {
+               if (blocks[edge->end_block->index].first_divergent_pred >
+                   back_edge->start_block->index - 1) {
+                  blocks[edge->end_block->index].first_divergent_pred =
+                     back_edge->start_block->index - 1;
+                  u_worklist_push_tail(&worklist, edge->end_block, index);
+               }
+            }
+         }
+      }
+
+      blocks[block->index].first_processed_divergent_pred =
+         blocks[block->index].first_divergent_pred;
+   }
+
+   /* For each reconvergent point p we have an open range
+    * (p->first_divergent_pred, p) where p may be outstanding. We need to keep
+    * track of the number of outstanding RPs and calculate the maximum.
+    */
+   foreach_block (block, &so->ir->block_list) {
+      if (block->reconvergence_point) {
+         blocks[blocks[block->index].first_divergent_pred].divergence_count++;
+      }
+   }
+
+   unsigned rc_level = 0;
+   so->branchstack = 0;
+   foreach_block (block, &so->ir->block_list) {
+      if (block->reconvergence_point)
+         rc_level--;
+
+      /* Account for lowerings that produce divergent control flow. */
+      foreach_instr (instr, &block->instr_list) {
+         switch (instr->opc) {
+         case OPC_SCAN_MACRO:
+            so->branchstack = MAX2(so->branchstack, rc_level + 2);
+            break;
+         case OPC_BALLOT_MACRO:
+         case OPC_READ_COND_MACRO:
+         case OPC_ELECT_MACRO:
+         case OPC_READ_FIRST_MACRO:
+         case OPC_SWZ_SHARED_MACRO:
+            so->branchstack = MAX2(so->branchstack, rc_level + 1);
+            break;
+         default:
+            break;
+         }
+      }
+
+      rc_level += blocks[block->index].divergence_count;
+
+      so->branchstack = MAX2(so->branchstack, rc_level); 
+   }
+   assert(rc_level == 0);
+
+   ralloc_free(mem_ctx);
+}
+
--- a/src/freedreno/ir3/ir3_shader.h
+++ b/src/freedreno/ir3/ir3_shader.h
@@ -1243,12 +1243,7 @@ ir3_shader_branchstack_hw(const struct ir3_shader_variant *v)
   if (v->compiler->gen < 5)
      return v->branchstack;

-   if (v->branchstack > 0) {
-      uint32_t branchstack = v->branchstack / 2 + 1;
-      return MIN2(branchstack, v->compiler->branchstack_size / 2);
-   } else {
-      return 0;
-   }
+   return DIV_ROUND_UP(MIN2(v->branchstack, v->compiler->branchstack_size), 2);
 }

 ENDC;
--- a/src/freedreno/ir3/ir3_validate.c
+++ b/src/freedreno/ir3/ir3_validate.c
@@ -426,7 +426,8 @@ ir3_validate(struct ir3 *ir)
            ctx->current_instr = NULL;

            /* Each logical successor should also be a physical successor: */
-            validate_assert(ctx, is_physical_successor(block, block->successors[i]));
+            if (block->physical_successors[0])
+               validate_assert(ctx, is_physical_successor(block, block->successors[i]));
         }
      }

--- a/src/freedreno/ir3/meson.build
+++ b/src/freedreno/ir3/meson.build
@@ -108,6 +108,7 @@ libfreedreno_ir3_files = files(
  'ir3_ra.c',
  'ir3_ra.h',
  'ir3_ra_validate.c',
+  'ir3_reconvergence.c',
  'ir3_remove_unreachable.c',
  'ir3_sched.c',
  'ir3_shader.c',