radeonsi: move si_build_main_function into si_shader_llvm.c

Reviewed-by: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer@amd.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/7939>
2020-12-04 12:17:45 -05:00
parent 273be1686e
commit 8cd1522622
3 changed files with 229 additions and 227 deletions
--- a/src/gallium/drivers/radeonsi/si_shader.c
+++ b/src/gallium/drivers/radeonsi/si_shader.c
@@ -1296,9 +1296,9 @@ static void si_optimize_vs_outputs(struct si_shader_context *ctx)
                          &shader->info.nr_param_exports);
 }

-static bool si_vs_needs_prolog(const struct si_shader_selector *sel,
-                               const struct si_vs_prolog_bits *prolog_key,
-                               const struct si_shader_key *key, bool ngg_cull_shader)
+bool si_vs_needs_prolog(const struct si_shader_selector *sel,
+                        const struct si_vs_prolog_bits *prolog_key,
+                        const struct si_shader_key *key, bool ngg_cull_shader)
 {
   /* VGPR initialization fixup for Vega10 and Raven is always done in the
    * VS prolog. */
@@ -1307,220 +1307,6 @@ static bool si_vs_needs_prolog(const struct si_shader_selector *sel,
          (ngg_cull_shader && key->opt.ngg_culling & SI_NGG_CULL_GS_FAST_LAUNCH_ALL);
 }

-static bool si_build_main_function(struct si_shader_context *ctx, struct si_shader *shader,
-                                   struct nir_shader *nir, bool free_nir, bool ngg_cull_shader)
-{
-   struct si_shader_selector *sel = shader->selector;
-   const struct si_shader_info *info = &sel->info;
-
-   ctx->shader = shader;
-   ctx->stage = sel->info.stage;
-
-   ctx->num_const_buffers = info->base.num_ubos;
-   ctx->num_shader_buffers = info->base.num_ssbos;
-
-   ctx->num_samplers = util_last_bit(info->base.textures_used);
-   ctx->num_images = info->base.num_images;
-
-   si_llvm_init_resource_callbacks(ctx);
-
-   switch (ctx->stage) {
-   case MESA_SHADER_VERTEX:
-      si_llvm_init_vs_callbacks(ctx, ngg_cull_shader);
-      break;
-   case MESA_SHADER_TESS_CTRL:
-      si_llvm_init_tcs_callbacks(ctx);
-      break;
-   case MESA_SHADER_TESS_EVAL:
-      si_llvm_init_tes_callbacks(ctx, ngg_cull_shader);
-      break;
-   case MESA_SHADER_GEOMETRY:
-      si_llvm_init_gs_callbacks(ctx);
-      break;
-   case MESA_SHADER_FRAGMENT:
-      si_llvm_init_ps_callbacks(ctx);
-      break;
-   case MESA_SHADER_COMPUTE:
-      ctx->abi.load_local_group_size = si_llvm_get_block_size;
-      break;
-   default:
-      assert(!"Unsupported shader type");
-      return false;
-   }
-
-   si_llvm_create_main_func(ctx, ngg_cull_shader);
-
-   if (ctx->shader->key.as_es || ctx->stage == MESA_SHADER_GEOMETRY)
-      si_preload_esgs_ring(ctx);
-
-   if (ctx->stage == MESA_SHADER_GEOMETRY)
-      si_preload_gs_rings(ctx);
-   else if (ctx->stage == MESA_SHADER_TESS_EVAL)
-      si_llvm_preload_tes_rings(ctx);
-
-   if (ctx->stage == MESA_SHADER_TESS_CTRL && sel->info.tessfactors_are_def_in_all_invocs) {
-      for (unsigned i = 0; i < 6; i++) {
-         ctx->invoc0_tess_factors[i] = ac_build_alloca_undef(&ctx->ac, ctx->ac.i32, "");
-      }
-   }
-
-   if (ctx->stage == MESA_SHADER_GEOMETRY) {
-      for (unsigned i = 0; i < 4; i++) {
-         ctx->gs_next_vertex[i] = ac_build_alloca(&ctx->ac, ctx->ac.i32, "");
-      }
-      if (shader->key.as_ngg) {
-         for (unsigned i = 0; i < 4; ++i) {
-            ctx->gs_curprim_verts[i] = ac_build_alloca(&ctx->ac, ctx->ac.i32, "");
-            ctx->gs_generated_prims[i] = ac_build_alloca(&ctx->ac, ctx->ac.i32, "");
-         }
-
-         assert(!ctx->gs_ngg_scratch);
-         LLVMTypeRef ai32 = LLVMArrayType(ctx->ac.i32, gfx10_ngg_get_scratch_dw_size(shader));
-         ctx->gs_ngg_scratch =
-            LLVMAddGlobalInAddressSpace(ctx->ac.module, ai32, "ngg_scratch", AC_ADDR_SPACE_LDS);
-         LLVMSetInitializer(ctx->gs_ngg_scratch, LLVMGetUndef(ai32));
-         LLVMSetAlignment(ctx->gs_ngg_scratch, 4);
-
-         ctx->gs_ngg_emit = LLVMAddGlobalInAddressSpace(
-            ctx->ac.module, LLVMArrayType(ctx->ac.i32, 0), "ngg_emit", AC_ADDR_SPACE_LDS);
-         LLVMSetLinkage(ctx->gs_ngg_emit, LLVMExternalLinkage);
-         LLVMSetAlignment(ctx->gs_ngg_emit, 4);
-      }
-   }
-
-   if (ctx->stage != MESA_SHADER_GEOMETRY && (shader->key.as_ngg && !shader->key.as_es)) {
-      /* Unconditionally declare scratch space base for streamout and
-       * vertex compaction. Whether space is actually allocated is
-       * determined during linking / PM4 creation.
-       *
-       * Add an extra dword per vertex to ensure an odd stride, which
-       * avoids bank conflicts for SoA accesses.
-       */
-      if (!gfx10_is_ngg_passthrough(shader))
-         si_llvm_declare_esgs_ring(ctx);
-
-      /* This is really only needed when streamout and / or vertex
-       * compaction is enabled.
-       */
-      if (!ctx->gs_ngg_scratch && (sel->so.num_outputs || shader->key.opt.ngg_culling)) {
-         LLVMTypeRef asi32 = LLVMArrayType(ctx->ac.i32, gfx10_ngg_get_scratch_dw_size(shader));
-         ctx->gs_ngg_scratch =
-            LLVMAddGlobalInAddressSpace(ctx->ac.module, asi32, "ngg_scratch", AC_ADDR_SPACE_LDS);
-         LLVMSetInitializer(ctx->gs_ngg_scratch, LLVMGetUndef(asi32));
-         LLVMSetAlignment(ctx->gs_ngg_scratch, 4);
-      }
-   }
-
-   /* For GFX9 merged shaders:
-    * - Set EXEC for the first shader. If the prolog is present, set
-    *   EXEC there instead.
-    * - Add a barrier before the second shader.
-    * - In the second shader, reset EXEC to ~0 and wrap the main part in
-    *   an if-statement. This is required for correctness in geometry
-    *   shaders, to ensure that empty GS waves do not send GS_EMIT and
-    *   GS_CUT messages.
-    *
-    * For monolithic merged shaders, the first shader is wrapped in an
-    * if-block together with its prolog in si_build_wrapper_function.
-    *
-    * NGG vertex and tess eval shaders running as the last
-    * vertex/geometry stage handle execution explicitly using
-    * if-statements.
-    */
-   if (ctx->screen->info.chip_class >= GFX9) {
-      if (!shader->is_monolithic && (shader->key.as_es || shader->key.as_ls) &&
-          (ctx->stage == MESA_SHADER_TESS_EVAL ||
-           (ctx->stage == MESA_SHADER_VERTEX &&
-            !si_vs_needs_prolog(sel, &shader->key.part.vs.prolog, &shader->key, ngg_cull_shader)))) {
-         si_init_exec_from_input(ctx, ctx->merged_wave_info, 0);
-      } else if (ctx->stage == MESA_SHADER_TESS_CTRL || ctx->stage == MESA_SHADER_GEOMETRY ||
-                 (shader->key.as_ngg && !shader->key.as_es)) {
-         LLVMValueRef thread_enabled = NULL;
-         bool nested_barrier;
-
-         if (!shader->is_monolithic || (ctx->stage == MESA_SHADER_TESS_EVAL && shader->key.as_ngg &&
-                                        !shader->key.as_es && !shader->key.opt.ngg_culling))
-            ac_init_exec_full_mask(&ctx->ac);
-
-         if ((ctx->stage == MESA_SHADER_VERTEX || ctx->stage == MESA_SHADER_TESS_EVAL) &&
-             shader->key.as_ngg && !shader->key.as_es && !shader->key.opt.ngg_culling) {
-            gfx10_ngg_build_sendmsg_gs_alloc_req(ctx);
-
-            /* Build the primitive export at the beginning
-             * of the shader if possible.
-             */
-            if (gfx10_ngg_export_prim_early(shader))
-               gfx10_ngg_build_export_prim(ctx, NULL, NULL);
-         }
-
-         if (ctx->stage == MESA_SHADER_TESS_CTRL) {
-            /* We need the barrier only if TCS inputs are read from LDS. */
-            nested_barrier =
-               !shader->key.opt.same_patch_vertices ||
-               shader->selector->info.base.inputs_read &
-               ~shader->selector->tcs_vgpr_only_inputs;
-
-            /* The wrapper inserts the conditional for monolithic shaders,
-             * and if this is a monolithic shader, we are already inside
-             * the conditional, so don't insert it.
-             */
-            if (!shader->is_monolithic)
-               thread_enabled = si_is_gs_thread(ctx); /* 2nd shader thread really */
-         } else if (ctx->stage == MESA_SHADER_GEOMETRY) {
-            if (shader->key.as_ngg) {
-               gfx10_ngg_gs_emit_prologue(ctx);
-               nested_barrier = false;
-            } else {
-               nested_barrier = true;
-            }
-
-            thread_enabled = si_is_gs_thread(ctx);
-         } else {
-            thread_enabled = si_is_es_thread(ctx);
-            nested_barrier = false;
-         }
-
-         if (thread_enabled) {
-            ctx->merged_wrap_if_entry_block = LLVMGetInsertBlock(ctx->ac.builder);
-            ctx->merged_wrap_if_label = 11500;
-            ac_build_ifcc(&ctx->ac, thread_enabled, ctx->merged_wrap_if_label);
-         }
-
-         if (nested_barrier) {
-            /* Execute a barrier before the second shader in
-             * a merged shader.
-             *
-             * Execute the barrier inside the conditional block,
-             * so that empty waves can jump directly to s_endpgm,
-             * which will also signal the barrier.
-             *
-             * This is possible in gfx9, because an empty wave
-             * for the second shader does not participate in
-             * the epilogue. With NGG, empty waves may still
-             * be required to export data (e.g. GS output vertices),
-             * so we cannot let them exit early.
-             *
-             * If the shader is TCS and the TCS epilog is present
-             * and contains a barrier, it will wait there and then
-             * reach s_endpgm.
-             */
-            si_llvm_emit_barrier(ctx);
-         }
-      }
-   }
-
-   bool success = si_nir_build_llvm(ctx, nir);
-   if (free_nir)
-      ralloc_free(nir);
-   if (!success) {
-      fprintf(stderr, "Failed to translate shader from NIR to LLVM\n");
-      return false;
-   }
-
-   si_llvm_build_ret(ctx, ctx->return_value);
-   return true;
-}
-
 /**
 * Compute the VS prolog key, which contains all the information needed to
 * build the VS prolog function, and set shader->info bits where needed.
@@ -1682,7 +1468,7 @@ static bool si_llvm_compile_shader(struct si_screen *sscreen, struct ac_llvm_com

   LLVMValueRef ngg_cull_main_fn = NULL;
   if (shader->key.opt.ngg_culling) {
-      if (!si_build_main_function(&ctx, shader, nir, false, true)) {
+      if (!si_llvm_translate_nir(&ctx, shader, nir, false, true)) {
         si_llvm_dispose(&ctx);
         return false;
      }
@@ -1690,7 +1476,7 @@ static bool si_llvm_compile_shader(struct si_screen *sscreen, struct ac_llvm_com
      ctx.main_fn = NULL;
   }

-   if (!si_build_main_function(&ctx, shader, nir, free_nir, false)) {
+   if (!si_llvm_translate_nir(&ctx, shader, nir, free_nir, false)) {
      si_llvm_dispose(&ctx);
      return false;
   }
@@ -1763,7 +1549,7 @@ static bool si_llvm_compile_shader(struct si_screen *sscreen, struct ac_llvm_com
         shader_ls.key.opt = shader->key.opt;
         shader_ls.is_monolithic = true;

-         if (!si_build_main_function(&ctx, &shader_ls, nir, free_nir, false)) {
+         if (!si_llvm_translate_nir(&ctx, &shader_ls, nir, free_nir, false)) {
            si_llvm_dispose(&ctx);
            return false;
         }
@@ -1827,7 +1613,7 @@ static bool si_llvm_compile_shader(struct si_screen *sscreen, struct ac_llvm_com
         shader_es.key.opt = shader->key.opt;
         shader_es.is_monolithic = true;

-         if (!si_build_main_function(&ctx, &shader_es, nir, free_nir, false)) {
+         if (!si_llvm_translate_nir(&ctx, &shader_es, nir, free_nir, false)) {
            si_llvm_dispose(&ctx);
            return false;
         }
--- a/src/gallium/drivers/radeonsi/si_shader_internal.h
+++ b/src/gallium/drivers/radeonsi/si_shader_internal.h
@@ -202,6 +202,9 @@ void si_add_arg_checked(struct ac_shader_args *args, enum ac_arg_regfile file, u
                        enum ac_arg_type type, struct ac_arg *arg, unsigned idx);
 void si_init_shader_args(struct si_shader_context *ctx, bool ngg_cull_shader);
 unsigned si_get_max_workgroup_size(const struct si_shader *shader);
+bool si_vs_needs_prolog(const struct si_shader_selector *sel,
+                        const struct si_vs_prolog_bits *prolog_key,
+                        const struct si_shader_key *key, bool ngg_cull_shader);
 bool si_need_ps_prolog(const union si_shader_part_key *key);
 void si_get_ps_prolog_key(struct si_shader *shader, union si_shader_part_key *key,
                          bool separate_prolog);
@@ -251,12 +254,11 @@ void si_init_exec_from_input(struct si_shader_context *ctx, struct ac_arg param,
 LLVMValueRef si_unpack_param(struct si_shader_context *ctx, struct ac_arg param, unsigned rshift,
                             unsigned bitwidth);
 LLVMValueRef si_get_primitive_id(struct si_shader_context *ctx, unsigned swizzle);
-LLVMValueRef si_llvm_get_block_size(struct ac_shader_abi *abi);
-void si_llvm_declare_compute_memory(struct si_shader_context *ctx);
-bool si_nir_build_llvm(struct si_shader_context *ctx, struct nir_shader *nir);
 void si_build_wrapper_function(struct si_shader_context *ctx, LLVMValueRef *parts,
                               unsigned num_parts, unsigned main_part,
                               unsigned next_shader_first_part, bool same_thread_count);
+bool si_llvm_translate_nir(struct si_shader_context *ctx, struct si_shader *shader,
+                           struct nir_shader *nir, bool free_nir, bool ngg_cull_shader);

 /* si_shader_llvm_gs.c */
 LLVMValueRef si_is_es_thread(struct si_shader_context *ctx);
--- a/src/gallium/drivers/radeonsi/si_shader_llvm.c
+++ b/src/gallium/drivers/radeonsi/si_shader_llvm.c
@@ -406,7 +406,7 @@ LLVMValueRef si_get_primitive_id(struct si_shader_context *ctx, unsigned swizzle
   }
 }

-LLVMValueRef si_llvm_get_block_size(struct ac_shader_abi *abi)
+static LLVMValueRef si_llvm_get_block_size(struct ac_shader_abi *abi)
 {
   struct si_shader_context *ctx = si_shader_context_from_abi(abi);

@@ -414,7 +414,7 @@ LLVMValueRef si_llvm_get_block_size(struct ac_shader_abi *abi)
   return ac_get_arg(&ctx->ac, ctx->block_size);
 }

-void si_llvm_declare_compute_memory(struct si_shader_context *ctx)
+static void si_llvm_declare_compute_memory(struct si_shader_context *ctx)
 {
   struct si_shader_selector *sel = ctx->shader->selector;
   unsigned lds_size = sel->info.base.cs.shared_size;
@@ -431,7 +431,7 @@ void si_llvm_declare_compute_memory(struct si_shader_context *ctx)
   ctx->ac.lds = LLVMBuildBitCast(ctx->ac.builder, var, i8p, "");
 }

-bool si_nir_build_llvm(struct si_shader_context *ctx, struct nir_shader *nir)
+static bool si_nir_build_llvm(struct si_shader_context *ctx, struct nir_shader *nir)
 {
   if (nir->info.stage == MESA_SHADER_VERTEX) {
      si_llvm_load_vs_inputs(ctx, nir);
@@ -804,3 +804,217 @@ void si_build_wrapper_function(struct si_shader_context *ctx, LLVMValueRef *part
   else
      LLVMBuildRet(builder, ret);
 }
+
+bool si_llvm_translate_nir(struct si_shader_context *ctx, struct si_shader *shader,
+                           struct nir_shader *nir, bool free_nir, bool ngg_cull_shader)
+{
+   struct si_shader_selector *sel = shader->selector;
+   const struct si_shader_info *info = &sel->info;
+
+   ctx->shader = shader;
+   ctx->stage = sel->info.stage;
+
+   ctx->num_const_buffers = info->base.num_ubos;
+   ctx->num_shader_buffers = info->base.num_ssbos;
+
+   ctx->num_samplers = util_last_bit(info->base.textures_used);
+   ctx->num_images = info->base.num_images;
+
+   si_llvm_init_resource_callbacks(ctx);
+
+   switch (ctx->stage) {
+   case MESA_SHADER_VERTEX:
+      si_llvm_init_vs_callbacks(ctx, ngg_cull_shader);
+      break;
+   case MESA_SHADER_TESS_CTRL:
+      si_llvm_init_tcs_callbacks(ctx);
+      break;
+   case MESA_SHADER_TESS_EVAL:
+      si_llvm_init_tes_callbacks(ctx, ngg_cull_shader);
+      break;
+   case MESA_SHADER_GEOMETRY:
+      si_llvm_init_gs_callbacks(ctx);
+      break;
+   case MESA_SHADER_FRAGMENT:
+      si_llvm_init_ps_callbacks(ctx);
+      break;
+   case MESA_SHADER_COMPUTE:
+      ctx->abi.load_local_group_size = si_llvm_get_block_size;
+      break;
+   default:
+      assert(!"Unsupported shader type");
+      return false;
+   }
+
+   si_llvm_create_main_func(ctx, ngg_cull_shader);
+
+   if (ctx->shader->key.as_es || ctx->stage == MESA_SHADER_GEOMETRY)
+      si_preload_esgs_ring(ctx);
+
+   if (ctx->stage == MESA_SHADER_GEOMETRY)
+      si_preload_gs_rings(ctx);
+   else if (ctx->stage == MESA_SHADER_TESS_EVAL)
+      si_llvm_preload_tes_rings(ctx);
+
+   if (ctx->stage == MESA_SHADER_TESS_CTRL && sel->info.tessfactors_are_def_in_all_invocs) {
+      for (unsigned i = 0; i < 6; i++) {
+         ctx->invoc0_tess_factors[i] = ac_build_alloca_undef(&ctx->ac, ctx->ac.i32, "");
+      }
+   }
+
+   if (ctx->stage == MESA_SHADER_GEOMETRY) {
+      for (unsigned i = 0; i < 4; i++) {
+         ctx->gs_next_vertex[i] = ac_build_alloca(&ctx->ac, ctx->ac.i32, "");
+      }
+      if (shader->key.as_ngg) {
+         for (unsigned i = 0; i < 4; ++i) {
+            ctx->gs_curprim_verts[i] = ac_build_alloca(&ctx->ac, ctx->ac.i32, "");
+            ctx->gs_generated_prims[i] = ac_build_alloca(&ctx->ac, ctx->ac.i32, "");
+         }
+
+         assert(!ctx->gs_ngg_scratch);
+         LLVMTypeRef ai32 = LLVMArrayType(ctx->ac.i32, gfx10_ngg_get_scratch_dw_size(shader));
+         ctx->gs_ngg_scratch =
+            LLVMAddGlobalInAddressSpace(ctx->ac.module, ai32, "ngg_scratch", AC_ADDR_SPACE_LDS);
+         LLVMSetInitializer(ctx->gs_ngg_scratch, LLVMGetUndef(ai32));
+         LLVMSetAlignment(ctx->gs_ngg_scratch, 4);
+
+         ctx->gs_ngg_emit = LLVMAddGlobalInAddressSpace(
+            ctx->ac.module, LLVMArrayType(ctx->ac.i32, 0), "ngg_emit", AC_ADDR_SPACE_LDS);
+         LLVMSetLinkage(ctx->gs_ngg_emit, LLVMExternalLinkage);
+         LLVMSetAlignment(ctx->gs_ngg_emit, 4);
+      }
+   }
+
+   if (ctx->stage != MESA_SHADER_GEOMETRY && (shader->key.as_ngg && !shader->key.as_es)) {
+      /* Unconditionally declare scratch space base for streamout and
+       * vertex compaction. Whether space is actually allocated is
+       * determined during linking / PM4 creation.
+       *
+       * Add an extra dword per vertex to ensure an odd stride, which
+       * avoids bank conflicts for SoA accesses.
+       */
+      if (!gfx10_is_ngg_passthrough(shader))
+         si_llvm_declare_esgs_ring(ctx);
+
+      /* This is really only needed when streamout and / or vertex
+       * compaction is enabled.
+       */
+      if (!ctx->gs_ngg_scratch && (sel->so.num_outputs || shader->key.opt.ngg_culling)) {
+         LLVMTypeRef asi32 = LLVMArrayType(ctx->ac.i32, gfx10_ngg_get_scratch_dw_size(shader));
+         ctx->gs_ngg_scratch =
+            LLVMAddGlobalInAddressSpace(ctx->ac.module, asi32, "ngg_scratch", AC_ADDR_SPACE_LDS);
+         LLVMSetInitializer(ctx->gs_ngg_scratch, LLVMGetUndef(asi32));
+         LLVMSetAlignment(ctx->gs_ngg_scratch, 4);
+      }
+   }
+
+   /* For GFX9 merged shaders:
+    * - Set EXEC for the first shader. If the prolog is present, set
+    *   EXEC there instead.
+    * - Add a barrier before the second shader.
+    * - In the second shader, reset EXEC to ~0 and wrap the main part in
+    *   an if-statement. This is required for correctness in geometry
+    *   shaders, to ensure that empty GS waves do not send GS_EMIT and
+    *   GS_CUT messages.
+    *
+    * For monolithic merged shaders, the first shader is wrapped in an
+    * if-block together with its prolog in si_build_wrapper_function.
+    *
+    * NGG vertex and tess eval shaders running as the last
+    * vertex/geometry stage handle execution explicitly using
+    * if-statements.
+    */
+   if (ctx->screen->info.chip_class >= GFX9) {
+      if (!shader->is_monolithic && (shader->key.as_es || shader->key.as_ls) &&
+          (ctx->stage == MESA_SHADER_TESS_EVAL ||
+           (ctx->stage == MESA_SHADER_VERTEX &&
+            !si_vs_needs_prolog(sel, &shader->key.part.vs.prolog, &shader->key, ngg_cull_shader)))) {
+         si_init_exec_from_input(ctx, ctx->merged_wave_info, 0);
+      } else if (ctx->stage == MESA_SHADER_TESS_CTRL || ctx->stage == MESA_SHADER_GEOMETRY ||
+                 (shader->key.as_ngg && !shader->key.as_es)) {
+         LLVMValueRef thread_enabled = NULL;
+         bool nested_barrier;
+
+         if (!shader->is_monolithic || (ctx->stage == MESA_SHADER_TESS_EVAL && shader->key.as_ngg &&
+                                        !shader->key.as_es && !shader->key.opt.ngg_culling))
+            ac_init_exec_full_mask(&ctx->ac);
+
+         if ((ctx->stage == MESA_SHADER_VERTEX || ctx->stage == MESA_SHADER_TESS_EVAL) &&
+             shader->key.as_ngg && !shader->key.as_es && !shader->key.opt.ngg_culling) {
+            gfx10_ngg_build_sendmsg_gs_alloc_req(ctx);
+
+            /* Build the primitive export at the beginning
+             * of the shader if possible.
+             */
+            if (gfx10_ngg_export_prim_early(shader))
+               gfx10_ngg_build_export_prim(ctx, NULL, NULL);
+         }
+
+         if (ctx->stage == MESA_SHADER_TESS_CTRL) {
+            /* We need the barrier only if TCS inputs are read from LDS. */
+            nested_barrier =
+               !shader->key.opt.same_patch_vertices ||
+               shader->selector->info.base.inputs_read &
+               ~shader->selector->tcs_vgpr_only_inputs;
+
+            /* The wrapper inserts the conditional for monolithic shaders,
+             * and if this is a monolithic shader, we are already inside
+             * the conditional, so don't insert it.
+             */
+            if (!shader->is_monolithic)
+               thread_enabled = si_is_gs_thread(ctx); /* 2nd shader thread really */
+         } else if (ctx->stage == MESA_SHADER_GEOMETRY) {
+            if (shader->key.as_ngg) {
+               gfx10_ngg_gs_emit_prologue(ctx);
+               nested_barrier = false;
+            } else {
+               nested_barrier = true;
+            }
+
+            thread_enabled = si_is_gs_thread(ctx);
+         } else {
+            thread_enabled = si_is_es_thread(ctx);
+            nested_barrier = false;
+         }
+
+         if (thread_enabled) {
+            ctx->merged_wrap_if_entry_block = LLVMGetInsertBlock(ctx->ac.builder);
+            ctx->merged_wrap_if_label = 11500;
+            ac_build_ifcc(&ctx->ac, thread_enabled, ctx->merged_wrap_if_label);
+         }
+
+         if (nested_barrier) {
+            /* Execute a barrier before the second shader in
+             * a merged shader.
+             *
+             * Execute the barrier inside the conditional block,
+             * so that empty waves can jump directly to s_endpgm,
+             * which will also signal the barrier.
+             *
+             * This is possible in gfx9, because an empty wave
+             * for the second shader does not participate in
+             * the epilogue. With NGG, empty waves may still
+             * be required to export data (e.g. GS output vertices),
+             * so we cannot let them exit early.
+             *
+             * If the shader is TCS and the TCS epilog is present
+             * and contains a barrier, it will wait there and then
+             * reach s_endpgm.
+             */
+            si_llvm_emit_barrier(ctx);
+         }
+      }
+   }
+
+   bool success = si_nir_build_llvm(ctx, nir);
+   if (free_nir)
+      ralloc_free(nir);
+   if (!success) {
+      fprintf(stderr, "Failed to translate shader from NIR to LLVM\n");
+      return false;
+   }
+
+   si_llvm_build_ret(ctx, ctx->return_value);
+   return true;
+}