ac/llvm: fix the remaining s_barriers for LLVM 15

LLVM 15 doesn't insert s_waitcnt before barriers. Reviewed-by: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer@amd.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/16304>
2022-05-02 22:08:43 -04:00
parent b48d183633
commit 20bb85e2ec
4 changed files with 38 additions and 6 deletions
--- a/src/amd/llvm/ac_llvm_build.c
+++ b/src/amd/llvm/ac_llvm_build.c
@@ -4025,6 +4025,7 @@ void ac_build_wg_wavescan_bottom(struct ac_llvm_context *ctx, struct ac_wg_scan
 void ac_build_wg_wavescan(struct ac_llvm_context *ctx, struct ac_wg_scan *ws)
 {
   ac_build_wg_wavescan_top(ctx, ws);
+   ac_build_waitcnt(ctx, AC_WAIT_LGKM);
   ac_build_s_barrier(ctx, ws->stage);
   ac_build_wg_wavescan_bottom(ctx, ws);
 }
@@ -4087,6 +4088,7 @@ void ac_build_wg_scan_bottom(struct ac_llvm_context *ctx, struct ac_wg_scan *ws)
 void ac_build_wg_scan(struct ac_llvm_context *ctx, struct ac_wg_scan *ws)
 {
   ac_build_wg_scan_top(ctx, ws);
+   ac_build_waitcnt(ctx, AC_WAIT_LGKM);
   ac_build_s_barrier(ctx, ws->stage);
   ac_build_wg_scan_bottom(ctx, ws);
 }
--- a/src/amd/vulkan/radv_nir_to_llvm.c
+++ b/src/amd/vulkan/radv_nir_to_llvm.c
@@ -1338,6 +1338,7 @@ handle_ngg_outputs_post_2(struct radv_shader_context *ctx)

         if (ctx->stage == MESA_SHADER_VERTEX) {
            /* Wait for GS stores to finish. */
+            ac_build_waitcnt(&ctx->ac, AC_WAIT_LGKM);
            ac_build_s_barrier(&ctx->ac, ctx->stage);

            tmp = ac_build_gep0(&ctx->ac, ctx->esgs_ring, get_thread_id_in_tg(ctx));
@@ -1384,6 +1385,7 @@ gfx10_ngg_gs_emit_prologue(struct radv_shader_context *ctx)
   LLVMBuildBr(ctx->ac.builder, merge_block);
   LLVMPositionBuilderAtEnd(ctx->ac.builder, merge_block);

+   ac_build_waitcnt(&ctx->ac, AC_WAIT_LGKM);
   ac_build_s_barrier(&ctx->ac, ctx->stage);
 }

@@ -1459,6 +1461,7 @@ gfx10_ngg_gs_emit_epilogue_2(struct radv_shader_context *ctx)
   LLVMBuilderRef builder = ctx->ac.builder;
   LLVMValueRef tmp, tmp2;

+   ac_build_waitcnt(&ctx->ac, AC_WAIT_LGKM);
   ac_build_s_barrier(&ctx->ac, ctx->stage);

   const LLVMValueRef tid = get_thread_id_in_tg(ctx);
@@ -1565,6 +1568,7 @@ gfx10_ngg_gs_emit_epilogue_2(struct radv_shader_context *ctx)
   }
   ac_build_endif(&ctx->ac, 5130);

+   ac_build_waitcnt(&ctx->ac, AC_WAIT_LGKM);
   ac_build_s_barrier(&ctx->ac, ctx->stage);

   /* Export primitive data */
@@ -2150,6 +2154,7 @@ ac_translate_nir_to_llvm(struct ac_llvm_compiler *ac_llvm,
          * and contains a barrier, it will wait there and then
          * reach s_endpgm.
          */
+         ac_build_waitcnt(&ctx.ac, AC_WAIT_LGKM);
         ac_build_s_barrier(&ctx.ac, shaders[shader_idx]->info.stage);
      }

--- a/src/gallium/drivers/radeonsi/gfx10_shader_ngg.c
+++ b/src/gallium/drivers/radeonsi/gfx10_shader_ngg.c
@@ -500,6 +500,7 @@ static void build_streamout(struct si_shader_context *ctx, struct ngg_streamout
      }
   }

+   ac_build_waitcnt(&ctx->ac, AC_WAIT_LGKM);
   ac_build_s_barrier(&ctx->ac, ctx->stage);

   /* Fetch the per-buffer offsets and per-stream emit counts in all waves. */
@@ -1023,6 +1024,8 @@ void gfx10_emit_ngg_culling_epilogue(struct ac_shader_abi *abi)
      builder, packed_data,
      ac_build_gep0(&ctx->ac, es_vtxptr, LLVMConstInt(ctx->ac.i32, lds_packed_data, 0)));
   ac_build_endif(&ctx->ac, ctx->merged_wrap_if_label);
+
+   ac_build_waitcnt(&ctx->ac, AC_WAIT_LGKM);
   ac_build_s_barrier(&ctx->ac, ctx->stage);

   LLVMValueRef tid = ac_get_thread_id(&ctx->ac);
@@ -1142,6 +1145,8 @@ void gfx10_emit_ngg_culling_epilogue(struct ac_shader_abi *abi)
      cull_primitive(ctx, pos, clipdist_accepted, gs_accepted, gs_vtxptr);
   }
   ac_build_endif(&ctx->ac, 16002);
+
+   ac_build_waitcnt(&ctx->ac, AC_WAIT_LGKM);
   ac_build_s_barrier(&ctx->ac, ctx->stage);

   gs_accepted = LLVMBuildLoad(builder, gs_accepted, "");
@@ -1172,6 +1177,7 @@ void gfx10_emit_ngg_culling_epilogue(struct ac_shader_abi *abi)
   }
   ac_build_endif(&ctx->ac, 16008);

+   ac_build_waitcnt(&ctx->ac, AC_WAIT_LGKM);
   ac_build_s_barrier(&ctx->ac, ctx->stage);

   /* Load the vertex masks and compute the new ES thread count. */
@@ -1263,6 +1269,8 @@ void gfx10_emit_ngg_culling_epilogue(struct ac_shader_abi *abi)
      ac_build_s_endpgm(&ctx->ac);
   }
   ac_build_endif(&ctx->ac, 19202);
+
+   ac_build_waitcnt(&ctx->ac, AC_WAIT_LGKM);
   ac_build_s_barrier(&ctx->ac, ctx->stage);

   /* Send the final vertex and primitive counts. */
@@ -1408,8 +1416,10 @@ void gfx10_emit_ngg_culling_epilogue(struct ac_shader_abi *abi)

   /* These two also use LDS. */
   if (gfx10_ngg_writes_user_edgeflags(shader) ||
-       (ctx->stage == MESA_SHADER_VERTEX && shader->key.ge.mono.u.vs_export_prim_id))
+       (ctx->stage == MESA_SHADER_VERTEX && shader->key.ge.mono.u.vs_export_prim_id)) {
+      ac_build_waitcnt(&ctx->ac, AC_WAIT_LGKM);
      ac_build_s_barrier(&ctx->ac, ctx->stage);
+   }

   ctx->return_value = ret;
 }
@@ -1512,8 +1522,10 @@ void gfx10_emit_ngg_epilogue(struct ac_shader_abi *abi)
      assert(!unterminated_es_if_block);

      /* Streamout already inserted the barrier, so don't insert it again. */
-      if (!ctx->so.num_outputs)
+      if (!ctx->so.num_outputs) {
+         ac_build_waitcnt(&ctx->ac, AC_WAIT_LGKM);
         ac_build_s_barrier(&ctx->ac, ctx->stage);
+      }

      ac_build_ifcc(&ctx->ac, is_gs_thread, 5400);
      /* Load edge flags from ES threads and store them into VGPRs in GS threads. */
@@ -1536,8 +1548,10 @@ void gfx10_emit_ngg_epilogue(struct ac_shader_abi *abi)
      assert(!unterminated_es_if_block);

      /* Streamout and edge flags use LDS. Make it idle, so that we can reuse it. */
-      if (ctx->so.num_outputs || gfx10_ngg_writes_user_edgeflags(ctx->shader))
+      if (ctx->so.num_outputs || gfx10_ngg_writes_user_edgeflags(ctx->shader)) {
+         ac_build_waitcnt(&ctx->ac, AC_WAIT_LGKM);
         ac_build_s_barrier(&ctx->ac, ctx->stage);
+      }

      ac_build_ifcc(&ctx->ac, is_gs_thread, 5400);
      /* Extract the PROVOKING_VTX_INDEX field. */
@@ -1630,7 +1644,8 @@ void gfx10_emit_ngg_epilogue(struct ac_shader_abi *abi)
         outputs[i].vertex_streams = 0;

         if (ctx->stage == MESA_SHADER_VERTEX) {
-            /* Wait for GS stores to finish. */
+            /* Wait for LDS stores to finish. */
+            ac_build_waitcnt(&ctx->ac, AC_WAIT_LGKM);
            ac_build_s_barrier(&ctx->ac, ctx->stage);

            tmp = ngg_nogs_vertex_ptr(ctx, gfx10_get_thread_id_in_tg(ctx));
@@ -1862,6 +1877,7 @@ void gfx10_ngg_gs_emit_prologue(struct si_shader_context *ctx)
      }
   ac_build_endif(&ctx->ac, 15090);

+   ac_build_waitcnt(&ctx->ac, AC_WAIT_LGKM);
   ac_build_s_barrier(&ctx->ac, ctx->stage);
 }

@@ -1926,6 +1942,7 @@ void gfx10_ngg_gs_emit_epilogue(struct si_shader_context *ctx)

   ac_build_endif(&ctx->ac, ctx->merged_wrap_if_label);

+   ac_build_waitcnt(&ctx->ac, AC_WAIT_LGKM);
   ac_build_s_barrier(&ctx->ac, ctx->stage);

   const LLVMValueRef tid = gfx10_get_thread_id_in_tg(ctx);
@@ -2003,8 +2020,10 @@ void gfx10_ngg_gs_emit_epilogue(struct si_shader_context *ctx)
      LLVMValueRef prim_enable = LLVMBuildAnd(builder, live, is_emit, "");

      /* Wait for streamout to finish before we kill primitives. */
-      if (ctx->so.num_outputs)
+      if (ctx->so.num_outputs) {
+         ac_build_waitcnt(&ctx->ac, AC_WAIT_LGKM);
         ac_build_s_barrier(&ctx->ac, ctx->stage);
+      }

      ac_build_ifcc(&ctx->ac, prim_enable, 0);
      {
@@ -2062,6 +2081,8 @@ void gfx10_ngg_gs_emit_epilogue(struct si_shader_context *ctx)
         ac_build_endif(&ctx->ac, 0);
      }
      ac_build_endif(&ctx->ac, 0);
+
+      ac_build_waitcnt(&ctx->ac, AC_WAIT_LGKM);
      ac_build_s_barrier(&ctx->ac, ctx->stage);
   }

@@ -2131,6 +2152,7 @@ void gfx10_ngg_gs_emit_epilogue(struct si_shader_context *ctx)
   }
   ac_build_endif(&ctx->ac, 5130);

+   ac_build_waitcnt(&ctx->ac, AC_WAIT_LGKM);
   ac_build_s_barrier(&ctx->ac, ctx->stage);

   /* Export primitive data */
--- a/src/gallium/drivers/radeonsi/si_shader_llvm.c
+++ b/src/gallium/drivers/radeonsi/si_shader_llvm.c
@@ -1015,10 +1015,13 @@ bool si_llvm_translate_nir(struct si_shader_context *ctx, struct si_shader *shad
         /* We need the barrier only if TCS inputs are read from LDS. */
         if (!shader->key.ge.opt.same_patch_vertices ||
             shader->selector->info.base.inputs_read &
-             ~shader->selector->info.tcs_vgpr_only_inputs)
+             ~shader->selector->info.tcs_vgpr_only_inputs) {
+            ac_build_waitcnt(&ctx->ac, AC_WAIT_LGKM);
            ac_build_s_barrier(&ctx->ac, ctx->stage);
+         }
      } else if (ctx->stage == MESA_SHADER_GEOMETRY && !shader->key.ge.as_ngg) {
         /* gfx10_ngg_gs_emit_prologue inserts the barrier for NGG. */
+         ac_build_waitcnt(&ctx->ac, AC_WAIT_LGKM);
         ac_build_s_barrier(&ctx->ac, ctx->stage);
      }
   }