diff --git a/src/amd/llvm/ac_llvm_build.c b/src/amd/llvm/ac_llvm_build.c
index 06e348e7ae5..c03aaf4c5e8 100644
--- a/src/amd/llvm/ac_llvm_build.c
+++ b/src/amd/llvm/ac_llvm_build.c
@@ -4025,6 +4025,7 @@ void ac_build_wg_wavescan_bottom(struct ac_llvm_context *ctx, struct ac_wg_scan
 void ac_build_wg_wavescan(struct ac_llvm_context *ctx, struct ac_wg_scan *ws)
 {
    ac_build_wg_wavescan_top(ctx, ws);
+   ac_build_waitcnt(ctx, AC_WAIT_LGKM);
    ac_build_s_barrier(ctx, ws->stage);
    ac_build_wg_wavescan_bottom(ctx, ws);
 }
@@ -4087,6 +4088,7 @@ void ac_build_wg_scan_bottom(struct ac_llvm_context *ctx, struct ac_wg_scan *ws)
 void ac_build_wg_scan(struct ac_llvm_context *ctx, struct ac_wg_scan *ws)
 {
    ac_build_wg_scan_top(ctx, ws);
+   ac_build_waitcnt(ctx, AC_WAIT_LGKM);
    ac_build_s_barrier(ctx, ws->stage);
    ac_build_wg_scan_bottom(ctx, ws);
 }
diff --git a/src/amd/vulkan/radv_nir_to_llvm.c b/src/amd/vulkan/radv_nir_to_llvm.c
index 027ee6ed0da..80df23e144b 100644
--- a/src/amd/vulkan/radv_nir_to_llvm.c
+++ b/src/amd/vulkan/radv_nir_to_llvm.c
@@ -1338,6 +1338,7 @@ handle_ngg_outputs_post_2(struct radv_shader_context *ctx)
 
          if (ctx->stage == MESA_SHADER_VERTEX) {
             /* Wait for GS stores to finish. */
+            ac_build_waitcnt(&ctx->ac, AC_WAIT_LGKM);
             ac_build_s_barrier(&ctx->ac, ctx->stage);
 
             tmp = ac_build_gep0(&ctx->ac, ctx->esgs_ring, get_thread_id_in_tg(ctx));
@@ -1384,6 +1385,7 @@ gfx10_ngg_gs_emit_prologue(struct radv_shader_context *ctx)
    LLVMBuildBr(ctx->ac.builder, merge_block);
    LLVMPositionBuilderAtEnd(ctx->ac.builder, merge_block);
 
+   ac_build_waitcnt(&ctx->ac, AC_WAIT_LGKM);
    ac_build_s_barrier(&ctx->ac, ctx->stage);
 }
 
@@ -1459,6 +1461,7 @@ gfx10_ngg_gs_emit_epilogue_2(struct radv_shader_context *ctx)
    LLVMBuilderRef builder = ctx->ac.builder;
    LLVMValueRef tmp, tmp2;
 
+   ac_build_waitcnt(&ctx->ac, AC_WAIT_LGKM);
    ac_build_s_barrier(&ctx->ac, ctx->stage);
 
    const LLVMValueRef tid = get_thread_id_in_tg(ctx);
@@ -1565,6 +1568,7 @@ gfx10_ngg_gs_emit_epilogue_2(struct radv_shader_context *ctx)
    }
    ac_build_endif(&ctx->ac, 5130);
 
+   ac_build_waitcnt(&ctx->ac, AC_WAIT_LGKM);
    ac_build_s_barrier(&ctx->ac, ctx->stage);
 
    /* Export primitive data */
@@ -2150,6 +2154,7 @@ ac_translate_nir_to_llvm(struct ac_llvm_compiler *ac_llvm,
           * and contains a barrier, it will wait there and then
           * reach s_endpgm.
           */
+         ac_build_waitcnt(&ctx.ac, AC_WAIT_LGKM);
          ac_build_s_barrier(&ctx.ac, shaders[shader_idx]->info.stage);
       }
 
diff --git a/src/gallium/drivers/radeonsi/gfx10_shader_ngg.c b/src/gallium/drivers/radeonsi/gfx10_shader_ngg.c
index f66e2593574..07879ba9b89 100644
--- a/src/gallium/drivers/radeonsi/gfx10_shader_ngg.c
+++ b/src/gallium/drivers/radeonsi/gfx10_shader_ngg.c
@@ -500,6 +500,7 @@ static void build_streamout(struct si_shader_context *ctx, struct ngg_streamout
       }
    }
 
+   ac_build_waitcnt(&ctx->ac, AC_WAIT_LGKM);
    ac_build_s_barrier(&ctx->ac, ctx->stage);
 
    /* Fetch the per-buffer offsets and per-stream emit counts in all waves. */
@@ -1023,6 +1024,8 @@ void gfx10_emit_ngg_culling_epilogue(struct ac_shader_abi *abi)
       builder, packed_data,
       ac_build_gep0(&ctx->ac, es_vtxptr, LLVMConstInt(ctx->ac.i32, lds_packed_data, 0)));
    ac_build_endif(&ctx->ac, ctx->merged_wrap_if_label);
+
+   ac_build_waitcnt(&ctx->ac, AC_WAIT_LGKM);
    ac_build_s_barrier(&ctx->ac, ctx->stage);
 
    LLVMValueRef tid = ac_get_thread_id(&ctx->ac);
@@ -1142,6 +1145,8 @@ void gfx10_emit_ngg_culling_epilogue(struct ac_shader_abi *abi)
       cull_primitive(ctx, pos, clipdist_accepted, gs_accepted, gs_vtxptr);
    }
    ac_build_endif(&ctx->ac, 16002);
+
+   ac_build_waitcnt(&ctx->ac, AC_WAIT_LGKM);
    ac_build_s_barrier(&ctx->ac, ctx->stage);
 
    gs_accepted = LLVMBuildLoad(builder, gs_accepted, "");
@@ -1172,6 +1177,7 @@ void gfx10_emit_ngg_culling_epilogue(struct ac_shader_abi *abi)
    }
    ac_build_endif(&ctx->ac, 16008);
 
+   ac_build_waitcnt(&ctx->ac, AC_WAIT_LGKM);
    ac_build_s_barrier(&ctx->ac, ctx->stage);
 
    /* Load the vertex masks and compute the new ES thread count. */
@@ -1263,6 +1269,8 @@ void gfx10_emit_ngg_culling_epilogue(struct ac_shader_abi *abi)
       ac_build_s_endpgm(&ctx->ac);
    }
    ac_build_endif(&ctx->ac, 19202);
+
+   ac_build_waitcnt(&ctx->ac, AC_WAIT_LGKM);
    ac_build_s_barrier(&ctx->ac, ctx->stage);
 
    /* Send the final vertex and primitive counts. */
@@ -1408,8 +1416,10 @@ void gfx10_emit_ngg_culling_epilogue(struct ac_shader_abi *abi)
 
    /* These two also use LDS. */
    if (gfx10_ngg_writes_user_edgeflags(shader) ||
-       (ctx->stage == MESA_SHADER_VERTEX && shader->key.ge.mono.u.vs_export_prim_id))
+       (ctx->stage == MESA_SHADER_VERTEX && shader->key.ge.mono.u.vs_export_prim_id)) {
+      ac_build_waitcnt(&ctx->ac, AC_WAIT_LGKM);
       ac_build_s_barrier(&ctx->ac, ctx->stage);
+   }
 
    ctx->return_value = ret;
 }
@@ -1512,8 +1522,10 @@ void gfx10_emit_ngg_epilogue(struct ac_shader_abi *abi)
       assert(!unterminated_es_if_block);
 
       /* Streamout already inserted the barrier, so don't insert it again. */
-      if (!ctx->so.num_outputs)
+      if (!ctx->so.num_outputs) {
+         ac_build_waitcnt(&ctx->ac, AC_WAIT_LGKM);
          ac_build_s_barrier(&ctx->ac, ctx->stage);
+      }
 
       ac_build_ifcc(&ctx->ac, is_gs_thread, 5400);
       /* Load edge flags from ES threads and store them into VGPRs in GS threads. */
@@ -1536,8 +1548,10 @@ void gfx10_emit_ngg_epilogue(struct ac_shader_abi *abi)
       assert(!unterminated_es_if_block);
 
       /* Streamout and edge flags use LDS. Make it idle, so that we can reuse it. */
-      if (ctx->so.num_outputs || gfx10_ngg_writes_user_edgeflags(ctx->shader))
+      if (ctx->so.num_outputs || gfx10_ngg_writes_user_edgeflags(ctx->shader)) {
+         ac_build_waitcnt(&ctx->ac, AC_WAIT_LGKM);
          ac_build_s_barrier(&ctx->ac, ctx->stage);
+      }
 
       ac_build_ifcc(&ctx->ac, is_gs_thread, 5400);
       /* Extract the PROVOKING_VTX_INDEX field. */
@@ -1630,7 +1644,8 @@ void gfx10_emit_ngg_epilogue(struct ac_shader_abi *abi)
          outputs[i].vertex_streams = 0;
 
          if (ctx->stage == MESA_SHADER_VERTEX) {
-            /* Wait for GS stores to finish. */
+            /* Wait for LDS stores to finish. */
+            ac_build_waitcnt(&ctx->ac, AC_WAIT_LGKM);
             ac_build_s_barrier(&ctx->ac, ctx->stage);
 
             tmp = ngg_nogs_vertex_ptr(ctx, gfx10_get_thread_id_in_tg(ctx));
@@ -1862,6 +1877,7 @@ void gfx10_ngg_gs_emit_prologue(struct si_shader_context *ctx)
       }
    ac_build_endif(&ctx->ac, 15090);
 
+   ac_build_waitcnt(&ctx->ac, AC_WAIT_LGKM);
    ac_build_s_barrier(&ctx->ac, ctx->stage);
 }
 
@@ -1926,6 +1942,7 @@ void gfx10_ngg_gs_emit_epilogue(struct si_shader_context *ctx)
 
    ac_build_endif(&ctx->ac, ctx->merged_wrap_if_label);
 
+   ac_build_waitcnt(&ctx->ac, AC_WAIT_LGKM);
    ac_build_s_barrier(&ctx->ac, ctx->stage);
 
    const LLVMValueRef tid = gfx10_get_thread_id_in_tg(ctx);
@@ -2003,8 +2020,10 @@ void gfx10_ngg_gs_emit_epilogue(struct si_shader_context *ctx)
       LLVMValueRef prim_enable = LLVMBuildAnd(builder, live, is_emit, "");
 
       /* Wait for streamout to finish before we kill primitives. */
-      if (ctx->so.num_outputs)
+      if (ctx->so.num_outputs) {
+         ac_build_waitcnt(&ctx->ac, AC_WAIT_LGKM);
          ac_build_s_barrier(&ctx->ac, ctx->stage);
+      }
 
       ac_build_ifcc(&ctx->ac, prim_enable, 0);
       {
@@ -2062,6 +2081,8 @@ void gfx10_ngg_gs_emit_epilogue(struct si_shader_context *ctx)
          ac_build_endif(&ctx->ac, 0);
       }
       ac_build_endif(&ctx->ac, 0);
+
+      ac_build_waitcnt(&ctx->ac, AC_WAIT_LGKM);
       ac_build_s_barrier(&ctx->ac, ctx->stage);
    }
 
@@ -2131,6 +2152,7 @@ void gfx10_ngg_gs_emit_epilogue(struct si_shader_context *ctx)
    }
    ac_build_endif(&ctx->ac, 5130);
 
+   ac_build_waitcnt(&ctx->ac, AC_WAIT_LGKM);
    ac_build_s_barrier(&ctx->ac, ctx->stage);
 
    /* Export primitive data */
diff --git a/src/gallium/drivers/radeonsi/si_shader_llvm.c b/src/gallium/drivers/radeonsi/si_shader_llvm.c
index 883a7372d3c..4229d90f2b2 100644
--- a/src/gallium/drivers/radeonsi/si_shader_llvm.c
+++ b/src/gallium/drivers/radeonsi/si_shader_llvm.c
@@ -1015,10 +1015,13 @@ bool si_llvm_translate_nir(struct si_shader_context *ctx, struct si_shader *shad
          /* We need the barrier only if TCS inputs are read from LDS. */
          if (!shader->key.ge.opt.same_patch_vertices ||
              shader->selector->info.base.inputs_read &
-             ~shader->selector->info.tcs_vgpr_only_inputs)
+             ~shader->selector->info.tcs_vgpr_only_inputs) {
+            ac_build_waitcnt(&ctx->ac, AC_WAIT_LGKM);
             ac_build_s_barrier(&ctx->ac, ctx->stage);
+         }
       } else if (ctx->stage == MESA_SHADER_GEOMETRY && !shader->key.ge.as_ngg) {
          /* gfx10_ngg_gs_emit_prologue inserts the barrier for NGG. */
+         ac_build_waitcnt(&ctx->ac, AC_WAIT_LGKM);
          ac_build_s_barrier(&ctx->ac, ctx->stage);
       }
    }