diff --git a/src/amd/llvm/ac_llvm_build.c b/src/amd/llvm/ac_llvm_build.c index 06e348e7ae5..c03aaf4c5e8 100644 --- a/src/amd/llvm/ac_llvm_build.c +++ b/src/amd/llvm/ac_llvm_build.c @@ -4025,6 +4025,7 @@ void ac_build_wg_wavescan_bottom(struct ac_llvm_context *ctx, struct ac_wg_scan void ac_build_wg_wavescan(struct ac_llvm_context *ctx, struct ac_wg_scan *ws) { ac_build_wg_wavescan_top(ctx, ws); + ac_build_waitcnt(ctx, AC_WAIT_LGKM); ac_build_s_barrier(ctx, ws->stage); ac_build_wg_wavescan_bottom(ctx, ws); } @@ -4087,6 +4088,7 @@ void ac_build_wg_scan_bottom(struct ac_llvm_context *ctx, struct ac_wg_scan *ws) void ac_build_wg_scan(struct ac_llvm_context *ctx, struct ac_wg_scan *ws) { ac_build_wg_scan_top(ctx, ws); + ac_build_waitcnt(ctx, AC_WAIT_LGKM); ac_build_s_barrier(ctx, ws->stage); ac_build_wg_scan_bottom(ctx, ws); } diff --git a/src/amd/vulkan/radv_nir_to_llvm.c b/src/amd/vulkan/radv_nir_to_llvm.c index 027ee6ed0da..80df23e144b 100644 --- a/src/amd/vulkan/radv_nir_to_llvm.c +++ b/src/amd/vulkan/radv_nir_to_llvm.c @@ -1338,6 +1338,7 @@ handle_ngg_outputs_post_2(struct radv_shader_context *ctx) if (ctx->stage == MESA_SHADER_VERTEX) { /* Wait for GS stores to finish. */ + ac_build_waitcnt(&ctx->ac, AC_WAIT_LGKM); ac_build_s_barrier(&ctx->ac, ctx->stage); tmp = ac_build_gep0(&ctx->ac, ctx->esgs_ring, get_thread_id_in_tg(ctx)); @@ -1384,6 +1385,7 @@ gfx10_ngg_gs_emit_prologue(struct radv_shader_context *ctx) LLVMBuildBr(ctx->ac.builder, merge_block); LLVMPositionBuilderAtEnd(ctx->ac.builder, merge_block); + ac_build_waitcnt(&ctx->ac, AC_WAIT_LGKM); ac_build_s_barrier(&ctx->ac, ctx->stage); } @@ -1459,6 +1461,7 @@ gfx10_ngg_gs_emit_epilogue_2(struct radv_shader_context *ctx) LLVMBuilderRef builder = ctx->ac.builder; LLVMValueRef tmp, tmp2; + ac_build_waitcnt(&ctx->ac, AC_WAIT_LGKM); ac_build_s_barrier(&ctx->ac, ctx->stage); const LLVMValueRef tid = get_thread_id_in_tg(ctx); @@ -1565,6 +1568,7 @@ gfx10_ngg_gs_emit_epilogue_2(struct radv_shader_context *ctx) } ac_build_endif(&ctx->ac, 5130); + ac_build_waitcnt(&ctx->ac, AC_WAIT_LGKM); ac_build_s_barrier(&ctx->ac, ctx->stage); /* Export primitive data */ @@ -2150,6 +2154,7 @@ ac_translate_nir_to_llvm(struct ac_llvm_compiler *ac_llvm, * and contains a barrier, it will wait there and then * reach s_endpgm. */ + ac_build_waitcnt(&ctx.ac, AC_WAIT_LGKM); ac_build_s_barrier(&ctx.ac, shaders[shader_idx]->info.stage); } diff --git a/src/gallium/drivers/radeonsi/gfx10_shader_ngg.c b/src/gallium/drivers/radeonsi/gfx10_shader_ngg.c index f66e2593574..07879ba9b89 100644 --- a/src/gallium/drivers/radeonsi/gfx10_shader_ngg.c +++ b/src/gallium/drivers/radeonsi/gfx10_shader_ngg.c @@ -500,6 +500,7 @@ static void build_streamout(struct si_shader_context *ctx, struct ngg_streamout } } + ac_build_waitcnt(&ctx->ac, AC_WAIT_LGKM); ac_build_s_barrier(&ctx->ac, ctx->stage); /* Fetch the per-buffer offsets and per-stream emit counts in all waves. */ @@ -1023,6 +1024,8 @@ void gfx10_emit_ngg_culling_epilogue(struct ac_shader_abi *abi) builder, packed_data, ac_build_gep0(&ctx->ac, es_vtxptr, LLVMConstInt(ctx->ac.i32, lds_packed_data, 0))); ac_build_endif(&ctx->ac, ctx->merged_wrap_if_label); + + ac_build_waitcnt(&ctx->ac, AC_WAIT_LGKM); ac_build_s_barrier(&ctx->ac, ctx->stage); LLVMValueRef tid = ac_get_thread_id(&ctx->ac); @@ -1142,6 +1145,8 @@ void gfx10_emit_ngg_culling_epilogue(struct ac_shader_abi *abi) cull_primitive(ctx, pos, clipdist_accepted, gs_accepted, gs_vtxptr); } ac_build_endif(&ctx->ac, 16002); + + ac_build_waitcnt(&ctx->ac, AC_WAIT_LGKM); ac_build_s_barrier(&ctx->ac, ctx->stage); gs_accepted = LLVMBuildLoad(builder, gs_accepted, ""); @@ -1172,6 +1177,7 @@ void gfx10_emit_ngg_culling_epilogue(struct ac_shader_abi *abi) } ac_build_endif(&ctx->ac, 16008); + ac_build_waitcnt(&ctx->ac, AC_WAIT_LGKM); ac_build_s_barrier(&ctx->ac, ctx->stage); /* Load the vertex masks and compute the new ES thread count. */ @@ -1263,6 +1269,8 @@ void gfx10_emit_ngg_culling_epilogue(struct ac_shader_abi *abi) ac_build_s_endpgm(&ctx->ac); } ac_build_endif(&ctx->ac, 19202); + + ac_build_waitcnt(&ctx->ac, AC_WAIT_LGKM); ac_build_s_barrier(&ctx->ac, ctx->stage); /* Send the final vertex and primitive counts. */ @@ -1408,8 +1416,10 @@ void gfx10_emit_ngg_culling_epilogue(struct ac_shader_abi *abi) /* These two also use LDS. */ if (gfx10_ngg_writes_user_edgeflags(shader) || - (ctx->stage == MESA_SHADER_VERTEX && shader->key.ge.mono.u.vs_export_prim_id)) + (ctx->stage == MESA_SHADER_VERTEX && shader->key.ge.mono.u.vs_export_prim_id)) { + ac_build_waitcnt(&ctx->ac, AC_WAIT_LGKM); ac_build_s_barrier(&ctx->ac, ctx->stage); + } ctx->return_value = ret; } @@ -1512,8 +1522,10 @@ void gfx10_emit_ngg_epilogue(struct ac_shader_abi *abi) assert(!unterminated_es_if_block); /* Streamout already inserted the barrier, so don't insert it again. */ - if (!ctx->so.num_outputs) + if (!ctx->so.num_outputs) { + ac_build_waitcnt(&ctx->ac, AC_WAIT_LGKM); ac_build_s_barrier(&ctx->ac, ctx->stage); + } ac_build_ifcc(&ctx->ac, is_gs_thread, 5400); /* Load edge flags from ES threads and store them into VGPRs in GS threads. */ @@ -1536,8 +1548,10 @@ void gfx10_emit_ngg_epilogue(struct ac_shader_abi *abi) assert(!unterminated_es_if_block); /* Streamout and edge flags use LDS. Make it idle, so that we can reuse it. */ - if (ctx->so.num_outputs || gfx10_ngg_writes_user_edgeflags(ctx->shader)) + if (ctx->so.num_outputs || gfx10_ngg_writes_user_edgeflags(ctx->shader)) { + ac_build_waitcnt(&ctx->ac, AC_WAIT_LGKM); ac_build_s_barrier(&ctx->ac, ctx->stage); + } ac_build_ifcc(&ctx->ac, is_gs_thread, 5400); /* Extract the PROVOKING_VTX_INDEX field. */ @@ -1630,7 +1644,8 @@ void gfx10_emit_ngg_epilogue(struct ac_shader_abi *abi) outputs[i].vertex_streams = 0; if (ctx->stage == MESA_SHADER_VERTEX) { - /* Wait for GS stores to finish. */ + /* Wait for LDS stores to finish. */ + ac_build_waitcnt(&ctx->ac, AC_WAIT_LGKM); ac_build_s_barrier(&ctx->ac, ctx->stage); tmp = ngg_nogs_vertex_ptr(ctx, gfx10_get_thread_id_in_tg(ctx)); @@ -1862,6 +1877,7 @@ void gfx10_ngg_gs_emit_prologue(struct si_shader_context *ctx) } ac_build_endif(&ctx->ac, 15090); + ac_build_waitcnt(&ctx->ac, AC_WAIT_LGKM); ac_build_s_barrier(&ctx->ac, ctx->stage); } @@ -1926,6 +1942,7 @@ void gfx10_ngg_gs_emit_epilogue(struct si_shader_context *ctx) ac_build_endif(&ctx->ac, ctx->merged_wrap_if_label); + ac_build_waitcnt(&ctx->ac, AC_WAIT_LGKM); ac_build_s_barrier(&ctx->ac, ctx->stage); const LLVMValueRef tid = gfx10_get_thread_id_in_tg(ctx); @@ -2003,8 +2020,10 @@ void gfx10_ngg_gs_emit_epilogue(struct si_shader_context *ctx) LLVMValueRef prim_enable = LLVMBuildAnd(builder, live, is_emit, ""); /* Wait for streamout to finish before we kill primitives. */ - if (ctx->so.num_outputs) + if (ctx->so.num_outputs) { + ac_build_waitcnt(&ctx->ac, AC_WAIT_LGKM); ac_build_s_barrier(&ctx->ac, ctx->stage); + } ac_build_ifcc(&ctx->ac, prim_enable, 0); { @@ -2062,6 +2081,8 @@ void gfx10_ngg_gs_emit_epilogue(struct si_shader_context *ctx) ac_build_endif(&ctx->ac, 0); } ac_build_endif(&ctx->ac, 0); + + ac_build_waitcnt(&ctx->ac, AC_WAIT_LGKM); ac_build_s_barrier(&ctx->ac, ctx->stage); } @@ -2131,6 +2152,7 @@ void gfx10_ngg_gs_emit_epilogue(struct si_shader_context *ctx) } ac_build_endif(&ctx->ac, 5130); + ac_build_waitcnt(&ctx->ac, AC_WAIT_LGKM); ac_build_s_barrier(&ctx->ac, ctx->stage); /* Export primitive data */ diff --git a/src/gallium/drivers/radeonsi/si_shader_llvm.c b/src/gallium/drivers/radeonsi/si_shader_llvm.c index 883a7372d3c..4229d90f2b2 100644 --- a/src/gallium/drivers/radeonsi/si_shader_llvm.c +++ b/src/gallium/drivers/radeonsi/si_shader_llvm.c @@ -1015,10 +1015,13 @@ bool si_llvm_translate_nir(struct si_shader_context *ctx, struct si_shader *shad /* We need the barrier only if TCS inputs are read from LDS. */ if (!shader->key.ge.opt.same_patch_vertices || shader->selector->info.base.inputs_read & - ~shader->selector->info.tcs_vgpr_only_inputs) + ~shader->selector->info.tcs_vgpr_only_inputs) { + ac_build_waitcnt(&ctx->ac, AC_WAIT_LGKM); ac_build_s_barrier(&ctx->ac, ctx->stage); + } } else if (ctx->stage == MESA_SHADER_GEOMETRY && !shader->key.ge.as_ngg) { /* gfx10_ngg_gs_emit_prologue inserts the barrier for NGG. */ + ac_build_waitcnt(&ctx->ac, AC_WAIT_LGKM); ac_build_s_barrier(&ctx->ac, ctx->stage); } }