ac/llvm: fix the remaining s_barriers for LLVM 15

LLVM 15 doesn't insert s_waitcnt before barriers.

Reviewed-by: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer@amd.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/16304>
This commit is contained in:
Marek Olšák
2022-05-02 22:08:43 -04:00
committed by Marge Bot
parent b48d183633
commit 20bb85e2ec
4 changed files with 38 additions and 6 deletions

View File

@@ -4025,6 +4025,7 @@ void ac_build_wg_wavescan_bottom(struct ac_llvm_context *ctx, struct ac_wg_scan
void ac_build_wg_wavescan(struct ac_llvm_context *ctx, struct ac_wg_scan *ws)
{
ac_build_wg_wavescan_top(ctx, ws);
ac_build_waitcnt(ctx, AC_WAIT_LGKM);
ac_build_s_barrier(ctx, ws->stage);
ac_build_wg_wavescan_bottom(ctx, ws);
}
@@ -4087,6 +4088,7 @@ void ac_build_wg_scan_bottom(struct ac_llvm_context *ctx, struct ac_wg_scan *ws)
void ac_build_wg_scan(struct ac_llvm_context *ctx, struct ac_wg_scan *ws)
{
ac_build_wg_scan_top(ctx, ws);
ac_build_waitcnt(ctx, AC_WAIT_LGKM);
ac_build_s_barrier(ctx, ws->stage);
ac_build_wg_scan_bottom(ctx, ws);
}

View File

@@ -1338,6 +1338,7 @@ handle_ngg_outputs_post_2(struct radv_shader_context *ctx)
if (ctx->stage == MESA_SHADER_VERTEX) {
/* Wait for GS stores to finish. */
ac_build_waitcnt(&ctx->ac, AC_WAIT_LGKM);
ac_build_s_barrier(&ctx->ac, ctx->stage);
tmp = ac_build_gep0(&ctx->ac, ctx->esgs_ring, get_thread_id_in_tg(ctx));
@@ -1384,6 +1385,7 @@ gfx10_ngg_gs_emit_prologue(struct radv_shader_context *ctx)
LLVMBuildBr(ctx->ac.builder, merge_block);
LLVMPositionBuilderAtEnd(ctx->ac.builder, merge_block);
ac_build_waitcnt(&ctx->ac, AC_WAIT_LGKM);
ac_build_s_barrier(&ctx->ac, ctx->stage);
}
@@ -1459,6 +1461,7 @@ gfx10_ngg_gs_emit_epilogue_2(struct radv_shader_context *ctx)
LLVMBuilderRef builder = ctx->ac.builder;
LLVMValueRef tmp, tmp2;
ac_build_waitcnt(&ctx->ac, AC_WAIT_LGKM);
ac_build_s_barrier(&ctx->ac, ctx->stage);
const LLVMValueRef tid = get_thread_id_in_tg(ctx);
@@ -1565,6 +1568,7 @@ gfx10_ngg_gs_emit_epilogue_2(struct radv_shader_context *ctx)
}
ac_build_endif(&ctx->ac, 5130);
ac_build_waitcnt(&ctx->ac, AC_WAIT_LGKM);
ac_build_s_barrier(&ctx->ac, ctx->stage);
/* Export primitive data */
@@ -2150,6 +2154,7 @@ ac_translate_nir_to_llvm(struct ac_llvm_compiler *ac_llvm,
* and contains a barrier, it will wait there and then
* reach s_endpgm.
*/
ac_build_waitcnt(&ctx.ac, AC_WAIT_LGKM);
ac_build_s_barrier(&ctx.ac, shaders[shader_idx]->info.stage);
}

View File

@@ -500,6 +500,7 @@ static void build_streamout(struct si_shader_context *ctx, struct ngg_streamout
}
}
ac_build_waitcnt(&ctx->ac, AC_WAIT_LGKM);
ac_build_s_barrier(&ctx->ac, ctx->stage);
/* Fetch the per-buffer offsets and per-stream emit counts in all waves. */
@@ -1023,6 +1024,8 @@ void gfx10_emit_ngg_culling_epilogue(struct ac_shader_abi *abi)
builder, packed_data,
ac_build_gep0(&ctx->ac, es_vtxptr, LLVMConstInt(ctx->ac.i32, lds_packed_data, 0)));
ac_build_endif(&ctx->ac, ctx->merged_wrap_if_label);
ac_build_waitcnt(&ctx->ac, AC_WAIT_LGKM);
ac_build_s_barrier(&ctx->ac, ctx->stage);
LLVMValueRef tid = ac_get_thread_id(&ctx->ac);
@@ -1142,6 +1145,8 @@ void gfx10_emit_ngg_culling_epilogue(struct ac_shader_abi *abi)
cull_primitive(ctx, pos, clipdist_accepted, gs_accepted, gs_vtxptr);
}
ac_build_endif(&ctx->ac, 16002);
ac_build_waitcnt(&ctx->ac, AC_WAIT_LGKM);
ac_build_s_barrier(&ctx->ac, ctx->stage);
gs_accepted = LLVMBuildLoad(builder, gs_accepted, "");
@@ -1172,6 +1177,7 @@ void gfx10_emit_ngg_culling_epilogue(struct ac_shader_abi *abi)
}
ac_build_endif(&ctx->ac, 16008);
ac_build_waitcnt(&ctx->ac, AC_WAIT_LGKM);
ac_build_s_barrier(&ctx->ac, ctx->stage);
/* Load the vertex masks and compute the new ES thread count. */
@@ -1263,6 +1269,8 @@ void gfx10_emit_ngg_culling_epilogue(struct ac_shader_abi *abi)
ac_build_s_endpgm(&ctx->ac);
}
ac_build_endif(&ctx->ac, 19202);
ac_build_waitcnt(&ctx->ac, AC_WAIT_LGKM);
ac_build_s_barrier(&ctx->ac, ctx->stage);
/* Send the final vertex and primitive counts. */
@@ -1408,8 +1416,10 @@ void gfx10_emit_ngg_culling_epilogue(struct ac_shader_abi *abi)
/* These two also use LDS. */
if (gfx10_ngg_writes_user_edgeflags(shader) ||
(ctx->stage == MESA_SHADER_VERTEX && shader->key.ge.mono.u.vs_export_prim_id))
(ctx->stage == MESA_SHADER_VERTEX && shader->key.ge.mono.u.vs_export_prim_id)) {
ac_build_waitcnt(&ctx->ac, AC_WAIT_LGKM);
ac_build_s_barrier(&ctx->ac, ctx->stage);
}
ctx->return_value = ret;
}
@@ -1512,8 +1522,10 @@ void gfx10_emit_ngg_epilogue(struct ac_shader_abi *abi)
assert(!unterminated_es_if_block);
/* Streamout already inserted the barrier, so don't insert it again. */
if (!ctx->so.num_outputs)
if (!ctx->so.num_outputs) {
ac_build_waitcnt(&ctx->ac, AC_WAIT_LGKM);
ac_build_s_barrier(&ctx->ac, ctx->stage);
}
ac_build_ifcc(&ctx->ac, is_gs_thread, 5400);
/* Load edge flags from ES threads and store them into VGPRs in GS threads. */
@@ -1536,8 +1548,10 @@ void gfx10_emit_ngg_epilogue(struct ac_shader_abi *abi)
assert(!unterminated_es_if_block);
/* Streamout and edge flags use LDS. Make it idle, so that we can reuse it. */
if (ctx->so.num_outputs || gfx10_ngg_writes_user_edgeflags(ctx->shader))
if (ctx->so.num_outputs || gfx10_ngg_writes_user_edgeflags(ctx->shader)) {
ac_build_waitcnt(&ctx->ac, AC_WAIT_LGKM);
ac_build_s_barrier(&ctx->ac, ctx->stage);
}
ac_build_ifcc(&ctx->ac, is_gs_thread, 5400);
/* Extract the PROVOKING_VTX_INDEX field. */
@@ -1630,7 +1644,8 @@ void gfx10_emit_ngg_epilogue(struct ac_shader_abi *abi)
outputs[i].vertex_streams = 0;
if (ctx->stage == MESA_SHADER_VERTEX) {
/* Wait for GS stores to finish. */
/* Wait for LDS stores to finish. */
ac_build_waitcnt(&ctx->ac, AC_WAIT_LGKM);
ac_build_s_barrier(&ctx->ac, ctx->stage);
tmp = ngg_nogs_vertex_ptr(ctx, gfx10_get_thread_id_in_tg(ctx));
@@ -1862,6 +1877,7 @@ void gfx10_ngg_gs_emit_prologue(struct si_shader_context *ctx)
}
ac_build_endif(&ctx->ac, 15090);
ac_build_waitcnt(&ctx->ac, AC_WAIT_LGKM);
ac_build_s_barrier(&ctx->ac, ctx->stage);
}
@@ -1926,6 +1942,7 @@ void gfx10_ngg_gs_emit_epilogue(struct si_shader_context *ctx)
ac_build_endif(&ctx->ac, ctx->merged_wrap_if_label);
ac_build_waitcnt(&ctx->ac, AC_WAIT_LGKM);
ac_build_s_barrier(&ctx->ac, ctx->stage);
const LLVMValueRef tid = gfx10_get_thread_id_in_tg(ctx);
@@ -2003,8 +2020,10 @@ void gfx10_ngg_gs_emit_epilogue(struct si_shader_context *ctx)
LLVMValueRef prim_enable = LLVMBuildAnd(builder, live, is_emit, "");
/* Wait for streamout to finish before we kill primitives. */
if (ctx->so.num_outputs)
if (ctx->so.num_outputs) {
ac_build_waitcnt(&ctx->ac, AC_WAIT_LGKM);
ac_build_s_barrier(&ctx->ac, ctx->stage);
}
ac_build_ifcc(&ctx->ac, prim_enable, 0);
{
@@ -2062,6 +2081,8 @@ void gfx10_ngg_gs_emit_epilogue(struct si_shader_context *ctx)
ac_build_endif(&ctx->ac, 0);
}
ac_build_endif(&ctx->ac, 0);
ac_build_waitcnt(&ctx->ac, AC_WAIT_LGKM);
ac_build_s_barrier(&ctx->ac, ctx->stage);
}
@@ -2131,6 +2152,7 @@ void gfx10_ngg_gs_emit_epilogue(struct si_shader_context *ctx)
}
ac_build_endif(&ctx->ac, 5130);
ac_build_waitcnt(&ctx->ac, AC_WAIT_LGKM);
ac_build_s_barrier(&ctx->ac, ctx->stage);
/* Export primitive data */

View File

@@ -1015,10 +1015,13 @@ bool si_llvm_translate_nir(struct si_shader_context *ctx, struct si_shader *shad
/* We need the barrier only if TCS inputs are read from LDS. */
if (!shader->key.ge.opt.same_patch_vertices ||
shader->selector->info.base.inputs_read &
~shader->selector->info.tcs_vgpr_only_inputs)
~shader->selector->info.tcs_vgpr_only_inputs) {
ac_build_waitcnt(&ctx->ac, AC_WAIT_LGKM);
ac_build_s_barrier(&ctx->ac, ctx->stage);
}
} else if (ctx->stage == MESA_SHADER_GEOMETRY && !shader->key.ge.as_ngg) {
/* gfx10_ngg_gs_emit_prologue inserts the barrier for NGG. */
ac_build_waitcnt(&ctx->ac, AC_WAIT_LGKM);
ac_build_s_barrier(&ctx->ac, ctx->stage);
}
}