radv: remove NGG streamout support in LLVM

It has never really been used due to various issues with GDS in the
past and it will be lowered in NIR at some point.

The driver support is still there because it can likely be re-used.
This implementation can also be used as a reference point.

Signed-off-by: Samuel Pitoiset <samuel.pitoiset@gmail.com>
Reviewed-by: Bas Nieuwenhuizen <bas@basnieuwenhuizen.nl>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/12695>
This commit is contained in:
Samuel Pitoiset
2021-09-02 09:55:22 +02:00
parent c552d99b09
commit 607a14b870
4 changed files with 6 additions and 555 deletions

View File

@@ -1410,8 +1410,7 @@ handle_vs_outputs_post(struct radv_shader_context *ctx, bool export_prim_id, boo
sizeof(outinfo->vs_output_param_offset));
outinfo->pos_exports = 0;
if (!ctx->args->options->use_ngg_streamout && ctx->args->shader_info->so.num_outputs &&
!ctx->args->is_gs_copy_shader) {
if (ctx->args->shader_info->so.num_outputs && !ctx->args->is_gs_copy_shader) {
/* The GS copy shader emission already emits streamout. */
radv_emit_streamout(ctx, 0);
}
@@ -1500,13 +1499,6 @@ ngg_get_prim_cnt(struct radv_shader_context *ctx)
false);
}
static LLVMValueRef
ngg_get_ordered_id(struct radv_shader_context *ctx)
{
return ac_build_bfe(&ctx->ac, ac_get_arg(&ctx->ac, ctx->args->ac.gs_tg_info), ctx->ac.i32_0,
LLVMConstInt(ctx->ac.i32, 12, false), false);
}
static LLVMValueRef
ngg_gs_get_vertex_storage(struct radv_shader_context *ctx)
{
@@ -1608,475 +1600,6 @@ ngg_gs_get_emit_primflag_ptr(struct radv_shader_context *ctx, LLVMValueRef verte
return LLVMBuildGEP(ctx->ac.builder, vertexptr, gep_idx, 3, "");
}
static struct radv_stream_output *
radv_get_stream_output_by_loc(struct radv_streamout_info *so, unsigned location)
{
for (unsigned i = 0; i < so->num_outputs; ++i) {
if (so->outputs[i].location == location)
return &so->outputs[i];
}
return NULL;
}
static void
build_streamout_vertex(struct radv_shader_context *ctx, LLVMValueRef *so_buffer,
LLVMValueRef *wg_offset_dw, unsigned stream, LLVMValueRef offset_vtx,
LLVMValueRef vertexptr)
{
struct radv_streamout_info *so = &ctx->args->shader_info->so;
LLVMBuilderRef builder = ctx->ac.builder;
LLVMValueRef offset[4] = {0};
LLVMValueRef tmp;
for (unsigned buffer = 0; buffer < 4; ++buffer) {
if (!wg_offset_dw[buffer])
continue;
tmp = LLVMBuildMul(builder, offset_vtx, LLVMConstInt(ctx->ac.i32, so->strides[buffer], false),
"");
tmp = LLVMBuildAdd(builder, wg_offset_dw[buffer], tmp, "");
offset[buffer] = LLVMBuildShl(builder, tmp, LLVMConstInt(ctx->ac.i32, 2, false), "");
}
if (ctx->stage == MESA_SHADER_GEOMETRY) {
struct radv_shader_output_values outputs[AC_LLVM_MAX_OUTPUTS];
unsigned noutput = 0;
unsigned out_idx = 0;
for (unsigned i = 0; i < AC_LLVM_MAX_OUTPUTS; ++i) {
unsigned output_usage_mask = ctx->args->shader_info->gs.output_usage_mask[i];
uint8_t output_stream = ctx->args->shader_info->gs.output_streams[i];
if (!(ctx->output_mask & (1ull << i)) || output_stream != stream)
continue;
outputs[noutput].slot_name = i;
outputs[noutput].slot_index = i == VARYING_SLOT_CLIP_DIST1;
outputs[noutput].usage_mask = output_usage_mask;
int length = util_last_bit(output_usage_mask);
for (unsigned j = 0; j < length; j++, out_idx++) {
if (!(output_usage_mask & (1 << j)))
continue;
tmp = ac_build_gep0(&ctx->ac, vertexptr, LLVMConstInt(ctx->ac.i32, out_idx, false));
outputs[noutput].values[j] = LLVMBuildLoad(builder, tmp, "");
}
for (unsigned j = length; j < 4; j++)
outputs[noutput].values[j] = LLVMGetUndef(ctx->ac.f32);
noutput++;
}
for (unsigned i = 0; i < noutput; i++) {
struct radv_stream_output *output =
radv_get_stream_output_by_loc(so, outputs[i].slot_name);
if (!output || output->stream != stream)
continue;
struct radv_shader_output_values out = {0};
for (unsigned j = 0; j < 4; j++) {
out.values[j] = outputs[i].values[j];
}
radv_emit_stream_output(ctx, so_buffer, offset, output, &out);
}
} else {
for (unsigned i = 0; i < so->num_outputs; ++i) {
struct radv_stream_output *output = &ctx->args->shader_info->so.outputs[i];
if (stream != output->stream)
continue;
struct radv_shader_output_values out = {0};
for (unsigned comp = 0; comp < 4; comp++) {
if (!(output->component_mask & (1 << comp)))
continue;
tmp =
ac_build_gep0(&ctx->ac, vertexptr, LLVMConstInt(ctx->ac.i32, 4 * i + comp, false));
out.values[comp] = LLVMBuildLoad(builder, tmp, "");
}
radv_emit_stream_output(ctx, so_buffer, offset, output, &out);
}
}
}
struct ngg_streamout {
LLVMValueRef num_vertices;
/* per-thread data */
LLVMValueRef prim_enable[4]; /* i1 per stream */
LLVMValueRef vertices[3]; /* [N x i32] addrspace(LDS)* */
/* Output */
LLVMValueRef emit[4]; /* per-stream emitted primitives (only valid for used streams) */
};
/**
* Build streamout logic.
*
* Implies a barrier.
*
* Writes number of emitted primitives to gs_ngg_scratch[4:7].
*
* Clobbers gs_ngg_scratch[8:].
*/
static void
build_streamout(struct radv_shader_context *ctx, struct ngg_streamout *nggso)
{
struct radv_streamout_info *so = &ctx->args->shader_info->so;
LLVMBuilderRef builder = ctx->ac.builder;
LLVMValueRef buf_ptr = ac_get_arg(&ctx->ac, ctx->args->streamout_buffers);
LLVMValueRef tid = get_thread_id_in_tg(ctx);
LLVMValueRef cond, tmp, tmp2;
LLVMValueRef i32_2 = LLVMConstInt(ctx->ac.i32, 2, false);
LLVMValueRef i32_4 = LLVMConstInt(ctx->ac.i32, 4, false);
LLVMValueRef i32_8 = LLVMConstInt(ctx->ac.i32, 8, false);
LLVMValueRef so_buffer[4] = {0};
unsigned max_num_vertices = 1 + (nggso->vertices[1] ? 1 : 0) + (nggso->vertices[2] ? 1 : 0);
LLVMValueRef prim_stride_dw[4] = {0};
LLVMValueRef prim_stride_dw_vgpr = LLVMGetUndef(ctx->ac.i32);
int stream_for_buffer[4] = {-1, -1, -1, -1};
unsigned bufmask_for_stream[4] = {0};
bool isgs = ctx->stage == MESA_SHADER_GEOMETRY;
unsigned scratch_emit_base = isgs ? 4 : 0;
LLVMValueRef scratch_emit_basev = isgs ? i32_4 : ctx->ac.i32_0;
unsigned scratch_offset_base = isgs ? 8 : 4;
LLVMValueRef scratch_offset_basev = isgs ? i32_8 : i32_4;
ac_llvm_add_target_dep_function_attr(ctx->main_function, "amdgpu-gds-size", 256);
/* Determine the mapping of streamout buffers to vertex streams. */
for (unsigned i = 0; i < so->num_outputs; ++i) {
unsigned buf = so->outputs[i].buffer;
unsigned stream = so->outputs[i].stream;
assert(stream_for_buffer[buf] < 0 || stream_for_buffer[buf] == stream);
stream_for_buffer[buf] = stream;
bufmask_for_stream[stream] |= 1 << buf;
}
for (unsigned buffer = 0; buffer < 4; ++buffer) {
if (stream_for_buffer[buffer] == -1)
continue;
assert(so->strides[buffer]);
LLVMValueRef stride_for_buffer = LLVMConstInt(ctx->ac.i32, so->strides[buffer], false);
prim_stride_dw[buffer] = LLVMBuildMul(builder, stride_for_buffer, nggso->num_vertices, "");
prim_stride_dw_vgpr =
ac_build_writelane(&ctx->ac, prim_stride_dw_vgpr, prim_stride_dw[buffer],
LLVMConstInt(ctx->ac.i32, buffer, false));
LLVMValueRef offset = LLVMConstInt(ctx->ac.i32, buffer, false);
so_buffer[buffer] = ac_build_load_to_sgpr(&ctx->ac, buf_ptr, offset);
}
cond = LLVMBuildICmp(builder, LLVMIntEQ, get_wave_id_in_tg(ctx), ctx->ac.i32_0, "");
ac_build_ifcc(&ctx->ac, cond, 5200);
{
LLVMTypeRef gdsptr = LLVMPointerType(ctx->ac.i32, AC_ADDR_SPACE_GDS);
LLVMValueRef gdsbase = LLVMBuildIntToPtr(builder, ctx->ac.i32_0, gdsptr, "");
/* Advance the streamout offsets in GDS. */
LLVMValueRef offsets_vgpr = ac_build_alloca_undef(&ctx->ac, ctx->ac.i32, "");
LLVMValueRef generated_by_stream_vgpr = ac_build_alloca_undef(&ctx->ac, ctx->ac.i32, "");
cond = LLVMBuildICmp(builder, LLVMIntULT, ac_get_thread_id(&ctx->ac), i32_4, "");
ac_build_ifcc(&ctx->ac, cond, 5210);
{
/* Fetch the number of generated primitives and store
* it in GDS for later use.
*/
if (isgs) {
tmp = ac_build_gep0(&ctx->ac, ctx->gs_ngg_scratch, tid);
tmp = LLVMBuildLoad(builder, tmp, "");
} else {
tmp = ac_build_writelane(&ctx->ac, ctx->ac.i32_0, ngg_get_prim_cnt(ctx), ctx->ac.i32_0);
}
LLVMBuildStore(builder, tmp, generated_by_stream_vgpr);
unsigned swizzle[4];
int unused_stream = -1;
for (unsigned stream = 0; stream < 4; ++stream) {
if (!ctx->args->shader_info->gs.num_stream_output_components[stream]) {
unused_stream = stream;
break;
}
}
for (unsigned buffer = 0; buffer < 4; ++buffer) {
if (stream_for_buffer[buffer] >= 0) {
swizzle[buffer] = stream_for_buffer[buffer];
} else {
assert(unused_stream >= 0);
swizzle[buffer] = unused_stream;
}
}
tmp = ac_build_quad_swizzle(&ctx->ac, tmp, swizzle[0], swizzle[1], swizzle[2], swizzle[3]);
tmp = LLVMBuildMul(builder, tmp, prim_stride_dw_vgpr, "");
LLVMValueRef args[] = {
LLVMBuildIntToPtr(builder, ngg_get_ordered_id(ctx), gdsptr, ""),
tmp,
ctx->ac.i32_0, // ordering
ctx->ac.i32_0, // scope
ctx->ac.i1false, // isVolatile
LLVMConstInt(ctx->ac.i32, 4 << 24, false), // OA index
ctx->ac.i1true, // wave release
ctx->ac.i1true, // wave done
};
tmp = ac_build_intrinsic(&ctx->ac, "llvm.amdgcn.ds.ordered.add", ctx->ac.i32, args,
ARRAY_SIZE(args), 0);
/* Keep offsets in a VGPR for quick retrieval via readlane by
* the first wave for bounds checking, and also store in LDS
* for retrieval by all waves later. */
LLVMBuildStore(builder, tmp, offsets_vgpr);
tmp2 = LLVMBuildAdd(builder, ac_get_thread_id(&ctx->ac), scratch_offset_basev, "");
tmp2 = ac_build_gep0(&ctx->ac, ctx->gs_ngg_scratch, tmp2);
LLVMBuildStore(builder, tmp, tmp2);
}
ac_build_endif(&ctx->ac, 5210);
/* Determine the max emit per buffer. This is done via the SALU, in part
* because LLVM can't generate divide-by-multiply if we try to do this
* via VALU with one lane per buffer.
*/
LLVMValueRef max_emit[4] = {0};
for (unsigned buffer = 0; buffer < 4; ++buffer) {
if (stream_for_buffer[buffer] == -1)
continue;
/* Compute the streamout buffer size in DWORD. */
LLVMValueRef bufsize_dw = LLVMBuildLShr(
builder, LLVMBuildExtractElement(builder, so_buffer[buffer], i32_2, ""), i32_2, "");
/* Load the streamout buffer offset from GDS. */
tmp = LLVMBuildLoad(builder, offsets_vgpr, "");
LLVMValueRef offset_dw =
ac_build_readlane(&ctx->ac, tmp, LLVMConstInt(ctx->ac.i32, buffer, false));
/* Compute the remaining size to emit. */
LLVMValueRef remaining_dw = LLVMBuildSub(builder, bufsize_dw, offset_dw, "");
tmp = LLVMBuildUDiv(builder, remaining_dw, prim_stride_dw[buffer], "");
cond = LLVMBuildICmp(builder, LLVMIntULT, bufsize_dw, offset_dw, "");
max_emit[buffer] = LLVMBuildSelect(builder, cond, ctx->ac.i32_0, tmp, "");
}
/* Determine the number of emitted primitives per stream and fixup the
* GDS counter if necessary.
*
* This is complicated by the fact that a single stream can emit to
* multiple buffers (but luckily not vice versa).
*/
LLVMValueRef emit_vgpr = ctx->ac.i32_0;
for (unsigned stream = 0; stream < 4; ++stream) {
if (!ctx->args->shader_info->gs.num_stream_output_components[stream])
continue;
/* Load the number of generated primitives from GDS and
* determine that number for the given stream.
*/
tmp = LLVMBuildLoad(builder, generated_by_stream_vgpr, "");
LLVMValueRef generated =
ac_build_readlane(&ctx->ac, tmp, LLVMConstInt(ctx->ac.i32, stream, false));
/* Compute the number of emitted primitives. */
LLVMValueRef emit = generated;
for (unsigned buffer = 0; buffer < 4; ++buffer) {
if (stream_for_buffer[buffer] == stream)
emit = ac_build_umin(&ctx->ac, emit, max_emit[buffer]);
}
/* Store the number of emitted primitives for that
* stream.
*/
emit_vgpr =
ac_build_writelane(&ctx->ac, emit_vgpr, emit, LLVMConstInt(ctx->ac.i32, stream, false));
/* Fixup the offset using a plain GDS atomic if we overflowed. */
cond = LLVMBuildICmp(builder, LLVMIntULT, emit, generated, "");
ac_build_ifcc(&ctx->ac, cond, 5221); /* scalar branch */
tmp = LLVMBuildLShr(builder, LLVMConstInt(ctx->ac.i32, bufmask_for_stream[stream], false),
ac_get_thread_id(&ctx->ac), "");
tmp = LLVMBuildTrunc(builder, tmp, ctx->ac.i1, "");
ac_build_ifcc(&ctx->ac, tmp, 5222);
{
tmp = LLVMBuildSub(builder, generated, emit, "");
tmp = LLVMBuildMul(builder, tmp, prim_stride_dw_vgpr, "");
tmp2 = LLVMBuildGEP(builder, gdsbase, &tid, 1, "");
LLVMBuildAtomicRMW(builder, LLVMAtomicRMWBinOpSub, tmp2, tmp,
LLVMAtomicOrderingMonotonic, false);
}
ac_build_endif(&ctx->ac, 5222);
ac_build_endif(&ctx->ac, 5221);
}
/* Store the number of emitted primitives to LDS for later use. */
cond = LLVMBuildICmp(builder, LLVMIntULT, ac_get_thread_id(&ctx->ac), i32_4, "");
ac_build_ifcc(&ctx->ac, cond, 5225);
{
tmp = LLVMBuildAdd(builder, ac_get_thread_id(&ctx->ac), scratch_emit_basev, "");
tmp = ac_build_gep0(&ctx->ac, ctx->gs_ngg_scratch, tmp);
LLVMBuildStore(builder, emit_vgpr, tmp);
}
ac_build_endif(&ctx->ac, 5225);
}
ac_build_endif(&ctx->ac, 5200);
/* Determine the workgroup-relative per-thread / primitive offset into
* the streamout buffers */
struct ac_wg_scan primemit_scan[4] = {0};
if (isgs) {
for (unsigned stream = 0; stream < 4; ++stream) {
if (!ctx->args->shader_info->gs.num_stream_output_components[stream])
continue;
primemit_scan[stream].enable_exclusive = true;
primemit_scan[stream].op = nir_op_iadd;
primemit_scan[stream].src = nggso->prim_enable[stream];
primemit_scan[stream].scratch = ac_build_gep0(
&ctx->ac, ctx->gs_ngg_scratch, LLVMConstInt(ctx->ac.i32, 12 + 8 * stream, false));
primemit_scan[stream].waveidx = get_wave_id_in_tg(ctx);
primemit_scan[stream].numwaves = get_tgsize(ctx);
primemit_scan[stream].maxwaves = 8;
ac_build_wg_scan_top(&ctx->ac, &primemit_scan[stream]);
}
}
ac_build_s_barrier(&ctx->ac);
/* Fetch the per-buffer offsets and per-stream emit counts in all waves. */
LLVMValueRef wgoffset_dw[4] = {0};
{
LLVMValueRef scratch_vgpr;
tmp = ac_build_gep0(&ctx->ac, ctx->gs_ngg_scratch, ac_get_thread_id(&ctx->ac));
scratch_vgpr = LLVMBuildLoad(builder, tmp, "");
for (unsigned buffer = 0; buffer < 4; ++buffer) {
if (stream_for_buffer[buffer] >= 0) {
wgoffset_dw[buffer] =
ac_build_readlane(&ctx->ac, scratch_vgpr,
LLVMConstInt(ctx->ac.i32, scratch_offset_base + buffer, false));
}
}
for (unsigned stream = 0; stream < 4; ++stream) {
if (ctx->args->shader_info->gs.num_stream_output_components[stream]) {
nggso->emit[stream] =
ac_build_readlane(&ctx->ac, scratch_vgpr,
LLVMConstInt(ctx->ac.i32, scratch_emit_base + stream, false));
}
}
}
/* Write out primitive data */
for (unsigned stream = 0; stream < 4; ++stream) {
if (!ctx->args->shader_info->gs.num_stream_output_components[stream])
continue;
if (isgs) {
ac_build_wg_scan_bottom(&ctx->ac, &primemit_scan[stream]);
} else {
primemit_scan[stream].result_exclusive = tid;
}
cond = LLVMBuildICmp(builder, LLVMIntULT, primemit_scan[stream].result_exclusive,
nggso->emit[stream], "");
cond = LLVMBuildAnd(builder, cond, nggso->prim_enable[stream], "");
ac_build_ifcc(&ctx->ac, cond, 5240);
{
LLVMValueRef offset_vtx =
LLVMBuildMul(builder, primemit_scan[stream].result_exclusive, nggso->num_vertices, "");
for (unsigned i = 0; i < max_num_vertices; ++i) {
cond = LLVMBuildICmp(builder, LLVMIntULT, LLVMConstInt(ctx->ac.i32, i, false),
nggso->num_vertices, "");
ac_build_ifcc(&ctx->ac, cond, 5241);
build_streamout_vertex(ctx, so_buffer, wgoffset_dw, stream, offset_vtx,
nggso->vertices[i]);
ac_build_endif(&ctx->ac, 5241);
offset_vtx = LLVMBuildAdd(builder, offset_vtx, ctx->ac.i32_1, "");
}
}
ac_build_endif(&ctx->ac, 5240);
}
}
static unsigned
ngg_nogs_vertex_size(struct radv_shader_context *ctx)
{
unsigned lds_vertex_size = 0;
if (ctx->args->shader_info->so.num_outputs)
lds_vertex_size = 4 * ctx->args->shader_info->so.num_outputs + 1;
return lds_vertex_size;
}
/**
* Returns an `[N x i32] addrspace(LDS)*` pointing at contiguous LDS storage
* for the vertex outputs.
*/
static LLVMValueRef
ngg_nogs_vertex_ptr(struct radv_shader_context *ctx, LLVMValueRef vtxid)
{
/* The extra dword is used to avoid LDS bank conflicts. */
unsigned vertex_size = ngg_nogs_vertex_size(ctx);
LLVMTypeRef ai32 = LLVMArrayType(ctx->ac.i32, vertex_size);
LLVMTypeRef pai32 = LLVMPointerType(ai32, AC_ADDR_SPACE_LDS);
LLVMValueRef tmp = LLVMBuildBitCast(ctx->ac.builder, ctx->esgs_ring, pai32, "");
return LLVMBuildGEP(ctx->ac.builder, tmp, &vtxid, 1, "");
}
static void
handle_ngg_outputs_post_1(struct radv_shader_context *ctx)
{
struct radv_streamout_info *so = &ctx->args->shader_info->so;
LLVMBuilderRef builder = ctx->ac.builder;
LLVMValueRef vertex_ptr = NULL;
LLVMValueRef tmp, tmp2;
assert((ctx->stage == MESA_SHADER_VERTEX || ctx->stage == MESA_SHADER_TESS_EVAL) &&
!ctx->args->is_gs_copy_shader);
if (!ctx->args->shader_info->so.num_outputs)
return;
vertex_ptr = ngg_nogs_vertex_ptr(ctx, get_thread_id_in_tg(ctx));
for (unsigned i = 0; i < so->num_outputs; ++i) {
struct radv_stream_output *output = &ctx->args->shader_info->so.outputs[i];
unsigned loc = output->location;
for (unsigned comp = 0; comp < 4; comp++) {
if (!(output->component_mask & (1 << comp)))
continue;
tmp = ac_build_gep0(&ctx->ac, vertex_ptr, LLVMConstInt(ctx->ac.i32, 4 * i + comp, false));
tmp2 = LLVMBuildLoad(builder, ctx->abi.outputs[4 * loc + comp], "");
tmp2 = ac_to_integer(&ctx->ac, tmp2);
LLVMBuildStore(builder, tmp2, tmp);
}
}
}
static void
handle_ngg_outputs_post_2(struct radv_shader_context *ctx)
{
@@ -2102,12 +1625,8 @@ handle_ngg_outputs_post_2(struct radv_shader_context *ctx)
/* Determine the number of vertices per primitive. */
unsigned num_vertices;
LLVMValueRef num_vertices_val;
if (ctx->stage == MESA_SHADER_VERTEX) {
LLVMValueRef outprim_val =
LLVMConstInt(ctx->ac.i32, ctx->args->options->key.vs.outprim, false);
num_vertices_val = LLVMBuildAdd(builder, outprim_val, ctx->ac.i32_1, "");
num_vertices = 3; /* TODO: optimize for points & lines */
} else {
assert(ctx->stage == MESA_SHADER_TESS_EVAL);
@@ -2118,30 +1637,12 @@ handle_ngg_outputs_post_2(struct radv_shader_context *ctx)
num_vertices = 2;
else
num_vertices = 3;
num_vertices_val = LLVMConstInt(ctx->ac.i32, num_vertices, false);
}
/* Streamout */
if (ctx->args->shader_info->so.num_outputs) {
struct ngg_streamout nggso = {0};
nggso.num_vertices = num_vertices_val;
nggso.prim_enable[0] = is_gs_thread;
for (unsigned i = 0; i < num_vertices; ++i)
nggso.vertices[i] = ngg_nogs_vertex_ptr(ctx, vtxindex[i]);
build_streamout(ctx, &nggso);
}
/* Copy Primitive IDs from GS threads to the LDS address corresponding
* to the ES thread of the provoking vertex.
*/
if (ctx->stage == MESA_SHADER_VERTEX && ctx->args->options->key.vs_common_out.export_prim_id) {
if (ctx->args->shader_info->so.num_outputs)
ac_build_s_barrier(&ctx->ac);
ac_build_ifcc(&ctx->ac, is_gs_thread, 5400);
LLVMValueRef provoking_vtx_in_prim = LLVMConstInt(ctx->ac.i32, 0, false);
@@ -2343,33 +1844,6 @@ gfx10_ngg_gs_emit_epilogue_2(struct radv_shader_context *ctx)
const LLVMValueRef tid = get_thread_id_in_tg(ctx);
LLVMValueRef num_emit_threads = ngg_get_prim_cnt(ctx);
/* Streamout */
if (ctx->args->shader_info->so.num_outputs) {
struct ngg_streamout nggso = {0};
nggso.num_vertices = LLVMConstInt(ctx->ac.i32, verts_per_prim, false);
LLVMValueRef vertexptr = ngg_gs_vertex_ptr(ctx, tid);
for (unsigned stream = 0; stream < 4; ++stream) {
if (!ctx->args->shader_info->gs.num_stream_output_components[stream])
continue;
tmp = LLVMBuildLoad(builder, ngg_gs_get_emit_primflag_ptr(ctx, vertexptr, stream), "");
tmp = LLVMBuildTrunc(builder, tmp, ctx->ac.i1, "");
tmp2 = LLVMBuildICmp(builder, LLVMIntULT, tid, num_emit_threads, "");
nggso.prim_enable[stream] = LLVMBuildAnd(builder, tmp, tmp2, "");
}
for (unsigned i = 0; i < verts_per_prim; ++i) {
tmp = LLVMBuildSub(builder, tid, LLVMConstInt(ctx->ac.i32, verts_per_prim - i - 1, false),
"");
tmp = ngg_gs_vertex_ptr(ctx, tmp);
nggso.vertices[i] = ac_build_gep0(&ctx->ac, tmp, ctx->ac.i32_0);
}
build_streamout(ctx, &nggso);
}
/* Write shader query data. */
tmp = ac_get_arg(&ctx->ac, ctx->args->ngg_gs_state);
tmp = LLVMBuildTrunc(builder, tmp, ctx->ac.i1, "");
@@ -2759,7 +2233,7 @@ handle_shader_outputs_post(struct ac_shader_abi *abi, unsigned max_outputs, LLVM
else if (ctx->args->options->key.vs_common_out.as_es)
break; /* Lowered in NIR */
else if (ctx->args->options->key.vs_common_out.as_ngg)
handle_ngg_outputs_post_1(ctx);
break;
else
handle_vs_outputs_post(ctx, ctx->args->options->key.vs_common_out.export_prim_id,
ctx->args->options->key.vs_common_out.export_clip_dists,
@@ -2777,7 +2251,7 @@ handle_shader_outputs_post(struct ac_shader_abi *abi, unsigned max_outputs, LLVM
if (ctx->args->options->key.vs_common_out.as_es)
break; /* Lowered in NIR */
else if (ctx->args->options->key.vs_common_out.as_ngg)
handle_ngg_outputs_post_1(ctx);
break;
else
handle_vs_outputs_post(ctx, ctx->args->options->key.vs_common_out.export_prim_id,
ctx->args->options->key.vs_common_out.export_clip_dists,
@@ -3028,17 +2502,6 @@ ac_translate_nir_to_llvm(struct ac_llvm_compiler *ac_llvm, struct nir_shader *co
if (!args->options->key.vs_common_out.as_ngg_passthrough)
declare_esgs_ring(&ctx);
/* This is really only needed when streamout and / or vertex
* compaction is enabled.
*/
if (args->shader_info->so.num_outputs) {
LLVMTypeRef asi32 = LLVMArrayType(ctx.ac.i32, 8);
ctx.gs_ngg_scratch =
LLVMAddGlobalInAddressSpace(ctx.ac.module, asi32, "ngg_scratch", AC_ADDR_SPACE_LDS);
LLVMSetInitializer(ctx.gs_ngg_scratch, LLVMGetUndef(asi32));
LLVMSetAlignment(ctx.gs_ngg_scratch, 4);
}
/* GFX10 hang workaround - there needs to be an s_barrier before gs_alloc_req always */
if (ctx.ac.chip_class == GFX10 && shader_count == 1)
ac_build_s_barrier(&ctx.ac);
@@ -3059,11 +2522,7 @@ ac_translate_nir_to_llvm(struct ac_llvm_compiler *ac_llvm, struct nir_shader *co
ctx.gs_generated_prims[i] = ac_build_alloca(&ctx.ac, ctx.ac.i32, "");
}
unsigned scratch_size = 8;
if (args->shader_info->so.num_outputs)
scratch_size = 44;
LLVMTypeRef ai32 = LLVMArrayType(ctx.ac.i32, scratch_size);
LLVMTypeRef ai32 = LLVMArrayType(ctx.ac.i32, 8);
ctx.gs_ngg_scratch =
LLVMAddGlobalInAddressSpace(ctx.ac.module, ai32, "ngg_scratch", AC_ADDR_SPACE_LDS);
LLVMSetInitializer(ctx.gs_ngg_scratch, LLVMGetUndef(ai32));
@@ -3302,7 +2761,7 @@ ac_gs_copy_shader_emit(struct radv_shader_context *ctx)
LLVMValueRef stream_id;
/* Fetch the vertex stream ID. */
if (!ctx->args->options->use_ngg_streamout && ctx->args->shader_info->so.num_outputs) {
if (ctx->args->shader_info->so.num_outputs) {
stream_id =
ac_unpack_param(&ctx->ac, ac_get_arg(&ctx->ac, ctx->args->ac.streamout_config), 24, 2);
} else {
@@ -3364,7 +2823,7 @@ ac_gs_copy_shader_emit(struct radv_shader_context *ctx)
}
}
if (!ctx->args->options->use_ngg_streamout && ctx->args->shader_info->so.num_outputs)
if (ctx->args->shader_info->so.num_outputs)
radv_emit_streamout(ctx, stream);
if (stream == 0) {