radv: remove NGG streamout support in LLVM
It has never really been used due to various issues with GDS in the past and it will be lowered in NIR at some point. The driver support is still there because it can likely be re-used. This implementation can also be used as a reference point. Signed-off-by: Samuel Pitoiset <samuel.pitoiset@gmail.com> Reviewed-by: Bas Nieuwenhuizen <bas@basnieuwenhuizen.nl> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/12695>
This commit is contained in:
@@ -1410,8 +1410,7 @@ handle_vs_outputs_post(struct radv_shader_context *ctx, bool export_prim_id, boo
|
||||
sizeof(outinfo->vs_output_param_offset));
|
||||
outinfo->pos_exports = 0;
|
||||
|
||||
if (!ctx->args->options->use_ngg_streamout && ctx->args->shader_info->so.num_outputs &&
|
||||
!ctx->args->is_gs_copy_shader) {
|
||||
if (ctx->args->shader_info->so.num_outputs && !ctx->args->is_gs_copy_shader) {
|
||||
/* The GS copy shader emission already emits streamout. */
|
||||
radv_emit_streamout(ctx, 0);
|
||||
}
|
||||
@@ -1500,13 +1499,6 @@ ngg_get_prim_cnt(struct radv_shader_context *ctx)
|
||||
false);
|
||||
}
|
||||
|
||||
static LLVMValueRef
|
||||
ngg_get_ordered_id(struct radv_shader_context *ctx)
|
||||
{
|
||||
return ac_build_bfe(&ctx->ac, ac_get_arg(&ctx->ac, ctx->args->ac.gs_tg_info), ctx->ac.i32_0,
|
||||
LLVMConstInt(ctx->ac.i32, 12, false), false);
|
||||
}
|
||||
|
||||
static LLVMValueRef
|
||||
ngg_gs_get_vertex_storage(struct radv_shader_context *ctx)
|
||||
{
|
||||
@@ -1608,475 +1600,6 @@ ngg_gs_get_emit_primflag_ptr(struct radv_shader_context *ctx, LLVMValueRef verte
|
||||
return LLVMBuildGEP(ctx->ac.builder, vertexptr, gep_idx, 3, "");
|
||||
}
|
||||
|
||||
static struct radv_stream_output *
|
||||
radv_get_stream_output_by_loc(struct radv_streamout_info *so, unsigned location)
|
||||
{
|
||||
for (unsigned i = 0; i < so->num_outputs; ++i) {
|
||||
if (so->outputs[i].location == location)
|
||||
return &so->outputs[i];
|
||||
}
|
||||
|
||||
return NULL;
|
||||
}
|
||||
|
||||
static void
|
||||
build_streamout_vertex(struct radv_shader_context *ctx, LLVMValueRef *so_buffer,
|
||||
LLVMValueRef *wg_offset_dw, unsigned stream, LLVMValueRef offset_vtx,
|
||||
LLVMValueRef vertexptr)
|
||||
{
|
||||
struct radv_streamout_info *so = &ctx->args->shader_info->so;
|
||||
LLVMBuilderRef builder = ctx->ac.builder;
|
||||
LLVMValueRef offset[4] = {0};
|
||||
LLVMValueRef tmp;
|
||||
|
||||
for (unsigned buffer = 0; buffer < 4; ++buffer) {
|
||||
if (!wg_offset_dw[buffer])
|
||||
continue;
|
||||
|
||||
tmp = LLVMBuildMul(builder, offset_vtx, LLVMConstInt(ctx->ac.i32, so->strides[buffer], false),
|
||||
"");
|
||||
tmp = LLVMBuildAdd(builder, wg_offset_dw[buffer], tmp, "");
|
||||
offset[buffer] = LLVMBuildShl(builder, tmp, LLVMConstInt(ctx->ac.i32, 2, false), "");
|
||||
}
|
||||
|
||||
if (ctx->stage == MESA_SHADER_GEOMETRY) {
|
||||
struct radv_shader_output_values outputs[AC_LLVM_MAX_OUTPUTS];
|
||||
unsigned noutput = 0;
|
||||
unsigned out_idx = 0;
|
||||
|
||||
for (unsigned i = 0; i < AC_LLVM_MAX_OUTPUTS; ++i) {
|
||||
unsigned output_usage_mask = ctx->args->shader_info->gs.output_usage_mask[i];
|
||||
uint8_t output_stream = ctx->args->shader_info->gs.output_streams[i];
|
||||
|
||||
if (!(ctx->output_mask & (1ull << i)) || output_stream != stream)
|
||||
continue;
|
||||
|
||||
outputs[noutput].slot_name = i;
|
||||
outputs[noutput].slot_index = i == VARYING_SLOT_CLIP_DIST1;
|
||||
outputs[noutput].usage_mask = output_usage_mask;
|
||||
|
||||
int length = util_last_bit(output_usage_mask);
|
||||
|
||||
for (unsigned j = 0; j < length; j++, out_idx++) {
|
||||
if (!(output_usage_mask & (1 << j)))
|
||||
continue;
|
||||
|
||||
tmp = ac_build_gep0(&ctx->ac, vertexptr, LLVMConstInt(ctx->ac.i32, out_idx, false));
|
||||
outputs[noutput].values[j] = LLVMBuildLoad(builder, tmp, "");
|
||||
}
|
||||
|
||||
for (unsigned j = length; j < 4; j++)
|
||||
outputs[noutput].values[j] = LLVMGetUndef(ctx->ac.f32);
|
||||
|
||||
noutput++;
|
||||
}
|
||||
|
||||
for (unsigned i = 0; i < noutput; i++) {
|
||||
struct radv_stream_output *output =
|
||||
radv_get_stream_output_by_loc(so, outputs[i].slot_name);
|
||||
|
||||
if (!output || output->stream != stream)
|
||||
continue;
|
||||
|
||||
struct radv_shader_output_values out = {0};
|
||||
|
||||
for (unsigned j = 0; j < 4; j++) {
|
||||
out.values[j] = outputs[i].values[j];
|
||||
}
|
||||
|
||||
radv_emit_stream_output(ctx, so_buffer, offset, output, &out);
|
||||
}
|
||||
} else {
|
||||
for (unsigned i = 0; i < so->num_outputs; ++i) {
|
||||
struct radv_stream_output *output = &ctx->args->shader_info->so.outputs[i];
|
||||
|
||||
if (stream != output->stream)
|
||||
continue;
|
||||
|
||||
struct radv_shader_output_values out = {0};
|
||||
|
||||
for (unsigned comp = 0; comp < 4; comp++) {
|
||||
if (!(output->component_mask & (1 << comp)))
|
||||
continue;
|
||||
|
||||
tmp =
|
||||
ac_build_gep0(&ctx->ac, vertexptr, LLVMConstInt(ctx->ac.i32, 4 * i + comp, false));
|
||||
out.values[comp] = LLVMBuildLoad(builder, tmp, "");
|
||||
}
|
||||
|
||||
radv_emit_stream_output(ctx, so_buffer, offset, output, &out);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
struct ngg_streamout {
|
||||
LLVMValueRef num_vertices;
|
||||
|
||||
/* per-thread data */
|
||||
LLVMValueRef prim_enable[4]; /* i1 per stream */
|
||||
LLVMValueRef vertices[3]; /* [N x i32] addrspace(LDS)* */
|
||||
|
||||
/* Output */
|
||||
LLVMValueRef emit[4]; /* per-stream emitted primitives (only valid for used streams) */
|
||||
};
|
||||
|
||||
/**
|
||||
* Build streamout logic.
|
||||
*
|
||||
* Implies a barrier.
|
||||
*
|
||||
* Writes number of emitted primitives to gs_ngg_scratch[4:7].
|
||||
*
|
||||
* Clobbers gs_ngg_scratch[8:].
|
||||
*/
|
||||
static void
|
||||
build_streamout(struct radv_shader_context *ctx, struct ngg_streamout *nggso)
|
||||
{
|
||||
struct radv_streamout_info *so = &ctx->args->shader_info->so;
|
||||
LLVMBuilderRef builder = ctx->ac.builder;
|
||||
LLVMValueRef buf_ptr = ac_get_arg(&ctx->ac, ctx->args->streamout_buffers);
|
||||
LLVMValueRef tid = get_thread_id_in_tg(ctx);
|
||||
LLVMValueRef cond, tmp, tmp2;
|
||||
LLVMValueRef i32_2 = LLVMConstInt(ctx->ac.i32, 2, false);
|
||||
LLVMValueRef i32_4 = LLVMConstInt(ctx->ac.i32, 4, false);
|
||||
LLVMValueRef i32_8 = LLVMConstInt(ctx->ac.i32, 8, false);
|
||||
LLVMValueRef so_buffer[4] = {0};
|
||||
unsigned max_num_vertices = 1 + (nggso->vertices[1] ? 1 : 0) + (nggso->vertices[2] ? 1 : 0);
|
||||
LLVMValueRef prim_stride_dw[4] = {0};
|
||||
LLVMValueRef prim_stride_dw_vgpr = LLVMGetUndef(ctx->ac.i32);
|
||||
int stream_for_buffer[4] = {-1, -1, -1, -1};
|
||||
unsigned bufmask_for_stream[4] = {0};
|
||||
bool isgs = ctx->stage == MESA_SHADER_GEOMETRY;
|
||||
unsigned scratch_emit_base = isgs ? 4 : 0;
|
||||
LLVMValueRef scratch_emit_basev = isgs ? i32_4 : ctx->ac.i32_0;
|
||||
unsigned scratch_offset_base = isgs ? 8 : 4;
|
||||
LLVMValueRef scratch_offset_basev = isgs ? i32_8 : i32_4;
|
||||
|
||||
ac_llvm_add_target_dep_function_attr(ctx->main_function, "amdgpu-gds-size", 256);
|
||||
|
||||
/* Determine the mapping of streamout buffers to vertex streams. */
|
||||
for (unsigned i = 0; i < so->num_outputs; ++i) {
|
||||
unsigned buf = so->outputs[i].buffer;
|
||||
unsigned stream = so->outputs[i].stream;
|
||||
assert(stream_for_buffer[buf] < 0 || stream_for_buffer[buf] == stream);
|
||||
stream_for_buffer[buf] = stream;
|
||||
bufmask_for_stream[stream] |= 1 << buf;
|
||||
}
|
||||
|
||||
for (unsigned buffer = 0; buffer < 4; ++buffer) {
|
||||
if (stream_for_buffer[buffer] == -1)
|
||||
continue;
|
||||
|
||||
assert(so->strides[buffer]);
|
||||
|
||||
LLVMValueRef stride_for_buffer = LLVMConstInt(ctx->ac.i32, so->strides[buffer], false);
|
||||
prim_stride_dw[buffer] = LLVMBuildMul(builder, stride_for_buffer, nggso->num_vertices, "");
|
||||
prim_stride_dw_vgpr =
|
||||
ac_build_writelane(&ctx->ac, prim_stride_dw_vgpr, prim_stride_dw[buffer],
|
||||
LLVMConstInt(ctx->ac.i32, buffer, false));
|
||||
|
||||
LLVMValueRef offset = LLVMConstInt(ctx->ac.i32, buffer, false);
|
||||
so_buffer[buffer] = ac_build_load_to_sgpr(&ctx->ac, buf_ptr, offset);
|
||||
}
|
||||
|
||||
cond = LLVMBuildICmp(builder, LLVMIntEQ, get_wave_id_in_tg(ctx), ctx->ac.i32_0, "");
|
||||
ac_build_ifcc(&ctx->ac, cond, 5200);
|
||||
{
|
||||
LLVMTypeRef gdsptr = LLVMPointerType(ctx->ac.i32, AC_ADDR_SPACE_GDS);
|
||||
LLVMValueRef gdsbase = LLVMBuildIntToPtr(builder, ctx->ac.i32_0, gdsptr, "");
|
||||
|
||||
/* Advance the streamout offsets in GDS. */
|
||||
LLVMValueRef offsets_vgpr = ac_build_alloca_undef(&ctx->ac, ctx->ac.i32, "");
|
||||
LLVMValueRef generated_by_stream_vgpr = ac_build_alloca_undef(&ctx->ac, ctx->ac.i32, "");
|
||||
|
||||
cond = LLVMBuildICmp(builder, LLVMIntULT, ac_get_thread_id(&ctx->ac), i32_4, "");
|
||||
ac_build_ifcc(&ctx->ac, cond, 5210);
|
||||
{
|
||||
/* Fetch the number of generated primitives and store
|
||||
* it in GDS for later use.
|
||||
*/
|
||||
if (isgs) {
|
||||
tmp = ac_build_gep0(&ctx->ac, ctx->gs_ngg_scratch, tid);
|
||||
tmp = LLVMBuildLoad(builder, tmp, "");
|
||||
} else {
|
||||
tmp = ac_build_writelane(&ctx->ac, ctx->ac.i32_0, ngg_get_prim_cnt(ctx), ctx->ac.i32_0);
|
||||
}
|
||||
LLVMBuildStore(builder, tmp, generated_by_stream_vgpr);
|
||||
|
||||
unsigned swizzle[4];
|
||||
int unused_stream = -1;
|
||||
for (unsigned stream = 0; stream < 4; ++stream) {
|
||||
if (!ctx->args->shader_info->gs.num_stream_output_components[stream]) {
|
||||
unused_stream = stream;
|
||||
break;
|
||||
}
|
||||
}
|
||||
for (unsigned buffer = 0; buffer < 4; ++buffer) {
|
||||
if (stream_for_buffer[buffer] >= 0) {
|
||||
swizzle[buffer] = stream_for_buffer[buffer];
|
||||
} else {
|
||||
assert(unused_stream >= 0);
|
||||
swizzle[buffer] = unused_stream;
|
||||
}
|
||||
}
|
||||
|
||||
tmp = ac_build_quad_swizzle(&ctx->ac, tmp, swizzle[0], swizzle[1], swizzle[2], swizzle[3]);
|
||||
tmp = LLVMBuildMul(builder, tmp, prim_stride_dw_vgpr, "");
|
||||
|
||||
LLVMValueRef args[] = {
|
||||
LLVMBuildIntToPtr(builder, ngg_get_ordered_id(ctx), gdsptr, ""),
|
||||
tmp,
|
||||
ctx->ac.i32_0, // ordering
|
||||
ctx->ac.i32_0, // scope
|
||||
ctx->ac.i1false, // isVolatile
|
||||
LLVMConstInt(ctx->ac.i32, 4 << 24, false), // OA index
|
||||
ctx->ac.i1true, // wave release
|
||||
ctx->ac.i1true, // wave done
|
||||
};
|
||||
|
||||
tmp = ac_build_intrinsic(&ctx->ac, "llvm.amdgcn.ds.ordered.add", ctx->ac.i32, args,
|
||||
ARRAY_SIZE(args), 0);
|
||||
|
||||
/* Keep offsets in a VGPR for quick retrieval via readlane by
|
||||
* the first wave for bounds checking, and also store in LDS
|
||||
* for retrieval by all waves later. */
|
||||
LLVMBuildStore(builder, tmp, offsets_vgpr);
|
||||
|
||||
tmp2 = LLVMBuildAdd(builder, ac_get_thread_id(&ctx->ac), scratch_offset_basev, "");
|
||||
tmp2 = ac_build_gep0(&ctx->ac, ctx->gs_ngg_scratch, tmp2);
|
||||
LLVMBuildStore(builder, tmp, tmp2);
|
||||
}
|
||||
ac_build_endif(&ctx->ac, 5210);
|
||||
|
||||
/* Determine the max emit per buffer. This is done via the SALU, in part
|
||||
* because LLVM can't generate divide-by-multiply if we try to do this
|
||||
* via VALU with one lane per buffer.
|
||||
*/
|
||||
LLVMValueRef max_emit[4] = {0};
|
||||
for (unsigned buffer = 0; buffer < 4; ++buffer) {
|
||||
if (stream_for_buffer[buffer] == -1)
|
||||
continue;
|
||||
|
||||
/* Compute the streamout buffer size in DWORD. */
|
||||
LLVMValueRef bufsize_dw = LLVMBuildLShr(
|
||||
builder, LLVMBuildExtractElement(builder, so_buffer[buffer], i32_2, ""), i32_2, "");
|
||||
|
||||
/* Load the streamout buffer offset from GDS. */
|
||||
tmp = LLVMBuildLoad(builder, offsets_vgpr, "");
|
||||
LLVMValueRef offset_dw =
|
||||
ac_build_readlane(&ctx->ac, tmp, LLVMConstInt(ctx->ac.i32, buffer, false));
|
||||
|
||||
/* Compute the remaining size to emit. */
|
||||
LLVMValueRef remaining_dw = LLVMBuildSub(builder, bufsize_dw, offset_dw, "");
|
||||
tmp = LLVMBuildUDiv(builder, remaining_dw, prim_stride_dw[buffer], "");
|
||||
|
||||
cond = LLVMBuildICmp(builder, LLVMIntULT, bufsize_dw, offset_dw, "");
|
||||
max_emit[buffer] = LLVMBuildSelect(builder, cond, ctx->ac.i32_0, tmp, "");
|
||||
}
|
||||
|
||||
/* Determine the number of emitted primitives per stream and fixup the
|
||||
* GDS counter if necessary.
|
||||
*
|
||||
* This is complicated by the fact that a single stream can emit to
|
||||
* multiple buffers (but luckily not vice versa).
|
||||
*/
|
||||
LLVMValueRef emit_vgpr = ctx->ac.i32_0;
|
||||
|
||||
for (unsigned stream = 0; stream < 4; ++stream) {
|
||||
if (!ctx->args->shader_info->gs.num_stream_output_components[stream])
|
||||
continue;
|
||||
|
||||
/* Load the number of generated primitives from GDS and
|
||||
* determine that number for the given stream.
|
||||
*/
|
||||
tmp = LLVMBuildLoad(builder, generated_by_stream_vgpr, "");
|
||||
LLVMValueRef generated =
|
||||
ac_build_readlane(&ctx->ac, tmp, LLVMConstInt(ctx->ac.i32, stream, false));
|
||||
|
||||
/* Compute the number of emitted primitives. */
|
||||
LLVMValueRef emit = generated;
|
||||
for (unsigned buffer = 0; buffer < 4; ++buffer) {
|
||||
if (stream_for_buffer[buffer] == stream)
|
||||
emit = ac_build_umin(&ctx->ac, emit, max_emit[buffer]);
|
||||
}
|
||||
|
||||
/* Store the number of emitted primitives for that
|
||||
* stream.
|
||||
*/
|
||||
emit_vgpr =
|
||||
ac_build_writelane(&ctx->ac, emit_vgpr, emit, LLVMConstInt(ctx->ac.i32, stream, false));
|
||||
|
||||
/* Fixup the offset using a plain GDS atomic if we overflowed. */
|
||||
cond = LLVMBuildICmp(builder, LLVMIntULT, emit, generated, "");
|
||||
ac_build_ifcc(&ctx->ac, cond, 5221); /* scalar branch */
|
||||
tmp = LLVMBuildLShr(builder, LLVMConstInt(ctx->ac.i32, bufmask_for_stream[stream], false),
|
||||
ac_get_thread_id(&ctx->ac), "");
|
||||
tmp = LLVMBuildTrunc(builder, tmp, ctx->ac.i1, "");
|
||||
ac_build_ifcc(&ctx->ac, tmp, 5222);
|
||||
{
|
||||
tmp = LLVMBuildSub(builder, generated, emit, "");
|
||||
tmp = LLVMBuildMul(builder, tmp, prim_stride_dw_vgpr, "");
|
||||
tmp2 = LLVMBuildGEP(builder, gdsbase, &tid, 1, "");
|
||||
LLVMBuildAtomicRMW(builder, LLVMAtomicRMWBinOpSub, tmp2, tmp,
|
||||
LLVMAtomicOrderingMonotonic, false);
|
||||
}
|
||||
ac_build_endif(&ctx->ac, 5222);
|
||||
ac_build_endif(&ctx->ac, 5221);
|
||||
}
|
||||
|
||||
/* Store the number of emitted primitives to LDS for later use. */
|
||||
cond = LLVMBuildICmp(builder, LLVMIntULT, ac_get_thread_id(&ctx->ac), i32_4, "");
|
||||
ac_build_ifcc(&ctx->ac, cond, 5225);
|
||||
{
|
||||
tmp = LLVMBuildAdd(builder, ac_get_thread_id(&ctx->ac), scratch_emit_basev, "");
|
||||
tmp = ac_build_gep0(&ctx->ac, ctx->gs_ngg_scratch, tmp);
|
||||
LLVMBuildStore(builder, emit_vgpr, tmp);
|
||||
}
|
||||
ac_build_endif(&ctx->ac, 5225);
|
||||
}
|
||||
ac_build_endif(&ctx->ac, 5200);
|
||||
|
||||
/* Determine the workgroup-relative per-thread / primitive offset into
|
||||
* the streamout buffers */
|
||||
struct ac_wg_scan primemit_scan[4] = {0};
|
||||
|
||||
if (isgs) {
|
||||
for (unsigned stream = 0; stream < 4; ++stream) {
|
||||
if (!ctx->args->shader_info->gs.num_stream_output_components[stream])
|
||||
continue;
|
||||
|
||||
primemit_scan[stream].enable_exclusive = true;
|
||||
primemit_scan[stream].op = nir_op_iadd;
|
||||
primemit_scan[stream].src = nggso->prim_enable[stream];
|
||||
primemit_scan[stream].scratch = ac_build_gep0(
|
||||
&ctx->ac, ctx->gs_ngg_scratch, LLVMConstInt(ctx->ac.i32, 12 + 8 * stream, false));
|
||||
primemit_scan[stream].waveidx = get_wave_id_in_tg(ctx);
|
||||
primemit_scan[stream].numwaves = get_tgsize(ctx);
|
||||
primemit_scan[stream].maxwaves = 8;
|
||||
ac_build_wg_scan_top(&ctx->ac, &primemit_scan[stream]);
|
||||
}
|
||||
}
|
||||
|
||||
ac_build_s_barrier(&ctx->ac);
|
||||
|
||||
/* Fetch the per-buffer offsets and per-stream emit counts in all waves. */
|
||||
LLVMValueRef wgoffset_dw[4] = {0};
|
||||
|
||||
{
|
||||
LLVMValueRef scratch_vgpr;
|
||||
|
||||
tmp = ac_build_gep0(&ctx->ac, ctx->gs_ngg_scratch, ac_get_thread_id(&ctx->ac));
|
||||
scratch_vgpr = LLVMBuildLoad(builder, tmp, "");
|
||||
|
||||
for (unsigned buffer = 0; buffer < 4; ++buffer) {
|
||||
if (stream_for_buffer[buffer] >= 0) {
|
||||
wgoffset_dw[buffer] =
|
||||
ac_build_readlane(&ctx->ac, scratch_vgpr,
|
||||
LLVMConstInt(ctx->ac.i32, scratch_offset_base + buffer, false));
|
||||
}
|
||||
}
|
||||
|
||||
for (unsigned stream = 0; stream < 4; ++stream) {
|
||||
if (ctx->args->shader_info->gs.num_stream_output_components[stream]) {
|
||||
nggso->emit[stream] =
|
||||
ac_build_readlane(&ctx->ac, scratch_vgpr,
|
||||
LLVMConstInt(ctx->ac.i32, scratch_emit_base + stream, false));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/* Write out primitive data */
|
||||
for (unsigned stream = 0; stream < 4; ++stream) {
|
||||
if (!ctx->args->shader_info->gs.num_stream_output_components[stream])
|
||||
continue;
|
||||
|
||||
if (isgs) {
|
||||
ac_build_wg_scan_bottom(&ctx->ac, &primemit_scan[stream]);
|
||||
} else {
|
||||
primemit_scan[stream].result_exclusive = tid;
|
||||
}
|
||||
|
||||
cond = LLVMBuildICmp(builder, LLVMIntULT, primemit_scan[stream].result_exclusive,
|
||||
nggso->emit[stream], "");
|
||||
cond = LLVMBuildAnd(builder, cond, nggso->prim_enable[stream], "");
|
||||
ac_build_ifcc(&ctx->ac, cond, 5240);
|
||||
{
|
||||
LLVMValueRef offset_vtx =
|
||||
LLVMBuildMul(builder, primemit_scan[stream].result_exclusive, nggso->num_vertices, "");
|
||||
|
||||
for (unsigned i = 0; i < max_num_vertices; ++i) {
|
||||
cond = LLVMBuildICmp(builder, LLVMIntULT, LLVMConstInt(ctx->ac.i32, i, false),
|
||||
nggso->num_vertices, "");
|
||||
ac_build_ifcc(&ctx->ac, cond, 5241);
|
||||
build_streamout_vertex(ctx, so_buffer, wgoffset_dw, stream, offset_vtx,
|
||||
nggso->vertices[i]);
|
||||
ac_build_endif(&ctx->ac, 5241);
|
||||
offset_vtx = LLVMBuildAdd(builder, offset_vtx, ctx->ac.i32_1, "");
|
||||
}
|
||||
}
|
||||
ac_build_endif(&ctx->ac, 5240);
|
||||
}
|
||||
}
|
||||
|
||||
static unsigned
|
||||
ngg_nogs_vertex_size(struct radv_shader_context *ctx)
|
||||
{
|
||||
unsigned lds_vertex_size = 0;
|
||||
|
||||
if (ctx->args->shader_info->so.num_outputs)
|
||||
lds_vertex_size = 4 * ctx->args->shader_info->so.num_outputs + 1;
|
||||
|
||||
return lds_vertex_size;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns an `[N x i32] addrspace(LDS)*` pointing at contiguous LDS storage
|
||||
* for the vertex outputs.
|
||||
*/
|
||||
static LLVMValueRef
|
||||
ngg_nogs_vertex_ptr(struct radv_shader_context *ctx, LLVMValueRef vtxid)
|
||||
{
|
||||
/* The extra dword is used to avoid LDS bank conflicts. */
|
||||
unsigned vertex_size = ngg_nogs_vertex_size(ctx);
|
||||
LLVMTypeRef ai32 = LLVMArrayType(ctx->ac.i32, vertex_size);
|
||||
LLVMTypeRef pai32 = LLVMPointerType(ai32, AC_ADDR_SPACE_LDS);
|
||||
LLVMValueRef tmp = LLVMBuildBitCast(ctx->ac.builder, ctx->esgs_ring, pai32, "");
|
||||
return LLVMBuildGEP(ctx->ac.builder, tmp, &vtxid, 1, "");
|
||||
}
|
||||
|
||||
static void
|
||||
handle_ngg_outputs_post_1(struct radv_shader_context *ctx)
|
||||
{
|
||||
struct radv_streamout_info *so = &ctx->args->shader_info->so;
|
||||
LLVMBuilderRef builder = ctx->ac.builder;
|
||||
LLVMValueRef vertex_ptr = NULL;
|
||||
LLVMValueRef tmp, tmp2;
|
||||
|
||||
assert((ctx->stage == MESA_SHADER_VERTEX || ctx->stage == MESA_SHADER_TESS_EVAL) &&
|
||||
!ctx->args->is_gs_copy_shader);
|
||||
|
||||
if (!ctx->args->shader_info->so.num_outputs)
|
||||
return;
|
||||
|
||||
vertex_ptr = ngg_nogs_vertex_ptr(ctx, get_thread_id_in_tg(ctx));
|
||||
|
||||
for (unsigned i = 0; i < so->num_outputs; ++i) {
|
||||
struct radv_stream_output *output = &ctx->args->shader_info->so.outputs[i];
|
||||
|
||||
unsigned loc = output->location;
|
||||
|
||||
for (unsigned comp = 0; comp < 4; comp++) {
|
||||
if (!(output->component_mask & (1 << comp)))
|
||||
continue;
|
||||
|
||||
tmp = ac_build_gep0(&ctx->ac, vertex_ptr, LLVMConstInt(ctx->ac.i32, 4 * i + comp, false));
|
||||
tmp2 = LLVMBuildLoad(builder, ctx->abi.outputs[4 * loc + comp], "");
|
||||
tmp2 = ac_to_integer(&ctx->ac, tmp2);
|
||||
LLVMBuildStore(builder, tmp2, tmp);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static void
|
||||
handle_ngg_outputs_post_2(struct radv_shader_context *ctx)
|
||||
{
|
||||
@@ -2102,12 +1625,8 @@ handle_ngg_outputs_post_2(struct radv_shader_context *ctx)
|
||||
|
||||
/* Determine the number of vertices per primitive. */
|
||||
unsigned num_vertices;
|
||||
LLVMValueRef num_vertices_val;
|
||||
|
||||
if (ctx->stage == MESA_SHADER_VERTEX) {
|
||||
LLVMValueRef outprim_val =
|
||||
LLVMConstInt(ctx->ac.i32, ctx->args->options->key.vs.outprim, false);
|
||||
num_vertices_val = LLVMBuildAdd(builder, outprim_val, ctx->ac.i32_1, "");
|
||||
num_vertices = 3; /* TODO: optimize for points & lines */
|
||||
} else {
|
||||
assert(ctx->stage == MESA_SHADER_TESS_EVAL);
|
||||
@@ -2118,30 +1637,12 @@ handle_ngg_outputs_post_2(struct radv_shader_context *ctx)
|
||||
num_vertices = 2;
|
||||
else
|
||||
num_vertices = 3;
|
||||
|
||||
num_vertices_val = LLVMConstInt(ctx->ac.i32, num_vertices, false);
|
||||
}
|
||||
|
||||
/* Streamout */
|
||||
if (ctx->args->shader_info->so.num_outputs) {
|
||||
struct ngg_streamout nggso = {0};
|
||||
|
||||
nggso.num_vertices = num_vertices_val;
|
||||
nggso.prim_enable[0] = is_gs_thread;
|
||||
|
||||
for (unsigned i = 0; i < num_vertices; ++i)
|
||||
nggso.vertices[i] = ngg_nogs_vertex_ptr(ctx, vtxindex[i]);
|
||||
|
||||
build_streamout(ctx, &nggso);
|
||||
}
|
||||
|
||||
/* Copy Primitive IDs from GS threads to the LDS address corresponding
|
||||
* to the ES thread of the provoking vertex.
|
||||
*/
|
||||
if (ctx->stage == MESA_SHADER_VERTEX && ctx->args->options->key.vs_common_out.export_prim_id) {
|
||||
if (ctx->args->shader_info->so.num_outputs)
|
||||
ac_build_s_barrier(&ctx->ac);
|
||||
|
||||
ac_build_ifcc(&ctx->ac, is_gs_thread, 5400);
|
||||
|
||||
LLVMValueRef provoking_vtx_in_prim = LLVMConstInt(ctx->ac.i32, 0, false);
|
||||
@@ -2343,33 +1844,6 @@ gfx10_ngg_gs_emit_epilogue_2(struct radv_shader_context *ctx)
|
||||
const LLVMValueRef tid = get_thread_id_in_tg(ctx);
|
||||
LLVMValueRef num_emit_threads = ngg_get_prim_cnt(ctx);
|
||||
|
||||
/* Streamout */
|
||||
if (ctx->args->shader_info->so.num_outputs) {
|
||||
struct ngg_streamout nggso = {0};
|
||||
|
||||
nggso.num_vertices = LLVMConstInt(ctx->ac.i32, verts_per_prim, false);
|
||||
|
||||
LLVMValueRef vertexptr = ngg_gs_vertex_ptr(ctx, tid);
|
||||
for (unsigned stream = 0; stream < 4; ++stream) {
|
||||
if (!ctx->args->shader_info->gs.num_stream_output_components[stream])
|
||||
continue;
|
||||
|
||||
tmp = LLVMBuildLoad(builder, ngg_gs_get_emit_primflag_ptr(ctx, vertexptr, stream), "");
|
||||
tmp = LLVMBuildTrunc(builder, tmp, ctx->ac.i1, "");
|
||||
tmp2 = LLVMBuildICmp(builder, LLVMIntULT, tid, num_emit_threads, "");
|
||||
nggso.prim_enable[stream] = LLVMBuildAnd(builder, tmp, tmp2, "");
|
||||
}
|
||||
|
||||
for (unsigned i = 0; i < verts_per_prim; ++i) {
|
||||
tmp = LLVMBuildSub(builder, tid, LLVMConstInt(ctx->ac.i32, verts_per_prim - i - 1, false),
|
||||
"");
|
||||
tmp = ngg_gs_vertex_ptr(ctx, tmp);
|
||||
nggso.vertices[i] = ac_build_gep0(&ctx->ac, tmp, ctx->ac.i32_0);
|
||||
}
|
||||
|
||||
build_streamout(ctx, &nggso);
|
||||
}
|
||||
|
||||
/* Write shader query data. */
|
||||
tmp = ac_get_arg(&ctx->ac, ctx->args->ngg_gs_state);
|
||||
tmp = LLVMBuildTrunc(builder, tmp, ctx->ac.i1, "");
|
||||
@@ -2759,7 +2233,7 @@ handle_shader_outputs_post(struct ac_shader_abi *abi, unsigned max_outputs, LLVM
|
||||
else if (ctx->args->options->key.vs_common_out.as_es)
|
||||
break; /* Lowered in NIR */
|
||||
else if (ctx->args->options->key.vs_common_out.as_ngg)
|
||||
handle_ngg_outputs_post_1(ctx);
|
||||
break;
|
||||
else
|
||||
handle_vs_outputs_post(ctx, ctx->args->options->key.vs_common_out.export_prim_id,
|
||||
ctx->args->options->key.vs_common_out.export_clip_dists,
|
||||
@@ -2777,7 +2251,7 @@ handle_shader_outputs_post(struct ac_shader_abi *abi, unsigned max_outputs, LLVM
|
||||
if (ctx->args->options->key.vs_common_out.as_es)
|
||||
break; /* Lowered in NIR */
|
||||
else if (ctx->args->options->key.vs_common_out.as_ngg)
|
||||
handle_ngg_outputs_post_1(ctx);
|
||||
break;
|
||||
else
|
||||
handle_vs_outputs_post(ctx, ctx->args->options->key.vs_common_out.export_prim_id,
|
||||
ctx->args->options->key.vs_common_out.export_clip_dists,
|
||||
@@ -3028,17 +2502,6 @@ ac_translate_nir_to_llvm(struct ac_llvm_compiler *ac_llvm, struct nir_shader *co
|
||||
if (!args->options->key.vs_common_out.as_ngg_passthrough)
|
||||
declare_esgs_ring(&ctx);
|
||||
|
||||
/* This is really only needed when streamout and / or vertex
|
||||
* compaction is enabled.
|
||||
*/
|
||||
if (args->shader_info->so.num_outputs) {
|
||||
LLVMTypeRef asi32 = LLVMArrayType(ctx.ac.i32, 8);
|
||||
ctx.gs_ngg_scratch =
|
||||
LLVMAddGlobalInAddressSpace(ctx.ac.module, asi32, "ngg_scratch", AC_ADDR_SPACE_LDS);
|
||||
LLVMSetInitializer(ctx.gs_ngg_scratch, LLVMGetUndef(asi32));
|
||||
LLVMSetAlignment(ctx.gs_ngg_scratch, 4);
|
||||
}
|
||||
|
||||
/* GFX10 hang workaround - there needs to be an s_barrier before gs_alloc_req always */
|
||||
if (ctx.ac.chip_class == GFX10 && shader_count == 1)
|
||||
ac_build_s_barrier(&ctx.ac);
|
||||
@@ -3059,11 +2522,7 @@ ac_translate_nir_to_llvm(struct ac_llvm_compiler *ac_llvm, struct nir_shader *co
|
||||
ctx.gs_generated_prims[i] = ac_build_alloca(&ctx.ac, ctx.ac.i32, "");
|
||||
}
|
||||
|
||||
unsigned scratch_size = 8;
|
||||
if (args->shader_info->so.num_outputs)
|
||||
scratch_size = 44;
|
||||
|
||||
LLVMTypeRef ai32 = LLVMArrayType(ctx.ac.i32, scratch_size);
|
||||
LLVMTypeRef ai32 = LLVMArrayType(ctx.ac.i32, 8);
|
||||
ctx.gs_ngg_scratch =
|
||||
LLVMAddGlobalInAddressSpace(ctx.ac.module, ai32, "ngg_scratch", AC_ADDR_SPACE_LDS);
|
||||
LLVMSetInitializer(ctx.gs_ngg_scratch, LLVMGetUndef(ai32));
|
||||
@@ -3302,7 +2761,7 @@ ac_gs_copy_shader_emit(struct radv_shader_context *ctx)
|
||||
LLVMValueRef stream_id;
|
||||
|
||||
/* Fetch the vertex stream ID. */
|
||||
if (!ctx->args->options->use_ngg_streamout && ctx->args->shader_info->so.num_outputs) {
|
||||
if (ctx->args->shader_info->so.num_outputs) {
|
||||
stream_id =
|
||||
ac_unpack_param(&ctx->ac, ac_get_arg(&ctx->ac, ctx->args->ac.streamout_config), 24, 2);
|
||||
} else {
|
||||
@@ -3364,7 +2823,7 @@ ac_gs_copy_shader_emit(struct radv_shader_context *ctx)
|
||||
}
|
||||
}
|
||||
|
||||
if (!ctx->args->options->use_ngg_streamout && ctx->args->shader_info->so.num_outputs)
|
||||
if (ctx->args->shader_info->so.num_outputs)
|
||||
radv_emit_streamout(ctx, stream);
|
||||
|
||||
if (stream == 0) {
|
||||
|
Reference in New Issue
Block a user