nir,ac,radv: add primitive count add intrinsics
radeonsi use shader buffer, but radv use gds for the query result storage. Reviewed-by: Marek Olšák <marek.olsak@amd.com> Signed-off-by: Qiang Yu <yuq825@gmail.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/17457>
This commit is contained in:
@@ -456,11 +456,11 @@ emit_ngg_nogs_prim_export(nir_builder *b, lower_ngg_nogs_state *st, nir_ssa_def
|
|||||||
{
|
{
|
||||||
/* Number of active GS threads. Each has 1 output primitive. */
|
/* Number of active GS threads. Each has 1 output primitive. */
|
||||||
nir_ssa_def *num_gs_threads = nir_bit_count(b, nir_ballot(b, 1, st->wave_size, nir_imm_bool(b, true)));
|
nir_ssa_def *num_gs_threads = nir_bit_count(b, nir_ballot(b, 1, st->wave_size, nir_imm_bool(b, true)));
|
||||||
/* Activate only 1 lane and add the number of primitives to GDS. */
|
/* Activate only 1 lane and add the number of primitives to query result. */
|
||||||
nir_if *if_elected = nir_push_if(b, nir_elect(b, 1));
|
nir_if *if_elected = nir_push_if(b, nir_elect(b, 1));
|
||||||
{
|
{
|
||||||
/* Add to stream 0 primitive generated counter. */
|
/* Add to stream 0 primitive generated counter. */
|
||||||
nir_gds_atomic_add_amd(b, 32, num_gs_threads, nir_imm_int(b, 4), nir_imm_int(b, 0x100));
|
nir_atomic_add_gen_prim_count_amd(b, num_gs_threads, .stream_id = 0);
|
||||||
}
|
}
|
||||||
nir_pop_if(b, if_elected);
|
nir_pop_if(b, if_elected);
|
||||||
}
|
}
|
||||||
@@ -2107,28 +2107,21 @@ ngg_gs_shader_query(nir_builder *b, nir_intrinsic_instr *intrin, lower_ngg_gs_st
|
|||||||
num_prims_in_wave = nir_reduce(b, prm_cnt, .reduction_op = nir_op_iadd);
|
num_prims_in_wave = nir_reduce(b, prm_cnt, .reduction_op = nir_op_iadd);
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Store the query result to GDS using an atomic add. */
|
/* Store the query result to query result using an atomic add. */
|
||||||
nir_if *if_first_lane = nir_push_if(b, nir_elect(b, 1));
|
nir_if *if_first_lane = nir_push_if(b, nir_elect(b, 1));
|
||||||
{
|
{
|
||||||
/* GDS counters:
|
|
||||||
* offset 0 - pipeline statistics counter for all streams
|
|
||||||
* offset 4|8|12|16 - generated primitive counter for stream 0|1|2|3
|
|
||||||
*/
|
|
||||||
|
|
||||||
nir_if *if_pipeline_query = nir_push_if(b, pipeline_query_enabled);
|
nir_if *if_pipeline_query = nir_push_if(b, pipeline_query_enabled);
|
||||||
{
|
{
|
||||||
/* Add all streams' number to the same counter. */
|
/* Add all streams' number to the same counter. */
|
||||||
nir_gds_atomic_add_amd(b, 32, num_prims_in_wave, nir_imm_int(b, 0),
|
nir_atomic_add_gs_emit_prim_count_amd(b, num_prims_in_wave);
|
||||||
nir_imm_int(b, 0x100));
|
|
||||||
}
|
}
|
||||||
nir_pop_if(b, if_pipeline_query);
|
nir_pop_if(b, if_pipeline_query);
|
||||||
|
|
||||||
nir_if *if_prim_gen_query = nir_push_if(b, prim_gen_query_enabled);
|
nir_if *if_prim_gen_query = nir_push_if(b, prim_gen_query_enabled);
|
||||||
{
|
{
|
||||||
/* Add to the counter for this stream. */
|
/* Add to the counter for this stream. */
|
||||||
nir_gds_atomic_add_amd(b, 32, num_prims_in_wave,
|
nir_atomic_add_gen_prim_count_amd(
|
||||||
nir_imm_int(b, 4 + nir_intrinsic_stream_id(intrin) * 4),
|
b, num_prims_in_wave, .stream_id = nir_intrinsic_stream_id(intrin));
|
||||||
nir_imm_int(b, 0x100));
|
|
||||||
}
|
}
|
||||||
nir_pop_if(b, if_prim_gen_query);
|
nir_pop_if(b, if_prim_gen_query);
|
||||||
}
|
}
|
||||||
|
@@ -4425,6 +4425,21 @@ static bool visit_intrinsic(struct ac_nir_context *ctx, nir_intrinsic_instr *ins
|
|||||||
result = ac_build_gather_values(&ctx->ac, global_count, instr->num_components);
|
result = ac_build_gather_values(&ctx->ac, global_count, instr->num_components);
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
case nir_intrinsic_atomic_add_gs_emit_prim_count_amd:
|
||||||
|
ctx->abi->atomic_add_prim_count(ctx->abi, ~0U, get_src(ctx, instr->src[0]),
|
||||||
|
ac_prim_count_gs_emit);
|
||||||
|
break;
|
||||||
|
case nir_intrinsic_atomic_add_gen_prim_count_amd:
|
||||||
|
case nir_intrinsic_atomic_add_xfb_prim_count_amd: {
|
||||||
|
LLVMValueRef prim_count = get_src(ctx, instr->src[0]);
|
||||||
|
unsigned stream = nir_intrinsic_stream_id(instr);
|
||||||
|
enum ac_prim_count count_type =
|
||||||
|
instr->intrinsic == nir_intrinsic_atomic_add_gen_prim_count_amd ?
|
||||||
|
ac_prim_count_gen : ac_prim_count_xfb;
|
||||||
|
|
||||||
|
ctx->abi->atomic_add_prim_count(ctx->abi, stream, prim_count, count_type);
|
||||||
|
break;
|
||||||
|
}
|
||||||
default:
|
default:
|
||||||
fprintf(stderr, "Unknown intrinsic: ");
|
fprintf(stderr, "Unknown intrinsic: ");
|
||||||
nir_print_instr(&instr->instr, stderr);
|
nir_print_instr(&instr->instr, stderr);
|
||||||
|
@@ -34,6 +34,12 @@
|
|||||||
|
|
||||||
#define AC_LLVM_MAX_OUTPUTS (VARYING_SLOT_VAR31 + 1)
|
#define AC_LLVM_MAX_OUTPUTS (VARYING_SLOT_VAR31 + 1)
|
||||||
|
|
||||||
|
enum ac_prim_count {
|
||||||
|
ac_prim_count_gs_emit,
|
||||||
|
ac_prim_count_gen,
|
||||||
|
ac_prim_count_xfb,
|
||||||
|
};
|
||||||
|
|
||||||
/* Document the shader ABI during compilation. This is what allows radeonsi and
|
/* Document the shader ABI during compilation. This is what allows radeonsi and
|
||||||
* radv to share a compiler backend.
|
* radv to share a compiler backend.
|
||||||
*/
|
*/
|
||||||
@@ -69,6 +75,9 @@ struct ac_shader_abi {
|
|||||||
void (*emit_vertex_with_counter)(struct ac_shader_abi *abi, unsigned stream,
|
void (*emit_vertex_with_counter)(struct ac_shader_abi *abi, unsigned stream,
|
||||||
LLVMValueRef vertexidx, LLVMValueRef *addrs);
|
LLVMValueRef vertexidx, LLVMValueRef *addrs);
|
||||||
|
|
||||||
|
void (*atomic_add_prim_count)(struct ac_shader_abi *abi, unsigned stream,
|
||||||
|
LLVMValueRef prim_count, enum ac_prim_count count_type);
|
||||||
|
|
||||||
LLVMValueRef (*load_inputs)(struct ac_shader_abi *abi,
|
LLVMValueRef (*load_inputs)(struct ac_shader_abi *abi,
|
||||||
unsigned driver_location, unsigned component,
|
unsigned driver_location, unsigned component,
|
||||||
unsigned num_components, unsigned vertex_index,
|
unsigned num_components, unsigned vertex_index,
|
||||||
|
@@ -78,11 +78,14 @@ lower_abi_instr(nir_builder *b, nir_instr *instr, void *state)
|
|||||||
b->cursor = nir_before_instr(instr);
|
b->cursor = nir_before_instr(instr);
|
||||||
|
|
||||||
nir_ssa_def *replacement = NULL;
|
nir_ssa_def *replacement = NULL;
|
||||||
|
bool progress = true;
|
||||||
|
|
||||||
switch (intrin->intrinsic) {
|
switch (intrin->intrinsic) {
|
||||||
case nir_intrinsic_load_ring_tess_factors_amd:
|
case nir_intrinsic_load_ring_tess_factors_amd:
|
||||||
if (s->use_llvm)
|
if (s->use_llvm) {
|
||||||
|
progress = false;
|
||||||
break;
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
replacement = load_ring(b, RING_HS_TESS_FACTOR, s);
|
replacement = load_ring(b, RING_HS_TESS_FACTOR, s);
|
||||||
break;
|
break;
|
||||||
@@ -90,8 +93,10 @@ lower_abi_instr(nir_builder *b, nir_instr *instr, void *state)
|
|||||||
replacement = ac_nir_load_arg(b, &s->args->ac, s->args->ac.tcs_factor_offset);
|
replacement = ac_nir_load_arg(b, &s->args->ac, s->args->ac.tcs_factor_offset);
|
||||||
break;
|
break;
|
||||||
case nir_intrinsic_load_ring_tess_offchip_amd:
|
case nir_intrinsic_load_ring_tess_offchip_amd:
|
||||||
if (s->use_llvm)
|
if (s->use_llvm) {
|
||||||
|
progress = false;
|
||||||
break;
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
replacement = load_ring(b, RING_HS_TESS_OFFCHIP, s);
|
replacement = load_ring(b, RING_HS_TESS_OFFCHIP, s);
|
||||||
break;
|
break;
|
||||||
@@ -111,8 +116,10 @@ lower_abi_instr(nir_builder *b, nir_instr *instr, void *state)
|
|||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
case nir_intrinsic_load_ring_esgs_amd:
|
case nir_intrinsic_load_ring_esgs_amd:
|
||||||
if (s->use_llvm)
|
if (s->use_llvm) {
|
||||||
|
progress = false;
|
||||||
break;
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
replacement = load_ring(b, stage == MESA_SHADER_GEOMETRY ? RING_ESGS_GS : RING_ESGS_VS, s);
|
replacement = load_ring(b, stage == MESA_SHADER_GEOMETRY ? RING_ESGS_GS : RING_ESGS_VS, s);
|
||||||
break;
|
break;
|
||||||
@@ -322,14 +329,34 @@ lower_abi_instr(nir_builder *b, nir_instr *instr, void *state)
|
|||||||
replacement = nir_imm_int(b, provoking_vertex);
|
replacement = nir_imm_int(b, provoking_vertex);
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/* GDS counters:
|
||||||
|
* offset 0 - pipeline statistics counter for all streams
|
||||||
|
* offset 4|8|12|16 - generated primitive counter for stream 0|1|2|3
|
||||||
|
*/
|
||||||
|
case nir_intrinsic_atomic_add_gs_emit_prim_count_amd:
|
||||||
|
nir_gds_atomic_add_amd(b, 32, intrin->src[0].ssa, nir_imm_int(b, 0), nir_imm_int(b, 0x100));
|
||||||
|
break;
|
||||||
|
case nir_intrinsic_atomic_add_gen_prim_count_amd:
|
||||||
|
nir_gds_atomic_add_amd(b, 32, intrin->src[0].ssa,
|
||||||
|
nir_imm_int(b, 4 + nir_intrinsic_stream_id(intrin) * 4),
|
||||||
|
nir_imm_int(b, 0x100));
|
||||||
|
break;
|
||||||
|
case nir_intrinsic_atomic_add_xfb_prim_count_amd:
|
||||||
|
/* No-op for RADV. */
|
||||||
|
break;
|
||||||
|
|
||||||
default:
|
default:
|
||||||
|
progress = false;
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!replacement)
|
if (!progress)
|
||||||
return false;
|
return false;
|
||||||
|
|
||||||
nir_ssa_def_rewrite_uses(&intrin->dest.ssa, replacement);
|
if (replacement)
|
||||||
|
nir_ssa_def_rewrite_uses(&intrin->dest.ssa, replacement);
|
||||||
|
|
||||||
nir_instr_remove(instr);
|
nir_instr_remove(instr);
|
||||||
nir_instr_free(instr);
|
nir_instr_free(instr);
|
||||||
|
|
||||||
|
@@ -1457,6 +1457,16 @@ intrinsic("ordered_xfb_counter_add_amd", dest_comp=0, src_comp=[1, 0], indices=[
|
|||||||
# Provoking vertex index in a primitive
|
# Provoking vertex index in a primitive
|
||||||
system_value("provoking_vtx_in_prim_amd", 1)
|
system_value("provoking_vtx_in_prim_amd", 1)
|
||||||
|
|
||||||
|
# Atomically add current wave's primitive count to query result
|
||||||
|
# * GS emitted primitive is primitive emitted by any GS stream
|
||||||
|
# * generated primitive is primitive that has been produced for that stream by VS/TES/GS
|
||||||
|
# * streamout primitve is primitve that has been written to xfb buffer, may be different
|
||||||
|
# than generated primitive when xfb buffer is too small to hold more primitives
|
||||||
|
# src[] = { primitive_count }.
|
||||||
|
intrinsic("atomic_add_gs_emit_prim_count_amd", [1])
|
||||||
|
intrinsic("atomic_add_gen_prim_count_amd", [1], indices=[STREAM_ID])
|
||||||
|
intrinsic("atomic_add_xfb_prim_count_amd", [1], indices=[STREAM_ID])
|
||||||
|
|
||||||
# V3D-specific instrinc for tile buffer color reads.
|
# V3D-specific instrinc for tile buffer color reads.
|
||||||
#
|
#
|
||||||
# The hardware requires that we read the samples and components of a pixel
|
# The hardware requires that we read the samples and components of a pixel
|
||||||
|
Reference in New Issue
Block a user