diff --git a/src/amd/common/ac_nir_lower_ngg.c b/src/amd/common/ac_nir_lower_ngg.c index 7cbaceced13..2d133642bca 100644 --- a/src/amd/common/ac_nir_lower_ngg.c +++ b/src/amd/common/ac_nir_lower_ngg.c @@ -1752,6 +1752,8 @@ ngg_build_streamout_buffer_info(nir_builder *b, nir_ssa_def *buffer_offsets_ret[4], nir_ssa_def *emit_prim_ret[4]) { + nir_ssa_def *undef = nir_ssa_undef(b, 1, 32); + /* For radeonsi which pass this value by arg when VS. Streamout need accurate * num-vert-per-prim for writing correct amount of data to buffer. */ @@ -1785,7 +1787,7 @@ ngg_build_streamout_buffer_info(nir_builder *b, workgroup_buffer_sizes[buffer] = nir_bcsel(b, buffer_valid, inc_buffer_size, nir_imm_int(b, 0)); } else - workgroup_buffer_sizes[buffer] = nir_ssa_undef(b, 1, 32); + workgroup_buffer_sizes[buffer] = undef; } nir_ssa_def *ordered_id = nir_load_ordered_id_amd(b); @@ -1801,6 +1803,9 @@ ngg_build_streamout_buffer_info(nir_builder *b, nir_ssa_def *emit_prim[4]; memcpy(emit_prim, gen_prim, 4 * sizeof(nir_ssa_def *)); + nir_ssa_def *any_overflow = nir_imm_bool(b, false); + nir_ssa_def *overflow_amount[4] = {undef, undef, undef, undef}; + for (unsigned buffer = 0; buffer < 4; buffer++) { if (!(info->buffers_written & BITFIELD_BIT(buffer))) continue; @@ -1811,6 +1816,10 @@ ngg_build_streamout_buffer_info(nir_builder *b, nir_ssa_def *remain_prim = nir_idiv(b, remain_size, prim_stride_ret[buffer]); nir_ssa_def *overflow = nir_ilt(b, buffer_size, buffer_offset); + any_overflow = nir_ior(b, any_overflow, overflow); + overflow_amount[buffer] = nir_imax(b, nir_imm_int(b, 0), + nir_isub(b, buffer_offset, buffer_size)); + unsigned stream = info->buffer_to_stream[buffer]; /* when previous workgroup overflow, we can't emit any primitive */ emit_prim[stream] = nir_bcsel( @@ -1822,9 +1831,16 @@ ngg_build_streamout_buffer_info(nir_builder *b, nir_store_shared(b, buffer_offset, scratch_base, .base = buffer * 4); } - /* No need to fixup the global buffer offset once we overflowed, - * because following workgroups overflow for sure. + /* We have to fix up the streamout offsets if we overflowed because they determine + * the vertex count for DrawTransformFeedback. */ + nir_if *if_any_overflow = nir_push_if(b, any_overflow); + { + nir_build_xfb_counter_sub_amd(b, nir_vec(b, overflow_amount, 4), + /* mask of buffers to update */ + .write_mask = info->buffers_written); + } + nir_pop_if(b, if_any_overflow); /* Save to LDS for being accessed by other waves in this workgroup. */ for (unsigned stream = 0; stream < 4; stream++) { diff --git a/src/amd/compiler/aco_instruction_selection.cpp b/src/amd/compiler/aco_instruction_selection.cpp index c2f2423f2c1..73971c5d439 100644 --- a/src/amd/compiler/aco_instruction_selection.cpp +++ b/src/amd/compiler/aco_instruction_selection.cpp @@ -9132,6 +9132,9 @@ visit_intrinsic(isel_context* ctx, nir_intrinsic_instr* instr) emit_split_vector(ctx, dst, instr->num_components); break; } + case nir_intrinsic_xfb_counter_sub_amd: + /* TODO: implement this */ + break; case nir_intrinsic_memory_barrier_buffer: { wait_imm wait; wait.lgkm = 0; diff --git a/src/amd/llvm/ac_nir_to_llvm.c b/src/amd/llvm/ac_nir_to_llvm.c index 0389682ff0f..3e109fcce6e 100644 --- a/src/amd/llvm/ac_nir_to_llvm.c +++ b/src/amd/llvm/ac_nir_to_llvm.c @@ -4232,6 +4232,27 @@ static bool visit_intrinsic(struct ac_nir_context *ctx, nir_intrinsic_instr *ins result = ac_build_gather_values(&ctx->ac, global_count, instr->num_components); break; } + case nir_intrinsic_xfb_counter_sub_amd: { + /* must be called in a single lane of a workgroup. */ + LLVMTypeRef gdsptr = LLVMPointerType(ctx->ac.i32, AC_ADDR_SPACE_GDS); + LLVMValueRef gdsbase = LLVMBuildIntToPtr(ctx->ac.builder, ctx->ac.i32_0, gdsptr, ""); + LLVMValueRef sub_vec = get_src(ctx, instr->src[0]); + unsigned write_mask = nir_intrinsic_write_mask(instr); + + for (unsigned i = 0; i < instr->num_components; i++) { + if (write_mask & (1 << i)) { + LLVMValueRef value = + LLVMBuildExtractElement(ctx->ac.builder, sub_vec, + LLVMConstInt(ctx->ac.i32, i, false), ""); + + LLVMValueRef gds_ptr = + ac_build_gep_ptr(&ctx->ac, ctx->ac.i32, gdsbase, LLVMConstInt(ctx->ac.i32, i, 0)); + LLVMBuildAtomicRMW(ctx->ac.builder, LLVMAtomicRMWBinOpSub, gds_ptr, value, + LLVMAtomicOrderingMonotonic, false); + } + } + break; + } case nir_intrinsic_export_amd: { unsigned flags = nir_intrinsic_flags(instr); unsigned target = nir_intrinsic_base(instr); diff --git a/src/compiler/nir/nir_divergence_analysis.c b/src/compiler/nir/nir_divergence_analysis.c index e57d2bc008c..78087e9d4ec 100644 --- a/src/compiler/nir/nir_divergence_analysis.c +++ b/src/compiler/nir/nir_divergence_analysis.c @@ -672,6 +672,7 @@ visit_intrinsic(nir_shader *shader, nir_intrinsic_instr *instr) case nir_intrinsic_load_topology_id_intel: case nir_intrinsic_load_scratch_base_ptr: case nir_intrinsic_ordered_xfb_counter_add_amd: + case nir_intrinsic_xfb_counter_sub_amd: case nir_intrinsic_load_stack: case nir_intrinsic_load_ray_launch_id: case nir_intrinsic_load_ray_instance_custom_index: diff --git a/src/compiler/nir/nir_intrinsics.py b/src/compiler/nir/nir_intrinsics.py index 30ee3818d9f..29ba0c7ca47 100644 --- a/src/compiler/nir/nir_intrinsics.py +++ b/src/compiler/nir/nir_intrinsics.py @@ -1529,10 +1529,15 @@ intrinsic("load_streamout_buffer_amd", dest_comp=4, indices=[BASE], bit_sizes=[3 # An ID for each workgroup ordered by primitve sequence system_value("ordered_id_amd", 1) -# Add to global streamout buffer counter in specified order +# Add src1 to global streamout buffer offsets in the specified order # src[] = { ordered_id, counter } # WRITE_MASK = mask for counter channel to update intrinsic("ordered_xfb_counter_add_amd", dest_comp=0, src_comp=[1, 0], indices=[WRITE_MASK], bit_sizes=[32]) +# Subtract from global streamout buffer offsets. Used to fix up the offsets +# when we overflow streamout buffers. +# src[] = { offsets } +# WRITE_MASK = mask of offsets to subtract +intrinsic("xfb_counter_sub_amd", src_comp=[0], indices=[WRITE_MASK], bit_sizes=[32]) # Provoking vertex index in a primitive system_value("provoking_vtx_in_prim_amd", 1)