diff --git a/src/compiler/nir/nir_divergence_analysis.c b/src/compiler/nir/nir_divergence_analysis.c index 84344209aa2..95f890f7b18 100644 --- a/src/compiler/nir/nir_divergence_analysis.c +++ b/src/compiler/nir/nir_divergence_analysis.c @@ -256,6 +256,7 @@ visit_intrinsic(nir_intrinsic_instr *instr, struct divergence_state *state) case nir_intrinsic_unit_test_uniform_amd: case nir_intrinsic_load_global_constant_uniform_block_intel: case nir_intrinsic_load_debug_log_desc_amd: + case nir_intrinsic_load_xfb_state_address_gfx12_amd: case nir_intrinsic_cmat_length: case nir_intrinsic_load_vs_primitive_stride_ir3: case nir_intrinsic_load_vs_vertex_stride_ir3: @@ -716,6 +717,7 @@ visit_intrinsic(nir_intrinsic_instr *instr, struct divergence_state *state) case nir_intrinsic_load_topology_id_intel: case nir_intrinsic_load_scratch_base_ptr: case nir_intrinsic_ordered_xfb_counter_add_gfx11_amd: + case nir_intrinsic_ordered_xfb_counter_add_gfx12_amd: case nir_intrinsic_xfb_counter_sub_gfx11_amd: case nir_intrinsic_unit_test_divergent_amd: case nir_intrinsic_load_stack: diff --git a/src/compiler/nir/nir_intrinsics.py b/src/compiler/nir/nir_intrinsics.py index 08df13eb8f0..f1c20db0f2e 100644 --- a/src/compiler/nir/nir_intrinsics.py +++ b/src/compiler/nir/nir_intrinsics.py @@ -1559,6 +1559,8 @@ system_value("pipeline_stat_query_enabled_amd", dest_comp=1, bit_sizes=[1]) system_value("prim_gen_query_enabled_amd", dest_comp=1, bit_sizes=[1]) # Whether NGG should execute shader query for primitive streamouted. system_value("prim_xfb_query_enabled_amd", dest_comp=1, bit_sizes=[1]) +# 64-bit memory address to struct {uint32_t ordered_id; uint32_t dwords_written;}[4] +system_value("xfb_state_address_gfx12_amd", dest_comp=1, bit_sizes=[64]) # Merged wave info. Bits 0-7 are the ES thread count, 8-15 are the GS thread count, 16-24 is the # GS Wave ID, 24-27 is the wave index in the workgroup, and 28-31 is the workgroup size in waves. system_value("merged_wave_info_amd", dest_comp=1) @@ -1686,6 +1688,24 @@ system_value("ordered_id_amd", 1) # WRITE_MASK = mask for counter channel to update intrinsic("ordered_xfb_counter_add_gfx11_amd", dest_comp=0, src_comp=[1, 0], indices=[WRITE_MASK], bit_sizes=[32]) +# Add dwords_written[] to global streamout offsets. +# * Exactly 4 lanes must be active, one for each buffer binding. +# * Disabled buffers must set dwords_written=0 for their lane, but the lane +# must be enabled. +# * This is implemented with inline assembly, which is why some parameters +# appear trivial or redundant. +# +# Inputs: +# exec = 0xf (set by the caller using nir_push_if) +# src[0] = 64-bit SGPR atomic base address (use nir_load_xfb_state_address_gfx12_amd) +# src[1] = 32-bit VGPR voffset, set in 4 lanes (always local_invocation_index * 8) +# src[2] = 32-bit SGPR ordered_id (use nir_load_ordered_id_amd) +# src[3] = 64-bit VGPR atomic src, set in 4 lanes +# (always pack_64_2x32_split(ordered_id, "dwords written per workgroup" for each buffer)) +# +# dst = 32-bit VGPR of the previous value of dwords_writtenN in memory, returned in 4 lanes +intrinsic("ordered_xfb_counter_add_gfx12_amd", dest_comp=1, src_comp=[1, 1, 1, 1], bit_sizes=[32]) + # Subtract from global streamout buffer offsets. Used to fix up the offsets # when we overflow streamout buffers. # src[] = { offsets }