nir: add streamout intrinsics for AMD GFX12
Reviewed-by: Timur Kristóf <timur.kristof@gmail.com> Acked-By: Georg Lehmann <dadschoorse@gmail.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/28889>
This commit is contained in:
@@ -256,6 +256,7 @@ visit_intrinsic(nir_intrinsic_instr *instr, struct divergence_state *state)
|
||||
case nir_intrinsic_unit_test_uniform_amd:
|
||||
case nir_intrinsic_load_global_constant_uniform_block_intel:
|
||||
case nir_intrinsic_load_debug_log_desc_amd:
|
||||
case nir_intrinsic_load_xfb_state_address_gfx12_amd:
|
||||
case nir_intrinsic_cmat_length:
|
||||
case nir_intrinsic_load_vs_primitive_stride_ir3:
|
||||
case nir_intrinsic_load_vs_vertex_stride_ir3:
|
||||
@@ -716,6 +717,7 @@ visit_intrinsic(nir_intrinsic_instr *instr, struct divergence_state *state)
|
||||
case nir_intrinsic_load_topology_id_intel:
|
||||
case nir_intrinsic_load_scratch_base_ptr:
|
||||
case nir_intrinsic_ordered_xfb_counter_add_gfx11_amd:
|
||||
case nir_intrinsic_ordered_xfb_counter_add_gfx12_amd:
|
||||
case nir_intrinsic_xfb_counter_sub_gfx11_amd:
|
||||
case nir_intrinsic_unit_test_divergent_amd:
|
||||
case nir_intrinsic_load_stack:
|
||||
|
@@ -1559,6 +1559,8 @@ system_value("pipeline_stat_query_enabled_amd", dest_comp=1, bit_sizes=[1])
|
||||
system_value("prim_gen_query_enabled_amd", dest_comp=1, bit_sizes=[1])
|
||||
# Whether NGG should execute shader query for primitive streamouted.
|
||||
system_value("prim_xfb_query_enabled_amd", dest_comp=1, bit_sizes=[1])
|
||||
# 64-bit memory address to struct {uint32_t ordered_id; uint32_t dwords_written;}[4]
|
||||
system_value("xfb_state_address_gfx12_amd", dest_comp=1, bit_sizes=[64])
|
||||
# Merged wave info. Bits 0-7 are the ES thread count, 8-15 are the GS thread count, 16-24 is the
|
||||
# GS Wave ID, 24-27 is the wave index in the workgroup, and 28-31 is the workgroup size in waves.
|
||||
system_value("merged_wave_info_amd", dest_comp=1)
|
||||
@@ -1686,6 +1688,24 @@ system_value("ordered_id_amd", 1)
|
||||
# WRITE_MASK = mask for counter channel to update
|
||||
intrinsic("ordered_xfb_counter_add_gfx11_amd", dest_comp=0, src_comp=[1, 0], indices=[WRITE_MASK], bit_sizes=[32])
|
||||
|
||||
# Add dwords_written[] to global streamout offsets.
|
||||
# * Exactly 4 lanes must be active, one for each buffer binding.
|
||||
# * Disabled buffers must set dwords_written=0 for their lane, but the lane
|
||||
# must be enabled.
|
||||
# * This is implemented with inline assembly, which is why some parameters
|
||||
# appear trivial or redundant.
|
||||
#
|
||||
# Inputs:
|
||||
# exec = 0xf (set by the caller using nir_push_if)
|
||||
# src[0] = 64-bit SGPR atomic base address (use nir_load_xfb_state_address_gfx12_amd)
|
||||
# src[1] = 32-bit VGPR voffset, set in 4 lanes (always local_invocation_index * 8)
|
||||
# src[2] = 32-bit SGPR ordered_id (use nir_load_ordered_id_amd)
|
||||
# src[3] = 64-bit VGPR atomic src, set in 4 lanes
|
||||
# (always pack_64_2x32_split(ordered_id, "dwords written per workgroup" for each buffer))
|
||||
#
|
||||
# dst = 32-bit VGPR of the previous value of dwords_writtenN in memory, returned in 4 lanes
|
||||
intrinsic("ordered_xfb_counter_add_gfx12_amd", dest_comp=1, src_comp=[1, 1, 1, 1], bit_sizes=[32])
|
||||
|
||||
# Subtract from global streamout buffer offsets. Used to fix up the offsets
|
||||
# when we overflow streamout buffers.
|
||||
# src[] = { offsets }
|
||||
|
Reference in New Issue
Block a user