nir: add streamout intrinsics for AMD GFX12
Reviewed-by: Timur Kristóf <timur.kristof@gmail.com> Acked-By: Georg Lehmann <dadschoorse@gmail.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/28889>
This commit is contained in:
@@ -256,6 +256,7 @@ visit_intrinsic(nir_intrinsic_instr *instr, struct divergence_state *state)
|
|||||||
case nir_intrinsic_unit_test_uniform_amd:
|
case nir_intrinsic_unit_test_uniform_amd:
|
||||||
case nir_intrinsic_load_global_constant_uniform_block_intel:
|
case nir_intrinsic_load_global_constant_uniform_block_intel:
|
||||||
case nir_intrinsic_load_debug_log_desc_amd:
|
case nir_intrinsic_load_debug_log_desc_amd:
|
||||||
|
case nir_intrinsic_load_xfb_state_address_gfx12_amd:
|
||||||
case nir_intrinsic_cmat_length:
|
case nir_intrinsic_cmat_length:
|
||||||
case nir_intrinsic_load_vs_primitive_stride_ir3:
|
case nir_intrinsic_load_vs_primitive_stride_ir3:
|
||||||
case nir_intrinsic_load_vs_vertex_stride_ir3:
|
case nir_intrinsic_load_vs_vertex_stride_ir3:
|
||||||
@@ -716,6 +717,7 @@ visit_intrinsic(nir_intrinsic_instr *instr, struct divergence_state *state)
|
|||||||
case nir_intrinsic_load_topology_id_intel:
|
case nir_intrinsic_load_topology_id_intel:
|
||||||
case nir_intrinsic_load_scratch_base_ptr:
|
case nir_intrinsic_load_scratch_base_ptr:
|
||||||
case nir_intrinsic_ordered_xfb_counter_add_gfx11_amd:
|
case nir_intrinsic_ordered_xfb_counter_add_gfx11_amd:
|
||||||
|
case nir_intrinsic_ordered_xfb_counter_add_gfx12_amd:
|
||||||
case nir_intrinsic_xfb_counter_sub_gfx11_amd:
|
case nir_intrinsic_xfb_counter_sub_gfx11_amd:
|
||||||
case nir_intrinsic_unit_test_divergent_amd:
|
case nir_intrinsic_unit_test_divergent_amd:
|
||||||
case nir_intrinsic_load_stack:
|
case nir_intrinsic_load_stack:
|
||||||
|
@@ -1559,6 +1559,8 @@ system_value("pipeline_stat_query_enabled_amd", dest_comp=1, bit_sizes=[1])
|
|||||||
system_value("prim_gen_query_enabled_amd", dest_comp=1, bit_sizes=[1])
|
system_value("prim_gen_query_enabled_amd", dest_comp=1, bit_sizes=[1])
|
||||||
# Whether NGG should execute shader query for primitive streamouted.
|
# Whether NGG should execute shader query for primitive streamouted.
|
||||||
system_value("prim_xfb_query_enabled_amd", dest_comp=1, bit_sizes=[1])
|
system_value("prim_xfb_query_enabled_amd", dest_comp=1, bit_sizes=[1])
|
||||||
|
# 64-bit memory address to struct {uint32_t ordered_id; uint32_t dwords_written;}[4]
|
||||||
|
system_value("xfb_state_address_gfx12_amd", dest_comp=1, bit_sizes=[64])
|
||||||
# Merged wave info. Bits 0-7 are the ES thread count, 8-15 are the GS thread count, 16-24 is the
|
# Merged wave info. Bits 0-7 are the ES thread count, 8-15 are the GS thread count, 16-24 is the
|
||||||
# GS Wave ID, 24-27 is the wave index in the workgroup, and 28-31 is the workgroup size in waves.
|
# GS Wave ID, 24-27 is the wave index in the workgroup, and 28-31 is the workgroup size in waves.
|
||||||
system_value("merged_wave_info_amd", dest_comp=1)
|
system_value("merged_wave_info_amd", dest_comp=1)
|
||||||
@@ -1686,6 +1688,24 @@ system_value("ordered_id_amd", 1)
|
|||||||
# WRITE_MASK = mask for counter channel to update
|
# WRITE_MASK = mask for counter channel to update
|
||||||
intrinsic("ordered_xfb_counter_add_gfx11_amd", dest_comp=0, src_comp=[1, 0], indices=[WRITE_MASK], bit_sizes=[32])
|
intrinsic("ordered_xfb_counter_add_gfx11_amd", dest_comp=0, src_comp=[1, 0], indices=[WRITE_MASK], bit_sizes=[32])
|
||||||
|
|
||||||
|
# Add dwords_written[] to global streamout offsets.
|
||||||
|
# * Exactly 4 lanes must be active, one for each buffer binding.
|
||||||
|
# * Disabled buffers must set dwords_written=0 for their lane, but the lane
|
||||||
|
# must be enabled.
|
||||||
|
# * This is implemented with inline assembly, which is why some parameters
|
||||||
|
# appear trivial or redundant.
|
||||||
|
#
|
||||||
|
# Inputs:
|
||||||
|
# exec = 0xf (set by the caller using nir_push_if)
|
||||||
|
# src[0] = 64-bit SGPR atomic base address (use nir_load_xfb_state_address_gfx12_amd)
|
||||||
|
# src[1] = 32-bit VGPR voffset, set in 4 lanes (always local_invocation_index * 8)
|
||||||
|
# src[2] = 32-bit SGPR ordered_id (use nir_load_ordered_id_amd)
|
||||||
|
# src[3] = 64-bit VGPR atomic src, set in 4 lanes
|
||||||
|
# (always pack_64_2x32_split(ordered_id, "dwords written per workgroup" for each buffer))
|
||||||
|
#
|
||||||
|
# dst = 32-bit VGPR of the previous value of dwords_writtenN in memory, returned in 4 lanes
|
||||||
|
intrinsic("ordered_xfb_counter_add_gfx12_amd", dest_comp=1, src_comp=[1, 1, 1, 1], bit_sizes=[32])
|
||||||
|
|
||||||
# Subtract from global streamout buffer offsets. Used to fix up the offsets
|
# Subtract from global streamout buffer offsets. Used to fix up the offsets
|
||||||
# when we overflow streamout buffers.
|
# when we overflow streamout buffers.
|
||||||
# src[] = { offsets }
|
# src[] = { offsets }
|
||||||
|
Reference in New Issue
Block a user