nir: rename ordered_xfb_counter_add_gfx12_amd -> ordered_add_loop_gfx12_amd
because it can also be used by compute. Acked-by: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer@amd.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/30063>
This commit is contained in:
@@ -1940,8 +1940,8 @@ ngg_build_streamout_buffer_info(nir_builder *b,
|
||||
*/
|
||||
if (use_gfx12_xfb_intrinsic) {
|
||||
buffer_offset_per_lane =
|
||||
nir_ordered_xfb_counter_add_gfx12_amd(b, xfb_state_address, xfb_voffset, ordered_id,
|
||||
atomic_src);
|
||||
nir_ordered_add_loop_gfx12_amd(b, xfb_state_address, xfb_voffset, ordered_id,
|
||||
atomic_src);
|
||||
} else {
|
||||
/* The NIR version of the above using nir_atomic_op_ordered_add_gfx12_amd. */
|
||||
enum { NUM_ATOMICS_IN_FLIGHT = 6 };
|
||||
|
@@ -3600,20 +3600,20 @@ static bool visit_intrinsic(struct ac_nir_context *ctx, nir_intrinsic_instr *ins
|
||||
}
|
||||
break;
|
||||
}
|
||||
case nir_intrinsic_ordered_xfb_counter_add_gfx12_amd: {
|
||||
case nir_intrinsic_ordered_add_loop_gfx12_amd: {
|
||||
const unsigned num_atomics = 6; /* max 8, using v0..v15 as temporaries */
|
||||
char code[2048];
|
||||
char *ptr = code;
|
||||
|
||||
/* Assembly outputs:
|
||||
* i32 VGPR $0 = dwordsWritten (set in 4 lanes)
|
||||
* i32 VGPR $0 = previous value in memory
|
||||
*
|
||||
* Assembly inputs:
|
||||
* EXEC = 0xf (4 lanes, set by nir_push_if())
|
||||
* EXEC = one lane per counter (use nir_push_if, streamout should always enable 4 lanes)
|
||||
* i64 SGPR $1 = atomic base address
|
||||
* i32 VGPR $2 = voffset = 8 * threadIDInGroup
|
||||
* i32 VGPR $2 = 32-bit VGPR voffset (streamout should set local_invocation_index * 8)
|
||||
* i32 SGPR $3 = orderedID
|
||||
* i64 VGPR $4 = {orderedID, numDwords} (set in 4 lanes)
|
||||
* i64 VGPR $4 = 64-bit VGPR atomic src (streamout should set {orderedID, numDwords})
|
||||
*/
|
||||
|
||||
/* Issue (num_atomics - 1) atomics to initialize the results.
|
||||
@@ -3639,13 +3639,12 @@ static bool visit_intrinsic(struct ac_nir_context *ctx, nir_intrinsic_instr *ins
|
||||
unsigned issue_index = (num_atomics - 1 + i) % num_atomics;
|
||||
unsigned read_index = i;
|
||||
|
||||
/* result = dwords_written */
|
||||
ptr += sprintf(ptr,
|
||||
/* Issue (or repeat) the attempt. */
|
||||
"global_atomic_ordered_add_b64 v[%u:%u], $2, $4, $1 th:TH_ATOMIC_RETURN\n"
|
||||
"s_wait_loadcnt 0x%x\n"
|
||||
/* if (result[check_index].ordered_id == ordered_id) {
|
||||
* dwords_written = result[check_index].dwords_written;
|
||||
* return_value = result[check_index].value;
|
||||
* break;
|
||||
* }
|
||||
*/
|
||||
|
@@ -721,7 +721,7 @@ visit_intrinsic(nir_intrinsic_instr *instr, struct divergence_state *state)
|
||||
case nir_intrinsic_load_topology_id_intel:
|
||||
case nir_intrinsic_load_scratch_base_ptr:
|
||||
case nir_intrinsic_ordered_xfb_counter_add_gfx11_amd:
|
||||
case nir_intrinsic_ordered_xfb_counter_add_gfx12_amd:
|
||||
case nir_intrinsic_ordered_add_loop_gfx12_amd:
|
||||
case nir_intrinsic_xfb_counter_sub_gfx11_amd:
|
||||
case nir_intrinsic_unit_test_divergent_amd:
|
||||
case nir_intrinsic_load_stack:
|
||||
|
@@ -1711,23 +1711,26 @@ system_value("ordered_id_amd", 1)
|
||||
# WRITE_MASK = mask for counter channel to update
|
||||
intrinsic("ordered_xfb_counter_add_gfx11_amd", dest_comp=0, src_comp=[1, 0], indices=[WRITE_MASK], bit_sizes=[32])
|
||||
|
||||
# Add dwords_written[] to global streamout offsets.
|
||||
# Execute the atomic ordered add loop. This does what ds_ordered_count did in previous generations.
|
||||
# This is implemented with inline assembly to get the most optimal code.
|
||||
#
|
||||
# Inputs:
|
||||
# exec = one lane per counter (use nir_push_if, streamout should always enable 4 lanes)
|
||||
# src[0] = 64-bit SGPR atomic base address (streamout should use nir_load_xfb_state_address_gfx12_amd)
|
||||
# src[1] = 32-bit VGPR voffset (streamout should set local_invocation_index * 8)
|
||||
# src[2] = 32-bit SGPR ordered_id (use nir_load_ordered_id_amd for streamout, compute shaders
|
||||
# should generated it manually)
|
||||
# src[3] = 64-bit VGPR atomic src, use pack_64_2x32_split(ordered_id, value), streamout should do:
|
||||
# pack_64_2x32_split(ordered_id, "dwords written per workgroup" for each buffer)
|
||||
#
|
||||
# dst = 32-bit VGPR of the previous value of 32-bit value in memory, returned for all enabled lanes
|
||||
|
||||
# Example - streamout: It's used to add dwords_written[] to global streamout offsets.
|
||||
# * Exactly 4 lanes must be active, one for each buffer binding.
|
||||
# * Disabled buffers must set dwords_written=0 for their lane, but the lane
|
||||
# must be enabled.
|
||||
# * This is implemented with inline assembly, which is why some parameters
|
||||
# appear trivial or redundant.
|
||||
#
|
||||
# Inputs:
|
||||
# exec = 0xf (set by the caller using nir_push_if)
|
||||
# src[0] = 64-bit SGPR atomic base address (use nir_load_xfb_state_address_gfx12_amd)
|
||||
# src[1] = 32-bit VGPR voffset, set in 4 lanes (always local_invocation_index * 8)
|
||||
# src[2] = 32-bit SGPR ordered_id (use nir_load_ordered_id_amd)
|
||||
# src[3] = 64-bit VGPR atomic src, set in 4 lanes
|
||||
# (always pack_64_2x32_split(ordered_id, "dwords written per workgroup" for each buffer))
|
||||
#
|
||||
# dst = 32-bit VGPR of the previous value of dwords_writtenN in memory, returned in 4 lanes
|
||||
intrinsic("ordered_xfb_counter_add_gfx12_amd", dest_comp=1, src_comp=[1, 1, 1, 1], bit_sizes=[32])
|
||||
intrinsic("ordered_add_loop_gfx12_amd", dest_comp=1, src_comp=[1, 1, 1, 1], bit_sizes=[32])
|
||||
|
||||
# Subtract from global streamout buffer offsets. Used to fix up the offsets
|
||||
# when we overflow streamout buffers.
|
||||
|
Reference in New Issue
Block a user