nir: rename ordered_xfb_counter_add_gfx12_amd -> ordered_add_loop_gfx12_amd

because it can also be used by compute. Acked-by: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer@amd.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/30063>
2024-06-22 23:07:37 -04:00
parent 1fd43bca2c
commit 1b2cd628b8
4 changed files with 25 additions and 23 deletions
--- a/src/amd/common/ac_nir_lower_ngg.c
+++ b/src/amd/common/ac_nir_lower_ngg.c
@@ -1940,8 +1940,8 @@ ngg_build_streamout_buffer_info(nir_builder *b,
             */
            if (use_gfx12_xfb_intrinsic) {
               buffer_offset_per_lane =
-                  nir_ordered_xfb_counter_add_gfx12_amd(b, xfb_state_address, xfb_voffset, ordered_id,
-                                                        atomic_src);
+                  nir_ordered_add_loop_gfx12_amd(b, xfb_state_address, xfb_voffset, ordered_id,
+                                                 atomic_src);
            } else {
               /* The NIR version of the above using nir_atomic_op_ordered_add_gfx12_amd. */
               enum { NUM_ATOMICS_IN_FLIGHT = 6 };
--- a/src/amd/llvm/ac_nir_to_llvm.c
+++ b/src/amd/llvm/ac_nir_to_llvm.c
@@ -3600,20 +3600,20 @@ static bool visit_intrinsic(struct ac_nir_context *ctx, nir_intrinsic_instr *ins
      }
      break;
   }
-   case nir_intrinsic_ordered_xfb_counter_add_gfx12_amd: {
+   case nir_intrinsic_ordered_add_loop_gfx12_amd: {
      const unsigned num_atomics = 6; /* max 8, using v0..v15 as temporaries */
      char code[2048];
      char *ptr = code;

      /* Assembly outputs:
-       *    i32 VGPR $0 = dwordsWritten (set in 4 lanes)
+       *    i32 VGPR $0 = previous value in memory
       *
       * Assembly inputs:
-       *    EXEC = 0xf (4 lanes, set by nir_push_if())
+       *    EXEC = one lane per counter (use nir_push_if, streamout should always enable 4 lanes)
       *    i64 SGPR $1 = atomic base address
-       *    i32 VGPR $2 = voffset = 8 * threadIDInGroup
+       *    i32 VGPR $2 = 32-bit VGPR voffset (streamout should set local_invocation_index * 8)
       *    i32 SGPR $3 = orderedID
-       *    i64 VGPR $4 = {orderedID, numDwords} (set in 4 lanes)
+       *    i64 VGPR $4 = 64-bit VGPR atomic src (streamout should set {orderedID, numDwords})
       */

      /* Issue (num_atomics - 1) atomics to initialize the results.
@@ -3639,13 +3639,12 @@ static bool visit_intrinsic(struct ac_nir_context *ctx, nir_intrinsic_instr *ins
         unsigned issue_index = (num_atomics - 1 + i) % num_atomics;
         unsigned read_index = i;

-         /* result = dwords_written */
         ptr += sprintf(ptr,
                        /* Issue (or repeat) the attempt. */
                        "global_atomic_ordered_add_b64 v[%u:%u], $2, $4, $1 th:TH_ATOMIC_RETURN\n"
                        "s_wait_loadcnt 0x%x\n"
                        /* if (result[check_index].ordered_id == ordered_id) {
-                         *    dwords_written = result[check_index].dwords_written;
+                         *    return_value = result[check_index].value;
                         *    break;
                         * }
                         */
--- a/src/compiler/nir/nir_divergence_analysis.c
+++ b/src/compiler/nir/nir_divergence_analysis.c
@@ -721,7 +721,7 @@ visit_intrinsic(nir_intrinsic_instr *instr, struct divergence_state *state)
   case nir_intrinsic_load_topology_id_intel:
   case nir_intrinsic_load_scratch_base_ptr:
   case nir_intrinsic_ordered_xfb_counter_add_gfx11_amd:
-   case nir_intrinsic_ordered_xfb_counter_add_gfx12_amd:
+   case nir_intrinsic_ordered_add_loop_gfx12_amd:
   case nir_intrinsic_xfb_counter_sub_gfx11_amd:
   case nir_intrinsic_unit_test_divergent_amd:
   case nir_intrinsic_load_stack:
--- a/src/compiler/nir/nir_intrinsics.py
+++ b/src/compiler/nir/nir_intrinsics.py
@@ -1711,23 +1711,26 @@ system_value("ordered_id_amd", 1)
 # WRITE_MASK = mask for counter channel to update
 intrinsic("ordered_xfb_counter_add_gfx11_amd", dest_comp=0, src_comp=[1, 0], indices=[WRITE_MASK], bit_sizes=[32])

-# Add dwords_written[] to global streamout offsets.
+# Execute the atomic ordered add loop. This does what ds_ordered_count did in previous generations.
+# This is implemented with inline assembly to get the most optimal code.
+#
+# Inputs:
+#   exec = one lane per counter (use nir_push_if, streamout should always enable 4 lanes)
+#   src[0] = 64-bit SGPR atomic base address (streamout should use nir_load_xfb_state_address_gfx12_amd)
+#   src[1] = 32-bit VGPR voffset (streamout should set local_invocation_index * 8)
+#   src[2] = 32-bit SGPR ordered_id (use nir_load_ordered_id_amd for streamout, compute shaders
+#            should generated it manually)
+#   src[3] = 64-bit VGPR atomic src, use pack_64_2x32_split(ordered_id, value), streamout should do:
+#            pack_64_2x32_split(ordered_id, "dwords written per workgroup" for each buffer)
+#
+# dst = 32-bit VGPR of the previous value of 32-bit value in memory, returned for all enabled lanes
+
+# Example - streamout: It's used to add dwords_written[] to global streamout offsets.
 # * Exactly 4 lanes must be active, one for each buffer binding.
 # * Disabled buffers must set dwords_written=0 for their lane, but the lane
 #   must be enabled.
-# * This is implemented with inline assembly, which is why some parameters
-#   appear trivial or redundant.
 #
-# Inputs:
-#   exec = 0xf (set by the caller using nir_push_if)
-#   src[0] = 64-bit SGPR atomic base address (use nir_load_xfb_state_address_gfx12_amd)
-#   src[1] = 32-bit VGPR voffset, set in 4 lanes (always local_invocation_index * 8)
-#   src[2] = 32-bit SGPR ordered_id (use nir_load_ordered_id_amd)
-#   src[3] = 64-bit VGPR atomic src, set in 4 lanes
-#            (always pack_64_2x32_split(ordered_id, "dwords written per workgroup" for each buffer))
-#
-# dst = 32-bit VGPR of the previous value of dwords_writtenN in memory, returned in 4 lanes
-intrinsic("ordered_xfb_counter_add_gfx12_amd", dest_comp=1, src_comp=[1, 1, 1, 1], bit_sizes=[32])
+intrinsic("ordered_add_loop_gfx12_amd", dest_comp=1, src_comp=[1, 1, 1, 1], bit_sizes=[32])

 # Subtract from global streamout buffer offsets. Used to fix up the offsets
 # when we overflow streamout buffers.