From 4f2e2e10bc2982271b8f951bbad5b4fec0de64f2 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= <marek.olsak@amd.com>
Date: Tue, 17 Dec 2024 04:02:00 -0500
Subject: [PATCH] ac/nir: vectorize streamout stores for legacy pipeline
 optimally
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Walk the whole vertex stride thanks to XFB info sorted by offset, gather
individual components from same or different outputs, and once we have
gathered 4, store them as vec4.

It also removes the COHERENT flag from VMEM stores because NGG streamout
doesn't use it either and I don't think it's needed.

Reviewed-by: Timur Kristóf <timur.kristof@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/32686>
---
 src/amd/common/ac_nir.c | 66 ++++++++++++++++++++++++++---------------
 1 file changed, 42 insertions(+), 24 deletions(-)

diff --git a/src/amd/common/ac_nir.c b/src/amd/common/ac_nir.c
index f07bb081c31..e27f19460cc 100644
--- a/src/amd/common/ac_nir.c
+++ b/src/amd/common/ac_nir.c
@@ -1233,7 +1233,10 @@ emit_streamout(nir_builder *b, unsigned stream, nir_xfb_info *info, ac_nir_prera
       so_write_offset[i] = offset;
    }
 
-   nir_def *undef = nir_undef(b, 1, 32);
+   nir_def *zero = nir_imm_int(b, 0);
+   unsigned num_values = 0, store_offset = 0, store_buffer_index = 0;
+   nir_def *values[4];
+
    for (unsigned i = 0; i < info->output_count; i++) {
       const nir_xfb_output_info *output = info->outputs + i;
       if (stream != info->buffer_to_stream[output->buffer])
@@ -1243,35 +1246,50 @@ emit_streamout(nir_builder *b, unsigned stream, nir_xfb_info *info, ac_nir_prera
       nir_def **output_data =
          get_output_and_type(out, output->location, output->high_16bits, &output_type);
 
-      nir_def *vec[4] = {undef, undef, undef, undef};
-      uint8_t mask = 0;
-      u_foreach_bit(j, output->component_mask) {
-         nir_def *data = output_data[j];
+      u_foreach_bit(out_comp, output->component_mask) {
+         if (!output_data[out_comp])
+            continue;
 
-         if (data) {
-            if (data->bit_size < 32) {
-               /* we need output type to convert non-32bit output to 32bit */
-               assert(output_type);
+         nir_def *data = output_data[out_comp];
 
-               nir_alu_type base_type = nir_alu_type_get_base_type(output_type[j]);
-               data = nir_convert_to_bit_size(b, data, base_type, 32);
-            }
+         if (data->bit_size < 32) {
+            /* Convert the 16-bit output to 32 bits. */
+            assert(output_type);
 
-            unsigned comp = j - output->component_offset;
-            vec[comp] = data;
-            mask |= 1 << comp;
+            nir_alu_type base_type = nir_alu_type_get_base_type(output_type[out_comp]);
+            data = nir_convert_to_bit_size(b, data, base_type, 32);
          }
+
+         assert(out_comp >= output->component_offset);
+         const unsigned store_comp = out_comp - output->component_offset;
+         const unsigned store_comp_offset = output->offset + store_comp * 4;
+         const bool has_hole = store_offset + num_values * 4 != store_comp_offset;
+
+         /* Flush the gathered components to memory as a vec4 store or less if there is a hole. */
+         if (num_values && (num_values == 4 || store_buffer_index != output->buffer || has_hole)) {
+            nir_store_buffer_amd(b, nir_vec(b, values, num_values), so_buffers[store_buffer_index],
+                                 so_write_offset[store_buffer_index], zero, zero,
+                                 .base = store_offset,
+                                 .access = ACCESS_NON_TEMPORAL);
+            num_values = 0;
+         }
+
+         /* Initialize the buffer index and offset if we are beginning a new vec4 store. */
+         if (num_values == 0) {
+            store_buffer_index = output->buffer;
+            store_offset = store_comp_offset;
+         }
+
+         values[num_values++] = data;
       }
+   }
 
-      if (!mask)
-         continue;
-
-      unsigned buffer = output->buffer;
-      nir_def *data = nir_vec(b, vec, util_last_bit(mask));
-      nir_def *zero = nir_imm_int(b, 0);
-      nir_store_buffer_amd(b, data, so_buffers[buffer], so_write_offset[buffer], zero, zero,
-                           .base = output->offset, .write_mask = mask,
-                           .access = ACCESS_COHERENT | ACCESS_NON_TEMPORAL);
+   if (num_values) {
+      /* Flush the remaining components to memory (as an up to vec4 store) */
+      nir_store_buffer_amd(b, nir_vec(b, values, num_values), so_buffers[store_buffer_index],
+                           so_write_offset[store_buffer_index], zero, zero,
+                           .base = store_offset,
+                           .access = ACCESS_NON_TEMPORAL);
    }
 
    nir_pop_if(b, NULL);