ac/nir/ngg: Refactor LDS instructions in NGG GS vertex emit and export.

Change NGG GS emit vertex code to emit combined shared stores, also change the export vertex code to emit combined shared loads. This results in more optimal code generation, ie. fewer LDS instructions are generated. GS vertices are stored using an odd stride to minimize the chance of bank conflicts, which means that unfortunately we still can't use an alignment higher than 4 here, so the best we can get are some ds_read2_b32 instructions. Fossil DB stats on Navi 21 (formerly Sienna Cichlid): Totals from 135 (0.10% of 128653) affected shaders: VGPRs: 6416 -> 6512 (+1.50%) CodeSize: 529436 -> 503792 (-4.84%) MaxWaves: 2952 -> 2924 (-0.95%) Instrs: 93384 -> 90176 (-3.44%) Latency: 290283 -> 293611 (+1.15%); split: -0.36%, +1.50% InvThroughput: 81218 -> 82598 (+1.70%) Copies: 6603 -> 6606 (+0.05%) PreVGPRs: 5037 -> 5076 (+0.77%) Signed-off-by: Timur Kristóf <timur.kristof@gmail.com> Reviewed-by: Daniel Schürmann <daniel@schuermann.dev> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/11425>
2022-06-21 16:06:04 +02:00
parent 8ab1e9826d
commit 2ac3e921e3
1 changed files with 43 additions and 28 deletions
--- a/src/amd/common/ac_nir_lower_ngg.c
+++ b/src/amd/common/ac_nir_lower_ngg.c
@@ -71,11 +71,11 @@ typedef struct

 typedef struct
 {
-   /* bitsize of this component (max 32), or 0 if it's never written at all */
-   uint8_t bit_size : 6;
+   /* Bitmask of components used: 4 bits per slot, 1 bit per component. */
+   uint8_t components_mask : 4;
   /* output stream index  */
   uint8_t stream : 2;
-} gs_output_component_info;
+} gs_output_info;

 typedef struct
 {
@@ -93,7 +93,7 @@ typedef struct
   bool found_out_vtxcnt[4];
   bool output_compile_time_known;
   bool provoking_vertex_last;
-   gs_output_component_info output_component_info[VARYING_SLOT_MAX][4];
+   gs_output_info output_info[VARYING_SLOT_MAX];
 } lower_ngg_gs_state;

 /* LDS layout of Mesh Shader workgroup info. */
@@ -1637,16 +1637,18 @@ lower_ngg_gs_store_output(nir_builder *b, nir_intrinsic_instr *intrin, lower_ngg
      if (num_consumed_components > 1)
         element = nir_extract_bits(b, &element, 1, 0, num_consumed_components, 32);

+      /* Save output usage info. */
+      gs_output_info *info = &s->output_info[io_sem.location];
+      /* The same output should always belong to the same stream. */
+      assert(!info->components_mask || info->stream == stream);
+      info->stream = stream;
+      info->components_mask |= BITFIELD_BIT(component_offset + comp * num_consumed_components);
+
      for (unsigned c = 0; c < num_consumed_components; ++c) {
         unsigned component_index =  (comp * num_consumed_components) + c + component_offset;
         unsigned base_index = base + base_offset + component_index / 4;
         component_index %= 4;

-         /* Save output usage info */
-         gs_output_component_info *info = &s->output_component_info[base_index][component_index];
-         info->bit_size = MAX2(info->bit_size, MIN2(store_val->bit_size, 32));
-         info->stream = stream;
-
         /* Store the current component element */
         nir_ssa_def *component_element = element;
         if (num_consumed_components > 1)
@@ -1679,21 +1681,26 @@ lower_ngg_gs_emit_vertex_with_counter(nir_builder *b, nir_intrinsic_instr *intri

   for (unsigned slot = 0; slot < VARYING_SLOT_MAX; ++slot) {
      unsigned packed_location = util_bitcount64((b->shader->info.outputs_written & BITFIELD64_MASK(slot)));
+      gs_output_info *info = &s->output_info[slot];
+      if (info->stream != stream || !info->components_mask)
+         continue;

-      for (unsigned comp = 0; comp < 4; ++comp) {
-         gs_output_component_info *info = &s->output_component_info[slot][comp];
-         if (info->stream != stream || !info->bit_size)
-            continue;
+      unsigned mask = info->components_mask;
+      while (mask) {
+         int start, count;
+         u_bit_scan_consecutive_range(&mask, &start, &count);
+         nir_ssa_def *values[4] = {0};
+         for (int c = start; c < start + count; ++c) {
+            /* Load output from variable. */
+            values[c - start] = nir_load_var(b, s->output_vars[slot][c]);
+            /* Clear the variable (it is undefined after emit_vertex) */
+            nir_store_var(b, s->output_vars[slot][c], nir_ssa_undef(b, 1, 32), 0x1);
+         }

-         /* Store the output to LDS */
-         nir_ssa_def *out_val = nir_load_var(b, s->output_vars[slot][comp]);
-         if (info->bit_size != 32)
-            out_val = nir_u2u(b, out_val, info->bit_size);
-
-         nir_store_shared(b, out_val, gs_emit_vtx_addr, .base = packed_location * 16 + comp * 4);
-
-         /* Clear the variable that holds the output */
-         nir_store_var(b, s->output_vars[slot][comp], nir_ssa_undef(b, 1, 32), 0x1u);
+         nir_ssa_def *store_val = nir_vec(b, values, (unsigned)count);
+         nir_store_shared(b, store_val, gs_emit_vtx_addr,
+                          .base = packed_location * 16 + start * 4,
+                          .align_mul = 4);
      }
   }

@@ -1834,16 +1841,24 @@ ngg_gs_export_vertices(nir_builder *b, nir_ssa_def *max_num_out_vtx, nir_ssa_def
      if (!(b->shader->info.outputs_written & BITFIELD64_BIT(slot)))
         continue;

+      gs_output_info *info = &s->output_info[slot];
+      if (!info->components_mask || info->stream != 0)
+         continue;
+
      unsigned packed_location = util_bitcount64((b->shader->info.outputs_written & BITFIELD64_MASK(slot)));
      nir_io_semantics io_sem = { .location = slot, .num_slots = 1 };

-      for (unsigned comp = 0; comp < 4; ++comp) {
-         gs_output_component_info *info = &s->output_component_info[slot][comp];
-         if (info->stream != 0 || info->bit_size == 0)
-            continue;
+      unsigned mask = info->components_mask;
+      while (mask) {
+         int start, count;
+         u_bit_scan_consecutive_range(&mask, &start, &count);
+         nir_ssa_def *load =
+            nir_load_shared(b, count, 32, exported_out_vtx_lds_addr,
+                            .base = packed_location * 16 + start * 4,
+                            .align_mul = 4);

-         nir_ssa_def *load = nir_load_shared(b, 1, info->bit_size, exported_out_vtx_lds_addr, .base = packed_location * 16u + comp * 4u, .align_mul = 4u);
-         nir_store_output(b, load, nir_imm_int(b, 0), .base = slot, .component = comp, .io_semantics = io_sem);
+         nir_store_output(b, load, nir_imm_int(b, 0), .base = slot, .io_semantics = io_sem,
+                          .component = start, .write_mask = BITFIELD_MASK(count));
      }
   }