ac: Add NIR passes to lower ES->GS I/O to memory accesses.

Signed-off-by: Timur Kristóf <timur.kristof@gmail.com> Reviewed-by: Rhys Perry <pendingchaos02@gmail.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/9201>
2021-03-02 15:30:58 +01:00
parent bf966d1c1d
commit 3185cb7dbf
4 changed files with 273 additions and 0 deletions
--- a/src/amd/Makefile.sources
+++ b/src/amd/Makefile.sources
@@ -45,6 +45,7 @@ AMD_COMMON_FILES = \
 	common/ac_msgpack.c \
 	common/ac_msgpack.h \
 	common/ac_nir.h \
+	common/ac_nir_lower_esgs_io_to_mem.c \
 	common/ac_nir_lower_tess_io_to_mem.c \
 	common/ac_surface.c \
 	common/ac_surface.h \
--- a/src/amd/common/ac_nir.h
+++ b/src/amd/common/ac_nir.h
@@ -72,6 +72,16 @@ ac_nir_lower_tess_to_const(nir_shader *shader,
                           unsigned tcs_num_patches,
                           unsigned options);

+void
+ac_nir_lower_es_outputs_to_mem(nir_shader *shader,
+                               enum chip_class chip_class,
+                               unsigned num_reserved_es_outputs);
+
+void
+ac_nir_lower_gs_inputs_to_mem(nir_shader *shader,
+                              enum chip_class chip_class,
+                              unsigned num_reserved_es_outputs);
+
 #ifdef __cplusplus
 }
 #endif
--- a/src/amd/common/ac_nir_lower_esgs_io_to_mem.c
+++ b/src/amd/common/ac_nir_lower_esgs_io_to_mem.c
@@ -0,0 +1,261 @@
+/*
+ * Copyright © 2021 Valve Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ *
+ */
+
+#include "ac_nir.h"
+#include "nir_builder.h"
+
+/*
+ * Lower NIR cross-stage I/O intrinsics into the memory accesses that actually happen on the HW.
+ *
+ * These HW stages are used only when a Geometry Shader is used.
+ * Export Shader (ES) runs the SW stage before GS, can be either VS or TES.
+ *
+ * * GFX6-8:
+ *   ES and GS are separate HW stages.
+ *   I/O is passed between them through VRAM.
+ * * GFX9+:
+ *   ES and GS are merged into a single HW stage.
+ *   I/O is passed between them through LDS.
+ *
+ */
+
+typedef struct {
+   /* Which hardware generation we're dealing with */
+   enum chip_class chip_class;
+
+   /* Number of ES outputs for which memory should be reserved.
+    * When compacted, this should be the number of linked ES outputs.
+    */
+   unsigned num_reserved_es_outputs;
+} lower_esgs_io_state;
+
+static nir_ssa_def *
+emit_split_buffer_load(nir_builder *b, nir_ssa_def *desc, nir_ssa_def *v_off, nir_ssa_def *s_off,
+                       unsigned component_stride, unsigned num_components, unsigned bit_size)
+{
+   unsigned total_bytes = num_components * bit_size / 8u;
+   unsigned full_dwords = total_bytes / 4u;
+   unsigned remaining_bytes = total_bytes - full_dwords * 4u;
+
+   /* Accomodate max number of split 64-bit loads */
+   nir_ssa_def *comps[NIR_MAX_VEC_COMPONENTS * 2u];
+
+   /* Assume that 1x32-bit load is better than 1x16-bit + 1x8-bit */
+   if (remaining_bytes == 3) {
+      remaining_bytes = 0;
+      full_dwords++;
+   }
+
+   for (unsigned i = 0; i < full_dwords; ++i)
+      comps[i] = nir_build_load_buffer_amd(b, 1, 32, desc, v_off, s_off,
+                                           .base = component_stride * i, .memory_modes = nir_var_shader_in);
+
+   if (remaining_bytes)
+      comps[full_dwords] = nir_build_load_buffer_amd(b, 1, remaining_bytes * 8, desc, v_off, s_off,
+                                                     .base = component_stride * full_dwords, .memory_modes = nir_var_shader_in);
+
+   return nir_extract_bits(b, comps, full_dwords + !!remaining_bytes, 0, num_components, bit_size);
+}
+
+static void
+emit_split_buffer_store(nir_builder *b, nir_ssa_def *d, nir_ssa_def *desc, nir_ssa_def *v_off, nir_ssa_def *s_off,
+                        unsigned component_stride, unsigned num_components, unsigned bit_size,
+                        unsigned writemask, bool swizzled, bool slc)
+{
+   while (writemask) {
+      int start, count;
+      u_bit_scan_consecutive_range(&writemask, &start, &count);
+      assert(start >= 0 && count >= 0);
+
+      unsigned bytes = count * bit_size / 8u;
+      unsigned start_byte = start * bit_size / 8u;
+
+      while (bytes) {
+         unsigned store_bytes = MIN2(bytes, 4u);
+         if ((start_byte % 4) == 1 || (start_byte % 4) == 3)
+            store_bytes = MIN2(store_bytes, 1);
+         else if ((start_byte % 2) == 2)
+            store_bytes = MIN2(store_bytes, 2);
+
+         nir_ssa_def *store_val = nir_extract_bits(b, &d, 1, start_byte * 8u, 1, store_bytes * 8u);
+         nir_build_store_buffer_amd(b, store_val, desc, v_off, s_off, .is_swizzled = swizzled, .slc_amd = slc,
+                                    .base = start_byte, .write_mask = 1u, .memory_modes = nir_var_shader_out);
+
+         start_byte += store_bytes;
+         bytes -= store_bytes;
+      }
+   }
+}
+
+static bool
+lower_es_output_store(nir_builder *b,
+                      nir_instr *instr,
+                      void *state)
+{
+   if (instr->type != nir_instr_type_intrinsic)
+      return false;
+
+   nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
+
+   if (intrin->intrinsic != nir_intrinsic_store_output)
+      return false;
+
+   lower_esgs_io_state *st = (lower_esgs_io_state *) state;
+   unsigned write_mask = nir_intrinsic_write_mask(intrin);
+
+   b->cursor = nir_before_instr(instr);
+   nir_ssa_def *io_off = nir_build_calc_io_offset(b, intrin, nir_imm_int(b, 16u), 4u);
+
+   if (st->chip_class <= GFX8) {
+      /* GFX6-8: ES is a separate HW stage, data is passed from ES to GS in VRAM. */
+      nir_ssa_def *ring = nir_build_load_ring_esgs_amd(b);
+      nir_ssa_def *es2gs_off = nir_build_load_ring_es2gs_offset_amd(b);
+      emit_split_buffer_store(b, intrin->src[0].ssa, ring, io_off, es2gs_off, 4u,
+                              intrin->src[0].ssa->num_components, intrin->src[0].ssa->bit_size,
+                              write_mask, true, true);
+   } else {
+      /* GFX9+: ES is merged into GS, data is passed through LDS. */
+      unsigned esgs_itemsize = st->num_reserved_es_outputs * 16u;
+      nir_ssa_def *vertex_idx = nir_build_load_local_invocation_index(b);
+      nir_ssa_def *off = nir_iadd(b, nir_imul_imm(b, vertex_idx, esgs_itemsize), io_off);
+      nir_build_store_shared(b, intrin->src[0].ssa, off, .write_mask = write_mask,
+                             .align_mul = 16u, .align_offset = (nir_intrinsic_component(intrin) * 4u) % 16u);
+   }
+
+   nir_instr_remove(instr);
+   return true;
+}
+
+static nir_ssa_def *
+gs_per_vertex_input_vertex_offset_gfx6(nir_builder *b, nir_src *vertex_src)
+{
+   if (nir_src_is_const(*vertex_src))
+      return nir_build_load_gs_vertex_offset_amd(b, .base = nir_src_as_uint(*vertex_src));
+
+   nir_ssa_def *vertex_offset = nir_build_load_gs_vertex_offset_amd(b, .base = 0);
+
+   for (unsigned i = 1; i < b->shader->info.gs.vertices_in; ++i) {
+      nir_ssa_def *cond = nir_ieq_imm(b, vertex_src->ssa, i);
+      nir_ssa_def *elem = nir_build_load_gs_vertex_offset_amd(b, .base = i);
+      vertex_offset = nir_bcsel(b, cond, elem, vertex_offset);
+   }
+
+   return vertex_offset;
+}
+
+static nir_ssa_def *
+gs_per_vertex_input_vertex_offset_gfx9(nir_builder *b, nir_src *vertex_src)
+{
+   if (nir_src_is_const(*vertex_src)) {
+      unsigned vertex = nir_src_as_uint(*vertex_src);
+      return nir_ubfe(b, nir_build_load_gs_vertex_offset_amd(b, .base = vertex / 2u * 2u),
+                      nir_imm_int(b, (vertex % 2u) * 16u), nir_imm_int(b, 16u));
+   }
+
+   nir_ssa_def *vertex_offset = nir_build_load_gs_vertex_offset_amd(b, .base = 0);
+
+   for (unsigned i = 1; i < b->shader->info.gs.vertices_in; i++) {
+      nir_ssa_def *cond = nir_ieq_imm(b, vertex_src->ssa, i);
+      nir_ssa_def *elem = nir_build_load_gs_vertex_offset_amd(b, .base = i / 2u * 2u);
+      if (i % 2u)
+         elem = nir_ishr_imm(b, elem, 16u);
+
+      vertex_offset = nir_bcsel(b, cond, elem, vertex_offset);
+   }
+
+   return nir_iand_imm(b, vertex_offset, 0xffffu);
+}
+
+static nir_ssa_def *
+gs_per_vertex_input_offset(nir_builder *b,
+                           lower_esgs_io_state *st,
+                           nir_intrinsic_instr *instr)
+{
+   nir_src *vertex_src = nir_get_io_vertex_index_src(instr);
+   nir_ssa_def *vertex_offset = st->chip_class >= GFX9
+                                ? gs_per_vertex_input_vertex_offset_gfx9(b, vertex_src)
+                                : gs_per_vertex_input_vertex_offset_gfx6(b, vertex_src);
+
+   unsigned base_stride = st->chip_class >= GFX9 ? 1 : 64 /* Wave size on GFX6-8 */;
+   nir_ssa_def *io_off = nir_build_calc_io_offset(b, instr, nir_imm_int(b, base_stride * 4u), base_stride);
+   nir_ssa_def *off = nir_iadd(b, io_off, vertex_offset);
+   return nir_imul_imm(b, off, 4u);
+}
+
+static nir_ssa_def *
+lower_gs_per_vertex_input_load(nir_builder *b,
+                               nir_instr *instr,
+                               void *state)
+{
+   lower_esgs_io_state *st = (lower_esgs_io_state *) state;
+   nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
+   nir_ssa_def *off = gs_per_vertex_input_offset(b, st, intrin);
+
+   if (st->chip_class >= GFX9)
+      return nir_build_load_shared(b, intrin->dest.ssa.num_components, intrin->dest.ssa.bit_size, off,
+                                   .align_mul = 16u, .align_offset = (nir_intrinsic_component(intrin) * 4u) % 16u);
+
+   unsigned wave_size = 64u; /* GFX6-8 only support wave64 */
+   nir_ssa_def *ring = nir_build_load_ring_esgs_amd(b);
+   return emit_split_buffer_load(b, ring, off, nir_imm_zero(b, 1, 32), 4u * wave_size,
+                                 intrin->dest.ssa.num_components, intrin->dest.ssa.bit_size);
+}
+
+static bool
+filter_load_per_vertex_input(const nir_instr *instr, UNUSED const void *state)
+{
+   return instr->type == nir_instr_type_intrinsic && nir_instr_as_intrinsic(instr)->intrinsic == nir_intrinsic_load_per_vertex_input;
+}
+
+void
+ac_nir_lower_es_outputs_to_mem(nir_shader *shader,
+                               enum chip_class chip_class,
+                               unsigned num_reserved_es_outputs)
+{
+   lower_esgs_io_state state = {
+      .chip_class = chip_class,
+      .num_reserved_es_outputs = num_reserved_es_outputs,
+   };
+
+   nir_shader_instructions_pass(shader,
+                                lower_es_output_store,
+                                nir_metadata_block_index | nir_metadata_dominance,
+                                &state);
+}
+
+void
+ac_nir_lower_gs_inputs_to_mem(nir_shader *shader,
+                              enum chip_class chip_class,
+                              unsigned num_reserved_es_outputs)
+{
+   lower_esgs_io_state state = {
+      .chip_class = chip_class,
+      .num_reserved_es_outputs = num_reserved_es_outputs,
+   };
+
+   nir_shader_lower_instructions(shader,
+                                 filter_load_per_vertex_input,
+                                 lower_gs_per_vertex_input_load,
+                                 &state);
+}
--- a/src/amd/common/meson.build
+++ b/src/amd/common/meson.build
@@ -88,6 +88,7 @@ amd_common_files = files(
  'ac_msgpack.h',
  'ac_rgp_elf_object_pack.c',
  'ac_nir.h',
+  'ac_nir_lower_esgs_io_to_mem.c',
  'ac_nir_lower_tess_io_to_mem.c',
 )