nak: Add NIR lowering for attribute I/O

This adds 4 NIR intrinsics for attribute I/O which match the Turing hardware instructions as well as a lowering pass to lower load/store[_per_vertex]_input/output to thewe intrinsics. This greatly simplifies nak_to_nir.rs. Also, this pass is able to handle a bunch of cases that the current code in nak_from_nir.rs can't: - Misaligned access (i.e., a vec3 load at 0x0f4) - Write masks on store[_per_vertex]_output - Indirect load/store where we need to use AL2P to get physical addresses, including scalarizing those cases It also handles the casses where we need to use ISBERD on vertex indices in the same pass. When we switch to this, we'll rip out the dedicated per_vertex lowering pass. Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/24998>
2023-10-03 18:51:05 -05:00
parent c1ffdb3ee9
commit abe9c1fea2
3 changed files with 211 additions and 0 deletions
--- a/src/nouveau/compiler/meson.build
+++ b/src/nouveau/compiler/meson.build
@@ -16,6 +16,7 @@ libnak_c_files = files(
  'nak.h',
  'nak_nir.c',
  'nak_nir_lower_tex.c',
+  'nak_nir_lower_vtg_io.c',
 )

 libnak_rs_files = files(
--- a/src/nouveau/compiler/nak_nir_lower_vtg_io.c
+++ b/src/nouveau/compiler/nak_nir_lower_vtg_io.c
@@ -0,0 +1,201 @@
+/*
+ * Copyright © 2023 Collabora, Ltd.
+ * SPDX-License-Identifier: MIT
+ */
+
+#include "nak_private.h"
+#include "nir_builder.h"
+
+static bool
+lower_vtg_io_intrin(nir_builder *b,
+                    nir_intrinsic_instr *intrin,
+                    void *cb_data)
+{
+   nir_def *vtx = NULL, *offset = NULL, *data = NULL;
+   switch (intrin->intrinsic) {
+   case nir_intrinsic_load_input:
+   case nir_intrinsic_load_output:
+      offset = intrin->src[0].ssa;
+      break;
+
+   case nir_intrinsic_load_per_vertex_input:
+   case nir_intrinsic_load_per_vertex_output:
+      vtx = intrin->src[0].ssa;
+      offset = intrin->src[1].ssa;
+      break;
+
+   case nir_intrinsic_store_output:
+      data = intrin->src[0].ssa;
+      offset = intrin->src[1].ssa;
+      break;
+
+   case nir_intrinsic_store_per_vertex_output:
+      data = intrin->src[0].ssa;
+      vtx = intrin->src[1].ssa;
+      offset = intrin->src[2].ssa;
+      break;
+
+   default:
+      return false;
+   }
+
+   const bool is_store = data != NULL;
+
+   unsigned base = nir_intrinsic_base(intrin);
+   unsigned range = nir_intrinsic_range(intrin);
+   unsigned component = nir_intrinsic_component(intrin);
+
+   bool is_output;
+   switch (intrin->intrinsic) {
+   case nir_intrinsic_load_input:
+   case nir_intrinsic_load_per_vertex_input:
+      is_output = false;
+      break;
+
+   case nir_intrinsic_load_output:
+   case nir_intrinsic_load_per_vertex_output:
+   case nir_intrinsic_store_output:
+   case nir_intrinsic_store_per_vertex_output:
+      is_output = true;
+      break;
+
+   default:
+      unreachable("Unknown NIR I/O intrinsic");
+   }
+
+   bool is_patch;
+   switch (b->shader->info.stage) {
+   case MESA_SHADER_VERTEX:
+   case MESA_SHADER_GEOMETRY:
+      is_patch = false;
+      break;
+
+   case MESA_SHADER_TESS_CTRL:
+      is_patch = is_output && vtx == NULL;
+      break;
+
+   case MESA_SHADER_TESS_EVAL:
+      is_patch = !is_output && vtx == NULL;
+      break;
+
+   default:
+      unreachable("Unknown shader stage");
+   }
+
+   nir_component_mask_t mask;
+   if (is_store)
+      mask = nir_intrinsic_write_mask(intrin);
+   else
+      mask = nir_component_mask(intrin->num_components);
+
+   b->cursor = nir_before_instr(&intrin->instr);
+
+   if (vtx != NULL && !is_output) {
+      nir_def *info = nir_load_sysval_nv(b, 32,
+                                         .base = NAK_SV_INVOCATION_INFO,
+                                         .access = ACCESS_CAN_REORDER);
+      nir_def *lo = nir_extract_u8_imm(b, info, 0);
+      nir_def *hi = nir_extract_u8_imm(b, info, 2);
+      nir_def *idx = nir_iadd(b, nir_imul(b, lo, hi), vtx);
+      vtx = nir_isberd_nv(b, idx);
+   }
+
+   if (vtx == NULL)
+      vtx = nir_imm_int(b, 0);
+
+   unsigned addr = base + 4 * component;
+   const bool offset_is_const = nir_src_is_const(nir_src_for_ssa(offset));
+   if (offset_is_const) {
+      unsigned const_offset = nir_src_as_uint(nir_src_for_ssa(offset));
+      assert(const_offset % 16 == 0);
+      addr += const_offset;
+
+      /* Tighten the range */
+      base = addr;
+      range = 4 * intrin->num_components;
+
+      if (const_offset != 0)
+         offset = nir_imm_int(b, 0);
+   }
+
+   const struct nak_nir_attr_io_flags flags = {
+      .output = is_output,
+      .patch = is_patch,
+      .phys = !offset_is_const && !is_patch,
+   };
+
+   uint32_t flags_u32;
+   STATIC_ASSERT(sizeof(flags_u32) == sizeof(flags));
+   memcpy(&flags_u32, &flags, sizeof(flags_u32));
+
+   nir_def *dst_comps[NIR_MAX_VEC_COMPONENTS];
+   while (mask) {
+      const unsigned c = ffs(mask) - 1;
+      unsigned comps = ffs(~(mask >> c)) - 1;
+      assert(comps > 0);
+
+      unsigned c_addr = addr + 4 * c;
+
+      /* vec2 has to be vec2 aligned, vec3/4 have to be vec4 aligned.  We
+       * don't have actual alignment information on these intrinsics but we
+       * can assume that the indirect offset (if any) is a multiple of 16 so
+       * we don't need to worry about that and can just look at c_addr.
+       */
+      comps = MIN2(comps, 4);
+      if (c_addr & 0xf)
+         comps = MIN2(comps, 2);
+      if (c_addr & 0x7)
+         comps = 1;
+      assert(!(c_addr & 0x3));
+
+      nir_def *c_offset = offset;
+      if (flags.phys) {
+         /* Physical addressing has to be scalar */
+         comps = 1;
+
+         /* Use al2p to compute a physical address */
+         c_offset = nir_al2p_nv(b, offset, .base = c_addr,
+                                .flags = flags_u32);
+         c_addr = 0;
+      }
+
+      if (is_store) {
+         nir_def *c_data = nir_channels(b, data, BITFIELD_RANGE(c, comps));
+         nir_ast_nv(b, c_data, vtx, c_offset,
+                    .base = c_addr,
+                    .flags = flags_u32,
+                    .range_base = base,
+                    .range = range);
+      } else {
+         uint32_t access = flags.output ? 0 : ACCESS_CAN_REORDER;
+         nir_def *c_data = nir_ald_nv(b, comps, vtx, c_offset,
+                                      .base = c_addr,
+                                      .flags = flags_u32,
+                                      .range_base = base,
+                                      .range = range,
+                                      .access = access);
+         for (unsigned i = 0; i < comps; i++)
+            dst_comps[c + i] = nir_channel(b, c_data, i);
+      }
+
+      mask &= ~BITFIELD_RANGE(c, comps);
+   }
+
+   if (!is_store) {
+      nir_def *dst = nir_vec(b, dst_comps, intrin->num_components);
+      nir_def_rewrite_uses(&intrin->def, dst);
+   }
+
+   nir_instr_remove(&intrin->instr);
+
+   return true;
+}
+
+bool
+nak_nir_lower_vtg_io(nir_shader *nir, const struct nak_compiler *nak)
+{
+   return nir_shader_intrinsics_pass(nir, lower_vtg_io_intrin,
+                                     nir_metadata_block_index |
+                                     nir_metadata_dominance,
+                                     NULL);
+}
--- a/src/nouveau/compiler/nak_private.h
+++ b/src/nouveau/compiler/nak_private.h
@@ -133,6 +133,15 @@ struct nak_nir_tex_flags {

 bool nak_nir_lower_tex(nir_shader *nir, const struct nak_compiler *nak);

+struct nak_nir_attr_io_flags {
+   bool output : 1;
+   bool patch : 1;
+   bool phys : 1;
+   uint32_t pad:29;
+};
+
+bool nak_nir_lower_vtg_io(nir_shader *nir, const struct nak_compiler *nak);
+
 enum nak_fs_out {
   NAK_FS_OUT_COLOR0 = 0x00,
   NAK_FS_OUT_COLOR1 = 0x10,