ntt: Switch to new-style registers and modifiers

Use all the nir_legacy.h features to transition away from the deprecated structures. shader-db is quite happy. I assume that's a mix of more aggressive source mod usage and better coalescing (nir-to-tgsi doesn't copyprop). total instructions in shared programs: 900179 -> 887156 (-1.45%) instructions in affected programs: 562077 -> 549054 (-2.32%) helped: 5198 HURT: 470 Instructions are helped. total temps in shared programs: 91165 -> 91162 (<.01%) temps in affected programs: 398 -> 395 (-0.75%) helped: 21 HURT: 18 Inconclusive result (value mean confidence interval includes 0). Signed-off-by: Alyssa Rosenzweig <alyssa@rosenzweig.io> Reviewed-by: Faith Ekstrand <faith.ekstrand@collabora.com> Tested-by: Pavel Ondračka <pavel.ondracka@gmail.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/24116>
2023-05-30 11:41:12 -04:00
parent d54aa28b97
commit 5f5be9e4e1
1 changed files with 185 additions and 76 deletions
--- a/src/gallium/auxiliary/nir/nir_to_tgsi.c
+++ b/src/gallium/auxiliary/nir/nir_to_tgsi.c
@@ -23,6 +23,7 @@

 #include "compiler/nir/nir.h"
 #include "compiler/nir/nir_deref.h"
+#include "compiler/nir/nir_legacy.h"
 #include "compiler/nir/nir_worklist.h"
 #include "nir/nir_to_tgsi.h"
 #include "pipe/p_screen.h"
@@ -564,8 +565,9 @@ ntt_extract_const_src_offset(nir_src *src)

      if (alu->op == nir_op_iadd) {
         for (int i = 0; i < 2; i++) {
+            assert(!alu->src[i].negate && !alu->src[i].abs);
            nir_const_value *v = nir_src_as_const_value(alu->src[i].src);
-            if (v && !alu->src[i].negate && !alu->src[i].abs) {
+            if (v != NULL) {
               *src = alu->src[1 - i].src;
               return v[alu->src[i].swizzle[s.comp]].u32;
            }
@@ -757,13 +759,10 @@ ntt_output_decl(struct ntt_compile *c, nir_intrinsic_instr *instr, uint32_t *fra
   return ureg_writemask(out, write_mask);
 }

-/* If this reg or SSA def is used only for storing an output, then in the simple
- * cases we can write directly to the TGSI output instead of having store_output
- * emit its own MOV.
- */
 static bool
-ntt_try_store_in_tgsi_output(struct ntt_compile *c, struct ureg_dst *dst,
-                             struct list_head *uses)
+ntt_try_store_in_tgsi_output_with_use(struct ntt_compile *c,
+                                      struct ureg_dst *dst,
+                                      nir_src *src)
 {
   *dst = ureg_dst_undef();

@@ -779,10 +778,6 @@ ntt_try_store_in_tgsi_output(struct ntt_compile *c, struct ureg_dst *dst,
      return false;
   }

-   if (!list_is_singular(uses))
-      return false;
-
-   nir_src *src = list_first_entry(uses, nir_src, use_link);
   if (src->is_if)
      return false;

@@ -802,6 +797,56 @@ ntt_try_store_in_tgsi_output(struct ntt_compile *c, struct ureg_dst *dst,
   return frac == 0;
 }

+/* If this reg is used only for storing an output, then in the simple
+ * cases we can write directly to the TGSI output instead of having
+ * store_output emit its own MOV.
+ */
+static bool
+ntt_try_store_reg_in_tgsi_output(struct ntt_compile *c, struct ureg_dst *dst,
+                                 nir_intrinsic_instr *reg_decl)
+{
+   assert(reg_decl->intrinsic == nir_intrinsic_decl_reg);
+
+   *dst = ureg_dst_undef();
+
+   /* Look for a single use for try_store_in_tgsi_output */
+   nir_src *use = NULL;
+   nir_foreach_reg_load(src, reg_decl) {
+      nir_intrinsic_instr *load = nir_instr_as_intrinsic(src->parent_instr);
+      nir_foreach_use_including_if(load_use, &load->dest.ssa) {
+         /* We can only have one use */
+         if (use != NULL)
+            return false;
+
+         use = load_use;
+      }
+   }
+
+   if (use == NULL)
+      return false;
+
+   return ntt_try_store_in_tgsi_output_with_use(c, dst, use);
+}
+
+/* If this SSA def is used only for storing an output, then in the simple
+ * cases we can write directly to the TGSI output instead of having
+ * store_output emit its own MOV.
+ */
+static bool
+ntt_try_store_ssa_in_tgsi_output(struct ntt_compile *c, struct ureg_dst *dst,
+                                 nir_ssa_def *def)
+{
+   *dst = ureg_dst_undef();
+
+   if (!list_is_singular(&def->uses))
+      return false;
+
+   nir_foreach_use_including_if(use, def) {
+      return ntt_try_store_in_tgsi_output_with_use(c, dst, use);
+   }
+   unreachable("We have one use");
+}
+
 static void
 ntt_setup_inputs(struct ntt_compile *c)
 {
@@ -1083,13 +1128,17 @@ static void
 ntt_setup_registers(struct ntt_compile *c, struct exec_list *list)
 {
   assert(c->num_temps == 0);
-   /* Permanently allocate all the array regs at the start. */
-   foreach_list_typed(nir_register, nir_reg, node, list) {
-      if (nir_reg->num_array_elems != 0) {
-         struct ureg_dst decl = ureg_DECL_array_temporary(c->ureg, nir_reg->num_array_elems, true);
-         c->reg_temp[nir_reg->index] = decl;
+
+   nir_foreach_reg_decl_safe(nir_reg, nir_shader_get_entrypoint(c->s)) {
+      /* Permanently allocate all the array regs at the start. */
+      unsigned num_array_elems = nir_intrinsic_num_array_elems(nir_reg);
+      unsigned index = nir_reg->dest.ssa.index;
+
+      if (num_array_elems != 0) {
+         struct ureg_dst decl = ureg_DECL_array_temporary(c->ureg, num_array_elems, true);
+         c->reg_temp[index] = decl;
         assert(c->num_temps == decl.Index);
-         c->num_temps += nir_reg->num_array_elems;
+         c->num_temps += num_array_elems;
      }
   }
   c->first_non_array_temp = c->num_temps;
@@ -1097,15 +1146,22 @@ ntt_setup_registers(struct ntt_compile *c, struct exec_list *list)
   /* After that, allocate non-array regs in our virtual space that we'll
    * register-allocate before ureg emit.
    */
-   foreach_list_typed(nir_register, nir_reg, node, list) {
-      if (nir_reg->num_array_elems == 0) {
+   nir_foreach_reg_decl_safe(nir_reg, nir_shader_get_entrypoint(c->s)) {
+      unsigned num_array_elems = nir_intrinsic_num_array_elems(nir_reg);
+      unsigned num_components = nir_intrinsic_num_components(nir_reg);
+      unsigned bit_size = nir_intrinsic_bit_size(nir_reg);
+      unsigned index = nir_reg->dest.ssa.index;
+
+      /* We already handled arrays */
+      if (num_array_elems == 0) {
         struct ureg_dst decl;
-         uint32_t write_mask = BITFIELD_MASK(nir_reg->num_components);
-         if (!ntt_try_store_in_tgsi_output(c, &decl, &nir_reg->uses)) {
-            if (nir_reg->bit_size == 64) {
-               if (nir_reg->num_components > 2) {
+         uint32_t write_mask = BITFIELD_MASK(num_components);
+
+         if (!ntt_try_store_reg_in_tgsi_output(c, &decl, nir_reg)) {
+            if (bit_size == 64) {
+               if (num_components > 2) {
                  fprintf(stderr, "NIR-to-TGSI: error: %d-component NIR r%d\n",
-                        nir_reg->num_components, nir_reg->index);
+                          num_components, index);
               }

               write_mask = ntt_64bit_write_mask(write_mask);
@@ -1113,7 +1169,7 @@ ntt_setup_registers(struct ntt_compile *c, struct exec_list *list)

            decl = ureg_writemask(ntt_temp(c), write_mask);
         }
-         c->reg_temp[nir_reg->index] = decl;
+         c->reg_temp[index] = decl;
      }
   }
 }
@@ -1169,21 +1225,24 @@ ntt_reladdr(struct ntt_compile *c, struct ureg_src addr, int addr_index)
   return ureg_scalar(ureg_src(c->addr_reg[addr_index]), 0);
 }

+/* Forward declare for recursion with indirects */
 static struct ureg_src
-ntt_get_src(struct ntt_compile *c, nir_src src)
+ntt_get_src(struct ntt_compile *c, nir_src src);
+
+static struct ureg_src
+ntt_get_chased_src(struct ntt_compile *c, nir_legacy_src *src)
 {
-   if (src.is_ssa) {
-      if (src.ssa->parent_instr->type == nir_instr_type_load_const)
-         return ntt_get_load_const_src(c, nir_instr_as_load_const(src.ssa->parent_instr));
+   if (src->is_ssa) {
+      if (src->ssa->parent_instr->type == nir_instr_type_load_const)
+         return ntt_get_load_const_src(c, nir_instr_as_load_const(src->ssa->parent_instr));

-      return c->ssa_temp[src.ssa->index];
+      return c->ssa_temp[src->ssa->index];
   } else {
-      nir_register *reg = src.reg.reg;
-      struct ureg_dst reg_temp = c->reg_temp[reg->index];
-      reg_temp.Index += src.reg.base_offset;
+      struct ureg_dst reg_temp = c->reg_temp[src->reg.handle->index];
+      reg_temp.Index += src->reg.base_offset;

-      if (src.reg.indirect) {
-         struct ureg_src offset = ntt_get_src(c, *src.reg.indirect);
+      if (src->reg.indirect) {
+         struct ureg_src offset = ntt_get_src(c, nir_src_for_ssa(src->reg.indirect));
         return ureg_src_indirect(ureg_src(reg_temp),
                                  ntt_reladdr(c, offset, 0));
      } else {
@@ -1192,17 +1251,32 @@ ntt_get_src(struct ntt_compile *c, nir_src src)
   }
 }

+static struct ureg_src
+ntt_get_src(struct ntt_compile *c, nir_src src)
+{
+   nir_legacy_src chased = nir_legacy_chase_src(&src);
+   return ntt_get_chased_src(c, &chased);
+}
+
 static struct ureg_src
 ntt_get_alu_src(struct ntt_compile *c, nir_alu_instr *instr, int i)
 {
-   nir_alu_src src = instr->src[i];
-   struct ureg_src usrc = ntt_get_src(c, src.src);
+   /* We only support 32-bit float modifiers.  The only other modifier type
+    * officially supported by TGSI is 32-bit integer negates, but even those are
+    * broken on virglrenderer, so skip lowering all integer and f64 float mods.
+    *
+    * The options->lower_fabs requests that we not have native source modifiers
+    * for fabs, and instead emit MAX(a,-a) for nir_op_fabs.
+    */
+   nir_legacy_alu_src src =
+      nir_legacy_chase_alu_src(&instr->src[i], !c->options->lower_fabs);
+   struct ureg_src usrc = ntt_get_chased_src(c, &src.src);

   /* Expand double/dvec2 src references to TGSI swizzles using a pair of 32-bit
    * channels.  We skip this for undefs, as those don't get split to vec2s (but
    * the specific swizzles from an undef don't matter)
    */
-   if (nir_src_bit_size(src.src) == 64 &&
+   if (nir_src_bit_size(instr->src[i].src) == 64 &&
      !(src.src.is_ssa && src.src.ssa->parent_instr->type == nir_instr_type_ssa_undef)) {
      int chan0 = 0, chan1 = 1;
      if (nir_op_infos[instr->op].input_sizes[i] == 0) {
@@ -1224,9 +1298,9 @@ ntt_get_alu_src(struct ntt_compile *c, nir_alu_instr *instr, int i)
                          src.swizzle[3]);
   }

-   if (src.abs)
+   if (src.fabs)
      usrc = ureg_abs(usrc);
-   if (src.negate)
+   if (src.fneg)
      usrc = ureg_negate(usrc);

   return usrc;
@@ -1255,7 +1329,7 @@ ntt_get_ssa_def_decl(struct ntt_compile *c, nir_ssa_def *ssa)
      writemask = ntt_64bit_write_mask(writemask);

   struct ureg_dst dst;
-   if (!ntt_try_store_in_tgsi_output(c, &dst, &ssa->uses))
+   if (!ntt_try_store_ssa_in_tgsi_output(c, &dst, ssa))
      dst = ntt_temp(c);

   c->ssa_temp[ssa->index] = ntt_swizzle_for_write_mask(ureg_src(dst), writemask);
@@ -1264,24 +1338,24 @@ ntt_get_ssa_def_decl(struct ntt_compile *c, nir_ssa_def *ssa)
 }

 static struct ureg_dst
-ntt_get_dest_decl(struct ntt_compile *c, nir_dest *dest)
+ntt_get_chased_dest_decl(struct ntt_compile *c, nir_legacy_dest *dest)
 {
   if (dest->is_ssa)
-      return ntt_get_ssa_def_decl(c, &dest->ssa);
+      return ntt_get_ssa_def_decl(c, dest->ssa);
   else
-      return c->reg_temp[dest->reg.reg->index];
+      return c->reg_temp[dest->reg.handle->index];
 }

 static struct ureg_dst
-ntt_get_dest(struct ntt_compile *c, nir_dest *dest)
+ntt_get_chased_dest(struct ntt_compile *c, nir_legacy_dest *dest)
 {
-   struct ureg_dst dst = ntt_get_dest_decl(c, dest);
+   struct ureg_dst dst = ntt_get_chased_dest_decl(c, dest);

   if (!dest->is_ssa) {
      dst.Index += dest->reg.base_offset;

      if (dest->reg.indirect) {
-         struct ureg_src offset = ntt_get_src(c, *dest->reg.indirect);
+         struct ureg_src offset = ntt_get_src(c, nir_src_for_ssa(dest->reg.indirect));
         dst = ureg_dst_indirect(dst, ntt_reladdr(c, offset, 0));
      }
   }
@@ -1289,6 +1363,35 @@ ntt_get_dest(struct ntt_compile *c, nir_dest *dest)
   return dst;
 }

+static struct ureg_dst
+ntt_get_dest(struct ntt_compile *c, nir_dest *dest)
+{
+   nir_legacy_dest chased = nir_legacy_chase_dest(dest);
+   return ntt_get_chased_dest(c, &chased);
+}
+
+static struct ureg_dst
+ntt_get_alu_dest(struct ntt_compile *c, nir_dest *dest)
+{
+   nir_legacy_alu_dest chased = nir_legacy_chase_alu_dest(dest);
+   struct ureg_dst dst = ntt_get_chased_dest(c, &chased.dest);
+
+   if (chased.fsat)
+      dst.Saturate = true;
+
+   /* Only registers get write masks */
+   if (chased.dest.is_ssa)
+      return dst;
+
+   int dst_64 = nir_dest_bit_size(*dest) == 64;
+   unsigned write_mask = chased.write_mask;
+
+   if (dst_64)
+      return ureg_writemask(dst, ntt_64bit_write_mask(write_mask));
+   else
+      return ureg_writemask(dst, write_mask);
+}
+
 /* For an SSA dest being populated by a constant src, replace the storage with
 * a copy of the ureg_src.
 */
@@ -1312,10 +1415,12 @@ ntt_store_def(struct ntt_compile *c, nir_ssa_def *def, struct ureg_src src)
 static void
 ntt_store(struct ntt_compile *c, nir_dest *dest, struct ureg_src src)
 {
-   if (dest->is_ssa)
-      ntt_store_def(c, &dest->ssa, src);
+   nir_legacy_dest chased = nir_legacy_chase_dest(dest);
+
+   if (chased.is_ssa)
+      ntt_store_def(c, chased.ssa, src);
   else {
-      struct ureg_dst dst = ntt_get_dest(c, dest);
+      struct ureg_dst dst = ntt_get_chased_dest(c, &chased);
      ntt_MOV(c, dst, src);
   }
 }
@@ -1353,6 +1458,10 @@ ntt_emit_alu(struct ntt_compile *c, nir_alu_instr *instr)
   int src_64 = nir_src_bit_size(instr->src[0].src) == 64;
   int num_srcs = nir_op_infos[instr->op].num_inputs;

+   /* Don't try to translate folded fsat since their source won't be valid */
+   if (instr->op == nir_op_fsat && nir_legacy_fsat_folds(instr))
+      return;
+
   c->precise = instr->exact;

   assert(num_srcs <= ARRAY_SIZE(src));
@@ -1361,15 +1470,7 @@ ntt_emit_alu(struct ntt_compile *c, nir_alu_instr *instr)
   for (; i < ARRAY_SIZE(src); i++)
      src[i] = ureg_src_undef();

-   dst = ntt_get_dest(c, &instr->dest.dest);
-
-   if (instr->dest.saturate)
-      dst.Saturate = true;
-
-   if (dst_64)
-      dst = ureg_writemask(dst, ntt_64bit_write_mask(instr->dest.write_mask));
-   else
-      dst = ureg_writemask(dst, instr->dest.write_mask);
+   dst = ntt_get_alu_dest(c, &instr->dest.dest);

   static enum tgsi_opcode op_map[][2] = {
      [nir_op_mov] = { TGSI_OPCODE_MOV, TGSI_OPCODE_MOV },
@@ -1522,6 +1623,10 @@ ntt_emit_alu(struct ntt_compile *c, nir_alu_instr *instr)
         break;

      case nir_op_fabs:
+         /* Try to eliminate */
+         if (!c->options->lower_fabs && nir_legacy_float_mod_folds(instr))
+            break;
+
         if (c->options->lower_fabs)
            ntt_MAX(c, dst, src[0], ureg_negate(src[0]));
         else
@@ -1538,6 +1643,10 @@ ntt_emit_alu(struct ntt_compile *c, nir_alu_instr *instr)
         break;

      case nir_op_fneg:
+         /* Try to eliminate */
+         if (nir_legacy_float_mod_folds(instr))
+            break;
+
         ntt_MOV(c, dst, ureg_negate(src[0]));
         break;

@@ -2564,6 +2673,14 @@ ntt_emit_intrinsic(struct ntt_compile *c, nir_intrinsic_instr *instr)
      ntt_CLOCK(c, ntt_get_dest(c, &instr->dest));
      break;

+   case nir_intrinsic_decl_reg:
+   case nir_intrinsic_load_reg:
+   case nir_intrinsic_load_reg_indirect:
+   case nir_intrinsic_store_reg:
+   case nir_intrinsic_store_reg_indirect:
+      /* fully consumed */
+      break;
+
   default:
      fprintf(stderr, "Unknown intrinsic: ");
      nir_print_instr(&instr->instr, stderr);
@@ -3050,7 +3167,7 @@ ntt_emit_impl(struct ntt_compile *c, nir_function_impl *impl)
   c->impl = impl;

   c->ssa_temp = rzalloc_array(c, struct ureg_src, impl->ssa_alloc);
-   c->reg_temp = rzalloc_array(c, struct ureg_dst, impl->reg_alloc);
+   c->reg_temp = rzalloc_array(c, struct ureg_dst, impl->ssa_alloc);

   /* Set up the struct ntt_blocks to put insns in */
   c->blocks = _mesa_pointer_hash_table_create(c);
@@ -3855,25 +3972,17 @@ const void *nir_to_tgsi_options(struct nir_shader *s,

   NIR_PASS_V(s, nir_opt_move, move_all);

-   /* We're fine lowering only 32-bit floats. TGSI officially supports 32-bit
-    * integer negates, but even those are broken on virglrenderer, so we don't
-    * use integer or f64 float mods.
-    *
-    * The options->lower_fabs requests that we not have native source modifiers
-    * for fabs, and instead emit MAX(a,-a) for nir_op_fabs.
+   NIR_PASS_V(s, nir_convert_from_ssa, true, true);
+   NIR_PASS_V(s, nir_lower_vec_to_regs, ntt_vec_to_mov_writemask_cb, NULL);
+
+   /* locals_to_reg_intrinsics will leave dead derefs that are good to clean up.
    */
-   nir_lower_to_source_mods_flags source_mods = nir_lower_fneg_source_mods;
-   if (!options->lower_fabs)
-      source_mods |= nir_lower_fabs_source_mods;
-   NIR_PASS_V(s, nir_lower_to_source_mods, source_mods);
-
-   NIR_PASS_V(s, nir_convert_from_ssa, true, false);
-   NIR_PASS_V(s, nir_lower_vec_to_movs, ntt_vec_to_mov_writemask_cb, NULL);
-
-   /* locals_to_regs will leave dead derefs that are good to clean up. */
-   NIR_PASS_V(s, nir_lower_locals_to_regs, 32);
+   NIR_PASS_V(s, nir_lower_locals_to_reg_intrinsics, 32);
   NIR_PASS_V(s, nir_opt_dce);

+   /* See comment in ntt_get_alu_src for supported modifiers */
+   NIR_PASS_V(s, nir_legacy_trivialize, !options->lower_fabs);
+
   if (NIR_DEBUG(TGSI)) {
      fprintf(stderr, "NIR before translation to TGSI:\n");
      nir_print_shader(s, stderr);