agx: Make p_combine take a dynamic src count

For larger vectors. Signed-off-by: Alyssa Rosenzweig <alyssa@rosenzweig.io> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/18525>
2022-09-09 14:32:01 -04:00
parent ef31dceee8
commit 4f85a7be8c
5 changed files with 69 additions and 35 deletions
--- a/src/asahi/compiler/agx_builder.h.py
+++ b/src/asahi/compiler/agx_builder.h.py
@@ -50,6 +50,10 @@ agx_${opcode}${suffix}(agx_builder *b
   , agx_index dst${dest}
 % endfor

+% if op.variable_srcs:
+   , unsigned nr_srcs
+% endif
+
 % for src in range(srcs):
   , agx_index src${src}
 % endfor
@@ -65,7 +69,10 @@ agx_${opcode}${suffix}(agx_builder *b
   I->dest[${dest}] = dst${dest};
 % endfor

-% if srcs > 0:
+% if op.variable_srcs:
+   I->src = ralloc_array(I, agx_index, nr_srcs);
+   I->nr_srcs = nr_srcs;
+% elif srcs > 0:
   I->src = ralloc_array(I, agx_index, ${srcs});
   I->nr_srcs = ${srcs};

@@ -82,7 +89,7 @@ agx_${opcode}${suffix}(agx_builder *b
   return I;
 }

-% if dests == 1:
+% if dests == 1 and not op.variable_srcs:
 static inline agx_index
 agx_${opcode}(agx_builder *b

--- a/src/asahi/compiler/agx_compile.c
+++ b/src/asahi/compiler/agx_compile.c
@@ -120,16 +120,14 @@ agx_emit_extract(agx_builder *b, agx_index vec, unsigned channel)
 }

 static void
-agx_cache_combine(agx_builder *b, agx_index dst,
-                  agx_index s0, agx_index s1, agx_index s2, agx_index s3)
+agx_cache_combine(agx_builder *b, agx_index dst, unsigned nr_srcs,
+                  agx_index *srcs)
 {
   /* Lifetime of a hash table entry has to be at least as long as the table */
-   agx_index *channels = ralloc_array(b->shader, agx_index, 4);
+   agx_index *channels = ralloc_array(b->shader, agx_index, nr_srcs);

-   channels[0] = s0;
-   channels[1] = s1;
-   channels[2] = s2;
-   channels[3] = s3;
+   for (unsigned i = 0; i < nr_srcs; ++i)
+      channels[i] = srcs[i];

   _mesa_hash_table_u64_insert(b->shader->allocated_vec, agx_index_to_key(dst),
                               channels);
@@ -142,11 +140,34 @@ agx_cache_combine(agx_builder *b, agx_index dst,
 * To optimize vector extractions, we record the individual channels
 */
 static agx_instr *
-agx_emit_combine_to(agx_builder *b, agx_index dst,
-                    agx_index s0, agx_index s1, agx_index s2, agx_index s3)
+agx_emit_combine_to(agx_builder *b, agx_index dst, unsigned nr_srcs,
+                    agx_index *srcs)
 {
-   agx_cache_combine(b, dst, s0, s1, s2, s3);
-   return agx_p_combine_to(b, dst, s0, s1, s2, s3);
+   agx_cache_combine(b, dst, 4, srcs);
+   agx_instr *I = agx_p_combine_to(b, dst, nr_srcs);
+
+   agx_foreach_src(I, s)
+      I->src[s] = srcs[s];
+
+   return I;
+}
+
+static agx_index
+agx_vec4(agx_builder *b, agx_index s0, agx_index s1, agx_index s2, agx_index s3)
+{
+      agx_index dst = agx_temp(b->shader, s0.size);
+      agx_index idx[4] = { s0, s1, s2, s3 };
+      agx_emit_combine_to(b, dst, 4, idx);
+      return dst;
+}
+
+static agx_index
+agx_vec2(agx_builder *b, agx_index s0, agx_index s1)
+{
+   agx_index dst = agx_temp(b->shader, s0.size);
+   agx_index idx[2] = { s0, s1 };
+   agx_emit_combine_to(b, dst, 2, idx);
+   return dst;
 }

 static void
@@ -197,7 +218,7 @@ agx_emit_cached_split(agx_builder *b, agx_index vec, unsigned n)
 {
   agx_index dests[4] = { agx_null(), agx_null(), agx_null(), agx_null() };
   agx_emit_split(b, dests, vec, n);
-   agx_cache_combine(b, vec, dests[0], dests[1], dests[2], dests[3]);
+   agx_cache_combine(b, vec, n, dests);
 }

 static void
@@ -654,7 +675,7 @@ agx_emit_intrinsic(agx_builder *b, nir_intrinsic_instr *instr)
   * If only individual components are accessed, this combine will be dead code
   * eliminated.
   */
-  return agx_emit_combine_to(b, dst, dests[0], dests[1], dests[2], dests[3]);
+  return agx_emit_combine_to(b, dst, 4, dests);
 }

 static agx_index
@@ -926,7 +947,10 @@ agx_emit_alu(agx_builder *b, nir_alu_instr *instr)
   case nir_op_vec2:
   case nir_op_vec3:
   case nir_op_vec4:
-      return agx_emit_combine_to(b, dst, s0, s1, s2, s3);
+   {
+      agx_index idx[] = { s0, s1, s2, s3 };
+      return agx_emit_combine_to(b, dst, 4, idx);
+   }

   case nir_op_vec8:
   case nir_op_vec16:
@@ -1049,7 +1073,7 @@ agx_emit_tex(agx_builder *b, nir_tex_instr *instr)
            agx_mov_to(b, layer32, layer);

            channels[nr - 1] = layer32;
-            coords = agx_p_combine(b, channels[0], channels[1], channels[2], channels[3]);
+            coords = agx_vec4(b, channels[0], channels[1], channels[2], channels[3]);
         } else {
            coords = index;
         }
--- a/src/asahi/compiler/agx_opcodes.py
+++ b/src/asahi/compiler/agx_opcodes.py
@@ -25,11 +25,16 @@ opcodes = {}
 immediates = {}
 enums = {}

+VARIABLE = ~0
+
 class Opcode(object):
-   def __init__(self, name, dests, srcs, imms, is_float, can_eliminate, encoding_16, encoding_32):
+   def __init__(self, name, dests, srcs, imms, is_float, can_eliminate,
+           encoding_16, encoding_32):
      self.name = name
-      self.dests = dests
-      self.srcs = srcs
+      self.dests = dests if dests != VARIABLE else 0
+      self.srcs = srcs if srcs != VARIABLE else 0
+      self.variable_srcs = (srcs == VARIABLE)
+      self.variable_dests = (dests == VARIABLE)
      self.imms = imms
      self.is_float = is_float
      self.can_eliminate = can_eliminate
@@ -57,7 +62,8 @@ class Encoding(object):
      if self.extensible:
         assert(length_long == length_short + (4 if length_short > 8 else 2))

-def op(name, encoding_32, dests = 1, srcs = 0, imms = [], is_float = False, can_eliminate = True, encoding_16 = None):
+def op(name, encoding_32, dests = 1, srcs = 0, imms = [], is_float = False,
+        can_eliminate = True, encoding_16 = None):
   encoding_16 = Encoding(encoding_16) if encoding_16 is not None else None
   encoding_32 = Encoding(encoding_32) if encoding_32 is not None else None

@@ -258,7 +264,7 @@ op("or", _, srcs = 2)
 # Indicates the logical end of the block, before final branches/control flow
 op("p_logical_end", _, dests = 0, srcs = 0, can_eliminate = False)

-op("p_combine", _, srcs = 4)
+op("p_combine", _, srcs = VARIABLE)
 op("p_split", _, srcs = 1, dests = 4)

 # Phis are special-cased in the IR as they (uniquely) can take an unbounded
--- a/src/asahi/compiler/agx_register_allocate.c
+++ b/src/asahi/compiler/agx_register_allocate.c
@@ -47,16 +47,7 @@ agx_write_registers(agx_instr *I, unsigned d)
   case AGX_OPCODE_LDCF:
      return 6;
   case AGX_OPCODE_P_COMBINE:
-   {
-      unsigned components = 0;
-
-      for (unsigned i = 0; i < 4; ++i) {
-         if (!agx_is_null(I->src[i]))
-            components = i + 1;
-      }
-
-      return components * size;
-   }
+      return I->nr_srcs * size;
   default:
      return size;
   }
@@ -325,11 +316,11 @@ agx_ra(agx_context *ctx)
         unsigned base = agx_index_to_reg(ssa_to_reg, ins->dest[0]);
         unsigned width = agx_size_align_16(ins->dest[0].size);

-         struct agx_copy copies[4];
+         struct agx_copy *copies = alloca(sizeof(copies[0]) * ins->nr_srcs);
         unsigned n = 0;

         /* Move the sources */
-         for (unsigned i = 0; i < 4; ++i) {
+         agx_foreach_src(ins, i) {
            if (agx_is_null(ins->src[i])) continue;
            assert(ins->src[i].size == ins->dest[0].size);

--- a/src/asahi/compiler/test/test-optimizer.cpp
+++ b/src/asahi/compiler/test/test-optimizer.cpp
@@ -105,7 +105,13 @@ TEST_F(Optimizer, Copyprop)

 TEST_F(Optimizer, InlineHazards)
 {
-   NEGCASE(agx_p_combine_to(b, wx, agx_mov_imm(b, AGX_SIZE_32, 0), wy, wz, wz));
+   NEGCASE({
+         agx_instr *I = agx_p_combine_to(b, wx, 4);
+         I->src[0] = agx_mov_imm(b, AGX_SIZE_32, 0);
+         I->src[1] = wy;
+         I->src[2] = wz;
+         I->src[3] = wz;
+   });
 }

 TEST_F(Optimizer, CopypropRespectsAbsNeg)