agx: Preload vertex/instance ID only at start

This means we don't reserve the registers, which improves RA considerably. Using a special preload psuedo-op instead of a regular move allows us to constrain semantics and gaurantee coalescing. shader-db on glmark2 subset: total instructions in shared programs: 6448 -> 6442 (-0.09%) instructions in affected programs: 230 -> 224 (-2.61%) helped: 4 HURT: 0 total bytes in shared programs: 42232 -> 42196 (-0.09%) bytes in affected programs: 1530 -> 1494 (-2.35%) helped: 4 HURT: 0 total halfregs in shared programs: 2291 -> 1926 (-15.93%) halfregs in affected programs: 2185 -> 1820 (-16.70%) helped: 75 HURT: 0 Signed-off-by: Alyssa Rosenzweig <alyssa@rosenzweig.io> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/18804>
2022-09-23 17:27:43 -04:00
parent f665229d77
commit c9a96d4615
5 changed files with 111 additions and 18 deletions
--- a/src/asahi/compiler/agx_compile.c
+++ b/src/asahi/compiler/agx_compile.c
@@ -52,6 +52,30 @@ int agx_debug = 0;
      fprintf(stderr, "%s:%d: "fmt, \
            __FUNCTION__, __LINE__, ##__VA_ARGS__); } while (0)

+static agx_index
+agx_cached_preload(agx_context *ctx, agx_index *cache, unsigned base, enum agx_size size)
+{
+   if (agx_is_null(*cache)) {
+      agx_block *block = agx_start_block(ctx);
+      agx_builder b = agx_init_builder(ctx, agx_before_block(block));
+      *cache = agx_preload(&b, agx_register(base, size));
+   }
+
+   return *cache;
+}
+
+static agx_index
+agx_vertex_id(agx_builder *b)
+{
+   return agx_cached_preload(b->shader, &b->shader->vertex_id, 10, AGX_SIZE_32);
+}
+
+static agx_index
+agx_instance_id(agx_builder *b)
+{
+   return agx_cached_preload(b->shader, &b->shader->instance_id, 12, AGX_SIZE_32);
+}
+
 static agx_index
 agx_get_cf(agx_context *ctx, bool smooth, bool perspective,
           gl_varying_slot slot, unsigned offset, unsigned count)
@@ -327,13 +351,10 @@ agx_emit_load_attr(agx_builder *b, agx_index dest, nir_intrinsic_instr *instr)
   agx_index shifted_stride = agx_mov_imm(b, 32, stride >> shift);
   agx_index src_offset = agx_mov_imm(b, 32, attrib.src_offset);

-   agx_index vertex_id = agx_register(10, AGX_SIZE_32);
-   agx_index instance_id = agx_register(12, AGX_SIZE_32);
-
   /* A nonzero divisor requires dividing the instance ID. A zero divisor
    * specifies per-instance data. */
-   agx_index element_id = (attrib.divisor == 0) ? vertex_id :
-                          agx_udiv_const(b, instance_id, attrib.divisor);
+   agx_index element_id = (attrib.divisor == 0) ? agx_vertex_id(b) :
+                          agx_udiv_const(b, agx_instance_id(b), attrib.divisor);

   agx_index offset = agx_imad(b, element_id, shifted_stride, src_offset, 0);

@@ -683,10 +704,10 @@ agx_emit_intrinsic(agx_builder *b, nir_intrinsic_instr *instr)
              AGX_PUSH_TEXTURE_BASE, AGX_SIZE_64, 0, 4));

  case nir_intrinsic_load_vertex_id:
-     return agx_mov_to(b, dst, agx_abs(agx_register(10, AGX_SIZE_32)));
+     return agx_mov_to(b, dst, agx_abs(agx_vertex_id(b)));

  case nir_intrinsic_load_instance_id:
-     return agx_mov_to(b, dst, agx_abs(agx_register(12, AGX_SIZE_32)));
+     return agx_mov_to(b, dst, agx_abs(agx_instance_id(b)));

  case nir_intrinsic_load_blend_const_color_r_float: return agx_blend_const(b, dst, 0);
  case nir_intrinsic_load_blend_const_color_g_float: return agx_blend_const(b, dst, 1);
--- a/src/asahi/compiler/agx_compiler.h
+++ b/src/asahi/compiler/agx_compiler.h
@@ -396,6 +396,11 @@ typedef struct {
    * components, populated by a split. */
   struct hash_table_u64 *allocated_vec;

+   /* During instruction selection, preloaded values,
+    * or NULL if it hasn't been preloaded
+    */
+   agx_index vertex_id, instance_id;
+
   /* Stats for shader-db */
   unsigned loop_count;
   unsigned spills;
@@ -456,6 +461,20 @@ agx_vec_for_intr(agx_context *ctx, nir_intrinsic_instr *instr)
   return agx_vec_for_dest(ctx, &instr->dest);
 }

+static inline unsigned
+agx_num_predecessors(agx_block *block)
+{
+   return util_dynarray_num_elements(&block->predecessors, agx_block *);
+}
+
+static inline agx_block *
+agx_start_block(agx_context *ctx)
+{
+   agx_block *first = list_first_entry(&ctx->blocks, agx_block, link);
+   assert(agx_num_predecessors(first) == 0);
+   return first;
+}
+
 /* Iterators for AGX IR */

 #define agx_foreach_block(ctx, v) \
@@ -650,6 +669,25 @@ agx_after_block_logical(agx_block *block)
   return agx_after_block(block);
 }

+
+static inline agx_cursor
+agx_before_nonempty_block(agx_block *block)
+{
+   agx_instr *I = list_first_entry(&block->instructions, agx_instr, link);
+   assert(I != NULL);
+
+   return agx_before_instr(I);
+}
+
+static inline agx_cursor
+agx_before_block(agx_block *block)
+{
+   if (list_is_empty(&block->instructions))
+      return agx_after_block(block);
+   else
+      return agx_before_nonempty_block(block);
+}
+
 /* IR builder in terms of cursor infrastructure */

 typedef struct {
--- a/src/asahi/compiler/agx_opcodes.py
+++ b/src/asahi/compiler/agx_opcodes.py
@@ -281,3 +281,7 @@ op("split", _, srcs = 1, dests = VARIABLE)
 op("phi", _, srcs = VARIABLE)

 op("unit_test", _, dests = 0, srcs = 1, can_eliminate = False)
+
+# Like mov, but takes a register and can only appear at the start. Gauranteed
+# to be coalesced during RA, rather than lowered to a real move. 
+op("preload", _, srcs = 1)
--- a/src/asahi/compiler/agx_register_allocate.c
+++ b/src/asahi/compiler/agx_register_allocate.c
@@ -111,10 +111,6 @@ agx_ra_assign_local(agx_block *block, uint8_t *ssa_to_reg, uint8_t *ncomps)
   }

   BITSET_SET(used_regs, 0); // control flow writes r0l
-   BITSET_SET(used_regs, 5*2); // TODO: precolouring, don't overwrite vertex ID
-   BITSET_SET(used_regs, (5*2 + 1));
-   BITSET_SET(used_regs, (6*2 + 0));
-   BITSET_SET(used_regs, (6*2 + 1));

   agx_foreach_instr_in_block(block, I) {
      /* Optimization: if a split contains the last use of a vector, the split
@@ -152,6 +148,21 @@ agx_ra_assign_local(agx_block *block, uint8_t *ssa_to_reg, uint8_t *ncomps)
            ssa_to_reg[I->dest[d].value] = reg + offset;
         }

+         continue;
+      } else if (I->op == AGX_OPCODE_PRELOAD) {
+         /* We must coalesce all preload moves */
+         assert(I->dest[0].type == AGX_INDEX_NORMAL);
+         assert(I->dest[0].size == I->src[0].size);
+         assert(I->src[0].type == AGX_INDEX_REGISTER);
+
+         unsigned base = I->src[0].value;
+
+         for (unsigned i = 0; i < agx_size_align_16(I->src[0].size); ++i) {
+            assert(!BITSET_TEST(used_regs, base + i));
+            BITSET_SET(used_regs, base + i);
+         }
+
+         ssa_to_reg[I->dest[0].value] = base;
         continue;
      }

@@ -363,18 +374,29 @@ agx_ra(agx_context *ctx)
      agx_insert_parallel_copies(ctx, block);
   }

-   /* Phi nodes can be removed now */
   agx_foreach_instr_global_safe(ctx, I) {
-      if (I->op == AGX_OPCODE_PHI || I->op == AGX_OPCODE_LOGICAL_END)
+      switch (I->op) {
+      /* Pseudoinstructions for RA must be removed now */
+      case AGX_OPCODE_PHI:
+      case AGX_OPCODE_LOGICAL_END:
+      case AGX_OPCODE_PRELOAD:
         agx_remove_instruction(I);
+         break;

-      /* Remove identity moves */
-      if (I->op == AGX_OPCODE_MOV && I->src[0].type == AGX_INDEX_REGISTER &&
-          I->dest[0].size == I->src[0].size && I->src[0].value == I->dest[0].value) {
+      /* Coalesced moves can be removed */
+      case AGX_OPCODE_MOV:
+         if (I->src[0].type == AGX_INDEX_REGISTER &&
+             I->dest[0].size == I->src[0].size &&
+             I->src[0].value == I->dest[0].value) {

            assert(I->dest[0].type == AGX_INDEX_REGISTER);
            agx_remove_instruction(I);
         }
+         break;
+
+      default:
+         break;
+      }
   }

   free(ssa_to_reg);
--- a/src/asahi/compiler/test/test-optimizer.cpp
+++ b/src/asahi/compiler/test/test-optimizer.cpp
@@ -165,3 +165,11 @@ TEST_F(Optimizer, IntCopypropDoesntConvert)
         agx_xor_to(b, out, cvt, wy);
   });
 }
+
+TEST_F(Optimizer, SkipPreloads)
+{
+   NEGCASE32({
+         agx_index preload = agx_preload(b, agx_register(0, AGX_SIZE_32));
+         agx_xor_to(b, out, preload, wy);
+   });
+}