asahi: Arrange VS varyings in the correct order

The GPU ABI requires varyings to be grouped as follows: - Position - Smooth shaded fp32 - Flat shaded fp32 - Linear shaded fp32 - Smooth shaded fp16 - Flat shaded fp16 - Linear shaded fp16 - Point size Use the flat shaded mask info we now have in the vertex shader key to sort things properly, and pass the counts to the hardware. FP16 is still TODO. Signed-off-by: Asahi Lina <lina@asahilina.net> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/23998>
2023-06-28 19:15:28 +09:00
parent 2055e03243
commit 1140bdb783
3 changed files with 58 additions and 6 deletions
--- a/src/asahi/compiler/agx_compile.c
+++ b/src/asahi/compiler/agx_compile.c
@@ -2018,7 +2018,8 @@ agx_optimize_nir(nir_shader *nir, unsigned *preamble_size)

 /* ABI: position first, then user, then psiz */
 static void
-agx_remap_varyings_vs(nir_shader *nir, struct agx_varyings_vs *varyings)
+agx_remap_varyings_vs(nir_shader *nir, struct agx_varyings_vs *varyings,
+                      struct agx_shader_key *key)
 {
   unsigned base = 0;

@@ -2033,16 +2034,47 @@ agx_remap_varyings_vs(nir_shader *nir, struct agx_varyings_vs *varyings)
   varyings->slots[VARYING_SLOT_POS] = base;
   base += 4;

-   u_foreach_bit64(loc, nir->info.outputs_written) {
+   assert(!(key->vs.outputs_flat_shaded & key->vs.outputs_linear_shaded));
+
+   /* Smooth 32-bit user bindings go next */
+   u_foreach_bit64(loc, nir->info.outputs_written &
+                           ~key->vs.outputs_flat_shaded &
+                           ~key->vs.outputs_linear_shaded) {
      if (loc == VARYING_SLOT_POS || loc == VARYING_SLOT_PSIZ)
         continue;

      varyings->slots[loc] = base;
      base += 4;
+      varyings->num_32_smooth += 4;
+   }
+
+   /* Flat 32-bit user bindings go next */
+   u_foreach_bit64(loc,
+                   nir->info.outputs_written & key->vs.outputs_flat_shaded) {
+      if (loc == VARYING_SLOT_POS || loc == VARYING_SLOT_PSIZ)
+         continue;
+
+      varyings->slots[loc] = base;
+      base += 4;
+      varyings->num_32_flat += 4;
+   }
+
+   /* Linear 32-bit user bindings go next */
+   u_foreach_bit64(loc,
+                   nir->info.outputs_written & key->vs.outputs_linear_shaded) {
+      if (loc == VARYING_SLOT_POS || loc == VARYING_SLOT_PSIZ)
+         continue;
+
+      varyings->slots[loc] = base;
+      base += 4;
+      varyings->num_32_linear += 4;
   }

   /* TODO: Link FP16 varyings */
   varyings->base_index_fp16 = base;
+   varyings->num_16_smooth = 0;
+   varyings->num_16_flat = 0;
+   varyings->num_16_linear = 0;

   if (nir->info.outputs_written & VARYING_BIT_PSIZ) {
      varyings->slots[VARYING_SLOT_PSIZ] = base;
@@ -2471,7 +2503,7 @@ agx_compile_shader_nir(nir_shader *nir, struct agx_shader_key *key,

   /* Must be last since NIR passes can remap driver_location freely */
   if (nir->info.stage == MESA_SHADER_VERTEX)
-      agx_remap_varyings_vs(nir, &out->varyings.vs);
+      agx_remap_varyings_vs(nir, &out->varyings.vs, key);

   if (agx_should_dump(nir, AGX_DBG_SHADERS))
      nir_print_shader(nir, stdout);
--- a/src/asahi/compiler/agx_compile.h
+++ b/src/asahi/compiler/agx_compile.h
@@ -10,6 +10,17 @@
 #include "util/u_dynarray.h"

 struct agx_varyings_vs {
+   /* The number of user varyings of each type. The varyings must be allocated
+    * in this order ({smooth, flat, linear} × {32, 16}), which may require
+    * remapping.
+    */
+   unsigned num_32_smooth;
+   unsigned num_32_flat;
+   unsigned num_32_linear;
+   unsigned num_16_smooth;
+   unsigned num_16_flat;
+   unsigned num_16_linear;
+
   /* The first index used for FP16 varyings. Indices less than this are treated
    * as FP32. This may require remapping slots to guarantee.
    */
--- a/src/gallium/drivers/asahi/agx_state.c
+++ b/src/gallium/drivers/asahi/agx_state.c
@@ -2212,7 +2212,6 @@ agx_batch_init_state(struct agx_batch *batch)
   struct agx_ppp_update ppp =
      agx_new_ppp_update(&batch->pool, (struct AGX_PPP_HEADER){
                                          .w_clamp = true,
-                                          .varying_counts_16 = true,
                                          .cull_2 = true,
                                          .occlusion_query_2 = true,
                                          .output_unknown = true,
@@ -2221,7 +2220,6 @@ agx_batch_init_state(struct agx_batch *batch)

   /* clang-format off */
   agx_ppp_push(&ppp, W_CLAMP, cfg) cfg.w_clamp = 1e-10;
-   agx_ppp_push(&ppp, VARYING_COUNTS, cfg);
   agx_ppp_push(&ppp, CULL_2, cfg);
   agx_ppp_push(&ppp, FRAGMENT_OCCLUSION_QUERY_2, cfg);
   agx_ppp_push(&ppp, OUTPUT_UNKNOWN, cfg);
@@ -2416,6 +2414,7 @@ agx_encode_state(struct agx_batch *batch, uint8_t *out, bool is_lines,
      .fragment_back_stencil = IS_DIRTY(ZS),
      .output_select = IS_DIRTY(VS_PROG) || IS_DIRTY(FS_PROG),
      .varying_counts_32 = IS_DIRTY(VS_PROG),
+      .varying_counts_16 = IS_DIRTY(VS_PROG),
      .cull = IS_DIRTY(RS),
      .fragment_shader =
         IS_DIRTY(FS) || varyings_dirty || IS_DIRTY(SAMPLE_MASK),
@@ -2506,9 +2505,19 @@ agx_encode_state(struct agx_batch *batch, uint8_t *out, bool is_lines,
      }
   }

+   assert(dirty.varying_counts_32 == dirty.varying_counts_16);
+
   if (dirty.varying_counts_32) {
      agx_ppp_push(&ppp, VARYING_COUNTS, cfg) {
-         cfg.smooth = agx_num_general_outputs(&ctx->vs->info.varyings.vs);
+         cfg.smooth = vs->info.varyings.vs.num_32_smooth;
+         cfg.flat = vs->info.varyings.vs.num_32_flat;
+         cfg.linear = vs->info.varyings.vs.num_32_linear;
+      }
+
+      agx_ppp_push(&ppp, VARYING_COUNTS, cfg) {
+         cfg.smooth = vs->info.varyings.vs.num_16_smooth;
+         cfg.flat = vs->info.varyings.vs.num_16_flat;
+         cfg.linear = vs->info.varyings.vs.num_16_linear;
      }
   }