From b200e5765cb51bcfefac583f811b3d246b919282 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Iv=C3=A1n=20Briano?= <ivan.briano@intel.com>
Date: Wed, 23 Aug 2023 11:09:01 -0700
Subject: [PATCH] anv: use a simpler MUE layout for fast linked libraries

The compaction introduced in a2521233631 ("intel/compiler/mesh: compactify MUE layout")
is not suitable for the case where graphics pipeline libraries are fast
linked, as the fragment shader won't receive the mue_map to know where
to locate its inputs.
For that case, keep doing what we did before and lay things down in the
order varyings are defined, which is also how it works for the non-mesh
case.

Fixes dEQP-VK.fragment_shading_rate.*fast_linked_library*.ms

Reviewed-by: Caio Oliveira <caio.oliveira@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/25047>
---
 src/intel/compiler/brw_compiler.h |   3 +
 src/intel/compiler/brw_mesh.cpp   | 174 +++++++++++++++++++++---------
 src/intel/vulkan/anv_pipeline.c   |  13 ++-
 3 files changed, 137 insertions(+), 53 deletions(-)

diff --git a/src/intel/compiler/brw_compiler.h b/src/intel/compiler/brw_compiler.h
index b1deca378e8..c39c8f5809a 100644
--- a/src/intel/compiler/brw_compiler.h
+++ b/src/intel/compiler/brw_compiler.h
@@ -426,6 +426,9 @@ struct brw_task_prog_key
 struct brw_mesh_prog_key
 {
    struct brw_base_prog_key base;
+
+   bool compact_mue:1;
+   unsigned padding:31;
 };
 
 enum brw_sf_primitive {
diff --git a/src/intel/compiler/brw_mesh.cpp b/src/intel/compiler/brw_mesh.cpp
index 759a5336487..0c7eacd753a 100644
--- a/src/intel/compiler/brw_mesh.cpp
+++ b/src/intel/compiler/brw_mesh.cpp
@@ -763,7 +763,7 @@ brw_compute_mue_layout(const struct brw_compiler *compiler,
 static void
 brw_compute_mue_map(const struct brw_compiler *compiler,
                     struct nir_shader *nir, struct brw_mue_map *map,
-                    enum brw_mesh_index_format index_format)
+                    enum brw_mesh_index_format index_format, bool compact_mue)
 {
    memset(map, 0, sizeof(*map));
 
@@ -823,21 +823,17 @@ brw_compute_mue_map(const struct brw_compiler *compiler,
          ~(per_primitive_header_bits | per_vertex_header_bits);
 
    /* packing into prim header is possible only if prim header is present */
-   map->user_data_in_primitive_header =
+   map->user_data_in_primitive_header = compact_mue &&
          (outputs_written & per_primitive_header_bits) != 0;
 
    /* Packing into vert header is always possible, but we allow it only
     * if full vec4 is available (so point size is not used) and there's
     * nothing between it and normal vertex data (so no clip distances).
     */
-   map->user_data_in_vertex_header =
+   map->user_data_in_vertex_header = compact_mue &&
          (outputs_written & per_vertex_header_bits) ==
                BITFIELD64_BIT(VARYING_SLOT_POS);
 
-   brw_compute_mue_layout(compiler, orders, regular_outputs, nir,
-                          &map->user_data_in_primitive_header,
-                          &map->user_data_in_vertex_header);
-
    if (outputs_written & per_primitive_header_bits) {
       if (outputs_written & BITFIELD64_BIT(VARYING_SLOT_PRIMITIVE_SHADING_RATE)) {
          map->start_dw[VARYING_SLOT_PRIMITIVE_SHADING_RATE] =
@@ -871,39 +867,80 @@ brw_compute_mue_map(const struct brw_compiler *compiler,
 
    map->per_primitive_data_size_dw = 0;
 
-   unsigned start_dw = map->per_primitive_start_dw;
-   if (map->user_data_in_primitive_header)
-      start_dw += 4; /* first 4 dwords are used */
-   else
-      start_dw += map->per_primitive_header_size_dw;
-   unsigned header_used_dw = 0;
+   /* For fast linked libraries, we can't pack the MUE, as the fragment shader
+    * will be compiled without access to the MUE map and won't be able to find
+    * out where everything is.
+    * Instead, keep doing things as we did before the packing, just laying out
+    * everything in varying order, which is how the FS will expect them.
+    */
+   if (compact_mue) {
+      brw_compute_mue_layout(compiler, orders, regular_outputs, nir,
+                             &map->user_data_in_primitive_header,
+                             &map->user_data_in_vertex_header);
+
+      unsigned start_dw = map->per_primitive_start_dw;
+      if (map->user_data_in_primitive_header)
+         start_dw += 4; /* first 4 dwords are used */
+      else
+         start_dw += map->per_primitive_header_size_dw;
+      unsigned header_used_dw = 0;
+
+      for (auto it = orders[PRIM].cbegin(); it != orders[PRIM].cend(); ++it) {
+         int location = (*it).location;
+         if (location < 0) {
+            start_dw += (*it).dwords;
+            if (map->user_data_in_primitive_header && header_used_dw < 4)
+               header_used_dw += (*it).dwords;
+            else
+               map->per_primitive_data_size_dw += (*it).dwords;
+            assert(header_used_dw <= 4);
+            continue;
+         }
+
+         assert(map->start_dw[location] == -1);
+
+         assert(location == VARYING_SLOT_PRIMITIVE_ID ||
+                location >= VARYING_SLOT_VAR0);
+
+         brw_mue_assign_position(&*it, map, start_dw);
 
-   for (auto it = orders[PRIM].cbegin(); it != orders[PRIM].cend(); ++it) {
-      int location = (*it).location;
-      if (location < 0) {
          start_dw += (*it).dwords;
          if (map->user_data_in_primitive_header && header_used_dw < 4)
             header_used_dw += (*it).dwords;
          else
             map->per_primitive_data_size_dw += (*it).dwords;
          assert(header_used_dw <= 4);
-         continue;
+         outputs_written &= ~BITFIELD64_RANGE(location, (*it).slots);
       }
+   } else {
+      unsigned start_dw = map->per_primitive_start_dw +
+                          map->per_primitive_header_size_dw;
 
-      assert(map->start_dw[location] == -1);
+      uint64_t per_prim_outputs = outputs_written & nir->info.per_primitive_outputs;
+      while (per_prim_outputs) {
+         uint64_t location = ffsl(per_prim_outputs) - 1;
 
-      assert(location == VARYING_SLOT_PRIMITIVE_ID ||
-             location >= VARYING_SLOT_VAR0);
+         assert(map->start_dw[location] == -1);
+         assert(location == VARYING_SLOT_PRIMITIVE_ID ||
+                location >= VARYING_SLOT_VAR0);
 
-      brw_mue_assign_position(&*it, map, start_dw);
+         nir_variable *var =
+            brw_nir_find_complete_variable_with_location(nir,
+                                                         nir_var_shader_out,
+                                                         location);
+         struct attr_desc d;
+         d.location = location;
+         d.type     = brw_nir_get_var_type(nir, var);
+         d.dwords   = glsl_count_dword_slots(d.type, false);
+         d.slots    = glsl_count_attribute_slots(d.type, false);
 
-      start_dw += (*it).dwords;
-      if (map->user_data_in_primitive_header && header_used_dw < 4)
-         header_used_dw += (*it).dwords;
-      else
-         map->per_primitive_data_size_dw += (*it).dwords;
-      assert(header_used_dw <= 4);
-      outputs_written &= ~BITFIELD64_RANGE(location, (*it).slots);
+         brw_mue_assign_position(&d, map, start_dw);
+
+         map->per_primitive_data_size_dw += ALIGN(d.dwords, 4);
+         start_dw += ALIGN(d.dwords, 4);
+
+         per_prim_outputs &= ~BITFIELD64_RANGE(location, d.slots);
+      }
    }
 
    map->per_primitive_pitch_dw = ALIGN(map->per_primitive_header_size_dw +
@@ -951,15 +988,40 @@ brw_compute_mue_map(const struct brw_compiler *compiler,
 
    map->per_vertex_data_size_dw = 0;
 
-   start_dw = map->per_vertex_start_dw;
-   if (!map->user_data_in_vertex_header)
-      start_dw += map->per_vertex_header_size_dw;
+   /* For fast linked libraries, we can't pack the MUE, as the fragment shader
+    * will be compiled without access to the MUE map and won't be able to find
+    * out where everything is.
+    * Instead, keep doing things as we did before the packing, just laying out
+    * everything in varying order, which is how the FS will expect them.
+    */
+   if (compact_mue) {
+      unsigned start_dw = map->per_vertex_start_dw;
+      if (!map->user_data_in_vertex_header)
+         start_dw += map->per_vertex_header_size_dw;
+
+      unsigned header_used_dw = 0;
+      for (unsigned type = VERT; type <= VERT_FLAT; ++type) {
+         for (auto it = orders[type].cbegin(); it != orders[type].cend(); ++it) {
+            int location = (*it).location;
+            if (location < 0) {
+               start_dw += (*it).dwords;
+               if (map->user_data_in_vertex_header && header_used_dw < 4) {
+                  header_used_dw += (*it).dwords;
+                  assert(header_used_dw <= 4);
+                  if (header_used_dw == 4)
+                     start_dw += 4; /* jump over gl_position */
+               } else {
+                  map->per_vertex_data_size_dw += (*it).dwords;
+               }
+               continue;
+            }
+
+            assert(map->start_dw[location] == -1);
+
+            assert(location >= VARYING_SLOT_VAR0);
+
+            brw_mue_assign_position(&*it, map, start_dw);
 
-   header_used_dw = 0;
-   for (unsigned type = VERT; type <= VERT_FLAT; ++type) {
-      for (auto it = orders[type].cbegin(); it != orders[type].cend(); ++it) {
-         int location = (*it).location;
-         if (location < 0) {
             start_dw += (*it).dwords;
             if (map->user_data_in_vertex_header && header_used_dw < 4) {
                header_used_dw += (*it).dwords;
@@ -969,25 +1031,36 @@ brw_compute_mue_map(const struct brw_compiler *compiler,
             } else {
                map->per_vertex_data_size_dw += (*it).dwords;
             }
-            continue;
+            outputs_written &= ~BITFIELD64_RANGE(location, (*it).slots);
          }
+      }
+   } else {
+      unsigned start_dw = map->per_vertex_start_dw +
+                          map->per_vertex_header_size_dw;
+
+      uint64_t per_vertex_outputs = outputs_written & ~nir->info.per_primitive_outputs;
+      while (per_vertex_outputs) {
+         uint64_t location = ffsl(per_vertex_outputs) - 1;
 
          assert(map->start_dw[location] == -1);
-
          assert(location >= VARYING_SLOT_VAR0);
 
-         brw_mue_assign_position(&*it, map, start_dw);
+         nir_variable *var =
+            brw_nir_find_complete_variable_with_location(nir,
+                                                         nir_var_shader_out,
+                                                         location);
+         struct attr_desc d;
+         d.location = location;
+         d.type     = brw_nir_get_var_type(nir, var);
+         d.dwords   = glsl_count_dword_slots(d.type, false);
+         d.slots    = glsl_count_attribute_slots(d.type, false);
 
-         start_dw += (*it).dwords;
-         if (map->user_data_in_vertex_header && header_used_dw < 4) {
-            header_used_dw += (*it).dwords;
-            assert(header_used_dw <= 4);
-            if (header_used_dw == 4)
-               start_dw += 4; /* jump over gl_position */
-         } else {
-            map->per_vertex_data_size_dw += (*it).dwords;
-         }
-         outputs_written &= ~BITFIELD64_RANGE(location, (*it).slots);
+         brw_mue_assign_position(&d, map, start_dw);
+
+         map->per_vertex_data_size_dw += ALIGN(d.dwords, 4);
+         start_dw += ALIGN(d.dwords, 4);
+
+         per_vertex_outputs &= ~BITFIELD64_RANGE(location, d.slots);
       }
    }
 
@@ -1435,7 +1508,8 @@ brw_compile_mesh(const struct brw_compiler *compiler,
 
    brw_nir_lower_tue_inputs(nir, params->tue_map);
 
-   brw_compute_mue_map(compiler, nir, &prog_data->map, prog_data->index_format);
+   brw_compute_mue_map(compiler, nir, &prog_data->map,
+                       prog_data->index_format, key->compact_mue);
    brw_nir_lower_mue_outputs(nir, &prog_data->map);
 
    brw_simd_selection_state simd_state{
diff --git a/src/intel/vulkan/anv_pipeline.c b/src/intel/vulkan/anv_pipeline.c
index 5bbcd6440fe..5e9ea5ed1c7 100644
--- a/src/intel/vulkan/anv_pipeline.c
+++ b/src/intel/vulkan/anv_pipeline.c
@@ -554,11 +554,14 @@ populate_task_prog_key(struct anv_pipeline_stage *stage,
 
 static void
 populate_mesh_prog_key(struct anv_pipeline_stage *stage,
-                       const struct anv_device *device)
+                       const struct anv_device *device,
+                       bool compact_mue)
 {
    memset(&stage->key, 0, sizeof(stage->key));
 
    populate_base_prog_key(stage, device);
+
+   stage->key.mesh.compact_mue = compact_mue;
 }
 
 static uint32_t
@@ -1737,9 +1740,13 @@ anv_graphics_pipeline_init_keys(struct anv_graphics_base_pipeline *pipeline,
          populate_task_prog_key(&stages[s], device);
          break;
 
-      case MESA_SHADER_MESH:
-         populate_mesh_prog_key(&stages[s], device);
+      case MESA_SHADER_MESH: {
+         const bool compact_mue =
+            !(pipeline->base.type == ANV_PIPELINE_GRAPHICS_LIB &&
+              !anv_pipeline_base_has_stage(pipeline, MESA_SHADER_FRAGMENT));
+         populate_mesh_prog_key(&stages[s], device, compact_mue);
          break;
+      }
 
       default:
          unreachable("Invalid graphics shader stage");