anv: use a simpler MUE layout for fast linked libraries

The compaction introduced in a252123363 ("intel/compiler/mesh: compactify MUE layout") is not suitable for the case where graphics pipeline libraries are fast linked, as the fragment shader won't receive the mue_map to know where to locate its inputs. For that case, keep doing what we did before and lay things down in the order varyings are defined, which is also how it works for the non-mesh case. Fixes dEQP-VK.fragment_shading_rate.*fast_linked_library*.ms Reviewed-by: Caio Oliveira <caio.oliveira@intel.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/25047>
2023-08-23 11:09:01 -07:00
parent bcde58ea86
commit b200e5765c
3 changed files with 137 additions and 53 deletions
--- a/src/intel/compiler/brw_compiler.h
+++ b/src/intel/compiler/brw_compiler.h
@@ -426,6 +426,9 @@ struct brw_task_prog_key
 struct brw_mesh_prog_key
 {
   struct brw_base_prog_key base;
+
+   bool compact_mue:1;
+   unsigned padding:31;
 };

 enum brw_sf_primitive {
--- a/src/intel/compiler/brw_mesh.cpp
+++ b/src/intel/compiler/brw_mesh.cpp
@@ -763,7 +763,7 @@ brw_compute_mue_layout(const struct brw_compiler *compiler,
 static void
 brw_compute_mue_map(const struct brw_compiler *compiler,
                    struct nir_shader *nir, struct brw_mue_map *map,
-                    enum brw_mesh_index_format index_format)
+                    enum brw_mesh_index_format index_format, bool compact_mue)
 {
   memset(map, 0, sizeof(*map));

@@ -823,21 +823,17 @@ brw_compute_mue_map(const struct brw_compiler *compiler,
         ~(per_primitive_header_bits | per_vertex_header_bits);

   /* packing into prim header is possible only if prim header is present */
-   map->user_data_in_primitive_header =
+   map->user_data_in_primitive_header = compact_mue &&
         (outputs_written & per_primitive_header_bits) != 0;

   /* Packing into vert header is always possible, but we allow it only
    * if full vec4 is available (so point size is not used) and there's
    * nothing between it and normal vertex data (so no clip distances).
    */
-   map->user_data_in_vertex_header =
+   map->user_data_in_vertex_header = compact_mue &&
         (outputs_written & per_vertex_header_bits) ==
               BITFIELD64_BIT(VARYING_SLOT_POS);

-   brw_compute_mue_layout(compiler, orders, regular_outputs, nir,
-                          &map->user_data_in_primitive_header,
-                          &map->user_data_in_vertex_header);
-
   if (outputs_written & per_primitive_header_bits) {
      if (outputs_written & BITFIELD64_BIT(VARYING_SLOT_PRIMITIVE_SHADING_RATE)) {
         map->start_dw[VARYING_SLOT_PRIMITIVE_SHADING_RATE] =
@@ -871,39 +867,80 @@ brw_compute_mue_map(const struct brw_compiler *compiler,

   map->per_primitive_data_size_dw = 0;

-   unsigned start_dw = map->per_primitive_start_dw;
-   if (map->user_data_in_primitive_header)
-      start_dw += 4; /* first 4 dwords are used */
-   else
-      start_dw += map->per_primitive_header_size_dw;
-   unsigned header_used_dw = 0;
+   /* For fast linked libraries, we can't pack the MUE, as the fragment shader
+    * will be compiled without access to the MUE map and won't be able to find
+    * out where everything is.
+    * Instead, keep doing things as we did before the packing, just laying out
+    * everything in varying order, which is how the FS will expect them.
+    */
+   if (compact_mue) {
+      brw_compute_mue_layout(compiler, orders, regular_outputs, nir,
+                             &map->user_data_in_primitive_header,
+                             &map->user_data_in_vertex_header);
+
+      unsigned start_dw = map->per_primitive_start_dw;
+      if (map->user_data_in_primitive_header)
+         start_dw += 4; /* first 4 dwords are used */
+      else
+         start_dw += map->per_primitive_header_size_dw;
+      unsigned header_used_dw = 0;
+
+      for (auto it = orders[PRIM].cbegin(); it != orders[PRIM].cend(); ++it) {
+         int location = (*it).location;
+         if (location < 0) {
+            start_dw += (*it).dwords;
+            if (map->user_data_in_primitive_header && header_used_dw < 4)
+               header_used_dw += (*it).dwords;
+            else
+               map->per_primitive_data_size_dw += (*it).dwords;
+            assert(header_used_dw <= 4);
+            continue;
+         }
+
+         assert(map->start_dw[location] == -1);
+
+         assert(location == VARYING_SLOT_PRIMITIVE_ID ||
+                location >= VARYING_SLOT_VAR0);
+
+         brw_mue_assign_position(&*it, map, start_dw);

-   for (auto it = orders[PRIM].cbegin(); it != orders[PRIM].cend(); ++it) {
-      int location = (*it).location;
-      if (location < 0) {
         start_dw += (*it).dwords;
         if (map->user_data_in_primitive_header && header_used_dw < 4)
            header_used_dw += (*it).dwords;
         else
            map->per_primitive_data_size_dw += (*it).dwords;
         assert(header_used_dw <= 4);
-         continue;
+         outputs_written &= ~BITFIELD64_RANGE(location, (*it).slots);
      }
+   } else {
+      unsigned start_dw = map->per_primitive_start_dw +
+                          map->per_primitive_header_size_dw;

-      assert(map->start_dw[location] == -1);
+      uint64_t per_prim_outputs = outputs_written & nir->info.per_primitive_outputs;
+      while (per_prim_outputs) {
+         uint64_t location = ffsl(per_prim_outputs) - 1;

-      assert(location == VARYING_SLOT_PRIMITIVE_ID ||
-             location >= VARYING_SLOT_VAR0);
+         assert(map->start_dw[location] == -1);
+         assert(location == VARYING_SLOT_PRIMITIVE_ID ||
+                location >= VARYING_SLOT_VAR0);

-      brw_mue_assign_position(&*it, map, start_dw);
+         nir_variable *var =
+            brw_nir_find_complete_variable_with_location(nir,
+                                                         nir_var_shader_out,
+                                                         location);
+         struct attr_desc d;
+         d.location = location;
+         d.type     = brw_nir_get_var_type(nir, var);
+         d.dwords   = glsl_count_dword_slots(d.type, false);
+         d.slots    = glsl_count_attribute_slots(d.type, false);

-      start_dw += (*it).dwords;
-      if (map->user_data_in_primitive_header && header_used_dw < 4)
-         header_used_dw += (*it).dwords;
-      else
-         map->per_primitive_data_size_dw += (*it).dwords;
-      assert(header_used_dw <= 4);
-      outputs_written &= ~BITFIELD64_RANGE(location, (*it).slots);
+         brw_mue_assign_position(&d, map, start_dw);
+
+         map->per_primitive_data_size_dw += ALIGN(d.dwords, 4);
+         start_dw += ALIGN(d.dwords, 4);
+
+         per_prim_outputs &= ~BITFIELD64_RANGE(location, d.slots);
+      }
   }

   map->per_primitive_pitch_dw = ALIGN(map->per_primitive_header_size_dw +
@@ -951,15 +988,40 @@ brw_compute_mue_map(const struct brw_compiler *compiler,

   map->per_vertex_data_size_dw = 0;

-   start_dw = map->per_vertex_start_dw;
-   if (!map->user_data_in_vertex_header)
-      start_dw += map->per_vertex_header_size_dw;
+   /* For fast linked libraries, we can't pack the MUE, as the fragment shader
+    * will be compiled without access to the MUE map and won't be able to find
+    * out where everything is.
+    * Instead, keep doing things as we did before the packing, just laying out
+    * everything in varying order, which is how the FS will expect them.
+    */
+   if (compact_mue) {
+      unsigned start_dw = map->per_vertex_start_dw;
+      if (!map->user_data_in_vertex_header)
+         start_dw += map->per_vertex_header_size_dw;
+
+      unsigned header_used_dw = 0;
+      for (unsigned type = VERT; type <= VERT_FLAT; ++type) {
+         for (auto it = orders[type].cbegin(); it != orders[type].cend(); ++it) {
+            int location = (*it).location;
+            if (location < 0) {
+               start_dw += (*it).dwords;
+               if (map->user_data_in_vertex_header && header_used_dw < 4) {
+                  header_used_dw += (*it).dwords;
+                  assert(header_used_dw <= 4);
+                  if (header_used_dw == 4)
+                     start_dw += 4; /* jump over gl_position */
+               } else {
+                  map->per_vertex_data_size_dw += (*it).dwords;
+               }
+               continue;
+            }
+
+            assert(map->start_dw[location] == -1);
+
+            assert(location >= VARYING_SLOT_VAR0);
+
+            brw_mue_assign_position(&*it, map, start_dw);

-   header_used_dw = 0;
-   for (unsigned type = VERT; type <= VERT_FLAT; ++type) {
-      for (auto it = orders[type].cbegin(); it != orders[type].cend(); ++it) {
-         int location = (*it).location;
-         if (location < 0) {
            start_dw += (*it).dwords;
            if (map->user_data_in_vertex_header && header_used_dw < 4) {
               header_used_dw += (*it).dwords;
@@ -969,25 +1031,36 @@ brw_compute_mue_map(const struct brw_compiler *compiler,
            } else {
               map->per_vertex_data_size_dw += (*it).dwords;
            }
-            continue;
+            outputs_written &= ~BITFIELD64_RANGE(location, (*it).slots);
         }
+      }
+   } else {
+      unsigned start_dw = map->per_vertex_start_dw +
+                          map->per_vertex_header_size_dw;
+
+      uint64_t per_vertex_outputs = outputs_written & ~nir->info.per_primitive_outputs;
+      while (per_vertex_outputs) {
+         uint64_t location = ffsl(per_vertex_outputs) - 1;

         assert(map->start_dw[location] == -1);
-
         assert(location >= VARYING_SLOT_VAR0);

-         brw_mue_assign_position(&*it, map, start_dw);
+         nir_variable *var =
+            brw_nir_find_complete_variable_with_location(nir,
+                                                         nir_var_shader_out,
+                                                         location);
+         struct attr_desc d;
+         d.location = location;
+         d.type     = brw_nir_get_var_type(nir, var);
+         d.dwords   = glsl_count_dword_slots(d.type, false);
+         d.slots    = glsl_count_attribute_slots(d.type, false);

-         start_dw += (*it).dwords;
-         if (map->user_data_in_vertex_header && header_used_dw < 4) {
-            header_used_dw += (*it).dwords;
-            assert(header_used_dw <= 4);
-            if (header_used_dw == 4)
-               start_dw += 4; /* jump over gl_position */
-         } else {
-            map->per_vertex_data_size_dw += (*it).dwords;
-         }
-         outputs_written &= ~BITFIELD64_RANGE(location, (*it).slots);
+         brw_mue_assign_position(&d, map, start_dw);
+
+         map->per_vertex_data_size_dw += ALIGN(d.dwords, 4);
+         start_dw += ALIGN(d.dwords, 4);
+
+         per_vertex_outputs &= ~BITFIELD64_RANGE(location, d.slots);
      }
   }

@@ -1435,7 +1508,8 @@ brw_compile_mesh(const struct brw_compiler *compiler,

   brw_nir_lower_tue_inputs(nir, params->tue_map);

-   brw_compute_mue_map(compiler, nir, &prog_data->map, prog_data->index_format);
+   brw_compute_mue_map(compiler, nir, &prog_data->map,
+                       prog_data->index_format, key->compact_mue);
   brw_nir_lower_mue_outputs(nir, &prog_data->map);

   brw_simd_selection_state simd_state{
--- a/src/intel/vulkan/anv_pipeline.c
+++ b/src/intel/vulkan/anv_pipeline.c
@@ -554,11 +554,14 @@ populate_task_prog_key(struct anv_pipeline_stage *stage,

 static void
 populate_mesh_prog_key(struct anv_pipeline_stage *stage,
-                       const struct anv_device *device)
+                       const struct anv_device *device,
+                       bool compact_mue)
 {
   memset(&stage->key, 0, sizeof(stage->key));

   populate_base_prog_key(stage, device);
+
+   stage->key.mesh.compact_mue = compact_mue;
 }

 static uint32_t
@@ -1737,9 +1740,13 @@ anv_graphics_pipeline_init_keys(struct anv_graphics_base_pipeline *pipeline,
         populate_task_prog_key(&stages[s], device);
         break;

-      case MESA_SHADER_MESH:
-         populate_mesh_prog_key(&stages[s], device);
+      case MESA_SHADER_MESH: {
+         const bool compact_mue =
+            !(pipeline->base.type == ANV_PIPELINE_GRAPHICS_LIB &&
+              !anv_pipeline_base_has_stage(pipeline, MESA_SHADER_FRAGMENT));
+         populate_mesh_prog_key(&stages[s], device, compact_mue);
         break;
+      }

      default:
         unreachable("Invalid graphics shader stage");