intel/compiler/mesh: compactify MUE layout

Instead of using 4 dwords for each output slot, use only the amount of memory actually needed by each variable. There are some complications from this "obvious" idea: - flat and non-flat variables can't be merged into the same vec4 slot, because flat inputs mask has vec4 stride - multi-slot variables can have different layout: float[N] requires N 1-dword slots, but i64vec3 requires 1 fully occupied 4-dword slot followed by 2-dword slot - some output variables occur both in single-channel/component split and combined variants - crossing vec4 boundary requires generating more writes, so avoiding them if possible is beneficial This patch fixes some issues with arrays in per-vertex and per-primitive data (func.mesh.ext.outputs.*.indirect_array.q0 in crucible) and by reduction in single MUE size it allows spawning more threads at the same time. Note: this patch doesn't improve vk_meshlet_cadscene performance because default layout is already optimal enough. Reviewed-by: Ivan Briano <ivan.briano@intel.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/20407>
2022-12-21 15:40:07 +01:00
parent fb765a65c8
commit a252123363
8 changed files with 478 additions and 118 deletions
--- a/src/intel/compiler/brw_compiler.h
+++ b/src/intel/compiler/brw_compiler.h
@@ -1022,6 +1022,7 @@ struct brw_wm_prog_data {
    * For varying slots that are not used by the FS, the value is -1.
    */
   int urb_setup[VARYING_SLOT_MAX];
+   int urb_setup_channel[VARYING_SLOT_MAX];

   /**
    * Cache structure into the urb_setup array above that contains the
@@ -1625,6 +1626,7 @@ struct brw_tue_map {

 struct brw_mue_map {
   int32_t start_dw[VARYING_SLOT_MAX];
+   uint32_t len_dw[VARYING_SLOT_MAX];
   uint32_t per_primitive_indices_dw;

   uint32_t size_dw;
--- a/src/intel/compiler/brw_fs.cpp
+++ b/src/intel/compiler/brw_fs.cpp
@@ -1764,10 +1764,10 @@ calculate_urb_setup(const struct intel_device_info *devinfo,
                    const nir_shader *nir,
                    const struct brw_mue_map *mue_map)
 {
-   memset(prog_data->urb_setup, -1,
-          sizeof(prog_data->urb_setup[0]) * VARYING_SLOT_MAX);
+   memset(prog_data->urb_setup, -1, sizeof(prog_data->urb_setup));
+   memset(prog_data->urb_setup_channel, 0, sizeof(prog_data->urb_setup_channel));

-   int urb_next = 0;
+   int urb_next = 0; /* in vec4s */

   const uint64_t inputs_read =
      nir->info.inputs_read & ~nir->info.per_primitive_inputs;
@@ -1782,6 +1782,9 @@ calculate_urb_setup(const struct intel_device_info *devinfo,
         uint64_t per_prim_inputs_read =
               nir->info.inputs_read & nir->info.per_primitive_inputs;

+         unsigned per_prim_start_dw = mue_map->per_primitive_start_dw;
+         unsigned per_prim_size_dw = mue_map->per_primitive_pitch_dw;
+
         /* In Mesh, PRIMITIVE_SHADING_RATE, VIEWPORT and LAYER slots
          * are always at the beginning, because they come from MUE
          * Primitive Header, not Per-Primitive Attributes.
@@ -1789,8 +1792,9 @@ calculate_urb_setup(const struct intel_device_info *devinfo,
         const uint64_t primitive_header_bits = VARYING_BIT_VIEWPORT |
                                                VARYING_BIT_LAYER |
                                                VARYING_BIT_PRIMITIVE_SHADING_RATE;
+         bool reads_header = (per_prim_inputs_read & primitive_header_bits) != 0;

-         if (per_prim_inputs_read & primitive_header_bits) {
+         if (reads_header) {
            /* Primitive Shading Rate, Layer and Viewport live in the same
             * 4-dwords slot (psr is dword 0, layer is dword 1, and viewport
             * is dword 2).
@@ -1804,23 +1808,30 @@ calculate_urb_setup(const struct intel_device_info *devinfo,
            if (per_prim_inputs_read & VARYING_BIT_VIEWPORT)
               prog_data->urb_setup[VARYING_SLOT_VIEWPORT] = 0;

-            /* 3DSTATE_SBE_MESH.Per[Primitive|Vertex]URBEntryOutputRead[Offset|Length]
-             * are in full GRFs (8 dwords) and MUE Primitive Header is 8 dwords,
-             * so next per-primitive attribute must be placed in slot 2 (each slot
-             * is 4 dwords long).
-             */
-            urb_next = 2;
            per_prim_inputs_read &= ~primitive_header_bits;
+         } else {
+            /* If fs doesn't need primitive header, then it won't be made
+             * available through SBE_MESH, so we have to skip them when
+             * calculating offset from start of per-prim data.
+             */
+            per_prim_start_dw += mue_map->per_primitive_header_size_dw;
+            per_prim_size_dw -= mue_map->per_primitive_header_size_dw;
         }

-         for (unsigned i = 0; i < VARYING_SLOT_MAX; i++) {
-            if (per_prim_inputs_read & BITFIELD64_BIT(i)) {
-               prog_data->urb_setup[i] = urb_next++;
-            }
+         u_foreach_bit64(i, per_prim_inputs_read) {
+            int start = mue_map->start_dw[i];
+
+            assert(start >= 0);
+            assert(mue_map->len_dw[i] > 0);
+
+            assert(unsigned(start) >= per_prim_start_dw);
+            unsigned pos_dw = unsigned(start) - per_prim_start_dw;
+
+            prog_data->urb_setup[i] = urb_next + pos_dw / 4;
+            prog_data->urb_setup_channel[i] = pos_dw % 4;
         }

-         /* The actual setup attributes later must be aligned to a full GRF. */
-         urb_next = ALIGN(urb_next, 2);
+         urb_next = per_prim_size_dw / 4;

         prog_data->num_per_primitive_inputs = urb_next;
      }
@@ -1835,21 +1846,43 @@ calculate_urb_setup(const struct intel_device_info *devinfo,
         unique_fs_attrs &= ~clip_dist_bits;
      }

+      unsigned per_vertex_start_dw = mue_map->per_vertex_start_dw;
+      unsigned per_vertex_size_dw = mue_map->per_vertex_pitch_dw;
+
+      /* Per-Vertex header is never available to fragment shader. */
+      per_vertex_start_dw += 8;
+      per_vertex_size_dw -= 8;
+
      /* In Mesh, CLIP_DIST slots are always at the beginning, because
       * they come from MUE Vertex Header, not Per-Vertex Attributes.
       */
      if (inputs_read & clip_dist_bits) {
-         prog_data->urb_setup[VARYING_SLOT_CLIP_DIST0] = urb_next++;
-         prog_data->urb_setup[VARYING_SLOT_CLIP_DIST1] = urb_next++;
+         prog_data->urb_setup[VARYING_SLOT_CLIP_DIST0] = urb_next;
+         prog_data->urb_setup[VARYING_SLOT_CLIP_DIST1] = urb_next + 1;
+      } else if (mue_map->per_vertex_header_size_dw > 8) {
+         /* Clip distances are in MUE, but we are not reading them in FS. */
+         per_vertex_start_dw += 8;
+         per_vertex_size_dw -= 8;
      }

      /* Per-Vertex attributes are laid out ordered.  Because we always link
       * Mesh and Fragment shaders, the which slots are written and read by
       * each of them will match. */
-      for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
-         if (unique_fs_attrs & BITFIELD64_BIT(i))
-            prog_data->urb_setup[i] = urb_next++;
+
+      u_foreach_bit64(i, unique_fs_attrs) {
+         int start = mue_map->start_dw[i];
+
+         assert(start >= 0);
+         assert(mue_map->len_dw[i] > 0);
+
+         assert(unsigned(start) >= per_vertex_start_dw);
+         unsigned pos_dw = unsigned(start) - per_vertex_start_dw;
+
+         prog_data->urb_setup[i] = urb_next + pos_dw / 4;
+         prog_data->urb_setup_channel[i] = pos_dw % 4;
      }
+
+      urb_next += per_vertex_size_dw / 4;
   } else if (devinfo->ver >= 6) {
      uint64_t vue_header_bits =
         VARYING_BIT_PSIZ | VARYING_BIT_LAYER | VARYING_BIT_VIEWPORT;
--- a/src/intel/compiler/brw_fs.h
+++ b/src/intel/compiler/brw_fs.h
@@ -438,7 +438,7 @@ public:
   fs_reg get_timestamp(const brw::fs_builder &bld);

   fs_reg interp_reg(int location, int channel);
-   fs_reg per_primitive_reg(int location);
+   fs_reg per_primitive_reg(int location, unsigned comp);

   virtual void dump_instruction_to_file(const backend_instruction *inst, FILE *file) const;
   virtual void dump_instructions_to_file(FILE *file) const;
--- a/src/intel/compiler/brw_fs_nir.cpp
+++ b/src/intel/compiler/brw_fs_nir.cpp
@@ -3489,7 +3489,7 @@ fs_visitor::nir_emit_fs_intrinsic(const fs_builder &bld,
         assert(base != VARYING_SLOT_PRIMITIVE_INDICES);
         for (unsigned int i = 0; i < num_components; i++) {
            bld.MOV(offset(dest, bld, i),
-                    retype(component(per_primitive_reg(base), comp + i), dest.type));
+                    retype(per_primitive_reg(base, comp + i), dest.type));
         }
      } else {
         for (unsigned int i = 0; i < num_components; i++) {
--- a/src/intel/compiler/brw_fs_visitor.cpp
+++ b/src/intel/compiler/brw_fs_visitor.cpp
@@ -126,6 +126,7 @@ fs_visitor::interp_reg(int location, int channel)

   assert(prog_data->urb_setup[location] >= 0);
   unsigned nr = prog_data->urb_setup[location];
+   channel += prog_data->urb_setup_channel[location];

   /* Adjust so we start counting from the first per_vertex input. */
   assert(nr >= prog_data->num_per_primitive_inputs);
@@ -142,19 +143,22 @@ fs_visitor::interp_reg(int location, int channel)
 * generate_code() time.
 */
 fs_reg
-fs_visitor::per_primitive_reg(int location)
+fs_visitor::per_primitive_reg(int location, unsigned comp)
 {
   assert(stage == MESA_SHADER_FRAGMENT);
   assert(BITFIELD64_BIT(location) & nir->info.per_primitive_inputs);

   const struct brw_wm_prog_data *prog_data = brw_wm_prog_data(this->prog_data);

+   comp += prog_data->urb_setup_channel[location];
+
   assert(prog_data->urb_setup[location] >= 0);

-   const unsigned regnr = prog_data->urb_setup[location];
+   const unsigned regnr = prog_data->urb_setup[location] + comp / 4;
+
   assert(regnr < prog_data->num_per_primitive_inputs);

-   return fs_reg(ATTR, regnr, BRW_REGISTER_TYPE_F);
+   return component(fs_reg(ATTR, regnr, BRW_REGISTER_TYPE_F), comp % 4);
 }

 /** Emits the interpolation for the varying inputs. */
--- a/src/intel/compiler/brw_mesh.cpp
+++ b/src/intel/compiler/brw_mesh.cpp
@@ -21,6 +21,8 @@
 * IN THE SOFTWARE.
 */

+#include <list>
+#include <vector>
 #include "brw_compiler.h"
 #include "brw_fs.h"
 #include "brw_nir.h"
@@ -414,6 +416,224 @@ brw_nir_lower_tue_inputs(nir_shader *nir, const brw_tue_map *map)
            nir_address_format_32bit_offset);
 }

+/* Attribute types. Flat attributes have to be a separate class because
+ * flat and interpolated attributes can't share the same vec4 slot
+ * (see 3DSTATE_SBE.ConstantInterpolationEnable).
+ */
+enum {
+   PRIM, /* per primitive */
+   VERT, /* per vertex interpolated */
+   VERT_FLAT, /* per vertex flat */
+};
+
+struct attr_desc {
+   int location;
+   const struct glsl_type *type;
+   unsigned dwords;
+   unsigned slots;
+};
+
+struct attr_type_info {
+   /* order of attributes, negative values are holes */
+   std::list<struct attr_desc> *order;
+
+   /* attributes after which there's hole of size equal to array index */
+   std::list<int> holes[4];
+};
+
+static void
+brw_mue_assign_position(const struct attr_desc *attr,
+                        struct brw_mue_map *map,
+                        unsigned start_dw)
+{
+   bool is_array = glsl_type_is_array(attr->type);
+   int location = attr->location;
+   unsigned remaining = attr->dwords;
+
+   for (unsigned slot = 0; slot < attr->slots; ++slot) {
+      map->start_dw[location + slot] = start_dw;
+
+      unsigned sz;
+
+      if (is_array) {
+         assert(attr->dwords % attr->slots == 0);
+         sz = attr->dwords / attr->slots;
+      } else {
+         sz = MIN2(remaining, 4);
+      }
+
+      map->len_dw[location + slot] = sz;
+      start_dw += sz;
+      remaining -= sz;
+   }
+}
+
+static nir_variable *
+brw_nir_find_complete_variable_with_location(nir_shader *shader,
+                                             nir_variable_mode mode,
+                                             int location)
+{
+   nir_variable *best_var = NULL;
+   unsigned last_size = 0;
+
+   nir_foreach_variable_with_modes(var, shader, mode) {
+      if (var->data.location != location)
+         continue;
+
+      unsigned new_size = glsl_count_dword_slots(var->type, false);
+      if (new_size > last_size) {
+         best_var = var;
+         last_size = new_size;
+      }
+   }
+
+   return best_var;
+}
+
+/* Finds order of outputs which require minimum size, without splitting
+ * of URB read/write messages (which operate on vec4-aligned memory).
+ */
+static void
+brw_compute_mue_layout(std::list<struct attr_desc> *orders,
+                       uint64_t outputs_written,
+                       struct nir_shader *nir)
+{
+   const struct shader_info *info = &nir->info;
+
+   struct attr_type_info data[3];
+   bool no_compact = !debug_get_bool_option("BRW_MESH_COMPACTION", true);
+
+   for (unsigned i = PRIM; i <= VERT_FLAT; ++i)
+      data[i].order = &orders[i];
+
+   u_foreach_bit64(location, outputs_written) {
+      if ((BITFIELD64_BIT(location) & outputs_written) == 0)
+         continue;
+
+      /* At this point there are both complete and split variables as
+       * outputs. We need the complete variable to compute the required
+       * size.
+       */
+      nir_variable *var =
+            brw_nir_find_complete_variable_with_location(nir,
+                                                         nir_var_shader_out,
+                                                         location);
+
+      struct attr_desc d;
+      d.location = location;
+      d.type     = brw_nir_get_var_type(nir, var);
+      d.dwords   = glsl_count_dword_slots(d.type, false);
+      d.slots    = glsl_count_attribute_slots(d.type, false);
+
+      struct attr_type_info *type_data;
+
+      if (BITFIELD64_BIT(location) & info->per_primitive_outputs)
+         type_data = &data[PRIM];
+      else if (var->data.interpolation == INTERP_MODE_FLAT)
+         type_data = &data[VERT_FLAT];
+      else
+         type_data = &data[VERT];
+
+      std::list<struct attr_desc> *order = type_data->order;
+      std::list<int> *holes = type_data->holes;
+
+      outputs_written &= ~BITFIELD64_RANGE(location, d.slots);
+
+      int mod = d.dwords % 4;
+      if (mod == 0) {
+         order->push_back(d);
+         continue;
+      }
+
+      struct attr_desc h;
+      h.location = -1;
+      h.type = NULL;
+      h.dwords = 4 - mod;
+      h.slots = 0;
+
+      if (no_compact) {
+         order->push_back(d);
+         order->push_back(h);
+         continue;
+      }
+
+      if (d.dwords > 4) {
+         order->push_back(d);
+         order->push_back(h);
+         holes[h.dwords].push_back(location);
+         continue;
+      }
+
+      assert(d.dwords < 4);
+
+      unsigned found = 0;
+      /* try to find the smallest hole big enough to hold this attribute */
+      for (unsigned sz = d.dwords; sz < 4; sz++){
+         if (!holes[sz].empty()) {
+            found = sz;
+            break;
+         }
+      }
+
+      /* append at the end if not found */
+      if (found == 0) {
+         order->push_back(d);
+         order->push_back(h);
+         holes[h.dwords].push_back(location);
+
+         continue;
+      }
+
+      assert(found < 4);
+      assert(!holes[found].empty());
+      int after_loc = holes[found].back();
+      holes[found].pop_back();
+
+      bool inserted_back = false;
+
+      for (auto it = order->begin(); it != order->end(); ++it) {
+         if ((*it).location != after_loc)
+            continue;
+
+         ++it;
+         /* must be a hole */
+         assert((*it).location < 0);
+         /* and it must be big enough */
+         assert(d.dwords <= (*it).dwords);
+
+         if (d.dwords == (*it).dwords) {
+            /* exact size, just replace */
+            *it = d;
+         } else {
+            /* inexact size, shrink hole */
+            (*it).dwords -= d.dwords;
+            /* and insert new attribute before it */
+            order->insert(it, d);
+
+            /* Insert shrunk hole in a spot so that the order of attributes
+             * is preserved.
+             */
+            std::list<int> &hole_list = holes[(*it).dwords];
+            std::list<int>::iterator insert_before = hole_list.end();
+
+            for (auto it2 = hole_list.begin(); it2 != hole_list.end(); ++it2) {
+               if ((*it2) >= (int)location) {
+                  insert_before = it2;
+                  break;
+               }
+            }
+
+            hole_list.insert(insert_before, location);
+         }
+
+         inserted_back = true;
+         break;
+      }
+
+      assert(inserted_back);
+   }
+}
+
 /* Mesh URB Entry consists of an initial section
 *
 *  - Primitive Count
@@ -443,8 +663,8 @@ brw_compute_mue_map(struct nir_shader *nir, struct brw_mue_map *map,
 {
   memset(map, 0, sizeof(*map));

-   for (int i = 0; i < VARYING_SLOT_MAX; i++)
-      map->start_dw[i] = -1;
+   memset(&map->start_dw[0], -1, sizeof(map->start_dw));
+   memset(&map->len_dw[0], 0, sizeof(map->len_dw));

   unsigned vertices_per_primitive =
      num_mesh_vertices_per_primitive(nir->info.mesh.primitive_type);
@@ -454,16 +674,6 @@ brw_compute_mue_map(struct nir_shader *nir, struct brw_mue_map *map,

   uint64_t outputs_written = nir->info.outputs_written;

-   /* Assign initial section. */
-   if (BITFIELD64_BIT(VARYING_SLOT_PRIMITIVE_COUNT) & outputs_written) {
-      map->start_dw[VARYING_SLOT_PRIMITIVE_COUNT] = 0;
-      outputs_written &= ~BITFIELD64_BIT(VARYING_SLOT_PRIMITIVE_COUNT);
-   }
-   if (BITFIELD64_BIT(VARYING_SLOT_PRIMITIVE_INDICES) & outputs_written) {
-      map->start_dw[VARYING_SLOT_PRIMITIVE_INDICES] = 1;
-      outputs_written &= ~BITFIELD64_BIT(VARYING_SLOT_PRIMITIVE_INDICES);
-   }
-
   /* One dword for primitives count then K extra dwords for each primitive. */
   switch (index_format) {
   case BRW_INDEX_FORMAT_U32:
@@ -479,86 +689,157 @@ brw_compute_mue_map(struct nir_shader *nir, struct brw_mue_map *map,
   map->per_primitive_start_dw = ALIGN(map->per_primitive_indices_dw *
                                       map->max_primitives + 1, 8);

-   /* TODO(mesh): Multiview. */
-   map->per_primitive_header_size_dw =
-         (nir->info.outputs_written & (BITFIELD64_BIT(VARYING_SLOT_VIEWPORT) |
-                                       BITFIELD64_BIT(VARYING_SLOT_CULL_PRIMITIVE) |
-                                       BITFIELD64_BIT(VARYING_SLOT_PRIMITIVE_SHADING_RATE) |
-                                       BITFIELD64_BIT(VARYING_SLOT_LAYER))) ? 8 : 0;
+   /* Assign initial section. */
+   if (BITFIELD64_BIT(VARYING_SLOT_PRIMITIVE_COUNT) & outputs_written) {
+      map->start_dw[VARYING_SLOT_PRIMITIVE_COUNT] = 0;
+      map->len_dw[VARYING_SLOT_PRIMITIVE_COUNT] = 1;
+      outputs_written &= ~BITFIELD64_BIT(VARYING_SLOT_PRIMITIVE_COUNT);
+   }
+   if (BITFIELD64_BIT(VARYING_SLOT_PRIMITIVE_INDICES) & outputs_written) {
+      map->start_dw[VARYING_SLOT_PRIMITIVE_INDICES] = 1;
+      map->len_dw[VARYING_SLOT_PRIMITIVE_INDICES] =
+            map->per_primitive_indices_dw * map->max_primitives;
+      outputs_written &= ~BITFIELD64_BIT(VARYING_SLOT_PRIMITIVE_INDICES);
+   }

-   map->per_primitive_data_size_dw = 0;
-   u_foreach_bit64(location, outputs_written & nir->info.per_primitive_outputs) {
-      assert(map->start_dw[location] == -1);
+   const uint64_t per_primitive_header_bits =
+         BITFIELD64_BIT(VARYING_SLOT_PRIMITIVE_SHADING_RATE) |
+         BITFIELD64_BIT(VARYING_SLOT_LAYER) |
+         BITFIELD64_BIT(VARYING_SLOT_VIEWPORT) |
+         BITFIELD64_BIT(VARYING_SLOT_CULL_PRIMITIVE);

-      unsigned start;
-      switch (location) {
-      case VARYING_SLOT_PRIMITIVE_SHADING_RATE:
-         start = map->per_primitive_start_dw + 0;
-         break;
-      case VARYING_SLOT_LAYER:
-         start = map->per_primitive_start_dw + 1; /* RTAIndex */
-         break;
-      case VARYING_SLOT_VIEWPORT:
-         start = map->per_primitive_start_dw + 2;
-         break;
-      case VARYING_SLOT_CULL_PRIMITIVE:
-         start = map->per_primitive_start_dw + 3;
-         break;
-      default:
-         assert(location == VARYING_SLOT_PRIMITIVE_ID ||
-                location >= VARYING_SLOT_VAR0);
-         start = map->per_primitive_start_dw +
-                 map->per_primitive_header_size_dw +
-                 map->per_primitive_data_size_dw;
-         map->per_primitive_data_size_dw += 4;
-         break;
+   const uint64_t per_vertex_header_bits =
+         BITFIELD64_BIT(VARYING_SLOT_PSIZ) |
+         BITFIELD64_BIT(VARYING_SLOT_POS) |
+         BITFIELD64_BIT(VARYING_SLOT_CLIP_DIST0) |
+         BITFIELD64_BIT(VARYING_SLOT_CLIP_DIST1);
+
+   std::list<struct attr_desc> orders[3];
+   uint64_t regular_outputs = outputs_written &
+         ~(per_primitive_header_bits | per_vertex_header_bits);
+   brw_compute_mue_layout(orders, regular_outputs, nir);
+
+   if (outputs_written & per_primitive_header_bits) {
+      if (outputs_written & BITFIELD64_BIT(VARYING_SLOT_PRIMITIVE_SHADING_RATE)) {
+         map->start_dw[VARYING_SLOT_PRIMITIVE_SHADING_RATE] =
+               map->per_primitive_start_dw + 0;
+         map->len_dw[VARYING_SLOT_PRIMITIVE_SHADING_RATE] = 1;
      }

-      map->start_dw[location] = start;
+      if (outputs_written & BITFIELD64_BIT(VARYING_SLOT_LAYER)) {
+         map->start_dw[VARYING_SLOT_LAYER] =
+               map->per_primitive_start_dw + 1; /* RTAIndex */
+         map->len_dw[VARYING_SLOT_LAYER] = 1;
+      }
+
+      if (outputs_written & BITFIELD64_BIT(VARYING_SLOT_VIEWPORT)) {
+         map->start_dw[VARYING_SLOT_VIEWPORT] =
+               map->per_primitive_start_dw + 2;
+         map->len_dw[VARYING_SLOT_VIEWPORT] = 1;
+      }
+
+      if (outputs_written & BITFIELD64_BIT(VARYING_SLOT_CULL_PRIMITIVE)) {
+         map->start_dw[VARYING_SLOT_CULL_PRIMITIVE] =
+               map->per_primitive_start_dw + 3;
+         map->len_dw[VARYING_SLOT_CULL_PRIMITIVE] = 1;
+      }
+
+      map->per_primitive_header_size_dw = 8;
+      outputs_written &= ~per_primitive_header_bits;
+   } else {
+      map->per_primitive_header_size_dw = 0;
+   }
+
+   map->per_primitive_data_size_dw = 0;
+
+   unsigned start_dw = map->per_primitive_start_dw +
+                       map->per_primitive_header_size_dw;
+   for (auto it = orders[PRIM].cbegin(); it != orders[PRIM].cend(); ++it) {
+      int location = (*it).location;
+      if (location < 0) {
+         start_dw += (*it).dwords;
+         map->per_primitive_data_size_dw += (*it).dwords;
+         continue;
+      }
+
+      assert(map->start_dw[location] == -1);
+
+      assert(location == VARYING_SLOT_PRIMITIVE_ID ||
+             location >= VARYING_SLOT_VAR0);
+
+      brw_mue_assign_position(&*it, map, start_dw);
+
+      start_dw += (*it).dwords;
+      map->per_primitive_data_size_dw += (*it).dwords;
+      outputs_written &= ~BITFIELD64_RANGE(location, (*it).slots);
   }

   map->per_primitive_pitch_dw = ALIGN(map->per_primitive_header_size_dw +
                                       map->per_primitive_data_size_dw, 8);

   map->per_vertex_start_dw = ALIGN(map->per_primitive_start_dw +
-                                    map->per_primitive_pitch_dw * map->max_primitives, 8);
+                                    map->per_primitive_pitch_dw *
+                                    map->max_primitives, 8);

   /* TODO(mesh): Multiview. */
   unsigned fixed_header_size = 8;
   map->per_vertex_header_size_dw = ALIGN(fixed_header_size +
                                          nir->info.clip_distance_array_size +
                                          nir->info.cull_distance_array_size, 8);
-   map->per_vertex_data_size_dw = 0;
-   u_foreach_bit64(location, outputs_written & ~nir->info.per_primitive_outputs) {
-      assert(map->start_dw[location] == -1);

-      unsigned start;
-      switch (location) {
-      case VARYING_SLOT_PSIZ:
-         start = map->per_vertex_start_dw + 3;
-         break;
-      case VARYING_SLOT_POS:
-         start = map->per_vertex_start_dw + 4;
-         break;
-      case VARYING_SLOT_CLIP_DIST0:
-         start = map->per_vertex_start_dw + fixed_header_size + 0;
-         break;
-      case VARYING_SLOT_CLIP_DIST1:
-         start = map->per_vertex_start_dw + fixed_header_size + 4;
-         break;
-      case VARYING_SLOT_CULL_DIST0:
-      case VARYING_SLOT_CULL_DIST1:
-         unreachable("cull distances should be lowered earlier");
-         break;
-      default:
-         assert(location >= VARYING_SLOT_VAR0);
-         start = map->per_vertex_start_dw +
-                 map->per_vertex_header_size_dw +
-                 map->per_vertex_data_size_dw;
-         map->per_vertex_data_size_dw += 4;
-         break;
+   if (outputs_written & per_vertex_header_bits) {
+      if (outputs_written & BITFIELD64_BIT(VARYING_SLOT_PSIZ)) {
+         map->start_dw[VARYING_SLOT_PSIZ] = map->per_vertex_start_dw + 3;
+         map->len_dw[VARYING_SLOT_PSIZ] = 1;
+      }
+
+      if (outputs_written & BITFIELD64_BIT(VARYING_SLOT_POS)) {
+         map->start_dw[VARYING_SLOT_POS] = map->per_vertex_start_dw + 4;
+         map->len_dw[VARYING_SLOT_POS] = 4;
+      }
+
+      if (outputs_written & BITFIELD64_BIT(VARYING_SLOT_CLIP_DIST0)) {
+         map->start_dw[VARYING_SLOT_CLIP_DIST0] =
+               map->per_vertex_start_dw + fixed_header_size + 0;
+         map->len_dw[VARYING_SLOT_CLIP_DIST0] = 4;
+      }
+
+      if (outputs_written & BITFIELD64_BIT(VARYING_SLOT_CLIP_DIST1)) {
+         map->start_dw[VARYING_SLOT_CLIP_DIST1] =
+               map->per_vertex_start_dw + fixed_header_size + 4;
+         map->len_dw[VARYING_SLOT_CLIP_DIST1] = 4;
+      }
+
+      outputs_written &= ~per_vertex_header_bits;
+   }
+
+   /* cull distances should be lowered earlier */
+   assert(!(outputs_written & BITFIELD64_BIT(VARYING_SLOT_CULL_DIST0)));
+   assert(!(outputs_written & BITFIELD64_BIT(VARYING_SLOT_CULL_DIST1)));
+
+   map->per_vertex_data_size_dw = 0;
+
+   start_dw = map->per_vertex_start_dw +
+              map->per_vertex_header_size_dw;
+   for (unsigned type = VERT; type <= VERT_FLAT; ++type) {
+      for (auto it = orders[type].cbegin(); it != orders[type].cend(); ++it) {
+         int location = (*it).location;
+         if (location < 0) {
+            start_dw += (*it).dwords;
+            map->per_vertex_data_size_dw += (*it).dwords;
+            continue;
+         }
+
+         assert(map->start_dw[location] == -1);
+
+         assert(location >= VARYING_SLOT_VAR0);
+
+         brw_mue_assign_position(&*it, map, start_dw);
+
+         start_dw += (*it).dwords;
+         map->per_vertex_data_size_dw += (*it).dwords;
+         outputs_written &= ~BITFIELD64_RANGE(location, (*it).slots);
      }
-      map->start_dw[location] = start;
   }

   map->per_vertex_pitch_dw = ALIGN(map->per_vertex_header_size_dw +
@@ -571,14 +852,18 @@ brw_compute_mue_map(struct nir_shader *nir, struct brw_mue_map *map,
 }

 static void
-brw_print_mue_map(FILE *fp, const struct brw_mue_map *map)
+brw_print_mue_map(FILE *fp, const struct brw_mue_map *map, struct nir_shader *nir)
 {
   fprintf(fp, "MUE map (%d dwords, %d primitives, %d vertices)\n",
           map->size_dw, map->max_primitives, map->max_vertices);
-   fprintf(fp, "  %4d: VARYING_SLOT_PRIMITIVE_COUNT\n",
-           map->start_dw[VARYING_SLOT_PRIMITIVE_COUNT]);
-   fprintf(fp, "  %4d: VARYING_SLOT_PRIMITIVE_INDICES\n",
-           map->start_dw[VARYING_SLOT_PRIMITIVE_INDICES]);
+   fprintf(fp, "  <%4d, %4d>: VARYING_SLOT_PRIMITIVE_COUNT\n",
+           map->start_dw[VARYING_SLOT_PRIMITIVE_COUNT],
+           map->start_dw[VARYING_SLOT_PRIMITIVE_COUNT] +
+           map->len_dw[VARYING_SLOT_PRIMITIVE_COUNT] - 1);
+   fprintf(fp, "  <%4d, %4d>: VARYING_SLOT_PRIMITIVE_INDICES\n",
+           map->start_dw[VARYING_SLOT_PRIMITIVE_INDICES],
+           map->start_dw[VARYING_SLOT_PRIMITIVE_INDICES] +
+           map->len_dw[VARYING_SLOT_PRIMITIVE_INDICES] - 1);

   fprintf(fp, "  ----- per primitive (start %d, header_size %d, data_size %d, pitch %d)\n",
           map->per_primitive_start_dw,
@@ -589,13 +874,20 @@ brw_print_mue_map(FILE *fp, const struct brw_mue_map *map)
   for (unsigned i = 0; i < VARYING_SLOT_MAX; i++) {
      if (map->start_dw[i] < 0)
         continue;
+
      const unsigned offset = map->start_dw[i];
-      if (offset >= map->per_primitive_start_dw &&
-          offset < map->per_primitive_start_dw + map->per_primitive_pitch_dw) {
-         fprintf(fp, "  %4d: %s\n", offset,
-                 gl_varying_slot_name_for_stage((gl_varying_slot)i,
-                                                MESA_SHADER_MESH));
-      }
+      const unsigned len = map->len_dw[i];
+
+      if (offset < map->per_primitive_start_dw ||
+          offset >= map->per_primitive_start_dw + map->per_primitive_pitch_dw)
+         continue;
+
+      const char *name =
+            gl_varying_slot_name_for_stage((gl_varying_slot)i,
+                                           MESA_SHADER_MESH);
+
+      fprintf(fp, "  <%4d, %4d>: %s (%d)\n", offset, offset + len - 1,
+              name, i);
   }

   fprintf(fp, "  ----- per vertex (start %d, header_size %d, data_size %d, pitch %d)\n",
@@ -607,13 +899,24 @@ brw_print_mue_map(FILE *fp, const struct brw_mue_map *map)
   for (unsigned i = 0; i < VARYING_SLOT_MAX; i++) {
      if (map->start_dw[i] < 0)
         continue;
+
      const unsigned offset = map->start_dw[i];
-      if (offset >= map->per_vertex_start_dw &&
-          offset < map->per_vertex_start_dw + map->per_vertex_pitch_dw) {
-         fprintf(fp, "  %4d: %s\n", offset,
-                 gl_varying_slot_name_for_stage((gl_varying_slot)i,
-                                                MESA_SHADER_MESH));
-      }
+      const unsigned len = map->len_dw[i];
+
+      if (offset < map->per_vertex_start_dw ||
+          offset >= map->per_vertex_start_dw + map->per_vertex_pitch_dw)
+         continue;
+
+      nir_variable *var =
+            nir_find_variable_with_location(nir, nir_var_shader_out, i);
+      bool flat = var->data.interpolation == INTERP_MODE_FLAT;
+
+      const char *name =
+            gl_varying_slot_name_for_stage((gl_varying_slot)i,
+                                           MESA_SHADER_MESH);
+
+      fprintf(fp, "  <%4d, %4d>: %s (%d)%s\n", offset, offset + len - 1,
+              name, i, flat ? " (flat)" : "");
   }

   fprintf(fp, "\n");
@@ -1070,7 +1373,7 @@ brw_compile_mesh(const struct brw_compiler *compiler,
         brw_print_tue_map(stderr, params->tue_map);
      }
      fprintf(stderr, "Mesh Output ");
-      brw_print_mue_map(stderr, &prog_data->map);
+      brw_print_mue_map(stderr, &prog_data->map, nir);
   }

   fs_generator g(compiler, &params->base, &prog_data->base.base,
--- a/src/intel/compiler/brw_nir.c
+++ b/src/intel/compiler/brw_nir.c
@@ -2085,6 +2085,21 @@ brw_nir_load_global_const(nir_builder *b, nir_intrinsic_instr *load_uniform,
   return sysval;
 }

+const struct glsl_type *
+brw_nir_get_var_type(const struct nir_shader *nir, nir_variable *var)
+{
+   const struct glsl_type *type = var->interface_type;
+   if (!type) {
+      type = var->type;
+      if (nir_is_arrayed_io(var, nir->info.stage) || var->data.per_view) {
+         assert(glsl_type_is_array(type));
+         type = glsl_get_array_element(type);
+      }
+   }
+
+   return type;
+}
+
 bool
 brw_nir_pulls_at_sample(nir_shader *shader)
 {
--- a/src/intel/compiler/brw_nir.h
+++ b/src/intel/compiler/brw_nir.h
@@ -283,6 +283,9 @@ nir_ssa_def *brw_nir_load_global_const(nir_builder *b,
                                       nir_ssa_def *base_addr,
                                       unsigned off);

+const struct glsl_type *brw_nir_get_var_type(const struct nir_shader *nir,
+                                             nir_variable *var);
+
 #ifdef __cplusplus
 }
 #endif