intel/compiler,anv: put some vertex and primitive data in headers

Both per-primitive and per-vertex space is allocated in MUE in 8 dword chunks and those 8-dword chunks (granularity of 3DSTATE_SBE_MESH.Per[Primitive|Vertex]URBEntryOutputReadLength) are passed to fragment shaders as inputs (either non-interpolated for per-primitive and flat vertex attributes or interpolated for non-flat vertex attributes). Some attributes have a special meaning and must be placed in separate 8/16-dword slot called Primitive Header or Vertex Header. Primitive Header contains 4 such attributes (Cull Primitive, ViewportIndex, RTAIndex, CPS), leaving 4 dwords (the rest of 8-dword slot) potentially unused. Vertex Header is similar - it starts with 3 unused dwords, 1 dword for Point Size (but if we declare that shader doesn't produce Point Size then we can reuse it), followed by 4 dwords for Position and optionally 8 dwords for clip distances. This means we have an interesting optimization problem - we can put some user attributes into holes in Primitive and Vertex Headers, which may lead to smaller MUE size and potentially more mesh threads running in parallel, but we have to be careful to use those holes only when we need it, otherwise we could force HW to pass too much data to fragment shader. Example 1: Let's assume that Primitive Header is enabled and user defined 12 dwords of per-primitive attributes. Without packing we would consume 8 + ALIGN(12, 8) = 24 dwords of MUE space and pass ALIGN(12, 8) = 16 dwords to fragment shader. With packing, we'll consume 4 + 4 + ALIGN(12 - 4, 8) = 16 dwords of MUE space and pass ALIGN(4, 8) + ALIGN(12 - 4, 8) = 16 dwords to fragment shader. 16/16 is better than 24/16, so packing makes sense. Example 2: Now let's assume that Primitive Header is enabled and user defined 16 dwords of per-primitive attributes. Without packing we would consume 8 + ALIGN(16, 8) = 24 dwords of MUE space and pass ALIGN(16, 16) = 16 dwords to fragment shader. With packing, we'll consume 4 + 4 + ALIGN(16 - 4, 8) = 24 dwords of MUE space and pass ALIGN(4, 8) + ALIGN(16 - 4, 8) = 24 dwords to fragment shader. 24/24 is worse than 24/16, so packing doesn't make sense. This change doesn't affect vk_meshlet_cadscene in default configuration, but it speeds it up by up to 25% with "-extraattributes N", where N is some small value divisible by 2 (by default N == 1) and we are bound by URB size. Reviewed-by: Ivan Briano <ivan.briano@intel.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/20407>
2022-12-21 15:42:55 +01:00
parent a252123363
commit c1685f08dd
4 changed files with 193 additions and 20 deletions
--- a/src/intel/compiler/brw_compiler.h
+++ b/src/intel/compiler/brw_compiler.h
@@ -1636,12 +1636,14 @@ struct brw_mue_map {
   uint32_t per_primitive_header_size_dw;
   uint32_t per_primitive_data_size_dw;
   uint32_t per_primitive_pitch_dw;
+   bool user_data_in_primitive_header;

   uint32_t max_vertices;
   uint32_t per_vertex_start_dw;
   uint32_t per_vertex_header_size_dw;
   uint32_t per_vertex_data_size_dw;
   uint32_t per_vertex_pitch_dw;
+   bool user_data_in_vertex_header;
 };

 struct brw_task_prog_data {
--- a/src/intel/compiler/brw_fs.cpp
+++ b/src/intel/compiler/brw_fs.cpp
@@ -1794,7 +1794,7 @@ calculate_urb_setup(const struct intel_device_info *devinfo,
                                                VARYING_BIT_PRIMITIVE_SHADING_RATE;
         bool reads_header = (per_prim_inputs_read & primitive_header_bits) != 0;

-         if (reads_header) {
+         if (reads_header || mue_map->user_data_in_primitive_header) {
            /* Primitive Shading Rate, Layer and Viewport live in the same
             * 4-dwords slot (psr is dword 0, layer is dword 1, and viewport
             * is dword 2).
@@ -1849,9 +1849,13 @@ calculate_urb_setup(const struct intel_device_info *devinfo,
      unsigned per_vertex_start_dw = mue_map->per_vertex_start_dw;
      unsigned per_vertex_size_dw = mue_map->per_vertex_pitch_dw;

-      /* Per-Vertex header is never available to fragment shader. */
-      per_vertex_start_dw += 8;
-      per_vertex_size_dw -= 8;
+      /* Per-Vertex header is available to fragment shader only if there's
+       * user data there.
+       */
+      if (!mue_map->user_data_in_vertex_header) {
+         per_vertex_start_dw += 8;
+         per_vertex_size_dw -= 8;
+      }

      /* In Mesh, CLIP_DIST slots are always at the beginning, because
       * they come from MUE Vertex Header, not Per-Vertex Attributes.
--- a/src/intel/compiler/brw_mesh.cpp
+++ b/src/intel/compiler/brw_mesh.cpp
@@ -438,7 +438,7 @@ struct attr_type_info {
   std::list<struct attr_desc> *order;

   /* attributes after which there's hole of size equal to array index */
-   std::list<int> holes[4];
+   std::list<int> holes[5];
 };

 static void
@@ -490,22 +490,71 @@ brw_nir_find_complete_variable_with_location(nir_shader *shader,
   return best_var;
 }

+static unsigned
+brw_sum_size(const std::list<struct attr_desc> &orders)
+{
+   unsigned sz = 0;
+   for (auto it = orders.cbegin(); it != orders.cend(); ++it)
+      sz += (*it).dwords;
+   return sz;
+}
+
 /* Finds order of outputs which require minimum size, without splitting
 * of URB read/write messages (which operate on vec4-aligned memory).
 */
 static void
 brw_compute_mue_layout(std::list<struct attr_desc> *orders,
                       uint64_t outputs_written,
-                       struct nir_shader *nir)
+                       struct nir_shader *nir,
+                       bool *pack_prim_data_into_header,
+                       bool *pack_vert_data_into_header)
 {
   const struct shader_info *info = &nir->info;

   struct attr_type_info data[3];
   bool no_compact = !debug_get_bool_option("BRW_MESH_COMPACTION", true);
+   unsigned header_packing = (unsigned)debug_get_num_option("BRW_MESH_HEADER_PACKING", 3);
+
+   if ((header_packing & 1) == 0)
+      *pack_prim_data_into_header = false;
+   if ((header_packing & 2) == 0)
+      *pack_vert_data_into_header = false;

   for (unsigned i = PRIM; i <= VERT_FLAT; ++i)
      data[i].order = &orders[i];

+   /* If packing into header is enabled, add a hole of size 4 and add
+    * a virtual location to keep the algorithm happy (it expects holes
+    * to be preceded by some location). We'll remove those virtual
+    * locations at the end.
+    */
+   const gl_varying_slot virtual_header_location = VARYING_SLOT_POS;
+   assert((outputs_written & BITFIELD64_BIT(virtual_header_location)) == 0);
+
+   struct attr_desc d;
+   d.location = virtual_header_location;
+   d.type = NULL;
+   d.dwords = 0;
+   d.slots = 0;
+
+   struct attr_desc h;
+   h.location = -1;
+   h.type = NULL;
+   h.dwords = 4;
+   h.slots = 0;
+
+   if (*pack_prim_data_into_header) {
+      orders[PRIM].push_back(d);
+      orders[PRIM].push_back(h);
+      data[PRIM].holes[4].push_back(virtual_header_location);
+   }
+
+   if (*pack_vert_data_into_header) {
+      orders[VERT].push_back(d);
+      orders[VERT].push_back(h);
+      data[VERT].holes[4].push_back(virtual_header_location);
+   }
+
   u_foreach_bit64(location, outputs_written) {
      if ((BITFIELD64_BIT(location) & outputs_written) == 0)
         continue;
@@ -519,7 +568,6 @@ brw_compute_mue_layout(std::list<struct attr_desc> *orders,
                                                         nir_var_shader_out,
                                                         location);

-      struct attr_desc d;
      d.location = location;
      d.type     = brw_nir_get_var_type(nir, var);
      d.dwords   = glsl_count_dword_slots(d.type, false);
@@ -539,13 +587,26 @@ brw_compute_mue_layout(std::list<struct attr_desc> *orders,

      outputs_written &= ~BITFIELD64_RANGE(location, d.slots);

+      /* special case to use hole of size 4 */
+      if (d.dwords == 4 && !holes[4].empty()) {
+         holes[4].pop_back();
+
+         assert(order->front().location == virtual_header_location);
+         order->pop_front();
+
+         assert(order->front().location == -1);
+         assert(order->front().dwords == 4);
+         order->front() = d;
+
+         continue;
+      }
+
      int mod = d.dwords % 4;
      if (mod == 0) {
         order->push_back(d);
         continue;
      }

-      struct attr_desc h;
      h.location = -1;
      h.type = NULL;
      h.dwords = 4 - mod;
@@ -568,7 +629,7 @@ brw_compute_mue_layout(std::list<struct attr_desc> *orders,

      unsigned found = 0;
      /* try to find the smallest hole big enough to hold this attribute */
-      for (unsigned sz = d.dwords; sz < 4; sz++){
+      for (unsigned sz = d.dwords; sz <= 4; sz++){
         if (!holes[sz].empty()) {
            found = sz;
            break;
@@ -584,7 +645,7 @@ brw_compute_mue_layout(std::list<struct attr_desc> *orders,
         continue;
      }

-      assert(found < 4);
+      assert(found <= 4);
      assert(!holes[found].empty());
      int after_loc = holes[found].back();
      holes[found].pop_back();
@@ -632,6 +693,61 @@ brw_compute_mue_layout(std::list<struct attr_desc> *orders,

      assert(inserted_back);
   }
+
+   if (*pack_prim_data_into_header) {
+      if (orders[PRIM].front().location == virtual_header_location)
+         orders[PRIM].pop_front();
+
+      if (!data[PRIM].holes[4].empty()) {
+         *pack_prim_data_into_header = false;
+
+         assert(orders[PRIM].front().location == -1);
+         assert(orders[PRIM].front().dwords == 4);
+         orders[PRIM].pop_front();
+      }
+
+      if (*pack_prim_data_into_header) {
+         unsigned sz = brw_sum_size(orders[PRIM]);
+
+         if (sz % 8 == 0 || sz % 8 > 4)
+            *pack_prim_data_into_header = false;
+      }
+   }
+
+   if (*pack_vert_data_into_header) {
+      if (orders[VERT].front().location == virtual_header_location)
+         orders[VERT].pop_front();
+
+      if (!data[VERT].holes[4].empty()) {
+         *pack_vert_data_into_header = false;
+
+         assert(orders[VERT].front().location == -1);
+         assert(orders[VERT].front().dwords == 4);
+         orders[VERT].pop_front();
+      }
+
+      if (*pack_vert_data_into_header) {
+         unsigned sz = brw_sum_size(orders[VERT]) +
+                       brw_sum_size(orders[VERT_FLAT]);
+
+         if (sz % 8 == 0 || sz % 8 > 4)
+            *pack_vert_data_into_header = false;
+      }
+   }
+
+
+   if (INTEL_DEBUG(DEBUG_MESH)) {
+      fprintf(stderr, "MUE attribute order:\n");
+      for (unsigned i = PRIM; i <= VERT_FLAT; ++i) {
+         if (!orders[i].empty())
+            fprintf(stderr, "%d: ", i);
+         for (auto it = orders[i].cbegin(); it != orders[i].cend(); ++it) {
+            fprintf(stderr, "%d(%d) ", (*it).location, (*it).dwords);
+         }
+         if (!orders[i].empty())
+            fprintf(stderr, "\n");
+      }
+   }
 }

 /* Mesh URB Entry consists of an initial section
@@ -717,7 +833,22 @@ brw_compute_mue_map(struct nir_shader *nir, struct brw_mue_map *map,
   std::list<struct attr_desc> orders[3];
   uint64_t regular_outputs = outputs_written &
         ~(per_primitive_header_bits | per_vertex_header_bits);
-   brw_compute_mue_layout(orders, regular_outputs, nir);
+
+   /* packing into prim header is possible only if prim header is present */
+   map->user_data_in_primitive_header =
+         (outputs_written & per_primitive_header_bits) != 0;
+
+   /* Packing into vert header is always possible, but we allow it only
+    * if full vec4 is available (so point size is not used) and there's
+    * nothing between it and normal vertex data (so no clip distances).
+    */
+   map->user_data_in_vertex_header =
+         (outputs_written & per_vertex_header_bits) ==
+               BITFIELD64_BIT(VARYING_SLOT_POS);
+
+   brw_compute_mue_layout(orders, regular_outputs, nir,
+                          &map->user_data_in_primitive_header,
+                          &map->user_data_in_vertex_header);

   if (outputs_written & per_primitive_header_bits) {
      if (outputs_written & BITFIELD64_BIT(VARYING_SLOT_PRIMITIVE_SHADING_RATE)) {
@@ -752,13 +883,22 @@ brw_compute_mue_map(struct nir_shader *nir, struct brw_mue_map *map,

   map->per_primitive_data_size_dw = 0;

-   unsigned start_dw = map->per_primitive_start_dw +
-                       map->per_primitive_header_size_dw;
+   unsigned start_dw = map->per_primitive_start_dw;
+   if (map->user_data_in_primitive_header)
+      start_dw += 4; /* first 4 dwords are used */
+   else
+      start_dw += map->per_primitive_header_size_dw;
+   unsigned header_used_dw = 0;
+
   for (auto it = orders[PRIM].cbegin(); it != orders[PRIM].cend(); ++it) {
      int location = (*it).location;
      if (location < 0) {
         start_dw += (*it).dwords;
-         map->per_primitive_data_size_dw += (*it).dwords;
+         if (map->user_data_in_primitive_header && header_used_dw < 4)
+            header_used_dw += (*it).dwords;
+         else
+            map->per_primitive_data_size_dw += (*it).dwords;
+         assert(header_used_dw <= 4);
         continue;
      }

@@ -770,7 +910,11 @@ brw_compute_mue_map(struct nir_shader *nir, struct brw_mue_map *map,
      brw_mue_assign_position(&*it, map, start_dw);

      start_dw += (*it).dwords;
-      map->per_primitive_data_size_dw += (*it).dwords;
+      if (map->user_data_in_primitive_header && header_used_dw < 4)
+         header_used_dw += (*it).dwords;
+      else
+         map->per_primitive_data_size_dw += (*it).dwords;
+      assert(header_used_dw <= 4);
      outputs_written &= ~BITFIELD64_RANGE(location, (*it).slots);
   }

@@ -819,14 +963,24 @@ brw_compute_mue_map(struct nir_shader *nir, struct brw_mue_map *map,

   map->per_vertex_data_size_dw = 0;

-   start_dw = map->per_vertex_start_dw +
-              map->per_vertex_header_size_dw;
+   start_dw = map->per_vertex_start_dw;
+   if (!map->user_data_in_vertex_header)
+      start_dw += map->per_vertex_header_size_dw;
+
+   header_used_dw = 0;
   for (unsigned type = VERT; type <= VERT_FLAT; ++type) {
      for (auto it = orders[type].cbegin(); it != orders[type].cend(); ++it) {
         int location = (*it).location;
         if (location < 0) {
            start_dw += (*it).dwords;
-            map->per_vertex_data_size_dw += (*it).dwords;
+            if (map->user_data_in_vertex_header && header_used_dw < 4) {
+               header_used_dw += (*it).dwords;
+               assert(header_used_dw <= 4);
+               if (header_used_dw == 4)
+                  start_dw += 4; /* jump over gl_position */
+            } else {
+               map->per_vertex_data_size_dw += (*it).dwords;
+            }
            continue;
         }

@@ -837,7 +991,14 @@ brw_compute_mue_map(struct nir_shader *nir, struct brw_mue_map *map,
         brw_mue_assign_position(&*it, map, start_dw);

         start_dw += (*it).dwords;
-         map->per_vertex_data_size_dw += (*it).dwords;
+         if (map->user_data_in_vertex_header && header_used_dw < 4) {
+            header_used_dw += (*it).dwords;
+            assert(header_used_dw <= 4);
+            if (header_used_dw == 4)
+               start_dw += 4; /* jump over gl_position */
+         } else {
+            map->per_vertex_data_size_dw += (*it).dwords;
+         }
         outputs_written &= ~BITFIELD64_RANGE(location, (*it).slots);
      }
   }
--- a/src/intel/vulkan/genX_pipeline.c
+++ b/src/intel/vulkan/genX_pipeline.c
@@ -558,6 +558,11 @@ emit_3dstate_sbe(struct anv_graphics_pipeline *pipeline)
            sbe_mesh.PerVertexURBEntryOutputReadLength += 1;
         }

+         if (mue->user_data_in_vertex_header) {
+            sbe_mesh.PerVertexURBEntryOutputReadOffset -= 1;
+            sbe_mesh.PerVertexURBEntryOutputReadLength += 1;
+         }
+
         assert(mue->per_primitive_header_size_dw % 8 == 0);
         sbe_mesh.PerPrimitiveURBEntryOutputReadOffset = mue->per_primitive_header_size_dw / 8;
         sbe_mesh.PerPrimitiveURBEntryOutputReadLength = DIV_ROUND_UP(mue->per_primitive_data_size_dw, 8);
@@ -569,7 +574,8 @@ emit_3dstate_sbe(struct anv_graphics_pipeline *pipeline)
          */
         if (wm_prog_data->urb_setup[VARYING_SLOT_VIEWPORT] >= 0 ||
             wm_prog_data->urb_setup[VARYING_SLOT_PRIMITIVE_SHADING_RATE] >= 0 ||
-             wm_prog_data->urb_setup[VARYING_SLOT_LAYER] >= 0) {
+             wm_prog_data->urb_setup[VARYING_SLOT_LAYER] >= 0 ||
+             mue->user_data_in_primitive_header) {
            assert(sbe_mesh.PerPrimitiveURBEntryOutputReadOffset > 0);
            sbe_mesh.PerPrimitiveURBEntryOutputReadOffset -= 1;
            sbe_mesh.PerPrimitiveURBEntryOutputReadLength += 1;