intel/compiler/mesh: optimize indirect writes

Our hardware requires that we write to URB using full vec4s at aligned addresses. It gives us an ability to mask-off dwords within vec4 we don't want to write, but we have to know their positions at compile time. Let's assume that: - V represents one dword we want to write - ? is an unitinitialized value - "|" is a vec4 boundary. When we want to write 2-dword value at offset 0 we generate 1 write message: | V1 V2 ? ? | with mask: | 1 1 0 0 | When we want to write 4-dword value at offset 2 we generate 2 write messages: | ? ? V1 V2 | V3 V4 ? ? | with mask: | 0 0 1 1 | 1 1 0 0 | However if we don't know the offset within vec4 at *compile time* we currently generate 4 write messages: | V1 V1 V1 V1 | | 0 0 1 0 | | V2 V2 V2 V2 | | 0 0 0 1 | | V3 V3 V3 V3 | | 1 0 0 0 | | V4 V4 V4 V4 | | 0 1 0 0 | where masks are determined at *run time*. This is quite wasteful and slow. However, if we could determine the offset modulo 4 statically at compile time, we could generate only 1 or 2 write messages (1 if modulo is 0) instead of 4. This is what this patch does: it analyzes the addressing expression for modulo 4 value and if it can determine it at compile time, we generate 1 or 2 writes, and if it can't we fallback to the old 4 writes method. In mesh shader, the value of offset modulo 4 should be known for all outputs, with an exception of primitive indices. The modulo value should be known because of MUE layout restrictions, which require that user per-primitive and per-vertex data start at address aligned to 8 dwords and we should statically always know the offset from this base. There can be some cases where the offset from the base is more dynamic (e.g. indirect array access inside a per-vertex value), so we always do the analysis. Primitive indices are an exception, because they form vec3s (for triangles), which means that the offset will not be easy to analyse. When U888X index format lands, primitive indices will use only one dword per triangle, which means that we'll always write them using one message. Task shaders don't have any predetermined structure of output memory, so always do the analysis. Reviewed-by: Caio Oliveira <caio.oliveira@intel.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/20050>
2022-11-10 20:29:54 +01:00
parent 2255375c4d
commit 3131c2fc7a
1 changed files with 99 additions and 8 deletions
--- a/src/intel/compiler/brw_mesh.cpp
+++ b/src/intel/compiler/brw_mesh.cpp
@@ -1004,6 +1004,82 @@ emit_urb_direct_writes(const fs_builder &bld, nir_intrinsic_instr *instr,
   }
 }

+static void
+emit_urb_indirect_vec4_write(const fs_builder &bld,
+                             const fs_reg &offset_src,
+                             unsigned base,
+                             const fs_reg &src,
+                             fs_reg urb_handle,
+                             unsigned src_comp_offset,
+                             unsigned dst_comp_offset,
+                             unsigned comps,
+                             unsigned mask)
+{
+   for (unsigned q = 0; q < bld.dispatch_width() / 8; q++) {
+      fs_builder bld8 = bld.group(8, q);
+
+      fs_reg off = bld8.vgrf(BRW_REGISTER_TYPE_UD, 1);
+      bld8.MOV(off, quarter(offset_src, q));
+      bld8.ADD(off, off, brw_imm_ud(base));
+      bld8.SHR(off, off, brw_imm_ud(2));
+
+      fs_reg payload_srcs[4];
+      unsigned length = 0;
+
+      for (unsigned i = 0; i < dst_comp_offset; i++)
+         payload_srcs[length++] = reg_undef;
+
+      for (unsigned c = 0; c < comps; c++)
+         payload_srcs[length++] = quarter(offset(src, bld, c + src_comp_offset), q);
+
+      fs_reg srcs[URB_LOGICAL_NUM_SRCS];
+      srcs[URB_LOGICAL_SRC_HANDLE] = urb_handle;
+      srcs[URB_LOGICAL_SRC_PER_SLOT_OFFSETS] = off;
+      srcs[URB_LOGICAL_SRC_CHANNEL_MASK] = brw_imm_ud(mask << 16);
+      srcs[URB_LOGICAL_SRC_DATA] = fs_reg(VGRF, bld.shader->alloc.allocate(length),
+                                          BRW_REGISTER_TYPE_F);
+      bld8.LOAD_PAYLOAD(srcs[URB_LOGICAL_SRC_DATA], payload_srcs, length, 0);
+
+      fs_inst *inst = bld8.emit(SHADER_OPCODE_URB_WRITE_LOGICAL,
+                                reg_undef, srcs, ARRAY_SIZE(srcs));
+      inst->mlen = 3 + length;
+      inst->offset = 0;
+   }
+}
+
+static void
+emit_urb_indirect_writes_mod(const fs_builder &bld, nir_intrinsic_instr *instr,
+                             const fs_reg &src, const fs_reg &offset_src,
+                             fs_reg urb_handle, unsigned mod)
+{
+   assert(nir_src_bit_size(instr->src[0]) == 32);
+
+   const unsigned comps = nir_src_num_components(instr->src[0]);
+   assert(comps <= 4);
+
+   const unsigned mask = nir_intrinsic_write_mask(instr);
+   const unsigned base_in_dwords = nir_intrinsic_base(instr) +
+                                   component_from_intrinsic(instr);
+
+   const unsigned comp_shift   = mod;
+   const unsigned first_comps  = MIN2(comps, 4 - comp_shift);
+   const unsigned second_comps = comps - first_comps;
+   const unsigned first_mask   = (mask << comp_shift) & 0xF;
+   const unsigned second_mask  = (mask >> (4 - comp_shift)) & 0xF;
+
+   if (first_mask > 0) {
+      emit_urb_indirect_vec4_write(bld, offset_src, base_in_dwords, src,
+                                   urb_handle, 0, comp_shift, first_comps,
+                                   first_mask);
+   }
+
+   if (second_mask > 0) {
+      emit_urb_indirect_vec4_write(bld, offset_src, base_in_dwords + 4, src,
+                                   urb_handle, first_comps, 0, second_comps,
+                                   second_mask);
+   }
+}
+
 static void
 emit_urb_indirect_writes(const fs_builder &bld, nir_intrinsic_instr *instr,
                         const fs_reg &src, const fs_reg &offset_src,
@@ -1184,15 +1260,30 @@ fs_visitor::emit_task_mesh_store(const fs_builder &bld, nir_intrinsic_instr *ins
   ubld8.MOV(h, urb_handle);
   ubld8.AND(h, h, brw_imm_ud(0xFFFF));

-   /* TODO(mesh): for per_vertex and per_primitive, if we could keep around
-    * the non-array-index offset, we could use to decide if we can perform
-    * either one or (at most) two writes instead one per component.
-    */
-
-   if (nir_src_is_const(*offset_nir_src))
+   if (nir_src_is_const(*offset_nir_src)) {
      emit_urb_direct_writes(bld, instr, src, h);
-   else
-      emit_urb_indirect_writes(bld, instr, src, get_nir_src(*offset_nir_src), h);
+   } else {
+      bool use_mod = false;
+      unsigned mod;
+
+      if (offset_nir_src->is_ssa) {
+         /* Try to calculate the value of (offset + base) % 4. If we can do
+          * this, then we can do indirect writes using only up to 2 URB
+          * writes (1 if modulo + num_comps is <= 4).
+          */
+         use_mod = nir_mod_analysis(nir_get_ssa_scalar(offset_nir_src->ssa, 0), nir_type_uint, 4, &mod);
+         if (use_mod) {
+            mod += nir_intrinsic_base(instr) + component_from_intrinsic(instr);
+            mod %= 4;
+         }
+      }
+
+      if (use_mod) {
+         emit_urb_indirect_writes_mod(bld, instr, src, get_nir_src(*offset_nir_src), h, mod);
+      } else {
+         emit_urb_indirect_writes(bld, instr, src, get_nir_src(*offset_nir_src), h);
+      }
+   }
 }

 void