brw: Align and combine constant-offset UBO loads in NIR

The hope here is to replace our backend handling for loading whole cachelines at a time from UBOs into NIR-based handling, which plays nicely with the NIR load/store vectorizer. Rounding down offsets to multiples of 64B allows us to globally CSE UBO loads across basic blocks. This is really useful. However, blindly rounding down the offset to a multiple of 64B can trigger anti-patterns where...a single unaligned memory load could have hit all the necessary data, but rounding it down split it into two loads. By moving this to NIR, we gain more control of the interplay between nir_opt_load_store_vectorize and this rebasing and CSE'ing. The backend can then simply load between nir_def_{first,last}_component_read() and trust that our NIR has the loads blockified appropriately. Reviewed-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/32888>
2024-12-26 16:41:30 -08:00
parent 36d0485ae4
commit 21636ff9fa
3 changed files with 141 additions and 1 deletions
--- a/src/intel/compiler/brw_nir.c
+++ b/src/intel/compiler/brw_nir.c
@@ -1665,9 +1665,26 @@ brw_vectorize_lower_mem_access(nir_shader *nir,
    *   - reduced register pressure
    */
   nir_divergence_analysis(nir);
-   if (OPT(intel_nir_blockify_uniform_loads, compiler->devinfo))
+   if (OPT(intel_nir_blockify_uniform_loads, compiler->devinfo)) {
      OPT(nir_opt_load_store_vectorize, &options);

+      OPT(nir_opt_constant_folding);
+      OPT(nir_copy_prop);
+
+      if (OPT(brw_nir_rebase_const_offset_ubo_loads)) {
+         OPT(nir_opt_cse);
+         OPT(nir_copy_prop);
+
+         nir_load_store_vectorize_options ubo_options = {
+            .modes = nir_var_mem_ubo,
+            .callback = brw_nir_should_vectorize_mem,
+            .robust_modes = options.robust_modes & nir_var_mem_ubo,
+         };
+
+         OPT(nir_opt_load_store_vectorize, &ubo_options);
+      }
+   }
+
   nir_lower_mem_access_bit_sizes_options mem_access_options = {
      .modes = nir_var_mem_ssbo |
               nir_var_mem_constant |
--- a/src/intel/compiler/intel_nir.h
+++ b/src/intel/compiler/intel_nir.h
@@ -14,6 +14,7 @@ extern "C" {
 struct intel_device_info;

 void intel_nir_apply_tcs_quads_workaround(nir_shader *nir);
+bool brw_nir_rebase_const_offset_ubo_loads(nir_shader *shader);
 bool intel_nir_blockify_uniform_loads(nir_shader *shader,
                                      const struct intel_device_info *devinfo);
 bool intel_nir_clamp_image_1d_2d_array_sizes(nir_shader *shader);
--- a/src/intel/compiler/intel_nir_blockify_uniform_loads.c
+++ b/src/intel/compiler/intel_nir_blockify_uniform_loads.c
@@ -26,6 +26,128 @@
 #include "isl/isl.h"
 #include "nir_builder.h"

+static bool
+rebase_const_offset_ubo_loads_instr(nir_builder *b,
+                                    nir_instr *instr,
+                                    void *cb_data)
+{
+   if (instr->type != nir_instr_type_intrinsic)
+      return false;
+
+   nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
+   if (intrin->intrinsic != nir_intrinsic_load_ubo_uniform_block_intel)
+      return false;
+
+   if (!nir_src_is_const(intrin->src[1]))
+      return false;
+
+   const unsigned type_bytes = intrin->def.bit_size / 8;
+   const unsigned cacheline_bytes = 64;
+   const unsigned block_components =
+      MIN2(cacheline_bytes / type_bytes, NIR_MAX_VEC_COMPONENTS);
+
+   const unsigned orig_offset = nir_src_as_uint(intrin->src[1]);
+   const unsigned new_offset = ROUND_DOWN_TO(orig_offset, cacheline_bytes);
+
+   const unsigned orig_def_components = intrin->def.num_components;
+   const unsigned orig_read_components =
+      nir_def_last_component_read(&intrin->def) + 1;
+   const unsigned pad_components = (orig_offset - new_offset) / type_bytes;
+
+   /* Don't round down if we'd have to split a single load into two loads */
+   if (orig_read_components + pad_components > block_components)
+      return false;
+
+   /* Always read a full block so we can CSE reads of different sizes.
+    * The backend will skip reading unused trailing components anyway.
+    */
+   intrin->def.num_components = block_components;
+   intrin->num_components = block_components;
+   nir_intrinsic_set_range_base(intrin, new_offset);
+   nir_intrinsic_set_range(intrin, block_components * type_bytes);
+   nir_intrinsic_set_align_offset(intrin, 0);
+
+   if (pad_components) {
+      /* Change the base of the load to the new lower offset, and emit
+       * moves to read from the now higher vector component locations.
+       */
+      b->cursor = nir_before_instr(instr);
+      nir_src_rewrite(&intrin->src[1], nir_imm_int(b, new_offset));
+   }
+
+   b->cursor = nir_after_instr(instr);
+
+   nir_scalar components[NIR_MAX_VEC_COMPONENTS];
+   nir_scalar undef = nir_get_scalar(nir_undef(b, 1, type_bytes * 8), 0);
+   unsigned i = 0;
+   for (; i < orig_read_components; i++)
+      components[i] = nir_get_scalar(&intrin->def, pad_components + i);
+   for (; i < orig_def_components; i++)
+      components[i] = undef;
+
+   nir_def *rebase = nir_vec_scalars(b, components, orig_def_components);
+   rebase->divergent = false;
+
+   nir_def_rewrite_uses_after(&intrin->def, rebase, rebase->parent_instr);
+
+   return true;
+}
+
+/**
+ * Shaders commonly contain small UBO loads with a constant offset scattered
+ * throughout the program.  Ideally, we want to vectorize those into larger
+ * block loads so we can load whole cachelines at a time, or at least fill
+ * whole 32B registers rather than having empty space.
+ *
+ * nir_opt_load_store_vectorize() is terrific for combining small loads into
+ * nice large block loads.  Unfortunately, it only vectorizes within a single
+ * basic block, and there's a lot of opportunity for optimizing globally.
+ *
+ * In the past, our backend loaded whole 64B cachelines at a time (on pre-Xe2,
+ * two registers) and rounded down constant UBO load offsets to the nearest
+ * multiple of 64B.  This meant multiple loads within the same 64B would be
+ * CSE'd into the same load, and we could even take advantage of global CSE.
+ * However, we didn't have a method for shrinking loads from 64B back to 32B
+ * again, and also didn't have a lot of flexibility in how this interacted
+ * with the NIR load/store vectorization.
+ *
+ * This pass takes a similar approach, but in NIR.  The idea is to:
+ *
+ * 1. Run load/store vectorization to combine access within a basic block
+ *
+ * 2. Find load_ubo_uniform_block_intel intrinsics with constant offsets.
+ *    Round their base down to the nearest multiple of 64B, and also increase
+ *    their returned vector to be a vec16 (64B for 32-bit values).  However,
+ *    only do this if a single vec16 load would cover this additional "pad"
+ *    space at the front, and all used components of the existing load.  That
+ *    way, we don't blindly turn a single load into two loads.
+ *
+ *    If we made any progress, then...
+ *
+ * 3. Run global CSE.  This will coalesce any accesses to the same 64B
+ *    region across subtrees of the CFG.
+ *
+ * 4. Run the load/store vectorizer again for UBOs.  This will clean up
+ *    any overlapping memory access within a block.
+ *
+ * 5. Have the backend only issue loads for components of the vec16 which
+ *    are actually read.  We could also shrink this in NIR, but doing it in
+ *    the backend is pretty straightforward.
+ *
+ * We could probably do better with a fancier sliding-window type pass
+ * which looked across blocks to produce optimal loads.  However, this
+ * simple hack using existing passes does a fairly good job for now.
+ */
+bool
+brw_nir_rebase_const_offset_ubo_loads(nir_shader *shader)
+{
+   return nir_shader_instructions_pass(shader,
+                                       rebase_const_offset_ubo_loads_instr,
+                                       nir_metadata_control_flow |
+                                       nir_metadata_live_defs,
+                                       NULL);
+}
+
 static bool
 intel_nir_blockify_uniform_loads_instr(nir_builder *b,
                                       nir_instr *instr,