brw: Combine convergent texture buffer fetches into fewer loads

Borderlands 3 (both DX11 and DX12 renderers) have a common pattern across many shaders: con 32x4 %510 = (uint32)txf %2 (handle), %1191 (0x10) (coord), %1 (0x0) (lod), 0 (texture) con 32x4 %512 = (uint32)txf %2 (handle), %1511 (0x11) (coord), %1 (0x0) (lod), 0 (texture) ... con 32x4 %550 = (uint32)txf %2 (handle), %1549 (0x25) (coord), %1 (0x0) (lod), 0 (texture) con 32x4 %552 = (uint32)txf %2 (handle), %1551 (0x26) (coord), %1 (0x0) (lod), 0 (texture) A single basic block contains piles of texelFetches from a 1D buffer texture, with constant coordinates. In most cases, only the .x channel of the result is read. So we have something on the order of 28 sampler messages, each asking for...a single uint32_t scalar value. Because our sampler doesn't have any support for convergent block loads (like the untyped LSC transpose messages for SSBOs)...this means we were emitting SIMD8/16 (or SIMD16/32 on Xe2) sampler messages for every single scalar, replicating what's effectively a SIMD1 value to the entire register. This is hugely wasteful, both in terms of register pressure, and also in back-and-forth sending and receiving memory messages. The good news is we can take advantage of our explicit SIMD model to handle this more efficiently. This patch adds a new optimization pass that detects a series of SHADER_OPCODE_TXF_LOGICAL, in the same basic block, with constant offsets, from the same texture. It constructs a new divergent coordinate where each channel is one of the constants (i.e <10, 11, 12, ..., 26> in the above example). It issues a new NoMask divergent texel fetch which loads N useful channels in one go, and replaces the rest with expansion MOVs that splat the SIMD1 result back to the full SIMD width. (These get copy propagated away.) We can pick the SIMD size of the load independently of the native shader width as well. On Xe2, those 28 convergent loads become a single SIMD32 ld message. On earlier hardware, we use 2 SIMD16 messages. Or we can use a smaller size when there aren't many to combine. In fossil-db, this cuts 27% of send messages in affected shaders, 3-6% of cycles, 2-3% of instructions, and 8-12% of live registers. On A770, this improves performance of Borderlands 3 by roughly 2.5-3.5%. Reviewed-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/32573>
2024-12-09 13:25:18 -08:00
parent 22881712c8
commit 6341b3cd87
4 changed files with 240 additions and 1 deletions
--- a/src/intel/compiler/brw_fs.h
+++ b/src/intel/compiler/brw_fs.h
@@ -671,6 +671,7 @@ bool brw_fs_opt_saturate_propagation(fs_visitor &s);
 bool brw_fs_opt_split_sends(fs_visitor &s);
 bool brw_fs_opt_split_virtual_grfs(fs_visitor &s);
 bool brw_fs_opt_zero_samples(fs_visitor &s);
+bool brw_opt_combine_convergent_txf(fs_visitor &s);

 bool brw_fs_workaround_emit_dummy_mov_instruction(fs_visitor &s);
 bool brw_fs_workaround_memory_fence_before_eot(fs_visitor &s);
--- a/src/intel/compiler/brw_fs_opt.cpp
+++ b/src/intel/compiler/brw_fs_opt.cpp
@@ -85,6 +85,9 @@ brw_fs_optimize(fs_visitor &s)
   progress = false;
   pass_num = 0;

+   if (OPT(brw_opt_combine_convergent_txf))
+      OPT(brw_fs_opt_copy_propagation_defs);
+
   if (OPT(brw_fs_lower_pack)) {
      OPT(brw_fs_opt_register_coalesce);
      OPT(brw_fs_opt_dead_code_eliminate);
@@ -552,4 +555,3 @@ brw_fs_opt_remove_extra_rounding_modes(fs_visitor &s)

   return progress;
 }
-
--- a/src/intel/compiler/brw_opt_txf_combiner.cpp
+++ b/src/intel/compiler/brw_opt_txf_combiner.cpp
@@ -0,0 +1,235 @@
+/*
+ * Copyright © 2024 Intel Corporation
+ * SPDX-License-Identifier: MIT
+ */
+
+#include "brw_eu.h"
+#include "brw_fs.h"
+#include "brw_fs_builder.h"
+
+using namespace brw;
+
+static unsigned
+dest_comps_for_txf(const fs_visitor &s, const fs_inst *txf)
+{
+   if (!txf)
+      return 0;
+
+   const unsigned grf_size = REG_SIZE * reg_unit(s.devinfo);
+   const unsigned per_component_regs =
+      DIV_ROUND_UP(brw_type_size_bytes(txf->dst.type) *
+                   txf->exec_size, grf_size);
+   const unsigned dest_regs = txf->size_written / grf_size;
+   const unsigned dest_comps = dest_regs / per_component_regs;
+   return dest_comps;
+}
+
+static bool
+is_def(const def_analysis &defs, const brw_reg &r)
+{
+   return r.file == IMM || r.file == BAD_FILE || defs.get(r) != NULL;
+}
+
+static bool
+is_uniform_def(const def_analysis &defs, const brw_reg &r)
+{
+   return is_def(defs, r) && is_uniform(r);
+}
+
+/**
+ * Check if two texture instructions have a matching source (either the same
+ * immediate value, or both references to the same immutable SSA def and
+ * with matching source modifiers and regions).
+ */
+static bool
+sources_match(ASSERTED const def_analysis &defs,
+              const fs_inst *a, const fs_inst *b, enum tex_logical_srcs src)
+{
+   assert(is_def(defs, a->src[src]));
+   assert(is_def(defs, b->src[src]));
+   return brw_regs_equal(&a->src[src], &b->src[src]);
+}
+
+/**
+ * Look for a series of convergent texture buffer fetches within a basic
+ * block and combine them into a single divergent load with one lane for
+ * each original fetch.  For example, this series of convergent fetches:
+ *
+ *   txf(16) %12:UD, coord = 12d, lod = 0u, handle = %1<0>:D
+ *   txf(16) %13:UD, coord = 13d, lod = 0u, handle = %1<0>:D
+ *   txf(16) %14:UD, coord = 14d, lod = 0u, handle = %1<0>:D
+ *   txf(16) %15:UD, coord = 15d, lod = 0u, handle = %1<0>:D
+ *   txf(16) %16:UD, coord = 16d, lod = 0u, handle = %1<0>:D
+ *   txf(16) %17:UD, coord = 17d, lod = 0u, handle = %1<0>:D
+ *   txf(16) %18:UD, coord = 18d, lod = 0u, handle = %1<0>:D
+ *   txf(16) %19:UD, coord = 19d, lod = 0u, handle = %1<0>:D
+ *
+ * can be combined into a single divergent load and scalar-expansion moves
+ * (which can easily be copy propagated away):
+ *
+ *   load_payload(1) %2:D 12d, 13d, 14d, 15d, 16d, 17d, 18d, 19d
+ *   txf(8) %3:UD, coord = %2, lod = 0u, handle = %1<0>:D
+ *   mov(16) %12:UD, %3+0.0<0>:UD
+ *   ...
+ *   mov(16) %19:UD, %3+0.28<0>:UD
+ *
+ * Our sampler hardware doesn't have any special support for convergent
+ * loads (like LSC transpose/block loads), and always performs SIMD8/16/32
+ * per-channel loads.  But with this trick, we can still combine multiple
+ * convergent loads into a single message with fewer round-trips, and much
+ * lower register pressure.
+ */
+bool
+brw_opt_combine_convergent_txf(fs_visitor &s)
+{
+   const def_analysis &defs = s.def_analysis.require();
+
+   const unsigned min_simd = 8 * reg_unit(s.devinfo);
+   const unsigned grf_size = REG_SIZE * reg_unit(s.devinfo);
+
+   bool progress = false;
+
+   foreach_block(block, s.cfg) {
+      /* Gather a list of convergent TXFs to the same surface in this block */
+      fs_inst *txfs[32] = {};
+      unsigned count = 0;
+
+      foreach_inst_in_block(fs_inst, inst, block) {
+         if (inst->opcode != SHADER_OPCODE_TXF_LOGICAL)
+            continue;
+
+         /* Only handle buffers or single miplevel 1D images for now */
+         if (inst->src[TEX_LOGICAL_SRC_COORD_COMPONENTS].ud > 1)
+            continue;
+
+         if (inst->src[TEX_LOGICAL_SRC_RESIDENCY].ud != 0)
+            continue;
+
+         if (inst->predicate || inst->force_writemask_all)
+            continue;
+
+         if (!is_uniform_def(defs, inst->src[TEX_LOGICAL_SRC_LOD]) ||
+             !is_uniform_def(defs, inst->src[TEX_LOGICAL_SRC_SURFACE]) ||
+             !is_uniform_def(defs, inst->src[TEX_LOGICAL_SRC_SURFACE_HANDLE]))
+            continue;
+
+         /* Only handle immediates for now: we could check is_uniform(),
+          * but we'd need to ensure the coordinate's definition reaches
+          * txfs[0] which is where we'll insert the combined coordinate.
+          */
+         if (inst->src[TEX_LOGICAL_SRC_COORDINATE].file != IMM)
+            continue;
+
+         /* texelFetch from 1D buffers shouldn't have any of these */
+         assert(inst->src[TEX_LOGICAL_SRC_SHADOW_C].file == BAD_FILE);
+         assert(inst->src[TEX_LOGICAL_SRC_LOD2].file == BAD_FILE);
+         assert(inst->src[TEX_LOGICAL_SRC_MIN_LOD].file == BAD_FILE);
+         assert(inst->src[TEX_LOGICAL_SRC_SAMPLE_INDEX].file == BAD_FILE);
+         assert(inst->src[TEX_LOGICAL_SRC_MCS].file == BAD_FILE);
+         assert(inst->src[TEX_LOGICAL_SRC_TG4_OFFSET].file == BAD_FILE);
+         assert(inst->src[TEX_LOGICAL_SRC_GRAD_COMPONENTS].file == IMM &&
+                inst->src[TEX_LOGICAL_SRC_GRAD_COMPONENTS].ud == 0);
+
+         if (count > 0 &&
+             (!sources_match(defs, inst, txfs[0], TEX_LOGICAL_SRC_LOD) ||
+              !sources_match(defs, inst, txfs[0], TEX_LOGICAL_SRC_SURFACE) ||
+              !sources_match(defs, inst, txfs[0],
+                             TEX_LOGICAL_SRC_SURFACE_HANDLE)))
+            continue;
+
+         txfs[count++] = inst;
+
+         if (count == ARRAY_SIZE(txfs))
+            break;
+      }
+
+      /* Need at least two things to combine. */
+      if (count < 2)
+         continue;
+
+      /* Emit divergent TXFs and replace the original ones with MOVs */
+      for (unsigned curr = 0; curr < count; curr += 32) {
+         const unsigned lanes = CLAMP(count - curr, min_simd, 32);
+         const unsigned width = util_next_power_of_two(lanes);
+         const fs_builder ubld =
+            fs_builder(&s).at(block, txfs[curr]).exec_all().group(width, 0);
+         const fs_builder ubld1 = ubld.group(1, 0);
+
+         enum brw_reg_type coord_type =
+            txfs[curr]->src[TEX_LOGICAL_SRC_COORDINATE].type;
+         brw_reg coord = ubld.vgrf(coord_type);
+         brw_reg coord_comps[32];
+
+         for (unsigned i = 0; i < width; i++) {
+            /* Our block size might be larger than the number of convergent
+             * loads we're combining.  If so, repeat the last component.
+             */
+            if (txfs[curr+i])
+               coord_comps[i] = txfs[curr+i]->src[TEX_LOGICAL_SRC_COORDINATE];
+            else
+               coord_comps[i] = coord_comps[i-1];
+         }
+         ubld1.VEC(coord, coord_comps, width);
+
+         brw_reg srcs[TEX_LOGICAL_NUM_SRCS];
+         srcs[TEX_LOGICAL_SRC_COORDINATE] = coord;
+         srcs[TEX_LOGICAL_SRC_LOD] = txfs[0]->src[TEX_LOGICAL_SRC_LOD];
+         srcs[TEX_LOGICAL_SRC_SURFACE] = txfs[0]->src[TEX_LOGICAL_SRC_SURFACE];
+         srcs[TEX_LOGICAL_SRC_SURFACE_HANDLE] =
+            txfs[0]->src[TEX_LOGICAL_SRC_SURFACE_HANDLE];
+         srcs[TEX_LOGICAL_SRC_SAMPLER] = txfs[0]->src[TEX_LOGICAL_SRC_SAMPLER];
+         srcs[TEX_LOGICAL_SRC_SAMPLER_HANDLE] =
+            txfs[0]->src[TEX_LOGICAL_SRC_SAMPLER_HANDLE];
+         srcs[TEX_LOGICAL_SRC_COORD_COMPONENTS] = brw_imm_ud(1);
+         srcs[TEX_LOGICAL_SRC_GRAD_COMPONENTS] = brw_imm_ud(0);
+         srcs[TEX_LOGICAL_SRC_RESIDENCY] = brw_imm_ud(0);
+
+         /* Each of our txf may have a reduced response length if some
+          * components are never read.  Use the maximum of the sizes.
+          */
+         unsigned new_dest_comps = 0;
+         for (unsigned i = 0; i < width; i++) {
+            const unsigned this_comps = dest_comps_for_txf(s, txfs[curr+i]);
+            new_dest_comps = MAX2(new_dest_comps, this_comps);
+         }
+
+         /* Emit the new divergent TXF */
+         brw_reg div = ubld.vgrf(BRW_TYPE_UD, new_dest_comps);
+         fs_inst *div_txf =
+            ubld.emit(SHADER_OPCODE_TXF_LOGICAL, div, srcs,
+                      TEX_LOGICAL_NUM_SRCS);
+
+         /* Update it to also use response length reduction */
+         const unsigned per_component_regs =
+            DIV_ROUND_UP(brw_type_size_bytes(div.type) * div_txf->exec_size,
+                         grf_size);
+         div_txf->size_written = new_dest_comps * per_component_regs * grf_size;
+
+         for (unsigned i = 0; i < width; i++) {
+            fs_inst *txf = txfs[curr+i];
+            if (!txf)
+               break;
+
+            const fs_builder ibld = fs_builder(&s, block, txf);
+
+            /* Replace each of the original TXFs with MOVs from our new one */
+            const unsigned dest_comps = dest_comps_for_txf(s, txf);
+            assert(dest_comps <= 4);
+
+            brw_reg v[4];
+            for (unsigned c = 0; c < dest_comps; c++)
+               v[c] = component(offset(div, ubld, c), i);
+            ibld.VEC(retype(txf->dst, BRW_TYPE_UD), v, dest_comps);
+
+            txf->remove(block);
+         }
+
+         progress = true;
+      }
+   }
+
+   if (progress)
+      s.invalidate_analysis(DEPENDENCY_INSTRUCTIONS);
+
+   return progress;
+}
--- a/src/intel/compiler/meson.build
+++ b/src/intel/compiler/meson.build
@@ -99,6 +99,7 @@ libintel_compiler_brw_files = files(
  'brw_nir_rt.h',
  'brw_nir_rt.c',
  'brw_nir_rt_builder.h',
+  'brw_opt_txf_combiner.cpp',
  'brw_packed_float.c',
  'brw_print.cpp',
  'brw_prim.h',