intel/fs: handle load_global_constant_uniform_block_intel

Again, load the data just once in GRF, share it across lanes. Shader-db on dg2: total instructions in shared programs: 23214555 -> 23215400 (<.01%) instructions in affected programs: 199977 -> 200822 (0.42%) helped: 3 HURT: 38 helped stats (abs) min: 5 max: 670 x̄: 283.67 x̃: 176 helped stats (rel) min: 1.34% max: 49.41% x̄: 22.15% x̃: 15.70% HURT stats (abs) min: 1 max: 185 x̄: 44.63 x̃: 32 HURT stats (rel) min: 0.13% max: 42.86% x̄: 10.25% x̃: 9.30% 95% mean confidence interval for instructions value: -18.65 59.87 95% mean confidence interval for instructions %-change: 3.29% 12.47% Inconclusive result (value mean confidence interval includes 0). total loops in shared programs: 5928 -> 5928 (0.00%) loops in affected programs: 0 -> 0 helped: 0 HURT: 0 total cycles in shared programs: 851137495 -> 851152449 (<.01%) cycles in affected programs: 16406137 -> 16421091 (0.09%) helped: 9 HURT: 32 helped stats (abs) min: 10 max: 13498 x̄: 6443.22 x̃: 5581 helped stats (rel) min: 0.11% max: 4.75% x̄: 1.45% x̃: 0.34% HURT stats (abs) min: 3 max: 15056 x̄: 2279.47 x̃: 735 HURT stats (rel) min: 0.10% max: 23.71% x̄: 4.58% x̃: 4.65% 95% mean confidence interval for cycles value: -1315.40 2044.87 95% mean confidence interval for cycles %-change: 1.71% 4.80% Inconclusive result (value mean confidence interval includes 0). total spills in shared programs: 11856 -> 11825 (-0.26%) spills in affected programs: 2368 -> 2337 (-1.31%) helped: 4 HURT: 0 total fills in shared programs: 16258 -> 16207 (-0.31%) fills in affected programs: 2930 -> 2879 (-1.74%) helped: 4 HURT: 0 total sends in shared programs: 1038194 -> 1038185 (<.01%) sends in affected programs: 40 -> 31 (-22.50%) helped: 4 HURT: 0 helped stats (abs) min: 1 max: 4 x̄: 2.25 x̃: 2 helped stats (rel) min: 10.00% max: 33.33% x̄: 21.46% x̃: 21.25% 95% mean confidence interval for sends value: -4.64 0.14 95% mean confidence interval for sends %-change: -40.41% -2.51% Inconclusive result (value mean confidence interval includes 0). LOST: 0 GAINED: 0 Some VK/DX titles result (on DG2 only), it's mostly additional instruction counts except for the unity spaceship demo where a CS shader gets additional SIMDness. The reason for additional instructions is that since we're doing block loads, we need to find the live channels in control flow to select a single lane value that is valid. aztec_ruins_high: Totals from 3 (1.12% of 269) affected shaders: Instrs: 17732 -> 17896 (+0.92%) Cycles: 796518 -> 819302 (+2.86%) cyberpunk_2077: Totals from 17 (0.17% of 10301) affected shaders: Instrs: 10848 -> 11658 (+7.47%) Cycles: 248243 -> 259168 (+4.40%); split: -0.57%, +4.97% fallout_4_dxvk_g2: Totals from 2 (0.12% of 1638) affected shaders: Instrs: 3157 -> 3368 (+6.68%) Cycles: 487807 -> 490426 (+0.54%); split: -0.26%, +0.79% Max live registers: 139 -> 141 (+1.44%) red_dead_redemption2: Totals from 68 (1.14% of 5970) affected shaders: Instrs: 34871 -> 36486 (+4.63%) Cycles: 551430 -> 565211 (+2.50%) Send messages: 2074 -> 2072 (-0.10%) Max live registers: 5078 -> 5077 (-0.02%) total_war_warhammer2: Totals from 5 (1.05% of 478) affected shaders: Instrs: 6905 -> 6971 (+0.96%); split: -0.16%, +1.12% Cycles: 97035 -> 97989 (+0.98%); split: -0.07%, +1.05% unity spaceship demo (instruction count going up due to a CS shader bump from SIMD8->16): Totals from 53 (9.71% of 546) affected shaders: Instrs: 223748 -> 233223 (+4.23%); split: -0.01%, +4.25% Cycles: 23134697 -> 25207080 (+8.96%); split: -0.17%, +9.13% Subgroup size: 480 -> 488 (+1.67%) Spill count: 2156 -> 2242 (+3.99%); split: -0.19%, +4.17% Fill count: 4617 -> 4845 (+4.94%); split: -0.09%, +5.02% Max live registers: 5991 -> 6050 (+0.98%); split: -0.40%, +1.39% Max dispatch width: 480 -> 488 (+1.67%) witcher_3_dxvk_g2: Totals from 27 (2.51% of 1074) affected shaders: Instrs: 57067 -> 57677 (+1.07%); split: -0.03%, +1.10% Cycles: 1397871 -> 1436704 (+2.78%); split: -0.35%, +3.13% Signed-off-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com> Reviewed-by: Kenneth Graunke <kenneth@whitecape.org> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/23477>
2023-06-08 21:53:02 +03:00
parent 4ee1a8bb9c
commit 6b9f838d62
3 changed files with 58 additions and 1 deletions
--- a/src/intel/compiler/brw_fs_nir.cpp
+++ b/src/intel/compiler/brw_fs_nir.cpp
@@ -5127,6 +5127,46 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr
      break;
   }

+   case nir_intrinsic_load_global_constant_uniform_block_intel: {
+      const unsigned total_dwords = ALIGN(instr->num_components, REG_SIZE / 4);
+      unsigned loaded_dwords = 0;
+
+      const fs_builder ubld1 = bld.exec_all().group(1, 0);
+      const fs_builder ubld8 = bld.exec_all().group(8, 0);
+      const fs_builder ubld16 = bld.exec_all().group(16, 0);
+
+      const fs_reg packed_consts =
+         ubld1.vgrf(BRW_REGISTER_TYPE_UD, total_dwords);
+      fs_reg address = bld.emit_uniformize(get_nir_src(instr->src[0]));
+
+      while (loaded_dwords < total_dwords) {
+         const unsigned block =
+            choose_oword_block_size_dwords(devinfo,
+                                           total_dwords - loaded_dwords);
+         const unsigned block_bytes = block * 4;
+
+         const fs_builder &ubld = block <= 8 ? ubld8 : ubld16;
+
+         fs_reg srcs[A64_LOGICAL_NUM_SRCS];
+         srcs[A64_LOGICAL_ADDRESS] = address;
+         srcs[A64_LOGICAL_SRC] = fs_reg(); /* No source data */
+         srcs[A64_LOGICAL_ARG] = brw_imm_ud(block);
+         srcs[A64_LOGICAL_ENABLE_HELPERS] = brw_imm_ud(0);
+         ubld.emit(SHADER_OPCODE_A64_UNALIGNED_OWORD_BLOCK_READ_LOGICAL,
+                   retype(byte_offset(packed_consts, loaded_dwords * 4), BRW_REGISTER_TYPE_UD),
+                   srcs, A64_LOGICAL_NUM_SRCS)->size_written = block_bytes;
+
+         increment_a64_address(ubld1, address, block_bytes);
+         loaded_dwords += block;
+      }
+
+      for (unsigned c = 0; c < instr->num_components; c++)
+         bld.MOV(retype(offset(dest, bld, c), BRW_REGISTER_TYPE_UD),
+                 component(packed_consts, c));
+
+      break;
+   }
+
   case nir_intrinsic_load_ssbo: {
      assert(devinfo->ver >= 7);

--- a/src/intel/compiler/brw_nir.c
+++ b/src/intel/compiler/brw_nir.c
@@ -1310,7 +1310,8 @@ brw_nir_should_vectorize_mem(unsigned align_mul, unsigned align_offset,
   if (low->intrinsic == nir_intrinsic_load_global_const_block_intel ||
       low->intrinsic == nir_intrinsic_load_ubo_uniform_block_intel ||
       low->intrinsic == nir_intrinsic_load_ssbo_uniform_block_intel ||
-       low->intrinsic == nir_intrinsic_load_shared_uniform_block_intel) {
+       low->intrinsic == nir_intrinsic_load_shared_uniform_block_intel ||
+       low->intrinsic == nir_intrinsic_load_global_constant_uniform_block_intel) {
      if (num_components > 4) {
         if (!util_is_power_of_two_nonzero(num_components))
            return false;
--- a/src/intel/compiler/brw_nir_blockify_uniform_loads.c
+++ b/src/intel/compiler/brw_nir_blockify_uniform_loads.c
@@ -87,6 +87,22 @@ brw_nir_blockify_uniform_loads_instr(nir_builder *b,
      intrin->intrinsic = nir_intrinsic_load_shared_uniform_block_intel;
      return true;

+   case nir_intrinsic_load_global_constant:
+      if (nir_src_is_divergent(intrin->src[0]))
+         return false;
+
+      if (nir_dest_bit_size(intrin->dest) != 32)
+         return false;
+
+      /* Without the LSC, we can only do block loads of at least 4dwords (1
+       * oword).
+       */
+      if (!devinfo->has_lsc && nir_dest_num_components(intrin->dest) < 4)
+         return false;
+
+      intrin->intrinsic = nir_intrinsic_load_global_constant_uniform_block_intel;
+      return true;
+
   default:
      return false;
   }