intel/fs: New VGRF packing scheme for constant combining

Each block is processed separately. VGRF channels that are allocated to values that are only used in a particular block are made available in other blocks. This is almost always an improvement, but there are some pessimal cases where it goes horribly wrong. Imagine a shader with two blocks. In that shader, the first block has 5 constants used in the first block and the second block. Three other constants are only used in the first block. The second block has 15 constants that are used only in the block. The static VGRF usage is 3 regardless of packing. However, scheduling may be able to shorten the live range of the first VGRF when it only has values that came from the first block (because three of the values are dead on entry to the second block). This used to occurs in a Mad Max shader on Broadwell. That shader went from 0:0 spills:fills to 107:52. Some changes over the last year, I'm assuming !13734, have prevented this case from occuring. This change created a lot of churn on Haswell and Ivy Bridge. This seems to be primarily due to all the extra constants used for coissue, but I did not investigate very deeply. On older platforms, there were no changes to spills or fills. As a result, this is only used on Broadwell and newer platforms. v2: Update expected checksum for pixmark-piano-v2.trace on gl-zink-anv-tgl. See #9714 for more details. shader-db results: Tiger Lake total instructions in shared programs: 21101332 -> 21102084 (<.01%) instructions in affected programs: 863686 -> 864438 (0.09%) helped: 463 / HURT: 437 total cycles in shared programs: 790573225 -> 790664391 (0.01%) cycles in affected programs: 92546803 -> 92637969 (0.10%) helped: 558 / HURT: 629 total spills in shared programs: 3959 -> 3951 (-0.20%) spills in affected programs: 184 -> 176 (-4.35%) helped: 2 / HURT: 0 total fills in shared programs: 2639 -> 2631 (-0.30%) fills in affected programs: 184 -> 176 (-4.35%) helped: 2 / HURT: 0 LOST: 1 GAINED: 5 Ice Lake and Skylake had similar results. (Ice Lake shown) total instructions in shared programs: 19945216 -> 19944711 (<.01%) instructions in affected programs: 139569 -> 139064 (-0.36%) helped: 66 / HURT: 3 total cycles in shared programs: 858410082 -> 857381323 (-0.12%) cycles in affected programs: 383825958 -> 382797199 (-0.27%) helped: 1012 / HURT: 1055 total spills in shared programs: 6190 -> 6116 (-1.20%) spills in affected programs: 891 -> 817 (-8.31%) helped: 66 / HURT: 3 total fills in shared programs: 7382 -> 7238 (-1.95%) fills in affected programs: 1538 -> 1394 (-9.36%) helped: 66 / HURT: 3 LOST: 5 GAINED: 8 Broadwell total instructions in shared programs: 17820886 -> 17812515 (-0.05%) instructions in affected programs: 800512 -> 792141 (-1.05%) helped: 385 / HURT: 1 total cycles in shared programs: 904482935 -> 903102070 (-0.15%) cycles in affected programs: 422427015 -> 421046150 (-0.33%) helped: 1091 / HURT: 812 total spills in shared programs: 17908 -> 16576 (-7.44%) spills in affected programs: 9459 -> 8127 (-14.08%) helped: 386 / HURT: 0 total fills in shared programs: 25397 -> 22354 (-11.98%) fills in affected programs: 15504 -> 12461 (-19.63%) helped: 385 / HURT: 1 LOST: 2 GAINED: 2 No shader-db changes on Haswell or older platforms. fossil-db results: Tiger Lake Instructions in all programs: 156881463 -> 156890970 (+0.0%) Instructions helped: 9033 Instructions hurt: 10285 Cycles in all programs: 7532597466 -> 7529647924 (-0.0%) Cycles helped: 10548 Cycles hurt: 13667 Spills in all programs: 5490 -> 5110 (-6.9%) Spills helped: 100 Spills hurt: 3 Fills in all programs: 6123 -> 5752 (-6.1%) Fills helped: 100 Fills hurt: 3 Gained: 17 Lost: 47 Ice Lake Instructions in all programs: 141309644 -> 141309603 (-0.0%) Instructions helped: 9 Instructions hurt: 4 Cycles in all programs: 9095812690 -> 9097008049 (+0.0%) Cycles helped: 14288 Cycles hurt: 16381 Spills in all programs: 7418 -> 7404 (-0.2%) Spills helped: 9 Spills hurt: 4 Fills in all programs: 8326 -> 8321 (-0.1%) Fills helped: 9 Fills hurt: 4 Skylake Instructions in all programs: 131872347 -> 131870690 (-0.0%) Instructions helped: 111 Instructions hurt: 3 Cycles in all programs: 8800835649 -> 8802483884 (+0.0%) Cycles helped: 9415 Cycles hurt: 9678 Spills in all programs: 6917 -> 6476 (-6.4%) Spills helped: 111 Spills hurt: 3 Fills in all programs: 7584 -> 7354 (-3.0%) Fills helped: 111 Fills hurt: 3 Lost: 5 Tested-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com> Reviewed-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/7698>
2020-12-07 22:29:34 -08:00
parent c506d7e511
commit 927a24db14
2 changed files with 184 additions and 13 deletions
--- a/src/gallium/drivers/zink/ci/traces-zink.yml
+++ b/src/gallium/drivers/zink/ci/traces-zink.yml
@@ -35,7 +35,7 @@ traces:
      checksum: 433b69bea68cfe81914b857bbdc60ea5
  gputest/pixmark-piano-v2.trace:
    gl-zink-anv-tgl:
-      checksum: 5bc82f565a6e791e1b8aa7860054e370
+      checksum: b1c96546107d8a7c01efdafdd0eabd21
  gputest/triangle-v2.trace:
    gl-zink-anv-tgl:
      checksum: 5f694874b15bcd7a3689b387c143590b
--- a/src/intel/compiler/brw_fs_combine_constants.cpp
+++ b/src/intel/compiler/brw_fs_combine_constants.cpp
@@ -870,6 +870,9 @@ struct imm {
    */
   bool must_promote;

+   /** Is the value used only in a single basic block? */
+   bool used_in_single_block;
+
   uint16_t first_use_ip;
   uint16_t last_use_ip;
 };
@@ -1173,6 +1176,133 @@ add_candidate_immediate(struct table *table, fs_inst *inst, unsigned ip,
   }
 }

+struct register_allocation {
+   /** VGRF for storing values. */
+   unsigned nr;
+
+   /**
+    * Mask of currently available slots in this register.
+    *
+    * Each register is 16, 16-bit slots.  Allocations require 1, 2, or 4 slots
+    * for word, double-word, or quad-word values, respectively.
+    */
+   uint16_t avail;
+};
+
+static fs_reg
+allocate_slots(struct register_allocation *regs, unsigned num_regs,
+               unsigned bytes, unsigned align_bytes,
+               brw::simple_allocator &alloc)
+{
+   assert(bytes == 2 || bytes == 4 || bytes == 8);
+   assert(align_bytes == 2 || align_bytes == 4 || align_bytes == 8);
+
+   const unsigned words = bytes / 2;
+   const unsigned align_words = align_bytes / 2;
+   const uint16_t mask = (1U << words) - 1;
+
+   for (unsigned i = 0; i < num_regs; i++) {
+      for (unsigned j = 0; j <= (16 - words); j += align_words) {
+         const uint16_t x = regs[i].avail >> j;
+
+         if ((x & mask) == mask) {
+            if (regs[i].nr == UINT_MAX)
+               regs[i].nr = alloc.allocate(1);
+
+            regs[i].avail &= ~(mask << j);
+
+            fs_reg reg(VGRF, regs[i].nr);
+            reg.offset = j * 2;
+
+            return reg;
+         }
+      }
+   }
+
+   unreachable("No free slots found.");
+}
+
+static void
+deallocate_slots(struct register_allocation *regs, unsigned num_regs,
+                 unsigned reg_nr, unsigned subreg_offset, unsigned bytes)
+{
+   assert(bytes == 2 || bytes == 4 || bytes == 8);
+   assert(subreg_offset % 2 == 0);
+   assert(subreg_offset + bytes <= 32);
+
+   const unsigned words = bytes / 2;
+   const unsigned offset = subreg_offset / 2;
+   const uint16_t mask = ((1U << words) - 1) << offset;
+
+   for (unsigned i = 0; i < num_regs; i++) {
+      if (regs[i].nr == reg_nr) {
+         regs[i].avail |= mask;
+         return;
+      }
+   }
+
+   unreachable("No such register found.");
+}
+
+static void
+parcel_out_registers(struct imm *imm, unsigned len, const bblock_t *cur_block,
+                     struct register_allocation *regs, unsigned num_regs,
+                     brw::simple_allocator &alloc, unsigned ver)
+{
+   /* Each basic block has two distinct set of constants.  There is the set of
+    * constants that only have uses in that block, and there is the set of
+    * constants that have uses after that block.
+    *
+    * Allocation proceeds in three passes.
+    *
+    * 1. Allocate space for the values that are used outside this block.
+    *
+    * 2. Allocate space for the values that are used only in this block.
+    *
+    * 3. Deallocate the space for the values that are used only in this block.
+    */
+
+   for (unsigned pass = 0; pass < 2; pass++) {
+      const bool used_in_single_block = pass != 0;
+
+      for (unsigned i = 0; i < len; i++) {
+         if (imm[i].block == cur_block &&
+             imm[i].used_in_single_block == used_in_single_block) {
+            /* From the BDW and CHV PRM, 3D Media GPGPU, Special Restrictions:
+             *
+             *   "In Align16 mode, the channel selects and channel enables apply
+             *    to a pair of half-floats, because these parameters are defined
+             *    for DWord elements ONLY. This is applicable when both source
+             *    and destination are half-floats."
+             *
+             * This means that Align16 instructions that use promoted HF
+             * immediates and use a <0,1,0>:HF region would read 2 HF slots
+             * instead of replicating the single one we want. To avoid this, we
+             * always populate both HF slots within a DWord with the constant.
+             */
+            const unsigned width = ver == 8 && imm[i].is_half_float ? 2 : 1;
+
+            const fs_reg reg = allocate_slots(regs, num_regs,
+                                              imm[i].size * width,
+                                              get_alignment_for_imm(&imm[i]),
+                                              alloc);
+
+            imm[i].nr = reg.nr;
+            imm[i].subreg_offset = reg.offset;
+         }
+      }
+   }
+
+   for (unsigned i = 0; i < len; i++) {
+      if (imm[i].block == cur_block && imm[i].used_in_single_block) {
+         const unsigned width = ver == 8 && imm[i].is_half_float ? 2 : 1;
+
+         deallocate_slots(regs, num_regs, imm[i].nr, imm[i].subreg_offset,
+                          imm[i].size * width);
+      }
+   }
+}
+
 bool
 fs_visitor::opt_combine_constants()
 {
@@ -1366,10 +1496,14 @@ fs_visitor::opt_combine_constants()
            imm->block = ib->block;
            imm->first_use_ip = ib->ip;
            imm->last_use_ip = ib->ip;
+            imm->used_in_single_block = true;
         } else {
            bblock_t *intersection = idom.intersect(ib->block,
                                                    imm->block);

+            if (ib->block != imm->block)
+               imm->used_in_single_block = false;
+
            if (imm->first_use_ip > ib->ip) {
               imm->first_use_ip = ib->ip;

@@ -1416,9 +1550,48 @@ fs_visitor::opt_combine_constants()
   if (cfg->num_blocks != 1)
      qsort(table.imm, table.len, sizeof(struct imm), compare);

+   if (devinfo->ver > 7) {
+      struct register_allocation *regs =
+         (struct register_allocation *) calloc(table.len, sizeof(regs[0]));
+
+      for (int i = 0; i < table.len; i++) {
+         regs[i].nr = UINT_MAX;
+         regs[i].avail = 0xffff;
+      }
+
+      foreach_block(block, cfg) {
+         parcel_out_registers(table.imm, table.len, block, regs, table.len,
+                              alloc, devinfo->ver);
+      }
+
+      free(regs);
+   } else {
+      fs_reg reg(VGRF, alloc.allocate(1));
+      reg.stride = 0;
+
+      for (int i = 0; i < table.len; i++) {
+         struct imm *imm = &table.imm[i];
+
+         /* Put the immediate in an offset aligned to its size. Some
+          * instructions seem to have additional alignment requirements, so
+          * account for that too.
+          */
+         reg.offset = ALIGN(reg.offset, get_alignment_for_imm(imm));
+
+         /* Ensure we have enough space in the register to copy the immediate */
+         if (reg.offset + imm->size > REG_SIZE) {
+            reg.nr = alloc.allocate(1);
+            reg.offset = 0;
+         }
+
+         imm->nr = reg.nr;
+         imm->subreg_offset = reg.offset;
+
+         reg.offset += imm->size;
+      }
+   }
+
   /* Insert MOVs to load the constant values into GRFs. */
-   fs_reg reg(VGRF, alloc.allocate(1));
-   reg.stride = 0;
   for (int i = 0; i < table.len; i++) {
      struct imm *imm = &table.imm[i];
      /* Insert it either before the instruction that generated the immediate
@@ -1442,24 +1615,22 @@ fs_visitor::opt_combine_constants()
      const uint32_t width = devinfo->ver == 8 && imm->is_half_float ? 2 : 1;
      const fs_builder ibld = bld.at(imm->block, n).exec_all().group(width, 0);

+      fs_reg reg(VGRF, imm->nr);
+      reg.offset = imm->subreg_offset;
+      reg.stride = 0;
+
      /* Put the immediate in an offset aligned to its size. Some instructions
       * seem to have additional alignment requirements, so account for that
       * too.
       */
-      reg.offset = ALIGN(reg.offset, get_alignment_for_imm(imm));
+      assert(reg.offset == ALIGN(reg.offset, get_alignment_for_imm(imm)));
+
+      struct brw_reg imm_reg = build_imm_reg_for_copy(imm);

      /* Ensure we have enough space in the register to copy the immediate */
-      struct brw_reg imm_reg = build_imm_reg_for_copy(imm);
-      if (reg.offset + type_sz(imm_reg.type) * width > REG_SIZE) {
-         reg.nr = alloc.allocate(1);
-         reg.offset = 0;
-      }
+      assert(reg.offset + type_sz(imm_reg.type) * width <= REG_SIZE);

      ibld.MOV(retype(reg, imm_reg.type), imm_reg);
-      imm->nr = reg.nr;
-      imm->subreg_offset = reg.offset;
-
-      reg.offset += imm->size * width;
   }
   shader_stats.promoted_constants = table.len;