intel/fs: drop FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GFX7

We can lower FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD into other more generic sends and drop this internal opcode. The idea behind this change is to allow bindless surfaces to be used for UBO pulls and why it's interesting to be able to reuse setup_surface_descriptors(). But that will come in a later change. No shader-db changes on TGL & DG2. Signed-off-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com> Reviewed-by: Kenneth Graunke <kenneth@whitecape.org> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/20416>
2022-12-21 20:16:27 +02:00
parent 5bc91550d1
commit 13cca48920
9 changed files with 154 additions and 191 deletions
--- a/src/intel/compiler/brw_eu_defines.h
+++ b/src/intel/compiler/brw_eu_defines.h
@@ -559,7 +559,6 @@ enum opcode {
   FS_OPCODE_PIXEL_X,
   FS_OPCODE_PIXEL_Y,
   FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD,
-   FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GFX7,
   FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GFX4,
   FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_LOGICAL,
   FS_OPCODE_SET_SAMPLE_ID,
@@ -888,6 +887,17 @@ enum tex_logical_srcs {
   TEX_LOGICAL_NUM_SRCS,
 };

+enum pull_uniform_constant_srcs {
+   /** Surface binding table index */
+   PULL_UNIFORM_CONSTANT_SRC_SURFACE,
+   /** Surface offset */
+   PULL_UNIFORM_CONSTANT_SRC_OFFSET,
+   /** Pull size */
+   PULL_UNIFORM_CONSTANT_SRC_SIZE,
+
+   PULL_UNIFORM_CONSTANT_SRCS,
+};
+
 enum surface_logical_srcs {
   /** Surface binding table index */
   SURFACE_LOGICAL_SRC_SURFACE,
--- a/src/intel/compiler/brw_fs.cpp
+++ b/src/intel/compiler/brw_fs.cpp
@@ -248,7 +248,6 @@ fs_inst::is_control_source(unsigned arg) const
 {
   switch (opcode) {
   case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
-   case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GFX7:
   case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GFX4:
      return arg == 0;

@@ -306,9 +305,6 @@ fs_inst::is_payload(unsigned arg) const
   case SHADER_OPCODE_BARRIER:
      return arg == 0;

-   case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GFX7:
-      return arg == 1;
-
   case SHADER_OPCODE_SEND:
      return arg == 2 || arg == 3;

@@ -864,12 +860,6 @@ fs_inst::size_read(int arg) const
         return 1;
      break;

-   case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GFX7:
-      /* The payload is actually stored in src1 */
-      if (arg == 1)
-         return mlen * REG_SIZE;
-      break;
-
   case FS_OPCODE_LINTERP:
      if (arg == 1)
         return 16;
@@ -2460,8 +2450,14 @@ fs_visitor::lower_constant_loads()
         const fs_reg dst = ubld.vgrf(BRW_REGISTER_TYPE_UD);
         const unsigned base = pull_index * 4;

-         ubld.emit(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD,
-                   dst, brw_imm_ud(index), brw_imm_ud(base & ~(block_sz - 1)));
+         fs_reg srcs[PULL_UNIFORM_CONSTANT_SRCS];
+         srcs[PULL_UNIFORM_CONSTANT_SRC_SURFACE] = brw_imm_ud(index);
+         srcs[PULL_UNIFORM_CONSTANT_SRC_OFFSET]  = brw_imm_ud(base & ~(block_sz - 1));
+         srcs[PULL_UNIFORM_CONSTANT_SRC_SIZE]    = brw_imm_ud(block_sz);
+
+
+         ubld.emit(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD, dst,
+                   srcs, PULL_UNIFORM_CONSTANT_SRCS);

         /* Rewrite the instruction to use the temporary VGRF. */
         inst->src[i].file = VGRF;
@@ -3678,106 +3674,6 @@ fs_visitor::insert_gfx4_send_dependency_workarounds()
      invalidate_analysis(DEPENDENCY_INSTRUCTIONS);
 }

-/**
- * Turns the generic expression-style uniform pull constant load instruction
- * into a hardware-specific series of instructions for loading a pull
- * constant.
- *
- * The expression style allows the CSE pass before this to optimize out
- * repeated loads from the same offset, and gives the pre-register-allocation
- * scheduling full flexibility, while the conversion to native instructions
- * allows the post-register-allocation scheduler the best information
- * possible.
- *
- * Note that execution masking for setting up pull constant loads is special:
- * the channels that need to be written are unrelated to the current execution
- * mask, since a later instruction will use one of the result channels as a
- * source operand for all 8 or 16 of its channels.
- */
-void
-fs_visitor::lower_uniform_pull_constant_loads()
-{
-   foreach_block_and_inst (block, fs_inst, inst, cfg) {
-      if (inst->opcode != FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD)
-         continue;
-
-      const fs_reg& surface = inst->src[0];
-      const fs_reg& offset_B = inst->src[1];
-      assert(offset_B.file == IMM);
-
-      if (devinfo->has_lsc) {
-         const fs_builder ubld =
-            fs_builder(this, block, inst).group(8, 0).exec_all();
-
-         const fs_reg payload = ubld.vgrf(BRW_REGISTER_TYPE_UD);
-         ubld.MOV(payload, offset_B);
-
-         inst->sfid = GFX12_SFID_UGM;
-         inst->desc = lsc_msg_desc(devinfo, LSC_OP_LOAD,
-                                   1 /* simd_size */,
-                                   LSC_ADDR_SURFTYPE_BTI,
-                                   LSC_ADDR_SIZE_A32,
-                                   1 /* num_coordinates */,
-                                   LSC_DATA_SIZE_D32,
-                                   inst->size_written / 4,
-                                   true /* transpose */,
-                                   LSC_CACHE_LOAD_L1STATE_L3MOCS,
-                                   true /* has_dest */);
-
-         fs_reg ex_desc;
-         if (surface.file == IMM) {
-            ex_desc = brw_imm_ud(lsc_bti_ex_desc(devinfo, surface.ud));
-         } else {
-            /* We only need the first component for the payload so we can use
-             * one of the other components for the extended descriptor
-             */
-            ex_desc = component(payload, 1);
-            ubld.group(1, 0).SHL(ex_desc, surface, brw_imm_ud(24));
-         }
-
-         /* Update the original instruction. */
-         inst->opcode = SHADER_OPCODE_SEND;
-         inst->mlen = lsc_msg_desc_src0_len(devinfo, inst->desc);
-         inst->ex_mlen = 0;
-         inst->header_size = 0;
-         inst->send_has_side_effects = false;
-         inst->send_is_volatile = true;
-         inst->exec_size = 1;
-
-         /* Finally, the payload */
-         inst->resize_sources(3);
-         inst->src[0] = brw_imm_ud(0); /* desc */
-         inst->src[1] = ex_desc;
-         inst->src[2] = payload;
-
-         invalidate_analysis(DEPENDENCY_INSTRUCTIONS | DEPENDENCY_VARIABLES);
-      } else if (devinfo->ver >= 7) {
-         const fs_builder ubld = fs_builder(this, block, inst).exec_all();
-         const fs_reg payload = ubld.group(8, 0).vgrf(BRW_REGISTER_TYPE_UD);
-
-         ubld.group(8, 0).MOV(payload,
-                              retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
-         ubld.group(1, 0).MOV(component(payload, 2),
-                              brw_imm_ud(offset_B.ud / 16));
-
-         inst->opcode = FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GFX7;
-         inst->src[1] = payload;
-         inst->header_size = 1;
-         inst->mlen = 1;
-
-         invalidate_analysis(DEPENDENCY_INSTRUCTIONS | DEPENDENCY_VARIABLES);
-      } else {
-         /* Before register allocation, we didn't tell the scheduler about the
-          * MRF we use.  We know it's safe to use this MRF because nothing
-          * else does except for register spill/unspill, which generates and
-          * uses its MRF within a single IR instruction.
-          */
-         inst->base_mrf = FIRST_PULL_LOAD_MRF(devinfo->ver) + 1;
-         inst->mlen = 1;
-      }
-   }
-}
-
 bool
 fs_visitor::lower_load_payload()
 {
--- a/src/intel/compiler/brw_fs.h
+++ b/src/intel/compiler/brw_fs.h
@@ -621,10 +621,6 @@ private:
   void generate_uniform_pull_constant_load(fs_inst *inst, struct brw_reg dst,
                                            struct brw_reg index,
                                            struct brw_reg offset);
-   void generate_uniform_pull_constant_load_gfx7(fs_inst *inst,
-                                                 struct brw_reg dst,
-                                                 struct brw_reg surf_index,
-                                                 struct brw_reg payload);
   void generate_varying_pull_constant_load_gfx4(fs_inst *inst,
                                                 struct brw_reg dst,
                                                 struct brw_reg index);
--- a/src/intel/compiler/brw_fs_generator.cpp
+++ b/src/intel/compiler/brw_fs_generator.cpp
@@ -1553,67 +1553,6 @@ fs_generator::generate_uniform_pull_constant_load(fs_inst *inst,
 			read_offset, surf_index);
 }

-void
-fs_generator::generate_uniform_pull_constant_load_gfx7(fs_inst *inst,
-                                                       struct brw_reg dst,
-                                                       struct brw_reg index,
-                                                       struct brw_reg payload)
-{
-   assert(index.type == BRW_REGISTER_TYPE_UD);
-   assert(payload.file == BRW_GENERAL_REGISTER_FILE);
-   assert(type_sz(dst.type) == 4);
-   assert(!devinfo->has_lsc);
-
-   if (index.file == BRW_IMMEDIATE_VALUE) {
-      const uint32_t surf_index = index.ud;
-
-      brw_push_insn_state(p);
-      brw_set_default_mask_control(p, BRW_MASK_DISABLE);
-      brw_inst *send = brw_next_insn(p, BRW_OPCODE_SEND);
-      brw_pop_insn_state(p);
-
-      brw_inst_set_sfid(devinfo, send, GFX6_SFID_DATAPORT_CONSTANT_CACHE);
-      brw_set_dest(p, send, retype(dst, BRW_REGISTER_TYPE_UD));
-      brw_set_src0(p, send, retype(payload, BRW_REGISTER_TYPE_UD));
-      brw_set_desc(p, send,
-                   brw_message_desc(devinfo, 1, DIV_ROUND_UP(inst->size_written,
-                                                             REG_SIZE), true) |
-                   brw_dp_desc(devinfo, surf_index,
-                               GFX7_DATAPORT_DC_OWORD_BLOCK_READ,
-                               BRW_DATAPORT_OWORD_BLOCK_DWORDS(inst->exec_size)));
-
-   } else {
-      const tgl_swsb swsb = brw_get_default_swsb(p);
-      struct brw_reg addr = vec1(retype(brw_address_reg(0), BRW_REGISTER_TYPE_UD));
-
-      brw_push_insn_state(p);
-      brw_set_default_mask_control(p, BRW_MASK_DISABLE);
-
-      /* a0.0 = surf_index & 0xff */
-      brw_set_default_swsb(p, tgl_swsb_src_dep(swsb));
-      brw_inst *insn_and = brw_next_insn(p, BRW_OPCODE_AND);
-      brw_inst_set_exec_size(p->devinfo, insn_and, BRW_EXECUTE_1);
-      brw_set_dest(p, insn_and, addr);
-      brw_set_src0(p, insn_and, vec1(retype(index, BRW_REGISTER_TYPE_UD)));
-      brw_set_src1(p, insn_and, brw_imm_ud(0x0ff));
-
-      /* dst = send(payload, a0.0 | <descriptor>) */
-      brw_set_default_swsb(p, tgl_swsb_dst_dep(swsb, 1));
-      brw_send_indirect_message(
-         p, GFX6_SFID_DATAPORT_CONSTANT_CACHE,
-         retype(dst, BRW_REGISTER_TYPE_UD),
-         retype(payload, BRW_REGISTER_TYPE_UD), addr,
-         brw_message_desc(devinfo, 1,
-                          DIV_ROUND_UP(inst->size_written, REG_SIZE), true) |
-         brw_dp_desc(devinfo, 0 /* surface */,
-                     GFX7_DATAPORT_DC_OWORD_BLOCK_READ,
-                     BRW_DATAPORT_OWORD_BLOCK_DWORDS(inst->exec_size)),
-         false /* EOT */);
-
-      brw_pop_insn_state(p);
-   }
-}
-
 void
 fs_generator::generate_varying_pull_constant_load_gfx4(fs_inst *inst,
                                                       struct brw_reg dst,
@@ -2294,12 +2233,6 @@ fs_generator::generate_code(const cfg_t *cfg, int dispatch_width,
         send_count++;
 	 break;

-      case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GFX7:
-         assert(inst->force_writemask_all);
-	 generate_uniform_pull_constant_load_gfx7(inst, dst, src[0], src[1]);
-         send_count++;
-	 break;
-
      case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GFX4:
 	 generate_varying_pull_constant_load_gfx4(inst, dst, src[0]);
         send_count++;
--- a/src/intel/compiler/brw_fs_nir.cpp
+++ b/src/intel/compiler/brw_fs_nir.cpp
@@ -4776,9 +4776,13 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr
            const unsigned count = MIN2(instr->num_components - c,
                                        (block_sz - base % block_sz) / type_size);

-            ubld.emit(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD,
-                      packed_consts, surf_index,
-                      brw_imm_ud(base & ~(block_sz - 1)));
+            fs_reg srcs[PULL_UNIFORM_CONSTANT_SRCS];
+            srcs[PULL_UNIFORM_CONSTANT_SRC_SURFACE] = surf_index;
+            srcs[PULL_UNIFORM_CONSTANT_SRC_OFFSET]  = brw_imm_ud(base & ~(block_sz - 1));
+            srcs[PULL_UNIFORM_CONSTANT_SRC_SIZE]    = brw_imm_ud(block_sz);
+
+            ubld.emit(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD, packed_consts,
+                      srcs, PULL_UNIFORM_CONSTANT_SRCS);

            const fs_reg consts =
               retype(byte_offset(packed_consts, base & (block_sz - 1)),
--- a/src/intel/compiler/brw_ir_performance.cpp
+++ b/src/intel/compiler/brw_ir_performance.cpp
@@ -1001,7 +1001,6 @@ namespace {
            abort();

      case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
-      case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GFX7:
         return calculate_desc(info, EU_UNIT_DP_CC, 2, 0, 0, 0, 16 /* XXX */,
                               10 /* XXX */, 100 /* XXX */, 0, 0, 0, 0);

@@ -1036,6 +1035,14 @@ namespace {

      case SHADER_OPCODE_SEND:
         switch (info.sfid) {
+         case GFX6_SFID_DATAPORT_CONSTANT_CACHE:
+            if (devinfo->ver >= 7) {
+               /* See FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD */
+               return calculate_desc(info, EU_UNIT_DP_CC, 2, 0, 0, 0, 16 /* XXX */,
+                                     10 /* XXX */, 100 /* XXX */, 0, 0, 0, 0);
+            } else {
+               abort();
+            }
         case GFX6_SFID_DATAPORT_RENDER_CACHE:
            if (devinfo->ver >= 7) {
               switch (brw_dp_desc_msg_type(devinfo, info.desc)) {
--- a/src/intel/compiler/brw_lower_logical_sends.cpp
+++ b/src/intel/compiler/brw_lower_logical_sends.cpp
@@ -2778,3 +2778,118 @@ fs_visitor::lower_logical_sends()

   return progress;
 }
+
+/**
+ * Turns the generic expression-style uniform pull constant load instruction
+ * into a hardware-specific series of instructions for loading a pull
+ * constant.
+ *
+ * The expression style allows the CSE pass before this to optimize out
+ * repeated loads from the same offset, and gives the pre-register-allocation
+ * scheduling full flexibility, while the conversion to native instructions
+ * allows the post-register-allocation scheduler the best information
+ * possible.
+ *
+ * Note that execution masking for setting up pull constant loads is special:
+ * the channels that need to be written are unrelated to the current execution
+ * mask, since a later instruction will use one of the result channels as a
+ * source operand for all 8 or 16 of its channels.
+ */
+void
+fs_visitor::lower_uniform_pull_constant_loads()
+{
+   foreach_block_and_inst (block, fs_inst, inst, cfg) {
+      if (inst->opcode != FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD)
+         continue;
+
+      const fs_reg& surface = inst->src[PULL_UNIFORM_CONSTANT_SRC_SURFACE];
+      const fs_reg& offset_B = inst->src[PULL_UNIFORM_CONSTANT_SRC_OFFSET];
+      const fs_reg& size_B = inst->src[PULL_UNIFORM_CONSTANT_SRC_SIZE];
+      assert(offset_B.file == IMM);
+      assert(size_B.file == IMM);
+
+      if (devinfo->has_lsc) {
+         const fs_builder ubld =
+            fs_builder(this, block, inst).group(8, 0).exec_all();
+
+         const fs_reg payload = ubld.vgrf(BRW_REGISTER_TYPE_UD);
+         ubld.MOV(payload, offset_B);
+
+         inst->sfid = GFX12_SFID_UGM;
+         inst->desc = lsc_msg_desc(devinfo, LSC_OP_LOAD,
+                                   1 /* simd_size */,
+                                   LSC_ADDR_SURFTYPE_BTI,
+                                   LSC_ADDR_SIZE_A32,
+                                   1 /* num_coordinates */,
+                                   LSC_DATA_SIZE_D32,
+                                   inst->size_written / 4,
+                                   true /* transpose */,
+                                   LSC_CACHE_LOAD_L1STATE_L3MOCS,
+                                   true /* has_dest */);
+
+         fs_reg ex_desc;
+         if (surface.file == IMM) {
+            ex_desc = brw_imm_ud(lsc_bti_ex_desc(devinfo, surface.ud));
+         } else {
+            /* We only need the first component for the payload so we can use
+             * one of the other components for the extended descriptor
+             */
+            ex_desc = component(payload, 1);
+            ubld.group(1, 0).SHL(ex_desc, surface, brw_imm_ud(24));
+         }
+
+         /* Update the original instruction. */
+         inst->opcode = SHADER_OPCODE_SEND;
+         inst->mlen = lsc_msg_desc_src0_len(devinfo, inst->desc);
+         inst->ex_mlen = 0;
+         inst->header_size = 0;
+         inst->send_has_side_effects = false;
+         inst->send_is_volatile = true;
+         inst->exec_size = 1;
+
+         /* Finally, the payload */
+         inst->resize_sources(3);
+         inst->src[0] = brw_imm_ud(0); /* desc */
+         inst->src[1] = ex_desc;
+         inst->src[2] = payload;
+
+         invalidate_analysis(DEPENDENCY_INSTRUCTIONS | DEPENDENCY_VARIABLES);
+      } else if (devinfo->ver >= 7) {
+         const fs_builder ubld = fs_builder(this, block, inst).exec_all();
+         fs_reg header = bld.exec_all().group(8, 0).vgrf(BRW_REGISTER_TYPE_UD);
+
+         ubld.group(8, 0).MOV(header,
+                              retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
+         ubld.group(1, 0).MOV(component(header, 2),
+                              brw_imm_ud(offset_B.ud / 16));
+
+         inst->sfid = GFX6_SFID_DATAPORT_CONSTANT_CACHE;
+         inst->opcode = SHADER_OPCODE_SEND;
+         inst->header_size = 1;
+         inst->mlen = 1;
+
+         uint32_t desc =
+            brw_dp_oword_block_rw_desc(devinfo, true /* align_16B */,
+                                       size_B.ud / 4, false /* write */);
+
+         inst->resize_sources(4);
+
+         setup_surface_descriptors(ubld, inst, desc,
+                                   inst->src[PULL_UNIFORM_CONSTANT_SRC_SURFACE],
+                                   fs_reg() /* surface_handle */);
+
+         inst->src[2] = header;
+         inst->src[3] = fs_reg(); /* unused for reads */
+
+         invalidate_analysis(DEPENDENCY_INSTRUCTIONS | DEPENDENCY_VARIABLES);
+      } else {
+         /* Before register allocation, we didn't tell the scheduler about the
+          * MRF we use.  We know it's safe to use this MRF because nothing
+          * else does except for register spill/unspill, which generates and
+          * uses its MRF within a single IR instruction.
+          */
+         inst->base_mrf = FIRST_PULL_LOAD_MRF(devinfo->ver) + 1;
+         inst->mlen = 1;
+      }
+   }
+}
--- a/src/intel/compiler/brw_schedule_instructions.cpp
+++ b/src/intel/compiler/brw_schedule_instructions.cpp
@@ -326,7 +326,6 @@ schedule_node::set_latency_gfx7(bool is_haswell)

   case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GFX4:
   case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
-   case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GFX7:
   case VS_OPCODE_PULL_CONSTANT_LOAD:
      /* testing using varying-index pull constants:
       *
@@ -399,6 +398,11 @@ schedule_node::set_latency_gfx7(bool is_haswell)
         break;
      }

+      case GFX6_SFID_DATAPORT_CONSTANT_CACHE:
+         /* See FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD */
+         latency = 200;
+         break;
+
      case GFX6_SFID_DATAPORT_RENDER_CACHE:
         switch (brw_fb_desc_msg_type(isa->devinfo, inst->desc)) {
         case GFX7_DATAPORT_RC_TYPED_SURFACE_WRITE:
--- a/src/intel/compiler/brw_shader.cpp
+++ b/src/intel/compiler/brw_shader.cpp
@@ -430,8 +430,6 @@ brw_instruction_name(const struct brw_isa_info *isa, enum opcode op)

   case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
      return "uniform_pull_const";
-   case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GFX7:
-      return "uniform_pull_const_gfx7";
   case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GFX4:
      return "varying_pull_const_gfx4";
   case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_LOGICAL: