intel/fs: Add a SCRATCH_HEADER opcode

This opcode is responsible for setting up the buffer base address and per-thread scratch space fields of a scratch message header. For the most part, it's a copy of g0 but some messages need us to zero out g0.2 and the bottom bits of g0.5. This may actually fix a bug when nir_load/store_scratch is used. The docs say that the DWORD scattered messages respect the per-thread scratch size specified in gN.3[3:0] in the message header but we've been leaving it zero. This may mean that we've been ignoring any scratch reads/writes from a load/store_scratch intrinsic above the 1KB mark. Reviewed-by: Kenneth Graunke <kenneth@whitecape.org> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/7084>
2020-10-09 04:13:20 -05:00
parent 24b64c8408
commit 06ebf23283
6 changed files with 86 additions and 33 deletions
--- a/src/intel/compiler/brw_eu_defines.h
+++ b/src/intel/compiler/brw_eu_defines.h
@@ -475,6 +475,8 @@ enum opcode {
   SHADER_OPCODE_GEN4_SCRATCH_WRITE,
   SHADER_OPCODE_GEN7_SCRATCH_READ,

+   SHADER_OPCODE_SCRATCH_HEADER,
+
   /**
    * Gen8+ SIMD8 URB Read messages.
    */
--- a/src/intel/compiler/brw_fs.cpp
+++ b/src/intel/compiler/brw_fs.cpp
@@ -5429,42 +5429,15 @@ lower_surface_logical_send(const fs_builder &bld, fs_inst *inst)
   if ((devinfo->gen < 9 && is_typed_access) || is_stateless) {
      fs_builder ubld = bld.exec_all().group(8, 0);
      header = ubld.vgrf(BRW_REGISTER_TYPE_UD);
-      ubld.MOV(header, brw_imm_d(0));
      if (is_stateless) {
-         /* Copy the per-thread scratch from g0 for bounds checking */
-         ubld.group(1, 0).AND(component(header, 3),
-                              retype(brw_vec1_grf(0, 3), BRW_REGISTER_TYPE_UD),
-                              brw_imm_ud(0xf));
-
-         /* Both the typed and scattered byte/dword A32 messages take a buffer
-          * base address in R0.5:[31:0] (See MH1_A32_PSM for typed messages or
-          * MH_A32_GO for byte/dword scattered messages in the SKL PRM Vol. 2d
-          * for more details.)  This is conveniently where the HW places the
-          * scratch surface base address.
-          *
-          * From the SKL PRM Vol. 7 "Per-Thread Scratch Space":
-          *
-          *    "When a thread becomes 'active' it is allocated a portion of
-          *    scratch space, sized according to PerThreadScratchSpace. The
-          *    starting location of each thread’s scratch space allocation,
-          *    ScratchSpaceOffset, is passed in the thread payload in
-          *    R0.5[31:10] and is specified as a 1KB-granular offset from the
-          *    GeneralStateBaseAddress.  The computation of ScratchSpaceOffset
-          *    includes the starting address of the stage’s scratch space
-          *    allocation, as programmed by ScratchSpaceBasePointer."
-          *
-          * The base address is passed in bits R0.5[31:10] and the bottom 10
-          * bits of R0.5 are used for other things.  Therefore, we have to
-          * mask off the bottom 10 bits so that we don't get a garbage base
-          * address.
-          */
-         ubld.group(1, 0).AND(component(header, 5),
-                              retype(brw_vec1_grf(0, 5), BRW_REGISTER_TYPE_UD),
-                              brw_imm_ud(0xfffffc00));
-      }
+         assert(!is_surface_access);
+         ubld.emit(SHADER_OPCODE_SCRATCH_HEADER, header);
+      } else {
+         ubld.MOV(header, brw_imm_d(0));
         if (is_surface_access)
            ubld.group(1, 0).MOV(component(header, 7), sample_mask);
      }
+   }
   const unsigned header_sz = header.file != BAD_FILE ? 1 : 0;

   fs_reg payload, payload2;
--- a/src/intel/compiler/brw_fs.h
+++ b/src/intel/compiler/brw_fs.h
@@ -513,6 +513,7 @@ private:
   void generate_scratch_write(fs_inst *inst, struct brw_reg src);
   void generate_scratch_read(fs_inst *inst, struct brw_reg dst);
   void generate_scratch_read_gen7(fs_inst *inst, struct brw_reg dst);
+   void generate_scratch_header(fs_inst *inst, struct brw_reg dst);
   void generate_uniform_pull_constant_load(fs_inst *inst, struct brw_reg dst,
                                            struct brw_reg index,
                                            struct brw_reg offset);
--- a/src/intel/compiler/brw_fs_generator.cpp
+++ b/src/intel/compiler/brw_fs_generator.cpp
@@ -1533,6 +1533,76 @@ fs_generator::generate_scratch_read_gen7(fs_inst *inst, struct brw_reg dst)
   gen7_block_read_scratch(p, dst, inst->exec_size / 8, inst->offset);
 }

+/* The A32 messages take a buffer base address in header.5:[31:0] (See
+ * MH1_A32_PSM for typed messages or MH_A32_GO for byte/dword scattered
+ * and OWord block messages in the SKL PRM Vol. 2d for more details.)
+ * Unfortunately, there are a number of subtle differences:
+ *
+ * For the block read/write messages:
+ *
+ *   - We always stomp header.2 to fill in the actual scratch address (in
+ *     units of OWORDs) so we don't care what's in there.
+ *
+ *   - They rely on per-thread scratch space value in header.3[3:0] to do
+ *     bounds checking so that needs to be valid.  The upper bits of
+ *     header.3 are ignored, though, so we can copy all of g0.3.
+ *
+ *   - They ignore header.5[9:0] and assumes the address is 1KB aligned.
+ *
+ *
+ * For the byte/dword scattered read/write messages:
+ *
+ *   - We want header.2 to be zero because that gets added to the per-channel
+ *     offset in the non-header portion of the message.
+ *
+ *   - Contrary to what the docs claim, they don't do any bounds checking so
+ *     the value of header.3[3:0] doesn't matter.
+ *
+ *   - They consider all of header.5 for the base address and header.5[9:0]
+ *     are not ignored.  This means that we can't copy g0.5 verbatim because
+ *     g0.5[9:0] contains the FFTID on most platforms.  Instead, we have to
+ *     use an AND to mask off the bottom 10 bits.
+ *
+ *
+ * For block messages, just copying g0 gives a valid header because all the
+ * garbage gets ignored except for header.2 which we stomp as part of message
+ * setup.  For byte/dword scattered messages, we can just zero out the header
+ * and copy over the bits we need from g0.5.  This opcode, however, tries to
+ * satisfy the requirements of both by starting with 0 and filling out the
+ * information required by either set of opcodes.
+ */
+void
+fs_generator::generate_scratch_header(fs_inst *inst, struct brw_reg dst)
+{
+   assert(inst->exec_size == 8 && inst->force_writemask_all);
+   assert(dst.file == BRW_GENERAL_REGISTER_FILE);
+
+   dst.type = BRW_REGISTER_TYPE_UD;
+
+   brw_inst *insn = brw_MOV(p, dst, brw_imm_ud(0));
+   if (devinfo->gen >= 12)
+      brw_set_default_swsb(p, tgl_swsb_null());
+   else
+      brw_inst_set_no_dd_clear(p->devinfo, insn, true);
+
+   /* Copy the per-thread scratch space size from g0.3[3:0] */
+   brw_set_default_exec_size(p, BRW_EXECUTE_1);
+   insn = brw_AND(p, suboffset(dst, 3),
+                     retype(brw_vec1_grf(0, 3), BRW_REGISTER_TYPE_UD),
+                     brw_imm_ud(INTEL_MASK(3, 0)));
+   if (devinfo->gen < 12) {
+      brw_inst_set_no_dd_clear(p->devinfo, insn, true);
+      brw_inst_set_no_dd_check(p->devinfo, insn, true);
+   }
+
+   /* Copy the scratch base address from g0.5[31:10] */
+   insn = brw_AND(p, suboffset(dst, 5),
+                     retype(brw_vec1_grf(0, 5), BRW_REGISTER_TYPE_UD),
+                     brw_imm_ud(INTEL_MASK(31, 10)));
+   if (devinfo->gen < 12)
+      brw_inst_set_no_dd_check(p->devinfo, insn, true);
+}
+
 void
 fs_generator::generate_uniform_pull_constant_load(fs_inst *inst,
                                                  struct brw_reg dst,
@@ -2265,6 +2335,10 @@ fs_generator::generate_code(const cfg_t *cfg, int dispatch_width,
         fill_count++;
 	 break;

+      case SHADER_OPCODE_SCRATCH_HEADER:
+         generate_scratch_header(inst, dst);
+         break;
+
      case SHADER_OPCODE_MOV_INDIRECT:
         generate_mov_indirect(inst, dst, src[0], src[1]);
         break;
--- a/src/intel/compiler/brw_ir_performance.cpp
+++ b/src/intel/compiler/brw_ir_performance.cpp
@@ -327,6 +327,7 @@ namespace {
      case BRW_OPCODE_LINE:
      case BRW_OPCODE_NOP:
      case SHADER_OPCODE_CLUSTER_BROADCAST:
+      case SHADER_OPCODE_SCRATCH_HEADER:
      case FS_OPCODE_DDX_COARSE:
      case FS_OPCODE_DDX_FINE:
      case FS_OPCODE_DDY_COARSE:
--- a/src/intel/compiler/brw_shader.cpp
+++ b/src/intel/compiler/brw_shader.cpp
@@ -349,6 +349,8 @@ brw_instruction_name(const struct gen_device_info *devinfo, enum opcode op)
      return "gen4_scratch_write";
   case SHADER_OPCODE_GEN7_SCRATCH_READ:
      return "gen7_scratch_read";
+   case SHADER_OPCODE_SCRATCH_HEADER:
+      return "scratch_header";
   case SHADER_OPCODE_URB_WRITE_SIMD8:
      return "gen8_urb_write_simd8";
   case SHADER_OPCODE_URB_WRITE_SIMD8_PER_SLOT: