From e666872c751bedd1e4c2e1231644c14ed18639e7 Mon Sep 17 00:00:00 2001
From: Ian Romanick <ian.d.romanick@intel.com>
Date: Wed, 20 Sep 2023 12:42:24 -0700
Subject: [PATCH] intel/compiler: Initial bits for DPAS instruction

v2: Add brw_ir_performance.cpp and brw_fs_generator.cpp changes. Fix
overlapping register allocation (via has_source_and_destination_hazard). Fix
incorrect destination register file encoding.

v3: Prevent lower_regioning from trying to "fix" DPAS sources.

v4: Add instruction latency information for scheduling and perf
estimates.

v5: Remove all mention of DPASW. Suggested by Curro and Caio. Update
the comment in fs_inst::has_source_and_destination_hazard. Suggested
by Caio.

v6: Add some comments near the src2 calculation in
fs_inst::size_read. Suggested by Caio.

Reviewed-by: Caio Oliveira <caio.oliveira@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/25994>
---
 src/intel/compiler/brw_eu.c                   |  1 +
 src/intel/compiler/brw_eu.h                   |  4 ++
 src/intel/compiler/brw_eu_defines.h           | 19 ++++++
 src/intel/compiler/brw_eu_emit.c              | 63 +++++++++++++++++++
 src/intel/compiler/brw_eu_validate.c          |  5 +-
 src/intel/compiler/brw_fs.cpp                 | 38 +++++++++++
 src/intel/compiler/brw_fs_builder.h           | 21 +++++++
 src/intel/compiler/brw_fs_generator.cpp       | 19 ++++++
 src/intel/compiler/brw_fs_lower_regioning.cpp |  4 +-
 src/intel/compiler/brw_inst.h                 | 61 ++++++++++++++++++
 src/intel/compiler/brw_ir.h                   | 10 +++
 src/intel/compiler/brw_ir_performance.cpp     | 32 +++++++++-
 .../compiler/brw_schedule_instructions.cpp    | 15 +++++
 src/intel/compiler/brw_shader.cpp             |  8 +++
 14 files changed, 297 insertions(+), 3 deletions(-)

diff --git a/src/intel/compiler/brw_eu.c b/src/intel/compiler/brw_eu.c
index 597c316d752..e865300a5c0 100644
--- a/src/intel/compiler/brw_eu.c
+++ b/src/intel/compiler/brw_eu.c
@@ -696,6 +696,7 @@ static const struct opcode_desc opcode_descs[] = {
    { BRW_OPCODE_DP2,      87,  "dp2",     2,    1,    GFX_LT(GFX11) },
    { BRW_OPCODE_DP4A,     88,  "dp4a",    3,    1,    GFX_GE(GFX12) },
    { BRW_OPCODE_LINE,     89,  "line",    2,    1,    GFX_LE(GFX10) },
+   { BRW_OPCODE_DPAS,     89,  "dpas",    3,    1,    GFX_GE(GFX125) },
    { BRW_OPCODE_PLN,      90,  "pln",     2,    1,    GFX_GE(GFX45) & GFX_LE(GFX10) },
    { BRW_OPCODE_MAD,      91,  "mad",     3,    1,    GFX_GE(GFX6) },
    { BRW_OPCODE_LRP,      92,  "lrp",     3,    1,    GFX_GE(GFX6) & GFX_LE(GFX10) },
diff --git a/src/intel/compiler/brw_eu.h b/src/intel/compiler/brw_eu.h
index 12118286c80..154ec4c9e7a 100644
--- a/src/intel/compiler/brw_eu.h
+++ b/src/intel/compiler/brw_eu.h
@@ -1908,6 +1908,10 @@ void brw_CMPN(struct brw_codegen *p,
               struct brw_reg src0,
               struct brw_reg src1);
 
+brw_inst *brw_DPAS(struct brw_codegen *p, enum gfx12_systolic_depth sdepth,
+                   unsigned rcount, struct brw_reg dest, struct brw_reg src0,
+                   struct brw_reg src1, struct brw_reg src2);
+
 void
 brw_untyped_atomic(struct brw_codegen *p,
                    struct brw_reg dst,
diff --git a/src/intel/compiler/brw_eu_defines.h b/src/intel/compiler/brw_eu_defines.h
index 852ec8c169d..b22bcf38605 100644
--- a/src/intel/compiler/brw_eu_defines.h
+++ b/src/intel/compiler/brw_eu_defines.h
@@ -254,6 +254,7 @@ enum opcode {
    BRW_OPCODE_DP2,
    BRW_OPCODE_DP4A, /**< Gfx12+ */
    BRW_OPCODE_LINE,
+   BRW_OPCODE_DPAS,  /**< Gfx12.5+ */
    BRW_OPCODE_PLN, /**< G45+ */
    BRW_OPCODE_MAD, /**< Gfx6+ */
    BRW_OPCODE_LRP, /**< Gfx6+ */
@@ -1137,6 +1138,24 @@ enum tgl_sbid_mode {
    TGL_SBID_SET = 4
 };
 
+
+enum gfx12_sub_byte_precision {
+   BRW_SUB_BYTE_PRECISION_NONE = 0,
+
+   /** 4 bits. Signedness determined by base type */
+   BRW_SUB_BYTE_PRECISION_4BIT = 1,
+
+   /** 2 bits. Signedness determined by base type */
+   BRW_SUB_BYTE_PRECISION_2BIT = 2,
+};
+
+enum gfx12_systolic_depth {
+   BRW_SYSTOLIC_DEPTH_16 = 0,
+   BRW_SYSTOLIC_DEPTH_2 = 1,
+   BRW_SYSTOLIC_DEPTH_4 = 2,
+   BRW_SYSTOLIC_DEPTH_8 = 3,
+};
+
 #ifdef __cplusplus
 /**
  * Allow bitwise arithmetic of tgl_sbid_mode enums.
diff --git a/src/intel/compiler/brw_eu_emit.c b/src/intel/compiler/brw_eu_emit.c
index 93f1f930d1b..0dd7b3ac266 100644
--- a/src/intel/compiler/brw_eu_emit.c
+++ b/src/intel/compiler/brw_eu_emit.c
@@ -1016,6 +1016,60 @@ brw_alu3(struct brw_codegen *p, unsigned opcode, struct brw_reg dest,
    return inst;
 }
 
+static brw_inst *
+brw_dpas_three_src(struct brw_codegen *p, enum gfx12_systolic_depth opcode,
+                   unsigned sdepth, unsigned rcount, struct brw_reg dest,
+                   struct brw_reg src0, struct brw_reg src1, struct brw_reg src2)
+{
+   const struct intel_device_info *devinfo = p->devinfo;
+   brw_inst *inst = next_insn(p, opcode);
+
+   assert(dest.file == BRW_GENERAL_REGISTER_FILE);
+   brw_inst_set_dpas_3src_dst_reg_file(devinfo, inst,
+                                       BRW_GENERAL_REGISTER_FILE);
+   brw_inst_set_dpas_3src_dst_reg_nr(devinfo, inst, dest.nr);
+   brw_inst_set_dpas_3src_dst_subreg_nr(devinfo, inst, dest.subnr);
+
+   if (brw_reg_type_is_floating_point(dest.type)) {
+      brw_inst_set_dpas_3src_exec_type(devinfo, inst,
+                                       BRW_ALIGN1_3SRC_EXEC_TYPE_FLOAT);
+   } else {
+      brw_inst_set_dpas_3src_exec_type(devinfo, inst,
+                                       BRW_ALIGN1_3SRC_EXEC_TYPE_INT);
+   }
+
+   brw_inst_set_dpas_3src_sdepth(devinfo, inst, sdepth);
+   brw_inst_set_dpas_3src_rcount(devinfo, inst, rcount - 1);
+
+   brw_inst_set_dpas_3src_dst_type(devinfo, inst, dest.type);
+   brw_inst_set_dpas_3src_src0_type(devinfo, inst, src0.type);
+   brw_inst_set_dpas_3src_src1_type(devinfo, inst, src1.type);
+   brw_inst_set_dpas_3src_src2_type(devinfo, inst, src2.type);
+
+   assert(src0.file == BRW_GENERAL_REGISTER_FILE ||
+          (src0.file == BRW_ARCHITECTURE_REGISTER_FILE &&
+           src0.nr == BRW_ARF_NULL));
+
+   brw_inst_set_dpas_3src_src0_reg_file(devinfo, inst, src0.file);
+   brw_inst_set_dpas_3src_src0_reg_nr(devinfo, inst, src0.nr);
+   brw_inst_set_dpas_3src_src0_subreg_nr(devinfo, inst, src0.subnr);
+
+   assert(src1.file == BRW_GENERAL_REGISTER_FILE);
+
+   brw_inst_set_dpas_3src_src1_reg_file(devinfo, inst, src1.file);
+   brw_inst_set_dpas_3src_src1_reg_nr(devinfo, inst, src1.nr);
+   brw_inst_set_dpas_3src_src1_subreg_nr(devinfo, inst, src1.subnr);
+   brw_inst_set_dpas_3src_src1_subbyte(devinfo, inst, BRW_SUB_BYTE_PRECISION_NONE);
+
+   assert(src2.file == BRW_GENERAL_REGISTER_FILE);
+
+   brw_inst_set_dpas_3src_src2_reg_file(devinfo, inst, src2.file);
+   brw_inst_set_dpas_3src_src2_reg_nr(devinfo, inst, src2.nr);
+   brw_inst_set_dpas_3src_src2_subreg_nr(devinfo, inst, src2.subnr);
+   brw_inst_set_dpas_3src_src2_subbyte(devinfo, inst, BRW_SUB_BYTE_PRECISION_NONE);
+
+   return inst;
+}
 
 /***********************************************************************
  * Convenience routines.
@@ -1248,6 +1302,15 @@ brw_PLN(struct brw_codegen *p, struct brw_reg dest,
    return brw_alu2(p, BRW_OPCODE_PLN, dest, src0, src1);
 }
 
+brw_inst *
+brw_DPAS(struct brw_codegen *p, enum gfx12_systolic_depth sdepth,
+         unsigned rcount, struct brw_reg dest, struct brw_reg src0,
+         struct brw_reg src1, struct brw_reg src2)
+{
+   return brw_dpas_three_src(p, BRW_OPCODE_DPAS, sdepth, rcount, dest, src0,
+                             src1, src2);
+}
+
 brw_inst *
 brw_F32TO16(struct brw_codegen *p, struct brw_reg dst, struct brw_reg src)
 {
diff --git a/src/intel/compiler/brw_eu_validate.c b/src/intel/compiler/brw_eu_validate.c
index d105f6b0ef9..2d30c7fa37e 100644
--- a/src/intel/compiler/brw_eu_validate.c
+++ b/src/intel/compiler/brw_eu_validate.c
@@ -687,7 +687,10 @@ general_restrictions_based_on_operand_types(const struct brw_isa_info *isa,
       return error_msg;
 
    if (devinfo->ver >= 11) {
-      if (num_sources == 3) {
+      /* A register type of B or UB for DPAS actually means 4 bytes packed into
+       * a D or UD, so it is allowed.
+       */
+      if (num_sources == 3 && brw_inst_opcode(isa, inst) != BRW_OPCODE_DPAS) {
          ERROR_IF(brw_reg_type_to_size(brw_inst_3src_a1_src1_type(devinfo, inst)) == 1 ||
                   brw_reg_type_to_size(brw_inst_3src_a1_src2_type(devinfo, inst)) == 1,
                   "Byte data type is not supported for src1/2 register regioning. This includes "
diff --git a/src/intel/compiler/brw_fs.cpp b/src/intel/compiler/brw_fs.cpp
index 3acdb20158f..ca2d18639ae 100644
--- a/src/intel/compiler/brw_fs.cpp
+++ b/src/intel/compiler/brw_fs.cpp
@@ -395,6 +395,21 @@ fs_inst::has_source_and_destination_hazard() const
       default:
          return !is_uniform(src[0]);
       }
+   case BRW_OPCODE_DPAS:
+      /* This is overly conservative. The actual hazard is more complicated to
+       * describe. When the repeat count is N, the single instruction behaves
+       * like N instructions with a repeat count of one, but the destination
+       * and source registers are incremented (in somewhat complex ways) for
+       * each instruction.
+       *
+       * This means the source and destination register is actually a range of
+       * registers. The hazard exists of an earlier iteration would write a
+       * register that should be read by a later iteration.
+       *
+       * There may be some advantage to properly modeling this, but for now,
+       * be overly conservative.
+       */
+      return rcount > 1;
    default:
       /* The SIMD16 compressed instruction
        *
@@ -844,6 +859,9 @@ fs_inst::components_read(unsigned i) const
       else
          return 1;
 
+   case BRW_OPCODE_DPAS:
+      unreachable("Do not use components_read() for DPAS.");
+
    default:
       return 1;
    }
@@ -904,6 +922,26 @@ fs_inst::size_read(int arg) const
       }
       break;
 
+   case BRW_OPCODE_DPAS:
+      switch (arg) {
+      case 0:
+         if (src[0].type == BRW_REGISTER_TYPE_HF) {
+            return rcount * REG_SIZE / 2;
+         } else {
+            return rcount * REG_SIZE;
+         }
+      case 1:
+         return sdepth * REG_SIZE;
+      case 2:
+         /* This is simpler than the formula described in the Bspec, but it
+          * covers all of the cases that we support on DG2.
+          */
+         return rcount * REG_SIZE;
+      default:
+         unreachable("Invalid source number.");
+      }
+      break;
+
    case SHADER_OPCODE_TEX:
    case FS_OPCODE_TXB:
    case SHADER_OPCODE_TXD:
diff --git a/src/intel/compiler/brw_fs_builder.h b/src/intel/compiler/brw_fs_builder.h
index c9c33f9e79e..63244f0b75b 100644
--- a/src/intel/compiler/brw_fs_builder.h
+++ b/src/intel/compiler/brw_fs_builder.h
@@ -834,6 +834,27 @@ namespace brw {
          return inst;
       }
 
+      instruction *
+      DPAS(const dst_reg &dst, const src_reg &src0, const src_reg &src1, const src_reg &src2,
+           unsigned sdepth, unsigned rcount) const
+      {
+         assert(_dispatch_width == 8);
+         assert(sdepth == 8);
+         assert(rcount == 1 || rcount == 2 || rcount == 4 || rcount == 8);
+
+         instruction *inst = emit(BRW_OPCODE_DPAS, dst, src0, src1, src2);
+         inst->sdepth = sdepth;
+         inst->rcount = rcount;
+
+         if (dst.type == BRW_REGISTER_TYPE_HF) {
+            inst->size_written = rcount * REG_SIZE / 2;
+         } else {
+            inst->size_written = rcount * REG_SIZE;
+         }
+
+         return inst;
+      }
+
       fs_visitor *shader;
 
       fs_inst *BREAK()    { return emit(BRW_OPCODE_BREAK); }
diff --git a/src/intel/compiler/brw_fs_generator.cpp b/src/intel/compiler/brw_fs_generator.cpp
index 95b415afa73..53b966f01ec 100644
--- a/src/intel/compiler/brw_fs_generator.cpp
+++ b/src/intel/compiler/brw_fs_generator.cpp
@@ -1606,6 +1606,19 @@ fs_generator::enable_debug(const char *shader_name)
    this->shader_name = shader_name;
 }
 
+static gfx12_systolic_depth
+translate_systolic_depth(unsigned d)
+{
+   /* Could also return (ffs(d) - 1) & 3. */
+   switch (d) {
+   case 2:  return BRW_SYSTOLIC_DEPTH_2;
+   case 4:  return BRW_SYSTOLIC_DEPTH_4;
+   case 8:  return BRW_SYSTOLIC_DEPTH_8;
+   case 16: return BRW_SYSTOLIC_DEPTH_16;
+   default: unreachable("Invalid systolic depth.");
+   }
+}
+
 int
 fs_generator::generate_code(const cfg_t *cfg, int dispatch_width,
                             struct shader_stats shader_stats,
@@ -1791,6 +1804,12 @@ fs_generator::generate_code(const cfg_t *cfg, int dispatch_width,
          brw_LINE(p, dst, src[0], src[1]);
          break;
 
+      case BRW_OPCODE_DPAS:
+         assert(devinfo->verx10 >= 125);
+         brw_DPAS(p, translate_systolic_depth(inst->sdepth), inst->rcount,
+                  dst, src[0], src[1], src[2]);
+         break;
+
       case BRW_OPCODE_MAD:
          assert(devinfo->ver >= 6);
          if (devinfo->ver < 10)
diff --git a/src/intel/compiler/brw_fs_lower_regioning.cpp b/src/intel/compiler/brw_fs_lower_regioning.cpp
index a2c04e3ba5b..3bff7770cd0 100644
--- a/src/intel/compiler/brw_fs_lower_regioning.cpp
+++ b/src/intel/compiler/brw_fs_lower_regioning.cpp
@@ -253,8 +253,10 @@ namespace {
    has_invalid_src_region(const intel_device_info *devinfo, const fs_inst *inst,
                           unsigned i)
    {
-      if (is_send(inst) || inst->is_math() || inst->is_control_source(i))
+      if (is_send(inst) || inst->is_math() || inst->is_control_source(i) ||
+          inst->opcode == BRW_OPCODE_DPAS) {
          return false;
+      }
 
       /* Empirical testing shows that Broadwell has a bug affecting half-float
        * MAD instructions when any of its sources has a non-zero offset, such
diff --git a/src/intel/compiler/brw_inst.h b/src/intel/compiler/brw_inst.h
index 06cabcd833e..6741dd6b21e 100644
--- a/src/intel/compiler/brw_inst.h
+++ b/src/intel/compiler/brw_inst.h
@@ -524,6 +524,67 @@ brw_inst_set_3src_a1_src2_imm(ASSERTED const struct intel_device_info *devinfo,
 }
 /** @} */
 
+/**
+ * Three-source systolic instructions:
+ *  @{
+ */
+F(dpas_3src_src2_reg_nr,    /* 4+ */ -1, -1,   /* 12+ */ 127, 120)
+F(dpas_3src_src2_subreg_nr, /* 4+ */ -1, -1,   /* 12+ */ 119, 115)
+F(dpas_3src_src2_reg_file,  /* 4+ */ -1, -1,   /* 12+ */ 114, 114)
+F(dpas_3src_src1_reg_nr,    /* 4+ */ -1, -1,   /* 12+ */ 111, 104)
+F(dpas_3src_src1_subreg_nr, /* 4+ */ -1, -1,   /* 12+ */ 103, 99)
+F(dpas_3src_src1_reg_file,  /* 4+ */ -1, -1,   /* 12+ */ 98,  98)
+F(dpas_3src_src1_hw_type,   /* 4+ */ -1, -1,   /* 12+ */ 90,  88)
+F(dpas_3src_src1_subbyte,   /* 4+ */ -1, -1,   /* 12+ */ 87,  86)
+F(dpas_3src_src2_subbyte,   /* 4+ */ -1, -1,   /* 12+ */ 85,  84)
+F(dpas_3src_src2_hw_type,   /* 4+ */ -1, -1,   /* 12+ */ 82,  80)
+F(dpas_3src_src0_reg_nr,    /* 4+ */ -1, -1,   /* 12+ */ 79,  72)
+F(dpas_3src_src0_subreg_nr, /* 4+ */ -1, -1,   /* 12+ */ 71,  67)
+F(dpas_3src_src0_reg_file,  /* 4+ */ -1, -1,   /* 12+ */ 66,  66)
+F(dpas_3src_dst_reg_nr,     /* 4+ */ -1, -1,   /* 12+ */ 63,  56)
+F(dpas_3src_dst_subreg_nr,  /* 4+ */ -1, -1,   /* 12+ */ 55,  51)
+F(dpas_3src_dst_reg_file,   /* 4+ */ -1, -1,   /* 12+ */ 50,  50)
+F(dpas_3src_sdepth,         /* 4+ */ -1, -1,   /* 12+ */ 49,  48)
+F(dpas_3src_rcount,         /* 4+ */ -1, -1,   /* 12+ */ 45,  43)
+F(dpas_3src_src0_hw_type,   /* 4+ */ -1, -1,   /* 12+ */ 42,  40)
+F(dpas_3src_exec_type,      /* 4+ */ -1, -1,   /* 12+ */ 39,  39)
+F(dpas_3src_dst_hw_type,    /* 4+ */ -1, -1,   /* 12+ */ 38,  36)
+/** @} */
+
+#define REG_TYPE(reg)                                                         \
+static inline void                                                            \
+brw_inst_set_dpas_3src_##reg##_type(const struct intel_device_info *devinfo,  \
+                                    brw_inst *inst, enum brw_reg_type type)   \
+{                                                                             \
+   UNUSED enum gfx10_align1_3src_exec_type exec_type =                        \
+      (enum gfx10_align1_3src_exec_type) brw_inst_dpas_3src_exec_type(devinfo,\
+                                                                      inst);  \
+   if (brw_reg_type_is_floating_point(type)) {                                \
+      assert(exec_type == BRW_ALIGN1_3SRC_EXEC_TYPE_FLOAT);                   \
+   } else {                                                                   \
+      assert(exec_type == BRW_ALIGN1_3SRC_EXEC_TYPE_INT);                     \
+   }                                                                          \
+   unsigned hw_type = brw_reg_type_to_a1_hw_3src_type(devinfo, type);         \
+   brw_inst_set_dpas_3src_##reg##_hw_type(devinfo, inst, hw_type);            \
+}                                                                             \
+                                                                              \
+static inline enum brw_reg_type                                               \
+brw_inst_dpas_3src_##reg##_type(const struct intel_device_info *devinfo,      \
+                              const brw_inst *inst)                           \
+{                                                                             \
+   enum gfx10_align1_3src_exec_type exec_type =                               \
+      (enum gfx10_align1_3src_exec_type) brw_inst_dpas_3src_exec_type(devinfo,\
+                                                                      inst);  \
+   unsigned hw_type = brw_inst_dpas_3src_##reg##_hw_type(devinfo, inst);      \
+   return brw_a1_hw_3src_type_to_reg_type(devinfo, hw_type, exec_type);       \
+}
+
+REG_TYPE(dst)
+REG_TYPE(src0)
+REG_TYPE(src1)
+REG_TYPE(src2)
+#undef REG_TYPE
+
 /**
  * Flow control instruction bits:
  *  @{
diff --git a/src/intel/compiler/brw_ir.h b/src/intel/compiler/brw_ir.h
index b77668a5e46..e7f54798303 100644
--- a/src/intel/compiler/brw_ir.h
+++ b/src/intel/compiler/brw_ir.h
@@ -199,6 +199,16 @@ struct backend_instruction {
     */
    unsigned flag_subreg:2;
 
+   /**
+    * Systolic depth used by DPAS instruction.
+    */
+   unsigned sdepth:4;
+
+   /**
+    * Repeat count used by DPAS instruction.
+    */
+   unsigned rcount:4;
+
    /** The number of hardware registers used for a message header. */
    uint8_t header_size;
 };
diff --git a/src/intel/compiler/brw_ir_performance.cpp b/src/intel/compiler/brw_ir_performance.cpp
index b50ef8bd828..9ab7ef563b0 100644
--- a/src/intel/compiler/brw_ir_performance.cpp
+++ b/src/intel/compiler/brw_ir_performance.cpp
@@ -148,6 +148,8 @@ namespace {
              !brw_reg_type_is_floating_point(tx) && type_sz(tx) == 4 &&
              type_sz(inst->src[0].type) == type_sz(inst->src[1].type))
             tx = brw_int_type(8, tx == BRW_REGISTER_TYPE_D);
+
+         rcount = inst->opcode == BRW_OPCODE_DPAS ? inst->rcount : 0;
       }
 
       instruction_info(const struct brw_isa_info *isa,
@@ -155,7 +157,7 @@ namespace {
          isa(isa), devinfo(isa->devinfo), op(inst->opcode),
          td(inst->dst.type), sd(DIV_ROUND_UP(inst->size_written, REG_SIZE)),
          tx(get_exec_type(inst)), sx(0), ss(0), sc(0),
-         desc(inst->desc), sfid(inst->sfid)
+         desc(inst->desc), sfid(inst->sfid), rcount(0)
       {
          /* Compute the maximum source size. */
          for (unsigned i = 0; i < ARRAY_SIZE(inst->src); i++)
@@ -195,6 +197,8 @@ namespace {
       uint32_t desc;
       /** Send message shared function ID. */
       uint8_t sfid;
+      /** Repeat count for DPAS instructions. */
+      uint8_t rcount;
    };
 
    /**
@@ -505,6 +509,32 @@ namespace {
          else
             abort();
 
+      case BRW_OPCODE_DPAS: {
+         unsigned ld;
+
+         switch (info.rcount) {
+         case 1:
+            ld = 21;
+            break;
+         case 2:
+            ld = 22;
+            break;
+         case 8:
+         default:
+            ld = 32;
+            break;
+         }
+
+         /* DPAS cannot write the accumulator or the flags, so pass UINT_MAX
+          * for la and lf.
+          */
+         if (devinfo->verx10 >= 125)
+            return calculate_desc(info, EU_UNIT_FPU, 0, 2, 1, 0, 2,
+                                  0, ld, UINT_MAX, UINT_MAX, 0, 0);
+         else
+            abort();
+      }
+
       case SHADER_OPCODE_RCP:
       case SHADER_OPCODE_RSQ:
       case SHADER_OPCODE_SQRT:
diff --git a/src/intel/compiler/brw_schedule_instructions.cpp b/src/intel/compiler/brw_schedule_instructions.cpp
index 0855ee9a131..913805f2609 100644
--- a/src/intel/compiler/brw_schedule_instructions.cpp
+++ b/src/intel/compiler/brw_schedule_instructions.cpp
@@ -627,6 +627,21 @@ schedule_node::set_latency_gfx7(const struct brw_isa_info *isa)
       }
       break;
 
+   case BRW_OPCODE_DPAS:
+      switch (inst->rcount) {
+      case 1:
+         latency = 21;
+         break;
+      case 2:
+         latency = 22;
+         break;
+      case 8:
+      default:
+         latency = 32;
+         break;
+      }
+      break;
+
    default:
       /* 2 cycles:
        * mul(8) g4<1>F g2<0,1,0>F      0.5F            { align1 WE_normal 1Q };
diff --git a/src/intel/compiler/brw_shader.cpp b/src/intel/compiler/brw_shader.cpp
index 5371e5fdcdd..c53a5e4fa6f 100644
--- a/src/intel/compiler/brw_shader.cpp
+++ b/src/intel/compiler/brw_shader.cpp
@@ -164,6 +164,13 @@ brw_instruction_name(const struct brw_isa_info *isa, enum opcode op)
       if (devinfo->ver > 7 && op == BRW_OPCODE_F16TO32)
          return "f16to32";
 
+      /* DPAS instructions may transiently exist on platforms that do not
+       * support DPAS. They will eventually be lowered, but in the meantime it
+       * must be possible to query the instruction name.
+       */
+      if (devinfo->verx10 < 125 && op == BRW_OPCODE_DPAS)
+         return "dpas";
+
       assert(brw_opcode_desc(isa, op)->name);
       return brw_opcode_desc(isa, op)->name;
    case FS_OPCODE_FB_WRITE:
@@ -936,6 +943,7 @@ backend_instruction::can_do_source_mods() const
    case BRW_OPCODE_ROR:
    case BRW_OPCODE_SUBB:
    case BRW_OPCODE_DP4A:
+   case BRW_OPCODE_DPAS:
    case SHADER_OPCODE_BROADCAST:
    case SHADER_OPCODE_CLUSTER_BROADCAST:
    case SHADER_OPCODE_MOV_INDIRECT: