From e666872c751bedd1e4c2e1231644c14ed18639e7 Mon Sep 17 00:00:00 2001 From: Ian Romanick Date: Wed, 20 Sep 2023 12:42:24 -0700 Subject: [PATCH] intel/compiler: Initial bits for DPAS instruction v2: Add brw_ir_performance.cpp and brw_fs_generator.cpp changes. Fix overlapping register allocation (via has_source_and_destination_hazard). Fix incorrect destination register file encoding. v3: Prevent lower_regioning from trying to "fix" DPAS sources. v4: Add instruction latency information for scheduling and perf estimates. v5: Remove all mention of DPASW. Suggested by Curro and Caio. Update the comment in fs_inst::has_source_and_destination_hazard. Suggested by Caio. v6: Add some comments near the src2 calculation in fs_inst::size_read. Suggested by Caio. Reviewed-by: Caio Oliveira Part-of: --- src/intel/compiler/brw_eu.c | 1 + src/intel/compiler/brw_eu.h | 4 ++ src/intel/compiler/brw_eu_defines.h | 19 ++++++ src/intel/compiler/brw_eu_emit.c | 63 +++++++++++++++++++ src/intel/compiler/brw_eu_validate.c | 5 +- src/intel/compiler/brw_fs.cpp | 38 +++++++++++ src/intel/compiler/brw_fs_builder.h | 21 +++++++ src/intel/compiler/brw_fs_generator.cpp | 19 ++++++ src/intel/compiler/brw_fs_lower_regioning.cpp | 4 +- src/intel/compiler/brw_inst.h | 61 ++++++++++++++++++ src/intel/compiler/brw_ir.h | 10 +++ src/intel/compiler/brw_ir_performance.cpp | 32 +++++++++- .../compiler/brw_schedule_instructions.cpp | 15 +++++ src/intel/compiler/brw_shader.cpp | 8 +++ 14 files changed, 297 insertions(+), 3 deletions(-) diff --git a/src/intel/compiler/brw_eu.c b/src/intel/compiler/brw_eu.c index 597c316d752..e865300a5c0 100644 --- a/src/intel/compiler/brw_eu.c +++ b/src/intel/compiler/brw_eu.c @@ -696,6 +696,7 @@ static const struct opcode_desc opcode_descs[] = { { BRW_OPCODE_DP2, 87, "dp2", 2, 1, GFX_LT(GFX11) }, { BRW_OPCODE_DP4A, 88, "dp4a", 3, 1, GFX_GE(GFX12) }, { BRW_OPCODE_LINE, 89, "line", 2, 1, GFX_LE(GFX10) }, + { BRW_OPCODE_DPAS, 89, "dpas", 3, 1, GFX_GE(GFX125) }, { BRW_OPCODE_PLN, 90, "pln", 2, 1, GFX_GE(GFX45) & GFX_LE(GFX10) }, { BRW_OPCODE_MAD, 91, "mad", 3, 1, GFX_GE(GFX6) }, { BRW_OPCODE_LRP, 92, "lrp", 3, 1, GFX_GE(GFX6) & GFX_LE(GFX10) }, diff --git a/src/intel/compiler/brw_eu.h b/src/intel/compiler/brw_eu.h index 12118286c80..154ec4c9e7a 100644 --- a/src/intel/compiler/brw_eu.h +++ b/src/intel/compiler/brw_eu.h @@ -1908,6 +1908,10 @@ void brw_CMPN(struct brw_codegen *p, struct brw_reg src0, struct brw_reg src1); +brw_inst *brw_DPAS(struct brw_codegen *p, enum gfx12_systolic_depth sdepth, + unsigned rcount, struct brw_reg dest, struct brw_reg src0, + struct brw_reg src1, struct brw_reg src2); + void brw_untyped_atomic(struct brw_codegen *p, struct brw_reg dst, diff --git a/src/intel/compiler/brw_eu_defines.h b/src/intel/compiler/brw_eu_defines.h index 852ec8c169d..b22bcf38605 100644 --- a/src/intel/compiler/brw_eu_defines.h +++ b/src/intel/compiler/brw_eu_defines.h @@ -254,6 +254,7 @@ enum opcode { BRW_OPCODE_DP2, BRW_OPCODE_DP4A, /**< Gfx12+ */ BRW_OPCODE_LINE, + BRW_OPCODE_DPAS, /**< Gfx12.5+ */ BRW_OPCODE_PLN, /**< G45+ */ BRW_OPCODE_MAD, /**< Gfx6+ */ BRW_OPCODE_LRP, /**< Gfx6+ */ @@ -1137,6 +1138,24 @@ enum tgl_sbid_mode { TGL_SBID_SET = 4 }; + +enum gfx12_sub_byte_precision { + BRW_SUB_BYTE_PRECISION_NONE = 0, + + /** 4 bits. Signedness determined by base type */ + BRW_SUB_BYTE_PRECISION_4BIT = 1, + + /** 2 bits. Signedness determined by base type */ + BRW_SUB_BYTE_PRECISION_2BIT = 2, +}; + +enum gfx12_systolic_depth { + BRW_SYSTOLIC_DEPTH_16 = 0, + BRW_SYSTOLIC_DEPTH_2 = 1, + BRW_SYSTOLIC_DEPTH_4 = 2, + BRW_SYSTOLIC_DEPTH_8 = 3, +}; + #ifdef __cplusplus /** * Allow bitwise arithmetic of tgl_sbid_mode enums. diff --git a/src/intel/compiler/brw_eu_emit.c b/src/intel/compiler/brw_eu_emit.c index 93f1f930d1b..0dd7b3ac266 100644 --- a/src/intel/compiler/brw_eu_emit.c +++ b/src/intel/compiler/brw_eu_emit.c @@ -1016,6 +1016,60 @@ brw_alu3(struct brw_codegen *p, unsigned opcode, struct brw_reg dest, return inst; } +static brw_inst * +brw_dpas_three_src(struct brw_codegen *p, enum gfx12_systolic_depth opcode, + unsigned sdepth, unsigned rcount, struct brw_reg dest, + struct brw_reg src0, struct brw_reg src1, struct brw_reg src2) +{ + const struct intel_device_info *devinfo = p->devinfo; + brw_inst *inst = next_insn(p, opcode); + + assert(dest.file == BRW_GENERAL_REGISTER_FILE); + brw_inst_set_dpas_3src_dst_reg_file(devinfo, inst, + BRW_GENERAL_REGISTER_FILE); + brw_inst_set_dpas_3src_dst_reg_nr(devinfo, inst, dest.nr); + brw_inst_set_dpas_3src_dst_subreg_nr(devinfo, inst, dest.subnr); + + if (brw_reg_type_is_floating_point(dest.type)) { + brw_inst_set_dpas_3src_exec_type(devinfo, inst, + BRW_ALIGN1_3SRC_EXEC_TYPE_FLOAT); + } else { + brw_inst_set_dpas_3src_exec_type(devinfo, inst, + BRW_ALIGN1_3SRC_EXEC_TYPE_INT); + } + + brw_inst_set_dpas_3src_sdepth(devinfo, inst, sdepth); + brw_inst_set_dpas_3src_rcount(devinfo, inst, rcount - 1); + + brw_inst_set_dpas_3src_dst_type(devinfo, inst, dest.type); + brw_inst_set_dpas_3src_src0_type(devinfo, inst, src0.type); + brw_inst_set_dpas_3src_src1_type(devinfo, inst, src1.type); + brw_inst_set_dpas_3src_src2_type(devinfo, inst, src2.type); + + assert(src0.file == BRW_GENERAL_REGISTER_FILE || + (src0.file == BRW_ARCHITECTURE_REGISTER_FILE && + src0.nr == BRW_ARF_NULL)); + + brw_inst_set_dpas_3src_src0_reg_file(devinfo, inst, src0.file); + brw_inst_set_dpas_3src_src0_reg_nr(devinfo, inst, src0.nr); + brw_inst_set_dpas_3src_src0_subreg_nr(devinfo, inst, src0.subnr); + + assert(src1.file == BRW_GENERAL_REGISTER_FILE); + + brw_inst_set_dpas_3src_src1_reg_file(devinfo, inst, src1.file); + brw_inst_set_dpas_3src_src1_reg_nr(devinfo, inst, src1.nr); + brw_inst_set_dpas_3src_src1_subreg_nr(devinfo, inst, src1.subnr); + brw_inst_set_dpas_3src_src1_subbyte(devinfo, inst, BRW_SUB_BYTE_PRECISION_NONE); + + assert(src2.file == BRW_GENERAL_REGISTER_FILE); + + brw_inst_set_dpas_3src_src2_reg_file(devinfo, inst, src2.file); + brw_inst_set_dpas_3src_src2_reg_nr(devinfo, inst, src2.nr); + brw_inst_set_dpas_3src_src2_subreg_nr(devinfo, inst, src2.subnr); + brw_inst_set_dpas_3src_src2_subbyte(devinfo, inst, BRW_SUB_BYTE_PRECISION_NONE); + + return inst; +} /*********************************************************************** * Convenience routines. @@ -1248,6 +1302,15 @@ brw_PLN(struct brw_codegen *p, struct brw_reg dest, return brw_alu2(p, BRW_OPCODE_PLN, dest, src0, src1); } +brw_inst * +brw_DPAS(struct brw_codegen *p, enum gfx12_systolic_depth sdepth, + unsigned rcount, struct brw_reg dest, struct brw_reg src0, + struct brw_reg src1, struct brw_reg src2) +{ + return brw_dpas_three_src(p, BRW_OPCODE_DPAS, sdepth, rcount, dest, src0, + src1, src2); +} + brw_inst * brw_F32TO16(struct brw_codegen *p, struct brw_reg dst, struct brw_reg src) { diff --git a/src/intel/compiler/brw_eu_validate.c b/src/intel/compiler/brw_eu_validate.c index d105f6b0ef9..2d30c7fa37e 100644 --- a/src/intel/compiler/brw_eu_validate.c +++ b/src/intel/compiler/brw_eu_validate.c @@ -687,7 +687,10 @@ general_restrictions_based_on_operand_types(const struct brw_isa_info *isa, return error_msg; if (devinfo->ver >= 11) { - if (num_sources == 3) { + /* A register type of B or UB for DPAS actually means 4 bytes packed into + * a D or UD, so it is allowed. + */ + if (num_sources == 3 && brw_inst_opcode(isa, inst) != BRW_OPCODE_DPAS) { ERROR_IF(brw_reg_type_to_size(brw_inst_3src_a1_src1_type(devinfo, inst)) == 1 || brw_reg_type_to_size(brw_inst_3src_a1_src2_type(devinfo, inst)) == 1, "Byte data type is not supported for src1/2 register regioning. This includes " diff --git a/src/intel/compiler/brw_fs.cpp b/src/intel/compiler/brw_fs.cpp index 3acdb20158f..ca2d18639ae 100644 --- a/src/intel/compiler/brw_fs.cpp +++ b/src/intel/compiler/brw_fs.cpp @@ -395,6 +395,21 @@ fs_inst::has_source_and_destination_hazard() const default: return !is_uniform(src[0]); } + case BRW_OPCODE_DPAS: + /* This is overly conservative. The actual hazard is more complicated to + * describe. When the repeat count is N, the single instruction behaves + * like N instructions with a repeat count of one, but the destination + * and source registers are incremented (in somewhat complex ways) for + * each instruction. + * + * This means the source and destination register is actually a range of + * registers. The hazard exists of an earlier iteration would write a + * register that should be read by a later iteration. + * + * There may be some advantage to properly modeling this, but for now, + * be overly conservative. + */ + return rcount > 1; default: /* The SIMD16 compressed instruction * @@ -844,6 +859,9 @@ fs_inst::components_read(unsigned i) const else return 1; + case BRW_OPCODE_DPAS: + unreachable("Do not use components_read() for DPAS."); + default: return 1; } @@ -904,6 +922,26 @@ fs_inst::size_read(int arg) const } break; + case BRW_OPCODE_DPAS: + switch (arg) { + case 0: + if (src[0].type == BRW_REGISTER_TYPE_HF) { + return rcount * REG_SIZE / 2; + } else { + return rcount * REG_SIZE; + } + case 1: + return sdepth * REG_SIZE; + case 2: + /* This is simpler than the formula described in the Bspec, but it + * covers all of the cases that we support on DG2. + */ + return rcount * REG_SIZE; + default: + unreachable("Invalid source number."); + } + break; + case SHADER_OPCODE_TEX: case FS_OPCODE_TXB: case SHADER_OPCODE_TXD: diff --git a/src/intel/compiler/brw_fs_builder.h b/src/intel/compiler/brw_fs_builder.h index c9c33f9e79e..63244f0b75b 100644 --- a/src/intel/compiler/brw_fs_builder.h +++ b/src/intel/compiler/brw_fs_builder.h @@ -834,6 +834,27 @@ namespace brw { return inst; } + instruction * + DPAS(const dst_reg &dst, const src_reg &src0, const src_reg &src1, const src_reg &src2, + unsigned sdepth, unsigned rcount) const + { + assert(_dispatch_width == 8); + assert(sdepth == 8); + assert(rcount == 1 || rcount == 2 || rcount == 4 || rcount == 8); + + instruction *inst = emit(BRW_OPCODE_DPAS, dst, src0, src1, src2); + inst->sdepth = sdepth; + inst->rcount = rcount; + + if (dst.type == BRW_REGISTER_TYPE_HF) { + inst->size_written = rcount * REG_SIZE / 2; + } else { + inst->size_written = rcount * REG_SIZE; + } + + return inst; + } + fs_visitor *shader; fs_inst *BREAK() { return emit(BRW_OPCODE_BREAK); } diff --git a/src/intel/compiler/brw_fs_generator.cpp b/src/intel/compiler/brw_fs_generator.cpp index 95b415afa73..53b966f01ec 100644 --- a/src/intel/compiler/brw_fs_generator.cpp +++ b/src/intel/compiler/brw_fs_generator.cpp @@ -1606,6 +1606,19 @@ fs_generator::enable_debug(const char *shader_name) this->shader_name = shader_name; } +static gfx12_systolic_depth +translate_systolic_depth(unsigned d) +{ + /* Could also return (ffs(d) - 1) & 3. */ + switch (d) { + case 2: return BRW_SYSTOLIC_DEPTH_2; + case 4: return BRW_SYSTOLIC_DEPTH_4; + case 8: return BRW_SYSTOLIC_DEPTH_8; + case 16: return BRW_SYSTOLIC_DEPTH_16; + default: unreachable("Invalid systolic depth."); + } +} + int fs_generator::generate_code(const cfg_t *cfg, int dispatch_width, struct shader_stats shader_stats, @@ -1791,6 +1804,12 @@ fs_generator::generate_code(const cfg_t *cfg, int dispatch_width, brw_LINE(p, dst, src[0], src[1]); break; + case BRW_OPCODE_DPAS: + assert(devinfo->verx10 >= 125); + brw_DPAS(p, translate_systolic_depth(inst->sdepth), inst->rcount, + dst, src[0], src[1], src[2]); + break; + case BRW_OPCODE_MAD: assert(devinfo->ver >= 6); if (devinfo->ver < 10) diff --git a/src/intel/compiler/brw_fs_lower_regioning.cpp b/src/intel/compiler/brw_fs_lower_regioning.cpp index a2c04e3ba5b..3bff7770cd0 100644 --- a/src/intel/compiler/brw_fs_lower_regioning.cpp +++ b/src/intel/compiler/brw_fs_lower_regioning.cpp @@ -253,8 +253,10 @@ namespace { has_invalid_src_region(const intel_device_info *devinfo, const fs_inst *inst, unsigned i) { - if (is_send(inst) || inst->is_math() || inst->is_control_source(i)) + if (is_send(inst) || inst->is_math() || inst->is_control_source(i) || + inst->opcode == BRW_OPCODE_DPAS) { return false; + } /* Empirical testing shows that Broadwell has a bug affecting half-float * MAD instructions when any of its sources has a non-zero offset, such diff --git a/src/intel/compiler/brw_inst.h b/src/intel/compiler/brw_inst.h index 06cabcd833e..6741dd6b21e 100644 --- a/src/intel/compiler/brw_inst.h +++ b/src/intel/compiler/brw_inst.h @@ -524,6 +524,67 @@ brw_inst_set_3src_a1_src2_imm(ASSERTED const struct intel_device_info *devinfo, } /** @} */ +/** + * Three-source systolic instructions: + * @{ + */ +F(dpas_3src_src2_reg_nr, /* 4+ */ -1, -1, /* 12+ */ 127, 120) +F(dpas_3src_src2_subreg_nr, /* 4+ */ -1, -1, /* 12+ */ 119, 115) +F(dpas_3src_src2_reg_file, /* 4+ */ -1, -1, /* 12+ */ 114, 114) +F(dpas_3src_src1_reg_nr, /* 4+ */ -1, -1, /* 12+ */ 111, 104) +F(dpas_3src_src1_subreg_nr, /* 4+ */ -1, -1, /* 12+ */ 103, 99) +F(dpas_3src_src1_reg_file, /* 4+ */ -1, -1, /* 12+ */ 98, 98) +F(dpas_3src_src1_hw_type, /* 4+ */ -1, -1, /* 12+ */ 90, 88) +F(dpas_3src_src1_subbyte, /* 4+ */ -1, -1, /* 12+ */ 87, 86) +F(dpas_3src_src2_subbyte, /* 4+ */ -1, -1, /* 12+ */ 85, 84) +F(dpas_3src_src2_hw_type, /* 4+ */ -1, -1, /* 12+ */ 82, 80) +F(dpas_3src_src0_reg_nr, /* 4+ */ -1, -1, /* 12+ */ 79, 72) +F(dpas_3src_src0_subreg_nr, /* 4+ */ -1, -1, /* 12+ */ 71, 67) +F(dpas_3src_src0_reg_file, /* 4+ */ -1, -1, /* 12+ */ 66, 66) +F(dpas_3src_dst_reg_nr, /* 4+ */ -1, -1, /* 12+ */ 63, 56) +F(dpas_3src_dst_subreg_nr, /* 4+ */ -1, -1, /* 12+ */ 55, 51) +F(dpas_3src_dst_reg_file, /* 4+ */ -1, -1, /* 12+ */ 50, 50) +F(dpas_3src_sdepth, /* 4+ */ -1, -1, /* 12+ */ 49, 48) +F(dpas_3src_rcount, /* 4+ */ -1, -1, /* 12+ */ 45, 43) +F(dpas_3src_src0_hw_type, /* 4+ */ -1, -1, /* 12+ */ 42, 40) +F(dpas_3src_exec_type, /* 4+ */ -1, -1, /* 12+ */ 39, 39) +F(dpas_3src_dst_hw_type, /* 4+ */ -1, -1, /* 12+ */ 38, 36) +/** @} */ + +#define REG_TYPE(reg) \ +static inline void \ +brw_inst_set_dpas_3src_##reg##_type(const struct intel_device_info *devinfo, \ + brw_inst *inst, enum brw_reg_type type) \ +{ \ + UNUSED enum gfx10_align1_3src_exec_type exec_type = \ + (enum gfx10_align1_3src_exec_type) brw_inst_dpas_3src_exec_type(devinfo,\ + inst); \ + if (brw_reg_type_is_floating_point(type)) { \ + assert(exec_type == BRW_ALIGN1_3SRC_EXEC_TYPE_FLOAT); \ + } else { \ + assert(exec_type == BRW_ALIGN1_3SRC_EXEC_TYPE_INT); \ + } \ + unsigned hw_type = brw_reg_type_to_a1_hw_3src_type(devinfo, type); \ + brw_inst_set_dpas_3src_##reg##_hw_type(devinfo, inst, hw_type); \ +} \ + \ +static inline enum brw_reg_type \ +brw_inst_dpas_3src_##reg##_type(const struct intel_device_info *devinfo, \ + const brw_inst *inst) \ +{ \ + enum gfx10_align1_3src_exec_type exec_type = \ + (enum gfx10_align1_3src_exec_type) brw_inst_dpas_3src_exec_type(devinfo,\ + inst); \ + unsigned hw_type = brw_inst_dpas_3src_##reg##_hw_type(devinfo, inst); \ + return brw_a1_hw_3src_type_to_reg_type(devinfo, hw_type, exec_type); \ +} + +REG_TYPE(dst) +REG_TYPE(src0) +REG_TYPE(src1) +REG_TYPE(src2) +#undef REG_TYPE + /** * Flow control instruction bits: * @{ diff --git a/src/intel/compiler/brw_ir.h b/src/intel/compiler/brw_ir.h index b77668a5e46..e7f54798303 100644 --- a/src/intel/compiler/brw_ir.h +++ b/src/intel/compiler/brw_ir.h @@ -199,6 +199,16 @@ struct backend_instruction { */ unsigned flag_subreg:2; + /** + * Systolic depth used by DPAS instruction. + */ + unsigned sdepth:4; + + /** + * Repeat count used by DPAS instruction. + */ + unsigned rcount:4; + /** The number of hardware registers used for a message header. */ uint8_t header_size; }; diff --git a/src/intel/compiler/brw_ir_performance.cpp b/src/intel/compiler/brw_ir_performance.cpp index b50ef8bd828..9ab7ef563b0 100644 --- a/src/intel/compiler/brw_ir_performance.cpp +++ b/src/intel/compiler/brw_ir_performance.cpp @@ -148,6 +148,8 @@ namespace { !brw_reg_type_is_floating_point(tx) && type_sz(tx) == 4 && type_sz(inst->src[0].type) == type_sz(inst->src[1].type)) tx = brw_int_type(8, tx == BRW_REGISTER_TYPE_D); + + rcount = inst->opcode == BRW_OPCODE_DPAS ? inst->rcount : 0; } instruction_info(const struct brw_isa_info *isa, @@ -155,7 +157,7 @@ namespace { isa(isa), devinfo(isa->devinfo), op(inst->opcode), td(inst->dst.type), sd(DIV_ROUND_UP(inst->size_written, REG_SIZE)), tx(get_exec_type(inst)), sx(0), ss(0), sc(0), - desc(inst->desc), sfid(inst->sfid) + desc(inst->desc), sfid(inst->sfid), rcount(0) { /* Compute the maximum source size. */ for (unsigned i = 0; i < ARRAY_SIZE(inst->src); i++) @@ -195,6 +197,8 @@ namespace { uint32_t desc; /** Send message shared function ID. */ uint8_t sfid; + /** Repeat count for DPAS instructions. */ + uint8_t rcount; }; /** @@ -505,6 +509,32 @@ namespace { else abort(); + case BRW_OPCODE_DPAS: { + unsigned ld; + + switch (info.rcount) { + case 1: + ld = 21; + break; + case 2: + ld = 22; + break; + case 8: + default: + ld = 32; + break; + } + + /* DPAS cannot write the accumulator or the flags, so pass UINT_MAX + * for la and lf. + */ + if (devinfo->verx10 >= 125) + return calculate_desc(info, EU_UNIT_FPU, 0, 2, 1, 0, 2, + 0, ld, UINT_MAX, UINT_MAX, 0, 0); + else + abort(); + } + case SHADER_OPCODE_RCP: case SHADER_OPCODE_RSQ: case SHADER_OPCODE_SQRT: diff --git a/src/intel/compiler/brw_schedule_instructions.cpp b/src/intel/compiler/brw_schedule_instructions.cpp index 0855ee9a131..913805f2609 100644 --- a/src/intel/compiler/brw_schedule_instructions.cpp +++ b/src/intel/compiler/brw_schedule_instructions.cpp @@ -627,6 +627,21 @@ schedule_node::set_latency_gfx7(const struct brw_isa_info *isa) } break; + case BRW_OPCODE_DPAS: + switch (inst->rcount) { + case 1: + latency = 21; + break; + case 2: + latency = 22; + break; + case 8: + default: + latency = 32; + break; + } + break; + default: /* 2 cycles: * mul(8) g4<1>F g2<0,1,0>F 0.5F { align1 WE_normal 1Q }; diff --git a/src/intel/compiler/brw_shader.cpp b/src/intel/compiler/brw_shader.cpp index 5371e5fdcdd..c53a5e4fa6f 100644 --- a/src/intel/compiler/brw_shader.cpp +++ b/src/intel/compiler/brw_shader.cpp @@ -164,6 +164,13 @@ brw_instruction_name(const struct brw_isa_info *isa, enum opcode op) if (devinfo->ver > 7 && op == BRW_OPCODE_F16TO32) return "f16to32"; + /* DPAS instructions may transiently exist on platforms that do not + * support DPAS. They will eventually be lowered, but in the meantime it + * must be possible to query the instruction name. + */ + if (devinfo->verx10 < 125 && op == BRW_OPCODE_DPAS) + return "dpas"; + assert(brw_opcode_desc(isa, op)->name); return brw_opcode_desc(isa, op)->name; case FS_OPCODE_FB_WRITE: @@ -936,6 +943,7 @@ backend_instruction::can_do_source_mods() const case BRW_OPCODE_ROR: case BRW_OPCODE_SUBB: case BRW_OPCODE_DP4A: + case BRW_OPCODE_DPAS: case SHADER_OPCODE_BROADCAST: case SHADER_OPCODE_CLUSTER_BROADCAST: case SHADER_OPCODE_MOV_INDIRECT: