diff --git a/src/intel/compiler/brw_fs_copy_propagation.cpp b/src/intel/compiler/brw_fs_copy_propagation.cpp index 419643af893..f39107fc549 100644 --- a/src/intel/compiler/brw_fs_copy_propagation.cpp +++ b/src/intel/compiler/brw_fs_copy_propagation.cpp @@ -818,7 +818,7 @@ try_copy_propagate(const brw_compiler *compiler, fs_inst *inst, static bool try_constant_propagate(const brw_compiler *compiler, fs_inst *inst, - acp_entry *entry) + acp_entry *entry, int arg) { const struct intel_device_info *devinfo = compiler->devinfo; bool progress = false; @@ -826,321 +826,299 @@ try_constant_propagate(const brw_compiler *compiler, fs_inst *inst, if (type_sz(entry->src.type) > 4) return false; - for (int i = inst->sources - 1; i >= 0; i--) { - if (inst->src[i].file != VGRF) - continue; + if (inst->src[arg].file != VGRF) + return false; - assert(entry->dst.file == VGRF); - if (inst->src[i].nr != entry->dst.nr) - continue; + assert(entry->dst.file == VGRF); + if (inst->src[arg].nr != entry->dst.nr) + return false; - /* Bail if inst is reading a range that isn't contained in the range - * that entry is writing. + /* Bail if inst is reading a range that isn't contained in the range + * that entry is writing. + */ + if (!region_contained_in(inst->src[arg], inst->size_read(arg), + entry->dst, entry->size_written)) + return false; + + /* If the size of the use type is larger than the size of the entry + * type, the entry doesn't contain all of the data that the user is + * trying to use. + */ + if (type_sz(inst->src[arg].type) > type_sz(entry->dst.type)) + return false; + + fs_reg val = entry->src; + + /* If the size of the use type is smaller than the size of the entry, + * clamp the value to the range of the use type. This enables constant + * copy propagation in cases like + * + * + * mov(8) g12<1>UD 0x0000000cUD + * ... + * mul(8) g47<1>D g86<8,8,1>D g12<16,8,2>W + */ + if (type_sz(inst->src[arg].type) < type_sz(entry->dst.type)) { + if (type_sz(inst->src[arg].type) != 2 || type_sz(entry->dst.type) != 4) + return false; + + assert(inst->src[arg].subnr == 0 || inst->src[arg].subnr == 2); + + /* When subnr is 0, we want the lower 16-bits, and when it's 2, we + * want the upper 16-bits. No other values of subnr are valid for a + * UD source. */ - if (!region_contained_in(inst->src[i], inst->size_read(i), - entry->dst, entry->size_written)) - continue; + const uint16_t v = inst->src[arg].subnr == 2 ? val.ud >> 16 : val.ud; - /* If the size of the use type is larger than the size of the entry - * type, the entry doesn't contain all of the data that the user is - * trying to use. + val.ud = v | (uint32_t(v) << 16); + } + + val.type = inst->src[arg].type; + + if (inst->src[arg].abs) { + if ((devinfo->ver >= 8 && is_logic_op(inst->opcode)) || + !brw_abs_immediate(val.type, &val.as_brw_reg())) { + return false; + } + } + + if (inst->src[arg].negate) { + if ((devinfo->ver >= 8 && is_logic_op(inst->opcode)) || + !brw_negate_immediate(val.type, &val.as_brw_reg())) { + return false; + } + } + + switch (inst->opcode) { + case BRW_OPCODE_MOV: + case SHADER_OPCODE_LOAD_PAYLOAD: + case FS_OPCODE_PACK: + inst->src[arg] = val; + progress = true; + break; + + case SHADER_OPCODE_POW: + /* Allow constant propagation into src1 (except on Gen 6 which + * doesn't support scalar source math), and let constant combining + * promote the constant on Gen < 8. */ - if (type_sz(inst->src[i].type) > type_sz(entry->dst.type)) - continue; - - fs_reg val = entry->src; - - /* If the size of the use type is smaller than the size of the entry, - * clamp the value to the range of the use type. This enables constant - * copy propagation in cases like - * - * - * mov(8) g12<1>UD 0x0000000cUD - * ... - * mul(8) g47<1>D g86<8,8,1>D g12<16,8,2>W - */ - if (type_sz(inst->src[i].type) < type_sz(entry->dst.type)) { - if (type_sz(inst->src[i].type) != 2 || type_sz(entry->dst.type) != 4) - continue; - - assert(inst->src[i].subnr == 0 || inst->src[i].subnr == 2); - - /* When subnr is 0, we want the lower 16-bits, and when it's 2, we - * want the upper 16-bits. No other values of subnr are valid for a - * UD source. - */ - const uint16_t v = inst->src[i].subnr == 2 ? val.ud >> 16 : val.ud; - - val.ud = v | (uint32_t(v) << 16); - } - - val.type = inst->src[i].type; - - if (inst->src[i].abs) { - if ((devinfo->ver >= 8 && is_logic_op(inst->opcode)) || - !brw_abs_immediate(val.type, &val.as_brw_reg())) { - continue; - } - } - - if (inst->src[i].negate) { - if ((devinfo->ver >= 8 && is_logic_op(inst->opcode)) || - !brw_negate_immediate(val.type, &val.as_brw_reg())) { - continue; - } - } - - switch (inst->opcode) { - case BRW_OPCODE_MOV: - case SHADER_OPCODE_LOAD_PAYLOAD: - case FS_OPCODE_PACK: - inst->src[i] = val; - progress = true; + if (devinfo->ver == 6) break; - case SHADER_OPCODE_POW: - /* Allow constant propagation into src1 (except on Gen 6 which - * doesn't support scalar source math), and let constant combining - * promote the constant on Gen < 8. + if (arg == 1) { + inst->src[arg] = val; + progress = true; + } + break; + + case BRW_OPCODE_SUBB: + if (arg == 1) { + inst->src[arg] = val; + progress = true; + } + break; + + case BRW_OPCODE_MACH: + case BRW_OPCODE_MUL: + case SHADER_OPCODE_MULH: + case BRW_OPCODE_ADD: + case BRW_OPCODE_XOR: + case BRW_OPCODE_ADDC: + if (arg == 1) { + inst->src[arg] = val; + progress = true; + } else if (arg == 0 && inst->src[1].file != IMM) { + /* Don't copy propagate the constant in situations like + * + * mov(8) g8<1>D 0x7fffffffD + * mul(8) g16<1>D g8<8,8,1>D g15<16,8,2>W + * + * On platforms that only have a 32x16 multiplier, this will + * result in lowering the multiply to + * + * mul(8) g15<1>D g14<8,8,1>D 0xffffUW + * mul(8) g16<1>D g14<8,8,1>D 0x7fffUW + * add(8) g15.1<2>UW g15.1<16,8,2>UW g16<16,8,2>UW + * + * On Gfx8 and Gfx9, which have the full 32x32 multiplier, it + * results in + * + * mul(8) g16<1>D g15<16,8,2>W 0x7fffffffD + * + * Volume 2a of the Skylake PRM says: + * + * When multiplying a DW and any lower precision integer, the + * DW operand must on src0. */ - if (devinfo->ver == 6) + if (inst->opcode == BRW_OPCODE_MUL && + type_sz(inst->src[1].type) < 4 && + type_sz(val.type) == 4) break; - if (i == 1) { - inst->src[i] = val; - progress = true; - } + /* Fit this constant in by commuting the operands. + * Exception: we can't do this for 32-bit integer MUL/MACH + * because it's asymmetric. + * + * The BSpec says for Broadwell that + * + * "When multiplying DW x DW, the dst cannot be accumulator." + * + * Integer MUL with a non-accumulator destination will be lowered + * by lower_integer_multiplication(), so don't restrict it. + */ + if (((inst->opcode == BRW_OPCODE_MUL && + inst->dst.is_accumulator()) || + inst->opcode == BRW_OPCODE_MACH) && + (inst->src[1].type == BRW_REGISTER_TYPE_D || + inst->src[1].type == BRW_REGISTER_TYPE_UD)) + break; + inst->src[0] = inst->src[1]; + inst->src[1] = val; + progress = true; + } + break; + + case BRW_OPCODE_ADD3: + /* add3 can have a single imm16 source. Proceed if the source type is + * already W or UW or the value can be coerced to one of those types. + */ + if (val.type == BRW_REGISTER_TYPE_W || val.type == BRW_REGISTER_TYPE_UW) + ; /* Nothing to do. */ + else if (val.ud <= 0xffff) + val = brw_imm_uw(val.ud); + else if (val.d >= -0x8000 && val.d <= 0x7fff) + val = brw_imm_w(val.d); + else break; - case BRW_OPCODE_SUBB: - if (i == 1) { - inst->src[i] = val; - progress = true; - } - break; + if (arg == 2) { + inst->src[arg] = val; + progress = true; + } else if (inst->src[2].file != IMM) { + inst->src[arg] = inst->src[2]; + inst->src[2] = val; + progress = true; + } - case BRW_OPCODE_MACH: - case BRW_OPCODE_MUL: - case SHADER_OPCODE_MULH: - case BRW_OPCODE_ADD: - case BRW_OPCODE_XOR: - case BRW_OPCODE_ADDC: - if (i == 1) { - inst->src[i] = val; - progress = true; - } else if (i == 0 && inst->src[1].file != IMM) { - /* Don't copy propagate the constant in situations like - * - * mov(8) g8<1>D 0x7fffffffD - * mul(8) g16<1>D g8<8,8,1>D g15<16,8,2>W - * - * On platforms that only have a 32x16 multiplier, this will - * result in lowering the multiply to - * - * mul(8) g15<1>D g14<8,8,1>D 0xffffUW - * mul(8) g16<1>D g14<8,8,1>D 0x7fffUW - * add(8) g15.1<2>UW g15.1<16,8,2>UW g16<16,8,2>UW - * - * On Gfx8 and Gfx9, which have the full 32x32 multiplier, it - * results in - * - * mul(8) g16<1>D g15<16,8,2>W 0x7fffffffD - * - * Volume 2a of the Skylake PRM says: - * - * When multiplying a DW and any lower precision integer, the - * DW operand must on src0. - */ - if (inst->opcode == BRW_OPCODE_MUL && - type_sz(inst->src[1].type) < 4 && - type_sz(val.type) == 4) - break; + break; - /* Fit this constant in by commuting the operands. - * Exception: we can't do this for 32-bit integer MUL/MACH - * because it's asymmetric. - * - * The BSpec says for Broadwell that - * - * "When multiplying DW x DW, the dst cannot be accumulator." - * - * Integer MUL with a non-accumulator destination will be lowered - * by lower_integer_multiplication(), so don't restrict it. + case BRW_OPCODE_CMP: + case BRW_OPCODE_IF: + if (arg == 1) { + inst->src[arg] = val; + progress = true; + } else if (arg == 0 && inst->src[1].file != IMM) { + enum brw_conditional_mod new_cmod; + + new_cmod = brw_swap_cmod(inst->conditional_mod); + if (new_cmod != BRW_CONDITIONAL_NONE) { + /* Fit this constant in by swapping the operands and + * flipping the test */ - if (((inst->opcode == BRW_OPCODE_MUL && - inst->dst.is_accumulator()) || - inst->opcode == BRW_OPCODE_MACH) && - (inst->src[1].type == BRW_REGISTER_TYPE_D || - inst->src[1].type == BRW_REGISTER_TYPE_UD)) - break; inst->src[0] = inst->src[1]; inst->src[1] = val; + inst->conditional_mod = new_cmod; progress = true; } - break; + } + break; - case BRW_OPCODE_ADD3: - /* add3 can have a single imm16 source. Proceed if the source type is - * already W or UW or the value can be coerced to one of those types. - */ - if (val.type == BRW_REGISTER_TYPE_W || val.type == BRW_REGISTER_TYPE_UW) - ; /* Nothing to do. */ - else if (val.ud <= 0xffff) - val = brw_imm_uw(val.ud); - else if (val.d >= -0x8000 && val.d <= 0x7fff) - val = brw_imm_w(val.d); - else - break; - - if (i == 2) { - inst->src[i] = val; - progress = true; - } else if (inst->src[2].file != IMM) { - inst->src[i] = inst->src[2]; - inst->src[2] = val; - progress = true; - } - - break; - - case BRW_OPCODE_CMP: - case BRW_OPCODE_IF: - if (i == 1) { - inst->src[i] = val; - progress = true; - } else if (i == 0 && inst->src[1].file != IMM) { - enum brw_conditional_mod new_cmod; - - new_cmod = brw_swap_cmod(inst->conditional_mod); - if (new_cmod != BRW_CONDITIONAL_NONE) { - /* Fit this constant in by swapping the operands and - * flipping the test - */ - inst->src[0] = inst->src[1]; - inst->src[1] = val; - inst->conditional_mod = new_cmod; - progress = true; - } - } - break; - - case BRW_OPCODE_SEL: - if (i == 1) { - inst->src[i] = val; - progress = true; - } else if (i == 0) { - if (inst->src[1].file != IMM && - (inst->conditional_mod == BRW_CONDITIONAL_NONE || - /* Only GE and L are commutative. */ - inst->conditional_mod == BRW_CONDITIONAL_GE || - inst->conditional_mod == BRW_CONDITIONAL_L)) { - inst->src[0] = inst->src[1]; - inst->src[1] = val; - - /* If this was predicated, flipping operands means - * we also need to flip the predicate. - */ - if (inst->conditional_mod == BRW_CONDITIONAL_NONE) { - inst->predicate_inverse = - !inst->predicate_inverse; - } - } else { - inst->src[0] = val; - } - - progress = true; - } - break; - - case FS_OPCODE_FB_WRITE_LOGICAL: - /* The stencil and omask sources of FS_OPCODE_FB_WRITE_LOGICAL are - * bit-cast using a strided region so they cannot be immediates. - */ - if (i != FB_WRITE_LOGICAL_SRC_SRC_STENCIL && - i != FB_WRITE_LOGICAL_SRC_OMASK) { - inst->src[i] = val; - progress = true; - } - break; - - case SHADER_OPCODE_INT_QUOTIENT: - case SHADER_OPCODE_INT_REMAINDER: - /* Allow constant propagation into either source (except on Gen 6 - * which doesn't support scalar source math). Constant combining - * promote the src1 constant on Gen < 8, and it will promote the src0 - * constant on all platforms. - */ - if (devinfo->ver == 6) - break; - - FALLTHROUGH; - case BRW_OPCODE_AND: - case BRW_OPCODE_ASR: - case BRW_OPCODE_BFE: - case BRW_OPCODE_BFI1: - case BRW_OPCODE_BFI2: - case BRW_OPCODE_ROL: - case BRW_OPCODE_ROR: - case BRW_OPCODE_SHL: - case BRW_OPCODE_SHR: - case BRW_OPCODE_OR: - case SHADER_OPCODE_TEX_LOGICAL: - case SHADER_OPCODE_TXD_LOGICAL: - case SHADER_OPCODE_TXF_LOGICAL: - case SHADER_OPCODE_TXL_LOGICAL: - case SHADER_OPCODE_TXS_LOGICAL: - case FS_OPCODE_TXB_LOGICAL: - case SHADER_OPCODE_TXF_CMS_LOGICAL: - case SHADER_OPCODE_TXF_CMS_W_LOGICAL: - case SHADER_OPCODE_TXF_CMS_W_GFX12_LOGICAL: - case SHADER_OPCODE_TXF_UMS_LOGICAL: - case SHADER_OPCODE_TXF_MCS_LOGICAL: - case SHADER_OPCODE_LOD_LOGICAL: - case SHADER_OPCODE_TG4_LOGICAL: - case SHADER_OPCODE_TG4_OFFSET_LOGICAL: - case SHADER_OPCODE_SAMPLEINFO_LOGICAL: - case SHADER_OPCODE_IMAGE_SIZE_LOGICAL: - case SHADER_OPCODE_UNTYPED_ATOMIC_LOGICAL: - case SHADER_OPCODE_UNTYPED_SURFACE_READ_LOGICAL: - case SHADER_OPCODE_UNTYPED_SURFACE_WRITE_LOGICAL: - case SHADER_OPCODE_TYPED_ATOMIC_LOGICAL: - case SHADER_OPCODE_TYPED_SURFACE_READ_LOGICAL: - case SHADER_OPCODE_TYPED_SURFACE_WRITE_LOGICAL: - case SHADER_OPCODE_BYTE_SCATTERED_WRITE_LOGICAL: - case SHADER_OPCODE_BYTE_SCATTERED_READ_LOGICAL: - case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD: - case SHADER_OPCODE_BROADCAST: - case BRW_OPCODE_MAD: - case BRW_OPCODE_LRP: - case FS_OPCODE_PACK_HALF_2x16_SPLIT: - case SHADER_OPCODE_SHUFFLE: - inst->src[i] = val; + case BRW_OPCODE_SEL: + if (arg == 1) { + inst->src[arg] = val; progress = true; + } else if (arg == 0) { + if (inst->src[1].file != IMM && + (inst->conditional_mod == BRW_CONDITIONAL_NONE || + /* Only GE and L are commutative. */ + inst->conditional_mod == BRW_CONDITIONAL_GE || + inst->conditional_mod == BRW_CONDITIONAL_L)) { + inst->src[0] = inst->src[1]; + inst->src[1] = val; + + /* If this was predicated, flipping operands means + * we also need to flip the predicate. + */ + if (inst->conditional_mod == BRW_CONDITIONAL_NONE) { + inst->predicate_inverse = + !inst->predicate_inverse; + } + } else { + inst->src[0] = val; + } + + progress = true; + } + break; + + case FS_OPCODE_FB_WRITE_LOGICAL: + /* The stencil and omask sources of FS_OPCODE_FB_WRITE_LOGICAL are + * bit-cast using a strided region so they cannot be immediates. + */ + if (arg != FB_WRITE_LOGICAL_SRC_SRC_STENCIL && + arg != FB_WRITE_LOGICAL_SRC_OMASK) { + inst->src[arg] = val; + progress = true; + } + break; + + case SHADER_OPCODE_INT_QUOTIENT: + case SHADER_OPCODE_INT_REMAINDER: + /* Allow constant propagation into either source (except on Gen 6 + * which doesn't support scalar source math). Constant combining + * promote the src1 constant on Gen < 8, and it will promote the src0 + * constant on all platforms. + */ + if (devinfo->ver == 6) break; - default: - break; - } - } + FALLTHROUGH; + case BRW_OPCODE_AND: + case BRW_OPCODE_ASR: + case BRW_OPCODE_BFE: + case BRW_OPCODE_BFI1: + case BRW_OPCODE_BFI2: + case BRW_OPCODE_ROL: + case BRW_OPCODE_ROR: + case BRW_OPCODE_SHL: + case BRW_OPCODE_SHR: + case BRW_OPCODE_OR: + case SHADER_OPCODE_TEX_LOGICAL: + case SHADER_OPCODE_TXD_LOGICAL: + case SHADER_OPCODE_TXF_LOGICAL: + case SHADER_OPCODE_TXL_LOGICAL: + case SHADER_OPCODE_TXS_LOGICAL: + case FS_OPCODE_TXB_LOGICAL: + case SHADER_OPCODE_TXF_CMS_LOGICAL: + case SHADER_OPCODE_TXF_CMS_W_LOGICAL: + case SHADER_OPCODE_TXF_CMS_W_GFX12_LOGICAL: + case SHADER_OPCODE_TXF_UMS_LOGICAL: + case SHADER_OPCODE_TXF_MCS_LOGICAL: + case SHADER_OPCODE_LOD_LOGICAL: + case SHADER_OPCODE_TG4_LOGICAL: + case SHADER_OPCODE_TG4_OFFSET_LOGICAL: + case SHADER_OPCODE_SAMPLEINFO_LOGICAL: + case SHADER_OPCODE_IMAGE_SIZE_LOGICAL: + case SHADER_OPCODE_UNTYPED_ATOMIC_LOGICAL: + case SHADER_OPCODE_UNTYPED_SURFACE_READ_LOGICAL: + case SHADER_OPCODE_UNTYPED_SURFACE_WRITE_LOGICAL: + case SHADER_OPCODE_TYPED_ATOMIC_LOGICAL: + case SHADER_OPCODE_TYPED_SURFACE_READ_LOGICAL: + case SHADER_OPCODE_TYPED_SURFACE_WRITE_LOGICAL: + case SHADER_OPCODE_BYTE_SCATTERED_WRITE_LOGICAL: + case SHADER_OPCODE_BYTE_SCATTERED_READ_LOGICAL: + case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD: + case SHADER_OPCODE_BROADCAST: + case BRW_OPCODE_MAD: + case BRW_OPCODE_LRP: + case FS_OPCODE_PACK_HALF_2x16_SPLIT: + case SHADER_OPCODE_SHUFFLE: + inst->src[arg] = val; + progress = true; + break; - /* ADD3 can only have the immediate as src0. */ - if (progress && inst->opcode == BRW_OPCODE_ADD3) { - if (inst->src[2].file == IMM) { - const auto src0 = inst->src[0]; - inst->src[0] = inst->src[2]; - inst->src[2] = src0; - } - } - - /* If only one of the sources of a 2-source, commutative instruction (e.g., - * AND) is immediate, it must be src1. If both are immediate, opt_algebraic - * should fold it away. - */ - if (progress && inst->sources == 2 && inst->is_commutative() && - inst->src[0].file == IMM && inst->src[1].file != IMM) { - const auto src1 = inst->src[1]; - inst->src[1] = inst->src[0]; - inst->src[0] = src1; + default: + break; } return progress; @@ -1178,25 +1156,50 @@ fs_visitor::opt_copy_propagation_local(void *copy_prop_ctx, bblock_t *block, foreach_inst_in_block(fs_inst, inst, block) { /* Try propagating into this instruction. */ - for (int i = 0; i < inst->sources; i++) { + bool instruction_progress = false; + for (int i = inst->sources - 1; i >= 0; i--) { if (inst->src[i].file != VGRF) continue; foreach_in_list(acp_entry, entry, &acp[inst->src[i].nr % ACP_HASH_SIZE]) { if (entry->src.file == IMM) { - if (try_constant_propagate(compiler, inst, entry)) { - progress = true; + if (try_constant_propagate(compiler, inst, entry, i)) { + instruction_progress = true; break; } } else { if (try_copy_propagate(compiler, inst, entry, i, alloc)) { - progress = true; + instruction_progress = true; break; } } } } + if (instruction_progress) { + progress = true; + + /* ADD3 can only have the immediate as src0. */ + if (inst->opcode == BRW_OPCODE_ADD3) { + if (inst->src[2].file == IMM) { + const auto src0 = inst->src[0]; + inst->src[0] = inst->src[2]; + inst->src[2] = src0; + } + } + + /* If only one of the sources of a 2-source, commutative instruction (e.g., + * AND) is immediate, it must be src1. If both are immediate, opt_algebraic + * should fold it away. + */ + if (inst->sources == 2 && inst->is_commutative() && + inst->src[0].file == IMM && inst->src[1].file != IMM) { + const auto src1 = inst->src[1]; + inst->src[1] = inst->src[0]; + inst->src[0] = src1; + } + } + /* kill the destination from the ACP */ if (inst->dst.file == VGRF || inst->dst.file == FIXED_GRF) { foreach_in_list_safe(acp_entry, entry, &acp[inst->dst.nr % ACP_HASH_SIZE]) {