diff --git a/src/intel/compiler/brw_fs_lower_regioning.cpp b/src/intel/compiler/brw_fs_lower_regioning.cpp index 2c901434e5b..127499ac99e 100644 --- a/src/intel/compiler/brw_fs_lower_regioning.cpp +++ b/src/intel/compiler/brw_fs_lower_regioning.cpp @@ -46,6 +46,99 @@ namespace { !inst->src[0].abs; } + /* + * Return an acceptable byte stride for the specified source of an + * instruction affected by a regioning restriction. + */ + unsigned + required_src_byte_stride(const intel_device_info *devinfo, const fs_inst *inst, + unsigned i) + { + if (has_dst_aligned_region_restriction(devinfo, inst)) { + return MAX2(type_sz(inst->dst.type), byte_stride(inst->dst)); + + } else if (has_subdword_integer_region_restriction(devinfo, inst) && + type_sz(inst->src[i].type) < 4 && byte_stride(inst->src[i]) >= 4) { + /* Use a stride of 32bits if possible, since that will guarantee that + * the copy emitted to lower this region won't be affected by the + * sub-dword integer region restrictions. This may not be possible + * for the second source of an instruction if we're required to use + * packed data due to Wa_16012383669. + */ + return (i == 1 ? type_sz(inst->src[i].type) : 4); + + } else { + return byte_stride(inst->src[i]); + } + } + + /* + * Return an acceptable byte sub-register offset for the specified source + * of an instruction affected by a regioning restriction. + */ + unsigned + required_src_byte_offset(const intel_device_info *devinfo, const fs_inst *inst, + unsigned i) + { + if (has_dst_aligned_region_restriction(devinfo, inst)) { + return reg_offset(inst->dst) % (reg_unit(devinfo) * REG_SIZE); + + } else if (has_subdword_integer_region_restriction(devinfo, inst) && + type_sz(inst->src[i].type) < 4 && byte_stride(inst->src[i]) >= 4) { + const unsigned dst_byte_stride = MAX2(byte_stride(inst->dst), + type_sz(inst->dst.type)); + const unsigned src_byte_stride = required_src_byte_stride(devinfo, inst, i); + const unsigned dst_byte_offset = + reg_offset(inst->dst) % (reg_unit(devinfo) * REG_SIZE); + const unsigned src_byte_offset = + reg_offset(inst->src[i]) % (reg_unit(devinfo) * REG_SIZE); + + if (src_byte_stride > type_sz(inst->src[i].type)) { + assert(src_byte_stride >= dst_byte_stride); + /* The source is affected by the Xe2+ sub-dword integer regioning + * restrictions. For the case of source 0 BSpec#56640 specifies a + * number of equations relating the source and destination + * sub-register numbers in all cases where a source stride of + * 32bits is allowed. These equations have the form: + * + * k * Dst.SubReg % m = Src.SubReg / l + * + * For some constants k, l and m different for each combination of + * source and destination types and strides. The expression in + * the return statement below computes a valid source offset by + * inverting the equation like: + * + * Src.SubReg = l * k * (Dst.SubReg % m) + * + * and then scaling by the element type sizes in order to get an + * expression in terms of byte offsets instead of sub-register + * numbers. It can be easily verified that in all cases listed on + * the hardware spec where the source has a well-defined uniform + * stride the product l*k is equal to the ratio between the source + * and destination strides. + */ + const unsigned m = 64 * dst_byte_stride / src_byte_stride; + return dst_byte_offset % m * src_byte_stride / dst_byte_stride; + } else { + assert(src_byte_stride == type_sz(inst->src[i].type)); + /* A packed source is required, likely due to the stricter + * requirements of the second source region. The source being + * packed guarantees that the region of the original instruction + * will be valid, but the copy may break the regioning + * restrictions. Do our best to try to prevent that from + * happening by making sure the offset of the temporary matches + * the original source based on the same equation above -- However + * that may not be sufficient if the source had a stride larger + * than 32bits, lowering the copy recursively may be necessary. + */ + return src_byte_offset * src_byte_stride / byte_stride(inst->src[i]); + } + + } else { + return reg_offset(inst->src[i]) % (reg_unit(devinfo) * REG_SIZE); + } + } + /* * Return an acceptable byte stride for the destination of an instruction * that requires it to have some particular alignment. @@ -193,43 +286,6 @@ namespace { } } - /* - * Return the stride between channels of the specified register in - * byte units, or ~0u if the region cannot be represented with a - * single one-dimensional stride. - */ - unsigned - byte_stride(const fs_reg ®) - { - switch (reg.file) { - case BAD_FILE: - case UNIFORM: - case IMM: - case VGRF: - case ATTR: - return reg.stride * type_sz(reg.type); - case ARF: - case FIXED_GRF: - if (reg.is_null()) { - return 0; - } else { - const unsigned hstride = reg.hstride ? 1 << (reg.hstride - 1) : 0; - const unsigned vstride = reg.vstride ? 1 << (reg.vstride - 1) : 0; - const unsigned width = 1 << reg.width; - - if (width == 1) { - return vstride * type_sz(reg.type); - } else if (hstride * width == vstride) { - return hstride * type_sz(reg.type); - } else { - return ~0u; - } - } - default: - unreachable("Invalid register file"); - } - } - /* * Return whether the instruction has an unsupported channel bit layout * specified for the i-th source region. @@ -257,10 +313,13 @@ namespace { const unsigned dst_byte_offset = reg_offset(inst->dst) % (reg_unit(devinfo) * REG_SIZE); const unsigned src_byte_offset = reg_offset(inst->src[i]) % (reg_unit(devinfo) * REG_SIZE); - return has_dst_aligned_region_restriction(devinfo, inst) && - !is_uniform(inst->src[i]) && - (byte_stride(inst->src[i]) != byte_stride(inst->dst) || - src_byte_offset != dst_byte_offset); + return (has_dst_aligned_region_restriction(devinfo, inst) && + !is_uniform(inst->src[i]) && + (byte_stride(inst->src[i]) != byte_stride(inst->dst) || + src_byte_offset != dst_byte_offset)) || + (has_subdword_integer_region_restriction(devinfo, inst) && + (byte_stride(inst->src[i]) != required_src_byte_stride(devinfo, inst, i) || + src_byte_offset != required_src_byte_offset(devinfo, inst, i))); } /* @@ -470,13 +529,24 @@ namespace { lower_src_region(fs_visitor *v, bblock_t *block, fs_inst *inst, unsigned i) { assert(inst->components_read(i) == 1); + const intel_device_info *devinfo = v->devinfo; const fs_builder ibld(v, block, inst); - const unsigned stride = type_sz(inst->dst.type) * inst->dst.stride / + const unsigned stride = required_src_byte_stride(devinfo, inst, i) / type_sz(inst->src[i].type); assert(stride > 0); - fs_reg tmp = ibld.vgrf(inst->src[i].type, stride); + /* Calculate the size of the temporary allocation manually instead of + * relying on the builder, since we may have to add some amount of + * padding mandated by the hardware for Xe2+ instructions with sub-dword + * integer regions. + */ + const unsigned size = + DIV_ROUND_UP(required_src_byte_offset(v->devinfo, inst, i) + + inst->exec_size * stride * type_sz(inst->src[i].type), + reg_unit(devinfo) * REG_SIZE) * reg_unit(devinfo); + fs_reg tmp(VGRF, v->alloc.allocate(size), inst->src[i].type); ibld.UNDEF(tmp); - tmp = horiz_stride(tmp, stride); + tmp = byte_offset(horiz_stride(tmp, stride), + required_src_byte_offset(devinfo, inst, i)); /* Emit a series of 32-bit integer copies with any source modifiers * cleaned up (because their semantics are dependent on the type). @@ -488,8 +558,16 @@ namespace { raw_src.negate = false; raw_src.abs = false; - for (unsigned j = 0; j < n; j++) - ibld.MOV(subscript(tmp, raw_type, j), subscript(raw_src, raw_type, j)); + for (unsigned j = 0; j < n; j++) { + fs_inst *jnst = ibld.MOV(subscript(tmp, raw_type, j), + subscript(raw_src, raw_type, j)); + if (has_subdword_integer_region_restriction(devinfo, jnst)) { + /* The copy isn't guaranteed to comply with all subdword integer + * regioning restrictions in some cases. Lower it recursively. + */ + lower_instruction(v, block, jnst); + } + } /* Point the original instruction at the temporary, making sure to keep * any source modifiers in the instruction. diff --git a/src/intel/compiler/brw_ir_fs.h b/src/intel/compiler/brw_ir_fs.h index 4861a2ae279..7c196a8b71a 100644 --- a/src/intel/compiler/brw_ir_fs.h +++ b/src/intel/compiler/brw_ir_fs.h @@ -720,6 +720,43 @@ is_unordered(const intel_device_info *devinfo, const fs_inst *inst) inst->dst.type == BRW_REGISTER_TYPE_DF)); } +/* + * Return the stride between channels of the specified register in + * byte units, or ~0u if the region cannot be represented with a + * single one-dimensional stride. + */ +static inline unsigned +byte_stride(const fs_reg ®) +{ + switch (reg.file) { + case BAD_FILE: + case UNIFORM: + case IMM: + case VGRF: + case ATTR: + return reg.stride * type_sz(reg.type); + case ARF: + case FIXED_GRF: + if (reg.is_null()) { + return 0; + } else { + const unsigned hstride = reg.hstride ? 1 << (reg.hstride - 1) : 0; + const unsigned vstride = reg.vstride ? 1 << (reg.vstride - 1) : 0; + const unsigned width = 1 << reg.width; + + if (width == 1) { + return vstride * type_sz(reg.type); + } else if (hstride * width == vstride) { + return hstride * type_sz(reg.type); + } else { + return ~0u; + } + } + default: + unreachable("Invalid register file"); + } +} + /** * Return whether the following regioning restriction applies to the specified * instruction. From the Cherryview PRM Vol 7. "Register Region @@ -768,6 +805,30 @@ has_dst_aligned_region_restriction(const intel_device_info *devinfo, return has_dst_aligned_region_restriction(devinfo, inst, inst->dst.type); } +/** + * Return true if the instruction can be potentially affected by the Xe2+ + * regioning restrictions that apply to integer types smaller than a dword. + * The restriction isn't quoted here due to its length, see BSpec #56640 for + * details. + */ +static inline bool +has_subdword_integer_region_restriction(const intel_device_info *devinfo, + const fs_inst *inst) +{ + if (devinfo->ver >= 20 && + brw_reg_type_is_integer(inst->dst.type) && + MAX2(byte_stride(inst->dst), type_sz(inst->dst.type)) < 4) { + for (unsigned i = 0; i < inst->sources; i++) { + if (brw_reg_type_is_integer(inst->src[i].type) && + type_sz(inst->src[i].type) < 4 && + byte_stride(inst->src[i]) >= 4) + return true; + } + } + + return false; +} + /** * Return whether the LOAD_PAYLOAD instruction is a plain copy of bits from * the specified register file into a VGRF.