diff --git a/src/intel/compiler/brw_eu_validate.c b/src/intel/compiler/brw_eu_validate.c index d0cadbdf3d5..709e0cca58d 100644 --- a/src/intel/compiler/brw_eu_validate.c +++ b/src/intel/compiler/brw_eu_validate.c @@ -1334,37 +1334,63 @@ special_restrictions_for_mixed_float_mode(const struct brw_isa_info *isa, } /** - * Creates an \p access_mask for an \p exec_size, \p element_size, and a region + * Creates a \p grf_access_mask for an \p exec_size, \p element_size, and a + * region * - * An \p access_mask is a 32-element array of uint64_t, where each uint64_t is - * a bitmask of bytes accessed by the region. + * A \p grf_access_mask is a 32-element array of uint8_t, where each uint8_t + * is a bitmask of grfs accessed by the region. * * For instance the access mask of the source gX.1<4,2,2>F in an exec_size = 4 * instruction would be * - * access_mask[0] = 0x00000000000000F0 - * access_mask[1] = 0x000000000000F000 - * access_mask[2] = 0x0000000000F00000 - * access_mask[3] = 0x00000000F0000000 + * access_mask[0] = 0x01 (bytes 7-4 of the 1st grf) + * access_mask[1] = 0x01 (bytes 15-12 of the 1st grf) + * access_mask[2] = 0x01 (bytes 23-20 of the 1st grf) + * access_mask[3] = 0x01 (bytes 31-28 of the 1st grf) + * access_mask[4-31] = 0 + * + * Before Xe2, gX<1,1,0>F in an exec_size == 16 would yield: + * + * access_mask[0] = 0x01 (bytes 3-0 of the 1st grf) + * access_mask[1] = 0x01 (bytes 7-4 of the 1st grf) + * ... + * access_mask[7] = 0x01 (bytes 31-28 of the 1st grf) + * access_mask[8] = 0x02 (bytes 3-0 of the 2nd grf) + * ... + * access_mask[15] = 0x02 (bytes 31-28 of the 2nd grf) + * access_mask[16-31] = 0 + * + * Whereas on Xe2, gX<1,1,0>F in an exec_size of 16 would yield: + * + * access_mask[0] = 0x01 (bytes 3-0 of the 1st grf) + * access_mask[1] = 0x01 (bytes 7-4 of the 1st grf) + * ... + * access_mask[7] = 0x01 (bytes 31-28 of the 1st grf) + * access_mask[8] = 0x01 (bytes 35-32 of the 1st grf) + * ... + * access_mask[15] = 0x01 (bytes 63-60 of the 1st grf) * access_mask[4-31] = 0 * - * because the first execution channel accesses bytes 7-4 and the second - * execution channel accesses bytes 15-12, etc. */ static void -align1_access_mask(uint64_t access_mask[static 32], - unsigned exec_size, unsigned element_size, unsigned subreg, - unsigned vstride, unsigned width, unsigned hstride) +grfs_accessed(const struct intel_device_info *devinfo, + uint8_t grf_access_mask[static 32], + unsigned exec_size, unsigned element_size, unsigned subreg, + unsigned vstride, unsigned width, unsigned hstride) { - const uint64_t mask = (1ULL << element_size) - 1; unsigned rowbase = subreg; unsigned element = 0; + assert(util_is_power_of_two_nonzero(reg_unit(devinfo))); + unsigned grf_size_shift = (5 - 1) + ffs(reg_unit(devinfo)); for (int y = 0; y < exec_size / width; y++) { unsigned offset = rowbase; for (int x = 0; x < width; x++) { - access_mask[element++] = mask << (offset % 64); + const unsigned start_grf = (offset >> grf_size_shift) % 8; + const unsigned end_byte = offset + (element_size - 1); + const unsigned end_grf = (end_byte >> grf_size_shift) % 8; + grf_access_mask[element++] = (1 << start_grf) | (1 << end_grf); offset += hstride * element_size; } @@ -1378,19 +1404,14 @@ align1_access_mask(uint64_t access_mask[static 32], * Returns the number of registers accessed according to the \p access_mask */ static int -registers_read(const uint64_t access_mask[static 32]) +registers_read(const uint8_t grfs_accessed[static 32]) { - int regs_read = 0; + uint8_t all_read = 0; - for (unsigned i = 0; i < 32; i++) { - if (access_mask[i] > 0xFFFFFFFF) { - return 2; - } else if (access_mask[i]) { - regs_read = 1; - } - } + for (unsigned i = 0; i < 32; i++) + all_read |= grfs_accessed[i]; - return regs_read; + return util_bitcount(all_read); } /** @@ -1406,7 +1427,7 @@ region_alignment_rules(const struct brw_isa_info *isa, brw_opcode_desc(isa, brw_inst_opcode(isa, inst)); unsigned num_sources = brw_num_sources_from_inst(isa, inst); unsigned exec_size = 1 << brw_inst_exec_size(devinfo, inst); - uint64_t dst_access_mask[32], src0_access_mask[32], src1_access_mask[32]; + uint8_t dst_access_mask[32], src0_access_mask[32], src1_access_mask[32]; struct string error_msg = { .str = NULL, .len = 0 }; if (num_sources == 3) @@ -1445,9 +1466,9 @@ region_alignment_rules(const struct brw_isa_info *isa, type = brw_inst_src ## n ## _type(devinfo, inst); \ element_size = brw_type_size_bytes(type); \ subreg = brw_inst_src ## n ## _da1_subreg_nr(devinfo, inst); \ - align1_access_mask(src ## n ## _access_mask, \ - exec_size, element_size, subreg, \ - vstride, width, hstride) + grfs_accessed(devinfo, src ## n ## _access_mask, \ + exec_size, element_size, subreg, \ + vstride, width, hstride) if (i == 0) { DO_SRC(0); @@ -1480,10 +1501,10 @@ region_alignment_rules(const struct brw_isa_info *isa, if (error_msg.str) return error_msg; - align1_access_mask(dst_access_mask, exec_size, element_size, subreg, - exec_size == 1 ? 0 : exec_size * stride, - exec_size == 1 ? 1 : exec_size, - exec_size == 1 ? 0 : stride); + grfs_accessed(devinfo, dst_access_mask, exec_size, element_size, subreg, + exec_size == 1 ? 0 : exec_size * stride, + exec_size == 1 ? 1 : exec_size, + exec_size == 1 ? 0 : stride); unsigned dst_regs = registers_read(dst_access_mask); @@ -1500,10 +1521,10 @@ region_alignment_rules(const struct brw_isa_info *isa, unsigned upper_reg_writes = 0, lower_reg_writes = 0; for (unsigned i = 0; i < exec_size; i++) { - if (dst_access_mask[i] > 0xFFFFFFFF) { + if (dst_access_mask[i] == 2) { upper_reg_writes++; } else { - assert(dst_access_mask[i] != 0); + assert(dst_access_mask[i] == 1); lower_reg_writes++; } }