intel/compiler: Micro optimize regions_overlap
On my Ice Lake laptop (using a locked CPU speed and other measures to prevent thermal throttling, etc.) using a release build, improves performance of compiling shaders from batman_arkham_city_goty.foz by -1.09% ± 0.084% (n = 5, pooled s = 0.354471) Reduces the size of a release build by 26k. text data bss dec hex filename 23163641 400720 231360 23795721 16b1809 before/lib64/dri/iris_dri.so 23137264 400720 231360 23769344 16ab100 after/lib64/dri/iris_dri.so Reviewed-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/22299>
This commit is contained in:
@@ -212,6 +212,31 @@ reg_padding(const fs_reg &r)
|
|||||||
return (MAX2(1, stride) - 1) * type_sz(r.type);
|
return (MAX2(1, stride) - 1) * type_sz(r.type);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/* Do not call this directly. Call regions_overlap() instead. */
|
||||||
|
static inline bool
|
||||||
|
regions_overlap_MRF(const fs_reg &r, unsigned dr, const fs_reg &s, unsigned ds)
|
||||||
|
{
|
||||||
|
if (r.nr & BRW_MRF_COMPR4) {
|
||||||
|
fs_reg t = r;
|
||||||
|
t.nr &= ~BRW_MRF_COMPR4;
|
||||||
|
/* COMPR4 regions are translated by the hardware during decompression
|
||||||
|
* into two separate half-regions 4 MRFs apart from each other.
|
||||||
|
*
|
||||||
|
* Note: swapping s and t in this parameter list eliminates one possible
|
||||||
|
* level of recursion (since the s in the called versions of
|
||||||
|
* regions_overlap_MRF can't be COMPR4), and that makes the compiled
|
||||||
|
* code a lot smaller.
|
||||||
|
*/
|
||||||
|
return regions_overlap_MRF(s, ds, t, dr / 2) ||
|
||||||
|
regions_overlap_MRF(s, ds, byte_offset(t, 4 * REG_SIZE), dr / 2);
|
||||||
|
} else if (s.nr & BRW_MRF_COMPR4) {
|
||||||
|
return regions_overlap_MRF(s, ds, r, dr);
|
||||||
|
}
|
||||||
|
|
||||||
|
return !((r.nr * REG_SIZE + r.offset + dr) <= (s.nr * REG_SIZE + s.offset) ||
|
||||||
|
(s.nr * REG_SIZE + s.offset + ds) <= (r.nr * REG_SIZE + r.offset));
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Return whether the register region starting at \p r and spanning \p dr
|
* Return whether the register region starting at \p r and spanning \p dr
|
||||||
* bytes could potentially overlap the register region starting at \p s and
|
* bytes could potentially overlap the register region starting at \p s and
|
||||||
@@ -220,22 +245,17 @@ reg_padding(const fs_reg &r)
|
|||||||
static inline bool
|
static inline bool
|
||||||
regions_overlap(const fs_reg &r, unsigned dr, const fs_reg &s, unsigned ds)
|
regions_overlap(const fs_reg &r, unsigned dr, const fs_reg &s, unsigned ds)
|
||||||
{
|
{
|
||||||
if (r.file == MRF && (r.nr & BRW_MRF_COMPR4)) {
|
if (r.file != s.file)
|
||||||
fs_reg t = r;
|
return false;
|
||||||
t.nr &= ~BRW_MRF_COMPR4;
|
|
||||||
/* COMPR4 regions are translated by the hardware during decompression
|
|
||||||
* into two separate half-regions 4 MRFs apart from each other.
|
|
||||||
*/
|
|
||||||
return regions_overlap(t, dr / 2, s, ds) ||
|
|
||||||
regions_overlap(byte_offset(t, 4 * REG_SIZE), dr / 2, s, ds);
|
|
||||||
|
|
||||||
} else if (s.file == MRF && (s.nr & BRW_MRF_COMPR4)) {
|
if (r.file == VGRF) {
|
||||||
return regions_overlap(s, ds, r, dr);
|
return r.nr == s.nr &&
|
||||||
|
!(r.offset + dr <= s.offset || s.offset + ds <= r.offset);
|
||||||
} else {
|
} else if (r.file != MRF) {
|
||||||
return reg_space(r) == reg_space(s) &&
|
return !(reg_offset(r) + dr <= reg_offset(s) ||
|
||||||
!(reg_offset(r) + dr <= reg_offset(s) ||
|
|
||||||
reg_offset(s) + ds <= reg_offset(r));
|
reg_offset(s) + ds <= reg_offset(r));
|
||||||
|
} else {
|
||||||
|
return regions_overlap_MRF(r, dr, s, ds);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
Reference in New Issue
Block a user