intel/fs: sel.cond writes the flags on Gfx4 and Gfx5
On Gfx4 and Gfx5, sel.l (for min) and sel.ge (for max) are implemented
using a separte cmpn and sel instruction. This lowering occurs in
fs_vistor::lower_minmax which is called very, very late... a long, long
time after the first calls to opt_cmod_propagation. As a result,
conditional modifiers can be incorrectly propagated across sel.cond on
those platforms.
No tests were affected by this change, and I find that quite shocking.
After just changing flags_written(), all of the atan tests started
failing on ILK. That required the change in cmod_propagatin (and the
addition of the prop_across_into_sel_gfx5 unit test).
Shader-db results for ILK and GM45 are below. I looked at a couple
before and after shaders... and every case that I looked at had
experienced incorrect cmod propagation. This affected a LOT of apps!
Euro Truck Simulator 2, The Talos Principle, Serious Sam 3, Sanctum 2,
Gang Beasts, and on and on... :(
I discovered this bug while working on a couple new optimization
passes. One of the passes attempts to remove condition modifiers that
are never used. The pass made no progress except on ILK and GM45.
After investigating a couple of the affected shaders, I noticed that
the code in those shaders looked wrong... investigation led to this
cause.
v2: Trivial changes in the unit tests.
v3: Fix type in comment in unit tests. Noticed by Jason and Priit.
v4: Tweak handling of BRW_OPCODE_SEL special case. Suggested by Jason.
Fixes: df1aec763e
("i965/fs: Define methods to calculate the flag subset read or written by an fs_inst.")
Reviewed-by: Jason Ekstrand <jason@jlekstrand.net>
Tested-by: Dave Airlie <airlied@redhat.com>
Iron Lake
total instructions in shared programs: 8180493 -> 8181781 (0.02%)
instructions in affected programs: 541796 -> 543084 (0.24%)
helped: 28
HURT: 1158
helped stats (abs) min: 1 max: 1 x̄: 1.00 x̃: 1
helped stats (rel) min: 0.35% max: 0.86% x̄: 0.53% x̃: 0.50%
HURT stats (abs) min: 1 max: 3 x̄: 1.14 x̃: 1
HURT stats (rel) min: 0.12% max: 4.00% x̄: 0.37% x̃: 0.23%
95% mean confidence interval for instructions value: 1.06 1.11
95% mean confidence interval for instructions %-change: 0.31% 0.38%
Instructions are HURT.
total cycles in shared programs: 239420470 -> 239421690 (<.01%)
cycles in affected programs: 2925992 -> 2927212 (0.04%)
helped: 49
HURT: 157
helped stats (abs) min: 2 max: 284 x̄: 62.69 x̃: 70
helped stats (rel) min: 0.04% max: 6.20% x̄: 1.68% x̃: 1.96%
HURT stats (abs) min: 2 max: 48 x̄: 27.34 x̃: 24
HURT stats (rel) min: 0.02% max: 2.91% x̄: 0.31% x̃: 0.20%
95% mean confidence interval for cycles value: -0.80 12.64
95% mean confidence interval for cycles %-change: -0.31% <.01%
Inconclusive result (value mean confidence interval includes 0).
GM45
total instructions in shared programs: 4985517 -> 4986207 (0.01%)
instructions in affected programs: 306935 -> 307625 (0.22%)
helped: 14
HURT: 625
helped stats (abs) min: 1 max: 1 x̄: 1.00 x̃: 1
helped stats (rel) min: 0.35% max: 0.82% x̄: 0.52% x̃: 0.49%
HURT stats (abs) min: 1 max: 3 x̄: 1.13 x̃: 1
HURT stats (rel) min: 0.12% max: 3.90% x̄: 0.34% x̃: 0.22%
95% mean confidence interval for instructions value: 1.04 1.12
95% mean confidence interval for instructions %-change: 0.29% 0.36%
Instructions are HURT.
total cycles in shared programs: 153827268 -> 153828052 (<.01%)
cycles in affected programs: 1669290 -> 1670074 (0.05%)
helped: 24
HURT: 84
helped stats (abs) min: 2 max: 232 x̄: 64.33 x̃: 67
helped stats (rel) min: 0.04% max: 4.62% x̄: 1.60% x̃: 1.94%
HURT stats (abs) min: 2 max: 48 x̄: 27.71 x̃: 24
HURT stats (rel) min: 0.02% max: 2.66% x̄: 0.34% x̃: 0.14%
95% mean confidence interval for cycles value: -1.94 16.46
95% mean confidence interval for cycles %-change: -0.29% 0.11%
Inconclusive result (value mean confidence interval includes 0).
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/12191>
This commit is contained in:
@@ -55,7 +55,7 @@ cmod_propagate_cmp_to_add(const intel_device_info *devinfo, bblock_t *block,
|
||||
fs_inst *inst)
|
||||
{
|
||||
bool read_flag = false;
|
||||
const unsigned flags_written = inst->flags_written();
|
||||
const unsigned flags_written = inst->flags_written(devinfo);
|
||||
|
||||
foreach_inst_in_block_reverse_starting_from(fs_inst, scan_inst, inst) {
|
||||
if (scan_inst->opcode == BRW_OPCODE_ADD &&
|
||||
@@ -89,8 +89,8 @@ cmod_propagate_cmp_to_add(const intel_device_info *devinfo, bblock_t *block,
|
||||
* Perhaps (scan_inst->flags_written() & flags_written) !=
|
||||
* flags_written?
|
||||
*/
|
||||
if (scan_inst->flags_written() != 0 &&
|
||||
scan_inst->flags_written() != flags_written)
|
||||
if (scan_inst->flags_written(devinfo) != 0 &&
|
||||
scan_inst->flags_written(devinfo) != flags_written)
|
||||
goto not_match;
|
||||
|
||||
/* From the Kaby Lake PRM Vol. 7 "Assigning Conditional Flags":
|
||||
@@ -142,7 +142,7 @@ cmod_propagate_cmp_to_add(const intel_device_info *devinfo, bblock_t *block,
|
||||
}
|
||||
|
||||
not_match:
|
||||
if ((scan_inst->flags_written() & flags_written) != 0)
|
||||
if ((scan_inst->flags_written(devinfo) & flags_written) != 0)
|
||||
break;
|
||||
|
||||
read_flag = read_flag ||
|
||||
@@ -171,7 +171,7 @@ cmod_propagate_not(const intel_device_info *devinfo, bblock_t *block,
|
||||
{
|
||||
const enum brw_conditional_mod cond = brw_negate_cmod(inst->conditional_mod);
|
||||
bool read_flag = false;
|
||||
const unsigned flags_written = inst->flags_written();
|
||||
const unsigned flags_written = inst->flags_written(devinfo);
|
||||
|
||||
if (cond != BRW_CONDITIONAL_Z && cond != BRW_CONDITIONAL_NZ)
|
||||
return false;
|
||||
@@ -195,8 +195,8 @@ cmod_propagate_not(const intel_device_info *devinfo, bblock_t *block,
|
||||
* Perhaps (scan_inst->flags_written() & flags_written) !=
|
||||
* flags_written?
|
||||
*/
|
||||
if (scan_inst->flags_written() != 0 &&
|
||||
scan_inst->flags_written() != flags_written)
|
||||
if (scan_inst->flags_written(devinfo) != 0 &&
|
||||
scan_inst->flags_written(devinfo) != flags_written)
|
||||
break;
|
||||
|
||||
if (scan_inst->can_do_cmod() &&
|
||||
@@ -209,7 +209,7 @@ cmod_propagate_not(const intel_device_info *devinfo, bblock_t *block,
|
||||
break;
|
||||
}
|
||||
|
||||
if ((scan_inst->flags_written() & flags_written) != 0)
|
||||
if ((scan_inst->flags_written(devinfo) & flags_written) != 0)
|
||||
break;
|
||||
|
||||
read_flag = read_flag ||
|
||||
@@ -285,7 +285,7 @@ opt_cmod_propagation_local(const intel_device_info *devinfo, bblock_t *block)
|
||||
}
|
||||
|
||||
bool read_flag = false;
|
||||
const unsigned flags_written = inst->flags_written();
|
||||
const unsigned flags_written = inst->flags_written(devinfo);
|
||||
foreach_inst_in_block_reverse_starting_from(fs_inst, scan_inst, inst) {
|
||||
if (regions_overlap(scan_inst->dst, scan_inst->size_written,
|
||||
inst->src[0], inst->size_read(0))) {
|
||||
@@ -296,8 +296,8 @@ opt_cmod_propagation_local(const intel_device_info *devinfo, bblock_t *block)
|
||||
* Perhaps (scan_inst->flags_written() & flags_written) !=
|
||||
* flags_written?
|
||||
*/
|
||||
if (scan_inst->flags_written() != 0 &&
|
||||
scan_inst->flags_written() != flags_written)
|
||||
if (scan_inst->flags_written(devinfo) != 0 &&
|
||||
scan_inst->flags_written(devinfo) != flags_written)
|
||||
break;
|
||||
|
||||
if (scan_inst->is_partial_write() ||
|
||||
@@ -396,7 +396,7 @@ opt_cmod_propagation_local(const intel_device_info *devinfo, bblock_t *block)
|
||||
* between scan_inst and inst.
|
||||
*/
|
||||
if (!inst->src[0].negate &&
|
||||
scan_inst->flags_written()) {
|
||||
scan_inst->flags_written(devinfo)) {
|
||||
if (scan_inst->opcode == BRW_OPCODE_CMP) {
|
||||
if ((inst->conditional_mod == BRW_CONDITIONAL_NZ) ||
|
||||
(inst->conditional_mod == BRW_CONDITIONAL_G &&
|
||||
@@ -408,8 +408,18 @@ opt_cmod_propagation_local(const intel_device_info *devinfo, bblock_t *block)
|
||||
break;
|
||||
}
|
||||
} else if (scan_inst->conditional_mod == inst->conditional_mod) {
|
||||
inst->remove(block, true);
|
||||
progress = true;
|
||||
/* On Gfx4 and Gfx5 sel.cond will dirty the flags, but the
|
||||
* flags value is not based on the result stored in the
|
||||
* destination. On all other platforms sel.cond will not
|
||||
* write the flags, so execution will not get to this point.
|
||||
*/
|
||||
if (scan_inst->opcode == BRW_OPCODE_SEL) {
|
||||
assert(devinfo->ver <= 5);
|
||||
} else {
|
||||
inst->remove(block, true);
|
||||
progress = true;
|
||||
}
|
||||
|
||||
break;
|
||||
} else if (!read_flag) {
|
||||
scan_inst->conditional_mod = inst->conditional_mod;
|
||||
@@ -505,7 +515,7 @@ opt_cmod_propagation_local(const intel_device_info *devinfo, bblock_t *block)
|
||||
break;
|
||||
}
|
||||
|
||||
if ((scan_inst->flags_written() & flags_written) != 0)
|
||||
if ((scan_inst->flags_written(devinfo) & flags_written) != 0)
|
||||
break;
|
||||
|
||||
read_flag = read_flag ||
|
||||
|
Reference in New Issue
Block a user