intel/fs: reduce liveness of variables in lowering passes

When lowering a single instruction with a destination VGRF to 2 or
more, the VGRF is now considered partially written by each generated
instruction and that increases its liveness especially in loops. Thus
potentially increasing the number of spills/fills due to register
allocation.

Putting an UNDEF instruction in front of the lowered instructions
allows the IR to limit the liveness of the VGRF, reducing register
pressure.

This has a pretty dramatic effect on spills/fills for RT shaders. Here
the stats on Q2RTX shaders on DG2 (wipping out any spills/fills due to
register allocation) :

Instructions in all programs: 26150 -> 24955 (-4.6%)
SENDs in all programs: 1148 -> 1148 (+0.0%)
Loops in all programs: 4 -> 4 (+0.0%)
Cycles in all programs: 392179 -> 332787 (-15.1%)
Spills in all programs: 132 -> 116 (-12.1%)
Fills in all programs: 262 -> 154 (-41.2%)

Shader-db results on TGL :

total instructions in shared programs: 21158140 -> 21158377 (<.01%)
instructions in affected programs: 76629 -> 76866 (0.31%)
helped: 18
HURT: 20
helped stats (abs) min: 1 max: 60 x̄: 18.89 x̃: 12
helped stats (rel) min: 0.21% max: 3.61% x̄: 1.02% x̃: 0.77%
HURT stats (abs)   min: 1 max: 79 x̄: 28.85 x̃: 18
HURT stats (rel)   min: 0.04% max: 2.81% x̄: 1.13% x̃: 0.79%
95% mean confidence interval for instructions value: -4.82 17.30
95% mean confidence interval for instructions %-change: -0.34% 0.57%
Inconclusive result (value mean confidence interval includes 0).

total loops in shared programs: 5753 -> 5753 (0.00%)
loops in affected programs: 0 -> 0
helped: 0
HURT: 0

total cycles in shared programs: 798856834 -> 798870688 (<.01%)
cycles in affected programs: 6208395 -> 6222249 (0.22%)
helped: 22
HURT: 17
helped stats (abs) min: 2 max: 8794 x̄: 1438.18 x̃: 782
helped stats (rel) min: 0.05% max: 2.28% x̄: 0.63% x̃: 0.44%
HURT stats (abs)   min: 2 max: 19178 x̄: 2676.12 x̃: 1358
HURT stats (rel)   min: 0.04% max: 23.49% x̄: 2.25% x̃: 0.71%
95% mean confidence interval for cycles value: -952.19 1662.65
95% mean confidence interval for cycles %-change: -0.64% 1.90%
Inconclusive result (value mean confidence interval includes 0).

total spills in shared programs: 4078 -> 4066 (-0.29%)
spills in affected programs: 40 -> 28 (-30.00%)
helped: 2
HURT: 0

total fills in shared programs: 2856 -> 2832 (-0.84%)
fills in affected programs: 127 -> 103 (-18.90%)
helped: 2
HURT: 0

total sends in shared programs: 998554 -> 998554 (0.00%)
sends in affected programs: 0 -> 0
helped: 0
HURT: 0

LOST:   0
GAINED: 0

Total CPU time (seconds): 2346.06 -> 2304.80 (-1.76%)

Signed-off-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
Reviewed-by: Francisco Jerez <currojerez@riseup.net>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/18657>
This commit is contained in:
Lionel Landwerlin
2022-09-14 02:40:01 +03:00
committed by Marge Bot
parent dd6d40429b
commit e5dfff0946
3 changed files with 35 additions and 1 deletions

View File

@@ -2549,6 +2549,9 @@ fs_visitor::opt_algebraic()
assert(!inst->src[0].negate); assert(!inst->src[0].negate);
const brw::fs_builder ibld(this, block, inst); const brw::fs_builder ibld(this, block, inst);
if (!inst->is_partial_write())
ibld.emit_undef_for_dst(inst);
ibld.MOV(subscript(inst->dst, BRW_REGISTER_TYPE_F, 1), ibld.MOV(subscript(inst->dst, BRW_REGISTER_TYPE_F, 1),
subscript(inst->src[0], BRW_REGISTER_TYPE_F, 1)); subscript(inst->src[0], BRW_REGISTER_TYPE_F, 1));
ibld.MOV(subscript(inst->dst, BRW_REGISTER_TYPE_F, 0), ibld.MOV(subscript(inst->dst, BRW_REGISTER_TYPE_F, 0),
@@ -2567,6 +2570,9 @@ fs_visitor::opt_algebraic()
assert(!inst->src[0].negate); assert(!inst->src[0].negate);
const brw::fs_builder ibld(this, block, inst); const brw::fs_builder ibld(this, block, inst);
if (!inst->is_partial_write())
ibld.emit_undef_for_dst(inst);
ibld.MOV(subscript(inst->dst, BRW_REGISTER_TYPE_UD, 1), ibld.MOV(subscript(inst->dst, BRW_REGISTER_TYPE_UD, 1),
subscript(inst->src[0], BRW_REGISTER_TYPE_UD, 1)); subscript(inst->src[0], BRW_REGISTER_TYPE_UD, 1));
ibld.MOV(subscript(inst->dst, BRW_REGISTER_TYPE_UD, 0), ibld.MOV(subscript(inst->dst, BRW_REGISTER_TYPE_UD, 0),
@@ -2697,6 +2703,9 @@ fs_visitor::opt_algebraic()
assert(!inst->src[1].abs && !inst->src[1].negate); assert(!inst->src[1].abs && !inst->src[1].negate);
const brw::fs_builder ibld(this, block, inst); const brw::fs_builder ibld(this, block, inst);
if (!inst->is_partial_write())
ibld.emit_undef_for_dst(inst);
set_predicate(inst->predicate, set_predicate(inst->predicate,
ibld.SEL(subscript(inst->dst, BRW_REGISTER_TYPE_UD, 0), ibld.SEL(subscript(inst->dst, BRW_REGISTER_TYPE_UD, 0),
subscript(inst->src[0], BRW_REGISTER_TYPE_UD, 0), subscript(inst->src[0], BRW_REGISTER_TYPE_UD, 0),
@@ -4107,6 +4116,7 @@ fs_visitor::lower_mul_qword_inst(fs_inst *inst, bblock_t *block)
subscript(inst->src[1], BRW_REGISTER_TYPE_UD, 0)); subscript(inst->src[1], BRW_REGISTER_TYPE_UD, 0));
ibld.MOV(bd_low, acc); ibld.MOV(bd_low, acc);
ibld.UNDEF(bd);
ibld.MOV(subscript(bd, BRW_REGISTER_TYPE_UD, 0), bd_low); ibld.MOV(subscript(bd, BRW_REGISTER_TYPE_UD, 0), bd_low);
ibld.MOV(subscript(bd, BRW_REGISTER_TYPE_UD, 1), bd_high); ibld.MOV(subscript(bd, BRW_REGISTER_TYPE_UD, 1), bd_high);
} }
@@ -4123,6 +4133,8 @@ fs_visitor::lower_mul_qword_inst(fs_inst *inst, bblock_t *block)
if (devinfo->has_64bit_int) { if (devinfo->has_64bit_int) {
ibld.MOV(inst->dst, bd); ibld.MOV(inst->dst, bd);
} else { } else {
if (!inst->is_partial_write())
ibld.emit_undef_for_dst(inst);
ibld.MOV(subscript(inst->dst, BRW_REGISTER_TYPE_UD, 0), ibld.MOV(subscript(inst->dst, BRW_REGISTER_TYPE_UD, 0),
subscript(bd, BRW_REGISTER_TYPE_UD, 0)); subscript(bd, BRW_REGISTER_TYPE_UD, 0));
ibld.MOV(subscript(inst->dst, BRW_REGISTER_TYPE_UD, 1), ibld.MOV(subscript(inst->dst, BRW_REGISTER_TYPE_UD, 1),
@@ -5564,6 +5576,10 @@ fs_visitor::lower_find_live_channel()
*/ */
fs_reg exec_mask(retype(brw_mask_reg(0), BRW_REGISTER_TYPE_UD)); fs_reg exec_mask(retype(brw_mask_reg(0), BRW_REGISTER_TYPE_UD));
const fs_builder ibld(this, block, inst);
if (!inst->is_partial_write())
ibld.emit_undef_for_dst(inst);
const fs_builder ubld = bld.at(block, inst).exec_all().group(1, 0); const fs_builder ubld = bld.at(block, inst).exec_all().group(1, 0);
/* ce0 doesn't consider the thread dispatch mask (DMask or VMask), /* ce0 doesn't consider the thread dispatch mask (DMask or VMask),

View File

@@ -565,6 +565,17 @@ namespace brw {
} }
} }
instruction *
emit_undef_for_dst(const instruction *old_inst) const
{
assert(old_inst->dst.file == VGRF);
instruction *inst = emit(SHADER_OPCODE_UNDEF,
retype(old_inst->dst, BRW_REGISTER_TYPE_UD));
inst->size_written = old_inst->size_written;
return inst;
}
/** /**
* Assorted arithmetic ops. * Assorted arithmetic ops.
* @{ * @{
@@ -785,7 +796,7 @@ namespace brw {
assert(dst.offset % REG_SIZE == 0); assert(dst.offset % REG_SIZE == 0);
instruction *inst = emit(SHADER_OPCODE_UNDEF, instruction *inst = emit(SHADER_OPCODE_UNDEF,
retype(dst, BRW_REGISTER_TYPE_UD)); retype(dst, BRW_REGISTER_TYPE_UD));
inst->size_written = shader->alloc.sizes[dst.nr] * REG_SIZE; inst->size_written = shader->alloc.sizes[dst.nr] * REG_SIZE - dst.offset;
return inst; return inst;
} }

View File

@@ -41,6 +41,13 @@ fs_visitor::lower_pack()
fs_reg dst = inst->dst; fs_reg dst = inst->dst;
const fs_builder ibld(this, block, inst); const fs_builder ibld(this, block, inst);
/* The lowering generates 2 instructions for what was previously 1. This
* can trick the IR to believe we're doing partial writes, but the
* register is actually fully written. Mark it as undef to help the IR
* reduce the liveness of the register.
*/
if (!inst->is_partial_write())
ibld.emit_undef_for_dst(inst);
for (unsigned i = 0; i < inst->sources; i++) for (unsigned i = 0; i < inst->sources; i++)
ibld.MOV(subscript(dst, inst->src[i].type, i), inst->src[i]); ibld.MOV(subscript(dst, inst->src[i].type, i), inst->src[i]);