intel/fs: reduce liveness of variables in lowering passes
When lowering a single instruction with a destination VGRF to 2 or more, the VGRF is now considered partially written by each generated instruction and that increases its liveness especially in loops. Thus potentially increasing the number of spills/fills due to register allocation. Putting an UNDEF instruction in front of the lowered instructions allows the IR to limit the liveness of the VGRF, reducing register pressure. This has a pretty dramatic effect on spills/fills for RT shaders. Here the stats on Q2RTX shaders on DG2 (wipping out any spills/fills due to register allocation) : Instructions in all programs: 26150 -> 24955 (-4.6%) SENDs in all programs: 1148 -> 1148 (+0.0%) Loops in all programs: 4 -> 4 (+0.0%) Cycles in all programs: 392179 -> 332787 (-15.1%) Spills in all programs: 132 -> 116 (-12.1%) Fills in all programs: 262 -> 154 (-41.2%) Shader-db results on TGL : total instructions in shared programs: 21158140 -> 21158377 (<.01%) instructions in affected programs: 76629 -> 76866 (0.31%) helped: 18 HURT: 20 helped stats (abs) min: 1 max: 60 x̄: 18.89 x̃: 12 helped stats (rel) min: 0.21% max: 3.61% x̄: 1.02% x̃: 0.77% HURT stats (abs) min: 1 max: 79 x̄: 28.85 x̃: 18 HURT stats (rel) min: 0.04% max: 2.81% x̄: 1.13% x̃: 0.79% 95% mean confidence interval for instructions value: -4.82 17.30 95% mean confidence interval for instructions %-change: -0.34% 0.57% Inconclusive result (value mean confidence interval includes 0). total loops in shared programs: 5753 -> 5753 (0.00%) loops in affected programs: 0 -> 0 helped: 0 HURT: 0 total cycles in shared programs: 798856834 -> 798870688 (<.01%) cycles in affected programs: 6208395 -> 6222249 (0.22%) helped: 22 HURT: 17 helped stats (abs) min: 2 max: 8794 x̄: 1438.18 x̃: 782 helped stats (rel) min: 0.05% max: 2.28% x̄: 0.63% x̃: 0.44% HURT stats (abs) min: 2 max: 19178 x̄: 2676.12 x̃: 1358 HURT stats (rel) min: 0.04% max: 23.49% x̄: 2.25% x̃: 0.71% 95% mean confidence interval for cycles value: -952.19 1662.65 95% mean confidence interval for cycles %-change: -0.64% 1.90% Inconclusive result (value mean confidence interval includes 0). total spills in shared programs: 4078 -> 4066 (-0.29%) spills in affected programs: 40 -> 28 (-30.00%) helped: 2 HURT: 0 total fills in shared programs: 2856 -> 2832 (-0.84%) fills in affected programs: 127 -> 103 (-18.90%) helped: 2 HURT: 0 total sends in shared programs: 998554 -> 998554 (0.00%) sends in affected programs: 0 -> 0 helped: 0 HURT: 0 LOST: 0 GAINED: 0 Total CPU time (seconds): 2346.06 -> 2304.80 (-1.76%) Signed-off-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com> Reviewed-by: Francisco Jerez <currojerez@riseup.net> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/18657>
This commit is contained in:

committed by
Marge Bot

parent
dd6d40429b
commit
e5dfff0946
@@ -2549,6 +2549,9 @@ fs_visitor::opt_algebraic()
|
||||
assert(!inst->src[0].negate);
|
||||
const brw::fs_builder ibld(this, block, inst);
|
||||
|
||||
if (!inst->is_partial_write())
|
||||
ibld.emit_undef_for_dst(inst);
|
||||
|
||||
ibld.MOV(subscript(inst->dst, BRW_REGISTER_TYPE_F, 1),
|
||||
subscript(inst->src[0], BRW_REGISTER_TYPE_F, 1));
|
||||
ibld.MOV(subscript(inst->dst, BRW_REGISTER_TYPE_F, 0),
|
||||
@@ -2567,6 +2570,9 @@ fs_visitor::opt_algebraic()
|
||||
assert(!inst->src[0].negate);
|
||||
const brw::fs_builder ibld(this, block, inst);
|
||||
|
||||
if (!inst->is_partial_write())
|
||||
ibld.emit_undef_for_dst(inst);
|
||||
|
||||
ibld.MOV(subscript(inst->dst, BRW_REGISTER_TYPE_UD, 1),
|
||||
subscript(inst->src[0], BRW_REGISTER_TYPE_UD, 1));
|
||||
ibld.MOV(subscript(inst->dst, BRW_REGISTER_TYPE_UD, 0),
|
||||
@@ -2697,6 +2703,9 @@ fs_visitor::opt_algebraic()
|
||||
assert(!inst->src[1].abs && !inst->src[1].negate);
|
||||
const brw::fs_builder ibld(this, block, inst);
|
||||
|
||||
if (!inst->is_partial_write())
|
||||
ibld.emit_undef_for_dst(inst);
|
||||
|
||||
set_predicate(inst->predicate,
|
||||
ibld.SEL(subscript(inst->dst, BRW_REGISTER_TYPE_UD, 0),
|
||||
subscript(inst->src[0], BRW_REGISTER_TYPE_UD, 0),
|
||||
@@ -4107,6 +4116,7 @@ fs_visitor::lower_mul_qword_inst(fs_inst *inst, bblock_t *block)
|
||||
subscript(inst->src[1], BRW_REGISTER_TYPE_UD, 0));
|
||||
ibld.MOV(bd_low, acc);
|
||||
|
||||
ibld.UNDEF(bd);
|
||||
ibld.MOV(subscript(bd, BRW_REGISTER_TYPE_UD, 0), bd_low);
|
||||
ibld.MOV(subscript(bd, BRW_REGISTER_TYPE_UD, 1), bd_high);
|
||||
}
|
||||
@@ -4123,6 +4133,8 @@ fs_visitor::lower_mul_qword_inst(fs_inst *inst, bblock_t *block)
|
||||
if (devinfo->has_64bit_int) {
|
||||
ibld.MOV(inst->dst, bd);
|
||||
} else {
|
||||
if (!inst->is_partial_write())
|
||||
ibld.emit_undef_for_dst(inst);
|
||||
ibld.MOV(subscript(inst->dst, BRW_REGISTER_TYPE_UD, 0),
|
||||
subscript(bd, BRW_REGISTER_TYPE_UD, 0));
|
||||
ibld.MOV(subscript(inst->dst, BRW_REGISTER_TYPE_UD, 1),
|
||||
@@ -5564,6 +5576,10 @@ fs_visitor::lower_find_live_channel()
|
||||
*/
|
||||
fs_reg exec_mask(retype(brw_mask_reg(0), BRW_REGISTER_TYPE_UD));
|
||||
|
||||
const fs_builder ibld(this, block, inst);
|
||||
if (!inst->is_partial_write())
|
||||
ibld.emit_undef_for_dst(inst);
|
||||
|
||||
const fs_builder ubld = bld.at(block, inst).exec_all().group(1, 0);
|
||||
|
||||
/* ce0 doesn't consider the thread dispatch mask (DMask or VMask),
|
||||
|
@@ -565,6 +565,17 @@ namespace brw {
|
||||
}
|
||||
}
|
||||
|
||||
instruction *
|
||||
emit_undef_for_dst(const instruction *old_inst) const
|
||||
{
|
||||
assert(old_inst->dst.file == VGRF);
|
||||
instruction *inst = emit(SHADER_OPCODE_UNDEF,
|
||||
retype(old_inst->dst, BRW_REGISTER_TYPE_UD));
|
||||
inst->size_written = old_inst->size_written;
|
||||
|
||||
return inst;
|
||||
}
|
||||
|
||||
/**
|
||||
* Assorted arithmetic ops.
|
||||
* @{
|
||||
@@ -785,7 +796,7 @@ namespace brw {
|
||||
assert(dst.offset % REG_SIZE == 0);
|
||||
instruction *inst = emit(SHADER_OPCODE_UNDEF,
|
||||
retype(dst, BRW_REGISTER_TYPE_UD));
|
||||
inst->size_written = shader->alloc.sizes[dst.nr] * REG_SIZE;
|
||||
inst->size_written = shader->alloc.sizes[dst.nr] * REG_SIZE - dst.offset;
|
||||
|
||||
return inst;
|
||||
}
|
||||
|
@@ -41,6 +41,13 @@ fs_visitor::lower_pack()
|
||||
fs_reg dst = inst->dst;
|
||||
|
||||
const fs_builder ibld(this, block, inst);
|
||||
/* The lowering generates 2 instructions for what was previously 1. This
|
||||
* can trick the IR to believe we're doing partial writes, but the
|
||||
* register is actually fully written. Mark it as undef to help the IR
|
||||
* reduce the liveness of the register.
|
||||
*/
|
||||
if (!inst->is_partial_write())
|
||||
ibld.emit_undef_for_dst(inst);
|
||||
for (unsigned i = 0; i < inst->sources; i++)
|
||||
ibld.MOV(subscript(dst, inst->src[i].type, i), inst->src[i]);
|
||||
|
||||
|
Reference in New Issue
Block a user