intel/fs: Add partial support for copy-propagating FIXED_GRFs.

This will be useful for eliminating redundant copies from the FS
thread payload, particularly in SIMD32 programs.  For the moment we
only allow FIXED_GRFs with identity strides in order to avoid dealing
with composing the arbitrary bidimensional strides that FIXED_GRF
regions potentially have, which are rarely used at the IR level
anyway.

This enables the following commit allowing block-propagation of
FIXED_GRF LOAD_PAYLOAD copies, and prevents the following shader-db
regressions (including SIMD32 programs) in combination with the
interpolation rework part of this series.  On ICL:

   total instructions in shared programs: 20484665 -> 20529650 (0.22%)
   instructions in affected programs: 6031235 -> 6076220 (0.75%)
   helped: 5
   HURT: 42073

   total spills in shared programs: 8748 -> 8925 (2.02%)
   spills in affected programs: 186 -> 363 (95.16%)
   helped: 5
   HURT: 9

   total fills in shared programs: 8663 -> 8960 (3.43%)
   fills in affected programs: 647 -> 944 (45.90%)
   helped: 5
   HURT: 9

On SKL:

   total instructions in shared programs: 18937442 -> 19128162 (1.01%)
   instructions in affected programs: 8378187 -> 8568907 (2.28%)
   helped: 39
   HURT: 68176

   LOST:   1
   GAINED: 4

On SNB:

   total instructions in shared programs: 14094685 -> 14243499 (1.06%)
   instructions in affected programs: 7751062 -> 7899876 (1.92%)
   helped: 623
   HURT: 53586

   LOST:   7
   GAINED: 25

Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
This commit is contained in:
Francisco Jerez
2019-12-30 00:38:08 -08:00
parent 5153d06d92
commit e328fbd9f8

View File

@@ -452,7 +452,7 @@ fs_visitor::try_copy_propagate(fs_inst *inst, int arg, acp_entry *entry)
if (entry->src.file == IMM) if (entry->src.file == IMM)
return false; return false;
assert(entry->src.file == VGRF || entry->src.file == UNIFORM || assert(entry->src.file == VGRF || entry->src.file == UNIFORM ||
entry->src.file == ATTR); entry->src.file == ATTR || entry->src.file == FIXED_GRF);
if (entry->opcode == SHADER_OPCODE_LOAD_PAYLOAD && if (entry->opcode == SHADER_OPCODE_LOAD_PAYLOAD &&
inst->opcode == SHADER_OPCODE_LOAD_PAYLOAD) inst->opcode == SHADER_OPCODE_LOAD_PAYLOAD)
@@ -469,6 +469,21 @@ fs_visitor::try_copy_propagate(fs_inst *inst, int arg, acp_entry *entry)
entry->dst, entry->size_written)) entry->dst, entry->size_written))
return false; return false;
/* Avoid propagating a FIXED_GRF register into an EOT instruction in order
* for any register allocation restrictions to be applied.
*/
if (entry->src.file == FIXED_GRF && inst->eot)
return false;
/* Avoid propagating odd-numbered FIXED_GRF registers into the first source
* of a LINTERP instruction on platforms where the PLN instruction has
* register alignment restrictions.
*/
if (devinfo->has_pln && devinfo->gen <= 6 &&
entry->src.file == FIXED_GRF && (entry->src.nr & 1) &&
inst->opcode == FS_OPCODE_LINTERP && arg == 0)
return false;
/* we can't generally copy-propagate UD negations because we /* we can't generally copy-propagate UD negations because we
* can end up accessing the resulting values as signed integers * can end up accessing the resulting values as signed integers
* instead. See also resolve_ud_negate() and comment in * instead. See also resolve_ud_negate() and comment in
@@ -493,16 +508,30 @@ fs_visitor::try_copy_propagate(fs_inst *inst, int arg, acp_entry *entry)
* derivatives, assume that their operands are packed so we can't * derivatives, assume that their operands are packed so we can't
* generally propagate strided regions to them. * generally propagate strided regions to them.
*/ */
if (instruction_requires_packed_data(inst) && entry->src.stride > 1) const unsigned entry_stride = (entry->src.file == FIXED_GRF ? 1 :
entry->src.stride);
if (instruction_requires_packed_data(inst) && entry_stride > 1)
return false; return false;
/* Bail if the result of composing both strides would exceed the /* Bail if the result of composing both strides would exceed the
* hardware limit. * hardware limit.
*/ */
if (!can_take_stride(inst, arg, entry->src.stride * inst->src[arg].stride, if (!can_take_stride(inst, arg, entry_stride * inst->src[arg].stride,
devinfo)) devinfo))
return false; return false;
/* Bail if the source FIXED_GRF region of the copy cannot be trivially
* composed with the source region of the instruction -- E.g. because the
* copy uses some extended stride greater than 4 not supported natively by
* the hardware as a horizontal stride, or because instruction compression
* could require us to use a vertical stride shorter than a GRF.
*/
if (entry->src.file == FIXED_GRF &&
(inst->src[arg].stride > 4 ||
inst->dst.component_size(inst->exec_size) >
inst->src[arg].component_size(inst->exec_size)))
return false;
/* Bail if the instruction type is larger than the execution type of the /* Bail if the instruction type is larger than the execution type of the
* copy, what implies that each channel is reading multiple channels of the * copy, what implies that each channel is reading multiple channels of the
* destination of the copy, and simply replacing the sources would give a * destination of the copy, and simply replacing the sources would give a
@@ -524,7 +553,7 @@ fs_visitor::try_copy_propagate(fs_inst *inst, int arg, acp_entry *entry)
* *
* Which would have different semantics. * Which would have different semantics.
*/ */
if (entry->src.stride != 1 && if (entry_stride != 1 &&
(inst->src[arg].stride * (inst->src[arg].stride *
type_sz(inst->src[arg].type)) % type_sz(entry->src.type) != 0) type_sz(inst->src[arg].type)) % type_sz(entry->src.type) != 0)
return false; return false;
@@ -562,13 +591,42 @@ fs_visitor::try_copy_propagate(fs_inst *inst, int arg, acp_entry *entry)
} }
} }
/* Save the offset of inst->src[arg] relative to entry->dst for it to be
* applied later.
*/
const unsigned rel_offset = inst->src[arg].offset - entry->dst.offset;
/* Fold the copy into the instruction consuming it. */
inst->src[arg].file = entry->src.file; inst->src[arg].file = entry->src.file;
inst->src[arg].nr = entry->src.nr; inst->src[arg].nr = entry->src.nr;
inst->src[arg].stride *= entry->src.stride; inst->src[arg].subnr = entry->src.subnr;
inst->saturate = inst->saturate || entry->saturate; inst->src[arg].offset = entry->src.offset;
/* Compute the offset of inst->src[arg] relative to entry->dst */ /* Compose the strides of both regions. */
const unsigned rel_offset = inst->src[arg].offset - entry->dst.offset; if (entry->src.file == FIXED_GRF) {
if (inst->src[arg].stride) {
const unsigned orig_width = 1 << entry->src.width;
const unsigned reg_width = REG_SIZE / (type_sz(inst->src[arg].type) *
inst->src[arg].stride);
inst->src[arg].width = cvt(MIN2(orig_width, reg_width)) - 1;
inst->src[arg].hstride = cvt(inst->src[arg].stride);
inst->src[arg].vstride = inst->src[arg].hstride + inst->src[arg].width;
} else {
inst->src[arg].vstride = inst->src[arg].hstride =
inst->src[arg].width = 0;
}
inst->src[arg].stride = 1;
/* Hopefully no Align16 around here... */
assert(entry->src.swizzle == BRW_SWIZZLE_XYZW);
inst->src[arg].swizzle = entry->src.swizzle;
} else {
inst->src[arg].stride *= entry->src.stride;
}
/* Compose any saturate modifiers. */
inst->saturate = inst->saturate || entry->saturate;
/* Compute the first component of the copy that the instruction is /* Compute the first component of the copy that the instruction is
* reading, and the base byte offset within that component. * reading, and the base byte offset within that component.
@@ -580,9 +638,8 @@ fs_visitor::try_copy_propagate(fs_inst *inst, int arg, acp_entry *entry)
/* Calculate the byte offset at the origin of the copy of the given /* Calculate the byte offset at the origin of the copy of the given
* component and suboffset. * component and suboffset.
*/ */
inst->src[arg].offset = suboffset + inst->src[arg] = byte_offset(inst->src[arg],
component * entry->src.stride * type_sz(entry->src.type) + component * entry_stride * type_sz(entry->src.type) + suboffset);
entry->src.offset;
if (has_source_modifiers) { if (has_source_modifiers) {
if (entry->dst.type != inst->src[arg].type) { if (entry->dst.type != inst->src[arg].type) {
@@ -834,7 +891,9 @@ can_propagate_from(fs_inst *inst)
inst->src[0], inst->size_read(0))) || inst->src[0], inst->size_read(0))) ||
inst->src[0].file == ATTR || inst->src[0].file == ATTR ||
inst->src[0].file == UNIFORM || inst->src[0].file == UNIFORM ||
inst->src[0].file == IMM) && inst->src[0].file == IMM ||
(inst->src[0].file == FIXED_GRF &&
inst->src[0].is_contiguous())) &&
inst->src[0].type == inst->dst.type && inst->src[0].type == inst->dst.type &&
!inst->is_partial_write()); !inst->is_partial_write());
} }
@@ -863,7 +922,7 @@ fs_visitor::opt_copy_propagation_local(void *copy_prop_ctx, bblock_t *block,
} }
/* kill the destination from the ACP */ /* kill the destination from the ACP */
if (inst->dst.file == VGRF) { if (inst->dst.file == VGRF || inst->dst.file == FIXED_GRF) {
foreach_in_list_safe(acp_entry, entry, &acp[inst->dst.nr % ACP_HASH_SIZE]) { foreach_in_list_safe(acp_entry, entry, &acp[inst->dst.nr % ACP_HASH_SIZE]) {
if (regions_overlap(entry->dst, entry->size_written, if (regions_overlap(entry->dst, entry->size_written,
inst->dst, inst->size_written)) inst->dst, inst->size_written))
@@ -905,7 +964,9 @@ fs_visitor::opt_copy_propagation_local(void *copy_prop_ctx, bblock_t *block,
assert(effective_width * type_sz(inst->src[i].type) % REG_SIZE == 0); assert(effective_width * type_sz(inst->src[i].type) % REG_SIZE == 0);
const unsigned size_written = effective_width * const unsigned size_written = effective_width *
type_sz(inst->src[i].type); type_sz(inst->src[i].type);
if (inst->src[i].file == VGRF) { if (inst->src[i].file == VGRF ||
(inst->src[i].file == FIXED_GRF &&
inst->src[i].is_contiguous())) {
acp_entry *entry = rzalloc(copy_prop_ctx, acp_entry); acp_entry *entry = rzalloc(copy_prop_ctx, acp_entry);
entry->dst = byte_offset(inst->dst, offset); entry->dst = byte_offset(inst->dst, offset);
entry->src = inst->src[i]; entry->src = inst->src[i];