i965/fs: Merge CMP and SEL into CSEL on Gen8+
v2: Fix several problems handling inverted predicates. Add a much bigger comment around the BRW_CONDITIONAL_NZ case. v3: Allow uniforms and shader inputs as sources for the original SEL and CMP instructions. This enables a LOT more shaders to receive CSEL merging (5816 vs 8564 on SKL). v4: Report progress. Broadwell and Skylake had similar results. (Broadwell shown) helped: 8527 HURT: 0 helped stats (abs) min: 1 max: 27 x̄: 2.44 x̃: 1 helped stats (rel) min: 0.03% max: 17.80% x̄: 1.12% x̃: 0.70% 95% mean confidence interval for instructions value: -2.51 -2.36 95% mean confidence interval for instructions %-change: -1.15% -1.10% Instructions are helped. total cycles in shared programs: 559442317 -> 558288357 (-0.21%) cycles in affected programs: 372699860 -> 371545900 (-0.31%) helped: 6748 HURT: 1450 helped stats (abs) min: 1 max: 32000 x̄: 182.41 x̃: 12 helped stats (rel) min: <.01% max: 66.08% x̄: 3.42% x̃: 0.70% HURT stats (abs) min: 1 max: 2538 x̄: 53.08 x̃: 14 HURT stats (rel) min: <.01% max: 96.72% x̄: 3.32% x̃: 0.90% 95% mean confidence interval for cycles value: -179.01 -102.51 95% mean confidence interval for cycles %-change: -2.37% -2.08% Cycles are helped. LOST: 0 GAINED: 6 No changes on earlier platforms. Signed-off-by: Ian Romanick <ian.d.romanick@intel.com> Reviewed-by: Samuel Iglesias Gonsálvez <siglesias@igalia.com> [v1] Reviewed-by: Kenneth Graunke <kenneth@whitecape.org> [v3] Reviewed-by: Matt Turner <mattst88@gmail.com>
This commit is contained in:
@@ -2843,6 +2843,106 @@ mask_relative_to(const fs_reg &r, const fs_reg &s, unsigned ds)
|
||||
return ((1 << n) - 1) << shift;
|
||||
}
|
||||
|
||||
bool
|
||||
fs_visitor::opt_peephole_csel()
|
||||
{
|
||||
if (devinfo->gen < 8)
|
||||
return false;
|
||||
|
||||
bool progress = false;
|
||||
|
||||
foreach_block_reverse(block, cfg) {
|
||||
int ip = block->end_ip + 1;
|
||||
|
||||
foreach_inst_in_block_reverse_safe(fs_inst, inst, block) {
|
||||
ip--;
|
||||
|
||||
if (inst->opcode != BRW_OPCODE_SEL ||
|
||||
inst->predicate != BRW_PREDICATE_NORMAL ||
|
||||
(inst->dst.type != BRW_REGISTER_TYPE_F &&
|
||||
inst->dst.type != BRW_REGISTER_TYPE_D &&
|
||||
inst->dst.type != BRW_REGISTER_TYPE_UD))
|
||||
continue;
|
||||
|
||||
/* Because it is a 3-src instruction, CSEL cannot have an immediate
|
||||
* value as a source, but we can sometimes handle zero.
|
||||
*/
|
||||
if ((inst->src[0].file != VGRF && inst->src[0].file != ATTR &&
|
||||
inst->src[0].file != UNIFORM) ||
|
||||
(inst->src[1].file != VGRF && inst->src[1].file != ATTR &&
|
||||
inst->src[1].file != UNIFORM && !inst->src[1].is_zero()))
|
||||
continue;
|
||||
|
||||
foreach_inst_in_block_reverse_starting_from(fs_inst, scan_inst, inst) {
|
||||
if (!scan_inst->flags_written())
|
||||
continue;
|
||||
|
||||
if ((scan_inst->opcode != BRW_OPCODE_CMP &&
|
||||
scan_inst->opcode != BRW_OPCODE_MOV) ||
|
||||
scan_inst->predicate != BRW_PREDICATE_NONE ||
|
||||
(scan_inst->src[0].file != VGRF &&
|
||||
scan_inst->src[0].file != ATTR &&
|
||||
scan_inst->src[0].file != UNIFORM) ||
|
||||
scan_inst->src[0].type != BRW_REGISTER_TYPE_F)
|
||||
break;
|
||||
|
||||
if (scan_inst->opcode == BRW_OPCODE_CMP && !scan_inst->src[1].is_zero())
|
||||
break;
|
||||
|
||||
const brw::fs_builder ibld(this, block, inst);
|
||||
|
||||
const enum brw_conditional_mod cond =
|
||||
inst->predicate_inverse
|
||||
? brw_negate_cmod(scan_inst->conditional_mod)
|
||||
: scan_inst->conditional_mod;
|
||||
|
||||
fs_inst *csel_inst = NULL;
|
||||
|
||||
if (inst->src[1].file != IMM) {
|
||||
csel_inst = ibld.CSEL(inst->dst,
|
||||
inst->src[0],
|
||||
inst->src[1],
|
||||
scan_inst->src[0],
|
||||
cond);
|
||||
} else if (cond == BRW_CONDITIONAL_NZ) {
|
||||
/* Consider the sequence
|
||||
*
|
||||
* cmp.nz.f0 null<1>F g3<8,8,1>F 0F
|
||||
* (+f0) sel g124<1>UD g2<8,8,1>UD 0x00000000UD
|
||||
*
|
||||
* The sel will pick the immediate value 0 if r0 is ±0.0.
|
||||
* Therefore, this sequence is equivalent:
|
||||
*
|
||||
* cmp.nz.f0 null<1>F g3<8,8,1>F 0F
|
||||
* (+f0) sel g124<1>F g2<8,8,1>F (abs)g3<8,8,1>F
|
||||
*
|
||||
* The abs is ensures that the result is 0UD when g3 is -0.0F.
|
||||
* By normal cmp-sel merging, this is also equivalent:
|
||||
*
|
||||
* csel.nz g124<1>F g2<4,4,1>F (abs)g3<4,4,1>F g3<4,4,1>F
|
||||
*/
|
||||
csel_inst = ibld.CSEL(inst->dst,
|
||||
inst->src[0],
|
||||
scan_inst->src[0],
|
||||
scan_inst->src[0],
|
||||
cond);
|
||||
|
||||
csel_inst->src[1].abs = true;
|
||||
}
|
||||
|
||||
if (csel_inst != NULL) {
|
||||
progress = true;
|
||||
inst->remove(block);
|
||||
}
|
||||
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return progress;
|
||||
}
|
||||
|
||||
bool
|
||||
fs_visitor::compute_to_mrf()
|
||||
{
|
||||
@@ -6078,6 +6178,12 @@ fs_visitor::optimize()
|
||||
OPT(compact_virtual_grfs);
|
||||
} while (progress);
|
||||
|
||||
/* Do this after cmod propagation has had every possible opportunity to
|
||||
* propagate results into SEL instructions.
|
||||
*/
|
||||
if (OPT(opt_peephole_csel))
|
||||
OPT(dead_code_eliminate);
|
||||
|
||||
progress = false;
|
||||
pass_num = 0;
|
||||
|
||||
|
@@ -191,6 +191,7 @@ public:
|
||||
fs_reg resolve_source_modifiers(const fs_reg &src);
|
||||
void emit_discard_jump();
|
||||
bool opt_peephole_sel();
|
||||
bool opt_peephole_csel();
|
||||
bool opt_peephole_predicated_break();
|
||||
bool opt_saturate_propagation();
|
||||
bool opt_cmod_propagation();
|
||||
|
Reference in New Issue
Block a user