From c6a8b382fd021987660eb74d2d55f3686c69aeaf Mon Sep 17 00:00:00 2001 From: Ian Romanick Date: Wed, 17 Jul 2024 11:36:24 -0700 Subject: [PATCH] intel/brw: Relax is_partial_write check in cmod propagation The is_partial_write check is too strict because it tests two separate things. It tests whether or not the instruction always writes a value (i.e., is it predicated), and it tests whether or not the instruction writes a complete register. This latter check is problematic as it perevents cmod propagation in SIMD1, and it prevents cmod propagation in SIMD8 when the destination size is 16 bits. This check is unnecessary. Cmod propagation already checks that the region written and region read overlap. It also already checks that the execution sizes of the instructions match. Further restriction based on the specific parts of the register written only generates false negatives. v2: Relax all of the calls to is_partial_write. Suggested by Caio. No shader-db changes on any Intel platform. fossil-db: Meteor Lake Totals: Instrs: 151505520 -> 151502923 (-0.00%); split: -0.00%, +0.00% Cycle count: 17201385104 -> 17194901423 (-0.04%); split: -0.06%, +0.02% Spill count: 80827 -> 80837 (+0.01%) Fill count: 152693 -> 152692 (-0.00%); split: -0.01%, +0.01% Totals from 346 (0.05% of 630198) affected shaders: Instrs: 1257205 -> 1254608 (-0.21%); split: -0.21%, +0.00% Cycle count: 5532845647 -> 5526361966 (-0.12%); split: -0.18%, +0.06% Spill count: 32903 -> 32913 (+0.03%) Fill count: 64338 -> 64337 (-0.00%); split: -0.03%, +0.03% DG2 Totals: Instrs: 151531440 -> 151528055 (-0.00%); split: -0.00%, +0.00% Cycle count: 17200238927 -> 17197996676 (-0.01%); split: -0.03%, +0.02% Spill count: 81003 -> 80971 (-0.04%); split: -0.04%, +0.00% Fill count: 152975 -> 152912 (-0.04%); split: -0.05%, +0.01% Totals from 346 (0.05% of 630198) affected shaders: Instrs: 1260363 -> 1256978 (-0.27%); split: -0.27%, +0.00% Cycle count: 5532019670 -> 5529777419 (-0.04%); split: -0.09%, +0.05% Spill count: 33046 -> 33014 (-0.10%); split: -0.11%, +0.01% Fill count: 64581 -> 64518 (-0.10%); split: -0.13%, +0.03% Tiger Lake and Ice Lake had similar results. (Tiger Lake shown) Totals: Instrs: 149972324 -> 149972289 (-0.00%) Cycle count: 15566495293 -> 15565151171 (-0.01%); split: -0.01%, +0.00% Totals from 16 (0.00% of 629912) affected shaders: Instrs: 351194 -> 351159 (-0.01%) Cycle count: 3922227030 -> 3920882908 (-0.03%); split: -0.04%, +0.00% Skylake Totals: Instrs: 140787999 -> 140787983 (-0.00%); split: -0.00%, +0.00% Cycle count: 14665614947 -> 14665515855 (-0.00%); split: -0.00%, +0.00% Spill count: 58500 -> 58501 (+0.00%) Fill count: 102097 -> 102100 (+0.00%) Totals from 16 (0.00% of 625685) affected shaders: Instrs: 343560 -> 343544 (-0.00%); split: -0.01%, +0.01% Cycle count: 3354997898 -> 3354898806 (-0.00%); split: -0.01%, +0.01% Spill count: 16864 -> 16865 (+0.01%) Fill count: 27479 -> 27482 (+0.01%) Reviewed-by: Kenneth Graunke Reviewed-by: Caio Oliveira Part-of: --- .../compiler/brw_fs_cmod_propagation.cpp | 9 ++++-- .../compiler/test_fs_cmod_propagation.cpp | 30 +++++++++++++++++++ 2 files changed, 36 insertions(+), 3 deletions(-) diff --git a/src/intel/compiler/brw_fs_cmod_propagation.cpp b/src/intel/compiler/brw_fs_cmod_propagation.cpp index 9ae1097cb98..17ef890dca4 100644 --- a/src/intel/compiler/brw_fs_cmod_propagation.cpp +++ b/src/intel/compiler/brw_fs_cmod_propagation.cpp @@ -59,7 +59,8 @@ cmod_propagate_cmp_to_add(const intel_device_info *devinfo, bblock_t *block, foreach_inst_in_block_reverse_starting_from(fs_inst, scan_inst, inst) { if (scan_inst->opcode == BRW_OPCODE_ADD && - !scan_inst->is_partial_write() && + !scan_inst->predicate && + scan_inst->dst.is_contiguous() && scan_inst->exec_size == inst->exec_size) { bool negate; @@ -184,7 +185,8 @@ cmod_propagate_not(const intel_device_info *devinfo, bblock_t *block, scan_inst->opcode != BRW_OPCODE_AND) break; - if (scan_inst->is_partial_write() || + if (scan_inst->predicate || + !scan_inst->dst.is_contiguous() || scan_inst->dst.offset != inst->src[0].offset || scan_inst->exec_size != inst->exec_size) break; @@ -298,7 +300,8 @@ opt_cmod_propagation_local(const intel_device_info *devinfo, bblock_t *block) scan_inst->flags_written(devinfo) != flags_written) break; - if (scan_inst->is_partial_write() || + if (scan_inst->predicate || + !scan_inst->dst.is_contiguous() || scan_inst->dst.offset != inst->src[0].offset || scan_inst->exec_size != inst->exec_size) break; diff --git a/src/intel/compiler/test_fs_cmod_propagation.cpp b/src/intel/compiler/test_fs_cmod_propagation.cpp index 2aa9070b2b3..6d5b495563e 100644 --- a/src/intel/compiler/test_fs_cmod_propagation.cpp +++ b/src/intel/compiler/test_fs_cmod_propagation.cpp @@ -3136,3 +3136,33 @@ TEST_F(cmod_propagation_test, prop_across_sel) EXPECT_EQ(BRW_CONDITIONAL_GE, instruction(block0, 1)->conditional_mod); } +TEST_F(cmod_propagation_test, Boolean_size_conversion) +{ + brw_reg dest1 = bld.vgrf(BRW_TYPE_W); + brw_reg src0 = bld.vgrf(BRW_TYPE_W); + brw_reg zero(brw_imm_w(0)); + + bld.CMP(dest1, src0, zero, BRW_CONDITIONAL_NZ); + set_condmod(BRW_CONDITIONAL_NZ, bld.MOV(bld.null_reg_d(), dest1)); + + /* = Before = + * 0: cmp.nz.f0 dest1:W src0:W 0W + * 1: mov.nz.f0 null:D dest1:W + * + * = After = + * 0: cmp.nz.f0 dest1:W src0:W 0W + */ + + brw_calculate_cfg(*v); + bblock_t *block0 = v->cfg->blocks[0]; + + EXPECT_EQ(0, block0->start_ip); + EXPECT_EQ(1, block0->end_ip); + + EXPECT_TRUE(cmod_propagation(v)); + EXPECT_EQ(0, block0->start_ip); + EXPECT_EQ(0, block0->end_ip); + + EXPECT_EQ(BRW_OPCODE_CMP, instruction(block0, 0)->opcode); + EXPECT_EQ(BRW_CONDITIONAL_NZ, instruction(block0, 0)->conditional_mod); +}