From 572e00dd66117331cc52c1756e9f2812d72ff0ac Mon Sep 17 00:00:00 2001 From: Ian Romanick Date: Tue, 16 Jul 2024 16:04:38 -0700 Subject: [PATCH] intel/brw: Copy prop from raw integer moves with mismatched types The specific pattern from the unit test was observed in ray tracing trampoline shaders. v2: Refactor the is_raw_move tests out to a utility function. Suggested by Ken. v3: Fix a regression caused by being too picky about source modifiers. This was introduced somewhere between when I did initial shader-db runs an v2. v4: Fix typo in comment. Noticed by Caio. shader-db: All Intel platforms had similar results. (Meteor Lake shown) total instructions in shared programs: 19734086 -> 19733997 (<.01%) instructions in affected programs: 135388 -> 135299 (-0.07%) helped: 76 / HURT: 2 total cycles in shared programs: 916290451 -> 916264968 (<.01%) cycles in affected programs: 41046002 -> 41020519 (-0.06%) helped: 32 / HURT: 29 fossil-db: Meteor Lake, DG2, and Skylake had similar results. (Meteor Lake shown) Totals: Instrs: 151531355 -> 151513669 (-0.01%); split: -0.01%, +0.00% Cycle count: 17209372399 -> 17208178205 (-0.01%); split: -0.01%, +0.00% Max live registers: 32016490 -> 32016493 (+0.00%) Totals from 17361 (2.75% of 630198) affected shaders: Instrs: 2642048 -> 2624362 (-0.67%); split: -0.67%, +0.00% Cycle count: 79803066 -> 78608872 (-1.50%); split: -1.75%, +0.25% Max live registers: 421668 -> 421671 (+0.00%) Tiger Lake and Ice Lake had similar results. (Tiger Lake shown) Totals: Instrs: 149995644 -> 149977326 (-0.01%); split: -0.01%, +0.00% Cycle count: 15567293770 -> 15566524840 (-0.00%); split: -0.02%, +0.01% Spill count: 61241 -> 61238 (-0.00%) Fill count: 107304 -> 107301 (-0.00%) Max live registers: 31993109 -> 31993112 (+0.00%) Totals from 17813 (2.83% of 629912) affected shaders: Instrs: 3738236 -> 3719918 (-0.49%); split: -0.49%, +0.00% Cycle count: 4251157049 -> 4250388119 (-0.02%); split: -0.06%, +0.04% Spill count: 28268 -> 28265 (-0.01%) Fill count: 50377 -> 50374 (-0.01%) Max live registers: 470648 -> 470651 (+0.00%) Reviewed-by: Kenneth Graunke Reviewed-by: Caio Oliveira Part-of: --- src/intel/compiler/brw_fs.cpp | 23 +++++ .../compiler/brw_fs_copy_propagation.cpp | 14 ++- src/intel/compiler/brw_ir_fs.h | 1 + .../compiler/test_fs_copy_propagation.cpp | 86 +++++++++++++++++++ 4 files changed, 121 insertions(+), 3 deletions(-) diff --git a/src/intel/compiler/brw_fs.cpp b/src/intel/compiler/brw_fs.cpp index 5a446152e5c..7e5219654e5 100644 --- a/src/intel/compiler/brw_fs.cpp +++ b/src/intel/compiler/brw_fs.cpp @@ -1001,6 +1001,29 @@ fs_inst::has_sampler_residency() const } } +/* \sa inst_is_raw_move in brw_eu_validate. */ +bool +fs_inst::is_raw_move() const +{ + if (opcode != BRW_OPCODE_MOV) + return false; + + if (src[0].file == IMM) { + if (brw_type_is_vector_imm(src[0].type)) + return false; + } else if (src[0].negate || src[0].abs) { + return false; + } + + if (saturate) + return false; + + return src[0].type == dst.type || + (brw_type_is_int(src[0].type) && + brw_type_is_int(dst.type) && + brw_type_size_bits(src[0].type) == brw_type_size_bits(dst.type)); +} + /* For SIMD16, we need to follow from the uniform setup of SIMD8 dispatch. * This brings in those uniform definitions */ diff --git a/src/intel/compiler/brw_fs_copy_propagation.cpp b/src/intel/compiler/brw_fs_copy_propagation.cpp index 01f253dfcd4..c18c11f7597 100644 --- a/src/intel/compiler/brw_fs_copy_propagation.cpp +++ b/src/intel/compiler/brw_fs_copy_propagation.cpp @@ -1274,8 +1274,12 @@ can_propagate_from(fs_inst *inst) inst->src[0].file == IMM || (inst->src[0].file == FIXED_GRF && inst->src[0].is_contiguous())) && - inst->src[0].type == inst->dst.type && - !inst->saturate && + /* is_raw_move also rejects source modifiers, but copy propagation + * can handle that if the types are the same. + */ + ((inst->src[0].type == inst->dst.type && + !inst->saturate) || + inst->is_raw_move()) && /* Subset of !is_partial_write() conditions. */ !inst->predicate && inst->dst.is_contiguous()) || is_identity_payload(FIXED_GRF, inst); @@ -1752,7 +1756,11 @@ find_value_for_offset(fs_inst *def, const brw_reg &src, unsigned src_size) switch (def->opcode) { case BRW_OPCODE_MOV: - if (def->dst.type == def->src[0].type && def->src[0].stride <= 1) { + /* is_raw_move also rejects source modifiers, but copy propagation + * can handle that if the tyeps are the same. + */ + if ((def->dst.type == def->src[0].type || def->is_raw_move()) && + def->src[0].stride <= 1) { val = def->src[0]; unsigned rel_offset = src.offset - def->dst.offset; diff --git a/src/intel/compiler/brw_ir_fs.h b/src/intel/compiler/brw_ir_fs.h index ebb04ad5794..b0120178503 100644 --- a/src/intel/compiler/brw_ir_fs.h +++ b/src/intel/compiler/brw_ir_fs.h @@ -70,6 +70,7 @@ public: bool is_control_flow_end() const; bool is_control_flow() const; bool is_commutative() const; + bool is_raw_move() const; bool can_do_saturate() const; bool reads_accumulator_implicitly() const; bool writes_accumulator_implicitly(const struct intel_device_info *devinfo) const; diff --git a/src/intel/compiler/test_fs_copy_propagation.cpp b/src/intel/compiler/test_fs_copy_propagation.cpp index 6ef904a542b..6152d6b4c26 100644 --- a/src/intel/compiler/test_fs_copy_propagation.cpp +++ b/src/intel/compiler/test_fs_copy_propagation.cpp @@ -228,3 +228,89 @@ TEST_F(copy_propagation_test, maxmax_sat_imm) v->cfg = NULL; } } + +TEST_F(copy_propagation_test, mixed_integer_sign) +{ + brw_reg vgrf0 = bld.vgrf(BRW_TYPE_UD); + brw_reg vgrf1 = bld.vgrf(BRW_TYPE_D); + brw_reg vgrf2 = bld.vgrf(BRW_TYPE_UD); + brw_reg vgrf3 = bld.vgrf(BRW_TYPE_UD); + brw_reg vgrf4 = bld.vgrf(BRW_TYPE_UD); + + bld.MOV(vgrf1, vgrf0); + bld.BFE(vgrf2, vgrf3, vgrf4, retype(vgrf1, BRW_TYPE_UD)); + + /* = Before = + * + * 0: mov(8) vgrf1:D vgrf0:UD + * 1: bfe(8) vgrf2:UD vgrf3:UD vgrf4:UD vgrf1:UD + * + * = After = + * 0: mov(8) vgrf1:D vgrf0:UD + * 1: bfe(8) vgrf2:UD vgrf3:UD vgrf4:UD vgrf0:UD + */ + + brw_calculate_cfg(*v); + bblock_t *block0 = v->cfg->blocks[0]; + + EXPECT_EQ(0, block0->start_ip); + EXPECT_EQ(1, block0->end_ip); + + EXPECT_TRUE(copy_propagation(v)); + EXPECT_EQ(0, block0->start_ip); + EXPECT_EQ(1, block0->end_ip); + + fs_inst *mov = instruction(block0, 0); + EXPECT_EQ(BRW_OPCODE_MOV, mov->opcode); + EXPECT_TRUE(mov->dst.equals(vgrf1)); + EXPECT_TRUE(mov->src[0].equals(vgrf0)); + + fs_inst *bfe = instruction(block0, 1); + EXPECT_EQ(BRW_OPCODE_BFE, bfe->opcode); + EXPECT_TRUE(bfe->dst.equals(vgrf2)); + EXPECT_TRUE(bfe->src[0].equals(vgrf3)); + EXPECT_TRUE(bfe->src[1].equals(vgrf4)); + EXPECT_TRUE(bfe->src[2].equals(vgrf0)); +} + +TEST_F(copy_propagation_test, mixed_integer_sign_with_vector_imm) +{ + brw_reg vgrf0 = bld.vgrf(BRW_TYPE_W); + brw_reg vgrf1 = bld.vgrf(BRW_TYPE_UD); + brw_reg vgrf2 = bld.vgrf(BRW_TYPE_UD); + + bld.MOV(vgrf0, brw_imm_uv(0xffff)); + bld.ADD(vgrf1, vgrf2, retype(vgrf0, BRW_TYPE_UW)); + + /* = Before = + * + * 0: mov(8) vgrf0:W ...:UV + * 1: add(8) vgrf1:UD vgrf2:UD vgrf0:UW + * + * = After = + * No change + */ + + brw_calculate_cfg(*v); + bblock_t *block0 = v->cfg->blocks[0]; + + const brw_reg src1 = instruction(block0, 1)->src[1]; + + EXPECT_EQ(0, block0->start_ip); + EXPECT_EQ(1, block0->end_ip); + + EXPECT_FALSE(copy_propagation(v)); + EXPECT_EQ(0, block0->start_ip); + EXPECT_EQ(1, block0->end_ip); + + fs_inst *mov = instruction(block0, 0); + EXPECT_EQ(BRW_OPCODE_MOV, mov->opcode); + EXPECT_TRUE(mov->dst.equals(vgrf0)); + EXPECT_TRUE(mov->src[0].file == IMM); + + fs_inst *add = instruction(block0, 1); + EXPECT_EQ(BRW_OPCODE_ADD, add->opcode); + EXPECT_TRUE(add->dst.equals(vgrf1)); + EXPECT_TRUE(add->src[0].equals(vgrf2)); + EXPECT_TRUE(add->src[1].equals(src1)); +}