intel/brw: Replace predicated break optimization with a simple peephole

We can achieve most of what brw_fs_opt_predicated_break() does with
simple peepholes at NIR -> BRW conversion time.

For predicated break and continue, we can simply look at an IF ... ENDIF
sequence after emitting it.  If there's a single instruction between the
two, and it's a BREAK or CONTINUE, then we can move the predicate from
the IF onto the jump, and delete the IF/ENDIF.  Because we haven't built
the CFG at this stage, we only need to remove them from the linked list
of instructions, which is trivial to do.

For the predicated while optimization, we can rely on the fact that we
already did the predicated break optimization, and simply look for a
predicated BREAK just before the WHILE.  If so, we move the predicate
onto the WHILE, invert it, and remove the BREAK.

There are a few cases where this approach does a worse job than the old
one: nir_convert_from_ssa may introduce load_reg and store_reg in blocks
containing break, and nir_trivialize_registers may decide it needs to
insert movs into those blocks.  So, at NIR -> BRW time, we'll actually
emit some MOVs there, which might have been possible to copy propagate
out after later optimizations.

However, the fossil-db results show that it's still pretty competitive.
For instructions, 1017 shaders were helped (average -1.87 instructions),
while only 62 were hurt (average +2.19 instructions).  In affected
shaders, it was -0.08% for instructions.

Reviewed-by: Caio Oliveira <caio.oliveira@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/30498>
This commit is contained in:
Kenneth Graunke
2024-08-01 15:33:36 -07:00
parent fad63d6483
commit c19e5a0a75
6 changed files with 27 additions and 591 deletions

View File

@@ -643,7 +643,6 @@ bool brw_fs_opt_copy_propagation_defs(fs_visitor &s);
bool brw_fs_opt_cse_defs(fs_visitor &s);
bool brw_fs_opt_dead_code_eliminate(fs_visitor &s);
bool brw_fs_opt_eliminate_find_live_channel(fs_visitor &s);
bool brw_fs_opt_predicated_break(fs_visitor &s);
bool brw_fs_opt_register_coalesce(fs_visitor &s);
bool brw_fs_opt_remove_extra_rounding_modes(fs_visitor &s);
bool brw_fs_opt_remove_redundant_halts(fs_visitor &s);

View File

@@ -450,7 +450,8 @@ fs_nir_emit_if(nir_to_brw_state &ntb, nir_if *if_stmt)
retype(cond_reg, BRW_TYPE_D));
inst->conditional_mod = BRW_CONDITIONAL_NZ;
bld.IF(BRW_PREDICATE_NORMAL)->predicate_inverse = invert;
fs_inst *iff = bld.IF(BRW_PREDICATE_NORMAL);
iff->predicate_inverse = invert;
fs_nir_emit_cf_list(ntb, &if_stmt->then_list);
@@ -459,7 +460,20 @@ fs_nir_emit_if(nir_to_brw_state &ntb, nir_if *if_stmt)
fs_nir_emit_cf_list(ntb, &if_stmt->else_list);
}
bld.emit(BRW_OPCODE_ENDIF);
fs_inst *endif = bld.emit(BRW_OPCODE_ENDIF);
/* Peephole: replace IF-JUMP-ENDIF with predicated jump */
if (endif->prev->prev == iff) {
fs_inst *jump = (fs_inst *) endif->prev;
if (jump->predicate == BRW_PREDICATE_NONE &&
(jump->opcode == BRW_OPCODE_BREAK ||
jump->opcode == BRW_OPCODE_CONTINUE)) {
jump->predicate = iff->predicate;
jump->predicate_inverse = iff->predicate_inverse;
iff->exec_node::remove();
endif->exec_node::remove();
}
}
}
static void
@@ -472,7 +486,17 @@ fs_nir_emit_loop(nir_to_brw_state &ntb, nir_loop *loop)
fs_nir_emit_cf_list(ntb, &loop->body);
bld.emit(BRW_OPCODE_WHILE);
fs_inst *peep_while = bld.emit(BRW_OPCODE_WHILE);
/* Peephole: replace (+f0) break; while with (-f0) while */
fs_inst *peep_break = (fs_inst *) peep_while->prev;
if (peep_break->opcode == BRW_OPCODE_BREAK &&
peep_break->predicate != BRW_PREDICATE_NONE) {
peep_while->predicate = peep_break->predicate;
peep_while->predicate_inverse = !peep_break->predicate_inverse;
peep_break->exec_node::remove();
}
}
static void

View File

@@ -76,8 +76,6 @@ brw_fs_optimize(fs_visitor &s)
progress = false;
pass_num = 0;
OPT(brw_fs_opt_predicated_break);
if (OPT(brw_fs_lower_pack)) {
OPT(brw_fs_opt_register_coalesce);
OPT(brw_fs_opt_dead_code_eliminate);

View File

@@ -1,243 +0,0 @@
/*
* Copyright © 2013 Intel Corporation
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice (including the next
* paragraph) shall be included in all copies or substantial portions of the
* Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
*/
#include "brw_fs.h"
using namespace brw;
/** @file
*
* Loops are often structured as
*
* loop:
* CMP.f0
* (+f0) IF
* BREAK
* ENDIF
* ...
* WHILE loop
*
* This peephole pass removes the IF and ENDIF instructions and predicates the
* BREAK, dropping two instructions from the loop body.
*
* If the loop was a DO { ... } WHILE loop, it looks like
*
* loop:
* ...
* CMP.f0
* (+f0) IF
* BREAK
* ENDIF
* WHILE loop
*
* and we can remove the BREAK instruction and predicate the WHILE.
*/
#define MAX_NESTING 128
struct loop_continue_tracking {
BITSET_WORD has_continue[BITSET_WORDS(MAX_NESTING)];
unsigned depth;
};
static void
enter_loop(struct loop_continue_tracking *s)
{
s->depth++;
/* Any loops deeper than that maximum nesting will just re-use the last
* flag. This simplifies most of the code. MAX_NESTING is chosen to be
* large enough that it is unlikely to occur. Even if it does, the
* optimization that uses this tracking is unlikely to make much
* difference.
*/
if (s->depth < MAX_NESTING)
BITSET_CLEAR(s->has_continue, s->depth);
}
static void
exit_loop(struct loop_continue_tracking *s)
{
assert(s->depth > 0);
s->depth--;
}
static void
set_continue(struct loop_continue_tracking *s)
{
const unsigned i = MIN2(s->depth, MAX_NESTING - 1);
BITSET_SET(s->has_continue, i);
}
static bool
has_continue(const struct loop_continue_tracking *s)
{
const unsigned i = MIN2(s->depth, MAX_NESTING - 1);
return BITSET_TEST(s->has_continue, i);
}
bool
brw_fs_opt_predicated_break(fs_visitor &s)
{
bool progress = false;
struct loop_continue_tracking state = { {0, }, 0 };
foreach_block (block, s.cfg) {
/* DO instructions, by definition, can only be found at the beginning of
* basic blocks.
*/
fs_inst *const do_inst = block->start();
/* BREAK, CONTINUE, and WHILE instructions, by definition, can only be
* found at the ends of basic blocks.
*/
fs_inst *jump_inst = block->end();
if (do_inst->opcode == BRW_OPCODE_DO)
enter_loop(&state);
if (jump_inst->opcode == BRW_OPCODE_CONTINUE)
set_continue(&state);
else if (jump_inst->opcode == BRW_OPCODE_WHILE)
exit_loop(&state);
if (block->start_ip != block->end_ip)
continue;
if (jump_inst->opcode != BRW_OPCODE_BREAK &&
jump_inst->opcode != BRW_OPCODE_CONTINUE)
continue;
fs_inst *if_inst = block->prev()->end();
if (if_inst->opcode != BRW_OPCODE_IF)
continue;
fs_inst *endif_inst = block->next()->start();
if (endif_inst->opcode != BRW_OPCODE_ENDIF)
continue;
bblock_t *jump_block = block;
bblock_t *if_block = jump_block->prev();
bblock_t *endif_block = jump_block->next();
jump_inst->predicate = if_inst->predicate;
jump_inst->predicate_inverse = if_inst->predicate_inverse;
bblock_t *earlier_block = if_block;
if (if_block->start_ip == if_block->end_ip) {
earlier_block = if_block->prev();
}
if_inst->remove(if_block);
bblock_t *later_block = endif_block;
if (endif_block->start_ip == endif_block->end_ip) {
later_block = endif_block->next();
}
endif_inst->remove(endif_block);
if (!earlier_block->ends_with_control_flow()) {
/* FIXME: There is a potential problem here. If earlier_block starts
* with a DO instruction, this will delete the physical link to the
* WHILE block. It is unclear whether ENDIF has the same potential
* problem.
*/
assert(earlier_block->start() == NULL ||
earlier_block->start()->opcode != BRW_OPCODE_DO);
earlier_block->unlink_children();
earlier_block->add_successor(s.cfg->mem_ctx, jump_block,
bblock_link_logical);
}
if (!later_block->starts_with_control_flow()) {
later_block->unlink_parents();
}
/* If jump_block already has a link to later_block, don't create another
* one. Instead, promote the link to logical.
*/
bool need_to_link = true;
foreach_list_typed(bblock_link, link, link, &jump_block->children) {
if (link->block == later_block) {
assert(later_block->starts_with_control_flow());
/* Update the link from later_block back to jump_block. */
foreach_list_typed(bblock_link, parent_link, link, &later_block->parents) {
if (parent_link->block == jump_block) {
parent_link->kind = bblock_link_logical;
}
}
/* Update the link from jump_block to later_block. */
link->kind = bblock_link_logical;
need_to_link = false;
}
}
if (need_to_link) {
jump_block->add_successor(s.cfg->mem_ctx, later_block,
bblock_link_logical);
}
if (earlier_block->can_combine_with(jump_block)) {
earlier_block->combine_with(jump_block);
block = earlier_block;
}
/* Now look at the first instruction of the block following the BREAK. If
* it's a WHILE, we can delete the break, predicate the WHILE, and join
* the two basic blocks.
*
* This optimization can only be applied if the only instruction that
* can transfer control to the WHILE is the BREAK. If other paths can
* lead to the while, the flags may be in an unknown state, and the loop
* could terminate prematurely. This can occur if the loop contains a
* CONT instruction.
*/
bblock_t *while_block = earlier_block->next();
fs_inst *while_inst = while_block->start();
if (jump_inst->opcode == BRW_OPCODE_BREAK &&
while_inst->opcode == BRW_OPCODE_WHILE &&
while_inst->predicate == BRW_PREDICATE_NONE &&
!has_continue(&state)) {
jump_inst->remove(earlier_block);
while_inst->predicate = jump_inst->predicate;
while_inst->predicate_inverse = !jump_inst->predicate_inverse;
assert(earlier_block->can_combine_with(while_block));
earlier_block->combine_with(while_block);
}
progress = true;
}
if (progress)
s.invalidate_analysis(DEPENDENCY_BLOCKS | DEPENDENCY_INSTRUCTIONS);
return progress;
}

View File

@@ -98,7 +98,6 @@ libintel_compiler_brw_files = files(
'brw_nir_rt.c',
'brw_nir_rt_builder.h',
'brw_packed_float.c',
'brw_predicated_break.cpp',
'brw_print.cpp',
'brw_prim.h',
'brw_private.h',
@@ -194,7 +193,6 @@ if with_tests
executable(
'intel_compiler_brw_tests',
files(
'test_predicated_break.cpp',
'test_eu_compact.cpp',
'test_eu_validate.cpp',
'test_fs_cmod_propagation.cpp',

View File

@@ -1,340 +0,0 @@
/*
* Copyright (c) 2023 Intel Corporation
* SPDX-License-Identifier: MIT
*/
#include <gtest/gtest.h>
#include "brw_fs.h"
#include "brw_fs_builder.h"
#include "brw_cfg.h"
using namespace brw;
class PredicatedBreakTest : public ::testing::Test {
virtual void SetUp();
virtual void TearDown();
public:
bool debug;
void *mem_ctx;
brw_compiler compiler;
brw_compile_params params;
intel_device_info devinfo;
struct brw_wm_prog_data prog_data;
struct gl_shader_program *shader_prog;
fs_visitor *shader_a;
fs_visitor *shader_b;
bool opt_predicated_break(fs_visitor *s);
};
void
PredicatedBreakTest::SetUp()
{
debug = getenv("TEST_DEBUG");
mem_ctx = ralloc_context(NULL);
devinfo = {};
devinfo.ver = 9;
devinfo.verx10 = 90;
compiler = {};
compiler.devinfo = &devinfo;
brw_init_isa_info(&compiler.isa, &devinfo);
params = {};
params.mem_ctx = mem_ctx;
prog_data = {};
nir_shader *nir =
nir_shader_create(mem_ctx, MESA_SHADER_FRAGMENT, NULL, NULL);
shader_a = new fs_visitor(&compiler, &params, NULL,
&prog_data.base, nir, 8, false, false);
shader_b = new fs_visitor(&compiler, &params, NULL,
&prog_data.base, nir, 8, false, false);
}
void
PredicatedBreakTest::TearDown()
{
delete shader_a;
delete shader_b;
ralloc_free(mem_ctx);
mem_ctx = NULL;
}
bool
PredicatedBreakTest::opt_predicated_break(fs_visitor *s)
{
const bool print = getenv("TEST_DEBUG");
if (print) {
fprintf(stderr, "= Before =\n");
s->cfg->dump();
}
bool ret = brw_fs_opt_predicated_break(*s);
if (print) {
fprintf(stderr, "\n= After =\n");
s->cfg->dump();
}
return ret;
}
static fs_builder
make_builder(fs_visitor *s)
{
return fs_builder(s, s->dispatch_width).at_end();
}
static testing::AssertionResult
shaders_match(const char *a_expr, const char *b_expr,
fs_visitor *a, fs_visitor *b)
{
/* Using the CFG string dump for this. Not ideal but it is
* convenient that covers some CFG information, helping to
* check if the optimization is keeping the CFG valid.
*/
char *a_str = NULL;
size_t a_size = 0;
FILE *a_file = open_memstream(&a_str, &a_size);
a->cfg->dump(a_file);
fclose(a_file);
char *b_str = NULL;
size_t b_size = 0;
FILE *b_file = open_memstream(&b_str, &b_size);
b->cfg->dump(b_file);
fclose(b_file);
if (a_size != b_size || strcmp(a_str, b_str) != 0) {
std::stringstream result;
result << "Shaders don't match.\n\n"
<< a_expr << " is:\n\n" << a_str
<< "\n---\n"
<< b_expr << " is:\n\n" << b_str
<< "\n";
free(a_str);
free(b_str);
return testing::AssertionFailure() << result.str();
}
free(a_str);
free(b_str);
return testing::AssertionSuccess();
}
#define ASSERT_SHADERS_MATCH(a, b) ASSERT_PRED_FORMAT2(shaders_match, a, b)
TEST_F(PredicatedBreakTest, TopBreakWithoutContinue)
{
fs_builder a = make_builder(shader_a);
fs_builder b = make_builder(shader_b);
brw_reg r1 = brw_vec8_grf(1, 0);
brw_reg r2 = brw_vec8_grf(2, 0);
brw_reg r3 = brw_vec8_grf(3, 0);
a.DO();
a.CMP(r1, r2, r3, BRW_CONDITIONAL_NZ);
a.IF(BRW_PREDICATE_NORMAL);
a.BREAK();
a.ENDIF();
a.ADD(r1, r2, r3);
a.WHILE();
a.NOP(); /* There's always going to be something after a WHILE. */
brw_calculate_cfg(*shader_a);
/* The IF/ENDIF around the BREAK is expected to be removed. */
bool progress = opt_predicated_break(shader_a);
EXPECT_TRUE(progress);
b.DO();
b.CMP(r1, r2, r3, BRW_CONDITIONAL_NZ);
b.BREAK()->predicate = BRW_PREDICATE_NORMAL;
b.ADD(r1, r2, r3);
b.WHILE();
b.NOP();
brw_calculate_cfg(*shader_b);
ASSERT_SHADERS_MATCH(shader_a, shader_b);
}
TEST_F(PredicatedBreakTest, TopBreakWithContinue)
{
fs_builder a = make_builder(shader_a);
fs_builder b = make_builder(shader_b);
brw_reg r1 = brw_vec8_grf(1, 0);
brw_reg r2 = brw_vec8_grf(2, 0);
brw_reg r3 = brw_vec8_grf(3, 0);
a.DO();
a.CMP(r1, r2, r3, BRW_CONDITIONAL_NZ);
a.IF(BRW_PREDICATE_NORMAL);
a.BREAK();
a.ENDIF();
a.ADD(r1, r2, r3);
a.CMP(r1, r2, r3, BRW_CONDITIONAL_GE);
a.IF(BRW_PREDICATE_NORMAL);
a.CONTINUE();
a.ENDIF();
a.MUL(r1, r2, r3);
a.WHILE();
a.NOP(); /* There's always going to be something after a WHILE. */
brw_calculate_cfg(*shader_a);
/* The IF/ENDIF around the BREAK and the CONTINUE are expected to be
* removed.
*/
bool progress = opt_predicated_break(shader_a);
EXPECT_TRUE(progress);
b.DO();
b.CMP(r1, r2, r3, BRW_CONDITIONAL_NZ);
b.BREAK()->predicate = BRW_PREDICATE_NORMAL;
b.ADD(r1, r2, r3);
b.CMP(r1, r2, r3, BRW_CONDITIONAL_GE);
b.CONTINUE()->predicate = BRW_PREDICATE_NORMAL;
b.MUL(r1, r2, r3);
b.WHILE();
b.NOP();
brw_calculate_cfg(*shader_b);
ASSERT_SHADERS_MATCH(shader_a, shader_b);
}
TEST_F(PredicatedBreakTest, DISABLED_BottomBreakWithoutContinue)
{
fs_builder a = make_builder(shader_a);
fs_builder b = make_builder(shader_b);
brw_reg r1 = brw_vec8_grf(1, 0);
brw_reg r2 = brw_vec8_grf(2, 0);
brw_reg r3 = brw_vec8_grf(3, 0);
a.DO();
a.ADD(r1, r2, r3);
a.CMP(r1, r2, r3, BRW_CONDITIONAL_NZ);
a.IF(BRW_PREDICATE_NORMAL);
a.BREAK();
a.ENDIF();
a.WHILE();
a.NOP(); /* There's always going to be something after a WHILE. */
brw_calculate_cfg(*shader_a);
/* BREAK is the only way to exit the loop, so expect to remove the
* IF/BREAK/ENDIF and add a predicate to WHILE.
*/
bool progress = opt_predicated_break(shader_a);
EXPECT_TRUE(progress);
b.DO();
b.ADD(r1, r2, r3);
b.CMP(r1, r2, r3, BRW_CONDITIONAL_NZ);
auto w = b.WHILE();
w->predicate = BRW_PREDICATE_NORMAL;
w->predicate_inverse = true;
b.NOP();
brw_calculate_cfg(*shader_b);
ASSERT_SHADERS_MATCH(shader_a, shader_b);
}
TEST_F(PredicatedBreakTest, BottomBreakWithContinue)
{
fs_builder a = make_builder(shader_a);
fs_builder b = make_builder(shader_b);
brw_reg r1 = brw_vec8_grf(1, 0);
brw_reg r2 = brw_vec8_grf(2, 0);
brw_reg r3 = brw_vec8_grf(3, 0);
a.DO();
a.ADD(r1, r2, r3);
a.CMP(r1, r2, r3, BRW_CONDITIONAL_GE);
a.IF(BRW_PREDICATE_NORMAL);
a.CONTINUE();
a.ENDIF();
a.MUL(r1, r2, r3);
a.CMP(r1, r2, r3, BRW_CONDITIONAL_NZ);
a.IF(BRW_PREDICATE_NORMAL);
a.BREAK();
a.ENDIF();
a.WHILE();
a.NOP(); /* There's always going to be something after a WHILE. */
brw_calculate_cfg(*shader_a);
/* With a CONTINUE, the BREAK can't be removed, but still remove the
* IF/ENDIF around both of them.
*/
bool progress = opt_predicated_break(shader_a);
EXPECT_TRUE(progress);
b.DO();
b.ADD(r1, r2, r3);
b.CMP(r1, r2, r3, BRW_CONDITIONAL_GE);
b.CONTINUE()->predicate = BRW_PREDICATE_NORMAL;
b.MUL(r1, r2, r3);
b.CMP(r1, r2, r3, BRW_CONDITIONAL_NZ);
b.BREAK()->predicate = BRW_PREDICATE_NORMAL;
b.WHILE();
b.NOP();
brw_calculate_cfg(*shader_b);
ASSERT_SHADERS_MATCH(shader_a, shader_b);
}
TEST_F(PredicatedBreakTest, TwoBreaks)
{
fs_builder a = make_builder(shader_a);
fs_builder b = make_builder(shader_b);
brw_reg r1 = brw_vec8_grf(1, 0);
brw_reg r2 = brw_vec8_grf(2, 0);
brw_reg r3 = brw_vec8_grf(3, 0);
a.DO();
a.ADD(r1, r2, r3);
a.CMP(r1, r2, r3, BRW_CONDITIONAL_NZ);
a.IF(BRW_PREDICATE_NORMAL);
a.BREAK();
a.ENDIF();
a.MUL(r1, r2, r3);
a.CMP(r1, r2, r3, BRW_CONDITIONAL_GE);
a.IF(BRW_PREDICATE_NORMAL);
a.BREAK();
a.ENDIF();
a.AND(r1, r2, r3);
a.WHILE();
a.NOP(); /* There's always going to be something after a WHILE. */
brw_calculate_cfg(*shader_a);
/* The IF/ENDIF around the breaks are expected to be removed. */
bool progress = opt_predicated_break(shader_a);
EXPECT_TRUE(progress);
b.DO();
b.ADD(r1, r2, r3);
b.CMP(r1, r2, r3, BRW_CONDITIONAL_NZ);
b.BREAK()->predicate = BRW_PREDICATE_NORMAL;
b.MUL(r1, r2, r3);
b.CMP(r1, r2, r3, BRW_CONDITIONAL_GE);
b.BREAK()->predicate = BRW_PREDICATE_NORMAL;
b.AND(r1, r2, r3);
b.WHILE();
b.NOP(); /* There's always going to be something after a WHILE. */
brw_calculate_cfg(*shader_b);
ASSERT_SHADERS_MATCH(shader_a, shader_b);
}