diff --git a/src/intel/compiler/brw_fs.cpp b/src/intel/compiler/brw_fs.cpp index 1b582cda0cd..7ebbe47b887 100644 --- a/src/intel/compiler/brw_fs.cpp +++ b/src/intel/compiler/brw_fs.cpp @@ -31,6 +31,7 @@ #include "main/macros.h" #include "brw_eu.h" #include "brw_fs.h" +#include "brw_fs_live_variables.h" #include "brw_nir.h" #include "brw_vec4_gs_visitor.h" #include "brw_cfg.h" @@ -7448,6 +7449,9 @@ fs_visitor::optimize() OPT(lower_logical_sends); + /* After logical SEND lowering. */ + OPT(fixup_nomask_control_flow); + if (progress) { OPT(opt_copy_propagation); /* Only run after logical send lowering because it's easier to implement @@ -7576,6 +7580,151 @@ fs_visitor::fixup_3src_null_dest() invalidate_live_intervals(); } +/** + * Find the first instruction in the program that might start a region of + * divergent control flow due to a HALT jump. There is no + * find_halt_control_flow_region_end(), the region of divergence extends until + * the only FS_OPCODE_PLACEHOLDER_HALT in the program. + */ +static const fs_inst * +find_halt_control_flow_region_start(const fs_visitor *v) +{ + if (brw_wm_prog_data(v->prog_data)->uses_kill) { + foreach_block_and_inst(block, fs_inst, inst, v->cfg) { + if (inst->opcode == FS_OPCODE_DISCARD_JUMP || + inst->opcode == FS_OPCODE_PLACEHOLDER_HALT) + return inst; + } + } + + return NULL; +} + +/** + * Work around the Gen12 hardware bug filed as GEN:BUG:1407528679. EU fusion + * can cause a BB to be executed with all channels disabled, which will lead + * to the execution of any NoMask instructions in it, even though any + * execution-masked instructions will be correctly shot down. This may break + * assumptions of some NoMask SEND messages whose descriptor depends on data + * generated by live invocations of the shader. + * + * This avoids the problem by predicating certain instructions on an ANY + * horizontal predicate that makes sure that their execution is omitted when + * all channels of the program are disabled. + */ +bool +fs_visitor::fixup_nomask_control_flow() +{ + if (devinfo->gen != 12) + return false; + + const brw_predicate pred = dispatch_width > 16 ? BRW_PREDICATE_ALIGN1_ANY32H : + dispatch_width > 8 ? BRW_PREDICATE_ALIGN1_ANY16H : + BRW_PREDICATE_ALIGN1_ANY8H; + const fs_inst *halt_start = find_halt_control_flow_region_start(this); + unsigned depth = 0; + bool progress = false; + + calculate_live_intervals(); + + /* Scan the program backwards in order to be able to easily determine + * whether the flag register is live at any point. + */ + foreach_block_reverse_safe(block, cfg) { + BITSET_WORD flag_liveout = live_intervals->block_data[block->num] + .flag_liveout[0]; + STATIC_ASSERT(ARRAY_SIZE(live_intervals->block_data[0].flag_liveout) == 1); + + foreach_inst_in_block_reverse_safe(fs_inst, inst, block) { + if (!inst->predicate && inst->exec_size >= 8) + flag_liveout &= ~inst->flags_written(); + + switch (inst->opcode) { + case BRW_OPCODE_DO: + case BRW_OPCODE_IF: + /* Note that this doesn't handle FS_OPCODE_DISCARD_JUMP since only + * the first one in the program closes the region of divergent + * control flow due to any HALT instructions -- Instead this is + * handled with the halt_start check below. + */ + depth--; + break; + + case BRW_OPCODE_WHILE: + case BRW_OPCODE_ENDIF: + case FS_OPCODE_PLACEHOLDER_HALT: + depth++; + break; + + default: + /* Note that the vast majority of NoMask SEND instructions in the + * program are harmless while executed in a block with all + * channels disabled, since any instructions with side effects we + * could hit here should be execution-masked. + * + * The main concern is NoMask SEND instructions where the message + * descriptor or header depends on data generated by live + * invocations of the shader (RESINFO and + * FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD with a dynamically + * computed surface index seem to be the only examples right now + * where this could easily lead to GPU hangs). Unfortunately we + * have no straightforward way to detect that currently, so just + * predicate any NoMask SEND instructions we find under control + * flow. + * + * If this proves to have a measurable performance impact it can + * be easily extended with a whitelist of messages we know we can + * safely omit the predication for. + */ + if (depth && inst->force_writemask_all && + is_send(inst) && !inst->predicate) { + /* We need to load the execution mask into the flag register by + * using a builder with channel group matching the whole shader + * (rather than the default which is derived from the original + * instruction), in order to avoid getting a right-shifted + * value. + */ + const fs_builder ubld = fs_builder(this, block, inst) + .exec_all().group(dispatch_width, 0); + const fs_reg flag = retype(brw_flag_reg(0, 0), + BRW_REGISTER_TYPE_UD); + + /* Due to the lack of flag register allocation we need to save + * and restore the flag register if it's live. + */ + const bool save_flag = flag_liveout & + flag_mask(flag, dispatch_width / 8); + const fs_reg tmp = ubld.group(1, 0).vgrf(flag.type); + + if (save_flag) + ubld.group(1, 0).MOV(tmp, flag); + + ubld.emit(FS_OPCODE_LOAD_LIVE_CHANNELS); + + set_predicate(pred, inst); + inst->flag_subreg = 0; + + if (save_flag) + ubld.group(1, 0).at(block, inst->next).MOV(flag, tmp); + + progress = true; + } + break; + } + + if (inst == halt_start) + depth--; + + flag_liveout |= inst->flags_read(devinfo); + } + } + + if (progress) + invalidate_live_intervals(); + + return progress; +} + void fs_visitor::allocate_registers(unsigned min_dispatch_width, bool allow_spilling) { diff --git a/src/intel/compiler/brw_fs.h b/src/intel/compiler/brw_fs.h index 5784da7535c..dfa1ff4d0e1 100644 --- a/src/intel/compiler/brw_fs.h +++ b/src/intel/compiler/brw_fs.h @@ -107,6 +107,7 @@ public: void setup_cs_payload(); bool fixup_sends_duplicate_payload(); void fixup_3src_null_dest(); + bool fixup_nomask_control_flow(); void assign_curb_setup(); void assign_urb_setup(); void convert_attr_sources_to_hw_regs(fs_inst *inst);