aco/assembler: change prefetch mode on GFX10.3+ during loops if beneficial
Totals from 8864 (6.68% of 132726) affected shaders: GFX11 CodeSize: 90776128 -> 90923760 (+0.16%) Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/23748>
This commit is contained in:

committed by
Marge Bot

parent
b9c5b273b0
commit
c778803d67
@@ -268,8 +268,8 @@ is located at this offset.
|
||||
|
||||
### InstFwdPrefetchBug
|
||||
|
||||
According to LLVM, the `s_inst_prefetch` instruction can cause a hang.
|
||||
There are no further details.
|
||||
According to LLVM, the `s_inst_prefetch` instruction can cause a hang on GFX10.
|
||||
Seems to be resolved on GFX10.3+. There are no further details.
|
||||
|
||||
### LdsMisalignedBug
|
||||
|
||||
|
@@ -1227,16 +1227,36 @@ align_block(asm_context& ctx, std::vector<uint32_t>& code, Block& block)
|
||||
std::vector<uint32_t> nops;
|
||||
|
||||
const unsigned loop_num_cl = DIV_ROUND_UP(block.offset - loop_header->offset, 16);
|
||||
|
||||
/* On GFX10.3+, change the prefetch mode if the loop fits into 2 or 3 cache lines.
|
||||
* Don't use the s_inst_prefetch instruction on GFX10 as it might cause hangs.
|
||||
*/
|
||||
const bool change_prefetch =
|
||||
ctx.program->gfx_level >= GFX10_3 && loop_num_cl > 1 && loop_num_cl <= 3;
|
||||
|
||||
if (change_prefetch) {
|
||||
Builder bld(ctx.program);
|
||||
int16_t prefetch_mode = loop_num_cl == 3 ? 0x1 : 0x2;
|
||||
aco_ptr<Instruction> instr(bld.sopp(aco_opcode::s_inst_prefetch, -1, prefetch_mode));
|
||||
emit_instruction(ctx, nops, instr.get());
|
||||
insert_code(ctx, code, loop_header->offset, nops.size(), nops.data());
|
||||
|
||||
/* Change prefetch mode back to default (0x3). */
|
||||
instr->sopp().imm = 0x3;
|
||||
emit_instruction(ctx, code, instr.get());
|
||||
}
|
||||
|
||||
const unsigned loop_start_cl = loop_header->offset >> 4;
|
||||
const unsigned loop_end_cl = (block.offset - 1) >> 4;
|
||||
|
||||
/* Align the loop if it fits into a single cache line or if we can
|
||||
/* Align the loop if it fits into the fetched cache lines or if we can
|
||||
* reduce the number of cache lines with less than 8 NOPs.
|
||||
*/
|
||||
const bool align_loop = loop_end_cl - loop_start_cl >= loop_num_cl &&
|
||||
(loop_num_cl == 1 || loop_header->offset % 16 > 8);
|
||||
(loop_num_cl == 1 || change_prefetch || loop_header->offset % 16 > 8);
|
||||
|
||||
if (align_loop) {
|
||||
nops.clear();
|
||||
nops.resize(16 - (loop_header->offset % 16), 0xbf800000u);
|
||||
insert_code(ctx, code, loop_header->offset, nops.size(), nops.data());
|
||||
}
|
||||
|
Reference in New Issue
Block a user