aco/assembler: change prefetch mode on GFX10.3+ during loops if beneficial

Totals from 8864 (6.68% of 132726) affected shaders: GFX11

CodeSize: 90776128 -> 90923760 (+0.16%)
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/23748>
This commit is contained in:
Daniel Schürmann
2023-06-21 14:08:06 +02:00
committed by Marge Bot
parent b9c5b273b0
commit c778803d67
2 changed files with 24 additions and 4 deletions

View File

@@ -268,8 +268,8 @@ is located at this offset.
### InstFwdPrefetchBug
According to LLVM, the `s_inst_prefetch` instruction can cause a hang.
There are no further details.
According to LLVM, the `s_inst_prefetch` instruction can cause a hang on GFX10.
Seems to be resolved on GFX10.3+. There are no further details.
### LdsMisalignedBug

View File

@@ -1227,16 +1227,36 @@ align_block(asm_context& ctx, std::vector<uint32_t>& code, Block& block)
std::vector<uint32_t> nops;
const unsigned loop_num_cl = DIV_ROUND_UP(block.offset - loop_header->offset, 16);
/* On GFX10.3+, change the prefetch mode if the loop fits into 2 or 3 cache lines.
* Don't use the s_inst_prefetch instruction on GFX10 as it might cause hangs.
*/
const bool change_prefetch =
ctx.program->gfx_level >= GFX10_3 && loop_num_cl > 1 && loop_num_cl <= 3;
if (change_prefetch) {
Builder bld(ctx.program);
int16_t prefetch_mode = loop_num_cl == 3 ? 0x1 : 0x2;
aco_ptr<Instruction> instr(bld.sopp(aco_opcode::s_inst_prefetch, -1, prefetch_mode));
emit_instruction(ctx, nops, instr.get());
insert_code(ctx, code, loop_header->offset, nops.size(), nops.data());
/* Change prefetch mode back to default (0x3). */
instr->sopp().imm = 0x3;
emit_instruction(ctx, code, instr.get());
}
const unsigned loop_start_cl = loop_header->offset >> 4;
const unsigned loop_end_cl = (block.offset - 1) >> 4;
/* Align the loop if it fits into a single cache line or if we can
/* Align the loop if it fits into the fetched cache lines or if we can
* reduce the number of cache lines with less than 8 NOPs.
*/
const bool align_loop = loop_end_cl - loop_start_cl >= loop_num_cl &&
(loop_num_cl == 1 || loop_header->offset % 16 > 8);
(loop_num_cl == 1 || change_prefetch || loop_header->offset % 16 > 8);
if (align_loop) {
nops.clear();
nops.resize(16 - (loop_header->offset % 16), 0xbf800000u);
insert_code(ctx, code, loop_header->offset, nops.size(), nops.data());
}