aco/gfx11: deallocate VGPRs at the end of the shader

fossil-db (gfx1100):
Totals from 65987 (40.81% of 161689) affected shaders:
Instrs: 57123207 -> 57199947 (+0.13%)
CodeSize: 308402500 -> 308709460 (+0.10%)
Latency: 680527139 -> 680527160 (+0.00%)
InvThroughput: 131620026 -> 131620045 (+0.00%)

Signed-off-by: Rhys Perry <pendingchaos02@gmail.com>
Reviewed-by: Daniel Schürmann <daniel@schuermann.dev>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/17710>
This commit is contained in:
Rhys Perry
2022-07-21 15:54:26 +01:00
committed by Marge Bot
parent 6407d783ea
commit 2930317cea
3 changed files with 29 additions and 0 deletions

View File

@@ -943,4 +943,28 @@ should_form_clause(const Instruction* a, const Instruction* b)
return false;
}
bool
dealloc_vgprs(Program* program)
{
if (program->gfx_level < GFX11)
return false;
/* skip if deallocating VGPRs won't increase occupancy */
uint16_t max_waves = program->dev.max_wave64_per_simd * (64 / program->wave_size);
max_waves = max_suitable_waves(program, max_waves);
if (program->max_reg_demand.vgpr <= get_addr_vgpr_from_waves(program, max_waves))
return false;
Block& block = program->blocks.back();
/* don't bother checking if there is a pending VMEM store or export: there almost always is */
Builder bld(program);
if (!block.instructions.empty() && block.instructions.back()->opcode == aco_opcode::s_endpgm) {
bld.reset(&block.instructions, block.instructions.begin() + (block.instructions.size() - 1));
bld.sopp(aco_opcode::s_sendmsg, -1, sendmsg_dealloc_vgprs);
}
return true;
}
} // namespace aco

View File

@@ -2289,6 +2289,7 @@ void lower_to_hw_instr(Program* program);
void schedule_program(Program* program, live& live_vars);
void spill(Program* program, live& live_vars);
void insert_wait_states(Program* program);
bool dealloc_vgprs(Program* program);
void insert_NOPs(Program* program);
void form_hard_clauses(Program* program);
unsigned emit_program(Program* program, std::vector<uint32_t>& code);

View File

@@ -2004,6 +2004,8 @@ lower_to_hw_instr(Program* program)
{
Block* discard_block = NULL;
bool should_dealloc_vgprs = dealloc_vgprs(program);
for (int block_idx = program->blocks.size() - 1; block_idx >= 0; block_idx--) {
Block* block = &program->blocks[block_idx];
lower_context ctx;
@@ -2126,6 +2128,8 @@ lower_to_hw_instr(Program* program)
block = &program->blocks[block_idx];
bld.reset(discard_block);
if (should_dealloc_vgprs)
bld.sopp(aco_opcode::s_sendmsg, -1, sendmsg_dealloc_vgprs);
bld.exp(aco_opcode::exp, Operand(v1), Operand(v1), Operand(v1), Operand(v1), 0,
program->gfx_level >= GFX11 ? V_008DFC_SQ_EXP_MRT : V_008DFC_SQ_EXP_NULL,
false, true, true);