aco: add ACO_DEBUG=perfinfo

This prints the program with each instruction's contribution to it's
latency and various factors for the calculation of the Inverse Throughput
statistic.

Signed-off-by: Rhys Perry <pendingchaos02@gmail.com>
Reviewed-by: Daniel Schürmann <daniel@schuermann.dev>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/8994>
This commit is contained in:
Rhys Perry
2021-02-03 13:40:54 +00:00
parent 5d6a1095bf
commit a0243f5c47
5 changed files with 29 additions and 2 deletions

View File

@@ -167,7 +167,7 @@ void aco_compile_shader(unsigned shader_count,
if (program->chip_class >= GFX10) if (program->chip_class >= GFX10)
aco::form_hard_clauses(program.get()); aco::form_hard_clauses(program.get());
if (program->collect_statistics) if (program->collect_statistics || (aco::debug_flags & aco::DEBUG_PERF_INFO))
aco::collect_preasm_stats(program.get()); aco::collect_preasm_stats(program.get());
/* Assembly */ /* Assembly */

View File

@@ -38,6 +38,7 @@ static const struct debug_control aco_debug_options[] = {
{"novn", DEBUG_NO_VN}, {"novn", DEBUG_NO_VN},
{"noopt", DEBUG_NO_OPT}, {"noopt", DEBUG_NO_OPT},
{"nosched", DEBUG_NO_SCHED}, {"nosched", DEBUG_NO_SCHED},
{"perfinfo", DEBUG_PERF_INFO},
{NULL, 0} {NULL, 0}
}; };

View File

@@ -55,6 +55,7 @@ enum {
DEBUG_NO_VN = 0x10, DEBUG_NO_VN = 0x10,
DEBUG_NO_OPT = 0x20, DEBUG_NO_OPT = 0x20,
DEBUG_NO_SCHED = 0x40, DEBUG_NO_SCHED = 0x40,
DEBUG_PERF_INFO = 0x80,
}; };
/** /**
@@ -2012,6 +2013,7 @@ void collect_postasm_stats(Program *program, const std::vector<uint32_t>& code);
enum print_flags { enum print_flags {
print_no_ssa = 0x1, print_no_ssa = 0x1,
print_perf_info = 0x2,
}; };
void aco_print_operand(const Operand *operand, FILE *output, unsigned flags=0); void aco_print_operand(const Operand *operand, FILE *output, unsigned flags=0);

View File

@@ -875,6 +875,9 @@ void aco_print_block(const Block* block, FILE *output, unsigned flags)
fprintf(output, "*/\n"); fprintf(output, "*/\n");
for (auto const& instr : block->instructions) { for (auto const& instr : block->instructions) {
fprintf(output, "\t"); fprintf(output, "\t");
if (flags & print_perf_info)
fprintf(output, "(%3u clk) ", instr->pass_flags);
aco_print_instr(instr.get(), output, flags); aco_print_instr(instr.get(), output, flags);
fprintf(output, "\n"); fprintf(output, "\n");
} }

View File

@@ -488,8 +488,11 @@ void collect_preasm_stats(Program *program)
for (unsigned pred : block.linear_preds) for (unsigned pred : block.linear_preds)
block_est.join(blocks[pred]); block_est.join(blocks[pred]);
for (aco_ptr<Instruction>& instr : block.instructions) for (aco_ptr<Instruction>& instr : block.instructions) {
unsigned before = block_est.cur_cycle;
block_est.add(instr); block_est.add(instr);
instr->pass_flags = block_est.cur_cycle - before;
}
/* TODO: it would be nice to be able to consider estimated loop trip /* TODO: it would be nice to be able to consider estimated loop trip
* counts used for loop unrolling. * counts used for loop unrolling.
@@ -541,6 +544,24 @@ void collect_preasm_stats(Program *program)
program->statistics[statistic_latency] = round(latency); program->statistics[statistic_latency] = round(latency);
program->statistics[statistic_inv_throughput] = round(1.0 / wave64_per_cycle); program->statistics[statistic_inv_throughput] = round(1.0 / wave64_per_cycle);
if (debug_flags & DEBUG_PERF_INFO) {
aco_print_program(program, stderr, print_no_ssa | print_perf_info);
fprintf(stderr, "num_waves: %u\n", program->num_waves);
fprintf(stderr, "salu_smem_usage: %f\n", usage[(int)BlockCycleEstimator::scalar]);
fprintf(stderr, "branch_sendmsg_usage: %f\n", usage[(int)BlockCycleEstimator::branch_sendmsg]);
fprintf(stderr, "valu_usage: %f\n", usage[(int)BlockCycleEstimator::valu]);
fprintf(stderr, "valu_complex_usage: %f\n", usage[(int)BlockCycleEstimator::valu_complex]);
fprintf(stderr, "lds_usage: %f\n", usage[(int)BlockCycleEstimator::lds]);
fprintf(stderr, "export_gds_usage: %f\n", usage[(int)BlockCycleEstimator::export_gds]);
fprintf(stderr, "vmem_usage: %f\n", usage[(int)BlockCycleEstimator::vmem]);
fprintf(stderr, "latency: %f\n", latency);
fprintf(stderr, "parallelism: %f\n", parallelism);
fprintf(stderr, "max_utilization: %f\n", max_utilization);
fprintf(stderr, "wave64_per_cycle: %f\n", wave64_per_cycle);
fprintf(stderr, "\n");
}
} }
void collect_postasm_stats(Program *program, const std::vector<uint32_t>& code) void collect_postasm_stats(Program *program, const std::vector<uint32_t>& code)