aco: add ACO_DEBUG=perfinfo

This prints the program with each instruction's contribution to it's latency and various factors for the calculation of the Inverse Throughput statistic. Signed-off-by: Rhys Perry <pendingchaos02@gmail.com> Reviewed-by: Daniel Schürmann <daniel@schuermann.dev> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/8994>
2021-02-03 13:40:54 +00:00
parent 5d6a1095bf
commit a0243f5c47
5 changed files with 29 additions and 2 deletions
--- a/src/amd/compiler/aco_interface.cpp
+++ b/src/amd/compiler/aco_interface.cpp
@@ -167,7 +167,7 @@ void aco_compile_shader(unsigned shader_count,
   if (program->chip_class >= GFX10)
      aco::form_hard_clauses(program.get());
-   if (program->collect_statistics)
+   if (program->collect_statistics || (aco::debug_flags & aco::DEBUG_PERF_INFO))
      aco::collect_preasm_stats(program.get());
   /* Assembly */
--- a/src/amd/compiler/aco_ir.cpp
+++ b/src/amd/compiler/aco_ir.cpp
@@ -38,6 +38,7 @@ static const struct debug_control aco_debug_options[] = {
   {"novn", DEBUG_NO_VN},
   {"noopt", DEBUG_NO_OPT},
   {"nosched", DEBUG_NO_SCHED},
   {"perfinfo", DEBUG_PERF_INFO},
   {NULL, 0}
 };
--- a/src/amd/compiler/aco_ir.h
+++ b/src/amd/compiler/aco_ir.h
@@ -55,6 +55,7 @@ enum {
   DEBUG_NO_VN = 0x10,
   DEBUG_NO_OPT = 0x20,
   DEBUG_NO_SCHED = 0x40,
   DEBUG_PERF_INFO = 0x80,
 };
 /**
@@ -2012,6 +2013,7 @@ void collect_postasm_stats(Program *program, const std::vector<uint32_t>& code);
 enum print_flags {
   print_no_ssa = 0x1,
   print_perf_info = 0x2,
 };
 void aco_print_operand(const Operand *operand, FILE *output, unsigned flags=0);
--- a/src/amd/compiler/aco_print_ir.cpp
+++ b/src/amd/compiler/aco_print_ir.cpp
@@ -875,6 +875,9 @@ void aco_print_block(const Block* block, FILE *output, unsigned flags)
   fprintf(output, "*/\n");
   for (auto const& instr : block->instructions) {
      fprintf(output, "\t");
      if (flags & print_perf_info)
         fprintf(output, "(%3u clk)   ", instr->pass_flags);
      aco_print_instr(instr.get(), output, flags);
      fprintf(output, "\n");
   }
--- a/src/amd/compiler/aco_statistics.cpp
+++ b/src/amd/compiler/aco_statistics.cpp
@@ -488,8 +488,11 @@ void collect_preasm_stats(Program *program)
      for (unsigned pred : block.linear_preds)
         block_est.join(blocks[pred]);
-      for (aco_ptr<Instruction>& instr : block.instructions)
+      for (aco_ptr<Instruction>& instr : block.instructions) {
         unsigned before = block_est.cur_cycle;
         block_est.add(instr);
         instr->pass_flags = block_est.cur_cycle - before;
      }
      /* TODO: it would be nice to be able to consider estimated loop trip
       * counts used for loop unrolling.
@@ -541,6 +544,24 @@ void collect_preasm_stats(Program *program)
   program->statistics[statistic_latency] = round(latency);
   program->statistics[statistic_inv_throughput] = round(1.0 / wave64_per_cycle);
   if (debug_flags & DEBUG_PERF_INFO) {
      aco_print_program(program, stderr, print_no_ssa | print_perf_info);
      fprintf(stderr, "num_waves: %u\n", program->num_waves);
      fprintf(stderr, "salu_smem_usage: %f\n", usage[(int)BlockCycleEstimator::scalar]);
      fprintf(stderr, "branch_sendmsg_usage: %f\n", usage[(int)BlockCycleEstimator::branch_sendmsg]);
      fprintf(stderr, "valu_usage: %f\n", usage[(int)BlockCycleEstimator::valu]);
      fprintf(stderr, "valu_complex_usage: %f\n", usage[(int)BlockCycleEstimator::valu_complex]);
      fprintf(stderr, "lds_usage: %f\n", usage[(int)BlockCycleEstimator::lds]);
      fprintf(stderr, "export_gds_usage: %f\n", usage[(int)BlockCycleEstimator::export_gds]);
      fprintf(stderr, "vmem_usage: %f\n", usage[(int)BlockCycleEstimator::vmem]);
      fprintf(stderr, "latency: %f\n", latency);
      fprintf(stderr, "parallelism: %f\n", parallelism);
      fprintf(stderr, "max_utilization: %f\n", max_utilization);
      fprintf(stderr, "wave64_per_cycle: %f\n", wave64_per_cycle);
      fprintf(stderr, "\n");
   }
 }
 void collect_postasm_stats(Program *program, const std::vector<uint32_t>& code)