agx: clamp register file based on workgroup size

once we allow workgroup sizes larger than 384 threads, we need to restrict the register file to ensure we fit. Signed-off-by: Alyssa Rosenzweig <alyssa@rosenzweig.io> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/27616>
2023-12-04 12:51:10 -04:00
parent bb37b072a5
commit a2328820f7
3 changed files with 44 additions and 1 deletions
--- a/src/asahi/compiler/agx_compile.h
+++ b/src/asahi/compiler/agx_compile.h
@@ -263,6 +263,7 @@ struct agx_occupancy {
 };

 struct agx_occupancy agx_occupancy_for_register_count(unsigned halfregs);
+unsigned agx_max_registers_for_occupancy(unsigned occupancy);

 static const nir_shader_compiler_options agx_nir_options = {
   .lower_fdiv = true,
--- a/src/asahi/compiler/agx_performance.c
+++ b/src/asahi/compiler/agx_performance.c
@@ -3,6 +3,7 @@
 * SPDX-License-Identifier: MIT
 */

+#include "agx_compile.h"
 #include "agx_compiler.h"

 /* Table describing the relationship between registers pressure and thread
@@ -29,3 +30,19 @@ agx_occupancy_for_register_count(unsigned halfregs)

   unreachable("Register count must be less than the maximum");
 }
+
+unsigned
+agx_max_registers_for_occupancy(unsigned occupancy)
+{
+   unsigned max_regs = 0;
+
+   for (unsigned i = 0; i < ARRAY_SIZE(occupancies); ++i) {
+      if (occupancy <= occupancies[i].max_threads)
+         max_regs = occupancies[i].max_registers;
+      else
+         break;
+   }
+
+   assert(max_regs > 0 && "Thread count must be less than the maximum");
+   return max_regs;
+}
--- a/src/asahi/compiler/agx_register_allocate.c
+++ b/src/asahi/compiler/agx_register_allocate.c
@@ -6,6 +6,7 @@
 #include "util/u_dynarray.h"
 #include "util/u_qsort.h"
 #include "agx_builder.h"
+#include "agx_compile.h"
 #include "agx_compiler.h"
 #include "agx_debug.h"
 #include "agx_opcodes.h"
@@ -993,8 +994,31 @@ agx_ra(agx_context *ctx)
   unsigned demand =
      ALIGN_POT(agx_calc_register_demand(ctx, ncomps), reg_file_alignment);

+   unsigned max_possible_regs = AGX_NUM_REGS;
+
+   /* Compute shaders need to have their entire workgroup together, so our
+    * register usage is bounded by the workgroup size.
+    */
+   if (gl_shader_stage_is_compute(ctx->stage)) {
+      unsigned threads_per_workgroup;
+
+      /* If we don't know the workgroup size, worst case it. TODO: Optimize
+       * this, since it'll decimate opencl perf.
+       */
+      if (ctx->nir->info.workgroup_size_variable) {
+         threads_per_workgroup = 1024;
+      } else {
+         threads_per_workgroup = ctx->nir->info.workgroup_size[0] *
+                                 ctx->nir->info.workgroup_size[1] *
+                                 ctx->nir->info.workgroup_size[2];
+      }
+
+      max_possible_regs =
+         agx_max_registers_for_occupancy(threads_per_workgroup);
+   }
+
   /* TODO: Spilling. Abort so we don't smash the stack in release builds. */
-   if (demand > AGX_NUM_REGS) {
+   if (demand > max_possible_regs) {
      fprintf(stderr, "\n");
      fprintf(stderr, "------------------------------------------------\n");
      fprintf(stderr, "Asahi Linux shader compiler limitation!\n");
@@ -1019,6 +1043,7 @@ agx_ra(agx_context *ctx)
   /* ...but not too tightly */
   assert((max_regs % reg_file_alignment) == 0 && "occupancy limits aligned");
   assert(max_regs >= (6 * 2) && "space for vertex shader preloading");
+   assert(max_regs <= max_possible_regs);

   /* Assign registers in dominance-order. This coincides with source-order due
    * to a NIR invariant, so we do not need special handling for this.