agx: clamp register file based on workgroup size

once we allow workgroup sizes larger than 384 threads, we need to restrict the
register file to ensure we fit.

Signed-off-by: Alyssa Rosenzweig <alyssa@rosenzweig.io>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/27616>
This commit is contained in:
Alyssa Rosenzweig
2023-12-04 12:51:10 -04:00
committed by Marge Bot
parent bb37b072a5
commit a2328820f7
3 changed files with 44 additions and 1 deletions

View File

@@ -263,6 +263,7 @@ struct agx_occupancy {
};
struct agx_occupancy agx_occupancy_for_register_count(unsigned halfregs);
unsigned agx_max_registers_for_occupancy(unsigned occupancy);
static const nir_shader_compiler_options agx_nir_options = {
.lower_fdiv = true,

View File

@@ -3,6 +3,7 @@
* SPDX-License-Identifier: MIT
*/
#include "agx_compile.h"
#include "agx_compiler.h"
/* Table describing the relationship between registers pressure and thread
@@ -29,3 +30,19 @@ agx_occupancy_for_register_count(unsigned halfregs)
unreachable("Register count must be less than the maximum");
}
unsigned
agx_max_registers_for_occupancy(unsigned occupancy)
{
unsigned max_regs = 0;
for (unsigned i = 0; i < ARRAY_SIZE(occupancies); ++i) {
if (occupancy <= occupancies[i].max_threads)
max_regs = occupancies[i].max_registers;
else
break;
}
assert(max_regs > 0 && "Thread count must be less than the maximum");
return max_regs;
}

View File

@@ -6,6 +6,7 @@
#include "util/u_dynarray.h"
#include "util/u_qsort.h"
#include "agx_builder.h"
#include "agx_compile.h"
#include "agx_compiler.h"
#include "agx_debug.h"
#include "agx_opcodes.h"
@@ -993,8 +994,31 @@ agx_ra(agx_context *ctx)
unsigned demand =
ALIGN_POT(agx_calc_register_demand(ctx, ncomps), reg_file_alignment);
unsigned max_possible_regs = AGX_NUM_REGS;
/* Compute shaders need to have their entire workgroup together, so our
* register usage is bounded by the workgroup size.
*/
if (gl_shader_stage_is_compute(ctx->stage)) {
unsigned threads_per_workgroup;
/* If we don't know the workgroup size, worst case it. TODO: Optimize
* this, since it'll decimate opencl perf.
*/
if (ctx->nir->info.workgroup_size_variable) {
threads_per_workgroup = 1024;
} else {
threads_per_workgroup = ctx->nir->info.workgroup_size[0] *
ctx->nir->info.workgroup_size[1] *
ctx->nir->info.workgroup_size[2];
}
max_possible_regs =
agx_max_registers_for_occupancy(threads_per_workgroup);
}
/* TODO: Spilling. Abort so we don't smash the stack in release builds. */
if (demand > AGX_NUM_REGS) {
if (demand > max_possible_regs) {
fprintf(stderr, "\n");
fprintf(stderr, "------------------------------------------------\n");
fprintf(stderr, "Asahi Linux shader compiler limitation!\n");
@@ -1019,6 +1043,7 @@ agx_ra(agx_context *ctx)
/* ...but not too tightly */
assert((max_regs % reg_file_alignment) == 0 && "occupancy limits aligned");
assert(max_regs >= (6 * 2) && "space for vertex shader preloading");
assert(max_regs <= max_possible_regs);
/* Assign registers in dominance-order. This coincides with source-order due
* to a NIR invariant, so we do not need special handling for this.