ac/nir: Lower large indirect variables to scratch
results from radeonsi NIR: Totals from affected shaders: SGPRS: 704 -> 464 (-34.09 %) VGPRS: 2056 -> 672 (-67.32 %) Spilled SGPRs: 24 -> 0 (-100.00 %) Spilled VGPRs: 28406 -> 0 (-100.00 %) Private memory VGPRs: 0 -> 3182 (0.00 %) Scratch size: 1064 -> 3228 (203.38 %) dwords per thread Code Size: 935260 -> 40180 (-95.70 %) bytes LDS: 0 -> 0 (0.00 %) blocks Max Waves: 28 -> 70 (150.00 %) Wait states: 0 -> 0 (0.00 %) results from radv: Totals from affected shaders: SGPRS: 80 -> 48 (-40.00 %) VGPRS: 204 -> 108 (-47.06 %) Spilled SGPRs: 0 -> 0 (0.00 %) Spilled VGPRs: 0 -> 0 (0.00 %) Private memory VGPRs: 0 -> 0 (0.00 %) Scratch size: 0 -> 256 (0.00 %) dwords per thread Code Size: 15792 -> 9504 (-39.82 %) bytes LDS: 0 -> 0 (0.00 %) blocks Max Waves: 1 -> 2 (100.00 %) Wait states: 0 -> 0 (0.00 %) Reviewed-by: Bas Nieuwenhuizen <bas@basnieuwenhuizen.nl>
This commit is contained in:
@@ -42,6 +42,8 @@ struct ac_nir_context {
|
||||
|
||||
LLVMValueRef *ssa_defs;
|
||||
|
||||
LLVMValueRef scratch;
|
||||
|
||||
struct hash_table *defs;
|
||||
struct hash_table *phis;
|
||||
struct hash_table *vars;
|
||||
@@ -3573,6 +3575,36 @@ static void visit_intrinsic(struct ac_nir_context *ctx,
|
||||
case nir_intrinsic_mbcnt_amd:
|
||||
result = ac_build_mbcnt(&ctx->ac, get_src(ctx, instr->src[0]));
|
||||
break;
|
||||
case nir_intrinsic_load_scratch: {
|
||||
LLVMValueRef offset = get_src(ctx, instr->src[0]);
|
||||
LLVMValueRef ptr = ac_build_gep0(&ctx->ac, ctx->scratch,
|
||||
offset);
|
||||
LLVMTypeRef comp_type =
|
||||
LLVMIntTypeInContext(ctx->ac.context, instr->dest.ssa.bit_size);
|
||||
LLVMTypeRef vec_type =
|
||||
instr->dest.ssa.num_components == 1 ? comp_type :
|
||||
LLVMVectorType(comp_type, instr->dest.ssa.num_components);
|
||||
unsigned addr_space = LLVMGetPointerAddressSpace(LLVMTypeOf(ptr));
|
||||
ptr = LLVMBuildBitCast(ctx->ac.builder, ptr,
|
||||
LLVMPointerType(vec_type, addr_space), "");
|
||||
result = LLVMBuildLoad(ctx->ac.builder, ptr, "");
|
||||
break;
|
||||
}
|
||||
case nir_intrinsic_store_scratch: {
|
||||
LLVMValueRef offset = get_src(ctx, instr->src[1]);
|
||||
LLVMValueRef ptr = ac_build_gep0(&ctx->ac, ctx->scratch,
|
||||
offset);
|
||||
LLVMTypeRef comp_type =
|
||||
LLVMIntTypeInContext(ctx->ac.context, instr->src[0].ssa->bit_size);
|
||||
LLVMTypeRef vec_type =
|
||||
instr->src[0].ssa->num_components == 1 ? comp_type :
|
||||
LLVMVectorType(comp_type, instr->src[0].ssa->num_components);
|
||||
unsigned addr_space = LLVMGetPointerAddressSpace(LLVMTypeOf(ptr));
|
||||
ptr = LLVMBuildBitCast(ctx->ac.builder, ptr,
|
||||
LLVMPointerType(vec_type, addr_space), "");
|
||||
LLVMBuildStore(ctx->ac.builder, get_src(ctx, instr->src[0]), ptr);
|
||||
break;
|
||||
}
|
||||
default:
|
||||
fprintf(stderr, "Unknown intrinsic: ");
|
||||
nir_print_instr(&instr->instr, stderr);
|
||||
@@ -4473,6 +4505,18 @@ setup_locals(struct ac_nir_context *ctx,
|
||||
}
|
||||
}
|
||||
|
||||
static void
|
||||
setup_scratch(struct ac_nir_context *ctx,
|
||||
struct nir_shader *shader)
|
||||
{
|
||||
if (shader->scratch_size == 0)
|
||||
return;
|
||||
|
||||
ctx->scratch = ac_build_alloca_undef(&ctx->ac,
|
||||
LLVMArrayType(ctx->ac.i8, shader->scratch_size),
|
||||
"scratch");
|
||||
}
|
||||
|
||||
static void
|
||||
setup_shared(struct ac_nir_context *ctx,
|
||||
struct nir_shader *nir)
|
||||
@@ -4518,6 +4562,7 @@ void ac_nir_translate(struct ac_llvm_context *ac, struct ac_shader_abi *abi,
|
||||
ctx.ssa_defs = calloc(func->impl->ssa_alloc, sizeof(LLVMValueRef));
|
||||
|
||||
setup_locals(&ctx, func);
|
||||
setup_scratch(&ctx, nir);
|
||||
|
||||
if (gl_shader_stage_is_compute(nir->info.stage))
|
||||
setup_shared(&ctx, nir);
|
||||
@@ -4539,6 +4584,15 @@ void ac_nir_translate(struct ac_llvm_context *ac, struct ac_shader_abi *abi,
|
||||
void
|
||||
ac_lower_indirect_derefs(struct nir_shader *nir, enum chip_class chip_class)
|
||||
{
|
||||
/* Lower large variables to scratch first so that we won't bloat the
|
||||
* shader by generating large if ladders for them. We later lower
|
||||
* scratch to alloca's, assuming LLVM won't generate VGPR indexing.
|
||||
*/
|
||||
NIR_PASS_V(nir, nir_lower_vars_to_scratch,
|
||||
nir_var_function_temp,
|
||||
256,
|
||||
glsl_get_natural_size_align_bytes);
|
||||
|
||||
/* While it would be nice not to have this flag, we are constrained
|
||||
* by the reality that LLVM 9.0 has buggy VGPR indexing on GFX9.
|
||||
*/
|
||||
|
Reference in New Issue
Block a user