From 296fe4daa64024530d7dcf66e55ef43c75cf53eb Mon Sep 17 00:00:00 2001 From: Iago Toral Quiroga Date: Mon, 3 May 2021 10:14:12 +0200 Subject: [PATCH] broadcom/compiler: add a compiler strategy to disable loop unrolling MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Loop unrolling can increase register pressure significantly, leading to lower thread counts and spilling. Reviewed-by: Alejandro PiƱeiro Part-of: --- src/broadcom/compiler/nir_to_vir.c | 5 +++-- src/broadcom/compiler/v3d_compiler.h | 5 ++++- src/broadcom/compiler/vir.c | 16 ++++++++++------ src/gallium/drivers/v3d/v3d_program.c | 2 +- 4 files changed, 18 insertions(+), 10 deletions(-) diff --git a/src/broadcom/compiler/nir_to_vir.c b/src/broadcom/compiler/nir_to_vir.c index 3cec6ba9bcd..43ce7a0ffbc 100644 --- a/src/broadcom/compiler/nir_to_vir.c +++ b/src/broadcom/compiler/nir_to_vir.c @@ -1774,7 +1774,7 @@ mem_vectorize_callback(unsigned align_mul, unsigned align_offset, } void -v3d_optimize_nir(struct nir_shader *s) +v3d_optimize_nir(struct v3d_compile *c, struct nir_shader *s) { bool progress; unsigned lower_flrp = @@ -1826,7 +1826,8 @@ v3d_optimize_nir(struct nir_shader *s) NIR_PASS(progress, s, nir_opt_undef); NIR_PASS(progress, s, nir_lower_undef_to_zero); - if (s->options->max_unroll_iterations > 0) { + if (c && !c->disable_loop_unrolling && + s->options->max_unroll_iterations > 0) { NIR_PASS(progress, s, nir_opt_loop_unroll, nir_var_shader_in | nir_var_shader_out | diff --git a/src/broadcom/compiler/v3d_compiler.h b/src/broadcom/compiler/v3d_compiler.h index fe2f44d8134..9b87dd77dcf 100644 --- a/src/broadcom/compiler/v3d_compiler.h +++ b/src/broadcom/compiler/v3d_compiler.h @@ -660,6 +660,9 @@ struct v3d_compile { */ bool disable_ldunif_opt; + /* Disables loop unrolling to reduce register pressure. */ + bool disable_loop_unrolling; + /* Minimum number of threads we are willing to use to register allocate * a shader with the current compilation strategy. This only prevents * us from lowering the thread count to register allocate successfully, @@ -939,7 +942,7 @@ vir_has_uniform(struct qinst *inst) const struct v3d_compiler *v3d_compiler_init(const struct v3d_device_info *devinfo); void v3d_compiler_free(const struct v3d_compiler *compiler); -void v3d_optimize_nir(struct nir_shader *s); +void v3d_optimize_nir(struct v3d_compile *c, struct nir_shader *s); uint64_t *v3d_compile(const struct v3d_compiler *compiler, struct v3d_key *key, diff --git a/src/broadcom/compiler/vir.c b/src/broadcom/compiler/vir.c index 48eba571727..3a35df247f1 100644 --- a/src/broadcom/compiler/vir.c +++ b/src/broadcom/compiler/vir.c @@ -526,6 +526,7 @@ vir_compile_init(const struct v3d_compiler *compiler, void *debug_output_data, int program_id, int variant_id, uint32_t min_threads_for_reg_alloc, + bool disable_loop_unrolling, bool disable_constant_ubo_load_sorting, bool disable_tmu_pipelining, bool fallback_scheduler) @@ -545,6 +546,7 @@ vir_compile_init(const struct v3d_compiler *compiler, c->fallback_scheduler = fallback_scheduler; c->disable_tmu_pipelining = disable_tmu_pipelining; c->disable_constant_ubo_load_sorting = disable_constant_ubo_load_sorting; + c->disable_loop_unrolling = disable_loop_unrolling; s = nir_shader_clone(c, s); c->s = s; @@ -867,7 +869,7 @@ v3d_nir_lower_vs_early(struct v3d_compile *c) NIR_PASS_V(c->s, nir_remove_unused_io_vars, nir_var_shader_out, used_outputs, NULL); /* demotes to globals */ NIR_PASS_V(c->s, nir_lower_global_vars_to_local); - v3d_optimize_nir(c->s); + v3d_optimize_nir(c, c->s); NIR_PASS_V(c->s, nir_remove_dead_variables, nir_var_shader_in, NULL); /* This must go before nir_lower_io */ @@ -901,7 +903,7 @@ v3d_nir_lower_gs_early(struct v3d_compile *c) NIR_PASS_V(c->s, nir_remove_unused_io_vars, nir_var_shader_out, used_outputs, NULL); /* demotes to globals */ NIR_PASS_V(c->s, nir_lower_global_vars_to_local); - v3d_optimize_nir(c->s); + v3d_optimize_nir(c, c->s); NIR_PASS_V(c->s, nir_remove_dead_variables, nir_var_shader_in, NULL); /* This must go before nir_lower_io */ @@ -1417,7 +1419,7 @@ v3d_attempt_compile(struct v3d_compile *c) NIR_PASS_V(c->s, nir_lower_wrmasks, should_split_wrmask, c->s); - v3d_optimize_nir(c->s); + v3d_optimize_nir(c, c->s); /* Do late algebraic optimization to turn add(a, neg(b)) back into * subs, then the mandatory cleanup after algebraic. Note that it may @@ -1537,6 +1539,7 @@ uint64_t *v3d_compile(const struct v3d_compiler *compiler, uint32_t min_threads_for_reg_alloc; } static const strategies[] = { { "default", 4 }, + { "disable loop unrolling", 4 }, { "disable UBO load sorting", 1 }, { "disable TMU pipelining", 1 }, { "fallback scheduler", 1 } @@ -1547,9 +1550,10 @@ uint64_t *v3d_compile(const struct v3d_compiler *compiler, debug_output, debug_output_data, program_id, variant_id, strategies[i].min_threads_for_reg_alloc, - i > 0, /* Disable UBO load sorting */ - i > 1, /* Disable TMU pipelining */ - i > 2 /* Fallback_scheduler */); + i > 0, /* Disable loop unrolling */ + i > 1, /* Disable UBO load sorting */ + i > 2, /* Disable TMU pipelining */ + i > 3 /* Fallback_scheduler */); v3d_attempt_compile(c); diff --git a/src/gallium/drivers/v3d/v3d_program.c b/src/gallium/drivers/v3d/v3d_program.c index 52ab2cf6d63..4050b933319 100644 --- a/src/gallium/drivers/v3d/v3d_program.c +++ b/src/gallium/drivers/v3d/v3d_program.c @@ -318,7 +318,7 @@ v3d_uncompiled_shader_create(struct pipe_context *pctx, NIR_PASS_V(s, nir_lower_load_const_to_scalar); - v3d_optimize_nir(s); + v3d_optimize_nir(NULL, s); NIR_PASS_V(s, nir_remove_dead_variables, nir_var_function_temp, NULL);