broadcom/compiler: add a compiler strategy to disable loop unrolling

Loop unrolling can increase register pressure significantly, leading to lower thread counts and spilling. Reviewed-by: Alejandro Piñeiro <apinheiro@igalia.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/10647>
2021-05-03 10:14:12 +02:00
parent 4742300e6b
commit 296fe4daa6
4 changed files with 18 additions and 10 deletions
--- a/src/broadcom/compiler/nir_to_vir.c
+++ b/src/broadcom/compiler/nir_to_vir.c
@@ -1774,7 +1774,7 @@ mem_vectorize_callback(unsigned align_mul, unsigned align_offset,
 }

 void
-v3d_optimize_nir(struct nir_shader *s)
+v3d_optimize_nir(struct v3d_compile *c, struct nir_shader *s)
 {
        bool progress;
        unsigned lower_flrp =
@@ -1826,7 +1826,8 @@ v3d_optimize_nir(struct nir_shader *s)
                NIR_PASS(progress, s, nir_opt_undef);
                NIR_PASS(progress, s, nir_lower_undef_to_zero);

-                if (s->options->max_unroll_iterations > 0) {
+                if (c && !c->disable_loop_unrolling &&
+                    s->options->max_unroll_iterations > 0) {
                        NIR_PASS(progress, s, nir_opt_loop_unroll,
                                 nir_var_shader_in |
                                 nir_var_shader_out |
--- a/src/broadcom/compiler/v3d_compiler.h
+++ b/src/broadcom/compiler/v3d_compiler.h
@@ -660,6 +660,9 @@ struct v3d_compile {
         */
        bool disable_ldunif_opt;

+        /* Disables loop unrolling to reduce register pressure. */
+        bool disable_loop_unrolling;
+
        /* Minimum number of threads we are willing to use to register allocate
         * a shader with the current compilation strategy. This only prevents
         * us from lowering the thread count to register allocate successfully,
@@ -939,7 +942,7 @@ vir_has_uniform(struct qinst *inst)

 const struct v3d_compiler *v3d_compiler_init(const struct v3d_device_info *devinfo);
 void v3d_compiler_free(const struct v3d_compiler *compiler);
-void v3d_optimize_nir(struct nir_shader *s);
+void v3d_optimize_nir(struct v3d_compile *c, struct nir_shader *s);

 uint64_t *v3d_compile(const struct v3d_compiler *compiler,
                      struct v3d_key *key,
--- a/src/broadcom/compiler/vir.c
+++ b/src/broadcom/compiler/vir.c
@@ -526,6 +526,7 @@ vir_compile_init(const struct v3d_compiler *compiler,
                 void *debug_output_data,
                 int program_id, int variant_id,
                 uint32_t min_threads_for_reg_alloc,
+                 bool disable_loop_unrolling,
                 bool disable_constant_ubo_load_sorting,
                 bool disable_tmu_pipelining,
                 bool fallback_scheduler)
@@ -545,6 +546,7 @@ vir_compile_init(const struct v3d_compiler *compiler,
        c->fallback_scheduler = fallback_scheduler;
        c->disable_tmu_pipelining = disable_tmu_pipelining;
        c->disable_constant_ubo_load_sorting = disable_constant_ubo_load_sorting;
+        c->disable_loop_unrolling = disable_loop_unrolling;

        s = nir_shader_clone(c, s);
        c->s = s;
@@ -867,7 +869,7 @@ v3d_nir_lower_vs_early(struct v3d_compile *c)
        NIR_PASS_V(c->s, nir_remove_unused_io_vars,
                   nir_var_shader_out, used_outputs, NULL); /* demotes to globals */
        NIR_PASS_V(c->s, nir_lower_global_vars_to_local);
-        v3d_optimize_nir(c->s);
+        v3d_optimize_nir(c, c->s);
        NIR_PASS_V(c->s, nir_remove_dead_variables, nir_var_shader_in, NULL);

        /* This must go before nir_lower_io */
@@ -901,7 +903,7 @@ v3d_nir_lower_gs_early(struct v3d_compile *c)
        NIR_PASS_V(c->s, nir_remove_unused_io_vars,
                   nir_var_shader_out, used_outputs, NULL); /* demotes to globals */
        NIR_PASS_V(c->s, nir_lower_global_vars_to_local);
-        v3d_optimize_nir(c->s);
+        v3d_optimize_nir(c, c->s);
        NIR_PASS_V(c->s, nir_remove_dead_variables, nir_var_shader_in, NULL);

        /* This must go before nir_lower_io */
@@ -1417,7 +1419,7 @@ v3d_attempt_compile(struct v3d_compile *c)

        NIR_PASS_V(c->s, nir_lower_wrmasks, should_split_wrmask, c->s);

-        v3d_optimize_nir(c->s);
+        v3d_optimize_nir(c, c->s);

        /* Do late algebraic optimization to turn add(a, neg(b)) back into
         * subs, then the mandatory cleanup after algebraic.  Note that it may
@@ -1537,6 +1539,7 @@ uint64_t *v3d_compile(const struct v3d_compiler *compiler,
                uint32_t min_threads_for_reg_alloc;
        } static const strategies[] = {
                { "default",                  4 },
+                { "disable loop unrolling",   4 },
                { "disable UBO load sorting", 1 },
                { "disable TMU pipelining",   1 },
                { "fallback scheduler",       1 }
@@ -1547,9 +1550,10 @@ uint64_t *v3d_compile(const struct v3d_compiler *compiler,
                                     debug_output, debug_output_data,
                                     program_id, variant_id,
                                     strategies[i].min_threads_for_reg_alloc,
-                                     i > 0, /* Disable UBO load sorting */
-                                     i > 1, /* Disable TMU pipelining */
-                                     i > 2  /* Fallback_scheduler */);
+                                     i > 0, /* Disable loop unrolling */
+                                     i > 1, /* Disable UBO load sorting */
+                                     i > 2, /* Disable TMU pipelining */
+                                     i > 3  /* Fallback_scheduler */);

                v3d_attempt_compile(c);

--- a/src/gallium/drivers/v3d/v3d_program.c
+++ b/src/gallium/drivers/v3d/v3d_program.c
@@ -318,7 +318,7 @@ v3d_uncompiled_shader_create(struct pipe_context *pctx,

        NIR_PASS_V(s, nir_lower_load_const_to_scalar);

-        v3d_optimize_nir(s);
+        v3d_optimize_nir(NULL, s);

        NIR_PASS_V(s, nir_remove_dead_variables, nir_var_function_temp, NULL);