broadcom/compiler: add more lowerings/optimizations on v3d_optimize_nir

Optimizations that we are already calling on the Vulkan driver. As preparation to the Vulkan frontend to use v3d_optimize_nir too. We need to add a new parameter to v3d_optimize_nir in order to know if we can call nir_opt_find_array_copies. As we don't track if we are calling nir_var_lower_copies, we explicitly call it when we create the uncompiled shader create. So instead of tracking, we assume that each driver (v3d/v3dv) would call it when the shader is created. So when v3d_optimize_nir is called as part of the process to compile it at the compiler, we call it with allow_copies as false. We exclude on purpose nir_opt_gcm as it is a case of a optimization that could help performance even if it hurts shader db stats. shaderdb stats: total instructions in shared programs: 11705923 -> 11705034 (<.01%) instructions in affected programs: 88350 -> 87461 (-1.01%) helped: 201 HURT: 80 Instructions are helped. total threads in shared programs: 375552 -> 375558 (<.01%) threads in affected programs: 6 -> 12 (100.00%) helped: 3 HURT: 0 total uniforms in shared programs: 3486108 -> 3485789 (<.01%) uniforms in affected programs: 7473 -> 7154 (-4.27%) helped: 90 HURT: 1 Uniforms are helped. total max-temps in shared programs: 2021860 -> 2021802 (<.01%) max-temps in affected programs: 800 -> 742 (-7.25%) helped: 21 HURT: 3 Max-temps are helped. total sfu-stalls in shared programs: 19299 -> 19296 (-0.02%) sfu-stalls in affected programs: 18 -> 15 (-16.67%) helped: 10 HURT: 7 Inconclusive result (value mean confidence interval includes 0). total inst-and-stalls in shared programs: 11725222 -> 11724330 (<.01%) inst-and-stalls in affected programs: 88402 -> 87510 (-1.01%) helped: 201 HURT: 80 Inst-and-stalls are helped. total nops in shared programs: 269674 -> 269386 (-0.11%) nops in affected programs: 3641 -> 3353 (-7.91%) helped: 103 HURT: 29 Nops are helped. Reviewed-by: Iago Toral Quiroga <itoral@igalia.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/17185>
2022-06-13 12:43:12 +02:00
parent 9cbc3ab239
commit 0bf31b0710
4 changed files with 50 additions and 6 deletions
--- a/src/broadcom/compiler/nir_to_vir.c
+++ b/src/broadcom/compiler/nir_to_vir.c
@@ -2126,7 +2126,7 @@ mem_vectorize_callback(unsigned align_mul, unsigned align_offset,
 }

 void
-v3d_optimize_nir(struct v3d_compile *c, struct nir_shader *s)
+v3d_optimize_nir(struct v3d_compile *c, struct nir_shader *s, bool allow_copies)
 {
        bool progress;
        unsigned lower_flrp =
@@ -2137,7 +2137,29 @@ v3d_optimize_nir(struct v3d_compile *c, struct nir_shader *s)
        do {
                progress = false;

+                NIR_PASS(progress, s, nir_split_array_vars, nir_var_function_temp);
+                NIR_PASS(progress, s, nir_shrink_vec_array_vars, nir_var_function_temp);
+                NIR_PASS(progress, s, nir_opt_deref);
+
                NIR_PASS(progress, s, nir_lower_vars_to_ssa);
+                if (allow_copies) {
+                        /* Only run this pass if nir_lower_var_copies was not called
+                         * yet. That would lower away any copy_deref instructions and we
+                         * don't want to introduce any more.
+                         */
+                        NIR_PASS(progress, s, nir_opt_find_array_copies);
+                }
+
+                NIR_PASS(progress, s, nir_opt_copy_prop_vars);
+                NIR_PASS(progress, s, nir_opt_dead_write_vars);
+                NIR_PASS(progress, s, nir_opt_combine_stores, nir_var_all);
+
+                NIR_PASS(progress, s, nir_remove_dead_variables,
+                         (nir_variable_mode)(nir_var_function_temp |
+                                             nir_var_shader_temp |
+                                             nir_var_mem_shared),
+                         NULL);
+
                NIR_PASS(progress, s, nir_lower_alu_to_scalar, NULL, NULL);
                NIR_PASS(progress, s, nir_lower_phis_to_scalar, false);
                NIR_PASS(progress, s, nir_copy_prop);
@@ -2145,10 +2167,27 @@ v3d_optimize_nir(struct v3d_compile *c, struct nir_shader *s)
                NIR_PASS(progress, s, nir_opt_dce);
                NIR_PASS(progress, s, nir_opt_dead_cf);
                NIR_PASS(progress, s, nir_opt_cse);
+                NIR_PASS(progress, s, nir_opt_peephole_select, 0, false, false);
                NIR_PASS(progress, s, nir_opt_peephole_select, 8, true, true);
                NIR_PASS(progress, s, nir_opt_algebraic);
                NIR_PASS(progress, s, nir_opt_constant_folding);

+                NIR_PASS(progress, s, nir_opt_intrinsics);
+                NIR_PASS(progress, s, nir_opt_idiv_const, 32);
+                NIR_PASS(progress, s, nir_lower_alu);
+
+                if (nir_opt_trivial_continues(s)) {
+                   progress = true;
+                   NIR_PASS(progress, s, nir_copy_prop);
+                   NIR_PASS(progress, s, nir_opt_dce);
+                }
+
+                NIR_PASS(progress, s, nir_opt_conditional_discard);
+
+                NIR_PASS(progress, s, nir_opt_remove_phis);
+                NIR_PASS(progress, s, nir_opt_if, false);
+                NIR_PASS(progress, s, nir_opt_undef);
+
                /* Note that vectorization may undo the load/store scalarization
                 * pass we run for non 32-bit TMU general load/store by
                 * converting, for example, 2 consecutive 16-bit loads into a
--- a/src/broadcom/compiler/v3d_compiler.h
+++ b/src/broadcom/compiler/v3d_compiler.h
@@ -1071,7 +1071,7 @@ vir_has_uniform(struct qinst *inst)
 const struct v3d_compiler *v3d_compiler_init(const struct v3d_device_info *devinfo,
                                             uint32_t max_inline_uniform_buffers);
 void v3d_compiler_free(const struct v3d_compiler *compiler);
-void v3d_optimize_nir(struct v3d_compile *c, struct nir_shader *s);
+void v3d_optimize_nir(struct v3d_compile *c, struct nir_shader *s, bool allow_copies);

 uint64_t *v3d_compile(const struct v3d_compiler *compiler,
                      struct v3d_key *key,
--- a/src/broadcom/compiler/vir.c
+++ b/src/broadcom/compiler/vir.c
@@ -930,7 +930,7 @@ v3d_nir_lower_vs_early(struct v3d_compile *c)
        NIR_PASS(_, c->s, nir_remove_unused_io_vars,
                 nir_var_shader_out, used_outputs, NULL); /* demotes to globals */
        NIR_PASS(_, c->s, nir_lower_global_vars_to_local);
-        v3d_optimize_nir(c, c->s);
+        v3d_optimize_nir(c, c->s, false);
        NIR_PASS(_, c->s, nir_remove_dead_variables, nir_var_shader_in, NULL);

        /* This must go before nir_lower_io */
@@ -964,7 +964,7 @@ v3d_nir_lower_gs_early(struct v3d_compile *c)
        NIR_PASS(_, c->s, nir_remove_unused_io_vars,
                 nir_var_shader_out, used_outputs, NULL); /* demotes to globals */
        NIR_PASS(_, c->s, nir_lower_global_vars_to_local);
-        v3d_optimize_nir(c, c->s);
+        v3d_optimize_nir(c, c->s, false);
        NIR_PASS(_, c->s, nir_remove_dead_variables, nir_var_shader_in, NULL);

        /* This must go before nir_lower_io */
@@ -1607,7 +1607,7 @@ v3d_attempt_compile(struct v3d_compile *c)

        NIR_PASS(_, c->s, v3d_nir_lower_subgroup_intrinsics, c);

-        v3d_optimize_nir(c, c->s);
+        v3d_optimize_nir(c, c->s, false);

        /* Do late algebraic optimization to turn add(a, neg(b)) back into
         * subs, then the mandatory cleanup after algebraic.  Note that it may
--- a/src/gallium/drivers/v3d/v3d_program.c
+++ b/src/gallium/drivers/v3d/v3d_program.c
@@ -318,7 +318,12 @@ v3d_uncompiled_shader_create(struct pipe_context *pctx,

        NIR_PASS(_, s, nir_lower_load_const_to_scalar);

-        v3d_optimize_nir(NULL, s);
+        v3d_optimize_nir(NULL, s, true);
+
+        NIR_PASS(_, s, nir_lower_var_copies);
+
+        /* Get rid of split copies */
+        v3d_optimize_nir(NULL, s, false);

        NIR_PASS(_, s, nir_remove_dead_variables, nir_var_function_temp, NULL);