nir/lower_vec_to_movs: don't vectorize unsupports ops

If the instruction being coalesced would be vectorized but the target doesn't support vectorizing that op, skip coalescing. Reuse the callbacks from alu_to_scalar to describe which ops should not be vectorized. Signed-off-by: Erico Nunes <nunes.erico@gmail.com> Reviewed-by: Jason Ekstrand <jason@jlekstrand.net> Reviewed-by: Eric Anholt <eric@anholt.net> Acked-by: Alyssa Rosenzweig <alyssa.rosenzweig@collabora.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/6506>
2020-08-30 15:07:23 +02:00
parent b75d8052a7
commit faaba0d6af
7 changed files with 57 additions and 12 deletions
--- a/src/compiler/nir/nir.h
+++ b/src/compiler/nir/nir.h
@@ -4088,6 +4088,14 @@ static inline bool should_print_nir(nir_shader *shader) { return false; }
 */
 typedef bool (*nir_instr_filter_cb)(const nir_instr *, const void *);

+/** An instruction filtering callback with writemask
+ *
+ * Returns true if the instruction should be processed with the associated
+ * writemask and false otherwise.
+ */
+typedef bool (*nir_instr_writemask_filter_cb)(const nir_instr *,
+                                              unsigned writemask, const void *);
+
 /** A simple instruction lowering callback
 *
 * Many instruction lowering passes can be written as a simple function which
@@ -4457,7 +4465,8 @@ bool nir_lower_variable_initializers(nir_shader *shader,
                                     nir_variable_mode modes);

 bool nir_move_vec_src_uses_to_dest(nir_shader *shader);
-bool nir_lower_vec_to_movs(nir_shader *shader);
+bool nir_lower_vec_to_movs(nir_shader *shader, nir_instr_writemask_filter_cb cb,
+                           const void *_data);
 void nir_lower_alpha_test(nir_shader *shader, enum compare_func func,
                          bool alpha_to_one,
                          const gl_state_index16 *alpha_ref_state_tokens);
--- a/src/compiler/nir/nir_lower_vec_to_movs.c
+++ b/src/compiler/nir/nir_lower_vec_to_movs.c
@@ -28,6 +28,11 @@
 #include "nir.h"
 #include "nir_builder.h"

+struct vec_to_movs_data {
+   nir_instr_writemask_filter_cb cb;
+   const void *data;
+};
+
 /*
 * Implements a simple pass that lowers vecN instructions to a series of
 * moves with partial writes.
@@ -119,8 +124,10 @@ has_replicated_dest(nir_alu_instr *alu)
 * can then call insert_mov as normal.
 */
 static unsigned
-try_coalesce(nir_alu_instr *vec, unsigned start_idx)
+try_coalesce(nir_alu_instr *vec, unsigned start_idx, void *_data)
 {
+   struct vec_to_movs_data *data = _data;
+
   assert(start_idx < nir_op_infos[vec->op].num_inputs);

   /* We will only even try if the source is SSA */
@@ -178,6 +185,7 @@ try_coalesce(nir_alu_instr *vec, unsigned start_idx)
      for (unsigned i = 0; i < 4; i++)
         swizzles[j][i] = src_alu->src[j].swizzle[i];

+   /* Generate the final write mask */
   unsigned write_mask = 0;
   for (unsigned i = start_idx; i < 4; i++) {
      if (!(vec->dest.write_mask & (1 << i)))
@@ -187,10 +195,21 @@ try_coalesce(nir_alu_instr *vec, unsigned start_idx)
          vec->src[i].src.ssa != &src_alu->dest.dest.ssa)
         continue;

-      /* At this point, the give vec source matchese up with the ALU
+      write_mask |= 1 << i;
+   }
+
+   /* If the instruction would be vectorized but the backend
+    * doesn't support vectorizing this op, abort. */
+   if (data->cb && !data->cb(&src_alu->instr, write_mask, data->data))
+      return 0;
+
+   for (unsigned i = start_idx; i < 4; i++) {
+      if (!(write_mask & (1 << i)))
+         continue;
+
+      /* At this point, the given vec source matches up with the ALU
       * instruction so we can re-swizzle that component to match.
       */
-      write_mask |= 1 << i;
      if (has_replicated_dest(src_alu)) {
         /* Since the destination is a single replicated value, we don't need
          * to do any reswizzling
@@ -266,7 +285,7 @@ nir_lower_vec_to_movs_instr(nir_builder *b, nir_instr *instr, void *data)
       * vecN had an SSA destination.
       */
      if (vec_had_ssa_dest && !(finished_write_mask & (1 << i)))
-         finished_write_mask |= try_coalesce(vec, i);
+         finished_write_mask |= try_coalesce(vec, i, data);

      if (!(finished_write_mask & (1 << i)))
         finished_write_mask |= insert_mov(vec, i, b->shader);
@@ -279,11 +298,17 @@ nir_lower_vec_to_movs_instr(nir_builder *b, nir_instr *instr, void *data)
 }

 bool
-nir_lower_vec_to_movs(nir_shader *shader)
+nir_lower_vec_to_movs(nir_shader *shader, nir_instr_writemask_filter_cb cb,
+                      const void *_data)
 {
+   struct vec_to_movs_data data = {
+      .cb = cb,
+      .data = _data,
+   };
+
   return nir_shader_instructions_pass(shader,
                                       nir_lower_vec_to_movs_instr,
                                       nir_metadata_block_index |
                                       nir_metadata_dominance,
-                                       NULL);
+                                       &data);
 }
--- a/src/gallium/auxiliary/nir/nir_to_tgsi.c
+++ b/src/gallium/auxiliary/nir/nir_to_tgsi.c
@@ -2622,7 +2622,7 @@ nir_to_tgsi(struct nir_shader *s,
              nir_lower_float_source_mods |
              nir_lower_int_source_mods); /* no doubles */
   NIR_PASS_V(s, nir_convert_from_ssa, true);
-   NIR_PASS_V(s, nir_lower_vec_to_movs);
+   NIR_PASS_V(s, nir_lower_vec_to_movs, NULL, NULL);

   /* locals_to_regs will leave dead derefs that are good to clean up. */
   NIR_PASS_V(s, nir_lower_locals_to_regs);
--- a/src/gallium/drivers/freedreno/a2xx/ir2_nir.c
+++ b/src/gallium/drivers/freedreno/a2xx/ir2_nir.c
@@ -1111,7 +1111,7 @@ ir2_nir_compile(struct ir2_context *ctx, bool binning)
 	OPT_V(ctx->nir, nir_convert_from_ssa, true);

 	OPT_V(ctx->nir, nir_move_vec_src_uses_to_dest);
-	OPT_V(ctx->nir, nir_lower_vec_to_movs);
+	OPT_V(ctx->nir, nir_lower_vec_to_movs, NULL, NULL);

 	OPT_V(ctx->nir, nir_opt_dce);

--- a/src/gallium/drivers/lima/lima_program.c
+++ b/src/gallium/drivers/lima/lima_program.c
@@ -191,6 +191,17 @@ lima_alu_to_scalar_filter_cb(const nir_instr *instr, const void *data)
   return false;
 }

+static bool
+lima_vec_to_movs_filter_cb(const nir_instr *instr, unsigned writemask,
+                           const void *data)
+{
+   assert(writemask > 0);
+   if (util_bitcount(writemask) == 1)
+      return true;
+
+   return !lima_alu_to_scalar_filter_cb(instr, data);
+}
+
 void
 lima_program_optimize_fs_nir(struct nir_shader *s,
                             struct nir_lower_tex_options *tex_options)
@@ -252,7 +263,7 @@ lima_program_optimize_fs_nir(struct nir_shader *s,
   NIR_PASS_V(s, nir_remove_dead_variables, nir_var_function_temp, NULL);

   NIR_PASS_V(s, nir_move_vec_src_uses_to_dest);
-   NIR_PASS_V(s, nir_lower_vec_to_movs);
+   NIR_PASS_V(s, nir_lower_vec_to_movs, lima_vec_to_movs_filter_cb, NULL);

   NIR_PASS_V(s, lima_nir_duplicate_load_uniforms);
   NIR_PASS_V(s, lima_nir_duplicate_load_inputs);
--- a/src/intel/compiler/brw_nir.c
+++ b/src/intel/compiler/brw_nir.c
@@ -1183,7 +1183,7 @@ brw_postprocess_nir(nir_shader *nir, const struct brw_compiler *compiler,

   if (!is_scalar) {
      OPT(nir_move_vec_src_uses_to_dest);
-      OPT(nir_lower_vec_to_movs);
+      OPT(nir_lower_vec_to_movs, NULL, NULL);
   }

   OPT(nir_opt_dce);
--- a/src/panfrost/midgard/midgard_compile.c
+++ b/src/panfrost/midgard/midgard_compile.c
@@ -348,7 +348,7 @@ optimise_nir(nir_shader *nir, unsigned quirks, bool is_blend)

        /* We are a vector architecture; write combine where possible */
        NIR_PASS(progress, nir, nir_move_vec_src_uses_to_dest);
-        NIR_PASS(progress, nir, nir_lower_vec_to_movs);
+        NIR_PASS(progress, nir, nir_lower_vec_to_movs, NULL, NULL);

        NIR_PASS(progress, nir, nir_opt_dce);
 }