agx: Add forward optimizing pass for fmov

Explain the ideas behind our SSA-based optimizer (inspired by ACO's, thank you to Daniel Schuermann for discussing this with me in the context of Bifrost), and implement the subset needed to propagate abs/neg through. Signed-off-by: Alyssa Rosenzweig <alyssa@rosenzweig.io> Acked-by: Jason Ekstrand <jason@jlekstrand.net> Acked-by: Bas Nieuwenhuizen <bas@basnieuwenhuizen.nl> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/10582>
2021-04-17 10:29:27 -04:00
parent e50bae00f4
commit 28801b4849
4 changed files with 139 additions and 0 deletions
--- a/src/asahi/compiler/agx_compile.c
+++ b/src/asahi/compiler/agx_compile.c
@@ -710,6 +710,11 @@ agx_compile_shader_nir(nir_shader *nir,
   agx_foreach_block(ctx, block)
      block->name = block_source_count++;

+   if (agx_debug & AGX_DBG_SHADERS && !skip_internal)
+      agx_print_shader(ctx, stdout);
+
+   agx_optimizer(ctx);
+
   if (agx_debug & AGX_DBG_SHADERS && !skip_internal)
      agx_print_shader(ctx, stdout);

--- a/src/asahi/compiler/agx_compiler.h
+++ b/src/asahi/compiler/agx_compiler.h
@@ -547,6 +547,7 @@ agx_builder_insert(agx_cursor *cursor, agx_instr *I)
 void agx_print_instr(agx_instr *I, FILE *fp);
 void agx_print_block(agx_block *block, FILE *fp);
 void agx_print_shader(agx_context *ctx, FILE *fp);
+void agx_optimizer(agx_context *ctx);
 void agx_ra(agx_context *ctx);
 void agx_pack(agx_context *ctx, struct util_dynarray *emission);

--- a/src/asahi/compiler/agx_optimizer.c
+++ b/src/asahi/compiler/agx_optimizer.c
@@ -0,0 +1,132 @@
+/*
+ * Copyright (C) 2021 Alyssa Rosenzweig <alyssa@rosenzweig.io>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "agx_compiler.h"
+
+/* AGX peephole optimizer responsible for instruction combining. It operates in
+ * a forward direction and a backward direction, in each case traversing in
+ * source order. SSA means the forward pass satisfies the invariant:
+ *
+ *    Every def is visited before any of its uses.
+ *
+ * Dually, the backend pass satisfies the invariant:
+ *
+ *    Every use of a def is visited before the def.
+ *
+ * This means the forward pass can propagate modifiers forward, whereas the
+ * backwards pass propagates modifiers backward. Consider an example:
+ *
+ *    1 = fabs 0
+ *    2 = fround 1
+ *    3 = fsat 1
+ *
+ * The forwards pass would propagate the fabs to the fround (since we can
+ * lookup the fabs from the fround source and do the replacement). By contrast
+ * the backwards pass would propagate the fsat back to the fround (since when
+ * we see the fround we know it has only a single user, fsat).  Propagatable
+ * instruction have natural directions (like pushforwards and pullbacks).
+ *
+ * We are careful to update the tracked state whenever we modify an instruction
+ * to ensure the passes are linear-time and converge in a single iteration.
+ *
+ * Size conversions are worth special discussion. Consider the snippet:
+ *
+ *    2 = fadd 0, 1
+ *    3 = f2f16 2
+ *    4 = fround 3
+ *
+ * A priori, we can move the f2f16 in either direction. But it's not equal --
+ * if we move it up to the fadd, we get FP16 for two instructions, whereas if
+ * we push it into the fround, we effectively get FP32 for two instructions. So
+ * f2f16 is backwards. Likewise, consider
+ *
+ *    2 = fadd 0, 1
+ *    3 = f2f32 1
+ *    4 = fround 3
+ *
+ * This time if we move f2f32 up to the fadd, we get FP32 for two, but if we
+ * move it down to the fround, we get FP16 to too. So f2f32 is backwards.
+ */
+
+static bool
+agx_is_fmov(agx_instr *def)
+{
+   return (def->op == AGX_OPCODE_FADD)
+      && agx_is_equiv(def->src[1], agx_negzero());
+}
+
+/* Compose floating-point modifiers with floating-point sources */
+
+static agx_index
+agx_compose_float_src(agx_index to, agx_index from)
+{
+   if (to.abs)
+      from.neg = false;
+
+   from.abs |= to.abs;
+   from.neg |= to.neg;
+
+   return from;
+}
+
+static void
+agx_optimizer_fmov(agx_instr **defs, agx_instr *ins, unsigned srcs)
+{
+   for (unsigned s = 0; s < srcs; ++s) {
+      agx_index src = ins->src[s];
+      if (src.type != AGX_INDEX_NORMAL) continue;
+      
+      agx_instr *def = defs[src.value];
+      if (!agx_is_fmov(def)) continue;
+      if (def->saturate) continue;
+
+      ins->src[s] = agx_compose_float_src(src, def->src[0]);
+   }
+}
+
+static void
+agx_optimizer_forward(agx_context *ctx)
+{
+   agx_instr **defs = calloc(ctx->alloc, sizeof(*defs));
+
+   agx_foreach_instr_global(ctx, I) {
+      struct agx_opcode_info info = agx_opcodes_info[I->op];
+
+      for (unsigned d = 0; d < info.nr_dests; ++d) {
+         assert(I->dest[d].type == AGX_INDEX_NORMAL);
+         defs[I->dest[d].value] = I;
+      }
+
+      /* Propagate fmov down */
+      if (info.is_float)
+         agx_optimizer_fmov(defs, I, info.nr_srcs);
+   }
+
+   free(defs);
+}
+
+void
+agx_optimizer(agx_context *ctx)
+{
+   agx_optimizer_forward(ctx);
+}
--- a/src/asahi/compiler/meson.build
+++ b/src/asahi/compiler/meson.build
@@ -23,6 +23,7 @@ libasahi_agx_files = files(
  'agx_compile.c',
  'agx_pack.c',
  'agx_print.c',
+  'agx_optimizer.c',
  'agx_register_allocate.c',
 )