radv: Add new linking step and use nir_opt_varyings.

The nir_opt_varyings pass is the new NIR solution for shader linking, with new features including better I/O compaction, packing 16-bit I/O, inter-stage code motion and more. Fossil DB stats on Rembrandt: Totals from 34585 (43.56% of 79395) affected shaders: MaxWaves: 873362 -> 873260 (-0.01%); split: +0.11%, -0.12% Instrs: 21543639 -> 21526956 (-0.08%); split: -0.27%, +0.19% CodeSize: 115077568 -> 115015536 (-0.05%); split: -0.25%, +0.20% VGPRs: 1465152 -> 1464192 (-0.07%); split: -0.29%, +0.22% Inputs: 161776 -> 158711 (-1.89%); split: -1.90%, +0.00% Outputs: 46532551993 -> 46532548680 (-0.00%); split: -0.00%, +0.00% LDS: 70597120 -> 70794752 (+0.28%); split: -0.04%, +0.32% Latency: 162963576 -> 162785055 (-0.11%); split: -0.25%, +0.14% InvThroughput: 37356298 -> 37261700 (-0.25%); split: -0.37%, +0.12% VClause: 427827 -> 427105 (-0.17%); split: -0.35%, +0.18% SClause: 669989 -> 668623 (-0.20%); split: -0.36%, +0.15% Copies: 1582166 -> 1582592 (+0.03%); split: -0.36%, +0.39% Branches: 523203 -> 523789 (+0.11%); split: -0.04%, +0.15% PreSGPRs: 1272992 -> 1273228 (+0.02%); split: -0.05%, +0.07% PreVGPRs: 1164295 -> 1161623 (-0.23%); split: -0.43%, +0.20% VALU: 13733432 -> 13714109 (-0.14%); split: -0.35%, +0.21% SALU: 2828974 -> 2831841 (+0.10%); split: -0.12%, +0.22% VMEM: 748396 -> 748500 (+0.01%); split: -0.16%, +0.18% SMEM: 1263487 -> 1263329 (-0.01%); split: -0.03%, +0.02% Signed-off-by: Timur Kristóf <timur.kristof@gmail.com> Reviewed-by: Georg Lehmann <dadschoorse@gmail.com> Reviewed-by: Samuel Pitoiset <samuel.pitoiset@gmail.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/28676>
2024-04-17 00:19:02 +02:00
parent fcb2c62b63
commit 17f6ab28cc
2 changed files with 142 additions and 0 deletions
--- a/src/amd/vulkan/radv_pipeline_graphics.c
+++ b/src/amd/vulkan/radv_pipeline_graphics.c
@@ -12,6 +12,7 @@
 #include "nir/nir.h"
 #include "nir/nir_builder.h"
 #include "nir/nir_serialize.h"
+#include "nir/nir_xfb_info.h"
 #include "nir/radv_nir.h"
 #include "spirv/nir_spirv.h"
 #include "util/disk_cache.h"
@@ -22,6 +23,7 @@
 #include "radv_debug.h"
 #include "radv_entrypoints.h"
 #include "radv_formats.h"
+#include "radv_physical_device.h"
 #include "radv_pipeline_cache.h"
 #include "radv_rmv.h"
 #include "radv_shader.h"
@@ -1519,6 +1521,141 @@ radv_graphics_shaders_link(const struct radv_device *device, const struct radv_g
   }
 }

+/**
+ * Fist pass of varying optimization.
+ * This function is called for each shader pair from first to last.
+ *
+ * 1. Run some NIR passes in preparation.
+ * 2. Optimize varyings.
+ * 3. If either shader changed, run algebraic optimizations.
+ */
+static void
+radv_graphics_shaders_link_varyings_first(struct radv_shader_stage *producer_stage,
+                                          struct radv_shader_stage *consumer_stage)
+{
+   nir_shader *producer = producer_stage->nir;
+   nir_shader *consumer = consumer_stage->nir;
+
+   /* It is expected by nir_opt_varyings that no undefined stores are present in the shader. */
+   NIR_PASS(_, producer, nir_opt_undef);
+
+   /* Update load/store alignments because inter-stage code motion may move instructions used to deduce this info. */
+   NIR_PASS(_, consumer, nir_opt_load_store_update_alignments);
+
+   /* Scalarize all I/O, because nir_opt_varyings and nir_opt_vectorize_io expect all I/O to be scalarized. */
+   NIR_PASS(_, producer, nir_lower_io_to_scalar, nir_var_shader_out, NULL, NULL);
+   NIR_PASS(_, consumer, nir_lower_io_to_scalar, nir_var_shader_in, NULL, NULL);
+
+   /* Eliminate useless vec->mov copies resulting from scalarization. */
+   NIR_PASS(_, producer, nir_copy_prop);
+
+   const nir_opt_varyings_progress p = nir_opt_varyings(producer, consumer, true, 0, 0);
+
+   /* Run algebraic optimizations on shaders that changed. */
+   if (p & nir_progress_producer) {
+      radv_optimize_nir_algebraic(producer, false, false);
+   }
+   if (p & nir_progress_consumer) {
+      radv_optimize_nir_algebraic(consumer, false, false);
+   }
+}
+
+/**
+ * Second pass of varying optimization.
+ * This function is called for each shader pair from last to fist,
+ * after the first pass had already been called for each pair.
+ * Done because the previous pass might have enabled additional
+ * opportunities for optimization.
+ *
+ * 1. Optimize varyings again.
+ * 2. If either shader changed, run algebraic optimizations.
+ * 3. Run some NIR passes to clean up the shaders.
+ */
+static void
+radv_graphics_shaders_link_varyings_second(struct radv_shader_stage *producer_stage,
+                                           struct radv_shader_stage *consumer_stage)
+{
+   nir_shader *producer = producer_stage->nir;
+   nir_shader *consumer = consumer_stage->nir;
+
+   const nir_opt_varyings_progress p = nir_opt_varyings(producer, consumer, true, 0, 0);
+
+   /* Run algebraic optimizations on shaders that changed. */
+   if (p & nir_progress_producer) {
+      radv_optimize_nir_algebraic(producer, true, false);
+   }
+   if (p & nir_progress_consumer) {
+      radv_optimize_nir_algebraic(consumer, true, false);
+   }
+
+   /* Re-vectorize I/O for stages that output to memory (LDS or VRAM).
+    * Don't vectorize FS inputs, doing so just regresses shader stats without any benefit.
+    * There is also no benefit from re-vectorizing the outputs of the last pre-rasterization
+    * stage here, because ac_nir_lower_ngg/legacy already takes care of that.
+    */
+   if (consumer->info.stage != MESA_SHADER_FRAGMENT) {
+      NIR_PASS(_, producer, nir_opt_vectorize_io, nir_var_shader_out);
+      NIR_PASS(_, consumer, nir_opt_vectorize_io, nir_var_shader_in);
+   }
+
+   /* Recompute driver locations of PS inputs
+    * because the backend compiler relies on their driver locations.
+    */
+   if (consumer->info.stage == MESA_SHADER_FRAGMENT)
+      nir_recompute_io_bases(consumer, nir_var_shader_in);
+
+   /* Gather shader info; at least the I/O info likely changed
+    * and changes to only the I/O info are not reflected in nir_opt_varyings_progress.
+    */
+   nir_shader_gather_info(producer, nir_shader_get_entrypoint(producer));
+   nir_shader_gather_info(consumer, nir_shader_get_entrypoint(consumer));
+
+   /* Recreate XFB info from intrinsics (nir_opt_varyings may have changed it). */
+   if (producer->xfb_info) {
+      nir_gather_xfb_info_from_intrinsics(producer);
+   }
+}
+
+/**
+ * Varying optimizations performed on lowered shader I/O.
+ *
+ * We do this after lowering shader I/O because this is more effective
+ * than running the same optimizations on I/O derefs.
+ */
+static void
+radv_graphics_shaders_link_varyings(struct radv_shader_stage *stages)
+{
+   /* Optimize varyings from first to last stage. */
+   gl_shader_stage prev = MESA_SHADER_NONE;
+   for (int i = 0; i < ARRAY_SIZE(graphics_shader_order); ++i) {
+      gl_shader_stage s = graphics_shader_order[i];
+      if (!stages[s].nir)
+         continue;
+
+      if (prev != MESA_SHADER_NONE) {
+         if (!stages[prev].key.optimisations_disabled && !stages[s].key.optimisations_disabled)
+            radv_graphics_shaders_link_varyings_first(&stages[prev], &stages[s]);
+      }
+
+      prev = s;
+   }
+
+   /* Optimize varyings from last to first stage. */
+   gl_shader_stage next = MESA_SHADER_NONE;
+   for (int i = ARRAY_SIZE(graphics_shader_order) - 1; i >= 0; --i) {
+      gl_shader_stage s = graphics_shader_order[i];
+      if (!stages[s].nir)
+         continue;
+
+      if (next != MESA_SHADER_NONE) {
+         if (!stages[s].key.optimisations_disabled && !stages[next].key.optimisations_disabled)
+            radv_graphics_shaders_link_varyings_second(&stages[s], &stages[next]);
+      }
+
+      next = s;
+   }
+}
+
 struct radv_ps_epilog_key
 radv_generate_ps_epilog_key(const struct radv_device *device, const struct radv_ps_epilog_state *state)
 {
@@ -2518,6 +2655,9 @@ radv_graphics_shaders_compile(struct radv_device *device, struct vk_pipeline_cac
         radv_nir_remap_color_attachment(stages[MESA_SHADER_FRAGMENT].nir, gfx_state);
   }

+   /* Optimize varyings on lowered shader I/O (more efficient than optimizing I/O derefs). */
+   radv_graphics_shaders_link_varyings(stages);
+
   radv_fill_shader_info(device, RADV_PIPELINE_GRAPHICS, gfx_state, stages, active_nir_stages);

   radv_declare_pipeline_args(device, stages, gfx_state, active_nir_stages);
--- a/src/amd/vulkan/radv_shader.c
+++ b/src/amd/vulkan/radv_shader.c
@@ -64,6 +64,8 @@ get_nir_options_for_stage(struct radv_physical_device *pdev, gl_shader_stage sta
   options->max_unroll_iterations_aggressive = 128;
   options->lower_doubles_options = nir_lower_drcp | nir_lower_dsqrt | nir_lower_drsq | nir_lower_ddiv;
   options->io_options |= nir_io_mediump_is_32bit;
+   options->varying_estimate_instr_cost = ac_nir_varying_estimate_instr_cost;
+   options->varying_expression_max_cost = ac_nir_varying_expression_max_cost;
 }

 void