radv: Add new linking step and use nir_opt_varyings.

The nir_opt_varyings pass is the new NIR solution for shader
linking, with new features including better I/O compaction,
packing 16-bit I/O, inter-stage code motion and more.

Fossil DB stats on Rembrandt:

Totals from 34585 (43.56% of 79395) affected shaders:
MaxWaves: 873362 -> 873260 (-0.01%); split: +0.11%, -0.12%
Instrs: 21543639 -> 21526956 (-0.08%); split: -0.27%, +0.19%
CodeSize: 115077568 -> 115015536 (-0.05%); split: -0.25%, +0.20%
VGPRs: 1465152 -> 1464192 (-0.07%); split: -0.29%, +0.22%
Inputs: 161776 -> 158711 (-1.89%); split: -1.90%, +0.00%
Outputs: 46532551993 -> 46532548680 (-0.00%); split: -0.00%, +0.00%
LDS: 70597120 -> 70794752 (+0.28%); split: -0.04%, +0.32%
Latency: 162963576 -> 162785055 (-0.11%); split: -0.25%, +0.14%
InvThroughput: 37356298 -> 37261700 (-0.25%); split: -0.37%, +0.12%
VClause: 427827 -> 427105 (-0.17%); split: -0.35%, +0.18%
SClause: 669989 -> 668623 (-0.20%); split: -0.36%, +0.15%
Copies: 1582166 -> 1582592 (+0.03%); split: -0.36%, +0.39%
Branches: 523203 -> 523789 (+0.11%); split: -0.04%, +0.15%
PreSGPRs: 1272992 -> 1273228 (+0.02%); split: -0.05%, +0.07%
PreVGPRs: 1164295 -> 1161623 (-0.23%); split: -0.43%, +0.20%
VALU: 13733432 -> 13714109 (-0.14%); split: -0.35%, +0.21%
SALU: 2828974 -> 2831841 (+0.10%); split: -0.12%, +0.22%
VMEM: 748396 -> 748500 (+0.01%); split: -0.16%, +0.18%
SMEM: 1263487 -> 1263329 (-0.01%); split: -0.03%, +0.02%

Signed-off-by: Timur Kristóf <timur.kristof@gmail.com>
Reviewed-by: Georg Lehmann <dadschoorse@gmail.com>
Reviewed-by: Samuel Pitoiset <samuel.pitoiset@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/28676>
This commit is contained in:
Timur Kristóf
2024-04-17 00:19:02 +02:00
committed by Marge Bot
parent fcb2c62b63
commit 17f6ab28cc
2 changed files with 142 additions and 0 deletions

View File

@@ -12,6 +12,7 @@
#include "nir/nir.h"
#include "nir/nir_builder.h"
#include "nir/nir_serialize.h"
#include "nir/nir_xfb_info.h"
#include "nir/radv_nir.h"
#include "spirv/nir_spirv.h"
#include "util/disk_cache.h"
@@ -22,6 +23,7 @@
#include "radv_debug.h"
#include "radv_entrypoints.h"
#include "radv_formats.h"
#include "radv_physical_device.h"
#include "radv_pipeline_cache.h"
#include "radv_rmv.h"
#include "radv_shader.h"
@@ -1519,6 +1521,141 @@ radv_graphics_shaders_link(const struct radv_device *device, const struct radv_g
}
}
/**
* Fist pass of varying optimization.
* This function is called for each shader pair from first to last.
*
* 1. Run some NIR passes in preparation.
* 2. Optimize varyings.
* 3. If either shader changed, run algebraic optimizations.
*/
static void
radv_graphics_shaders_link_varyings_first(struct radv_shader_stage *producer_stage,
struct radv_shader_stage *consumer_stage)
{
nir_shader *producer = producer_stage->nir;
nir_shader *consumer = consumer_stage->nir;
/* It is expected by nir_opt_varyings that no undefined stores are present in the shader. */
NIR_PASS(_, producer, nir_opt_undef);
/* Update load/store alignments because inter-stage code motion may move instructions used to deduce this info. */
NIR_PASS(_, consumer, nir_opt_load_store_update_alignments);
/* Scalarize all I/O, because nir_opt_varyings and nir_opt_vectorize_io expect all I/O to be scalarized. */
NIR_PASS(_, producer, nir_lower_io_to_scalar, nir_var_shader_out, NULL, NULL);
NIR_PASS(_, consumer, nir_lower_io_to_scalar, nir_var_shader_in, NULL, NULL);
/* Eliminate useless vec->mov copies resulting from scalarization. */
NIR_PASS(_, producer, nir_copy_prop);
const nir_opt_varyings_progress p = nir_opt_varyings(producer, consumer, true, 0, 0);
/* Run algebraic optimizations on shaders that changed. */
if (p & nir_progress_producer) {
radv_optimize_nir_algebraic(producer, false, false);
}
if (p & nir_progress_consumer) {
radv_optimize_nir_algebraic(consumer, false, false);
}
}
/**
* Second pass of varying optimization.
* This function is called for each shader pair from last to fist,
* after the first pass had already been called for each pair.
* Done because the previous pass might have enabled additional
* opportunities for optimization.
*
* 1. Optimize varyings again.
* 2. If either shader changed, run algebraic optimizations.
* 3. Run some NIR passes to clean up the shaders.
*/
static void
radv_graphics_shaders_link_varyings_second(struct radv_shader_stage *producer_stage,
struct radv_shader_stage *consumer_stage)
{
nir_shader *producer = producer_stage->nir;
nir_shader *consumer = consumer_stage->nir;
const nir_opt_varyings_progress p = nir_opt_varyings(producer, consumer, true, 0, 0);
/* Run algebraic optimizations on shaders that changed. */
if (p & nir_progress_producer) {
radv_optimize_nir_algebraic(producer, true, false);
}
if (p & nir_progress_consumer) {
radv_optimize_nir_algebraic(consumer, true, false);
}
/* Re-vectorize I/O for stages that output to memory (LDS or VRAM).
* Don't vectorize FS inputs, doing so just regresses shader stats without any benefit.
* There is also no benefit from re-vectorizing the outputs of the last pre-rasterization
* stage here, because ac_nir_lower_ngg/legacy already takes care of that.
*/
if (consumer->info.stage != MESA_SHADER_FRAGMENT) {
NIR_PASS(_, producer, nir_opt_vectorize_io, nir_var_shader_out);
NIR_PASS(_, consumer, nir_opt_vectorize_io, nir_var_shader_in);
}
/* Recompute driver locations of PS inputs
* because the backend compiler relies on their driver locations.
*/
if (consumer->info.stage == MESA_SHADER_FRAGMENT)
nir_recompute_io_bases(consumer, nir_var_shader_in);
/* Gather shader info; at least the I/O info likely changed
* and changes to only the I/O info are not reflected in nir_opt_varyings_progress.
*/
nir_shader_gather_info(producer, nir_shader_get_entrypoint(producer));
nir_shader_gather_info(consumer, nir_shader_get_entrypoint(consumer));
/* Recreate XFB info from intrinsics (nir_opt_varyings may have changed it). */
if (producer->xfb_info) {
nir_gather_xfb_info_from_intrinsics(producer);
}
}
/**
* Varying optimizations performed on lowered shader I/O.
*
* We do this after lowering shader I/O because this is more effective
* than running the same optimizations on I/O derefs.
*/
static void
radv_graphics_shaders_link_varyings(struct radv_shader_stage *stages)
{
/* Optimize varyings from first to last stage. */
gl_shader_stage prev = MESA_SHADER_NONE;
for (int i = 0; i < ARRAY_SIZE(graphics_shader_order); ++i) {
gl_shader_stage s = graphics_shader_order[i];
if (!stages[s].nir)
continue;
if (prev != MESA_SHADER_NONE) {
if (!stages[prev].key.optimisations_disabled && !stages[s].key.optimisations_disabled)
radv_graphics_shaders_link_varyings_first(&stages[prev], &stages[s]);
}
prev = s;
}
/* Optimize varyings from last to first stage. */
gl_shader_stage next = MESA_SHADER_NONE;
for (int i = ARRAY_SIZE(graphics_shader_order) - 1; i >= 0; --i) {
gl_shader_stage s = graphics_shader_order[i];
if (!stages[s].nir)
continue;
if (next != MESA_SHADER_NONE) {
if (!stages[s].key.optimisations_disabled && !stages[next].key.optimisations_disabled)
radv_graphics_shaders_link_varyings_second(&stages[s], &stages[next]);
}
next = s;
}
}
struct radv_ps_epilog_key
radv_generate_ps_epilog_key(const struct radv_device *device, const struct radv_ps_epilog_state *state)
{
@@ -2518,6 +2655,9 @@ radv_graphics_shaders_compile(struct radv_device *device, struct vk_pipeline_cac
radv_nir_remap_color_attachment(stages[MESA_SHADER_FRAGMENT].nir, gfx_state);
}
/* Optimize varyings on lowered shader I/O (more efficient than optimizing I/O derefs). */
radv_graphics_shaders_link_varyings(stages);
radv_fill_shader_info(device, RADV_PIPELINE_GRAPHICS, gfx_state, stages, active_nir_stages);
radv_declare_pipeline_args(device, stages, gfx_state, active_nir_stages);

View File

@@ -64,6 +64,8 @@ get_nir_options_for_stage(struct radv_physical_device *pdev, gl_shader_stage sta
options->max_unroll_iterations_aggressive = 128;
options->lower_doubles_options = nir_lower_drcp | nir_lower_dsqrt | nir_lower_drsq | nir_lower_ddiv;
options->io_options |= nir_io_mediump_is_32bit;
options->varying_estimate_instr_cost = ac_nir_varying_estimate_instr_cost;
options->varying_expression_max_cost = ac_nir_varying_expression_max_cost;
}
void