From 1becc6953cbd656f1e6172fc425c37fadb3cf41f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= <marek.olsak@amd.com>
Date: Mon, 22 Apr 2024 03:39:26 -0400
Subject: [PATCH] ac/nir: import the MSAA resolving pixel shader from radeonsi

It has a lot of options for efficiency.

Acked-by: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer@amd.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/28917>
---
 src/amd/common/ac_nir.c                       |  48 ++++
 src/amd/common/ac_nir_helpers.h               |   8 +
 src/amd/common/ac_nir_meta.h                  |  41 +++
 src/amd/common/ac_nir_meta_ps_resolve.c       | 166 ++++++++++++
 src/amd/common/meson.build                    |   2 +
 src/gallium/drivers/radeonsi/si_blit.c        |  58 ++--
 src/gallium/drivers/radeonsi/si_pipe.h        |  17 +-
 .../drivers/radeonsi/si_shaderlib_nir.c       | 251 ++----------------
 8 files changed, 326 insertions(+), 265 deletions(-)
 create mode 100644 src/amd/common/ac_nir_meta.h
 create mode 100644 src/amd/common/ac_nir_meta_ps_resolve.c

diff --git a/src/amd/common/ac_nir.c b/src/amd/common/ac_nir.c
index 8e5603029e8..aa17c2f8224 100644
--- a/src/amd/common/ac_nir.c
+++ b/src/amd/common/ac_nir.c
@@ -4,6 +4,7 @@
  * SPDX-License-Identifier: MIT
  */
 
+#include "ac_gpu_info.h"
 #include "ac_nir.h"
 #include "ac_nir_helpers.h"
 #include "sid.h"
@@ -1509,3 +1510,50 @@ ac_nir_opt_pack_half(nir_shader *shader, enum amd_gfx_level gfx_level)
    }
    return progress;
 }
+
+nir_def *
+ac_average_samples(nir_builder *b, nir_def **samples, unsigned num_samples)
+{
+   /* This works like add-reduce by computing the sum of each pair independently, and then
+    * computing the sum of each pair of sums, and so on, to get better instruction-level
+    * parallelism.
+    */
+   if (num_samples == 16) {
+      for (unsigned i = 0; i < 8; i++)
+         samples[i] = nir_fadd(b, samples[i * 2], samples[i * 2 + 1]);
+   }
+   if (num_samples >= 8) {
+      for (unsigned i = 0; i < 4; i++)
+         samples[i] = nir_fadd(b, samples[i * 2], samples[i * 2 + 1]);
+   }
+   if (num_samples >= 4) {
+      for (unsigned i = 0; i < 2; i++)
+         samples[i] = nir_fadd(b, samples[i * 2], samples[i * 2 + 1]);
+   }
+   if (num_samples >= 2)
+      samples[0] = nir_fadd(b, samples[0], samples[1]);
+
+   return nir_fmul_imm(b, samples[0], 1.0 / num_samples); /* average the sum */
+}
+
+void
+ac_optimization_barrier_vgpr_array(const struct radeon_info *info, nir_builder *b,
+                                   nir_def **array, unsigned num_elements,
+                                   unsigned num_components)
+{
+   /* We use the optimization barrier to force LLVM to form VMEM clauses by constraining its
+    * instruction scheduling options.
+    *
+    * VMEM clauses are supported since GFX10. It's not recommended to use the optimization
+    * barrier in the compute blit for GFX6-8 because the lack of A16 combined with optimization
+    * barriers would unnecessarily increase VGPR usage for MSAA resources.
+    */
+   if (!b->shader->info.use_aco_amd && info->gfx_level >= GFX10) {
+      for (unsigned i = 0; i < num_elements; i++) {
+         unsigned prev_num = array[i]->num_components;
+         array[i] = nir_trim_vector(b, array[i], num_components);
+         array[i] = nir_optimization_barrier_vgpr_amd(b, array[i]->bit_size, array[i]);
+         array[i] = nir_pad_vector(b, array[i], prev_num);
+      }
+   }
+}
diff --git a/src/amd/common/ac_nir_helpers.h b/src/amd/common/ac_nir_helpers.h
index e7e1537b822..53ce943e09a 100644
--- a/src/amd/common/ac_nir_helpers.h
+++ b/src/amd/common/ac_nir_helpers.h
@@ -136,6 +136,14 @@ ac_nir_cull_primitive(nir_builder *b,
 void
 ac_nir_sleep(nir_builder *b, unsigned num_cycles);
 
+nir_def *
+ac_average_samples(nir_builder *b, nir_def **samples, unsigned num_samples);
+
+void
+ac_optimization_barrier_vgpr_array(const struct radeon_info *info, nir_builder *b,
+                                   nir_def **array, unsigned num_elements,
+                                   unsigned num_components);
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/src/amd/common/ac_nir_meta.h b/src/amd/common/ac_nir_meta.h
new file mode 100644
index 00000000000..be231f24754
--- /dev/null
+++ b/src/amd/common/ac_nir_meta.h
@@ -0,0 +1,41 @@
+/*
+ * Copyright 2024 Advanced Micro Devices, Inc.
+ *
+ * SPDX-License-Identifier: MIT
+ */
+
+#ifndef AC_NIR_META_H
+#define AC_NIR_META_H
+
+#include "ac_gpu_info.h"
+#include "nir.h"
+
+union ac_ps_resolve_key {
+   struct {
+      bool use_aco:1;
+      bool src_is_array:1;
+      uint8_t log_samples:2;
+      uint8_t last_src_channel:2; /* this shouldn't be greater than last_dst_channel */
+      uint8_t last_dst_channel:2;
+      bool x_clamp_to_edge:1;
+      bool y_clamp_to_edge:1;
+      bool a16:1;
+      bool d16:1;
+   };
+   uint64_t key; /* use with hash_table_u64 */
+};
+
+/* Only immutable settings. */
+struct ac_ps_resolve_options {
+   const nir_shader_compiler_options *nir_options;
+   const struct radeon_info *info;
+   bool use_aco;     /* global driver setting */
+   bool no_fmask;    /* FMASK disabled by a debug option, ignored on GFX11+ */
+   bool print_key;   /* print ac_ps_resolve_key into stderr */
+};
+
+nir_shader *
+ac_create_resolve_ps(const struct ac_ps_resolve_options *options,
+                      const union ac_ps_resolve_key *key);
+
+#endif
diff --git a/src/amd/common/ac_nir_meta_ps_resolve.c b/src/amd/common/ac_nir_meta_ps_resolve.c
new file mode 100644
index 00000000000..bccb3bf50fb
--- /dev/null
+++ b/src/amd/common/ac_nir_meta_ps_resolve.c
@@ -0,0 +1,166 @@
+/*
+ * Copyright 2024 Advanced Micro Devices, Inc.
+ *
+ * SPDX-License-Identifier: MIT
+ */
+
+#include "ac_nir_meta.h"
+#include "ac_nir_helpers.h"
+#include "nir_builder.h"
+#include "compiler/aco_interface.h"
+
+static nir_def *
+build_tex_load_ms(nir_builder *b, unsigned num_components, unsigned bit_size,
+                  nir_deref_instr *tex_deref, nir_def *coord, nir_def *sample_index)
+{
+   nir_tex_src srcs[] = {
+      nir_tex_src_for_ssa(nir_tex_src_coord, coord),
+      nir_tex_src_for_ssa(nir_tex_src_ms_index, sample_index),
+   };
+   nir_def *result = nir_build_tex_deref_instr(b, nir_texop_txf_ms, tex_deref, tex_deref,
+                                               ARRAY_SIZE(srcs), srcs);
+
+   nir_tex_instr *tex = nir_instr_as_tex(result->parent_instr);
+
+   assert(bit_size == 32 || bit_size == 16);
+   if (bit_size == 16) {
+      tex->dest_type = nir_type_float16;
+      tex->def.bit_size = 16;
+   }
+   return nir_trim_vector(b, result, num_components);
+}
+
+nir_shader *
+ac_create_resolve_ps(const struct ac_ps_resolve_options *options,
+                     const union ac_ps_resolve_key *key)
+{
+   if (options->print_key) {
+      fprintf(stderr, "Internal shader: resolve_ps\n");
+      fprintf(stderr, "   key.use_aco = %u\n", key->use_aco);
+      fprintf(stderr, "   key.src_is_array = %u\n", key->src_is_array);
+      fprintf(stderr, "   key.log_samples = %u\n", key->log_samples);
+      fprintf(stderr, "   key.last_src_channel = %u\n", key->last_src_channel);
+      fprintf(stderr, "   key.x_clamp_to_edge = %u\n", key->x_clamp_to_edge);
+      fprintf(stderr, "   key.y_clamp_to_edge = %u\n", key->y_clamp_to_edge);
+      fprintf(stderr, "   key.d16 = %u\n", key->d16);
+      fprintf(stderr, "   key.a16 = %u\n", key->a16);
+      fprintf(stderr, "\n");
+   }
+
+   nir_builder b =
+      nir_builder_init_simple_shader(MESA_SHADER_FRAGMENT, options->nir_options, "ac_resolve_ps");
+   b.shader->info.use_aco_amd = options->use_aco ||
+                                (key->use_aco && aco_is_gpu_supported(options->info));
+   BITSET_SET(b.shader->info.textures_used, 1);
+
+   const struct glsl_type *sampler_type =
+      glsl_sampler_type(GLSL_SAMPLER_DIM_MS, /*shadow*/ false, /*is_array*/ key->src_is_array,
+                        GLSL_TYPE_FLOAT);
+   nir_variable *sampler = nir_variable_create(b.shader, nir_var_uniform, sampler_type, "samp0");
+   sampler->data.binding = 0;
+
+   nir_deref_instr *deref = nir_build_deref_var(&b, sampler);
+   nir_def *zero = nir_imm_int(&b, 0);
+   nir_def *baryc = nir_load_barycentric_pixel(&b, 32, .interp_mode = INTERP_MODE_SMOOTH);
+   nir_def *coord = nir_load_interpolated_input(&b, 2 + key->src_is_array, 32, baryc, zero,
+                                                .dest_type = nir_type_float32,
+                                                .io_semantics = (nir_io_semantics){
+                                                                .location = VARYING_SLOT_VAR0,
+                                                                .num_slots = 1});
+
+   /* Nearest filtering floors and then converts to integer, and then
+    * applies clamp to edge as clamp(coord, 0, dim - 1).
+    */
+   coord = nir_vector_insert_imm(&b, coord, nir_ffloor(&b, nir_channel(&b, coord, 0)), 0);
+   coord = nir_vector_insert_imm(&b, coord, nir_ffloor(&b, nir_channel(&b, coord, 1)), 1);
+   coord = nir_f2iN(&b, coord, key->a16 ? 16 : 32);
+
+   /* Clamp to edge only for X and Y because Z can't be out of bounds. */
+   nir_def *resinfo = NULL;
+   for (unsigned chan = 0; chan < 2; chan++) {
+      if (chan ? key->y_clamp_to_edge : key->x_clamp_to_edge) {
+         if (!resinfo) {
+            resinfo = nir_build_tex_deref_instr(&b, nir_texop_txs, deref, deref, 0, NULL);
+
+            if (key->a16) {
+               resinfo = nir_umin_imm(&b, resinfo, INT16_MAX);
+               resinfo = nir_i2i16(&b, resinfo);
+            }
+         }
+
+         nir_def *tmp = nir_channel(&b, coord, chan);
+         tmp = nir_imax_imm(&b, tmp, 0);
+         tmp = nir_imin(&b, tmp, nir_iadd_imm(&b, nir_channel(&b, resinfo, chan), -1));
+         coord = nir_vector_insert_imm(&b, coord, tmp, chan);
+      }
+   }
+
+   /* Use samples_identical if it's supported. */
+   bool uses_samples_identical = options->info->gfx_level < GFX11 && !options->no_fmask;
+   nir_def *sample0 = NULL;
+   nir_if *if_identical = NULL;
+
+   assert(key->last_src_channel <= key->last_dst_channel);
+
+   if (uses_samples_identical) {
+      nir_tex_src iden_srcs[] = {
+         nir_tex_src_for_ssa(nir_tex_src_coord, coord),
+      };
+      nir_def *samples_identical =
+         nir_build_tex_deref_instr(&b, nir_texop_samples_identical, deref, deref,
+                                   ARRAY_SIZE(iden_srcs), iden_srcs);
+
+      /* If all samples are identical, load only sample 0. */
+      if_identical = nir_push_if(&b, samples_identical);
+      {
+         sample0 = build_tex_load_ms(&b, key->last_src_channel + 1, key->d16 ? 16 : 32,
+                                     deref, coord, nir_imm_intN_t(&b, 0, coord->bit_size));
+      }
+      nir_push_else(&b, if_identical);
+   }
+
+   /* Insert the sample index into the coordinates. */
+   unsigned num_src_coords = 2 + key->src_is_array + 1;
+   unsigned num_samples = 1 << key->log_samples;
+   nir_def *coord_src[16] = {0};
+
+   for (unsigned i = 0; i < num_samples; i++) {
+      coord_src[i] = nir_pad_vector(&b, coord, num_src_coords);
+      coord_src[i] = nir_vector_insert_imm(&b, coord_src[i],
+                                           nir_imm_intN_t(&b, i, coord->bit_size),
+                                           num_src_coords - 1);
+   }
+
+   /* We need this because LLVM interleaves coordinate computations with image loads, which breaks
+    * VMEM clauses.
+    */
+   ac_optimization_barrier_vgpr_array(options->info, &b, coord_src, num_samples, num_src_coords);
+
+   nir_def *samples[16] = {0};
+   for (unsigned i = 0; i < num_samples; i++) {
+      samples[i] = build_tex_load_ms(&b, key->last_src_channel + 1, key->d16 ? 16 : 32,
+                                     deref, nir_trim_vector(&b, coord_src[i], num_src_coords - 1),
+                                     nir_channel(&b, coord_src[i], num_src_coords - 1));
+   }
+   nir_def *result = ac_average_samples(&b, samples, num_samples);
+
+   if (uses_samples_identical) {
+      nir_pop_if(&b, if_identical);
+      result = nir_if_phi(&b, sample0, result);
+   }
+
+   result = nir_pad_vector(&b, result, key->last_dst_channel + 1);
+   for (unsigned i = key->last_src_channel + 1; i <= key->last_dst_channel; i++) {
+      result = nir_vector_insert_imm(&b, result,
+                                     nir_imm_floatN_t(&b, i == 3 ? 1 : 0, result->bit_size), i);
+   }
+
+   nir_store_output(&b, result, zero,
+                    .write_mask = BITFIELD_MASK(key->last_dst_channel + 1),
+                    .src_type = key->d16 ? nir_type_float16 : nir_type_float32,
+                    .io_semantics = (nir_io_semantics){
+                                    .location = FRAG_RESULT_DATA0,
+                                    .num_slots = 1});
+
+   return b.shader;
+}
diff --git a/src/amd/common/meson.build b/src/amd/common/meson.build
index 72a5660b840..96ed5b3d24a 100644
--- a/src/amd/common/meson.build
+++ b/src/amd/common/meson.build
@@ -113,6 +113,8 @@ amd_common_files = files(
   'ac_nir_lower_tex.c',
   'ac_nir_lower_ngg.c',
   'ac_nir_lower_ps.c',
+  'ac_nir_meta.h',
+  'ac_nir_meta_ps_resolve.c',
   'amd_family.c',
   'ac_parse_ib.c',
   'ac_perfcounter.c',
diff --git a/src/gallium/drivers/radeonsi/si_blit.c b/src/gallium/drivers/radeonsi/si_blit.c
index 5039bc05d73..b19337750a6 100644
--- a/src/gallium/drivers/radeonsi/si_blit.c
+++ b/src/gallium/drivers/radeonsi/si_blit.c
@@ -10,6 +10,7 @@
 #include "util/u_log.h"
 #include "util/u_surface.h"
 #include "util/hash_table.h"
+#include "ac_nir_meta.h"
 
 enum
 {
@@ -1303,29 +1304,29 @@ void si_gfx_blit(struct pipe_context *ctx, const struct pipe_blit_info *info)
         /* No scaling */
         (info->dst.box.width == abs(info->src.box.width) &&
          info->dst.box.height == abs(info->src.box.height)))) {
-      union si_resolve_ps_key options;
-      options.key = 0;
+      union ac_ps_resolve_key key;
+      key.key = 0;
 
       /* LLVM is slower on GFX10.3 and older because it doesn't form VMEM clauses and it's more
        * difficult to force them with optimization barriers when FMASK is used.
        */
-      options.use_aco = true;
-      options.src_is_array = info->src.resource->target == PIPE_TEXTURE_1D_ARRAY ||
-                             info->src.resource->target == PIPE_TEXTURE_2D_ARRAY ||
-                             info->src.resource->target == PIPE_TEXTURE_CUBE ||
-                             info->src.resource->target == PIPE_TEXTURE_CUBE_ARRAY;
-      options.log_samples = util_logbase2(info->src.resource->nr_samples);
-      options.last_dst_channel = util_format_get_last_component(info->dst.format);
-      options.last_src_channel = util_format_get_last_component(info->src.format);
-      options.last_src_channel = MIN2(options.last_src_channel, options.last_dst_channel);
-      options.x_clamp_to_edge = si_should_blit_clamp_to_edge(info, BITFIELD_BIT(0));
-      options.y_clamp_to_edge = si_should_blit_clamp_to_edge(info, BITFIELD_BIT(1));
-      options.a16 = sctx->gfx_level >= GFX9 && util_is_box_sint16(&info->dst.box) &&
-                    util_is_box_sint16(&info->src.box);
+      key.use_aco = true;
+      key.src_is_array = info->src.resource->target == PIPE_TEXTURE_1D_ARRAY ||
+                         info->src.resource->target == PIPE_TEXTURE_2D_ARRAY ||
+                         info->src.resource->target == PIPE_TEXTURE_CUBE ||
+                         info->src.resource->target == PIPE_TEXTURE_CUBE_ARRAY;
+      key.log_samples = util_logbase2(info->src.resource->nr_samples);
+      key.last_dst_channel = util_format_get_last_component(info->dst.format);
+      key.last_src_channel = util_format_get_last_component(info->src.format);
+      key.last_src_channel = MIN2(key.last_src_channel, key.last_dst_channel);
+      key.x_clamp_to_edge = si_should_blit_clamp_to_edge(info, BITFIELD_BIT(0));
+      key.y_clamp_to_edge = si_should_blit_clamp_to_edge(info, BITFIELD_BIT(1));
+      key.a16 = sctx->gfx_level >= GFX9 && util_is_box_sint16(&info->dst.box) &&
+                util_is_box_sint16(&info->src.box);
       unsigned max_dst_chan_size = util_format_get_max_channel_size(info->dst.format);
       unsigned max_src_chan_size = util_format_get_max_channel_size(info->src.format);
 
-      if (options.use_aco && util_format_is_float(info->dst.format) && max_dst_chan_size == 32) {
+      if (key.use_aco && util_format_is_float(info->dst.format) && max_dst_chan_size == 32) {
          /* TODO: ACO doesn't meet precision expectations of this test when the destination format
           * is R32G32B32A32_FLOAT, the source format is R8G8B8A8_UNORM, and the resolving math uses
           * FP16. It's theoretically arguable whether FP16 is legal in this case. LLVM passes
@@ -1333,19 +1334,28 @@ void si_gfx_blit(struct pipe_context *ctx, const struct pipe_blit_info *info)
           *
           * piglit/bin/copyteximage CUBE -samples=2 -auto
           */
-         options.d16 = 0;
+         key.d16 = 0;
       } else {
          /* Resolving has precision issues all the way down to R11G11B10_FLOAT. */
-         options.d16 = ((!options.use_aco && !sctx->screen->use_aco && sctx->gfx_level >= GFX8) ||
-                        /* ACO doesn't support D16 on GFX8 */
-                        ((options.use_aco || sctx->screen->use_aco) && sctx->gfx_level >= GFX9)) &&
-                       MIN2(max_dst_chan_size, max_src_chan_size) <= 10;
+         key.d16 = ((!key.use_aco && !sctx->screen->use_aco && sctx->gfx_level >= GFX8) ||
+                    /* ACO doesn't support D16 on GFX8 */
+                    ((key.use_aco || sctx->screen->use_aco) && sctx->gfx_level >= GFX9)) &&
+                   MIN2(max_dst_chan_size, max_src_chan_size) <= 10;
       }
 
-      fs = _mesa_hash_table_u64_search(sctx->ps_resolve_shaders, options.key);
+      fs = _mesa_hash_table_u64_search(sctx->ps_resolve_shaders, key.key);
       if (!fs) {
-         fs = si_create_resolve_ps(sctx, &options);
-         _mesa_hash_table_u64_insert(sctx->ps_resolve_shaders, options.key, fs);
+         struct ac_ps_resolve_options options = {
+            .nir_options = sctx->b.screen->get_compiler_options(sctx->b.screen, PIPE_SHADER_IR_NIR,
+                                                                PIPE_SHADER_FRAGMENT),
+            .info = &sctx->screen->info,
+            .use_aco = sctx->screen->use_aco,
+            .no_fmask = sctx->screen->debug_flags & DBG(NO_FMASK),
+            .print_key = si_can_dump_shader(sctx->screen, MESA_SHADER_FRAGMENT, SI_DUMP_SHADER_KEY),
+         };
+
+         fs = si_create_shader_state(sctx, ac_create_resolve_ps(&options, &key));
+         _mesa_hash_table_u64_insert(sctx->ps_resolve_shaders, key.key, fs);
       }
    }
 
diff --git a/src/gallium/drivers/radeonsi/si_pipe.h b/src/gallium/drivers/radeonsi/si_pipe.h
index 742763d0b5c..9ff7cedc006 100644
--- a/src/gallium/drivers/radeonsi/si_pipe.h
+++ b/src/gallium/drivers/radeonsi/si_pipe.h
@@ -1634,6 +1634,7 @@ void si_suspend_queries(struct si_context *sctx);
 void si_resume_queries(struct si_context *sctx);
 
 /* si_shaderlib_nir.c */
+void *si_create_shader_state(struct si_context *sctx, struct nir_shader *nir);
 void *si_create_dcc_retile_cs(struct si_context *sctx, struct radeon_surf *surf);
 void *gfx9_create_clear_dcc_msaa_cs(struct si_context *sctx, struct si_texture *tex);
 void *si_create_passthrough_tcs(struct si_context *sctx);
@@ -1680,23 +1681,7 @@ union si_compute_blit_shader_key {
    uint64_t key;
 };
 
-union si_resolve_ps_key {
-   struct {
-      bool use_aco:1;
-      bool src_is_array:1;
-      uint8_t log_samples:2;
-      uint8_t last_src_channel:2; /* this shouldn't be greater than last_dst_channel */
-      uint8_t last_dst_channel:2;
-      bool x_clamp_to_edge:1;
-      bool y_clamp_to_edge:1;
-      bool a16:1;
-      bool d16:1;
-   };
-   uint64_t key;
-};
-
 void *si_create_blit_cs(struct si_context *sctx, const union si_compute_blit_shader_key *options);
-void *si_create_resolve_ps(struct si_context *sctx, const union si_resolve_ps_key *options);
 void *si_get_blitter_vs(struct si_context *sctx, enum blitter_attrib_type type,
                         unsigned num_layers);
 void *si_create_dma_compute_shader(struct si_context *sctx, unsigned num_dwords_per_thread,
diff --git a/src/gallium/drivers/radeonsi/si_shaderlib_nir.c b/src/gallium/drivers/radeonsi/si_shaderlib_nir.c
index 1cdfdd0f23c..820bf569f8e 100644
--- a/src/gallium/drivers/radeonsi/si_shaderlib_nir.c
+++ b/src/gallium/drivers/radeonsi/si_shaderlib_nir.c
@@ -11,8 +11,9 @@
 #include "si_query.h"
 #include "aco_interface.h"
 #include "nir_format_convert.h"
+#include "ac_nir_helpers.h"
 
-static void *create_shader_state(struct si_context *sctx, nir_shader *nir)
+void *si_create_shader_state(struct si_context *sctx, nir_shader *nir)
 {
    sctx->b.screen->finalize_nir(sctx->b.screen, (void*)nir);
    return pipe_shader_from_nir(&sctx->b, nir);
@@ -106,7 +107,7 @@ void *si_create_dcc_retile_cs(struct si_context *sctx, struct radeon_surf *surf)
                                  zero, zero, zero); /* z, sample, pipe_xor */
    nir_store_ssbo(&b, value, zero, dst_offset, .write_mask=0x1, .align_mul=1);
 
-   return create_shader_state(sctx, b.shader);
+   return si_create_shader_state(sctx, b.shader);
 }
 
 void *gfx9_create_clear_dcc_msaa_cs(struct si_context *sctx, struct si_texture *tex)
@@ -150,7 +151,7 @@ void *gfx9_create_clear_dcc_msaa_cs(struct si_context *sctx, struct si_texture *
     */
    nir_store_ssbo(&b, clear_value, zero, offset, .write_mask=0x1, .align_mul=2);
 
-   return create_shader_state(sctx, b.shader);
+   return si_create_shader_state(sctx, b.shader);
 }
 
 /* Create a compute shader implementing clear_buffer or copy_buffer. */
@@ -183,7 +184,7 @@ void *si_create_clear_buffer_rmw_cs(struct si_context *sctx)
 
    nir_store_ssbo(&b, data, zero, address, .align_mul = 4);
 
-   return create_shader_state(sctx, b.shader);
+   return si_create_shader_state(sctx, b.shader);
 }
 
 /* This is used when TCS is NULL in the VS->TCS->TES chain. In this case,
@@ -202,7 +203,7 @@ void *si_create_passthrough_tcs(struct si_context *sctx)
    nir_shader *tcs = nir_create_passthrough_tcs_impl(sctx->screen->nir_options, locations,
                                                      info->num_outputs, sctx->patch_vertices);
 
-   return create_shader_state(sctx, tcs);
+   return si_create_shader_state(sctx, tcs);
 }
 
 static nir_def *convert_linear_to_srgb(nir_builder *b, nir_def *input)
@@ -218,30 +219,6 @@ static nir_def *convert_linear_to_srgb(nir_builder *b, nir_def *input)
    return input;
 }
 
-static nir_def *average_samples(nir_builder *b, nir_def **samples, unsigned num_samples)
-{
-   /* This works like add-reduce by computing the sum of each pair independently, and then
-    * computing the sum of each pair of sums, and so on, to get better instruction-level
-    * parallelism.
-    */
-   if (num_samples == 16) {
-      for (unsigned i = 0; i < 8; i++)
-         samples[i] = nir_fadd(b, samples[i * 2], samples[i * 2 + 1]);
-   }
-   if (num_samples >= 8) {
-      for (unsigned i = 0; i < 4; i++)
-         samples[i] = nir_fadd(b, samples[i * 2], samples[i * 2 + 1]);
-   }
-   if (num_samples >= 4) {
-      for (unsigned i = 0; i < 2; i++)
-         samples[i] = nir_fadd(b, samples[i * 2], samples[i * 2 + 1]);
-   }
-   if (num_samples >= 2)
-      samples[0] = nir_fadd(b, samples[0], samples[1]);
-
-   return nir_fmul_imm(b, samples[0], 1.0 / num_samples); /* average the sum */
-}
-
 static nir_def *apply_blit_output_modifiers(nir_builder *b, nir_def *color,
                                                 const union si_compute_blit_shader_key *options)
 {
@@ -290,27 +267,6 @@ static nir_def *apply_blit_output_modifiers(nir_builder *b, nir_def *color,
    return color;
 }
 
-static void optimization_barrier_vgpr_array(struct si_context *sctx, nir_builder *b,
-                                            nir_def **array, unsigned num_elements,
-                                            unsigned num_components)
-{
-   /* We use the optimization barrier to force LLVM to form VMEM clauses by constraining its
-    * instruction scheduling options.
-    *
-    * VMEM clauses are supported since GFX10. It's not recommended to use the optimization
-    * barrier in the compute blit for GFX6-8 because the lack of A16 combined with optimization
-    * barriers would unnecessarily increase VGPR usage for MSAA resources.
-    */
-   if (!b->shader->info.use_aco_amd && sctx->gfx_level >= GFX10) {
-      for (unsigned i = 0; i < num_elements; i++) {
-         unsigned prev_num = array[i]->num_components;
-         array[i] = nir_trim_vector(b, array[i], num_components);
-         array[i] = nir_optimization_barrier_vgpr_amd(b, array[i]->bit_size, array[i]);
-         array[i] = nir_pad_vector(b, array[i], prev_num);
-      }
-   }
-}
-
 /* The compute blit shader.
  *
  * Implementation details:
@@ -541,8 +497,8 @@ void *si_create_blit_cs(struct si_context *sctx, const union si_compute_blit_sha
 
       /* We don't want the computation of src coordinates to be interleaved with loads. */
       if (lane_size > 1 || src_samples > 1) {
-         optimization_barrier_vgpr_array(sctx, &b, coord_src, lane_size * src_samples,
-                                         num_src_coords);
+         ac_optimization_barrier_vgpr_array(&sctx->screen->info, &b, coord_src,
+                                            lane_size * src_samples, num_src_coords);
       }
 
       /* Use "samples_identical" for MSAA resolving if it's supported. */
@@ -588,14 +544,14 @@ void *si_create_blit_cs(struct si_context *sctx, const union si_compute_blit_sha
       /* Resolve MSAA if necessary. */
       if (is_resolve) {
          /* We don't want the averaging of samples to be interleaved with image loads. */
-         optimization_barrier_vgpr_array(sctx, &b, color, lane_size * src_samples,
-                                         options->last_src_channel + 1);
+         ac_optimization_barrier_vgpr_array(&sctx->screen->info, &b, color, lane_size * src_samples,
+                                            options->last_src_channel + 1);
 
          /* This reduces the "color" array from "src_samples * lane_size" elements to only
           * "lane_size" elements.
           */
          foreach_pixel_in_lane(1, sample, x, y, z, i) {
-            color[i] = average_samples(&b, &color[i * src_samples], src_samples);
+            color[i] = ac_average_samples(&b, &color[i * src_samples], src_samples);
          }
          src_samples = 1;
       }
@@ -634,13 +590,15 @@ void *si_create_blit_cs(struct si_context *sctx, const union si_compute_blit_sha
    }
 
    /* We don't want the computation of dst coordinates to be interleaved with stores. */
-   if (lane_size > 1 || dst_samples > 1)
-      optimization_barrier_vgpr_array(sctx, &b, coord_dst, lane_size * dst_samples, num_dst_coords);
+   if (lane_size > 1 || dst_samples > 1) {
+      ac_optimization_barrier_vgpr_array(&sctx->screen->info, &b, coord_dst, lane_size * dst_samples,
+                                         num_dst_coords);
+   }
 
    /* We don't want the application of blit output modifiers to be interleaved with stores. */
    if (!options->is_clear && (lane_size > 1 || MIN2(src_samples, dst_samples) > 1)) {
-      optimization_barrier_vgpr_array(sctx, &b, color, lane_size * src_samples,
-                                      options->last_dst_channel + 1);
+      ac_optimization_barrier_vgpr_array(&sctx->screen->info, &b, color, lane_size * src_samples,
+                                         options->last_dst_channel + 1);
    }
 
    /* Store the pixels, one per sample. */
@@ -655,7 +613,7 @@ void *si_create_blit_cs(struct si_context *sctx, const union si_compute_blit_sha
    if (options->has_start_xyz)
       nir_pop_if(&b, if_positive);
 
-   return create_shader_state(sctx, b.shader);
+   return si_create_shader_state(sctx, b.shader);
 }
 
 /* Store the clear color at the beginning of every 256B block. This is required when we clear DCC
@@ -694,7 +652,7 @@ void *si_clear_image_dcc_single_shader(struct si_context *sctx, bool is_msaa, un
    nir_image_deref_store(&b, &nir_build_deref_var(&b, output_img)->def, coord, nir_imm_int(&b, 0),
                          clear_color, nir_imm_int(&b, 0));
 
-   return create_shader_state(sctx, b.shader);
+   return si_create_shader_state(sctx, b.shader);
 }
 
 void *si_create_ubyte_to_ushort_compute_shader(struct si_context *sctx)
@@ -714,7 +672,7 @@ void *si_create_ubyte_to_ushort_compute_shader(struct si_context *sctx)
    nir_store_ssbo(&b, nir_u2u16(&b, ubyte_value), nir_imm_int(&b, 0),
                   store_address, .access = ACCESS_RESTRICT);
 
-   return create_shader_state(sctx, b.shader);
+   return si_create_shader_state(sctx, b.shader);
 }
 
 /* Create a compute shader implementing clear_buffer or copy_buffer. */
@@ -745,7 +703,7 @@ void *si_create_dma_compute_shader(struct si_context *sctx, unsigned num_dwords_
 
    nir_store_ssbo(&b, value, nir_imm_int(&b, !is_clear), offset, .access = ACCESS_RESTRICT);
 
-   return create_shader_state(sctx, b.shader);
+   return si_create_shader_state(sctx, b.shader);
 }
 
 /* Load samples from the image, and copy them to the same image. This looks like
@@ -764,7 +722,7 @@ void *si_create_fmask_expand_cs(struct si_context *sctx, unsigned num_samples, b
 
    /* Return an empty compute shader */
    if (num_samples == 0)
-      return create_shader_state(sctx, b.shader);
+      return si_create_shader_state(sctx, b.shader);
 
    b.shader->info.num_images = 1;
 
@@ -803,7 +761,7 @@ void *si_create_fmask_expand_cs(struct si_context *sctx, unsigned num_samples, b
                             .image_array = is_array);
    }
 
-   return create_shader_state(sctx, b.shader);
+   return si_create_shader_state(sctx, b.shader);
 }
 
 /* This is just a pass-through shader with 1-3 MOV instructions. */
@@ -873,7 +831,7 @@ void *si_get_blitter_vs(struct si_context *sctx, enum blitter_attrib_type type,
                                                      SYSTEM_VALUE_INSTANCE_ID, glsl_int_type()));
    }
 
-   *vs = create_shader_state(sctx, b.shader);
+   *vs = si_create_shader_state(sctx, b.shader);
    return *vs;
 }
 
@@ -1251,7 +1209,7 @@ void *si_create_query_result_cs(struct si_context *sctx)
    }
    nir_pop_if(&b, if_acc_chaining);
 
-   return create_shader_state(sctx, b.shader);
+   return si_create_shader_state(sctx, b.shader);
 }
 
 /* Create the compute shader that is used to collect the results of gfx10+
@@ -1499,162 +1457,5 @@ void *gfx11_create_sh_query_result_cs(struct si_context *sctx)
    }
    nir_pop_if(&b, if_write_summary_buffer);
 
-   return create_shader_state(sctx, b.shader);
-}
-
-static nir_def *build_tex_load_ms(nir_builder *b, unsigned num_components, unsigned bit_size,
-                                  nir_deref_instr *tex_deref, nir_def *coord, nir_def *sample_index)
-{
-   nir_tex_src srcs[] = {
-      nir_tex_src_for_ssa(nir_tex_src_coord, coord),
-      nir_tex_src_for_ssa(nir_tex_src_ms_index, sample_index),
-   };
-   nir_def *result = nir_build_tex_deref_instr(b, nir_texop_txf_ms, tex_deref, tex_deref,
-                                               ARRAY_SIZE(srcs), srcs);
-
-   nir_tex_instr *tex = nir_instr_as_tex(result->parent_instr);
-
-   assert(bit_size == 32 || bit_size == 16);
-   if (bit_size == 16) {
-      tex->dest_type = nir_type_float16;
-      tex->def.bit_size = 16;
-   }
-   return nir_trim_vector(b, result, num_components);
-}
-
-void *si_create_resolve_ps(struct si_context *sctx, const union si_resolve_ps_key *options)
-{
-   if (si_can_dump_shader(sctx->screen, MESA_SHADER_FRAGMENT, SI_DUMP_SHADER_KEY)) {
-      fprintf(stderr, "Internal shader: resolve_ps\n");
-      fprintf(stderr, "   options.use_aco = %u\n", options->use_aco);
-      fprintf(stderr, "   options.src_is_array = %u\n", options->src_is_array);
-      fprintf(stderr, "   options.log_samples = %u\n", options->log_samples);
-      fprintf(stderr, "   options.last_src_channel = %u\n", options->last_src_channel);
-      fprintf(stderr, "   options.x_clamp_to_edge = %u\n", options->x_clamp_to_edge);
-      fprintf(stderr, "   options.y_clamp_to_edge = %u\n", options->y_clamp_to_edge);
-      fprintf(stderr, "   options.d16 = %u\n", options->d16);
-      fprintf(stderr, "   options.a16 = %u\n", options->a16);
-      fprintf(stderr, "\n");
-   }
-
-   const nir_shader_compiler_options *nir_options =
-      sctx->b.screen->get_compiler_options(sctx->b.screen, PIPE_SHADER_IR_NIR, PIPE_SHADER_FRAGMENT);
-   nir_builder b =
-      nir_builder_init_simple_shader(MESA_SHADER_FRAGMENT, nir_options, "si_resolve_ps");
-
-   b.shader->info.use_aco_amd = sctx->screen->use_aco ||
-                                (options->use_aco && aco_is_gpu_supported(&sctx->screen->info));
-   BITSET_SET(b.shader->info.textures_used, 1);
-
-   const struct glsl_type *sampler_type =
-      glsl_sampler_type(GLSL_SAMPLER_DIM_MS, /*shadow*/ false, /*is_array*/ options->src_is_array,
-                        GLSL_TYPE_FLOAT);
-   nir_variable *sampler = nir_variable_create(b.shader, nir_var_uniform, sampler_type, "samp0");
-   sampler->data.binding = 0;
-
-   nir_deref_instr *deref = nir_build_deref_var(&b, sampler);
-   nir_def *zero = nir_imm_int(&b, 0);
-   nir_def *baryc = nir_load_barycentric_pixel(&b, 32, .interp_mode = INTERP_MODE_SMOOTH);
-   nir_def *coord = nir_load_interpolated_input(&b, 2 + options->src_is_array, 32, baryc, zero,
-                                                .dest_type = nir_type_float32,
-                                                .io_semantics = (nir_io_semantics){
-                                                                .location = VARYING_SLOT_VAR0,
-                                                                .num_slots = 1});
-
-   /* Nearest filtering floors and then converts to integer, and then
-    * applies clamp to edge as clamp(coord, 0, dim - 1).
-    */
-   coord = nir_vector_insert_imm(&b, coord, nir_ffloor(&b, nir_channel(&b, coord, 0)), 0);
-   coord = nir_vector_insert_imm(&b, coord, nir_ffloor(&b, nir_channel(&b, coord, 1)), 1);
-   coord = nir_f2iN(&b, coord, options->a16 ? 16 : 32);
-
-   /* Clamp to edge only for X and Y because Z can't be out of bounds. */
-   nir_def *resinfo = NULL;
-   for (unsigned chan = 0; chan < 2; chan++) {
-      if (chan ? options->y_clamp_to_edge : options->x_clamp_to_edge) {
-         if (!resinfo) {
-            resinfo = nir_build_tex_deref_instr(&b, nir_texop_txs, deref, deref, 0, NULL);
-
-            if (options->a16) {
-               resinfo = nir_umin_imm(&b, resinfo, INT16_MAX);
-               resinfo = nir_i2i16(&b, resinfo);
-            }
-         }
-
-         nir_def *tmp = nir_channel(&b, coord, chan);
-         tmp = nir_imax_imm(&b, tmp, 0);
-         tmp = nir_imin(&b, tmp, nir_iadd_imm(&b, nir_channel(&b, resinfo, chan), -1));
-         coord = nir_vector_insert_imm(&b, coord, tmp, chan);
-      }
-   }
-
-   /* Use samples_identical if it's supported. */
-   bool uses_samples_identical = sctx->gfx_level < GFX11 &&
-                                 !(sctx->screen->debug_flags & DBG(NO_FMASK));
-   nir_def *sample0 = NULL;
-   nir_if *if_identical = NULL;
-
-   assert(options->last_src_channel <= options->last_dst_channel);
-
-   if (uses_samples_identical) {
-      nir_tex_src iden_srcs[] = {
-         nir_tex_src_for_ssa(nir_tex_src_coord, coord),
-      };
-      nir_def *samples_identical =
-         nir_build_tex_deref_instr(&b, nir_texop_samples_identical, deref, deref,
-                                   ARRAY_SIZE(iden_srcs), iden_srcs);
-
-      /* If all samples are identical, load only sample 0. */
-      if_identical = nir_push_if(&b, samples_identical);
-      {
-         sample0 = build_tex_load_ms(&b, options->last_src_channel + 1, options->d16 ? 16 : 32,
-                                     deref, coord, nir_imm_intN_t(&b, 0, coord->bit_size));
-      }
-      nir_push_else(&b, if_identical);
-   }
-
-   /* Insert the sample index into the coordinates. */
-   unsigned num_src_coords = 2 + options->src_is_array + 1;
-   unsigned num_samples = 1 << options->log_samples;
-   nir_def *coord_src[16] = {0};
-
-   for (unsigned i = 0; i < num_samples; i++) {
-      coord_src[i] = nir_pad_vector(&b, coord, num_src_coords);
-      coord_src[i] = nir_vector_insert_imm(&b, coord_src[i],
-                                           nir_imm_intN_t(&b, i, coord->bit_size),
-                                           num_src_coords - 1);
-   }
-
-   /* We need this because LLVM interleaves coordinate computations with image loads, which breaks
-    * VMEM clauses.
-    */
-   optimization_barrier_vgpr_array(sctx, &b, coord_src, num_samples, num_src_coords);
-
-   nir_def *samples[16] = {0};
-   for (unsigned i = 0; i < num_samples; i++) {
-      samples[i] = build_tex_load_ms(&b, options->last_src_channel + 1, options->d16 ? 16 : 32,
-                                     deref, nir_trim_vector(&b, coord_src[i], num_src_coords - 1),
-                                     nir_channel(&b, coord_src[i], num_src_coords - 1));
-   }
-   nir_def *result = average_samples(&b, samples, num_samples);
-
-   if (uses_samples_identical) {
-      nir_pop_if(&b, if_identical);
-      result = nir_if_phi(&b, sample0, result);
-   }
-
-   result = nir_pad_vector(&b, result, options->last_dst_channel + 1);
-   for (unsigned i = options->last_src_channel + 1; i <= options->last_dst_channel; i++) {
-      result = nir_vector_insert_imm(&b, result,
-                                     nir_imm_floatN_t(&b, i == 3 ? 1 : 0, result->bit_size), i);
-   }
-
-   nir_store_output(&b, result, zero,
-                    .write_mask = BITFIELD_MASK(options->last_dst_channel + 1),
-                    .src_type = options->d16 ? nir_type_float16 : nir_type_float32,
-                    .io_semantics = (nir_io_semantics){
-                                    .location = FRAG_RESULT_DATA0,
-                                    .num_slots = 1});
-
-   return create_shader_state(sctx, b.shader);
+   return si_create_shader_state(sctx, b.shader);
 }