diff --git a/src/amd/common/ac_nir.c b/src/amd/common/ac_nir.c
index 615644c1dd4..88af6084208 100644
--- a/src/amd/common/ac_nir.c
+++ b/src/amd/common/ac_nir.c
@@ -2344,6 +2344,228 @@ ac_nir_lower_bit_size_callback(const nir_instr *instr, void *data)
    return 0;
 }
 
+static unsigned
+align_load_store_size(enum amd_gfx_level gfx_level, unsigned size, bool uses_smem, bool is_shared)
+{
+   /* LDS can't overfetch because accesses that are partially out of range would be dropped
+    * entirely, so all unaligned LDS accesses are always split.
+    */
+   if (is_shared)
+      return size;
+
+   /* Align the size to what the hw supports. Out of range access due to alignment is OK because
+    * range checking is per dword for untyped instructions. This assumes that the compiler backend
+    * overfetches due to load size alignment instead of splitting the load.
+    *
+    * GFX6-11 don't have 96-bit SMEM loads.
+    * GFX6 doesn't have 96-bit untyped VMEM loads.
+    */
+   if (gfx_level >= (uses_smem ? GFX12 : GFX7) && size == 96)
+      return size;
+   else
+      return util_next_power_of_two(size);
+}
+
+bool
+ac_nir_mem_vectorize_callback(unsigned align_mul, unsigned align_offset, unsigned bit_size,
+                              unsigned num_components, int64_t hole_size, nir_intrinsic_instr *low,
+                              nir_intrinsic_instr *high, void *data)
+{
+   struct ac_nir_config *config = (struct ac_nir_config *)data;
+   bool uses_smem = (nir_intrinsic_has_access(low) &&
+                     nir_intrinsic_access(low) & ACCESS_SMEM_AMD) ||
+                    /* These don't have the "access" field. */
+                    low->intrinsic == nir_intrinsic_load_smem_amd ||
+                    low->intrinsic == nir_intrinsic_load_push_constant;
+   bool is_store = !nir_intrinsic_infos[low->intrinsic].has_dest;
+   bool is_scratch = low->intrinsic == nir_intrinsic_load_stack ||
+                     low->intrinsic == nir_intrinsic_store_stack ||
+                     low->intrinsic == nir_intrinsic_load_scratch ||
+                     low->intrinsic == nir_intrinsic_store_scratch;
+   bool is_shared = low->intrinsic == nir_intrinsic_load_shared ||
+                    low->intrinsic == nir_intrinsic_store_shared ||
+                    low->intrinsic == nir_intrinsic_load_deref ||
+                    low->intrinsic == nir_intrinsic_store_deref;
+
+   assert(!is_store || hole_size <= 0);
+
+   /* If we get derefs here, only shared memory derefs are expected. */
+   assert((low->intrinsic != nir_intrinsic_load_deref &&
+           low->intrinsic != nir_intrinsic_store_deref) ||
+          nir_deref_mode_is(nir_src_as_deref(low->src[0]), nir_var_mem_shared));
+
+   /* Don't vectorize descriptor loads for LLVM due to excessive SGPR and VGPR spilling. */
+   if (!config->uses_aco && low->intrinsic == nir_intrinsic_load_smem_amd)
+      return false;
+
+   /* Reject opcodes we don't vectorize. */
+   switch (low->intrinsic) {
+   case nir_intrinsic_load_smem_amd:
+   case nir_intrinsic_load_push_constant:
+   case nir_intrinsic_load_ubo:
+   case nir_intrinsic_load_stack:
+   case nir_intrinsic_store_stack:
+   case nir_intrinsic_load_scratch:
+   case nir_intrinsic_store_scratch:
+   case nir_intrinsic_load_global_constant:
+   case nir_intrinsic_load_global:
+   case nir_intrinsic_store_global:
+   case nir_intrinsic_load_ssbo:
+   case nir_intrinsic_store_ssbo:
+   case nir_intrinsic_load_deref:
+   case nir_intrinsic_store_deref:
+   case nir_intrinsic_load_shared:
+   case nir_intrinsic_store_shared:
+      break;
+   default:
+      return false;
+   }
+
+   /* Align the size to what the hw supports. */
+   unsigned unaligned_new_size = num_components * bit_size;
+   unsigned aligned_new_size = align_load_store_size(config->gfx_level, unaligned_new_size,
+                                                     uses_smem, is_shared);
+
+   if (uses_smem) {
+      /* Maximize SMEM vectorization except for LLVM, which suffers from SGPR and VGPR spilling.
+       * GFX6-7 have fewer hw SGPRs, so merge only up to 128 bits to limit SGPR usage.
+       */
+      if (aligned_new_size > (config->gfx_level >= GFX8 ? (config->uses_aco ? 512 : 256) : 128))
+         return false;
+   } else {
+      if (aligned_new_size > 128)
+         return false;
+
+      /* GFX6-8 only support 32-bit scratch loads/stores. */
+      if (config->gfx_level <= GFX8 && is_scratch && aligned_new_size > 32)
+         return false;
+   }
+
+   if (!is_store) {
+      /* Non-descriptor loads. */
+      if (low->intrinsic != nir_intrinsic_load_ubo &&
+          low->intrinsic != nir_intrinsic_load_ssbo) {
+         /* Only increase the size of loads if doing so doesn't extend into a new page.
+          * Here we set alignment to MAX because we don't know the alignment of global
+          * pointers before adding the offset.
+          */
+         uint32_t resource_align = low->intrinsic == nir_intrinsic_load_global_constant ||
+                                   low->intrinsic == nir_intrinsic_load_global ? NIR_ALIGN_MUL_MAX : 4;
+         uint32_t page_size = 4096;
+         uint32_t mul = MIN3(align_mul, page_size, resource_align);
+         unsigned end = (align_offset + unaligned_new_size / 8u) & (mul - 1);
+         if ((aligned_new_size - unaligned_new_size) / 8u > (mul - end))
+            return false;
+      }
+
+      /* Only allow SMEM loads to overfetch by 32 bits:
+       *
+       * Examples (the hole is indicated by parentheses, the numbers are  in bytes, the maximum
+       * overfetch size is 4):
+       *    4  | (4) | 4   ->  hw loads 12  : ALLOWED    (4 over)
+       *    4  | (4) | 4   ->  hw loads 16  : DISALLOWED (8 over)
+       *    4  |  4  | 4   ->  hw loads 16  : ALLOWED    (4 over)
+       *    4  | (4) | 8   ->  hw loads 16  : ALLOWED    (4 over)
+       *    16 |  4        ->  hw loads 32  : DISALLOWED (12 over)
+       *    16 |  8        ->  hw loads 32  : DISALLOWED (8 over)
+       *    16 | 12        ->  hw loads 32  : ALLOWED    (4 over)
+       *    16 | (4) | 12  ->  hw loads 32  : ALLOWED    (4 over)
+       *    32 | 16        ->  hw loads 64  : DISALLOWED (16 over)
+       *    32 | 28        ->  hw loads 64  : ALLOWED    (4 over)
+       *    32 | (4) | 28  ->  hw loads 64  : ALLOWED    (4 over)
+       *
+       * Note that we can overfetch by more than 4 bytes if we merge more than 2 loads, e.g.:
+       *    4  | (4) | 8 | (4) | 12  ->  hw loads 32  : ALLOWED (4 + 4 over)
+       *
+       * That's because this callback is called twice in that case, each time allowing only 4 over.
+       *
+       * This is only enabled for ACO. LLVM spills SGPRs and VGPRs too much.
+       */
+      unsigned overfetch_size = 0;
+
+      if (config->uses_aco && uses_smem && aligned_new_size >= 128)
+         overfetch_size = 32;
+
+      int64_t aligned_unvectorized_size =
+         align_load_store_size(config->gfx_level, low->num_components * low->def.bit_size,
+                               uses_smem, is_shared) +
+         align_load_store_size(config->gfx_level, high->num_components * high->def.bit_size,
+                               uses_smem, is_shared);
+
+      if (aligned_new_size > aligned_unvectorized_size + overfetch_size)
+         return false;
+   }
+
+   uint32_t align;
+   if (align_offset)
+      align = 1 << (ffs(align_offset) - 1);
+   else
+      align = align_mul;
+
+   /* Validate the alignment and number of components. */
+   if (!is_shared) {
+      unsigned max_components;
+      if (align % 4 == 0)
+         max_components = NIR_MAX_VEC_COMPONENTS;
+      else if (align % 2 == 0)
+         max_components = 16u / bit_size;
+      else
+         max_components = 8u / bit_size;
+      return (align % (bit_size / 8u)) == 0 && num_components <= max_components;
+   } else {
+      if (bit_size * num_components == 96) { /* 96 bit loads require 128 bit alignment and are split otherwise */
+         return align % 16 == 0;
+      } else if (bit_size == 16 && (align % 4)) {
+         /* AMD hardware can't do 2-byte aligned f16vec2 loads, but they are useful for ALU
+          * vectorization, because our vectorizer requires the scalar IR to already contain vectors.
+          */
+         return (align % 2 == 0) && num_components <= 2;
+      } else {
+         if (num_components == 3) {
+            /* AMD hardware can't do 3-component loads except for 96-bit loads, handled above. */
+            return false;
+         }
+         unsigned req = bit_size * num_components;
+         if (req == 64 || req == 128) /* 64-bit and 128-bit loads can use ds_read2_b{32,64} */
+            req /= 2u;
+         return align % (req / 8u) == 0;
+      }
+   }
+   return false;
+}
+
+bool ac_nir_scalarize_overfetching_loads_callback(const nir_instr *instr, const void *data)
+{
+   nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
+
+   /* Reject opcodes we don't scalarize. */
+   switch (intr->intrinsic) {
+   case nir_intrinsic_load_ubo:
+   case nir_intrinsic_load_ssbo:
+   case nir_intrinsic_load_global:
+   case nir_intrinsic_load_global_constant:
+   case nir_intrinsic_load_shared:
+      break;
+   default:
+      return false;
+   }
+
+   bool uses_smem = nir_intrinsic_has_access(intr) &&
+                    nir_intrinsic_access(intr) & ACCESS_SMEM_AMD;
+   bool is_shared = intr->intrinsic == nir_intrinsic_load_shared;
+
+   enum amd_gfx_level gfx_level = *(enum amd_gfx_level *)data;
+   unsigned comp_size = intr->def.bit_size / 8;
+   unsigned load_size = intr->def.num_components * comp_size;
+   unsigned used_load_size = util_bitcount(nir_def_components_read(&intr->def)) * comp_size;
+
+   /* Scalarize if the load overfetches. That includes loads that overfetch due to load size
+    * alignment, e.g. when only a power-of-two load is available. The scalarized loads are expected
+    * to be later vectorized to optimal sizes.
+    */
+   return used_load_size < align_load_store_size(gfx_level, load_size, uses_smem, is_shared);
+}
+
 /* Get chip-agnostic memory instruction access flags (as opposed to chip-specific GLC/DLC/SLC)
  * from a NIR memory intrinsic.
  */
diff --git a/src/amd/common/ac_nir.h b/src/amd/common/ac_nir.h
index 3602e5761a8..bd41b3c8b67 100644
--- a/src/amd/common/ac_nir.h
+++ b/src/amd/common/ac_nir.h
@@ -372,6 +372,14 @@ ac_nir_optimize_uniform_atomics(nir_shader *nir);
 unsigned
 ac_nir_lower_bit_size_callback(const nir_instr *instr, void *data);
 
+bool
+ac_nir_mem_vectorize_callback(unsigned align_mul, unsigned align_offset, unsigned bit_size,
+                              unsigned num_components, int64_t hole_size,
+                              nir_intrinsic_instr *low, nir_intrinsic_instr *high, void *data);
+
+bool
+ac_nir_scalarize_overfetching_loads_callback(const nir_instr *instr, const void *data);
+
 enum gl_access_qualifier
 ac_nir_get_mem_access_flags(const nir_intrinsic_instr *instr);
 
diff --git a/src/amd/common/ac_shader_util.c b/src/amd/common/ac_shader_util.c
index 532246d574f..91810359008 100644
--- a/src/amd/common/ac_shader_util.c
+++ b/src/amd/common/ac_shader_util.c
@@ -114,228 +114,6 @@ void ac_set_nir_options(struct radeon_info *info, bool use_llvm,
       BITFIELD_BIT(nir_lower_packing_op_unpack_32_4x8);
 }
 
-static unsigned
-align_load_store_size(enum amd_gfx_level gfx_level, unsigned size, bool uses_smem, bool is_shared)
-{
-   /* LDS can't overfetch because accesses that are partially out of range would be dropped
-    * entirely, so all unaligned LDS accesses are always split.
-    */
-   if (is_shared)
-      return size;
-
-   /* Align the size to what the hw supports. Out of range access due to alignment is OK because
-    * range checking is per dword for untyped instructions. This assumes that the compiler backend
-    * overfetches due to load size alignment instead of splitting the load.
-    *
-    * GFX6-11 don't have 96-bit SMEM loads.
-    * GFX6 doesn't have 96-bit untyped VMEM loads.
-    */
-   if (gfx_level >= (uses_smem ? GFX12 : GFX7) && size == 96)
-      return size;
-   else
-      return util_next_power_of_two(size);
-}
-
-bool
-ac_nir_mem_vectorize_callback(unsigned align_mul, unsigned align_offset, unsigned bit_size,
-                              unsigned num_components, int64_t hole_size, nir_intrinsic_instr *low,
-                              nir_intrinsic_instr *high, void *data)
-{
-   struct ac_nir_config *config = (struct ac_nir_config *)data;
-   bool uses_smem = (nir_intrinsic_has_access(low) &&
-                     nir_intrinsic_access(low) & ACCESS_SMEM_AMD) ||
-                    /* These don't have the "access" field. */
-                    low->intrinsic == nir_intrinsic_load_smem_amd ||
-                    low->intrinsic == nir_intrinsic_load_push_constant;
-   bool is_store = !nir_intrinsic_infos[low->intrinsic].has_dest;
-   bool is_scratch = low->intrinsic == nir_intrinsic_load_stack ||
-                     low->intrinsic == nir_intrinsic_store_stack ||
-                     low->intrinsic == nir_intrinsic_load_scratch ||
-                     low->intrinsic == nir_intrinsic_store_scratch;
-   bool is_shared = low->intrinsic == nir_intrinsic_load_shared ||
-                    low->intrinsic == nir_intrinsic_store_shared ||
-                    low->intrinsic == nir_intrinsic_load_deref ||
-                    low->intrinsic == nir_intrinsic_store_deref;
-
-   assert(!is_store || hole_size <= 0);
-
-   /* If we get derefs here, only shared memory derefs are expected. */
-   assert((low->intrinsic != nir_intrinsic_load_deref &&
-           low->intrinsic != nir_intrinsic_store_deref) ||
-          nir_deref_mode_is(nir_src_as_deref(low->src[0]), nir_var_mem_shared));
-
-   /* Don't vectorize descriptor loads for LLVM due to excessive SGPR and VGPR spilling. */
-   if (!config->uses_aco && low->intrinsic == nir_intrinsic_load_smem_amd)
-      return false;
-
-   /* Reject opcodes we don't vectorize. */
-   switch (low->intrinsic) {
-   case nir_intrinsic_load_smem_amd:
-   case nir_intrinsic_load_push_constant:
-   case nir_intrinsic_load_ubo:
-   case nir_intrinsic_load_stack:
-   case nir_intrinsic_store_stack:
-   case nir_intrinsic_load_scratch:
-   case nir_intrinsic_store_scratch:
-   case nir_intrinsic_load_global_constant:
-   case nir_intrinsic_load_global:
-   case nir_intrinsic_store_global:
-   case nir_intrinsic_load_ssbo:
-   case nir_intrinsic_store_ssbo:
-   case nir_intrinsic_load_deref:
-   case nir_intrinsic_store_deref:
-   case nir_intrinsic_load_shared:
-   case nir_intrinsic_store_shared:
-      break;
-   default:
-      return false;
-   }
-
-   /* Align the size to what the hw supports. */
-   unsigned unaligned_new_size = num_components * bit_size;
-   unsigned aligned_new_size = align_load_store_size(config->gfx_level, unaligned_new_size,
-                                                     uses_smem, is_shared);
-
-   if (uses_smem) {
-      /* Maximize SMEM vectorization except for LLVM, which suffers from SGPR and VGPR spilling.
-       * GFX6-7 have fewer hw SGPRs, so merge only up to 128 bits to limit SGPR usage.
-       */
-      if (aligned_new_size > (config->gfx_level >= GFX8 ? (config->uses_aco ? 512 : 256) : 128))
-         return false;
-   } else {
-      if (aligned_new_size > 128)
-         return false;
-
-      /* GFX6-8 only support 32-bit scratch loads/stores. */
-      if (config->gfx_level <= GFX8 && is_scratch && aligned_new_size > 32)
-         return false;
-   }
-
-   if (!is_store) {
-      /* Non-descriptor loads. */
-      if (low->intrinsic != nir_intrinsic_load_ubo &&
-          low->intrinsic != nir_intrinsic_load_ssbo) {
-         /* Only increase the size of loads if doing so doesn't extend into a new page.
-          * Here we set alignment to MAX because we don't know the alignment of global
-          * pointers before adding the offset.
-          */
-         uint32_t resource_align = low->intrinsic == nir_intrinsic_load_global_constant ||
-                                   low->intrinsic == nir_intrinsic_load_global ? NIR_ALIGN_MUL_MAX : 4;
-         uint32_t page_size = 4096;
-         uint32_t mul = MIN3(align_mul, page_size, resource_align);
-         unsigned end = (align_offset + unaligned_new_size / 8u) & (mul - 1);
-         if ((aligned_new_size - unaligned_new_size) / 8u > (mul - end))
-            return false;
-      }
-
-      /* Only allow SMEM loads to overfetch by 32 bits:
-       *
-       * Examples (the hole is indicated by parentheses, the numbers are  in bytes, the maximum
-       * overfetch size is 4):
-       *    4  | (4) | 4   ->  hw loads 12  : ALLOWED    (4 over)
-       *    4  | (4) | 4   ->  hw loads 16  : DISALLOWED (8 over)
-       *    4  |  4  | 4   ->  hw loads 16  : ALLOWED    (4 over)
-       *    4  | (4) | 8   ->  hw loads 16  : ALLOWED    (4 over)
-       *    16 |  4        ->  hw loads 32  : DISALLOWED (12 over)
-       *    16 |  8        ->  hw loads 32  : DISALLOWED (8 over)
-       *    16 | 12        ->  hw loads 32  : ALLOWED    (4 over)
-       *    16 | (4) | 12  ->  hw loads 32  : ALLOWED    (4 over)
-       *    32 | 16        ->  hw loads 64  : DISALLOWED (16 over)
-       *    32 | 28        ->  hw loads 64  : ALLOWED    (4 over)
-       *    32 | (4) | 28  ->  hw loads 64  : ALLOWED    (4 over)
-       *
-       * Note that we can overfetch by more than 4 bytes if we merge more than 2 loads, e.g.:
-       *    4  | (4) | 8 | (4) | 12  ->  hw loads 32  : ALLOWED (4 + 4 over)
-       *
-       * That's because this callback is called twice in that case, each time allowing only 4 over.
-       *
-       * This is only enabled for ACO. LLVM spills SGPRs and VGPRs too much.
-       */
-      unsigned overfetch_size = 0;
-
-      if (config->uses_aco && uses_smem && aligned_new_size >= 128)
-         overfetch_size = 32;
-
-      int64_t aligned_unvectorized_size =
-         align_load_store_size(config->gfx_level, low->num_components * low->def.bit_size,
-                               uses_smem, is_shared) +
-         align_load_store_size(config->gfx_level, high->num_components * high->def.bit_size,
-                               uses_smem, is_shared);
-
-      if (aligned_new_size > aligned_unvectorized_size + overfetch_size)
-         return false;
-   }
-
-   uint32_t align;
-   if (align_offset)
-      align = 1 << (ffs(align_offset) - 1);
-   else
-      align = align_mul;
-
-   /* Validate the alignment and number of components. */
-   if (!is_shared) {
-      unsigned max_components;
-      if (align % 4 == 0)
-         max_components = NIR_MAX_VEC_COMPONENTS;
-      else if (align % 2 == 0)
-         max_components = 16u / bit_size;
-      else
-         max_components = 8u / bit_size;
-      return (align % (bit_size / 8u)) == 0 && num_components <= max_components;
-   } else {
-      if (bit_size * num_components == 96) { /* 96 bit loads require 128 bit alignment and are split otherwise */
-         return align % 16 == 0;
-      } else if (bit_size == 16 && (align % 4)) {
-         /* AMD hardware can't do 2-byte aligned f16vec2 loads, but they are useful for ALU
-          * vectorization, because our vectorizer requires the scalar IR to already contain vectors.
-          */
-         return (align % 2 == 0) && num_components <= 2;
-      } else {
-         if (num_components == 3) {
-            /* AMD hardware can't do 3-component loads except for 96-bit loads, handled above. */
-            return false;
-         }
-         unsigned req = bit_size * num_components;
-         if (req == 64 || req == 128) /* 64-bit and 128-bit loads can use ds_read2_b{32,64} */
-            req /= 2u;
-         return align % (req / 8u) == 0;
-      }
-   }
-   return false;
-}
-
-bool ac_nir_scalarize_overfetching_loads_callback(const nir_instr *instr, const void *data)
-{
-   nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
-
-   /* Reject opcodes we don't scalarize. */
-   switch (intr->intrinsic) {
-   case nir_intrinsic_load_ubo:
-   case nir_intrinsic_load_ssbo:
-   case nir_intrinsic_load_global:
-   case nir_intrinsic_load_global_constant:
-   case nir_intrinsic_load_shared:
-      break;
-   default:
-      return false;
-   }
-
-   bool uses_smem = nir_intrinsic_has_access(intr) &&
-                    nir_intrinsic_access(intr) & ACCESS_SMEM_AMD;
-   bool is_shared = intr->intrinsic == nir_intrinsic_load_shared;
-
-   enum amd_gfx_level gfx_level = *(enum amd_gfx_level *)data;
-   unsigned comp_size = intr->def.bit_size / 8;
-   unsigned load_size = intr->def.num_components * comp_size;
-   unsigned used_load_size = util_bitcount(nir_def_components_read(&intr->def)) * comp_size;
-
-   /* Scalarize if the load overfetches. That includes loads that overfetch due to load size
-    * alignment, e.g. when only a power-of-two load is available. The scalarized loads are expected
-    * to be later vectorized to optimal sizes.
-    */
-   return used_load_size < align_load_store_size(gfx_level, load_size, uses_smem, is_shared);
-}
-
 unsigned ac_get_spi_shader_z_format(bool writes_z, bool writes_stencil, bool writes_samplemask,
                                     bool writes_mrt0_alpha)
 {
diff --git a/src/amd/common/ac_shader_util.h b/src/amd/common/ac_shader_util.h
index 5008f947a80..ceb14e76ca8 100644
--- a/src/amd/common/ac_shader_util.h
+++ b/src/amd/common/ac_shader_util.h
@@ -249,12 +249,6 @@ struct ac_nir_config {
 void ac_set_nir_options(struct radeon_info *info, bool use_llvm,
                         nir_shader_compiler_options *options);
 
-bool ac_nir_mem_vectorize_callback(unsigned align_mul, unsigned align_offset, unsigned bit_size,
-                                   unsigned num_components, int64_t hole_size,
-                                   nir_intrinsic_instr *low, nir_intrinsic_instr *high, void *data);
-
-bool ac_nir_scalarize_overfetching_loads_callback(const nir_instr *instr, const void *data);
-
 unsigned ac_get_spi_shader_z_format(bool writes_z, bool writes_stencil, bool writes_samplemask,
                                     bool writes_mrt0_alpha);
 
diff --git a/src/amd/vulkan/radv_pipeline_rt.c b/src/amd/vulkan/radv_pipeline_rt.c
index 07925c58932..f6a41a10403 100644
--- a/src/amd/vulkan/radv_pipeline_rt.c
+++ b/src/amd/vulkan/radv_pipeline_rt.c
@@ -19,6 +19,7 @@
 #include "radv_pipeline_rt.h"
 #include "radv_rmv.h"
 #include "radv_shader.h"
+#include "ac_nir.h"
 
 struct rt_handle_hash_entry {
    uint32_t key;