ac,radeonsi import PM4 state from RadeonSI

Signed-off-by: Samuel Pitoiset <samuel.pitoiset@gmail.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/29452>
2024-05-28 15:56:52 +02:00
parent 62c52fb59d
commit 428601095c
10 changed files with 829 additions and 710 deletions
--- a/src/amd/common/ac_pm4.c
+++ b/src/amd/common/ac_pm4.c
@@ -0,0 +1,371 @@
+/*
+ * Copyright 2012 Advanced Micro Devices, Inc.
+ *
+ * SPDX-License-Identifier: MIT
+ */
+
+#include "ac_debug.h"
+#include "ac_gpu_info.h"
+#include "ac_pm4.h"
+
+#include "sid.h"
+
+#include <string.h>
+#include <stdlib.h>
+
+static bool
+opcode_is_pairs(unsigned opcode)
+{
+   return opcode == PKT3_SET_CONTEXT_REG_PAIRS ||
+          opcode == PKT3_SET_SH_REG_PAIRS ||
+          opcode == PKT3_SET_UCONFIG_REG_PAIRS;
+}
+
+static bool
+opcode_is_pairs_packed(unsigned opcode)
+{
+   return opcode == PKT3_SET_CONTEXT_REG_PAIRS_PACKED ||
+          opcode == PKT3_SET_SH_REG_PAIRS_PACKED ||
+          opcode == PKT3_SET_SH_REG_PAIRS_PACKED_N;
+}
+
+static unsigned
+pairs_packed_opcode_to_regular(unsigned opcode)
+{
+   switch (opcode) {
+   case PKT3_SET_CONTEXT_REG_PAIRS_PACKED:
+      return PKT3_SET_CONTEXT_REG;
+   case PKT3_SET_SH_REG_PAIRS_PACKED:
+      return PKT3_SET_SH_REG;
+   default:
+      unreachable("invalid packed opcode");
+   }
+}
+
+static unsigned
+regular_opcode_to_pairs(struct ac_pm4_state *state, unsigned opcode)
+{
+   const struct radeon_info *info = state->info;
+
+   switch (opcode) {
+   case PKT3_SET_CONTEXT_REG:
+      return info->has_set_context_pairs_packed ? PKT3_SET_CONTEXT_REG_PAIRS_PACKED :
+             info->has_set_context_pairs ? PKT3_SET_CONTEXT_REG_PAIRS : opcode;
+   case PKT3_SET_SH_REG:
+      return info->has_set_sh_pairs_packed ? PKT3_SET_SH_REG_PAIRS_PACKED :
+             info->has_set_sh_pairs ? PKT3_SET_SH_REG_PAIRS : opcode;
+   case PKT3_SET_UCONFIG_REG:
+      return info->has_set_uconfig_pairs ? PKT3_SET_UCONFIG_REG_PAIRS : opcode;
+   }
+
+   return opcode;
+}
+
+static bool
+packed_next_is_reg_offset_pair(struct ac_pm4_state *state)
+{
+   return (state->ndw - state->last_pm4) % 3 == 2;
+}
+
+static bool
+packed_next_is_reg_value1(struct ac_pm4_state *state)
+{
+   return (state->ndw - state->last_pm4) % 3 == 1;
+}
+
+static bool
+packed_prev_is_reg_value0(struct ac_pm4_state *state)
+{
+   return packed_next_is_reg_value1(state);
+}
+
+static unsigned
+get_packed_reg_dw_offsetN(struct ac_pm4_state *state, unsigned index)
+{
+   unsigned i = state->last_pm4 + 2 + (index / 2) * 3;
+   assert(i < state->ndw);
+   return (state->pm4[i] >> ((index % 2) * 16)) & 0xffff;
+}
+
+static unsigned
+get_packed_reg_valueN_idx(struct ac_pm4_state *state, unsigned index)
+{
+   unsigned i = state->last_pm4 + 2 + (index / 2) * 3 + 1 + (index % 2);
+   assert(i < state->ndw);
+   return i;
+}
+
+static unsigned
+get_packed_reg_valueN(struct ac_pm4_state *state, unsigned index)
+{
+   return state->pm4[get_packed_reg_valueN_idx(state, index)];
+}
+
+static unsigned
+get_packed_reg_count(struct ac_pm4_state *state)
+{
+   int body_size = state->ndw - state->last_pm4 - 2;
+   assert(body_size > 0 && body_size % 3 == 0);
+   return (body_size / 3) * 2;
+}
+
+void
+ac_pm4_finalize(struct ac_pm4_state *state)
+{
+   if (opcode_is_pairs_packed(state->last_opcode)) {
+      unsigned reg_count = get_packed_reg_count(state);
+      unsigned reg_dw_offset0 = get_packed_reg_dw_offsetN(state, 0);
+
+      if (state->packed_is_padded)
+         reg_count--;
+
+      bool all_consecutive = true;
+
+      /* If the whole packed SET packet only sets consecutive registers, rewrite the packet
+       * to be unpacked to make it shorter.
+       *
+       * This also eliminates the invalid scenario when the packed SET packet sets only
+       * 2 registers and the register offsets are equal due to padding.
+       */
+      for (unsigned i = 1; i < reg_count; i++) {
+         if (reg_dw_offset0 != get_packed_reg_dw_offsetN(state, i) - i) {
+            all_consecutive = false;
+            break;
+         }
+      }
+
+      if (all_consecutive) {
+         assert(state->ndw - state->last_pm4 == 2 + 3 * (reg_count + state->packed_is_padded) / 2);
+         state->pm4[state->last_pm4] = PKT3(pairs_packed_opcode_to_regular(state->last_opcode),
+                                            reg_count, 0);
+         state->pm4[state->last_pm4 + 1] = reg_dw_offset0;
+         for (unsigned i = 0; i < reg_count; i++)
+            state->pm4[state->last_pm4 + 2 + i] = get_packed_reg_valueN(state, i);
+         state->ndw = state->last_pm4 + 2 + reg_count;
+         state->last_opcode = PKT3_SET_SH_REG;
+      } else {
+         /* Set reg_va_low_idx to where the shader address is stored in the pm4 state. */
+         if (state->debug_sqtt &&
+             (state->last_opcode == PKT3_SET_SH_REG_PAIRS_PACKED ||
+              state->last_opcode == PKT3_SET_SH_REG_PAIRS_PACKED_N)) {
+            if (state->packed_is_padded)
+               reg_count++; /* Add this back because we only need to record the last write. */
+
+            for (int i = reg_count - 1; i >= 0; i--) {
+               unsigned reg_offset = SI_SH_REG_OFFSET + get_packed_reg_dw_offsetN(state, i) * 4;
+
+               if (strstr(ac_get_register_name(state->info->gfx_level,
+                                               state->info->family, reg_offset),
+                          "SPI_SHADER_PGM_LO_")) {
+                  state->spi_shader_pgm_lo_reg = reg_offset;
+                  break;
+               }
+            }
+         }
+
+         /* If it's a packed SET_SH packet, use the *_N variant when possible. */
+         if (state->last_opcode == PKT3_SET_SH_REG_PAIRS_PACKED && reg_count <= 14) {
+            state->pm4[state->last_pm4] &= PKT3_IT_OPCODE_C;
+            state->pm4[state->last_pm4] |= PKT3_IT_OPCODE_S(PKT3_SET_SH_REG_PAIRS_PACKED_N);
+         }
+      }
+   }
+
+   if (state->debug_sqtt && state->last_opcode == PKT3_SET_SH_REG) {
+      /* Set reg_va_low_idx to where the shader address is stored in the pm4 state. */
+      unsigned reg_count = PKT_COUNT_G(state->pm4[state->last_pm4]);
+      unsigned reg_base_offset = SI_SH_REG_OFFSET + state->pm4[state->last_pm4 + 1] * 4;
+
+      for (unsigned i = 0; i < reg_count; i++) {
+         if (strstr(ac_get_register_name(state->info->gfx_level,
+                                         state->info->family, reg_base_offset + i * 4),
+                    "SPI_SHADER_PGM_LO_")) {
+            state->spi_shader_pgm_lo_reg = reg_base_offset + i * 4;
+
+            break;
+         }
+      }
+   }
+}
+
+void
+ac_pm4_cmd_begin(struct ac_pm4_state *state, unsigned opcode)
+{
+   ac_pm4_finalize(state);
+
+   assert(state->max_dw);
+   assert(state->ndw < state->max_dw);
+   assert(opcode <= 254);
+   state->last_opcode = opcode;
+   state->last_pm4 = state->ndw++;
+   state->packed_is_padded = false;
+}
+
+void
+ac_pm4_cmd_add(struct ac_pm4_state *state, uint32_t dw)
+{
+   assert(state->max_dw);
+   assert(state->ndw < state->max_dw);
+   state->pm4[state->ndw++] = dw;
+   state->last_opcode = 255; /* invalid opcode */
+}
+
+void
+ac_pm4_cmd_end(struct ac_pm4_state *state, bool predicate)
+{
+   unsigned count;
+   count = state->ndw - state->last_pm4 - 2;
+   /* All SET_*_PAIRS* packets on the gfx queue must set RESET_FILTER_CAM. */
+   bool reset_filter_cam = !state->is_compute_queue &&
+                           (opcode_is_pairs(state->last_opcode) ||
+                            opcode_is_pairs_packed(state->last_opcode));
+
+   state->pm4[state->last_pm4] = PKT3(state->last_opcode, count, predicate) |
+                                 PKT3_RESET_FILTER_CAM_S(reset_filter_cam);
+
+   if (opcode_is_pairs_packed(state->last_opcode)) {
+      if (packed_prev_is_reg_value0(state)) {
+         /* Duplicate the first register at the end to make the number of registers aligned to 2. */
+         ac_pm4_set_reg_custom(state, get_packed_reg_dw_offsetN(state, 0) * 4,
+                               get_packed_reg_valueN(state, 0),
+                               state->last_opcode, 0);
+         state->packed_is_padded = true;
+      }
+
+      state->pm4[state->last_pm4 + 1] = get_packed_reg_count(state);
+   }
+}
+
+void
+ac_pm4_set_reg_custom(struct ac_pm4_state *state, unsigned reg, uint32_t val,
+                      unsigned opcode, unsigned idx)
+{
+   bool is_packed = opcode_is_pairs_packed(opcode);
+   reg >>= 2;
+
+   assert(state->max_dw);
+   assert(state->ndw + 2 <= state->max_dw);
+
+   if (is_packed) {
+      assert(idx == 0);
+
+      if (opcode != state->last_opcode) {
+         ac_pm4_cmd_begin(state, opcode); /* reserve space for the header */
+         state->ndw++; /* reserve space for the register count, it will be set at the end */
+      }
+   } else if (opcode_is_pairs(opcode)) {
+      assert(idx == 0);
+
+      if (opcode != state->last_opcode)
+         ac_pm4_cmd_begin(state, opcode);
+
+      state->pm4[state->ndw++] = reg;
+   } else if (opcode != state->last_opcode || reg != (state->last_reg + 1) ||
+              idx != state->last_idx) {
+      ac_pm4_cmd_begin(state, opcode);
+      state->pm4[state->ndw++] = reg | (idx << 28);
+   }
+
+   assert(reg <= UINT16_MAX);
+   state->last_reg = reg;
+   state->last_idx = idx;
+
+   if (is_packed) {
+      if (state->packed_is_padded) {
+         /* The packet is padded, which means the first register is written redundantly again
+          * at the end. Remove it, so that we can replace it with this register.
+          */
+         state->packed_is_padded = false;
+         state->ndw--;
+      }
+
+      if (packed_next_is_reg_offset_pair(state)) {
+         state->pm4[state->ndw++] = reg;
+      } else if (packed_next_is_reg_value1(state)) {
+         /* Set the second register offset in the high 16 bits. */
+         state->pm4[state->ndw - 2] &= 0x0000ffff;
+         state->pm4[state->ndw - 2] |= reg << 16;
+      }
+   }
+
+   state->pm4[state->ndw++] = val;
+   ac_pm4_cmd_end(state, false);
+}
+
+void ac_pm4_set_reg(struct ac_pm4_state *state, unsigned reg, uint32_t val)
+{
+   unsigned opcode;
+
+   if (reg >= SI_CONFIG_REG_OFFSET && reg < SI_CONFIG_REG_END) {
+      opcode = PKT3_SET_CONFIG_REG;
+      reg -= SI_CONFIG_REG_OFFSET;
+
+   } else if (reg >= SI_SH_REG_OFFSET && reg < SI_SH_REG_END) {
+      opcode = PKT3_SET_SH_REG;
+      reg -= SI_SH_REG_OFFSET;
+
+   } else if (reg >= SI_CONTEXT_REG_OFFSET && reg < SI_CONTEXT_REG_END) {
+      opcode = PKT3_SET_CONTEXT_REG;
+      reg -= SI_CONTEXT_REG_OFFSET;
+
+   } else if (reg >= CIK_UCONFIG_REG_OFFSET && reg < CIK_UCONFIG_REG_END) {
+      opcode = PKT3_SET_UCONFIG_REG;
+      reg -= CIK_UCONFIG_REG_OFFSET;
+
+   } else {
+      fprintf(stderr, "mesa: Invalid register offset %08x!\n", reg);
+      return;
+   }
+
+   opcode = regular_opcode_to_pairs(state, opcode);
+
+   ac_pm4_set_reg_custom(state, reg, val, opcode, 0);
+}
+
+void
+ac_pm4_set_reg_idx3(struct ac_pm4_state *state, unsigned reg, uint32_t val)
+{
+   if (state->info->uses_kernel_cu_mask) {
+      assert(state->info->gfx_level >= GFX10);
+      ac_pm4_set_reg_custom(state, reg - SI_SH_REG_OFFSET, val, PKT3_SET_SH_REG_INDEX, 3);
+   } else {
+      ac_pm4_set_reg(state, reg, val);
+   }
+}
+
+void
+ac_pm4_clear_state(struct ac_pm4_state *state, const struct radeon_info *info,
+                   bool debug_sqtt, bool is_compute_queue)
+{
+   state->info = info;
+   state->debug_sqtt = debug_sqtt;
+   state->ndw = 0;
+   state->is_compute_queue = is_compute_queue;
+
+   if (!state->max_dw)
+      state->max_dw = ARRAY_SIZE(state->pm4);
+}
+
+struct ac_pm4_state *
+ac_pm4_create_sized(const struct radeon_info *info, bool debug_sqtt,
+                    unsigned max_dw, bool is_compute_queue)
+{
+   struct ac_pm4_state *pm4;
+   unsigned size = sizeof(*pm4) + 4 * (max_dw - ARRAY_SIZE(pm4->pm4));
+
+   pm4 = (struct ac_pm4_state *)calloc(1, size);
+   if (pm4) {
+      pm4->max_dw = max_dw;
+      ac_pm4_clear_state(pm4, info, debug_sqtt, is_compute_queue);
+   }
+   return pm4;
+}
+
+void
+ac_pm4_free_state(struct ac_pm4_state *state)
+{
+   if (!state)
+      return;
+
+   free(state);
+}
--- a/src/amd/common/ac_pm4.h
+++ b/src/amd/common/ac_pm4.h
@@ -0,0 +1,76 @@
+/*
+ * Copyright 2012 Advanced Micro Devices, Inc.
+ *
+ * SPDX-License-Identifier: MIT
+ */
+
+#ifndef AC_PM4_H
+#define AC_PM4_H
+
+#include "ac_gpu_info.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct ac_pm4_state {
+   const struct radeon_info *info;
+
+   /* PKT3_SET_*_REG handling */
+   uint16_t last_reg;   /* register offset in dwords */
+   uint16_t last_pm4;
+   uint16_t ndw;        /* number of dwords in pm4 */
+   uint8_t last_opcode;
+   uint8_t last_idx;
+   bool is_compute_queue;
+   bool packed_is_padded; /* whether SET_*_REG_PAIRS_PACKED is padded to an even number of regs */
+
+   /* commands for the DE */
+   uint16_t max_dw;
+
+   /* Used by SQTT to override the shader address */
+   bool debug_sqtt;
+   uint32_t spi_shader_pgm_lo_reg;
+
+   /* This must be the last field because the array can continue after the structure. */
+   uint32_t pm4[64];
+};
+
+void
+ac_pm4_set_reg(struct ac_pm4_state *state, unsigned reg, uint32_t val);
+
+void
+ac_pm4_set_reg_custom(struct ac_pm4_state *state, unsigned reg, uint32_t val,
+                      unsigned opcode, unsigned idx);
+
+void
+ac_pm4_set_reg_idx3(struct ac_pm4_state *state, unsigned reg, uint32_t val);
+
+void
+ac_pm4_clear_state(struct ac_pm4_state *state, const struct radeon_info *info,
+                   bool debug_sqtt, bool is_compute_queue);
+
+void
+ac_pm4_cmd_begin(struct ac_pm4_state *state, unsigned opcode);
+
+void
+ac_pm4_cmd_add(struct ac_pm4_state *state, uint32_t dw);
+
+void
+ac_pm4_cmd_end(struct ac_pm4_state *state, bool predicate);
+
+void
+ac_pm4_finalize(struct ac_pm4_state *state);
+
+struct ac_pm4_state *
+ac_pm4_create_sized(const struct radeon_info *info, bool debug_sqtt,
+                    unsigned max_dw, bool is_compute_queue);
+
+void
+ac_pm4_free_state(struct ac_pm4_state *state);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
--- a/src/amd/common/meson.build
+++ b/src/amd/common/meson.build
@@ -115,6 +115,8 @@ amd_common_files = files(
  'ac_parse_ib.c',
  'ac_perfcounter.c',
  'ac_perfcounter.h',
+  'ac_pm4.c',
+  'ac_pm4.h',
  'ac_vcn_av1_default.h',
 )

--- a/src/gallium/drivers/radeonsi/si_cp_reg_shadowing.c
+++ b/src/gallium/drivers/radeonsi/si_cp_reg_shadowing.c
@@ -65,7 +65,7 @@ void si_init_cp_reg_shadowing(struct si_context *sctx)
      struct si_pm4_state *shadowing_preamble = si_pm4_create_sized(sctx->screen, 256, false);

      ac_create_shadowing_ib_preamble(&sctx->screen->info,
-                                      (pm4_cmd_add_fn)si_pm4_cmd_add, shadowing_preamble,
+                                      (pm4_cmd_add_fn)ac_pm4_cmd_add, shadowing_preamble,
                                      sctx->shadowing.registers->gpu_address, sctx->screen->dpbb_allowed);

      /* Initialize shadowed registers as follows. */
@@ -95,8 +95,8 @@ void si_init_cp_reg_shadowing(struct si_context *sctx)
      /* Setup preemption. The shadowing preamble will be executed as a preamble IB,
       * which will load register values from memory on a context switch.
       */
-      sctx->ws->cs_setup_preemption(&sctx->gfx_cs, shadowing_preamble->pm4,
-                                    shadowing_preamble->ndw);
+      sctx->ws->cs_setup_preemption(&sctx->gfx_cs, shadowing_preamble->base.pm4,
+                                    shadowing_preamble->base.ndw);
      si_pm4_free_state(sctx, shadowing_preamble, ~0);
   }
 }
--- a/src/gallium/drivers/radeonsi/si_gfx_cs.c
+++ b/src/gallium/drivers/radeonsi/si_gfx_cs.c
@@ -511,7 +511,7 @@ void si_begin_new_gfx_cs(struct si_context *ctx, bool first_cs)
      struct si_pm4_state *preamble = is_secure ? ctx->cs_preamble_state_tmz :
                                                  ctx->cs_preamble_state;
      radeon_begin(&ctx->gfx_cs);
-      radeon_emit_array(preamble->pm4, preamble->ndw);
+      radeon_emit_array(preamble->base.pm4, preamble->base.ndw);
      radeon_end();
   }

--- a/src/gallium/drivers/radeonsi/si_pm4.c
+++ b/src/gallium/drivers/radeonsi/si_pm4.c
@@ -11,321 +11,12 @@
 #include "util/u_memory.h"
 #include "ac_debug.h"

-static void si_pm4_set_reg_custom(struct si_pm4_state *state, unsigned reg, uint32_t val,
-                                  unsigned opcode, unsigned idx);
-
-static bool opcode_is_pairs(unsigned opcode)
-{
-   return opcode == PKT3_SET_CONTEXT_REG_PAIRS ||
-          opcode == PKT3_SET_SH_REG_PAIRS ||
-          opcode == PKT3_SET_UCONFIG_REG_PAIRS;
-}
-
-static bool opcode_is_pairs_packed(unsigned opcode)
-{
-   return opcode == PKT3_SET_CONTEXT_REG_PAIRS_PACKED ||
-          opcode == PKT3_SET_SH_REG_PAIRS_PACKED ||
-          opcode == PKT3_SET_SH_REG_PAIRS_PACKED_N;
-}
-
-static unsigned pairs_packed_opcode_to_regular(unsigned opcode)
-{
-   switch (opcode) {
-   case PKT3_SET_CONTEXT_REG_PAIRS_PACKED:
-      return PKT3_SET_CONTEXT_REG;
-   case PKT3_SET_SH_REG_PAIRS_PACKED:
-      return PKT3_SET_SH_REG;
-   default:
-      unreachable("invalid packed opcode");
-   }
-}
-
-static unsigned regular_opcode_to_pairs(struct si_pm4_state *state, unsigned opcode)
-{
-   const struct radeon_info *info = &state->screen->info;
-
-   switch (opcode) {
-   case PKT3_SET_CONTEXT_REG:
-      return info->has_set_context_pairs_packed ? PKT3_SET_CONTEXT_REG_PAIRS_PACKED :
-             info->has_set_context_pairs ? PKT3_SET_CONTEXT_REG_PAIRS : opcode;
-   case PKT3_SET_SH_REG:
-      return info->has_set_sh_pairs_packed ? PKT3_SET_SH_REG_PAIRS_PACKED :
-             info->has_set_sh_pairs ? PKT3_SET_SH_REG_PAIRS : opcode;
-   case PKT3_SET_UCONFIG_REG:
-      return info->has_set_uconfig_pairs ? PKT3_SET_UCONFIG_REG_PAIRS : opcode;
-   }
-
-   return opcode;
-}
-
-static bool packed_next_is_reg_offset_pair(struct si_pm4_state *state)
-{
-   return (state->ndw - state->last_pm4) % 3 == 2;
-}
-
-static bool packed_next_is_reg_value1(struct si_pm4_state *state)
-{
-   return (state->ndw - state->last_pm4) % 3 == 1;
-}
-
-static bool packed_prev_is_reg_value0(struct si_pm4_state *state)
-{
-   return packed_next_is_reg_value1(state);
-}
-
-static unsigned get_packed_reg_dw_offsetN(struct si_pm4_state *state, unsigned index)
-{
-   unsigned i = state->last_pm4 + 2 + (index / 2) * 3;
-   assert(i < state->ndw);
-   return (state->pm4[i] >> ((index % 2) * 16)) & 0xffff;
-}
-
-static unsigned get_packed_reg_valueN_idx(struct si_pm4_state *state, unsigned index)
-{
-   unsigned i = state->last_pm4 + 2 + (index / 2) * 3 + 1 + (index % 2);
-   assert(i < state->ndw);
-   return i;
-}
-
-static unsigned get_packed_reg_valueN(struct si_pm4_state *state, unsigned index)
-{
-   return state->pm4[get_packed_reg_valueN_idx(state, index)];
-}
-
-static unsigned get_packed_reg_count(struct si_pm4_state *state)
-{
-   int body_size = state->ndw - state->last_pm4 - 2;
-   assert(body_size > 0 && body_size % 3 == 0);
-   return (body_size / 3) * 2;
-}
-
-void si_pm4_finalize(struct si_pm4_state *state)
-{
-   if (opcode_is_pairs_packed(state->last_opcode)) {
-      unsigned reg_count = get_packed_reg_count(state);
-      unsigned reg_dw_offset0 = get_packed_reg_dw_offsetN(state, 0);
-
-      if (state->packed_is_padded)
-         reg_count--;
-
-      bool all_consecutive = true;
-
-      /* If the whole packed SET packet only sets consecutive registers, rewrite the packet
-       * to be unpacked to make it shorter.
-       *
-       * This also eliminates the invalid scenario when the packed SET packet sets only
-       * 2 registers and the register offsets are equal due to padding.
-       */
-      for (unsigned i = 1; i < reg_count; i++) {
-         if (reg_dw_offset0 != get_packed_reg_dw_offsetN(state, i) - i) {
-            all_consecutive = false;
-            break;
-         }
-      }
-
-      if (all_consecutive) {
-         assert(state->ndw - state->last_pm4 == 2 + 3 * (reg_count + state->packed_is_padded) / 2);
-         state->pm4[state->last_pm4] = PKT3(pairs_packed_opcode_to_regular(state->last_opcode),
-                                            reg_count, 0);
-         state->pm4[state->last_pm4 + 1] = reg_dw_offset0;
-         for (unsigned i = 0; i < reg_count; i++)
-            state->pm4[state->last_pm4 + 2 + i] = get_packed_reg_valueN(state, i);
-         state->ndw = state->last_pm4 + 2 + reg_count;
-         state->last_opcode = PKT3_SET_SH_REG;
-      } else {
-         /* Set reg_va_low_idx to where the shader address is stored in the pm4 state. */
-         if (state->screen->debug_flags & DBG(SQTT) &&
-             (state->last_opcode == PKT3_SET_SH_REG_PAIRS_PACKED ||
-              state->last_opcode == PKT3_SET_SH_REG_PAIRS_PACKED_N)) {
-            if (state->packed_is_padded)
-               reg_count++; /* Add this back because we only need to record the last write. */
-
-            for (int i = reg_count - 1; i >= 0; i--) {
-               unsigned reg_offset = SI_SH_REG_OFFSET + get_packed_reg_dw_offsetN(state, i) * 4;
-
-               if (strstr(ac_get_register_name(state->screen->info.gfx_level,
-                                               state->screen->info.family, reg_offset),
-                          "SPI_SHADER_PGM_LO_")) {
-                  state->spi_shader_pgm_lo_reg = reg_offset;
-                  break;
-               }
-            }
-         }
-
-         /* If it's a packed SET_SH packet, use the *_N variant when possible. */
-         if (state->last_opcode == PKT3_SET_SH_REG_PAIRS_PACKED && reg_count <= 14) {
-            state->pm4[state->last_pm4] &= PKT3_IT_OPCODE_C;
-            state->pm4[state->last_pm4] |= PKT3_IT_OPCODE_S(PKT3_SET_SH_REG_PAIRS_PACKED_N);
-         }
-      }
-   }
-
-   if (state->screen->debug_flags & DBG(SQTT) && state->last_opcode == PKT3_SET_SH_REG) {
-      /* Set reg_va_low_idx to where the shader address is stored in the pm4 state. */
-      unsigned reg_count = PKT_COUNT_G(state->pm4[state->last_pm4]);
-      unsigned reg_base_offset = SI_SH_REG_OFFSET + state->pm4[state->last_pm4 + 1] * 4;
-
-      for (unsigned i = 0; i < reg_count; i++) {
-         if (strstr(ac_get_register_name(state->screen->info.gfx_level,
-                                         state->screen->info.family, reg_base_offset + i * 4),
-                    "SPI_SHADER_PGM_LO_")) {
-            state->spi_shader_pgm_lo_reg = reg_base_offset + i * 4;
-
-            break;
-         }
-      }
-   }
-}
-
-static void si_pm4_cmd_begin(struct si_pm4_state *state, unsigned opcode)
-{
-   si_pm4_finalize(state);
-
-   assert(state->max_dw);
-   assert(state->ndw < state->max_dw);
-   assert(opcode <= 254);
-   state->last_opcode = opcode;
-   state->last_pm4 = state->ndw++;
-   state->packed_is_padded = false;
-}
-
-void si_pm4_cmd_add(struct si_pm4_state *state, uint32_t dw)
-{
-   assert(state->max_dw);
-   assert(state->ndw < state->max_dw);
-   state->pm4[state->ndw++] = dw;
-   state->last_opcode = 255; /* invalid opcode */
-}
-
-static void si_pm4_cmd_end(struct si_pm4_state *state, bool predicate)
-{
-   unsigned count;
-   count = state->ndw - state->last_pm4 - 2;
-   /* All SET_*_PAIRS* packets on the gfx queue must set RESET_FILTER_CAM. */
-   bool reset_filter_cam = !state->is_compute_queue &&
-                           (opcode_is_pairs(state->last_opcode) ||
-                            opcode_is_pairs_packed(state->last_opcode));
-
-   state->pm4[state->last_pm4] = PKT3(state->last_opcode, count, predicate) |
-                                 PKT3_RESET_FILTER_CAM_S(reset_filter_cam);
-
-   if (opcode_is_pairs_packed(state->last_opcode)) {
-      if (packed_prev_is_reg_value0(state)) {
-         /* Duplicate the first register at the end to make the number of registers aligned to 2. */
-         si_pm4_set_reg_custom(state, get_packed_reg_dw_offsetN(state, 0) * 4,
-                               get_packed_reg_valueN(state, 0),
-                               state->last_opcode, 0);
-         state->packed_is_padded = true;
-      }
-
-      state->pm4[state->last_pm4 + 1] = get_packed_reg_count(state);
-   }
-}
-
-static void si_pm4_set_reg_custom(struct si_pm4_state *state, unsigned reg, uint32_t val,
-                                  unsigned opcode, unsigned idx)
-{
-   bool is_packed = opcode_is_pairs_packed(opcode);
-   reg >>= 2;
-
-   assert(state->max_dw);
-   assert(state->ndw + 2 <= state->max_dw);
-
-   if (is_packed) {
-      assert(idx == 0);
-
-      if (opcode != state->last_opcode) {
-         si_pm4_cmd_begin(state, opcode); /* reserve space for the header */
-         state->ndw++; /* reserve space for the register count, it will be set at the end */
-      }
-   } else if (opcode_is_pairs(opcode)) {
-      assert(idx == 0);
-
-      if (opcode != state->last_opcode)
-         si_pm4_cmd_begin(state, opcode);
-
-      state->pm4[state->ndw++] = reg;
-   } else if (opcode != state->last_opcode || reg != (state->last_reg + 1) ||
-              idx != state->last_idx) {
-      si_pm4_cmd_begin(state, opcode);
-      state->pm4[state->ndw++] = reg | (idx << 28);
-   }
-
-   assert(reg <= UINT16_MAX);
-   state->last_reg = reg;
-   state->last_idx = idx;
-
-   if (is_packed) {
-      if (state->packed_is_padded) {
-         /* The packet is padded, which means the first register is written redundantly again
-          * at the end. Remove it, so that we can replace it with this register.
-          */
-         state->packed_is_padded = false;
-         state->ndw--;
-      }
-
-      if (packed_next_is_reg_offset_pair(state)) {
-         state->pm4[state->ndw++] = reg;
-      } else if (packed_next_is_reg_value1(state)) {
-         /* Set the second register offset in the high 16 bits. */
-         state->pm4[state->ndw - 2] &= 0x0000ffff;
-         state->pm4[state->ndw - 2] |= reg << 16;
-      }
-   }
-
-   state->pm4[state->ndw++] = val;
-   si_pm4_cmd_end(state, false);
-}
-
-void si_pm4_set_reg(struct si_pm4_state *state, unsigned reg, uint32_t val)
-{
-   unsigned opcode;
-
-   if (reg >= SI_CONFIG_REG_OFFSET && reg < SI_CONFIG_REG_END) {
-      opcode = PKT3_SET_CONFIG_REG;
-      reg -= SI_CONFIG_REG_OFFSET;
-
-   } else if (reg >= SI_SH_REG_OFFSET && reg < SI_SH_REG_END) {
-      opcode = PKT3_SET_SH_REG;
-      reg -= SI_SH_REG_OFFSET;
-
-   } else if (reg >= SI_CONTEXT_REG_OFFSET && reg < SI_CONTEXT_REG_END) {
-      opcode = PKT3_SET_CONTEXT_REG;
-      reg -= SI_CONTEXT_REG_OFFSET;
-
-   } else if (reg >= CIK_UCONFIG_REG_OFFSET && reg < CIK_UCONFIG_REG_END) {
-      opcode = PKT3_SET_UCONFIG_REG;
-      reg -= CIK_UCONFIG_REG_OFFSET;
-
-   } else {
-      PRINT_ERR("Invalid register offset %08x!\n", reg);
-      return;
-   }
-
-   opcode = regular_opcode_to_pairs(state, opcode);
-
-   si_pm4_set_reg_custom(state, reg, val, opcode, 0);
-}
-
-void si_pm4_set_reg_idx3(struct si_pm4_state *state, unsigned reg, uint32_t val)
-{
-   if (state->screen->info.uses_kernel_cu_mask) {
-      assert(state->screen->info.gfx_level >= GFX10);
-      si_pm4_set_reg_custom(state, reg - SI_SH_REG_OFFSET, val, PKT3_SET_SH_REG_INDEX, 3);
-   } else {
-      si_pm4_set_reg(state, reg, val);
-   }
-}
-
 void si_pm4_clear_state(struct si_pm4_state *state, struct si_screen *sscreen,
                        bool is_compute_queue)
 {
-   state->screen = sscreen;
-   state->ndw = 0;
-   state->is_compute_queue = is_compute_queue;
+   const bool debug_sqtt = !!(sscreen->debug_flags & DBG(SQTT));

-   if (!state->max_dw)
-      state->max_dw = ARRAY_SIZE(state->pm4);
+   ac_pm4_clear_state(&state->base, &sscreen->info, debug_sqtt, is_compute_queue);
 }

 void si_pm4_free_state(struct si_context *sctx, struct si_pm4_state *state, unsigned idx)
@@ -351,7 +42,7 @@ void si_pm4_emit_commands(struct si_context *sctx, struct si_pm4_state *state)
   struct radeon_cmdbuf *cs = &sctx->gfx_cs;

   radeon_begin(cs);
-   radeon_emit_array(state->pm4, state->ndw);
+   radeon_emit_array(state->base.pm4, state->base.ndw);
   radeon_end();
 }

@@ -364,7 +55,7 @@ void si_pm4_emit_state(struct si_context *sctx, unsigned index)
   assert(state && state != sctx->emitted.array[index]);

   radeon_begin(cs);
-   radeon_emit_array(state->pm4, state->ndw);
+   radeon_emit_array(state->base.pm4, state->base.ndw);
   radeon_end();

   sctx->emitted.array[index] = state;
@@ -396,21 +87,21 @@ struct si_pm4_state *si_pm4_create_sized(struct si_screen *sscreen, unsigned max
                                         bool is_compute_queue)
 {
   struct si_pm4_state *pm4;
-   unsigned size = sizeof(*pm4) + 4 * (max_dw - ARRAY_SIZE(pm4->pm4));
+   unsigned size = sizeof(*pm4) + 4 * (max_dw - ARRAY_SIZE(pm4->base.pm4));

   pm4 = (struct si_pm4_state *)calloc(1, size);
   if (pm4) {
-      pm4->max_dw = max_dw;
+      pm4->base.max_dw = max_dw;
      si_pm4_clear_state(pm4, sscreen, is_compute_queue);
   }
   return pm4;
 }

-struct si_pm4_state *si_pm4_clone(struct si_pm4_state *orig)
+struct si_pm4_state *si_pm4_clone(struct si_screen *sscreen, struct si_pm4_state *orig)
 {
-   struct si_pm4_state *pm4 = si_pm4_create_sized(orig->screen, orig->max_dw,
-                                                  orig->is_compute_queue);
+   struct si_pm4_state *pm4 = si_pm4_create_sized(sscreen, orig->base.max_dw,
+                                                  orig->base.is_compute_queue);
   if (pm4)
-      memcpy(pm4, orig, sizeof(*pm4) + 4 * (pm4->max_dw - ARRAY_SIZE(pm4->pm4)));
+      memcpy(pm4, orig, sizeof(*pm4) + 4 * (pm4->base.max_dw - ARRAY_SIZE(pm4->base.pm4)));
   return pm4;
 }
--- a/src/gallium/drivers/radeonsi/si_pm4.h
+++ b/src/gallium/drivers/radeonsi/si_pm4.h
@@ -10,6 +10,8 @@
 #include <stdint.h>
 #include <stdbool.h>

+#include "ac_pm4.h"
+
 #ifdef __cplusplus
 extern "C" {
 #endif
@@ -27,35 +29,12 @@ struct si_atom {
 };

 struct si_pm4_state {
-   struct si_screen *screen;
-
-   /* PKT3_SET_*_REG handling */
-   uint16_t last_reg;   /* register offset in dwords */
-   uint16_t last_pm4;
-   uint16_t ndw;        /* number of dwords in pm4 */
-   uint8_t last_opcode;
-   uint8_t last_idx;
-   bool is_compute_queue;
-   bool packed_is_padded; /* whether SET_*_REG_PAIRS_PACKED is padded to an even number of regs */
-
   /* For shader states only */
   struct si_atom atom;

-   /* commands for the DE */
-   uint16_t max_dw;
-
-   /* Used by SQTT to override the shader address */
-   uint32_t spi_shader_pgm_lo_reg;
-
-   /* This must be the last field because the array can continue after the structure. */
-   uint32_t pm4[64];
+   struct ac_pm4_state base;
 };

-void si_pm4_cmd_add(struct si_pm4_state *state, uint32_t dw);
-void si_pm4_set_reg(struct si_pm4_state *state, unsigned reg, uint32_t val);
-void si_pm4_set_reg_idx3(struct si_pm4_state *state, unsigned reg, uint32_t val);
-void si_pm4_finalize(struct si_pm4_state *state);
-
 void si_pm4_clear_state(struct si_pm4_state *state, struct si_screen *sscreen,
                        bool is_compute_queue);
 void si_pm4_free_state(struct si_context *sctx, struct si_pm4_state *state, unsigned idx);
@@ -66,7 +45,7 @@ void si_pm4_emit_shader(struct si_context *sctx, unsigned index);
 void si_pm4_reset_emitted(struct si_context *sctx);
 struct si_pm4_state *si_pm4_create_sized(struct si_screen *sscreen, unsigned max_dw,
                                         bool is_compute_queue);
-struct si_pm4_state *si_pm4_clone(struct si_pm4_state *orig);
+struct si_pm4_state *si_pm4_clone(struct si_screen *sscreen, struct si_pm4_state *orig);

 #ifdef __cplusplus
 }
--- a/src/gallium/drivers/radeonsi/si_state.c
+++ b/src/gallium/drivers/radeonsi/si_state.c
--- a/src/gallium/drivers/radeonsi/si_state_draw.cpp
+++ b/src/gallium/drivers/radeonsi/si_state_draw.cpp
@@ -376,11 +376,11 @@ static bool si_update_shaders(struct si_context *sctx)
                  struct si_pm4_state *pm4 = &shader->pm4;

                  uint64_t va_low = shader->gpu_address >> 8;
-                  uint32_t reg = pm4->spi_shader_pgm_lo_reg;
-                  si_pm4_set_reg(&pipeline->pm4, reg, va_low);
+                  uint32_t reg = pm4->base.spi_shader_pgm_lo_reg;
+                  ac_pm4_set_reg(&pipeline->pm4.base, reg, va_low);
               }
            }
-            si_pm4_finalize(&pipeline->pm4);
+            ac_pm4_finalize(&pipeline->pm4.base);
            sctx->screen->ws->buffer_unmap(sctx->screen->ws, bo->buf);

            _mesa_hash_table_u64_insert(sctx->sqtt->pipeline_bos,
--- a/src/gallium/drivers/radeonsi/si_state_shaders.cpp
+++ b/src/gallium/drivers/radeonsi/si_state_shaders.cpp
@@ -685,7 +685,7 @@ static void si_shader_ls(struct si_screen *sscreen, struct si_shader *shader)
      return;

   va = shader->bo->gpu_address;
-   si_pm4_set_reg(pm4, R_00B520_SPI_SHADER_PGM_LO_LS, va >> 8);
+   ac_pm4_set_reg(&pm4->base, R_00B520_SPI_SHADER_PGM_LO_LS, va >> 8);

   shader->config.rsrc1 = S_00B528_VGPRS(si_shader_encode_vgprs(shader)) |
                          S_00B528_SGPRS(si_shader_encode_sgprs(shader)) |
@@ -694,7 +694,7 @@ static void si_shader_ls(struct si_screen *sscreen, struct si_shader *shader)
                          S_00B528_FLOAT_MODE(shader->config.float_mode);
   shader->config.rsrc2 = S_00B52C_USER_SGPR(si_get_num_vs_user_sgprs(shader, SI_VS_NUM_USER_SGPR)) |
                          S_00B52C_SCRATCH_EN(shader->config.scratch_bytes_per_wave > 0);
-   si_pm4_finalize(pm4);
+   ac_pm4_finalize(&pm4->base);
 }

 static void si_shader_hs(struct si_screen *sscreen, struct si_shader *shader)
@@ -709,30 +709,30 @@ static void si_shader_hs(struct si_screen *sscreen, struct si_shader *shader)
                                GFX6_TCS_NUM_USER_SGPR;

   if (sscreen->info.gfx_level >= GFX12) {
-      si_pm4_set_reg(pm4, R_00B420_SPI_SHADER_PGM_RSRC4_HS,
+      ac_pm4_set_reg(&pm4->base, R_00B420_SPI_SHADER_PGM_RSRC4_HS,
                     S_00B420_WAVE_LIMIT(0x3ff) |
                     S_00B420_GLG_FORCE_DISABLE(1) |
                     S_00B420_INST_PREF_SIZE(si_get_shader_prefetch_size(shader)));

-      si_pm4_set_reg(pm4, R_00B424_SPI_SHADER_PGM_LO_LS, va >> 8);
+      ac_pm4_set_reg(&pm4->base, R_00B424_SPI_SHADER_PGM_LO_LS, va >> 8);
   } else if (sscreen->info.gfx_level >= GFX11) {
-      si_pm4_set_reg_idx3(pm4, R_00B404_SPI_SHADER_PGM_RSRC4_HS,
+      ac_pm4_set_reg_idx3(&pm4->base, R_00B404_SPI_SHADER_PGM_RSRC4_HS,
                          ac_apply_cu_en(S_00B404_INST_PREF_SIZE(si_get_shader_prefetch_size(shader)) |
                                         S_00B404_CU_EN(0xffff),
                                         C_00B404_CU_EN, 16, &sscreen->info));

-      si_pm4_set_reg(pm4, R_00B520_SPI_SHADER_PGM_LO_LS, va >> 8);
+      ac_pm4_set_reg(&pm4->base, R_00B520_SPI_SHADER_PGM_LO_LS, va >> 8);
   } else if (sscreen->info.gfx_level >= GFX10) {
-      si_pm4_set_reg(pm4, R_00B520_SPI_SHADER_PGM_LO_LS, va >> 8);
+      ac_pm4_set_reg(&pm4->base, R_00B520_SPI_SHADER_PGM_LO_LS, va >> 8);
   } else if (sscreen->info.gfx_level >= GFX9) {
-      si_pm4_set_reg(pm4, R_00B410_SPI_SHADER_PGM_LO_LS, va >> 8);
+      ac_pm4_set_reg(&pm4->base, R_00B410_SPI_SHADER_PGM_LO_LS, va >> 8);
   } else {
-      si_pm4_set_reg(pm4, R_00B420_SPI_SHADER_PGM_LO_HS, va >> 8);
-      si_pm4_set_reg(pm4, R_00B424_SPI_SHADER_PGM_HI_HS,
+      ac_pm4_set_reg(&pm4->base, R_00B420_SPI_SHADER_PGM_LO_HS, va >> 8);
+      ac_pm4_set_reg(&pm4->base, R_00B424_SPI_SHADER_PGM_HI_HS,
                     S_00B424_MEM_BASE(sscreen->info.address32_hi >> 8));
   }

-   si_pm4_set_reg(pm4, R_00B428_SPI_SHADER_PGM_RSRC1_HS,
+   ac_pm4_set_reg(&pm4->base, R_00B428_SPI_SHADER_PGM_RSRC1_HS,
                  S_00B428_VGPRS(si_shader_encode_vgprs(shader)) |
                  S_00B428_SGPRS(si_shader_encode_sgprs(shader)) |
                  S_00B428_DX10_CLAMP(sscreen->info.gfx_level < GFX12) |
@@ -752,9 +752,9 @@ static void si_shader_hs(struct si_screen *sscreen, struct si_shader *shader)
      shader->config.rsrc2 |= S_00B42C_OC_LDS_EN(1);

   if (sscreen->info.gfx_level <= GFX8)
-      si_pm4_set_reg(pm4, R_00B42C_SPI_SHADER_PGM_RSRC2_HS, shader->config.rsrc2);
+      ac_pm4_set_reg(&pm4->base, R_00B42C_SPI_SHADER_PGM_RSRC2_HS, shader->config.rsrc2);

-   si_pm4_finalize(pm4);
+   ac_pm4_finalize(&pm4->base);
 }

 static void si_emit_shader_es(struct si_context *sctx, unsigned index)
@@ -804,16 +804,16 @@ static void si_shader_es(struct si_screen *sscreen, struct si_shader *shader)

   oc_lds_en = shader->selector->stage == MESA_SHADER_TESS_EVAL ? 1 : 0;

-   si_pm4_set_reg(pm4, R_00B320_SPI_SHADER_PGM_LO_ES, va >> 8);
-   si_pm4_set_reg(pm4, R_00B324_SPI_SHADER_PGM_HI_ES,
+   ac_pm4_set_reg(&pm4->base, R_00B320_SPI_SHADER_PGM_LO_ES, va >> 8);
+   ac_pm4_set_reg(&pm4->base, R_00B324_SPI_SHADER_PGM_HI_ES,
                  S_00B324_MEM_BASE(sscreen->info.address32_hi >> 8));
-   si_pm4_set_reg(pm4, R_00B328_SPI_SHADER_PGM_RSRC1_ES,
+   ac_pm4_set_reg(&pm4->base, R_00B328_SPI_SHADER_PGM_RSRC1_ES,
                  S_00B328_VGPRS(si_shader_encode_vgprs(shader)) |
                  S_00B328_SGPRS(si_shader_encode_sgprs(shader)) |
                  S_00B328_VGPR_COMP_CNT(vgpr_comp_cnt) |
                  S_00B328_DX10_CLAMP(1) |
                  S_00B328_FLOAT_MODE(shader->config.float_mode));
-   si_pm4_set_reg(pm4, R_00B32C_SPI_SHADER_PGM_RSRC2_ES,
+   ac_pm4_set_reg(&pm4->base, R_00B32C_SPI_SHADER_PGM_RSRC2_ES,
                  S_00B32C_USER_SGPR(num_user_sgprs) | S_00B32C_OC_LDS_EN(oc_lds_en) |
                  S_00B32C_SCRATCH_EN(shader->config.scratch_bytes_per_wave > 0));

@@ -821,7 +821,7 @@ static void si_shader_es(struct si_screen *sscreen, struct si_shader *shader)
      si_set_tesseval_regs(sscreen, shader->selector, shader);

   polaris_set_vgt_vertex_reuse(sscreen, shader->selector, shader);
-   si_pm4_finalize(pm4);
+   ac_pm4_finalize(&pm4->base);
 }

 void gfx9_get_gs_info(struct si_shader_selector *es, struct si_shader_selector *gs,
@@ -1094,9 +1094,9 @@ static void si_shader_gs(struct si_screen *sscreen, struct si_shader *shader)
         num_user_sgprs = GFX9_GS_NUM_USER_SGPR;

      if (sscreen->info.gfx_level >= GFX10) {
-         si_pm4_set_reg(pm4, R_00B320_SPI_SHADER_PGM_LO_ES, va >> 8);
+         ac_pm4_set_reg(&pm4->base, R_00B320_SPI_SHADER_PGM_LO_ES, va >> 8);
      } else {
-         si_pm4_set_reg(pm4, R_00B210_SPI_SHADER_PGM_LO_ES, va >> 8);
+         ac_pm4_set_reg(&pm4->base, R_00B210_SPI_SHADER_PGM_LO_ES, va >> 8);
      }

      uint32_t rsrc1 = S_00B228_VGPRS(si_shader_encode_vgprs(shader)) |
@@ -1117,8 +1117,8 @@ static void si_shader_gs(struct si_screen *sscreen, struct si_shader *shader)
         rsrc2 |= S_00B22C_USER_SGPR_MSB_GFX9(num_user_sgprs >> 5);
      }

-      si_pm4_set_reg(pm4, R_00B228_SPI_SHADER_PGM_RSRC1_GS, rsrc1);
-      si_pm4_set_reg(pm4, R_00B22C_SPI_SHADER_PGM_RSRC2_GS, rsrc2);
+      ac_pm4_set_reg(&pm4->base, R_00B228_SPI_SHADER_PGM_RSRC1_GS, rsrc1);
+      ac_pm4_set_reg(&pm4->base, R_00B22C_SPI_SHADER_PGM_RSRC2_GS, rsrc2);

      shader->gs.spi_shader_pgm_rsrc3_gs =
         ac_apply_cu_en(S_00B21C_CU_EN(0xffff) |
@@ -1147,20 +1147,20 @@ static void si_shader_gs(struct si_screen *sscreen, struct si_shader *shader)
                        S_00B21C_WAVE_LIMIT(0x3F),
                        C_00B21C_CU_EN, 0, &sscreen->info);

-      si_pm4_set_reg(pm4, R_00B220_SPI_SHADER_PGM_LO_GS, va >> 8);
-      si_pm4_set_reg(pm4, R_00B224_SPI_SHADER_PGM_HI_GS,
+      ac_pm4_set_reg(&pm4->base, R_00B220_SPI_SHADER_PGM_LO_GS, va >> 8);
+      ac_pm4_set_reg(&pm4->base, R_00B224_SPI_SHADER_PGM_HI_GS,
                     S_00B224_MEM_BASE(sscreen->info.address32_hi >> 8));

-      si_pm4_set_reg(pm4, R_00B228_SPI_SHADER_PGM_RSRC1_GS,
+      ac_pm4_set_reg(&pm4->base, R_00B228_SPI_SHADER_PGM_RSRC1_GS,
                     S_00B228_VGPRS(si_shader_encode_vgprs(shader)) |
                     S_00B228_SGPRS(si_shader_encode_sgprs(shader)) |
                     S_00B228_DX10_CLAMP(1) |
                     S_00B228_FLOAT_MODE(shader->config.float_mode));
-      si_pm4_set_reg(pm4, R_00B22C_SPI_SHADER_PGM_RSRC2_GS,
+      ac_pm4_set_reg(&pm4->base, R_00B22C_SPI_SHADER_PGM_RSRC2_GS,
                     S_00B22C_USER_SGPR(GFX6_GS_NUM_USER_SGPR) |
                     S_00B22C_SCRATCH_EN(shader->config.scratch_bytes_per_wave > 0));
   }
-   si_pm4_finalize(pm4);
+   ac_pm4_finalize(&pm4->base);
 }

 bool gfx10_is_ngg_passthrough(struct si_shader *shader)
@@ -1488,18 +1488,18 @@ static void gfx10_shader_ngg(struct si_screen *sscreen, struct si_shader *shader
   }

   if (sscreen->info.gfx_level >= GFX12) {
-      si_pm4_set_reg(pm4, R_00B224_SPI_SHADER_PGM_LO_ES, va >> 8);
+      ac_pm4_set_reg(&pm4->base, R_00B224_SPI_SHADER_PGM_LO_ES, va >> 8);
   } else {
-      si_pm4_set_reg(pm4, R_00B320_SPI_SHADER_PGM_LO_ES, va >> 8);
+      ac_pm4_set_reg(&pm4->base, R_00B320_SPI_SHADER_PGM_LO_ES, va >> 8);
   }

-   si_pm4_set_reg(pm4, R_00B228_SPI_SHADER_PGM_RSRC1_GS,
+   ac_pm4_set_reg(&pm4->base, R_00B228_SPI_SHADER_PGM_RSRC1_GS,
                  S_00B228_VGPRS(si_shader_encode_vgprs(shader)) |
                  S_00B228_FLOAT_MODE(shader->config.float_mode) |
                  S_00B228_DX10_CLAMP(sscreen->info.gfx_level < GFX12) |
                  S_00B228_MEM_ORDERED(si_shader_mem_ordered(shader)) |
                  S_00B228_GS_VGPR_COMP_CNT(gs_vgpr_comp_cnt));
-   si_pm4_set_reg(pm4, R_00B22C_SPI_SHADER_PGM_RSRC2_GS,
+   ac_pm4_set_reg(&pm4->base, R_00B22C_SPI_SHADER_PGM_RSRC2_GS,
                  S_00B22C_SCRATCH_EN(shader->config.scratch_bytes_per_wave > 0) |
                  S_00B22C_USER_SGPR(num_user_sgprs) |
                  S_00B22C_ES_VGPR_COMP_CNT(es_vgpr_comp_cnt) |
@@ -1672,7 +1672,7 @@ static void gfx10_shader_ngg(struct si_screen *sscreen, struct si_shader *shader
         S_028B54_MAX_PRIMGRP_IN_WAVE(2);
   }

-   si_pm4_finalize(pm4);
+   ac_pm4_finalize(&pm4->base);
 }

 static void si_emit_shader_vs(struct si_context *sctx, unsigned index)
@@ -1829,15 +1829,15 @@ static void si_shader_vs(struct si_screen *sscreen, struct si_shader *shader,
   oc_lds_en = shader->selector->stage == MESA_SHADER_TESS_EVAL ? 1 : 0;

   if (sscreen->info.gfx_level >= GFX7) {
-      si_pm4_set_reg_idx3(pm4, R_00B118_SPI_SHADER_PGM_RSRC3_VS,
+      ac_pm4_set_reg_idx3(&pm4->base, R_00B118_SPI_SHADER_PGM_RSRC3_VS,
                          ac_apply_cu_en(S_00B118_CU_EN(cu_mask) |
                                         S_00B118_WAVE_LIMIT(0x3F),
                                         C_00B118_CU_EN, 0, &sscreen->info));
-      si_pm4_set_reg(pm4, R_00B11C_SPI_SHADER_LATE_ALLOC_VS, S_00B11C_LIMIT(late_alloc_wave64));
+      ac_pm4_set_reg(&pm4->base, R_00B11C_SPI_SHADER_LATE_ALLOC_VS, S_00B11C_LIMIT(late_alloc_wave64));
   }

-   si_pm4_set_reg(pm4, R_00B120_SPI_SHADER_PGM_LO_VS, va >> 8);
-   si_pm4_set_reg(pm4, R_00B124_SPI_SHADER_PGM_HI_VS,
+   ac_pm4_set_reg(&pm4->base, R_00B120_SPI_SHADER_PGM_LO_VS, va >> 8);
+   ac_pm4_set_reg(&pm4->base, R_00B124_SPI_SHADER_PGM_HI_VS,
                  S_00B124_MEM_BASE(sscreen->info.address32_hi >> 8));

   uint32_t rsrc1 =
@@ -1863,8 +1863,8 @@ static void si_shader_vs(struct si_screen *sscreen, struct si_shader *shader,
               S_00B12C_SO_EN(1);
   }

-   si_pm4_set_reg(pm4, R_00B128_SPI_SHADER_PGM_RSRC1_VS, rsrc1);
-   si_pm4_set_reg(pm4, R_00B12C_SPI_SHADER_PGM_RSRC2_VS, rsrc2);
+   ac_pm4_set_reg(&pm4->base, R_00B128_SPI_SHADER_PGM_RSRC1_VS, rsrc1);
+   ac_pm4_set_reg(&pm4->base, R_00B12C_SPI_SHADER_PGM_RSRC2_VS, rsrc2);

   if (window_space)
      shader->vs.pa_cl_vte_cntl = S_028818_VTX_XY_FMT(1) | S_028818_VTX_Z_FMT(1);
@@ -1878,7 +1878,7 @@ static void si_shader_vs(struct si_screen *sscreen, struct si_shader *shader,
      si_set_tesseval_regs(sscreen, shader->selector, shader);

   polaris_set_vgt_vertex_reuse(sscreen, shader->selector, shader);
-   si_pm4_finalize(pm4);
+   ac_pm4_finalize(&pm4->base);
 }

 static unsigned si_get_spi_shader_col_format(struct si_shader *shader)
@@ -2173,40 +2173,40 @@ static void si_shader_ps(struct si_screen *sscreen, struct si_shader *shader)
   if (sscreen->dpbb_allowed &&
       (sscreen->pbb_context_states_per_bin > 1 ||
        sscreen->pbb_persistent_states_per_bin > 1)) {
-      si_pm4_cmd_add(pm4, PKT3(PKT3_EVENT_WRITE, 0, 0));
-      si_pm4_cmd_add(pm4, EVENT_TYPE(V_028A90_BREAK_BATCH) | EVENT_INDEX(0));
+      ac_pm4_cmd_add(&pm4->base, PKT3(PKT3_EVENT_WRITE, 0, 0));
+      ac_pm4_cmd_add(&pm4->base, EVENT_TYPE(V_028A90_BREAK_BATCH) | EVENT_INDEX(0));
   }

   if (sscreen->info.gfx_level >= GFX12) {
-      si_pm4_set_reg(pm4, R_00B01C_SPI_SHADER_PGM_RSRC4_PS,
+      ac_pm4_set_reg(&pm4->base, R_00B01C_SPI_SHADER_PGM_RSRC4_PS,
                     S_00B01C_WAVE_LIMIT_GFX12(0x3FF) |
                     S_00B01C_LDS_GROUP_SIZE_GFX12(1) |
                     S_00B01C_INST_PREF_SIZE(si_get_shader_prefetch_size(shader)));
   } else if (sscreen->info.gfx_level >= GFX11) {
      unsigned cu_mask_ps = gfx103_get_cu_mask_ps(sscreen);

-      si_pm4_set_reg_idx3(pm4, R_00B004_SPI_SHADER_PGM_RSRC4_PS,
+      ac_pm4_set_reg_idx3(&pm4->base, R_00B004_SPI_SHADER_PGM_RSRC4_PS,
                          ac_apply_cu_en(S_00B004_CU_EN(cu_mask_ps >> 16) |
                                         S_00B004_INST_PREF_SIZE(si_get_shader_prefetch_size(shader)),
                                         C_00B004_CU_EN, 16, &sscreen->info));
   }

   uint64_t va = shader->bo->gpu_address;
-   si_pm4_set_reg(pm4, R_00B020_SPI_SHADER_PGM_LO_PS, va >> 8);
-   si_pm4_set_reg(pm4, R_00B024_SPI_SHADER_PGM_HI_PS,
+   ac_pm4_set_reg(&pm4->base, R_00B020_SPI_SHADER_PGM_LO_PS, va >> 8);
+   ac_pm4_set_reg(&pm4->base, R_00B024_SPI_SHADER_PGM_HI_PS,
                  S_00B024_MEM_BASE(sscreen->info.address32_hi >> 8));

-   si_pm4_set_reg(pm4, R_00B028_SPI_SHADER_PGM_RSRC1_PS,
+   ac_pm4_set_reg(&pm4->base, R_00B028_SPI_SHADER_PGM_RSRC1_PS,
                  S_00B028_VGPRS(si_shader_encode_vgprs(shader)) |
                  S_00B028_SGPRS(si_shader_encode_sgprs(shader)) |
                  S_00B028_DX10_CLAMP(sscreen->info.gfx_level < GFX12) |
                  S_00B028_MEM_ORDERED(si_shader_mem_ordered(shader)) |
                  S_00B028_FLOAT_MODE(shader->config.float_mode));
-   si_pm4_set_reg(pm4, R_00B02C_SPI_SHADER_PGM_RSRC2_PS,
+   ac_pm4_set_reg(&pm4->base, R_00B02C_SPI_SHADER_PGM_RSRC2_PS,
                  S_00B02C_EXTRA_LDS_SIZE(shader->config.lds_size) |
                  S_00B02C_USER_SGPR(SI_PS_NUM_USER_SGPR) |
                  S_00B32C_SCRATCH_EN(shader->config.scratch_bytes_per_wave > 0));
-   si_pm4_finalize(pm4);
+   ac_pm4_finalize(&pm4->base);
 }

 static void si_shader_init_pm4_state(struct si_screen *sscreen, struct si_shader *shader)
@@ -2251,7 +2251,7 @@ static void si_shader_init_pm4_state(struct si_screen *sscreen, struct si_shader
      assert(0);
   }

-   assert(!(sscreen->debug_flags & DBG(SQTT)) || shader->pm4.spi_shader_pgm_lo_reg != 0);
+   assert(!(sscreen->debug_flags & DBG(SQTT)) || shader->pm4.base.spi_shader_pgm_lo_reg != 0);
 }

 static void si_clear_vs_key_inputs(union si_shader_key *key)
@@ -4052,13 +4052,13 @@ static void si_cs_preamble_add_vgt_flush(struct si_context *sctx, bool tmz)
      return;

   /* Done by Vulkan before VGT_FLUSH. */
-   si_pm4_cmd_add(pm4, PKT3(PKT3_EVENT_WRITE, 0, 0));
-   si_pm4_cmd_add(pm4, EVENT_TYPE(V_028A90_VS_PARTIAL_FLUSH) | EVENT_INDEX(4));
+   ac_pm4_cmd_add(&pm4->base, PKT3(PKT3_EVENT_WRITE, 0, 0));
+   ac_pm4_cmd_add(&pm4->base, EVENT_TYPE(V_028A90_VS_PARTIAL_FLUSH) | EVENT_INDEX(4));

   /* VGT_FLUSH is required even if VGT is idle. It resets VGT pointers. */
-   si_pm4_cmd_add(pm4, PKT3(PKT3_EVENT_WRITE, 0, 0));
-   si_pm4_cmd_add(pm4, EVENT_TYPE(V_028A90_VGT_FLUSH) | EVENT_INDEX(0));
-   si_pm4_finalize(pm4);
+   ac_pm4_cmd_add(&pm4->base, PKT3(PKT3_EVENT_WRITE, 0, 0));
+   ac_pm4_cmd_add(&pm4->base, EVENT_TYPE(V_028A90_VGT_FLUSH) | EVENT_INDEX(0));
+   ac_pm4_finalize(&pm4->base);

   *has_vgt_flush = true;
 }
@@ -4199,32 +4199,32 @@ bool si_update_gs_ring_buffers(struct si_context *sctx)

      if (!*gs_ring_state_dw_offset) {
         /* We are here for the first time. The packets will be added. */
-         *gs_ring_state_dw_offset = pm4->ndw;
+         *gs_ring_state_dw_offset = pm4->base.ndw;
      } else {
         /* We have been here before. Overwrite the previous packets. */
-         old_ndw = pm4->ndw;
-         pm4->ndw = *gs_ring_state_dw_offset;
+         old_ndw = pm4->base.ndw;
+         pm4->base.ndw = *gs_ring_state_dw_offset;
      }

      /* Unallocated rings are written to reserve the space in the pm4
       * (to be able to overwrite them later). */
      if (sctx->gfx_level >= GFX7) {
         if (sctx->gfx_level <= GFX8)
-            si_pm4_set_reg(pm4, R_030900_VGT_ESGS_RING_SIZE,
+            ac_pm4_set_reg(&pm4->base, R_030900_VGT_ESGS_RING_SIZE,
                           sctx->esgs_ring ? sctx->esgs_ring->width0 / 256 : 0);
-         si_pm4_set_reg(pm4, R_030904_VGT_GSVS_RING_SIZE,
+         ac_pm4_set_reg(&pm4->base, R_030904_VGT_GSVS_RING_SIZE,
                        sctx->gsvs_ring ? sctx->gsvs_ring->width0 / 256 : 0);
      } else {
-         si_pm4_set_reg(pm4, R_0088C8_VGT_ESGS_RING_SIZE,
+         ac_pm4_set_reg(&pm4->base, R_0088C8_VGT_ESGS_RING_SIZE,
                        sctx->esgs_ring ? sctx->esgs_ring->width0 / 256 : 0);
-         si_pm4_set_reg(pm4, R_0088CC_VGT_GSVS_RING_SIZE,
+         ac_pm4_set_reg(&pm4->base, R_0088CC_VGT_GSVS_RING_SIZE,
                        sctx->gsvs_ring ? sctx->gsvs_ring->width0 / 256 : 0);
      }
-      si_pm4_finalize(pm4);
+      ac_pm4_finalize(&pm4->base);

      if (old_ndw) {
-         pm4->ndw = old_ndw;
-         pm4->last_opcode = 255; /* invalid opcode (we don't save the last opcode) */
+         pm4->base.ndw = old_ndw;
+         pm4->base.last_opcode = 255; /* invalid opcode (we don't save the last opcode) */
      }
   }