turnip: add tu_cmd_buffer.h

Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/17811>
2022-07-29 14:06:04 -07:00
parent 6666ec3945
commit 8e61bee30c
4 changed files with 673 additions and 649 deletions
--- a/src/freedreno/vulkan/tu_cmd_buffer.c
+++ b/src/freedreno/vulkan/tu_cmd_buffer.c
@@ -25,6 +25,7 @@
 * DEALINGS IN THE SOFTWARE.
 */

+#include "tu_cmd_buffer.h"
 #include "tu_private.h"

 #include "vk_render_pass.h"
--- a/src/freedreno/vulkan/tu_cmd_buffer.h
+++ b/src/freedreno/vulkan/tu_cmd_buffer.h
@@ -0,0 +1,670 @@
+/*
+ * Copyright © 2016 Red Hat.
+ * Copyright © 2016 Bas Nieuwenhuizen
+ * SPDX-License-Identifier: MIT
+ *
+ * based in part on anv driver which is:
+ * Copyright © 2015 Intel Corporation
+ */
+
+#ifndef TU_CMD_BUFFER_H
+#define TU_CMD_BUFFER_H
+
+#include "tu_common.h"
+
+#include "tu_cs.h"
+#include "tu_descriptor_set.h"
+#include "tu_device.h"
+#include "tu_lrz.h"
+#include "tu_pass.h"
+#include "tu_pipeline.h"
+
+enum tu_draw_state_group_id
+{
+   TU_DRAW_STATE_PROGRAM_CONFIG,
+   TU_DRAW_STATE_PROGRAM,
+   TU_DRAW_STATE_PROGRAM_BINNING,
+   TU_DRAW_STATE_VB,
+   TU_DRAW_STATE_VI,
+   TU_DRAW_STATE_VI_BINNING,
+   TU_DRAW_STATE_RAST,
+   TU_DRAW_STATE_CONST,
+   TU_DRAW_STATE_DESC_SETS,
+   TU_DRAW_STATE_DESC_SETS_LOAD,
+   TU_DRAW_STATE_VS_PARAMS,
+   TU_DRAW_STATE_INPUT_ATTACHMENTS_GMEM,
+   TU_DRAW_STATE_INPUT_ATTACHMENTS_SYSMEM,
+   TU_DRAW_STATE_LRZ_AND_DEPTH_PLANE,
+   TU_DRAW_STATE_PRIM_MODE_GMEM,
+   TU_DRAW_STATE_PRIM_MODE_SYSMEM,
+
+   /* dynamic state related draw states */
+   TU_DRAW_STATE_DYNAMIC,
+   TU_DRAW_STATE_COUNT = TU_DRAW_STATE_DYNAMIC + TU_DYNAMIC_STATE_COUNT,
+};
+
+struct tu_descriptor_state
+{
+   struct tu_descriptor_set *sets[MAX_SETS];
+   struct tu_descriptor_set push_set;
+   uint32_t dynamic_descriptors[MAX_DYNAMIC_BUFFERS_SIZE];
+};
+
+enum tu_cmd_dirty_bits
+{
+   TU_CMD_DIRTY_VERTEX_BUFFERS = BIT(0),
+   TU_CMD_DIRTY_VB_STRIDE = BIT(1),
+   TU_CMD_DIRTY_GRAS_SU_CNTL = BIT(2),
+   TU_CMD_DIRTY_RB_DEPTH_CNTL = BIT(3),
+   TU_CMD_DIRTY_RB_STENCIL_CNTL = BIT(4),
+   TU_CMD_DIRTY_DESC_SETS_LOAD = BIT(5),
+   TU_CMD_DIRTY_COMPUTE_DESC_SETS_LOAD = BIT(6),
+   TU_CMD_DIRTY_SHADER_CONSTS = BIT(7),
+   TU_CMD_DIRTY_LRZ = BIT(8),
+   TU_CMD_DIRTY_VS_PARAMS = BIT(9),
+   TU_CMD_DIRTY_RASTERIZER_DISCARD = BIT(10),
+   TU_CMD_DIRTY_VIEWPORTS = BIT(11),
+   TU_CMD_DIRTY_BLEND = BIT(12),
+   /* all draw states were disabled and need to be re-enabled: */
+   TU_CMD_DIRTY_DRAW_STATE = BIT(13)
+};
+
+/* There are only three cache domains we have to care about: the CCU, or
+ * color cache unit, which is used for color and depth/stencil attachments
+ * and copy/blit destinations, and is split conceptually into color and depth,
+ * and the universal cache or UCHE which is used for pretty much everything
+ * else, except for the CP (uncached) and host. We need to flush whenever data
+ * crosses these boundaries.
+ */
+
+enum tu_cmd_access_mask {
+   TU_ACCESS_UCHE_READ = 1 << 0,
+   TU_ACCESS_UCHE_WRITE = 1 << 1,
+   TU_ACCESS_CCU_COLOR_READ = 1 << 2,
+   TU_ACCESS_CCU_COLOR_WRITE = 1 << 3,
+   TU_ACCESS_CCU_DEPTH_READ = 1 << 4,
+   TU_ACCESS_CCU_DEPTH_WRITE = 1 << 5,
+
+   /* Experiments have shown that while it's safe to avoid flushing the CCU
+    * after each blit/renderpass, it's not safe to assume that subsequent
+    * lookups with a different attachment state will hit unflushed cache
+    * entries. That is, the CCU needs to be flushed and possibly invalidated
+    * when accessing memory with a different attachment state. Writing to an
+    * attachment under the following conditions after clearing using the
+    * normal 2d engine path is known to have issues:
+    *
+    * - It isn't the 0'th layer.
+    * - There are more than one attachment, and this isn't the 0'th attachment
+    *   (this seems to also depend on the cpp of the attachments).
+    *
+    * Our best guess is that the layer/MRT state is used when computing
+    * the location of a cache entry in CCU, to avoid conflicts. We assume that
+    * any access in a renderpass after or before an access by a transfer needs
+    * a flush/invalidate, and use the _INCOHERENT variants to represent access
+    * by a renderpass.
+    */
+   TU_ACCESS_CCU_COLOR_INCOHERENT_READ = 1 << 6,
+   TU_ACCESS_CCU_COLOR_INCOHERENT_WRITE = 1 << 7,
+   TU_ACCESS_CCU_DEPTH_INCOHERENT_READ = 1 << 8,
+   TU_ACCESS_CCU_DEPTH_INCOHERENT_WRITE = 1 << 9,
+
+   /* Accesses which bypasses any cache. e.g. writes via the host,
+    * CP_EVENT_WRITE::BLIT, and the CP are SYSMEM_WRITE.
+    */
+   TU_ACCESS_SYSMEM_READ = 1 << 10,
+   TU_ACCESS_SYSMEM_WRITE = 1 << 11,
+
+   /* Memory writes from the CP start in-order with draws and event writes,
+    * but execute asynchronously and hence need a CP_WAIT_MEM_WRITES if read.
+    */
+   TU_ACCESS_CP_WRITE = 1 << 12,
+
+   TU_ACCESS_READ =
+      TU_ACCESS_UCHE_READ |
+      TU_ACCESS_CCU_COLOR_READ |
+      TU_ACCESS_CCU_DEPTH_READ |
+      TU_ACCESS_CCU_COLOR_INCOHERENT_READ |
+      TU_ACCESS_CCU_DEPTH_INCOHERENT_READ |
+      TU_ACCESS_SYSMEM_READ,
+
+   TU_ACCESS_WRITE =
+      TU_ACCESS_UCHE_WRITE |
+      TU_ACCESS_CCU_COLOR_WRITE |
+      TU_ACCESS_CCU_COLOR_INCOHERENT_WRITE |
+      TU_ACCESS_CCU_DEPTH_WRITE |
+      TU_ACCESS_CCU_DEPTH_INCOHERENT_WRITE |
+      TU_ACCESS_SYSMEM_WRITE |
+      TU_ACCESS_CP_WRITE,
+
+   TU_ACCESS_ALL =
+      TU_ACCESS_READ |
+      TU_ACCESS_WRITE,
+};
+
+/* Starting with a6xx, the pipeline is split into several "clusters" (really
+ * pipeline stages). Each stage has its own pair of register banks and can
+ * switch them independently, so that earlier stages can run ahead of later
+ * ones. e.g. the FS of draw N and the VS of draw N + 1 can be executing at
+ * the same time.
+ *
+ * As a result of this, we need to insert a WFI when an earlier stage depends
+ * on the result of a later stage. CP_DRAW_* and CP_BLIT will wait for any
+ * pending WFI's to complete before starting, and usually before reading
+ * indirect params even, so a WFI also acts as a full "pipeline stall".
+ *
+ * Note, the names of the stages come from CLUSTER_* in devcoredump. We
+ * include all the stages for completeness, even ones which do not read/write
+ * anything.
+ */
+
+enum tu_stage {
+   /* This doesn't correspond to a cluster, but we need it for tracking
+    * indirect draw parameter reads etc.
+    */
+   TU_STAGE_CP,
+
+   /* - Fetch index buffer
+    * - Fetch vertex attributes, dispatch VS
+    */
+   TU_STAGE_FE,
+
+   /* Execute all geometry stages (VS thru GS) */
+   TU_STAGE_SP_VS,
+
+   /* Write to VPC, do primitive assembly. */
+   TU_STAGE_PC_VS,
+
+   /* Rasterization. RB_DEPTH_BUFFER_BASE only exists in CLUSTER_PS according
+    * to devcoredump so presumably this stage stalls for TU_STAGE_PS when
+    * early depth testing is enabled before dispatching fragments? However
+    * GRAS reads and writes LRZ directly.
+    */
+   TU_STAGE_GRAS,
+
+   /* Execute FS */
+   TU_STAGE_SP_PS,
+
+   /* - Fragment tests
+    * - Write color/depth
+    * - Streamout writes (???)
+    * - Varying interpolation (???)
+    */
+   TU_STAGE_PS,
+};
+
+enum tu_cmd_flush_bits {
+   TU_CMD_FLAG_CCU_FLUSH_DEPTH = 1 << 0,
+   TU_CMD_FLAG_CCU_FLUSH_COLOR = 1 << 1,
+   TU_CMD_FLAG_CCU_INVALIDATE_DEPTH = 1 << 2,
+   TU_CMD_FLAG_CCU_INVALIDATE_COLOR = 1 << 3,
+   TU_CMD_FLAG_CACHE_FLUSH = 1 << 4,
+   TU_CMD_FLAG_CACHE_INVALIDATE = 1 << 5,
+   TU_CMD_FLAG_WAIT_MEM_WRITES = 1 << 6,
+   TU_CMD_FLAG_WAIT_FOR_IDLE = 1 << 7,
+   TU_CMD_FLAG_WAIT_FOR_ME = 1 << 8,
+
+   TU_CMD_FLAG_ALL_FLUSH =
+      TU_CMD_FLAG_CCU_FLUSH_DEPTH |
+      TU_CMD_FLAG_CCU_FLUSH_COLOR |
+      TU_CMD_FLAG_CACHE_FLUSH |
+      /* Treat the CP as a sort of "cache" which may need to be "flushed" via
+       * waiting for writes to land with WAIT_FOR_MEM_WRITES.
+       */
+      TU_CMD_FLAG_WAIT_MEM_WRITES,
+
+   TU_CMD_FLAG_ALL_INVALIDATE =
+      TU_CMD_FLAG_CCU_INVALIDATE_DEPTH |
+      TU_CMD_FLAG_CCU_INVALIDATE_COLOR |
+      TU_CMD_FLAG_CACHE_INVALIDATE |
+      /* Treat CP_WAIT_FOR_ME as a "cache" that needs to be invalidated when a
+       * a command that needs CP_WAIT_FOR_ME is executed. This means we may
+       * insert an extra WAIT_FOR_ME before an indirect command requiring it
+       * in case there was another command before the current command buffer
+       * that it needs to wait for.
+       */
+      TU_CMD_FLAG_WAIT_FOR_ME,
+};
+
+/* Changing the CCU from sysmem mode to gmem mode or vice-versa is pretty
+ * heavy, involving a CCU cache flush/invalidate and a WFI in order to change
+ * which part of the gmem is used by the CCU. Here we keep track of what the
+ * state of the CCU.
+ */
+enum tu_cmd_ccu_state {
+   TU_CMD_CCU_SYSMEM,
+   TU_CMD_CCU_GMEM,
+   TU_CMD_CCU_UNKNOWN,
+};
+
+struct tu_cache_state {
+   /* Caches which must be made available (flushed) eventually if there are
+    * any users outside that cache domain, and caches which must be
+    * invalidated eventually if there are any reads.
+    */
+   enum tu_cmd_flush_bits pending_flush_bits;
+   /* Pending flushes */
+   enum tu_cmd_flush_bits flush_bits;
+};
+
+struct tu_vs_params {
+   uint32_t vertex_offset;
+   uint32_t first_instance;
+};
+
+/* This should be for state that is set inside a renderpass and used at
+ * renderpass end time, e.g. to decide whether to use sysmem. This needs
+ * special handling for secondary cmdbufs and suspending/resuming render
+ * passes where the state may need to be combined afterwards.
+ */
+struct tu_render_pass_state
+{
+   bool xfb_used;
+   bool has_tess;
+   bool has_prim_generated_query_in_rp;
+   bool disable_gmem;
+
+   /* Track whether conditional predicate for COND_REG_EXEC is changed in draw_cs */
+   bool draw_cs_writes_to_cond_pred;
+
+   uint32_t drawcall_count;
+
+   /* A calculated "draw cost" value for renderpass, which tries to
+    * estimate the bandwidth-per-sample of all the draws according
+    * to:
+    *
+    *    foreach_draw (...) {
+    *      sum += pipeline->color_bandwidth_per_sample;
+    *      if (depth_test_enabled)
+    *        sum += pipeline->depth_cpp_per_sample;
+    *      if (depth_write_enabled)
+    *        sum += pipeline->depth_cpp_per_sample;
+    *      if (stencil_write_enabled)
+    *        sum += pipeline->stencil_cpp_per_sample * 2;
+    *    }
+    *    drawcall_bandwidth_per_sample = sum / drawcall_count;
+    *
+    * It allows us to estimate the total bandwidth of drawcalls later, by
+    * calculating (drawcall_bandwidth_per_sample * zpass_sample_count).
+    *
+    * This does ignore depth buffer traffic for samples which do not
+    * pass due to depth-test fail, and some other details.  But it is
+    * just intended to be a rough estimate that is easy to calculate.
+    */
+   uint32_t drawcall_bandwidth_per_sample_sum;
+};
+
+struct tu_cmd_state
+{
+   uint32_t dirty;
+
+   struct tu_pipeline *pipeline;
+   struct tu_pipeline *compute_pipeline;
+
+   struct tu_render_pass_state rp;
+
+   /* Vertex buffers, viewports, and scissors
+    * the states for these can be updated partially, so we need to save these
+    * to be able to emit a complete draw state
+    */
+   struct {
+      uint64_t base;
+      uint32_t size;
+      uint32_t stride;
+   } vb[MAX_VBS];
+   VkViewport viewport[MAX_VIEWPORTS];
+   VkRect2D scissor[MAX_SCISSORS];
+   uint32_t max_viewport, max_scissor;
+
+   /* for dynamic states that can't be emitted directly */
+   uint32_t dynamic_stencil_mask;
+   uint32_t dynamic_stencil_wrmask;
+   uint32_t dynamic_stencil_ref;
+
+   uint32_t gras_su_cntl, rb_depth_cntl, rb_stencil_cntl;
+   uint32_t pc_raster_cntl, vpc_unknown_9107;
+   uint32_t rb_mrt_control[MAX_RTS], rb_mrt_blend_control[MAX_RTS];
+   uint32_t rb_mrt_control_rop;
+   uint32_t rb_blend_cntl, sp_blend_cntl;
+   uint32_t pipeline_color_write_enable, pipeline_blend_enable;
+   uint32_t color_write_enable;
+   bool logic_op_enabled;
+   bool rop_reads_dst;
+   enum pc_di_primtype primtype;
+   bool primitive_restart_enable;
+
+   /* saved states to re-emit in TU_CMD_DIRTY_DRAW_STATE case */
+   struct tu_draw_state dynamic_state[TU_DYNAMIC_STATE_COUNT];
+   struct tu_draw_state vertex_buffers;
+   struct tu_draw_state shader_const;
+   struct tu_draw_state desc_sets;
+
+   struct tu_draw_state vs_params;
+
+   /* Index buffer */
+   uint64_t index_va;
+   uint32_t max_index_count;
+   uint8_t index_size;
+
+   /* because streamout base has to be 32-byte aligned
+    * there is an extra offset to deal with when it is
+    * unaligned
+    */
+   uint8_t streamout_offset[IR3_MAX_SO_BUFFERS];
+
+   /* Renderpasses are tricky, because we may need to flush differently if
+    * using sysmem vs. gmem and therefore we have to delay any flushing that
+    * happens before a renderpass. So we have to have two copies of the flush
+    * state, one for intra-renderpass flushes (i.e. renderpass dependencies)
+    * and one for outside a renderpass.
+    */
+   struct tu_cache_state cache;
+   struct tu_cache_state renderpass_cache;
+
+   enum tu_cmd_ccu_state ccu_state;
+
+   /* Decides which GMEM layout to use from the tu_pass, based on whether the CCU
+    * might get used by tu_store_gmem_attachment().
+    */
+   enum tu_gmem_layout gmem_layout;
+
+   const struct tu_render_pass *pass;
+   const struct tu_subpass *subpass;
+   const struct tu_framebuffer *framebuffer;
+   const struct tu_tiling_config *tiling;
+   VkRect2D render_area;
+
+   const struct tu_image_view **attachments;
+
+   /* State that in the dynamic case comes from VkRenderingInfo and needs to
+    * be saved/restored when suspending. This holds the state for the last
+    * suspended renderpass, which may point to this command buffer's dynamic_*
+    * or another command buffer if executed on a secondary.
+    */
+   struct {
+      const struct tu_render_pass *pass;
+      const struct tu_subpass *subpass;
+      const struct tu_framebuffer *framebuffer;
+      VkRect2D render_area;
+      enum tu_gmem_layout gmem_layout;
+
+      const struct tu_image_view **attachments;
+
+      struct tu_lrz_state lrz;
+   } suspended_pass;
+
+   bool tessfactor_addr_set;
+   bool predication_active;
+   enum a5xx_line_mode line_mode;
+   bool z_negative_one_to_one;
+
+   /* VK_QUERY_PIPELINE_STATISTIC_CLIPPING_INVOCATIONS_BIT and
+    * VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT are allowed to run simultaniously,
+    * but they use the same {START,STOP}_PRIMITIVE_CTRS control.
+    */
+   uint32_t prim_counters_running;
+
+   bool prim_generated_query_running_before_rp;
+
+   /* These are the states of the suspend/resume state machine. In addition to
+    * tracking whether we're in the middle of a chain of suspending and
+    * resuming passes that will be merged, we need to track whether the
+    * command buffer begins in the middle of such a chain, for when it gets
+    * merged with other command buffers. We call such a chain that begins
+    * before the command buffer starts a "pre-chain".
+    *
+    * Note that when this command buffer is finished, this state is untouched
+    * but it gains a different meaning. For example, if we finish in state
+    * SR_IN_CHAIN, we finished in the middle of a suspend/resume chain, so
+    * there's a suspend/resume chain that extends past the end of the command
+    * buffer. In this sense it's the "opposite" of SR_AFTER_PRE_CHAIN, which
+    * means that there's a suspend/resume chain that extends before the
+    * beginning.
+    */
+   enum {
+      /* Either there are no suspend/resume chains, or they are entirely
+       * contained in the current command buffer.
+       *
+       *   BeginCommandBuffer() <- start of current command buffer
+       *       ...
+       *       // we are here
+       */
+      SR_NONE = 0,
+
+      /* We are in the middle of a suspend/resume chain that starts before the
+       * current command buffer. This happens when the command buffer begins
+       * with a resuming render pass and all of the passes up to the current
+       * one are suspending. In this state, our part of the chain is not saved
+       * and is in the current draw_cs/state.
+       *
+       *   BeginRendering() ... EndRendering(suspending)
+       *   BeginCommandBuffer() <- start of current command buffer
+       *       BeginRendering(resuming) ... EndRendering(suspending)
+       *       BeginRendering(resuming) ... EndRendering(suspending)
+       *       ...
+       *       // we are here
+       */
+      SR_IN_PRE_CHAIN,
+
+      /* We are currently outside of any suspend/resume chains, but there is a
+       * chain starting before the current command buffer. It is saved in
+       * pre_chain.
+       *
+       *   BeginRendering() ... EndRendering(suspending)
+       *   BeginCommandBuffer() <- start of current command buffer
+       *       // This part is stashed in pre_chain
+       *       BeginRendering(resuming) ... EndRendering(suspending)
+       *       BeginRendering(resuming) ... EndRendering(suspending)
+       *       ...
+       *       BeginRendering(resuming) ... EndRendering() // end of chain
+       *       ...
+       *       // we are here
+       */
+      SR_AFTER_PRE_CHAIN,
+
+      /* We are in the middle of a suspend/resume chain and there is no chain
+       * starting before the current command buffer.
+       *
+       *   BeginCommandBuffer() <- start of current command buffer
+       *       ...
+       *       BeginRendering() ... EndRendering(suspending)
+       *       BeginRendering(resuming) ... EndRendering(suspending)
+       *       BeginRendering(resuming) ... EndRendering(suspending)
+       *       ...
+       *       // we are here
+       */
+      SR_IN_CHAIN,
+
+      /* We are in the middle of a suspend/resume chain and there is another,
+       * separate, chain starting before the current command buffer.
+       *
+       *   BeginRendering() ... EndRendering(suspending)
+       *   CommandBufferBegin() <- start of current command buffer
+       *       // This part is stashed in pre_chain
+       *       BeginRendering(resuming) ... EndRendering(suspending)
+       *       BeginRendering(resuming) ... EndRendering(suspending)
+       *       ...
+       *       BeginRendering(resuming) ... EndRendering() // end of chain
+       *       ...
+       *       BeginRendering() ... EndRendering(suspending)
+       *       BeginRendering(resuming) ... EndRendering(suspending)
+       *       BeginRendering(resuming) ... EndRendering(suspending)
+       *       ...
+       *       // we are here
+       */
+      SR_IN_CHAIN_AFTER_PRE_CHAIN,
+   } suspend_resume;
+
+   bool suspending, resuming;
+
+   struct tu_lrz_state lrz;
+
+   struct tu_draw_state lrz_and_depth_plane_state;
+
+   struct tu_vs_params last_vs_params;
+};
+
+struct tu_cmd_pool
+{
+   struct vk_command_pool vk;
+
+   struct list_head cmd_buffers;
+   struct list_head free_cmd_buffers;
+};
+VK_DEFINE_NONDISP_HANDLE_CASTS(tu_cmd_pool, vk.base, VkCommandPool,
+                               VK_OBJECT_TYPE_COMMAND_POOL)
+
+enum tu_cmd_buffer_status
+{
+   TU_CMD_BUFFER_STATUS_INVALID,
+   TU_CMD_BUFFER_STATUS_INITIAL,
+   TU_CMD_BUFFER_STATUS_RECORDING,
+   TU_CMD_BUFFER_STATUS_EXECUTABLE,
+   TU_CMD_BUFFER_STATUS_PENDING,
+};
+
+struct tu_cmd_buffer
+{
+   struct vk_command_buffer vk;
+
+   struct tu_device *device;
+
+   struct tu_cmd_pool *pool;
+   struct list_head pool_link;
+
+   struct u_trace trace;
+   struct u_trace_iterator trace_renderpass_start;
+   struct u_trace_iterator trace_renderpass_end;
+
+   struct list_head renderpass_autotune_results;
+   struct tu_autotune_results_buffer* autotune_buffer;
+
+   VkCommandBufferUsageFlags usage_flags;
+   enum tu_cmd_buffer_status status;
+
+   VkQueryPipelineStatisticFlags inherited_pipeline_statistics;
+
+   struct tu_cmd_state state;
+   uint32_t queue_family_index;
+
+   uint32_t push_constants[MAX_PUSH_CONSTANTS_SIZE / 4];
+   VkShaderStageFlags push_constant_stages;
+   struct tu_descriptor_set meta_push_descriptors;
+
+   struct tu_descriptor_state descriptors[MAX_BIND_POINTS];
+
+   struct tu_render_pass_attachment dynamic_rp_attachments[2 * (MAX_RTS + 1)];
+   struct tu_subpass_attachment dynamic_color_attachments[MAX_RTS];
+   struct tu_subpass_attachment dynamic_resolve_attachments[MAX_RTS + 1];
+   const struct tu_image_view *dynamic_attachments[2 * (MAX_RTS + 1)];
+
+   struct tu_render_pass dynamic_pass;
+   struct tu_subpass dynamic_subpass;
+   struct tu_framebuffer dynamic_framebuffer;
+
+   VkResult record_result;
+
+   struct tu_cs cs;
+   struct tu_cs draw_cs;
+   struct tu_cs tile_store_cs;
+   struct tu_cs draw_epilogue_cs;
+   struct tu_cs sub_cs;
+
+   /* If the first render pass in the command buffer is resuming, then it is
+    * part of a suspend/resume chain that starts before the current command
+    * buffer and needs to be merged later. In this case, its incomplete state
+    * is stored in pre_chain. In the symmetric case where the last render pass
+    * is suspending, we just skip ending the render pass and its state is
+    * stored in draw_cs/the current state. The first and last render pass
+    * might be part of different chains, which is why all the state may need
+    * to be saved separately here.
+    */
+   struct {
+      struct tu_cs draw_cs;
+      struct tu_cs draw_epilogue_cs;
+
+      struct u_trace_iterator trace_renderpass_start, trace_renderpass_end;
+
+      struct tu_render_pass_state state;
+   } pre_chain;
+
+   uint32_t vsc_draw_strm_pitch;
+   uint32_t vsc_prim_strm_pitch;
+};
+VK_DEFINE_HANDLE_CASTS(tu_cmd_buffer, vk.base, VkCommandBuffer,
+                       VK_OBJECT_TYPE_COMMAND_BUFFER)
+
+static inline uint32_t
+tu_attachment_gmem_offset(struct tu_cmd_buffer *cmd,
+                          const struct tu_render_pass_attachment *att)
+{
+   assert(cmd->state.gmem_layout < TU_GMEM_LAYOUT_COUNT);
+   return att->gmem_offset[cmd->state.gmem_layout];
+}
+
+static inline uint32_t
+tu_attachment_gmem_offset_stencil(struct tu_cmd_buffer *cmd,
+                                  const struct tu_render_pass_attachment *att)
+{
+   assert(cmd->state.gmem_layout < TU_GMEM_LAYOUT_COUNT);
+   return att->gmem_offset_stencil[cmd->state.gmem_layout];
+}
+
+void tu_render_pass_state_merge(struct tu_render_pass_state *dst,
+                                const struct tu_render_pass_state *src);
+
+VkResult tu_cmd_buffer_begin(struct tu_cmd_buffer *cmd_buffer,
+                             VkCommandBufferUsageFlags usage_flags);
+
+void tu_emit_cache_flush_renderpass(struct tu_cmd_buffer *cmd_buffer,
+                                    struct tu_cs *cs);
+
+void tu_emit_cache_flush_ccu(struct tu_cmd_buffer *cmd_buffer,
+                             struct tu_cs *cs,
+                             enum tu_cmd_ccu_state ccu_state);
+
+void
+tu_append_pre_chain(struct tu_cmd_buffer *cmd,
+                    struct tu_cmd_buffer *secondary);
+
+void
+tu_append_pre_post_chain(struct tu_cmd_buffer *cmd,
+                         struct tu_cmd_buffer *secondary);
+
+void
+tu_append_post_chain(struct tu_cmd_buffer *cmd,
+                     struct tu_cmd_buffer *secondary);
+
+void
+tu_restore_suspended_pass(struct tu_cmd_buffer *cmd,
+                          struct tu_cmd_buffer *suspended);
+
+void tu_cmd_render(struct tu_cmd_buffer *cmd);
+
+void
+tu6_emit_event_write(struct tu_cmd_buffer *cmd,
+                     struct tu_cs *cs,
+                     enum vgt_event_type event);
+
+static inline struct tu_descriptor_state *
+tu_get_descriptors_state(struct tu_cmd_buffer *cmd_buffer,
+                         VkPipelineBindPoint bind_point)
+{
+   return &cmd_buffer->descriptors[bind_point];
+}
+
+void tu6_emit_msaa(struct tu_cs *cs, VkSampleCountFlagBits samples,
+                   enum a5xx_line_mode line_mode);
+
+void tu6_emit_window_scissor(struct tu_cs *cs, uint32_t x1, uint32_t y1, uint32_t x2, uint32_t y2);
+
+void tu6_emit_window_offset(struct tu_cs *cs, uint32_t x1, uint32_t y1);
+
+void tu_disable_draw_states(struct tu_cmd_buffer *cmd, struct tu_cs *cs);
+
+void tu6_apply_depth_bounds_workaround(struct tu_device *device,
+                                       uint32_t *rb_depth_cntl);
+
+void
+update_stencil_mask(uint32_t *value, VkStencilFaceFlags face, uint32_t mask);
+
+#endif /* TU_CMD_BUFFER_H */
--- a/src/freedreno/vulkan/tu_common.h
+++ b/src/freedreno/vulkan/tu_common.h
@@ -116,6 +116,7 @@
 struct tu_buffer;
 struct tu_buffer_view;
 struct tu_cmd_buffer;
+struct tu_cmd_pool;
 struct tu_descriptor_pool;
 struct tu_descriptor_set;
 struct tu_descriptor_set_layout;
--- a/src/freedreno/vulkan/tu_private.h
+++ b/src/freedreno/vulkan/tu_private.h
@@ -31,6 +31,7 @@
 #include "tu_common.h"
 #include "tu_autotune.h"
 #include "tu_clear_blit.h"
+#include "tu_cmd_buffer.h"
 #include "tu_cs.h"
 #include "tu_descriptor_set.h"
 #include "tu_device.h"
@@ -91,651 +92,11 @@ __tu_finishme(const char *file, int line, const char *format, ...)
      tu_finishme("stub %s", __func__);                                      \
   } while (0)

-enum tu_draw_state_group_id
-{
-   TU_DRAW_STATE_PROGRAM_CONFIG,
-   TU_DRAW_STATE_PROGRAM,
-   TU_DRAW_STATE_PROGRAM_BINNING,
-   TU_DRAW_STATE_VB,
-   TU_DRAW_STATE_VI,
-   TU_DRAW_STATE_VI_BINNING,
-   TU_DRAW_STATE_RAST,
-   TU_DRAW_STATE_CONST,
-   TU_DRAW_STATE_DESC_SETS,
-   TU_DRAW_STATE_DESC_SETS_LOAD,
-   TU_DRAW_STATE_VS_PARAMS,
-   TU_DRAW_STATE_INPUT_ATTACHMENTS_GMEM,
-   TU_DRAW_STATE_INPUT_ATTACHMENTS_SYSMEM,
-   TU_DRAW_STATE_LRZ_AND_DEPTH_PLANE,
-   TU_DRAW_STATE_PRIM_MODE_GMEM,
-   TU_DRAW_STATE_PRIM_MODE_SYSMEM,
-
-   /* dynamic state related draw states */
-   TU_DRAW_STATE_DYNAMIC,
-   TU_DRAW_STATE_COUNT = TU_DRAW_STATE_DYNAMIC + TU_DYNAMIC_STATE_COUNT,
-};
-
 void
 tu_framebuffer_tiling_config(struct tu_framebuffer *fb,
                             const struct tu_device *device,
                             const struct tu_render_pass *pass);

-struct tu_descriptor_state
-{
-   struct tu_descriptor_set *sets[MAX_SETS];
-   struct tu_descriptor_set push_set;
-   uint32_t dynamic_descriptors[MAX_DYNAMIC_BUFFERS_SIZE];
-};
-
-enum tu_cmd_dirty_bits
-{
-   TU_CMD_DIRTY_VERTEX_BUFFERS = BIT(0),
-   TU_CMD_DIRTY_VB_STRIDE = BIT(1),
-   TU_CMD_DIRTY_GRAS_SU_CNTL = BIT(2),
-   TU_CMD_DIRTY_RB_DEPTH_CNTL = BIT(3),
-   TU_CMD_DIRTY_RB_STENCIL_CNTL = BIT(4),
-   TU_CMD_DIRTY_DESC_SETS_LOAD = BIT(5),
-   TU_CMD_DIRTY_COMPUTE_DESC_SETS_LOAD = BIT(6),
-   TU_CMD_DIRTY_SHADER_CONSTS = BIT(7),
-   TU_CMD_DIRTY_LRZ = BIT(8),
-   TU_CMD_DIRTY_VS_PARAMS = BIT(9),
-   TU_CMD_DIRTY_RASTERIZER_DISCARD = BIT(10),
-   TU_CMD_DIRTY_VIEWPORTS = BIT(11),
-   TU_CMD_DIRTY_BLEND = BIT(12),
-   /* all draw states were disabled and need to be re-enabled: */
-   TU_CMD_DIRTY_DRAW_STATE = BIT(13)
-};
-
-/* There are only three cache domains we have to care about: the CCU, or
- * color cache unit, which is used for color and depth/stencil attachments
- * and copy/blit destinations, and is split conceptually into color and depth,
- * and the universal cache or UCHE which is used for pretty much everything
- * else, except for the CP (uncached) and host. We need to flush whenever data
- * crosses these boundaries.
- */
-
-enum tu_cmd_access_mask {
-   TU_ACCESS_UCHE_READ = 1 << 0,
-   TU_ACCESS_UCHE_WRITE = 1 << 1,
-   TU_ACCESS_CCU_COLOR_READ = 1 << 2,
-   TU_ACCESS_CCU_COLOR_WRITE = 1 << 3,
-   TU_ACCESS_CCU_DEPTH_READ = 1 << 4,
-   TU_ACCESS_CCU_DEPTH_WRITE = 1 << 5,
-
-   /* Experiments have shown that while it's safe to avoid flushing the CCU
-    * after each blit/renderpass, it's not safe to assume that subsequent
-    * lookups with a different attachment state will hit unflushed cache
-    * entries. That is, the CCU needs to be flushed and possibly invalidated
-    * when accessing memory with a different attachment state. Writing to an
-    * attachment under the following conditions after clearing using the
-    * normal 2d engine path is known to have issues:
-    *
-    * - It isn't the 0'th layer.
-    * - There are more than one attachment, and this isn't the 0'th attachment
-    *   (this seems to also depend on the cpp of the attachments).
-    *
-    * Our best guess is that the layer/MRT state is used when computing
-    * the location of a cache entry in CCU, to avoid conflicts. We assume that
-    * any access in a renderpass after or before an access by a transfer needs
-    * a flush/invalidate, and use the _INCOHERENT variants to represent access
-    * by a renderpass.
-    */
-   TU_ACCESS_CCU_COLOR_INCOHERENT_READ = 1 << 6,
-   TU_ACCESS_CCU_COLOR_INCOHERENT_WRITE = 1 << 7,
-   TU_ACCESS_CCU_DEPTH_INCOHERENT_READ = 1 << 8,
-   TU_ACCESS_CCU_DEPTH_INCOHERENT_WRITE = 1 << 9,
-
-   /* Accesses which bypasses any cache. e.g. writes via the host,
-    * CP_EVENT_WRITE::BLIT, and the CP are SYSMEM_WRITE.
-    */
-   TU_ACCESS_SYSMEM_READ = 1 << 10,
-   TU_ACCESS_SYSMEM_WRITE = 1 << 11,
-
-   /* Memory writes from the CP start in-order with draws and event writes,
-    * but execute asynchronously and hence need a CP_WAIT_MEM_WRITES if read.
-    */
-   TU_ACCESS_CP_WRITE = 1 << 12,
-
-   TU_ACCESS_READ =
-      TU_ACCESS_UCHE_READ |
-      TU_ACCESS_CCU_COLOR_READ |
-      TU_ACCESS_CCU_DEPTH_READ |
-      TU_ACCESS_CCU_COLOR_INCOHERENT_READ |
-      TU_ACCESS_CCU_DEPTH_INCOHERENT_READ |
-      TU_ACCESS_SYSMEM_READ,
-
-   TU_ACCESS_WRITE =
-      TU_ACCESS_UCHE_WRITE |
-      TU_ACCESS_CCU_COLOR_WRITE |
-      TU_ACCESS_CCU_COLOR_INCOHERENT_WRITE |
-      TU_ACCESS_CCU_DEPTH_WRITE |
-      TU_ACCESS_CCU_DEPTH_INCOHERENT_WRITE |
-      TU_ACCESS_SYSMEM_WRITE |
-      TU_ACCESS_CP_WRITE,
-
-   TU_ACCESS_ALL =
-      TU_ACCESS_READ |
-      TU_ACCESS_WRITE,
-};
-
-/* Starting with a6xx, the pipeline is split into several "clusters" (really
- * pipeline stages). Each stage has its own pair of register banks and can
- * switch them independently, so that earlier stages can run ahead of later
- * ones. e.g. the FS of draw N and the VS of draw N + 1 can be executing at
- * the same time.
- *
- * As a result of this, we need to insert a WFI when an earlier stage depends
- * on the result of a later stage. CP_DRAW_* and CP_BLIT will wait for any
- * pending WFI's to complete before starting, and usually before reading
- * indirect params even, so a WFI also acts as a full "pipeline stall".
- *
- * Note, the names of the stages come from CLUSTER_* in devcoredump. We
- * include all the stages for completeness, even ones which do not read/write
- * anything.
- */
-
-enum tu_stage {
-   /* This doesn't correspond to a cluster, but we need it for tracking
-    * indirect draw parameter reads etc.
-    */
-   TU_STAGE_CP,
-
-   /* - Fetch index buffer
-    * - Fetch vertex attributes, dispatch VS
-    */
-   TU_STAGE_FE,
-
-   /* Execute all geometry stages (VS thru GS) */
-   TU_STAGE_SP_VS,
-
-   /* Write to VPC, do primitive assembly. */
-   TU_STAGE_PC_VS,
-
-   /* Rasterization. RB_DEPTH_BUFFER_BASE only exists in CLUSTER_PS according
-    * to devcoredump so presumably this stage stalls for TU_STAGE_PS when
-    * early depth testing is enabled before dispatching fragments? However
-    * GRAS reads and writes LRZ directly.
-    */
-   TU_STAGE_GRAS,
-
-   /* Execute FS */
-   TU_STAGE_SP_PS,
-
-   /* - Fragment tests
-    * - Write color/depth
-    * - Streamout writes (???)
-    * - Varying interpolation (???)
-    */
-   TU_STAGE_PS,
-};
-
-enum tu_cmd_flush_bits {
-   TU_CMD_FLAG_CCU_FLUSH_DEPTH = 1 << 0,
-   TU_CMD_FLAG_CCU_FLUSH_COLOR = 1 << 1,
-   TU_CMD_FLAG_CCU_INVALIDATE_DEPTH = 1 << 2,
-   TU_CMD_FLAG_CCU_INVALIDATE_COLOR = 1 << 3,
-   TU_CMD_FLAG_CACHE_FLUSH = 1 << 4,
-   TU_CMD_FLAG_CACHE_INVALIDATE = 1 << 5,
-   TU_CMD_FLAG_WAIT_MEM_WRITES = 1 << 6,
-   TU_CMD_FLAG_WAIT_FOR_IDLE = 1 << 7,
-   TU_CMD_FLAG_WAIT_FOR_ME = 1 << 8,
-
-   TU_CMD_FLAG_ALL_FLUSH =
-      TU_CMD_FLAG_CCU_FLUSH_DEPTH |
-      TU_CMD_FLAG_CCU_FLUSH_COLOR |
-      TU_CMD_FLAG_CACHE_FLUSH |
-      /* Treat the CP as a sort of "cache" which may need to be "flushed" via
-       * waiting for writes to land with WAIT_FOR_MEM_WRITES.
-       */
-      TU_CMD_FLAG_WAIT_MEM_WRITES,
-
-   TU_CMD_FLAG_ALL_INVALIDATE =
-      TU_CMD_FLAG_CCU_INVALIDATE_DEPTH |
-      TU_CMD_FLAG_CCU_INVALIDATE_COLOR |
-      TU_CMD_FLAG_CACHE_INVALIDATE |
-      /* Treat CP_WAIT_FOR_ME as a "cache" that needs to be invalidated when a
-       * a command that needs CP_WAIT_FOR_ME is executed. This means we may
-       * insert an extra WAIT_FOR_ME before an indirect command requiring it
-       * in case there was another command before the current command buffer
-       * that it needs to wait for.
-       */
-      TU_CMD_FLAG_WAIT_FOR_ME,
-};
-
-/* Changing the CCU from sysmem mode to gmem mode or vice-versa is pretty
- * heavy, involving a CCU cache flush/invalidate and a WFI in order to change
- * which part of the gmem is used by the CCU. Here we keep track of what the
- * state of the CCU.
- */
-enum tu_cmd_ccu_state {
-   TU_CMD_CCU_SYSMEM,
-   TU_CMD_CCU_GMEM,
-   TU_CMD_CCU_UNKNOWN,
-};
-
-struct tu_cache_state {
-   /* Caches which must be made available (flushed) eventually if there are
-    * any users outside that cache domain, and caches which must be
-    * invalidated eventually if there are any reads.
-    */
-   enum tu_cmd_flush_bits pending_flush_bits;
-   /* Pending flushes */
-   enum tu_cmd_flush_bits flush_bits;
-};
-
-struct tu_vs_params {
-   uint32_t vertex_offset;
-   uint32_t first_instance;
-};
-
-/* This should be for state that is set inside a renderpass and used at
- * renderpass end time, e.g. to decide whether to use sysmem. This needs
- * special handling for secondary cmdbufs and suspending/resuming render
- * passes where the state may need to be combined afterwards.
- */
-struct tu_render_pass_state
-{
-   bool xfb_used;
-   bool has_tess;
-   bool has_prim_generated_query_in_rp;
-   bool disable_gmem;
-
-   /* Track whether conditional predicate for COND_REG_EXEC is changed in draw_cs */
-   bool draw_cs_writes_to_cond_pred;
-
-   uint32_t drawcall_count;
-
-   /* A calculated "draw cost" value for renderpass, which tries to
-    * estimate the bandwidth-per-sample of all the draws according
-    * to:
-    *
-    *    foreach_draw (...) {
-    *      sum += pipeline->color_bandwidth_per_sample;
-    *      if (depth_test_enabled)
-    *        sum += pipeline->depth_cpp_per_sample;
-    *      if (depth_write_enabled)
-    *        sum += pipeline->depth_cpp_per_sample;
-    *      if (stencil_write_enabled)
-    *        sum += pipeline->stencil_cpp_per_sample * 2;
-    *    }
-    *    drawcall_bandwidth_per_sample = sum / drawcall_count;
-    *
-    * It allows us to estimate the total bandwidth of drawcalls later, by
-    * calculating (drawcall_bandwidth_per_sample * zpass_sample_count).
-    *
-    * This does ignore depth buffer traffic for samples which do not
-    * pass due to depth-test fail, and some other details.  But it is
-    * just intended to be a rough estimate that is easy to calculate.
-    */
-   uint32_t drawcall_bandwidth_per_sample_sum;
-};
-
-void tu_render_pass_state_merge(struct tu_render_pass_state *dst,
-                                const struct tu_render_pass_state *src);
-struct tu_cmd_state
-{
-   uint32_t dirty;
-
-   struct tu_pipeline *pipeline;
-   struct tu_pipeline *compute_pipeline;
-
-   struct tu_render_pass_state rp;
-
-   /* Vertex buffers, viewports, and scissors
-    * the states for these can be updated partially, so we need to save these
-    * to be able to emit a complete draw state
-    */
-   struct {
-      uint64_t base;
-      uint32_t size;
-      uint32_t stride;
-   } vb[MAX_VBS];
-   VkViewport viewport[MAX_VIEWPORTS];
-   VkRect2D scissor[MAX_SCISSORS];
-   uint32_t max_viewport, max_scissor;
-
-   /* for dynamic states that can't be emitted directly */
-   uint32_t dynamic_stencil_mask;
-   uint32_t dynamic_stencil_wrmask;
-   uint32_t dynamic_stencil_ref;
-
-   uint32_t gras_su_cntl, rb_depth_cntl, rb_stencil_cntl;
-   uint32_t pc_raster_cntl, vpc_unknown_9107;
-   uint32_t rb_mrt_control[MAX_RTS], rb_mrt_blend_control[MAX_RTS];
-   uint32_t rb_mrt_control_rop;
-   uint32_t rb_blend_cntl, sp_blend_cntl;
-   uint32_t pipeline_color_write_enable, pipeline_blend_enable;
-   uint32_t color_write_enable;
-   bool logic_op_enabled;
-   bool rop_reads_dst;
-   enum pc_di_primtype primtype;
-   bool primitive_restart_enable;
-
-   /* saved states to re-emit in TU_CMD_DIRTY_DRAW_STATE case */
-   struct tu_draw_state dynamic_state[TU_DYNAMIC_STATE_COUNT];
-   struct tu_draw_state vertex_buffers;
-   struct tu_draw_state shader_const;
-   struct tu_draw_state desc_sets;
-
-   struct tu_draw_state vs_params;
-
-   /* Index buffer */
-   uint64_t index_va;
-   uint32_t max_index_count;
-   uint8_t index_size;
-
-   /* because streamout base has to be 32-byte aligned
-    * there is an extra offset to deal with when it is
-    * unaligned
-    */
-   uint8_t streamout_offset[IR3_MAX_SO_BUFFERS];
-
-   /* Renderpasses are tricky, because we may need to flush differently if
-    * using sysmem vs. gmem and therefore we have to delay any flushing that
-    * happens before a renderpass. So we have to have two copies of the flush
-    * state, one for intra-renderpass flushes (i.e. renderpass dependencies)
-    * and one for outside a renderpass.
-    */
-   struct tu_cache_state cache;
-   struct tu_cache_state renderpass_cache;
-
-   enum tu_cmd_ccu_state ccu_state;
-
-   /* Decides which GMEM layout to use from the tu_pass, based on whether the CCU
-    * might get used by tu_store_gmem_attachment().
-    */
-   enum tu_gmem_layout gmem_layout;
-
-   const struct tu_render_pass *pass;
-   const struct tu_subpass *subpass;
-   const struct tu_framebuffer *framebuffer;
-   const struct tu_tiling_config *tiling;
-   VkRect2D render_area;
-
-   const struct tu_image_view **attachments;
-
-   /* State that in the dynamic case comes from VkRenderingInfo and needs to
-    * be saved/restored when suspending. This holds the state for the last
-    * suspended renderpass, which may point to this command buffer's dynamic_*
-    * or another command buffer if executed on a secondary.
-    */
-   struct {
-      const struct tu_render_pass *pass;
-      const struct tu_subpass *subpass;
-      const struct tu_framebuffer *framebuffer;
-      VkRect2D render_area;
-      enum tu_gmem_layout gmem_layout;
-
-      const struct tu_image_view **attachments;
-
-      struct tu_lrz_state lrz;
-   } suspended_pass;
-
-   bool tessfactor_addr_set;
-   bool predication_active;
-   enum a5xx_line_mode line_mode;
-   bool z_negative_one_to_one;
-
-   /* VK_QUERY_PIPELINE_STATISTIC_CLIPPING_INVOCATIONS_BIT and
-    * VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT are allowed to run simultaniously,
-    * but they use the same {START,STOP}_PRIMITIVE_CTRS control.
-    */
-   uint32_t prim_counters_running;
-
-   bool prim_generated_query_running_before_rp;
-
-   /* These are the states of the suspend/resume state machine. In addition to
-    * tracking whether we're in the middle of a chain of suspending and
-    * resuming passes that will be merged, we need to track whether the
-    * command buffer begins in the middle of such a chain, for when it gets
-    * merged with other command buffers. We call such a chain that begins
-    * before the command buffer starts a "pre-chain".
-    *
-    * Note that when this command buffer is finished, this state is untouched
-    * but it gains a different meaning. For example, if we finish in state
-    * SR_IN_CHAIN, we finished in the middle of a suspend/resume chain, so
-    * there's a suspend/resume chain that extends past the end of the command
-    * buffer. In this sense it's the "opposite" of SR_AFTER_PRE_CHAIN, which
-    * means that there's a suspend/resume chain that extends before the
-    * beginning.
-    */
-   enum {
-      /* Either there are no suspend/resume chains, or they are entirely
-       * contained in the current command buffer.
-       *
-       *   BeginCommandBuffer() <- start of current command buffer
-       *       ...
-       *       // we are here
-       */
-      SR_NONE = 0,
-
-      /* We are in the middle of a suspend/resume chain that starts before the
-       * current command buffer. This happens when the command buffer begins
-       * with a resuming render pass and all of the passes up to the current
-       * one are suspending. In this state, our part of the chain is not saved
-       * and is in the current draw_cs/state.
-       *
-       *   BeginRendering() ... EndRendering(suspending)
-       *   BeginCommandBuffer() <- start of current command buffer
-       *       BeginRendering(resuming) ... EndRendering(suspending)
-       *       BeginRendering(resuming) ... EndRendering(suspending)
-       *       ...
-       *       // we are here
-       */
-      SR_IN_PRE_CHAIN,
-
-      /* We are currently outside of any suspend/resume chains, but there is a
-       * chain starting before the current command buffer. It is saved in
-       * pre_chain.
-       *
-       *   BeginRendering() ... EndRendering(suspending)
-       *   BeginCommandBuffer() <- start of current command buffer
-       *       // This part is stashed in pre_chain
-       *       BeginRendering(resuming) ... EndRendering(suspending)
-       *       BeginRendering(resuming) ... EndRendering(suspending)
-       *       ...
-       *       BeginRendering(resuming) ... EndRendering() // end of chain
-       *       ...
-       *       // we are here
-       */
-      SR_AFTER_PRE_CHAIN,
-
-      /* We are in the middle of a suspend/resume chain and there is no chain
-       * starting before the current command buffer.
-       *
-       *   BeginCommandBuffer() <- start of current command buffer
-       *       ...
-       *       BeginRendering() ... EndRendering(suspending)
-       *       BeginRendering(resuming) ... EndRendering(suspending)
-       *       BeginRendering(resuming) ... EndRendering(suspending)
-       *       ...
-       *       // we are here
-       */
-      SR_IN_CHAIN,
-
-      /* We are in the middle of a suspend/resume chain and there is another,
-       * separate, chain starting before the current command buffer.
-       *
-       *   BeginRendering() ... EndRendering(suspending)
-       *   CommandBufferBegin() <- start of current command buffer
-       *       // This part is stashed in pre_chain
-       *       BeginRendering(resuming) ... EndRendering(suspending)
-       *       BeginRendering(resuming) ... EndRendering(suspending)
-       *       ...
-       *       BeginRendering(resuming) ... EndRendering() // end of chain
-       *       ...
-       *       BeginRendering() ... EndRendering(suspending)
-       *       BeginRendering(resuming) ... EndRendering(suspending)
-       *       BeginRendering(resuming) ... EndRendering(suspending)
-       *       ...
-       *       // we are here
-       */
-      SR_IN_CHAIN_AFTER_PRE_CHAIN,
-   } suspend_resume;
-
-   bool suspending, resuming;
-
-   struct tu_lrz_state lrz;
-
-   struct tu_draw_state lrz_and_depth_plane_state;
-
-   struct tu_vs_params last_vs_params;
-};
-
-struct tu_cmd_pool
-{
-   struct vk_command_pool vk;
-
-   struct list_head cmd_buffers;
-   struct list_head free_cmd_buffers;
-};
-
-enum tu_cmd_buffer_status
-{
-   TU_CMD_BUFFER_STATUS_INVALID,
-   TU_CMD_BUFFER_STATUS_INITIAL,
-   TU_CMD_BUFFER_STATUS_RECORDING,
-   TU_CMD_BUFFER_STATUS_EXECUTABLE,
-   TU_CMD_BUFFER_STATUS_PENDING,
-};
-
-struct tu_cmd_buffer
-{
-   struct vk_command_buffer vk;
-
-   struct tu_device *device;
-
-   struct tu_cmd_pool *pool;
-   struct list_head pool_link;
-
-   struct u_trace trace;
-   struct u_trace_iterator trace_renderpass_start;
-   struct u_trace_iterator trace_renderpass_end;
-
-   struct list_head renderpass_autotune_results;
-   struct tu_autotune_results_buffer* autotune_buffer;
-
-   VkCommandBufferUsageFlags usage_flags;
-   enum tu_cmd_buffer_status status;
-
-   VkQueryPipelineStatisticFlags inherited_pipeline_statistics;
-
-   struct tu_cmd_state state;
-   uint32_t queue_family_index;
-
-   uint32_t push_constants[MAX_PUSH_CONSTANTS_SIZE / 4];
-   VkShaderStageFlags push_constant_stages;
-   struct tu_descriptor_set meta_push_descriptors;
-
-   struct tu_descriptor_state descriptors[MAX_BIND_POINTS];
-
-   struct tu_render_pass_attachment dynamic_rp_attachments[2 * (MAX_RTS + 1)];
-   struct tu_subpass_attachment dynamic_color_attachments[MAX_RTS];
-   struct tu_subpass_attachment dynamic_resolve_attachments[MAX_RTS + 1];
-   const struct tu_image_view *dynamic_attachments[2 * (MAX_RTS + 1)];
-
-   struct tu_render_pass dynamic_pass;
-   struct tu_subpass dynamic_subpass;
-   struct tu_framebuffer dynamic_framebuffer;
-
-   VkResult record_result;
-
-   struct tu_cs cs;
-   struct tu_cs draw_cs;
-   struct tu_cs tile_store_cs;
-   struct tu_cs draw_epilogue_cs;
-   struct tu_cs sub_cs;
-
-   /* If the first render pass in the command buffer is resuming, then it is
-    * part of a suspend/resume chain that starts before the current command
-    * buffer and needs to be merged later. In this case, its incomplete state
-    * is stored in pre_chain. In the symmetric case where the last render pass
-    * is suspending, we just skip ending the render pass and its state is
-    * stored in draw_cs/the current state. The first and last render pass
-    * might be part of different chains, which is why all the state may need
-    * to be saved separately here.
-    */
-   struct {
-      struct tu_cs draw_cs;
-      struct tu_cs draw_epilogue_cs;
-
-      struct u_trace_iterator trace_renderpass_start, trace_renderpass_end;
-
-      struct tu_render_pass_state state;
-   } pre_chain;
-
-   uint32_t vsc_draw_strm_pitch;
-   uint32_t vsc_prim_strm_pitch;
-};
-
-static inline uint32_t
-tu_attachment_gmem_offset(struct tu_cmd_buffer *cmd,
-                          const struct tu_render_pass_attachment *att)
-{
-   assert(cmd->state.gmem_layout < TU_GMEM_LAYOUT_COUNT);
-   return att->gmem_offset[cmd->state.gmem_layout];
-}
-
-static inline uint32_t
-tu_attachment_gmem_offset_stencil(struct tu_cmd_buffer *cmd,
-                                  const struct tu_render_pass_attachment *att)
-{
-   assert(cmd->state.gmem_layout < TU_GMEM_LAYOUT_COUNT);
-   return att->gmem_offset_stencil[cmd->state.gmem_layout];
-}
-
-VkResult tu_cmd_buffer_begin(struct tu_cmd_buffer *cmd_buffer,
-                             VkCommandBufferUsageFlags usage_flags);
-
-void tu_emit_cache_flush_renderpass(struct tu_cmd_buffer *cmd_buffer,
-                                    struct tu_cs *cs);
-
-void tu_emit_cache_flush_ccu(struct tu_cmd_buffer *cmd_buffer,
-                             struct tu_cs *cs,
-                             enum tu_cmd_ccu_state ccu_state);
-
-void
-tu_append_pre_chain(struct tu_cmd_buffer *cmd,
-                    struct tu_cmd_buffer *secondary);
-
-void
-tu_append_pre_post_chain(struct tu_cmd_buffer *cmd,
-                         struct tu_cmd_buffer *secondary);
-
-void
-tu_append_post_chain(struct tu_cmd_buffer *cmd,
-                     struct tu_cmd_buffer *secondary);
-
-void
-tu_restore_suspended_pass(struct tu_cmd_buffer *cmd,
-                          struct tu_cmd_buffer *suspended);
-
-void tu_cmd_render(struct tu_cmd_buffer *cmd);
-
-void
-tu6_emit_event_write(struct tu_cmd_buffer *cmd,
-                     struct tu_cs *cs,
-                     enum vgt_event_type event);
-
-static inline struct tu_descriptor_state *
-tu_get_descriptors_state(struct tu_cmd_buffer *cmd_buffer,
-                         VkPipelineBindPoint bind_point)
-{
-   return &cmd_buffer->descriptors[bind_point];
-}
-
-void tu6_emit_msaa(struct tu_cs *cs, VkSampleCountFlagBits samples,
-                   enum a5xx_line_mode line_mode);
-
-void tu6_emit_window_scissor(struct tu_cs *cs, uint32_t x1, uint32_t y1, uint32_t x2, uint32_t y2);
-
-void tu6_emit_window_offset(struct tu_cs *cs, uint32_t x1, uint32_t y1);
-
-void tu_disable_draw_states(struct tu_cmd_buffer *cmd, struct tu_cs *cs);
-
-void tu6_apply_depth_bounds_workaround(struct tu_device *device,
-                                       uint32_t *rb_depth_cntl);
-
 VkResult
 tu_gralloc_info(struct tu_device *device,
                const VkNativeBufferANDROID *gralloc_info,
@@ -748,13 +109,4 @@ tu_import_memory_from_gralloc_handle(VkDevice device_h,
                                     const VkAllocationCallbacks *alloc,
                                     VkImage image_h);

-VK_DEFINE_HANDLE_CASTS(tu_cmd_buffer, vk.base, VkCommandBuffer,
-                       VK_OBJECT_TYPE_COMMAND_BUFFER)
-
-VK_DEFINE_NONDISP_HANDLE_CASTS(tu_cmd_pool, vk.base, VkCommandPool,
-                               VK_OBJECT_TYPE_COMMAND_POOL)
-
-void
-update_stencil_mask(uint32_t *value, VkStencilFaceFlags face, uint32_t mask);
-
 #endif /* TU_PRIVATE_H */