turnip: add tu_cmd_buffer.h

Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/17811>
This commit is contained in:
Chia-I Wu
2022-07-29 14:06:04 -07:00
committed by Marge Bot
parent 6666ec3945
commit 8e61bee30c
4 changed files with 673 additions and 649 deletions

View File

@@ -25,6 +25,7 @@
* DEALINGS IN THE SOFTWARE.
*/
#include "tu_cmd_buffer.h"
#include "tu_private.h"
#include "vk_render_pass.h"

View File

@@ -0,0 +1,670 @@
/*
* Copyright © 2016 Red Hat.
* Copyright © 2016 Bas Nieuwenhuizen
* SPDX-License-Identifier: MIT
*
* based in part on anv driver which is:
* Copyright © 2015 Intel Corporation
*/
#ifndef TU_CMD_BUFFER_H
#define TU_CMD_BUFFER_H
#include "tu_common.h"
#include "tu_cs.h"
#include "tu_descriptor_set.h"
#include "tu_device.h"
#include "tu_lrz.h"
#include "tu_pass.h"
#include "tu_pipeline.h"
enum tu_draw_state_group_id
{
TU_DRAW_STATE_PROGRAM_CONFIG,
TU_DRAW_STATE_PROGRAM,
TU_DRAW_STATE_PROGRAM_BINNING,
TU_DRAW_STATE_VB,
TU_DRAW_STATE_VI,
TU_DRAW_STATE_VI_BINNING,
TU_DRAW_STATE_RAST,
TU_DRAW_STATE_CONST,
TU_DRAW_STATE_DESC_SETS,
TU_DRAW_STATE_DESC_SETS_LOAD,
TU_DRAW_STATE_VS_PARAMS,
TU_DRAW_STATE_INPUT_ATTACHMENTS_GMEM,
TU_DRAW_STATE_INPUT_ATTACHMENTS_SYSMEM,
TU_DRAW_STATE_LRZ_AND_DEPTH_PLANE,
TU_DRAW_STATE_PRIM_MODE_GMEM,
TU_DRAW_STATE_PRIM_MODE_SYSMEM,
/* dynamic state related draw states */
TU_DRAW_STATE_DYNAMIC,
TU_DRAW_STATE_COUNT = TU_DRAW_STATE_DYNAMIC + TU_DYNAMIC_STATE_COUNT,
};
struct tu_descriptor_state
{
struct tu_descriptor_set *sets[MAX_SETS];
struct tu_descriptor_set push_set;
uint32_t dynamic_descriptors[MAX_DYNAMIC_BUFFERS_SIZE];
};
enum tu_cmd_dirty_bits
{
TU_CMD_DIRTY_VERTEX_BUFFERS = BIT(0),
TU_CMD_DIRTY_VB_STRIDE = BIT(1),
TU_CMD_DIRTY_GRAS_SU_CNTL = BIT(2),
TU_CMD_DIRTY_RB_DEPTH_CNTL = BIT(3),
TU_CMD_DIRTY_RB_STENCIL_CNTL = BIT(4),
TU_CMD_DIRTY_DESC_SETS_LOAD = BIT(5),
TU_CMD_DIRTY_COMPUTE_DESC_SETS_LOAD = BIT(6),
TU_CMD_DIRTY_SHADER_CONSTS = BIT(7),
TU_CMD_DIRTY_LRZ = BIT(8),
TU_CMD_DIRTY_VS_PARAMS = BIT(9),
TU_CMD_DIRTY_RASTERIZER_DISCARD = BIT(10),
TU_CMD_DIRTY_VIEWPORTS = BIT(11),
TU_CMD_DIRTY_BLEND = BIT(12),
/* all draw states were disabled and need to be re-enabled: */
TU_CMD_DIRTY_DRAW_STATE = BIT(13)
};
/* There are only three cache domains we have to care about: the CCU, or
* color cache unit, which is used for color and depth/stencil attachments
* and copy/blit destinations, and is split conceptually into color and depth,
* and the universal cache or UCHE which is used for pretty much everything
* else, except for the CP (uncached) and host. We need to flush whenever data
* crosses these boundaries.
*/
enum tu_cmd_access_mask {
TU_ACCESS_UCHE_READ = 1 << 0,
TU_ACCESS_UCHE_WRITE = 1 << 1,
TU_ACCESS_CCU_COLOR_READ = 1 << 2,
TU_ACCESS_CCU_COLOR_WRITE = 1 << 3,
TU_ACCESS_CCU_DEPTH_READ = 1 << 4,
TU_ACCESS_CCU_DEPTH_WRITE = 1 << 5,
/* Experiments have shown that while it's safe to avoid flushing the CCU
* after each blit/renderpass, it's not safe to assume that subsequent
* lookups with a different attachment state will hit unflushed cache
* entries. That is, the CCU needs to be flushed and possibly invalidated
* when accessing memory with a different attachment state. Writing to an
* attachment under the following conditions after clearing using the
* normal 2d engine path is known to have issues:
*
* - It isn't the 0'th layer.
* - There are more than one attachment, and this isn't the 0'th attachment
* (this seems to also depend on the cpp of the attachments).
*
* Our best guess is that the layer/MRT state is used when computing
* the location of a cache entry in CCU, to avoid conflicts. We assume that
* any access in a renderpass after or before an access by a transfer needs
* a flush/invalidate, and use the _INCOHERENT variants to represent access
* by a renderpass.
*/
TU_ACCESS_CCU_COLOR_INCOHERENT_READ = 1 << 6,
TU_ACCESS_CCU_COLOR_INCOHERENT_WRITE = 1 << 7,
TU_ACCESS_CCU_DEPTH_INCOHERENT_READ = 1 << 8,
TU_ACCESS_CCU_DEPTH_INCOHERENT_WRITE = 1 << 9,
/* Accesses which bypasses any cache. e.g. writes via the host,
* CP_EVENT_WRITE::BLIT, and the CP are SYSMEM_WRITE.
*/
TU_ACCESS_SYSMEM_READ = 1 << 10,
TU_ACCESS_SYSMEM_WRITE = 1 << 11,
/* Memory writes from the CP start in-order with draws and event writes,
* but execute asynchronously and hence need a CP_WAIT_MEM_WRITES if read.
*/
TU_ACCESS_CP_WRITE = 1 << 12,
TU_ACCESS_READ =
TU_ACCESS_UCHE_READ |
TU_ACCESS_CCU_COLOR_READ |
TU_ACCESS_CCU_DEPTH_READ |
TU_ACCESS_CCU_COLOR_INCOHERENT_READ |
TU_ACCESS_CCU_DEPTH_INCOHERENT_READ |
TU_ACCESS_SYSMEM_READ,
TU_ACCESS_WRITE =
TU_ACCESS_UCHE_WRITE |
TU_ACCESS_CCU_COLOR_WRITE |
TU_ACCESS_CCU_COLOR_INCOHERENT_WRITE |
TU_ACCESS_CCU_DEPTH_WRITE |
TU_ACCESS_CCU_DEPTH_INCOHERENT_WRITE |
TU_ACCESS_SYSMEM_WRITE |
TU_ACCESS_CP_WRITE,
TU_ACCESS_ALL =
TU_ACCESS_READ |
TU_ACCESS_WRITE,
};
/* Starting with a6xx, the pipeline is split into several "clusters" (really
* pipeline stages). Each stage has its own pair of register banks and can
* switch them independently, so that earlier stages can run ahead of later
* ones. e.g. the FS of draw N and the VS of draw N + 1 can be executing at
* the same time.
*
* As a result of this, we need to insert a WFI when an earlier stage depends
* on the result of a later stage. CP_DRAW_* and CP_BLIT will wait for any
* pending WFI's to complete before starting, and usually before reading
* indirect params even, so a WFI also acts as a full "pipeline stall".
*
* Note, the names of the stages come from CLUSTER_* in devcoredump. We
* include all the stages for completeness, even ones which do not read/write
* anything.
*/
enum tu_stage {
/* This doesn't correspond to a cluster, but we need it for tracking
* indirect draw parameter reads etc.
*/
TU_STAGE_CP,
/* - Fetch index buffer
* - Fetch vertex attributes, dispatch VS
*/
TU_STAGE_FE,
/* Execute all geometry stages (VS thru GS) */
TU_STAGE_SP_VS,
/* Write to VPC, do primitive assembly. */
TU_STAGE_PC_VS,
/* Rasterization. RB_DEPTH_BUFFER_BASE only exists in CLUSTER_PS according
* to devcoredump so presumably this stage stalls for TU_STAGE_PS when
* early depth testing is enabled before dispatching fragments? However
* GRAS reads and writes LRZ directly.
*/
TU_STAGE_GRAS,
/* Execute FS */
TU_STAGE_SP_PS,
/* - Fragment tests
* - Write color/depth
* - Streamout writes (???)
* - Varying interpolation (???)
*/
TU_STAGE_PS,
};
enum tu_cmd_flush_bits {
TU_CMD_FLAG_CCU_FLUSH_DEPTH = 1 << 0,
TU_CMD_FLAG_CCU_FLUSH_COLOR = 1 << 1,
TU_CMD_FLAG_CCU_INVALIDATE_DEPTH = 1 << 2,
TU_CMD_FLAG_CCU_INVALIDATE_COLOR = 1 << 3,
TU_CMD_FLAG_CACHE_FLUSH = 1 << 4,
TU_CMD_FLAG_CACHE_INVALIDATE = 1 << 5,
TU_CMD_FLAG_WAIT_MEM_WRITES = 1 << 6,
TU_CMD_FLAG_WAIT_FOR_IDLE = 1 << 7,
TU_CMD_FLAG_WAIT_FOR_ME = 1 << 8,
TU_CMD_FLAG_ALL_FLUSH =
TU_CMD_FLAG_CCU_FLUSH_DEPTH |
TU_CMD_FLAG_CCU_FLUSH_COLOR |
TU_CMD_FLAG_CACHE_FLUSH |
/* Treat the CP as a sort of "cache" which may need to be "flushed" via
* waiting for writes to land with WAIT_FOR_MEM_WRITES.
*/
TU_CMD_FLAG_WAIT_MEM_WRITES,
TU_CMD_FLAG_ALL_INVALIDATE =
TU_CMD_FLAG_CCU_INVALIDATE_DEPTH |
TU_CMD_FLAG_CCU_INVALIDATE_COLOR |
TU_CMD_FLAG_CACHE_INVALIDATE |
/* Treat CP_WAIT_FOR_ME as a "cache" that needs to be invalidated when a
* a command that needs CP_WAIT_FOR_ME is executed. This means we may
* insert an extra WAIT_FOR_ME before an indirect command requiring it
* in case there was another command before the current command buffer
* that it needs to wait for.
*/
TU_CMD_FLAG_WAIT_FOR_ME,
};
/* Changing the CCU from sysmem mode to gmem mode or vice-versa is pretty
* heavy, involving a CCU cache flush/invalidate and a WFI in order to change
* which part of the gmem is used by the CCU. Here we keep track of what the
* state of the CCU.
*/
enum tu_cmd_ccu_state {
TU_CMD_CCU_SYSMEM,
TU_CMD_CCU_GMEM,
TU_CMD_CCU_UNKNOWN,
};
struct tu_cache_state {
/* Caches which must be made available (flushed) eventually if there are
* any users outside that cache domain, and caches which must be
* invalidated eventually if there are any reads.
*/
enum tu_cmd_flush_bits pending_flush_bits;
/* Pending flushes */
enum tu_cmd_flush_bits flush_bits;
};
struct tu_vs_params {
uint32_t vertex_offset;
uint32_t first_instance;
};
/* This should be for state that is set inside a renderpass and used at
* renderpass end time, e.g. to decide whether to use sysmem. This needs
* special handling for secondary cmdbufs and suspending/resuming render
* passes where the state may need to be combined afterwards.
*/
struct tu_render_pass_state
{
bool xfb_used;
bool has_tess;
bool has_prim_generated_query_in_rp;
bool disable_gmem;
/* Track whether conditional predicate for COND_REG_EXEC is changed in draw_cs */
bool draw_cs_writes_to_cond_pred;
uint32_t drawcall_count;
/* A calculated "draw cost" value for renderpass, which tries to
* estimate the bandwidth-per-sample of all the draws according
* to:
*
* foreach_draw (...) {
* sum += pipeline->color_bandwidth_per_sample;
* if (depth_test_enabled)
* sum += pipeline->depth_cpp_per_sample;
* if (depth_write_enabled)
* sum += pipeline->depth_cpp_per_sample;
* if (stencil_write_enabled)
* sum += pipeline->stencil_cpp_per_sample * 2;
* }
* drawcall_bandwidth_per_sample = sum / drawcall_count;
*
* It allows us to estimate the total bandwidth of drawcalls later, by
* calculating (drawcall_bandwidth_per_sample * zpass_sample_count).
*
* This does ignore depth buffer traffic for samples which do not
* pass due to depth-test fail, and some other details. But it is
* just intended to be a rough estimate that is easy to calculate.
*/
uint32_t drawcall_bandwidth_per_sample_sum;
};
struct tu_cmd_state
{
uint32_t dirty;
struct tu_pipeline *pipeline;
struct tu_pipeline *compute_pipeline;
struct tu_render_pass_state rp;
/* Vertex buffers, viewports, and scissors
* the states for these can be updated partially, so we need to save these
* to be able to emit a complete draw state
*/
struct {
uint64_t base;
uint32_t size;
uint32_t stride;
} vb[MAX_VBS];
VkViewport viewport[MAX_VIEWPORTS];
VkRect2D scissor[MAX_SCISSORS];
uint32_t max_viewport, max_scissor;
/* for dynamic states that can't be emitted directly */
uint32_t dynamic_stencil_mask;
uint32_t dynamic_stencil_wrmask;
uint32_t dynamic_stencil_ref;
uint32_t gras_su_cntl, rb_depth_cntl, rb_stencil_cntl;
uint32_t pc_raster_cntl, vpc_unknown_9107;
uint32_t rb_mrt_control[MAX_RTS], rb_mrt_blend_control[MAX_RTS];
uint32_t rb_mrt_control_rop;
uint32_t rb_blend_cntl, sp_blend_cntl;
uint32_t pipeline_color_write_enable, pipeline_blend_enable;
uint32_t color_write_enable;
bool logic_op_enabled;
bool rop_reads_dst;
enum pc_di_primtype primtype;
bool primitive_restart_enable;
/* saved states to re-emit in TU_CMD_DIRTY_DRAW_STATE case */
struct tu_draw_state dynamic_state[TU_DYNAMIC_STATE_COUNT];
struct tu_draw_state vertex_buffers;
struct tu_draw_state shader_const;
struct tu_draw_state desc_sets;
struct tu_draw_state vs_params;
/* Index buffer */
uint64_t index_va;
uint32_t max_index_count;
uint8_t index_size;
/* because streamout base has to be 32-byte aligned
* there is an extra offset to deal with when it is
* unaligned
*/
uint8_t streamout_offset[IR3_MAX_SO_BUFFERS];
/* Renderpasses are tricky, because we may need to flush differently if
* using sysmem vs. gmem and therefore we have to delay any flushing that
* happens before a renderpass. So we have to have two copies of the flush
* state, one for intra-renderpass flushes (i.e. renderpass dependencies)
* and one for outside a renderpass.
*/
struct tu_cache_state cache;
struct tu_cache_state renderpass_cache;
enum tu_cmd_ccu_state ccu_state;
/* Decides which GMEM layout to use from the tu_pass, based on whether the CCU
* might get used by tu_store_gmem_attachment().
*/
enum tu_gmem_layout gmem_layout;
const struct tu_render_pass *pass;
const struct tu_subpass *subpass;
const struct tu_framebuffer *framebuffer;
const struct tu_tiling_config *tiling;
VkRect2D render_area;
const struct tu_image_view **attachments;
/* State that in the dynamic case comes from VkRenderingInfo and needs to
* be saved/restored when suspending. This holds the state for the last
* suspended renderpass, which may point to this command buffer's dynamic_*
* or another command buffer if executed on a secondary.
*/
struct {
const struct tu_render_pass *pass;
const struct tu_subpass *subpass;
const struct tu_framebuffer *framebuffer;
VkRect2D render_area;
enum tu_gmem_layout gmem_layout;
const struct tu_image_view **attachments;
struct tu_lrz_state lrz;
} suspended_pass;
bool tessfactor_addr_set;
bool predication_active;
enum a5xx_line_mode line_mode;
bool z_negative_one_to_one;
/* VK_QUERY_PIPELINE_STATISTIC_CLIPPING_INVOCATIONS_BIT and
* VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT are allowed to run simultaniously,
* but they use the same {START,STOP}_PRIMITIVE_CTRS control.
*/
uint32_t prim_counters_running;
bool prim_generated_query_running_before_rp;
/* These are the states of the suspend/resume state machine. In addition to
* tracking whether we're in the middle of a chain of suspending and
* resuming passes that will be merged, we need to track whether the
* command buffer begins in the middle of such a chain, for when it gets
* merged with other command buffers. We call such a chain that begins
* before the command buffer starts a "pre-chain".
*
* Note that when this command buffer is finished, this state is untouched
* but it gains a different meaning. For example, if we finish in state
* SR_IN_CHAIN, we finished in the middle of a suspend/resume chain, so
* there's a suspend/resume chain that extends past the end of the command
* buffer. In this sense it's the "opposite" of SR_AFTER_PRE_CHAIN, which
* means that there's a suspend/resume chain that extends before the
* beginning.
*/
enum {
/* Either there are no suspend/resume chains, or they are entirely
* contained in the current command buffer.
*
* BeginCommandBuffer() <- start of current command buffer
* ...
* // we are here
*/
SR_NONE = 0,
/* We are in the middle of a suspend/resume chain that starts before the
* current command buffer. This happens when the command buffer begins
* with a resuming render pass and all of the passes up to the current
* one are suspending. In this state, our part of the chain is not saved
* and is in the current draw_cs/state.
*
* BeginRendering() ... EndRendering(suspending)
* BeginCommandBuffer() <- start of current command buffer
* BeginRendering(resuming) ... EndRendering(suspending)
* BeginRendering(resuming) ... EndRendering(suspending)
* ...
* // we are here
*/
SR_IN_PRE_CHAIN,
/* We are currently outside of any suspend/resume chains, but there is a
* chain starting before the current command buffer. It is saved in
* pre_chain.
*
* BeginRendering() ... EndRendering(suspending)
* BeginCommandBuffer() <- start of current command buffer
* // This part is stashed in pre_chain
* BeginRendering(resuming) ... EndRendering(suspending)
* BeginRendering(resuming) ... EndRendering(suspending)
* ...
* BeginRendering(resuming) ... EndRendering() // end of chain
* ...
* // we are here
*/
SR_AFTER_PRE_CHAIN,
/* We are in the middle of a suspend/resume chain and there is no chain
* starting before the current command buffer.
*
* BeginCommandBuffer() <- start of current command buffer
* ...
* BeginRendering() ... EndRendering(suspending)
* BeginRendering(resuming) ... EndRendering(suspending)
* BeginRendering(resuming) ... EndRendering(suspending)
* ...
* // we are here
*/
SR_IN_CHAIN,
/* We are in the middle of a suspend/resume chain and there is another,
* separate, chain starting before the current command buffer.
*
* BeginRendering() ... EndRendering(suspending)
* CommandBufferBegin() <- start of current command buffer
* // This part is stashed in pre_chain
* BeginRendering(resuming) ... EndRendering(suspending)
* BeginRendering(resuming) ... EndRendering(suspending)
* ...
* BeginRendering(resuming) ... EndRendering() // end of chain
* ...
* BeginRendering() ... EndRendering(suspending)
* BeginRendering(resuming) ... EndRendering(suspending)
* BeginRendering(resuming) ... EndRendering(suspending)
* ...
* // we are here
*/
SR_IN_CHAIN_AFTER_PRE_CHAIN,
} suspend_resume;
bool suspending, resuming;
struct tu_lrz_state lrz;
struct tu_draw_state lrz_and_depth_plane_state;
struct tu_vs_params last_vs_params;
};
struct tu_cmd_pool
{
struct vk_command_pool vk;
struct list_head cmd_buffers;
struct list_head free_cmd_buffers;
};
VK_DEFINE_NONDISP_HANDLE_CASTS(tu_cmd_pool, vk.base, VkCommandPool,
VK_OBJECT_TYPE_COMMAND_POOL)
enum tu_cmd_buffer_status
{
TU_CMD_BUFFER_STATUS_INVALID,
TU_CMD_BUFFER_STATUS_INITIAL,
TU_CMD_BUFFER_STATUS_RECORDING,
TU_CMD_BUFFER_STATUS_EXECUTABLE,
TU_CMD_BUFFER_STATUS_PENDING,
};
struct tu_cmd_buffer
{
struct vk_command_buffer vk;
struct tu_device *device;
struct tu_cmd_pool *pool;
struct list_head pool_link;
struct u_trace trace;
struct u_trace_iterator trace_renderpass_start;
struct u_trace_iterator trace_renderpass_end;
struct list_head renderpass_autotune_results;
struct tu_autotune_results_buffer* autotune_buffer;
VkCommandBufferUsageFlags usage_flags;
enum tu_cmd_buffer_status status;
VkQueryPipelineStatisticFlags inherited_pipeline_statistics;
struct tu_cmd_state state;
uint32_t queue_family_index;
uint32_t push_constants[MAX_PUSH_CONSTANTS_SIZE / 4];
VkShaderStageFlags push_constant_stages;
struct tu_descriptor_set meta_push_descriptors;
struct tu_descriptor_state descriptors[MAX_BIND_POINTS];
struct tu_render_pass_attachment dynamic_rp_attachments[2 * (MAX_RTS + 1)];
struct tu_subpass_attachment dynamic_color_attachments[MAX_RTS];
struct tu_subpass_attachment dynamic_resolve_attachments[MAX_RTS + 1];
const struct tu_image_view *dynamic_attachments[2 * (MAX_RTS + 1)];
struct tu_render_pass dynamic_pass;
struct tu_subpass dynamic_subpass;
struct tu_framebuffer dynamic_framebuffer;
VkResult record_result;
struct tu_cs cs;
struct tu_cs draw_cs;
struct tu_cs tile_store_cs;
struct tu_cs draw_epilogue_cs;
struct tu_cs sub_cs;
/* If the first render pass in the command buffer is resuming, then it is
* part of a suspend/resume chain that starts before the current command
* buffer and needs to be merged later. In this case, its incomplete state
* is stored in pre_chain. In the symmetric case where the last render pass
* is suspending, we just skip ending the render pass and its state is
* stored in draw_cs/the current state. The first and last render pass
* might be part of different chains, which is why all the state may need
* to be saved separately here.
*/
struct {
struct tu_cs draw_cs;
struct tu_cs draw_epilogue_cs;
struct u_trace_iterator trace_renderpass_start, trace_renderpass_end;
struct tu_render_pass_state state;
} pre_chain;
uint32_t vsc_draw_strm_pitch;
uint32_t vsc_prim_strm_pitch;
};
VK_DEFINE_HANDLE_CASTS(tu_cmd_buffer, vk.base, VkCommandBuffer,
VK_OBJECT_TYPE_COMMAND_BUFFER)
static inline uint32_t
tu_attachment_gmem_offset(struct tu_cmd_buffer *cmd,
const struct tu_render_pass_attachment *att)
{
assert(cmd->state.gmem_layout < TU_GMEM_LAYOUT_COUNT);
return att->gmem_offset[cmd->state.gmem_layout];
}
static inline uint32_t
tu_attachment_gmem_offset_stencil(struct tu_cmd_buffer *cmd,
const struct tu_render_pass_attachment *att)
{
assert(cmd->state.gmem_layout < TU_GMEM_LAYOUT_COUNT);
return att->gmem_offset_stencil[cmd->state.gmem_layout];
}
void tu_render_pass_state_merge(struct tu_render_pass_state *dst,
const struct tu_render_pass_state *src);
VkResult tu_cmd_buffer_begin(struct tu_cmd_buffer *cmd_buffer,
VkCommandBufferUsageFlags usage_flags);
void tu_emit_cache_flush_renderpass(struct tu_cmd_buffer *cmd_buffer,
struct tu_cs *cs);
void tu_emit_cache_flush_ccu(struct tu_cmd_buffer *cmd_buffer,
struct tu_cs *cs,
enum tu_cmd_ccu_state ccu_state);
void
tu_append_pre_chain(struct tu_cmd_buffer *cmd,
struct tu_cmd_buffer *secondary);
void
tu_append_pre_post_chain(struct tu_cmd_buffer *cmd,
struct tu_cmd_buffer *secondary);
void
tu_append_post_chain(struct tu_cmd_buffer *cmd,
struct tu_cmd_buffer *secondary);
void
tu_restore_suspended_pass(struct tu_cmd_buffer *cmd,
struct tu_cmd_buffer *suspended);
void tu_cmd_render(struct tu_cmd_buffer *cmd);
void
tu6_emit_event_write(struct tu_cmd_buffer *cmd,
struct tu_cs *cs,
enum vgt_event_type event);
static inline struct tu_descriptor_state *
tu_get_descriptors_state(struct tu_cmd_buffer *cmd_buffer,
VkPipelineBindPoint bind_point)
{
return &cmd_buffer->descriptors[bind_point];
}
void tu6_emit_msaa(struct tu_cs *cs, VkSampleCountFlagBits samples,
enum a5xx_line_mode line_mode);
void tu6_emit_window_scissor(struct tu_cs *cs, uint32_t x1, uint32_t y1, uint32_t x2, uint32_t y2);
void tu6_emit_window_offset(struct tu_cs *cs, uint32_t x1, uint32_t y1);
void tu_disable_draw_states(struct tu_cmd_buffer *cmd, struct tu_cs *cs);
void tu6_apply_depth_bounds_workaround(struct tu_device *device,
uint32_t *rb_depth_cntl);
void
update_stencil_mask(uint32_t *value, VkStencilFaceFlags face, uint32_t mask);
#endif /* TU_CMD_BUFFER_H */

View File

@@ -116,6 +116,7 @@
struct tu_buffer;
struct tu_buffer_view;
struct tu_cmd_buffer;
struct tu_cmd_pool;
struct tu_descriptor_pool;
struct tu_descriptor_set;
struct tu_descriptor_set_layout;

View File

@@ -31,6 +31,7 @@
#include "tu_common.h"
#include "tu_autotune.h"
#include "tu_clear_blit.h"
#include "tu_cmd_buffer.h"
#include "tu_cs.h"
#include "tu_descriptor_set.h"
#include "tu_device.h"
@@ -91,651 +92,11 @@ __tu_finishme(const char *file, int line, const char *format, ...)
tu_finishme("stub %s", __func__); \
} while (0)
enum tu_draw_state_group_id
{
TU_DRAW_STATE_PROGRAM_CONFIG,
TU_DRAW_STATE_PROGRAM,
TU_DRAW_STATE_PROGRAM_BINNING,
TU_DRAW_STATE_VB,
TU_DRAW_STATE_VI,
TU_DRAW_STATE_VI_BINNING,
TU_DRAW_STATE_RAST,
TU_DRAW_STATE_CONST,
TU_DRAW_STATE_DESC_SETS,
TU_DRAW_STATE_DESC_SETS_LOAD,
TU_DRAW_STATE_VS_PARAMS,
TU_DRAW_STATE_INPUT_ATTACHMENTS_GMEM,
TU_DRAW_STATE_INPUT_ATTACHMENTS_SYSMEM,
TU_DRAW_STATE_LRZ_AND_DEPTH_PLANE,
TU_DRAW_STATE_PRIM_MODE_GMEM,
TU_DRAW_STATE_PRIM_MODE_SYSMEM,
/* dynamic state related draw states */
TU_DRAW_STATE_DYNAMIC,
TU_DRAW_STATE_COUNT = TU_DRAW_STATE_DYNAMIC + TU_DYNAMIC_STATE_COUNT,
};
void
tu_framebuffer_tiling_config(struct tu_framebuffer *fb,
const struct tu_device *device,
const struct tu_render_pass *pass);
struct tu_descriptor_state
{
struct tu_descriptor_set *sets[MAX_SETS];
struct tu_descriptor_set push_set;
uint32_t dynamic_descriptors[MAX_DYNAMIC_BUFFERS_SIZE];
};
enum tu_cmd_dirty_bits
{
TU_CMD_DIRTY_VERTEX_BUFFERS = BIT(0),
TU_CMD_DIRTY_VB_STRIDE = BIT(1),
TU_CMD_DIRTY_GRAS_SU_CNTL = BIT(2),
TU_CMD_DIRTY_RB_DEPTH_CNTL = BIT(3),
TU_CMD_DIRTY_RB_STENCIL_CNTL = BIT(4),
TU_CMD_DIRTY_DESC_SETS_LOAD = BIT(5),
TU_CMD_DIRTY_COMPUTE_DESC_SETS_LOAD = BIT(6),
TU_CMD_DIRTY_SHADER_CONSTS = BIT(7),
TU_CMD_DIRTY_LRZ = BIT(8),
TU_CMD_DIRTY_VS_PARAMS = BIT(9),
TU_CMD_DIRTY_RASTERIZER_DISCARD = BIT(10),
TU_CMD_DIRTY_VIEWPORTS = BIT(11),
TU_CMD_DIRTY_BLEND = BIT(12),
/* all draw states were disabled and need to be re-enabled: */
TU_CMD_DIRTY_DRAW_STATE = BIT(13)
};
/* There are only three cache domains we have to care about: the CCU, or
* color cache unit, which is used for color and depth/stencil attachments
* and copy/blit destinations, and is split conceptually into color and depth,
* and the universal cache or UCHE which is used for pretty much everything
* else, except for the CP (uncached) and host. We need to flush whenever data
* crosses these boundaries.
*/
enum tu_cmd_access_mask {
TU_ACCESS_UCHE_READ = 1 << 0,
TU_ACCESS_UCHE_WRITE = 1 << 1,
TU_ACCESS_CCU_COLOR_READ = 1 << 2,
TU_ACCESS_CCU_COLOR_WRITE = 1 << 3,
TU_ACCESS_CCU_DEPTH_READ = 1 << 4,
TU_ACCESS_CCU_DEPTH_WRITE = 1 << 5,
/* Experiments have shown that while it's safe to avoid flushing the CCU
* after each blit/renderpass, it's not safe to assume that subsequent
* lookups with a different attachment state will hit unflushed cache
* entries. That is, the CCU needs to be flushed and possibly invalidated
* when accessing memory with a different attachment state. Writing to an
* attachment under the following conditions after clearing using the
* normal 2d engine path is known to have issues:
*
* - It isn't the 0'th layer.
* - There are more than one attachment, and this isn't the 0'th attachment
* (this seems to also depend on the cpp of the attachments).
*
* Our best guess is that the layer/MRT state is used when computing
* the location of a cache entry in CCU, to avoid conflicts. We assume that
* any access in a renderpass after or before an access by a transfer needs
* a flush/invalidate, and use the _INCOHERENT variants to represent access
* by a renderpass.
*/
TU_ACCESS_CCU_COLOR_INCOHERENT_READ = 1 << 6,
TU_ACCESS_CCU_COLOR_INCOHERENT_WRITE = 1 << 7,
TU_ACCESS_CCU_DEPTH_INCOHERENT_READ = 1 << 8,
TU_ACCESS_CCU_DEPTH_INCOHERENT_WRITE = 1 << 9,
/* Accesses which bypasses any cache. e.g. writes via the host,
* CP_EVENT_WRITE::BLIT, and the CP are SYSMEM_WRITE.
*/
TU_ACCESS_SYSMEM_READ = 1 << 10,
TU_ACCESS_SYSMEM_WRITE = 1 << 11,
/* Memory writes from the CP start in-order with draws and event writes,
* but execute asynchronously and hence need a CP_WAIT_MEM_WRITES if read.
*/
TU_ACCESS_CP_WRITE = 1 << 12,
TU_ACCESS_READ =
TU_ACCESS_UCHE_READ |
TU_ACCESS_CCU_COLOR_READ |
TU_ACCESS_CCU_DEPTH_READ |
TU_ACCESS_CCU_COLOR_INCOHERENT_READ |
TU_ACCESS_CCU_DEPTH_INCOHERENT_READ |
TU_ACCESS_SYSMEM_READ,
TU_ACCESS_WRITE =
TU_ACCESS_UCHE_WRITE |
TU_ACCESS_CCU_COLOR_WRITE |
TU_ACCESS_CCU_COLOR_INCOHERENT_WRITE |
TU_ACCESS_CCU_DEPTH_WRITE |
TU_ACCESS_CCU_DEPTH_INCOHERENT_WRITE |
TU_ACCESS_SYSMEM_WRITE |
TU_ACCESS_CP_WRITE,
TU_ACCESS_ALL =
TU_ACCESS_READ |
TU_ACCESS_WRITE,
};
/* Starting with a6xx, the pipeline is split into several "clusters" (really
* pipeline stages). Each stage has its own pair of register banks and can
* switch them independently, so that earlier stages can run ahead of later
* ones. e.g. the FS of draw N and the VS of draw N + 1 can be executing at
* the same time.
*
* As a result of this, we need to insert a WFI when an earlier stage depends
* on the result of a later stage. CP_DRAW_* and CP_BLIT will wait for any
* pending WFI's to complete before starting, and usually before reading
* indirect params even, so a WFI also acts as a full "pipeline stall".
*
* Note, the names of the stages come from CLUSTER_* in devcoredump. We
* include all the stages for completeness, even ones which do not read/write
* anything.
*/
enum tu_stage {
/* This doesn't correspond to a cluster, but we need it for tracking
* indirect draw parameter reads etc.
*/
TU_STAGE_CP,
/* - Fetch index buffer
* - Fetch vertex attributes, dispatch VS
*/
TU_STAGE_FE,
/* Execute all geometry stages (VS thru GS) */
TU_STAGE_SP_VS,
/* Write to VPC, do primitive assembly. */
TU_STAGE_PC_VS,
/* Rasterization. RB_DEPTH_BUFFER_BASE only exists in CLUSTER_PS according
* to devcoredump so presumably this stage stalls for TU_STAGE_PS when
* early depth testing is enabled before dispatching fragments? However
* GRAS reads and writes LRZ directly.
*/
TU_STAGE_GRAS,
/* Execute FS */
TU_STAGE_SP_PS,
/* - Fragment tests
* - Write color/depth
* - Streamout writes (???)
* - Varying interpolation (???)
*/
TU_STAGE_PS,
};
enum tu_cmd_flush_bits {
TU_CMD_FLAG_CCU_FLUSH_DEPTH = 1 << 0,
TU_CMD_FLAG_CCU_FLUSH_COLOR = 1 << 1,
TU_CMD_FLAG_CCU_INVALIDATE_DEPTH = 1 << 2,
TU_CMD_FLAG_CCU_INVALIDATE_COLOR = 1 << 3,
TU_CMD_FLAG_CACHE_FLUSH = 1 << 4,
TU_CMD_FLAG_CACHE_INVALIDATE = 1 << 5,
TU_CMD_FLAG_WAIT_MEM_WRITES = 1 << 6,
TU_CMD_FLAG_WAIT_FOR_IDLE = 1 << 7,
TU_CMD_FLAG_WAIT_FOR_ME = 1 << 8,
TU_CMD_FLAG_ALL_FLUSH =
TU_CMD_FLAG_CCU_FLUSH_DEPTH |
TU_CMD_FLAG_CCU_FLUSH_COLOR |
TU_CMD_FLAG_CACHE_FLUSH |
/* Treat the CP as a sort of "cache" which may need to be "flushed" via
* waiting for writes to land with WAIT_FOR_MEM_WRITES.
*/
TU_CMD_FLAG_WAIT_MEM_WRITES,
TU_CMD_FLAG_ALL_INVALIDATE =
TU_CMD_FLAG_CCU_INVALIDATE_DEPTH |
TU_CMD_FLAG_CCU_INVALIDATE_COLOR |
TU_CMD_FLAG_CACHE_INVALIDATE |
/* Treat CP_WAIT_FOR_ME as a "cache" that needs to be invalidated when a
* a command that needs CP_WAIT_FOR_ME is executed. This means we may
* insert an extra WAIT_FOR_ME before an indirect command requiring it
* in case there was another command before the current command buffer
* that it needs to wait for.
*/
TU_CMD_FLAG_WAIT_FOR_ME,
};
/* Changing the CCU from sysmem mode to gmem mode or vice-versa is pretty
* heavy, involving a CCU cache flush/invalidate and a WFI in order to change
* which part of the gmem is used by the CCU. Here we keep track of what the
* state of the CCU.
*/
enum tu_cmd_ccu_state {
TU_CMD_CCU_SYSMEM,
TU_CMD_CCU_GMEM,
TU_CMD_CCU_UNKNOWN,
};
struct tu_cache_state {
/* Caches which must be made available (flushed) eventually if there are
* any users outside that cache domain, and caches which must be
* invalidated eventually if there are any reads.
*/
enum tu_cmd_flush_bits pending_flush_bits;
/* Pending flushes */
enum tu_cmd_flush_bits flush_bits;
};
struct tu_vs_params {
uint32_t vertex_offset;
uint32_t first_instance;
};
/* This should be for state that is set inside a renderpass and used at
* renderpass end time, e.g. to decide whether to use sysmem. This needs
* special handling for secondary cmdbufs and suspending/resuming render
* passes where the state may need to be combined afterwards.
*/
struct tu_render_pass_state
{
bool xfb_used;
bool has_tess;
bool has_prim_generated_query_in_rp;
bool disable_gmem;
/* Track whether conditional predicate for COND_REG_EXEC is changed in draw_cs */
bool draw_cs_writes_to_cond_pred;
uint32_t drawcall_count;
/* A calculated "draw cost" value for renderpass, which tries to
* estimate the bandwidth-per-sample of all the draws according
* to:
*
* foreach_draw (...) {
* sum += pipeline->color_bandwidth_per_sample;
* if (depth_test_enabled)
* sum += pipeline->depth_cpp_per_sample;
* if (depth_write_enabled)
* sum += pipeline->depth_cpp_per_sample;
* if (stencil_write_enabled)
* sum += pipeline->stencil_cpp_per_sample * 2;
* }
* drawcall_bandwidth_per_sample = sum / drawcall_count;
*
* It allows us to estimate the total bandwidth of drawcalls later, by
* calculating (drawcall_bandwidth_per_sample * zpass_sample_count).
*
* This does ignore depth buffer traffic for samples which do not
* pass due to depth-test fail, and some other details. But it is
* just intended to be a rough estimate that is easy to calculate.
*/
uint32_t drawcall_bandwidth_per_sample_sum;
};
void tu_render_pass_state_merge(struct tu_render_pass_state *dst,
const struct tu_render_pass_state *src);
struct tu_cmd_state
{
uint32_t dirty;
struct tu_pipeline *pipeline;
struct tu_pipeline *compute_pipeline;
struct tu_render_pass_state rp;
/* Vertex buffers, viewports, and scissors
* the states for these can be updated partially, so we need to save these
* to be able to emit a complete draw state
*/
struct {
uint64_t base;
uint32_t size;
uint32_t stride;
} vb[MAX_VBS];
VkViewport viewport[MAX_VIEWPORTS];
VkRect2D scissor[MAX_SCISSORS];
uint32_t max_viewport, max_scissor;
/* for dynamic states that can't be emitted directly */
uint32_t dynamic_stencil_mask;
uint32_t dynamic_stencil_wrmask;
uint32_t dynamic_stencil_ref;
uint32_t gras_su_cntl, rb_depth_cntl, rb_stencil_cntl;
uint32_t pc_raster_cntl, vpc_unknown_9107;
uint32_t rb_mrt_control[MAX_RTS], rb_mrt_blend_control[MAX_RTS];
uint32_t rb_mrt_control_rop;
uint32_t rb_blend_cntl, sp_blend_cntl;
uint32_t pipeline_color_write_enable, pipeline_blend_enable;
uint32_t color_write_enable;
bool logic_op_enabled;
bool rop_reads_dst;
enum pc_di_primtype primtype;
bool primitive_restart_enable;
/* saved states to re-emit in TU_CMD_DIRTY_DRAW_STATE case */
struct tu_draw_state dynamic_state[TU_DYNAMIC_STATE_COUNT];
struct tu_draw_state vertex_buffers;
struct tu_draw_state shader_const;
struct tu_draw_state desc_sets;
struct tu_draw_state vs_params;
/* Index buffer */
uint64_t index_va;
uint32_t max_index_count;
uint8_t index_size;
/* because streamout base has to be 32-byte aligned
* there is an extra offset to deal with when it is
* unaligned
*/
uint8_t streamout_offset[IR3_MAX_SO_BUFFERS];
/* Renderpasses are tricky, because we may need to flush differently if
* using sysmem vs. gmem and therefore we have to delay any flushing that
* happens before a renderpass. So we have to have two copies of the flush
* state, one for intra-renderpass flushes (i.e. renderpass dependencies)
* and one for outside a renderpass.
*/
struct tu_cache_state cache;
struct tu_cache_state renderpass_cache;
enum tu_cmd_ccu_state ccu_state;
/* Decides which GMEM layout to use from the tu_pass, based on whether the CCU
* might get used by tu_store_gmem_attachment().
*/
enum tu_gmem_layout gmem_layout;
const struct tu_render_pass *pass;
const struct tu_subpass *subpass;
const struct tu_framebuffer *framebuffer;
const struct tu_tiling_config *tiling;
VkRect2D render_area;
const struct tu_image_view **attachments;
/* State that in the dynamic case comes from VkRenderingInfo and needs to
* be saved/restored when suspending. This holds the state for the last
* suspended renderpass, which may point to this command buffer's dynamic_*
* or another command buffer if executed on a secondary.
*/
struct {
const struct tu_render_pass *pass;
const struct tu_subpass *subpass;
const struct tu_framebuffer *framebuffer;
VkRect2D render_area;
enum tu_gmem_layout gmem_layout;
const struct tu_image_view **attachments;
struct tu_lrz_state lrz;
} suspended_pass;
bool tessfactor_addr_set;
bool predication_active;
enum a5xx_line_mode line_mode;
bool z_negative_one_to_one;
/* VK_QUERY_PIPELINE_STATISTIC_CLIPPING_INVOCATIONS_BIT and
* VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT are allowed to run simultaniously,
* but they use the same {START,STOP}_PRIMITIVE_CTRS control.
*/
uint32_t prim_counters_running;
bool prim_generated_query_running_before_rp;
/* These are the states of the suspend/resume state machine. In addition to
* tracking whether we're in the middle of a chain of suspending and
* resuming passes that will be merged, we need to track whether the
* command buffer begins in the middle of such a chain, for when it gets
* merged with other command buffers. We call such a chain that begins
* before the command buffer starts a "pre-chain".
*
* Note that when this command buffer is finished, this state is untouched
* but it gains a different meaning. For example, if we finish in state
* SR_IN_CHAIN, we finished in the middle of a suspend/resume chain, so
* there's a suspend/resume chain that extends past the end of the command
* buffer. In this sense it's the "opposite" of SR_AFTER_PRE_CHAIN, which
* means that there's a suspend/resume chain that extends before the
* beginning.
*/
enum {
/* Either there are no suspend/resume chains, or they are entirely
* contained in the current command buffer.
*
* BeginCommandBuffer() <- start of current command buffer
* ...
* // we are here
*/
SR_NONE = 0,
/* We are in the middle of a suspend/resume chain that starts before the
* current command buffer. This happens when the command buffer begins
* with a resuming render pass and all of the passes up to the current
* one are suspending. In this state, our part of the chain is not saved
* and is in the current draw_cs/state.
*
* BeginRendering() ... EndRendering(suspending)
* BeginCommandBuffer() <- start of current command buffer
* BeginRendering(resuming) ... EndRendering(suspending)
* BeginRendering(resuming) ... EndRendering(suspending)
* ...
* // we are here
*/
SR_IN_PRE_CHAIN,
/* We are currently outside of any suspend/resume chains, but there is a
* chain starting before the current command buffer. It is saved in
* pre_chain.
*
* BeginRendering() ... EndRendering(suspending)
* BeginCommandBuffer() <- start of current command buffer
* // This part is stashed in pre_chain
* BeginRendering(resuming) ... EndRendering(suspending)
* BeginRendering(resuming) ... EndRendering(suspending)
* ...
* BeginRendering(resuming) ... EndRendering() // end of chain
* ...
* // we are here
*/
SR_AFTER_PRE_CHAIN,
/* We are in the middle of a suspend/resume chain and there is no chain
* starting before the current command buffer.
*
* BeginCommandBuffer() <- start of current command buffer
* ...
* BeginRendering() ... EndRendering(suspending)
* BeginRendering(resuming) ... EndRendering(suspending)
* BeginRendering(resuming) ... EndRendering(suspending)
* ...
* // we are here
*/
SR_IN_CHAIN,
/* We are in the middle of a suspend/resume chain and there is another,
* separate, chain starting before the current command buffer.
*
* BeginRendering() ... EndRendering(suspending)
* CommandBufferBegin() <- start of current command buffer
* // This part is stashed in pre_chain
* BeginRendering(resuming) ... EndRendering(suspending)
* BeginRendering(resuming) ... EndRendering(suspending)
* ...
* BeginRendering(resuming) ... EndRendering() // end of chain
* ...
* BeginRendering() ... EndRendering(suspending)
* BeginRendering(resuming) ... EndRendering(suspending)
* BeginRendering(resuming) ... EndRendering(suspending)
* ...
* // we are here
*/
SR_IN_CHAIN_AFTER_PRE_CHAIN,
} suspend_resume;
bool suspending, resuming;
struct tu_lrz_state lrz;
struct tu_draw_state lrz_and_depth_plane_state;
struct tu_vs_params last_vs_params;
};
struct tu_cmd_pool
{
struct vk_command_pool vk;
struct list_head cmd_buffers;
struct list_head free_cmd_buffers;
};
enum tu_cmd_buffer_status
{
TU_CMD_BUFFER_STATUS_INVALID,
TU_CMD_BUFFER_STATUS_INITIAL,
TU_CMD_BUFFER_STATUS_RECORDING,
TU_CMD_BUFFER_STATUS_EXECUTABLE,
TU_CMD_BUFFER_STATUS_PENDING,
};
struct tu_cmd_buffer
{
struct vk_command_buffer vk;
struct tu_device *device;
struct tu_cmd_pool *pool;
struct list_head pool_link;
struct u_trace trace;
struct u_trace_iterator trace_renderpass_start;
struct u_trace_iterator trace_renderpass_end;
struct list_head renderpass_autotune_results;
struct tu_autotune_results_buffer* autotune_buffer;
VkCommandBufferUsageFlags usage_flags;
enum tu_cmd_buffer_status status;
VkQueryPipelineStatisticFlags inherited_pipeline_statistics;
struct tu_cmd_state state;
uint32_t queue_family_index;
uint32_t push_constants[MAX_PUSH_CONSTANTS_SIZE / 4];
VkShaderStageFlags push_constant_stages;
struct tu_descriptor_set meta_push_descriptors;
struct tu_descriptor_state descriptors[MAX_BIND_POINTS];
struct tu_render_pass_attachment dynamic_rp_attachments[2 * (MAX_RTS + 1)];
struct tu_subpass_attachment dynamic_color_attachments[MAX_RTS];
struct tu_subpass_attachment dynamic_resolve_attachments[MAX_RTS + 1];
const struct tu_image_view *dynamic_attachments[2 * (MAX_RTS + 1)];
struct tu_render_pass dynamic_pass;
struct tu_subpass dynamic_subpass;
struct tu_framebuffer dynamic_framebuffer;
VkResult record_result;
struct tu_cs cs;
struct tu_cs draw_cs;
struct tu_cs tile_store_cs;
struct tu_cs draw_epilogue_cs;
struct tu_cs sub_cs;
/* If the first render pass in the command buffer is resuming, then it is
* part of a suspend/resume chain that starts before the current command
* buffer and needs to be merged later. In this case, its incomplete state
* is stored in pre_chain. In the symmetric case where the last render pass
* is suspending, we just skip ending the render pass and its state is
* stored in draw_cs/the current state. The first and last render pass
* might be part of different chains, which is why all the state may need
* to be saved separately here.
*/
struct {
struct tu_cs draw_cs;
struct tu_cs draw_epilogue_cs;
struct u_trace_iterator trace_renderpass_start, trace_renderpass_end;
struct tu_render_pass_state state;
} pre_chain;
uint32_t vsc_draw_strm_pitch;
uint32_t vsc_prim_strm_pitch;
};
static inline uint32_t
tu_attachment_gmem_offset(struct tu_cmd_buffer *cmd,
const struct tu_render_pass_attachment *att)
{
assert(cmd->state.gmem_layout < TU_GMEM_LAYOUT_COUNT);
return att->gmem_offset[cmd->state.gmem_layout];
}
static inline uint32_t
tu_attachment_gmem_offset_stencil(struct tu_cmd_buffer *cmd,
const struct tu_render_pass_attachment *att)
{
assert(cmd->state.gmem_layout < TU_GMEM_LAYOUT_COUNT);
return att->gmem_offset_stencil[cmd->state.gmem_layout];
}
VkResult tu_cmd_buffer_begin(struct tu_cmd_buffer *cmd_buffer,
VkCommandBufferUsageFlags usage_flags);
void tu_emit_cache_flush_renderpass(struct tu_cmd_buffer *cmd_buffer,
struct tu_cs *cs);
void tu_emit_cache_flush_ccu(struct tu_cmd_buffer *cmd_buffer,
struct tu_cs *cs,
enum tu_cmd_ccu_state ccu_state);
void
tu_append_pre_chain(struct tu_cmd_buffer *cmd,
struct tu_cmd_buffer *secondary);
void
tu_append_pre_post_chain(struct tu_cmd_buffer *cmd,
struct tu_cmd_buffer *secondary);
void
tu_append_post_chain(struct tu_cmd_buffer *cmd,
struct tu_cmd_buffer *secondary);
void
tu_restore_suspended_pass(struct tu_cmd_buffer *cmd,
struct tu_cmd_buffer *suspended);
void tu_cmd_render(struct tu_cmd_buffer *cmd);
void
tu6_emit_event_write(struct tu_cmd_buffer *cmd,
struct tu_cs *cs,
enum vgt_event_type event);
static inline struct tu_descriptor_state *
tu_get_descriptors_state(struct tu_cmd_buffer *cmd_buffer,
VkPipelineBindPoint bind_point)
{
return &cmd_buffer->descriptors[bind_point];
}
void tu6_emit_msaa(struct tu_cs *cs, VkSampleCountFlagBits samples,
enum a5xx_line_mode line_mode);
void tu6_emit_window_scissor(struct tu_cs *cs, uint32_t x1, uint32_t y1, uint32_t x2, uint32_t y2);
void tu6_emit_window_offset(struct tu_cs *cs, uint32_t x1, uint32_t y1);
void tu_disable_draw_states(struct tu_cmd_buffer *cmd, struct tu_cs *cs);
void tu6_apply_depth_bounds_workaround(struct tu_device *device,
uint32_t *rb_depth_cntl);
VkResult
tu_gralloc_info(struct tu_device *device,
const VkNativeBufferANDROID *gralloc_info,
@@ -748,13 +109,4 @@ tu_import_memory_from_gralloc_handle(VkDevice device_h,
const VkAllocationCallbacks *alloc,
VkImage image_h);
VK_DEFINE_HANDLE_CASTS(tu_cmd_buffer, vk.base, VkCommandBuffer,
VK_OBJECT_TYPE_COMMAND_BUFFER)
VK_DEFINE_NONDISP_HANDLE_CASTS(tu_cmd_pool, vk.base, VkCommandPool,
VK_OBJECT_TYPE_COMMAND_POOL)
void
update_stencil_mask(uint32_t *value, VkStencilFaceFlags face, uint32_t mask);
#endif /* TU_PRIVATE_H */