diff --git a/src/freedreno/vulkan/tu_cmd_buffer.c b/src/freedreno/vulkan/tu_cmd_buffer.c index b851e520fdb..55cc2b4f255 100644 --- a/src/freedreno/vulkan/tu_cmd_buffer.c +++ b/src/freedreno/vulkan/tu_cmd_buffer.c @@ -25,6 +25,7 @@ * DEALINGS IN THE SOFTWARE. */ +#include "tu_cmd_buffer.h" #include "tu_private.h" #include "vk_render_pass.h" diff --git a/src/freedreno/vulkan/tu_cmd_buffer.h b/src/freedreno/vulkan/tu_cmd_buffer.h new file mode 100644 index 00000000000..d37d007b079 --- /dev/null +++ b/src/freedreno/vulkan/tu_cmd_buffer.h @@ -0,0 +1,670 @@ +/* + * Copyright © 2016 Red Hat. + * Copyright © 2016 Bas Nieuwenhuizen + * SPDX-License-Identifier: MIT + * + * based in part on anv driver which is: + * Copyright © 2015 Intel Corporation + */ + +#ifndef TU_CMD_BUFFER_H +#define TU_CMD_BUFFER_H + +#include "tu_common.h" + +#include "tu_cs.h" +#include "tu_descriptor_set.h" +#include "tu_device.h" +#include "tu_lrz.h" +#include "tu_pass.h" +#include "tu_pipeline.h" + +enum tu_draw_state_group_id +{ + TU_DRAW_STATE_PROGRAM_CONFIG, + TU_DRAW_STATE_PROGRAM, + TU_DRAW_STATE_PROGRAM_BINNING, + TU_DRAW_STATE_VB, + TU_DRAW_STATE_VI, + TU_DRAW_STATE_VI_BINNING, + TU_DRAW_STATE_RAST, + TU_DRAW_STATE_CONST, + TU_DRAW_STATE_DESC_SETS, + TU_DRAW_STATE_DESC_SETS_LOAD, + TU_DRAW_STATE_VS_PARAMS, + TU_DRAW_STATE_INPUT_ATTACHMENTS_GMEM, + TU_DRAW_STATE_INPUT_ATTACHMENTS_SYSMEM, + TU_DRAW_STATE_LRZ_AND_DEPTH_PLANE, + TU_DRAW_STATE_PRIM_MODE_GMEM, + TU_DRAW_STATE_PRIM_MODE_SYSMEM, + + /* dynamic state related draw states */ + TU_DRAW_STATE_DYNAMIC, + TU_DRAW_STATE_COUNT = TU_DRAW_STATE_DYNAMIC + TU_DYNAMIC_STATE_COUNT, +}; + +struct tu_descriptor_state +{ + struct tu_descriptor_set *sets[MAX_SETS]; + struct tu_descriptor_set push_set; + uint32_t dynamic_descriptors[MAX_DYNAMIC_BUFFERS_SIZE]; +}; + +enum tu_cmd_dirty_bits +{ + TU_CMD_DIRTY_VERTEX_BUFFERS = BIT(0), + TU_CMD_DIRTY_VB_STRIDE = BIT(1), + TU_CMD_DIRTY_GRAS_SU_CNTL = BIT(2), + TU_CMD_DIRTY_RB_DEPTH_CNTL = BIT(3), + TU_CMD_DIRTY_RB_STENCIL_CNTL = BIT(4), + TU_CMD_DIRTY_DESC_SETS_LOAD = BIT(5), + TU_CMD_DIRTY_COMPUTE_DESC_SETS_LOAD = BIT(6), + TU_CMD_DIRTY_SHADER_CONSTS = BIT(7), + TU_CMD_DIRTY_LRZ = BIT(8), + TU_CMD_DIRTY_VS_PARAMS = BIT(9), + TU_CMD_DIRTY_RASTERIZER_DISCARD = BIT(10), + TU_CMD_DIRTY_VIEWPORTS = BIT(11), + TU_CMD_DIRTY_BLEND = BIT(12), + /* all draw states were disabled and need to be re-enabled: */ + TU_CMD_DIRTY_DRAW_STATE = BIT(13) +}; + +/* There are only three cache domains we have to care about: the CCU, or + * color cache unit, which is used for color and depth/stencil attachments + * and copy/blit destinations, and is split conceptually into color and depth, + * and the universal cache or UCHE which is used for pretty much everything + * else, except for the CP (uncached) and host. We need to flush whenever data + * crosses these boundaries. + */ + +enum tu_cmd_access_mask { + TU_ACCESS_UCHE_READ = 1 << 0, + TU_ACCESS_UCHE_WRITE = 1 << 1, + TU_ACCESS_CCU_COLOR_READ = 1 << 2, + TU_ACCESS_CCU_COLOR_WRITE = 1 << 3, + TU_ACCESS_CCU_DEPTH_READ = 1 << 4, + TU_ACCESS_CCU_DEPTH_WRITE = 1 << 5, + + /* Experiments have shown that while it's safe to avoid flushing the CCU + * after each blit/renderpass, it's not safe to assume that subsequent + * lookups with a different attachment state will hit unflushed cache + * entries. That is, the CCU needs to be flushed and possibly invalidated + * when accessing memory with a different attachment state. Writing to an + * attachment under the following conditions after clearing using the + * normal 2d engine path is known to have issues: + * + * - It isn't the 0'th layer. + * - There are more than one attachment, and this isn't the 0'th attachment + * (this seems to also depend on the cpp of the attachments). + * + * Our best guess is that the layer/MRT state is used when computing + * the location of a cache entry in CCU, to avoid conflicts. We assume that + * any access in a renderpass after or before an access by a transfer needs + * a flush/invalidate, and use the _INCOHERENT variants to represent access + * by a renderpass. + */ + TU_ACCESS_CCU_COLOR_INCOHERENT_READ = 1 << 6, + TU_ACCESS_CCU_COLOR_INCOHERENT_WRITE = 1 << 7, + TU_ACCESS_CCU_DEPTH_INCOHERENT_READ = 1 << 8, + TU_ACCESS_CCU_DEPTH_INCOHERENT_WRITE = 1 << 9, + + /* Accesses which bypasses any cache. e.g. writes via the host, + * CP_EVENT_WRITE::BLIT, and the CP are SYSMEM_WRITE. + */ + TU_ACCESS_SYSMEM_READ = 1 << 10, + TU_ACCESS_SYSMEM_WRITE = 1 << 11, + + /* Memory writes from the CP start in-order with draws and event writes, + * but execute asynchronously and hence need a CP_WAIT_MEM_WRITES if read. + */ + TU_ACCESS_CP_WRITE = 1 << 12, + + TU_ACCESS_READ = + TU_ACCESS_UCHE_READ | + TU_ACCESS_CCU_COLOR_READ | + TU_ACCESS_CCU_DEPTH_READ | + TU_ACCESS_CCU_COLOR_INCOHERENT_READ | + TU_ACCESS_CCU_DEPTH_INCOHERENT_READ | + TU_ACCESS_SYSMEM_READ, + + TU_ACCESS_WRITE = + TU_ACCESS_UCHE_WRITE | + TU_ACCESS_CCU_COLOR_WRITE | + TU_ACCESS_CCU_COLOR_INCOHERENT_WRITE | + TU_ACCESS_CCU_DEPTH_WRITE | + TU_ACCESS_CCU_DEPTH_INCOHERENT_WRITE | + TU_ACCESS_SYSMEM_WRITE | + TU_ACCESS_CP_WRITE, + + TU_ACCESS_ALL = + TU_ACCESS_READ | + TU_ACCESS_WRITE, +}; + +/* Starting with a6xx, the pipeline is split into several "clusters" (really + * pipeline stages). Each stage has its own pair of register banks and can + * switch them independently, so that earlier stages can run ahead of later + * ones. e.g. the FS of draw N and the VS of draw N + 1 can be executing at + * the same time. + * + * As a result of this, we need to insert a WFI when an earlier stage depends + * on the result of a later stage. CP_DRAW_* and CP_BLIT will wait for any + * pending WFI's to complete before starting, and usually before reading + * indirect params even, so a WFI also acts as a full "pipeline stall". + * + * Note, the names of the stages come from CLUSTER_* in devcoredump. We + * include all the stages for completeness, even ones which do not read/write + * anything. + */ + +enum tu_stage { + /* This doesn't correspond to a cluster, but we need it for tracking + * indirect draw parameter reads etc. + */ + TU_STAGE_CP, + + /* - Fetch index buffer + * - Fetch vertex attributes, dispatch VS + */ + TU_STAGE_FE, + + /* Execute all geometry stages (VS thru GS) */ + TU_STAGE_SP_VS, + + /* Write to VPC, do primitive assembly. */ + TU_STAGE_PC_VS, + + /* Rasterization. RB_DEPTH_BUFFER_BASE only exists in CLUSTER_PS according + * to devcoredump so presumably this stage stalls for TU_STAGE_PS when + * early depth testing is enabled before dispatching fragments? However + * GRAS reads and writes LRZ directly. + */ + TU_STAGE_GRAS, + + /* Execute FS */ + TU_STAGE_SP_PS, + + /* - Fragment tests + * - Write color/depth + * - Streamout writes (???) + * - Varying interpolation (???) + */ + TU_STAGE_PS, +}; + +enum tu_cmd_flush_bits { + TU_CMD_FLAG_CCU_FLUSH_DEPTH = 1 << 0, + TU_CMD_FLAG_CCU_FLUSH_COLOR = 1 << 1, + TU_CMD_FLAG_CCU_INVALIDATE_DEPTH = 1 << 2, + TU_CMD_FLAG_CCU_INVALIDATE_COLOR = 1 << 3, + TU_CMD_FLAG_CACHE_FLUSH = 1 << 4, + TU_CMD_FLAG_CACHE_INVALIDATE = 1 << 5, + TU_CMD_FLAG_WAIT_MEM_WRITES = 1 << 6, + TU_CMD_FLAG_WAIT_FOR_IDLE = 1 << 7, + TU_CMD_FLAG_WAIT_FOR_ME = 1 << 8, + + TU_CMD_FLAG_ALL_FLUSH = + TU_CMD_FLAG_CCU_FLUSH_DEPTH | + TU_CMD_FLAG_CCU_FLUSH_COLOR | + TU_CMD_FLAG_CACHE_FLUSH | + /* Treat the CP as a sort of "cache" which may need to be "flushed" via + * waiting for writes to land with WAIT_FOR_MEM_WRITES. + */ + TU_CMD_FLAG_WAIT_MEM_WRITES, + + TU_CMD_FLAG_ALL_INVALIDATE = + TU_CMD_FLAG_CCU_INVALIDATE_DEPTH | + TU_CMD_FLAG_CCU_INVALIDATE_COLOR | + TU_CMD_FLAG_CACHE_INVALIDATE | + /* Treat CP_WAIT_FOR_ME as a "cache" that needs to be invalidated when a + * a command that needs CP_WAIT_FOR_ME is executed. This means we may + * insert an extra WAIT_FOR_ME before an indirect command requiring it + * in case there was another command before the current command buffer + * that it needs to wait for. + */ + TU_CMD_FLAG_WAIT_FOR_ME, +}; + +/* Changing the CCU from sysmem mode to gmem mode or vice-versa is pretty + * heavy, involving a CCU cache flush/invalidate and a WFI in order to change + * which part of the gmem is used by the CCU. Here we keep track of what the + * state of the CCU. + */ +enum tu_cmd_ccu_state { + TU_CMD_CCU_SYSMEM, + TU_CMD_CCU_GMEM, + TU_CMD_CCU_UNKNOWN, +}; + +struct tu_cache_state { + /* Caches which must be made available (flushed) eventually if there are + * any users outside that cache domain, and caches which must be + * invalidated eventually if there are any reads. + */ + enum tu_cmd_flush_bits pending_flush_bits; + /* Pending flushes */ + enum tu_cmd_flush_bits flush_bits; +}; + +struct tu_vs_params { + uint32_t vertex_offset; + uint32_t first_instance; +}; + +/* This should be for state that is set inside a renderpass and used at + * renderpass end time, e.g. to decide whether to use sysmem. This needs + * special handling for secondary cmdbufs and suspending/resuming render + * passes where the state may need to be combined afterwards. + */ +struct tu_render_pass_state +{ + bool xfb_used; + bool has_tess; + bool has_prim_generated_query_in_rp; + bool disable_gmem; + + /* Track whether conditional predicate for COND_REG_EXEC is changed in draw_cs */ + bool draw_cs_writes_to_cond_pred; + + uint32_t drawcall_count; + + /* A calculated "draw cost" value for renderpass, which tries to + * estimate the bandwidth-per-sample of all the draws according + * to: + * + * foreach_draw (...) { + * sum += pipeline->color_bandwidth_per_sample; + * if (depth_test_enabled) + * sum += pipeline->depth_cpp_per_sample; + * if (depth_write_enabled) + * sum += pipeline->depth_cpp_per_sample; + * if (stencil_write_enabled) + * sum += pipeline->stencil_cpp_per_sample * 2; + * } + * drawcall_bandwidth_per_sample = sum / drawcall_count; + * + * It allows us to estimate the total bandwidth of drawcalls later, by + * calculating (drawcall_bandwidth_per_sample * zpass_sample_count). + * + * This does ignore depth buffer traffic for samples which do not + * pass due to depth-test fail, and some other details. But it is + * just intended to be a rough estimate that is easy to calculate. + */ + uint32_t drawcall_bandwidth_per_sample_sum; +}; + +struct tu_cmd_state +{ + uint32_t dirty; + + struct tu_pipeline *pipeline; + struct tu_pipeline *compute_pipeline; + + struct tu_render_pass_state rp; + + /* Vertex buffers, viewports, and scissors + * the states for these can be updated partially, so we need to save these + * to be able to emit a complete draw state + */ + struct { + uint64_t base; + uint32_t size; + uint32_t stride; + } vb[MAX_VBS]; + VkViewport viewport[MAX_VIEWPORTS]; + VkRect2D scissor[MAX_SCISSORS]; + uint32_t max_viewport, max_scissor; + + /* for dynamic states that can't be emitted directly */ + uint32_t dynamic_stencil_mask; + uint32_t dynamic_stencil_wrmask; + uint32_t dynamic_stencil_ref; + + uint32_t gras_su_cntl, rb_depth_cntl, rb_stencil_cntl; + uint32_t pc_raster_cntl, vpc_unknown_9107; + uint32_t rb_mrt_control[MAX_RTS], rb_mrt_blend_control[MAX_RTS]; + uint32_t rb_mrt_control_rop; + uint32_t rb_blend_cntl, sp_blend_cntl; + uint32_t pipeline_color_write_enable, pipeline_blend_enable; + uint32_t color_write_enable; + bool logic_op_enabled; + bool rop_reads_dst; + enum pc_di_primtype primtype; + bool primitive_restart_enable; + + /* saved states to re-emit in TU_CMD_DIRTY_DRAW_STATE case */ + struct tu_draw_state dynamic_state[TU_DYNAMIC_STATE_COUNT]; + struct tu_draw_state vertex_buffers; + struct tu_draw_state shader_const; + struct tu_draw_state desc_sets; + + struct tu_draw_state vs_params; + + /* Index buffer */ + uint64_t index_va; + uint32_t max_index_count; + uint8_t index_size; + + /* because streamout base has to be 32-byte aligned + * there is an extra offset to deal with when it is + * unaligned + */ + uint8_t streamout_offset[IR3_MAX_SO_BUFFERS]; + + /* Renderpasses are tricky, because we may need to flush differently if + * using sysmem vs. gmem and therefore we have to delay any flushing that + * happens before a renderpass. So we have to have two copies of the flush + * state, one for intra-renderpass flushes (i.e. renderpass dependencies) + * and one for outside a renderpass. + */ + struct tu_cache_state cache; + struct tu_cache_state renderpass_cache; + + enum tu_cmd_ccu_state ccu_state; + + /* Decides which GMEM layout to use from the tu_pass, based on whether the CCU + * might get used by tu_store_gmem_attachment(). + */ + enum tu_gmem_layout gmem_layout; + + const struct tu_render_pass *pass; + const struct tu_subpass *subpass; + const struct tu_framebuffer *framebuffer; + const struct tu_tiling_config *tiling; + VkRect2D render_area; + + const struct tu_image_view **attachments; + + /* State that in the dynamic case comes from VkRenderingInfo and needs to + * be saved/restored when suspending. This holds the state for the last + * suspended renderpass, which may point to this command buffer's dynamic_* + * or another command buffer if executed on a secondary. + */ + struct { + const struct tu_render_pass *pass; + const struct tu_subpass *subpass; + const struct tu_framebuffer *framebuffer; + VkRect2D render_area; + enum tu_gmem_layout gmem_layout; + + const struct tu_image_view **attachments; + + struct tu_lrz_state lrz; + } suspended_pass; + + bool tessfactor_addr_set; + bool predication_active; + enum a5xx_line_mode line_mode; + bool z_negative_one_to_one; + + /* VK_QUERY_PIPELINE_STATISTIC_CLIPPING_INVOCATIONS_BIT and + * VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT are allowed to run simultaniously, + * but they use the same {START,STOP}_PRIMITIVE_CTRS control. + */ + uint32_t prim_counters_running; + + bool prim_generated_query_running_before_rp; + + /* These are the states of the suspend/resume state machine. In addition to + * tracking whether we're in the middle of a chain of suspending and + * resuming passes that will be merged, we need to track whether the + * command buffer begins in the middle of such a chain, for when it gets + * merged with other command buffers. We call such a chain that begins + * before the command buffer starts a "pre-chain". + * + * Note that when this command buffer is finished, this state is untouched + * but it gains a different meaning. For example, if we finish in state + * SR_IN_CHAIN, we finished in the middle of a suspend/resume chain, so + * there's a suspend/resume chain that extends past the end of the command + * buffer. In this sense it's the "opposite" of SR_AFTER_PRE_CHAIN, which + * means that there's a suspend/resume chain that extends before the + * beginning. + */ + enum { + /* Either there are no suspend/resume chains, or they are entirely + * contained in the current command buffer. + * + * BeginCommandBuffer() <- start of current command buffer + * ... + * // we are here + */ + SR_NONE = 0, + + /* We are in the middle of a suspend/resume chain that starts before the + * current command buffer. This happens when the command buffer begins + * with a resuming render pass and all of the passes up to the current + * one are suspending. In this state, our part of the chain is not saved + * and is in the current draw_cs/state. + * + * BeginRendering() ... EndRendering(suspending) + * BeginCommandBuffer() <- start of current command buffer + * BeginRendering(resuming) ... EndRendering(suspending) + * BeginRendering(resuming) ... EndRendering(suspending) + * ... + * // we are here + */ + SR_IN_PRE_CHAIN, + + /* We are currently outside of any suspend/resume chains, but there is a + * chain starting before the current command buffer. It is saved in + * pre_chain. + * + * BeginRendering() ... EndRendering(suspending) + * BeginCommandBuffer() <- start of current command buffer + * // This part is stashed in pre_chain + * BeginRendering(resuming) ... EndRendering(suspending) + * BeginRendering(resuming) ... EndRendering(suspending) + * ... + * BeginRendering(resuming) ... EndRendering() // end of chain + * ... + * // we are here + */ + SR_AFTER_PRE_CHAIN, + + /* We are in the middle of a suspend/resume chain and there is no chain + * starting before the current command buffer. + * + * BeginCommandBuffer() <- start of current command buffer + * ... + * BeginRendering() ... EndRendering(suspending) + * BeginRendering(resuming) ... EndRendering(suspending) + * BeginRendering(resuming) ... EndRendering(suspending) + * ... + * // we are here + */ + SR_IN_CHAIN, + + /* We are in the middle of a suspend/resume chain and there is another, + * separate, chain starting before the current command buffer. + * + * BeginRendering() ... EndRendering(suspending) + * CommandBufferBegin() <- start of current command buffer + * // This part is stashed in pre_chain + * BeginRendering(resuming) ... EndRendering(suspending) + * BeginRendering(resuming) ... EndRendering(suspending) + * ... + * BeginRendering(resuming) ... EndRendering() // end of chain + * ... + * BeginRendering() ... EndRendering(suspending) + * BeginRendering(resuming) ... EndRendering(suspending) + * BeginRendering(resuming) ... EndRendering(suspending) + * ... + * // we are here + */ + SR_IN_CHAIN_AFTER_PRE_CHAIN, + } suspend_resume; + + bool suspending, resuming; + + struct tu_lrz_state lrz; + + struct tu_draw_state lrz_and_depth_plane_state; + + struct tu_vs_params last_vs_params; +}; + +struct tu_cmd_pool +{ + struct vk_command_pool vk; + + struct list_head cmd_buffers; + struct list_head free_cmd_buffers; +}; +VK_DEFINE_NONDISP_HANDLE_CASTS(tu_cmd_pool, vk.base, VkCommandPool, + VK_OBJECT_TYPE_COMMAND_POOL) + +enum tu_cmd_buffer_status +{ + TU_CMD_BUFFER_STATUS_INVALID, + TU_CMD_BUFFER_STATUS_INITIAL, + TU_CMD_BUFFER_STATUS_RECORDING, + TU_CMD_BUFFER_STATUS_EXECUTABLE, + TU_CMD_BUFFER_STATUS_PENDING, +}; + +struct tu_cmd_buffer +{ + struct vk_command_buffer vk; + + struct tu_device *device; + + struct tu_cmd_pool *pool; + struct list_head pool_link; + + struct u_trace trace; + struct u_trace_iterator trace_renderpass_start; + struct u_trace_iterator trace_renderpass_end; + + struct list_head renderpass_autotune_results; + struct tu_autotune_results_buffer* autotune_buffer; + + VkCommandBufferUsageFlags usage_flags; + enum tu_cmd_buffer_status status; + + VkQueryPipelineStatisticFlags inherited_pipeline_statistics; + + struct tu_cmd_state state; + uint32_t queue_family_index; + + uint32_t push_constants[MAX_PUSH_CONSTANTS_SIZE / 4]; + VkShaderStageFlags push_constant_stages; + struct tu_descriptor_set meta_push_descriptors; + + struct tu_descriptor_state descriptors[MAX_BIND_POINTS]; + + struct tu_render_pass_attachment dynamic_rp_attachments[2 * (MAX_RTS + 1)]; + struct tu_subpass_attachment dynamic_color_attachments[MAX_RTS]; + struct tu_subpass_attachment dynamic_resolve_attachments[MAX_RTS + 1]; + const struct tu_image_view *dynamic_attachments[2 * (MAX_RTS + 1)]; + + struct tu_render_pass dynamic_pass; + struct tu_subpass dynamic_subpass; + struct tu_framebuffer dynamic_framebuffer; + + VkResult record_result; + + struct tu_cs cs; + struct tu_cs draw_cs; + struct tu_cs tile_store_cs; + struct tu_cs draw_epilogue_cs; + struct tu_cs sub_cs; + + /* If the first render pass in the command buffer is resuming, then it is + * part of a suspend/resume chain that starts before the current command + * buffer and needs to be merged later. In this case, its incomplete state + * is stored in pre_chain. In the symmetric case where the last render pass + * is suspending, we just skip ending the render pass and its state is + * stored in draw_cs/the current state. The first and last render pass + * might be part of different chains, which is why all the state may need + * to be saved separately here. + */ + struct { + struct tu_cs draw_cs; + struct tu_cs draw_epilogue_cs; + + struct u_trace_iterator trace_renderpass_start, trace_renderpass_end; + + struct tu_render_pass_state state; + } pre_chain; + + uint32_t vsc_draw_strm_pitch; + uint32_t vsc_prim_strm_pitch; +}; +VK_DEFINE_HANDLE_CASTS(tu_cmd_buffer, vk.base, VkCommandBuffer, + VK_OBJECT_TYPE_COMMAND_BUFFER) + +static inline uint32_t +tu_attachment_gmem_offset(struct tu_cmd_buffer *cmd, + const struct tu_render_pass_attachment *att) +{ + assert(cmd->state.gmem_layout < TU_GMEM_LAYOUT_COUNT); + return att->gmem_offset[cmd->state.gmem_layout]; +} + +static inline uint32_t +tu_attachment_gmem_offset_stencil(struct tu_cmd_buffer *cmd, + const struct tu_render_pass_attachment *att) +{ + assert(cmd->state.gmem_layout < TU_GMEM_LAYOUT_COUNT); + return att->gmem_offset_stencil[cmd->state.gmem_layout]; +} + +void tu_render_pass_state_merge(struct tu_render_pass_state *dst, + const struct tu_render_pass_state *src); + +VkResult tu_cmd_buffer_begin(struct tu_cmd_buffer *cmd_buffer, + VkCommandBufferUsageFlags usage_flags); + +void tu_emit_cache_flush_renderpass(struct tu_cmd_buffer *cmd_buffer, + struct tu_cs *cs); + +void tu_emit_cache_flush_ccu(struct tu_cmd_buffer *cmd_buffer, + struct tu_cs *cs, + enum tu_cmd_ccu_state ccu_state); + +void +tu_append_pre_chain(struct tu_cmd_buffer *cmd, + struct tu_cmd_buffer *secondary); + +void +tu_append_pre_post_chain(struct tu_cmd_buffer *cmd, + struct tu_cmd_buffer *secondary); + +void +tu_append_post_chain(struct tu_cmd_buffer *cmd, + struct tu_cmd_buffer *secondary); + +void +tu_restore_suspended_pass(struct tu_cmd_buffer *cmd, + struct tu_cmd_buffer *suspended); + +void tu_cmd_render(struct tu_cmd_buffer *cmd); + +void +tu6_emit_event_write(struct tu_cmd_buffer *cmd, + struct tu_cs *cs, + enum vgt_event_type event); + +static inline struct tu_descriptor_state * +tu_get_descriptors_state(struct tu_cmd_buffer *cmd_buffer, + VkPipelineBindPoint bind_point) +{ + return &cmd_buffer->descriptors[bind_point]; +} + +void tu6_emit_msaa(struct tu_cs *cs, VkSampleCountFlagBits samples, + enum a5xx_line_mode line_mode); + +void tu6_emit_window_scissor(struct tu_cs *cs, uint32_t x1, uint32_t y1, uint32_t x2, uint32_t y2); + +void tu6_emit_window_offset(struct tu_cs *cs, uint32_t x1, uint32_t y1); + +void tu_disable_draw_states(struct tu_cmd_buffer *cmd, struct tu_cs *cs); + +void tu6_apply_depth_bounds_workaround(struct tu_device *device, + uint32_t *rb_depth_cntl); + +void +update_stencil_mask(uint32_t *value, VkStencilFaceFlags face, uint32_t mask); + +#endif /* TU_CMD_BUFFER_H */ diff --git a/src/freedreno/vulkan/tu_common.h b/src/freedreno/vulkan/tu_common.h index 983d2057989..bffed1d4914 100644 --- a/src/freedreno/vulkan/tu_common.h +++ b/src/freedreno/vulkan/tu_common.h @@ -116,6 +116,7 @@ struct tu_buffer; struct tu_buffer_view; struct tu_cmd_buffer; +struct tu_cmd_pool; struct tu_descriptor_pool; struct tu_descriptor_set; struct tu_descriptor_set_layout; diff --git a/src/freedreno/vulkan/tu_private.h b/src/freedreno/vulkan/tu_private.h index 623103668b0..c655e4f0e0d 100644 --- a/src/freedreno/vulkan/tu_private.h +++ b/src/freedreno/vulkan/tu_private.h @@ -31,6 +31,7 @@ #include "tu_common.h" #include "tu_autotune.h" #include "tu_clear_blit.h" +#include "tu_cmd_buffer.h" #include "tu_cs.h" #include "tu_descriptor_set.h" #include "tu_device.h" @@ -91,651 +92,11 @@ __tu_finishme(const char *file, int line, const char *format, ...) tu_finishme("stub %s", __func__); \ } while (0) -enum tu_draw_state_group_id -{ - TU_DRAW_STATE_PROGRAM_CONFIG, - TU_DRAW_STATE_PROGRAM, - TU_DRAW_STATE_PROGRAM_BINNING, - TU_DRAW_STATE_VB, - TU_DRAW_STATE_VI, - TU_DRAW_STATE_VI_BINNING, - TU_DRAW_STATE_RAST, - TU_DRAW_STATE_CONST, - TU_DRAW_STATE_DESC_SETS, - TU_DRAW_STATE_DESC_SETS_LOAD, - TU_DRAW_STATE_VS_PARAMS, - TU_DRAW_STATE_INPUT_ATTACHMENTS_GMEM, - TU_DRAW_STATE_INPUT_ATTACHMENTS_SYSMEM, - TU_DRAW_STATE_LRZ_AND_DEPTH_PLANE, - TU_DRAW_STATE_PRIM_MODE_GMEM, - TU_DRAW_STATE_PRIM_MODE_SYSMEM, - - /* dynamic state related draw states */ - TU_DRAW_STATE_DYNAMIC, - TU_DRAW_STATE_COUNT = TU_DRAW_STATE_DYNAMIC + TU_DYNAMIC_STATE_COUNT, -}; - void tu_framebuffer_tiling_config(struct tu_framebuffer *fb, const struct tu_device *device, const struct tu_render_pass *pass); -struct tu_descriptor_state -{ - struct tu_descriptor_set *sets[MAX_SETS]; - struct tu_descriptor_set push_set; - uint32_t dynamic_descriptors[MAX_DYNAMIC_BUFFERS_SIZE]; -}; - -enum tu_cmd_dirty_bits -{ - TU_CMD_DIRTY_VERTEX_BUFFERS = BIT(0), - TU_CMD_DIRTY_VB_STRIDE = BIT(1), - TU_CMD_DIRTY_GRAS_SU_CNTL = BIT(2), - TU_CMD_DIRTY_RB_DEPTH_CNTL = BIT(3), - TU_CMD_DIRTY_RB_STENCIL_CNTL = BIT(4), - TU_CMD_DIRTY_DESC_SETS_LOAD = BIT(5), - TU_CMD_DIRTY_COMPUTE_DESC_SETS_LOAD = BIT(6), - TU_CMD_DIRTY_SHADER_CONSTS = BIT(7), - TU_CMD_DIRTY_LRZ = BIT(8), - TU_CMD_DIRTY_VS_PARAMS = BIT(9), - TU_CMD_DIRTY_RASTERIZER_DISCARD = BIT(10), - TU_CMD_DIRTY_VIEWPORTS = BIT(11), - TU_CMD_DIRTY_BLEND = BIT(12), - /* all draw states were disabled and need to be re-enabled: */ - TU_CMD_DIRTY_DRAW_STATE = BIT(13) -}; - -/* There are only three cache domains we have to care about: the CCU, or - * color cache unit, which is used for color and depth/stencil attachments - * and copy/blit destinations, and is split conceptually into color and depth, - * and the universal cache or UCHE which is used for pretty much everything - * else, except for the CP (uncached) and host. We need to flush whenever data - * crosses these boundaries. - */ - -enum tu_cmd_access_mask { - TU_ACCESS_UCHE_READ = 1 << 0, - TU_ACCESS_UCHE_WRITE = 1 << 1, - TU_ACCESS_CCU_COLOR_READ = 1 << 2, - TU_ACCESS_CCU_COLOR_WRITE = 1 << 3, - TU_ACCESS_CCU_DEPTH_READ = 1 << 4, - TU_ACCESS_CCU_DEPTH_WRITE = 1 << 5, - - /* Experiments have shown that while it's safe to avoid flushing the CCU - * after each blit/renderpass, it's not safe to assume that subsequent - * lookups with a different attachment state will hit unflushed cache - * entries. That is, the CCU needs to be flushed and possibly invalidated - * when accessing memory with a different attachment state. Writing to an - * attachment under the following conditions after clearing using the - * normal 2d engine path is known to have issues: - * - * - It isn't the 0'th layer. - * - There are more than one attachment, and this isn't the 0'th attachment - * (this seems to also depend on the cpp of the attachments). - * - * Our best guess is that the layer/MRT state is used when computing - * the location of a cache entry in CCU, to avoid conflicts. We assume that - * any access in a renderpass after or before an access by a transfer needs - * a flush/invalidate, and use the _INCOHERENT variants to represent access - * by a renderpass. - */ - TU_ACCESS_CCU_COLOR_INCOHERENT_READ = 1 << 6, - TU_ACCESS_CCU_COLOR_INCOHERENT_WRITE = 1 << 7, - TU_ACCESS_CCU_DEPTH_INCOHERENT_READ = 1 << 8, - TU_ACCESS_CCU_DEPTH_INCOHERENT_WRITE = 1 << 9, - - /* Accesses which bypasses any cache. e.g. writes via the host, - * CP_EVENT_WRITE::BLIT, and the CP are SYSMEM_WRITE. - */ - TU_ACCESS_SYSMEM_READ = 1 << 10, - TU_ACCESS_SYSMEM_WRITE = 1 << 11, - - /* Memory writes from the CP start in-order with draws and event writes, - * but execute asynchronously and hence need a CP_WAIT_MEM_WRITES if read. - */ - TU_ACCESS_CP_WRITE = 1 << 12, - - TU_ACCESS_READ = - TU_ACCESS_UCHE_READ | - TU_ACCESS_CCU_COLOR_READ | - TU_ACCESS_CCU_DEPTH_READ | - TU_ACCESS_CCU_COLOR_INCOHERENT_READ | - TU_ACCESS_CCU_DEPTH_INCOHERENT_READ | - TU_ACCESS_SYSMEM_READ, - - TU_ACCESS_WRITE = - TU_ACCESS_UCHE_WRITE | - TU_ACCESS_CCU_COLOR_WRITE | - TU_ACCESS_CCU_COLOR_INCOHERENT_WRITE | - TU_ACCESS_CCU_DEPTH_WRITE | - TU_ACCESS_CCU_DEPTH_INCOHERENT_WRITE | - TU_ACCESS_SYSMEM_WRITE | - TU_ACCESS_CP_WRITE, - - TU_ACCESS_ALL = - TU_ACCESS_READ | - TU_ACCESS_WRITE, -}; - -/* Starting with a6xx, the pipeline is split into several "clusters" (really - * pipeline stages). Each stage has its own pair of register banks and can - * switch them independently, so that earlier stages can run ahead of later - * ones. e.g. the FS of draw N and the VS of draw N + 1 can be executing at - * the same time. - * - * As a result of this, we need to insert a WFI when an earlier stage depends - * on the result of a later stage. CP_DRAW_* and CP_BLIT will wait for any - * pending WFI's to complete before starting, and usually before reading - * indirect params even, so a WFI also acts as a full "pipeline stall". - * - * Note, the names of the stages come from CLUSTER_* in devcoredump. We - * include all the stages for completeness, even ones which do not read/write - * anything. - */ - -enum tu_stage { - /* This doesn't correspond to a cluster, but we need it for tracking - * indirect draw parameter reads etc. - */ - TU_STAGE_CP, - - /* - Fetch index buffer - * - Fetch vertex attributes, dispatch VS - */ - TU_STAGE_FE, - - /* Execute all geometry stages (VS thru GS) */ - TU_STAGE_SP_VS, - - /* Write to VPC, do primitive assembly. */ - TU_STAGE_PC_VS, - - /* Rasterization. RB_DEPTH_BUFFER_BASE only exists in CLUSTER_PS according - * to devcoredump so presumably this stage stalls for TU_STAGE_PS when - * early depth testing is enabled before dispatching fragments? However - * GRAS reads and writes LRZ directly. - */ - TU_STAGE_GRAS, - - /* Execute FS */ - TU_STAGE_SP_PS, - - /* - Fragment tests - * - Write color/depth - * - Streamout writes (???) - * - Varying interpolation (???) - */ - TU_STAGE_PS, -}; - -enum tu_cmd_flush_bits { - TU_CMD_FLAG_CCU_FLUSH_DEPTH = 1 << 0, - TU_CMD_FLAG_CCU_FLUSH_COLOR = 1 << 1, - TU_CMD_FLAG_CCU_INVALIDATE_DEPTH = 1 << 2, - TU_CMD_FLAG_CCU_INVALIDATE_COLOR = 1 << 3, - TU_CMD_FLAG_CACHE_FLUSH = 1 << 4, - TU_CMD_FLAG_CACHE_INVALIDATE = 1 << 5, - TU_CMD_FLAG_WAIT_MEM_WRITES = 1 << 6, - TU_CMD_FLAG_WAIT_FOR_IDLE = 1 << 7, - TU_CMD_FLAG_WAIT_FOR_ME = 1 << 8, - - TU_CMD_FLAG_ALL_FLUSH = - TU_CMD_FLAG_CCU_FLUSH_DEPTH | - TU_CMD_FLAG_CCU_FLUSH_COLOR | - TU_CMD_FLAG_CACHE_FLUSH | - /* Treat the CP as a sort of "cache" which may need to be "flushed" via - * waiting for writes to land with WAIT_FOR_MEM_WRITES. - */ - TU_CMD_FLAG_WAIT_MEM_WRITES, - - TU_CMD_FLAG_ALL_INVALIDATE = - TU_CMD_FLAG_CCU_INVALIDATE_DEPTH | - TU_CMD_FLAG_CCU_INVALIDATE_COLOR | - TU_CMD_FLAG_CACHE_INVALIDATE | - /* Treat CP_WAIT_FOR_ME as a "cache" that needs to be invalidated when a - * a command that needs CP_WAIT_FOR_ME is executed. This means we may - * insert an extra WAIT_FOR_ME before an indirect command requiring it - * in case there was another command before the current command buffer - * that it needs to wait for. - */ - TU_CMD_FLAG_WAIT_FOR_ME, -}; - -/* Changing the CCU from sysmem mode to gmem mode or vice-versa is pretty - * heavy, involving a CCU cache flush/invalidate and a WFI in order to change - * which part of the gmem is used by the CCU. Here we keep track of what the - * state of the CCU. - */ -enum tu_cmd_ccu_state { - TU_CMD_CCU_SYSMEM, - TU_CMD_CCU_GMEM, - TU_CMD_CCU_UNKNOWN, -}; - -struct tu_cache_state { - /* Caches which must be made available (flushed) eventually if there are - * any users outside that cache domain, and caches which must be - * invalidated eventually if there are any reads. - */ - enum tu_cmd_flush_bits pending_flush_bits; - /* Pending flushes */ - enum tu_cmd_flush_bits flush_bits; -}; - -struct tu_vs_params { - uint32_t vertex_offset; - uint32_t first_instance; -}; - -/* This should be for state that is set inside a renderpass and used at - * renderpass end time, e.g. to decide whether to use sysmem. This needs - * special handling for secondary cmdbufs and suspending/resuming render - * passes where the state may need to be combined afterwards. - */ -struct tu_render_pass_state -{ - bool xfb_used; - bool has_tess; - bool has_prim_generated_query_in_rp; - bool disable_gmem; - - /* Track whether conditional predicate for COND_REG_EXEC is changed in draw_cs */ - bool draw_cs_writes_to_cond_pred; - - uint32_t drawcall_count; - - /* A calculated "draw cost" value for renderpass, which tries to - * estimate the bandwidth-per-sample of all the draws according - * to: - * - * foreach_draw (...) { - * sum += pipeline->color_bandwidth_per_sample; - * if (depth_test_enabled) - * sum += pipeline->depth_cpp_per_sample; - * if (depth_write_enabled) - * sum += pipeline->depth_cpp_per_sample; - * if (stencil_write_enabled) - * sum += pipeline->stencil_cpp_per_sample * 2; - * } - * drawcall_bandwidth_per_sample = sum / drawcall_count; - * - * It allows us to estimate the total bandwidth of drawcalls later, by - * calculating (drawcall_bandwidth_per_sample * zpass_sample_count). - * - * This does ignore depth buffer traffic for samples which do not - * pass due to depth-test fail, and some other details. But it is - * just intended to be a rough estimate that is easy to calculate. - */ - uint32_t drawcall_bandwidth_per_sample_sum; -}; - -void tu_render_pass_state_merge(struct tu_render_pass_state *dst, - const struct tu_render_pass_state *src); -struct tu_cmd_state -{ - uint32_t dirty; - - struct tu_pipeline *pipeline; - struct tu_pipeline *compute_pipeline; - - struct tu_render_pass_state rp; - - /* Vertex buffers, viewports, and scissors - * the states for these can be updated partially, so we need to save these - * to be able to emit a complete draw state - */ - struct { - uint64_t base; - uint32_t size; - uint32_t stride; - } vb[MAX_VBS]; - VkViewport viewport[MAX_VIEWPORTS]; - VkRect2D scissor[MAX_SCISSORS]; - uint32_t max_viewport, max_scissor; - - /* for dynamic states that can't be emitted directly */ - uint32_t dynamic_stencil_mask; - uint32_t dynamic_stencil_wrmask; - uint32_t dynamic_stencil_ref; - - uint32_t gras_su_cntl, rb_depth_cntl, rb_stencil_cntl; - uint32_t pc_raster_cntl, vpc_unknown_9107; - uint32_t rb_mrt_control[MAX_RTS], rb_mrt_blend_control[MAX_RTS]; - uint32_t rb_mrt_control_rop; - uint32_t rb_blend_cntl, sp_blend_cntl; - uint32_t pipeline_color_write_enable, pipeline_blend_enable; - uint32_t color_write_enable; - bool logic_op_enabled; - bool rop_reads_dst; - enum pc_di_primtype primtype; - bool primitive_restart_enable; - - /* saved states to re-emit in TU_CMD_DIRTY_DRAW_STATE case */ - struct tu_draw_state dynamic_state[TU_DYNAMIC_STATE_COUNT]; - struct tu_draw_state vertex_buffers; - struct tu_draw_state shader_const; - struct tu_draw_state desc_sets; - - struct tu_draw_state vs_params; - - /* Index buffer */ - uint64_t index_va; - uint32_t max_index_count; - uint8_t index_size; - - /* because streamout base has to be 32-byte aligned - * there is an extra offset to deal with when it is - * unaligned - */ - uint8_t streamout_offset[IR3_MAX_SO_BUFFERS]; - - /* Renderpasses are tricky, because we may need to flush differently if - * using sysmem vs. gmem and therefore we have to delay any flushing that - * happens before a renderpass. So we have to have two copies of the flush - * state, one for intra-renderpass flushes (i.e. renderpass dependencies) - * and one for outside a renderpass. - */ - struct tu_cache_state cache; - struct tu_cache_state renderpass_cache; - - enum tu_cmd_ccu_state ccu_state; - - /* Decides which GMEM layout to use from the tu_pass, based on whether the CCU - * might get used by tu_store_gmem_attachment(). - */ - enum tu_gmem_layout gmem_layout; - - const struct tu_render_pass *pass; - const struct tu_subpass *subpass; - const struct tu_framebuffer *framebuffer; - const struct tu_tiling_config *tiling; - VkRect2D render_area; - - const struct tu_image_view **attachments; - - /* State that in the dynamic case comes from VkRenderingInfo and needs to - * be saved/restored when suspending. This holds the state for the last - * suspended renderpass, which may point to this command buffer's dynamic_* - * or another command buffer if executed on a secondary. - */ - struct { - const struct tu_render_pass *pass; - const struct tu_subpass *subpass; - const struct tu_framebuffer *framebuffer; - VkRect2D render_area; - enum tu_gmem_layout gmem_layout; - - const struct tu_image_view **attachments; - - struct tu_lrz_state lrz; - } suspended_pass; - - bool tessfactor_addr_set; - bool predication_active; - enum a5xx_line_mode line_mode; - bool z_negative_one_to_one; - - /* VK_QUERY_PIPELINE_STATISTIC_CLIPPING_INVOCATIONS_BIT and - * VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT are allowed to run simultaniously, - * but they use the same {START,STOP}_PRIMITIVE_CTRS control. - */ - uint32_t prim_counters_running; - - bool prim_generated_query_running_before_rp; - - /* These are the states of the suspend/resume state machine. In addition to - * tracking whether we're in the middle of a chain of suspending and - * resuming passes that will be merged, we need to track whether the - * command buffer begins in the middle of such a chain, for when it gets - * merged with other command buffers. We call such a chain that begins - * before the command buffer starts a "pre-chain". - * - * Note that when this command buffer is finished, this state is untouched - * but it gains a different meaning. For example, if we finish in state - * SR_IN_CHAIN, we finished in the middle of a suspend/resume chain, so - * there's a suspend/resume chain that extends past the end of the command - * buffer. In this sense it's the "opposite" of SR_AFTER_PRE_CHAIN, which - * means that there's a suspend/resume chain that extends before the - * beginning. - */ - enum { - /* Either there are no suspend/resume chains, or they are entirely - * contained in the current command buffer. - * - * BeginCommandBuffer() <- start of current command buffer - * ... - * // we are here - */ - SR_NONE = 0, - - /* We are in the middle of a suspend/resume chain that starts before the - * current command buffer. This happens when the command buffer begins - * with a resuming render pass and all of the passes up to the current - * one are suspending. In this state, our part of the chain is not saved - * and is in the current draw_cs/state. - * - * BeginRendering() ... EndRendering(suspending) - * BeginCommandBuffer() <- start of current command buffer - * BeginRendering(resuming) ... EndRendering(suspending) - * BeginRendering(resuming) ... EndRendering(suspending) - * ... - * // we are here - */ - SR_IN_PRE_CHAIN, - - /* We are currently outside of any suspend/resume chains, but there is a - * chain starting before the current command buffer. It is saved in - * pre_chain. - * - * BeginRendering() ... EndRendering(suspending) - * BeginCommandBuffer() <- start of current command buffer - * // This part is stashed in pre_chain - * BeginRendering(resuming) ... EndRendering(suspending) - * BeginRendering(resuming) ... EndRendering(suspending) - * ... - * BeginRendering(resuming) ... EndRendering() // end of chain - * ... - * // we are here - */ - SR_AFTER_PRE_CHAIN, - - /* We are in the middle of a suspend/resume chain and there is no chain - * starting before the current command buffer. - * - * BeginCommandBuffer() <- start of current command buffer - * ... - * BeginRendering() ... EndRendering(suspending) - * BeginRendering(resuming) ... EndRendering(suspending) - * BeginRendering(resuming) ... EndRendering(suspending) - * ... - * // we are here - */ - SR_IN_CHAIN, - - /* We are in the middle of a suspend/resume chain and there is another, - * separate, chain starting before the current command buffer. - * - * BeginRendering() ... EndRendering(suspending) - * CommandBufferBegin() <- start of current command buffer - * // This part is stashed in pre_chain - * BeginRendering(resuming) ... EndRendering(suspending) - * BeginRendering(resuming) ... EndRendering(suspending) - * ... - * BeginRendering(resuming) ... EndRendering() // end of chain - * ... - * BeginRendering() ... EndRendering(suspending) - * BeginRendering(resuming) ... EndRendering(suspending) - * BeginRendering(resuming) ... EndRendering(suspending) - * ... - * // we are here - */ - SR_IN_CHAIN_AFTER_PRE_CHAIN, - } suspend_resume; - - bool suspending, resuming; - - struct tu_lrz_state lrz; - - struct tu_draw_state lrz_and_depth_plane_state; - - struct tu_vs_params last_vs_params; -}; - -struct tu_cmd_pool -{ - struct vk_command_pool vk; - - struct list_head cmd_buffers; - struct list_head free_cmd_buffers; -}; - -enum tu_cmd_buffer_status -{ - TU_CMD_BUFFER_STATUS_INVALID, - TU_CMD_BUFFER_STATUS_INITIAL, - TU_CMD_BUFFER_STATUS_RECORDING, - TU_CMD_BUFFER_STATUS_EXECUTABLE, - TU_CMD_BUFFER_STATUS_PENDING, -}; - -struct tu_cmd_buffer -{ - struct vk_command_buffer vk; - - struct tu_device *device; - - struct tu_cmd_pool *pool; - struct list_head pool_link; - - struct u_trace trace; - struct u_trace_iterator trace_renderpass_start; - struct u_trace_iterator trace_renderpass_end; - - struct list_head renderpass_autotune_results; - struct tu_autotune_results_buffer* autotune_buffer; - - VkCommandBufferUsageFlags usage_flags; - enum tu_cmd_buffer_status status; - - VkQueryPipelineStatisticFlags inherited_pipeline_statistics; - - struct tu_cmd_state state; - uint32_t queue_family_index; - - uint32_t push_constants[MAX_PUSH_CONSTANTS_SIZE / 4]; - VkShaderStageFlags push_constant_stages; - struct tu_descriptor_set meta_push_descriptors; - - struct tu_descriptor_state descriptors[MAX_BIND_POINTS]; - - struct tu_render_pass_attachment dynamic_rp_attachments[2 * (MAX_RTS + 1)]; - struct tu_subpass_attachment dynamic_color_attachments[MAX_RTS]; - struct tu_subpass_attachment dynamic_resolve_attachments[MAX_RTS + 1]; - const struct tu_image_view *dynamic_attachments[2 * (MAX_RTS + 1)]; - - struct tu_render_pass dynamic_pass; - struct tu_subpass dynamic_subpass; - struct tu_framebuffer dynamic_framebuffer; - - VkResult record_result; - - struct tu_cs cs; - struct tu_cs draw_cs; - struct tu_cs tile_store_cs; - struct tu_cs draw_epilogue_cs; - struct tu_cs sub_cs; - - /* If the first render pass in the command buffer is resuming, then it is - * part of a suspend/resume chain that starts before the current command - * buffer and needs to be merged later. In this case, its incomplete state - * is stored in pre_chain. In the symmetric case where the last render pass - * is suspending, we just skip ending the render pass and its state is - * stored in draw_cs/the current state. The first and last render pass - * might be part of different chains, which is why all the state may need - * to be saved separately here. - */ - struct { - struct tu_cs draw_cs; - struct tu_cs draw_epilogue_cs; - - struct u_trace_iterator trace_renderpass_start, trace_renderpass_end; - - struct tu_render_pass_state state; - } pre_chain; - - uint32_t vsc_draw_strm_pitch; - uint32_t vsc_prim_strm_pitch; -}; - -static inline uint32_t -tu_attachment_gmem_offset(struct tu_cmd_buffer *cmd, - const struct tu_render_pass_attachment *att) -{ - assert(cmd->state.gmem_layout < TU_GMEM_LAYOUT_COUNT); - return att->gmem_offset[cmd->state.gmem_layout]; -} - -static inline uint32_t -tu_attachment_gmem_offset_stencil(struct tu_cmd_buffer *cmd, - const struct tu_render_pass_attachment *att) -{ - assert(cmd->state.gmem_layout < TU_GMEM_LAYOUT_COUNT); - return att->gmem_offset_stencil[cmd->state.gmem_layout]; -} - -VkResult tu_cmd_buffer_begin(struct tu_cmd_buffer *cmd_buffer, - VkCommandBufferUsageFlags usage_flags); - -void tu_emit_cache_flush_renderpass(struct tu_cmd_buffer *cmd_buffer, - struct tu_cs *cs); - -void tu_emit_cache_flush_ccu(struct tu_cmd_buffer *cmd_buffer, - struct tu_cs *cs, - enum tu_cmd_ccu_state ccu_state); - -void -tu_append_pre_chain(struct tu_cmd_buffer *cmd, - struct tu_cmd_buffer *secondary); - -void -tu_append_pre_post_chain(struct tu_cmd_buffer *cmd, - struct tu_cmd_buffer *secondary); - -void -tu_append_post_chain(struct tu_cmd_buffer *cmd, - struct tu_cmd_buffer *secondary); - -void -tu_restore_suspended_pass(struct tu_cmd_buffer *cmd, - struct tu_cmd_buffer *suspended); - -void tu_cmd_render(struct tu_cmd_buffer *cmd); - -void -tu6_emit_event_write(struct tu_cmd_buffer *cmd, - struct tu_cs *cs, - enum vgt_event_type event); - -static inline struct tu_descriptor_state * -tu_get_descriptors_state(struct tu_cmd_buffer *cmd_buffer, - VkPipelineBindPoint bind_point) -{ - return &cmd_buffer->descriptors[bind_point]; -} - -void tu6_emit_msaa(struct tu_cs *cs, VkSampleCountFlagBits samples, - enum a5xx_line_mode line_mode); - -void tu6_emit_window_scissor(struct tu_cs *cs, uint32_t x1, uint32_t y1, uint32_t x2, uint32_t y2); - -void tu6_emit_window_offset(struct tu_cs *cs, uint32_t x1, uint32_t y1); - -void tu_disable_draw_states(struct tu_cmd_buffer *cmd, struct tu_cs *cs); - -void tu6_apply_depth_bounds_workaround(struct tu_device *device, - uint32_t *rb_depth_cntl); - VkResult tu_gralloc_info(struct tu_device *device, const VkNativeBufferANDROID *gralloc_info, @@ -748,13 +109,4 @@ tu_import_memory_from_gralloc_handle(VkDevice device_h, const VkAllocationCallbacks *alloc, VkImage image_h); -VK_DEFINE_HANDLE_CASTS(tu_cmd_buffer, vk.base, VkCommandBuffer, - VK_OBJECT_TYPE_COMMAND_BUFFER) - -VK_DEFINE_NONDISP_HANDLE_CASTS(tu_cmd_pool, vk.base, VkCommandPool, - VK_OBJECT_TYPE_COMMAND_POOL) - -void -update_stencil_mask(uint32_t *value, VkStencilFaceFlags face, uint32_t mask); - #endif /* TU_PRIVATE_H */