turnip: add tu_cmd_buffer.h
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/17811>
This commit is contained in:
@@ -25,6 +25,7 @@
|
||||
* DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
#include "tu_cmd_buffer.h"
|
||||
#include "tu_private.h"
|
||||
|
||||
#include "vk_render_pass.h"
|
||||
|
670
src/freedreno/vulkan/tu_cmd_buffer.h
Normal file
670
src/freedreno/vulkan/tu_cmd_buffer.h
Normal file
@@ -0,0 +1,670 @@
|
||||
/*
|
||||
* Copyright © 2016 Red Hat.
|
||||
* Copyright © 2016 Bas Nieuwenhuizen
|
||||
* SPDX-License-Identifier: MIT
|
||||
*
|
||||
* based in part on anv driver which is:
|
||||
* Copyright © 2015 Intel Corporation
|
||||
*/
|
||||
|
||||
#ifndef TU_CMD_BUFFER_H
|
||||
#define TU_CMD_BUFFER_H
|
||||
|
||||
#include "tu_common.h"
|
||||
|
||||
#include "tu_cs.h"
|
||||
#include "tu_descriptor_set.h"
|
||||
#include "tu_device.h"
|
||||
#include "tu_lrz.h"
|
||||
#include "tu_pass.h"
|
||||
#include "tu_pipeline.h"
|
||||
|
||||
enum tu_draw_state_group_id
|
||||
{
|
||||
TU_DRAW_STATE_PROGRAM_CONFIG,
|
||||
TU_DRAW_STATE_PROGRAM,
|
||||
TU_DRAW_STATE_PROGRAM_BINNING,
|
||||
TU_DRAW_STATE_VB,
|
||||
TU_DRAW_STATE_VI,
|
||||
TU_DRAW_STATE_VI_BINNING,
|
||||
TU_DRAW_STATE_RAST,
|
||||
TU_DRAW_STATE_CONST,
|
||||
TU_DRAW_STATE_DESC_SETS,
|
||||
TU_DRAW_STATE_DESC_SETS_LOAD,
|
||||
TU_DRAW_STATE_VS_PARAMS,
|
||||
TU_DRAW_STATE_INPUT_ATTACHMENTS_GMEM,
|
||||
TU_DRAW_STATE_INPUT_ATTACHMENTS_SYSMEM,
|
||||
TU_DRAW_STATE_LRZ_AND_DEPTH_PLANE,
|
||||
TU_DRAW_STATE_PRIM_MODE_GMEM,
|
||||
TU_DRAW_STATE_PRIM_MODE_SYSMEM,
|
||||
|
||||
/* dynamic state related draw states */
|
||||
TU_DRAW_STATE_DYNAMIC,
|
||||
TU_DRAW_STATE_COUNT = TU_DRAW_STATE_DYNAMIC + TU_DYNAMIC_STATE_COUNT,
|
||||
};
|
||||
|
||||
struct tu_descriptor_state
|
||||
{
|
||||
struct tu_descriptor_set *sets[MAX_SETS];
|
||||
struct tu_descriptor_set push_set;
|
||||
uint32_t dynamic_descriptors[MAX_DYNAMIC_BUFFERS_SIZE];
|
||||
};
|
||||
|
||||
enum tu_cmd_dirty_bits
|
||||
{
|
||||
TU_CMD_DIRTY_VERTEX_BUFFERS = BIT(0),
|
||||
TU_CMD_DIRTY_VB_STRIDE = BIT(1),
|
||||
TU_CMD_DIRTY_GRAS_SU_CNTL = BIT(2),
|
||||
TU_CMD_DIRTY_RB_DEPTH_CNTL = BIT(3),
|
||||
TU_CMD_DIRTY_RB_STENCIL_CNTL = BIT(4),
|
||||
TU_CMD_DIRTY_DESC_SETS_LOAD = BIT(5),
|
||||
TU_CMD_DIRTY_COMPUTE_DESC_SETS_LOAD = BIT(6),
|
||||
TU_CMD_DIRTY_SHADER_CONSTS = BIT(7),
|
||||
TU_CMD_DIRTY_LRZ = BIT(8),
|
||||
TU_CMD_DIRTY_VS_PARAMS = BIT(9),
|
||||
TU_CMD_DIRTY_RASTERIZER_DISCARD = BIT(10),
|
||||
TU_CMD_DIRTY_VIEWPORTS = BIT(11),
|
||||
TU_CMD_DIRTY_BLEND = BIT(12),
|
||||
/* all draw states were disabled and need to be re-enabled: */
|
||||
TU_CMD_DIRTY_DRAW_STATE = BIT(13)
|
||||
};
|
||||
|
||||
/* There are only three cache domains we have to care about: the CCU, or
|
||||
* color cache unit, which is used for color and depth/stencil attachments
|
||||
* and copy/blit destinations, and is split conceptually into color and depth,
|
||||
* and the universal cache or UCHE which is used for pretty much everything
|
||||
* else, except for the CP (uncached) and host. We need to flush whenever data
|
||||
* crosses these boundaries.
|
||||
*/
|
||||
|
||||
enum tu_cmd_access_mask {
|
||||
TU_ACCESS_UCHE_READ = 1 << 0,
|
||||
TU_ACCESS_UCHE_WRITE = 1 << 1,
|
||||
TU_ACCESS_CCU_COLOR_READ = 1 << 2,
|
||||
TU_ACCESS_CCU_COLOR_WRITE = 1 << 3,
|
||||
TU_ACCESS_CCU_DEPTH_READ = 1 << 4,
|
||||
TU_ACCESS_CCU_DEPTH_WRITE = 1 << 5,
|
||||
|
||||
/* Experiments have shown that while it's safe to avoid flushing the CCU
|
||||
* after each blit/renderpass, it's not safe to assume that subsequent
|
||||
* lookups with a different attachment state will hit unflushed cache
|
||||
* entries. That is, the CCU needs to be flushed and possibly invalidated
|
||||
* when accessing memory with a different attachment state. Writing to an
|
||||
* attachment under the following conditions after clearing using the
|
||||
* normal 2d engine path is known to have issues:
|
||||
*
|
||||
* - It isn't the 0'th layer.
|
||||
* - There are more than one attachment, and this isn't the 0'th attachment
|
||||
* (this seems to also depend on the cpp of the attachments).
|
||||
*
|
||||
* Our best guess is that the layer/MRT state is used when computing
|
||||
* the location of a cache entry in CCU, to avoid conflicts. We assume that
|
||||
* any access in a renderpass after or before an access by a transfer needs
|
||||
* a flush/invalidate, and use the _INCOHERENT variants to represent access
|
||||
* by a renderpass.
|
||||
*/
|
||||
TU_ACCESS_CCU_COLOR_INCOHERENT_READ = 1 << 6,
|
||||
TU_ACCESS_CCU_COLOR_INCOHERENT_WRITE = 1 << 7,
|
||||
TU_ACCESS_CCU_DEPTH_INCOHERENT_READ = 1 << 8,
|
||||
TU_ACCESS_CCU_DEPTH_INCOHERENT_WRITE = 1 << 9,
|
||||
|
||||
/* Accesses which bypasses any cache. e.g. writes via the host,
|
||||
* CP_EVENT_WRITE::BLIT, and the CP are SYSMEM_WRITE.
|
||||
*/
|
||||
TU_ACCESS_SYSMEM_READ = 1 << 10,
|
||||
TU_ACCESS_SYSMEM_WRITE = 1 << 11,
|
||||
|
||||
/* Memory writes from the CP start in-order with draws and event writes,
|
||||
* but execute asynchronously and hence need a CP_WAIT_MEM_WRITES if read.
|
||||
*/
|
||||
TU_ACCESS_CP_WRITE = 1 << 12,
|
||||
|
||||
TU_ACCESS_READ =
|
||||
TU_ACCESS_UCHE_READ |
|
||||
TU_ACCESS_CCU_COLOR_READ |
|
||||
TU_ACCESS_CCU_DEPTH_READ |
|
||||
TU_ACCESS_CCU_COLOR_INCOHERENT_READ |
|
||||
TU_ACCESS_CCU_DEPTH_INCOHERENT_READ |
|
||||
TU_ACCESS_SYSMEM_READ,
|
||||
|
||||
TU_ACCESS_WRITE =
|
||||
TU_ACCESS_UCHE_WRITE |
|
||||
TU_ACCESS_CCU_COLOR_WRITE |
|
||||
TU_ACCESS_CCU_COLOR_INCOHERENT_WRITE |
|
||||
TU_ACCESS_CCU_DEPTH_WRITE |
|
||||
TU_ACCESS_CCU_DEPTH_INCOHERENT_WRITE |
|
||||
TU_ACCESS_SYSMEM_WRITE |
|
||||
TU_ACCESS_CP_WRITE,
|
||||
|
||||
TU_ACCESS_ALL =
|
||||
TU_ACCESS_READ |
|
||||
TU_ACCESS_WRITE,
|
||||
};
|
||||
|
||||
/* Starting with a6xx, the pipeline is split into several "clusters" (really
|
||||
* pipeline stages). Each stage has its own pair of register banks and can
|
||||
* switch them independently, so that earlier stages can run ahead of later
|
||||
* ones. e.g. the FS of draw N and the VS of draw N + 1 can be executing at
|
||||
* the same time.
|
||||
*
|
||||
* As a result of this, we need to insert a WFI when an earlier stage depends
|
||||
* on the result of a later stage. CP_DRAW_* and CP_BLIT will wait for any
|
||||
* pending WFI's to complete before starting, and usually before reading
|
||||
* indirect params even, so a WFI also acts as a full "pipeline stall".
|
||||
*
|
||||
* Note, the names of the stages come from CLUSTER_* in devcoredump. We
|
||||
* include all the stages for completeness, even ones which do not read/write
|
||||
* anything.
|
||||
*/
|
||||
|
||||
enum tu_stage {
|
||||
/* This doesn't correspond to a cluster, but we need it for tracking
|
||||
* indirect draw parameter reads etc.
|
||||
*/
|
||||
TU_STAGE_CP,
|
||||
|
||||
/* - Fetch index buffer
|
||||
* - Fetch vertex attributes, dispatch VS
|
||||
*/
|
||||
TU_STAGE_FE,
|
||||
|
||||
/* Execute all geometry stages (VS thru GS) */
|
||||
TU_STAGE_SP_VS,
|
||||
|
||||
/* Write to VPC, do primitive assembly. */
|
||||
TU_STAGE_PC_VS,
|
||||
|
||||
/* Rasterization. RB_DEPTH_BUFFER_BASE only exists in CLUSTER_PS according
|
||||
* to devcoredump so presumably this stage stalls for TU_STAGE_PS when
|
||||
* early depth testing is enabled before dispatching fragments? However
|
||||
* GRAS reads and writes LRZ directly.
|
||||
*/
|
||||
TU_STAGE_GRAS,
|
||||
|
||||
/* Execute FS */
|
||||
TU_STAGE_SP_PS,
|
||||
|
||||
/* - Fragment tests
|
||||
* - Write color/depth
|
||||
* - Streamout writes (???)
|
||||
* - Varying interpolation (???)
|
||||
*/
|
||||
TU_STAGE_PS,
|
||||
};
|
||||
|
||||
enum tu_cmd_flush_bits {
|
||||
TU_CMD_FLAG_CCU_FLUSH_DEPTH = 1 << 0,
|
||||
TU_CMD_FLAG_CCU_FLUSH_COLOR = 1 << 1,
|
||||
TU_CMD_FLAG_CCU_INVALIDATE_DEPTH = 1 << 2,
|
||||
TU_CMD_FLAG_CCU_INVALIDATE_COLOR = 1 << 3,
|
||||
TU_CMD_FLAG_CACHE_FLUSH = 1 << 4,
|
||||
TU_CMD_FLAG_CACHE_INVALIDATE = 1 << 5,
|
||||
TU_CMD_FLAG_WAIT_MEM_WRITES = 1 << 6,
|
||||
TU_CMD_FLAG_WAIT_FOR_IDLE = 1 << 7,
|
||||
TU_CMD_FLAG_WAIT_FOR_ME = 1 << 8,
|
||||
|
||||
TU_CMD_FLAG_ALL_FLUSH =
|
||||
TU_CMD_FLAG_CCU_FLUSH_DEPTH |
|
||||
TU_CMD_FLAG_CCU_FLUSH_COLOR |
|
||||
TU_CMD_FLAG_CACHE_FLUSH |
|
||||
/* Treat the CP as a sort of "cache" which may need to be "flushed" via
|
||||
* waiting for writes to land with WAIT_FOR_MEM_WRITES.
|
||||
*/
|
||||
TU_CMD_FLAG_WAIT_MEM_WRITES,
|
||||
|
||||
TU_CMD_FLAG_ALL_INVALIDATE =
|
||||
TU_CMD_FLAG_CCU_INVALIDATE_DEPTH |
|
||||
TU_CMD_FLAG_CCU_INVALIDATE_COLOR |
|
||||
TU_CMD_FLAG_CACHE_INVALIDATE |
|
||||
/* Treat CP_WAIT_FOR_ME as a "cache" that needs to be invalidated when a
|
||||
* a command that needs CP_WAIT_FOR_ME is executed. This means we may
|
||||
* insert an extra WAIT_FOR_ME before an indirect command requiring it
|
||||
* in case there was another command before the current command buffer
|
||||
* that it needs to wait for.
|
||||
*/
|
||||
TU_CMD_FLAG_WAIT_FOR_ME,
|
||||
};
|
||||
|
||||
/* Changing the CCU from sysmem mode to gmem mode or vice-versa is pretty
|
||||
* heavy, involving a CCU cache flush/invalidate and a WFI in order to change
|
||||
* which part of the gmem is used by the CCU. Here we keep track of what the
|
||||
* state of the CCU.
|
||||
*/
|
||||
enum tu_cmd_ccu_state {
|
||||
TU_CMD_CCU_SYSMEM,
|
||||
TU_CMD_CCU_GMEM,
|
||||
TU_CMD_CCU_UNKNOWN,
|
||||
};
|
||||
|
||||
struct tu_cache_state {
|
||||
/* Caches which must be made available (flushed) eventually if there are
|
||||
* any users outside that cache domain, and caches which must be
|
||||
* invalidated eventually if there are any reads.
|
||||
*/
|
||||
enum tu_cmd_flush_bits pending_flush_bits;
|
||||
/* Pending flushes */
|
||||
enum tu_cmd_flush_bits flush_bits;
|
||||
};
|
||||
|
||||
struct tu_vs_params {
|
||||
uint32_t vertex_offset;
|
||||
uint32_t first_instance;
|
||||
};
|
||||
|
||||
/* This should be for state that is set inside a renderpass and used at
|
||||
* renderpass end time, e.g. to decide whether to use sysmem. This needs
|
||||
* special handling for secondary cmdbufs and suspending/resuming render
|
||||
* passes where the state may need to be combined afterwards.
|
||||
*/
|
||||
struct tu_render_pass_state
|
||||
{
|
||||
bool xfb_used;
|
||||
bool has_tess;
|
||||
bool has_prim_generated_query_in_rp;
|
||||
bool disable_gmem;
|
||||
|
||||
/* Track whether conditional predicate for COND_REG_EXEC is changed in draw_cs */
|
||||
bool draw_cs_writes_to_cond_pred;
|
||||
|
||||
uint32_t drawcall_count;
|
||||
|
||||
/* A calculated "draw cost" value for renderpass, which tries to
|
||||
* estimate the bandwidth-per-sample of all the draws according
|
||||
* to:
|
||||
*
|
||||
* foreach_draw (...) {
|
||||
* sum += pipeline->color_bandwidth_per_sample;
|
||||
* if (depth_test_enabled)
|
||||
* sum += pipeline->depth_cpp_per_sample;
|
||||
* if (depth_write_enabled)
|
||||
* sum += pipeline->depth_cpp_per_sample;
|
||||
* if (stencil_write_enabled)
|
||||
* sum += pipeline->stencil_cpp_per_sample * 2;
|
||||
* }
|
||||
* drawcall_bandwidth_per_sample = sum / drawcall_count;
|
||||
*
|
||||
* It allows us to estimate the total bandwidth of drawcalls later, by
|
||||
* calculating (drawcall_bandwidth_per_sample * zpass_sample_count).
|
||||
*
|
||||
* This does ignore depth buffer traffic for samples which do not
|
||||
* pass due to depth-test fail, and some other details. But it is
|
||||
* just intended to be a rough estimate that is easy to calculate.
|
||||
*/
|
||||
uint32_t drawcall_bandwidth_per_sample_sum;
|
||||
};
|
||||
|
||||
struct tu_cmd_state
|
||||
{
|
||||
uint32_t dirty;
|
||||
|
||||
struct tu_pipeline *pipeline;
|
||||
struct tu_pipeline *compute_pipeline;
|
||||
|
||||
struct tu_render_pass_state rp;
|
||||
|
||||
/* Vertex buffers, viewports, and scissors
|
||||
* the states for these can be updated partially, so we need to save these
|
||||
* to be able to emit a complete draw state
|
||||
*/
|
||||
struct {
|
||||
uint64_t base;
|
||||
uint32_t size;
|
||||
uint32_t stride;
|
||||
} vb[MAX_VBS];
|
||||
VkViewport viewport[MAX_VIEWPORTS];
|
||||
VkRect2D scissor[MAX_SCISSORS];
|
||||
uint32_t max_viewport, max_scissor;
|
||||
|
||||
/* for dynamic states that can't be emitted directly */
|
||||
uint32_t dynamic_stencil_mask;
|
||||
uint32_t dynamic_stencil_wrmask;
|
||||
uint32_t dynamic_stencil_ref;
|
||||
|
||||
uint32_t gras_su_cntl, rb_depth_cntl, rb_stencil_cntl;
|
||||
uint32_t pc_raster_cntl, vpc_unknown_9107;
|
||||
uint32_t rb_mrt_control[MAX_RTS], rb_mrt_blend_control[MAX_RTS];
|
||||
uint32_t rb_mrt_control_rop;
|
||||
uint32_t rb_blend_cntl, sp_blend_cntl;
|
||||
uint32_t pipeline_color_write_enable, pipeline_blend_enable;
|
||||
uint32_t color_write_enable;
|
||||
bool logic_op_enabled;
|
||||
bool rop_reads_dst;
|
||||
enum pc_di_primtype primtype;
|
||||
bool primitive_restart_enable;
|
||||
|
||||
/* saved states to re-emit in TU_CMD_DIRTY_DRAW_STATE case */
|
||||
struct tu_draw_state dynamic_state[TU_DYNAMIC_STATE_COUNT];
|
||||
struct tu_draw_state vertex_buffers;
|
||||
struct tu_draw_state shader_const;
|
||||
struct tu_draw_state desc_sets;
|
||||
|
||||
struct tu_draw_state vs_params;
|
||||
|
||||
/* Index buffer */
|
||||
uint64_t index_va;
|
||||
uint32_t max_index_count;
|
||||
uint8_t index_size;
|
||||
|
||||
/* because streamout base has to be 32-byte aligned
|
||||
* there is an extra offset to deal with when it is
|
||||
* unaligned
|
||||
*/
|
||||
uint8_t streamout_offset[IR3_MAX_SO_BUFFERS];
|
||||
|
||||
/* Renderpasses are tricky, because we may need to flush differently if
|
||||
* using sysmem vs. gmem and therefore we have to delay any flushing that
|
||||
* happens before a renderpass. So we have to have two copies of the flush
|
||||
* state, one for intra-renderpass flushes (i.e. renderpass dependencies)
|
||||
* and one for outside a renderpass.
|
||||
*/
|
||||
struct tu_cache_state cache;
|
||||
struct tu_cache_state renderpass_cache;
|
||||
|
||||
enum tu_cmd_ccu_state ccu_state;
|
||||
|
||||
/* Decides which GMEM layout to use from the tu_pass, based on whether the CCU
|
||||
* might get used by tu_store_gmem_attachment().
|
||||
*/
|
||||
enum tu_gmem_layout gmem_layout;
|
||||
|
||||
const struct tu_render_pass *pass;
|
||||
const struct tu_subpass *subpass;
|
||||
const struct tu_framebuffer *framebuffer;
|
||||
const struct tu_tiling_config *tiling;
|
||||
VkRect2D render_area;
|
||||
|
||||
const struct tu_image_view **attachments;
|
||||
|
||||
/* State that in the dynamic case comes from VkRenderingInfo and needs to
|
||||
* be saved/restored when suspending. This holds the state for the last
|
||||
* suspended renderpass, which may point to this command buffer's dynamic_*
|
||||
* or another command buffer if executed on a secondary.
|
||||
*/
|
||||
struct {
|
||||
const struct tu_render_pass *pass;
|
||||
const struct tu_subpass *subpass;
|
||||
const struct tu_framebuffer *framebuffer;
|
||||
VkRect2D render_area;
|
||||
enum tu_gmem_layout gmem_layout;
|
||||
|
||||
const struct tu_image_view **attachments;
|
||||
|
||||
struct tu_lrz_state lrz;
|
||||
} suspended_pass;
|
||||
|
||||
bool tessfactor_addr_set;
|
||||
bool predication_active;
|
||||
enum a5xx_line_mode line_mode;
|
||||
bool z_negative_one_to_one;
|
||||
|
||||
/* VK_QUERY_PIPELINE_STATISTIC_CLIPPING_INVOCATIONS_BIT and
|
||||
* VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT are allowed to run simultaniously,
|
||||
* but they use the same {START,STOP}_PRIMITIVE_CTRS control.
|
||||
*/
|
||||
uint32_t prim_counters_running;
|
||||
|
||||
bool prim_generated_query_running_before_rp;
|
||||
|
||||
/* These are the states of the suspend/resume state machine. In addition to
|
||||
* tracking whether we're in the middle of a chain of suspending and
|
||||
* resuming passes that will be merged, we need to track whether the
|
||||
* command buffer begins in the middle of such a chain, for when it gets
|
||||
* merged with other command buffers. We call such a chain that begins
|
||||
* before the command buffer starts a "pre-chain".
|
||||
*
|
||||
* Note that when this command buffer is finished, this state is untouched
|
||||
* but it gains a different meaning. For example, if we finish in state
|
||||
* SR_IN_CHAIN, we finished in the middle of a suspend/resume chain, so
|
||||
* there's a suspend/resume chain that extends past the end of the command
|
||||
* buffer. In this sense it's the "opposite" of SR_AFTER_PRE_CHAIN, which
|
||||
* means that there's a suspend/resume chain that extends before the
|
||||
* beginning.
|
||||
*/
|
||||
enum {
|
||||
/* Either there are no suspend/resume chains, or they are entirely
|
||||
* contained in the current command buffer.
|
||||
*
|
||||
* BeginCommandBuffer() <- start of current command buffer
|
||||
* ...
|
||||
* // we are here
|
||||
*/
|
||||
SR_NONE = 0,
|
||||
|
||||
/* We are in the middle of a suspend/resume chain that starts before the
|
||||
* current command buffer. This happens when the command buffer begins
|
||||
* with a resuming render pass and all of the passes up to the current
|
||||
* one are suspending. In this state, our part of the chain is not saved
|
||||
* and is in the current draw_cs/state.
|
||||
*
|
||||
* BeginRendering() ... EndRendering(suspending)
|
||||
* BeginCommandBuffer() <- start of current command buffer
|
||||
* BeginRendering(resuming) ... EndRendering(suspending)
|
||||
* BeginRendering(resuming) ... EndRendering(suspending)
|
||||
* ...
|
||||
* // we are here
|
||||
*/
|
||||
SR_IN_PRE_CHAIN,
|
||||
|
||||
/* We are currently outside of any suspend/resume chains, but there is a
|
||||
* chain starting before the current command buffer. It is saved in
|
||||
* pre_chain.
|
||||
*
|
||||
* BeginRendering() ... EndRendering(suspending)
|
||||
* BeginCommandBuffer() <- start of current command buffer
|
||||
* // This part is stashed in pre_chain
|
||||
* BeginRendering(resuming) ... EndRendering(suspending)
|
||||
* BeginRendering(resuming) ... EndRendering(suspending)
|
||||
* ...
|
||||
* BeginRendering(resuming) ... EndRendering() // end of chain
|
||||
* ...
|
||||
* // we are here
|
||||
*/
|
||||
SR_AFTER_PRE_CHAIN,
|
||||
|
||||
/* We are in the middle of a suspend/resume chain and there is no chain
|
||||
* starting before the current command buffer.
|
||||
*
|
||||
* BeginCommandBuffer() <- start of current command buffer
|
||||
* ...
|
||||
* BeginRendering() ... EndRendering(suspending)
|
||||
* BeginRendering(resuming) ... EndRendering(suspending)
|
||||
* BeginRendering(resuming) ... EndRendering(suspending)
|
||||
* ...
|
||||
* // we are here
|
||||
*/
|
||||
SR_IN_CHAIN,
|
||||
|
||||
/* We are in the middle of a suspend/resume chain and there is another,
|
||||
* separate, chain starting before the current command buffer.
|
||||
*
|
||||
* BeginRendering() ... EndRendering(suspending)
|
||||
* CommandBufferBegin() <- start of current command buffer
|
||||
* // This part is stashed in pre_chain
|
||||
* BeginRendering(resuming) ... EndRendering(suspending)
|
||||
* BeginRendering(resuming) ... EndRendering(suspending)
|
||||
* ...
|
||||
* BeginRendering(resuming) ... EndRendering() // end of chain
|
||||
* ...
|
||||
* BeginRendering() ... EndRendering(suspending)
|
||||
* BeginRendering(resuming) ... EndRendering(suspending)
|
||||
* BeginRendering(resuming) ... EndRendering(suspending)
|
||||
* ...
|
||||
* // we are here
|
||||
*/
|
||||
SR_IN_CHAIN_AFTER_PRE_CHAIN,
|
||||
} suspend_resume;
|
||||
|
||||
bool suspending, resuming;
|
||||
|
||||
struct tu_lrz_state lrz;
|
||||
|
||||
struct tu_draw_state lrz_and_depth_plane_state;
|
||||
|
||||
struct tu_vs_params last_vs_params;
|
||||
};
|
||||
|
||||
struct tu_cmd_pool
|
||||
{
|
||||
struct vk_command_pool vk;
|
||||
|
||||
struct list_head cmd_buffers;
|
||||
struct list_head free_cmd_buffers;
|
||||
};
|
||||
VK_DEFINE_NONDISP_HANDLE_CASTS(tu_cmd_pool, vk.base, VkCommandPool,
|
||||
VK_OBJECT_TYPE_COMMAND_POOL)
|
||||
|
||||
enum tu_cmd_buffer_status
|
||||
{
|
||||
TU_CMD_BUFFER_STATUS_INVALID,
|
||||
TU_CMD_BUFFER_STATUS_INITIAL,
|
||||
TU_CMD_BUFFER_STATUS_RECORDING,
|
||||
TU_CMD_BUFFER_STATUS_EXECUTABLE,
|
||||
TU_CMD_BUFFER_STATUS_PENDING,
|
||||
};
|
||||
|
||||
struct tu_cmd_buffer
|
||||
{
|
||||
struct vk_command_buffer vk;
|
||||
|
||||
struct tu_device *device;
|
||||
|
||||
struct tu_cmd_pool *pool;
|
||||
struct list_head pool_link;
|
||||
|
||||
struct u_trace trace;
|
||||
struct u_trace_iterator trace_renderpass_start;
|
||||
struct u_trace_iterator trace_renderpass_end;
|
||||
|
||||
struct list_head renderpass_autotune_results;
|
||||
struct tu_autotune_results_buffer* autotune_buffer;
|
||||
|
||||
VkCommandBufferUsageFlags usage_flags;
|
||||
enum tu_cmd_buffer_status status;
|
||||
|
||||
VkQueryPipelineStatisticFlags inherited_pipeline_statistics;
|
||||
|
||||
struct tu_cmd_state state;
|
||||
uint32_t queue_family_index;
|
||||
|
||||
uint32_t push_constants[MAX_PUSH_CONSTANTS_SIZE / 4];
|
||||
VkShaderStageFlags push_constant_stages;
|
||||
struct tu_descriptor_set meta_push_descriptors;
|
||||
|
||||
struct tu_descriptor_state descriptors[MAX_BIND_POINTS];
|
||||
|
||||
struct tu_render_pass_attachment dynamic_rp_attachments[2 * (MAX_RTS + 1)];
|
||||
struct tu_subpass_attachment dynamic_color_attachments[MAX_RTS];
|
||||
struct tu_subpass_attachment dynamic_resolve_attachments[MAX_RTS + 1];
|
||||
const struct tu_image_view *dynamic_attachments[2 * (MAX_RTS + 1)];
|
||||
|
||||
struct tu_render_pass dynamic_pass;
|
||||
struct tu_subpass dynamic_subpass;
|
||||
struct tu_framebuffer dynamic_framebuffer;
|
||||
|
||||
VkResult record_result;
|
||||
|
||||
struct tu_cs cs;
|
||||
struct tu_cs draw_cs;
|
||||
struct tu_cs tile_store_cs;
|
||||
struct tu_cs draw_epilogue_cs;
|
||||
struct tu_cs sub_cs;
|
||||
|
||||
/* If the first render pass in the command buffer is resuming, then it is
|
||||
* part of a suspend/resume chain that starts before the current command
|
||||
* buffer and needs to be merged later. In this case, its incomplete state
|
||||
* is stored in pre_chain. In the symmetric case where the last render pass
|
||||
* is suspending, we just skip ending the render pass and its state is
|
||||
* stored in draw_cs/the current state. The first and last render pass
|
||||
* might be part of different chains, which is why all the state may need
|
||||
* to be saved separately here.
|
||||
*/
|
||||
struct {
|
||||
struct tu_cs draw_cs;
|
||||
struct tu_cs draw_epilogue_cs;
|
||||
|
||||
struct u_trace_iterator trace_renderpass_start, trace_renderpass_end;
|
||||
|
||||
struct tu_render_pass_state state;
|
||||
} pre_chain;
|
||||
|
||||
uint32_t vsc_draw_strm_pitch;
|
||||
uint32_t vsc_prim_strm_pitch;
|
||||
};
|
||||
VK_DEFINE_HANDLE_CASTS(tu_cmd_buffer, vk.base, VkCommandBuffer,
|
||||
VK_OBJECT_TYPE_COMMAND_BUFFER)
|
||||
|
||||
static inline uint32_t
|
||||
tu_attachment_gmem_offset(struct tu_cmd_buffer *cmd,
|
||||
const struct tu_render_pass_attachment *att)
|
||||
{
|
||||
assert(cmd->state.gmem_layout < TU_GMEM_LAYOUT_COUNT);
|
||||
return att->gmem_offset[cmd->state.gmem_layout];
|
||||
}
|
||||
|
||||
static inline uint32_t
|
||||
tu_attachment_gmem_offset_stencil(struct tu_cmd_buffer *cmd,
|
||||
const struct tu_render_pass_attachment *att)
|
||||
{
|
||||
assert(cmd->state.gmem_layout < TU_GMEM_LAYOUT_COUNT);
|
||||
return att->gmem_offset_stencil[cmd->state.gmem_layout];
|
||||
}
|
||||
|
||||
void tu_render_pass_state_merge(struct tu_render_pass_state *dst,
|
||||
const struct tu_render_pass_state *src);
|
||||
|
||||
VkResult tu_cmd_buffer_begin(struct tu_cmd_buffer *cmd_buffer,
|
||||
VkCommandBufferUsageFlags usage_flags);
|
||||
|
||||
void tu_emit_cache_flush_renderpass(struct tu_cmd_buffer *cmd_buffer,
|
||||
struct tu_cs *cs);
|
||||
|
||||
void tu_emit_cache_flush_ccu(struct tu_cmd_buffer *cmd_buffer,
|
||||
struct tu_cs *cs,
|
||||
enum tu_cmd_ccu_state ccu_state);
|
||||
|
||||
void
|
||||
tu_append_pre_chain(struct tu_cmd_buffer *cmd,
|
||||
struct tu_cmd_buffer *secondary);
|
||||
|
||||
void
|
||||
tu_append_pre_post_chain(struct tu_cmd_buffer *cmd,
|
||||
struct tu_cmd_buffer *secondary);
|
||||
|
||||
void
|
||||
tu_append_post_chain(struct tu_cmd_buffer *cmd,
|
||||
struct tu_cmd_buffer *secondary);
|
||||
|
||||
void
|
||||
tu_restore_suspended_pass(struct tu_cmd_buffer *cmd,
|
||||
struct tu_cmd_buffer *suspended);
|
||||
|
||||
void tu_cmd_render(struct tu_cmd_buffer *cmd);
|
||||
|
||||
void
|
||||
tu6_emit_event_write(struct tu_cmd_buffer *cmd,
|
||||
struct tu_cs *cs,
|
||||
enum vgt_event_type event);
|
||||
|
||||
static inline struct tu_descriptor_state *
|
||||
tu_get_descriptors_state(struct tu_cmd_buffer *cmd_buffer,
|
||||
VkPipelineBindPoint bind_point)
|
||||
{
|
||||
return &cmd_buffer->descriptors[bind_point];
|
||||
}
|
||||
|
||||
void tu6_emit_msaa(struct tu_cs *cs, VkSampleCountFlagBits samples,
|
||||
enum a5xx_line_mode line_mode);
|
||||
|
||||
void tu6_emit_window_scissor(struct tu_cs *cs, uint32_t x1, uint32_t y1, uint32_t x2, uint32_t y2);
|
||||
|
||||
void tu6_emit_window_offset(struct tu_cs *cs, uint32_t x1, uint32_t y1);
|
||||
|
||||
void tu_disable_draw_states(struct tu_cmd_buffer *cmd, struct tu_cs *cs);
|
||||
|
||||
void tu6_apply_depth_bounds_workaround(struct tu_device *device,
|
||||
uint32_t *rb_depth_cntl);
|
||||
|
||||
void
|
||||
update_stencil_mask(uint32_t *value, VkStencilFaceFlags face, uint32_t mask);
|
||||
|
||||
#endif /* TU_CMD_BUFFER_H */
|
@@ -116,6 +116,7 @@
|
||||
struct tu_buffer;
|
||||
struct tu_buffer_view;
|
||||
struct tu_cmd_buffer;
|
||||
struct tu_cmd_pool;
|
||||
struct tu_descriptor_pool;
|
||||
struct tu_descriptor_set;
|
||||
struct tu_descriptor_set_layout;
|
||||
|
@@ -31,6 +31,7 @@
|
||||
#include "tu_common.h"
|
||||
#include "tu_autotune.h"
|
||||
#include "tu_clear_blit.h"
|
||||
#include "tu_cmd_buffer.h"
|
||||
#include "tu_cs.h"
|
||||
#include "tu_descriptor_set.h"
|
||||
#include "tu_device.h"
|
||||
@@ -91,651 +92,11 @@ __tu_finishme(const char *file, int line, const char *format, ...)
|
||||
tu_finishme("stub %s", __func__); \
|
||||
} while (0)
|
||||
|
||||
enum tu_draw_state_group_id
|
||||
{
|
||||
TU_DRAW_STATE_PROGRAM_CONFIG,
|
||||
TU_DRAW_STATE_PROGRAM,
|
||||
TU_DRAW_STATE_PROGRAM_BINNING,
|
||||
TU_DRAW_STATE_VB,
|
||||
TU_DRAW_STATE_VI,
|
||||
TU_DRAW_STATE_VI_BINNING,
|
||||
TU_DRAW_STATE_RAST,
|
||||
TU_DRAW_STATE_CONST,
|
||||
TU_DRAW_STATE_DESC_SETS,
|
||||
TU_DRAW_STATE_DESC_SETS_LOAD,
|
||||
TU_DRAW_STATE_VS_PARAMS,
|
||||
TU_DRAW_STATE_INPUT_ATTACHMENTS_GMEM,
|
||||
TU_DRAW_STATE_INPUT_ATTACHMENTS_SYSMEM,
|
||||
TU_DRAW_STATE_LRZ_AND_DEPTH_PLANE,
|
||||
TU_DRAW_STATE_PRIM_MODE_GMEM,
|
||||
TU_DRAW_STATE_PRIM_MODE_SYSMEM,
|
||||
|
||||
/* dynamic state related draw states */
|
||||
TU_DRAW_STATE_DYNAMIC,
|
||||
TU_DRAW_STATE_COUNT = TU_DRAW_STATE_DYNAMIC + TU_DYNAMIC_STATE_COUNT,
|
||||
};
|
||||
|
||||
void
|
||||
tu_framebuffer_tiling_config(struct tu_framebuffer *fb,
|
||||
const struct tu_device *device,
|
||||
const struct tu_render_pass *pass);
|
||||
|
||||
struct tu_descriptor_state
|
||||
{
|
||||
struct tu_descriptor_set *sets[MAX_SETS];
|
||||
struct tu_descriptor_set push_set;
|
||||
uint32_t dynamic_descriptors[MAX_DYNAMIC_BUFFERS_SIZE];
|
||||
};
|
||||
|
||||
enum tu_cmd_dirty_bits
|
||||
{
|
||||
TU_CMD_DIRTY_VERTEX_BUFFERS = BIT(0),
|
||||
TU_CMD_DIRTY_VB_STRIDE = BIT(1),
|
||||
TU_CMD_DIRTY_GRAS_SU_CNTL = BIT(2),
|
||||
TU_CMD_DIRTY_RB_DEPTH_CNTL = BIT(3),
|
||||
TU_CMD_DIRTY_RB_STENCIL_CNTL = BIT(4),
|
||||
TU_CMD_DIRTY_DESC_SETS_LOAD = BIT(5),
|
||||
TU_CMD_DIRTY_COMPUTE_DESC_SETS_LOAD = BIT(6),
|
||||
TU_CMD_DIRTY_SHADER_CONSTS = BIT(7),
|
||||
TU_CMD_DIRTY_LRZ = BIT(8),
|
||||
TU_CMD_DIRTY_VS_PARAMS = BIT(9),
|
||||
TU_CMD_DIRTY_RASTERIZER_DISCARD = BIT(10),
|
||||
TU_CMD_DIRTY_VIEWPORTS = BIT(11),
|
||||
TU_CMD_DIRTY_BLEND = BIT(12),
|
||||
/* all draw states were disabled and need to be re-enabled: */
|
||||
TU_CMD_DIRTY_DRAW_STATE = BIT(13)
|
||||
};
|
||||
|
||||
/* There are only three cache domains we have to care about: the CCU, or
|
||||
* color cache unit, which is used for color and depth/stencil attachments
|
||||
* and copy/blit destinations, and is split conceptually into color and depth,
|
||||
* and the universal cache or UCHE which is used for pretty much everything
|
||||
* else, except for the CP (uncached) and host. We need to flush whenever data
|
||||
* crosses these boundaries.
|
||||
*/
|
||||
|
||||
enum tu_cmd_access_mask {
|
||||
TU_ACCESS_UCHE_READ = 1 << 0,
|
||||
TU_ACCESS_UCHE_WRITE = 1 << 1,
|
||||
TU_ACCESS_CCU_COLOR_READ = 1 << 2,
|
||||
TU_ACCESS_CCU_COLOR_WRITE = 1 << 3,
|
||||
TU_ACCESS_CCU_DEPTH_READ = 1 << 4,
|
||||
TU_ACCESS_CCU_DEPTH_WRITE = 1 << 5,
|
||||
|
||||
/* Experiments have shown that while it's safe to avoid flushing the CCU
|
||||
* after each blit/renderpass, it's not safe to assume that subsequent
|
||||
* lookups with a different attachment state will hit unflushed cache
|
||||
* entries. That is, the CCU needs to be flushed and possibly invalidated
|
||||
* when accessing memory with a different attachment state. Writing to an
|
||||
* attachment under the following conditions after clearing using the
|
||||
* normal 2d engine path is known to have issues:
|
||||
*
|
||||
* - It isn't the 0'th layer.
|
||||
* - There are more than one attachment, and this isn't the 0'th attachment
|
||||
* (this seems to also depend on the cpp of the attachments).
|
||||
*
|
||||
* Our best guess is that the layer/MRT state is used when computing
|
||||
* the location of a cache entry in CCU, to avoid conflicts. We assume that
|
||||
* any access in a renderpass after or before an access by a transfer needs
|
||||
* a flush/invalidate, and use the _INCOHERENT variants to represent access
|
||||
* by a renderpass.
|
||||
*/
|
||||
TU_ACCESS_CCU_COLOR_INCOHERENT_READ = 1 << 6,
|
||||
TU_ACCESS_CCU_COLOR_INCOHERENT_WRITE = 1 << 7,
|
||||
TU_ACCESS_CCU_DEPTH_INCOHERENT_READ = 1 << 8,
|
||||
TU_ACCESS_CCU_DEPTH_INCOHERENT_WRITE = 1 << 9,
|
||||
|
||||
/* Accesses which bypasses any cache. e.g. writes via the host,
|
||||
* CP_EVENT_WRITE::BLIT, and the CP are SYSMEM_WRITE.
|
||||
*/
|
||||
TU_ACCESS_SYSMEM_READ = 1 << 10,
|
||||
TU_ACCESS_SYSMEM_WRITE = 1 << 11,
|
||||
|
||||
/* Memory writes from the CP start in-order with draws and event writes,
|
||||
* but execute asynchronously and hence need a CP_WAIT_MEM_WRITES if read.
|
||||
*/
|
||||
TU_ACCESS_CP_WRITE = 1 << 12,
|
||||
|
||||
TU_ACCESS_READ =
|
||||
TU_ACCESS_UCHE_READ |
|
||||
TU_ACCESS_CCU_COLOR_READ |
|
||||
TU_ACCESS_CCU_DEPTH_READ |
|
||||
TU_ACCESS_CCU_COLOR_INCOHERENT_READ |
|
||||
TU_ACCESS_CCU_DEPTH_INCOHERENT_READ |
|
||||
TU_ACCESS_SYSMEM_READ,
|
||||
|
||||
TU_ACCESS_WRITE =
|
||||
TU_ACCESS_UCHE_WRITE |
|
||||
TU_ACCESS_CCU_COLOR_WRITE |
|
||||
TU_ACCESS_CCU_COLOR_INCOHERENT_WRITE |
|
||||
TU_ACCESS_CCU_DEPTH_WRITE |
|
||||
TU_ACCESS_CCU_DEPTH_INCOHERENT_WRITE |
|
||||
TU_ACCESS_SYSMEM_WRITE |
|
||||
TU_ACCESS_CP_WRITE,
|
||||
|
||||
TU_ACCESS_ALL =
|
||||
TU_ACCESS_READ |
|
||||
TU_ACCESS_WRITE,
|
||||
};
|
||||
|
||||
/* Starting with a6xx, the pipeline is split into several "clusters" (really
|
||||
* pipeline stages). Each stage has its own pair of register banks and can
|
||||
* switch them independently, so that earlier stages can run ahead of later
|
||||
* ones. e.g. the FS of draw N and the VS of draw N + 1 can be executing at
|
||||
* the same time.
|
||||
*
|
||||
* As a result of this, we need to insert a WFI when an earlier stage depends
|
||||
* on the result of a later stage. CP_DRAW_* and CP_BLIT will wait for any
|
||||
* pending WFI's to complete before starting, and usually before reading
|
||||
* indirect params even, so a WFI also acts as a full "pipeline stall".
|
||||
*
|
||||
* Note, the names of the stages come from CLUSTER_* in devcoredump. We
|
||||
* include all the stages for completeness, even ones which do not read/write
|
||||
* anything.
|
||||
*/
|
||||
|
||||
enum tu_stage {
|
||||
/* This doesn't correspond to a cluster, but we need it for tracking
|
||||
* indirect draw parameter reads etc.
|
||||
*/
|
||||
TU_STAGE_CP,
|
||||
|
||||
/* - Fetch index buffer
|
||||
* - Fetch vertex attributes, dispatch VS
|
||||
*/
|
||||
TU_STAGE_FE,
|
||||
|
||||
/* Execute all geometry stages (VS thru GS) */
|
||||
TU_STAGE_SP_VS,
|
||||
|
||||
/* Write to VPC, do primitive assembly. */
|
||||
TU_STAGE_PC_VS,
|
||||
|
||||
/* Rasterization. RB_DEPTH_BUFFER_BASE only exists in CLUSTER_PS according
|
||||
* to devcoredump so presumably this stage stalls for TU_STAGE_PS when
|
||||
* early depth testing is enabled before dispatching fragments? However
|
||||
* GRAS reads and writes LRZ directly.
|
||||
*/
|
||||
TU_STAGE_GRAS,
|
||||
|
||||
/* Execute FS */
|
||||
TU_STAGE_SP_PS,
|
||||
|
||||
/* - Fragment tests
|
||||
* - Write color/depth
|
||||
* - Streamout writes (???)
|
||||
* - Varying interpolation (???)
|
||||
*/
|
||||
TU_STAGE_PS,
|
||||
};
|
||||
|
||||
enum tu_cmd_flush_bits {
|
||||
TU_CMD_FLAG_CCU_FLUSH_DEPTH = 1 << 0,
|
||||
TU_CMD_FLAG_CCU_FLUSH_COLOR = 1 << 1,
|
||||
TU_CMD_FLAG_CCU_INVALIDATE_DEPTH = 1 << 2,
|
||||
TU_CMD_FLAG_CCU_INVALIDATE_COLOR = 1 << 3,
|
||||
TU_CMD_FLAG_CACHE_FLUSH = 1 << 4,
|
||||
TU_CMD_FLAG_CACHE_INVALIDATE = 1 << 5,
|
||||
TU_CMD_FLAG_WAIT_MEM_WRITES = 1 << 6,
|
||||
TU_CMD_FLAG_WAIT_FOR_IDLE = 1 << 7,
|
||||
TU_CMD_FLAG_WAIT_FOR_ME = 1 << 8,
|
||||
|
||||
TU_CMD_FLAG_ALL_FLUSH =
|
||||
TU_CMD_FLAG_CCU_FLUSH_DEPTH |
|
||||
TU_CMD_FLAG_CCU_FLUSH_COLOR |
|
||||
TU_CMD_FLAG_CACHE_FLUSH |
|
||||
/* Treat the CP as a sort of "cache" which may need to be "flushed" via
|
||||
* waiting for writes to land with WAIT_FOR_MEM_WRITES.
|
||||
*/
|
||||
TU_CMD_FLAG_WAIT_MEM_WRITES,
|
||||
|
||||
TU_CMD_FLAG_ALL_INVALIDATE =
|
||||
TU_CMD_FLAG_CCU_INVALIDATE_DEPTH |
|
||||
TU_CMD_FLAG_CCU_INVALIDATE_COLOR |
|
||||
TU_CMD_FLAG_CACHE_INVALIDATE |
|
||||
/* Treat CP_WAIT_FOR_ME as a "cache" that needs to be invalidated when a
|
||||
* a command that needs CP_WAIT_FOR_ME is executed. This means we may
|
||||
* insert an extra WAIT_FOR_ME before an indirect command requiring it
|
||||
* in case there was another command before the current command buffer
|
||||
* that it needs to wait for.
|
||||
*/
|
||||
TU_CMD_FLAG_WAIT_FOR_ME,
|
||||
};
|
||||
|
||||
/* Changing the CCU from sysmem mode to gmem mode or vice-versa is pretty
|
||||
* heavy, involving a CCU cache flush/invalidate and a WFI in order to change
|
||||
* which part of the gmem is used by the CCU. Here we keep track of what the
|
||||
* state of the CCU.
|
||||
*/
|
||||
enum tu_cmd_ccu_state {
|
||||
TU_CMD_CCU_SYSMEM,
|
||||
TU_CMD_CCU_GMEM,
|
||||
TU_CMD_CCU_UNKNOWN,
|
||||
};
|
||||
|
||||
struct tu_cache_state {
|
||||
/* Caches which must be made available (flushed) eventually if there are
|
||||
* any users outside that cache domain, and caches which must be
|
||||
* invalidated eventually if there are any reads.
|
||||
*/
|
||||
enum tu_cmd_flush_bits pending_flush_bits;
|
||||
/* Pending flushes */
|
||||
enum tu_cmd_flush_bits flush_bits;
|
||||
};
|
||||
|
||||
struct tu_vs_params {
|
||||
uint32_t vertex_offset;
|
||||
uint32_t first_instance;
|
||||
};
|
||||
|
||||
/* This should be for state that is set inside a renderpass and used at
|
||||
* renderpass end time, e.g. to decide whether to use sysmem. This needs
|
||||
* special handling for secondary cmdbufs and suspending/resuming render
|
||||
* passes where the state may need to be combined afterwards.
|
||||
*/
|
||||
struct tu_render_pass_state
|
||||
{
|
||||
bool xfb_used;
|
||||
bool has_tess;
|
||||
bool has_prim_generated_query_in_rp;
|
||||
bool disable_gmem;
|
||||
|
||||
/* Track whether conditional predicate for COND_REG_EXEC is changed in draw_cs */
|
||||
bool draw_cs_writes_to_cond_pred;
|
||||
|
||||
uint32_t drawcall_count;
|
||||
|
||||
/* A calculated "draw cost" value for renderpass, which tries to
|
||||
* estimate the bandwidth-per-sample of all the draws according
|
||||
* to:
|
||||
*
|
||||
* foreach_draw (...) {
|
||||
* sum += pipeline->color_bandwidth_per_sample;
|
||||
* if (depth_test_enabled)
|
||||
* sum += pipeline->depth_cpp_per_sample;
|
||||
* if (depth_write_enabled)
|
||||
* sum += pipeline->depth_cpp_per_sample;
|
||||
* if (stencil_write_enabled)
|
||||
* sum += pipeline->stencil_cpp_per_sample * 2;
|
||||
* }
|
||||
* drawcall_bandwidth_per_sample = sum / drawcall_count;
|
||||
*
|
||||
* It allows us to estimate the total bandwidth of drawcalls later, by
|
||||
* calculating (drawcall_bandwidth_per_sample * zpass_sample_count).
|
||||
*
|
||||
* This does ignore depth buffer traffic for samples which do not
|
||||
* pass due to depth-test fail, and some other details. But it is
|
||||
* just intended to be a rough estimate that is easy to calculate.
|
||||
*/
|
||||
uint32_t drawcall_bandwidth_per_sample_sum;
|
||||
};
|
||||
|
||||
void tu_render_pass_state_merge(struct tu_render_pass_state *dst,
|
||||
const struct tu_render_pass_state *src);
|
||||
struct tu_cmd_state
|
||||
{
|
||||
uint32_t dirty;
|
||||
|
||||
struct tu_pipeline *pipeline;
|
||||
struct tu_pipeline *compute_pipeline;
|
||||
|
||||
struct tu_render_pass_state rp;
|
||||
|
||||
/* Vertex buffers, viewports, and scissors
|
||||
* the states for these can be updated partially, so we need to save these
|
||||
* to be able to emit a complete draw state
|
||||
*/
|
||||
struct {
|
||||
uint64_t base;
|
||||
uint32_t size;
|
||||
uint32_t stride;
|
||||
} vb[MAX_VBS];
|
||||
VkViewport viewport[MAX_VIEWPORTS];
|
||||
VkRect2D scissor[MAX_SCISSORS];
|
||||
uint32_t max_viewport, max_scissor;
|
||||
|
||||
/* for dynamic states that can't be emitted directly */
|
||||
uint32_t dynamic_stencil_mask;
|
||||
uint32_t dynamic_stencil_wrmask;
|
||||
uint32_t dynamic_stencil_ref;
|
||||
|
||||
uint32_t gras_su_cntl, rb_depth_cntl, rb_stencil_cntl;
|
||||
uint32_t pc_raster_cntl, vpc_unknown_9107;
|
||||
uint32_t rb_mrt_control[MAX_RTS], rb_mrt_blend_control[MAX_RTS];
|
||||
uint32_t rb_mrt_control_rop;
|
||||
uint32_t rb_blend_cntl, sp_blend_cntl;
|
||||
uint32_t pipeline_color_write_enable, pipeline_blend_enable;
|
||||
uint32_t color_write_enable;
|
||||
bool logic_op_enabled;
|
||||
bool rop_reads_dst;
|
||||
enum pc_di_primtype primtype;
|
||||
bool primitive_restart_enable;
|
||||
|
||||
/* saved states to re-emit in TU_CMD_DIRTY_DRAW_STATE case */
|
||||
struct tu_draw_state dynamic_state[TU_DYNAMIC_STATE_COUNT];
|
||||
struct tu_draw_state vertex_buffers;
|
||||
struct tu_draw_state shader_const;
|
||||
struct tu_draw_state desc_sets;
|
||||
|
||||
struct tu_draw_state vs_params;
|
||||
|
||||
/* Index buffer */
|
||||
uint64_t index_va;
|
||||
uint32_t max_index_count;
|
||||
uint8_t index_size;
|
||||
|
||||
/* because streamout base has to be 32-byte aligned
|
||||
* there is an extra offset to deal with when it is
|
||||
* unaligned
|
||||
*/
|
||||
uint8_t streamout_offset[IR3_MAX_SO_BUFFERS];
|
||||
|
||||
/* Renderpasses are tricky, because we may need to flush differently if
|
||||
* using sysmem vs. gmem and therefore we have to delay any flushing that
|
||||
* happens before a renderpass. So we have to have two copies of the flush
|
||||
* state, one for intra-renderpass flushes (i.e. renderpass dependencies)
|
||||
* and one for outside a renderpass.
|
||||
*/
|
||||
struct tu_cache_state cache;
|
||||
struct tu_cache_state renderpass_cache;
|
||||
|
||||
enum tu_cmd_ccu_state ccu_state;
|
||||
|
||||
/* Decides which GMEM layout to use from the tu_pass, based on whether the CCU
|
||||
* might get used by tu_store_gmem_attachment().
|
||||
*/
|
||||
enum tu_gmem_layout gmem_layout;
|
||||
|
||||
const struct tu_render_pass *pass;
|
||||
const struct tu_subpass *subpass;
|
||||
const struct tu_framebuffer *framebuffer;
|
||||
const struct tu_tiling_config *tiling;
|
||||
VkRect2D render_area;
|
||||
|
||||
const struct tu_image_view **attachments;
|
||||
|
||||
/* State that in the dynamic case comes from VkRenderingInfo and needs to
|
||||
* be saved/restored when suspending. This holds the state for the last
|
||||
* suspended renderpass, which may point to this command buffer's dynamic_*
|
||||
* or another command buffer if executed on a secondary.
|
||||
*/
|
||||
struct {
|
||||
const struct tu_render_pass *pass;
|
||||
const struct tu_subpass *subpass;
|
||||
const struct tu_framebuffer *framebuffer;
|
||||
VkRect2D render_area;
|
||||
enum tu_gmem_layout gmem_layout;
|
||||
|
||||
const struct tu_image_view **attachments;
|
||||
|
||||
struct tu_lrz_state lrz;
|
||||
} suspended_pass;
|
||||
|
||||
bool tessfactor_addr_set;
|
||||
bool predication_active;
|
||||
enum a5xx_line_mode line_mode;
|
||||
bool z_negative_one_to_one;
|
||||
|
||||
/* VK_QUERY_PIPELINE_STATISTIC_CLIPPING_INVOCATIONS_BIT and
|
||||
* VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT are allowed to run simultaniously,
|
||||
* but they use the same {START,STOP}_PRIMITIVE_CTRS control.
|
||||
*/
|
||||
uint32_t prim_counters_running;
|
||||
|
||||
bool prim_generated_query_running_before_rp;
|
||||
|
||||
/* These are the states of the suspend/resume state machine. In addition to
|
||||
* tracking whether we're in the middle of a chain of suspending and
|
||||
* resuming passes that will be merged, we need to track whether the
|
||||
* command buffer begins in the middle of such a chain, for when it gets
|
||||
* merged with other command buffers. We call such a chain that begins
|
||||
* before the command buffer starts a "pre-chain".
|
||||
*
|
||||
* Note that when this command buffer is finished, this state is untouched
|
||||
* but it gains a different meaning. For example, if we finish in state
|
||||
* SR_IN_CHAIN, we finished in the middle of a suspend/resume chain, so
|
||||
* there's a suspend/resume chain that extends past the end of the command
|
||||
* buffer. In this sense it's the "opposite" of SR_AFTER_PRE_CHAIN, which
|
||||
* means that there's a suspend/resume chain that extends before the
|
||||
* beginning.
|
||||
*/
|
||||
enum {
|
||||
/* Either there are no suspend/resume chains, or they are entirely
|
||||
* contained in the current command buffer.
|
||||
*
|
||||
* BeginCommandBuffer() <- start of current command buffer
|
||||
* ...
|
||||
* // we are here
|
||||
*/
|
||||
SR_NONE = 0,
|
||||
|
||||
/* We are in the middle of a suspend/resume chain that starts before the
|
||||
* current command buffer. This happens when the command buffer begins
|
||||
* with a resuming render pass and all of the passes up to the current
|
||||
* one are suspending. In this state, our part of the chain is not saved
|
||||
* and is in the current draw_cs/state.
|
||||
*
|
||||
* BeginRendering() ... EndRendering(suspending)
|
||||
* BeginCommandBuffer() <- start of current command buffer
|
||||
* BeginRendering(resuming) ... EndRendering(suspending)
|
||||
* BeginRendering(resuming) ... EndRendering(suspending)
|
||||
* ...
|
||||
* // we are here
|
||||
*/
|
||||
SR_IN_PRE_CHAIN,
|
||||
|
||||
/* We are currently outside of any suspend/resume chains, but there is a
|
||||
* chain starting before the current command buffer. It is saved in
|
||||
* pre_chain.
|
||||
*
|
||||
* BeginRendering() ... EndRendering(suspending)
|
||||
* BeginCommandBuffer() <- start of current command buffer
|
||||
* // This part is stashed in pre_chain
|
||||
* BeginRendering(resuming) ... EndRendering(suspending)
|
||||
* BeginRendering(resuming) ... EndRendering(suspending)
|
||||
* ...
|
||||
* BeginRendering(resuming) ... EndRendering() // end of chain
|
||||
* ...
|
||||
* // we are here
|
||||
*/
|
||||
SR_AFTER_PRE_CHAIN,
|
||||
|
||||
/* We are in the middle of a suspend/resume chain and there is no chain
|
||||
* starting before the current command buffer.
|
||||
*
|
||||
* BeginCommandBuffer() <- start of current command buffer
|
||||
* ...
|
||||
* BeginRendering() ... EndRendering(suspending)
|
||||
* BeginRendering(resuming) ... EndRendering(suspending)
|
||||
* BeginRendering(resuming) ... EndRendering(suspending)
|
||||
* ...
|
||||
* // we are here
|
||||
*/
|
||||
SR_IN_CHAIN,
|
||||
|
||||
/* We are in the middle of a suspend/resume chain and there is another,
|
||||
* separate, chain starting before the current command buffer.
|
||||
*
|
||||
* BeginRendering() ... EndRendering(suspending)
|
||||
* CommandBufferBegin() <- start of current command buffer
|
||||
* // This part is stashed in pre_chain
|
||||
* BeginRendering(resuming) ... EndRendering(suspending)
|
||||
* BeginRendering(resuming) ... EndRendering(suspending)
|
||||
* ...
|
||||
* BeginRendering(resuming) ... EndRendering() // end of chain
|
||||
* ...
|
||||
* BeginRendering() ... EndRendering(suspending)
|
||||
* BeginRendering(resuming) ... EndRendering(suspending)
|
||||
* BeginRendering(resuming) ... EndRendering(suspending)
|
||||
* ...
|
||||
* // we are here
|
||||
*/
|
||||
SR_IN_CHAIN_AFTER_PRE_CHAIN,
|
||||
} suspend_resume;
|
||||
|
||||
bool suspending, resuming;
|
||||
|
||||
struct tu_lrz_state lrz;
|
||||
|
||||
struct tu_draw_state lrz_and_depth_plane_state;
|
||||
|
||||
struct tu_vs_params last_vs_params;
|
||||
};
|
||||
|
||||
struct tu_cmd_pool
|
||||
{
|
||||
struct vk_command_pool vk;
|
||||
|
||||
struct list_head cmd_buffers;
|
||||
struct list_head free_cmd_buffers;
|
||||
};
|
||||
|
||||
enum tu_cmd_buffer_status
|
||||
{
|
||||
TU_CMD_BUFFER_STATUS_INVALID,
|
||||
TU_CMD_BUFFER_STATUS_INITIAL,
|
||||
TU_CMD_BUFFER_STATUS_RECORDING,
|
||||
TU_CMD_BUFFER_STATUS_EXECUTABLE,
|
||||
TU_CMD_BUFFER_STATUS_PENDING,
|
||||
};
|
||||
|
||||
struct tu_cmd_buffer
|
||||
{
|
||||
struct vk_command_buffer vk;
|
||||
|
||||
struct tu_device *device;
|
||||
|
||||
struct tu_cmd_pool *pool;
|
||||
struct list_head pool_link;
|
||||
|
||||
struct u_trace trace;
|
||||
struct u_trace_iterator trace_renderpass_start;
|
||||
struct u_trace_iterator trace_renderpass_end;
|
||||
|
||||
struct list_head renderpass_autotune_results;
|
||||
struct tu_autotune_results_buffer* autotune_buffer;
|
||||
|
||||
VkCommandBufferUsageFlags usage_flags;
|
||||
enum tu_cmd_buffer_status status;
|
||||
|
||||
VkQueryPipelineStatisticFlags inherited_pipeline_statistics;
|
||||
|
||||
struct tu_cmd_state state;
|
||||
uint32_t queue_family_index;
|
||||
|
||||
uint32_t push_constants[MAX_PUSH_CONSTANTS_SIZE / 4];
|
||||
VkShaderStageFlags push_constant_stages;
|
||||
struct tu_descriptor_set meta_push_descriptors;
|
||||
|
||||
struct tu_descriptor_state descriptors[MAX_BIND_POINTS];
|
||||
|
||||
struct tu_render_pass_attachment dynamic_rp_attachments[2 * (MAX_RTS + 1)];
|
||||
struct tu_subpass_attachment dynamic_color_attachments[MAX_RTS];
|
||||
struct tu_subpass_attachment dynamic_resolve_attachments[MAX_RTS + 1];
|
||||
const struct tu_image_view *dynamic_attachments[2 * (MAX_RTS + 1)];
|
||||
|
||||
struct tu_render_pass dynamic_pass;
|
||||
struct tu_subpass dynamic_subpass;
|
||||
struct tu_framebuffer dynamic_framebuffer;
|
||||
|
||||
VkResult record_result;
|
||||
|
||||
struct tu_cs cs;
|
||||
struct tu_cs draw_cs;
|
||||
struct tu_cs tile_store_cs;
|
||||
struct tu_cs draw_epilogue_cs;
|
||||
struct tu_cs sub_cs;
|
||||
|
||||
/* If the first render pass in the command buffer is resuming, then it is
|
||||
* part of a suspend/resume chain that starts before the current command
|
||||
* buffer and needs to be merged later. In this case, its incomplete state
|
||||
* is stored in pre_chain. In the symmetric case where the last render pass
|
||||
* is suspending, we just skip ending the render pass and its state is
|
||||
* stored in draw_cs/the current state. The first and last render pass
|
||||
* might be part of different chains, which is why all the state may need
|
||||
* to be saved separately here.
|
||||
*/
|
||||
struct {
|
||||
struct tu_cs draw_cs;
|
||||
struct tu_cs draw_epilogue_cs;
|
||||
|
||||
struct u_trace_iterator trace_renderpass_start, trace_renderpass_end;
|
||||
|
||||
struct tu_render_pass_state state;
|
||||
} pre_chain;
|
||||
|
||||
uint32_t vsc_draw_strm_pitch;
|
||||
uint32_t vsc_prim_strm_pitch;
|
||||
};
|
||||
|
||||
static inline uint32_t
|
||||
tu_attachment_gmem_offset(struct tu_cmd_buffer *cmd,
|
||||
const struct tu_render_pass_attachment *att)
|
||||
{
|
||||
assert(cmd->state.gmem_layout < TU_GMEM_LAYOUT_COUNT);
|
||||
return att->gmem_offset[cmd->state.gmem_layout];
|
||||
}
|
||||
|
||||
static inline uint32_t
|
||||
tu_attachment_gmem_offset_stencil(struct tu_cmd_buffer *cmd,
|
||||
const struct tu_render_pass_attachment *att)
|
||||
{
|
||||
assert(cmd->state.gmem_layout < TU_GMEM_LAYOUT_COUNT);
|
||||
return att->gmem_offset_stencil[cmd->state.gmem_layout];
|
||||
}
|
||||
|
||||
VkResult tu_cmd_buffer_begin(struct tu_cmd_buffer *cmd_buffer,
|
||||
VkCommandBufferUsageFlags usage_flags);
|
||||
|
||||
void tu_emit_cache_flush_renderpass(struct tu_cmd_buffer *cmd_buffer,
|
||||
struct tu_cs *cs);
|
||||
|
||||
void tu_emit_cache_flush_ccu(struct tu_cmd_buffer *cmd_buffer,
|
||||
struct tu_cs *cs,
|
||||
enum tu_cmd_ccu_state ccu_state);
|
||||
|
||||
void
|
||||
tu_append_pre_chain(struct tu_cmd_buffer *cmd,
|
||||
struct tu_cmd_buffer *secondary);
|
||||
|
||||
void
|
||||
tu_append_pre_post_chain(struct tu_cmd_buffer *cmd,
|
||||
struct tu_cmd_buffer *secondary);
|
||||
|
||||
void
|
||||
tu_append_post_chain(struct tu_cmd_buffer *cmd,
|
||||
struct tu_cmd_buffer *secondary);
|
||||
|
||||
void
|
||||
tu_restore_suspended_pass(struct tu_cmd_buffer *cmd,
|
||||
struct tu_cmd_buffer *suspended);
|
||||
|
||||
void tu_cmd_render(struct tu_cmd_buffer *cmd);
|
||||
|
||||
void
|
||||
tu6_emit_event_write(struct tu_cmd_buffer *cmd,
|
||||
struct tu_cs *cs,
|
||||
enum vgt_event_type event);
|
||||
|
||||
static inline struct tu_descriptor_state *
|
||||
tu_get_descriptors_state(struct tu_cmd_buffer *cmd_buffer,
|
||||
VkPipelineBindPoint bind_point)
|
||||
{
|
||||
return &cmd_buffer->descriptors[bind_point];
|
||||
}
|
||||
|
||||
void tu6_emit_msaa(struct tu_cs *cs, VkSampleCountFlagBits samples,
|
||||
enum a5xx_line_mode line_mode);
|
||||
|
||||
void tu6_emit_window_scissor(struct tu_cs *cs, uint32_t x1, uint32_t y1, uint32_t x2, uint32_t y2);
|
||||
|
||||
void tu6_emit_window_offset(struct tu_cs *cs, uint32_t x1, uint32_t y1);
|
||||
|
||||
void tu_disable_draw_states(struct tu_cmd_buffer *cmd, struct tu_cs *cs);
|
||||
|
||||
void tu6_apply_depth_bounds_workaround(struct tu_device *device,
|
||||
uint32_t *rb_depth_cntl);
|
||||
|
||||
VkResult
|
||||
tu_gralloc_info(struct tu_device *device,
|
||||
const VkNativeBufferANDROID *gralloc_info,
|
||||
@@ -748,13 +109,4 @@ tu_import_memory_from_gralloc_handle(VkDevice device_h,
|
||||
const VkAllocationCallbacks *alloc,
|
||||
VkImage image_h);
|
||||
|
||||
VK_DEFINE_HANDLE_CASTS(tu_cmd_buffer, vk.base, VkCommandBuffer,
|
||||
VK_OBJECT_TYPE_COMMAND_BUFFER)
|
||||
|
||||
VK_DEFINE_NONDISP_HANDLE_CASTS(tu_cmd_pool, vk.base, VkCommandPool,
|
||||
VK_OBJECT_TYPE_COMMAND_POOL)
|
||||
|
||||
void
|
||||
update_stencil_mask(uint32_t *value, VkStencilFaceFlags face, uint32_t mask);
|
||||
|
||||
#endif /* TU_PRIVATE_H */
|
||||
|
Reference in New Issue
Block a user