anv/cmd_buffer: Only emit PIPE_CONTROL on-demand

This is in contrast to emitting it directly in vkCmdPipelineBarrier.  This
has a couple of advantages.  First, it means that no matter how many
vkCmdPipelineBarrier calls the application strings together it gets one or
two PIPE_CONTROLs.  Second, it allow us to better track when we need to do
stalls because we can flag when a flush has happened and we need a stall.

Signed-off-by: Jason Ekstrand <jason@jlekstrand.net>
Reviewed-by: Jordan Justen <jordan.l.justen@intel.com>
This commit is contained in:
Jason Ekstrand
2016-05-20 11:49:12 -07:00
parent 7120c75ec3
commit 3a83c176ea
6 changed files with 140 additions and 70 deletions

View File

@@ -127,6 +127,7 @@ anv_cmd_state_reset(struct anv_cmd_buffer *cmd_buffer)
state->dirty = 0; state->dirty = 0;
state->vb_dirty = 0; state->vb_dirty = 0;
state->pending_pipe_bits = 0;
state->descriptors_dirty = 0; state->descriptors_dirty = 0;
state->push_constants_dirty = 0; state->push_constants_dirty = 0;
state->pipeline = NULL; state->pipeline = NULL;

View File

@@ -39,6 +39,8 @@ genX(cmd_buffer_alloc_null_surface_state)(struct anv_cmd_buffer *cmd_buffer,
void genX(cmd_buffer_set_subpass)(struct anv_cmd_buffer *cmd_buffer, void genX(cmd_buffer_set_subpass)(struct anv_cmd_buffer *cmd_buffer,
struct anv_subpass *subpass); struct anv_subpass *subpass);
void genX(cmd_buffer_apply_pipe_flushes)(struct anv_cmd_buffer *cmd_buffer);
void genX(flush_pipeline_select_3d)(struct anv_cmd_buffer *cmd_buffer); void genX(flush_pipeline_select_3d)(struct anv_cmd_buffer *cmd_buffer);
void genX(flush_pipeline_select_gpgpu)(struct anv_cmd_buffer *cmd_buffer); void genX(flush_pipeline_select_gpgpu)(struct anv_cmd_buffer *cmd_buffer);

View File

@@ -1066,6 +1066,45 @@ enum anv_cmd_dirty_bits {
}; };
typedef uint32_t anv_cmd_dirty_mask_t; typedef uint32_t anv_cmd_dirty_mask_t;
enum anv_pipe_bits {
ANV_PIPE_DEPTH_CACHE_FLUSH_BIT = (1 << 0),
ANV_PIPE_STALL_AT_SCOREBOARD_BIT = (1 << 1),
ANV_PIPE_STATE_CACHE_INVALIDATE_BIT = (1 << 2),
ANV_PIPE_CONSTANT_CACHE_INVALIDATE_BIT = (1 << 3),
ANV_PIPE_VF_CACHE_INVALIDATE_BIT = (1 << 4),
ANV_PIPE_DATA_CACHE_FLUSH_BIT = (1 << 5),
ANV_PIPE_TEXTURE_CACHE_INVALIDATE_BIT = (1 << 10),
ANV_PIPE_INSTRUCTION_CACHE_INVALIDATE_BIT = (1 << 11),
ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT = (1 << 12),
ANV_PIPE_DEPTH_STALL_BIT = (1 << 13),
ANV_PIPE_CS_STALL_BIT = (1 << 20),
/* This bit does not exist directly in PIPE_CONTROL. Instead it means that
* a flush has happened but not a CS stall. The next time we do any sort
* of invalidation we need to insert a CS stall at that time. Otherwise,
* we would have to CS stall on every flush which could be bad.
*/
ANV_PIPE_NEEDS_CS_STALL_BIT = (1 << 21),
};
#define ANV_PIPE_FLUSH_BITS ( \
ANV_PIPE_DEPTH_CACHE_FLUSH_BIT | \
ANV_PIPE_DATA_CACHE_FLUSH_BIT | \
ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT)
#define ANV_PIPE_STALL_BITS ( \
ANV_PIPE_STALL_AT_SCOREBOARD_BIT | \
ANV_PIPE_DEPTH_STALL_BIT | \
ANV_PIPE_CS_STALL_BIT)
#define ANV_PIPE_INVALIDATE_BITS ( \
ANV_PIPE_STATE_CACHE_INVALIDATE_BIT | \
ANV_PIPE_CONSTANT_CACHE_INVALIDATE_BIT | \
ANV_PIPE_VF_CACHE_INVALIDATE_BIT | \
ANV_PIPE_DATA_CACHE_FLUSH_BIT | \
ANV_PIPE_TEXTURE_CACHE_INVALIDATE_BIT | \
ANV_PIPE_INSTRUCTION_CACHE_INVALIDATE_BIT)
struct anv_vertex_binding { struct anv_vertex_binding {
struct anv_buffer * buffer; struct anv_buffer * buffer;
VkDeviceSize offset; VkDeviceSize offset;
@@ -1164,6 +1203,7 @@ struct anv_cmd_state {
uint32_t vb_dirty; uint32_t vb_dirty;
anv_cmd_dirty_mask_t dirty; anv_cmd_dirty_mask_t dirty;
anv_cmd_dirty_mask_t compute_dirty; anv_cmd_dirty_mask_t compute_dirty;
enum anv_pipe_bits pending_pipe_bits;
uint32_t num_workgroups_offset; uint32_t num_workgroups_offset;
struct anv_bo *num_workgroups_bo; struct anv_bo *num_workgroups_bo;
VkShaderStageFlags descriptors_dirty; VkShaderStageFlags descriptors_dirty;

View File

@@ -307,6 +307,8 @@ genX(cmd_buffer_flush_compute_state)(struct anv_cmd_buffer *cmd_buffer)
} }
cmd_buffer->state.compute_dirty = 0; cmd_buffer->state.compute_dirty = 0;
genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
} }
void void

View File

@@ -391,6 +391,8 @@ genX(cmd_buffer_flush_compute_state)(struct anv_cmd_buffer *cmd_buffer)
} }
cmd_buffer->state.compute_dirty = 0; cmd_buffer->state.compute_dirty = 0;
genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
} }
void genX(CmdSetEvent)( void genX(CmdSetEvent)(

View File

@@ -136,6 +136,82 @@ genX(cmd_buffer_emit_state_base_address)(struct anv_cmd_buffer *cmd_buffer)
} }
} }
void
genX(cmd_buffer_apply_pipe_flushes)(struct anv_cmd_buffer *cmd_buffer)
{
enum anv_pipe_bits bits = cmd_buffer->state.pending_pipe_bits;
/* Flushes are pipelined while invalidations are handled immediately.
* Therefore, if we're flushing anything then we need to schedule a stall
* before any invalidations can happen.
*/
if (bits & ANV_PIPE_FLUSH_BITS)
bits |= ANV_PIPE_NEEDS_CS_STALL_BIT;
/* If we're going to do an invalidate and we have a pending CS stall that
* has yet to be resolved, we do the CS stall now.
*/
if ((bits & ANV_PIPE_INVALIDATE_BITS) &&
(bits & ANV_PIPE_NEEDS_CS_STALL_BIT)) {
bits |= ANV_PIPE_CS_STALL_BIT;
bits &= ~ANV_PIPE_NEEDS_CS_STALL_BIT;
}
if (bits & (ANV_PIPE_FLUSH_BITS | ANV_PIPE_CS_STALL_BIT)) {
anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pipe) {
pipe.DepthCacheFlushEnable = bits & ANV_PIPE_DEPTH_CACHE_FLUSH_BIT;
pipe.DCFlushEnable = bits & ANV_PIPE_DATA_CACHE_FLUSH_BIT;
pipe.RenderTargetCacheFlushEnable =
bits & ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT;
pipe.DepthStallEnable = bits & ANV_PIPE_DEPTH_STALL_BIT;
pipe.CommandStreamerStallEnable = bits & ANV_PIPE_CS_STALL_BIT;
pipe.StallAtPixelScoreboard = bits & ANV_PIPE_STALL_AT_SCOREBOARD_BIT;
/*
* According to the Broadwell documentation, any PIPE_CONTROL with the
* "Command Streamer Stall" bit set must also have another bit set,
* with five different options:
*
* - Render Target Cache Flush
* - Depth Cache Flush
* - Stall at Pixel Scoreboard
* - Post-Sync Operation
* - Depth Stall
* - DC Flush Enable
*
* I chose "Stall at Pixel Scoreboard" since that's what we use in
* mesa and it seems to work fine. The choice is fairly arbitrary.
*/
if ((bits & ANV_PIPE_CS_STALL_BIT) &&
!(bits & (ANV_PIPE_FLUSH_BITS | ANV_PIPE_DEPTH_STALL_BIT |
ANV_PIPE_STALL_AT_SCOREBOARD_BIT)))
pipe.StallAtPixelScoreboard = true;
}
bits &= ~(ANV_PIPE_FLUSH_BITS | ANV_PIPE_CS_STALL_BIT);
}
if (bits & ANV_PIPE_INVALIDATE_BITS) {
anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pipe) {
pipe.StateCacheInvalidationEnable =
bits & ANV_PIPE_STATE_CACHE_INVALIDATE_BIT;
pipe.ConstantCacheInvalidationEnable =
bits & ANV_PIPE_CONSTANT_CACHE_INVALIDATE_BIT;
pipe.VFCacheInvalidationEnable =
bits & ANV_PIPE_VF_CACHE_INVALIDATE_BIT;
pipe.TextureCacheInvalidationEnable =
bits & ANV_PIPE_TEXTURE_CACHE_INVALIDATE_BIT;
pipe.InstructionCacheInvalidateEnable =
bits & ANV_PIPE_INSTRUCTION_CACHE_INVALIDATE_BIT;
}
bits &= ~ANV_PIPE_INVALIDATE_BITS;
}
cmd_buffer->state.pending_pipe_bits = bits;
}
void genX(CmdPipelineBarrier)( void genX(CmdPipelineBarrier)(
VkCommandBuffer commandBuffer, VkCommandBuffer commandBuffer,
VkPipelineStageFlags srcStageMask, VkPipelineStageFlags srcStageMask,
@@ -149,7 +225,7 @@ void genX(CmdPipelineBarrier)(
const VkImageMemoryBarrier* pImageMemoryBarriers) const VkImageMemoryBarrier* pImageMemoryBarriers)
{ {
ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer); ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
uint32_t b, *dw; uint32_t b;
/* XXX: Right now, we're really dumb and just flush whatever categories /* XXX: Right now, we're really dumb and just flush whatever categories
* the app asks for. One of these days we may make this a bit better * the app asks for. One of these days we may make this a bit better
@@ -173,105 +249,50 @@ void genX(CmdPipelineBarrier)(
dst_flags |= pImageMemoryBarriers[i].dstAccessMask; dst_flags |= pImageMemoryBarriers[i].dstAccessMask;
} }
/* Mask out the Source access flags we care about */ enum anv_pipe_bits pipe_bits = 0;
const uint32_t src_mask =
VK_ACCESS_SHADER_WRITE_BIT |
VK_ACCESS_COLOR_ATTACHMENT_WRITE_BIT |
VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_WRITE_BIT |
VK_ACCESS_TRANSFER_WRITE_BIT;
src_flags = src_flags & src_mask;
/* Mask out the destination access flags we care about */
const uint32_t dst_mask =
VK_ACCESS_INDIRECT_COMMAND_READ_BIT |
VK_ACCESS_INDEX_READ_BIT |
VK_ACCESS_VERTEX_ATTRIBUTE_READ_BIT |
VK_ACCESS_UNIFORM_READ_BIT |
VK_ACCESS_SHADER_READ_BIT |
VK_ACCESS_COLOR_ATTACHMENT_READ_BIT |
VK_ACCESS_TRANSFER_READ_BIT;
dst_flags = dst_flags & dst_mask;
/* The src flags represent how things were used previously. This is
* what we use for doing flushes.
*/
struct GENX(PIPE_CONTROL) flush_cmd = {
GENX(PIPE_CONTROL_header),
.PostSyncOperation = NoWrite,
};
for_each_bit(b, src_flags) { for_each_bit(b, src_flags) {
switch ((VkAccessFlagBits)(1 << b)) { switch ((VkAccessFlagBits)(1 << b)) {
case VK_ACCESS_SHADER_WRITE_BIT: case VK_ACCESS_SHADER_WRITE_BIT:
flush_cmd.DCFlushEnable = true; pipe_bits |= ANV_PIPE_DATA_CACHE_FLUSH_BIT;
break; break;
case VK_ACCESS_COLOR_ATTACHMENT_WRITE_BIT: case VK_ACCESS_COLOR_ATTACHMENT_WRITE_BIT:
flush_cmd.RenderTargetCacheFlushEnable = true; pipe_bits |= ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT;
break; break;
case VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_WRITE_BIT: case VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_WRITE_BIT:
flush_cmd.DepthCacheFlushEnable = true; pipe_bits |= ANV_PIPE_DEPTH_CACHE_FLUSH_BIT;
break; break;
case VK_ACCESS_TRANSFER_WRITE_BIT: case VK_ACCESS_TRANSFER_WRITE_BIT:
flush_cmd.RenderTargetCacheFlushEnable = true; pipe_bits |= ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT;
flush_cmd.DepthCacheFlushEnable = true; pipe_bits |= ANV_PIPE_DEPTH_CACHE_FLUSH_BIT;
break; break;
default: default:
unreachable("should've masked this out by now"); break; /* Nothing to do */
} }
} }
/* If we end up doing two PIPE_CONTROLs, the first, flusing one also has to
* stall and wait for the flushing to finish, so we don't re-dirty the
* caches with in-flight rendering after the second PIPE_CONTROL
* invalidates.
*/
if (dst_flags)
flush_cmd.CommandStreamerStallEnable = true;
if (src_flags && dst_flags) {
dw = anv_batch_emit_dwords(&cmd_buffer->batch, GENX(PIPE_CONTROL_length));
GENX(PIPE_CONTROL_pack)(&cmd_buffer->batch, dw, &flush_cmd);
}
/* The dst flags represent how things will be used in the future. This
* is what we use for doing cache invalidations.
*/
struct GENX(PIPE_CONTROL) invalidate_cmd = {
GENX(PIPE_CONTROL_header),
.PostSyncOperation = NoWrite,
};
for_each_bit(b, dst_flags) { for_each_bit(b, dst_flags) {
switch ((VkAccessFlagBits)(1 << b)) { switch ((VkAccessFlagBits)(1 << b)) {
case VK_ACCESS_INDIRECT_COMMAND_READ_BIT: case VK_ACCESS_INDIRECT_COMMAND_READ_BIT:
case VK_ACCESS_INDEX_READ_BIT: case VK_ACCESS_INDEX_READ_BIT:
case VK_ACCESS_VERTEX_ATTRIBUTE_READ_BIT: case VK_ACCESS_VERTEX_ATTRIBUTE_READ_BIT:
invalidate_cmd.VFCacheInvalidationEnable = true; pipe_bits |= ANV_PIPE_VF_CACHE_INVALIDATE_BIT;
break; break;
case VK_ACCESS_UNIFORM_READ_BIT: case VK_ACCESS_UNIFORM_READ_BIT:
invalidate_cmd.ConstantCacheInvalidationEnable = true; pipe_bits |= ANV_PIPE_CONSTANT_CACHE_INVALIDATE_BIT;
/* fallthrough */ pipe_bits |= ANV_PIPE_TEXTURE_CACHE_INVALIDATE_BIT;
break;
case VK_ACCESS_SHADER_READ_BIT: case VK_ACCESS_SHADER_READ_BIT:
invalidate_cmd.TextureCacheInvalidationEnable = true;
break;
case VK_ACCESS_COLOR_ATTACHMENT_READ_BIT: case VK_ACCESS_COLOR_ATTACHMENT_READ_BIT:
invalidate_cmd.TextureCacheInvalidationEnable = true;
break;
case VK_ACCESS_TRANSFER_READ_BIT: case VK_ACCESS_TRANSFER_READ_BIT:
invalidate_cmd.TextureCacheInvalidationEnable = true; pipe_bits |= ANV_PIPE_TEXTURE_CACHE_INVALIDATE_BIT;
break; break;
default: default:
unreachable("should've masked this out by now"); break; /* Nothing to do */
} }
} }
if (dst_flags) { cmd_buffer->state.pending_pipe_bits |= pipe_bits;
dw = anv_batch_emit_dwords(&cmd_buffer->batch, GENX(PIPE_CONTROL_length));
GENX(PIPE_CONTROL_pack)(&cmd_buffer->batch, dw, &invalidate_cmd);
}
} }
static void static void
@@ -511,6 +532,8 @@ genX(cmd_buffer_flush_state)(struct anv_cmd_buffer *cmd_buffer)
gen7_cmd_buffer_emit_scissor(cmd_buffer); gen7_cmd_buffer_emit_scissor(cmd_buffer);
genX(cmd_buffer_flush_dynamic_state)(cmd_buffer); genX(cmd_buffer_flush_dynamic_state)(cmd_buffer);
genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
} }
static void static void