radv: Try to do a better job of dealing with L2 coherent images.
Only try to invalidate L2 if we actually hit one of the incoherent images. Note we may actually insert some extra flushes at the end of a command buffer so that we may asume the caches are clean the start of the next command buffer. However, on average I think that case is uncommon enough that being able to make assumptions at the start of a cmdbuffer is beneficial. Especially since MSAA is somewhat rare in more recent games. Reviewed-by: Samuel Pitoiset <samuel.pitoiset@gmail.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/13239>
This commit is contained in:

committed by
Marge Bot

parent
64b237436a
commit
fd8210f27e
@@ -3777,9 +3777,9 @@ radv_stage_flush(struct radv_cmd_buffer *cmd_buffer, VkPipelineStageFlags src_st
|
|||||||
* images. However, given the existence of memory barriers which do not specify
|
* images. However, given the existence of memory barriers which do not specify
|
||||||
* the image/buffer it often devolves to just VRAM/GTT anyway.
|
* the image/buffer it often devolves to just VRAM/GTT anyway.
|
||||||
*
|
*
|
||||||
* In practice we can cheat a bit, since the INV_* operations include writebacks.
|
* To help reducing the invalidations for GPUs that have L2 coherency between the
|
||||||
* If we know that all the destinations that need the WB do an INV, then we can
|
* RB and the shader caches, we always invalidate L2 on the src side, as we can
|
||||||
* skip the WB.
|
* use our knowledge of past usage to optimize flushes away.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
enum radv_cmd_flush_bits
|
enum radv_cmd_flush_bits
|
||||||
@@ -3811,6 +3811,10 @@ radv_src_access_flush(struct radv_cmd_buffer *cmd_buffer, VkAccessFlags src_flag
|
|||||||
flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_CB;
|
flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_CB;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/* This is valid even for the rb_noncoherent_dirty case, because with how we account for
|
||||||
|
* dirtyness, if it isn't dirty it doesn't contain the data at all and hence doesn't need
|
||||||
|
* invalidating. */
|
||||||
if (!image_is_coherent)
|
if (!image_is_coherent)
|
||||||
flush_bits |= RADV_CMD_FLAG_WB_L2;
|
flush_bits |= RADV_CMD_FLAG_WB_L2;
|
||||||
break;
|
break;
|
||||||
@@ -3878,6 +3882,11 @@ radv_dst_access_flush(struct radv_cmd_buffer *cmd_buffer, VkAccessFlags dst_flag
|
|||||||
has_DB_meta = false;
|
has_DB_meta = false;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/* All the L2 invalidations below are not the CB/DB. So if there are no incoherent images
|
||||||
|
* in the L2 cache in CB/DB mode then they are already usable from all the other L2 clients. */
|
||||||
|
image_is_coherent |= cmd_buffer->device->physical_device->rad_info.chip_class >= GFX9 &&
|
||||||
|
!cmd_buffer->state.rb_noncoherent_dirty;
|
||||||
|
|
||||||
u_foreach_bit(b, dst_flags)
|
u_foreach_bit(b, dst_flags)
|
||||||
{
|
{
|
||||||
switch ((VkAccessFlagBits)(1 << b)) {
|
switch ((VkAccessFlagBits)(1 << b)) {
|
||||||
@@ -4741,6 +4750,16 @@ radv_EndCommandBuffer(VkCommandBuffer commandBuffer)
|
|||||||
*/
|
*/
|
||||||
cmd_buffer->state.flush_bits |= cmd_buffer->active_query_flush_bits;
|
cmd_buffer->state.flush_bits |= cmd_buffer->active_query_flush_bits;
|
||||||
|
|
||||||
|
/* Flush noncoherent images on GFX9+ so we can assume they're clean on the start of a
|
||||||
|
* command buffer.
|
||||||
|
*/
|
||||||
|
if (cmd_buffer->state.rb_noncoherent_dirty &&
|
||||||
|
cmd_buffer->device->physical_device->rad_info.chip_class >= GFX9)
|
||||||
|
cmd_buffer->state.flush_bits |= radv_src_access_flush(
|
||||||
|
cmd_buffer,
|
||||||
|
VK_ACCESS_COLOR_ATTACHMENT_WRITE_BIT | VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_WRITE_BIT,
|
||||||
|
NULL);
|
||||||
|
|
||||||
/* Since NGG streamout uses GDS, we need to make GDS idle when
|
/* Since NGG streamout uses GDS, we need to make GDS idle when
|
||||||
* we leave the IB, otherwise another process might overwrite
|
* we leave the IB, otherwise another process might overwrite
|
||||||
* it while our shaders are busy.
|
* it while our shaders are busy.
|
||||||
@@ -5735,10 +5754,37 @@ radv_cmd_buffer_begin_subpass(struct radv_cmd_buffer *cmd_buffer, uint32_t subpa
|
|||||||
assert(cmd_buffer->cs->cdw <= cdw_max);
|
assert(cmd_buffer->cs->cdw <= cdw_max);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static void
|
||||||
|
radv_mark_noncoherent_rb(struct radv_cmd_buffer *cmd_buffer)
|
||||||
|
{
|
||||||
|
const struct radv_subpass *subpass = cmd_buffer->state.subpass;
|
||||||
|
|
||||||
|
/* Have to be conservative in cmdbuffers with inherited attachments. */
|
||||||
|
if (!cmd_buffer->state.attachments) {
|
||||||
|
cmd_buffer->state.rb_noncoherent_dirty = true;
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
for (uint32_t i = 0; i < subpass->color_count; ++i) {
|
||||||
|
const uint32_t a = subpass->color_attachments[i].attachment;
|
||||||
|
if (a == VK_ATTACHMENT_UNUSED)
|
||||||
|
continue;
|
||||||
|
if (!cmd_buffer->state.attachments[a].iview->image->l2_coherent) {
|
||||||
|
cmd_buffer->state.rb_noncoherent_dirty = true;
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (subpass->depth_stencil_attachment &&
|
||||||
|
!cmd_buffer->state.attachments[subpass->depth_stencil_attachment->attachment]
|
||||||
|
.iview->image->l2_coherent)
|
||||||
|
cmd_buffer->state.rb_noncoherent_dirty = true;
|
||||||
|
}
|
||||||
|
|
||||||
void
|
void
|
||||||
radv_cmd_buffer_restore_subpass(struct radv_cmd_buffer *cmd_buffer,
|
radv_cmd_buffer_restore_subpass(struct radv_cmd_buffer *cmd_buffer,
|
||||||
const struct radv_subpass *subpass)
|
const struct radv_subpass *subpass)
|
||||||
{
|
{
|
||||||
|
radv_mark_noncoherent_rb(cmd_buffer);
|
||||||
radv_cmd_buffer_set_subpass(cmd_buffer, subpass);
|
radv_cmd_buffer_set_subpass(cmd_buffer, subpass);
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -5810,6 +5856,8 @@ radv_CmdNextSubpass2(VkCommandBuffer commandBuffer, const VkSubpassBeginInfo *pS
|
|||||||
{
|
{
|
||||||
RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
|
RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
|
||||||
|
|
||||||
|
radv_mark_noncoherent_rb(cmd_buffer);
|
||||||
|
|
||||||
uint32_t prev_subpass = radv_get_subpass_id(cmd_buffer);
|
uint32_t prev_subpass = radv_get_subpass_id(cmd_buffer);
|
||||||
radv_cmd_buffer_end_subpass(cmd_buffer);
|
radv_cmd_buffer_end_subpass(cmd_buffer);
|
||||||
radv_cmd_buffer_begin_subpass(cmd_buffer, prev_subpass + 1);
|
radv_cmd_buffer_begin_subpass(cmd_buffer, prev_subpass + 1);
|
||||||
@@ -7192,6 +7240,8 @@ radv_CmdEndRenderPass2(VkCommandBuffer commandBuffer, const VkSubpassEndInfo *pS
|
|||||||
{
|
{
|
||||||
RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
|
RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
|
||||||
|
|
||||||
|
radv_mark_noncoherent_rb(cmd_buffer);
|
||||||
|
|
||||||
radv_emit_subpass_barrier(cmd_buffer, &cmd_buffer->state.pass->end_barrier);
|
radv_emit_subpass_barrier(cmd_buffer, &cmd_buffer->state.pass->end_barrier);
|
||||||
|
|
||||||
radv_cmd_buffer_end_subpass(cmd_buffer);
|
radv_cmd_buffer_end_subpass(cmd_buffer);
|
||||||
@@ -7574,6 +7624,9 @@ radv_barrier(struct radv_cmd_buffer *cmd_buffer, uint32_t memoryBarrierCount,
|
|||||||
enum radv_cmd_flush_bits src_flush_bits = 0;
|
enum radv_cmd_flush_bits src_flush_bits = 0;
|
||||||
enum radv_cmd_flush_bits dst_flush_bits = 0;
|
enum radv_cmd_flush_bits dst_flush_bits = 0;
|
||||||
|
|
||||||
|
if (cmd_buffer->state.subpass)
|
||||||
|
radv_mark_noncoherent_rb(cmd_buffer);
|
||||||
|
|
||||||
radv_describe_barrier_start(cmd_buffer, info->reason);
|
radv_describe_barrier_start(cmd_buffer, info->reason);
|
||||||
|
|
||||||
for (unsigned i = 0; i < info->eventCount; ++i) {
|
for (unsigned i = 0; i < info->eventCount; ++i) {
|
||||||
|
@@ -1396,6 +1396,9 @@ struct radv_cmd_state {
|
|||||||
/* Whether CP DMA is busy/idle. */
|
/* Whether CP DMA is busy/idle. */
|
||||||
bool dma_is_busy;
|
bool dma_is_busy;
|
||||||
|
|
||||||
|
/* Whether any images that are not L2 coherent are dirty from the CB. */
|
||||||
|
bool rb_noncoherent_dirty;
|
||||||
|
|
||||||
/* Conditional rendering info. */
|
/* Conditional rendering info. */
|
||||||
uint8_t predication_op; /* 32-bit or 64-bit predicate value */
|
uint8_t predication_op; /* 32-bit or 64-bit predicate value */
|
||||||
int predication_type; /* -1: disabled, 0: normal, 1: inverted */
|
int predication_type; /* -1: disabled, 0: normal, 1: inverted */
|
||||||
|
@@ -1357,6 +1357,9 @@ si_emit_cache_flush(struct radv_cmd_buffer *cmd_buffer)
|
|||||||
if (unlikely(cmd_buffer->device->trace_bo))
|
if (unlikely(cmd_buffer->device->trace_bo))
|
||||||
radv_cmd_buffer_trace_emit(cmd_buffer);
|
radv_cmd_buffer_trace_emit(cmd_buffer);
|
||||||
|
|
||||||
|
if (cmd_buffer->state.flush_bits & RADV_CMD_FLAG_INV_L2)
|
||||||
|
cmd_buffer->state.rb_noncoherent_dirty = false;
|
||||||
|
|
||||||
/* Clear the caches that have been flushed to avoid syncing too much
|
/* Clear the caches that have been flushed to avoid syncing too much
|
||||||
* when there is some pending active queries.
|
* when there is some pending active queries.
|
||||||
*/
|
*/
|
||||||
|
Reference in New Issue
Block a user