radv: Try to do a better job of dealing with L2 coherent images.

Only try to invalidate L2 if we actually hit one of the incoherent images.

Note we may actually insert some extra flushes at the end of a command
buffer so that we may asume the caches are clean the start of the next
command buffer. However, on average I think that case is uncommon
enough that being able to make assumptions at the start of a cmdbuffer
is beneficial. Especially since MSAA is somewhat rare in more recent
games.

Reviewed-by: Samuel Pitoiset <samuel.pitoiset@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/13239>
This commit is contained in:
Bas Nieuwenhuizen
2021-10-05 03:10:20 +02:00
committed by Marge Bot
parent 64b237436a
commit fd8210f27e
3 changed files with 62 additions and 3 deletions

View File

@@ -3777,9 +3777,9 @@ radv_stage_flush(struct radv_cmd_buffer *cmd_buffer, VkPipelineStageFlags src_st
* images. However, given the existence of memory barriers which do not specify
* the image/buffer it often devolves to just VRAM/GTT anyway.
*
* In practice we can cheat a bit, since the INV_* operations include writebacks.
* If we know that all the destinations that need the WB do an INV, then we can
* skip the WB.
* To help reducing the invalidations for GPUs that have L2 coherency between the
* RB and the shader caches, we always invalidate L2 on the src side, as we can
* use our knowledge of past usage to optimize flushes away.
*/
enum radv_cmd_flush_bits
@@ -3811,6 +3811,10 @@ radv_src_access_flush(struct radv_cmd_buffer *cmd_buffer, VkAccessFlags src_flag
flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_CB;
}
}
/* This is valid even for the rb_noncoherent_dirty case, because with how we account for
* dirtyness, if it isn't dirty it doesn't contain the data at all and hence doesn't need
* invalidating. */
if (!image_is_coherent)
flush_bits |= RADV_CMD_FLAG_WB_L2;
break;
@@ -3878,6 +3882,11 @@ radv_dst_access_flush(struct radv_cmd_buffer *cmd_buffer, VkAccessFlags dst_flag
has_DB_meta = false;
}
/* All the L2 invalidations below are not the CB/DB. So if there are no incoherent images
* in the L2 cache in CB/DB mode then they are already usable from all the other L2 clients. */
image_is_coherent |= cmd_buffer->device->physical_device->rad_info.chip_class >= GFX9 &&
!cmd_buffer->state.rb_noncoherent_dirty;
u_foreach_bit(b, dst_flags)
{
switch ((VkAccessFlagBits)(1 << b)) {
@@ -4741,6 +4750,16 @@ radv_EndCommandBuffer(VkCommandBuffer commandBuffer)
*/
cmd_buffer->state.flush_bits |= cmd_buffer->active_query_flush_bits;
/* Flush noncoherent images on GFX9+ so we can assume they're clean on the start of a
* command buffer.
*/
if (cmd_buffer->state.rb_noncoherent_dirty &&
cmd_buffer->device->physical_device->rad_info.chip_class >= GFX9)
cmd_buffer->state.flush_bits |= radv_src_access_flush(
cmd_buffer,
VK_ACCESS_COLOR_ATTACHMENT_WRITE_BIT | VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_WRITE_BIT,
NULL);
/* Since NGG streamout uses GDS, we need to make GDS idle when
* we leave the IB, otherwise another process might overwrite
* it while our shaders are busy.
@@ -5735,10 +5754,37 @@ radv_cmd_buffer_begin_subpass(struct radv_cmd_buffer *cmd_buffer, uint32_t subpa
assert(cmd_buffer->cs->cdw <= cdw_max);
}
static void
radv_mark_noncoherent_rb(struct radv_cmd_buffer *cmd_buffer)
{
const struct radv_subpass *subpass = cmd_buffer->state.subpass;
/* Have to be conservative in cmdbuffers with inherited attachments. */
if (!cmd_buffer->state.attachments) {
cmd_buffer->state.rb_noncoherent_dirty = true;
return;
}
for (uint32_t i = 0; i < subpass->color_count; ++i) {
const uint32_t a = subpass->color_attachments[i].attachment;
if (a == VK_ATTACHMENT_UNUSED)
continue;
if (!cmd_buffer->state.attachments[a].iview->image->l2_coherent) {
cmd_buffer->state.rb_noncoherent_dirty = true;
return;
}
}
if (subpass->depth_stencil_attachment &&
!cmd_buffer->state.attachments[subpass->depth_stencil_attachment->attachment]
.iview->image->l2_coherent)
cmd_buffer->state.rb_noncoherent_dirty = true;
}
void
radv_cmd_buffer_restore_subpass(struct radv_cmd_buffer *cmd_buffer,
const struct radv_subpass *subpass)
{
radv_mark_noncoherent_rb(cmd_buffer);
radv_cmd_buffer_set_subpass(cmd_buffer, subpass);
}
@@ -5810,6 +5856,8 @@ radv_CmdNextSubpass2(VkCommandBuffer commandBuffer, const VkSubpassBeginInfo *pS
{
RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
radv_mark_noncoherent_rb(cmd_buffer);
uint32_t prev_subpass = radv_get_subpass_id(cmd_buffer);
radv_cmd_buffer_end_subpass(cmd_buffer);
radv_cmd_buffer_begin_subpass(cmd_buffer, prev_subpass + 1);
@@ -7192,6 +7240,8 @@ radv_CmdEndRenderPass2(VkCommandBuffer commandBuffer, const VkSubpassEndInfo *pS
{
RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
radv_mark_noncoherent_rb(cmd_buffer);
radv_emit_subpass_barrier(cmd_buffer, &cmd_buffer->state.pass->end_barrier);
radv_cmd_buffer_end_subpass(cmd_buffer);
@@ -7574,6 +7624,9 @@ radv_barrier(struct radv_cmd_buffer *cmd_buffer, uint32_t memoryBarrierCount,
enum radv_cmd_flush_bits src_flush_bits = 0;
enum radv_cmd_flush_bits dst_flush_bits = 0;
if (cmd_buffer->state.subpass)
radv_mark_noncoherent_rb(cmd_buffer);
radv_describe_barrier_start(cmd_buffer, info->reason);
for (unsigned i = 0; i < info->eventCount; ++i) {

View File

@@ -1396,6 +1396,9 @@ struct radv_cmd_state {
/* Whether CP DMA is busy/idle. */
bool dma_is_busy;
/* Whether any images that are not L2 coherent are dirty from the CB. */
bool rb_noncoherent_dirty;
/* Conditional rendering info. */
uint8_t predication_op; /* 32-bit or 64-bit predicate value */
int predication_type; /* -1: disabled, 0: normal, 1: inverted */

View File

@@ -1357,6 +1357,9 @@ si_emit_cache_flush(struct radv_cmd_buffer *cmd_buffer)
if (unlikely(cmd_buffer->device->trace_bo))
radv_cmd_buffer_trace_emit(cmd_buffer);
if (cmd_buffer->state.flush_bits & RADV_CMD_FLAG_INV_L2)
cmd_buffer->state.rb_noncoherent_dirty = false;
/* Clear the caches that have been flushed to avoid syncing too much
* when there is some pending active queries.
*/