radv: Try to do a better job of dealing with L2 coherent images.
Only try to invalidate L2 if we actually hit one of the incoherent images. Note we may actually insert some extra flushes at the end of a command buffer so that we may asume the caches are clean the start of the next command buffer. However, on average I think that case is uncommon enough that being able to make assumptions at the start of a cmdbuffer is beneficial. Especially since MSAA is somewhat rare in more recent games. Reviewed-by: Samuel Pitoiset <samuel.pitoiset@gmail.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/13239>
This commit is contained in:

committed by
Marge Bot

parent
64b237436a
commit
fd8210f27e
@@ -3777,9 +3777,9 @@ radv_stage_flush(struct radv_cmd_buffer *cmd_buffer, VkPipelineStageFlags src_st
|
||||
* images. However, given the existence of memory barriers which do not specify
|
||||
* the image/buffer it often devolves to just VRAM/GTT anyway.
|
||||
*
|
||||
* In practice we can cheat a bit, since the INV_* operations include writebacks.
|
||||
* If we know that all the destinations that need the WB do an INV, then we can
|
||||
* skip the WB.
|
||||
* To help reducing the invalidations for GPUs that have L2 coherency between the
|
||||
* RB and the shader caches, we always invalidate L2 on the src side, as we can
|
||||
* use our knowledge of past usage to optimize flushes away.
|
||||
*/
|
||||
|
||||
enum radv_cmd_flush_bits
|
||||
@@ -3811,6 +3811,10 @@ radv_src_access_flush(struct radv_cmd_buffer *cmd_buffer, VkAccessFlags src_flag
|
||||
flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_CB;
|
||||
}
|
||||
}
|
||||
|
||||
/* This is valid even for the rb_noncoherent_dirty case, because with how we account for
|
||||
* dirtyness, if it isn't dirty it doesn't contain the data at all and hence doesn't need
|
||||
* invalidating. */
|
||||
if (!image_is_coherent)
|
||||
flush_bits |= RADV_CMD_FLAG_WB_L2;
|
||||
break;
|
||||
@@ -3878,6 +3882,11 @@ radv_dst_access_flush(struct radv_cmd_buffer *cmd_buffer, VkAccessFlags dst_flag
|
||||
has_DB_meta = false;
|
||||
}
|
||||
|
||||
/* All the L2 invalidations below are not the CB/DB. So if there are no incoherent images
|
||||
* in the L2 cache in CB/DB mode then they are already usable from all the other L2 clients. */
|
||||
image_is_coherent |= cmd_buffer->device->physical_device->rad_info.chip_class >= GFX9 &&
|
||||
!cmd_buffer->state.rb_noncoherent_dirty;
|
||||
|
||||
u_foreach_bit(b, dst_flags)
|
||||
{
|
||||
switch ((VkAccessFlagBits)(1 << b)) {
|
||||
@@ -4741,6 +4750,16 @@ radv_EndCommandBuffer(VkCommandBuffer commandBuffer)
|
||||
*/
|
||||
cmd_buffer->state.flush_bits |= cmd_buffer->active_query_flush_bits;
|
||||
|
||||
/* Flush noncoherent images on GFX9+ so we can assume they're clean on the start of a
|
||||
* command buffer.
|
||||
*/
|
||||
if (cmd_buffer->state.rb_noncoherent_dirty &&
|
||||
cmd_buffer->device->physical_device->rad_info.chip_class >= GFX9)
|
||||
cmd_buffer->state.flush_bits |= radv_src_access_flush(
|
||||
cmd_buffer,
|
||||
VK_ACCESS_COLOR_ATTACHMENT_WRITE_BIT | VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_WRITE_BIT,
|
||||
NULL);
|
||||
|
||||
/* Since NGG streamout uses GDS, we need to make GDS idle when
|
||||
* we leave the IB, otherwise another process might overwrite
|
||||
* it while our shaders are busy.
|
||||
@@ -5735,10 +5754,37 @@ radv_cmd_buffer_begin_subpass(struct radv_cmd_buffer *cmd_buffer, uint32_t subpa
|
||||
assert(cmd_buffer->cs->cdw <= cdw_max);
|
||||
}
|
||||
|
||||
static void
|
||||
radv_mark_noncoherent_rb(struct radv_cmd_buffer *cmd_buffer)
|
||||
{
|
||||
const struct radv_subpass *subpass = cmd_buffer->state.subpass;
|
||||
|
||||
/* Have to be conservative in cmdbuffers with inherited attachments. */
|
||||
if (!cmd_buffer->state.attachments) {
|
||||
cmd_buffer->state.rb_noncoherent_dirty = true;
|
||||
return;
|
||||
}
|
||||
|
||||
for (uint32_t i = 0; i < subpass->color_count; ++i) {
|
||||
const uint32_t a = subpass->color_attachments[i].attachment;
|
||||
if (a == VK_ATTACHMENT_UNUSED)
|
||||
continue;
|
||||
if (!cmd_buffer->state.attachments[a].iview->image->l2_coherent) {
|
||||
cmd_buffer->state.rb_noncoherent_dirty = true;
|
||||
return;
|
||||
}
|
||||
}
|
||||
if (subpass->depth_stencil_attachment &&
|
||||
!cmd_buffer->state.attachments[subpass->depth_stencil_attachment->attachment]
|
||||
.iview->image->l2_coherent)
|
||||
cmd_buffer->state.rb_noncoherent_dirty = true;
|
||||
}
|
||||
|
||||
void
|
||||
radv_cmd_buffer_restore_subpass(struct radv_cmd_buffer *cmd_buffer,
|
||||
const struct radv_subpass *subpass)
|
||||
{
|
||||
radv_mark_noncoherent_rb(cmd_buffer);
|
||||
radv_cmd_buffer_set_subpass(cmd_buffer, subpass);
|
||||
}
|
||||
|
||||
@@ -5810,6 +5856,8 @@ radv_CmdNextSubpass2(VkCommandBuffer commandBuffer, const VkSubpassBeginInfo *pS
|
||||
{
|
||||
RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
|
||||
|
||||
radv_mark_noncoherent_rb(cmd_buffer);
|
||||
|
||||
uint32_t prev_subpass = radv_get_subpass_id(cmd_buffer);
|
||||
radv_cmd_buffer_end_subpass(cmd_buffer);
|
||||
radv_cmd_buffer_begin_subpass(cmd_buffer, prev_subpass + 1);
|
||||
@@ -7192,6 +7240,8 @@ radv_CmdEndRenderPass2(VkCommandBuffer commandBuffer, const VkSubpassEndInfo *pS
|
||||
{
|
||||
RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
|
||||
|
||||
radv_mark_noncoherent_rb(cmd_buffer);
|
||||
|
||||
radv_emit_subpass_barrier(cmd_buffer, &cmd_buffer->state.pass->end_barrier);
|
||||
|
||||
radv_cmd_buffer_end_subpass(cmd_buffer);
|
||||
@@ -7574,6 +7624,9 @@ radv_barrier(struct radv_cmd_buffer *cmd_buffer, uint32_t memoryBarrierCount,
|
||||
enum radv_cmd_flush_bits src_flush_bits = 0;
|
||||
enum radv_cmd_flush_bits dst_flush_bits = 0;
|
||||
|
||||
if (cmd_buffer->state.subpass)
|
||||
radv_mark_noncoherent_rb(cmd_buffer);
|
||||
|
||||
radv_describe_barrier_start(cmd_buffer, info->reason);
|
||||
|
||||
for (unsigned i = 0; i < info->eventCount; ++i) {
|
||||
|
@@ -1396,6 +1396,9 @@ struct radv_cmd_state {
|
||||
/* Whether CP DMA is busy/idle. */
|
||||
bool dma_is_busy;
|
||||
|
||||
/* Whether any images that are not L2 coherent are dirty from the CB. */
|
||||
bool rb_noncoherent_dirty;
|
||||
|
||||
/* Conditional rendering info. */
|
||||
uint8_t predication_op; /* 32-bit or 64-bit predicate value */
|
||||
int predication_type; /* -1: disabled, 0: normal, 1: inverted */
|
||||
|
@@ -1357,6 +1357,9 @@ si_emit_cache_flush(struct radv_cmd_buffer *cmd_buffer)
|
||||
if (unlikely(cmd_buffer->device->trace_bo))
|
||||
radv_cmd_buffer_trace_emit(cmd_buffer);
|
||||
|
||||
if (cmd_buffer->state.flush_bits & RADV_CMD_FLAG_INV_L2)
|
||||
cmd_buffer->state.rb_noncoherent_dirty = false;
|
||||
|
||||
/* Clear the caches that have been flushed to avoid syncing too much
|
||||
* when there is some pending active queries.
|
||||
*/
|
||||
|
Reference in New Issue
Block a user