From fc5cb5400801a4476ae9148c6085f06738fa4602 Mon Sep 17 00:00:00 2001 From: Felix DeGrood Date: Thu, 11 Mar 2021 08:40:56 -0800 Subject: [PATCH] anv: Add debug messages for DEBUG_PIPE_CONTROL Enable with INTEL_DEBUG=pc. Reviewed-by: Lionel Landwerlin Part-of: --- src/intel/vulkan/anv_blorp.c | 68 +++++++---- src/intel/vulkan/anv_private.h | 17 +++ src/intel/vulkan/anv_util.c | 29 +++++ src/intel/vulkan/genX_blorp_exec.c | 21 ++-- src/intel/vulkan/genX_cmd_buffer.c | 174 +++++++++++++++++++++-------- src/intel/vulkan/genX_query.c | 9 +- 6 files changed, 237 insertions(+), 81 deletions(-) diff --git a/src/intel/vulkan/anv_blorp.c b/src/intel/vulkan/anv_blorp.c index f985bf50922..2deeaa63953 100644 --- a/src/intel/vulkan/anv_blorp.c +++ b/src/intel/vulkan/anv_blorp.c @@ -875,7 +875,9 @@ void anv_CmdUpdateBuffer( /* We're about to read data that was written from the CPU. Flush the * texture cache so we don't get anything stale. */ - cmd_buffer->state.pending_pipe_bits |= ANV_PIPE_TEXTURE_CACHE_INVALIDATE_BIT; + anv_add_pending_pipe_bits(cmd_buffer, + ANV_PIPE_TEXTURE_CACHE_INVALIDATE_BIT, + "before UpdateBuffer"); while (dataSize) { const uint32_t copy_size = MIN2(dataSize, max_update_size); @@ -1513,11 +1515,12 @@ anv_image_copy_to_shadow(struct anv_cmd_buffer *cmd_buffer, /* We don't know who touched the main surface last so flush a bunch of * caches to ensure we get good data. */ - cmd_buffer->state.pending_pipe_bits |= - ANV_PIPE_DEPTH_CACHE_FLUSH_BIT | - ANV_PIPE_DATA_CACHE_FLUSH_BIT | - ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT | - ANV_PIPE_TEXTURE_CACHE_INVALIDATE_BIT; + anv_add_pending_pipe_bits(cmd_buffer, + ANV_PIPE_DEPTH_CACHE_FLUSH_BIT | + ANV_PIPE_DATA_CACHE_FLUSH_BIT | + ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT | + ANV_PIPE_TEXTURE_CACHE_INVALIDATE_BIT, + "before copy_to_shadow"); struct blorp_surf surf; get_blorp_surf_for_anv_image(cmd_buffer->device, @@ -1553,8 +1556,9 @@ anv_image_copy_to_shadow(struct anv_cmd_buffer *cmd_buffer, } /* We just wrote to the buffer with the render cache. Flush it. */ - cmd_buffer->state.pending_pipe_bits |= - ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT; + anv_add_pending_pipe_bits(cmd_buffer, + ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT, + "after copy_to_shadow"); blorp_batch_finish(&batch); } @@ -1632,8 +1636,10 @@ anv_image_clear_depth_stencil(struct anv_cmd_buffer *cmd_buffer, * performance. If it does this, we need to flush it out of the depth * cache before rendering to it. */ - cmd_buffer->state.pending_pipe_bits |= - ANV_PIPE_DEPTH_CACHE_FLUSH_BIT | ANV_PIPE_END_OF_PIPE_SYNC_BIT; + anv_add_pending_pipe_bits(cmd_buffer, + ANV_PIPE_DEPTH_CACHE_FLUSH_BIT | + ANV_PIPE_END_OF_PIPE_SYNC_BIT, + "before clear DS"); blorp_clear_depth_stencil(&batch, &depth, &stencil, level, base_layer, layer_count, @@ -1649,8 +1655,10 @@ anv_image_clear_depth_stencil(struct anv_cmd_buffer *cmd_buffer, * performance. If it does this, we need to flush it out of the render * cache before someone starts trying to do stencil on it. */ - cmd_buffer->state.pending_pipe_bits |= - ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT | ANV_PIPE_END_OF_PIPE_SYNC_BIT; + anv_add_pending_pipe_bits(cmd_buffer, + ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT | + ANV_PIPE_END_OF_PIPE_SYNC_BIT, + "after clear DS"); struct blorp_surf stencil_shadow; if ((aspects & VK_IMAGE_ASPECT_STENCIL_BIT) && @@ -1749,8 +1757,10 @@ anv_image_hiz_clear(struct anv_cmd_buffer *cmd_buffer, * and a 3DPRIMITIVE, the GPU appears to also need this to avoid occasional * hangs when doing a clear with WM_HZ_OP. */ - cmd_buffer->state.pending_pipe_bits |= - ANV_PIPE_DEPTH_CACHE_FLUSH_BIT | ANV_PIPE_DEPTH_STALL_BIT; + anv_add_pending_pipe_bits(cmd_buffer, + ANV_PIPE_DEPTH_CACHE_FLUSH_BIT | + ANV_PIPE_DEPTH_STALL_BIT, + "before clear hiz"); blorp_hiz_clear_depth_stencil(&batch, &depth, &stencil, level, base_layer, layer_count, @@ -1780,8 +1790,10 @@ anv_image_hiz_clear(struct anv_cmd_buffer *cmd_buffer, * supposedly unnecessary, we choose to perform the flush unconditionally * just to be safe. */ - cmd_buffer->state.pending_pipe_bits |= - ANV_PIPE_DEPTH_CACHE_FLUSH_BIT | ANV_PIPE_DEPTH_STALL_BIT; + anv_add_pending_pipe_bits(cmd_buffer, + ANV_PIPE_DEPTH_CACHE_FLUSH_BIT | + ANV_PIPE_DEPTH_STALL_BIT, + "after clear hiz"); } void @@ -1832,8 +1844,10 @@ anv_image_mcs_op(struct anv_cmd_buffer *cmd_buffer, * resolve and then use a second PIPE_CONTROL after the resolve to ensure * that it is completed before any additional drawing occurs. */ - cmd_buffer->state.pending_pipe_bits |= - ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT | ANV_PIPE_END_OF_PIPE_SYNC_BIT; + anv_add_pending_pipe_bits(cmd_buffer, + ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT | + ANV_PIPE_END_OF_PIPE_SYNC_BIT, + "before fast clear mcs"); switch (mcs_op) { case ISL_AUX_OP_FAST_CLEAR: @@ -1851,8 +1865,10 @@ anv_image_mcs_op(struct anv_cmd_buffer *cmd_buffer, unreachable("Unsupported MCS operation"); } - cmd_buffer->state.pending_pipe_bits |= - ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT | ANV_PIPE_END_OF_PIPE_SYNC_BIT; + anv_add_pending_pipe_bits(cmd_buffer, + ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT | + ANV_PIPE_END_OF_PIPE_SYNC_BIT, + "after fast clear mcs"); blorp_batch_finish(&batch); } @@ -1913,8 +1929,10 @@ anv_image_ccs_op(struct anv_cmd_buffer *cmd_buffer, * resolve and then use a second PIPE_CONTROL after the resolve to ensure * that it is completed before any additional drawing occurs. */ - cmd_buffer->state.pending_pipe_bits |= - ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT | ANV_PIPE_END_OF_PIPE_SYNC_BIT; + anv_add_pending_pipe_bits(cmd_buffer, + ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT | + ANV_PIPE_END_OF_PIPE_SYNC_BIT, + "before fast clear ccs"); switch (ccs_op) { case ISL_AUX_OP_FAST_CLEAR: @@ -1937,8 +1955,10 @@ anv_image_ccs_op(struct anv_cmd_buffer *cmd_buffer, unreachable("Unsupported CCS operation"); } - cmd_buffer->state.pending_pipe_bits |= - ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT | ANV_PIPE_END_OF_PIPE_SYNC_BIT; + anv_add_pending_pipe_bits(cmd_buffer, + ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT | + ANV_PIPE_END_OF_PIPE_SYNC_BIT, + "after fast clear ccs"); blorp_batch_finish(&batch); } diff --git a/src/intel/vulkan/anv_private.h b/src/intel/vulkan/anv_private.h index 6ec8f31047c..f0ba263e9cf 100644 --- a/src/intel/vulkan/anv_private.h +++ b/src/intel/vulkan/anv_private.h @@ -4541,6 +4541,23 @@ anv_device_entrypoint_is_enabled(int index, uint32_t core_version, const struct vk_device_dispatch_table * anv_get_device_dispatch_table(const struct intel_device_info *devinfo); +void +anv_dump_pipe_bits(enum anv_pipe_bits bits); + +static inline void +anv_add_pending_pipe_bits(struct anv_cmd_buffer* cmd_buffer, + enum anv_pipe_bits bits, + const char* reason) +{ + cmd_buffer->state.pending_pipe_bits |= bits; + if (unlikely(INTEL_DEBUG & DEBUG_PIPE_CONTROL) && bits) + { + fputs("pc: add ", stderr); + anv_dump_pipe_bits(bits); + fprintf(stderr, "reason: %s\n", reason); + } +} + static inline uint32_t anv_get_subpass_id(const struct anv_cmd_state * const cmd_state) { diff --git a/src/intel/vulkan/anv_util.c b/src/intel/vulkan/anv_util.c index a1b5715396d..0d171264667 100644 --- a/src/intel/vulkan/anv_util.c +++ b/src/intel/vulkan/anv_util.c @@ -113,3 +113,32 @@ __vk_errorf(struct anv_instance *instance, return error; } + +void +anv_dump_pipe_bits(enum anv_pipe_bits bits) +{ + if (bits & ANV_PIPE_DEPTH_CACHE_FLUSH_BIT) + fputs("+depth_flush ", stderr); + if (bits & ANV_PIPE_DATA_CACHE_FLUSH_BIT) + fputs("+dc_flush ", stderr); + if (bits & ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT) + fputs("+rt_flush ", stderr); + if (bits & ANV_PIPE_TILE_CACHE_FLUSH_BIT) + fputs("+tile_flush ", stderr); + if (bits & ANV_PIPE_STATE_CACHE_INVALIDATE_BIT) + fputs("+state_inval ", stderr); + if (bits & ANV_PIPE_CONSTANT_CACHE_INVALIDATE_BIT) + fputs("+const_inval ", stderr); + if (bits & ANV_PIPE_VF_CACHE_INVALIDATE_BIT) + fputs("+vf_inval ", stderr); + if (bits & ANV_PIPE_TEXTURE_CACHE_INVALIDATE_BIT) + fputs("+tex_inval ", stderr); + if (bits & ANV_PIPE_INSTRUCTION_CACHE_INVALIDATE_BIT) + fputs("+ic_inval ", stderr); + if (bits & ANV_PIPE_STALL_AT_SCOREBOARD_BIT) + fputs("+pb_stall ", stderr); + if (bits & ANV_PIPE_DEPTH_STALL_BIT) + fputs("+depth_stall ", stderr); + if (bits & ANV_PIPE_CS_STALL_BIT) + fputs("+cs_stall ", stderr); +} diff --git a/src/intel/vulkan/genX_blorp_exec.c b/src/intel/vulkan/genX_blorp_exec.c index 7966b30feb6..9ca1fb66284 100644 --- a/src/intel/vulkan/genX_blorp_exec.c +++ b/src/intel/vulkan/genX_blorp_exec.c @@ -258,9 +258,10 @@ genX(blorp_exec)(struct blorp_batch *batch, * is set due to new association of BTI, PS Scoreboard Stall bit must * be set in this packet." */ - cmd_buffer->state.pending_pipe_bits |= - ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT | - ANV_PIPE_STALL_AT_SCOREBOARD_BIT; + anv_add_pending_pipe_bits(cmd_buffer, + ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT | + ANV_PIPE_STALL_AT_SCOREBOARD_BIT, + "before blorp BTI change"); #endif #if GFX_VERx10 == 120 @@ -285,8 +286,11 @@ genX(blorp_exec)(struct blorp_batch *batch, * See genX(cmd_buffer_mi_memcpy) for more details. */ if (params->src.clear_color_addr.buffer || - params->dst.clear_color_addr.buffer) - cmd_buffer->state.pending_pipe_bits |= ANV_PIPE_CS_STALL_BIT; + params->dst.clear_color_addr.buffer) { + anv_add_pending_pipe_bits(cmd_buffer, + ANV_PIPE_CS_STALL_BIT, + "before blorp prep fast clear"); + } #endif genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer); @@ -311,9 +315,10 @@ genX(blorp_exec)(struct blorp_batch *batch, * is set due to new association of BTI, PS Scoreboard Stall bit must * be set in this packet." */ - cmd_buffer->state.pending_pipe_bits |= - ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT | - ANV_PIPE_STALL_AT_SCOREBOARD_BIT; + anv_add_pending_pipe_bits(cmd_buffer, + ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT | + ANV_PIPE_STALL_AT_SCOREBOARD_BIT, + "after blorp BTI change"); #endif cmd_buffer->state.gfx.vb_dirty = ~0; diff --git a/src/intel/vulkan/genX_cmd_buffer.c b/src/intel/vulkan/genX_cmd_buffer.c index 04eb12340b0..157fee90951 100644 --- a/src/intel/vulkan/genX_cmd_buffer.c +++ b/src/intel/vulkan/genX_cmd_buffer.c @@ -50,6 +50,32 @@ static void genX(flush_pipeline_select)(struct anv_cmd_buffer *cmd_buffer, uint32_t pipeline); +static enum anv_pipe_bits +convert_pc_to_bits(struct GENX(PIPE_CONTROL) *pc) { + enum anv_pipe_bits bits = 0; + bits |= (pc->DepthCacheFlushEnable) ? ANV_PIPE_DEPTH_CACHE_FLUSH_BIT : 0; + bits |= (pc->DCFlushEnable) ? ANV_PIPE_DATA_CACHE_FLUSH_BIT : 0; +#if GFX_VER >= 12 + bits |= (pc->TileCacheFlushEnable) ? ANV_PIPE_TILE_CACHE_FLUSH_BIT : 0; +#endif + bits |= (pc->RenderTargetCacheFlushEnable) ? ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT : 0; + bits |= (pc->StateCacheInvalidationEnable) ? ANV_PIPE_STATE_CACHE_INVALIDATE_BIT : 0; + bits |= (pc->ConstantCacheInvalidationEnable) ? ANV_PIPE_CONSTANT_CACHE_INVALIDATE_BIT : 0; + bits |= (pc->TextureCacheInvalidationEnable) ? ANV_PIPE_TEXTURE_CACHE_INVALIDATE_BIT : 0; + bits |= (pc->InstructionCacheInvalidateEnable) ? ANV_PIPE_INSTRUCTION_CACHE_INVALIDATE_BIT : 0; + bits |= (pc->StallAtPixelScoreboard) ? ANV_PIPE_STALL_AT_SCOREBOARD_BIT : 0; + bits |= (pc->DepthStallEnable) ? ANV_PIPE_DEPTH_STALL_BIT : 0; + bits |= (pc->CommandStreamerStallEnable) ? ANV_PIPE_CS_STALL_BIT : 0; + return bits; +} + +#define anv_debug_dump_pc(pc) \ + if (unlikely(INTEL_DEBUG & DEBUG_PIPE_CONTROL)) { \ + fputs("pc: emit PC=( ", stderr); \ + anv_dump_pipe_bits(convert_pc_to_bits(&(pc))); \ + fprintf(stderr, ") reason: %s\n", __FUNCTION__); \ + } + void genX(cmd_buffer_emit_state_base_address)(struct anv_cmd_buffer *cmd_buffer) { @@ -87,6 +113,7 @@ genX(cmd_buffer_emit_state_base_address)(struct anv_cmd_buffer *cmd_buffer) if (devinfo->revision == 0 /* A0 */) pc.HDCPipelineFlushEnable = true; #endif + anv_debug_dump_pc(pc); } #if GFX_VER == 12 @@ -236,6 +263,7 @@ genX(cmd_buffer_emit_state_base_address)(struct anv_cmd_buffer *cmd_buffer) pc.TextureCacheInvalidationEnable = true; pc.ConstantCacheInvalidationEnable = true; pc.StateCacheInvalidationEnable = true; + anv_debug_dump_pc(pc); } } @@ -477,7 +505,9 @@ anv_image_init_aux_tt(struct anv_cmd_buffer *cmd_buffer, * with not having this stall in some cases if we were really careful but * it's better to play it safe. Full stall the GPU. */ - cmd_buffer->state.pending_pipe_bits |= ANV_PIPE_END_OF_PIPE_SYNC_BIT; + anv_add_pending_pipe_bits(cmd_buffer, + ANV_PIPE_END_OF_PIPE_SYNC_BIT, + "before update AUX-TT"); genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer); struct mi_builder b; @@ -548,7 +578,9 @@ anv_image_init_aux_tt(struct anv_cmd_buffer *cmd_buffer, } } - cmd_buffer->state.pending_pipe_bits |= ANV_PIPE_AUX_TABLE_INVALIDATE_BIT; + anv_add_pending_pipe_bits(cmd_buffer, + ANV_PIPE_AUX_TABLE_INVALIDATE_BIT, + "after update AUX-TT"); } #endif /* GFX_VER == 12 */ @@ -1069,7 +1101,9 @@ genX(copy_fast_clear_dwords)(struct anv_cmd_buffer *cmd_buffer, * In order to work around this issue, we emit a PIPE_CONTROL with the * command streamer stall bit set. */ - cmd_buffer->state.pending_pipe_bits |= ANV_PIPE_CS_STALL_BIT; + anv_add_pending_pipe_bits(cmd_buffer, + ANV_PIPE_CS_STALL_BIT, + "after copy_fast_clear_dwords. Avoid potential hang"); genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer); #endif @@ -1092,8 +1126,9 @@ genX(copy_fast_clear_dwords)(struct anv_cmd_buffer *cmd_buffer, * * In testing, SKL doesn't actually seem to need this, but HSW does. */ - cmd_buffer->state.pending_pipe_bits |= - ANV_PIPE_STATE_CACHE_INVALIDATE_BIT; + anv_add_pending_pipe_bits(cmd_buffer, + ANV_PIPE_STATE_CACHE_INVALIDATE_BIT, + "after copy_fast_clear_dwords surface state update"); } } @@ -1426,8 +1461,10 @@ transition_color_buffer(struct anv_cmd_buffer *cmd_buffer, * resolve and the second likely ensures that the resolve is complete before * we do any more rendering or clearing. */ - cmd_buffer->state.pending_pipe_bits |= - ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT | ANV_PIPE_END_OF_PIPE_SYNC_BIT; + anv_add_pending_pipe_bits(cmd_buffer, + ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT | + ANV_PIPE_END_OF_PIPE_SYNC_BIT, + "after transition RT"); for (uint32_t l = 0; l < level_count; l++) { uint32_t level = base_level + l; @@ -1472,8 +1509,10 @@ transition_color_buffer(struct anv_cmd_buffer *cmd_buffer, } } - cmd_buffer->state.pending_pipe_bits |= - ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT | ANV_PIPE_END_OF_PIPE_SYNC_BIT; + anv_add_pending_pipe_bits(cmd_buffer, + ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT | + ANV_PIPE_END_OF_PIPE_SYNC_BIT, + "after transition RT"); } static VkResult @@ -1718,14 +1757,19 @@ genX(BeginCommandBuffer)( * VF cache occasionally. It's easier if we can assume we start with a * fresh cache (See also genX(cmd_buffer_set_binding_for_gfx8_vb_flush).) */ - cmd_buffer->state.pending_pipe_bits |= ANV_PIPE_VF_CACHE_INVALIDATE_BIT; + anv_add_pending_pipe_bits(cmd_buffer, + ANV_PIPE_VF_CACHE_INVALIDATE_BIT, + "new cmd buffer"); /* Re-emit the aux table register in every command buffer. This way we're * ensured that we have the table even if this command buffer doesn't * initialize any images. */ - if (cmd_buffer->device->info.has_aux_map) - cmd_buffer->state.pending_pipe_bits |= ANV_PIPE_AUX_TABLE_INVALIDATE_BIT; + if (cmd_buffer->device->info.has_aux_map) { + anv_add_pending_pipe_bits(cmd_buffer, + ANV_PIPE_AUX_TABLE_INVALIDATE_BIT, + "new cmd buffer with aux-tt"); + } /* We send an "Indirect State Pointers Disable" packet at * EndCommandBuffer, so all push contant packets are ignored during a @@ -1842,10 +1886,12 @@ emit_isp_disable(struct anv_cmd_buffer *cmd_buffer) anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) { pc.StallAtPixelScoreboard = true; pc.CommandStreamerStallEnable = true; + anv_debug_dump_pc(pc); } anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) { pc.IndirectStatePointersDisable = true; pc.CommandStreamerStallEnable = true; + anv_debug_dump_pc(pc); } } @@ -1954,8 +2000,9 @@ genX(CmdExecuteCommands)( * invalidate the whole thing. */ if (GFX_VER >= 8 && GFX_VER <= 9) { - primary->state.pending_pipe_bits |= - ANV_PIPE_CS_STALL_BIT | ANV_PIPE_VF_CACHE_INVALIDATE_BIT; + anv_add_pending_pipe_bits(primary, + ANV_PIPE_CS_STALL_BIT | ANV_PIPE_VF_CACHE_INVALIDATE_BIT, + "Secondary cmd buffer not tracked in VF cache"); } /* The secondary may have selected a different pipeline (3D or compute) and @@ -2008,6 +2055,7 @@ genX(cmd_buffer_config_l3)(struct anv_cmd_buffer *cmd_buffer, pc.DCFlushEnable = true; pc.PostSyncOperation = NoWrite; pc.CommandStreamerStallEnable = true; + anv_debug_dump_pc(pc); } /* ...followed by a second pipelined PIPE_CONTROL that initiates @@ -2030,6 +2078,7 @@ genX(cmd_buffer_config_l3)(struct anv_cmd_buffer *cmd_buffer, pc.InstructionCacheInvalidateEnable = true; pc.StateCacheInvalidationEnable = true; pc.PostSyncOperation = NoWrite; + anv_debug_dump_pc(pc); } /* Now send a third stalling flush to make sure that invalidation is @@ -2039,6 +2088,7 @@ genX(cmd_buffer_config_l3)(struct anv_cmd_buffer *cmd_buffer, pc.DCFlushEnable = true; pc.PostSyncOperation = NoWrite; pc.CommandStreamerStallEnable = true; + anv_debug_dump_pc(pc); } genX(emit_l3_config)(&cmd_buffer->batch, cmd_buffer->device, cfg); @@ -2238,6 +2288,7 @@ genX(cmd_buffer_apply_pipe_flushes)(struct anv_cmd_buffer *cmd_buffer) !pipe.DepthStallEnable && !pipe.DCFlushEnable) pipe.StallAtPixelScoreboard = true; + anv_debug_dump_pc(pipe); } /* If a render target flush was emitted, then we can toggle off the bit @@ -2326,6 +2377,7 @@ genX(cmd_buffer_apply_pipe_flushes)(struct anv_cmd_buffer *cmd_buffer) pipe.PostSyncOperation = WriteImmediateData; pipe.Address = cmd_buffer->device->workaround_address; } + anv_debug_dump_pc(pipe); } #if GFX_VER == 12 @@ -2426,9 +2478,10 @@ void genX(CmdPipelineBarrier)( } } - cmd_buffer->state.pending_pipe_bits |= - anv_pipe_flush_bits_for_access_flags(cmd_buffer->device, src_flags) | - anv_pipe_invalidate_bits_for_access_flags(cmd_buffer->device, dst_flags); + anv_add_pending_pipe_bits(cmd_buffer, + anv_pipe_flush_bits_for_access_flags(cmd_buffer->device, src_flags) | + anv_pipe_invalidate_bits_for_access_flags(cmd_buffer->device, dst_flags), + "pipe barrier"); } static void @@ -3617,8 +3670,11 @@ genX(cmd_buffer_flush_state)(struct anv_cmd_buffer *cmd_buffer) } /* CNL and later require a CS stall after 3DSTATE_SO_BUFFER */ - if (GFX_VER >= 10) - cmd_buffer->state.pending_pipe_bits |= ANV_PIPE_CS_STALL_BIT; + if (GFX_VER >= 10) { + anv_add_pending_pipe_bits(cmd_buffer, + ANV_PIPE_CS_STALL_BIT, + "after 3DSTATE_SO_BUFFER call"); + } } if (cmd_buffer->state.gfx.dirty & ANV_CMD_DIRTY_PIPELINE) { @@ -3655,6 +3711,7 @@ genX(cmd_buffer_flush_state)(struct anv_cmd_buffer *cmd_buffer) pc.DepthStallEnable = true; pc.PostSyncOperation = WriteImmediateData; pc.Address = cmd_buffer->device->workaround_address; + anv_debug_dump_pc(pc); } } #endif @@ -4388,7 +4445,9 @@ void genX(CmdBeginTransformFeedbackEXT)( * process or otherwise pending at the point that the MI_LOAD/STORE * commands are processed. This will likely require a pipeline flush." */ - cmd_buffer->state.pending_pipe_bits |= ANV_PIPE_CS_STALL_BIT; + anv_add_pending_pipe_bits(cmd_buffer, + ANV_PIPE_CS_STALL_BIT, + "begin transform feedback"); genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer); for (uint32_t idx = 0; idx < MAX_XFB_BUFFERS; idx++) { @@ -4441,7 +4500,9 @@ void genX(CmdEndTransformFeedbackEXT)( * process or otherwise pending at the point that the MI_LOAD/STORE * commands are processed. This will likely require a pipeline flush." */ - cmd_buffer->state.pending_pipe_bits |= ANV_PIPE_CS_STALL_BIT; + anv_add_pending_pipe_bits(cmd_buffer, + ANV_PIPE_CS_STALL_BIT, + "end transform feedback"); genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer); for (uint32_t cb_idx = 0; cb_idx < counterBufferCount; cb_idx++) { @@ -4497,7 +4558,9 @@ genX(cmd_buffer_flush_compute_state)(struct anv_cmd_buffer *cmd_buffer) * these scoreboard related states, a MEDIA_STATE_FLUSH is * sufficient." */ - cmd_buffer->state.pending_pipe_bits |= ANV_PIPE_CS_STALL_BIT; + anv_add_pending_pipe_bits(cmd_buffer, + ANV_PIPE_CS_STALL_BIT, + "flush compute state"); genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer); anv_batch_emit_batch(&cmd_buffer->batch, &pipeline->base.batch); @@ -4940,6 +5003,7 @@ genX(flush_pipeline_select)(struct anv_cmd_buffer *cmd_buffer, */ pc.DepthStallEnable = true; #endif + anv_debug_dump_pc(pc); } anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) { @@ -4951,6 +5015,7 @@ genX(flush_pipeline_select)(struct anv_cmd_buffer *cmd_buffer, #if GFX_VER >= 12 pc.TileCacheFlushEnable = true; #endif + anv_debug_dump_pc(pc); } anv_batch_emit(&cmd_buffer->batch, GENX(PIPELINE_SELECT), ps) { @@ -5013,15 +5078,18 @@ genX(cmd_buffer_emit_gfx7_depth_flush)(struct anv_cmd_buffer *cmd_buffer) */ anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pipe) { pipe.DepthStallEnable = true; + anv_debug_dump_pc(pipe); } anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pipe) { pipe.DepthCacheFlushEnable = true; #if GFX_VER >= 12 pipe.TileCacheFlushEnable = true; #endif + anv_debug_dump_pc(pipe); } anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pipe) { pipe.DepthStallEnable = true; + anv_debug_dump_pc(pipe); } } @@ -5096,8 +5164,10 @@ genX(cmd_buffer_set_binding_for_gfx8_vb_flush)(struct anv_cmd_buffer *cmd_buffer /* If our range is larger than 32 bits, we have to flush */ assert(bound->end - bound->start <= (1ull << 32)); if (dirty->end - dirty->start > (1ull << 32)) { - cmd_buffer->state.pending_pipe_bits |= - ANV_PIPE_CS_STALL_BIT | ANV_PIPE_VF_CACHE_INVALIDATE_BIT; + anv_add_pending_pipe_bits(cmd_buffer, + ANV_PIPE_CS_STALL_BIT | + ANV_PIPE_VF_CACHE_INVALIDATE_BIT, + "vb > 32b range"); } } @@ -5212,8 +5282,10 @@ genX(cmd_buffer_emit_hashing_mode)(struct anv_cmd_buffer *cmd_buffer, if (cmd_buffer->state.current_hash_scale != scale && (width > min_size[idx][0] || height > min_size[idx][1])) { - cmd_buffer->state.pending_pipe_bits |= - ANV_PIPE_CS_STALL_BIT | ANV_PIPE_STALL_AT_SCOREBOARD_BIT; + anv_add_pending_pipe_bits(cmd_buffer, + ANV_PIPE_CS_STALL_BIT | + ANV_PIPE_STALL_AT_SCOREBOARD_BIT, + "change pixel hash mode"); genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer); anv_batch_write_reg(&cmd_buffer->batch, GENX(GT_MODE), gt) { @@ -5398,8 +5470,9 @@ cmd_buffer_begin_subpass(struct anv_cmd_buffer *cmd_buffer, cmd_buffer->state.gfx.dirty |= ANV_CMD_DIRTY_PIPELINE; /* Accumulate any subpass flushes that need to happen before the subpass */ - cmd_buffer->state.pending_pipe_bits |= - cmd_buffer->state.pass->subpass_flushes[subpass_id]; + anv_add_pending_pipe_bits(cmd_buffer, + cmd_buffer->state.pass->subpass_flushes[subpass_id], + "begin subpass deps/attachments"); VkRect2D render_area = cmd_buffer->state.render_area; struct anv_framebuffer *fb = cmd_buffer->state.framebuffer; @@ -5741,9 +5814,10 @@ cmd_buffer_begin_subpass(struct anv_cmd_buffer *cmd_buffer, * is set due to new association of BTI, PS Scoreboard Stall bit must * be set in this packet." */ - cmd_buffer->state.pending_pipe_bits |= - ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT | - ANV_PIPE_STALL_AT_SCOREBOARD_BIT; + anv_add_pending_pipe_bits(cmd_buffer, + ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT | + ANV_PIPE_STALL_AT_SCOREBOARD_BIT, + "change RT"); #endif #if GFX_VERx10 == 120 @@ -5754,10 +5828,11 @@ cmd_buffer_begin_subpass(struct anv_cmd_buffer *cmd_buffer, * we want to do a depth flush and stall, so the pipeline is not using these * settings while we change the registers. */ - cmd_buffer->state.pending_pipe_bits |= - ANV_PIPE_DEPTH_CACHE_FLUSH_BIT | - ANV_PIPE_DEPTH_STALL_BIT | - ANV_PIPE_END_OF_PIPE_SYNC_BIT; + anv_add_pending_pipe_bits(cmd_buffer, + ANV_PIPE_DEPTH_CACHE_FLUSH_BIT | + ANV_PIPE_DEPTH_STALL_BIT | + ANV_PIPE_END_OF_PIPE_SYNC_BIT, + "change DS"); genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer); #endif @@ -5860,9 +5935,10 @@ cmd_buffer_end_subpass(struct anv_cmd_buffer *cmd_buffer) * result of writes to the MSAA color attachments show up in the sampler * when we blit to the single-sampled resolve target. */ - cmd_buffer->state.pending_pipe_bits |= - ANV_PIPE_TEXTURE_CACHE_INVALIDATE_BIT | - ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT; + anv_add_pending_pipe_bits(cmd_buffer, + ANV_PIPE_TEXTURE_CACHE_INVALIDATE_BIT | + ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT, + "MSAA resolve"); for (uint32_t i = 0; i < subpass->color_count; ++i) { uint32_t src_att = subpass->color_attachments[i].attachment; @@ -5919,9 +5995,10 @@ cmd_buffer_end_subpass(struct anv_cmd_buffer *cmd_buffer) * result of writes to the MSAA depth attachments show up in the sampler * when we blit to the single-sampled resolve target. */ - cmd_buffer->state.pending_pipe_bits |= - ANV_PIPE_TEXTURE_CACHE_INVALIDATE_BIT | - ANV_PIPE_DEPTH_CACHE_FLUSH_BIT; + anv_add_pending_pipe_bits(cmd_buffer, + ANV_PIPE_TEXTURE_CACHE_INVALIDATE_BIT | + ANV_PIPE_DEPTH_CACHE_FLUSH_BIT, + "MSAA resolve"); uint32_t src_att = subpass->depth_stencil_attachment->attachment; uint32_t dst_att = subpass->ds_resolve_attachment->attachment; @@ -6151,8 +6228,9 @@ cmd_buffer_end_subpass(struct anv_cmd_buffer *cmd_buffer) * genX_CmdNextSubpass just calls end/begin back-to-back, we just end up * ORing the bits in twice so it's harmless. */ - cmd_buffer->state.pending_pipe_bits |= - cmd_buffer->state.pass->subpass_flushes[subpass_id + 1]; + anv_add_pending_pipe_bits(cmd_buffer, + cmd_buffer->state.pass->subpass_flushes[subpass_id + 1], + "end subpass deps/attachments"); } void genX(CmdBeginRenderPass2)( @@ -6337,6 +6415,7 @@ void genX(CmdSetEvent)( event->state.offset }; pc.ImmediateData = VK_EVENT_SET; + anv_debug_dump_pc(pc); } } @@ -6364,6 +6443,7 @@ void genX(CmdResetEvent)( event->state.offset }; pc.ImmediateData = VK_EVENT_RESET; + anv_debug_dump_pc(pc); } } @@ -6436,9 +6516,10 @@ VkResult genX(CmdSetPerformanceOverrideINTEL)( case VK_PERFORMANCE_OVERRIDE_TYPE_FLUSH_GPU_CACHES_INTEL: if (pOverrideInfo->enable) { /* FLUSH ALL THE THINGS! As requested by the MDAPI team. */ - cmd_buffer->state.pending_pipe_bits |= - ANV_PIPE_FLUSH_BITS | - ANV_PIPE_INVALIDATE_BITS; + anv_add_pending_pipe_bits(cmd_buffer, + ANV_PIPE_FLUSH_BITS | + ANV_PIPE_INVALIDATE_BITS, + "perf counter isolation"); genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer); } break; @@ -6466,5 +6547,6 @@ void genX(cmd_emit_timestamp)(struct anv_batch *batch, pc.CommandStreamerStallEnable = true; pc.PostSyncOperation = WriteTimestamp; pc.Address = (struct anv_address) {bo, offset}; + anv_debug_dump_pc(pc); } } diff --git a/src/intel/vulkan/genX_query.c b/src/intel/vulkan/genX_query.c index 26db49d51db..ab1d4e9571e 100644 --- a/src/intel/vulkan/genX_query.c +++ b/src/intel/vulkan/genX_query.c @@ -1374,8 +1374,9 @@ void genX(CmdCopyQueryPoolResults)( * command streamer. */ if (cmd_buffer->state.pending_pipe_bits & ANV_PIPE_RENDER_TARGET_BUFFER_WRITES) { - cmd_buffer->state.pending_pipe_bits |= - ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT; + anv_add_pending_pipe_bits(cmd_buffer, + ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT, + "CopyQueryPoolResults"); } if ((flags & VK_QUERY_RESULT_WAIT_BIT) || @@ -1393,7 +1394,9 @@ void genX(CmdCopyQueryPoolResults)( */ pool->type == VK_QUERY_TYPE_OCCLUSION || pool->type == VK_QUERY_TYPE_TIMESTAMP) { - cmd_buffer->state.pending_pipe_bits |= ANV_PIPE_CS_STALL_BIT; + anv_add_pending_pipe_bits(cmd_buffer, + ANV_PIPE_CS_STALL_BIT, + "CopyQueryPoolResults"); genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer); }