tu: Integrate WFI/WAIT_FOR_ME/WAIT_MEM_WRITES with cache tracking
Track them via pending_flush_bits. Previously WFI was only tracked in flush_bits and WAIT_FOR_ME was emitted directly. This means that we don't emit WAIT_FOR_ME or WAIT_FOR_IDLE if there wasn't a cache flush or other write by the GPU. Also split up host writes from sysmem writes, as only the former require WFI/WAIT_FOR_ME. Along the way, I also realized that we were missing proper handling of transform feedback counter writes which require WAIT_MEM_WRITES. Plumb that through as well. And CmdDrawIndirectByteCountEXT needs a WAIT_FOR_ME as it does not wait for WFI internally. As an example of what this does, a typical barrier for transform feedback with srcAccess = VK_TRANSFORM_FEEDBACK_WRITE_COUNTER_BIT_EXT and dstAccess = VK_ACCESS_INDIRECT_COMMAND_READ_BIT used to emit on A650: - WAIT_FOR_IDLE and now we emit: - WAIT_MEM_WRITES - WAIT_FOR_ME So we've eliminated a useless WFI and added some necessary waits. Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/6007>
This commit is contained in:
@@ -159,8 +159,12 @@ tu6_emit_flushes(struct tu_cmd_buffer *cmd_buffer,
|
||||
tu6_emit_event_write(cmd_buffer, cs, CACHE_FLUSH_TS);
|
||||
if (flushes & TU_CMD_FLAG_CACHE_INVALIDATE)
|
||||
tu6_emit_event_write(cmd_buffer, cs, CACHE_INVALIDATE);
|
||||
if (flushes & TU_CMD_FLAG_WFI)
|
||||
if (flushes & TU_CMD_FLAG_WAIT_MEM_WRITES)
|
||||
tu_cs_emit_pkt7(cs, CP_WAIT_MEM_WRITES, 0);
|
||||
if (flushes & TU_CMD_FLAG_WAIT_FOR_IDLE)
|
||||
tu_cs_emit_wfi(cs);
|
||||
if (flushes & TU_CMD_FLAG_WAIT_FOR_ME)
|
||||
tu_cs_emit_pkt7(cs, CP_WAIT_FOR_ME, 0);
|
||||
}
|
||||
|
||||
/* "Normal" cache flushes, that don't require any special handling */
|
||||
@@ -214,10 +218,11 @@ tu_emit_cache_flush_ccu(struct tu_cmd_buffer *cmd_buffer,
|
||||
flushes |=
|
||||
TU_CMD_FLAG_CCU_INVALIDATE_COLOR |
|
||||
TU_CMD_FLAG_CCU_INVALIDATE_DEPTH |
|
||||
TU_CMD_FLAG_WFI;
|
||||
TU_CMD_FLAG_WAIT_FOR_IDLE;
|
||||
cmd_buffer->state.cache.pending_flush_bits &= ~(
|
||||
TU_CMD_FLAG_CCU_INVALIDATE_COLOR |
|
||||
TU_CMD_FLAG_CCU_INVALIDATE_DEPTH);
|
||||
TU_CMD_FLAG_CCU_INVALIDATE_DEPTH |
|
||||
TU_CMD_FLAG_WAIT_FOR_IDLE);
|
||||
}
|
||||
|
||||
tu6_emit_flushes(cmd_buffer, cs, flushes);
|
||||
@@ -2192,10 +2197,28 @@ tu_flush_for_access(struct tu_cache_state *cache,
|
||||
{
|
||||
enum tu_cmd_flush_bits flush_bits = 0;
|
||||
|
||||
if (src_mask & TU_ACCESS_HOST_WRITE) {
|
||||
/* Host writes are always visible to CP, so only invalidate GPU caches */
|
||||
cache->pending_flush_bits |= TU_CMD_FLAG_GPU_INVALIDATE;
|
||||
}
|
||||
|
||||
if (src_mask & TU_ACCESS_SYSMEM_WRITE) {
|
||||
/* Invalidate CP and 2D engine (make it do WFI + WFM if necessary) as
|
||||
* well.
|
||||
*/
|
||||
cache->pending_flush_bits |= TU_CMD_FLAG_ALL_INVALIDATE;
|
||||
}
|
||||
|
||||
if (src_mask & TU_ACCESS_CP_WRITE) {
|
||||
/* Flush the CP write queue. However a WFI shouldn't be necessary as
|
||||
* WAIT_MEM_WRITES should cover it.
|
||||
*/
|
||||
cache->pending_flush_bits |=
|
||||
TU_CMD_FLAG_WAIT_MEM_WRITES |
|
||||
TU_CMD_FLAG_GPU_INVALIDATE |
|
||||
TU_CMD_FLAG_WAIT_FOR_ME;
|
||||
}
|
||||
|
||||
#define SRC_FLUSH(domain, flush, invalidate) \
|
||||
if (src_mask & TU_ACCESS_##domain##_WRITE) { \
|
||||
cache->pending_flush_bits |= TU_CMD_FLAG_##flush | \
|
||||
@@ -2220,7 +2243,11 @@ tu_flush_for_access(struct tu_cache_state *cache,
|
||||
|
||||
#undef SRC_INCOHERENT_FLUSH
|
||||
|
||||
if (dst_mask & (TU_ACCESS_SYSMEM_READ | TU_ACCESS_SYSMEM_WRITE)) {
|
||||
/* Treat host & sysmem write accesses the same, since the kernel implicitly
|
||||
* drains the queue before signalling completion to the host.
|
||||
*/
|
||||
if (dst_mask & (TU_ACCESS_SYSMEM_READ | TU_ACCESS_SYSMEM_WRITE |
|
||||
TU_ACCESS_HOST_READ | TU_ACCESS_HOST_WRITE)) {
|
||||
flush_bits |= cache->pending_flush_bits & TU_CMD_FLAG_ALL_FLUSH;
|
||||
}
|
||||
|
||||
@@ -2252,7 +2279,13 @@ tu_flush_for_access(struct tu_cache_state *cache,
|
||||
#undef DST_INCOHERENT_FLUSH
|
||||
|
||||
if (dst_mask & TU_ACCESS_WFI_READ) {
|
||||
flush_bits |= TU_CMD_FLAG_WFI;
|
||||
flush_bits |= cache->pending_flush_bits &
|
||||
(TU_CMD_FLAG_ALL_FLUSH | TU_CMD_FLAG_WAIT_FOR_IDLE);
|
||||
}
|
||||
|
||||
if (dst_mask & TU_ACCESS_WFM_READ) {
|
||||
flush_bits |= cache->pending_flush_bits &
|
||||
(TU_CMD_FLAG_ALL_FLUSH | TU_CMD_FLAG_WAIT_FOR_ME);
|
||||
}
|
||||
|
||||
cache->flush_bits |= flush_bits;
|
||||
@@ -2265,30 +2298,45 @@ vk2tu_access(VkAccessFlags flags, bool gmem)
|
||||
enum tu_cmd_access_mask mask = 0;
|
||||
|
||||
/* If the GPU writes a buffer that is then read by an indirect draw
|
||||
* command, we theoretically need a WFI + WAIT_FOR_ME combination to
|
||||
* wait for the writes to complete. The WAIT_FOR_ME is performed as part
|
||||
* of the draw by the firmware, so we just need to execute a WFI.
|
||||
* command, we theoretically need to emit a WFI to wait for any cache
|
||||
* flushes, and then a WAIT_FOR_ME to wait on the CP for the WFI to
|
||||
* complete. Waiting for the WFI to complete is performed as part of the
|
||||
* draw by the firmware, so we just need to execute the WFI.
|
||||
*
|
||||
* Transform feedback counters are read via CP_MEM_TO_REG, which implicitly
|
||||
* does CP_WAIT_FOR_ME, but we still need a WFI if the GPU writes it.
|
||||
*/
|
||||
if (flags &
|
||||
(VK_ACCESS_INDIRECT_COMMAND_READ_BIT |
|
||||
VK_ACCESS_TRANSFORM_FEEDBACK_COUNTER_READ_BIT_EXT |
|
||||
VK_ACCESS_MEMORY_READ_BIT)) {
|
||||
mask |= TU_ACCESS_WFI_READ;
|
||||
}
|
||||
|
||||
if (flags &
|
||||
(VK_ACCESS_INDIRECT_COMMAND_READ_BIT | /* Read performed by CP */
|
||||
VK_ACCESS_TRANSFORM_FEEDBACK_COUNTER_READ_BIT_EXT | /* Read performed by CP, I think */
|
||||
VK_ACCESS_CONDITIONAL_RENDERING_READ_BIT_EXT | /* Read performed by CP */
|
||||
VK_ACCESS_HOST_READ_BIT | /* sysmem by definition */
|
||||
VK_ACCESS_TRANSFORM_FEEDBACK_COUNTER_READ_BIT_EXT | /* Read performed by CP */
|
||||
VK_ACCESS_MEMORY_READ_BIT)) {
|
||||
mask |= TU_ACCESS_SYSMEM_READ;
|
||||
}
|
||||
|
||||
if (flags &
|
||||
(VK_ACCESS_HOST_WRITE_BIT |
|
||||
VK_ACCESS_TRANSFORM_FEEDBACK_COUNTER_WRITE_BIT_EXT | /* Write performed by CP, I think */
|
||||
(VK_ACCESS_TRANSFORM_FEEDBACK_COUNTER_WRITE_BIT_EXT |
|
||||
VK_ACCESS_MEMORY_WRITE_BIT)) {
|
||||
mask |= TU_ACCESS_SYSMEM_WRITE;
|
||||
mask |= TU_ACCESS_CP_WRITE;
|
||||
}
|
||||
|
||||
if (flags &
|
||||
(VK_ACCESS_HOST_READ_BIT |
|
||||
VK_ACCESS_MEMORY_WRITE_BIT)) {
|
||||
mask |= TU_ACCESS_HOST_READ;
|
||||
}
|
||||
|
||||
if (flags &
|
||||
(VK_ACCESS_HOST_WRITE_BIT |
|
||||
VK_ACCESS_MEMORY_WRITE_BIT)) {
|
||||
mask |= TU_ACCESS_HOST_WRITE;
|
||||
}
|
||||
|
||||
if (flags &
|
||||
@@ -3166,6 +3214,20 @@ tu_CmdDrawIndexed(VkCommandBuffer commandBuffer,
|
||||
tu_cs_emit(cs, cmd->state.max_index_count);
|
||||
}
|
||||
|
||||
/* Various firmware bugs/inconsistencies mean that some indirect draw opcodes
|
||||
* do not wait for WFI's to complete before executing. Add a WAIT_FOR_ME if
|
||||
* pending for these opcodes. This may result in a few extra WAIT_FOR_ME's
|
||||
* with these opcodes, but the alternative would add unnecessary WAIT_FOR_ME's
|
||||
* before draw opcodes that don't need it.
|
||||
*/
|
||||
static void
|
||||
draw_wfm(struct tu_cmd_buffer *cmd)
|
||||
{
|
||||
cmd->state.renderpass_cache.flush_bits |=
|
||||
cmd->state.renderpass_cache.pending_flush_bits & TU_CMD_FLAG_WAIT_FOR_ME;
|
||||
cmd->state.renderpass_cache.pending_flush_bits &= ~TU_CMD_FLAG_WAIT_FOR_ME;
|
||||
}
|
||||
|
||||
void
|
||||
tu_CmdDrawIndirect(VkCommandBuffer commandBuffer,
|
||||
VkBuffer _buffer,
|
||||
@@ -3179,15 +3241,17 @@ tu_CmdDrawIndirect(VkCommandBuffer commandBuffer,
|
||||
|
||||
cmd->state.vs_params = (struct tu_draw_state) {};
|
||||
|
||||
tu6_draw_common(cmd, cs, false, 0);
|
||||
|
||||
/* workaround for a firmware bug with CP_DRAW_INDIRECT_MULTI, where it
|
||||
* doesn't wait for WFIs to be completed and leads to GPU fault/hang
|
||||
* TODO: this could be worked around in a more performant way,
|
||||
* or there may exist newer firmware that has been fixed
|
||||
/* The latest known a630_sqe.fw fails to wait for WFI before reading the
|
||||
* indirect buffer when using CP_DRAW_INDIRECT_MULTI, so we have to fall
|
||||
* back to CP_WAIT_FOR_ME except for a650 which has a fixed firmware.
|
||||
*
|
||||
* TODO: There may be newer a630_sqe.fw released in the future which fixes
|
||||
* this, if so we should detect it and avoid this workaround.
|
||||
*/
|
||||
if (cmd->device->physical_device->gpu_id != 650)
|
||||
tu_cs_emit_pkt7(cs, CP_WAIT_FOR_ME, 0);
|
||||
draw_wfm(cmd);
|
||||
|
||||
tu6_draw_common(cmd, cs, false, 0);
|
||||
|
||||
tu_cs_emit_pkt7(cs, CP_DRAW_INDIRECT_MULTI, 6);
|
||||
tu_cs_emit(cs, tu_draw_initiator(cmd, DI_SRC_SEL_AUTO_INDEX));
|
||||
@@ -3213,15 +3277,10 @@ tu_CmdDrawIndexedIndirect(VkCommandBuffer commandBuffer,
|
||||
|
||||
cmd->state.vs_params = (struct tu_draw_state) {};
|
||||
|
||||
tu6_draw_common(cmd, cs, true, 0);
|
||||
|
||||
/* workaround for a firmware bug with CP_DRAW_INDIRECT_MULTI, where it
|
||||
* doesn't wait for WFIs to be completed and leads to GPU fault/hang
|
||||
* TODO: this could be worked around in a more performant way,
|
||||
* or there may exist newer firmware that has been fixed
|
||||
*/
|
||||
if (cmd->device->physical_device->gpu_id != 650)
|
||||
tu_cs_emit_pkt7(cs, CP_WAIT_FOR_ME, 0);
|
||||
draw_wfm(cmd);
|
||||
|
||||
tu6_draw_common(cmd, cs, true, 0);
|
||||
|
||||
tu_cs_emit_pkt7(cs, CP_DRAW_INDIRECT_MULTI, 9);
|
||||
tu_cs_emit(cs, tu_draw_initiator(cmd, DI_SRC_SEL_DMA));
|
||||
@@ -3248,6 +3307,13 @@ void tu_CmdDrawIndirectByteCountEXT(VkCommandBuffer commandBuffer,
|
||||
TU_FROM_HANDLE(tu_buffer, buf, _counterBuffer);
|
||||
struct tu_cs *cs = &cmd->draw_cs;
|
||||
|
||||
/* All known firmware versions do not wait for WFI's with CP_DRAW_AUTO.
|
||||
* Plus, for the common case where the counter buffer is written by
|
||||
* vkCmdEndTransformFeedback, we need to wait for the CP_WAIT_MEM_WRITES to
|
||||
* complete which means we need a WAIT_FOR_ME anyway.
|
||||
*/
|
||||
draw_wfm(cmd);
|
||||
|
||||
cmd->state.vs_params = tu6_emit_vs_params(cmd, 0, firstInstance);
|
||||
|
||||
tu6_draw_common(cmd, cs, false, 0);
|
||||
|
@@ -752,13 +752,34 @@ enum tu_cmd_access_mask {
|
||||
TU_ACCESS_CCU_DEPTH_INCOHERENT_READ = 1 << 8,
|
||||
TU_ACCESS_CCU_DEPTH_INCOHERENT_WRITE = 1 << 9,
|
||||
|
||||
TU_ACCESS_SYSMEM_READ = 1 << 10,
|
||||
TU_ACCESS_SYSMEM_WRITE = 1 << 11,
|
||||
/* Accesses by the host */
|
||||
TU_ACCESS_HOST_READ = 1 << 10,
|
||||
TU_ACCESS_HOST_WRITE = 1 << 11,
|
||||
|
||||
/* Set if a WFI is required due to data being read by the CP or the 2D
|
||||
* engine.
|
||||
/* Accesses by a GPU engine which bypasses any cache. e.g. writes via
|
||||
* CP_EVENT_WRITE::BLIT and the CP are SYSMEM_WRITE.
|
||||
*/
|
||||
TU_ACCESS_WFI_READ = 1 << 12,
|
||||
TU_ACCESS_SYSMEM_READ = 1 << 12,
|
||||
TU_ACCESS_SYSMEM_WRITE = 1 << 13,
|
||||
|
||||
/* Set if a WFI is required. This can be required for:
|
||||
* - 2D engine which (on some models) doesn't wait for flushes to complete
|
||||
* before starting
|
||||
* - CP draw indirect opcodes, where we need to wait for any flushes to
|
||||
* complete but the CP implicitly waits for WFI's to complete and
|
||||
* therefore we only need a WFI after the flushes.
|
||||
*/
|
||||
TU_ACCESS_WFI_READ = 1 << 14,
|
||||
|
||||
/* Set if a CP_WAIT_FOR_ME is required due to the data being read by the CP
|
||||
* without it waiting for any WFI.
|
||||
*/
|
||||
TU_ACCESS_WFM_READ = 1 << 15,
|
||||
|
||||
/* Memory writes from the CP start in-order with draws and event writes,
|
||||
* but execute asynchronously and hence need a CP_WAIT_MEM_WRITES if read.
|
||||
*/
|
||||
TU_ACCESS_CP_WRITE = 1 << 16,
|
||||
|
||||
TU_ACCESS_READ =
|
||||
TU_ACCESS_UCHE_READ |
|
||||
@@ -766,7 +787,10 @@ enum tu_cmd_access_mask {
|
||||
TU_ACCESS_CCU_DEPTH_READ |
|
||||
TU_ACCESS_CCU_COLOR_INCOHERENT_READ |
|
||||
TU_ACCESS_CCU_DEPTH_INCOHERENT_READ |
|
||||
TU_ACCESS_SYSMEM_READ,
|
||||
TU_ACCESS_HOST_READ |
|
||||
TU_ACCESS_SYSMEM_READ |
|
||||
TU_ACCESS_WFI_READ |
|
||||
TU_ACCESS_WFM_READ,
|
||||
|
||||
TU_ACCESS_WRITE =
|
||||
TU_ACCESS_UCHE_WRITE |
|
||||
@@ -774,7 +798,9 @@ enum tu_cmd_access_mask {
|
||||
TU_ACCESS_CCU_COLOR_INCOHERENT_WRITE |
|
||||
TU_ACCESS_CCU_DEPTH_WRITE |
|
||||
TU_ACCESS_CCU_DEPTH_INCOHERENT_WRITE |
|
||||
TU_ACCESS_SYSMEM_WRITE,
|
||||
TU_ACCESS_HOST_WRITE |
|
||||
TU_ACCESS_SYSMEM_WRITE |
|
||||
TU_ACCESS_CP_WRITE,
|
||||
|
||||
TU_ACCESS_ALL =
|
||||
TU_ACCESS_READ |
|
||||
@@ -788,18 +814,31 @@ enum tu_cmd_flush_bits {
|
||||
TU_CMD_FLAG_CCU_INVALIDATE_COLOR = 1 << 3,
|
||||
TU_CMD_FLAG_CACHE_FLUSH = 1 << 4,
|
||||
TU_CMD_FLAG_CACHE_INVALIDATE = 1 << 5,
|
||||
TU_CMD_FLAG_WAIT_MEM_WRITES = 1 << 6,
|
||||
TU_CMD_FLAG_WAIT_FOR_IDLE = 1 << 7,
|
||||
TU_CMD_FLAG_WAIT_FOR_ME = 1 << 8,
|
||||
|
||||
TU_CMD_FLAG_ALL_FLUSH =
|
||||
TU_CMD_FLAG_CCU_FLUSH_DEPTH |
|
||||
TU_CMD_FLAG_CCU_FLUSH_COLOR |
|
||||
TU_CMD_FLAG_CACHE_FLUSH,
|
||||
TU_CMD_FLAG_CACHE_FLUSH |
|
||||
/* Treat the CP as a sort of "cache" which may need to be "flushed" via
|
||||
* waiting for writes to land with WAIT_FOR_MEM_WRITES.
|
||||
*/
|
||||
TU_CMD_FLAG_WAIT_MEM_WRITES,
|
||||
|
||||
TU_CMD_FLAG_ALL_INVALIDATE =
|
||||
TU_CMD_FLAG_GPU_INVALIDATE =
|
||||
TU_CMD_FLAG_CCU_INVALIDATE_DEPTH |
|
||||
TU_CMD_FLAG_CCU_INVALIDATE_COLOR |
|
||||
TU_CMD_FLAG_CACHE_INVALIDATE,
|
||||
|
||||
TU_CMD_FLAG_WFI = 1 << 6,
|
||||
TU_CMD_FLAG_ALL_INVALIDATE =
|
||||
TU_CMD_FLAG_GPU_INVALIDATE |
|
||||
/* Treat the CP as a sort of "cache" which may need to be "invalidated"
|
||||
* via waiting for UCHE/CCU flushes to land with WFI/WFM.
|
||||
*/
|
||||
TU_CMD_FLAG_WAIT_FOR_IDLE |
|
||||
TU_CMD_FLAG_WAIT_FOR_ME,
|
||||
};
|
||||
|
||||
/* Changing the CCU from sysmem mode to gmem mode or vice-versa is pretty
|
||||
|
Reference in New Issue
Block a user