radv: Record cache flushes for RGP.

Not doing the EOP TS cacheflush event because that break wave counting
in RGP for some reason. But the rest looks to be all there.

Reviewed-by: Samuel Pitoiset <samuel.pitoiset@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/6550>
This commit is contained in:
Bas Nieuwenhuizen
2020-09-01 21:28:16 +02:00
committed by Marge Bot
parent cc73182152
commit 78165ea3e2
6 changed files with 126 additions and 10 deletions

View File

@@ -300,7 +300,7 @@ struct rgp_sqtt_marker_barrier_end {
union { union {
struct { struct {
uint32_t sync_cp_dma : 1; uint32_t sync_cp_dma : 1;
uint32_t inval_ccp : 1; uint32_t inval_tcp : 1;
uint32_t inval_sqI : 1; uint32_t inval_sqI : 1;
uint32_t inval_sqK : 1; uint32_t inval_sqK : 1;
uint32_t flush_tcc : 1; uint32_t flush_tcc : 1;
@@ -526,6 +526,38 @@ radv_describe_barrier_end_delayed(struct radv_cmd_buffer *cmd_buffer)
marker.num_layout_transitions = cmd_buffer->state.num_layout_transitions; marker.num_layout_transitions = cmd_buffer->state.num_layout_transitions;
/* TODO: fill pipeline stalls, cache flushes, etc */ /* TODO: fill pipeline stalls, cache flushes, etc */
if (cmd_buffer->state.sqtt_flush_bits & RGP_FLUSH_WAIT_ON_EOP_TS)
marker.wait_on_eop_ts = true;
if (cmd_buffer->state.sqtt_flush_bits & RGP_FLUSH_VS_PARTIAL_FLUSH)
marker.vs_partial_flush = true;
if (cmd_buffer->state.sqtt_flush_bits & RGP_FLUSH_PS_PARTIAL_FLUSH)
marker.ps_partial_flush = true;
if (cmd_buffer->state.sqtt_flush_bits & RGP_FLUSH_CS_PARTIAL_FLUSH)
marker.cs_partial_flush = true;
if (cmd_buffer->state.sqtt_flush_bits & RGP_FLUSH_PFP_SYNC_ME)
marker.pfp_sync_me = true;
if (cmd_buffer->state.sqtt_flush_bits & RGP_FLUSH_SYNC_CP_DMA)
marker.sync_cp_dma = true;
if (cmd_buffer->state.sqtt_flush_bits & RGP_FLUSH_INVAL_VMEM_L0)
marker.inval_tcp = true;
if (cmd_buffer->state.sqtt_flush_bits & RGP_FLUSH_INVAL_ICACHE)
marker.inval_sqI = true;
if (cmd_buffer->state.sqtt_flush_bits & RGP_FLUSH_INVAL_SMEM_L0)
marker.inval_sqK = true;
if (cmd_buffer->state.sqtt_flush_bits & RGP_FLUSH_FLUSH_L2)
marker.flush_tcc = true;
if (cmd_buffer->state.sqtt_flush_bits & RGP_FLUSH_INVAL_L2)
marker.inval_tcc = true;
if (cmd_buffer->state.sqtt_flush_bits & RGP_FLUSH_FLUSH_CB)
marker.flush_cb = true;
if (cmd_buffer->state.sqtt_flush_bits & RGP_FLUSH_INVAL_CB)
marker.inval_cb = true;
if (cmd_buffer->state.sqtt_flush_bits & RGP_FLUSH_FLUSH_DB)
marker.flush_db = true;
if (cmd_buffer->state.sqtt_flush_bits & RGP_FLUSH_INVAL_DB)
marker.inval_db = true;
if (cmd_buffer->state.sqtt_flush_bits & RGP_FLUSH_INVAL_L1)
marker.inval_gl1 = true;
radv_emit_thread_trace_userdata(cmd_buffer->device, cs, &marker, sizeof(marker) / 4); radv_emit_thread_trace_userdata(cmd_buffer->device, cs, &marker, sizeof(marker) / 4);
@@ -543,6 +575,7 @@ radv_describe_barrier_start(struct radv_cmd_buffer *cmd_buffer,
return; return;
radv_describe_barrier_end_delayed(cmd_buffer); radv_describe_barrier_end_delayed(cmd_buffer);
cmd_buffer->state.sqtt_flush_bits = 0;
marker.identifier = RGP_SQTT_MARKER_IDENTIFIER_BARRIER_START; marker.identifier = RGP_SQTT_MARKER_IDENTIFIER_BARRIER_START;
marker.cb_id = 0; marker.cb_id = 0;

View File

@@ -606,6 +606,7 @@ radv_cmd_buffer_after_draw(struct radv_cmd_buffer *cmd_buffer,
} }
if (cmd_buffer->device->instance->debug_flags & RADV_DEBUG_SYNC_SHADERS) { if (cmd_buffer->device->instance->debug_flags & RADV_DEBUG_SYNC_SHADERS) {
enum rgp_flush_bits sqtt_flush_bits = 0;
assert(flags & (RADV_CMD_FLAG_PS_PARTIAL_FLUSH | assert(flags & (RADV_CMD_FLAG_PS_PARTIAL_FLUSH |
RADV_CMD_FLAG_CS_PARTIAL_FLUSH)); RADV_CMD_FLAG_CS_PARTIAL_FLUSH));
@@ -617,7 +618,7 @@ radv_cmd_buffer_after_draw(struct radv_cmd_buffer *cmd_buffer,
&cmd_buffer->gfx9_fence_idx, &cmd_buffer->gfx9_fence_idx,
cmd_buffer->gfx9_fence_va, cmd_buffer->gfx9_fence_va,
radv_cmd_buffer_uses_mec(cmd_buffer), radv_cmd_buffer_uses_mec(cmd_buffer),
flags, cmd_buffer->gfx9_eop_bug_va); flags, &sqtt_flush_bits, cmd_buffer->gfx9_eop_bug_va);
} }
if (unlikely(cmd_buffer->device->trace_bo)) if (unlikely(cmd_buffer->device->trace_bo))

View File

@@ -3767,6 +3767,7 @@ radv_get_preamble_cs(struct radv_queue *queue,
} }
for(int i = 0; i < 3; ++i) { for(int i = 0; i < 3; ++i) {
enum rgp_flush_bits sqtt_flush_bits = 0;
struct radeon_cmdbuf *cs = NULL; struct radeon_cmdbuf *cs = NULL;
cs = queue->device->ws->cs_create(queue->device->ws, cs = queue->device->ws->cs_create(queue->device->ws,
queue->queue_family_index ? RING_COMPUTE : RING_GFX); queue->queue_family_index ? RING_COMPUTE : RING_GFX);
@@ -3832,7 +3833,7 @@ radv_get_preamble_cs(struct radv_queue *queue,
RADV_CMD_FLAG_INV_SCACHE | RADV_CMD_FLAG_INV_SCACHE |
RADV_CMD_FLAG_INV_VCACHE | RADV_CMD_FLAG_INV_VCACHE |
RADV_CMD_FLAG_INV_L2 | RADV_CMD_FLAG_INV_L2 |
RADV_CMD_FLAG_START_PIPELINE_STATS, 0); RADV_CMD_FLAG_START_PIPELINE_STATS, &sqtt_flush_bits, 0);
} else if (i == 1) { } else if (i == 1) {
si_cs_emit_cache_flush(cs, si_cs_emit_cache_flush(cs,
queue->device->physical_device->rad_info.chip_class, queue->device->physical_device->rad_info.chip_class,
@@ -3843,7 +3844,7 @@ radv_get_preamble_cs(struct radv_queue *queue,
RADV_CMD_FLAG_INV_SCACHE | RADV_CMD_FLAG_INV_SCACHE |
RADV_CMD_FLAG_INV_VCACHE | RADV_CMD_FLAG_INV_VCACHE |
RADV_CMD_FLAG_INV_L2 | RADV_CMD_FLAG_INV_L2 |
RADV_CMD_FLAG_START_PIPELINE_STATS, 0); RADV_CMD_FLAG_START_PIPELINE_STATS, &sqtt_flush_bits, 0);
} }
if (queue->device->ws->cs_finalize(cs) != VK_SUCCESS) if (queue->device->ws->cs_finalize(cs) != VK_SUCCESS)

View File

@@ -1301,6 +1301,25 @@ struct radv_subpass_sample_locs_state {
struct radv_sample_locations_state sample_location; struct radv_sample_locations_state sample_location;
}; };
enum rgp_flush_bits {
RGP_FLUSH_WAIT_ON_EOP_TS = 0x1,
RGP_FLUSH_VS_PARTIAL_FLUSH = 0x2,
RGP_FLUSH_PS_PARTIAL_FLUSH = 0x4,
RGP_FLUSH_CS_PARTIAL_FLUSH = 0x8,
RGP_FLUSH_PFP_SYNC_ME = 0x10,
RGP_FLUSH_SYNC_CP_DMA = 0x20,
RGP_FLUSH_INVAL_VMEM_L0 = 0x40,
RGP_FLUSH_INVAL_ICACHE = 0x80,
RGP_FLUSH_INVAL_SMEM_L0 = 0x100,
RGP_FLUSH_FLUSH_L2 = 0x200,
RGP_FLUSH_INVAL_L2 = 0x400,
RGP_FLUSH_FLUSH_CB = 0x800,
RGP_FLUSH_INVAL_CB = 0x1000,
RGP_FLUSH_FLUSH_DB = 0x2000,
RGP_FLUSH_INVAL_DB = 0x4000,
RGP_FLUSH_INVAL_L1 = 0x8000,
};
struct radv_cmd_state { struct radv_cmd_state {
/* Vertex descriptors */ /* Vertex descriptors */
uint64_t vb_va; uint64_t vb_va;
@@ -1370,6 +1389,7 @@ struct radv_cmd_state {
uint32_t num_events; uint32_t num_events;
uint32_t num_layout_transitions; uint32_t num_layout_transitions;
bool pending_sqtt_barrier_end; bool pending_sqtt_barrier_end;
enum rgp_flush_bits sqtt_flush_bits;
}; };
struct radv_cmd_pool { struct radv_cmd_pool {
@@ -1487,6 +1507,7 @@ void si_cs_emit_cache_flush(struct radeon_cmdbuf *cs,
uint32_t *fence_ptr, uint64_t va, uint32_t *fence_ptr, uint64_t va,
bool is_mec, bool is_mec,
enum radv_cmd_flush_bits flush_bits, enum radv_cmd_flush_bits flush_bits,
enum rgp_flush_bits *sqtt_flush_bits,
uint64_t gfx9_eop_bug_va); uint64_t gfx9_eop_bug_va);
void si_emit_cache_flush(struct radv_cmd_buffer *cmd_buffer); void si_emit_cache_flush(struct radv_cmd_buffer *cmd_buffer);
void si_emit_set_predication_state(struct radv_cmd_buffer *cmd_buffer, void si_emit_set_predication_state(struct radv_cmd_buffer *cmd_buffer,

View File

@@ -390,6 +390,7 @@ static void
radv_emit_wait_for_idle(struct radv_device *device, radv_emit_wait_for_idle(struct radv_device *device,
struct radeon_cmdbuf *cs, int family) struct radeon_cmdbuf *cs, int family)
{ {
enum rgp_flush_bits sqtt_flush_bits = 0;
si_cs_emit_cache_flush(cs, device->physical_device->rad_info.chip_class, si_cs_emit_cache_flush(cs, device->physical_device->rad_info.chip_class,
NULL, 0, NULL, 0,
family == RING_COMPUTE && family == RING_COMPUTE &&
@@ -400,7 +401,7 @@ radv_emit_wait_for_idle(struct radv_device *device,
RADV_CMD_FLAG_INV_ICACHE | RADV_CMD_FLAG_INV_ICACHE |
RADV_CMD_FLAG_INV_SCACHE | RADV_CMD_FLAG_INV_SCACHE |
RADV_CMD_FLAG_INV_VCACHE | RADV_CMD_FLAG_INV_VCACHE |
RADV_CMD_FLAG_INV_L2, 0); RADV_CMD_FLAG_INV_L2, &sqtt_flush_bits, 0);
} }
static void static void

View File

@@ -1040,6 +1040,7 @@ gfx10_cs_emit_cache_flush(struct radeon_cmdbuf *cs,
uint64_t flush_va, uint64_t flush_va,
bool is_mec, bool is_mec,
enum radv_cmd_flush_bits flush_bits, enum radv_cmd_flush_bits flush_bits,
enum rgp_flush_bits *sqtt_flush_bits,
uint64_t gfx9_eop_bug_va) uint64_t gfx9_eop_bug_va)
{ {
uint32_t gcr_cntl = 0; uint32_t gcr_cntl = 0;
@@ -1048,26 +1049,38 @@ gfx10_cs_emit_cache_flush(struct radeon_cmdbuf *cs,
/* We don't need these. */ /* We don't need these. */
assert(!(flush_bits & (RADV_CMD_FLAG_VGT_STREAMOUT_SYNC))); assert(!(flush_bits & (RADV_CMD_FLAG_VGT_STREAMOUT_SYNC)));
if (flush_bits & RADV_CMD_FLAG_INV_ICACHE) if (flush_bits & RADV_CMD_FLAG_INV_ICACHE) {
gcr_cntl |= S_586_GLI_INV(V_586_GLI_ALL); gcr_cntl |= S_586_GLI_INV(V_586_GLI_ALL);
*sqtt_flush_bits |= RGP_FLUSH_INVAL_ICACHE;
}
if (flush_bits & RADV_CMD_FLAG_INV_SCACHE) { if (flush_bits & RADV_CMD_FLAG_INV_SCACHE) {
/* TODO: When writing to the SMEM L1 cache, we need to set SEQ /* TODO: When writing to the SMEM L1 cache, we need to set SEQ
* to FORWARD when both L1 and L2 are written out (WB or INV). * to FORWARD when both L1 and L2 are written out (WB or INV).
*/ */
gcr_cntl |= S_586_GL1_INV(1) | S_586_GLK_INV(1); gcr_cntl |= S_586_GL1_INV(1) | S_586_GLK_INV(1);
*sqtt_flush_bits |= RGP_FLUSH_INVAL_SMEM_L0;
} }
if (flush_bits & RADV_CMD_FLAG_INV_VCACHE) if (flush_bits & RADV_CMD_FLAG_INV_VCACHE) {
gcr_cntl |= S_586_GL1_INV(1) | S_586_GLV_INV(1); gcr_cntl |= S_586_GL1_INV(1) | S_586_GLV_INV(1);
*sqtt_flush_bits |= RGP_FLUSH_INVAL_VMEM_L0 | RGP_FLUSH_INVAL_L1;
}
if (flush_bits & RADV_CMD_FLAG_INV_L2) { if (flush_bits & RADV_CMD_FLAG_INV_L2) {
/* Writeback and invalidate everything in L2. */ /* Writeback and invalidate everything in L2. */
gcr_cntl |= S_586_GL2_INV(1) | S_586_GL2_WB(1) | gcr_cntl |= S_586_GL2_INV(1) | S_586_GL2_WB(1) |
S_586_GLM_INV(1) | S_586_GLM_WB(1); S_586_GLM_INV(1) | S_586_GLM_WB(1);
*sqtt_flush_bits |= RGP_FLUSH_INVAL_L2;
} else if (flush_bits & RADV_CMD_FLAG_WB_L2) { } else if (flush_bits & RADV_CMD_FLAG_WB_L2) {
/* Writeback but do not invalidate. /* Writeback but do not invalidate.
* GLM doesn't support WB alone. If WB is set, INV must be set too. * GLM doesn't support WB alone. If WB is set, INV must be set too.
*/ */
gcr_cntl |= S_586_GL2_WB(1) | gcr_cntl |= S_586_GL2_WB(1) |
S_586_GLM_WB(1) | S_586_GLM_INV(1); S_586_GLM_WB(1) | S_586_GLM_INV(1);
*sqtt_flush_bits |= RGP_FLUSH_FLUSH_L2;
} }
/* TODO: Implement this new flag for GFX9+. /* TODO: Implement this new flag for GFX9+.
@@ -1082,6 +1095,8 @@ gfx10_cs_emit_cache_flush(struct radeon_cmdbuf *cs,
radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0)); radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
radeon_emit(cs, EVENT_TYPE(V_028A90_FLUSH_AND_INV_CB_META) | radeon_emit(cs, EVENT_TYPE(V_028A90_FLUSH_AND_INV_CB_META) |
EVENT_INDEX(0)); EVENT_INDEX(0));
*sqtt_flush_bits |= RGP_FLUSH_FLUSH_CB | RGP_FLUSH_INVAL_CB;
} }
/* TODO: trigger on RADV_CMD_FLAG_FLUSH_AND_INV_DB_META ? */ /* TODO: trigger on RADV_CMD_FLAG_FLUSH_AND_INV_DB_META ? */
@@ -1090,6 +1105,8 @@ gfx10_cs_emit_cache_flush(struct radeon_cmdbuf *cs,
radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0)); radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
radeon_emit(cs, EVENT_TYPE(V_028A90_FLUSH_AND_INV_DB_META) | radeon_emit(cs, EVENT_TYPE(V_028A90_FLUSH_AND_INV_DB_META) |
EVENT_INDEX(0)); EVENT_INDEX(0));
*sqtt_flush_bits |= RGP_FLUSH_FLUSH_DB | RGP_FLUSH_INVAL_DB;
} }
/* First flush CB/DB, then L1/L2. */ /* First flush CB/DB, then L1/L2. */
@@ -1110,15 +1127,21 @@ gfx10_cs_emit_cache_flush(struct radeon_cmdbuf *cs,
if (flush_bits & RADV_CMD_FLAG_PS_PARTIAL_FLUSH) { if (flush_bits & RADV_CMD_FLAG_PS_PARTIAL_FLUSH) {
radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0)); radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
radeon_emit(cs, EVENT_TYPE(V_028A90_PS_PARTIAL_FLUSH) | EVENT_INDEX(4)); radeon_emit(cs, EVENT_TYPE(V_028A90_PS_PARTIAL_FLUSH) | EVENT_INDEX(4));
*sqtt_flush_bits |= RGP_FLUSH_PS_PARTIAL_FLUSH;
} else if (flush_bits & RADV_CMD_FLAG_VS_PARTIAL_FLUSH) { } else if (flush_bits & RADV_CMD_FLAG_VS_PARTIAL_FLUSH) {
radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0)); radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
radeon_emit(cs, EVENT_TYPE(V_028A90_VS_PARTIAL_FLUSH) | EVENT_INDEX(4)); radeon_emit(cs, EVENT_TYPE(V_028A90_VS_PARTIAL_FLUSH) | EVENT_INDEX(4));
*sqtt_flush_bits |= RGP_FLUSH_VS_PARTIAL_FLUSH;
} }
} }
if (flush_bits & RADV_CMD_FLAG_CS_PARTIAL_FLUSH) { if (flush_bits & RADV_CMD_FLAG_CS_PARTIAL_FLUSH) {
radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0)); radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
radeon_emit(cs, EVENT_TYPE(V_028A90_CS_PARTIAL_FLUSH | EVENT_INDEX(4))); radeon_emit(cs, EVENT_TYPE(V_028A90_CS_PARTIAL_FLUSH | EVENT_INDEX(4)));
*sqtt_flush_bits |= RGP_FLUSH_CS_PARTIAL_FLUSH;
} }
if (cb_db_event) { if (cb_db_event) {
@@ -1197,6 +1220,8 @@ gfx10_cs_emit_cache_flush(struct radeon_cmdbuf *cs,
/* We need to ensure that PFP waits as well. */ /* We need to ensure that PFP waits as well. */
radeon_emit(cs, PKT3(PKT3_PFP_SYNC_ME, 0, 0)); radeon_emit(cs, PKT3(PKT3_PFP_SYNC_ME, 0, 0));
radeon_emit(cs, 0); radeon_emit(cs, 0);
*sqtt_flush_bits |= RGP_FLUSH_PFP_SYNC_ME;
} }
if (flush_bits & RADV_CMD_FLAG_START_PIPELINE_STATS) { if (flush_bits & RADV_CMD_FLAG_START_PIPELINE_STATS) {
@@ -1217,6 +1242,7 @@ si_cs_emit_cache_flush(struct radeon_cmdbuf *cs,
uint64_t flush_va, uint64_t flush_va,
bool is_mec, bool is_mec,
enum radv_cmd_flush_bits flush_bits, enum radv_cmd_flush_bits flush_bits,
enum rgp_flush_bits *sqtt_flush_bits,
uint64_t gfx9_eop_bug_va) uint64_t gfx9_eop_bug_va)
{ {
unsigned cp_coher_cntl = 0; unsigned cp_coher_cntl = 0;
@@ -1226,14 +1252,19 @@ si_cs_emit_cache_flush(struct radeon_cmdbuf *cs,
if (chip_class >= GFX10) { if (chip_class >= GFX10) {
/* GFX10 cache flush handling is quite different. */ /* GFX10 cache flush handling is quite different. */
gfx10_cs_emit_cache_flush(cs, chip_class, flush_cnt, flush_va, gfx10_cs_emit_cache_flush(cs, chip_class, flush_cnt, flush_va,
is_mec, flush_bits, gfx9_eop_bug_va); is_mec, flush_bits, sqtt_flush_bits,
gfx9_eop_bug_va);
return; return;
} }
if (flush_bits & RADV_CMD_FLAG_INV_ICACHE) if (flush_bits & RADV_CMD_FLAG_INV_ICACHE) {
cp_coher_cntl |= S_0085F0_SH_ICACHE_ACTION_ENA(1); cp_coher_cntl |= S_0085F0_SH_ICACHE_ACTION_ENA(1);
if (flush_bits & RADV_CMD_FLAG_INV_SCACHE) *sqtt_flush_bits |= RGP_FLUSH_INVAL_ICACHE;
}
if (flush_bits & RADV_CMD_FLAG_INV_SCACHE) {
cp_coher_cntl |= S_0085F0_SH_KCACHE_ACTION_ENA(1); cp_coher_cntl |= S_0085F0_SH_KCACHE_ACTION_ENA(1);
*sqtt_flush_bits |= RGP_FLUSH_INVAL_SMEM_L0;
}
if (chip_class <= GFX8) { if (chip_class <= GFX8) {
if (flush_bits & RADV_CMD_FLAG_FLUSH_AND_INV_CB) { if (flush_bits & RADV_CMD_FLAG_FLUSH_AND_INV_CB) {
@@ -1259,34 +1290,48 @@ si_cs_emit_cache_flush(struct radeon_cmdbuf *cs,
0, 0, 0, 0,
gfx9_eop_bug_va); gfx9_eop_bug_va);
} }
*sqtt_flush_bits |= RGP_FLUSH_FLUSH_CB | RGP_FLUSH_INVAL_CB;
} }
if (flush_bits & RADV_CMD_FLAG_FLUSH_AND_INV_DB) { if (flush_bits & RADV_CMD_FLAG_FLUSH_AND_INV_DB) {
cp_coher_cntl |= S_0085F0_DB_ACTION_ENA(1) | cp_coher_cntl |= S_0085F0_DB_ACTION_ENA(1) |
S_0085F0_DB_DEST_BASE_ENA(1); S_0085F0_DB_DEST_BASE_ENA(1);
*sqtt_flush_bits |= RGP_FLUSH_FLUSH_DB | RGP_FLUSH_INVAL_DB;
} }
} }
if (flush_bits & RADV_CMD_FLAG_FLUSH_AND_INV_CB_META) { if (flush_bits & RADV_CMD_FLAG_FLUSH_AND_INV_CB_META) {
radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0)); radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
radeon_emit(cs, EVENT_TYPE(V_028A90_FLUSH_AND_INV_CB_META) | EVENT_INDEX(0)); radeon_emit(cs, EVENT_TYPE(V_028A90_FLUSH_AND_INV_CB_META) | EVENT_INDEX(0));
*sqtt_flush_bits |= RGP_FLUSH_FLUSH_CB | RGP_FLUSH_INVAL_CB;
} }
if (flush_bits & RADV_CMD_FLAG_FLUSH_AND_INV_DB_META) { if (flush_bits & RADV_CMD_FLAG_FLUSH_AND_INV_DB_META) {
radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0)); radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
radeon_emit(cs, EVENT_TYPE(V_028A90_FLUSH_AND_INV_DB_META) | EVENT_INDEX(0)); radeon_emit(cs, EVENT_TYPE(V_028A90_FLUSH_AND_INV_DB_META) | EVENT_INDEX(0));
*sqtt_flush_bits |= RGP_FLUSH_FLUSH_DB | RGP_FLUSH_INVAL_DB;
} }
if (flush_bits & RADV_CMD_FLAG_PS_PARTIAL_FLUSH) { if (flush_bits & RADV_CMD_FLAG_PS_PARTIAL_FLUSH) {
radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0)); radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
radeon_emit(cs, EVENT_TYPE(V_028A90_PS_PARTIAL_FLUSH) | EVENT_INDEX(4)); radeon_emit(cs, EVENT_TYPE(V_028A90_PS_PARTIAL_FLUSH) | EVENT_INDEX(4));
*sqtt_flush_bits |= RGP_FLUSH_PS_PARTIAL_FLUSH;
} else if (flush_bits & RADV_CMD_FLAG_VS_PARTIAL_FLUSH) { } else if (flush_bits & RADV_CMD_FLAG_VS_PARTIAL_FLUSH) {
radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0)); radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
radeon_emit(cs, EVENT_TYPE(V_028A90_VS_PARTIAL_FLUSH) | EVENT_INDEX(4)); radeon_emit(cs, EVENT_TYPE(V_028A90_VS_PARTIAL_FLUSH) | EVENT_INDEX(4));
*sqtt_flush_bits |= RGP_FLUSH_VS_PARTIAL_FLUSH;
} }
if (flush_bits & RADV_CMD_FLAG_CS_PARTIAL_FLUSH) { if (flush_bits & RADV_CMD_FLAG_CS_PARTIAL_FLUSH) {
radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0)); radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
radeon_emit(cs, EVENT_TYPE(V_028A90_CS_PARTIAL_FLUSH) | EVENT_INDEX(4)); radeon_emit(cs, EVENT_TYPE(V_028A90_CS_PARTIAL_FLUSH) | EVENT_INDEX(4));
*sqtt_flush_bits |= RGP_FLUSH_CS_PARTIAL_FLUSH;
} }
if (chip_class == GFX9 && flush_cb_db) { if (chip_class == GFX9 && flush_cb_db) {
@@ -1310,6 +1355,9 @@ si_cs_emit_cache_flush(struct radeon_cmdbuf *cs,
tc_flags = EVENT_TC_ACTION_ENA | tc_flags = EVENT_TC_ACTION_ENA |
EVENT_TC_MD_ACTION_ENA; EVENT_TC_MD_ACTION_ENA;
*sqtt_flush_bits |= RGP_FLUSH_FLUSH_CB | RGP_FLUSH_INVAL_CB |
RGP_FLUSH_FLUSH_DB | RGP_FLUSH_INVAL_DB;
/* Ideally flush TC together with CB/DB. */ /* Ideally flush TC together with CB/DB. */
if (flush_bits & RADV_CMD_FLAG_INV_L2) { if (flush_bits & RADV_CMD_FLAG_INV_L2) {
/* Writeback and invalidate everything in L2 & L1. */ /* Writeback and invalidate everything in L2 & L1. */
@@ -1321,6 +1369,8 @@ si_cs_emit_cache_flush(struct radeon_cmdbuf *cs,
flush_bits &= ~(RADV_CMD_FLAG_INV_L2 | flush_bits &= ~(RADV_CMD_FLAG_INV_L2 |
RADV_CMD_FLAG_WB_L2 | RADV_CMD_FLAG_WB_L2 |
RADV_CMD_FLAG_INV_VCACHE); RADV_CMD_FLAG_INV_VCACHE);
*sqtt_flush_bits |= RGP_FLUSH_INVAL_L2;
} }
assert(flush_cnt); assert(flush_cnt);
(*flush_cnt)++; (*flush_cnt)++;
@@ -1357,6 +1407,8 @@ si_cs_emit_cache_flush(struct radeon_cmdbuf *cs,
!is_mec) { !is_mec) {
radeon_emit(cs, PKT3(PKT3_PFP_SYNC_ME, 0, 0)); radeon_emit(cs, PKT3(PKT3_PFP_SYNC_ME, 0, 0));
radeon_emit(cs, 0); radeon_emit(cs, 0);
*sqtt_flush_bits |= RGP_FLUSH_PFP_SYNC_ME;
} }
if ((flush_bits & RADV_CMD_FLAG_INV_L2) || if ((flush_bits & RADV_CMD_FLAG_INV_L2) ||
@@ -1367,6 +1419,8 @@ si_cs_emit_cache_flush(struct radeon_cmdbuf *cs,
S_0085F0_TCL1_ACTION_ENA(1) | S_0085F0_TCL1_ACTION_ENA(1) |
S_0301F0_TC_WB_ACTION_ENA(chip_class >= GFX8)); S_0301F0_TC_WB_ACTION_ENA(chip_class >= GFX8));
cp_coher_cntl = 0; cp_coher_cntl = 0;
*sqtt_flush_bits |= RGP_FLUSH_INVAL_L2 | RGP_FLUSH_INVAL_VMEM_L0;
} else { } else {
if(flush_bits & RADV_CMD_FLAG_WB_L2) { if(flush_bits & RADV_CMD_FLAG_WB_L2) {
/* WB = write-back /* WB = write-back
@@ -1381,6 +1435,8 @@ si_cs_emit_cache_flush(struct radeon_cmdbuf *cs,
S_0301F0_TC_WB_ACTION_ENA(1) | S_0301F0_TC_WB_ACTION_ENA(1) |
S_0301F0_TC_NC_ACTION_ENA(1)); S_0301F0_TC_NC_ACTION_ENA(1));
cp_coher_cntl = 0; cp_coher_cntl = 0;
*sqtt_flush_bits |= RGP_FLUSH_FLUSH_L2 | RGP_FLUSH_INVAL_VMEM_L0;
} }
if (flush_bits & RADV_CMD_FLAG_INV_VCACHE) { if (flush_bits & RADV_CMD_FLAG_INV_VCACHE) {
si_emit_acquire_mem(cs, is_mec, si_emit_acquire_mem(cs, is_mec,
@@ -1388,6 +1444,8 @@ si_cs_emit_cache_flush(struct radeon_cmdbuf *cs,
cp_coher_cntl | cp_coher_cntl |
S_0085F0_TCL1_ACTION_ENA(1)); S_0085F0_TCL1_ACTION_ENA(1));
cp_coher_cntl = 0; cp_coher_cntl = 0;
*sqtt_flush_bits |= RGP_FLUSH_INVAL_VMEM_L0;
} }
} }
@@ -1437,6 +1495,7 @@ si_emit_cache_flush(struct radv_cmd_buffer *cmd_buffer)
cmd_buffer->gfx9_fence_va, cmd_buffer->gfx9_fence_va,
radv_cmd_buffer_uses_mec(cmd_buffer), radv_cmd_buffer_uses_mec(cmd_buffer),
cmd_buffer->state.flush_bits, cmd_buffer->state.flush_bits,
&cmd_buffer->state.sqtt_flush_bits,
cmd_buffer->gfx9_eop_bug_va); cmd_buffer->gfx9_eop_bug_va);