radeonsi: handle deferred cache flushes as a state (si_atom)

This allows us to remove a little bit of code from si_draw, and enable
removing more code in the future.

Reviewed-by: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer@amd.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/24732>
This commit is contained in:
Marek Olšák
2023-07-16 10:38:20 -04:00
committed by Marge Bot
parent c3129b2b83
commit 1e4b539042
16 changed files with 117 additions and 27 deletions

View File

@@ -496,16 +496,20 @@ static void si_blit_decompress_color(struct si_context *sctx, struct si_texture
/* Required before and after FMASK and DCC_DECOMPRESS. */
if (custom_blend == sctx->custom_blend_fmask_decompress ||
custom_blend == sctx->custom_blend_dcc_decompress)
custom_blend == sctx->custom_blend_dcc_decompress) {
sctx->flags |= SI_CONTEXT_FLUSH_AND_INV_CB;
si_mark_atom_dirty(sctx, &sctx->atoms.s.cache_flush);
}
si_blitter_begin(sctx, SI_DECOMPRESS);
util_blitter_custom_color(sctx->blitter, cbsurf, custom_blend);
si_blitter_end(sctx);
if (custom_blend == sctx->custom_blend_fmask_decompress ||
custom_blend == sctx->custom_blend_dcc_decompress)
custom_blend == sctx->custom_blend_dcc_decompress) {
sctx->flags |= SI_CONTEXT_FLUSH_AND_INV_CB;
si_mark_atom_dirty(sctx, &sctx->atoms.s.cache_flush);
}
/* When running FMASK decompression with DCC, we need to run the "eliminate fast clear" pass
* separately because FMASK decompression doesn't eliminate DCC fast clear. This makes
@@ -1036,6 +1040,7 @@ static void si_do_CB_resolve(struct si_context *sctx, const struct pipe_blit_inf
{
/* Required before and after CB_RESOLVE. */
sctx->flags |= SI_CONTEXT_FLUSH_AND_INV_CB;
si_mark_atom_dirty(sctx, &sctx->atoms.s.cache_flush);
si_blitter_begin(
sctx, SI_COLOR_RESOLVE | (info->render_condition_enable ? 0 : SI_DISABLE_RENDER_COND));

View File

@@ -55,6 +55,8 @@ void si_execute_clears(struct si_context *sctx, struct si_clear_info *info,
if (sctx->gfx_level <= GFX8)
sctx->flags |= SI_CONTEXT_INV_L2;
si_mark_atom_dirty(sctx, &sctx->atoms.s.cache_flush);
/* Execute clears. */
for (unsigned i = 0; i < num_clears; i++) {
if (info[i].is_dcc_msaa) {
@@ -83,6 +85,8 @@ void si_execute_clears(struct si_context *sctx, struct si_clear_info *info,
/* GFX6-8: CB and DB don't use L2. */
if (sctx->gfx_level <= GFX8)
sctx->flags |= SI_CONTEXT_WB_L2;
si_mark_atom_dirty(sctx, &sctx->atoms.s.cache_flush);
}
static bool si_alloc_separate_cmask(struct si_screen *sscreen, struct si_texture *tex)
@@ -1162,8 +1166,10 @@ static void si_clear(struct pipe_context *ctx, unsigned buffers,
si_mark_atom_dirty(sctx, &sctx->atoms.s.db_render_state);
}
if (needs_db_flush)
if (needs_db_flush) {
sctx->flags |= SI_CONTEXT_FLUSH_AND_INV_DB;
si_mark_atom_dirty(sctx, &sctx->atoms.s.cache_flush);
}
}
if (unlikely(sctx->sqtt_enabled)) {

View File

@@ -947,8 +947,10 @@ static void si_launch_grid(struct pipe_context *ctx, const struct pipe_grid_info
bool cs_regalloc_hang = sscreen->info.has_cs_regalloc_hang_bug &&
info->block[0] * info->block[1] * info->block[2] > 256;
if (cs_regalloc_hang)
if (cs_regalloc_hang) {
sctx->flags |= SI_CONTEXT_PS_PARTIAL_FLUSH | SI_CONTEXT_CS_PARTIAL_FLUSH;
si_mark_atom_dirty(sctx, &sctx->atoms.s.cache_flush);
}
if (program->ir_type != PIPE_SHADER_IR_NATIVE && program->shader.compilation_failed)
return;
@@ -976,6 +978,7 @@ static void si_launch_grid(struct pipe_context *ctx, const struct pipe_grid_info
/* Indirect buffers use TC L2 on GFX9, but not older hw. */
if (sctx->gfx_level <= GFX8 && si_resource(info->indirect)->TC_L2_dirty) {
sctx->flags |= SI_CONTEXT_WB_L2;
si_mark_atom_dirty(sctx, &sctx->atoms.s.cache_flush);
si_resource(info->indirect)->TC_L2_dirty = false;
}
}
@@ -1024,7 +1027,7 @@ static void si_launch_grid(struct pipe_context *ctx, const struct pipe_grid_info
/* Registers that are not read from memory should be set before this: */
if (sctx->flags)
sctx->emit_cache_flush(sctx, &sctx->gfx_cs);
si_emit_cache_flush_direct(sctx);
if (sctx->has_graphics && si_is_atom_dirty(sctx, &sctx->atoms.s.render_cond)) {
sctx->atoms.s.render_cond.emit(sctx, -1);
@@ -1060,8 +1063,10 @@ static void si_launch_grid(struct pipe_context *ctx, const struct pipe_grid_info
sctx->compute_is_busy = true;
sctx->num_compute_calls++;
if (cs_regalloc_hang)
if (cs_regalloc_hang) {
sctx->flags |= SI_CONTEXT_CS_PARTIAL_FLUSH;
si_mark_atom_dirty(sctx, &sctx->atoms.s.cache_flush);
}
}
void si_destroy_compute(struct si_compute *program)

View File

@@ -163,6 +163,9 @@ static void si_launch_grid_internal(struct si_context *sctx, const struct pipe_g
if (sctx->num_hw_pipestat_streamout_queries)
sctx->flags |= SI_CONTEXT_STOP_PIPELINE_STATS;
if (sctx->flags)
si_mark_atom_dirty(sctx, &sctx->atoms.s.cache_flush);
if (!(flags & SI_OP_CS_RENDER_COND_ENABLE))
sctx->render_cond_enabled = false;
@@ -213,6 +216,9 @@ static void si_launch_grid_internal(struct si_context *sctx, const struct pipe_g
sctx->flags |= SI_CONTEXT_INV_SCACHE | SI_CONTEXT_INV_VCACHE | SI_CONTEXT_PFP_SYNC_ME;
}
}
if (sctx->flags)
si_mark_atom_dirty(sctx, &sctx->atoms.s.cache_flush);
}
void si_launch_grid_internal_ssbos(struct si_context *sctx, struct pipe_grid_info *info,
@@ -220,8 +226,10 @@ void si_launch_grid_internal_ssbos(struct si_context *sctx, struct pipe_grid_inf
unsigned num_buffers, const struct pipe_shader_buffer *buffers,
unsigned writeable_bitmask)
{
if (!(flags & SI_OP_SKIP_CACHE_INV_BEFORE))
if (!(flags & SI_OP_SKIP_CACHE_INV_BEFORE)) {
sctx->flags |= si_get_flush_flags(sctx, coher, SI_COMPUTE_DST_CACHE_POLICY);
si_mark_atom_dirty(sctx, &sctx->atoms.s.cache_flush);
}
/* Save states. */
struct pipe_shader_buffer saved_sb[3] = {};
@@ -243,8 +251,10 @@ void si_launch_grid_internal_ssbos(struct si_context *sctx, struct pipe_grid_inf
/* Do cache flushing at the end. */
if (get_cache_policy(sctx, coher, 0) == L2_BYPASS) {
if (flags & SI_OP_SYNC_AFTER)
if (flags & SI_OP_SYNC_AFTER) {
sctx->flags |= SI_CONTEXT_WB_L2;
si_mark_atom_dirty(sctx, &sctx->atoms.s.cache_flush);
}
} else {
while (writeable_bitmask)
si_resource(buffers[u_bit_scan(&writeable_bitmask)].buffer)->TC_L2_dirty = true;

View File

@@ -144,7 +144,7 @@ static void si_cp_dma_prepare(struct si_context *sctx, struct pipe_resource *dst
* Also wait for the previous CP DMA operations.
*/
if (*is_first && sctx->flags)
sctx->emit_cache_flush(sctx, &sctx->gfx_cs);
si_emit_cache_flush_direct(sctx);
if (user_flags & SI_OP_SYNC_CPDMA_BEFORE && *is_first && !(*packet_flags & CP_DMA_CLEAR))
*packet_flags |= CP_DMA_RAW_WAIT;
@@ -192,6 +192,9 @@ void si_cp_dma_clear_buffer(struct si_context *sctx, struct radeon_cmdbuf *cs,
sctx->flags |= si_get_flush_flags(sctx, coher, cache_policy);
}
if (sctx->flags)
si_mark_atom_dirty(sctx, &sctx->atoms.s.cache_flush);
while (size) {
unsigned byte_count = MIN2(size, cp_dma_max_byte_count(sctx));
unsigned dma_flags = CP_DMA_CLEAR | (sdst ? 0 : CP_DMA_DST_IS_GDS);
@@ -330,6 +333,9 @@ void si_cp_dma_copy_buffer(struct si_context *sctx, struct pipe_resource *dst,
if ((dst || src) && !(user_flags & SI_OP_SKIP_CACHE_INV_BEFORE))
sctx->flags |= si_get_flush_flags(sctx, coher, cache_policy);
if (sctx->flags)
si_mark_atom_dirty(sctx, &sctx->atoms.s.cache_flush);
/* This is the main part doing the copying. Src is always aligned. */
main_dst_offset = dst_offset + skipped_size;
main_src_offset = src_offset + skipped_size;

View File

@@ -1926,7 +1926,7 @@ static void si_upload_bindless_descriptors(struct si_context *sctx)
* descriptors directly in memory, in case the GPU is using them.
*/
sctx->flags |= SI_CONTEXT_PS_PARTIAL_FLUSH | SI_CONTEXT_CS_PARTIAL_FLUSH;
sctx->emit_cache_flush(sctx, &sctx->gfx_cs);
si_emit_cache_flush_direct(sctx);
util_dynarray_foreach (&sctx->resident_tex_handles, struct si_texture_handle *, tex_handle) {
unsigned desc_slot = (*tex_handle)->desc_slot;
@@ -1950,6 +1950,7 @@ static void si_upload_bindless_descriptors(struct si_context *sctx)
/* Invalidate scalar L0 because the cache doesn't know that L2 changed. */
sctx->flags |= SI_CONTEXT_INV_SCACHE;
si_mark_atom_dirty(sctx, &sctx->atoms.s.cache_flush);
sctx->bindless_descriptors_dirty = false;
}

View File

@@ -108,7 +108,7 @@ void si_flush_gfx_cs(struct si_context *ctx, unsigned flags, struct pipe_fence_h
/* Wait for draw calls to finish if needed. */
if (wait_flags) {
ctx->flags |= wait_flags;
ctx->emit_cache_flush(ctx, &ctx->gfx_cs);
si_emit_cache_flush_direct(ctx);
}
ctx->gfx_last_ib_is_busy = (wait_flags & wait_ps_cs) != wait_ps_cs;
@@ -396,6 +396,8 @@ void si_begin_new_gfx_cs(struct si_context *ctx, bool first_cs)
if (ctx->screen->info.has_vgt_flush_ngg_legacy_bug && !ctx->ngg)
ctx->flags |= SI_CONTEXT_VGT_FLUSH;
si_mark_atom_dirty(ctx, &ctx->atoms.s.cache_flush);
if (ctx->screen->attribute_ring) {
radeon_add_to_buffer_list(ctx, &ctx->gfx_cs, ctx->screen->attribute_ring,
RADEON_USAGE_READWRITE | RADEON_PRIO_SHADER_RINGS);
@@ -658,6 +660,9 @@ void gfx10_emit_cache_flush(struct si_context *ctx, struct radeon_cmdbuf *cs)
unsigned cb_db_event = 0;
unsigned flags = ctx->flags;
if (!flags)
return;
if (!ctx->has_graphics) {
/* Only process compute flags. */
flags &= SI_CONTEXT_INV_ICACHE | SI_CONTEXT_INV_SCACHE | SI_CONTEXT_INV_VCACHE |
@@ -911,10 +916,13 @@ void gfx10_emit_cache_flush(struct si_context *ctx, struct radeon_cmdbuf *cs)
ctx->flags = 0;
}
void si_emit_cache_flush(struct si_context *sctx, struct radeon_cmdbuf *cs)
void gfx6_emit_cache_flush(struct si_context *sctx, struct radeon_cmdbuf *cs)
{
uint32_t flags = sctx->flags;
if (!flags)
return;
if (!sctx->has_graphics) {
/* Only process compute flags. */
flags &= SI_CONTEXT_INV_ICACHE | SI_CONTEXT_INV_SCACHE | SI_CONTEXT_INV_VCACHE |

View File

@@ -611,7 +611,7 @@ static struct pipe_context *si_create_context(struct pipe_screen *screen, unsign
if (sctx->gfx_level >= GFX10)
sctx->emit_cache_flush = gfx10_emit_cache_flush;
else
sctx->emit_cache_flush = si_emit_cache_flush;
sctx->emit_cache_flush = gfx6_emit_cache_flush;
sctx->b.emit_string_marker = si_emit_string_marker;
sctx->b.set_debug_callback = si_set_debug_callback;

View File

@@ -1531,7 +1531,7 @@ void si_trace_emit(struct si_context *sctx);
void si_emit_surface_sync(struct si_context *sctx, struct radeon_cmdbuf *cs,
unsigned cp_coher_cntl);
void gfx10_emit_cache_flush(struct si_context *sctx, struct radeon_cmdbuf *cs);
void si_emit_cache_flush(struct si_context *sctx, struct radeon_cmdbuf *cs);
void gfx6_emit_cache_flush(struct si_context *sctx, struct radeon_cmdbuf *cs);
/* Replace the sctx->b.draw_vbo function with a wrapper. This can be use to implement
* optimizations without affecting the normal draw_vbo functions perf.
*/
@@ -1851,6 +1851,8 @@ static inline void si_make_CB_shader_coherent(struct si_context *sctx, unsigned
/* GFX6-GFX8 */
sctx->flags |= SI_CONTEXT_INV_L2;
}
si_mark_atom_dirty(sctx, &sctx->atoms.s.cache_flush);
}
static inline void si_make_DB_shader_coherent(struct si_context *sctx, unsigned num_samples,
@@ -1876,6 +1878,8 @@ static inline void si_make_DB_shader_coherent(struct si_context *sctx, unsigned
/* GFX6-GFX8 */
sctx->flags |= SI_CONTEXT_INV_L2;
}
si_mark_atom_dirty(sctx, &sctx->atoms.s.cache_flush);
}
static inline bool si_can_sample_zs(struct si_texture *tex, bool stencil_sampler)
@@ -2116,6 +2120,23 @@ si_set_rasterized_prim(struct si_context *sctx, enum mesa_prim rast_prim,
}
}
/* There are 3 ways to flush caches and all of them are correct.
*
* 1) sctx->flags |= ...;
* si_mark_atom_dirty(sctx, &sctx->atoms.s.cache_flush); // deferred
*
* 2) sctx->flags |= ...;
* si_emit_cache_flush_direct(sctx); // immediate
*
* 3) sctx->flags |= ...;
* sctx->emit_cache_flush(sctx, cs); // immediate (2 is better though)
*/
static inline void si_emit_cache_flush_direct(struct si_context *sctx)
{
sctx->emit_cache_flush(sctx, &sctx->gfx_cs);
sctx->dirty_atoms &= ~SI_ATOM_BIT(cache_flush);
}
#define PRINT_ERR(fmt, args...) \
fprintf(stderr, "EE %s:%d %s - " fmt, __FILE__, __LINE__, __func__, ##args)

View File

@@ -859,9 +859,11 @@ static void si_update_hw_pipeline_stats(struct si_context *sctx, unsigned type,
if (diff == 1 && sctx->num_hw_pipestat_streamout_queries == 1) {
sctx->flags &= ~SI_CONTEXT_STOP_PIPELINE_STATS;
sctx->flags |= SI_CONTEXT_START_PIPELINE_STATS;
si_mark_atom_dirty(sctx, &sctx->atoms.s.cache_flush);
} else if (diff == -1 && sctx->num_hw_pipestat_streamout_queries == 0) {
sctx->flags &= ~SI_CONTEXT_START_PIPELINE_STATS;
sctx->flags |= SI_CONTEXT_STOP_PIPELINE_STATS;
si_mark_atom_dirty(sctx, &sctx->atoms.s.cache_flush);
}
}
}
@@ -1569,6 +1571,7 @@ static void si_query_hw_get_result_resource(struct si_context *sctx, struct si_q
}
sctx->flags |= sctx->screen->barrier_flags.cp_to_L2;
si_mark_atom_dirty(sctx, &sctx->atoms.s.cache_flush);
for (qbuf = &query->buffer; qbuf; qbuf = qbuf_prev) {
if (query->b.type != PIPE_QUERY_TIMESTAMP) {
@@ -1664,6 +1667,7 @@ static void si_render_condition(struct pipe_context *ctx, struct pipe_query *que
/* Settings this in the render cond atom is too late,
* so set it here. */
sctx->flags |= sctx->screen->barrier_flags.L2_to_cp | SI_CONTEXT_FLUSH_FOR_RENDER_COND;
si_mark_atom_dirty(sctx, &sctx->atoms.s.cache_flush);
sctx->render_cond_enabled = old_render_cond_enabled;
}

View File

@@ -1494,11 +1494,13 @@ static void si_set_active_query_state(struct pipe_context *ctx, bool enable)
if (sctx->num_hw_pipestat_streamout_queries) {
sctx->flags &= ~SI_CONTEXT_STOP_PIPELINE_STATS;
sctx->flags |= SI_CONTEXT_START_PIPELINE_STATS;
si_mark_atom_dirty(sctx, &sctx->atoms.s.cache_flush);
}
} else {
if (sctx->num_hw_pipestat_streamout_queries) {
sctx->flags &= ~SI_CONTEXT_START_PIPELINE_STATS;
sctx->flags |= SI_CONTEXT_STOP_PIPELINE_STATS;
si_mark_atom_dirty(sctx, &sctx->atoms.s.cache_flush);
}
}
@@ -2893,6 +2895,7 @@ static void si_set_framebuffer_state(struct pipe_context *ctx,
}
sctx->flags |= SI_CONTEXT_CS_PARTIAL_FLUSH | SI_CONTEXT_PS_PARTIAL_FLUSH;
si_mark_atom_dirty(sctx, &sctx->atoms.s.cache_flush);
/* u_blitter doesn't invoke depth decompression when it does multiple
* blits in a row, but the only case when it matters for DB is when
@@ -2910,6 +2913,7 @@ static void si_set_framebuffer_state(struct pipe_context *ctx,
* Flushing DB metadata works around the problem.
*/
sctx->flags |= SI_CONTEXT_FLUSH_AND_INV_DB_META;
si_mark_atom_dirty(sctx, &sctx->atoms.s.cache_flush);
}
/* Take the maximum of the old and new count. If the new count is lower,
@@ -5390,6 +5394,8 @@ static void si_memory_barrier(struct pipe_context *ctx, unsigned flags)
/* Indirect buffers use TC L2 on GFX9, but not older hw. */
if (sctx->screen->info.gfx_level <= GFX8 && flags & PIPE_BARRIER_INDIRECT_BUFFER)
sctx->flags |= SI_CONTEXT_WB_L2;
si_mark_atom_dirty(sctx, &sctx->atoms.s.cache_flush);
}
static void *si_create_blend_custom(struct si_context *sctx, unsigned mode)
@@ -5402,6 +5408,11 @@ static void *si_create_blend_custom(struct si_context *sctx, unsigned mode)
return si_create_blend_state_mode(&sctx->b, &blend, mode);
}
static void si_emit_cache_flush_state(struct si_context *sctx, unsigned index)
{
sctx->emit_cache_flush(sctx, &sctx->gfx_cs);
}
void si_init_state_compute_functions(struct si_context *sctx)
{
sctx->b.create_sampler_state = si_create_sampler_state;
@@ -5434,6 +5445,7 @@ void si_init_state_functions(struct si_context *sctx)
sctx->atoms.s.clip_regs.emit = si_emit_clip_regs;
sctx->atoms.s.clip_state.emit = si_emit_clip_state;
sctx->atoms.s.stencil_ref.emit = si_emit_stencil_ref;
sctx->atoms.s.cache_flush.emit = si_emit_cache_flush_state;
sctx->b.create_blend_state = si_create_blend_state;
sctx->b.bind_blend_state = si_bind_blend_state;

View File

@@ -208,6 +208,7 @@ union si_state_atoms {
struct si_atom ngg_cull_state;
struct si_atom vgt_pipeline_state;
struct si_atom tess_io_layout;
struct si_atom cache_flush;
} s;
struct si_atom array[sizeof(struct si_atoms_s) / sizeof(struct si_atom)];
};

View File

@@ -833,8 +833,10 @@ static unsigned si_get_ia_multi_vgt_param(struct si_context *sctx,
if (GFX_VERSION == GFX7 &&
sctx->family == CHIP_HAWAII && G_028AA8_SWITCH_ON_EOI(ia_multi_vgt_param) &&
num_instanced_prims_less_than<IS_DRAW_VERTEX_STATE>(indirect, prim, min_vertex_count,
instance_count, 2, sctx->patch_vertices))
instance_count, 2, sctx->patch_vertices)) {
sctx->flags |= SI_CONTEXT_VGT_FLUSH;
si_mark_atom_dirty(sctx, &sctx->atoms.s.cache_flush);
}
}
return ia_multi_vgt_param;
@@ -2086,6 +2088,7 @@ static void si_draw(struct pipe_context *ctx,
/* GFX8 reads index buffers through TC L2, so it doesn't
* need this. */
sctx->flags |= SI_CONTEXT_WB_L2;
si_mark_atom_dirty(sctx, &sctx->atoms.s.cache_flush);
si_resource(indexbuf)->TC_L2_dirty = false;
}
}
@@ -2098,12 +2101,14 @@ static void si_draw(struct pipe_context *ctx,
if (GFX_VERSION <= GFX8) {
if (indirect->buffer && si_resource(indirect->buffer)->TC_L2_dirty) {
sctx->flags |= SI_CONTEXT_WB_L2;
si_mark_atom_dirty(sctx, &sctx->atoms.s.cache_flush);
si_resource(indirect->buffer)->TC_L2_dirty = false;
}
if (indirect->indirect_draw_count &&
si_resource(indirect->indirect_draw_count)->TC_L2_dirty) {
sctx->flags |= SI_CONTEXT_WB_L2;
si_mark_atom_dirty(sctx, &sctx->atoms.s.cache_flush);
si_resource(indirect->indirect_draw_count)->TC_L2_dirty = false;
}
}
@@ -2260,18 +2265,17 @@ static void si_draw(struct pipe_context *ctx,
/* Emit all states except possibly render condition. */
si_emit_rasterizer_prim_state<GFX_VERSION, HAS_GS, NGG, IS_BLIT>(sctx);
si_emit_all_states(sctx, masked_atoms);
/* Emit draw states. */
si_emit_vs_state<GFX_VERSION, HAS_TESS, HAS_GS, NGG, IS_BLIT, HAS_PAIRS>
(sctx, index_size);
/* This must be done before si_emit_all_states because it can set cache flush flags. */
si_emit_draw_registers<GFX_VERSION, HAS_TESS, HAS_GS, NGG, IS_DRAW_VERTEX_STATE>
(sctx, indirect, prim, index_size, instance_count, primitive_restart,
info->restart_index, min_direct_count);
/* This emits states and flushes caches. */
si_emit_all_states(sctx, masked_atoms);
/* <-- CUs are idle here if the cache_flush state waited. */
if (sctx->flags)
sctx->emit_cache_flush(sctx, &sctx->gfx_cs);
/* <-- CUs are idle here if we waited. */
/* This must be done after si_emit_all_states, which can affect this. */
si_emit_vs_state<GFX_VERSION, HAS_TESS, HAS_GS, NGG, IS_BLIT, HAS_PAIRS>
(sctx, index_size);
/* If we haven't emitted the render condition state (because it depends on cache flushes),
* do it now.
@@ -2328,6 +2332,7 @@ static void si_draw(struct pipe_context *ctx,
(GFX_VERSION == GFX8 && (sctx->family == CHIP_TONGA || sctx->family == CHIP_FIJI))) &&
si_get_strmout_en(sctx)) {
sctx->flags |= SI_CONTEXT_VGT_STREAMOUT_SYNC;
si_mark_atom_dirty(sctx, &sctx->atoms.s.cache_flush);
}
if (unlikely(IS_BLIT && sctx->decompression_enabled)) {

View File

@@ -3427,6 +3427,8 @@ bool si_update_ngg(struct si_context *sctx)
*/
if (sctx->screen->info.has_vgt_flush_ngg_legacy_bug && !new_ngg) {
sctx->flags |= SI_CONTEXT_VGT_FLUSH;
si_mark_atom_dirty(sctx, &sctx->atoms.s.cache_flush);
if (sctx->gfx_level == GFX10) {
/* Workaround for https://gitlab.freedesktop.org/mesa/mesa/-/issues/2941 */
si_flush_gfx_cs(sctx, RADEON_FLUSH_ASYNC_START_NEXT_GFX_IB_NOW, NULL);

View File

@@ -115,6 +115,9 @@ static void si_set_streamout_targets(struct pipe_context *ctx, unsigned num_targ
SI_CONTEXT_PFP_SYNC_ME;
}
if (sctx->flags)
si_mark_atom_dirty(sctx, &sctx->atoms.s.cache_flush);
/* Streamout buffers must be bound in 2 places:
* 1) in VGT by setting the VGT_STRMOUT registers
* 2) as shader resources
@@ -193,7 +196,7 @@ static void si_set_streamout_targets(struct pipe_context *ctx, unsigned num_targ
si_set_internal_shader_buffer(sctx, SI_VS_STREAMOUT_BUF0 + i, NULL);
if (wait_now)
sctx->emit_cache_flush(sctx, &sctx->gfx_cs);
si_emit_cache_flush_direct(sctx);
}
static void si_flush_vgt_streamout(struct si_context *sctx)
@@ -309,7 +312,7 @@ void si_emit_streamout_end(struct si_context *sctx)
if (sctx->gfx_level >= GFX11) {
/* Wait for streamout to finish before reading GDS_STRMOUT registers. */
sctx->flags |= SI_CONTEXT_VS_PARTIAL_FLUSH;
sctx->emit_cache_flush(sctx, &sctx->gfx_cs);
si_emit_cache_flush_direct(sctx);
} else {
si_flush_vgt_streamout(sctx);
}
@@ -326,6 +329,7 @@ void si_emit_streamout_end(struct si_context *sctx)
COPY_DATA_REG, NULL,
(R_031088_GDS_STRMOUT_DWORDS_WRITTEN_0 >> 2) + i);
sctx->flags |= SI_CONTEXT_PFP_SYNC_ME;
si_mark_atom_dirty(sctx, &sctx->atoms.s.cache_flush);
} else {
radeon_begin(cs);
radeon_emit(PKT3(PKT3_STRMOUT_BUFFER_UPDATE, 4, 0));

View File

@@ -153,7 +153,7 @@ void si_test_dma_perf(struct si_screen *sscreen)
sctx->flags |= SI_CONTEXT_CS_PARTIAL_FLUSH |
SI_CONTEXT_FLUSH_AND_INV_CB |
SI_CONTEXT_FLUSH_AND_INV_DB;
sctx->emit_cache_flush(sctx, &sctx->gfx_cs);
si_emit_cache_flush_direct(sctx);
struct pipe_query *q = ctx->create_query(ctx, query_type, 0);
ctx->begin_query(ctx, q);
@@ -217,7 +217,7 @@ void si_test_dma_perf(struct si_screen *sscreen)
sctx->flags |= SI_CONTEXT_INV_VCACHE |
(cache_policy == L2_LRU ? 0 : SI_CONTEXT_INV_L2) |
SI_CONTEXT_CS_PARTIAL_FLUSH;
sctx->emit_cache_flush(sctx, &sctx->gfx_cs);
si_emit_cache_flush_direct(sctx);
}
ctx->end_query(ctx, q);