radeonsi: handle deferred cache flushes as a state (si_atom)
This allows us to remove a little bit of code from si_draw, and enable removing more code in the future. Reviewed-by: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer@amd.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/24732>
This commit is contained in:
@@ -496,16 +496,20 @@ static void si_blit_decompress_color(struct si_context *sctx, struct si_texture
|
||||
|
||||
/* Required before and after FMASK and DCC_DECOMPRESS. */
|
||||
if (custom_blend == sctx->custom_blend_fmask_decompress ||
|
||||
custom_blend == sctx->custom_blend_dcc_decompress)
|
||||
custom_blend == sctx->custom_blend_dcc_decompress) {
|
||||
sctx->flags |= SI_CONTEXT_FLUSH_AND_INV_CB;
|
||||
si_mark_atom_dirty(sctx, &sctx->atoms.s.cache_flush);
|
||||
}
|
||||
|
||||
si_blitter_begin(sctx, SI_DECOMPRESS);
|
||||
util_blitter_custom_color(sctx->blitter, cbsurf, custom_blend);
|
||||
si_blitter_end(sctx);
|
||||
|
||||
if (custom_blend == sctx->custom_blend_fmask_decompress ||
|
||||
custom_blend == sctx->custom_blend_dcc_decompress)
|
||||
custom_blend == sctx->custom_blend_dcc_decompress) {
|
||||
sctx->flags |= SI_CONTEXT_FLUSH_AND_INV_CB;
|
||||
si_mark_atom_dirty(sctx, &sctx->atoms.s.cache_flush);
|
||||
}
|
||||
|
||||
/* When running FMASK decompression with DCC, we need to run the "eliminate fast clear" pass
|
||||
* separately because FMASK decompression doesn't eliminate DCC fast clear. This makes
|
||||
@@ -1036,6 +1040,7 @@ static void si_do_CB_resolve(struct si_context *sctx, const struct pipe_blit_inf
|
||||
{
|
||||
/* Required before and after CB_RESOLVE. */
|
||||
sctx->flags |= SI_CONTEXT_FLUSH_AND_INV_CB;
|
||||
si_mark_atom_dirty(sctx, &sctx->atoms.s.cache_flush);
|
||||
|
||||
si_blitter_begin(
|
||||
sctx, SI_COLOR_RESOLVE | (info->render_condition_enable ? 0 : SI_DISABLE_RENDER_COND));
|
||||
|
@@ -55,6 +55,8 @@ void si_execute_clears(struct si_context *sctx, struct si_clear_info *info,
|
||||
if (sctx->gfx_level <= GFX8)
|
||||
sctx->flags |= SI_CONTEXT_INV_L2;
|
||||
|
||||
si_mark_atom_dirty(sctx, &sctx->atoms.s.cache_flush);
|
||||
|
||||
/* Execute clears. */
|
||||
for (unsigned i = 0; i < num_clears; i++) {
|
||||
if (info[i].is_dcc_msaa) {
|
||||
@@ -83,6 +85,8 @@ void si_execute_clears(struct si_context *sctx, struct si_clear_info *info,
|
||||
/* GFX6-8: CB and DB don't use L2. */
|
||||
if (sctx->gfx_level <= GFX8)
|
||||
sctx->flags |= SI_CONTEXT_WB_L2;
|
||||
|
||||
si_mark_atom_dirty(sctx, &sctx->atoms.s.cache_flush);
|
||||
}
|
||||
|
||||
static bool si_alloc_separate_cmask(struct si_screen *sscreen, struct si_texture *tex)
|
||||
@@ -1162,8 +1166,10 @@ static void si_clear(struct pipe_context *ctx, unsigned buffers,
|
||||
si_mark_atom_dirty(sctx, &sctx->atoms.s.db_render_state);
|
||||
}
|
||||
|
||||
if (needs_db_flush)
|
||||
if (needs_db_flush) {
|
||||
sctx->flags |= SI_CONTEXT_FLUSH_AND_INV_DB;
|
||||
si_mark_atom_dirty(sctx, &sctx->atoms.s.cache_flush);
|
||||
}
|
||||
}
|
||||
|
||||
if (unlikely(sctx->sqtt_enabled)) {
|
||||
|
@@ -947,8 +947,10 @@ static void si_launch_grid(struct pipe_context *ctx, const struct pipe_grid_info
|
||||
bool cs_regalloc_hang = sscreen->info.has_cs_regalloc_hang_bug &&
|
||||
info->block[0] * info->block[1] * info->block[2] > 256;
|
||||
|
||||
if (cs_regalloc_hang)
|
||||
if (cs_regalloc_hang) {
|
||||
sctx->flags |= SI_CONTEXT_PS_PARTIAL_FLUSH | SI_CONTEXT_CS_PARTIAL_FLUSH;
|
||||
si_mark_atom_dirty(sctx, &sctx->atoms.s.cache_flush);
|
||||
}
|
||||
|
||||
if (program->ir_type != PIPE_SHADER_IR_NATIVE && program->shader.compilation_failed)
|
||||
return;
|
||||
@@ -976,6 +978,7 @@ static void si_launch_grid(struct pipe_context *ctx, const struct pipe_grid_info
|
||||
/* Indirect buffers use TC L2 on GFX9, but not older hw. */
|
||||
if (sctx->gfx_level <= GFX8 && si_resource(info->indirect)->TC_L2_dirty) {
|
||||
sctx->flags |= SI_CONTEXT_WB_L2;
|
||||
si_mark_atom_dirty(sctx, &sctx->atoms.s.cache_flush);
|
||||
si_resource(info->indirect)->TC_L2_dirty = false;
|
||||
}
|
||||
}
|
||||
@@ -1024,7 +1027,7 @@ static void si_launch_grid(struct pipe_context *ctx, const struct pipe_grid_info
|
||||
|
||||
/* Registers that are not read from memory should be set before this: */
|
||||
if (sctx->flags)
|
||||
sctx->emit_cache_flush(sctx, &sctx->gfx_cs);
|
||||
si_emit_cache_flush_direct(sctx);
|
||||
|
||||
if (sctx->has_graphics && si_is_atom_dirty(sctx, &sctx->atoms.s.render_cond)) {
|
||||
sctx->atoms.s.render_cond.emit(sctx, -1);
|
||||
@@ -1060,8 +1063,10 @@ static void si_launch_grid(struct pipe_context *ctx, const struct pipe_grid_info
|
||||
sctx->compute_is_busy = true;
|
||||
sctx->num_compute_calls++;
|
||||
|
||||
if (cs_regalloc_hang)
|
||||
if (cs_regalloc_hang) {
|
||||
sctx->flags |= SI_CONTEXT_CS_PARTIAL_FLUSH;
|
||||
si_mark_atom_dirty(sctx, &sctx->atoms.s.cache_flush);
|
||||
}
|
||||
}
|
||||
|
||||
void si_destroy_compute(struct si_compute *program)
|
||||
|
@@ -163,6 +163,9 @@ static void si_launch_grid_internal(struct si_context *sctx, const struct pipe_g
|
||||
if (sctx->num_hw_pipestat_streamout_queries)
|
||||
sctx->flags |= SI_CONTEXT_STOP_PIPELINE_STATS;
|
||||
|
||||
if (sctx->flags)
|
||||
si_mark_atom_dirty(sctx, &sctx->atoms.s.cache_flush);
|
||||
|
||||
if (!(flags & SI_OP_CS_RENDER_COND_ENABLE))
|
||||
sctx->render_cond_enabled = false;
|
||||
|
||||
@@ -213,6 +216,9 @@ static void si_launch_grid_internal(struct si_context *sctx, const struct pipe_g
|
||||
sctx->flags |= SI_CONTEXT_INV_SCACHE | SI_CONTEXT_INV_VCACHE | SI_CONTEXT_PFP_SYNC_ME;
|
||||
}
|
||||
}
|
||||
|
||||
if (sctx->flags)
|
||||
si_mark_atom_dirty(sctx, &sctx->atoms.s.cache_flush);
|
||||
}
|
||||
|
||||
void si_launch_grid_internal_ssbos(struct si_context *sctx, struct pipe_grid_info *info,
|
||||
@@ -220,8 +226,10 @@ void si_launch_grid_internal_ssbos(struct si_context *sctx, struct pipe_grid_inf
|
||||
unsigned num_buffers, const struct pipe_shader_buffer *buffers,
|
||||
unsigned writeable_bitmask)
|
||||
{
|
||||
if (!(flags & SI_OP_SKIP_CACHE_INV_BEFORE))
|
||||
if (!(flags & SI_OP_SKIP_CACHE_INV_BEFORE)) {
|
||||
sctx->flags |= si_get_flush_flags(sctx, coher, SI_COMPUTE_DST_CACHE_POLICY);
|
||||
si_mark_atom_dirty(sctx, &sctx->atoms.s.cache_flush);
|
||||
}
|
||||
|
||||
/* Save states. */
|
||||
struct pipe_shader_buffer saved_sb[3] = {};
|
||||
@@ -243,8 +251,10 @@ void si_launch_grid_internal_ssbos(struct si_context *sctx, struct pipe_grid_inf
|
||||
|
||||
/* Do cache flushing at the end. */
|
||||
if (get_cache_policy(sctx, coher, 0) == L2_BYPASS) {
|
||||
if (flags & SI_OP_SYNC_AFTER)
|
||||
if (flags & SI_OP_SYNC_AFTER) {
|
||||
sctx->flags |= SI_CONTEXT_WB_L2;
|
||||
si_mark_atom_dirty(sctx, &sctx->atoms.s.cache_flush);
|
||||
}
|
||||
} else {
|
||||
while (writeable_bitmask)
|
||||
si_resource(buffers[u_bit_scan(&writeable_bitmask)].buffer)->TC_L2_dirty = true;
|
||||
|
@@ -144,7 +144,7 @@ static void si_cp_dma_prepare(struct si_context *sctx, struct pipe_resource *dst
|
||||
* Also wait for the previous CP DMA operations.
|
||||
*/
|
||||
if (*is_first && sctx->flags)
|
||||
sctx->emit_cache_flush(sctx, &sctx->gfx_cs);
|
||||
si_emit_cache_flush_direct(sctx);
|
||||
|
||||
if (user_flags & SI_OP_SYNC_CPDMA_BEFORE && *is_first && !(*packet_flags & CP_DMA_CLEAR))
|
||||
*packet_flags |= CP_DMA_RAW_WAIT;
|
||||
@@ -192,6 +192,9 @@ void si_cp_dma_clear_buffer(struct si_context *sctx, struct radeon_cmdbuf *cs,
|
||||
sctx->flags |= si_get_flush_flags(sctx, coher, cache_policy);
|
||||
}
|
||||
|
||||
if (sctx->flags)
|
||||
si_mark_atom_dirty(sctx, &sctx->atoms.s.cache_flush);
|
||||
|
||||
while (size) {
|
||||
unsigned byte_count = MIN2(size, cp_dma_max_byte_count(sctx));
|
||||
unsigned dma_flags = CP_DMA_CLEAR | (sdst ? 0 : CP_DMA_DST_IS_GDS);
|
||||
@@ -330,6 +333,9 @@ void si_cp_dma_copy_buffer(struct si_context *sctx, struct pipe_resource *dst,
|
||||
if ((dst || src) && !(user_flags & SI_OP_SKIP_CACHE_INV_BEFORE))
|
||||
sctx->flags |= si_get_flush_flags(sctx, coher, cache_policy);
|
||||
|
||||
if (sctx->flags)
|
||||
si_mark_atom_dirty(sctx, &sctx->atoms.s.cache_flush);
|
||||
|
||||
/* This is the main part doing the copying. Src is always aligned. */
|
||||
main_dst_offset = dst_offset + skipped_size;
|
||||
main_src_offset = src_offset + skipped_size;
|
||||
|
@@ -1926,7 +1926,7 @@ static void si_upload_bindless_descriptors(struct si_context *sctx)
|
||||
* descriptors directly in memory, in case the GPU is using them.
|
||||
*/
|
||||
sctx->flags |= SI_CONTEXT_PS_PARTIAL_FLUSH | SI_CONTEXT_CS_PARTIAL_FLUSH;
|
||||
sctx->emit_cache_flush(sctx, &sctx->gfx_cs);
|
||||
si_emit_cache_flush_direct(sctx);
|
||||
|
||||
util_dynarray_foreach (&sctx->resident_tex_handles, struct si_texture_handle *, tex_handle) {
|
||||
unsigned desc_slot = (*tex_handle)->desc_slot;
|
||||
@@ -1950,6 +1950,7 @@ static void si_upload_bindless_descriptors(struct si_context *sctx)
|
||||
|
||||
/* Invalidate scalar L0 because the cache doesn't know that L2 changed. */
|
||||
sctx->flags |= SI_CONTEXT_INV_SCACHE;
|
||||
si_mark_atom_dirty(sctx, &sctx->atoms.s.cache_flush);
|
||||
sctx->bindless_descriptors_dirty = false;
|
||||
}
|
||||
|
||||
|
@@ -108,7 +108,7 @@ void si_flush_gfx_cs(struct si_context *ctx, unsigned flags, struct pipe_fence_h
|
||||
/* Wait for draw calls to finish if needed. */
|
||||
if (wait_flags) {
|
||||
ctx->flags |= wait_flags;
|
||||
ctx->emit_cache_flush(ctx, &ctx->gfx_cs);
|
||||
si_emit_cache_flush_direct(ctx);
|
||||
}
|
||||
ctx->gfx_last_ib_is_busy = (wait_flags & wait_ps_cs) != wait_ps_cs;
|
||||
|
||||
@@ -396,6 +396,8 @@ void si_begin_new_gfx_cs(struct si_context *ctx, bool first_cs)
|
||||
if (ctx->screen->info.has_vgt_flush_ngg_legacy_bug && !ctx->ngg)
|
||||
ctx->flags |= SI_CONTEXT_VGT_FLUSH;
|
||||
|
||||
si_mark_atom_dirty(ctx, &ctx->atoms.s.cache_flush);
|
||||
|
||||
if (ctx->screen->attribute_ring) {
|
||||
radeon_add_to_buffer_list(ctx, &ctx->gfx_cs, ctx->screen->attribute_ring,
|
||||
RADEON_USAGE_READWRITE | RADEON_PRIO_SHADER_RINGS);
|
||||
@@ -658,6 +660,9 @@ void gfx10_emit_cache_flush(struct si_context *ctx, struct radeon_cmdbuf *cs)
|
||||
unsigned cb_db_event = 0;
|
||||
unsigned flags = ctx->flags;
|
||||
|
||||
if (!flags)
|
||||
return;
|
||||
|
||||
if (!ctx->has_graphics) {
|
||||
/* Only process compute flags. */
|
||||
flags &= SI_CONTEXT_INV_ICACHE | SI_CONTEXT_INV_SCACHE | SI_CONTEXT_INV_VCACHE |
|
||||
@@ -911,10 +916,13 @@ void gfx10_emit_cache_flush(struct si_context *ctx, struct radeon_cmdbuf *cs)
|
||||
ctx->flags = 0;
|
||||
}
|
||||
|
||||
void si_emit_cache_flush(struct si_context *sctx, struct radeon_cmdbuf *cs)
|
||||
void gfx6_emit_cache_flush(struct si_context *sctx, struct radeon_cmdbuf *cs)
|
||||
{
|
||||
uint32_t flags = sctx->flags;
|
||||
|
||||
if (!flags)
|
||||
return;
|
||||
|
||||
if (!sctx->has_graphics) {
|
||||
/* Only process compute flags. */
|
||||
flags &= SI_CONTEXT_INV_ICACHE | SI_CONTEXT_INV_SCACHE | SI_CONTEXT_INV_VCACHE |
|
||||
|
@@ -611,7 +611,7 @@ static struct pipe_context *si_create_context(struct pipe_screen *screen, unsign
|
||||
if (sctx->gfx_level >= GFX10)
|
||||
sctx->emit_cache_flush = gfx10_emit_cache_flush;
|
||||
else
|
||||
sctx->emit_cache_flush = si_emit_cache_flush;
|
||||
sctx->emit_cache_flush = gfx6_emit_cache_flush;
|
||||
|
||||
sctx->b.emit_string_marker = si_emit_string_marker;
|
||||
sctx->b.set_debug_callback = si_set_debug_callback;
|
||||
|
@@ -1531,7 +1531,7 @@ void si_trace_emit(struct si_context *sctx);
|
||||
void si_emit_surface_sync(struct si_context *sctx, struct radeon_cmdbuf *cs,
|
||||
unsigned cp_coher_cntl);
|
||||
void gfx10_emit_cache_flush(struct si_context *sctx, struct radeon_cmdbuf *cs);
|
||||
void si_emit_cache_flush(struct si_context *sctx, struct radeon_cmdbuf *cs);
|
||||
void gfx6_emit_cache_flush(struct si_context *sctx, struct radeon_cmdbuf *cs);
|
||||
/* Replace the sctx->b.draw_vbo function with a wrapper. This can be use to implement
|
||||
* optimizations without affecting the normal draw_vbo functions perf.
|
||||
*/
|
||||
@@ -1851,6 +1851,8 @@ static inline void si_make_CB_shader_coherent(struct si_context *sctx, unsigned
|
||||
/* GFX6-GFX8 */
|
||||
sctx->flags |= SI_CONTEXT_INV_L2;
|
||||
}
|
||||
|
||||
si_mark_atom_dirty(sctx, &sctx->atoms.s.cache_flush);
|
||||
}
|
||||
|
||||
static inline void si_make_DB_shader_coherent(struct si_context *sctx, unsigned num_samples,
|
||||
@@ -1876,6 +1878,8 @@ static inline void si_make_DB_shader_coherent(struct si_context *sctx, unsigned
|
||||
/* GFX6-GFX8 */
|
||||
sctx->flags |= SI_CONTEXT_INV_L2;
|
||||
}
|
||||
|
||||
si_mark_atom_dirty(sctx, &sctx->atoms.s.cache_flush);
|
||||
}
|
||||
|
||||
static inline bool si_can_sample_zs(struct si_texture *tex, bool stencil_sampler)
|
||||
@@ -2116,6 +2120,23 @@ si_set_rasterized_prim(struct si_context *sctx, enum mesa_prim rast_prim,
|
||||
}
|
||||
}
|
||||
|
||||
/* There are 3 ways to flush caches and all of them are correct.
|
||||
*
|
||||
* 1) sctx->flags |= ...;
|
||||
* si_mark_atom_dirty(sctx, &sctx->atoms.s.cache_flush); // deferred
|
||||
*
|
||||
* 2) sctx->flags |= ...;
|
||||
* si_emit_cache_flush_direct(sctx); // immediate
|
||||
*
|
||||
* 3) sctx->flags |= ...;
|
||||
* sctx->emit_cache_flush(sctx, cs); // immediate (2 is better though)
|
||||
*/
|
||||
static inline void si_emit_cache_flush_direct(struct si_context *sctx)
|
||||
{
|
||||
sctx->emit_cache_flush(sctx, &sctx->gfx_cs);
|
||||
sctx->dirty_atoms &= ~SI_ATOM_BIT(cache_flush);
|
||||
}
|
||||
|
||||
#define PRINT_ERR(fmt, args...) \
|
||||
fprintf(stderr, "EE %s:%d %s - " fmt, __FILE__, __LINE__, __func__, ##args)
|
||||
|
||||
|
@@ -859,9 +859,11 @@ static void si_update_hw_pipeline_stats(struct si_context *sctx, unsigned type,
|
||||
if (diff == 1 && sctx->num_hw_pipestat_streamout_queries == 1) {
|
||||
sctx->flags &= ~SI_CONTEXT_STOP_PIPELINE_STATS;
|
||||
sctx->flags |= SI_CONTEXT_START_PIPELINE_STATS;
|
||||
si_mark_atom_dirty(sctx, &sctx->atoms.s.cache_flush);
|
||||
} else if (diff == -1 && sctx->num_hw_pipestat_streamout_queries == 0) {
|
||||
sctx->flags &= ~SI_CONTEXT_START_PIPELINE_STATS;
|
||||
sctx->flags |= SI_CONTEXT_STOP_PIPELINE_STATS;
|
||||
si_mark_atom_dirty(sctx, &sctx->atoms.s.cache_flush);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -1569,6 +1571,7 @@ static void si_query_hw_get_result_resource(struct si_context *sctx, struct si_q
|
||||
}
|
||||
|
||||
sctx->flags |= sctx->screen->barrier_flags.cp_to_L2;
|
||||
si_mark_atom_dirty(sctx, &sctx->atoms.s.cache_flush);
|
||||
|
||||
for (qbuf = &query->buffer; qbuf; qbuf = qbuf_prev) {
|
||||
if (query->b.type != PIPE_QUERY_TIMESTAMP) {
|
||||
@@ -1664,6 +1667,7 @@ static void si_render_condition(struct pipe_context *ctx, struct pipe_query *que
|
||||
/* Settings this in the render cond atom is too late,
|
||||
* so set it here. */
|
||||
sctx->flags |= sctx->screen->barrier_flags.L2_to_cp | SI_CONTEXT_FLUSH_FOR_RENDER_COND;
|
||||
si_mark_atom_dirty(sctx, &sctx->atoms.s.cache_flush);
|
||||
|
||||
sctx->render_cond_enabled = old_render_cond_enabled;
|
||||
}
|
||||
|
@@ -1494,11 +1494,13 @@ static void si_set_active_query_state(struct pipe_context *ctx, bool enable)
|
||||
if (sctx->num_hw_pipestat_streamout_queries) {
|
||||
sctx->flags &= ~SI_CONTEXT_STOP_PIPELINE_STATS;
|
||||
sctx->flags |= SI_CONTEXT_START_PIPELINE_STATS;
|
||||
si_mark_atom_dirty(sctx, &sctx->atoms.s.cache_flush);
|
||||
}
|
||||
} else {
|
||||
if (sctx->num_hw_pipestat_streamout_queries) {
|
||||
sctx->flags &= ~SI_CONTEXT_START_PIPELINE_STATS;
|
||||
sctx->flags |= SI_CONTEXT_STOP_PIPELINE_STATS;
|
||||
si_mark_atom_dirty(sctx, &sctx->atoms.s.cache_flush);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -2893,6 +2895,7 @@ static void si_set_framebuffer_state(struct pipe_context *ctx,
|
||||
}
|
||||
|
||||
sctx->flags |= SI_CONTEXT_CS_PARTIAL_FLUSH | SI_CONTEXT_PS_PARTIAL_FLUSH;
|
||||
si_mark_atom_dirty(sctx, &sctx->atoms.s.cache_flush);
|
||||
|
||||
/* u_blitter doesn't invoke depth decompression when it does multiple
|
||||
* blits in a row, but the only case when it matters for DB is when
|
||||
@@ -2910,6 +2913,7 @@ static void si_set_framebuffer_state(struct pipe_context *ctx,
|
||||
* Flushing DB metadata works around the problem.
|
||||
*/
|
||||
sctx->flags |= SI_CONTEXT_FLUSH_AND_INV_DB_META;
|
||||
si_mark_atom_dirty(sctx, &sctx->atoms.s.cache_flush);
|
||||
}
|
||||
|
||||
/* Take the maximum of the old and new count. If the new count is lower,
|
||||
@@ -5390,6 +5394,8 @@ static void si_memory_barrier(struct pipe_context *ctx, unsigned flags)
|
||||
/* Indirect buffers use TC L2 on GFX9, but not older hw. */
|
||||
if (sctx->screen->info.gfx_level <= GFX8 && flags & PIPE_BARRIER_INDIRECT_BUFFER)
|
||||
sctx->flags |= SI_CONTEXT_WB_L2;
|
||||
|
||||
si_mark_atom_dirty(sctx, &sctx->atoms.s.cache_flush);
|
||||
}
|
||||
|
||||
static void *si_create_blend_custom(struct si_context *sctx, unsigned mode)
|
||||
@@ -5402,6 +5408,11 @@ static void *si_create_blend_custom(struct si_context *sctx, unsigned mode)
|
||||
return si_create_blend_state_mode(&sctx->b, &blend, mode);
|
||||
}
|
||||
|
||||
static void si_emit_cache_flush_state(struct si_context *sctx, unsigned index)
|
||||
{
|
||||
sctx->emit_cache_flush(sctx, &sctx->gfx_cs);
|
||||
}
|
||||
|
||||
void si_init_state_compute_functions(struct si_context *sctx)
|
||||
{
|
||||
sctx->b.create_sampler_state = si_create_sampler_state;
|
||||
@@ -5434,6 +5445,7 @@ void si_init_state_functions(struct si_context *sctx)
|
||||
sctx->atoms.s.clip_regs.emit = si_emit_clip_regs;
|
||||
sctx->atoms.s.clip_state.emit = si_emit_clip_state;
|
||||
sctx->atoms.s.stencil_ref.emit = si_emit_stencil_ref;
|
||||
sctx->atoms.s.cache_flush.emit = si_emit_cache_flush_state;
|
||||
|
||||
sctx->b.create_blend_state = si_create_blend_state;
|
||||
sctx->b.bind_blend_state = si_bind_blend_state;
|
||||
|
@@ -208,6 +208,7 @@ union si_state_atoms {
|
||||
struct si_atom ngg_cull_state;
|
||||
struct si_atom vgt_pipeline_state;
|
||||
struct si_atom tess_io_layout;
|
||||
struct si_atom cache_flush;
|
||||
} s;
|
||||
struct si_atom array[sizeof(struct si_atoms_s) / sizeof(struct si_atom)];
|
||||
};
|
||||
|
@@ -833,8 +833,10 @@ static unsigned si_get_ia_multi_vgt_param(struct si_context *sctx,
|
||||
if (GFX_VERSION == GFX7 &&
|
||||
sctx->family == CHIP_HAWAII && G_028AA8_SWITCH_ON_EOI(ia_multi_vgt_param) &&
|
||||
num_instanced_prims_less_than<IS_DRAW_VERTEX_STATE>(indirect, prim, min_vertex_count,
|
||||
instance_count, 2, sctx->patch_vertices))
|
||||
instance_count, 2, sctx->patch_vertices)) {
|
||||
sctx->flags |= SI_CONTEXT_VGT_FLUSH;
|
||||
si_mark_atom_dirty(sctx, &sctx->atoms.s.cache_flush);
|
||||
}
|
||||
}
|
||||
|
||||
return ia_multi_vgt_param;
|
||||
@@ -2086,6 +2088,7 @@ static void si_draw(struct pipe_context *ctx,
|
||||
/* GFX8 reads index buffers through TC L2, so it doesn't
|
||||
* need this. */
|
||||
sctx->flags |= SI_CONTEXT_WB_L2;
|
||||
si_mark_atom_dirty(sctx, &sctx->atoms.s.cache_flush);
|
||||
si_resource(indexbuf)->TC_L2_dirty = false;
|
||||
}
|
||||
}
|
||||
@@ -2098,12 +2101,14 @@ static void si_draw(struct pipe_context *ctx,
|
||||
if (GFX_VERSION <= GFX8) {
|
||||
if (indirect->buffer && si_resource(indirect->buffer)->TC_L2_dirty) {
|
||||
sctx->flags |= SI_CONTEXT_WB_L2;
|
||||
si_mark_atom_dirty(sctx, &sctx->atoms.s.cache_flush);
|
||||
si_resource(indirect->buffer)->TC_L2_dirty = false;
|
||||
}
|
||||
|
||||
if (indirect->indirect_draw_count &&
|
||||
si_resource(indirect->indirect_draw_count)->TC_L2_dirty) {
|
||||
sctx->flags |= SI_CONTEXT_WB_L2;
|
||||
si_mark_atom_dirty(sctx, &sctx->atoms.s.cache_flush);
|
||||
si_resource(indirect->indirect_draw_count)->TC_L2_dirty = false;
|
||||
}
|
||||
}
|
||||
@@ -2260,18 +2265,17 @@ static void si_draw(struct pipe_context *ctx,
|
||||
|
||||
/* Emit all states except possibly render condition. */
|
||||
si_emit_rasterizer_prim_state<GFX_VERSION, HAS_GS, NGG, IS_BLIT>(sctx);
|
||||
si_emit_all_states(sctx, masked_atoms);
|
||||
|
||||
/* Emit draw states. */
|
||||
si_emit_vs_state<GFX_VERSION, HAS_TESS, HAS_GS, NGG, IS_BLIT, HAS_PAIRS>
|
||||
(sctx, index_size);
|
||||
/* This must be done before si_emit_all_states because it can set cache flush flags. */
|
||||
si_emit_draw_registers<GFX_VERSION, HAS_TESS, HAS_GS, NGG, IS_DRAW_VERTEX_STATE>
|
||||
(sctx, indirect, prim, index_size, instance_count, primitive_restart,
|
||||
info->restart_index, min_direct_count);
|
||||
/* This emits states and flushes caches. */
|
||||
si_emit_all_states(sctx, masked_atoms);
|
||||
/* <-- CUs are idle here if the cache_flush state waited. */
|
||||
|
||||
if (sctx->flags)
|
||||
sctx->emit_cache_flush(sctx, &sctx->gfx_cs);
|
||||
/* <-- CUs are idle here if we waited. */
|
||||
/* This must be done after si_emit_all_states, which can affect this. */
|
||||
si_emit_vs_state<GFX_VERSION, HAS_TESS, HAS_GS, NGG, IS_BLIT, HAS_PAIRS>
|
||||
(sctx, index_size);
|
||||
|
||||
/* If we haven't emitted the render condition state (because it depends on cache flushes),
|
||||
* do it now.
|
||||
@@ -2328,6 +2332,7 @@ static void si_draw(struct pipe_context *ctx,
|
||||
(GFX_VERSION == GFX8 && (sctx->family == CHIP_TONGA || sctx->family == CHIP_FIJI))) &&
|
||||
si_get_strmout_en(sctx)) {
|
||||
sctx->flags |= SI_CONTEXT_VGT_STREAMOUT_SYNC;
|
||||
si_mark_atom_dirty(sctx, &sctx->atoms.s.cache_flush);
|
||||
}
|
||||
|
||||
if (unlikely(IS_BLIT && sctx->decompression_enabled)) {
|
||||
|
@@ -3427,6 +3427,8 @@ bool si_update_ngg(struct si_context *sctx)
|
||||
*/
|
||||
if (sctx->screen->info.has_vgt_flush_ngg_legacy_bug && !new_ngg) {
|
||||
sctx->flags |= SI_CONTEXT_VGT_FLUSH;
|
||||
si_mark_atom_dirty(sctx, &sctx->atoms.s.cache_flush);
|
||||
|
||||
if (sctx->gfx_level == GFX10) {
|
||||
/* Workaround for https://gitlab.freedesktop.org/mesa/mesa/-/issues/2941 */
|
||||
si_flush_gfx_cs(sctx, RADEON_FLUSH_ASYNC_START_NEXT_GFX_IB_NOW, NULL);
|
||||
|
@@ -115,6 +115,9 @@ static void si_set_streamout_targets(struct pipe_context *ctx, unsigned num_targ
|
||||
SI_CONTEXT_PFP_SYNC_ME;
|
||||
}
|
||||
|
||||
if (sctx->flags)
|
||||
si_mark_atom_dirty(sctx, &sctx->atoms.s.cache_flush);
|
||||
|
||||
/* Streamout buffers must be bound in 2 places:
|
||||
* 1) in VGT by setting the VGT_STRMOUT registers
|
||||
* 2) as shader resources
|
||||
@@ -193,7 +196,7 @@ static void si_set_streamout_targets(struct pipe_context *ctx, unsigned num_targ
|
||||
si_set_internal_shader_buffer(sctx, SI_VS_STREAMOUT_BUF0 + i, NULL);
|
||||
|
||||
if (wait_now)
|
||||
sctx->emit_cache_flush(sctx, &sctx->gfx_cs);
|
||||
si_emit_cache_flush_direct(sctx);
|
||||
}
|
||||
|
||||
static void si_flush_vgt_streamout(struct si_context *sctx)
|
||||
@@ -309,7 +312,7 @@ void si_emit_streamout_end(struct si_context *sctx)
|
||||
if (sctx->gfx_level >= GFX11) {
|
||||
/* Wait for streamout to finish before reading GDS_STRMOUT registers. */
|
||||
sctx->flags |= SI_CONTEXT_VS_PARTIAL_FLUSH;
|
||||
sctx->emit_cache_flush(sctx, &sctx->gfx_cs);
|
||||
si_emit_cache_flush_direct(sctx);
|
||||
} else {
|
||||
si_flush_vgt_streamout(sctx);
|
||||
}
|
||||
@@ -326,6 +329,7 @@ void si_emit_streamout_end(struct si_context *sctx)
|
||||
COPY_DATA_REG, NULL,
|
||||
(R_031088_GDS_STRMOUT_DWORDS_WRITTEN_0 >> 2) + i);
|
||||
sctx->flags |= SI_CONTEXT_PFP_SYNC_ME;
|
||||
si_mark_atom_dirty(sctx, &sctx->atoms.s.cache_flush);
|
||||
} else {
|
||||
radeon_begin(cs);
|
||||
radeon_emit(PKT3(PKT3_STRMOUT_BUFFER_UPDATE, 4, 0));
|
||||
|
@@ -153,7 +153,7 @@ void si_test_dma_perf(struct si_screen *sscreen)
|
||||
sctx->flags |= SI_CONTEXT_CS_PARTIAL_FLUSH |
|
||||
SI_CONTEXT_FLUSH_AND_INV_CB |
|
||||
SI_CONTEXT_FLUSH_AND_INV_DB;
|
||||
sctx->emit_cache_flush(sctx, &sctx->gfx_cs);
|
||||
si_emit_cache_flush_direct(sctx);
|
||||
|
||||
struct pipe_query *q = ctx->create_query(ctx, query_type, 0);
|
||||
ctx->begin_query(ctx, q);
|
||||
@@ -217,7 +217,7 @@ void si_test_dma_perf(struct si_screen *sscreen)
|
||||
sctx->flags |= SI_CONTEXT_INV_VCACHE |
|
||||
(cache_policy == L2_LRU ? 0 : SI_CONTEXT_INV_L2) |
|
||||
SI_CONTEXT_CS_PARTIAL_FLUSH;
|
||||
sctx->emit_cache_flush(sctx, &sctx->gfx_cs);
|
||||
si_emit_cache_flush_direct(sctx);
|
||||
}
|
||||
|
||||
ctx->end_query(ctx, q);
|
||||
|
Reference in New Issue
Block a user