radeonsi: handle deferred cache flushes as a state (si_atom)

This allows us to remove a little bit of code from si_draw, and enable removing more code in the future. Reviewed-by: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer@amd.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/24732>
2023-07-16 10:38:20 -04:00
parent c3129b2b83
commit 1e4b539042
16 changed files with 117 additions and 27 deletions
--- a/src/gallium/drivers/radeonsi/si_blit.c
+++ b/src/gallium/drivers/radeonsi/si_blit.c
@@ -496,16 +496,20 @@ static void si_blit_decompress_color(struct si_context *sctx, struct si_texture

         /* Required before and after FMASK and DCC_DECOMPRESS. */
         if (custom_blend == sctx->custom_blend_fmask_decompress ||
-             custom_blend == sctx->custom_blend_dcc_decompress)
+             custom_blend == sctx->custom_blend_dcc_decompress) {
            sctx->flags |= SI_CONTEXT_FLUSH_AND_INV_CB;
+            si_mark_atom_dirty(sctx, &sctx->atoms.s.cache_flush);
+         }

         si_blitter_begin(sctx, SI_DECOMPRESS);
         util_blitter_custom_color(sctx->blitter, cbsurf, custom_blend);
         si_blitter_end(sctx);

         if (custom_blend == sctx->custom_blend_fmask_decompress ||
-             custom_blend == sctx->custom_blend_dcc_decompress)
+             custom_blend == sctx->custom_blend_dcc_decompress) {
            sctx->flags |= SI_CONTEXT_FLUSH_AND_INV_CB;
+            si_mark_atom_dirty(sctx, &sctx->atoms.s.cache_flush);
+         }

         /* When running FMASK decompression with DCC, we need to run the "eliminate fast clear" pass
          * separately because FMASK decompression doesn't eliminate DCC fast clear. This makes
@@ -1036,6 +1040,7 @@ static void si_do_CB_resolve(struct si_context *sctx, const struct pipe_blit_inf
 {
   /* Required before and after CB_RESOLVE. */
   sctx->flags |= SI_CONTEXT_FLUSH_AND_INV_CB;
+   si_mark_atom_dirty(sctx, &sctx->atoms.s.cache_flush);

   si_blitter_begin(
      sctx, SI_COLOR_RESOLVE | (info->render_condition_enable ? 0 : SI_DISABLE_RENDER_COND));
--- a/src/gallium/drivers/radeonsi/si_clear.c
+++ b/src/gallium/drivers/radeonsi/si_clear.c
@@ -55,6 +55,8 @@ void si_execute_clears(struct si_context *sctx, struct si_clear_info *info,
   if (sctx->gfx_level <= GFX8)
      sctx->flags |= SI_CONTEXT_INV_L2;

+   si_mark_atom_dirty(sctx, &sctx->atoms.s.cache_flush);
+
   /* Execute clears. */
   for (unsigned i = 0; i < num_clears; i++) {
      if (info[i].is_dcc_msaa) {
@@ -83,6 +85,8 @@ void si_execute_clears(struct si_context *sctx, struct si_clear_info *info,
   /* GFX6-8: CB and DB don't use L2. */
   if (sctx->gfx_level <= GFX8)
      sctx->flags |= SI_CONTEXT_WB_L2;
+
+   si_mark_atom_dirty(sctx, &sctx->atoms.s.cache_flush);
 }

 static bool si_alloc_separate_cmask(struct si_screen *sscreen, struct si_texture *tex)
@@ -1162,8 +1166,10 @@ static void si_clear(struct pipe_context *ctx, unsigned buffers,
         si_mark_atom_dirty(sctx, &sctx->atoms.s.db_render_state);
      }

-      if (needs_db_flush)
+      if (needs_db_flush) {
         sctx->flags |= SI_CONTEXT_FLUSH_AND_INV_DB;
+         si_mark_atom_dirty(sctx, &sctx->atoms.s.cache_flush);
+      }
   }

   if (unlikely(sctx->sqtt_enabled)) {
--- a/src/gallium/drivers/radeonsi/si_compute.c
+++ b/src/gallium/drivers/radeonsi/si_compute.c
@@ -947,8 +947,10 @@ static void si_launch_grid(struct pipe_context *ctx, const struct pipe_grid_info
   bool cs_regalloc_hang = sscreen->info.has_cs_regalloc_hang_bug &&
                           info->block[0] * info->block[1] * info->block[2] > 256;

-   if (cs_regalloc_hang)
+   if (cs_regalloc_hang) {
      sctx->flags |= SI_CONTEXT_PS_PARTIAL_FLUSH | SI_CONTEXT_CS_PARTIAL_FLUSH;
+      si_mark_atom_dirty(sctx, &sctx->atoms.s.cache_flush);
+   }

   if (program->ir_type != PIPE_SHADER_IR_NATIVE && program->shader.compilation_failed)
      return;
@@ -976,6 +978,7 @@ static void si_launch_grid(struct pipe_context *ctx, const struct pipe_grid_info
      /* Indirect buffers use TC L2 on GFX9, but not older hw. */
      if (sctx->gfx_level <= GFX8 && si_resource(info->indirect)->TC_L2_dirty) {
         sctx->flags |= SI_CONTEXT_WB_L2;
+         si_mark_atom_dirty(sctx, &sctx->atoms.s.cache_flush);
         si_resource(info->indirect)->TC_L2_dirty = false;
      }
   }
@@ -1024,7 +1027,7 @@ static void si_launch_grid(struct pipe_context *ctx, const struct pipe_grid_info

   /* Registers that are not read from memory should be set before this: */
   if (sctx->flags)
-      sctx->emit_cache_flush(sctx, &sctx->gfx_cs);
+      si_emit_cache_flush_direct(sctx);

   if (sctx->has_graphics && si_is_atom_dirty(sctx, &sctx->atoms.s.render_cond)) {
      sctx->atoms.s.render_cond.emit(sctx, -1);
@@ -1060,8 +1063,10 @@ static void si_launch_grid(struct pipe_context *ctx, const struct pipe_grid_info
   sctx->compute_is_busy = true;
   sctx->num_compute_calls++;

-   if (cs_regalloc_hang)
+   if (cs_regalloc_hang) {
      sctx->flags |= SI_CONTEXT_CS_PARTIAL_FLUSH;
+      si_mark_atom_dirty(sctx, &sctx->atoms.s.cache_flush);
+   }
 }

 void si_destroy_compute(struct si_compute *program)
--- a/src/gallium/drivers/radeonsi/si_compute_blit.c
+++ b/src/gallium/drivers/radeonsi/si_compute_blit.c
@@ -163,6 +163,9 @@ static void si_launch_grid_internal(struct si_context *sctx, const struct pipe_g
   if (sctx->num_hw_pipestat_streamout_queries)
      sctx->flags |= SI_CONTEXT_STOP_PIPELINE_STATS;

+   if (sctx->flags)
+      si_mark_atom_dirty(sctx, &sctx->atoms.s.cache_flush);
+
   if (!(flags & SI_OP_CS_RENDER_COND_ENABLE))
      sctx->render_cond_enabled = false;

@@ -213,6 +216,9 @@ static void si_launch_grid_internal(struct si_context *sctx, const struct pipe_g
         sctx->flags |= SI_CONTEXT_INV_SCACHE | SI_CONTEXT_INV_VCACHE | SI_CONTEXT_PFP_SYNC_ME;
      }
   }
+
+   if (sctx->flags)
+      si_mark_atom_dirty(sctx, &sctx->atoms.s.cache_flush);
 }

 void si_launch_grid_internal_ssbos(struct si_context *sctx, struct pipe_grid_info *info,
@@ -220,8 +226,10 @@ void si_launch_grid_internal_ssbos(struct si_context *sctx, struct pipe_grid_inf
                                   unsigned num_buffers, const struct pipe_shader_buffer *buffers,
                                   unsigned writeable_bitmask)
 {
-   if (!(flags & SI_OP_SKIP_CACHE_INV_BEFORE))
+   if (!(flags & SI_OP_SKIP_CACHE_INV_BEFORE)) {
      sctx->flags |= si_get_flush_flags(sctx, coher, SI_COMPUTE_DST_CACHE_POLICY);
+      si_mark_atom_dirty(sctx, &sctx->atoms.s.cache_flush);
+   }

   /* Save states. */
   struct pipe_shader_buffer saved_sb[3] = {};
@@ -243,8 +251,10 @@ void si_launch_grid_internal_ssbos(struct si_context *sctx, struct pipe_grid_inf

   /* Do cache flushing at the end. */
   if (get_cache_policy(sctx, coher, 0) == L2_BYPASS) {
-      if (flags & SI_OP_SYNC_AFTER)
+      if (flags & SI_OP_SYNC_AFTER) {
         sctx->flags |= SI_CONTEXT_WB_L2;
+         si_mark_atom_dirty(sctx, &sctx->atoms.s.cache_flush);
+      }
   } else {
      while (writeable_bitmask)
         si_resource(buffers[u_bit_scan(&writeable_bitmask)].buffer)->TC_L2_dirty = true;
--- a/src/gallium/drivers/radeonsi/si_cp_dma.c
+++ b/src/gallium/drivers/radeonsi/si_cp_dma.c
@@ -144,7 +144,7 @@ static void si_cp_dma_prepare(struct si_context *sctx, struct pipe_resource *dst
    * Also wait for the previous CP DMA operations.
    */
   if (*is_first && sctx->flags)
-      sctx->emit_cache_flush(sctx, &sctx->gfx_cs);
+      si_emit_cache_flush_direct(sctx);

   if (user_flags & SI_OP_SYNC_CPDMA_BEFORE && *is_first && !(*packet_flags & CP_DMA_CLEAR))
      *packet_flags |= CP_DMA_RAW_WAIT;
@@ -192,6 +192,9 @@ void si_cp_dma_clear_buffer(struct si_context *sctx, struct radeon_cmdbuf *cs,
         sctx->flags |= si_get_flush_flags(sctx, coher, cache_policy);
   }

+   if (sctx->flags)
+      si_mark_atom_dirty(sctx, &sctx->atoms.s.cache_flush);
+
   while (size) {
      unsigned byte_count = MIN2(size, cp_dma_max_byte_count(sctx));
      unsigned dma_flags = CP_DMA_CLEAR | (sdst ? 0 : CP_DMA_DST_IS_GDS);
@@ -330,6 +333,9 @@ void si_cp_dma_copy_buffer(struct si_context *sctx, struct pipe_resource *dst,
   if ((dst || src) && !(user_flags & SI_OP_SKIP_CACHE_INV_BEFORE))
         sctx->flags |= si_get_flush_flags(sctx, coher, cache_policy);

+   if (sctx->flags)
+      si_mark_atom_dirty(sctx, &sctx->atoms.s.cache_flush);
+
   /* This is the main part doing the copying. Src is always aligned. */
   main_dst_offset = dst_offset + skipped_size;
   main_src_offset = src_offset + skipped_size;
--- a/src/gallium/drivers/radeonsi/si_descriptors.c
+++ b/src/gallium/drivers/radeonsi/si_descriptors.c
@@ -1926,7 +1926,7 @@ static void si_upload_bindless_descriptors(struct si_context *sctx)
    * descriptors directly in memory, in case the GPU is using them.
    */
   sctx->flags |= SI_CONTEXT_PS_PARTIAL_FLUSH | SI_CONTEXT_CS_PARTIAL_FLUSH;
-   sctx->emit_cache_flush(sctx, &sctx->gfx_cs);
+   si_emit_cache_flush_direct(sctx);

   util_dynarray_foreach (&sctx->resident_tex_handles, struct si_texture_handle *, tex_handle) {
      unsigned desc_slot = (*tex_handle)->desc_slot;
@@ -1950,6 +1950,7 @@ static void si_upload_bindless_descriptors(struct si_context *sctx)

   /* Invalidate scalar L0 because the cache doesn't know that L2 changed. */
   sctx->flags |= SI_CONTEXT_INV_SCACHE;
+   si_mark_atom_dirty(sctx, &sctx->atoms.s.cache_flush);
   sctx->bindless_descriptors_dirty = false;
 }

--- a/src/gallium/drivers/radeonsi/si_gfx_cs.c
+++ b/src/gallium/drivers/radeonsi/si_gfx_cs.c
@@ -108,7 +108,7 @@ void si_flush_gfx_cs(struct si_context *ctx, unsigned flags, struct pipe_fence_h
   /* Wait for draw calls to finish if needed. */
   if (wait_flags) {
      ctx->flags |= wait_flags;
-      ctx->emit_cache_flush(ctx, &ctx->gfx_cs);
+      si_emit_cache_flush_direct(ctx);
   }
   ctx->gfx_last_ib_is_busy = (wait_flags & wait_ps_cs) != wait_ps_cs;

@@ -396,6 +396,8 @@ void si_begin_new_gfx_cs(struct si_context *ctx, bool first_cs)
   if (ctx->screen->info.has_vgt_flush_ngg_legacy_bug && !ctx->ngg)
      ctx->flags |= SI_CONTEXT_VGT_FLUSH;

+   si_mark_atom_dirty(ctx, &ctx->atoms.s.cache_flush);
+
   if (ctx->screen->attribute_ring) {
      radeon_add_to_buffer_list(ctx, &ctx->gfx_cs, ctx->screen->attribute_ring,
                                RADEON_USAGE_READWRITE | RADEON_PRIO_SHADER_RINGS);
@@ -658,6 +660,9 @@ void gfx10_emit_cache_flush(struct si_context *ctx, struct radeon_cmdbuf *cs)
   unsigned cb_db_event = 0;
   unsigned flags = ctx->flags;

+   if (!flags)
+      return;
+
   if (!ctx->has_graphics) {
      /* Only process compute flags. */
      flags &= SI_CONTEXT_INV_ICACHE | SI_CONTEXT_INV_SCACHE | SI_CONTEXT_INV_VCACHE |
@@ -911,10 +916,13 @@ void gfx10_emit_cache_flush(struct si_context *ctx, struct radeon_cmdbuf *cs)
   ctx->flags = 0;
 }

-void si_emit_cache_flush(struct si_context *sctx, struct radeon_cmdbuf *cs)
+void gfx6_emit_cache_flush(struct si_context *sctx, struct radeon_cmdbuf *cs)
 {
   uint32_t flags = sctx->flags;

+   if (!flags)
+      return;
+
   if (!sctx->has_graphics) {
      /* Only process compute flags. */
      flags &= SI_CONTEXT_INV_ICACHE | SI_CONTEXT_INV_SCACHE | SI_CONTEXT_INV_VCACHE |
--- a/src/gallium/drivers/radeonsi/si_pipe.c
+++ b/src/gallium/drivers/radeonsi/si_pipe.c
@@ -611,7 +611,7 @@ static struct pipe_context *si_create_context(struct pipe_screen *screen, unsign
   if (sctx->gfx_level >= GFX10)
      sctx->emit_cache_flush = gfx10_emit_cache_flush;
   else
-      sctx->emit_cache_flush = si_emit_cache_flush;
+      sctx->emit_cache_flush = gfx6_emit_cache_flush;

   sctx->b.emit_string_marker = si_emit_string_marker;
   sctx->b.set_debug_callback = si_set_debug_callback;
--- a/src/gallium/drivers/radeonsi/si_pipe.h
+++ b/src/gallium/drivers/radeonsi/si_pipe.h
@@ -1531,7 +1531,7 @@ void si_trace_emit(struct si_context *sctx);
 void si_emit_surface_sync(struct si_context *sctx, struct radeon_cmdbuf *cs,
                          unsigned cp_coher_cntl);
 void gfx10_emit_cache_flush(struct si_context *sctx, struct radeon_cmdbuf *cs);
-void si_emit_cache_flush(struct si_context *sctx, struct radeon_cmdbuf *cs);
+void gfx6_emit_cache_flush(struct si_context *sctx, struct radeon_cmdbuf *cs);
 /* Replace the sctx->b.draw_vbo function with a wrapper. This can be use to implement
 * optimizations without affecting the normal draw_vbo functions perf.
 */
@@ -1851,6 +1851,8 @@ static inline void si_make_CB_shader_coherent(struct si_context *sctx, unsigned
      /* GFX6-GFX8 */
      sctx->flags |= SI_CONTEXT_INV_L2;
   }
+
+   si_mark_atom_dirty(sctx, &sctx->atoms.s.cache_flush);
 }

 static inline void si_make_DB_shader_coherent(struct si_context *sctx, unsigned num_samples,
@@ -1876,6 +1878,8 @@ static inline void si_make_DB_shader_coherent(struct si_context *sctx, unsigned
      /* GFX6-GFX8 */
      sctx->flags |= SI_CONTEXT_INV_L2;
   }
+
+   si_mark_atom_dirty(sctx, &sctx->atoms.s.cache_flush);
 }

 static inline bool si_can_sample_zs(struct si_texture *tex, bool stencil_sampler)
@@ -2116,6 +2120,23 @@ si_set_rasterized_prim(struct si_context *sctx, enum mesa_prim rast_prim,
   }
 }

+/* There are 3 ways to flush caches and all of them are correct.
+ *
+ * 1) sctx->flags |= ...;
+ *    si_mark_atom_dirty(sctx, &sctx->atoms.s.cache_flush); // deferred
+ *
+ * 2) sctx->flags |= ...;
+ *    si_emit_cache_flush_direct(sctx); // immediate
+ *
+ * 3) sctx->flags |= ...;
+ *    sctx->emit_cache_flush(sctx, cs); // immediate (2 is better though)
+ */
+static inline void si_emit_cache_flush_direct(struct si_context *sctx)
+{
+   sctx->emit_cache_flush(sctx, &sctx->gfx_cs);
+   sctx->dirty_atoms &= ~SI_ATOM_BIT(cache_flush);
+}
+
 #define PRINT_ERR(fmt, args...)                                                                    \
   fprintf(stderr, "EE %s:%d %s - " fmt, __FILE__, __LINE__, __func__, ##args)

--- a/src/gallium/drivers/radeonsi/si_query.c
+++ b/src/gallium/drivers/radeonsi/si_query.c
@@ -859,9 +859,11 @@ static void si_update_hw_pipeline_stats(struct si_context *sctx, unsigned type,
      if (diff == 1 && sctx->num_hw_pipestat_streamout_queries == 1) {
         sctx->flags &= ~SI_CONTEXT_STOP_PIPELINE_STATS;
         sctx->flags |= SI_CONTEXT_START_PIPELINE_STATS;
+         si_mark_atom_dirty(sctx, &sctx->atoms.s.cache_flush);
      } else if (diff == -1 && sctx->num_hw_pipestat_streamout_queries == 0) {
         sctx->flags &= ~SI_CONTEXT_START_PIPELINE_STATS;
         sctx->flags |= SI_CONTEXT_STOP_PIPELINE_STATS;
+         si_mark_atom_dirty(sctx, &sctx->atoms.s.cache_flush);
      }
   }
 }
@@ -1569,6 +1571,7 @@ static void si_query_hw_get_result_resource(struct si_context *sctx, struct si_q
   }

   sctx->flags |= sctx->screen->barrier_flags.cp_to_L2;
+   si_mark_atom_dirty(sctx, &sctx->atoms.s.cache_flush);

   for (qbuf = &query->buffer; qbuf; qbuf = qbuf_prev) {
      if (query->b.type != PIPE_QUERY_TIMESTAMP) {
@@ -1664,6 +1667,7 @@ static void si_render_condition(struct pipe_context *ctx, struct pipe_query *que
         /* Settings this in the render cond atom is too late,
          * so set it here. */
         sctx->flags |= sctx->screen->barrier_flags.L2_to_cp | SI_CONTEXT_FLUSH_FOR_RENDER_COND;
+         si_mark_atom_dirty(sctx, &sctx->atoms.s.cache_flush);

         sctx->render_cond_enabled = old_render_cond_enabled;
      }
--- a/src/gallium/drivers/radeonsi/si_state.c
+++ b/src/gallium/drivers/radeonsi/si_state.c
@@ -1494,11 +1494,13 @@ static void si_set_active_query_state(struct pipe_context *ctx, bool enable)
      if (sctx->num_hw_pipestat_streamout_queries) {
         sctx->flags &= ~SI_CONTEXT_STOP_PIPELINE_STATS;
         sctx->flags |= SI_CONTEXT_START_PIPELINE_STATS;
+         si_mark_atom_dirty(sctx, &sctx->atoms.s.cache_flush);
      }
   } else {
      if (sctx->num_hw_pipestat_streamout_queries) {
         sctx->flags &= ~SI_CONTEXT_START_PIPELINE_STATS;
         sctx->flags |= SI_CONTEXT_STOP_PIPELINE_STATS;
+         si_mark_atom_dirty(sctx, &sctx->atoms.s.cache_flush);
      }
   }

@@ -2893,6 +2895,7 @@ static void si_set_framebuffer_state(struct pipe_context *ctx,
   }

   sctx->flags |= SI_CONTEXT_CS_PARTIAL_FLUSH | SI_CONTEXT_PS_PARTIAL_FLUSH;
+   si_mark_atom_dirty(sctx, &sctx->atoms.s.cache_flush);

   /* u_blitter doesn't invoke depth decompression when it does multiple
    * blits in a row, but the only case when it matters for DB is when
@@ -2910,6 +2913,7 @@ static void si_set_framebuffer_state(struct pipe_context *ctx,
       * Flushing DB metadata works around the problem.
       */
      sctx->flags |= SI_CONTEXT_FLUSH_AND_INV_DB_META;
+      si_mark_atom_dirty(sctx, &sctx->atoms.s.cache_flush);
   }

   /* Take the maximum of the old and new count. If the new count is lower,
@@ -5390,6 +5394,8 @@ static void si_memory_barrier(struct pipe_context *ctx, unsigned flags)
   /* Indirect buffers use TC L2 on GFX9, but not older hw. */
   if (sctx->screen->info.gfx_level <= GFX8 && flags & PIPE_BARRIER_INDIRECT_BUFFER)
      sctx->flags |= SI_CONTEXT_WB_L2;
+
+   si_mark_atom_dirty(sctx, &sctx->atoms.s.cache_flush);
 }

 static void *si_create_blend_custom(struct si_context *sctx, unsigned mode)
@@ -5402,6 +5408,11 @@ static void *si_create_blend_custom(struct si_context *sctx, unsigned mode)
   return si_create_blend_state_mode(&sctx->b, &blend, mode);
 }

+static void si_emit_cache_flush_state(struct si_context *sctx, unsigned index)
+{
+   sctx->emit_cache_flush(sctx, &sctx->gfx_cs);
+}
+
 void si_init_state_compute_functions(struct si_context *sctx)
 {
   sctx->b.create_sampler_state = si_create_sampler_state;
@@ -5434,6 +5445,7 @@ void si_init_state_functions(struct si_context *sctx)
   sctx->atoms.s.clip_regs.emit = si_emit_clip_regs;
   sctx->atoms.s.clip_state.emit = si_emit_clip_state;
   sctx->atoms.s.stencil_ref.emit = si_emit_stencil_ref;
+   sctx->atoms.s.cache_flush.emit = si_emit_cache_flush_state;

   sctx->b.create_blend_state = si_create_blend_state;
   sctx->b.bind_blend_state = si_bind_blend_state;
--- a/src/gallium/drivers/radeonsi/si_state.h
+++ b/src/gallium/drivers/radeonsi/si_state.h
@@ -208,6 +208,7 @@ union si_state_atoms {
      struct si_atom ngg_cull_state;
      struct si_atom vgt_pipeline_state;
      struct si_atom tess_io_layout;
+      struct si_atom cache_flush;
   } s;
   struct si_atom array[sizeof(struct si_atoms_s) / sizeof(struct si_atom)];
 };
--- a/src/gallium/drivers/radeonsi/si_state_draw.cpp
+++ b/src/gallium/drivers/radeonsi/si_state_draw.cpp
@@ -833,8 +833,10 @@ static unsigned si_get_ia_multi_vgt_param(struct si_context *sctx,
      if (GFX_VERSION == GFX7 &&
          sctx->family == CHIP_HAWAII && G_028AA8_SWITCH_ON_EOI(ia_multi_vgt_param) &&
          num_instanced_prims_less_than<IS_DRAW_VERTEX_STATE>(indirect, prim, min_vertex_count,
-                                                              instance_count, 2, sctx->patch_vertices))
+                                                              instance_count, 2, sctx->patch_vertices)) {
         sctx->flags |= SI_CONTEXT_VGT_FLUSH;
+         si_mark_atom_dirty(sctx, &sctx->atoms.s.cache_flush);
+      }
   }

   return ia_multi_vgt_param;
@@ -2086,6 +2088,7 @@ static void si_draw(struct pipe_context *ctx,
         /* GFX8 reads index buffers through TC L2, so it doesn't
          * need this. */
         sctx->flags |= SI_CONTEXT_WB_L2;
+         si_mark_atom_dirty(sctx, &sctx->atoms.s.cache_flush);
         si_resource(indexbuf)->TC_L2_dirty = false;
      }
   }
@@ -2098,12 +2101,14 @@ static void si_draw(struct pipe_context *ctx,
      if (GFX_VERSION <= GFX8) {
         if (indirect->buffer && si_resource(indirect->buffer)->TC_L2_dirty) {
            sctx->flags |= SI_CONTEXT_WB_L2;
+            si_mark_atom_dirty(sctx, &sctx->atoms.s.cache_flush);
            si_resource(indirect->buffer)->TC_L2_dirty = false;
         }

         if (indirect->indirect_draw_count &&
             si_resource(indirect->indirect_draw_count)->TC_L2_dirty) {
            sctx->flags |= SI_CONTEXT_WB_L2;
+            si_mark_atom_dirty(sctx, &sctx->atoms.s.cache_flush);
            si_resource(indirect->indirect_draw_count)->TC_L2_dirty = false;
         }
      }
@@ -2260,18 +2265,17 @@ static void si_draw(struct pipe_context *ctx,

   /* Emit all states except possibly render condition. */
   si_emit_rasterizer_prim_state<GFX_VERSION, HAS_GS, NGG, IS_BLIT>(sctx);
-   si_emit_all_states(sctx, masked_atoms);
-
-   /* Emit draw states. */
-   si_emit_vs_state<GFX_VERSION, HAS_TESS, HAS_GS, NGG, IS_BLIT, HAS_PAIRS>
-         (sctx, index_size);
+   /* This must be done before si_emit_all_states because it can set cache flush flags. */
   si_emit_draw_registers<GFX_VERSION, HAS_TESS, HAS_GS, NGG, IS_DRAW_VERTEX_STATE>
         (sctx, indirect, prim, index_size, instance_count, primitive_restart,
          info->restart_index, min_direct_count);
+   /* This emits states and flushes caches. */
+   si_emit_all_states(sctx, masked_atoms);
+   /* <-- CUs are idle here if the cache_flush state waited. */

-   if (sctx->flags)
-      sctx->emit_cache_flush(sctx, &sctx->gfx_cs);
-   /* <-- CUs are idle here if we waited. */
+   /* This must be done after si_emit_all_states, which can affect this. */
+   si_emit_vs_state<GFX_VERSION, HAS_TESS, HAS_GS, NGG, IS_BLIT, HAS_PAIRS>
+         (sctx, index_size);

   /* If we haven't emitted the render condition state (because it depends on cache flushes),
    * do it now.
@@ -2328,6 +2332,7 @@ static void si_draw(struct pipe_context *ctx,
        (GFX_VERSION == GFX8 && (sctx->family == CHIP_TONGA || sctx->family == CHIP_FIJI))) &&
       si_get_strmout_en(sctx)) {
      sctx->flags |= SI_CONTEXT_VGT_STREAMOUT_SYNC;
+      si_mark_atom_dirty(sctx, &sctx->atoms.s.cache_flush);
   }

   if (unlikely(IS_BLIT && sctx->decompression_enabled)) {
--- a/src/gallium/drivers/radeonsi/si_state_shaders.cpp
+++ b/src/gallium/drivers/radeonsi/si_state_shaders.cpp
@@ -3427,6 +3427,8 @@ bool si_update_ngg(struct si_context *sctx)
       */
      if (sctx->screen->info.has_vgt_flush_ngg_legacy_bug && !new_ngg) {
         sctx->flags |= SI_CONTEXT_VGT_FLUSH;
+         si_mark_atom_dirty(sctx, &sctx->atoms.s.cache_flush);
+
         if (sctx->gfx_level == GFX10) {
            /* Workaround for https://gitlab.freedesktop.org/mesa/mesa/-/issues/2941 */
            si_flush_gfx_cs(sctx, RADEON_FLUSH_ASYNC_START_NEXT_GFX_IB_NOW, NULL);
--- a/src/gallium/drivers/radeonsi/si_state_streamout.c
+++ b/src/gallium/drivers/radeonsi/si_state_streamout.c
@@ -115,6 +115,9 @@ static void si_set_streamout_targets(struct pipe_context *ctx, unsigned num_targ
                     SI_CONTEXT_PFP_SYNC_ME;
   }

+   if (sctx->flags)
+      si_mark_atom_dirty(sctx, &sctx->atoms.s.cache_flush);
+
   /* Streamout buffers must be bound in 2 places:
    * 1) in VGT by setting the VGT_STRMOUT registers
    * 2) as shader resources
@@ -193,7 +196,7 @@ static void si_set_streamout_targets(struct pipe_context *ctx, unsigned num_targ
      si_set_internal_shader_buffer(sctx, SI_VS_STREAMOUT_BUF0 + i, NULL);

   if (wait_now)
-      sctx->emit_cache_flush(sctx, &sctx->gfx_cs);
+      si_emit_cache_flush_direct(sctx);
 }

 static void si_flush_vgt_streamout(struct si_context *sctx)
@@ -309,7 +312,7 @@ void si_emit_streamout_end(struct si_context *sctx)
   if (sctx->gfx_level >= GFX11) {
      /* Wait for streamout to finish before reading GDS_STRMOUT registers. */
      sctx->flags |= SI_CONTEXT_VS_PARTIAL_FLUSH;
-      sctx->emit_cache_flush(sctx, &sctx->gfx_cs);
+      si_emit_cache_flush_direct(sctx);
   } else {
      si_flush_vgt_streamout(sctx);
   }
@@ -326,6 +329,7 @@ void si_emit_streamout_end(struct si_context *sctx)
                         COPY_DATA_REG, NULL,
                         (R_031088_GDS_STRMOUT_DWORDS_WRITTEN_0 >> 2) + i);
         sctx->flags |= SI_CONTEXT_PFP_SYNC_ME;
+         si_mark_atom_dirty(sctx, &sctx->atoms.s.cache_flush);
      } else {
         radeon_begin(cs);
         radeon_emit(PKT3(PKT3_STRMOUT_BUFFER_UPDATE, 4, 0));
--- a/src/gallium/drivers/radeonsi/si_test_dma_perf.c
+++ b/src/gallium/drivers/radeonsi/si_test_dma_perf.c
@@ -153,7 +153,7 @@ void si_test_dma_perf(struct si_screen *sscreen)
            sctx->flags |= SI_CONTEXT_CS_PARTIAL_FLUSH |
                           SI_CONTEXT_FLUSH_AND_INV_CB |
                           SI_CONTEXT_FLUSH_AND_INV_DB;
-            sctx->emit_cache_flush(sctx, &sctx->gfx_cs);
+            si_emit_cache_flush_direct(sctx);

            struct pipe_query *q = ctx->create_query(ctx, query_type, 0);
            ctx->begin_query(ctx, q);
@@ -217,7 +217,7 @@ void si_test_dma_perf(struct si_screen *sscreen)
               sctx->flags |= SI_CONTEXT_INV_VCACHE |
                              (cache_policy == L2_LRU ? 0 : SI_CONTEXT_INV_L2) |
                              SI_CONTEXT_CS_PARTIAL_FLUSH;
-               sctx->emit_cache_flush(sctx, &sctx->gfx_cs);
+               si_emit_cache_flush_direct(sctx);
            }

            ctx->end_query(ctx, q);