From 1537b9355aa112d2d83ed5787cb18a3c564c1107 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= Date: Fri, 23 Aug 2024 08:06:02 -0400 Subject: [PATCH] ac,radeonsi: update comments related to the L2 cache, use "L2", not "TC" "GL2" is also OK. "TC-compatible" is also OK. Acked-by: Pierre-Eric Pelloux-Prayer Part-of: --- src/amd/common/ac_surface.c | 4 ++-- src/amd/common/ac_surface.h | 2 +- src/gallium/drivers/radeonsi/si_clear.c | 2 +- src/gallium/drivers/radeonsi/si_compute.c | 2 +- src/gallium/drivers/radeonsi/si_pipe.h | 8 ++++---- src/gallium/drivers/radeonsi/si_state.c | 19 +++++-------------- .../drivers/radeonsi/si_state_draw.cpp | 6 +++--- .../drivers/radeonsi/si_state_streamout.c | 7 ++++--- 8 files changed, 21 insertions(+), 29 deletions(-) diff --git a/src/amd/common/ac_surface.c b/src/amd/common/ac_surface.c index 492cba3c401..5616dc0663f 100644 --- a/src/amd/common/ac_surface.c +++ b/src/amd/common/ac_surface.c @@ -1548,7 +1548,7 @@ static int gfx6_compute_surface(ADDR_HANDLE addrlib, const struct radeon_info *i */ if (!(surf->flags & RADEON_SURF_Z_OR_SBUFFER) && surf->meta_size && config->info.levels > 1) { /* The smallest miplevels that are never compressed by DCC - * still read the DCC buffer via TC if the base level uses DCC, + * still read the DCC buffer from memory if the base level uses DCC, * and for some reason the DCC buffer needs to be larger if * the miptree uses non-zero tile_swizzle. Otherwise there are * VM faults. @@ -2192,7 +2192,7 @@ static int gfx9_compute_miptree(struct ac_addrlib *addrlib, const struct radeon_ * * Alternative solutions that also work but are worse: * - Disable DCC entirely. - * - Flush TC L2 after rendering. + * - Flush the L2 cache after rendering. */ for (unsigned i = 0; i < in->numMipLevels; i++) { surf->u.gfx9.meta_levels[i].offset = meta_mip_info[i].offset; diff --git a/src/amd/common/ac_surface.h b/src/amd/common/ac_surface.h index 4e529af9a82..e8cbe929de2 100644 --- a/src/amd/common/ac_surface.h +++ b/src/amd/common/ac_surface.h @@ -153,7 +153,7 @@ enum gfx9_resource_type struct gfx9_surf_meta_flags { uint8_t rb_aligned : 1; /* optimal for RBs */ - uint8_t pipe_aligned : 1; /* optimal for TC */ + uint8_t pipe_aligned : 1; /* optimal for L2 */ uint8_t independent_64B_blocks : 1; uint8_t independent_128B_blocks : 1; uint8_t max_compressed_block_size : 2; diff --git a/src/gallium/drivers/radeonsi/si_clear.c b/src/gallium/drivers/radeonsi/si_clear.c index 5beff4db8f4..aefe034bc38 100644 --- a/src/gallium/drivers/radeonsi/si_clear.c +++ b/src/gallium/drivers/radeonsi/si_clear.c @@ -66,7 +66,7 @@ void si_execute_clears(struct si_context *sctx, struct si_clear_info *info, sctx->framebuffer.DB_has_shader_readable_metadata); } - /* Flush caches in case we use compute. */ + /* Invalidate the VMEM cache because we always use compute. */ sctx->flags |= SI_CONTEXT_INV_VCACHE; /* GFX6-8: CB and DB don't use L2. */ diff --git a/src/gallium/drivers/radeonsi/si_compute.c b/src/gallium/drivers/radeonsi/si_compute.c index 22a64784fe4..2ff851e8be8 100644 --- a/src/gallium/drivers/radeonsi/si_compute.c +++ b/src/gallium/drivers/radeonsi/si_compute.c @@ -1213,7 +1213,7 @@ static void si_launch_grid(struct pipe_context *ctx, const struct pipe_grid_info } if (info->indirect) { - /* Indirect buffers use TC L2 on GFX9-GFX11, but not other hw. */ + /* Indirect buffers are read through L2 on GFX9-GFX11, but not other hw. */ if ((sctx->gfx_level <= GFX8 || sctx->gfx_level == GFX12) && si_resource(info->indirect)->TC_L2_dirty) { sctx->flags |= SI_CONTEXT_WB_L2 | SI_CONTEXT_PFP_SYNC_ME; diff --git a/src/gallium/drivers/radeonsi/si_pipe.h b/src/gallium/drivers/radeonsi/si_pipe.h index c07ba4276e1..f1480f376cd 100644 --- a/src/gallium/drivers/radeonsi/si_pipe.h +++ b/src/gallium/drivers/radeonsi/si_pipe.h @@ -331,14 +331,14 @@ struct si_resource { struct util_range valid_buffer_range; /* For buffers only. This indicates that a write operation has been - * performed by TC L2, but the cache hasn't been flushed. - * Any hw block which doesn't use or bypasses TC L2 should check this + * performed by L2, but the cache hasn't been flushed. + * Any hw block which doesn't use or bypasses L2 should check this * flag and flush the cache before using the buffer. * - * For example, TC L2 must be flushed if a buffer which has been + * For example, L2 must be flushed if a buffer which has been * modified by a shader store instruction is about to be used as * an index buffer. The reason is that VGT DMA index fetching doesn't - * use TC L2. + * use L2. */ bool TC_L2_dirty; diff --git a/src/gallium/drivers/radeonsi/si_state.c b/src/gallium/drivers/radeonsi/si_state.c index 39730e76f69..8121f30fa47 100644 --- a/src/gallium/drivers/radeonsi/si_state.c +++ b/src/gallium/drivers/radeonsi/si_state.c @@ -2627,20 +2627,7 @@ static void si_set_framebuffer_state(struct pipe_context *ctx, } } - /* Only flush TC when changing the framebuffer state, because - * the only client not using TC that can change textures is - * the framebuffer. - * - * Wait for compute shaders because of possible transitions: - * - FB write -> shader read - * - shader write -> FB read - * - * Wait for draws because of possible transitions: - * - texture -> render (eg: glBlitFramebuffer(with src=dst) then glDraw*) - * - * DB caches are flushed on demand (using si_decompress_textures). - * - * When MSAA is enabled, CB and TC caches are flushed on demand + /* When MSAA is enabled, CB and L2 caches are flushed on demand * (after FMASK decompression). Shader write -> FB read transitions * cannot happen for MSAA textures, because MSAA shader images are * not supported. @@ -2653,9 +2640,13 @@ static void si_set_framebuffer_state(struct pipe_context *ctx, sctx->framebuffer.all_DCC_pipe_aligned); } + /* Wait for CS because: shader write -> FB read + * Wait for PS because: texture -> render (eg: glBlitFramebuffer(with src=dst) then glDraw*) + */ sctx->flags |= SI_CONTEXT_CS_PARTIAL_FLUSH | SI_CONTEXT_PS_PARTIAL_FLUSH; si_mark_atom_dirty(sctx, &sctx->atoms.s.cache_flush); + /* DB caches are flushed on demand (using si_decompress_textures) except the cases below. */ if (sctx->gfx_level >= GFX12) { si_make_DB_shader_coherent(sctx, sctx->framebuffer.nr_samples, true, false); } else if (sctx->generate_mipmap_for_depth) { diff --git a/src/gallium/drivers/radeonsi/si_state_draw.cpp b/src/gallium/drivers/radeonsi/si_state_draw.cpp index bd2c936306b..6062f4f80af 100644 --- a/src/gallium/drivers/radeonsi/si_state_draw.cpp +++ b/src/gallium/drivers/radeonsi/si_state_draw.cpp @@ -594,7 +594,7 @@ static void si_prefetch_shaders(struct si_context *sctx) if (GFX_VERSION < GFX7 || !mask) return; - /* Prefetch shaders and VBO descriptors to TC L2. */ + /* Prefetch shaders and VBO descriptors into L2. */ if (GFX_VERSION >= GFX11) { if (HAS_TESS && mask & SI_PREFETCH_HS) si_prefetch_shader_async(sctx, sctx->queued.named.hs); @@ -2134,7 +2134,7 @@ static void si_draw(struct pipe_context *ctx, index_offset -= start_offset; } else if ((GFX_VERSION <= GFX7 || GFX_VERSION == GFX12) && si_resource(indexbuf)->TC_L2_dirty) { - /* GFX8-GFX11 reads index buffers through TC L2, so it doesn't + /* GFX8-GFX11 reads index buffers through L2, so it doesn't * need this. */ sctx->flags |= SI_CONTEXT_WB_L2 | SI_CONTEXT_PFP_SYNC_ME; si_mark_atom_dirty(sctx, &sctx->atoms.s.cache_flush); @@ -2146,7 +2146,7 @@ static void si_draw(struct pipe_context *ctx, unsigned total_direct_count = 0; if (!IS_DRAW_VERTEX_STATE && indirect) { - /* Indirect buffers use TC L2 on GFX9-GFX11, but not other hw. */ + /* Indirect buffers use L2 on GFX9-GFX11, but not other hw. */ if (GFX_VERSION <= GFX8 || GFX_VERSION == GFX12) { if (indirect->buffer && si_resource(indirect->buffer)->TC_L2_dirty) { sctx->flags |= SI_CONTEXT_WB_L2 | SI_CONTEXT_PFP_SYNC_ME; diff --git a/src/gallium/drivers/radeonsi/si_state_streamout.c b/src/gallium/drivers/radeonsi/si_state_streamout.c index 8047965ff84..8858a68c5fa 100644 --- a/src/gallium/drivers/radeonsi/si_state_streamout.c +++ b/src/gallium/drivers/radeonsi/si_state_streamout.c @@ -75,13 +75,13 @@ static void si_set_streamout_targets(struct pipe_context *ctx, unsigned num_targ /* Stop streamout. */ si_emit_streamout_end(sctx); - /* Since streamout uses vector writes which go through TC L2 - * and most other clients can use TC L2 as well, we don't need + /* Since streamout uses vector writes which go through L2 + * and most other clients can use L2 as well, we don't need * to flush it. * * The only cases which requires flushing it is VGT DMA index * fetching (on <= GFX7) and indirect draw data, which are rare - * cases. Thus, flag the TC L2 dirtiness in the resource and + * cases. Thus, flag the L2 dirtiness in the resource and * handle it at draw call time. */ for (i = 0; i < old_num_targets; i++) @@ -387,6 +387,7 @@ void si_emit_streamout_end(struct si_context *sctx) t[i]->buf_filled_size, t[i]->buf_filled_size_offset, COPY_DATA_REG, NULL, (R_031088_GDS_STRMOUT_DWORDS_WRITTEN_0 >> 2) + i); + /* For DrawTF reading buf_filled_size: */ sctx->flags |= SI_CONTEXT_PFP_SYNC_ME; si_mark_atom_dirty(sctx, &sctx->atoms.s.cache_flush); } else {