radeonsi: add fail_if_slow parameter into compute_clear/copy_buffer

and move all failure paths into it. Reviewed-by: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer@amd.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/30173>
2024-04-27 08:27:36 -04:00
parent e42a25aea1
commit 8df427f162
1 changed files with 54 additions and 60 deletions
--- a/src/gallium/drivers/radeonsi/si_compute_blit.c
+++ b/src/gallium/drivers/radeonsi/si_compute_blit.c
@@ -257,20 +257,44 @@ void si_compute_clear_buffer_rmw(struct si_context *sctx, struct pipe_resource *
                                 1, &sb, 0x1);
 }

-static void si_compute_do_clear_or_copy(struct si_context *sctx, struct pipe_resource *dst,
-                                        unsigned dst_offset, struct pipe_resource *src,
-                                        unsigned src_offset, unsigned size,
-                                        const uint32_t *clear_value, unsigned clear_value_size,
-                                        unsigned flags, enum si_coherency coher)
+static bool si_compute_clear_or_copy_buffer(struct si_context *sctx, struct pipe_resource *dst,
+                                            unsigned dst_offset, struct pipe_resource *src,
+                                            unsigned src_offset, unsigned size,
+                                            const uint32_t *clear_value, unsigned clear_value_size,
+                                            unsigned flags, enum si_coherency coher,
+                                            bool fail_if_slow)
 {
-   assert(src_offset % 4 == 0);
-   assert(dst_offset % 4 == 0);
-   assert(size % 4 == 0);
+   bool is_copy = src != NULL;
+
+   if (src_offset % 4 || dst_offset % 4 || size % 4 || clear_value_size % 4)
+      return false;
+
+   /* This doesn't fail very often because the only possible fallback is CP DMA, which doesn't
+    * support the render condition.
+    */
+   if (fail_if_slow && !(flags & SI_OP_CS_RENDER_COND_ENABLE) && sctx->screen->info.has_cp_dma &&
+       !sctx->screen->info.cp_sdma_ge_use_system_memory_scope) {
+      if (is_copy) {
+         /* Only use compute for large VRAM copies on dGPUs. */
+         if (size <= 8192 || !sctx->screen->info.has_dedicated_vram ||
+             !(si_resource(dst)->domains & RADEON_DOMAIN_VRAM) ||
+             !(si_resource(src)->domains & RADEON_DOMAIN_VRAM))
+            return false;
+      } else {
+         /* Buffer clear.
+          *
+          * CP DMA clears are terribly slow with GTT on GFX6-8, which can be encountered with any
+          * buffer due to BO evictions, so never use CP DMA clears on GFX6-8. On GFX9+, use CP DMA
+          * clears if the size is small.
+          */
+         if (sctx->gfx_level >= GFX9 && clear_value_size <= 4 && size <= 4096)
+            return false;
+      }
+   }

   assert(dst->target != PIPE_BUFFER || dst_offset + size <= dst->width0);
   assert(!src || src_offset + size <= src->width0);

-   bool is_copy = src != NULL;
   unsigned dwords_per_thread = clear_value_size == 12 ? 3 : 4;
   unsigned num_threads = DIV_ROUND_UP(size, dwords_per_thread * 4);

@@ -301,6 +325,7 @@ static void si_compute_do_clear_or_copy(struct si_context *sctx, struct pipe_res

   si_launch_grid_internal_ssbos(sctx, &info, *shader, flags, coher, is_copy ? 2 : 1, sb,
                                 is_copy ? 0x2 : 0x1);
+   return true;
 }

 void si_clear_buffer(struct si_context *sctx, struct pipe_resource *dst,
@@ -318,52 +343,28 @@ void si_clear_buffer(struct si_context *sctx, struct pipe_resource *dst,
   assert(clear_value_size != 3 && clear_value_size != 6); /* 12 is allowed. */
   assert(offset % clear_alignment == 0);
   assert(size % clear_alignment == 0);
-   assert(size < (UINT_MAX & ~0xf)); /* TODO: test 64-bit sizes in all codepaths */
+   assert(offset < (UINT32_MAX & ~0x3)); /* the limit of pipe_shader_buffer::buffer_size */
+   assert(align(size, 16) < UINT32_MAX); /* we round up the size to 16 for compute */

   uint32_t clamped;
   if (util_lower_clearsize_to_dword(clear_value, (int*)&clear_value_size, &clamped))
      clear_value = &clamped;

   uint64_t aligned_size = size & ~3ull;
-   if (aligned_size >= 4) {
-      uint64_t compute_min_size;
-
-      if (sctx->gfx_level <= GFX8) {
-         /* CP DMA clears are terribly slow with GTT on GFX6-8, which can always
-          * happen due to BO evictions.
-          */
-         compute_min_size = 0;
-      } else {
-         /* Use a small enough size because CP DMA is slower than compute with bigger sizes. */
-         compute_min_size = 4 * 1024;
-      }
-
-      /* TODO: use compute for 8-bit and 16-bit clear values */
-      if (method == SI_AUTO_SELECT_CLEAR_METHOD &&
-          /* CP DMA doesn't support the render condition. */
-          (flags & SI_OP_CS_RENDER_COND_ENABLE ||
-           /* CP DMA doesn't support large clear value sizes. */
-           clear_value_size > 4 ||
-           /* Use compute if the size is large enough. Always prefer compute on GFX12. */
-           (clear_value_size == 4 && offset % 4 == 0 &&
-            (!sctx->screen->info.has_cp_dma ||
-             sctx->screen->info.cp_sdma_ge_use_system_memory_scope || size > compute_min_size))))
-         method = SI_COMPUTE_CLEAR_METHOD;
-
-      if (method == SI_COMPUTE_CLEAR_METHOD) {
-         si_compute_do_clear_or_copy(sctx, dst, offset, NULL, 0, aligned_size, clear_value,
-                                     clear_value_size, flags, coher);
-      } else {
-         assert(clear_value_size == 4);
-         assert(!(flags & SI_OP_CS_RENDER_COND_ENABLE));
-         si_cp_dma_clear_buffer(sctx, &sctx->gfx_cs, dst, offset, aligned_size, *clear_value,
-                                flags, coher, get_cache_policy(sctx, coher, size));
-      }
-
-      offset += aligned_size;
-      size -= aligned_size;
+   if (aligned_size &&
+       (method == SI_CP_DMA_CLEAR_METHOD ||
+        !si_compute_clear_or_copy_buffer(sctx, dst, offset, NULL, 0, aligned_size, clear_value,
+                                         clear_value_size, flags, coher,
+                                         method == SI_AUTO_SELECT_CLEAR_METHOD))) {
+      assert(clear_value_size == 4);
+      assert(!(flags & SI_OP_CS_RENDER_COND_ENABLE));
+      si_cp_dma_clear_buffer(sctx, &sctx->gfx_cs, dst, offset, aligned_size, *clear_value,
+                             flags, coher, get_cache_policy(sctx, coher, size));
   }

+   offset += aligned_size;
+   size -= aligned_size;
+
   /* Handle non-dword alignment. */
   if (size) {
      assert(!(flags & SI_OP_CS_RENDER_COND_ENABLE));
@@ -398,22 +399,15 @@ void si_copy_buffer(struct si_context *sctx, struct pipe_resource *dst, struct p

   enum si_coherency coher = SI_COHERENCY_SHADER;
   enum si_cache_policy cache_policy = get_cache_policy(sctx, coher, size);
-   uint64_t compute_min_size = 8 * 1024;

   si_improve_sync_flags(sctx, dst, src, &flags);

-   /* Only use compute for VRAM copies on dGPUs. */
-   /* TODO: use compute for unaligned big sizes */
-   if (dst_offset % 4 == 0 && src_offset % 4 == 0 && size % 4 == 0 &&
-       (!sctx->screen->info.has_cp_dma || sctx->screen->info.cp_sdma_ge_use_system_memory_scope ||
-        (sctx->screen->info.has_dedicated_vram && si_resource(dst)->domains & RADEON_DOMAIN_VRAM &&
-         si_resource(src)->domains & RADEON_DOMAIN_VRAM && size > compute_min_size))) {
-      si_compute_do_clear_or_copy(sctx, dst, dst_offset, src, src_offset, size, NULL, 0,
-                                  flags, coher);
-   } else {
-      si_cp_dma_copy_buffer(sctx, dst, src, dst_offset, src_offset, size,
-                            flags, coher, cache_policy);
-   }
+   if (si_compute_clear_or_copy_buffer(sctx, dst, dst_offset, src, src_offset, size, NULL, 0,
+                                       flags, coher, true))
+      return;
+
+   si_cp_dma_copy_buffer(sctx, dst, src, dst_offset, src_offset, size, flags, coher,
+                         cache_policy);
 }

 void si_compute_shorten_ubyte_buffer(struct si_context *sctx, struct pipe_resource *dst, struct pipe_resource *src,