radeonsi: clear buffers with a 12B clear value by clearing 4 dwords per thread

It's faster than clearing 3 dwords per thread. Reviewed-by: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer@amd.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/30173>
2024-04-28 06:32:39 -04:00
parent 9fa0cb8aa8
commit 65b09edff2
3 changed files with 30 additions and 5 deletions
--- a/src/gallium/drivers/radeonsi/si_compute_blit.c
+++ b/src/gallium/drivers/radeonsi/si_compute_blit.c
@@ -280,12 +280,13 @@ bool si_compute_clear_copy_buffer(struct si_context *sctx, struct pipe_resource
         assert(!"clear_value_size must be <= dwords_per_thread");
         return false; /* invalid value */
      }
-
-      if (clear_value_size == 12 && dwords_per_thread != 3)
-         return false; /* unimplemented (yet) */
   } else {
      /* Set default optimal settings. */
-      dwords_per_thread = clear_value_size == 12 ? 3 : 4;
+      /* Clearing 4 dwords per thread with a 3-dword clear value is slightly faster with big sizes. */
+      if (!is_copy && clear_value_size == 12)
+         dwords_per_thread = size <= 4096 ? 3 : 4;
+      else
+         dwords_per_thread = 4;
   }

   /* This doesn't fail very often because the only possible fallback is CP DMA, which doesn't
@@ -342,6 +343,7 @@ bool si_compute_clear_copy_buffer(struct si_context *sctx, struct pipe_resource
   key.is_clear = !is_copy;
   assert(dwords_per_thread && dwords_per_thread <= 4);
   key.dwords_per_thread = dwords_per_thread;
+   key.clear_value_size_is_12 = !is_copy && clear_value_size == 12;

   void *shader = _mesa_hash_table_u64_search(sctx->cs_dma_shaders, key.key);
   if (!shader) {
--- a/src/gallium/drivers/radeonsi/si_pipe.h
+++ b/src/gallium/drivers/radeonsi/si_pipe.h
@@ -1645,6 +1645,7 @@ union si_cs_clear_copy_buffer_key {
   struct {
      bool is_clear:1;
      unsigned dwords_per_thread:3; /* 1..4 allowed */
+      unsigned clear_value_size_is_12:1;
   };
   uint64_t key;
 };
--- a/src/gallium/drivers/radeonsi/si_shaderlib_nir.c
+++ b/src/gallium/drivers/radeonsi/si_shaderlib_nir.c
@@ -234,6 +234,7 @@ void *si_create_dma_compute_shader(struct si_context *sctx, union si_cs_clear_co
      fprintf(stderr, "Internal shader: dma\n");
      fprintf(stderr, "   key.is_clear = %u\n", key->is_clear);
      fprintf(stderr, "   key.dwords_per_thread = %u\n", key->dwords_per_thread);
+      fprintf(stderr, "   key.clear_value_size_is_12 = %u\n", key->clear_value_size_is_12);
      fprintf(stderr, "\n");
   }

@@ -245,7 +246,8 @@ void *si_create_dma_compute_shader(struct si_context *sctx, union si_cs_clear_co
   b.shader->info.workgroup_size[1] = 1;
   b.shader->info.workgroup_size[2] = 1;
   b.shader->info.num_ssbos = key->is_clear ? 1 : 2;
-   b.shader->info.cs.user_data_components_amd = key->is_clear ? key->dwords_per_thread : 0;
+   b.shader->info.cs.user_data_components_amd =
+      key->is_clear ? (key->clear_value_size_is_12 ? 3 : key->dwords_per_thread) : 0;

   nir_def *thread_id = ac_get_global_ids(&b, 1, 32);
   /* Convert the global thread ID into bytes. */
@@ -254,6 +256,26 @@ void *si_create_dma_compute_shader(struct si_context *sctx, union si_cs_clear_co

   if (key->is_clear) {
      value = nir_trim_vector(&b, nir_load_user_data_amd(&b), key->dwords_per_thread);
+
+      /* We store 4 dwords per thread, but the clear value has 3 dwords. Swizzle it to 4 dwords.
+       * Storing 4 dwords per thread is faster even when the ALU cost is worse.
+       */
+      if (key->clear_value_size_is_12 && key->dwords_per_thread == 4) {
+         nir_def *dw_offset = nir_imul_imm(&b, thread_id, key->dwords_per_thread);
+         nir_def *vec[3];
+
+         /* Swizzle a 3-component clear value to get a 4-component clear value. Example:
+          * 0 1 2 3 | 4 5 6 7 | 8 9 10 11  // dw_offset
+          *              |
+          *              V
+          * 0 1 2 0 | 1 2 0 1 | 2 0 1 2    // clear value component indices
+          */
+         for (unsigned i = 0; i < 3; i++) {
+            vec[i] = nir_vector_extract(&b, value,
+                                        nir_umod_imm(&b, nir_iadd_imm(&b, dw_offset, i), 3));
+         }
+         value = nir_vec4(&b, vec[0], vec[1], vec[2], vec[0]);
+      }
   } else {
      value = nir_load_ssbo(&b, key->dwords_per_thread, 32, nir_imm_int(&b, 0), offset,
                            .access = ACCESS_RESTRICT);