diff --git a/src/gallium/drivers/radeonsi/si_compute_blit.c b/src/gallium/drivers/radeonsi/si_compute_blit.c
index 60d7461dac5..783ef9a8dbe 100644
--- a/src/gallium/drivers/radeonsi/si_compute_blit.c
+++ b/src/gallium/drivers/radeonsi/si_compute_blit.c
@@ -280,12 +280,13 @@ bool si_compute_clear_copy_buffer(struct si_context *sctx, struct pipe_resource
          assert(!"clear_value_size must be <= dwords_per_thread");
          return false; /* invalid value */
       }
-
-      if (clear_value_size == 12 && dwords_per_thread != 3)
-         return false; /* unimplemented (yet) */
    } else {
       /* Set default optimal settings. */
-      dwords_per_thread = clear_value_size == 12 ? 3 : 4;
+      /* Clearing 4 dwords per thread with a 3-dword clear value is slightly faster with big sizes. */
+      if (!is_copy && clear_value_size == 12)
+         dwords_per_thread = size <= 4096 ? 3 : 4;
+      else
+         dwords_per_thread = 4;
    }
 
    /* This doesn't fail very often because the only possible fallback is CP DMA, which doesn't
@@ -342,6 +343,7 @@ bool si_compute_clear_copy_buffer(struct si_context *sctx, struct pipe_resource
    key.is_clear = !is_copy;
    assert(dwords_per_thread && dwords_per_thread <= 4);
    key.dwords_per_thread = dwords_per_thread;
+   key.clear_value_size_is_12 = !is_copy && clear_value_size == 12;
 
    void *shader = _mesa_hash_table_u64_search(sctx->cs_dma_shaders, key.key);
    if (!shader) {
diff --git a/src/gallium/drivers/radeonsi/si_pipe.h b/src/gallium/drivers/radeonsi/si_pipe.h
index af064e418bc..c59d2ee1cf6 100644
--- a/src/gallium/drivers/radeonsi/si_pipe.h
+++ b/src/gallium/drivers/radeonsi/si_pipe.h
@@ -1645,6 +1645,7 @@ union si_cs_clear_copy_buffer_key {
    struct {
       bool is_clear:1;
       unsigned dwords_per_thread:3; /* 1..4 allowed */
+      unsigned clear_value_size_is_12:1;
    };
    uint64_t key;
 };
diff --git a/src/gallium/drivers/radeonsi/si_shaderlib_nir.c b/src/gallium/drivers/radeonsi/si_shaderlib_nir.c
index c1db6f601e6..2efa7b9f0cf 100644
--- a/src/gallium/drivers/radeonsi/si_shaderlib_nir.c
+++ b/src/gallium/drivers/radeonsi/si_shaderlib_nir.c
@@ -234,6 +234,7 @@ void *si_create_dma_compute_shader(struct si_context *sctx, union si_cs_clear_co
       fprintf(stderr, "Internal shader: dma\n");
       fprintf(stderr, "   key.is_clear = %u\n", key->is_clear);
       fprintf(stderr, "   key.dwords_per_thread = %u\n", key->dwords_per_thread);
+      fprintf(stderr, "   key.clear_value_size_is_12 = %u\n", key->clear_value_size_is_12);
       fprintf(stderr, "\n");
    }
 
@@ -245,7 +246,8 @@ void *si_create_dma_compute_shader(struct si_context *sctx, union si_cs_clear_co
    b.shader->info.workgroup_size[1] = 1;
    b.shader->info.workgroup_size[2] = 1;
    b.shader->info.num_ssbos = key->is_clear ? 1 : 2;
-   b.shader->info.cs.user_data_components_amd = key->is_clear ? key->dwords_per_thread : 0;
+   b.shader->info.cs.user_data_components_amd =
+      key->is_clear ? (key->clear_value_size_is_12 ? 3 : key->dwords_per_thread) : 0;
 
    nir_def *thread_id = ac_get_global_ids(&b, 1, 32);
    /* Convert the global thread ID into bytes. */
@@ -254,6 +256,26 @@ void *si_create_dma_compute_shader(struct si_context *sctx, union si_cs_clear_co
 
    if (key->is_clear) {
       value = nir_trim_vector(&b, nir_load_user_data_amd(&b), key->dwords_per_thread);
+
+      /* We store 4 dwords per thread, but the clear value has 3 dwords. Swizzle it to 4 dwords.
+       * Storing 4 dwords per thread is faster even when the ALU cost is worse.
+       */
+      if (key->clear_value_size_is_12 && key->dwords_per_thread == 4) {
+         nir_def *dw_offset = nir_imul_imm(&b, thread_id, key->dwords_per_thread);
+         nir_def *vec[3];
+
+         /* Swizzle a 3-component clear value to get a 4-component clear value. Example:
+          * 0 1 2 3 | 4 5 6 7 | 8 9 10 11  // dw_offset
+          *              |
+          *              V
+          * 0 1 2 0 | 1 2 0 1 | 2 0 1 2    // clear value component indices
+          */
+         for (unsigned i = 0; i < 3; i++) {
+            vec[i] = nir_vector_extract(&b, value,
+                                        nir_umod_imm(&b, nir_iadd_imm(&b, dw_offset, i), 3));
+         }
+         value = nir_vec4(&b, vec[0], vec[1], vec[2], vec[0]);
+      }
    } else {
       value = nir_load_ssbo(&b, key->dwords_per_thread, 32, nir_imm_int(&b, 0), offset,
                             .access = ACCESS_RESTRICT);