radeonsi: change the compute blit to clear/blit multiple pixels per lane

The target is 8-16B per lane regardless of the format and number of samples. This is needed to fully utilize the memory bandwidth instead of only a small fraction of it. These are optimal numbers identified by benchmarking. Acked-by: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer@amd.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/28917>
2024-04-12 22:00:52 -04:00
parent d4c066abaf
commit 5b3e1a0532
3 changed files with 411 additions and 46 deletions
--- a/src/gallium/drivers/radeonsi/si_compute_blit.c
+++ b/src/gallium/drivers/radeonsi/si_compute_blit.c
@@ -859,6 +859,52 @@ bool si_compute_copy_image(struct si_context *sctx, struct pipe_resource *dst, u
   return success;
 }

+/* Return a power-of-two alignment of a number. */
+static unsigned compute_alignment(unsigned x)
+{
+   return x ? BITFIELD_BIT(ffs(x) - 1) : BITFIELD_BIT(31);
+}
+
+/* Set the blit info, but change the dst box and trim the src box according to the new dst box. */
+static void set_trimmed_blit(const struct pipe_blit_info *old, const struct pipe_box *box,
+                             bool is_clear, struct pipe_blit_info *out)
+{
+   assert(old->dst.box.x <= box->x);
+   assert(old->dst.box.y <= box->y);
+   assert(old->dst.box.z <= box->z);
+   assert(box->x + box->width <= old->dst.box.x + old->dst.box.width);
+   assert(box->y + box->height <= old->dst.box.y + old->dst.box.height);
+   assert(box->z + box->depth <= old->dst.box.z + old->dst.box.depth);
+   /* No scaling. */
+   assert(is_clear || old->dst.box.width == abs(old->src.box.width));
+   assert(is_clear || old->dst.box.height == abs(old->src.box.height));
+   assert(is_clear || old->dst.box.depth == abs(old->src.box.depth));
+
+   *out = *old;
+   out->dst.box = *box;
+
+   if (!is_clear) {
+      if (out->src.box.width > 0) {
+         out->src.box.x += box->x - old->dst.box.x;
+         out->src.box.width = box->width;
+      } else {
+         out->src.box.x -= box->x - old->dst.box.x;
+         out->src.box.width = -box->width;
+      }
+
+      if (out->src.box.height > 0) {
+         out->src.box.y += box->y - old->dst.box.y;
+         out->src.box.height = box->height;
+      } else {
+         out->src.box.y -= box->y - old->dst.box.y;
+         out->src.box.height = -box->height;
+      }
+
+      out->src.box.z += box->z - old->dst.box.z;
+      out->src.box.depth = box->depth;
+   }
+}
+
 typedef struct {
   unsigned x, y, z;
 } uvec3;
@@ -873,6 +919,8 @@ bool si_compute_blit(struct si_context *sctx, const struct pipe_blit_info *info,
   bool is_clear = !info->src.resource;
   unsigned dst_samples = MAX2(1, sdst->buffer.b.b.nr_samples);
   unsigned src_samples = is_clear ? 1 : MAX2(1, ssrc->buffer.b.b.nr_samples);
+   bool is_resolve = !is_clear && dst_samples == 1 && src_samples >= 2 &&
+                     !util_format_is_pure_integer(info->dst.format);
   bool sample0_only = src_samples >= 2 && dst_samples == 1 &&
                       (info->sample0_only || util_format_is_pure_integer(info->dst.format));
   /* Get the channel sizes. */
@@ -934,6 +982,252 @@ bool si_compute_blit(struct si_context *sctx, const struct pipe_blit_info *info,
   unsigned width = info->dst.box.width;
   unsigned height = info->dst.box.height;
   unsigned depth = info->dst.box.depth;
+   uvec3 lane_size = (uvec3){1, 1, 1};
+
+   /* Determine the size of the block of pixels that will be processed by a single lane.
+    * Generally we want to load and store about 8-16B per lane, but there are exceptions.
+    * The block sizes were fine-tuned for Navi31, and might be suboptimal on different generations.
+    */
+   if (sdst->surface.bpe <= 8 && (is_resolve ? src_samples : dst_samples) <= 4 &&
+       /* Small blits don't benefit. */
+       width * height * depth * sdst->surface.bpe * dst_samples > 128 * 1024) {
+      if (is_3d_tiling) {
+         /* Thick tiling. */
+         if (!is_clear && ssrc->surface.is_linear) {
+            /* Linear -> Thick. */
+            if (sdst->surface.bpe == 4)
+               lane_size = (uvec3){2, 1, 1}; /* 8B per lane */
+            else if (sdst->surface.bpe == 2)
+               lane_size = (uvec3){2, 1, 2}; /* 8B per lane */
+            else if (sdst->surface.bpe == 1)
+               lane_size = (uvec3){4, 1, 2}; /* 8B per lane */
+         } else {
+            if (sdst->surface.bpe == 8)
+               lane_size = (uvec3){1, 1, 2}; /* 16B per lane */
+            else if (sdst->surface.bpe == 4)
+               lane_size = (uvec3){1, 2, 2}; /* 16B per lane */
+            else if (sdst->surface.bpe == 2)
+               lane_size = (uvec3){1, 2, 4}; /* 16B per lane */
+            else
+               lane_size = (uvec3){2, 2, 2}; /* 8B per lane */
+         }
+      } else if (sdst->surface.is_linear) {
+         /* Linear layout. */
+         if (!is_clear && !ssrc->surface.is_linear) {
+            /* Tiled -> Linear. */
+            if (sdst->surface.bpe == 8 && !ssrc->surface.thick_tiling)
+               lane_size = (uvec3){2, 1, 1}; /* 16B per lane */
+            else if (sdst->surface.bpe == 4)
+               lane_size = (uvec3){1, 2, 1}; /* 8B per lane */
+            else if (sdst->surface.bpe == 2 && ssrc->surface.thick_tiling)
+               lane_size = (uvec3){2, 2, 1}; /* 8B per lane */
+            else if (sdst->surface.bpe == 1 && ssrc->surface.thick_tiling)
+               lane_size = (uvec3){2, 2, 2}; /* 8B per lane */
+            else if (sdst->surface.bpe <= 2)
+               lane_size = (uvec3){2, 4, 1}; /* 8-16B per lane */
+         } else {
+            /* Clear or Linear -> Linear. */
+            if (sdst->surface.bpe == 8)
+               lane_size = (uvec3){2, 1, 1}; /* 16B per lane */
+            else if (sdst->surface.bpe == 4)
+               lane_size = (uvec3){4, 1, 1}; /* 16B per lane */
+            else if (sdst->surface.bpe == 2)
+               lane_size = (uvec3){4, 2, 1}; /* 16B per lane */
+            else
+               lane_size = (uvec3){8, 1, 1}; /* 8B per lane */
+         }
+      } else {
+         /* Thin tiling. */
+         if (is_resolve) {
+            if (sdst->surface.bpe == 8 && src_samples == 2) {
+               lane_size = (uvec3){1, 2, 1}; /* 32B->16B per lane */
+            } else if (sdst->surface.bpe == 4) {
+               lane_size = (uvec3){2, 1, 1}; /* 32B->8B for 4 samples, 16B->8B for 2 samples */
+            } else if (sdst->surface.bpe <= 2) {
+               if (src_samples == 4)
+                  lane_size = (uvec3){2, 1, 1}; /* 16B->4B for 16bpp, 8B->2B for 8bpp */
+               else
+                  lane_size = (uvec3){2, 2, 1}; /* 16B->8B for 16bpp, 8B->4B for 8bpp */
+            }
+         } else {
+            if (sdst->surface.bpe == 8 && dst_samples == 1)
+               lane_size = (uvec3){1, 2, 1}; /* 16B per lane */
+            else if (sdst->surface.bpe == 4) {
+               if (dst_samples == 2)
+                  lane_size = (uvec3){2, 1, 1}; /* 16B per lane */
+               else if (dst_samples == 1)
+                  lane_size = (uvec3){2, 2, 1}; /* 16B per lane */
+            } else if (sdst->surface.bpe == 2) {
+               if (dst_samples == 4 || (!is_clear && ssrc->surface.is_linear))
+                  lane_size = (uvec3){2, 1, 1}; /* 16B per lane (4B for linear src) */
+               else if (dst_samples == 2)
+                  lane_size = (uvec3){2, 2, 1}; /* 16B per lane */
+               else
+                  lane_size = (uvec3){2, 4, 1}; /* 16B per lane */
+            } else if (sdst->surface.bpe == 1) {
+               if (dst_samples == 4)
+                  lane_size = (uvec3){2, 1, 1}; /* 8B per lane */
+               else if (dst_samples == 2 || (!is_clear && ssrc->surface.is_linear))
+                  lane_size = (uvec3){2, 2, 1}; /* 8B per lane (4B for linear src) */
+               else
+                  lane_size = (uvec3){2, 4, 1}; /* 8B per lane */
+            }
+         }
+      }
+   }
+
+   /* Check that the lane size fits into the shader key. */
+   static const union si_compute_blit_shader_key max_lane_size = {
+      .log_lane_width = ~0,
+      .log_lane_height = ~0,
+      .log_lane_depth = ~0,
+   };
+   assert(util_logbase2(lane_size.x) <= max_lane_size.log_lane_width);
+   assert(util_logbase2(lane_size.y) <= max_lane_size.log_lane_height);
+   assert(util_logbase2(lane_size.z) <= max_lane_size.log_lane_depth);
+
+   /* If the shader blits a block of pixels per lane, it must have the dst box aligned to that
+    * block because it can't blit a subset of pixels per lane.
+    *
+    * If the blit dst box is not aligned to the lane size, split it into multiple blits by cutting
+    * off the unaligned sides of the box and blitting the middle that's aligned to the lane size,
+    * then blit the unaligned sides separately. This splits the blit into up to 7 blits for 3D,
+    * and 5 blits for 2D.
+    */
+   if (info->dst.box.x % lane_size.x ||
+       info->dst.box.y % lane_size.y ||
+       info->dst.box.z % lane_size.z ||
+       info->dst.box.width % lane_size.x ||
+       info->dst.box.height % lane_size.y ||
+       info->dst.box.depth % lane_size.z) {
+      struct pipe_box middle;
+
+      /* Cut off unaligned regions on the sides of the box. */
+      middle.x = align(info->dst.box.x, lane_size.x);
+      middle.y = align(info->dst.box.y, lane_size.y);
+      middle.z = align(info->dst.box.z, lane_size.z);
+
+      middle.width = info->dst.box.width - (middle.x - info->dst.box.x);
+      if (middle.width > 0)
+         middle.width -= middle.width % lane_size.x;
+      middle.height = info->dst.box.height - (middle.y - info->dst.box.y);
+      if (middle.height > 0)
+         middle.height -= middle.height % lane_size.y;
+      middle.depth = info->dst.box.depth - (middle.z - info->dst.box.z);
+      if (middle.depth > 0)
+         middle.depth -= middle.depth % lane_size.z;
+
+      /* Only a few cases are regressed by this. The vast majority benefits a lot.
+       * This was fine-tuned for Navi31, and might be suboptimal on different generations.
+       */
+      bool slow = (sdst->surface.is_linear && !is_clear && ssrc->surface.is_linear && depth > 1) ||
+                  (sdst->surface.thick_tiling &&
+                   ((sdst->surface.bpe == 8 && is_clear) ||
+                    (sdst->surface.bpe == 4 &&
+                     (sdst->surface.is_linear || (!is_clear && ssrc->surface.is_linear))) ||
+                    (sdst->surface.bpe == 2 && sdst->surface.is_linear && !is_clear &&
+                     ssrc->surface.is_linear))) ||
+                  (!sdst->surface.thick_tiling &&
+                   ((sdst->surface.bpe == 4 && sdst->surface.is_linear && !is_clear &&
+                     ssrc->surface.is_linear) ||
+                    (sdst->surface.bpe == 8 && !is_clear &&
+                     sdst->surface.is_linear != ssrc->surface.is_linear) ||
+                    (is_resolve && sdst->surface.bpe == 4 && src_samples == 4) ||
+                    (is_resolve && sdst->surface.bpe == 8 && src_samples == 2)));
+
+      /* Only use this if the middle blit is large enough. */
+      if (!slow && middle.width > 0 && middle.height > 0 && middle.depth > 0 &&
+          middle.width * middle.height * middle.depth * sdst->surface.bpe * dst_samples >
+          128 * 1024) {
+         /* Compute the size of unaligned regions on all sides of the box. */
+         struct pipe_box top, left, right, bottom, front, back;
+
+         assert(!(flags & SI_OP_IS_NESTED));
+
+         top = info->dst.box;
+         top.height = middle.y - top.y;
+
+         bottom = info->dst.box;
+         bottom.y = middle.y + middle.height;
+         bottom.height = info->dst.box.height - top.height - middle.height;
+
+         left = info->dst.box;
+         left.y = middle.y;
+         left.height = middle.height;
+         left.width = middle.x - left.x;
+
+         right = info->dst.box;
+         right.y = middle.y;
+         right.height = middle.height;
+         right.x = middle.x + middle.width;
+         right.width = info->dst.box.width - left.width - middle.width;
+
+         front = info->dst.box;
+         front.x = middle.x;
+         front.y = middle.y;
+         front.width = middle.width;
+         front.height = middle.height;
+         front.depth = middle.z - front.z;
+
+         back = info->dst.box;
+         back.x = middle.x;
+         back.y = middle.y;
+         back.width = middle.width;
+         back.height = middle.height;
+         back.z = middle.z + middle.depth;
+         back.depth = info->dst.box.depth - front.depth - middle.depth;
+
+         struct pipe_box boxes[] = {middle, top, bottom, left, right, front, back};
+         int last = -1;
+
+         /* Verify that the boxes don't intersect. */
+         for (unsigned i = 0; i < ARRAY_SIZE(boxes); i++) {
+            for (unsigned j = i + 1; j < ARRAY_SIZE(boxes); j++) {
+               if (boxes[i].width > 0 && boxes[i].height > 0 && boxes[i].depth > 0 &&
+                   boxes[j].width > 0 && boxes[j].height > 0 && boxes[j].depth > 0) {
+                  if (u_box_test_intersection_3d(&boxes[i], &boxes[j])) {
+                     printf("\b   (%u, %u, %u) -> (%u, %u, %u) | (%u, %u, %u) -> (%u, %u, %u)\n",
+                            boxes[i].x, boxes[i].y, boxes[i].z,
+                            boxes[i].x + boxes[i].width - 1,
+                            boxes[i].y + boxes[i].height - 1,
+                            boxes[i].z + boxes[i].depth - 1,
+                            boxes[j].x, boxes[j].y, boxes[j].z,
+                            boxes[j].x + boxes[j].width,
+                            boxes[j].y + boxes[j].height,
+                            boxes[j].z + boxes[j].depth);
+                     assert(0);
+                  }
+               }
+            }
+         }
+
+         for (unsigned i = 0; i < ARRAY_SIZE(boxes); i++) {
+            if (boxes[i].width > 0 && boxes[i].height > 0 && boxes[i].depth > 0)
+               last = i;
+         }
+         assert(last > 0);
+
+         for (unsigned i = 0; i < ARRAY_SIZE(boxes); i++) {
+            if (boxes[i].width > 0 && boxes[i].height > 0 && boxes[i].depth > 0) {
+               struct pipe_blit_info new_info;
+               ASSERTED bool ok;
+
+               set_trimmed_blit(info, &boxes[i], is_clear, &new_info);
+               ok = si_compute_blit(sctx, &new_info, clear_color, dst_access, src_access,
+                                    (flags & ~SI_OP_SYNC_BEFORE_AFTER) | SI_OP_IS_NESTED |
+                                    (i == 0 ? flags & SI_OP_SYNC_BEFORE : 0) |
+                                    (i == last ? flags & SI_OP_SYNC_AFTER : 0));
+               assert(ok);
+            }
+         }
+         return true;
+      }
+   }
+
+   /* If the box can't blit split, at least reduce the lane size to the alignment of the box. */
+   lane_size.x = MIN3(lane_size.x, compute_alignment(info->dst.box.x), compute_alignment(width));
+   lane_size.y = MIN3(lane_size.y, compute_alignment(info->dst.box.y), compute_alignment(height));
+   lane_size.z = MIN3(lane_size.z, compute_alignment(info->dst.box.z), compute_alignment(depth));

   /* Determine the alignment of coordinates of the first thread of each wave. The alignment should be
    * to a 256B block or the size of 1 wave, whichever is less, but there are a few exceptions.
@@ -958,10 +1252,10 @@ bool si_compute_blit(struct si_context *sctx, const struct pipe_blit_info *info,
            align = (uvec3){2, 2, 4};
      }

-      /* Clamp the alignment to the size of 1 wave. */
-      align.x = MIN2(align.x, 4);
-      align.y = MIN2(align.y, 4);
-      align.z = MIN2(align.z, 4);
+      /* Clamp the alignment to the expected size of 1 wave. */
+      align.x = MIN2(align.x, 4 * lane_size.x);
+      align.y = MIN2(align.y, 4 * lane_size.y);
+      align.z = MIN2(align.z, 4 * lane_size.z);
   } else if (sdst->surface.is_linear) {
      /* 1D blits from linear to linear are faster unaligned.
       * 1D image clears don't benefit from any alignment.
@@ -969,8 +1263,10 @@ bool si_compute_blit(struct si_context *sctx, const struct pipe_blit_info *info,
      if (height == 1 && depth == 1 && (is_clear || ssrc->surface.is_linear)) {
         align = (uvec3){1, 1, 1};
      } else {
-         /* Linear blits should use the cache line size instead of 256B alignment. */
-         align.x = MIN2(64, sctx->screen->info.tcc_cache_line_size / sdst->surface.bpe);
+         /* Linear blits should use the cache line size instead of 256B alignment.
+          * Clamp it to the expected size of 1 wave.
+          */
+         align.x = MIN2(sctx->screen->info.tcc_cache_line_size / sdst->surface.bpe, 64 * lane_size.x);
         align.y = 1;
         align.z = 1;
      }
@@ -1015,9 +1311,9 @@ bool si_compute_blit(struct si_context *sctx, const struct pipe_blit_info *info,
            align = (uvec3){4, 4, 1};
      }

-      /* Clamp the alignment to the size of 1 wave. */
-      align.x = MIN2(align.x, 8);
-      align.y = MIN2(align.y, 8);
+      /* Clamp the alignment to the expected size of 1 wave. */
+      align.x = MIN2(align.x, 8 * lane_size.x);
+      align.y = MIN2(align.y, 8 * lane_size.y);
   }

   /* If we don't have much to copy, don't align. The threshold is guessed and isn't covered
@@ -1045,6 +1341,21 @@ bool si_compute_blit(struct si_context *sctx, const struct pipe_blit_info *info,
   height += start_y;
   depth += start_z;

+   /* Divide by the dispatch parameters by the lane size. */
+   assert(start_x % lane_size.x == 0);
+   assert(start_y % lane_size.y == 0);
+   assert(start_z % lane_size.z == 0);
+   assert(width % lane_size.x == 0);
+   assert(height % lane_size.y == 0);
+   assert(depth % lane_size.z == 0);
+
+   start_x /= lane_size.x;
+   start_y /= lane_size.y;
+   start_z /= lane_size.z;
+   width /= lane_size.x;
+   height /= lane_size.y;
+   depth /= lane_size.z;
+
   /* Choose the block (i.e. wave) dimensions based on the copy area size and the image layout
    * of dst.
    */
@@ -1094,6 +1405,9 @@ bool si_compute_blit(struct si_context *sctx, const struct pipe_blit_info *info,
   options.is_clear = is_clear;
   options.wg_dim = wg_dim;
   options.has_start_xyz = start_x || start_y || start_z;
+   options.log_lane_width = util_logbase2(lane_size.x);
+   options.log_lane_height = util_logbase2(lane_size.y);
+   options.log_lane_depth = util_logbase2(lane_size.z);
   options.dst_is_1d = info->dst.resource->target == PIPE_TEXTURE_1D ||
                       info->dst.resource->target == PIPE_TEXTURE_1D_ARRAY;
   options.dst_is_msaa = dst_samples > 1;
@@ -1141,7 +1455,6 @@ bool si_compute_blit(struct si_context *sctx, const struct pipe_blit_info *info,
      options.use_integer_one = util_format_is_pure_integer(info->dst.format) &&
                                options.last_src_channel < options.last_dst_channel &&
                                options.last_dst_channel == 3;
-      bool is_resolve = options.src_is_msaa && !options.dst_is_msaa && !options.sample0_only;
      options.d16 = has_d16 &&
                    /* Blitting FP16 using D16 has precision issues. Resolving has precision
                     * issues all the way down to R11G11B10_FLOAT. */
--- a/src/gallium/drivers/radeonsi/si_pipe.h
+++ b/src/gallium/drivers/radeonsi/si_pipe.h
@@ -1479,6 +1479,7 @@ void si_destroy_compute(struct si_compute *program);
 #define SI_OP_SYNC_GE_BEFORE              (1 << 8) /* only sync VS, TCS, TES, GS */
 /* Only for si_compute_blit: */
 #define SI_OP_FAIL_IF_SLOW                (1 << 9)
+#define SI_OP_IS_NESTED                   (1 << 10)

 unsigned si_get_flush_flags(struct si_context *sctx, enum si_coherency coher,
                            enum si_cache_policy cache_policy);
@@ -1634,6 +1635,7 @@ void *gfx9_create_clear_dcc_msaa_cs(struct si_context *sctx, struct si_texture *
 void *si_create_passthrough_tcs(struct si_context *sctx);
 void *si_clear_image_dcc_single_shader(struct si_context *sctx, bool is_msaa, unsigned wg_dim);

+#define SI_MAX_COMPUTE_BLIT_LANE_SIZE  16
 #define SI_MAX_COMPUTE_BLIT_SAMPLES    8

 union si_compute_blit_shader_key {
@@ -1641,6 +1643,10 @@ union si_compute_blit_shader_key {
      /* Workgroup settings. */
      uint8_t wg_dim:2; /* 1, 2, or 3 */
      bool has_start_xyz:1;
+      /* The size of a block of pixels that a single thread will process. */
+      uint8_t log_lane_width:3;
+      uint8_t log_lane_height:2;
+      uint8_t log_lane_depth:2;
      /* Declaration modifiers. */
      bool is_clear:1;
      bool src_is_1d:1;
--- a/src/gallium/drivers/radeonsi/si_shaderlib_nir.c
+++ b/src/gallium/drivers/radeonsi/si_shaderlib_nir.c
@@ -343,6 +343,12 @@ void *si_create_blit_cs(struct si_context *sctx, const union si_compute_blit_sha
   nir_variable *img_dst = nir_variable_create(b.shader, nir_var_uniform, img_type[1], "img1");
   img_dst->data.binding = image_dst_index;

+   unsigned lane_width = 1 << options->log_lane_width;
+   unsigned lane_height = 1 << options->log_lane_height;
+   unsigned lane_depth = 1 << options->log_lane_depth;
+   unsigned lane_size = lane_width * lane_height * lane_depth;
+   assert(lane_size <= SI_MAX_COMPUTE_BLIT_LANE_SIZE);
+
   nir_def *zero = nir_imm_int(&b, 0);

   /* Instructions. */
@@ -365,6 +371,7 @@ void *si_create_blit_cs(struct si_context *sctx, const union si_compute_blit_sha
      if_positive = nir_push_if(&b, is_positive);
   }

+   dst_xyz = nir_imul(&b, dst_xyz, nir_imm_ivec3(&b, lane_width, lane_height, lane_depth));
   nir_def *src_xyz = dst_xyz;

   /* Flip src coordinates. */
@@ -378,7 +385,7 @@ void *si_create_blit_cs(struct si_context *sctx, const union si_compute_blit_sha
          * Therefore do: x = -x - 1, which becomes (width - 1) to 0 after we add box.x = width.
          */
         nir_def *comp = nir_channel(&b, src_xyz, i);
-         comp = nir_iadd_imm(&b, nir_ineg(&b, comp), -1);
+         comp = nir_iadd_imm(&b, nir_ineg(&b, comp), -(int)(i ? lane_height : lane_width));
         src_xyz = nir_vector_insert_imm(&b, src_xyz, comp, i);
      }
   }
@@ -394,9 +401,16 @@ void *si_create_blit_cs(struct si_context *sctx, const union si_compute_blit_sha
   base_coord_src = nir_pad_vector(&b, base_coord_src, 4);
   base_coord_dst = nir_pad_vector(&b, base_coord_dst, 4);

-/* NOTE: This will be changed to a more complex loop in the future. */
-#define foreach_sample(num_samples, sample) \
-   for (unsigned sample = 0; sample < (num_samples); sample++)
+/* Iterate over all pixels in the lane. num_samples is the only input.
+ * (sample, x, y, z) are generated coordinates, while "i" is the coordinates converted to
+ * an absolute index.
+ */
+#define foreach_pixel_in_lane(num_samples, sample, x, y, z, i) \
+   for (unsigned z = 0; z < lane_depth; z++) \
+      for (unsigned y = 0; y < lane_height; y++) \
+         for (unsigned x = 0; x < lane_width; x++) \
+            for (unsigned i = ((z * lane_height + y) * lane_width + x) * (num_samples), sample = 0; \
+                 sample < (num_samples); sample++, i++) \

   /* Swizzle coordinates for 1D_ARRAY. */
   static const unsigned swizzle_xz[] = {0, 2, 0, 0};
@@ -409,8 +423,8 @@ void *si_create_blit_cs(struct si_context *sctx, const union si_compute_blit_sha
   unsigned src_samples = options->src_is_msaa && !options->sample0_only &&
                          !options->is_clear ? num_samples : 1;
   unsigned dst_samples = options->dst_is_msaa ? num_samples : 1;
-   nir_def *color[SI_MAX_COMPUTE_BLIT_SAMPLES] = {0};
-   nir_def *coord_dst[SI_MAX_COMPUTE_BLIT_SAMPLES] = {0};
+   nir_def *color[SI_MAX_COMPUTE_BLIT_LANE_SIZE * SI_MAX_COMPUTE_BLIT_SAMPLES] = {0};
+   nir_def *coord_dst[SI_MAX_COMPUTE_BLIT_LANE_SIZE * SI_MAX_COMPUTE_BLIT_SAMPLES] = {0};
   nir_def *src_resinfo = NULL;

   if (options->is_clear) {
@@ -419,16 +433,31 @@ void *si_create_blit_cs(struct si_context *sctx, const union si_compute_blit_sha
                              BITFIELD_RANGE(4, options->d16 ? 2 : 4));
      if (options->d16)
         color[0] = nir_unpack_64_4x16(&b, nir_pack_64_2x32(&b, color[0]));
+
+      foreach_pixel_in_lane(1, sample, x, y, z, i) {
+         color[i] = color[0];
+      }
   } else {
-      nir_def *coord_src[SI_MAX_COMPUTE_BLIT_SAMPLES] = {0};
+      nir_def *coord_src[SI_MAX_COMPUTE_BLIT_LANE_SIZE * SI_MAX_COMPUTE_BLIT_SAMPLES] = {0};

      /* Initialize src coordinates, one vector per pixel. */
-      foreach_sample(src_samples, i) {
-         coord_src[i] = base_coord_src;
+      foreach_pixel_in_lane(src_samples, sample, x, y, z, i) {
+         unsigned tmp_x = x;
+         unsigned tmp_y = y;
+
+         /* Change the order from 0..N to N..0 for flipped blits. */
+         if (options->flip_x)
+            tmp_x = lane_width - 1 - x;
+         if (options->flip_y)
+            tmp_y = lane_height - 1 - y;
+
+         coord_src[i] = nir_iadd(&b, base_coord_src,
+                                     nir_imm_ivec4(&b, tmp_x, tmp_y, z, 0));
         if (options->src_is_1d)
            coord_src[i] = nir_swizzle(&b, coord_src[i], swizzle_xz, 4);
         if (options->src_is_msaa) {
-            coord_src[i] = nir_vector_insert_imm(&b, coord_src[i], nir_imm_int(&b, i),
+            coord_src[i] = nir_vector_insert_imm(&b, coord_src[i],
+                                                 nir_imm_int(&b, sample),
                                                 num_src_coords - 1);
         }

@@ -451,8 +480,8 @@ void *si_create_blit_cs(struct si_context *sctx, const union si_compute_blit_sha
      }

      /* We don't want the computation of src coordinates to be interleaved with loads. */
-      if (src_samples > 1) {
-         optimization_barrier_vgpr_array(sctx, &b, coord_src, src_samples,
+      if (lane_size > 1 || src_samples > 1) {
+         optimization_barrier_vgpr_array(sctx, &b, coord_src, lane_size * src_samples,
                                         num_src_coords);
      }

@@ -460,29 +489,35 @@ void *si_create_blit_cs(struct si_context *sctx, const union si_compute_blit_sha
      bool is_resolve = src_samples > 1 && dst_samples == 1;
      bool uses_samples_identical = sctx->gfx_level < GFX11 &&
                                    !(sctx->screen->debug_flags & DBG(NO_FMASK)) && is_resolve;
-      nir_def *samples_identical = NULL, *sample0 = {0};
+      nir_def *samples_identical = NULL, *sample0[SI_MAX_COMPUTE_BLIT_LANE_SIZE] = {0};
      nir_if *if_identical = NULL;

      if (uses_samples_identical) {
-         samples_identical = nir_image_deref_samples_identical(&b, 1, deref_ssa(&b, img_src),
-                                                              coord_src[0],
+         samples_identical = nir_imm_true(&b);
+
+         /* If we are resolving multiple pixels per lane, AND all results of "samples_identical". */
+         foreach_pixel_in_lane(1, sample, x, y, z, i) {
+            nir_def *iden = nir_image_deref_samples_identical(&b, 1, deref_ssa(&b, img_src),
+                                                              coord_src[i * src_samples],
                                                              .image_dim = GLSL_SAMPLER_DIM_MS);
+            samples_identical = nir_iand(&b, samples_identical, iden);
+         }

         /* If all samples are identical, load only sample 0. */
         if_identical = nir_push_if(&b, samples_identical);
-         {
-            sample0 = nir_image_deref_load(&b, options->last_src_channel + 1, bit_size,
-                                           deref_ssa(&b, img_src), coord_src[0],
-                                           nir_channel(&b, coord_src[0],
-                                                       num_src_coords - 1), zero,
-                                           .image_dim = img_src->type->sampler_dimensionality,
-                                           .image_array = img_src->type->sampler_array);
+         foreach_pixel_in_lane(1, sample, x, y, z, i) {
+            sample0[i] = nir_image_deref_load(&b, options->last_src_channel + 1, bit_size,
+                                              deref_ssa(&b, img_src), coord_src[i * src_samples],
+                                              nir_channel(&b, coord_src[i * src_samples],
+                                                          num_src_coords - 1), zero,
+                                              .image_dim = img_src->type->sampler_dimensionality,
+                                              .image_array = img_src->type->sampler_array);
         }
         nir_push_else(&b, if_identical);
      }

      /* Load src pixels, one per sample. */
-      foreach_sample(src_samples, i) {
+      foreach_pixel_in_lane(src_samples, sample, x, y, z, i) {
         color[i] = nir_image_deref_load(&b, options->last_src_channel + 1, bit_size,
                                         deref_ssa(&b, img_src), coord_src[i],
                                         nir_channel(&b, coord_src[i], num_src_coords - 1), zero,
@@ -493,50 +528,61 @@ void *si_create_blit_cs(struct si_context *sctx, const union si_compute_blit_sha
      /* Resolve MSAA if necessary. */
      if (is_resolve) {
         /* We don't want the averaging of samples to be interleaved with image loads. */
-         optimization_barrier_vgpr_array(sctx, &b, color, src_samples,
+         optimization_barrier_vgpr_array(sctx, &b, color, lane_size * src_samples,
                                         options->last_src_channel + 1);

-         color[0] = average_samples(&b, color, src_samples);
+         /* This reduces the "color" array from "src_samples * lane_size" elements to only
+          * "lane_size" elements.
+          */
+         foreach_pixel_in_lane(1, sample, x, y, z, i) {
+            color[i] = average_samples(&b, &color[i * src_samples], src_samples);
+         }
         src_samples = 1;
      }

      if (uses_samples_identical) {
         nir_pop_if(&b, if_identical);
-         color[0] = nir_if_phi(&b, sample0, color[0]);
+         foreach_pixel_in_lane(1, sample, x, y, z, i) {
+            color[i] = nir_if_phi(&b, sample0[i], color[i]);
+         }
      }
   }

+   /* We need to load the descriptor here, otherwise the load would be after optimization
+    * barriers waiting for image loads, i.e. after s_waitcnt vmcnt(0).
+    */
   nir_def *img_dst_desc = nir_image_deref_descriptor_amd(&b, 8, 32, deref_ssa(&b, img_dst));
+   if (lane_size > 1 && !sctx->screen->use_aco)
+      img_dst_desc = nir_optimization_barrier_sgpr_amd(&b, 32, img_dst_desc);

   /* Apply the blit output modifiers, once per sample.  */
-   foreach_sample(src_samples, i) {
+   foreach_pixel_in_lane(src_samples, sample, x, y, z, i) {
      color[i] = apply_blit_output_modifiers(&b, color[i], options);
   }

   /* Initialize dst coordinates, one vector per pixel. */
-   foreach_sample(dst_samples, i) {
-      coord_dst[i] = base_coord_dst;
+   foreach_pixel_in_lane(dst_samples, sample, x, y, z, i) {
+      coord_dst[i] = nir_iadd(&b, base_coord_dst, nir_imm_ivec4(&b, x, y, z, 0));
      if (options->dst_is_1d)
         coord_dst[i] = nir_swizzle(&b, coord_dst[i], swizzle_xz, 4);
      if (options->dst_is_msaa) {
-         coord_dst[i] = nir_vector_insert_imm(&b, coord_dst[i],
-                                              nir_imm_int(&b, i),
+         coord_dst[i] = nir_vector_insert_imm(&b, coord_dst[i], nir_imm_int(&b, sample),
                                              num_dst_coords - 1);
      }
   }

   /* We don't want the computation of dst coordinates to be interleaved with stores. */
-   if (dst_samples > 1)
-      optimization_barrier_vgpr_array(sctx, &b, coord_dst, dst_samples, num_dst_coords);
+   if (lane_size > 1 || dst_samples > 1)
+      optimization_barrier_vgpr_array(sctx, &b, coord_dst, lane_size * dst_samples, num_dst_coords);

   /* We don't want the application of blit output modifiers to be interleaved with stores. */
-   if (!options->is_clear && MIN2(src_samples, dst_samples) > 1) {
-      optimization_barrier_vgpr_array(sctx, &b, color, src_samples,
+   if (!options->is_clear && (lane_size > 1 || MIN2(src_samples, dst_samples) > 1)) {
+      optimization_barrier_vgpr_array(sctx, &b, color, lane_size * src_samples,
                                      options->last_dst_channel + 1);
   }

   /* Store the pixels, one per sample. */
-   foreach_sample(dst_samples, i) {
+   foreach_pixel_in_lane(dst_samples, sample, x, y, z, i) {
      nir_bindless_image_store(&b, img_dst_desc, coord_dst[i],
                               nir_channel(&b, coord_dst[i], num_dst_coords - 1),
                               src_samples > 1 ? color[i] : color[i / dst_samples], zero,