diff --git a/src/gallium/drivers/radeonsi/si_compute_blit.c b/src/gallium/drivers/radeonsi/si_compute_blit.c index b8087231df7..11edf8e72e0 100644 --- a/src/gallium/drivers/radeonsi/si_compute_blit.c +++ b/src/gallium/drivers/radeonsi/si_compute_blit.c @@ -859,6 +859,52 @@ bool si_compute_copy_image(struct si_context *sctx, struct pipe_resource *dst, u return success; } +/* Return a power-of-two alignment of a number. */ +static unsigned compute_alignment(unsigned x) +{ + return x ? BITFIELD_BIT(ffs(x) - 1) : BITFIELD_BIT(31); +} + +/* Set the blit info, but change the dst box and trim the src box according to the new dst box. */ +static void set_trimmed_blit(const struct pipe_blit_info *old, const struct pipe_box *box, + bool is_clear, struct pipe_blit_info *out) +{ + assert(old->dst.box.x <= box->x); + assert(old->dst.box.y <= box->y); + assert(old->dst.box.z <= box->z); + assert(box->x + box->width <= old->dst.box.x + old->dst.box.width); + assert(box->y + box->height <= old->dst.box.y + old->dst.box.height); + assert(box->z + box->depth <= old->dst.box.z + old->dst.box.depth); + /* No scaling. */ + assert(is_clear || old->dst.box.width == abs(old->src.box.width)); + assert(is_clear || old->dst.box.height == abs(old->src.box.height)); + assert(is_clear || old->dst.box.depth == abs(old->src.box.depth)); + + *out = *old; + out->dst.box = *box; + + if (!is_clear) { + if (out->src.box.width > 0) { + out->src.box.x += box->x - old->dst.box.x; + out->src.box.width = box->width; + } else { + out->src.box.x -= box->x - old->dst.box.x; + out->src.box.width = -box->width; + } + + if (out->src.box.height > 0) { + out->src.box.y += box->y - old->dst.box.y; + out->src.box.height = box->height; + } else { + out->src.box.y -= box->y - old->dst.box.y; + out->src.box.height = -box->height; + } + + out->src.box.z += box->z - old->dst.box.z; + out->src.box.depth = box->depth; + } +} + typedef struct { unsigned x, y, z; } uvec3; @@ -873,6 +919,8 @@ bool si_compute_blit(struct si_context *sctx, const struct pipe_blit_info *info, bool is_clear = !info->src.resource; unsigned dst_samples = MAX2(1, sdst->buffer.b.b.nr_samples); unsigned src_samples = is_clear ? 1 : MAX2(1, ssrc->buffer.b.b.nr_samples); + bool is_resolve = !is_clear && dst_samples == 1 && src_samples >= 2 && + !util_format_is_pure_integer(info->dst.format); bool sample0_only = src_samples >= 2 && dst_samples == 1 && (info->sample0_only || util_format_is_pure_integer(info->dst.format)); /* Get the channel sizes. */ @@ -934,6 +982,252 @@ bool si_compute_blit(struct si_context *sctx, const struct pipe_blit_info *info, unsigned width = info->dst.box.width; unsigned height = info->dst.box.height; unsigned depth = info->dst.box.depth; + uvec3 lane_size = (uvec3){1, 1, 1}; + + /* Determine the size of the block of pixels that will be processed by a single lane. + * Generally we want to load and store about 8-16B per lane, but there are exceptions. + * The block sizes were fine-tuned for Navi31, and might be suboptimal on different generations. + */ + if (sdst->surface.bpe <= 8 && (is_resolve ? src_samples : dst_samples) <= 4 && + /* Small blits don't benefit. */ + width * height * depth * sdst->surface.bpe * dst_samples > 128 * 1024) { + if (is_3d_tiling) { + /* Thick tiling. */ + if (!is_clear && ssrc->surface.is_linear) { + /* Linear -> Thick. */ + if (sdst->surface.bpe == 4) + lane_size = (uvec3){2, 1, 1}; /* 8B per lane */ + else if (sdst->surface.bpe == 2) + lane_size = (uvec3){2, 1, 2}; /* 8B per lane */ + else if (sdst->surface.bpe == 1) + lane_size = (uvec3){4, 1, 2}; /* 8B per lane */ + } else { + if (sdst->surface.bpe == 8) + lane_size = (uvec3){1, 1, 2}; /* 16B per lane */ + else if (sdst->surface.bpe == 4) + lane_size = (uvec3){1, 2, 2}; /* 16B per lane */ + else if (sdst->surface.bpe == 2) + lane_size = (uvec3){1, 2, 4}; /* 16B per lane */ + else + lane_size = (uvec3){2, 2, 2}; /* 8B per lane */ + } + } else if (sdst->surface.is_linear) { + /* Linear layout. */ + if (!is_clear && !ssrc->surface.is_linear) { + /* Tiled -> Linear. */ + if (sdst->surface.bpe == 8 && !ssrc->surface.thick_tiling) + lane_size = (uvec3){2, 1, 1}; /* 16B per lane */ + else if (sdst->surface.bpe == 4) + lane_size = (uvec3){1, 2, 1}; /* 8B per lane */ + else if (sdst->surface.bpe == 2 && ssrc->surface.thick_tiling) + lane_size = (uvec3){2, 2, 1}; /* 8B per lane */ + else if (sdst->surface.bpe == 1 && ssrc->surface.thick_tiling) + lane_size = (uvec3){2, 2, 2}; /* 8B per lane */ + else if (sdst->surface.bpe <= 2) + lane_size = (uvec3){2, 4, 1}; /* 8-16B per lane */ + } else { + /* Clear or Linear -> Linear. */ + if (sdst->surface.bpe == 8) + lane_size = (uvec3){2, 1, 1}; /* 16B per lane */ + else if (sdst->surface.bpe == 4) + lane_size = (uvec3){4, 1, 1}; /* 16B per lane */ + else if (sdst->surface.bpe == 2) + lane_size = (uvec3){4, 2, 1}; /* 16B per lane */ + else + lane_size = (uvec3){8, 1, 1}; /* 8B per lane */ + } + } else { + /* Thin tiling. */ + if (is_resolve) { + if (sdst->surface.bpe == 8 && src_samples == 2) { + lane_size = (uvec3){1, 2, 1}; /* 32B->16B per lane */ + } else if (sdst->surface.bpe == 4) { + lane_size = (uvec3){2, 1, 1}; /* 32B->8B for 4 samples, 16B->8B for 2 samples */ + } else if (sdst->surface.bpe <= 2) { + if (src_samples == 4) + lane_size = (uvec3){2, 1, 1}; /* 16B->4B for 16bpp, 8B->2B for 8bpp */ + else + lane_size = (uvec3){2, 2, 1}; /* 16B->8B for 16bpp, 8B->4B for 8bpp */ + } + } else { + if (sdst->surface.bpe == 8 && dst_samples == 1) + lane_size = (uvec3){1, 2, 1}; /* 16B per lane */ + else if (sdst->surface.bpe == 4) { + if (dst_samples == 2) + lane_size = (uvec3){2, 1, 1}; /* 16B per lane */ + else if (dst_samples == 1) + lane_size = (uvec3){2, 2, 1}; /* 16B per lane */ + } else if (sdst->surface.bpe == 2) { + if (dst_samples == 4 || (!is_clear && ssrc->surface.is_linear)) + lane_size = (uvec3){2, 1, 1}; /* 16B per lane (4B for linear src) */ + else if (dst_samples == 2) + lane_size = (uvec3){2, 2, 1}; /* 16B per lane */ + else + lane_size = (uvec3){2, 4, 1}; /* 16B per lane */ + } else if (sdst->surface.bpe == 1) { + if (dst_samples == 4) + lane_size = (uvec3){2, 1, 1}; /* 8B per lane */ + else if (dst_samples == 2 || (!is_clear && ssrc->surface.is_linear)) + lane_size = (uvec3){2, 2, 1}; /* 8B per lane (4B for linear src) */ + else + lane_size = (uvec3){2, 4, 1}; /* 8B per lane */ + } + } + } + } + + /* Check that the lane size fits into the shader key. */ + static const union si_compute_blit_shader_key max_lane_size = { + .log_lane_width = ~0, + .log_lane_height = ~0, + .log_lane_depth = ~0, + }; + assert(util_logbase2(lane_size.x) <= max_lane_size.log_lane_width); + assert(util_logbase2(lane_size.y) <= max_lane_size.log_lane_height); + assert(util_logbase2(lane_size.z) <= max_lane_size.log_lane_depth); + + /* If the shader blits a block of pixels per lane, it must have the dst box aligned to that + * block because it can't blit a subset of pixels per lane. + * + * If the blit dst box is not aligned to the lane size, split it into multiple blits by cutting + * off the unaligned sides of the box and blitting the middle that's aligned to the lane size, + * then blit the unaligned sides separately. This splits the blit into up to 7 blits for 3D, + * and 5 blits for 2D. + */ + if (info->dst.box.x % lane_size.x || + info->dst.box.y % lane_size.y || + info->dst.box.z % lane_size.z || + info->dst.box.width % lane_size.x || + info->dst.box.height % lane_size.y || + info->dst.box.depth % lane_size.z) { + struct pipe_box middle; + + /* Cut off unaligned regions on the sides of the box. */ + middle.x = align(info->dst.box.x, lane_size.x); + middle.y = align(info->dst.box.y, lane_size.y); + middle.z = align(info->dst.box.z, lane_size.z); + + middle.width = info->dst.box.width - (middle.x - info->dst.box.x); + if (middle.width > 0) + middle.width -= middle.width % lane_size.x; + middle.height = info->dst.box.height - (middle.y - info->dst.box.y); + if (middle.height > 0) + middle.height -= middle.height % lane_size.y; + middle.depth = info->dst.box.depth - (middle.z - info->dst.box.z); + if (middle.depth > 0) + middle.depth -= middle.depth % lane_size.z; + + /* Only a few cases are regressed by this. The vast majority benefits a lot. + * This was fine-tuned for Navi31, and might be suboptimal on different generations. + */ + bool slow = (sdst->surface.is_linear && !is_clear && ssrc->surface.is_linear && depth > 1) || + (sdst->surface.thick_tiling && + ((sdst->surface.bpe == 8 && is_clear) || + (sdst->surface.bpe == 4 && + (sdst->surface.is_linear || (!is_clear && ssrc->surface.is_linear))) || + (sdst->surface.bpe == 2 && sdst->surface.is_linear && !is_clear && + ssrc->surface.is_linear))) || + (!sdst->surface.thick_tiling && + ((sdst->surface.bpe == 4 && sdst->surface.is_linear && !is_clear && + ssrc->surface.is_linear) || + (sdst->surface.bpe == 8 && !is_clear && + sdst->surface.is_linear != ssrc->surface.is_linear) || + (is_resolve && sdst->surface.bpe == 4 && src_samples == 4) || + (is_resolve && sdst->surface.bpe == 8 && src_samples == 2))); + + /* Only use this if the middle blit is large enough. */ + if (!slow && middle.width > 0 && middle.height > 0 && middle.depth > 0 && + middle.width * middle.height * middle.depth * sdst->surface.bpe * dst_samples > + 128 * 1024) { + /* Compute the size of unaligned regions on all sides of the box. */ + struct pipe_box top, left, right, bottom, front, back; + + assert(!(flags & SI_OP_IS_NESTED)); + + top = info->dst.box; + top.height = middle.y - top.y; + + bottom = info->dst.box; + bottom.y = middle.y + middle.height; + bottom.height = info->dst.box.height - top.height - middle.height; + + left = info->dst.box; + left.y = middle.y; + left.height = middle.height; + left.width = middle.x - left.x; + + right = info->dst.box; + right.y = middle.y; + right.height = middle.height; + right.x = middle.x + middle.width; + right.width = info->dst.box.width - left.width - middle.width; + + front = info->dst.box; + front.x = middle.x; + front.y = middle.y; + front.width = middle.width; + front.height = middle.height; + front.depth = middle.z - front.z; + + back = info->dst.box; + back.x = middle.x; + back.y = middle.y; + back.width = middle.width; + back.height = middle.height; + back.z = middle.z + middle.depth; + back.depth = info->dst.box.depth - front.depth - middle.depth; + + struct pipe_box boxes[] = {middle, top, bottom, left, right, front, back}; + int last = -1; + + /* Verify that the boxes don't intersect. */ + for (unsigned i = 0; i < ARRAY_SIZE(boxes); i++) { + for (unsigned j = i + 1; j < ARRAY_SIZE(boxes); j++) { + if (boxes[i].width > 0 && boxes[i].height > 0 && boxes[i].depth > 0 && + boxes[j].width > 0 && boxes[j].height > 0 && boxes[j].depth > 0) { + if (u_box_test_intersection_3d(&boxes[i], &boxes[j])) { + printf("\b (%u, %u, %u) -> (%u, %u, %u) | (%u, %u, %u) -> (%u, %u, %u)\n", + boxes[i].x, boxes[i].y, boxes[i].z, + boxes[i].x + boxes[i].width - 1, + boxes[i].y + boxes[i].height - 1, + boxes[i].z + boxes[i].depth - 1, + boxes[j].x, boxes[j].y, boxes[j].z, + boxes[j].x + boxes[j].width, + boxes[j].y + boxes[j].height, + boxes[j].z + boxes[j].depth); + assert(0); + } + } + } + } + + for (unsigned i = 0; i < ARRAY_SIZE(boxes); i++) { + if (boxes[i].width > 0 && boxes[i].height > 0 && boxes[i].depth > 0) + last = i; + } + assert(last > 0); + + for (unsigned i = 0; i < ARRAY_SIZE(boxes); i++) { + if (boxes[i].width > 0 && boxes[i].height > 0 && boxes[i].depth > 0) { + struct pipe_blit_info new_info; + ASSERTED bool ok; + + set_trimmed_blit(info, &boxes[i], is_clear, &new_info); + ok = si_compute_blit(sctx, &new_info, clear_color, dst_access, src_access, + (flags & ~SI_OP_SYNC_BEFORE_AFTER) | SI_OP_IS_NESTED | + (i == 0 ? flags & SI_OP_SYNC_BEFORE : 0) | + (i == last ? flags & SI_OP_SYNC_AFTER : 0)); + assert(ok); + } + } + return true; + } + } + + /* If the box can't blit split, at least reduce the lane size to the alignment of the box. */ + lane_size.x = MIN3(lane_size.x, compute_alignment(info->dst.box.x), compute_alignment(width)); + lane_size.y = MIN3(lane_size.y, compute_alignment(info->dst.box.y), compute_alignment(height)); + lane_size.z = MIN3(lane_size.z, compute_alignment(info->dst.box.z), compute_alignment(depth)); /* Determine the alignment of coordinates of the first thread of each wave. The alignment should be * to a 256B block or the size of 1 wave, whichever is less, but there are a few exceptions. @@ -958,10 +1252,10 @@ bool si_compute_blit(struct si_context *sctx, const struct pipe_blit_info *info, align = (uvec3){2, 2, 4}; } - /* Clamp the alignment to the size of 1 wave. */ - align.x = MIN2(align.x, 4); - align.y = MIN2(align.y, 4); - align.z = MIN2(align.z, 4); + /* Clamp the alignment to the expected size of 1 wave. */ + align.x = MIN2(align.x, 4 * lane_size.x); + align.y = MIN2(align.y, 4 * lane_size.y); + align.z = MIN2(align.z, 4 * lane_size.z); } else if (sdst->surface.is_linear) { /* 1D blits from linear to linear are faster unaligned. * 1D image clears don't benefit from any alignment. @@ -969,8 +1263,10 @@ bool si_compute_blit(struct si_context *sctx, const struct pipe_blit_info *info, if (height == 1 && depth == 1 && (is_clear || ssrc->surface.is_linear)) { align = (uvec3){1, 1, 1}; } else { - /* Linear blits should use the cache line size instead of 256B alignment. */ - align.x = MIN2(64, sctx->screen->info.tcc_cache_line_size / sdst->surface.bpe); + /* Linear blits should use the cache line size instead of 256B alignment. + * Clamp it to the expected size of 1 wave. + */ + align.x = MIN2(sctx->screen->info.tcc_cache_line_size / sdst->surface.bpe, 64 * lane_size.x); align.y = 1; align.z = 1; } @@ -1015,9 +1311,9 @@ bool si_compute_blit(struct si_context *sctx, const struct pipe_blit_info *info, align = (uvec3){4, 4, 1}; } - /* Clamp the alignment to the size of 1 wave. */ - align.x = MIN2(align.x, 8); - align.y = MIN2(align.y, 8); + /* Clamp the alignment to the expected size of 1 wave. */ + align.x = MIN2(align.x, 8 * lane_size.x); + align.y = MIN2(align.y, 8 * lane_size.y); } /* If we don't have much to copy, don't align. The threshold is guessed and isn't covered @@ -1045,6 +1341,21 @@ bool si_compute_blit(struct si_context *sctx, const struct pipe_blit_info *info, height += start_y; depth += start_z; + /* Divide by the dispatch parameters by the lane size. */ + assert(start_x % lane_size.x == 0); + assert(start_y % lane_size.y == 0); + assert(start_z % lane_size.z == 0); + assert(width % lane_size.x == 0); + assert(height % lane_size.y == 0); + assert(depth % lane_size.z == 0); + + start_x /= lane_size.x; + start_y /= lane_size.y; + start_z /= lane_size.z; + width /= lane_size.x; + height /= lane_size.y; + depth /= lane_size.z; + /* Choose the block (i.e. wave) dimensions based on the copy area size and the image layout * of dst. */ @@ -1094,6 +1405,9 @@ bool si_compute_blit(struct si_context *sctx, const struct pipe_blit_info *info, options.is_clear = is_clear; options.wg_dim = wg_dim; options.has_start_xyz = start_x || start_y || start_z; + options.log_lane_width = util_logbase2(lane_size.x); + options.log_lane_height = util_logbase2(lane_size.y); + options.log_lane_depth = util_logbase2(lane_size.z); options.dst_is_1d = info->dst.resource->target == PIPE_TEXTURE_1D || info->dst.resource->target == PIPE_TEXTURE_1D_ARRAY; options.dst_is_msaa = dst_samples > 1; @@ -1141,7 +1455,6 @@ bool si_compute_blit(struct si_context *sctx, const struct pipe_blit_info *info, options.use_integer_one = util_format_is_pure_integer(info->dst.format) && options.last_src_channel < options.last_dst_channel && options.last_dst_channel == 3; - bool is_resolve = options.src_is_msaa && !options.dst_is_msaa && !options.sample0_only; options.d16 = has_d16 && /* Blitting FP16 using D16 has precision issues. Resolving has precision * issues all the way down to R11G11B10_FLOAT. */ diff --git a/src/gallium/drivers/radeonsi/si_pipe.h b/src/gallium/drivers/radeonsi/si_pipe.h index c824d001800..b575e5f4889 100644 --- a/src/gallium/drivers/radeonsi/si_pipe.h +++ b/src/gallium/drivers/radeonsi/si_pipe.h @@ -1479,6 +1479,7 @@ void si_destroy_compute(struct si_compute *program); #define SI_OP_SYNC_GE_BEFORE (1 << 8) /* only sync VS, TCS, TES, GS */ /* Only for si_compute_blit: */ #define SI_OP_FAIL_IF_SLOW (1 << 9) +#define SI_OP_IS_NESTED (1 << 10) unsigned si_get_flush_flags(struct si_context *sctx, enum si_coherency coher, enum si_cache_policy cache_policy); @@ -1634,6 +1635,7 @@ void *gfx9_create_clear_dcc_msaa_cs(struct si_context *sctx, struct si_texture * void *si_create_passthrough_tcs(struct si_context *sctx); void *si_clear_image_dcc_single_shader(struct si_context *sctx, bool is_msaa, unsigned wg_dim); +#define SI_MAX_COMPUTE_BLIT_LANE_SIZE 16 #define SI_MAX_COMPUTE_BLIT_SAMPLES 8 union si_compute_blit_shader_key { @@ -1641,6 +1643,10 @@ union si_compute_blit_shader_key { /* Workgroup settings. */ uint8_t wg_dim:2; /* 1, 2, or 3 */ bool has_start_xyz:1; + /* The size of a block of pixels that a single thread will process. */ + uint8_t log_lane_width:3; + uint8_t log_lane_height:2; + uint8_t log_lane_depth:2; /* Declaration modifiers. */ bool is_clear:1; bool src_is_1d:1; diff --git a/src/gallium/drivers/radeonsi/si_shaderlib_nir.c b/src/gallium/drivers/radeonsi/si_shaderlib_nir.c index 823a473ba0c..d754b60fd67 100644 --- a/src/gallium/drivers/radeonsi/si_shaderlib_nir.c +++ b/src/gallium/drivers/radeonsi/si_shaderlib_nir.c @@ -343,6 +343,12 @@ void *si_create_blit_cs(struct si_context *sctx, const union si_compute_blit_sha nir_variable *img_dst = nir_variable_create(b.shader, nir_var_uniform, img_type[1], "img1"); img_dst->data.binding = image_dst_index; + unsigned lane_width = 1 << options->log_lane_width; + unsigned lane_height = 1 << options->log_lane_height; + unsigned lane_depth = 1 << options->log_lane_depth; + unsigned lane_size = lane_width * lane_height * lane_depth; + assert(lane_size <= SI_MAX_COMPUTE_BLIT_LANE_SIZE); + nir_def *zero = nir_imm_int(&b, 0); /* Instructions. */ @@ -365,6 +371,7 @@ void *si_create_blit_cs(struct si_context *sctx, const union si_compute_blit_sha if_positive = nir_push_if(&b, is_positive); } + dst_xyz = nir_imul(&b, dst_xyz, nir_imm_ivec3(&b, lane_width, lane_height, lane_depth)); nir_def *src_xyz = dst_xyz; /* Flip src coordinates. */ @@ -378,7 +385,7 @@ void *si_create_blit_cs(struct si_context *sctx, const union si_compute_blit_sha * Therefore do: x = -x - 1, which becomes (width - 1) to 0 after we add box.x = width. */ nir_def *comp = nir_channel(&b, src_xyz, i); - comp = nir_iadd_imm(&b, nir_ineg(&b, comp), -1); + comp = nir_iadd_imm(&b, nir_ineg(&b, comp), -(int)(i ? lane_height : lane_width)); src_xyz = nir_vector_insert_imm(&b, src_xyz, comp, i); } } @@ -394,9 +401,16 @@ void *si_create_blit_cs(struct si_context *sctx, const union si_compute_blit_sha base_coord_src = nir_pad_vector(&b, base_coord_src, 4); base_coord_dst = nir_pad_vector(&b, base_coord_dst, 4); -/* NOTE: This will be changed to a more complex loop in the future. */ -#define foreach_sample(num_samples, sample) \ - for (unsigned sample = 0; sample < (num_samples); sample++) +/* Iterate over all pixels in the lane. num_samples is the only input. + * (sample, x, y, z) are generated coordinates, while "i" is the coordinates converted to + * an absolute index. + */ +#define foreach_pixel_in_lane(num_samples, sample, x, y, z, i) \ + for (unsigned z = 0; z < lane_depth; z++) \ + for (unsigned y = 0; y < lane_height; y++) \ + for (unsigned x = 0; x < lane_width; x++) \ + for (unsigned i = ((z * lane_height + y) * lane_width + x) * (num_samples), sample = 0; \ + sample < (num_samples); sample++, i++) \ /* Swizzle coordinates for 1D_ARRAY. */ static const unsigned swizzle_xz[] = {0, 2, 0, 0}; @@ -409,8 +423,8 @@ void *si_create_blit_cs(struct si_context *sctx, const union si_compute_blit_sha unsigned src_samples = options->src_is_msaa && !options->sample0_only && !options->is_clear ? num_samples : 1; unsigned dst_samples = options->dst_is_msaa ? num_samples : 1; - nir_def *color[SI_MAX_COMPUTE_BLIT_SAMPLES] = {0}; - nir_def *coord_dst[SI_MAX_COMPUTE_BLIT_SAMPLES] = {0}; + nir_def *color[SI_MAX_COMPUTE_BLIT_LANE_SIZE * SI_MAX_COMPUTE_BLIT_SAMPLES] = {0}; + nir_def *coord_dst[SI_MAX_COMPUTE_BLIT_LANE_SIZE * SI_MAX_COMPUTE_BLIT_SAMPLES] = {0}; nir_def *src_resinfo = NULL; if (options->is_clear) { @@ -419,16 +433,31 @@ void *si_create_blit_cs(struct si_context *sctx, const union si_compute_blit_sha BITFIELD_RANGE(4, options->d16 ? 2 : 4)); if (options->d16) color[0] = nir_unpack_64_4x16(&b, nir_pack_64_2x32(&b, color[0])); + + foreach_pixel_in_lane(1, sample, x, y, z, i) { + color[i] = color[0]; + } } else { - nir_def *coord_src[SI_MAX_COMPUTE_BLIT_SAMPLES] = {0}; + nir_def *coord_src[SI_MAX_COMPUTE_BLIT_LANE_SIZE * SI_MAX_COMPUTE_BLIT_SAMPLES] = {0}; /* Initialize src coordinates, one vector per pixel. */ - foreach_sample(src_samples, i) { - coord_src[i] = base_coord_src; + foreach_pixel_in_lane(src_samples, sample, x, y, z, i) { + unsigned tmp_x = x; + unsigned tmp_y = y; + + /* Change the order from 0..N to N..0 for flipped blits. */ + if (options->flip_x) + tmp_x = lane_width - 1 - x; + if (options->flip_y) + tmp_y = lane_height - 1 - y; + + coord_src[i] = nir_iadd(&b, base_coord_src, + nir_imm_ivec4(&b, tmp_x, tmp_y, z, 0)); if (options->src_is_1d) coord_src[i] = nir_swizzle(&b, coord_src[i], swizzle_xz, 4); if (options->src_is_msaa) { - coord_src[i] = nir_vector_insert_imm(&b, coord_src[i], nir_imm_int(&b, i), + coord_src[i] = nir_vector_insert_imm(&b, coord_src[i], + nir_imm_int(&b, sample), num_src_coords - 1); } @@ -451,8 +480,8 @@ void *si_create_blit_cs(struct si_context *sctx, const union si_compute_blit_sha } /* We don't want the computation of src coordinates to be interleaved with loads. */ - if (src_samples > 1) { - optimization_barrier_vgpr_array(sctx, &b, coord_src, src_samples, + if (lane_size > 1 || src_samples > 1) { + optimization_barrier_vgpr_array(sctx, &b, coord_src, lane_size * src_samples, num_src_coords); } @@ -460,29 +489,35 @@ void *si_create_blit_cs(struct si_context *sctx, const union si_compute_blit_sha bool is_resolve = src_samples > 1 && dst_samples == 1; bool uses_samples_identical = sctx->gfx_level < GFX11 && !(sctx->screen->debug_flags & DBG(NO_FMASK)) && is_resolve; - nir_def *samples_identical = NULL, *sample0 = {0}; + nir_def *samples_identical = NULL, *sample0[SI_MAX_COMPUTE_BLIT_LANE_SIZE] = {0}; nir_if *if_identical = NULL; if (uses_samples_identical) { - samples_identical = nir_image_deref_samples_identical(&b, 1, deref_ssa(&b, img_src), - coord_src[0], + samples_identical = nir_imm_true(&b); + + /* If we are resolving multiple pixels per lane, AND all results of "samples_identical". */ + foreach_pixel_in_lane(1, sample, x, y, z, i) { + nir_def *iden = nir_image_deref_samples_identical(&b, 1, deref_ssa(&b, img_src), + coord_src[i * src_samples], .image_dim = GLSL_SAMPLER_DIM_MS); + samples_identical = nir_iand(&b, samples_identical, iden); + } /* If all samples are identical, load only sample 0. */ if_identical = nir_push_if(&b, samples_identical); - { - sample0 = nir_image_deref_load(&b, options->last_src_channel + 1, bit_size, - deref_ssa(&b, img_src), coord_src[0], - nir_channel(&b, coord_src[0], - num_src_coords - 1), zero, - .image_dim = img_src->type->sampler_dimensionality, - .image_array = img_src->type->sampler_array); + foreach_pixel_in_lane(1, sample, x, y, z, i) { + sample0[i] = nir_image_deref_load(&b, options->last_src_channel + 1, bit_size, + deref_ssa(&b, img_src), coord_src[i * src_samples], + nir_channel(&b, coord_src[i * src_samples], + num_src_coords - 1), zero, + .image_dim = img_src->type->sampler_dimensionality, + .image_array = img_src->type->sampler_array); } nir_push_else(&b, if_identical); } /* Load src pixels, one per sample. */ - foreach_sample(src_samples, i) { + foreach_pixel_in_lane(src_samples, sample, x, y, z, i) { color[i] = nir_image_deref_load(&b, options->last_src_channel + 1, bit_size, deref_ssa(&b, img_src), coord_src[i], nir_channel(&b, coord_src[i], num_src_coords - 1), zero, @@ -493,50 +528,61 @@ void *si_create_blit_cs(struct si_context *sctx, const union si_compute_blit_sha /* Resolve MSAA if necessary. */ if (is_resolve) { /* We don't want the averaging of samples to be interleaved with image loads. */ - optimization_barrier_vgpr_array(sctx, &b, color, src_samples, + optimization_barrier_vgpr_array(sctx, &b, color, lane_size * src_samples, options->last_src_channel + 1); - color[0] = average_samples(&b, color, src_samples); + /* This reduces the "color" array from "src_samples * lane_size" elements to only + * "lane_size" elements. + */ + foreach_pixel_in_lane(1, sample, x, y, z, i) { + color[i] = average_samples(&b, &color[i * src_samples], src_samples); + } src_samples = 1; } if (uses_samples_identical) { nir_pop_if(&b, if_identical); - color[0] = nir_if_phi(&b, sample0, color[0]); + foreach_pixel_in_lane(1, sample, x, y, z, i) { + color[i] = nir_if_phi(&b, sample0[i], color[i]); + } } } + /* We need to load the descriptor here, otherwise the load would be after optimization + * barriers waiting for image loads, i.e. after s_waitcnt vmcnt(0). + */ nir_def *img_dst_desc = nir_image_deref_descriptor_amd(&b, 8, 32, deref_ssa(&b, img_dst)); + if (lane_size > 1 && !sctx->screen->use_aco) + img_dst_desc = nir_optimization_barrier_sgpr_amd(&b, 32, img_dst_desc); /* Apply the blit output modifiers, once per sample. */ - foreach_sample(src_samples, i) { + foreach_pixel_in_lane(src_samples, sample, x, y, z, i) { color[i] = apply_blit_output_modifiers(&b, color[i], options); } /* Initialize dst coordinates, one vector per pixel. */ - foreach_sample(dst_samples, i) { - coord_dst[i] = base_coord_dst; + foreach_pixel_in_lane(dst_samples, sample, x, y, z, i) { + coord_dst[i] = nir_iadd(&b, base_coord_dst, nir_imm_ivec4(&b, x, y, z, 0)); if (options->dst_is_1d) coord_dst[i] = nir_swizzle(&b, coord_dst[i], swizzle_xz, 4); if (options->dst_is_msaa) { - coord_dst[i] = nir_vector_insert_imm(&b, coord_dst[i], - nir_imm_int(&b, i), + coord_dst[i] = nir_vector_insert_imm(&b, coord_dst[i], nir_imm_int(&b, sample), num_dst_coords - 1); } } /* We don't want the computation of dst coordinates to be interleaved with stores. */ - if (dst_samples > 1) - optimization_barrier_vgpr_array(sctx, &b, coord_dst, dst_samples, num_dst_coords); + if (lane_size > 1 || dst_samples > 1) + optimization_barrier_vgpr_array(sctx, &b, coord_dst, lane_size * dst_samples, num_dst_coords); /* We don't want the application of blit output modifiers to be interleaved with stores. */ - if (!options->is_clear && MIN2(src_samples, dst_samples) > 1) { - optimization_barrier_vgpr_array(sctx, &b, color, src_samples, + if (!options->is_clear && (lane_size > 1 || MIN2(src_samples, dst_samples) > 1)) { + optimization_barrier_vgpr_array(sctx, &b, color, lane_size * src_samples, options->last_dst_channel + 1); } /* Store the pixels, one per sample. */ - foreach_sample(dst_samples, i) { + foreach_pixel_in_lane(dst_samples, sample, x, y, z, i) { nir_bindless_image_store(&b, img_dst_desc, coord_dst[i], nir_channel(&b, coord_dst[i], num_dst_coords - 1), src_samples > 1 ? color[i] : color[i / dst_samples], zero,