diff --git a/src/gallium/drivers/radeonsi/si_compute_blit.c b/src/gallium/drivers/radeonsi/si_compute_blit.c index f0adffb7a9c..8d1348dc27c 100644 --- a/src/gallium/drivers/radeonsi/si_compute_blit.c +++ b/src/gallium/drivers/radeonsi/si_compute_blit.c @@ -899,6 +899,7 @@ bool si_compute_blit(struct si_context *sctx, const struct pipe_blit_info *info, if (info->dst.format == PIPE_FORMAT_A8R8_UNORM || /* This format fails AMD_TEST=imagecopy. */ max_dst_chan_size == 5 || /* PIPE_FORMAT_R5G5B5A1_UNORM has precision issues */ util_format_is_depth_or_stencil(info->dst.resource->format) || + dst_samples > SI_MAX_COMPUTE_BLIT_SAMPLES || info->dst_sample != 0 || /* Image stores support DCC since GFX10. Return only for gfx queues. DCC is disabled * for compute queues farther below. */ @@ -912,7 +913,8 @@ bool si_compute_blit(struct si_context *sctx, const struct pipe_blit_info *info, (info->dst.box.width != abs(info->src.box.width) || info->dst.box.height != abs(info->src.box.height) || info->dst.box.depth != abs(info->src.box.depth) || - util_format_is_depth_or_stencil(info->src.resource->format)))) + util_format_is_depth_or_stencil(info->src.resource->format) || + src_samples > SI_MAX_COMPUTE_BLIT_SAMPLES))) return false; /* Testing on Navi21 showed that the compute blit is slightly slower than the gfx blit. @@ -1104,7 +1106,8 @@ bool si_compute_blit(struct si_context *sctx, const struct pipe_blit_info *info, bool has_d16 = sctx->gfx_level >= (sctx->screen->use_aco ? GFX9 : GFX8); if (is_clear) { - options.log2_samples = util_logbase2(dst_samples); + assert(dst_samples <= 8); + options.log_samples = util_logbase2(dst_samples); options.d16 = has_d16 && max_dst_chan_size <= (util_format_is_float(info->dst.format) || util_format_is_pure_integer(info->dst.format) ? 16 : 11); @@ -1117,10 +1120,11 @@ bool si_compute_blit(struct si_context *sctx, const struct pipe_blit_info *info, info->src.resource->target == PIPE_TEXTURE_1D_ARRAY || info->src.resource->target == PIPE_TEXTURE_2D_ARRAY || info->src.resource->target == PIPE_TEXTURE_CUBE_ARRAY; - /* Resolving integer formats only copies sample 0. log2_samples is then unused. */ + /* Resolving integer formats only copies sample 0. log_samples is then unused. */ options.sample0_only = sample0_only; unsigned num_samples = MAX2(src_samples, dst_samples); - options.log2_samples = sample0_only ? 0 : util_logbase2(num_samples); + assert(num_samples <= 8); + options.log_samples = sample0_only ? 0 : util_logbase2(num_samples); options.x_clamp_to_edge = si_should_blit_clamp_to_edge(info, BITFIELD_BIT(0)); options.y_clamp_to_edge = si_should_blit_clamp_to_edge(info, BITFIELD_BIT(1)); options.flip_x = info->src.box.width < 0; diff --git a/src/gallium/drivers/radeonsi/si_pipe.h b/src/gallium/drivers/radeonsi/si_pipe.h index 3468742eb05..8a4b798c8e9 100644 --- a/src/gallium/drivers/radeonsi/si_pipe.h +++ b/src/gallium/drivers/radeonsi/si_pipe.h @@ -1632,6 +1632,8 @@ void *gfx9_create_clear_dcc_msaa_cs(struct si_context *sctx, struct si_texture * void *si_create_passthrough_tcs(struct si_context *sctx); void *si_clear_image_dcc_single_shader(struct si_context *sctx, bool is_msaa, unsigned wg_dim); +#define SI_MAX_COMPUTE_BLIT_SAMPLES 8 + union si_compute_blit_shader_key { struct { /* Workgroup settings. */ @@ -1646,7 +1648,7 @@ union si_compute_blit_shader_key { bool src_has_z:1; bool dst_has_z:1; bool d16:1; - uint8_t log2_samples:4; + uint8_t log_samples:2; bool sample0_only:1; /* src is MSAA, dst is not MSAA, log2_samples is ignored */ /* Source coordinate modifiers. */ bool x_clamp_to_edge:1; @@ -1658,7 +1660,7 @@ union si_compute_blit_shader_key { bool uint_to_sint:1; bool dst_is_srgb:1; bool use_integer_one:1; - uint8_t last_src_channel:2; + uint8_t last_src_channel:2; /* this shouldn't be greater than last_dst_channel */ uint8_t last_dst_channel:2; }; uint64_t key; diff --git a/src/gallium/drivers/radeonsi/si_shaderlib_nir.c b/src/gallium/drivers/radeonsi/si_shaderlib_nir.c index 0df3ca85199..823a473ba0c 100644 --- a/src/gallium/drivers/radeonsi/si_shaderlib_nir.c +++ b/src/gallium/drivers/radeonsi/si_shaderlib_nir.c @@ -195,13 +195,12 @@ static nir_def *convert_linear_to_srgb(nir_builder *b, nir_def *input) /* There are small precision differences compared to CB, so the gfx blit will return slightly * different results. */ + for (unsigned i = 0; i < MIN2(3, input->num_components); i++) { + input = nir_vector_insert_imm(b, input, + nir_format_linear_to_srgb(b, nir_channel(b, input, i)), i); + } - nir_def *comp[4]; - for (unsigned i = 0; i < 3; i++) - comp[i] = nir_format_linear_to_srgb(b, nir_channel(b, input, i)); - comp[3] = nir_channel(b, input, 3); - - return nir_vec(b, comp, 4); + return input; } static nir_def *average_samples(nir_builder *b, nir_def **samples, unsigned num_samples) @@ -228,61 +227,6 @@ static nir_def *average_samples(nir_builder *b, nir_def **samples, unsigned num_ return nir_fmul_imm(b, samples[0], 1.0 / num_samples); /* average the sum */ } -static nir_def *image_resolve_msaa(struct si_screen *sscreen, nir_builder *b, nir_variable *img, - unsigned num_samples, nir_def *coord, unsigned bit_size) -{ - nir_def *zero = nir_imm_int(b, 0); - nir_def *result = NULL; - nir_variable *var = NULL; - - /* Gfx11 doesn't support samples_identical, so we can't use it. */ - if (sscreen->info.gfx_level < GFX11) { - /* We need a local variable to get the result out of conditional branches in SSA. */ - var = nir_local_variable_create(b->impl, - bit_size == 16 ? glsl_f16vec_type(4) : glsl_vec4_type(), - NULL); - - /* If all samples are identical, load only sample 0. */ - nir_push_if(b, nir_image_deref_samples_identical(b, 1, deref_ssa(b, img), coord)); - result = nir_image_deref_load(b, 4, bit_size, deref_ssa(b, img), coord, zero, zero); - nir_store_var(b, var, result, 0xf); - - nir_push_else(b, NULL); - } - - nir_def *sample_index[16]; - for (unsigned i = 0; i < num_samples; i++) - sample_index[i] = nir_imm_int(b, i); - - /* We need to hide the constant sample indices behind the optimization barrier, otherwise - * LLVM doesn't put loads into the same clause. - * - * TODO: nir_group_loads could do this. - */ - if (!sscreen->use_aco) { - for (unsigned i = 0; i < num_samples; i++) - sample_index[i] = nir_optimization_barrier_vgpr_amd(b, bit_size, sample_index[i]); - } - - /* Load all samples. */ - nir_def *samples[16]; - for (unsigned i = 0; i < num_samples; i++) { - samples[i] = nir_image_deref_load(b, 4, bit_size, deref_ssa(b, img), - coord, sample_index[i], zero); - } - - result = average_samples(b, samples, num_samples); - - if (sscreen->info.gfx_level < GFX11) { - /* Exit the conditional branch and get the result out of the branch. */ - nir_store_var(b, var, result, 0xf); - nir_pop_if(b, NULL); - result = nir_load_var(b, var); - } - - return result; -} - static nir_def *apply_blit_output_modifiers(nir_builder *b, nir_def *color, const union si_compute_blit_shader_key *options) { @@ -304,12 +248,24 @@ static nir_def *apply_blit_output_modifiers(nir_builder *b, nir_def *color, nir_def *one = options->use_integer_one ? nir_imm_intN_t(b, 1, bit_size) : nir_imm_floatN_t(b, 1, bit_size); - /* Set channels not present in src to 0 or 1. This will eliminate code loading and resolving - * those channels. - */ - if (!options->is_clear) { - for (unsigned chan = options->last_src_channel + 1; chan <= options->last_dst_channel; chan++) - color = nir_vector_insert_imm(b, color, chan == 3 ? one : zero, chan); + if (options->is_clear) { + if (options->last_dst_channel < 3) + color = nir_trim_vector(b, color, options->last_dst_channel + 1); + } else { + assert(options->last_src_channel <= options->last_dst_channel); + assert(color->num_components == options->last_src_channel + 1); + + /* Set channels not present in src to 0 or 1. */ + if (options->last_src_channel < options->last_dst_channel) { + color = nir_pad_vector(b, color, options->last_dst_channel + 1); + + for (unsigned chan = options->last_src_channel + 1; chan <= options->last_dst_channel; chan++) + color = nir_vector_insert_imm(b, color, chan == 3 ? one : zero, chan); + } + + /* Discard channels not present in dst. The hardware fills unstored channels with 0. */ + if (options->last_dst_channel < options->last_src_channel) + color = nir_trim_vector(b, color, options->last_dst_channel + 1); } /* Discard channels not present in dst. The hardware fills unstored channels with 0. */ @@ -319,6 +275,27 @@ static nir_def *apply_blit_output_modifiers(nir_builder *b, nir_def *color, return color; } +static void optimization_barrier_vgpr_array(struct si_context *sctx, nir_builder *b, + nir_def **array, unsigned num_elements, + unsigned num_components) +{ + /* We use the optimization barrier to force LLVM to form VMEM clauses by constraining its + * instruction scheduling options. + * + * VMEM clauses are supported since GFX10. It's not recommended to use the optimization + * barrier in the compute blit for GFX6-8 because the lack of A16 combined with optimization + * barriers would unnecessarily increase VGPR usage for MSAA resources. + */ + if (!sctx->screen->use_aco && sctx->gfx_level >= GFX10) { + for (unsigned i = 0; i < num_elements; i++) { + unsigned prev_num = array[i]->num_components; + array[i] = nir_trim_vector(b, array[i], num_components); + array[i] = nir_optimization_barrier_vgpr_amd(b, array[i]->bit_size, array[i]); + array[i] = nir_pad_vector(b, array[i], prev_num); + } + } +} + /* The compute blit shader. * * Implementation details: @@ -407,96 +384,164 @@ void *si_create_blit_cs(struct si_context *sctx, const union si_compute_blit_sha } /* Add box.xyz. */ - nir_def *coord_src = NULL, *coord_dst = NULL, *dim = NULL; + nir_def *base_coord_src = NULL, *base_coord_dst = NULL; unpack_2x16_signed(&b, nir_trim_vector(&b, nir_load_user_data_amd(&b), 3), - &coord_src, &coord_dst); - coord_dst = nir_iadd(&b, coord_dst, dst_xyz); - coord_src = nir_iadd(&b, coord_src, src_xyz); - - /* Clamp to edge for src, only X and Y because Z can't be out of bounds. */ - for (unsigned i = 0; i < 2; i++) { - if (i ? options->y_clamp_to_edge : options->x_clamp_to_edge) { - assert(!options->src_is_1d || i == 0); - - if (!dim) - dim = nir_image_deref_size(&b, 4, 32, deref_ssa(&b, img_src), zero); - - nir_def *tmp = nir_channel(&b, coord_src, i); - tmp = nir_imax(&b, tmp, nir_imm_int(&b, 0)); - tmp = nir_imin(&b, tmp, nir_iadd_imm(&b, nir_channel(&b, dim, i), -1)); - coord_src = nir_vector_insert_imm(&b, coord_src, tmp, i); - } - } - - /* Swizzle coordinates for 1D_ARRAY. */ - static unsigned swizzle_xz[] = {0, 2, 0, 0}; - - if (options->src_is_1d) - coord_src = nir_swizzle(&b, coord_src, swizzle_xz, 4); - if (options->dst_is_1d) - coord_dst = nir_swizzle(&b, coord_dst, swizzle_xz, 4); + &base_coord_src, &base_coord_dst); + base_coord_dst = nir_iadd(&b, base_coord_dst, dst_xyz); + base_coord_src = nir_iadd(&b, base_coord_src, src_xyz); /* Coordinates must have 4 channels in NIR. */ - coord_src = nir_pad_vector(&b, coord_src, 4); - coord_dst = nir_pad_vector(&b, coord_dst, 4); + base_coord_src = nir_pad_vector(&b, base_coord_src, 4); + base_coord_dst = nir_pad_vector(&b, base_coord_dst, 4); - /* TODO: out-of-bounds image stores have no effect, but we could jump over them for better perf */ +/* NOTE: This will be changed to a more complex loop in the future. */ +#define foreach_sample(num_samples, sample) \ + for (unsigned sample = 0; sample < (num_samples); sample++) - /* Execute the image loads and stores. */ + /* Swizzle coordinates for 1D_ARRAY. */ + static const unsigned swizzle_xz[] = {0, 2, 0, 0}; + + /* Execute image loads and stores. */ + unsigned num_src_coords = (options->src_is_1d ? 1 : 2) + options->src_has_z + options->src_is_msaa; + unsigned num_dst_coords = (options->dst_is_1d ? 1 : 2) + options->dst_has_z + options->dst_is_msaa; unsigned bit_size = options->d16 ? 16 : 32; - unsigned num_samples = 1 << options->log2_samples; - nir_def *color = NULL; + unsigned num_samples = 1 << options->log_samples; + unsigned src_samples = options->src_is_msaa && !options->sample0_only && + !options->is_clear ? num_samples : 1; + unsigned dst_samples = options->dst_is_msaa ? num_samples : 1; + nir_def *color[SI_MAX_COMPUTE_BLIT_SAMPLES] = {0}; + nir_def *coord_dst[SI_MAX_COMPUTE_BLIT_SAMPLES] = {0}; + nir_def *src_resinfo = NULL; if (options->is_clear) { - /* The clear color start at component 4 of user data. */ - color = nir_channels(&b, nir_load_user_data_amd(&b), - BITFIELD_RANGE(4, options->d16 ? 2 : 4)); + /* The clear color starts at component 4 of user data. */ + color[0] = nir_channels(&b, nir_load_user_data_amd(&b), + BITFIELD_RANGE(4, options->d16 ? 2 : 4)); if (options->d16) - color = nir_unpack_64_4x16(&b, nir_pack_64_2x32(&b, color)); + color[0] = nir_unpack_64_4x16(&b, nir_pack_64_2x32(&b, color[0])); + } else { + nir_def *coord_src[SI_MAX_COMPUTE_BLIT_SAMPLES] = {0}; + + /* Initialize src coordinates, one vector per pixel. */ + foreach_sample(src_samples, i) { + coord_src[i] = base_coord_src; + if (options->src_is_1d) + coord_src[i] = nir_swizzle(&b, coord_src[i], swizzle_xz, 4); + if (options->src_is_msaa) { + coord_src[i] = nir_vector_insert_imm(&b, coord_src[i], nir_imm_int(&b, i), + num_src_coords - 1); + } + + /* Clamp to edge for src, only X and Y because Z can't be out of bounds. */ + for (unsigned chan = 0; chan < 2; chan++) { + if (chan ? options->y_clamp_to_edge : options->x_clamp_to_edge) { + assert(!options->src_is_1d || chan == 0); + + if (!src_resinfo) { + src_resinfo = nir_image_deref_size(&b, 4, 32, deref_ssa(&b, img_src), + zero); + } + + nir_def *tmp = nir_channel(&b, coord_src[i], chan); + tmp = nir_imax_imm(&b, tmp, 0); + tmp = nir_imin(&b, tmp, nir_iadd_imm(&b, nir_channel(&b, src_resinfo, chan), -1)); + coord_src[i] = nir_vector_insert_imm(&b, coord_src[i], tmp, chan); + } + } + } + + /* We don't want the computation of src coordinates to be interleaved with loads. */ + if (src_samples > 1) { + optimization_barrier_vgpr_array(sctx, &b, coord_src, src_samples, + num_src_coords); + } + + /* Use "samples_identical" for MSAA resolving if it's supported. */ + bool is_resolve = src_samples > 1 && dst_samples == 1; + bool uses_samples_identical = sctx->gfx_level < GFX11 && + !(sctx->screen->debug_flags & DBG(NO_FMASK)) && is_resolve; + nir_def *samples_identical = NULL, *sample0 = {0}; + nir_if *if_identical = NULL; + + if (uses_samples_identical) { + samples_identical = nir_image_deref_samples_identical(&b, 1, deref_ssa(&b, img_src), + coord_src[0], + .image_dim = GLSL_SAMPLER_DIM_MS); + + /* If all samples are identical, load only sample 0. */ + if_identical = nir_push_if(&b, samples_identical); + { + sample0 = nir_image_deref_load(&b, options->last_src_channel + 1, bit_size, + deref_ssa(&b, img_src), coord_src[0], + nir_channel(&b, coord_src[0], + num_src_coords - 1), zero, + .image_dim = img_src->type->sampler_dimensionality, + .image_array = img_src->type->sampler_array); + } + nir_push_else(&b, if_identical); + } + + /* Load src pixels, one per sample. */ + foreach_sample(src_samples, i) { + color[i] = nir_image_deref_load(&b, options->last_src_channel + 1, bit_size, + deref_ssa(&b, img_src), coord_src[i], + nir_channel(&b, coord_src[i], num_src_coords - 1), zero, + .image_dim = img_src->type->sampler_dimensionality, + .image_array = img_src->type->sampler_array); + } + + /* Resolve MSAA if necessary. */ + if (is_resolve) { + /* We don't want the averaging of samples to be interleaved with image loads. */ + optimization_barrier_vgpr_array(sctx, &b, color, src_samples, + options->last_src_channel + 1); + + color[0] = average_samples(&b, color, src_samples); + src_samples = 1; + } + + if (uses_samples_identical) { + nir_pop_if(&b, if_identical); + color[0] = nir_if_phi(&b, sample0, color[0]); + } } - if (options->src_is_msaa && !options->dst_is_msaa && !options->sample0_only) { - /* MSAA resolving (downsampling). */ - assert(num_samples > 1 && !options->is_clear); - color = image_resolve_msaa(sctx->screen, &b, img_src, num_samples, coord_src, bit_size); - color = apply_blit_output_modifiers(&b, color, options); - nir_image_deref_store(&b, deref_ssa(&b, img_dst), coord_dst, zero, color, zero); + nir_def *img_dst_desc = nir_image_deref_descriptor_amd(&b, 8, 32, deref_ssa(&b, img_dst)); - } else if (options->src_is_msaa && options->dst_is_msaa) { - /* MSAA copy. */ - nir_def *color[16]; - assert(num_samples > 1 && !options->is_clear); - /* Group loads together and then stores. */ - for (unsigned i = 0; i < num_samples; i++) { - color[i] = nir_image_deref_load(&b, 4, bit_size, deref_ssa(&b, img_src), coord_src, - nir_imm_int(&b, i), zero); - } - for (unsigned i = 0; i < num_samples; i++) - color[i] = apply_blit_output_modifiers(&b, color[i], options); - for (unsigned i = 0; i < num_samples; i++) { - nir_image_deref_store(&b, deref_ssa(&b, img_dst), coord_dst, - nir_imm_int(&b, i), color[i], zero); - } - } else if (!options->src_is_msaa && options->dst_is_msaa) { - /* MSAA upsampling. */ - assert(num_samples > 1); - if (!options->is_clear) - color = nir_image_deref_load(&b, 4, bit_size, deref_ssa(&b, img_src), coord_src, zero, zero); + /* Apply the blit output modifiers, once per sample. */ + foreach_sample(src_samples, i) { + color[i] = apply_blit_output_modifiers(&b, color[i], options); + } - color = apply_blit_output_modifiers(&b, color, options); - for (unsigned i = 0; i < num_samples; i++) { - nir_image_deref_store(&b, deref_ssa(&b, img_dst), coord_dst, - nir_imm_int(&b, i), color, zero); + /* Initialize dst coordinates, one vector per pixel. */ + foreach_sample(dst_samples, i) { + coord_dst[i] = base_coord_dst; + if (options->dst_is_1d) + coord_dst[i] = nir_swizzle(&b, coord_dst[i], swizzle_xz, 4); + if (options->dst_is_msaa) { + coord_dst[i] = nir_vector_insert_imm(&b, coord_dst[i], + nir_imm_int(&b, i), + num_dst_coords - 1); } - } else { - /* Non-MSAA copy or read sample 0 only. */ - /* src2 = sample_index (zero), src3 = lod (zero) */ - assert(num_samples == 1); - if (!options->is_clear) - color = nir_image_deref_load(&b, 4, bit_size, deref_ssa(&b, img_src), coord_src, zero, zero); + } - color = apply_blit_output_modifiers(&b, color, options); - nir_image_deref_store(&b, deref_ssa(&b, img_dst), coord_dst, zero, color, zero); + /* We don't want the computation of dst coordinates to be interleaved with stores. */ + if (dst_samples > 1) + optimization_barrier_vgpr_array(sctx, &b, coord_dst, dst_samples, num_dst_coords); + + /* We don't want the application of blit output modifiers to be interleaved with stores. */ + if (!options->is_clear && MIN2(src_samples, dst_samples) > 1) { + optimization_barrier_vgpr_array(sctx, &b, color, src_samples, + options->last_dst_channel + 1); + } + + /* Store the pixels, one per sample. */ + foreach_sample(dst_samples, i) { + nir_bindless_image_store(&b, img_dst_desc, coord_dst[i], + nir_channel(&b, coord_dst[i], num_dst_coords - 1), + src_samples > 1 ? color[i] : color[i / dst_samples], zero, + .image_dim = glsl_get_sampler_dim(img_type[1]), + .image_array = glsl_sampler_type_is_array(img_type[1])); } if (options->has_start_xyz)