From 1becc6953cbd656f1e6172fc425c37fadb3cf41f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= Date: Mon, 22 Apr 2024 03:39:26 -0400 Subject: [PATCH] ac/nir: import the MSAA resolving pixel shader from radeonsi It has a lot of options for efficiency. Acked-by: Pierre-Eric Pelloux-Prayer Part-of: --- src/amd/common/ac_nir.c | 48 ++++ src/amd/common/ac_nir_helpers.h | 8 + src/amd/common/ac_nir_meta.h | 41 +++ src/amd/common/ac_nir_meta_ps_resolve.c | 166 ++++++++++++ src/amd/common/meson.build | 2 + src/gallium/drivers/radeonsi/si_blit.c | 58 ++-- src/gallium/drivers/radeonsi/si_pipe.h | 17 +- .../drivers/radeonsi/si_shaderlib_nir.c | 251 ++---------------- 8 files changed, 326 insertions(+), 265 deletions(-) create mode 100644 src/amd/common/ac_nir_meta.h create mode 100644 src/amd/common/ac_nir_meta_ps_resolve.c diff --git a/src/amd/common/ac_nir.c b/src/amd/common/ac_nir.c index 8e5603029e8..aa17c2f8224 100644 --- a/src/amd/common/ac_nir.c +++ b/src/amd/common/ac_nir.c @@ -4,6 +4,7 @@ * SPDX-License-Identifier: MIT */ +#include "ac_gpu_info.h" #include "ac_nir.h" #include "ac_nir_helpers.h" #include "sid.h" @@ -1509,3 +1510,50 @@ ac_nir_opt_pack_half(nir_shader *shader, enum amd_gfx_level gfx_level) } return progress; } + +nir_def * +ac_average_samples(nir_builder *b, nir_def **samples, unsigned num_samples) +{ + /* This works like add-reduce by computing the sum of each pair independently, and then + * computing the sum of each pair of sums, and so on, to get better instruction-level + * parallelism. + */ + if (num_samples == 16) { + for (unsigned i = 0; i < 8; i++) + samples[i] = nir_fadd(b, samples[i * 2], samples[i * 2 + 1]); + } + if (num_samples >= 8) { + for (unsigned i = 0; i < 4; i++) + samples[i] = nir_fadd(b, samples[i * 2], samples[i * 2 + 1]); + } + if (num_samples >= 4) { + for (unsigned i = 0; i < 2; i++) + samples[i] = nir_fadd(b, samples[i * 2], samples[i * 2 + 1]); + } + if (num_samples >= 2) + samples[0] = nir_fadd(b, samples[0], samples[1]); + + return nir_fmul_imm(b, samples[0], 1.0 / num_samples); /* average the sum */ +} + +void +ac_optimization_barrier_vgpr_array(const struct radeon_info *info, nir_builder *b, + nir_def **array, unsigned num_elements, + unsigned num_components) +{ + /* We use the optimization barrier to force LLVM to form VMEM clauses by constraining its + * instruction scheduling options. + * + * VMEM clauses are supported since GFX10. It's not recommended to use the optimization + * barrier in the compute blit for GFX6-8 because the lack of A16 combined with optimization + * barriers would unnecessarily increase VGPR usage for MSAA resources. + */ + if (!b->shader->info.use_aco_amd && info->gfx_level >= GFX10) { + for (unsigned i = 0; i < num_elements; i++) { + unsigned prev_num = array[i]->num_components; + array[i] = nir_trim_vector(b, array[i], num_components); + array[i] = nir_optimization_barrier_vgpr_amd(b, array[i]->bit_size, array[i]); + array[i] = nir_pad_vector(b, array[i], prev_num); + } + } +} diff --git a/src/amd/common/ac_nir_helpers.h b/src/amd/common/ac_nir_helpers.h index e7e1537b822..53ce943e09a 100644 --- a/src/amd/common/ac_nir_helpers.h +++ b/src/amd/common/ac_nir_helpers.h @@ -136,6 +136,14 @@ ac_nir_cull_primitive(nir_builder *b, void ac_nir_sleep(nir_builder *b, unsigned num_cycles); +nir_def * +ac_average_samples(nir_builder *b, nir_def **samples, unsigned num_samples); + +void +ac_optimization_barrier_vgpr_array(const struct radeon_info *info, nir_builder *b, + nir_def **array, unsigned num_elements, + unsigned num_components); + #ifdef __cplusplus } #endif diff --git a/src/amd/common/ac_nir_meta.h b/src/amd/common/ac_nir_meta.h new file mode 100644 index 00000000000..be231f24754 --- /dev/null +++ b/src/amd/common/ac_nir_meta.h @@ -0,0 +1,41 @@ +/* + * Copyright 2024 Advanced Micro Devices, Inc. + * + * SPDX-License-Identifier: MIT + */ + +#ifndef AC_NIR_META_H +#define AC_NIR_META_H + +#include "ac_gpu_info.h" +#include "nir.h" + +union ac_ps_resolve_key { + struct { + bool use_aco:1; + bool src_is_array:1; + uint8_t log_samples:2; + uint8_t last_src_channel:2; /* this shouldn't be greater than last_dst_channel */ + uint8_t last_dst_channel:2; + bool x_clamp_to_edge:1; + bool y_clamp_to_edge:1; + bool a16:1; + bool d16:1; + }; + uint64_t key; /* use with hash_table_u64 */ +}; + +/* Only immutable settings. */ +struct ac_ps_resolve_options { + const nir_shader_compiler_options *nir_options; + const struct radeon_info *info; + bool use_aco; /* global driver setting */ + bool no_fmask; /* FMASK disabled by a debug option, ignored on GFX11+ */ + bool print_key; /* print ac_ps_resolve_key into stderr */ +}; + +nir_shader * +ac_create_resolve_ps(const struct ac_ps_resolve_options *options, + const union ac_ps_resolve_key *key); + +#endif diff --git a/src/amd/common/ac_nir_meta_ps_resolve.c b/src/amd/common/ac_nir_meta_ps_resolve.c new file mode 100644 index 00000000000..bccb3bf50fb --- /dev/null +++ b/src/amd/common/ac_nir_meta_ps_resolve.c @@ -0,0 +1,166 @@ +/* + * Copyright 2024 Advanced Micro Devices, Inc. + * + * SPDX-License-Identifier: MIT + */ + +#include "ac_nir_meta.h" +#include "ac_nir_helpers.h" +#include "nir_builder.h" +#include "compiler/aco_interface.h" + +static nir_def * +build_tex_load_ms(nir_builder *b, unsigned num_components, unsigned bit_size, + nir_deref_instr *tex_deref, nir_def *coord, nir_def *sample_index) +{ + nir_tex_src srcs[] = { + nir_tex_src_for_ssa(nir_tex_src_coord, coord), + nir_tex_src_for_ssa(nir_tex_src_ms_index, sample_index), + }; + nir_def *result = nir_build_tex_deref_instr(b, nir_texop_txf_ms, tex_deref, tex_deref, + ARRAY_SIZE(srcs), srcs); + + nir_tex_instr *tex = nir_instr_as_tex(result->parent_instr); + + assert(bit_size == 32 || bit_size == 16); + if (bit_size == 16) { + tex->dest_type = nir_type_float16; + tex->def.bit_size = 16; + } + return nir_trim_vector(b, result, num_components); +} + +nir_shader * +ac_create_resolve_ps(const struct ac_ps_resolve_options *options, + const union ac_ps_resolve_key *key) +{ + if (options->print_key) { + fprintf(stderr, "Internal shader: resolve_ps\n"); + fprintf(stderr, " key.use_aco = %u\n", key->use_aco); + fprintf(stderr, " key.src_is_array = %u\n", key->src_is_array); + fprintf(stderr, " key.log_samples = %u\n", key->log_samples); + fprintf(stderr, " key.last_src_channel = %u\n", key->last_src_channel); + fprintf(stderr, " key.x_clamp_to_edge = %u\n", key->x_clamp_to_edge); + fprintf(stderr, " key.y_clamp_to_edge = %u\n", key->y_clamp_to_edge); + fprintf(stderr, " key.d16 = %u\n", key->d16); + fprintf(stderr, " key.a16 = %u\n", key->a16); + fprintf(stderr, "\n"); + } + + nir_builder b = + nir_builder_init_simple_shader(MESA_SHADER_FRAGMENT, options->nir_options, "ac_resolve_ps"); + b.shader->info.use_aco_amd = options->use_aco || + (key->use_aco && aco_is_gpu_supported(options->info)); + BITSET_SET(b.shader->info.textures_used, 1); + + const struct glsl_type *sampler_type = + glsl_sampler_type(GLSL_SAMPLER_DIM_MS, /*shadow*/ false, /*is_array*/ key->src_is_array, + GLSL_TYPE_FLOAT); + nir_variable *sampler = nir_variable_create(b.shader, nir_var_uniform, sampler_type, "samp0"); + sampler->data.binding = 0; + + nir_deref_instr *deref = nir_build_deref_var(&b, sampler); + nir_def *zero = nir_imm_int(&b, 0); + nir_def *baryc = nir_load_barycentric_pixel(&b, 32, .interp_mode = INTERP_MODE_SMOOTH); + nir_def *coord = nir_load_interpolated_input(&b, 2 + key->src_is_array, 32, baryc, zero, + .dest_type = nir_type_float32, + .io_semantics = (nir_io_semantics){ + .location = VARYING_SLOT_VAR0, + .num_slots = 1}); + + /* Nearest filtering floors and then converts to integer, and then + * applies clamp to edge as clamp(coord, 0, dim - 1). + */ + coord = nir_vector_insert_imm(&b, coord, nir_ffloor(&b, nir_channel(&b, coord, 0)), 0); + coord = nir_vector_insert_imm(&b, coord, nir_ffloor(&b, nir_channel(&b, coord, 1)), 1); + coord = nir_f2iN(&b, coord, key->a16 ? 16 : 32); + + /* Clamp to edge only for X and Y because Z can't be out of bounds. */ + nir_def *resinfo = NULL; + for (unsigned chan = 0; chan < 2; chan++) { + if (chan ? key->y_clamp_to_edge : key->x_clamp_to_edge) { + if (!resinfo) { + resinfo = nir_build_tex_deref_instr(&b, nir_texop_txs, deref, deref, 0, NULL); + + if (key->a16) { + resinfo = nir_umin_imm(&b, resinfo, INT16_MAX); + resinfo = nir_i2i16(&b, resinfo); + } + } + + nir_def *tmp = nir_channel(&b, coord, chan); + tmp = nir_imax_imm(&b, tmp, 0); + tmp = nir_imin(&b, tmp, nir_iadd_imm(&b, nir_channel(&b, resinfo, chan), -1)); + coord = nir_vector_insert_imm(&b, coord, tmp, chan); + } + } + + /* Use samples_identical if it's supported. */ + bool uses_samples_identical = options->info->gfx_level < GFX11 && !options->no_fmask; + nir_def *sample0 = NULL; + nir_if *if_identical = NULL; + + assert(key->last_src_channel <= key->last_dst_channel); + + if (uses_samples_identical) { + nir_tex_src iden_srcs[] = { + nir_tex_src_for_ssa(nir_tex_src_coord, coord), + }; + nir_def *samples_identical = + nir_build_tex_deref_instr(&b, nir_texop_samples_identical, deref, deref, + ARRAY_SIZE(iden_srcs), iden_srcs); + + /* If all samples are identical, load only sample 0. */ + if_identical = nir_push_if(&b, samples_identical); + { + sample0 = build_tex_load_ms(&b, key->last_src_channel + 1, key->d16 ? 16 : 32, + deref, coord, nir_imm_intN_t(&b, 0, coord->bit_size)); + } + nir_push_else(&b, if_identical); + } + + /* Insert the sample index into the coordinates. */ + unsigned num_src_coords = 2 + key->src_is_array + 1; + unsigned num_samples = 1 << key->log_samples; + nir_def *coord_src[16] = {0}; + + for (unsigned i = 0; i < num_samples; i++) { + coord_src[i] = nir_pad_vector(&b, coord, num_src_coords); + coord_src[i] = nir_vector_insert_imm(&b, coord_src[i], + nir_imm_intN_t(&b, i, coord->bit_size), + num_src_coords - 1); + } + + /* We need this because LLVM interleaves coordinate computations with image loads, which breaks + * VMEM clauses. + */ + ac_optimization_barrier_vgpr_array(options->info, &b, coord_src, num_samples, num_src_coords); + + nir_def *samples[16] = {0}; + for (unsigned i = 0; i < num_samples; i++) { + samples[i] = build_tex_load_ms(&b, key->last_src_channel + 1, key->d16 ? 16 : 32, + deref, nir_trim_vector(&b, coord_src[i], num_src_coords - 1), + nir_channel(&b, coord_src[i], num_src_coords - 1)); + } + nir_def *result = ac_average_samples(&b, samples, num_samples); + + if (uses_samples_identical) { + nir_pop_if(&b, if_identical); + result = nir_if_phi(&b, sample0, result); + } + + result = nir_pad_vector(&b, result, key->last_dst_channel + 1); + for (unsigned i = key->last_src_channel + 1; i <= key->last_dst_channel; i++) { + result = nir_vector_insert_imm(&b, result, + nir_imm_floatN_t(&b, i == 3 ? 1 : 0, result->bit_size), i); + } + + nir_store_output(&b, result, zero, + .write_mask = BITFIELD_MASK(key->last_dst_channel + 1), + .src_type = key->d16 ? nir_type_float16 : nir_type_float32, + .io_semantics = (nir_io_semantics){ + .location = FRAG_RESULT_DATA0, + .num_slots = 1}); + + return b.shader; +} diff --git a/src/amd/common/meson.build b/src/amd/common/meson.build index 72a5660b840..96ed5b3d24a 100644 --- a/src/amd/common/meson.build +++ b/src/amd/common/meson.build @@ -113,6 +113,8 @@ amd_common_files = files( 'ac_nir_lower_tex.c', 'ac_nir_lower_ngg.c', 'ac_nir_lower_ps.c', + 'ac_nir_meta.h', + 'ac_nir_meta_ps_resolve.c', 'amd_family.c', 'ac_parse_ib.c', 'ac_perfcounter.c', diff --git a/src/gallium/drivers/radeonsi/si_blit.c b/src/gallium/drivers/radeonsi/si_blit.c index 5039bc05d73..b19337750a6 100644 --- a/src/gallium/drivers/radeonsi/si_blit.c +++ b/src/gallium/drivers/radeonsi/si_blit.c @@ -10,6 +10,7 @@ #include "util/u_log.h" #include "util/u_surface.h" #include "util/hash_table.h" +#include "ac_nir_meta.h" enum { @@ -1303,29 +1304,29 @@ void si_gfx_blit(struct pipe_context *ctx, const struct pipe_blit_info *info) /* No scaling */ (info->dst.box.width == abs(info->src.box.width) && info->dst.box.height == abs(info->src.box.height)))) { - union si_resolve_ps_key options; - options.key = 0; + union ac_ps_resolve_key key; + key.key = 0; /* LLVM is slower on GFX10.3 and older because it doesn't form VMEM clauses and it's more * difficult to force them with optimization barriers when FMASK is used. */ - options.use_aco = true; - options.src_is_array = info->src.resource->target == PIPE_TEXTURE_1D_ARRAY || - info->src.resource->target == PIPE_TEXTURE_2D_ARRAY || - info->src.resource->target == PIPE_TEXTURE_CUBE || - info->src.resource->target == PIPE_TEXTURE_CUBE_ARRAY; - options.log_samples = util_logbase2(info->src.resource->nr_samples); - options.last_dst_channel = util_format_get_last_component(info->dst.format); - options.last_src_channel = util_format_get_last_component(info->src.format); - options.last_src_channel = MIN2(options.last_src_channel, options.last_dst_channel); - options.x_clamp_to_edge = si_should_blit_clamp_to_edge(info, BITFIELD_BIT(0)); - options.y_clamp_to_edge = si_should_blit_clamp_to_edge(info, BITFIELD_BIT(1)); - options.a16 = sctx->gfx_level >= GFX9 && util_is_box_sint16(&info->dst.box) && - util_is_box_sint16(&info->src.box); + key.use_aco = true; + key.src_is_array = info->src.resource->target == PIPE_TEXTURE_1D_ARRAY || + info->src.resource->target == PIPE_TEXTURE_2D_ARRAY || + info->src.resource->target == PIPE_TEXTURE_CUBE || + info->src.resource->target == PIPE_TEXTURE_CUBE_ARRAY; + key.log_samples = util_logbase2(info->src.resource->nr_samples); + key.last_dst_channel = util_format_get_last_component(info->dst.format); + key.last_src_channel = util_format_get_last_component(info->src.format); + key.last_src_channel = MIN2(key.last_src_channel, key.last_dst_channel); + key.x_clamp_to_edge = si_should_blit_clamp_to_edge(info, BITFIELD_BIT(0)); + key.y_clamp_to_edge = si_should_blit_clamp_to_edge(info, BITFIELD_BIT(1)); + key.a16 = sctx->gfx_level >= GFX9 && util_is_box_sint16(&info->dst.box) && + util_is_box_sint16(&info->src.box); unsigned max_dst_chan_size = util_format_get_max_channel_size(info->dst.format); unsigned max_src_chan_size = util_format_get_max_channel_size(info->src.format); - if (options.use_aco && util_format_is_float(info->dst.format) && max_dst_chan_size == 32) { + if (key.use_aco && util_format_is_float(info->dst.format) && max_dst_chan_size == 32) { /* TODO: ACO doesn't meet precision expectations of this test when the destination format * is R32G32B32A32_FLOAT, the source format is R8G8B8A8_UNORM, and the resolving math uses * FP16. It's theoretically arguable whether FP16 is legal in this case. LLVM passes @@ -1333,19 +1334,28 @@ void si_gfx_blit(struct pipe_context *ctx, const struct pipe_blit_info *info) * * piglit/bin/copyteximage CUBE -samples=2 -auto */ - options.d16 = 0; + key.d16 = 0; } else { /* Resolving has precision issues all the way down to R11G11B10_FLOAT. */ - options.d16 = ((!options.use_aco && !sctx->screen->use_aco && sctx->gfx_level >= GFX8) || - /* ACO doesn't support D16 on GFX8 */ - ((options.use_aco || sctx->screen->use_aco) && sctx->gfx_level >= GFX9)) && - MIN2(max_dst_chan_size, max_src_chan_size) <= 10; + key.d16 = ((!key.use_aco && !sctx->screen->use_aco && sctx->gfx_level >= GFX8) || + /* ACO doesn't support D16 on GFX8 */ + ((key.use_aco || sctx->screen->use_aco) && sctx->gfx_level >= GFX9)) && + MIN2(max_dst_chan_size, max_src_chan_size) <= 10; } - fs = _mesa_hash_table_u64_search(sctx->ps_resolve_shaders, options.key); + fs = _mesa_hash_table_u64_search(sctx->ps_resolve_shaders, key.key); if (!fs) { - fs = si_create_resolve_ps(sctx, &options); - _mesa_hash_table_u64_insert(sctx->ps_resolve_shaders, options.key, fs); + struct ac_ps_resolve_options options = { + .nir_options = sctx->b.screen->get_compiler_options(sctx->b.screen, PIPE_SHADER_IR_NIR, + PIPE_SHADER_FRAGMENT), + .info = &sctx->screen->info, + .use_aco = sctx->screen->use_aco, + .no_fmask = sctx->screen->debug_flags & DBG(NO_FMASK), + .print_key = si_can_dump_shader(sctx->screen, MESA_SHADER_FRAGMENT, SI_DUMP_SHADER_KEY), + }; + + fs = si_create_shader_state(sctx, ac_create_resolve_ps(&options, &key)); + _mesa_hash_table_u64_insert(sctx->ps_resolve_shaders, key.key, fs); } } diff --git a/src/gallium/drivers/radeonsi/si_pipe.h b/src/gallium/drivers/radeonsi/si_pipe.h index 742763d0b5c..9ff7cedc006 100644 --- a/src/gallium/drivers/radeonsi/si_pipe.h +++ b/src/gallium/drivers/radeonsi/si_pipe.h @@ -1634,6 +1634,7 @@ void si_suspend_queries(struct si_context *sctx); void si_resume_queries(struct si_context *sctx); /* si_shaderlib_nir.c */ +void *si_create_shader_state(struct si_context *sctx, struct nir_shader *nir); void *si_create_dcc_retile_cs(struct si_context *sctx, struct radeon_surf *surf); void *gfx9_create_clear_dcc_msaa_cs(struct si_context *sctx, struct si_texture *tex); void *si_create_passthrough_tcs(struct si_context *sctx); @@ -1680,23 +1681,7 @@ union si_compute_blit_shader_key { uint64_t key; }; -union si_resolve_ps_key { - struct { - bool use_aco:1; - bool src_is_array:1; - uint8_t log_samples:2; - uint8_t last_src_channel:2; /* this shouldn't be greater than last_dst_channel */ - uint8_t last_dst_channel:2; - bool x_clamp_to_edge:1; - bool y_clamp_to_edge:1; - bool a16:1; - bool d16:1; - }; - uint64_t key; -}; - void *si_create_blit_cs(struct si_context *sctx, const union si_compute_blit_shader_key *options); -void *si_create_resolve_ps(struct si_context *sctx, const union si_resolve_ps_key *options); void *si_get_blitter_vs(struct si_context *sctx, enum blitter_attrib_type type, unsigned num_layers); void *si_create_dma_compute_shader(struct si_context *sctx, unsigned num_dwords_per_thread, diff --git a/src/gallium/drivers/radeonsi/si_shaderlib_nir.c b/src/gallium/drivers/radeonsi/si_shaderlib_nir.c index 1cdfdd0f23c..820bf569f8e 100644 --- a/src/gallium/drivers/radeonsi/si_shaderlib_nir.c +++ b/src/gallium/drivers/radeonsi/si_shaderlib_nir.c @@ -11,8 +11,9 @@ #include "si_query.h" #include "aco_interface.h" #include "nir_format_convert.h" +#include "ac_nir_helpers.h" -static void *create_shader_state(struct si_context *sctx, nir_shader *nir) +void *si_create_shader_state(struct si_context *sctx, nir_shader *nir) { sctx->b.screen->finalize_nir(sctx->b.screen, (void*)nir); return pipe_shader_from_nir(&sctx->b, nir); @@ -106,7 +107,7 @@ void *si_create_dcc_retile_cs(struct si_context *sctx, struct radeon_surf *surf) zero, zero, zero); /* z, sample, pipe_xor */ nir_store_ssbo(&b, value, zero, dst_offset, .write_mask=0x1, .align_mul=1); - return create_shader_state(sctx, b.shader); + return si_create_shader_state(sctx, b.shader); } void *gfx9_create_clear_dcc_msaa_cs(struct si_context *sctx, struct si_texture *tex) @@ -150,7 +151,7 @@ void *gfx9_create_clear_dcc_msaa_cs(struct si_context *sctx, struct si_texture * */ nir_store_ssbo(&b, clear_value, zero, offset, .write_mask=0x1, .align_mul=2); - return create_shader_state(sctx, b.shader); + return si_create_shader_state(sctx, b.shader); } /* Create a compute shader implementing clear_buffer or copy_buffer. */ @@ -183,7 +184,7 @@ void *si_create_clear_buffer_rmw_cs(struct si_context *sctx) nir_store_ssbo(&b, data, zero, address, .align_mul = 4); - return create_shader_state(sctx, b.shader); + return si_create_shader_state(sctx, b.shader); } /* This is used when TCS is NULL in the VS->TCS->TES chain. In this case, @@ -202,7 +203,7 @@ void *si_create_passthrough_tcs(struct si_context *sctx) nir_shader *tcs = nir_create_passthrough_tcs_impl(sctx->screen->nir_options, locations, info->num_outputs, sctx->patch_vertices); - return create_shader_state(sctx, tcs); + return si_create_shader_state(sctx, tcs); } static nir_def *convert_linear_to_srgb(nir_builder *b, nir_def *input) @@ -218,30 +219,6 @@ static nir_def *convert_linear_to_srgb(nir_builder *b, nir_def *input) return input; } -static nir_def *average_samples(nir_builder *b, nir_def **samples, unsigned num_samples) -{ - /* This works like add-reduce by computing the sum of each pair independently, and then - * computing the sum of each pair of sums, and so on, to get better instruction-level - * parallelism. - */ - if (num_samples == 16) { - for (unsigned i = 0; i < 8; i++) - samples[i] = nir_fadd(b, samples[i * 2], samples[i * 2 + 1]); - } - if (num_samples >= 8) { - for (unsigned i = 0; i < 4; i++) - samples[i] = nir_fadd(b, samples[i * 2], samples[i * 2 + 1]); - } - if (num_samples >= 4) { - for (unsigned i = 0; i < 2; i++) - samples[i] = nir_fadd(b, samples[i * 2], samples[i * 2 + 1]); - } - if (num_samples >= 2) - samples[0] = nir_fadd(b, samples[0], samples[1]); - - return nir_fmul_imm(b, samples[0], 1.0 / num_samples); /* average the sum */ -} - static nir_def *apply_blit_output_modifiers(nir_builder *b, nir_def *color, const union si_compute_blit_shader_key *options) { @@ -290,27 +267,6 @@ static nir_def *apply_blit_output_modifiers(nir_builder *b, nir_def *color, return color; } -static void optimization_barrier_vgpr_array(struct si_context *sctx, nir_builder *b, - nir_def **array, unsigned num_elements, - unsigned num_components) -{ - /* We use the optimization barrier to force LLVM to form VMEM clauses by constraining its - * instruction scheduling options. - * - * VMEM clauses are supported since GFX10. It's not recommended to use the optimization - * barrier in the compute blit for GFX6-8 because the lack of A16 combined with optimization - * barriers would unnecessarily increase VGPR usage for MSAA resources. - */ - if (!b->shader->info.use_aco_amd && sctx->gfx_level >= GFX10) { - for (unsigned i = 0; i < num_elements; i++) { - unsigned prev_num = array[i]->num_components; - array[i] = nir_trim_vector(b, array[i], num_components); - array[i] = nir_optimization_barrier_vgpr_amd(b, array[i]->bit_size, array[i]); - array[i] = nir_pad_vector(b, array[i], prev_num); - } - } -} - /* The compute blit shader. * * Implementation details: @@ -541,8 +497,8 @@ void *si_create_blit_cs(struct si_context *sctx, const union si_compute_blit_sha /* We don't want the computation of src coordinates to be interleaved with loads. */ if (lane_size > 1 || src_samples > 1) { - optimization_barrier_vgpr_array(sctx, &b, coord_src, lane_size * src_samples, - num_src_coords); + ac_optimization_barrier_vgpr_array(&sctx->screen->info, &b, coord_src, + lane_size * src_samples, num_src_coords); } /* Use "samples_identical" for MSAA resolving if it's supported. */ @@ -588,14 +544,14 @@ void *si_create_blit_cs(struct si_context *sctx, const union si_compute_blit_sha /* Resolve MSAA if necessary. */ if (is_resolve) { /* We don't want the averaging of samples to be interleaved with image loads. */ - optimization_barrier_vgpr_array(sctx, &b, color, lane_size * src_samples, - options->last_src_channel + 1); + ac_optimization_barrier_vgpr_array(&sctx->screen->info, &b, color, lane_size * src_samples, + options->last_src_channel + 1); /* This reduces the "color" array from "src_samples * lane_size" elements to only * "lane_size" elements. */ foreach_pixel_in_lane(1, sample, x, y, z, i) { - color[i] = average_samples(&b, &color[i * src_samples], src_samples); + color[i] = ac_average_samples(&b, &color[i * src_samples], src_samples); } src_samples = 1; } @@ -634,13 +590,15 @@ void *si_create_blit_cs(struct si_context *sctx, const union si_compute_blit_sha } /* We don't want the computation of dst coordinates to be interleaved with stores. */ - if (lane_size > 1 || dst_samples > 1) - optimization_barrier_vgpr_array(sctx, &b, coord_dst, lane_size * dst_samples, num_dst_coords); + if (lane_size > 1 || dst_samples > 1) { + ac_optimization_barrier_vgpr_array(&sctx->screen->info, &b, coord_dst, lane_size * dst_samples, + num_dst_coords); + } /* We don't want the application of blit output modifiers to be interleaved with stores. */ if (!options->is_clear && (lane_size > 1 || MIN2(src_samples, dst_samples) > 1)) { - optimization_barrier_vgpr_array(sctx, &b, color, lane_size * src_samples, - options->last_dst_channel + 1); + ac_optimization_barrier_vgpr_array(&sctx->screen->info, &b, color, lane_size * src_samples, + options->last_dst_channel + 1); } /* Store the pixels, one per sample. */ @@ -655,7 +613,7 @@ void *si_create_blit_cs(struct si_context *sctx, const union si_compute_blit_sha if (options->has_start_xyz) nir_pop_if(&b, if_positive); - return create_shader_state(sctx, b.shader); + return si_create_shader_state(sctx, b.shader); } /* Store the clear color at the beginning of every 256B block. This is required when we clear DCC @@ -694,7 +652,7 @@ void *si_clear_image_dcc_single_shader(struct si_context *sctx, bool is_msaa, un nir_image_deref_store(&b, &nir_build_deref_var(&b, output_img)->def, coord, nir_imm_int(&b, 0), clear_color, nir_imm_int(&b, 0)); - return create_shader_state(sctx, b.shader); + return si_create_shader_state(sctx, b.shader); } void *si_create_ubyte_to_ushort_compute_shader(struct si_context *sctx) @@ -714,7 +672,7 @@ void *si_create_ubyte_to_ushort_compute_shader(struct si_context *sctx) nir_store_ssbo(&b, nir_u2u16(&b, ubyte_value), nir_imm_int(&b, 0), store_address, .access = ACCESS_RESTRICT); - return create_shader_state(sctx, b.shader); + return si_create_shader_state(sctx, b.shader); } /* Create a compute shader implementing clear_buffer or copy_buffer. */ @@ -745,7 +703,7 @@ void *si_create_dma_compute_shader(struct si_context *sctx, unsigned num_dwords_ nir_store_ssbo(&b, value, nir_imm_int(&b, !is_clear), offset, .access = ACCESS_RESTRICT); - return create_shader_state(sctx, b.shader); + return si_create_shader_state(sctx, b.shader); } /* Load samples from the image, and copy them to the same image. This looks like @@ -764,7 +722,7 @@ void *si_create_fmask_expand_cs(struct si_context *sctx, unsigned num_samples, b /* Return an empty compute shader */ if (num_samples == 0) - return create_shader_state(sctx, b.shader); + return si_create_shader_state(sctx, b.shader); b.shader->info.num_images = 1; @@ -803,7 +761,7 @@ void *si_create_fmask_expand_cs(struct si_context *sctx, unsigned num_samples, b .image_array = is_array); } - return create_shader_state(sctx, b.shader); + return si_create_shader_state(sctx, b.shader); } /* This is just a pass-through shader with 1-3 MOV instructions. */ @@ -873,7 +831,7 @@ void *si_get_blitter_vs(struct si_context *sctx, enum blitter_attrib_type type, SYSTEM_VALUE_INSTANCE_ID, glsl_int_type())); } - *vs = create_shader_state(sctx, b.shader); + *vs = si_create_shader_state(sctx, b.shader); return *vs; } @@ -1251,7 +1209,7 @@ void *si_create_query_result_cs(struct si_context *sctx) } nir_pop_if(&b, if_acc_chaining); - return create_shader_state(sctx, b.shader); + return si_create_shader_state(sctx, b.shader); } /* Create the compute shader that is used to collect the results of gfx10+ @@ -1499,162 +1457,5 @@ void *gfx11_create_sh_query_result_cs(struct si_context *sctx) } nir_pop_if(&b, if_write_summary_buffer); - return create_shader_state(sctx, b.shader); -} - -static nir_def *build_tex_load_ms(nir_builder *b, unsigned num_components, unsigned bit_size, - nir_deref_instr *tex_deref, nir_def *coord, nir_def *sample_index) -{ - nir_tex_src srcs[] = { - nir_tex_src_for_ssa(nir_tex_src_coord, coord), - nir_tex_src_for_ssa(nir_tex_src_ms_index, sample_index), - }; - nir_def *result = nir_build_tex_deref_instr(b, nir_texop_txf_ms, tex_deref, tex_deref, - ARRAY_SIZE(srcs), srcs); - - nir_tex_instr *tex = nir_instr_as_tex(result->parent_instr); - - assert(bit_size == 32 || bit_size == 16); - if (bit_size == 16) { - tex->dest_type = nir_type_float16; - tex->def.bit_size = 16; - } - return nir_trim_vector(b, result, num_components); -} - -void *si_create_resolve_ps(struct si_context *sctx, const union si_resolve_ps_key *options) -{ - if (si_can_dump_shader(sctx->screen, MESA_SHADER_FRAGMENT, SI_DUMP_SHADER_KEY)) { - fprintf(stderr, "Internal shader: resolve_ps\n"); - fprintf(stderr, " options.use_aco = %u\n", options->use_aco); - fprintf(stderr, " options.src_is_array = %u\n", options->src_is_array); - fprintf(stderr, " options.log_samples = %u\n", options->log_samples); - fprintf(stderr, " options.last_src_channel = %u\n", options->last_src_channel); - fprintf(stderr, " options.x_clamp_to_edge = %u\n", options->x_clamp_to_edge); - fprintf(stderr, " options.y_clamp_to_edge = %u\n", options->y_clamp_to_edge); - fprintf(stderr, " options.d16 = %u\n", options->d16); - fprintf(stderr, " options.a16 = %u\n", options->a16); - fprintf(stderr, "\n"); - } - - const nir_shader_compiler_options *nir_options = - sctx->b.screen->get_compiler_options(sctx->b.screen, PIPE_SHADER_IR_NIR, PIPE_SHADER_FRAGMENT); - nir_builder b = - nir_builder_init_simple_shader(MESA_SHADER_FRAGMENT, nir_options, "si_resolve_ps"); - - b.shader->info.use_aco_amd = sctx->screen->use_aco || - (options->use_aco && aco_is_gpu_supported(&sctx->screen->info)); - BITSET_SET(b.shader->info.textures_used, 1); - - const struct glsl_type *sampler_type = - glsl_sampler_type(GLSL_SAMPLER_DIM_MS, /*shadow*/ false, /*is_array*/ options->src_is_array, - GLSL_TYPE_FLOAT); - nir_variable *sampler = nir_variable_create(b.shader, nir_var_uniform, sampler_type, "samp0"); - sampler->data.binding = 0; - - nir_deref_instr *deref = nir_build_deref_var(&b, sampler); - nir_def *zero = nir_imm_int(&b, 0); - nir_def *baryc = nir_load_barycentric_pixel(&b, 32, .interp_mode = INTERP_MODE_SMOOTH); - nir_def *coord = nir_load_interpolated_input(&b, 2 + options->src_is_array, 32, baryc, zero, - .dest_type = nir_type_float32, - .io_semantics = (nir_io_semantics){ - .location = VARYING_SLOT_VAR0, - .num_slots = 1}); - - /* Nearest filtering floors and then converts to integer, and then - * applies clamp to edge as clamp(coord, 0, dim - 1). - */ - coord = nir_vector_insert_imm(&b, coord, nir_ffloor(&b, nir_channel(&b, coord, 0)), 0); - coord = nir_vector_insert_imm(&b, coord, nir_ffloor(&b, nir_channel(&b, coord, 1)), 1); - coord = nir_f2iN(&b, coord, options->a16 ? 16 : 32); - - /* Clamp to edge only for X and Y because Z can't be out of bounds. */ - nir_def *resinfo = NULL; - for (unsigned chan = 0; chan < 2; chan++) { - if (chan ? options->y_clamp_to_edge : options->x_clamp_to_edge) { - if (!resinfo) { - resinfo = nir_build_tex_deref_instr(&b, nir_texop_txs, deref, deref, 0, NULL); - - if (options->a16) { - resinfo = nir_umin_imm(&b, resinfo, INT16_MAX); - resinfo = nir_i2i16(&b, resinfo); - } - } - - nir_def *tmp = nir_channel(&b, coord, chan); - tmp = nir_imax_imm(&b, tmp, 0); - tmp = nir_imin(&b, tmp, nir_iadd_imm(&b, nir_channel(&b, resinfo, chan), -1)); - coord = nir_vector_insert_imm(&b, coord, tmp, chan); - } - } - - /* Use samples_identical if it's supported. */ - bool uses_samples_identical = sctx->gfx_level < GFX11 && - !(sctx->screen->debug_flags & DBG(NO_FMASK)); - nir_def *sample0 = NULL; - nir_if *if_identical = NULL; - - assert(options->last_src_channel <= options->last_dst_channel); - - if (uses_samples_identical) { - nir_tex_src iden_srcs[] = { - nir_tex_src_for_ssa(nir_tex_src_coord, coord), - }; - nir_def *samples_identical = - nir_build_tex_deref_instr(&b, nir_texop_samples_identical, deref, deref, - ARRAY_SIZE(iden_srcs), iden_srcs); - - /* If all samples are identical, load only sample 0. */ - if_identical = nir_push_if(&b, samples_identical); - { - sample0 = build_tex_load_ms(&b, options->last_src_channel + 1, options->d16 ? 16 : 32, - deref, coord, nir_imm_intN_t(&b, 0, coord->bit_size)); - } - nir_push_else(&b, if_identical); - } - - /* Insert the sample index into the coordinates. */ - unsigned num_src_coords = 2 + options->src_is_array + 1; - unsigned num_samples = 1 << options->log_samples; - nir_def *coord_src[16] = {0}; - - for (unsigned i = 0; i < num_samples; i++) { - coord_src[i] = nir_pad_vector(&b, coord, num_src_coords); - coord_src[i] = nir_vector_insert_imm(&b, coord_src[i], - nir_imm_intN_t(&b, i, coord->bit_size), - num_src_coords - 1); - } - - /* We need this because LLVM interleaves coordinate computations with image loads, which breaks - * VMEM clauses. - */ - optimization_barrier_vgpr_array(sctx, &b, coord_src, num_samples, num_src_coords); - - nir_def *samples[16] = {0}; - for (unsigned i = 0; i < num_samples; i++) { - samples[i] = build_tex_load_ms(&b, options->last_src_channel + 1, options->d16 ? 16 : 32, - deref, nir_trim_vector(&b, coord_src[i], num_src_coords - 1), - nir_channel(&b, coord_src[i], num_src_coords - 1)); - } - nir_def *result = average_samples(&b, samples, num_samples); - - if (uses_samples_identical) { - nir_pop_if(&b, if_identical); - result = nir_if_phi(&b, sample0, result); - } - - result = nir_pad_vector(&b, result, options->last_dst_channel + 1); - for (unsigned i = options->last_src_channel + 1; i <= options->last_dst_channel; i++) { - result = nir_vector_insert_imm(&b, result, - nir_imm_floatN_t(&b, i == 3 ? 1 : 0, result->bit_size), i); - } - - nir_store_output(&b, result, zero, - .write_mask = BITFIELD_MASK(options->last_dst_channel + 1), - .src_type = options->d16 ? nir_type_float16 : nir_type_float32, - .io_semantics = (nir_io_semantics){ - .location = FRAG_RESULT_DATA0, - .num_slots = 1}); - - return create_shader_state(sctx, b.shader); + return si_create_shader_state(sctx, b.shader); }