radeonsi: use MIMG D16 (16-bit data) for image instructions in compute blits
This reduces VGPR usage. Reviewed-by: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer@amd.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/28917>
This commit is contained in:
@@ -1037,6 +1037,9 @@ bool si_compute_blit(struct si_context *sctx, const struct pipe_blit_info *info,
|
||||
{
|
||||
struct si_texture *sdst = (struct si_texture *)info->dst.resource;
|
||||
bool is_3d_tiling = sdst->surface.thick_tiling;
|
||||
/* Get the channel sizes. */
|
||||
unsigned max_dst_chan_size = util_format_get_max_channel_size(info->dst.format);
|
||||
unsigned max_src_chan_size = util_format_get_max_channel_size(info->src.format);
|
||||
|
||||
/* Testing on Navi21 showed that the compute blit is slightly slower than the gfx blit.
|
||||
* The compute blit is even slower with DCC stores. VP13 CATIA_plane_pencil is a good test
|
||||
@@ -1170,6 +1173,16 @@ bool si_compute_blit(struct si_context *sctx, const struct pipe_blit_info *info,
|
||||
options.use_integer_one = util_format_is_pure_integer(info->dst.format) &&
|
||||
options.last_src_channel < options.last_dst_channel &&
|
||||
options.last_dst_channel == 3;
|
||||
bool is_resolve = options.src_is_msaa && !options.dst_is_msaa && !options.sample0_only;
|
||||
/* ACO doesn't support D16 on GFX8 */
|
||||
bool has_d16 = sctx->gfx_level >= (sctx->screen->use_aco ? GFX9 : GFX8);
|
||||
options.d16 = has_d16 &&
|
||||
/* Blitting FP16 using D16 has precision issues. Resolving has precision
|
||||
* issues all the way down to R11G11B10_FLOAT. */
|
||||
MIN2(max_dst_chan_size, max_src_chan_size) <=
|
||||
(util_format_is_pure_integer(info->dst.format) ?
|
||||
(options.sint_to_uint || options.uint_to_sint ? 10 : 16) :
|
||||
(is_resolve ? 10 : 11));
|
||||
|
||||
struct hash_entry *entry = _mesa_hash_table_search(sctx->cs_blit_shaders,
|
||||
(void*)(uintptr_t)options.key);
|
||||
|
@@ -1647,6 +1647,7 @@ union si_compute_blit_shader_key {
|
||||
bool dst_is_msaa:1;
|
||||
bool src_has_z:1;
|
||||
bool dst_has_z:1;
|
||||
bool d16:1;
|
||||
uint8_t log2_samples:4;
|
||||
bool sample0_only:1; /* src is MSAA, dst is not MSAA, log2_samples is ignored */
|
||||
/* Source coordinate modifiers. */
|
||||
|
@@ -294,7 +294,7 @@ static nir_def *average_samples(nir_builder *b, nir_def **samples, unsigned num_
|
||||
}
|
||||
|
||||
static nir_def *image_resolve_msaa(struct si_screen *sscreen, nir_builder *b, nir_variable *img,
|
||||
unsigned num_samples, nir_def *coord)
|
||||
unsigned num_samples, nir_def *coord, unsigned bit_size)
|
||||
{
|
||||
nir_def *zero = nir_imm_int(b, 0);
|
||||
nir_def *result = NULL;
|
||||
@@ -303,11 +303,13 @@ static nir_def *image_resolve_msaa(struct si_screen *sscreen, nir_builder *b, ni
|
||||
/* Gfx11 doesn't support samples_identical, so we can't use it. */
|
||||
if (sscreen->info.gfx_level < GFX11) {
|
||||
/* We need a local variable to get the result out of conditional branches in SSA. */
|
||||
var = nir_local_variable_create(b->impl, glsl_vec4_type(), NULL);
|
||||
var = nir_local_variable_create(b->impl,
|
||||
bit_size == 16 ? glsl_f16vec_type(4) : glsl_vec4_type(),
|
||||
NULL);
|
||||
|
||||
/* If all samples are identical, load only sample 0. */
|
||||
nir_push_if(b, nir_image_deref_samples_identical(b, 1, deref_ssa(b, img), coord));
|
||||
result = nir_image_deref_load(b, 4, 32, deref_ssa(b, img), coord, zero, zero);
|
||||
result = nir_image_deref_load(b, 4, bit_size, deref_ssa(b, img), coord, zero, zero);
|
||||
nir_store_var(b, var, result, 0xf);
|
||||
|
||||
nir_push_else(b, NULL);
|
||||
@@ -324,13 +326,13 @@ static nir_def *image_resolve_msaa(struct si_screen *sscreen, nir_builder *b, ni
|
||||
*/
|
||||
if (!sscreen->use_aco) {
|
||||
for (unsigned i = 0; i < num_samples; i++)
|
||||
sample_index[i] = nir_optimization_barrier_vgpr_amd(b, 32, sample_index[i]);
|
||||
sample_index[i] = nir_optimization_barrier_vgpr_amd(b, bit_size, sample_index[i]);
|
||||
}
|
||||
|
||||
/* Load all samples. */
|
||||
nir_def *samples[16];
|
||||
for (unsigned i = 0; i < num_samples; i++) {
|
||||
samples[i] = nir_image_deref_load(b, 4, 32, deref_ssa(b, img),
|
||||
samples[i] = nir_image_deref_load(b, 4, bit_size, deref_ssa(b, img),
|
||||
coord, sample_index[i], zero);
|
||||
}
|
||||
|
||||
@@ -349,17 +351,23 @@ static nir_def *image_resolve_msaa(struct si_screen *sscreen, nir_builder *b, ni
|
||||
static nir_def *apply_blit_output_modifiers(nir_builder *b, nir_def *color,
|
||||
const union si_compute_blit_shader_key *options)
|
||||
{
|
||||
if (options->sint_to_uint)
|
||||
color = nir_imax(b, color, nir_imm_int(b, 0));
|
||||
unsigned bit_size = color->bit_size;
|
||||
nir_def *zero = nir_imm_intN_t(b, 0, bit_size);
|
||||
|
||||
if (options->uint_to_sint)
|
||||
color = nir_umin(b, color, nir_imm_int(b, INT32_MAX));
|
||||
if (options->sint_to_uint)
|
||||
color = nir_imax(b, color, zero);
|
||||
|
||||
if (options->uint_to_sint) {
|
||||
color = nir_umin(b, color,
|
||||
nir_imm_intN_t(b, bit_size == 16 ? INT16_MAX : INT32_MAX,
|
||||
bit_size));
|
||||
}
|
||||
|
||||
if (options->dst_is_srgb)
|
||||
color = convert_linear_to_srgb(b, color);
|
||||
|
||||
nir_def *zero = nir_imm_int(b, 0);
|
||||
nir_def *one = options->use_integer_one ? nir_imm_int(b, 1) : nir_imm_float(b, 1);
|
||||
nir_def *one = options->use_integer_one ? nir_imm_intN_t(b, 1, bit_size) :
|
||||
nir_imm_floatN_t(b, 1, bit_size);
|
||||
|
||||
/* Set channels not present in src to 0 or 1. This will eliminate code loading and resolving
|
||||
* those channels.
|
||||
@@ -479,13 +487,14 @@ void *si_create_blit_cs(struct si_context *sctx, const union si_compute_blit_sha
|
||||
/* TODO: out-of-bounds image stores have no effect, but we could jump over them for better perf */
|
||||
|
||||
/* Execute the image loads and stores. */
|
||||
unsigned bit_size = options->d16 ? 16 : 32;
|
||||
unsigned num_samples = 1 << options->log2_samples;
|
||||
nir_def *color;
|
||||
|
||||
if (options->src_is_msaa && !options->dst_is_msaa && !options->sample0_only) {
|
||||
/* MSAA resolving (downsampling). */
|
||||
assert(num_samples > 1);
|
||||
color = image_resolve_msaa(sctx->screen, &b, img_src, num_samples, coord_src);
|
||||
color = image_resolve_msaa(sctx->screen, &b, img_src, num_samples, coord_src, bit_size);
|
||||
color = apply_blit_output_modifiers(&b, color, options);
|
||||
nir_image_deref_store(&b, deref_ssa(&b, img_dst), coord_dst, zero, color, zero);
|
||||
|
||||
@@ -495,7 +504,7 @@ void *si_create_blit_cs(struct si_context *sctx, const union si_compute_blit_sha
|
||||
assert(num_samples > 1);
|
||||
/* Group loads together and then stores. */
|
||||
for (unsigned i = 0; i < num_samples; i++) {
|
||||
color[i] = nir_image_deref_load(&b, 4, 32, deref_ssa(&b, img_src), coord_src,
|
||||
color[i] = nir_image_deref_load(&b, 4, bit_size, deref_ssa(&b, img_src), coord_src,
|
||||
nir_imm_int(&b, i), zero);
|
||||
}
|
||||
for (unsigned i = 0; i < num_samples; i++)
|
||||
@@ -507,7 +516,7 @@ void *si_create_blit_cs(struct si_context *sctx, const union si_compute_blit_sha
|
||||
} else if (!options->src_is_msaa && options->dst_is_msaa) {
|
||||
/* MSAA upsampling. */
|
||||
assert(num_samples > 1);
|
||||
color = nir_image_deref_load(&b, 4, 32, deref_ssa(&b, img_src), coord_src, zero, zero);
|
||||
color = nir_image_deref_load(&b, 4, bit_size, deref_ssa(&b, img_src), coord_src, zero, zero);
|
||||
color = apply_blit_output_modifiers(&b, color, options);
|
||||
for (unsigned i = 0; i < num_samples; i++) {
|
||||
nir_image_deref_store(&b, deref_ssa(&b, img_dst), coord_dst,
|
||||
@@ -517,7 +526,7 @@ void *si_create_blit_cs(struct si_context *sctx, const union si_compute_blit_sha
|
||||
/* Non-MSAA copy or read sample 0 only. */
|
||||
/* src2 = sample_index (zero), src3 = lod (zero) */
|
||||
assert(num_samples == 1);
|
||||
color = nir_image_deref_load(&b, 4, 32, deref_ssa(&b, img_src), coord_src, zero, zero);
|
||||
color = nir_image_deref_load(&b, 4, bit_size, deref_ssa(&b, img_src), coord_src, zero, zero);
|
||||
color = apply_blit_output_modifiers(&b, color, options);
|
||||
nir_image_deref_store(&b, deref_ssa(&b, img_dst), coord_dst, zero, color, zero);
|
||||
}
|
||||
|
Reference in New Issue
Block a user