radeonsi: use MIMG D16 (16-bit data) for image instructions in compute blits

This reduces VGPR usage. Reviewed-by: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer@amd.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/28917>
2024-03-25 02:29:39 -04:00
parent d3638a9f58
commit 2423c5ad2f
3 changed files with 38 additions and 15 deletions
--- a/src/gallium/drivers/radeonsi/si_compute_blit.c
+++ b/src/gallium/drivers/radeonsi/si_compute_blit.c
@@ -1037,6 +1037,9 @@ bool si_compute_blit(struct si_context *sctx, const struct pipe_blit_info *info,
 {
   struct si_texture *sdst = (struct si_texture *)info->dst.resource;
   bool is_3d_tiling = sdst->surface.thick_tiling;
+   /* Get the channel sizes. */
+   unsigned max_dst_chan_size = util_format_get_max_channel_size(info->dst.format);
+   unsigned max_src_chan_size = util_format_get_max_channel_size(info->src.format);

   /* Testing on Navi21 showed that the compute blit is slightly slower than the gfx blit.
    * The compute blit is even slower with DCC stores. VP13 CATIA_plane_pencil is a good test
@@ -1170,6 +1173,16 @@ bool si_compute_blit(struct si_context *sctx, const struct pipe_blit_info *info,
   options.use_integer_one = util_format_is_pure_integer(info->dst.format) &&
                             options.last_src_channel < options.last_dst_channel &&
                             options.last_dst_channel == 3;
+   bool is_resolve = options.src_is_msaa && !options.dst_is_msaa && !options.sample0_only;
+   /* ACO doesn't support D16 on GFX8 */
+   bool has_d16 = sctx->gfx_level >= (sctx->screen->use_aco ? GFX9 : GFX8);
+   options.d16 = has_d16 &&
+                 /* Blitting FP16 using D16 has precision issues. Resolving has precision
+                  * issues all the way down to R11G11B10_FLOAT. */
+                 MIN2(max_dst_chan_size, max_src_chan_size) <=
+                 (util_format_is_pure_integer(info->dst.format) ?
+                     (options.sint_to_uint || options.uint_to_sint ? 10 : 16) :
+                     (is_resolve ? 10 : 11));

   struct hash_entry *entry = _mesa_hash_table_search(sctx->cs_blit_shaders,
                                                      (void*)(uintptr_t)options.key);
--- a/src/gallium/drivers/radeonsi/si_pipe.h
+++ b/src/gallium/drivers/radeonsi/si_pipe.h
@@ -1647,6 +1647,7 @@ union si_compute_blit_shader_key {
      bool dst_is_msaa:1;
      bool src_has_z:1;
      bool dst_has_z:1;
+      bool d16:1;
      uint8_t log2_samples:4;
      bool sample0_only:1; /* src is MSAA, dst is not MSAA, log2_samples is ignored */
      /* Source coordinate modifiers. */
--- a/src/gallium/drivers/radeonsi/si_shaderlib_nir.c
+++ b/src/gallium/drivers/radeonsi/si_shaderlib_nir.c
@@ -294,7 +294,7 @@ static nir_def *average_samples(nir_builder *b, nir_def **samples, unsigned num_
 }

 static nir_def *image_resolve_msaa(struct si_screen *sscreen, nir_builder *b, nir_variable *img,
-                                   unsigned num_samples, nir_def *coord)
+                                   unsigned num_samples, nir_def *coord, unsigned bit_size)
 {
   nir_def *zero = nir_imm_int(b, 0);
   nir_def *result = NULL;
@@ -303,11 +303,13 @@ static nir_def *image_resolve_msaa(struct si_screen *sscreen, nir_builder *b, ni
   /* Gfx11 doesn't support samples_identical, so we can't use it. */
   if (sscreen->info.gfx_level < GFX11) {
      /* We need a local variable to get the result out of conditional branches in SSA. */
-      var = nir_local_variable_create(b->impl, glsl_vec4_type(), NULL);
+      var = nir_local_variable_create(b->impl,
+                                      bit_size == 16 ? glsl_f16vec_type(4) : glsl_vec4_type(),
+                                      NULL);

      /* If all samples are identical, load only sample 0. */
      nir_push_if(b, nir_image_deref_samples_identical(b, 1, deref_ssa(b, img), coord));
-      result = nir_image_deref_load(b, 4, 32, deref_ssa(b, img), coord, zero, zero);
+      result = nir_image_deref_load(b, 4, bit_size, deref_ssa(b, img), coord, zero, zero);
      nir_store_var(b, var, result, 0xf);

      nir_push_else(b, NULL);
@@ -324,13 +326,13 @@ static nir_def *image_resolve_msaa(struct si_screen *sscreen, nir_builder *b, ni
    */
   if (!sscreen->use_aco) {
      for (unsigned i = 0; i < num_samples; i++)
-         sample_index[i] = nir_optimization_barrier_vgpr_amd(b, 32, sample_index[i]);
+         sample_index[i] = nir_optimization_barrier_vgpr_amd(b, bit_size, sample_index[i]);
   }

   /* Load all samples. */
   nir_def *samples[16];
   for (unsigned i = 0; i < num_samples; i++) {
-      samples[i] = nir_image_deref_load(b, 4, 32, deref_ssa(b, img),
+      samples[i] = nir_image_deref_load(b, 4, bit_size, deref_ssa(b, img),
                                        coord, sample_index[i], zero);
   }

@@ -349,17 +351,23 @@ static nir_def *image_resolve_msaa(struct si_screen *sscreen, nir_builder *b, ni
 static nir_def *apply_blit_output_modifiers(nir_builder *b, nir_def *color,
                                                const union si_compute_blit_shader_key *options)
 {
-   if (options->sint_to_uint)
-      color = nir_imax(b, color, nir_imm_int(b, 0));
+   unsigned bit_size = color->bit_size;
+   nir_def *zero = nir_imm_intN_t(b, 0, bit_size);

-   if (options->uint_to_sint)
-      color = nir_umin(b, color, nir_imm_int(b, INT32_MAX));
+   if (options->sint_to_uint)
+      color = nir_imax(b, color, zero);
+
+   if (options->uint_to_sint) {
+      color = nir_umin(b, color,
+                       nir_imm_intN_t(b, bit_size == 16 ? INT16_MAX : INT32_MAX,
+                                      bit_size));
+   }

   if (options->dst_is_srgb)
      color = convert_linear_to_srgb(b, color);

-   nir_def *zero = nir_imm_int(b, 0);
-   nir_def *one = options->use_integer_one ? nir_imm_int(b, 1) : nir_imm_float(b, 1);
+   nir_def *one = options->use_integer_one ? nir_imm_intN_t(b, 1, bit_size) :
+                                             nir_imm_floatN_t(b, 1, bit_size);

   /* Set channels not present in src to 0 or 1. This will eliminate code loading and resolving
    * those channels.
@@ -479,13 +487,14 @@ void *si_create_blit_cs(struct si_context *sctx, const union si_compute_blit_sha
   /* TODO: out-of-bounds image stores have no effect, but we could jump over them for better perf */

   /* Execute the image loads and stores. */
+   unsigned bit_size = options->d16 ? 16 : 32;
   unsigned num_samples = 1 << options->log2_samples;
   nir_def *color;

   if (options->src_is_msaa && !options->dst_is_msaa && !options->sample0_only) {
      /* MSAA resolving (downsampling). */
      assert(num_samples > 1);
-      color = image_resolve_msaa(sctx->screen, &b, img_src, num_samples, coord_src);
+      color = image_resolve_msaa(sctx->screen, &b, img_src, num_samples, coord_src, bit_size);
      color = apply_blit_output_modifiers(&b, color, options);
      nir_image_deref_store(&b, deref_ssa(&b, img_dst), coord_dst, zero, color, zero);

@@ -495,7 +504,7 @@ void *si_create_blit_cs(struct si_context *sctx, const union si_compute_blit_sha
      assert(num_samples > 1);
      /* Group loads together and then stores. */
      for (unsigned i = 0; i < num_samples; i++) {
-         color[i] = nir_image_deref_load(&b, 4, 32, deref_ssa(&b, img_src), coord_src,
+         color[i] = nir_image_deref_load(&b, 4, bit_size, deref_ssa(&b, img_src), coord_src,
                                         nir_imm_int(&b, i), zero);
      }
      for (unsigned i = 0; i < num_samples; i++)
@@ -507,7 +516,7 @@ void *si_create_blit_cs(struct si_context *sctx, const union si_compute_blit_sha
   } else if (!options->src_is_msaa && options->dst_is_msaa) {
      /* MSAA upsampling. */
      assert(num_samples > 1);
-      color = nir_image_deref_load(&b, 4, 32, deref_ssa(&b, img_src), coord_src, zero, zero);
+      color = nir_image_deref_load(&b, 4, bit_size, deref_ssa(&b, img_src), coord_src, zero, zero);
      color = apply_blit_output_modifiers(&b, color, options);
      for (unsigned i = 0; i < num_samples; i++) {
         nir_image_deref_store(&b, deref_ssa(&b, img_dst), coord_dst,
@@ -517,7 +526,7 @@ void *si_create_blit_cs(struct si_context *sctx, const union si_compute_blit_sha
      /* Non-MSAA copy or read sample 0 only. */
      /* src2 = sample_index (zero), src3 = lod (zero) */
      assert(num_samples == 1);
-      color = nir_image_deref_load(&b, 4, 32, deref_ssa(&b, img_src), coord_src, zero, zero);
+      color = nir_image_deref_load(&b, 4, bit_size, deref_ssa(&b, img_src), coord_src, zero, zero);
      color = apply_blit_output_modifiers(&b, color, options);
      nir_image_deref_store(&b, deref_ssa(&b, img_dst), coord_dst, zero, color, zero);
   }