radeonsi: restructure (rewrite) the compute blit shader

This merges the separate MSAA, downsampling, upsampling, and non-MSAA blocks. It's not meant to change behavior, but some change are necessary: - disallow 16 samples - loads only load the number of components that we need - optimizations barriers are placed optimally and include the sample index in the same vector as the coordinates, so that LLVM is forced to form VMEM clauses for loads and stores - the shader queries the descriptor for the dst image manually and passes it to the image store instead of the image variable (this is needed to get latency hiding for scalar loads in the presence of optimization barriers) This is a prerequisite for blitting multiple pixels per lane. Acked-by: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer@amd.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/28917>
2024-03-25 00:13:51 -04:00
parent d2ce5fc07a
commit 30af861bff
3 changed files with 199 additions and 148 deletions
--- a/src/gallium/drivers/radeonsi/si_compute_blit.c
+++ b/src/gallium/drivers/radeonsi/si_compute_blit.c
@@ -899,6 +899,7 @@ bool si_compute_blit(struct si_context *sctx, const struct pipe_blit_info *info,
   if (info->dst.format == PIPE_FORMAT_A8R8_UNORM || /* This format fails AMD_TEST=imagecopy. */
       max_dst_chan_size == 5 || /* PIPE_FORMAT_R5G5B5A1_UNORM has precision issues */
       util_format_is_depth_or_stencil(info->dst.resource->format) ||
+       dst_samples > SI_MAX_COMPUTE_BLIT_SAMPLES ||
       info->dst_sample != 0 ||
       /* Image stores support DCC since GFX10. Return only for gfx queues. DCC is disabled
        * for compute queues farther below. */
@@ -912,7 +913,8 @@ bool si_compute_blit(struct si_context *sctx, const struct pipe_blit_info *info,
        (info->dst.box.width != abs(info->src.box.width) ||
         info->dst.box.height != abs(info->src.box.height) ||
         info->dst.box.depth != abs(info->src.box.depth) ||
-         util_format_is_depth_or_stencil(info->src.resource->format))))
+         util_format_is_depth_or_stencil(info->src.resource->format) ||
+         src_samples > SI_MAX_COMPUTE_BLIT_SAMPLES)))
      return false;

   /* Testing on Navi21 showed that the compute blit is slightly slower than the gfx blit.
@@ -1104,7 +1106,8 @@ bool si_compute_blit(struct si_context *sctx, const struct pipe_blit_info *info,
   bool has_d16 = sctx->gfx_level >= (sctx->screen->use_aco ? GFX9 : GFX8);

   if (is_clear) {
-      options.log2_samples = util_logbase2(dst_samples);
+      assert(dst_samples <= 8);
+      options.log_samples = util_logbase2(dst_samples);
      options.d16 = has_d16 &&
                    max_dst_chan_size <= (util_format_is_float(info->dst.format) ||
                                          util_format_is_pure_integer(info->dst.format) ? 16 : 11);
@@ -1117,10 +1120,11 @@ bool si_compute_blit(struct si_context *sctx, const struct pipe_blit_info *info,
                          info->src.resource->target == PIPE_TEXTURE_1D_ARRAY ||
                          info->src.resource->target == PIPE_TEXTURE_2D_ARRAY ||
                          info->src.resource->target == PIPE_TEXTURE_CUBE_ARRAY;
-      /* Resolving integer formats only copies sample 0. log2_samples is then unused. */
+      /* Resolving integer formats only copies sample 0. log_samples is then unused. */
      options.sample0_only = sample0_only;
      unsigned num_samples = MAX2(src_samples, dst_samples);
-      options.log2_samples = sample0_only ? 0 : util_logbase2(num_samples);
+      assert(num_samples <= 8);
+      options.log_samples = sample0_only ? 0 : util_logbase2(num_samples);
      options.x_clamp_to_edge = si_should_blit_clamp_to_edge(info, BITFIELD_BIT(0));
      options.y_clamp_to_edge = si_should_blit_clamp_to_edge(info, BITFIELD_BIT(1));
      options.flip_x = info->src.box.width < 0;
--- a/src/gallium/drivers/radeonsi/si_pipe.h
+++ b/src/gallium/drivers/radeonsi/si_pipe.h
@@ -1632,6 +1632,8 @@ void *gfx9_create_clear_dcc_msaa_cs(struct si_context *sctx, struct si_texture *
 void *si_create_passthrough_tcs(struct si_context *sctx);
 void *si_clear_image_dcc_single_shader(struct si_context *sctx, bool is_msaa, unsigned wg_dim);

+#define SI_MAX_COMPUTE_BLIT_SAMPLES    8
+
 union si_compute_blit_shader_key {
   struct {
      /* Workgroup settings. */
@@ -1646,7 +1648,7 @@ union si_compute_blit_shader_key {
      bool src_has_z:1;
      bool dst_has_z:1;
      bool d16:1;
-      uint8_t log2_samples:4;
+      uint8_t log_samples:2;
      bool sample0_only:1; /* src is MSAA, dst is not MSAA, log2_samples is ignored */
      /* Source coordinate modifiers. */
      bool x_clamp_to_edge:1;
@@ -1658,7 +1660,7 @@ union si_compute_blit_shader_key {
      bool uint_to_sint:1;
      bool dst_is_srgb:1;
      bool use_integer_one:1;
-      uint8_t last_src_channel:2;
+      uint8_t last_src_channel:2; /* this shouldn't be greater than last_dst_channel */
      uint8_t last_dst_channel:2;
   };
   uint64_t key;
--- a/src/gallium/drivers/radeonsi/si_shaderlib_nir.c
+++ b/src/gallium/drivers/radeonsi/si_shaderlib_nir.c
@@ -195,13 +195,12 @@ static nir_def *convert_linear_to_srgb(nir_builder *b, nir_def *input)
   /* There are small precision differences compared to CB, so the gfx blit will return slightly
    * different results.
    */
+   for (unsigned i = 0; i < MIN2(3, input->num_components); i++) {
+      input = nir_vector_insert_imm(b, input,
+                                    nir_format_linear_to_srgb(b, nir_channel(b, input, i)), i);
+   }

-   nir_def *comp[4];
-   for (unsigned i = 0; i < 3; i++)
-      comp[i] = nir_format_linear_to_srgb(b, nir_channel(b, input, i));
-   comp[3] = nir_channel(b, input, 3);
-
-   return nir_vec(b, comp, 4);
+   return input;
 }

 static nir_def *average_samples(nir_builder *b, nir_def **samples, unsigned num_samples)
@@ -228,61 +227,6 @@ static nir_def *average_samples(nir_builder *b, nir_def **samples, unsigned num_
   return nir_fmul_imm(b, samples[0], 1.0 / num_samples); /* average the sum */
 }

-static nir_def *image_resolve_msaa(struct si_screen *sscreen, nir_builder *b, nir_variable *img,
-                                   unsigned num_samples, nir_def *coord, unsigned bit_size)
-{
-   nir_def *zero = nir_imm_int(b, 0);
-   nir_def *result = NULL;
-   nir_variable *var = NULL;
-
-   /* Gfx11 doesn't support samples_identical, so we can't use it. */
-   if (sscreen->info.gfx_level < GFX11) {
-      /* We need a local variable to get the result out of conditional branches in SSA. */
-      var = nir_local_variable_create(b->impl,
-                                      bit_size == 16 ? glsl_f16vec_type(4) : glsl_vec4_type(),
-                                      NULL);
-
-      /* If all samples are identical, load only sample 0. */
-      nir_push_if(b, nir_image_deref_samples_identical(b, 1, deref_ssa(b, img), coord));
-      result = nir_image_deref_load(b, 4, bit_size, deref_ssa(b, img), coord, zero, zero);
-      nir_store_var(b, var, result, 0xf);
-
-      nir_push_else(b, NULL);
-   }
-
-   nir_def *sample_index[16];
-   for (unsigned i = 0; i < num_samples; i++)
-      sample_index[i] = nir_imm_int(b, i);
-
-   /* We need to hide the constant sample indices behind the optimization barrier, otherwise
-    * LLVM doesn't put loads into the same clause.
-    *
-    * TODO: nir_group_loads could do this.
-    */
-   if (!sscreen->use_aco) {
-      for (unsigned i = 0; i < num_samples; i++)
-         sample_index[i] = nir_optimization_barrier_vgpr_amd(b, bit_size, sample_index[i]);
-   }
-
-   /* Load all samples. */
-   nir_def *samples[16];
-   for (unsigned i = 0; i < num_samples; i++) {
-      samples[i] = nir_image_deref_load(b, 4, bit_size, deref_ssa(b, img),
-                                        coord, sample_index[i], zero);
-   }
-
-   result = average_samples(b, samples, num_samples);
-
-   if (sscreen->info.gfx_level < GFX11) {
-      /* Exit the conditional branch and get the result out of the branch. */
-      nir_store_var(b, var, result, 0xf);
-      nir_pop_if(b, NULL);
-      result = nir_load_var(b, var);
-   }
-
-   return result;
-}
-
 static nir_def *apply_blit_output_modifiers(nir_builder *b, nir_def *color,
                                                const union si_compute_blit_shader_key *options)
 {
@@ -304,12 +248,24 @@ static nir_def *apply_blit_output_modifiers(nir_builder *b, nir_def *color,
   nir_def *one = options->use_integer_one ? nir_imm_intN_t(b, 1, bit_size) :
                                             nir_imm_floatN_t(b, 1, bit_size);

-   /* Set channels not present in src to 0 or 1. This will eliminate code loading and resolving
-    * those channels.
-    */
-   if (!options->is_clear) {
-      for (unsigned chan = options->last_src_channel + 1; chan <= options->last_dst_channel; chan++)
-         color = nir_vector_insert_imm(b, color, chan == 3 ? one : zero, chan);
+   if (options->is_clear) {
+      if (options->last_dst_channel < 3)
+         color = nir_trim_vector(b, color, options->last_dst_channel + 1);
+   } else {
+      assert(options->last_src_channel <= options->last_dst_channel);
+      assert(color->num_components == options->last_src_channel + 1);
+
+      /* Set channels not present in src to 0 or 1. */
+      if (options->last_src_channel < options->last_dst_channel) {
+         color = nir_pad_vector(b, color, options->last_dst_channel + 1);
+
+         for (unsigned chan = options->last_src_channel + 1; chan <= options->last_dst_channel; chan++)
+            color = nir_vector_insert_imm(b, color, chan == 3 ? one : zero, chan);
+      }
+
+      /* Discard channels not present in dst. The hardware fills unstored channels with 0. */
+      if (options->last_dst_channel < options->last_src_channel)
+         color = nir_trim_vector(b, color, options->last_dst_channel + 1);
   }

   /* Discard channels not present in dst. The hardware fills unstored channels with 0. */
@@ -319,6 +275,27 @@ static nir_def *apply_blit_output_modifiers(nir_builder *b, nir_def *color,
   return color;
 }

+static void optimization_barrier_vgpr_array(struct si_context *sctx, nir_builder *b,
+                                            nir_def **array, unsigned num_elements,
+                                            unsigned num_components)
+{
+   /* We use the optimization barrier to force LLVM to form VMEM clauses by constraining its
+    * instruction scheduling options.
+    *
+    * VMEM clauses are supported since GFX10. It's not recommended to use the optimization
+    * barrier in the compute blit for GFX6-8 because the lack of A16 combined with optimization
+    * barriers would unnecessarily increase VGPR usage for MSAA resources.
+    */
+   if (!sctx->screen->use_aco && sctx->gfx_level >= GFX10) {
+      for (unsigned i = 0; i < num_elements; i++) {
+         unsigned prev_num = array[i]->num_components;
+         array[i] = nir_trim_vector(b, array[i], num_components);
+         array[i] = nir_optimization_barrier_vgpr_amd(b, array[i]->bit_size, array[i]);
+         array[i] = nir_pad_vector(b, array[i], prev_num);
+      }
+   }
+}
+
 /* The compute blit shader.
 *
 * Implementation details:
@@ -407,96 +384,164 @@ void *si_create_blit_cs(struct si_context *sctx, const union si_compute_blit_sha
   }

   /* Add box.xyz. */
-   nir_def *coord_src = NULL, *coord_dst = NULL, *dim = NULL;
+   nir_def *base_coord_src = NULL, *base_coord_dst = NULL;
   unpack_2x16_signed(&b, nir_trim_vector(&b, nir_load_user_data_amd(&b), 3),
-                      &coord_src, &coord_dst);
-   coord_dst = nir_iadd(&b, coord_dst, dst_xyz);
-   coord_src = nir_iadd(&b, coord_src, src_xyz);
-
-   /* Clamp to edge for src, only X and Y because Z can't be out of bounds. */
-   for (unsigned i = 0; i < 2; i++) {
-      if (i ? options->y_clamp_to_edge : options->x_clamp_to_edge) {
-         assert(!options->src_is_1d || i == 0);
-
-         if (!dim)
-            dim = nir_image_deref_size(&b, 4, 32, deref_ssa(&b, img_src), zero);
-
-         nir_def *tmp = nir_channel(&b, coord_src, i);
-         tmp = nir_imax(&b, tmp, nir_imm_int(&b, 0));
-         tmp = nir_imin(&b, tmp, nir_iadd_imm(&b, nir_channel(&b, dim, i), -1));
-         coord_src = nir_vector_insert_imm(&b, coord_src, tmp, i);
-      }
-   }
-
-   /* Swizzle coordinates for 1D_ARRAY. */
-   static unsigned swizzle_xz[] = {0, 2, 0, 0};
-
-   if (options->src_is_1d)
-      coord_src = nir_swizzle(&b, coord_src, swizzle_xz, 4);
-   if (options->dst_is_1d)
-      coord_dst = nir_swizzle(&b, coord_dst, swizzle_xz, 4);
+                      &base_coord_src, &base_coord_dst);
+   base_coord_dst = nir_iadd(&b, base_coord_dst, dst_xyz);
+   base_coord_src = nir_iadd(&b, base_coord_src, src_xyz);

   /* Coordinates must have 4 channels in NIR. */
-   coord_src = nir_pad_vector(&b, coord_src, 4);
-   coord_dst = nir_pad_vector(&b, coord_dst, 4);
+   base_coord_src = nir_pad_vector(&b, base_coord_src, 4);
+   base_coord_dst = nir_pad_vector(&b, base_coord_dst, 4);

-   /* TODO: out-of-bounds image stores have no effect, but we could jump over them for better perf */
+/* NOTE: This will be changed to a more complex loop in the future. */
+#define foreach_sample(num_samples, sample) \
+   for (unsigned sample = 0; sample < (num_samples); sample++)

-   /* Execute the image loads and stores. */
+   /* Swizzle coordinates for 1D_ARRAY. */
+   static const unsigned swizzle_xz[] = {0, 2, 0, 0};
+
+   /* Execute image loads and stores. */
+   unsigned num_src_coords = (options->src_is_1d ? 1 : 2) + options->src_has_z + options->src_is_msaa;
+   unsigned num_dst_coords = (options->dst_is_1d ? 1 : 2) + options->dst_has_z + options->dst_is_msaa;
   unsigned bit_size = options->d16 ? 16 : 32;
-   unsigned num_samples = 1 << options->log2_samples;
-   nir_def *color = NULL;
+   unsigned num_samples = 1 << options->log_samples;
+   unsigned src_samples = options->src_is_msaa && !options->sample0_only &&
+                          !options->is_clear ? num_samples : 1;
+   unsigned dst_samples = options->dst_is_msaa ? num_samples : 1;
+   nir_def *color[SI_MAX_COMPUTE_BLIT_SAMPLES] = {0};
+   nir_def *coord_dst[SI_MAX_COMPUTE_BLIT_SAMPLES] = {0};
+   nir_def *src_resinfo = NULL;

   if (options->is_clear) {
-      /* The clear color start at component 4 of user data. */
-      color = nir_channels(&b, nir_load_user_data_amd(&b),
-                           BITFIELD_RANGE(4, options->d16 ? 2 : 4));
+      /* The clear color starts at component 4 of user data. */
+      color[0] = nir_channels(&b, nir_load_user_data_amd(&b),
+                              BITFIELD_RANGE(4, options->d16 ? 2 : 4));
      if (options->d16)
-         color = nir_unpack_64_4x16(&b, nir_pack_64_2x32(&b, color));
+         color[0] = nir_unpack_64_4x16(&b, nir_pack_64_2x32(&b, color[0]));
+   } else {
+      nir_def *coord_src[SI_MAX_COMPUTE_BLIT_SAMPLES] = {0};
+
+      /* Initialize src coordinates, one vector per pixel. */
+      foreach_sample(src_samples, i) {
+         coord_src[i] = base_coord_src;
+         if (options->src_is_1d)
+            coord_src[i] = nir_swizzle(&b, coord_src[i], swizzle_xz, 4);
+         if (options->src_is_msaa) {
+            coord_src[i] = nir_vector_insert_imm(&b, coord_src[i], nir_imm_int(&b, i),
+                                                 num_src_coords - 1);
+         }
+
+         /* Clamp to edge for src, only X and Y because Z can't be out of bounds. */
+         for (unsigned chan = 0; chan < 2; chan++) {
+            if (chan ? options->y_clamp_to_edge : options->x_clamp_to_edge) {
+               assert(!options->src_is_1d || chan == 0);
+
+               if (!src_resinfo) {
+                  src_resinfo = nir_image_deref_size(&b, 4, 32, deref_ssa(&b, img_src),
+                                                     zero);
+               }
+
+               nir_def *tmp = nir_channel(&b, coord_src[i], chan);
+               tmp = nir_imax_imm(&b, tmp, 0);
+               tmp = nir_imin(&b, tmp, nir_iadd_imm(&b, nir_channel(&b, src_resinfo, chan), -1));
+               coord_src[i] = nir_vector_insert_imm(&b, coord_src[i], tmp, chan);
+            }
+         }
+      }
+
+      /* We don't want the computation of src coordinates to be interleaved with loads. */
+      if (src_samples > 1) {
+         optimization_barrier_vgpr_array(sctx, &b, coord_src, src_samples,
+                                         num_src_coords);
+      }
+
+      /* Use "samples_identical" for MSAA resolving if it's supported. */
+      bool is_resolve = src_samples > 1 && dst_samples == 1;
+      bool uses_samples_identical = sctx->gfx_level < GFX11 &&
+                                    !(sctx->screen->debug_flags & DBG(NO_FMASK)) && is_resolve;
+      nir_def *samples_identical = NULL, *sample0 = {0};
+      nir_if *if_identical = NULL;
+
+      if (uses_samples_identical) {
+         samples_identical = nir_image_deref_samples_identical(&b, 1, deref_ssa(&b, img_src),
+                                                              coord_src[0],
+                                                              .image_dim = GLSL_SAMPLER_DIM_MS);
+
+         /* If all samples are identical, load only sample 0. */
+         if_identical = nir_push_if(&b, samples_identical);
+         {
+            sample0 = nir_image_deref_load(&b, options->last_src_channel + 1, bit_size,
+                                           deref_ssa(&b, img_src), coord_src[0],
+                                           nir_channel(&b, coord_src[0],
+                                                       num_src_coords - 1), zero,
+                                           .image_dim = img_src->type->sampler_dimensionality,
+                                           .image_array = img_src->type->sampler_array);
+         }
+         nir_push_else(&b, if_identical);
+      }
+
+      /* Load src pixels, one per sample. */
+      foreach_sample(src_samples, i) {
+         color[i] = nir_image_deref_load(&b, options->last_src_channel + 1, bit_size,
+                                         deref_ssa(&b, img_src), coord_src[i],
+                                         nir_channel(&b, coord_src[i], num_src_coords - 1), zero,
+                                         .image_dim = img_src->type->sampler_dimensionality,
+                                         .image_array = img_src->type->sampler_array);
+      }
+
+      /* Resolve MSAA if necessary. */
+      if (is_resolve) {
+         /* We don't want the averaging of samples to be interleaved with image loads. */
+         optimization_barrier_vgpr_array(sctx, &b, color, src_samples,
+                                         options->last_src_channel + 1);
+
+         color[0] = average_samples(&b, color, src_samples);
+         src_samples = 1;
+      }
+
+      if (uses_samples_identical) {
+         nir_pop_if(&b, if_identical);
+         color[0] = nir_if_phi(&b, sample0, color[0]);
+      }
   }

-   if (options->src_is_msaa && !options->dst_is_msaa && !options->sample0_only) {
-      /* MSAA resolving (downsampling). */
-      assert(num_samples > 1 && !options->is_clear);
-      color = image_resolve_msaa(sctx->screen, &b, img_src, num_samples, coord_src, bit_size);
-      color = apply_blit_output_modifiers(&b, color, options);
-      nir_image_deref_store(&b, deref_ssa(&b, img_dst), coord_dst, zero, color, zero);
+   nir_def *img_dst_desc = nir_image_deref_descriptor_amd(&b, 8, 32, deref_ssa(&b, img_dst));

-   } else if (options->src_is_msaa && options->dst_is_msaa) {
-      /* MSAA copy. */
-      nir_def *color[16];
-      assert(num_samples > 1 && !options->is_clear);
-      /* Group loads together and then stores. */
-      for (unsigned i = 0; i < num_samples; i++) {
-         color[i] = nir_image_deref_load(&b, 4, bit_size, deref_ssa(&b, img_src), coord_src,
-                                         nir_imm_int(&b, i), zero);
-      }
-      for (unsigned i = 0; i < num_samples; i++)
-         color[i] = apply_blit_output_modifiers(&b, color[i], options);
-      for (unsigned i = 0; i < num_samples; i++) {
-         nir_image_deref_store(&b, deref_ssa(&b, img_dst), coord_dst,
-                               nir_imm_int(&b, i), color[i], zero);
-      }
-   } else if (!options->src_is_msaa && options->dst_is_msaa) {
-      /* MSAA upsampling. */
-      assert(num_samples > 1);
-      if (!options->is_clear)
-         color = nir_image_deref_load(&b, 4, bit_size, deref_ssa(&b, img_src), coord_src, zero, zero);
+   /* Apply the blit output modifiers, once per sample.  */
+   foreach_sample(src_samples, i) {
+      color[i] = apply_blit_output_modifiers(&b, color[i], options);
+   }

-      color = apply_blit_output_modifiers(&b, color, options);
-      for (unsigned i = 0; i < num_samples; i++) {
-         nir_image_deref_store(&b, deref_ssa(&b, img_dst), coord_dst,
-                               nir_imm_int(&b, i), color, zero);
+   /* Initialize dst coordinates, one vector per pixel. */
+   foreach_sample(dst_samples, i) {
+      coord_dst[i] = base_coord_dst;
+      if (options->dst_is_1d)
+         coord_dst[i] = nir_swizzle(&b, coord_dst[i], swizzle_xz, 4);
+      if (options->dst_is_msaa) {
+         coord_dst[i] = nir_vector_insert_imm(&b, coord_dst[i],
+                                              nir_imm_int(&b, i),
+                                              num_dst_coords - 1);
      }
-   } else {
-      /* Non-MSAA copy or read sample 0 only. */
-      /* src2 = sample_index (zero), src3 = lod (zero) */
-      assert(num_samples == 1);
-      if (!options->is_clear)
-         color = nir_image_deref_load(&b, 4, bit_size, deref_ssa(&b, img_src), coord_src, zero, zero);
+   }

-      color = apply_blit_output_modifiers(&b, color, options);
-      nir_image_deref_store(&b, deref_ssa(&b, img_dst), coord_dst, zero, color, zero);
+   /* We don't want the computation of dst coordinates to be interleaved with stores. */
+   if (dst_samples > 1)
+      optimization_barrier_vgpr_array(sctx, &b, coord_dst, dst_samples, num_dst_coords);
+
+   /* We don't want the application of blit output modifiers to be interleaved with stores. */
+   if (!options->is_clear && MIN2(src_samples, dst_samples) > 1) {
+      optimization_barrier_vgpr_array(sctx, &b, color, src_samples,
+                                      options->last_dst_channel + 1);
+   }
+
+   /* Store the pixels, one per sample. */
+   foreach_sample(dst_samples, i) {
+      nir_bindless_image_store(&b, img_dst_desc, coord_dst[i],
+                               nir_channel(&b, coord_dst[i], num_dst_coords - 1),
+                               src_samples > 1 ? color[i] : color[i / dst_samples], zero,
+                               .image_dim = glsl_get_sampler_dim(img_type[1]),
+                               .image_array = glsl_sampler_type_is_array(img_type[1]));
   }

   if (options->has_start_xyz)