radv: Lower non-dynamic VS inputs in NIR.

Add a new RADV specific NIR pass which lowers VS input loads to AMD specific buffer load instructions. We do this because we want to remove the RADV specific VS input handling from the shader compiler back-ends. Fossil DB stats on Rembrandt (GFX10.3): Totals from 32507 (24.09% of 134913) affected shaders: VGPRs: 1245512 -> 1245344 (-0.01%); split: -0.35%, +0.34% SpillSGPRs: 1068 -> 1102 (+3.18%) CodeSize: 90333192 -> 90327232 (-0.01%); split: -0.07%, +0.06% MaxWaves: 881816 -> 881388 (-0.05%); split: +0.23%, -0.28% Instrs: 17264710 -> 17264562 (-0.00%); split: -0.09%, +0.09% Latency: 87300501 -> 86586480 (-0.82%); split: -1.07%, +0.25% InvThroughput: 13700046 -> 13685931 (-0.10%); split: -0.20%, +0.10% VClause: 361520 -> 361301 (-0.06%); split: -1.32%, +1.26% SClause: 441018 -> 441505 (+0.11%); split: -0.54%, +0.65% Copies: 1371477 -> 1373838 (+0.17%); split: -0.57%, +0.75% Branches: 496639 -> 496611 (-0.01%); split: -0.01%, +0.00% PreSGPRs: 1122956 -> 1122663 (-0.03%); split: -0.09%, +0.06% PreVGPRs: 976051 -> 995717 (+2.01%); split: -0.12%, +2.14% Signed-off-by: Timur Kristóf <timur.kristof@gmail.com> Reviewed-by: Konstantin Seurer <konstantin.seurer@gmail.com> Reviewed-by: Rhys Perry <pendingchaos02@gmail.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/16805>
2023-02-10 08:37:06 +01:00
parent 22ca8c8561
commit a46acdbc3f
3 changed files with 246 additions and 7 deletions
--- a/src/amd/vulkan/radv_nir_lower_vs_inputs.c
+++ b/src/amd/vulkan/radv_nir_lower_vs_inputs.c
@@ -33,7 +33,7 @@ typedef struct {
   const struct radv_shader_args *args;
   const struct radv_shader_info *info;
   const struct radv_pipeline_key *pl_key;
-   uint32_t address32_hi;
+   const struct radeon_info *rad_info;
 } lower_vs_inputs_state;

 static nir_ssa_def *
@@ -80,6 +80,245 @@ lower_load_vs_input_from_prolog(nir_builder *b, nir_intrinsic_instr *intrin,
   return extracted;
 }

+static nir_ssa_def *
+calc_vs_input_index_instance_rate(nir_builder *b, unsigned location, lower_vs_inputs_state *s)
+{
+   const uint32_t divisor = s->pl_key->vs.instance_rate_divisors[location];
+   nir_ssa_def *start_instance = nir_load_base_instance(b);
+
+   if (divisor == 0)
+      return start_instance;
+
+   nir_ssa_def *instance_id = nir_udiv_imm(b, nir_load_instance_id(b), divisor);
+   return nir_iadd(b, start_instance, instance_id);
+}
+
+static nir_ssa_def *
+calc_vs_input_index(nir_builder *b, unsigned location, lower_vs_inputs_state *s)
+{
+   if (s->pl_key->vs.instance_rate_inputs & BITFIELD_BIT(location))
+      return calc_vs_input_index_instance_rate(b, location, s);
+
+   return nir_iadd(b, nir_load_first_vertex(b), nir_load_vertex_id_zero_base(b));
+}
+
+static bool
+can_use_untyped_load(const struct util_format_description *f, const unsigned bit_size)
+{
+   /* All components must have same size and type. */
+   if (!f->is_array)
+      return false;
+
+   const struct util_format_channel_description *c = &f->channel[0];
+   return c->size == bit_size && bit_size >= 32;
+}
+
+static nir_ssa_def *
+oob_input_load_value(nir_builder *b, const unsigned channel_idx, const unsigned bit_size,
+                     const bool is_float)
+{
+   /* 22.1.1. Attribute Location and Component Assignment of Vulkan 1.3 specification:
+    * For 64-bit data types, no default attribute values are provided. Input variables
+    * must not use more components than provided by the attribute.
+    */
+   if (bit_size == 64)
+      return nir_ssa_undef(b, 1, bit_size);
+
+   if (channel_idx == 3) {
+      if (is_float)
+         return nir_imm_floatN_t(b, 1.0, bit_size);
+      else
+         return nir_imm_intN_t(b, 1, bit_size);
+   }
+
+   return nir_imm_intN_t(b, 0, bit_size);
+}
+
+static unsigned
+count_format_bytes(const struct util_format_description *f, const unsigned first_channel,
+                   const unsigned num_channels)
+{
+   if (!num_channels)
+      return 0;
+
+   const unsigned last_channel = first_channel + num_channels - 1;
+   assert(last_channel < f->nr_channels);
+   unsigned bits = 0;
+   for (unsigned i = first_channel; i <= last_channel; ++i) {
+      bits += f->channel[i].size;
+   }
+
+   assert(bits % 8 == 0);
+   return bits / 8;
+}
+
+static nir_ssa_def *
+lower_load_vs_input(nir_builder *b, nir_intrinsic_instr *intrin, lower_vs_inputs_state *s)
+{
+   nir_src *offset_src = nir_get_io_offset_src(intrin);
+   assert(nir_src_is_const(*offset_src));
+
+   const unsigned base = nir_intrinsic_base(intrin);
+   const unsigned base_offset = nir_src_as_uint(*offset_src);
+   const unsigned location = base + base_offset - VERT_ATTRIB_GENERIC0;
+   const unsigned bit_size = intrin->dest.ssa.bit_size;
+   const unsigned dest_num_components = intrin->dest.ssa.num_components;
+
+   /* Convert the component offset to bit_size units.
+    * (Intrinsic component offset is in 32-bit units.)
+    *
+    * Small bitsize inputs consume the same space as 32-bit inputs,
+    * but 64-bit inputs consume twice as many.
+    * 64-bit variables must not have a component of 1 or 3.
+    * (See VK spec 15.1.5 "Component Assignment")
+    */
+   const unsigned component = nir_intrinsic_component(intrin) / (MAX2(32, bit_size) / 32);
+
+   /* Bitmask of components in bit_size units
+    * of the current input load that are actually used.
+    */
+   const unsigned dest_use_mask = nir_ssa_def_components_read(&intrin->dest.ssa) << component;
+
+   /* If the input is entirely unused, just replace it with undef.
+    * This is just in case we debug this pass without running DCE first.
+    */
+   if (!dest_use_mask)
+      return nir_ssa_undef(b, dest_num_components, bit_size);
+
+   const uint32_t attrib_binding = s->pl_key->vs.vertex_attribute_bindings[location];
+   const uint32_t attrib_offset = s->pl_key->vs.vertex_attribute_offsets[location];
+   const uint32_t attrib_stride = s->pl_key->vs.vertex_attribute_strides[location];
+   const enum pipe_format attrib_format = s->pl_key->vs.vertex_attribute_formats[location];
+   const struct util_format_description *f = util_format_description(attrib_format);
+   const unsigned binding_index =
+      s->info->vs.use_per_attribute_vb_descs ? location : attrib_binding;
+   const unsigned desc_index =
+      util_bitcount(s->info->vs.vb_desc_usage_mask & u_bit_consecutive(0, binding_index));
+
+   nir_ssa_def *vertex_buffers_arg = ac_nir_load_arg(b, &s->args->ac, s->args->ac.vertex_buffers);
+   nir_ssa_def *vertex_buffers =
+      nir_pack_64_2x32_split(b, vertex_buffers_arg, nir_imm_int(b, s->rad_info->address32_hi));
+   nir_ssa_def *descriptor =
+      nir_load_smem_amd(b, 4, vertex_buffers, nir_imm_int(b, desc_index * 16));
+   nir_ssa_def *base_index = calc_vs_input_index(b, location, s);
+   nir_ssa_def *zero = nir_imm_int(b, 0);
+
+   /* Try to shrink the load format by skipping unused components from the start.
+    * Beneficial because the backend may be able to emit fewer HW instructions.
+    * Only possible with array formats.
+    */
+   const unsigned first_used_channel = ffs(dest_use_mask) - 1;
+   const unsigned skipped_start = f->is_array ? first_used_channel : 0;
+
+   /* Number of channels we actually use and load.
+    * Don't shrink the format here because this might allow the backend to
+    * emit fewer (but larger than needed) HW instructions.
+    */
+   const unsigned first_trailing_unused_channel = util_last_bit(dest_use_mask);
+   const unsigned max_loaded_channels = MIN2(first_trailing_unused_channel, f->nr_channels);
+   const unsigned fetch_num_channels =
+      skipped_start >= max_loaded_channels ? 0 : max_loaded_channels - skipped_start;
+
+   /* Load VS inputs from VRAM.
+    *
+    * For the vast majority of cases this will only create 1x load_(typed)_buffer_amd
+    * intrinsic and the backend is responsible for further splitting that
+    * to as many HW instructions as needed based on alignment.
+    *
+    * Take care to prevent loaded components from failing the range check,
+    * by emitting several load intrinsics with different index sources.
+    * This is necessary because the backend can't further roll the const offset
+    * into the index source of MUBUF / MTBUF instructions.
+    */
+   nir_ssa_def *loads[NIR_MAX_VEC_COMPONENTS] = {0};
+   unsigned num_loads = 0;
+   for (unsigned x = 0, channels; x < fetch_num_channels; x += channels) {
+      channels = fetch_num_channels - x;
+      const unsigned start = skipped_start + x;
+      enum pipe_format fetch_format = attrib_format;
+      nir_ssa_def *index = base_index;
+
+      /* Add excess constant offset to the index. */
+      unsigned const_off = attrib_offset + count_format_bytes(f, 0, start);
+      if (attrib_stride && const_off > attrib_stride) {
+         index = nir_iadd_imm(b, base_index, const_off / attrib_stride);
+         const_off %= attrib_stride;
+      }
+
+      /* Reduce the number of loaded channels until we can pass the range check.
+       * Only for array formats. VK spec mandates proper alignment for packed formats.
+       * Note, NONE seems to occur in real use and is considered an array format.
+       */
+      if (f->is_array && fetch_format != PIPE_FORMAT_NONE) {
+         while (channels > 1 && attrib_stride &&
+                (const_off + count_format_bytes(f, start, channels)) > attrib_stride) {
+            channels--;
+         }
+
+         /* Keep the fetch format as large as possible to let the backend emit
+          * larger load instructions when it deems them beneficial.
+          */
+         fetch_format =
+            util_format_get_array(f->channel[0].type, f->channel[0].size, f->nr_channels - start,
+                                  f->is_unorm || f->is_snorm, f->channel[0].pure_integer);
+      }
+
+      assert(f->is_array || channels == fetch_num_channels);
+
+      /* Prefer using untyped buffer loads if possible, to avoid potential alignment issues.
+       * Typed loads can cause GPU hangs when used with improper alignment.
+       */
+      if (can_use_untyped_load(f, bit_size)) {
+         loads[num_loads++] =
+            nir_load_buffer_amd(b, channels, bit_size, descriptor, zero, zero, index,
+                                .base = const_off, .memory_modes = nir_var_shader_in);
+      } else {
+         const unsigned align_mul = MAX2(1, s->pl_key->vs.vertex_binding_align[attrib_binding]);
+         const unsigned align_offset = const_off % align_mul;
+
+         loads[num_loads++] = nir_load_typed_buffer_amd(
+            b, channels, bit_size, descriptor, zero, zero, index, .base = const_off,
+            .format = fetch_format, .align_mul = align_mul, .align_offset = align_offset,
+            .memory_modes = nir_var_shader_in);
+      }
+   }
+
+   nir_ssa_def *load = loads[0];
+
+   /* Extract the channels we actually need when we couldn't skip starting
+    * components or had to emit more than one load instrinsic.
+    */
+   if (num_loads > 0 && (first_used_channel > skipped_start || num_loads != 1))
+      load = nir_extract_bits(b, loads, num_loads, (first_used_channel - skipped_start) * bit_size,
+                              max_loaded_channels - first_used_channel, bit_size);
+
+   /* Return early if possible to avoid generating unnecessary IR. */
+   if (num_loads > 0 && first_used_channel == component &&
+       load->num_components == dest_num_components)
+      return load;
+
+   /* Fill unused and OOB components.
+    */
+   const nir_alu_type dst_type = nir_alu_type_get_base_type(nir_intrinsic_dest_type(intrin));
+   nir_ssa_def *channels[NIR_MAX_VEC_COMPONENTS] = {0};
+   for (unsigned i = 0; i < dest_num_components; ++i) {
+      const unsigned c = i + component;
+      if (!(dest_use_mask & BITFIELD_BIT(c))) {
+         /* Fill unused channels with zero. */
+         channels[i] = nir_imm_zero(b, 1, bit_size);
+      } else if (c < max_loaded_channels) {
+         /* Use channels that were loaded from VRAM. */
+         assert(c >= first_used_channel);
+         channels[i] = nir_channel(b, load, c - first_used_channel);
+      } else {
+         /* Handle input loads that are larger than their format. */
+         channels[i] = oob_input_load_value(b, c, bit_size, dst_type == nir_type_float);
+      }
+   }
+
+   return nir_vec(b, channels, dest_num_components);
+}
+
 static bool
 lower_vs_input_instr(nir_builder *b, nir_instr *instr, void *state)
 {
@@ -99,8 +338,7 @@ lower_vs_input_instr(nir_builder *b, nir_instr *instr, void *state)
   if (s->info->vs.dynamic_inputs) {
      replacement = lower_load_vs_input_from_prolog(b, intrin, s);
   } else {
-      /* TODO: lower non-dynamic inputs */
-      return false;
+      replacement = lower_load_vs_input(b, intrin, s);
   }

   nir_ssa_def_rewrite_uses(&intrin->dest.ssa, replacement);
@@ -112,7 +350,7 @@ lower_vs_input_instr(nir_builder *b, nir_instr *instr, void *state)

 bool
 radv_nir_lower_vs_inputs(nir_shader *shader, const struct radv_pipeline_stage *vs_stage,
-                         const struct radv_pipeline_key *pl_key, uint32_t address32_hi)
+                         const struct radv_pipeline_key *pl_key, const struct radeon_info *rad_info)
 {
   assert(shader->info.stage == MESA_SHADER_VERTEX);

@@ -120,7 +358,7 @@ radv_nir_lower_vs_inputs(nir_shader *shader, const struct radv_pipeline_stage *v
      .info = &vs_stage->info,
      .args = &vs_stage->args,
      .pl_key = pl_key,
-      .address32_hi = address32_hi,
+      .rad_info = rad_info,
   };

   return nir_shader_instructions_pass(shader, lower_vs_input_instr,
--- a/src/amd/vulkan/radv_pipeline.c
+++ b/src/amd/vulkan/radv_pipeline.c
@@ -3239,7 +3239,7 @@ radv_postprocess_nir(struct radv_pipeline *pipeline,
    */
   if (stage->stage == MESA_SHADER_VERTEX) {
      NIR_PASS(_, stage->nir, radv_nir_lower_vs_inputs, stage, pipeline_key,
-               device->physical_device->rad_info.address32_hi);
+               &device->physical_device->rad_info);
   }

   /* Lower I/O intrinsics to memory instructions. */
--- a/src/amd/vulkan/radv_shader.h
+++ b/src/amd/vulkan/radv_shader.h
@@ -557,7 +557,8 @@ void radv_nir_lower_abi(nir_shader *shader, enum amd_gfx_level gfx_level,
                        const struct radv_pipeline_key *pl_key, uint32_t address32_hi);

 bool radv_nir_lower_vs_inputs(nir_shader *shader, const struct radv_pipeline_stage *vs_stage,
-                              const struct radv_pipeline_key *key, uint32_t address32_hi);
+                              const struct radv_pipeline_key *key,
+                              const struct radeon_info *rad_info);

 void radv_init_shader_arenas(struct radv_device *device);
 void radv_destroy_shader_arenas(struct radv_device *device);