ir3: use isam.v for multi-component SSBO loads

Since a7xx, isam.v can be used to perform multi-component SSBO loads. Use this whenever possible to prevent excessive scalarization. isam.v also uses only a single coordinate (as opposed to a 2-dimensional coordinate for isam) so this reduces register pressure as well. Signed-off-by: Job Noorman <jnoorman@igalia.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/28664>
2024-04-10 10:43:38 +02:00
parent 455ebcccfb
commit c4fe247e62
8 changed files with 41 additions and 7 deletions
--- a/src/freedreno/common/freedreno_dev_info.h
+++ b/src/freedreno/common/freedreno_dev_info.h
@@ -175,6 +175,8 @@ struct fd_dev_info {
      /* See ir3_compiler::has_scalar_alu. */
      bool has_scalar_alu;

+      bool has_isam_v;
+
      /* Whether writing to UBWC attachment and reading the same image as input
       * attachment or as a texture reads correct values from the image.
       * If this is false, we may read stale values from the flag buffer,
--- a/src/freedreno/common/freedreno_devices.py
+++ b/src/freedreno/common/freedreno_devices.py
@@ -414,6 +414,7 @@ a6xx_gen4 = A6XXProps(
        has_lrz_dir_tracking = True,
        has_per_view_viewport = True,
        has_scalar_alu = True,
+        has_isam_v = True,
    )

 a6xx_a690_quirk = A6XXProps(
@@ -794,6 +795,7 @@ a7xx_base = A6XXProps(
        line_width_max = 127.5,
        has_scalar_alu = True,
        has_coherent_ubwc_flag_caches = True,
+        has_isam_v = True,
    )

 a7xx_725 = A7XXProps(
--- a/src/freedreno/ir3/ir3.c
+++ b/src/freedreno/ir3/ir3.c
@@ -1258,6 +1258,12 @@ ir3_valid_flags(struct ir3_instruction *instr, unsigned n, unsigned flags)
         return false;
      break;
   case 5:
+      if (instr->opc == OPC_ISAM && (instr->flags & IR3_INSTR_V)) {
+         if (((instr->flags & IR3_INSTR_S2EN) && n == 2) ||
+             (!(instr->flags & IR3_INSTR_S2EN) && n == 1)) {
+            return flags == IR3_REG_IMMED;
+         }
+      }
      /* no flags allowed */
      if (flags)
         return false;
--- a/src/freedreno/ir3/ir3_compiler.c
+++ b/src/freedreno/ir3/ir3_compiler.c
@@ -224,6 +224,7 @@ ir3_compiler_create(struct fd_device *dev, const struct fd_dev_id *dev_id,
      compiler->has_branch_and_or = true;
      compiler->has_predication = true;
      compiler->has_scalar_alu = dev_info->a6xx.has_scalar_alu;
+      compiler->has_isam_v = dev_info->a6xx.has_isam_v;
      compiler->fs_must_have_non_zero_constlen_quirk = dev_info->a7xx.fs_must_have_non_zero_constlen_quirk;
   } else {
      compiler->max_const_pipeline = 512;
@@ -237,6 +238,7 @@ ir3_compiler_create(struct fd_device *dev, const struct fd_dev_id *dev_id,
      compiler->max_const_safe = 256;

      compiler->has_scalar_alu = false;
+      compiler->has_isam_v = false;
   }

   /* This is just a guess for a4xx. */
--- a/src/freedreno/ir3/ir3_compiler.h
+++ b/src/freedreno/ir3/ir3_compiler.h
@@ -208,6 +208,9 @@ struct ir3_compiler {
   /* Whether SSBOs have descriptors for sampling with ISAM */
   bool has_isam_ssbo;

+   /* Whether isam.v is supported to sample multiple components from SSBOs */
+   bool has_isam_v;
+
   /* True if 16-bit descriptors are used for both 16-bit and 32-bit access. */
   bool storage_16bit;

--- a/src/freedreno/ir3/ir3_compiler_nir.c
+++ b/src/freedreno/ir3/ir3_compiler_nir.c
@@ -1592,9 +1592,11 @@ emit_intrinsic_load_ssbo(struct ir3_context *ctx,
                         nir_intrinsic_instr *intr,
                         struct ir3_instruction **dst)
 {
-   /* Note: isam currently can't handle vectorized loads/stores */
+   /* Note: we can only use isam for vectorized loads/stores if isam.v is
+    * available.
+    */
   if (!(nir_intrinsic_access(intr) & ACCESS_CAN_REORDER) ||
-       intr->def.num_components > 1 ||
+       (intr->def.num_components > 1 && !ctx->compiler->has_isam_v) ||
       !ctx->compiler->has_isam_ssbo) {
      ctx->funcs->emit_intrinsic_load_ssbo(ctx, intr, dst);
      return;
@@ -1602,13 +1604,27 @@ emit_intrinsic_load_ssbo(struct ir3_context *ctx,

   struct ir3_block *b = ctx->block;
   struct ir3_instruction *offset = ir3_get_src(ctx, &intr->src[2])[0];
-   struct ir3_instruction *coords = ir3_collect(b, offset, create_immed(b, 0));
+   struct ir3_instruction *coords = NULL;
+   unsigned imm_offset = 0;
+
+   if (ctx->compiler->has_isam_v) {
+      coords = offset;
+   } else {
+      coords = ir3_collect(b, offset, create_immed(b, 0));
+   }
+
   struct tex_src_info info = get_image_ssbo_samp_tex_src(ctx, &intr->src[0], false);

   unsigned num_components = intr->def.num_components;
+   assert(num_components == 1 || ctx->compiler->has_isam_v);
+
   struct ir3_instruction *sam =
      emit_sam(ctx, OPC_ISAM, info, utype_for_size(intr->def.bit_size),
-               MASK(num_components), coords, NULL);
+               MASK(num_components), coords, create_immed(b, imm_offset));
+
+   if (ctx->compiler->has_isam_v) {
+      sam->flags |= (IR3_INSTR_V | IR3_INSTR_INV_1D);
+   }

   ir3_handle_nonuniform(sam, intr);

--- a/src/freedreno/ir3/ir3_cp.c
+++ b/src/freedreno/ir3/ir3_cp.c
@@ -444,6 +444,7 @@ reg_cp(struct ir3_cp_ctx *ctx, struct ir3_instruction *instr,
                      (opc_cat(instr->opc) == 2) ||
                      (opc_cat(instr->opc) == 6) ||
                      is_meta(instr) ||
+                      (instr->opc == OPC_ISAM && (n == 1 || n == 2)) ||
                      (is_mad(instr->opc) && (n == 0)));

         if ((opc_cat(instr->opc) == 2) &&
--- a/src/freedreno/ir3/ir3_nir.c
+++ b/src/freedreno/ir3/ir3_nir.c
@@ -92,10 +92,11 @@ ir3_nir_should_scalarize_mem(const nir_instr *instr, const void *data)

   /* Scalarize load_ssbo's that we could otherwise lower to isam,
    * as the tex cache benefit outweighs the benefit of vectorizing
+    * Don't do this if (vectorized) isam.v is supported.
    */
   if ((intrin->intrinsic == nir_intrinsic_load_ssbo) &&
       (nir_intrinsic_access(intrin) & ACCESS_CAN_REORDER) &&
-       compiler->has_isam_ssbo) {
+       compiler->has_isam_ssbo && !compiler->has_isam_v) {
      return true;
   }

@@ -112,11 +113,12 @@ ir3_nir_should_vectorize_mem(unsigned align_mul, unsigned align_offset,
   unsigned byte_size = bit_size / 8;

   /* Don't vectorize load_ssbo's that we could otherwise lower to isam,
-    * as the tex cache benefit outweighs the benefit of vectorizing
+    * as the tex cache benefit outweighs the benefit of vectorizing. If we
+    * support isam.v, we can vectorize this though.
    */
   if ((low->intrinsic == nir_intrinsic_load_ssbo) &&
       (nir_intrinsic_access(low) & ACCESS_CAN_REORDER) &&
-       compiler->has_isam_ssbo) {
+       compiler->has_isam_ssbo && !compiler->has_isam_v) {
      return false;
   }