nir/lower_vars_to_scratch: calculate threshold-limited variable size separately

ir3's lowering of variables to scratch memory has to treat 8-bit values as 16-bit ones when comparing such value's size against the given threshold since those values are handled through 16-bit half-registers. But those values can still use natural 8-bit size and alignment for storing inside scratch memory. nir_lower_vars_to_scratch now accepts two size-and-alignment functions, one used for calculating the variable size and the other for calculating the size and alignment needed for storing inside scratch memory. Non-ir3 uses of this pass can just duplicate the currently-used function. ir3 provides a separate variable-size function that special-cases 8-bit types. Signed-off-by: Zan Dobersek <zdobersek@igalia.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/29875>
2024-07-14 08:59:27 +02:00
parent f8602612ed
commit 7fd5f76393
10 changed files with 46 additions and 13 deletions
--- a/src/amd/common/ac_nir.c
+++ b/src/amd/common/ac_nir.c
@@ -648,7 +648,7 @@ ac_nir_lower_indirect_derefs(nir_shader *shader,
    * scratch to alloca's, assuming LLVM won't generate VGPR indexing.
    */
   NIR_PASS(progress, shader, nir_lower_vars_to_scratch, nir_var_function_temp, 256,
-            glsl_get_natural_size_align_bytes);
+            glsl_get_natural_size_align_bytes, glsl_get_natural_size_align_bytes);

   /* LLVM doesn't support VGPR indexing on GFX9. */
   bool llvm_has_working_vgpr_indexing = gfx_level != GFX9;
--- a/src/asahi/compiler/agx_compile.c
+++ b/src/asahi/compiler/agx_compile.c
@@ -3293,7 +3293,7 @@ agx_preprocess_nir(nir_shader *nir, const nir_shader *libagx)

   /* Lower large arrays to scratch and small arrays to csel */
   NIR_PASS(_, nir, nir_lower_vars_to_scratch, nir_var_function_temp, 16,
-            glsl_get_natural_size_align_bytes);
+            glsl_get_natural_size_align_bytes, glsl_get_natural_size_align_bytes);
   NIR_PASS(_, nir, nir_lower_indirect_derefs, nir_var_function_temp, ~0);
   NIR_PASS(_, nir, nir_split_var_copies);
   NIR_PASS(_, nir, nir_lower_global_vars_to_local);
--- a/src/broadcom/compiler/vir.c
+++ b/src/broadcom/compiler/vir.c
@@ -1752,6 +1752,7 @@ v3d_attempt_compile(struct v3d_compile *c)
        NIR_PASS(_, c->s, nir_lower_vars_to_scratch,
                 nir_var_function_temp,
                 0,
+                 glsl_get_natural_size_align_bytes,
                 glsl_get_natural_size_align_bytes);

        NIR_PASS(_, c->s, v3d_nir_lower_global_2x32);
--- a/src/compiler/glsl_types.c
+++ b/src/compiler/glsl_types.c
@@ -3685,7 +3685,7 @@ glsl_channel_type(const glsl_type *t)
   }
 }

-static void
+void
 glsl_size_align_handle_array_and_structs(const glsl_type *type,
                                         glsl_type_size_align_func size_align,
                                         unsigned *size, unsigned *align)
--- a/src/compiler/glsl_types.h
+++ b/src/compiler/glsl_types.h
@@ -1350,6 +1350,9 @@ glsl_get_explicit_interface_type(const glsl_type *t, bool supports_std430)
   }
 }

+void glsl_size_align_handle_array_and_structs(const glsl_type *type,
+                                              glsl_type_size_align_func size_align,
+                                              unsigned *size, unsigned *align);
 void glsl_get_natural_size_align_bytes(const glsl_type *t, unsigned *size, unsigned *align);
 void glsl_get_vec4_size_align_bytes(const glsl_type *type, unsigned *size, unsigned *align);

--- a/src/compiler/nir/nir.h
+++ b/src/compiler/nir/nir.h
@@ -5434,7 +5434,8 @@ bool nir_lower_io_to_temporaries(nir_shader *shader,
 bool nir_lower_vars_to_scratch(nir_shader *shader,
                               nir_variable_mode modes,
                               int size_threshold,
-                               glsl_type_size_align_func size_align);
+                               glsl_type_size_align_func variable_size_align,
+                               glsl_type_size_align_func scratch_layout_size_align);

 void nir_lower_clip_halfz(nir_shader *shader);

--- a/src/compiler/nir/nir_lower_scratch.c
+++ b/src/compiler/nir/nir_lower_scratch.c
@@ -95,7 +95,8 @@ bool
 nir_lower_vars_to_scratch(nir_shader *shader,
                          nir_variable_mode modes,
                          int size_threshold,
-                          glsl_type_size_align_func size_align)
+                          glsl_type_size_align_func variable_size_align,
+                          glsl_type_size_align_func scratch_layout_size_align)
 {
   struct set *set = _mesa_pointer_set_create(NULL);

@@ -131,7 +132,7 @@ nir_lower_vars_to_scratch(nir_shader *shader,
               continue;

            unsigned var_size, var_align;
-            size_align(var->type, &var_size, &var_align);
+            variable_size_align(var->type, &var_size, &var_align);
            if (var_size <= size_threshold)
               continue;

@@ -207,13 +208,13 @@ nir_lower_vars_to_scratch(nir_shader *shader,

            if (var->data.location == INT_MAX) {
               unsigned var_size, var_align;
-               size_align(var->type, &var_size, &var_align);
+               scratch_layout_size_align(var->type, &var_size, &var_align);

               var->data.location = ALIGN_POT(shader->scratch_size, var_align);
               shader->scratch_size = var->data.location + var_size;
            }

-            lower_load_store(&build, intrin, size_align);
+            lower_load_store(&build, intrin, scratch_layout_size_align);
            impl_progress = true;
         }
      }
--- a/src/freedreno/ir3/ir3_nir.c
+++ b/src/freedreno/ir3/ir3_nir.c
@@ -181,6 +181,31 @@ ir3_lower_bit_size(const nir_instr *instr, UNUSED void *data)
   return 0;
 }

+static void
+ir3_get_variable_size_align_bytes(const glsl_type *type, unsigned *size, unsigned *align)
+{
+   switch (type->base_type) {
+   case GLSL_TYPE_ARRAY:
+   case GLSL_TYPE_INTERFACE:
+   case GLSL_TYPE_STRUCT:
+      glsl_size_align_handle_array_and_structs(type, ir3_get_variable_size_align_bytes,
+                                               size, align);
+      break;
+   case GLSL_TYPE_UINT8:
+   case GLSL_TYPE_INT8:
+      /* 8-bit values are handled through 16-bit half-registers, so the resulting size
+       * and alignment value has to be doubled to reflect the actual variable size
+       * requirement.
+       */
+      *size = 2 * glsl_get_components(type);
+      *align = 2;
+      break;
+   default:
+      glsl_get_natural_size_align_bytes(type, size, align);
+      break;
+   }
+}
+
 #define OPT(nir, pass, ...)                                                    \
   ({                                                                          \
      bool this_progress = false;                                              \
@@ -828,7 +853,8 @@ ir3_nir_lower_variant(struct ir3_shader_variant *so, nir_shader *s)
    */
   if (so->compiler->has_pvtmem) {
      progress |= OPT(s, nir_lower_vars_to_scratch, nir_var_function_temp,
-                      16 * 16 /* bytes */, glsl_get_natural_size_align_bytes);
+                      16 * 16 /* bytes */,
+                      ir3_get_variable_size_align_bytes, glsl_get_natural_size_align_bytes);
   }

   /* Lower scratch writemasks */
--- a/src/gallium/drivers/r600/sfn/sfn_nir.cpp
+++ b/src/gallium/drivers/r600/sfn/sfn_nir.cpp
@@ -847,6 +847,7 @@ r600_lower_and_optimize_nir(nir_shader *sh,
              nir_lower_vars_to_scratch,
              nir_var_function_temp,
              40,
+              r600_get_natural_size_align_bytes,
              r600_get_natural_size_align_bytes);

   while (optimize_once(sh))
--- a/src/panfrost/compiler/bifrost_compile.c
+++ b/src/panfrost/compiler/bifrost_compile.c
@@ -4937,12 +4937,12 @@ bifrost_preprocess_nir(nir_shader *nir, unsigned gpu_id)
    * (currently unconditional for Valhall), we force vec4 alignment for
    * scratch access.
    */
-   bool packed_tls = (gpu_id >= 0x9000);
-
+   glsl_type_size_align_func vars_to_scratch_size_align_func =
+      (gpu_id >= 0x9000) ? glsl_get_vec4_size_align_bytes
+                         : glsl_get_natural_size_align_bytes;
   /* Lower large arrays to scratch and small arrays to bcsel */
   NIR_PASS_V(nir, nir_lower_vars_to_scratch, nir_var_function_temp, 256,
-              packed_tls ? glsl_get_vec4_size_align_bytes
-                         : glsl_get_natural_size_align_bytes);
+              vars_to_scratch_size_align_func, vars_to_scratch_size_align_func);
   NIR_PASS_V(nir, nir_lower_indirect_derefs, nir_var_function_temp, ~0);

   NIR_PASS_V(nir, nir_split_var_copies);