ir3: Use generic const alloc for everything and call it once

With all consts going through generic allocations it's now possible to call ir3_setup_const_state once, and have lowerings that dynamically lower things to consts just to update the max consts being used. The only exception for now are immediates, since they eat up the space that was left and allocated much later. Signed-off-by: Danylo Piliaiev <dpiliaiev@igalia.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/32140>
2024-11-14 12:05:17 +01:00
parent cf73f89ba0
commit 3e5d4d50c5
14 changed files with 211 additions and 185 deletions
--- a/src/freedreno/computerator/a4xx.cc
+++ b/src/freedreno/computerator/a4xx.cc
@@ -207,7 +207,7 @@ cs_const_emit(struct fd_ringbuffer *ring, struct kernel *kernel,
   struct ir3_shader_variant *v = ir3_kernel->v;

   const struct ir3_const_state *const_state = ir3_const_state(v);
-   uint32_t base = const_state->offsets.immediate;
+   uint32_t base = const_state->allocs.max_const_offset_vec4;
   int size = DIV_ROUND_UP(const_state->immediates_count, 4);

   /* truncate size to avoid writing constants that shader
--- a/src/freedreno/computerator/a6xx.cc
+++ b/src/freedreno/computerator/a6xx.cc
@@ -316,7 +316,7 @@ cs_const_emit(struct fd_ringbuffer *ring, struct kernel *kernel,
   struct ir3_shader_variant *v = ir3_kernel->v;

   const struct ir3_const_state *const_state = ir3_const_state(v);
-   uint32_t base = const_state->offsets.immediate;
+   uint32_t base = const_state->allocs.max_const_offset_vec4;
   int size = DIV_ROUND_UP(const_state->immediates_count, 4);

   if (ir3_kernel->info.numwg != INVALID_REG) {
--- a/src/freedreno/ir3/ir3_a4xx.c
+++ b/src/freedreno/ir3/ir3_a4xx.c
@@ -215,7 +215,7 @@ get_image_offset(struct ir3_context *ctx, const nir_intrinsic_instr *instr,
      const struct ir3_const_state *const_state = ir3_const_state(ctx->so);
      assert(const_state->image_dims.mask & (1 << index));

-      cb = regid(const_state->offsets.image_dims, 0) +
+      cb = ir3_const_reg(const_state, IR3_CONST_ALLOC_IMAGE_DIMS, 0) +
         const_state->image_dims.off[index];
   } else {
      index += ctx->s->info.num_ssbos;
--- a/src/freedreno/ir3/ir3_compiler_nir.c
+++ b/src/freedreno/ir3/ir3_compiler_nir.c
@@ -1344,7 +1344,7 @@ emit_intrinsic_load_kernel_input(struct ir3_context *ctx,
   const struct ir3_const_state *const_state = ir3_const_state(ctx->so);
   struct ir3_builder *b = &ctx->build;
   unsigned offset = nir_intrinsic_base(intr);
-   unsigned p = regid(const_state->offsets.kernel_params, 0);
+   unsigned p = ir3_const_reg(const_state, IR3_CONST_ALLOC_KERNEL_PARAMS, 0);

   struct ir3_instruction *src0 = ir3_get_src(ctx, &intr->src[0])[0];

@@ -2600,8 +2600,10 @@ emit_intrinsic(struct ir3_context *ctx, nir_intrinsic_instr *intr)
   }

   const struct ir3_const_state *const_state = ir3_const_state(ctx->so);
-   const unsigned primitive_param = const_state->offsets.primitive_param * 4;
-   const unsigned primitive_map = const_state->offsets.primitive_map * 4;
+   const unsigned primitive_param =
+      const_state->allocs.consts[IR3_CONST_ALLOC_PRIMITIVE_PARAM].offset_vec4 * 4;
+   const unsigned primitive_map =
+      const_state->allocs.consts[IR3_CONST_ALLOC_PRIMITIVE_MAP].offset_vec4 * 4;

   switch (intr->intrinsic) {
   case nir_intrinsic_decl_reg:
@@ -4732,7 +4734,9 @@ emit_stream_out(struct ir3_context *ctx)
      unsigned stride = strmout->stride[i];
      struct ir3_instruction *base, *off;

-      base = create_uniform(&ctx->build, regid(const_state->offsets.tfbo, i));
+      base = create_uniform(
+         &ctx->build,
+         ir3_const_reg(const_state, IR3_CONST_ALLOC_TFBO, i));

      /* 24-bit should be enough: */
      off = ir3_MUL_U24(&ctx->build, vtxcnt, 0,
--- a/src/freedreno/ir3/ir3_nir.c
+++ b/src/freedreno/ir3/ir3_nir.c
@@ -1110,6 +1110,10 @@ ir3_nir_lower_variant(struct ir3_shader_variant *so,
   if (so->compiler->load_shader_consts_via_preamble)
      progress |= OPT(s, ir3_nir_lower_driver_params_to_ubo, so);

+   if (!so->binning_pass) {
+      ir3_setup_const_state(s, so, ir3_const_state_mut(so));
+   }
+
   /* Do the preamble before analysing UBO ranges, because it's usually
    * higher-value and because it can result in eliminating some indirect UBO
    * accesses where otherwise we'd have to push the whole range. However we
@@ -1221,13 +1225,6 @@ ir3_nir_lower_variant(struct ir3_shader_variant *so,
   }

   nir_sweep(s);
-
-   /* Binning pass variants re-use  the const_state of the corresponding
-    * draw pass shader, so that same const emit can be re-used for both
-    * passes:
-    */
-   if (!so->binning_pass)
-      ir3_setup_const_state(s, so, ir3_const_state_mut(so));
 }

 bool
@@ -1299,9 +1296,11 @@ ir3_get_driver_param_info(const nir_shader *shader, nir_intrinsic_instr *intr,
   return true;
 }

-static void
-ir3_nir_scan_driver_consts(struct ir3_compiler *compiler, nir_shader *shader, struct ir3_const_state *layout)
+uint32_t
+ir3_nir_scan_driver_consts(struct ir3_compiler *compiler, nir_shader *shader,
+                           struct ir3_const_image_dims *image_dims)
 {
+   uint32_t num_driver_params = 0;
   nir_foreach_function (function, shader) {
      if (!function->impl)
         continue;
@@ -1314,32 +1313,34 @@ ir3_nir_scan_driver_consts(struct ir3_compiler *compiler, nir_shader *shader, st
            nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
            unsigned idx;

-            switch (intr->intrinsic) {
-            case nir_intrinsic_image_atomic:
-            case nir_intrinsic_image_atomic_swap:
-            case nir_intrinsic_image_load:
-            case nir_intrinsic_image_store:
-            case nir_intrinsic_image_size:
-               /* a4xx gets these supplied by the hw directly (maybe CP?) */
-               if (compiler->gen == 5 &&
-                   !(intr->intrinsic == nir_intrinsic_image_load &&
-                     !(nir_intrinsic_access(intr) & ACCESS_COHERENT))) {
-                  idx = nir_src_as_uint(intr->src[0]);
-                  if (layout->image_dims.mask & (1 << idx))
-                     break;
-                  layout->image_dims.mask |= (1 << idx);
-                  layout->image_dims.off[idx] = layout->image_dims.count;
-                  layout->image_dims.count += 3; /* three const per */
+            if (image_dims) {
+               switch (intr->intrinsic) {
+               case nir_intrinsic_image_atomic:
+               case nir_intrinsic_image_atomic_swap:
+               case nir_intrinsic_image_load:
+               case nir_intrinsic_image_store:
+               case nir_intrinsic_image_size:
+                  /* a4xx gets these supplied by the hw directly (maybe CP?) */
+                  if (compiler->gen == 5 &&
+                     !(intr->intrinsic == nir_intrinsic_image_load &&
+                        !(nir_intrinsic_access(intr) & ACCESS_COHERENT))) {
+                     idx = nir_src_as_uint(intr->src[0]);
+                     if (image_dims->mask & (1 << idx))
+                        break;
+                     image_dims->mask |= (1 << idx);
+                     image_dims->off[idx] = image_dims->count;
+                     image_dims->count += 3; /* three const per */
+                  }
+                  break;
+               default:
+                  break;
               }
-               break;
-            default:
-               break;
            }

            struct driver_param_info param_info;
            if (ir3_get_driver_param_info(shader, intr, &param_info)) {
-               layout->num_driver_params =
-                  MAX2(layout->num_driver_params,
+               num_driver_params =
+                  MAX2(num_driver_params,
                       param_info.offset + nir_intrinsic_dest_components(intr));
            }
         }
@@ -1353,9 +1354,11 @@ ir3_nir_scan_driver_consts(struct ir3_compiler *compiler, nir_shader *shader, st
    */
   if (!compiler->has_shared_regfile &&
         shader->info.stage == MESA_SHADER_COMPUTE) {
-      layout->num_driver_params =
-         MAX2(layout->num_driver_params, IR3_DP_CS(workgroup_id_z) + 1);
+      num_driver_params =
+         MAX2(num_driver_params, IR3_DP_CS(workgroup_id_z) + 1);
   }
+
+   return num_driver_params;
 }

 void
@@ -1413,10 +1416,46 @@ ir3_const_alloc_all_reserved_space(struct ir3_const_allocations *const_alloc)
   const_alloc->reserved_vec4 = 0;
 }

-/* Sets up the variant-dependent constant state for the ir3_shader.  Note
- * that it is also used from ir3_nir_analyze_ubo_ranges() to figure out the
- * maximum number of driver params that would eventually be used, to leave
- * space for this function to allocate the driver params.
+void
+ir3_alloc_driver_params(struct ir3_const_allocations *const_alloc,
+                        uint32_t *num_driver_params,
+                        struct ir3_compiler *compiler,
+                        gl_shader_stage shader_stage)
+{
+   if (*num_driver_params == 0)
+      return;
+
+   /* num_driver_params in dwords.  we only need to align to vec4s for the
+    * common case of immediate constant uploads, but for indirect dispatch
+    * the constants may also be indirect and so we have to align the area in
+    * const space to that requirement.
+    */
+   *num_driver_params = align(*num_driver_params, 4);
+   unsigned upload_unit = 1;
+   if (shader_stage == MESA_SHADER_COMPUTE ||
+       (*num_driver_params >= IR3_DP_VS(vtxid_base))) {
+      upload_unit = compiler->const_upload_unit;
+   }
+
+   /* offset cannot be 0 for vs params loaded by CP_DRAW_INDIRECT_MULTI */
+   if (shader_stage == MESA_SHADER_VERTEX && compiler->gen >= 6)
+      const_alloc->max_const_offset_vec4 =
+         MAX2(const_alloc->max_const_offset_vec4, 1);
+
+   uint32_t driver_params_size_vec4 =
+      align(*num_driver_params / 4, upload_unit);
+   ir3_const_alloc(const_alloc, IR3_CONST_ALLOC_DRIVER_PARAMS,
+                   driver_params_size_vec4, upload_unit);
+}
+
+/* Sets up the variant-dependent constant state for the ir3_shader.
+ * The consts allocation flow is as follows:
+ * 1) Turnip/Freedreno allocates consts required by corresponding API,
+ *    e.g. push const, inline uniforms, etc. Then passes ir3_const_allocations
+ *    into IR3.
+ * 2) ir3_setup_const_state pre-allocates consts with non-negotiable size.
+ * 3) IR3 lowerings afterwards allocate from the free space left.
+ * 4) Allocate offsets for consts from step 2)
 */
 void
 ir3_setup_const_state(nir_shader *nir, struct ir3_shader_variant *v,
@@ -1425,9 +1464,8 @@ ir3_setup_const_state(nir_shader *nir, struct ir3_shader_variant *v,
   struct ir3_compiler *compiler = v->compiler;
   unsigned ptrsz = ir3_pointer_size(compiler);

-   memset(&const_state->offsets, ~0, sizeof(const_state->offsets));
-
-   ir3_nir_scan_driver_consts(compiler, nir, const_state);
+   const_state->num_driver_params =
+      ir3_nir_scan_driver_consts(compiler, nir, &const_state->image_dims);

   if ((compiler->gen < 5) && (v->stream_output.num_outputs > 0)) {
      const_state->num_driver_params =
@@ -1438,92 +1476,56 @@ ir3_setup_const_state(nir_shader *nir, struct ir3_shader_variant *v,

   assert((const_state->ubo_state.size % 16) == 0);

-   /* IR3_CONST_ALLOC_DRIVER_PARAMS could have been allocated earlier. */
-   if (const_state->allocs.consts[IR3_CONST_ALLOC_DRIVER_PARAMS].size_vec4 == 0) {
-      ir3_nir_scan_driver_consts(compiler, nir, const_state);
-      if (const_state->num_driver_params > 0) {
-        /* num_driver_params in dwords.  we only need to align to vec4s for the
-         * common case of immediate constant uploads, but for indirect dispatch
-         * the constants may also be indirect and so we have to align the area in
-         * const space to that requirement.
-         */
-         const_state->num_driver_params = align(const_state->num_driver_params, 4);
-         unsigned upload_unit = 1;
-         if (v->type == MESA_SHADER_COMPUTE ||
-            (const_state->num_driver_params >= IR3_DP_VS(vtxid_base))) {
-            upload_unit = compiler->const_upload_unit;
-         }
-
-         /* offset cannot be 0 for vs params loaded by CP_DRAW_INDIRECT_MULTI */
-         if (v->type == MESA_SHADER_VERTEX && compiler->gen >= 6)
-            const_state->allocs.max_const_offset_vec4 =
-               MAX2(const_state->allocs.max_const_offset_vec4, 1);
-
-         uint32_t driver_params_size_vec4 =
-            align(const_state->num_driver_params / 4, upload_unit);
-         ir3_const_alloc(&const_state->allocs, IR3_CONST_ALLOC_DRIVER_PARAMS,
-                         driver_params_size_vec4, upload_unit);
-      }
-   }
-
-   unsigned constoff = const_state->allocs.max_const_offset_vec4;
+   ir3_alloc_driver_params(&const_state->allocs,
+                           &const_state->num_driver_params, compiler,
+                           v->type);

   if (const_state->image_dims.count > 0) {
-      unsigned cnt = const_state->image_dims.count;
-      const_state->offsets.image_dims = constoff;
-      constoff += align(cnt, 4) / 4;
+      ir3_const_reserve_space(&const_state->allocs, IR3_CONST_ALLOC_IMAGE_DIMS,
+                              align(const_state->image_dims.count, 4) / 4, 1);
   }

-   if (v->type == MESA_SHADER_KERNEL) {
-      const_state->offsets.kernel_params = constoff;
-      constoff += align(v->cs.req_input_mem, 4) / 4;
+   if (v->type == MESA_SHADER_KERNEL && v->cs.req_input_mem) {
+      ir3_const_reserve_space(&const_state->allocs,
+                              IR3_CONST_ALLOC_KERNEL_PARAMS,
+                              align(v->cs.req_input_mem, 4) / 4, 1);
   }

   if ((v->type == MESA_SHADER_VERTEX) && (compiler->gen < 5) &&
       v->stream_output.num_outputs > 0) {
-      const_state->offsets.tfbo = constoff;
-      constoff += align(IR3_MAX_SO_BUFFERS * ptrsz, 4) / 4;
+      ir3_const_reserve_space(&const_state->allocs, IR3_CONST_ALLOC_TFBO,
+                              align(IR3_MAX_SO_BUFFERS * ptrsz, 4) / 4, 1);
   }

   if (!compiler->load_shader_consts_via_preamble) {
      switch (v->type) {
      case MESA_SHADER_TESS_CTRL:
      case MESA_SHADER_TESS_EVAL:
-         const_state->offsets.primitive_param = constoff;
-         constoff += 2;
-
-         const_state->offsets.primitive_map = constoff;
+         ir3_const_reserve_space(&const_state->allocs,
+                                 IR3_CONST_ALLOC_PRIMITIVE_PARAM, 2, 1);
         break;
      case MESA_SHADER_GEOMETRY:
-         const_state->offsets.primitive_param = constoff;
-         constoff += 1;
-
-         const_state->offsets.primitive_map = constoff;
+         ir3_const_reserve_space(&const_state->allocs,
+                                 IR3_CONST_ALLOC_PRIMITIVE_PARAM, 1, 1);
         break;
      default:
         break;
      }
   }

-   switch (v->type) {
-   case MESA_SHADER_VERTEX:
-      const_state->offsets.primitive_param = constoff;
-      constoff += 1;
-      break;
-   case MESA_SHADER_TESS_CTRL:
-   case MESA_SHADER_TESS_EVAL:
-      constoff += DIV_ROUND_UP(v->input_size, 4);
-      break;
-   case MESA_SHADER_GEOMETRY:
-      constoff += DIV_ROUND_UP(v->input_size, 4);
-      break;
-   default:
-      break;
+   if (v->type == MESA_SHADER_VERTEX) {
+      ir3_const_reserve_space(&const_state->allocs,
+                              IR3_CONST_ALLOC_PRIMITIVE_PARAM, 1, 1);
   }

-   const_state->offsets.immediate = constoff;
+   if ((v->type == MESA_SHADER_TESS_CTRL || v->type == MESA_SHADER_TESS_EVAL ||
+        v->type == MESA_SHADER_GEOMETRY)) {
+      ir3_const_reserve_space(&const_state->allocs,
+                              IR3_CONST_ALLOC_PRIMITIVE_MAP,
+                              DIV_ROUND_UP(v->input_size, 4), 1);
+   }

-   assert(constoff <= ir3_max_const(v));
+   assert(const_state->allocs.max_const_offset_vec4 <= ir3_max_const(v));
 }

 uint32_t
@@ -1531,8 +1533,9 @@ ir3_const_state_get_free_space(const struct ir3_shader_variant *v,
                               const struct ir3_const_state *const_state,
                               uint32_t align_vec4)
 {
-   uint32_t free_space_vec4 =
-      ir3_max_const(v) - align(const_state->offsets.immediate, align_vec4) -
-      const_state->allocs.reserved_vec4;
+   uint32_t aligned_offset_vec4 =
+      align(const_state->allocs.max_const_offset_vec4, align_vec4);
+   uint32_t free_space_vec4 = ir3_max_const(v) - aligned_offset_vec4 -
+                              const_state->allocs.reserved_vec4;
   return free_space_vec4;
 }
--- a/src/freedreno/ir3/ir3_nir.h
+++ b/src/freedreno/ir3/ir3_nir.h
@@ -89,6 +89,13 @@ void ir3_const_free_reserved_space(struct ir3_const_allocations *const_alloc,
                                   enum ir3_const_alloc_type type);
 void ir3_const_alloc_all_reserved_space(struct ir3_const_allocations *const_alloc);

+uint32_t ir3_nir_scan_driver_consts(struct ir3_compiler *compiler,
+                                    nir_shader *shader,
+                                    struct ir3_const_image_dims *image_dims);
+void ir3_alloc_driver_params(struct ir3_const_allocations *const_alloc,
+                             uint32_t *num_driver_params,
+                             struct ir3_compiler *compiler,
+                             enum pipe_shader_type shader_type);
 bool ir3_nir_lower_load_constant(nir_shader *nir, struct ir3_shader_variant *v);
 void ir3_nir_analyze_ubo_ranges(nir_shader *nir, struct ir3_shader_variant *v);
 bool ir3_nir_lower_ubo_loads(nir_shader *nir, struct ir3_shader_variant *v);
--- a/src/freedreno/ir3/ir3_nir_analyze_ubo_ranges.c
+++ b/src/freedreno/ir3/ir3_nir_analyze_ubo_ranges.c
@@ -560,8 +560,7 @@ ir3_nir_lower_const_global_loads(nir_shader *nir, struct ir3_shader_variant *v)
      global_offset =
         const_state->allocs.consts[IR3_CONST_ALLOC_GLOBAL].offset_vec4 * 16;
   } else {
-      struct ir3_const_state *const_state = ir3_const_state_mut(v);
-      ir3_setup_const_state(nir, v, const_state);
+      const struct ir3_const_state *const_state = ir3_const_state(v);
      global_offset = const_state->allocs.max_const_offset_vec4 * 16;
      max_upload =
         ir3_const_state_get_free_space(v, const_state, 1) * 16;
@@ -643,10 +642,9 @@ ir3_nir_analyze_ubo_ranges(nir_shader *nir, struct ir3_shader_variant *v)
   /* Limit our uploads to the amount of constant buffer space available in
    * the hardware, minus what the shader compiler may need for various
    * driver params.  We do this UBO-to-push-constant before the real
-    * allocation of the driver params' const space, because UBO pointers can
+    * allocation of the UBO pointers' const space, because UBO pointers can
    * be driver params but this pass usually eliminatings them.
    */
-   ir3_setup_const_state(nir, v, const_state);
   const uint32_t max_upload =
      ir3_const_state_get_free_space(v, const_state, align_vec4) * 16;

--- a/src/freedreno/ir3/ir3_nir_opt_preamble.c
+++ b/src/freedreno/ir3/ir3_nir_opt_preamble.c
@@ -287,8 +287,7 @@ ir3_nir_opt_preamble(nir_shader *nir, struct ir3_shader_variant *v)
      max_size =
         const_state->allocs.consts[IR3_CONST_ALLOC_PREAMBLE].size_vec4 * 4;
   } else {
-      struct ir3_const_state *const_state = ir3_const_state_mut(v);
-      ir3_setup_const_state(nir, v, const_state);
+      const struct ir3_const_state *const_state = ir3_const_state(v);
      max_size = ir3_const_state_get_free_space(
                    v, const_state, v->compiler->const_upload_unit) * 4;
   }
--- a/src/freedreno/ir3/ir3_shader.c
+++ b/src/freedreno/ir3/ir3_shader.c
@@ -28,7 +28,7 @@
 static uint16_t
 const_imm_index_to_reg(const struct ir3_const_state *const_state, unsigned i)
 {
-   return i + (4 * const_state->offsets.immediate);
+   return i + (4 * const_state->allocs.max_const_offset_vec4);
 }

 uint16_t
@@ -69,7 +69,8 @@ ir3_const_add_imm(struct ir3_shader_variant *v, uint32_t imm)
   /* Add on a new immediate to be pushed, if we have space left in the
    * constbuf.
    */
-   if (const_state->offsets.immediate + const_state->immediates_count / 4 >=
+   if (const_state->allocs.max_const_offset_vec4 +
+          const_state->immediates_count / 4 >=
       ir3_max_const(v)) {
      return INVALID_CONST_REG;
   }
@@ -776,6 +777,16 @@ ir3_const_alloc_type_to_string(enum ir3_const_alloc_type type)
      return "global";
   case IR3_CONST_ALLOC_UBO_PTRS:
      return "ubo_ptrs";
+   case IR3_CONST_ALLOC_IMAGE_DIMS:
+      return "image_dims";
+   case IR3_CONST_ALLOC_KERNEL_PARAMS:
+      return "kernel_params";
+   case IR3_CONST_ALLOC_TFBO:
+      return "tfbo";
+   case IR3_CONST_ALLOC_PRIMITIVE_PARAM:
+      return "primitive_param";
+   case IR3_CONST_ALLOC_PRIMITIVE_MAP:
+      return "primitive_map";
   default:
      return "unknown";
   }
@@ -800,16 +811,6 @@ dump_const_state(struct ir3_shader_variant *so, FILE *out)
      }
   }

-   if (cs->offsets.image_dims != ~0)
-      fprintf(out, ";   image_dims:       c%u.x\n", cs->offsets.image_dims);
-   if (cs->offsets.kernel_params != ~0)
-      fprintf(out, ";   kernel_params:    c%u.x\n", cs->offsets.kernel_params);
-   if (cs->offsets.tfbo != ~0)
-      fprintf(out, ";   tfbo:             c%u.x\n", cs->offsets.tfbo);
-   if (cs->offsets.primitive_param != ~0)
-      fprintf(out, ";   primitive_params: c%u.x\n", cs->offsets.primitive_param);
-   if (cs->offsets.primitive_map != ~0)
-      fprintf(out, ";   primitive_map:    c%u.x\n", cs->offsets.primitive_map);
   fprintf(out, "; ubo_state:\n");
   fprintf(out, ";   num_enabled:      %u\n", us->num_enabled);
   for (unsigned i = 0; i < us->num_enabled; i++) {
@@ -912,7 +913,8 @@ ir3_shader_disasm(struct ir3_shader_variant *so, uint32_t *bin, FILE *out)

   const struct ir3_const_state *const_state = ir3_const_state(so);
   for (i = 0; i < DIV_ROUND_UP(const_state->immediates_count, 4); i++) {
-      fprintf(out, "@const(c%d.x)\t", const_state->offsets.immediate + i);
+      fprintf(out, "@const(c%d.x)\t",
+              const_state->allocs.max_const_offset_vec4 + i);
      fprintf(out, "0x%08x, 0x%08x, 0x%08x, 0x%08x\n",
              const_state->immediates[i * 4 + 0],
              const_state->immediates[i * 4 + 1],
--- a/src/freedreno/ir3/ir3_shader.h
+++ b/src/freedreno/ir3/ir3_shader.h
@@ -206,7 +206,24 @@ enum ir3_const_alloc_type {
   IR3_CONST_ALLOC_GLOBAL = 6,
   /* OpenGL, pre-a6xx; pointers to UBOs */
   IR3_CONST_ALLOC_UBO_PTRS = 7,
-   IR3_CONST_ALLOC_MAX = 8,
+   /* OpenGL, a5xx only; needed to calculate pixel offset, but only
+    * for images that have image_{load,store,size,atomic*} intrinsics.
+    */
+   IR3_CONST_ALLOC_IMAGE_DIMS = 8,
+   /* OpenCL */
+   IR3_CONST_ALLOC_KERNEL_PARAMS = 9,
+   /* OpenGL, TFBO addresses only for vs on a3xx/a4xx */
+   IR3_CONST_ALLOC_TFBO = 10,
+   /* Common, stage-dependent primitive params:
+    *  vs, gs: uvec4(primitive_stride, vertex_stride, 0, 0)
+    *  hs, ds: uvec4(primitive_stride, vertex_stride,
+    *                patch_stride, patch_vertices_in)
+    *          uvec4(tess_param_base, tess_factor_base)
+    */
+   IR3_CONST_ALLOC_PRIMITIVE_PARAM = 11,
+   /* Common, mapping from varying location to offset. */
+   IR3_CONST_ALLOC_PRIMITIVE_MAP = 12,
+   IR3_CONST_ALLOC_MAX = 13,
 };

 struct ir3_const_allocation {
@@ -232,30 +249,30 @@ ir3_const_can_upload(const struct ir3_const_allocations *const_alloc,
          const_alloc->consts[type].offset_vec4 < shader_const_size_vec4;
 }

+struct ir3_const_image_dims {
+   uint32_t mask;  /* bitmask of images that have image_store */
+   uint32_t count; /* number of consts allocated */
+   /* three const allocated per image which has image_store:
+      *  + cpp         (bytes per pixel)
+      *  + pitch       (y pitch)
+      *  + array_pitch (z pitch)
+      */
+   uint32_t off[IR3_MAX_SHADER_IMAGES];
+};
+
 /**
- * Describes the layout of shader consts in the const register file.
+ * Describes the layout of shader consts in the const register file
+ * and additional info about individual allocations.
 *
- * Layout of constant registers, each section aligned to vec4.  Note
- * that pointer size (ubo, etc) changes depending on generation.
+ * Each consts section is aligned to vec4. Note that pointer
+ * size (ubo, etc) changes depending on generation.
 *
- *   + user consts: only used for turnip push consts
- *   + Optional consts: ubo ranges, preamble, global, etc.
- *   + UBO addresses: turnip is bindless and these are wasted
- *   + image dimensions: a5xx only; needed to calculate pixel offset, but only
- *     for images that have image_{load,store,size,atomic*} intrinsics
- *   + kernel params: cl only
- *   + driver params: these are stage-dependent; see ir3_driver_param
- *   + TFBO addresses: only for vs on a3xx/a4xx
- *   + primitive params: these are stage-dependent
- *       vs, gs: uvec4(primitive_stride, vertex_stride, 0, 0)
- *       hs, ds: uvec4(primitive_stride, vertex_stride,
- *                     patch_stride, patch_vertices_in)
- *               uvec4(tess_param_base, tess_factor_base)
- *   + primitive map
- *   + lowered immediates
- *
- * Immediates go last mostly because they are inserted in the CP pass
- * after the nir -> ir3 frontend.
+ * The consts allocation flow is as follows:
+ * 1) Turnip/Freedreno allocates consts required by corresponding API,
+ *    e.g. push const, inline uniforms, etc. Then passes ir3_const_allocations
+ *    into IR3.
+ * 2) ir3_setup_const_state allocates consts with non-negotiable size.
+ * 3) IR3 lowerings afterwards allocate from the free space left.
 *
 * Note UBO size in bytes should be aligned to vec4
 */
@@ -268,28 +285,9 @@ struct ir3_const_state {
   struct ir3_driver_ubo driver_params_ubo;
   struct ir3_driver_ubo primitive_map_ubo, primitive_param_ubo;

-   struct {
-      /* Required consts, cannot negotiate their size */
-      unsigned image_dims;
-      unsigned kernel_params;
-      unsigned tfbo;
-      unsigned primitive_param;
-      unsigned primitive_map;
-      unsigned immediate;
-   } offsets;
-
   struct ir3_const_allocations allocs;

-   struct {
-      uint32_t mask;  /* bitmask of images that have image_store */
-      uint32_t count; /* number of consts allocated */
-      /* three const allocated per image which has image_store:
-       *  + cpp         (bytes per pixel)
-       *  + pitch       (y pitch)
-       *  + array_pitch (z pitch)
-       */
-      uint32_t off[IR3_MAX_SHADER_IMAGES];
-   } image_dims;
+   struct ir3_const_image_dims image_dims;

   unsigned immediates_count;
   unsigned immediates_size;
--- a/src/freedreno/vulkan/tu_pipeline.cc
+++ b/src/freedreno/vulkan/tu_pipeline.cc
@@ -619,10 +619,10 @@ tu6_emit_const(struct tu_cs *cs, uint32_t opcode, enum tu_geom_consts_type type,
      uint32_t base;
      switch (type) {
      case TU_CONSTS_PRIMITIVE_MAP:
-         base = const_state->offsets.primitive_map;
+         base = const_state->allocs.consts[IR3_CONST_ALLOC_PRIMITIVE_MAP].offset_vec4;
         break;
      case TU_CONSTS_PRIMITIVE_PARAM:
-         base = const_state->offsets.primitive_param;
+         base = const_state->allocs.consts[IR3_CONST_ALLOC_PRIMITIVE_PARAM].offset_vec4;
         break;
      default:
         unreachable("bad consts type");
--- a/src/freedreno/vulkan/tu_shader.cc
+++ b/src/freedreno/vulkan/tu_shader.cc
@@ -1124,7 +1124,7 @@ static uint32_t
 tu_xs_get_immediates_packet_size_dwords(const struct ir3_shader_variant *xs)
 {
   const struct ir3_const_state *const_state = ir3_const_state(xs);
-   uint32_t base = const_state->offsets.immediate;
+   uint32_t base = const_state->allocs.max_const_offset_vec4;
   int32_t size = DIV_ROUND_UP(const_state->immediates_count, 4);

   /* truncate size to avoid writing constants that shader
@@ -1332,7 +1332,7 @@ tu6_emit_xs(struct tu_cs *cs,
   /* emit immediates */

   const struct ir3_const_state *const_state = ir3_const_state(xs);
-   uint32_t base = const_state->offsets.immediate;
+   uint32_t base = const_state->allocs.max_const_offset_vec4;
   unsigned immediate_size = tu_xs_get_immediates_packet_size_dwords(xs);

   if (immediate_size > 0) {
--- a/src/gallium/drivers/freedreno/a6xx/fd6_const.cc
+++ b/src/gallium/drivers/freedreno/a6xx/fd6_const.cc
@@ -167,8 +167,11 @@ emit_stage_tess_consts(struct fd_ringbuffer *ring, const struct ir3_shader_varia
      int base = const_state->primitive_param_ubo.idx;

      fd6_upload_emit_driver_ubo(ctx, ring, v, base, num_params, params);
-   } else {
-      const unsigned regid = const_state->offsets.primitive_param;
+   } else if (ir3_const_can_upload(&const_state->allocs,
+                                   IR3_CONST_ALLOC_PRIMITIVE_PARAM,
+                                   v->constlen)) {
+      const unsigned regid =
+         const_state->allocs.consts[IR3_CONST_ALLOC_PRIMITIVE_PARAM].offset_vec4;
      int size = MIN2(1 + regid, v->constlen) - regid;
      if (size > 0)
         fd6_emit_const_user(ring, v, regid * 4, num_params, params);
--- a/src/gallium/drivers/freedreno/ir3/ir3_const.h
+++ b/src/gallium/drivers/freedreno/ir3/ir3_const.h
@@ -245,8 +245,10 @@ ir3_emit_image_dims(struct fd_screen *screen,
                    struct fd_shaderimg_stateobj *si)
 {
   const struct ir3_const_state *const_state = ir3_const_state(v);
-   uint32_t offset = const_state->offsets.image_dims;
-   if (v->constlen > offset) {
+   uint32_t offset =
+      const_state->allocs.consts[IR3_CONST_ALLOC_IMAGE_DIMS].offset_vec4;
+   if (ir3_const_can_upload(&const_state->allocs, IR3_CONST_ALLOC_IMAGE_DIMS,
+                            v->constlen)) {
      uint32_t dims[align(const_state->image_dims.count, 4)];
      unsigned mask = const_state->image_dims.mask;

@@ -297,7 +299,7 @@ ir3_emit_immediates(const struct ir3_shader_variant *v,
                    struct fd_ringbuffer *ring)
 {
   const struct ir3_const_state *const_state = ir3_const_state(v);
-   uint32_t base = const_state->offsets.immediate;
+   uint32_t base = const_state->allocs.max_const_offset_vec4;
   int size = DIV_ROUND_UP(const_state->immediates_count, 4);

   /* truncate size to avoid writing constants that shader
@@ -324,7 +326,13 @@ ir3_emit_link_map(const struct ir3_shader_variant *producer,
                  struct fd_ringbuffer *ring)
 {
   const struct ir3_const_state *const_state = ir3_const_state(consumer);
-   uint32_t base = const_state->offsets.primitive_map;
+   if (!ir3_const_can_upload(&const_state->allocs,
+                             IR3_CONST_ALLOC_PRIMITIVE_MAP,
+                             consumer->constlen))
+      return;
+
+   uint32_t base =
+      const_state->allocs.consts[IR3_CONST_ALLOC_PRIMITIVE_MAP].offset_vec4;
   int size = DIV_ROUND_UP(consumer->input_size, 4);

   /* truncate size to avoid writing constants that shader
@@ -347,8 +355,10 @@ emit_tfbos(struct fd_context *ctx, const struct ir3_shader_variant *v,
 {
   /* streamout addresses after driver-params: */
   const struct ir3_const_state *const_state = ir3_const_state(v);
-   uint32_t offset = const_state->offsets.tfbo;
-   if (v->constlen > offset) {
+   uint32_t offset =
+      const_state->allocs.consts[IR3_CONST_ALLOC_TFBO].offset_vec4;
+   if (ir3_const_can_upload(&const_state->allocs, IR3_CONST_ALLOC_TFBO,
+                            v->constlen)) {
      struct fd_streamout_stateobj *so = &ctx->streamout;
      const struct ir3_stream_output_info *info = &v->stream_output;
      uint32_t params = 4;
@@ -423,8 +433,10 @@ emit_kernel_params(struct fd_context *ctx, const struct ir3_shader_variant *v,
   assert_dt
 {
   const struct ir3_const_state *const_state = ir3_const_state(v);
-   uint32_t offset = const_state->offsets.kernel_params;
-   if (v->constlen > offset) {
+   uint32_t offset =
+      const_state->allocs.consts[IR3_CONST_ALLOC_KERNEL_PARAMS].offset_vec4;
+   if (ir3_const_can_upload(&const_state->allocs, IR3_CONST_ALLOC_KERNEL_PARAMS,
+                            v->constlen)) {
      ring_wfi(ctx->batch, ring);
      emit_const_user(ring, v, offset * 4,
                      align(v->cs.req_input_mem, 4),