diff --git a/.gitlab-ci/deqp-freedreno-a307-fails.txt b/.gitlab-ci/deqp-freedreno-a307-fails.txt
index 060d10cc869..fa6a12dba16 100644
--- a/.gitlab-ci/deqp-freedreno-a307-fails.txt
+++ b/.gitlab-ci/deqp-freedreno-a307-fails.txt
@@ -388,10 +388,6 @@ dEQP-GLES3.functional.shaders.derivate.fwidth.nicest.fbo_msaa4.vec3_highp,Fail
 dEQP-GLES3.functional.shaders.derivate.fwidth.nicest.fbo_msaa4.vec3_mediump,Fail
 dEQP-GLES3.functional.shaders.derivate.fwidth.nicest.fbo_msaa4.vec4_highp,Fail
 dEQP-GLES3.functional.shaders.derivate.fwidth.nicest.fbo_msaa4.vec4_mediump,Fail
-dEQP-GLES2.functional.shaders.indexing.tmp_array.vec3_const_write_dynamic_loop_read_fragment,Fail
-dEQP-GLES2.functional.shaders.indexing.tmp_array.vec3_const_write_dynamic_read_fragment,Fail
-dEQP-GLES2.functional.shaders.indexing.tmp_array.vec4_const_write_dynamic_loop_read_fragment,Fail
-dEQP-GLES2.functional.shaders.indexing.tmp_array.vec4_const_write_dynamic_read_fragment,Fail
 dEQP-GLES3.functional.shaders.linkage.varying.rules.differing_interpolation_2,Fail
 dEQP-GLES3.functional.shaders.texture_functions.texturegradoffset.isampler2d_vertex,Fail
 dEQP-GLES3.functional.shaders.texture_functions.texturegradoffset.isampler3d_vertex,Fail
diff --git a/.gitlab-ci/deqp-freedreno-a630-fails.txt b/.gitlab-ci/deqp-freedreno-a630-fails.txt
index 4d8c2a69960..2a555e22936 100644
--- a/.gitlab-ci/deqp-freedreno-a630-fails.txt
+++ b/.gitlab-ci/deqp-freedreno-a630-fails.txt
@@ -1,8 +1,4 @@
 
-dEQP-GLES2.functional.shaders.indexing.tmp_array.vec3_const_write_dynamic_loop_read_vertex,Fail
-dEQP-GLES2.functional.shaders.indexing.tmp_array.vec3_const_write_dynamic_read_vertex,Fail
-dEQP-GLES2.functional.shaders.indexing.tmp_array.vec4_const_write_dynamic_loop_read_vertex,Fail
-dEQP-GLES2.functional.shaders.indexing.tmp_array.vec4_const_write_dynamic_read_vertex,Fail
 dEQP-VK.api.copy_and_blit.core.blit_image.all_formats.color.3d.a1r5g5b5_unorm_pack16.a1r5g5b5_unorm_pack16.optimal_general_nearest,Fail
 dEQP-VK.api.copy_and_blit.core.blit_image.all_formats.color.3d.a2b10g10r10_uint_pack32.a2b10g10r10_uint_pack32.general_optimal_nearest,Fail
 dEQP-VK.api.copy_and_blit.core.blit_image.all_formats.color.3d.a2r10g10b10_unorm_pack32.a2r10g10b10_unorm_pack32.optimal_optimal_nearest,Fail
diff --git a/src/freedreno/computerator/ir3_asm.c b/src/freedreno/computerator/ir3_asm.c
index e1e845a9a7c..a976bede5e7 100644
--- a/src/freedreno/computerator/ir3_asm.c
+++ b/src/freedreno/computerator/ir3_asm.c
@@ -42,7 +42,7 @@ ir3_asm_assemble(struct ir3_compiler *c, FILE *in)
 	kernel->base.num_bufs = kernel->info.num_bufs;
 	memcpy(kernel->base.buf_sizes, kernel->info.buf_sizes, sizeof(kernel->base.buf_sizes));
 
-	unsigned sz = v->info.sizedwords * 4;
+	unsigned sz = v->info.size;
 
 	v->bo = fd_bo_new(c->dev, sz,
 			DRM_FREEDRENO_GEM_CACHE_WCOMBINE |
diff --git a/src/freedreno/ir3/ir3.c b/src/freedreno/ir3/ir3.c
index 84aa8eb46d9..2f2612d40c6 100644
--- a/src/freedreno/ir3/ir3.c
+++ b/src/freedreno/ir3/ir3.c
@@ -942,15 +942,24 @@ void * ir3_assemble(struct ir3_shader_variant *v)
 	 * doesn't try to decode the following data as instructions (such as the
 	 * next stage's shader in turnip)
 	 */
-	info->sizedwords = MAX2(v->instrlen * compiler->instr_align,
-			instr_count + 4) * sizeof(instr_t) / 4;
+	info->size = MAX2(v->instrlen * compiler->instr_align, instr_count + 4) *
+		sizeof(instr_t);
+	info->sizedwords = info->size / 4;
+
+	if (v->constant_data_size) {
+		/* Make sure that where we're about to place the constant_data is safe
+		 * to indirectly upload from.
+		 */
+		info->constant_data_offset = align(info->size, v->shader->compiler->const_upload_unit * 16);
+		info->size = info->constant_data_offset + v->constant_data_size;
+	}
 
 	/* Pad out the size so that when turnip uploads the shaders in
 	 * sequence, the starting offset of the next one is properly aligned.
 	 */
-	info->sizedwords = align(info->sizedwords, compiler->instr_align * sizeof(instr_t) / 4);
+	info->size = align(info->size, compiler->instr_align * sizeof(instr_t));
 
-	ptr = dwords = rzalloc_size(v, 4 * info->sizedwords);
+	ptr = dwords = rzalloc_size(v, info->size);
 
 	foreach_block (block, &shader->block_list) {
 		unsigned sfu_delay = 0;
@@ -1003,6 +1012,14 @@ void * ir3_assemble(struct ir3_shader_variant *v)
 		}
 	}
 
+	/* Append the immediates after the end of the program.  This lets us emit
+	 * the immediates as an indirect load, while avoiding creating another BO.
+	 */
+	if (v->constant_data_size)
+		memcpy(&ptr[info->constant_data_offset / 4], v->constant_data, v->constant_data_size);
+	ralloc_free(v->constant_data);
+	v->constant_data = NULL;
+
 	return ptr;
 
 fail:
diff --git a/src/freedreno/ir3/ir3.h b/src/freedreno/ir3/ir3.h
index cb42636f285..262f2a28dcf 100644
--- a/src/freedreno/ir3/ir3.h
+++ b/src/freedreno/ir3/ir3.h
@@ -45,6 +45,13 @@ struct ir3_block;
 
 struct ir3_info {
 	void *data;              /* used internally in ir3 assembler */
+	/* Size in bytes of the shader binary, including NIR constants and
+	 * padding
+	 */
+	uint32_t size;
+	/* byte offset from start of the shader to the NIR constant data. */
+	uint32_t constant_data_offset;
+	/* Size in dwords of the instructions. */
 	uint16_t sizedwords;
 	uint16_t instrs_count;   /* expanded to account for rpt's */
 	uint16_t nops_count;     /* # of nop instructions, including nopN */
diff --git a/src/freedreno/ir3/ir3_disk_cache.c b/src/freedreno/ir3/ir3_disk_cache.c
index 78726710758..29a2c8c2157 100644
--- a/src/freedreno/ir3/ir3_disk_cache.c
+++ b/src/freedreno/ir3/ir3_disk_cache.c
@@ -126,8 +126,8 @@ retrieve_variant(struct blob_reader *blob, struct ir3_shader_variant *v)
 	 * pointers need special handling:
 	 */
 
-	v->bin = rzalloc_size(v, 4 * v->info.sizedwords);
-	blob_copy_bytes(blob, v->bin, 4 * v->info.sizedwords);
+	v->bin = rzalloc_size(v, v->info.size);
+	blob_copy_bytes(blob, v->bin, v->info.size);
 
 	if (!v->binning_pass) {
 		blob_copy_bytes(blob, v->const_state, sizeof(*v->const_state));
@@ -147,7 +147,9 @@ store_variant(struct blob *blob, struct ir3_shader_variant *v)
 	 * pointers need special handling:
 	 */
 
-	blob_write_bytes(blob, v->bin, 4 * v->info.sizedwords);
+	blob_write_bytes(blob, v->bin, v->info.size);
+
+	/* No saving constant_data, it's already baked into bin at this point. */
 
 	if (!v->binning_pass) {
 		blob_write_bytes(blob, v->const_state, sizeof(*v->const_state));
diff --git a/src/freedreno/ir3/ir3_nir.c b/src/freedreno/ir3/ir3_nir.c
index 29ab29691e0..d6d891a9560 100644
--- a/src/freedreno/ir3/ir3_nir.c
+++ b/src/freedreno/ir3/ir3_nir.c
@@ -495,11 +495,25 @@ ir3_nir_lower_variant(struct ir3_shader_variant *so, nir_shader *s)
 		progress |= OPT(s, nir_lower_tex, &tex_options);
 	}
 
+	/* Move large constant variables to the constants attached to the NIR
+	 * shader, which we will upload in the immediates range.  This generates
+	 * amuls, so we need to clean those up after.
+	 *
+	 * Passing no size_align, we would get packed values, which if we end up
+	 * having to load with LDC would result in extra reads to unpack from
+	 * straddling loads.  Align everything to vec4 to avoid that, though we
+	 * could theoretically do better.
+	 */
+	OPT_V(s, nir_opt_large_constants, glsl_get_vec4_size_align_bytes, 32 /* bytes */);
+	OPT_V(s, ir3_nir_lower_load_constant, so);
+
 	if (!so->binning_pass)
 		OPT_V(s, ir3_nir_analyze_ubo_ranges, so);
 
 	progress |= OPT(s, ir3_nir_lower_ubo_loads, so);
 
+	OPT_V(s, nir_lower_amul, ir3_glsl_type_size);
+
 	/* UBO offset lowering has to come after we've decided what will
 	 * be left as load_ubo
 	 */
diff --git a/src/freedreno/ir3/ir3_nir.h b/src/freedreno/ir3/ir3_nir.h
index d716e530493..17dc4aa155c 100644
--- a/src/freedreno/ir3/ir3_nir.h
+++ b/src/freedreno/ir3/ir3_nir.h
@@ -59,6 +59,7 @@ void ir3_nir_lower_variant(struct ir3_shader_variant *so, nir_shader *s);
 
 void ir3_setup_const_state(nir_shader *nir, struct ir3_shader_variant *v,
 		struct ir3_const_state *const_state);
+bool ir3_nir_lower_load_constant(nir_shader *nir, struct ir3_shader_variant *v);
 void ir3_nir_analyze_ubo_ranges(nir_shader *nir, struct ir3_shader_variant *v);
 bool ir3_nir_lower_ubo_loads(nir_shader *nir, struct ir3_shader_variant *v);
 bool ir3_nir_fixup_load_uniform(nir_shader *nir);
diff --git a/src/freedreno/ir3/ir3_nir_analyze_ubo_ranges.c b/src/freedreno/ir3/ir3_nir_analyze_ubo_ranges.c
index 8e7f9aa29d1..a1c06b90819 100644
--- a/src/freedreno/ir3/ir3_nir_analyze_ubo_ranges.c
+++ b/src/freedreno/ir3/ir3_nir_analyze_ubo_ranges.c
@@ -530,3 +530,94 @@ ir3_nir_fixup_load_uniform(nir_shader *nir)
 			fixup_load_uniform_filter, fixup_load_uniform_instr,
 			NULL);
 }
+static nir_ssa_def *
+ir3_nir_lower_load_const_instr(nir_builder *b, nir_instr *in_instr, void *data)
+{
+	struct ir3_const_state *const_state = data;
+	nir_intrinsic_instr *instr = nir_instr_as_intrinsic(in_instr);
+
+	/* Pick a UBO index to use as our constant data.  Skip UBO 0 since that's
+	 * reserved for gallium's cb0.
+	 */
+	if (const_state->constant_data_ubo == -1) {
+		if (b->shader->info.num_ubos == 0)
+			b->shader->info.num_ubos++;
+		const_state->constant_data_ubo = b->shader->info.num_ubos++;
+	}
+
+	unsigned num_components = instr->num_components;
+	if (nir_dest_bit_size(instr->dest) == 16) {
+		/* We can't do 16b loads -- either from LDC (32-bit only in any of our
+		 * traces, and disasm that doesn't look like it really supports it) or
+		 * from the constant file (where CONSTANT_DEMOTION_ENABLE means we get
+		 * automatic 32b-to-16b conversions when we ask for 16b from it).
+		 * Instead, we'll load 32b from a UBO and unpack from there.
+		 */
+		num_components = DIV_ROUND_UP(num_components, 2);
+	}
+	unsigned base = nir_intrinsic_base(instr);
+	nir_intrinsic_instr *load =
+		nir_intrinsic_instr_create(b->shader, nir_intrinsic_load_ubo);
+	load->num_components = num_components;
+	nir_ssa_dest_init(&load->instr, &load->dest,
+			load->num_components, 32,
+			instr->dest.ssa.name);
+
+	load->src[0] = nir_src_for_ssa(nir_imm_int(b,
+					const_state->constant_data_ubo));
+	load->src[1] = nir_src_for_ssa(nir_iadd_imm(b,
+					nir_ssa_for_src(b, instr->src[0], 1), base));
+
+	nir_intrinsic_set_align(load,
+			nir_intrinsic_align_mul(instr),
+			nir_intrinsic_align_offset(instr));
+	nir_intrinsic_set_range_base(load, base);
+	nir_intrinsic_set_range(load, nir_intrinsic_range(instr));
+
+	nir_builder_instr_insert(b, &load->instr);
+
+	nir_ssa_def *result = &load->dest.ssa;
+	if (nir_dest_bit_size(instr->dest) == 16) {
+		result = nir_bitcast_vector(b, result, 16);
+		result = nir_channels(b, result, BITSET_MASK(instr->num_components));
+	}
+
+	return result;
+}
+
+static bool
+ir3_lower_load_const_filter(const nir_instr *instr, const void *data)
+{
+        return (instr->type == nir_instr_type_intrinsic &&
+                nir_instr_as_intrinsic(instr)->intrinsic == nir_intrinsic_load_constant);
+}
+
+/* Lowers load_constant intrinsics to UBO accesses so we can run them through
+ * the general "upload to const file or leave as UBO access" code.
+ */
+bool
+ir3_nir_lower_load_constant(nir_shader *nir, struct ir3_shader_variant *v)
+{
+	struct ir3_const_state *const_state = ir3_const_state(v);
+
+	const_state->constant_data_ubo = -1;
+
+	bool progress = nir_shader_lower_instructions(nir,
+			ir3_lower_load_const_filter, ir3_nir_lower_load_const_instr,
+			const_state);
+
+	if (progress) {
+		struct ir3_compiler *compiler = v->shader->compiler;
+
+		/* Save a copy of the NIR constant data to the variant for
+			* inclusion in the final assembly.
+			*/
+		v->constant_data_size = align(nir->constant_data_size,
+				compiler->const_upload_unit * 4 * sizeof(uint32_t));
+		v->constant_data = rzalloc_size(v, v->constant_data_size);
+		memcpy(v->constant_data, nir->constant_data,
+				nir->constant_data_size);
+	}
+
+	return progress;
+}
diff --git a/src/freedreno/ir3/ir3_shader.h b/src/freedreno/ir3/ir3_shader.h
index 36aba4facc7..bba3c627da3 100644
--- a/src/freedreno/ir3/ir3_shader.h
+++ b/src/freedreno/ir3/ir3_shader.h
@@ -157,6 +157,9 @@ struct ir3_const_state {
 	unsigned num_ubos;
 	unsigned num_driver_params;   /* scalar */
 
+	/* UBO that should be mapped to the NIR shader's constant_data (or -1). */
+	int32_t constant_data_ubo;
+
 	struct {
 		/* user const start at zero */
 		unsigned ubo;
@@ -504,6 +507,12 @@ struct ir3_shader_variant {
 	gl_shader_stage type;
 	struct ir3_shader *shader;
 
+	/* variant's copy of nir->constant_data (since we don't track the NIR in
+	 * the variant, and shader->nir is before the opt pass).  Moves to v->bin
+	 * after assembly.
+	 */
+	void *constant_data;
+
 	/*
 	 * Below here is serialized when written to disk cache:
 	 */
@@ -525,6 +534,8 @@ struct ir3_shader_variant {
 
 	struct ir3_info info;
 
+	uint32_t constant_data_size;
+
 	/* Levels of nesting of flow control:
 	 */
 	unsigned branchstack;
diff --git a/src/freedreno/vulkan/tu_cmd_buffer.c b/src/freedreno/vulkan/tu_cmd_buffer.c
index dfcaca99f42..41d9c81858b 100644
--- a/src/freedreno/vulkan/tu_cmd_buffer.c
+++ b/src/freedreno/vulkan/tu_cmd_buffer.c
@@ -3013,7 +3013,8 @@ tu6_emit_user_consts(struct tu_cs *cs, const struct tu_pipeline *pipeline,
 {
    const struct tu_program_descriptor_linkage *link =
       &pipeline->program.link[type];
-   const struct ir3_ubo_analysis_state *state = &link->const_state.ubo_state;
+   const struct ir3_const_state *const_state = &link->const_state;
+   const struct ir3_ubo_analysis_state *state = &const_state->ubo_state;
 
    if (link->push_consts.count > 0) {
       unsigned num_units = link->push_consts.count;
@@ -3048,9 +3049,14 @@ tu6_emit_user_consts(struct tu_cs *cs, const struct tu_pipeline *pipeline,
       debug_assert((offset % 16) == 0);
 
       /* Dig out the descriptor from the descriptor state and read the VA from
-       * it.
+       * it.  All our UBOs are bindless with the exception of the NIR
+       * constant_data, which is uploaded once in the pipeline.
        */
-      assert(state->range[i].ubo.bindless);
+      if (!state->range[i].ubo.bindless) {
+         assert(state->range[i].ubo.block == const_state->constant_data_ubo);
+         continue;
+      }
+
       uint32_t *base = state->range[i].ubo.bindless_base == MAX_SETS ?
          descriptors_state->dynamic_descriptors :
          descriptors_state->sets[state->range[i].ubo.bindless_base]->mapped_ptr;
diff --git a/src/freedreno/vulkan/tu_pipeline.c b/src/freedreno/vulkan/tu_pipeline.c
index b2f8c636682..dde112391da 100644
--- a/src/freedreno/vulkan/tu_pipeline.c
+++ b/src/freedreno/vulkan/tu_pipeline.c
@@ -453,19 +453,61 @@ tu6_emit_xs_config(struct tu_cs *cs,
     */
    size = MIN2(size + base, xs->constlen) - base;
 
-   if (size <= 0)
-      return;
+   if (size > 0) {
+      tu_cs_emit_pkt7(cs, tu6_stage2opcode(stage), 3 + size * 4);
+      tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(base) |
+                 CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) |
+                 CP_LOAD_STATE6_0_STATE_SRC(SS6_DIRECT) |
+                 CP_LOAD_STATE6_0_STATE_BLOCK(tu6_stage2shadersb(stage)) |
+                 CP_LOAD_STATE6_0_NUM_UNIT(size));
+      tu_cs_emit(cs, CP_LOAD_STATE6_1_EXT_SRC_ADDR(0));
+      tu_cs_emit(cs, CP_LOAD_STATE6_2_EXT_SRC_ADDR_HI(0));
 
-   tu_cs_emit_pkt7(cs, tu6_stage2opcode(stage), 3 + size * 4);
-   tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(base) |
-                  CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) |
-                  CP_LOAD_STATE6_0_STATE_SRC(SS6_DIRECT) |
-                  CP_LOAD_STATE6_0_STATE_BLOCK(tu6_stage2shadersb(stage)) |
-                  CP_LOAD_STATE6_0_NUM_UNIT(size));
-   tu_cs_emit(cs, CP_LOAD_STATE6_1_EXT_SRC_ADDR(0));
-   tu_cs_emit(cs, CP_LOAD_STATE6_2_EXT_SRC_ADDR_HI(0));
+      tu_cs_emit_array(cs, const_state->immediates, size * 4);
+   }
 
-   tu_cs_emit_array(cs, const_state->immediates, size * 4);
+   if (const_state->constant_data_ubo != -1) {
+      uint64_t iova = binary_iova + xs->info.constant_data_offset;
+
+      /* Upload UBO state for the constant data. */
+      tu_cs_emit_pkt7(cs, tu6_stage2opcode(stage), 5);
+      tu_cs_emit(cs,
+                 CP_LOAD_STATE6_0_DST_OFF(const_state->constant_data_ubo) |
+                 CP_LOAD_STATE6_0_STATE_TYPE(ST6_UBO)|
+                 CP_LOAD_STATE6_0_STATE_SRC(SS6_DIRECT) |
+                 CP_LOAD_STATE6_0_STATE_BLOCK(tu6_stage2shadersb(stage)) |
+                 CP_LOAD_STATE6_0_NUM_UNIT(1));
+      tu_cs_emit(cs, CP_LOAD_STATE6_1_EXT_SRC_ADDR(0));
+      tu_cs_emit(cs, CP_LOAD_STATE6_2_EXT_SRC_ADDR_HI(0));
+      int size_vec4s = DIV_ROUND_UP(xs->constant_data_size, 16);
+      tu_cs_emit_qw(cs,
+                    iova |
+                    (uint64_t)A6XX_UBO_1_SIZE(size_vec4s) << 32);
+
+      /* Upload the constant data to the const file if needed. */
+      const struct ir3_ubo_analysis_state *ubo_state = &const_state->ubo_state;
+
+      for (int i = 0; i < ubo_state->num_enabled; i++) {
+         if (ubo_state->range[i].ubo.block != const_state->constant_data_ubo ||
+             ubo_state->range[i].ubo.bindless) {
+            continue;
+         }
+
+         uint32_t start = ubo_state->range[i].start;
+         uint32_t end = ubo_state->range[i].end;
+         uint32_t size = MIN2(end - start,
+                              (16 * xs->constlen) - ubo_state->range[i].offset);
+
+         tu_cs_emit_pkt7(cs, tu6_stage2opcode(stage), 3);
+         tu_cs_emit(cs,
+                    CP_LOAD_STATE6_0_DST_OFF(ubo_state->range[i].offset / 16) |
+                    CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) |
+                    CP_LOAD_STATE6_0_STATE_SRC(SS6_INDIRECT) |
+                    CP_LOAD_STATE6_0_STATE_BLOCK(tu6_stage2shadersb(stage)) |
+                    CP_LOAD_STATE6_0_NUM_UNIT(size / 16));
+         tu_cs_emit_qw(cs, iova + start);
+      }
+   }
 }
 
 static void
@@ -1939,12 +1981,12 @@ tu_pipeline_allocate_cs(struct tu_device *dev,
    if (builder) {
       for (uint32_t i = 0; i < MESA_SHADER_STAGES; i++) {
          if (builder->variants[i])
-            size += builder->variants[i]->info.sizedwords;
+            size += builder->variants[i]->info.size / 4;
       }
 
-      size += builder->binning_variant->info.sizedwords;
+      size += builder->binning_variant->info.size / 4;
    } else {
-      size += compute->info.sizedwords;
+      size += compute->info.size / 4;
    }
 
    tu_cs_init(&pipeline->cs, dev, TU_CS_MODE_SUB_STREAM, size);
@@ -2016,12 +2058,12 @@ tu_upload_variant(struct tu_pipeline *pipeline,
       return 0;
 
    /* this expects to get enough alignment because shaders are allocated first
-    * and sizedwords is always aligned correctly
+    * and total size is always aligned correctly
     * note: an assert in tu6_emit_xs_config validates the alignment
     */
-   tu_cs_alloc(&pipeline->cs, variant->info.sizedwords, 1, &memory);
+   tu_cs_alloc(&pipeline->cs, variant->info.size / 4, 1, &memory);
 
-   memcpy(memory.map, variant->bin, sizeof(uint32_t) * variant->info.sizedwords);
+   memcpy(memory.map, variant->bin, variant->info.size);
    return memory.iova;
 }
 
diff --git a/src/gallium/drivers/freedreno/a6xx/fd6_const.c b/src/gallium/drivers/freedreno/a6xx/fd6_const.c
index 020fbf532d2..78b7b05a32c 100644
--- a/src/gallium/drivers/freedreno/a6xx/fd6_const.c
+++ b/src/gallium/drivers/freedreno/a6xx/fd6_const.c
@@ -248,6 +248,16 @@ fd6_emit_ubos(struct fd_context *ctx, const struct ir3_shader_variant *v,
 	OUT_RING(ring, CP_LOAD_STATE6_2_EXT_SRC_ADDR_HI(0));
 
 	for (int i = 0; i < num_ubos; i++) {
+		/* NIR constant data is packed into the end of the shader. */
+		if (i == const_state->constant_data_ubo) {
+			int size_vec4s = DIV_ROUND_UP(v->constant_data_size, 16);
+			OUT_RELOC(ring, v->bo,
+					v->info.constant_data_offset,
+					(uint64_t)A6XX_UBO_1_SIZE(size_vec4s) << 32,
+					0);
+			continue;
+		}
+
 		struct pipe_constant_buffer *cb = &constbuf->cb[i];
 
 		/* If we have user pointers (constbuf 0, aka GL uniforms), upload them
diff --git a/src/gallium/drivers/freedreno/ir3/ir3_const.h b/src/gallium/drivers/freedreno/ir3/ir3_const.h
index 2c9c56041b5..4dc36c47c5d 100644
--- a/src/gallium/drivers/freedreno/ir3/ir3_const.h
+++ b/src/gallium/drivers/freedreno/ir3/ir3_const.h
@@ -106,6 +106,44 @@ ir3_user_consts_size(struct ir3_ubo_analysis_state *state,
 	}
 }
 
+/**
+ * Uploads the referenced subranges of the nir constant_data to the hardware's
+ * constant buffer.
+ */
+static inline void
+ir3_emit_constant_data(struct fd_screen *screen,
+		const struct ir3_shader_variant *v, struct fd_ringbuffer *ring)
+{
+	const struct ir3_const_state *const_state = ir3_const_state(v);
+	const struct ir3_ubo_analysis_state *state = &const_state->ubo_state;
+
+	for (unsigned i = 0; i < state->num_enabled; i++) {
+		unsigned ubo = state->range[i].ubo.block;
+		if (ubo != const_state->constant_data_ubo)
+			continue;
+
+		uint32_t size = state->range[i].end - state->range[i].start;
+
+		/* Pre-a6xx, we might have ranges enabled in the shader that aren't
+		 * used in the binning variant.
+		 */
+		if (16 * v->constlen <= state->range[i].offset)
+			continue;
+
+		/* and even if the start of the const buffer is before
+		 * first_immediate, the end may not be:
+		 */
+		size = MIN2(size, (16 * v->constlen) - state->range[i].offset);
+
+		if (size == 0)
+			continue;
+
+		emit_const_bo(ring, v, state->range[i].offset / 4,
+				v->info.constant_data_offset + state->range[i].start,
+				size / 4, v->bo);
+	}
+}
+
 /**
  * Uploads sub-ranges of UBOs to the hardware's constant buffer (UBO access
  * outside of these ranges will be done using full UBO accesses in the
@@ -121,8 +159,10 @@ ir3_emit_user_consts(struct fd_screen *screen, const struct ir3_shader_variant *
 	for (unsigned i = 0; i < state->num_enabled; i++) {
 		assert(!state->range[i].ubo.bindless);
 		unsigned ubo = state->range[i].ubo.block;
-		if (!(constbuf->enabled_mask & (1 << ubo)))
+		if (!(constbuf->enabled_mask & (1 << ubo)) ||
+				ubo == const_state->constant_data_ubo) {
 			continue;
+		}
 		struct pipe_constant_buffer *cb = &constbuf->cb[ubo];
 
 		uint32_t size = state->range[i].end - state->range[i].start;
@@ -176,6 +216,12 @@ ir3_emit_ubos(struct fd_context *ctx, const struct ir3_shader_variant *v,
 		struct fd_bo *bos[params];
 
 		for (uint32_t i = 0; i < params; i++) {
+			if (i == const_state->constant_data_ubo) {
+				bos[i] = v->bo;
+				offsets[i] = v->info.constant_data_offset;
+				continue;
+			}
+
 			struct pipe_constant_buffer *cb = &constbuf->cb[i];
 
 			/* If we have user pointers (constbuf 0, aka GL uniforms), upload
@@ -299,6 +345,11 @@ ir3_emit_immediates(struct fd_screen *screen, const struct ir3_shader_variant *v
 
 	if (size > 0)
 		emit_const_user(ring, v, base, size, const_state->immediates);
+
+	/* NIR constant data has the same lifetime as immediates, so upload it
+	 * now, too.
+	 */
+	ir3_emit_constant_data(screen, v, ring);
 }
 
 static inline void
diff --git a/src/gallium/drivers/freedreno/ir3/ir3_gallium.c b/src/gallium/drivers/freedreno/ir3/ir3_gallium.c
index cb28ed559cf..5a79a7692cc 100644
--- a/src/gallium/drivers/freedreno/ir3/ir3_gallium.c
+++ b/src/gallium/drivers/freedreno/ir3/ir3_gallium.c
@@ -86,9 +86,7 @@ upload_shader_variant(struct ir3_shader_variant *v)
 
 	assert(!v->bo);
 
-	unsigned sz = v->info.sizedwords * 4;
-
-	v->bo = fd_bo_new(compiler->dev, sz,
+	v->bo = fd_bo_new(compiler->dev, v->info.size,
 			DRM_FREEDRENO_GEM_CACHE_WCOMBINE |
 			DRM_FREEDRENO_GEM_TYPE_KMEM,
 			"%s:%s", ir3_shader_stage(v), info->name);
@@ -96,7 +94,7 @@ upload_shader_variant(struct ir3_shader_variant *v)
 	/* Always include shaders in kernel crash dumps. */
 	fd_bo_mark_for_dump(v->bo);
 
-	memcpy(fd_bo_map(v->bo), v->bin, sz);
+	memcpy(fd_bo_map(v->bo), v->bin, v->info.size);
 }
 
 struct ir3_shader_variant *