diff --git a/.gitlab-ci/deqp-freedreno-a307-fails.txt b/.gitlab-ci/deqp-freedreno-a307-fails.txt index 060d10cc869..fa6a12dba16 100644 --- a/.gitlab-ci/deqp-freedreno-a307-fails.txt +++ b/.gitlab-ci/deqp-freedreno-a307-fails.txt @@ -388,10 +388,6 @@ dEQP-GLES3.functional.shaders.derivate.fwidth.nicest.fbo_msaa4.vec3_highp,Fail dEQP-GLES3.functional.shaders.derivate.fwidth.nicest.fbo_msaa4.vec3_mediump,Fail dEQP-GLES3.functional.shaders.derivate.fwidth.nicest.fbo_msaa4.vec4_highp,Fail dEQP-GLES3.functional.shaders.derivate.fwidth.nicest.fbo_msaa4.vec4_mediump,Fail -dEQP-GLES2.functional.shaders.indexing.tmp_array.vec3_const_write_dynamic_loop_read_fragment,Fail -dEQP-GLES2.functional.shaders.indexing.tmp_array.vec3_const_write_dynamic_read_fragment,Fail -dEQP-GLES2.functional.shaders.indexing.tmp_array.vec4_const_write_dynamic_loop_read_fragment,Fail -dEQP-GLES2.functional.shaders.indexing.tmp_array.vec4_const_write_dynamic_read_fragment,Fail dEQP-GLES3.functional.shaders.linkage.varying.rules.differing_interpolation_2,Fail dEQP-GLES3.functional.shaders.texture_functions.texturegradoffset.isampler2d_vertex,Fail dEQP-GLES3.functional.shaders.texture_functions.texturegradoffset.isampler3d_vertex,Fail diff --git a/.gitlab-ci/deqp-freedreno-a630-fails.txt b/.gitlab-ci/deqp-freedreno-a630-fails.txt index 4d8c2a69960..2a555e22936 100644 --- a/.gitlab-ci/deqp-freedreno-a630-fails.txt +++ b/.gitlab-ci/deqp-freedreno-a630-fails.txt @@ -1,8 +1,4 @@ -dEQP-GLES2.functional.shaders.indexing.tmp_array.vec3_const_write_dynamic_loop_read_vertex,Fail -dEQP-GLES2.functional.shaders.indexing.tmp_array.vec3_const_write_dynamic_read_vertex,Fail -dEQP-GLES2.functional.shaders.indexing.tmp_array.vec4_const_write_dynamic_loop_read_vertex,Fail -dEQP-GLES2.functional.shaders.indexing.tmp_array.vec4_const_write_dynamic_read_vertex,Fail dEQP-VK.api.copy_and_blit.core.blit_image.all_formats.color.3d.a1r5g5b5_unorm_pack16.a1r5g5b5_unorm_pack16.optimal_general_nearest,Fail dEQP-VK.api.copy_and_blit.core.blit_image.all_formats.color.3d.a2b10g10r10_uint_pack32.a2b10g10r10_uint_pack32.general_optimal_nearest,Fail dEQP-VK.api.copy_and_blit.core.blit_image.all_formats.color.3d.a2r10g10b10_unorm_pack32.a2r10g10b10_unorm_pack32.optimal_optimal_nearest,Fail diff --git a/src/freedreno/computerator/ir3_asm.c b/src/freedreno/computerator/ir3_asm.c index e1e845a9a7c..a976bede5e7 100644 --- a/src/freedreno/computerator/ir3_asm.c +++ b/src/freedreno/computerator/ir3_asm.c @@ -42,7 +42,7 @@ ir3_asm_assemble(struct ir3_compiler *c, FILE *in) kernel->base.num_bufs = kernel->info.num_bufs; memcpy(kernel->base.buf_sizes, kernel->info.buf_sizes, sizeof(kernel->base.buf_sizes)); - unsigned sz = v->info.sizedwords * 4; + unsigned sz = v->info.size; v->bo = fd_bo_new(c->dev, sz, DRM_FREEDRENO_GEM_CACHE_WCOMBINE | diff --git a/src/freedreno/ir3/ir3.c b/src/freedreno/ir3/ir3.c index 84aa8eb46d9..2f2612d40c6 100644 --- a/src/freedreno/ir3/ir3.c +++ b/src/freedreno/ir3/ir3.c @@ -942,15 +942,24 @@ void * ir3_assemble(struct ir3_shader_variant *v) * doesn't try to decode the following data as instructions (such as the * next stage's shader in turnip) */ - info->sizedwords = MAX2(v->instrlen * compiler->instr_align, - instr_count + 4) * sizeof(instr_t) / 4; + info->size = MAX2(v->instrlen * compiler->instr_align, instr_count + 4) * + sizeof(instr_t); + info->sizedwords = info->size / 4; + + if (v->constant_data_size) { + /* Make sure that where we're about to place the constant_data is safe + * to indirectly upload from. + */ + info->constant_data_offset = align(info->size, v->shader->compiler->const_upload_unit * 16); + info->size = info->constant_data_offset + v->constant_data_size; + } /* Pad out the size so that when turnip uploads the shaders in * sequence, the starting offset of the next one is properly aligned. */ - info->sizedwords = align(info->sizedwords, compiler->instr_align * sizeof(instr_t) / 4); + info->size = align(info->size, compiler->instr_align * sizeof(instr_t)); - ptr = dwords = rzalloc_size(v, 4 * info->sizedwords); + ptr = dwords = rzalloc_size(v, info->size); foreach_block (block, &shader->block_list) { unsigned sfu_delay = 0; @@ -1003,6 +1012,14 @@ void * ir3_assemble(struct ir3_shader_variant *v) } } + /* Append the immediates after the end of the program. This lets us emit + * the immediates as an indirect load, while avoiding creating another BO. + */ + if (v->constant_data_size) + memcpy(&ptr[info->constant_data_offset / 4], v->constant_data, v->constant_data_size); + ralloc_free(v->constant_data); + v->constant_data = NULL; + return ptr; fail: diff --git a/src/freedreno/ir3/ir3.h b/src/freedreno/ir3/ir3.h index cb42636f285..262f2a28dcf 100644 --- a/src/freedreno/ir3/ir3.h +++ b/src/freedreno/ir3/ir3.h @@ -45,6 +45,13 @@ struct ir3_block; struct ir3_info { void *data; /* used internally in ir3 assembler */ + /* Size in bytes of the shader binary, including NIR constants and + * padding + */ + uint32_t size; + /* byte offset from start of the shader to the NIR constant data. */ + uint32_t constant_data_offset; + /* Size in dwords of the instructions. */ uint16_t sizedwords; uint16_t instrs_count; /* expanded to account for rpt's */ uint16_t nops_count; /* # of nop instructions, including nopN */ diff --git a/src/freedreno/ir3/ir3_disk_cache.c b/src/freedreno/ir3/ir3_disk_cache.c index 78726710758..29a2c8c2157 100644 --- a/src/freedreno/ir3/ir3_disk_cache.c +++ b/src/freedreno/ir3/ir3_disk_cache.c @@ -126,8 +126,8 @@ retrieve_variant(struct blob_reader *blob, struct ir3_shader_variant *v) * pointers need special handling: */ - v->bin = rzalloc_size(v, 4 * v->info.sizedwords); - blob_copy_bytes(blob, v->bin, 4 * v->info.sizedwords); + v->bin = rzalloc_size(v, v->info.size); + blob_copy_bytes(blob, v->bin, v->info.size); if (!v->binning_pass) { blob_copy_bytes(blob, v->const_state, sizeof(*v->const_state)); @@ -147,7 +147,9 @@ store_variant(struct blob *blob, struct ir3_shader_variant *v) * pointers need special handling: */ - blob_write_bytes(blob, v->bin, 4 * v->info.sizedwords); + blob_write_bytes(blob, v->bin, v->info.size); + + /* No saving constant_data, it's already baked into bin at this point. */ if (!v->binning_pass) { blob_write_bytes(blob, v->const_state, sizeof(*v->const_state)); diff --git a/src/freedreno/ir3/ir3_nir.c b/src/freedreno/ir3/ir3_nir.c index 29ab29691e0..d6d891a9560 100644 --- a/src/freedreno/ir3/ir3_nir.c +++ b/src/freedreno/ir3/ir3_nir.c @@ -495,11 +495,25 @@ ir3_nir_lower_variant(struct ir3_shader_variant *so, nir_shader *s) progress |= OPT(s, nir_lower_tex, &tex_options); } + /* Move large constant variables to the constants attached to the NIR + * shader, which we will upload in the immediates range. This generates + * amuls, so we need to clean those up after. + * + * Passing no size_align, we would get packed values, which if we end up + * having to load with LDC would result in extra reads to unpack from + * straddling loads. Align everything to vec4 to avoid that, though we + * could theoretically do better. + */ + OPT_V(s, nir_opt_large_constants, glsl_get_vec4_size_align_bytes, 32 /* bytes */); + OPT_V(s, ir3_nir_lower_load_constant, so); + if (!so->binning_pass) OPT_V(s, ir3_nir_analyze_ubo_ranges, so); progress |= OPT(s, ir3_nir_lower_ubo_loads, so); + OPT_V(s, nir_lower_amul, ir3_glsl_type_size); + /* UBO offset lowering has to come after we've decided what will * be left as load_ubo */ diff --git a/src/freedreno/ir3/ir3_nir.h b/src/freedreno/ir3/ir3_nir.h index d716e530493..17dc4aa155c 100644 --- a/src/freedreno/ir3/ir3_nir.h +++ b/src/freedreno/ir3/ir3_nir.h @@ -59,6 +59,7 @@ void ir3_nir_lower_variant(struct ir3_shader_variant *so, nir_shader *s); void ir3_setup_const_state(nir_shader *nir, struct ir3_shader_variant *v, struct ir3_const_state *const_state); +bool ir3_nir_lower_load_constant(nir_shader *nir, struct ir3_shader_variant *v); void ir3_nir_analyze_ubo_ranges(nir_shader *nir, struct ir3_shader_variant *v); bool ir3_nir_lower_ubo_loads(nir_shader *nir, struct ir3_shader_variant *v); bool ir3_nir_fixup_load_uniform(nir_shader *nir); diff --git a/src/freedreno/ir3/ir3_nir_analyze_ubo_ranges.c b/src/freedreno/ir3/ir3_nir_analyze_ubo_ranges.c index 8e7f9aa29d1..a1c06b90819 100644 --- a/src/freedreno/ir3/ir3_nir_analyze_ubo_ranges.c +++ b/src/freedreno/ir3/ir3_nir_analyze_ubo_ranges.c @@ -530,3 +530,94 @@ ir3_nir_fixup_load_uniform(nir_shader *nir) fixup_load_uniform_filter, fixup_load_uniform_instr, NULL); } +static nir_ssa_def * +ir3_nir_lower_load_const_instr(nir_builder *b, nir_instr *in_instr, void *data) +{ + struct ir3_const_state *const_state = data; + nir_intrinsic_instr *instr = nir_instr_as_intrinsic(in_instr); + + /* Pick a UBO index to use as our constant data. Skip UBO 0 since that's + * reserved for gallium's cb0. + */ + if (const_state->constant_data_ubo == -1) { + if (b->shader->info.num_ubos == 0) + b->shader->info.num_ubos++; + const_state->constant_data_ubo = b->shader->info.num_ubos++; + } + + unsigned num_components = instr->num_components; + if (nir_dest_bit_size(instr->dest) == 16) { + /* We can't do 16b loads -- either from LDC (32-bit only in any of our + * traces, and disasm that doesn't look like it really supports it) or + * from the constant file (where CONSTANT_DEMOTION_ENABLE means we get + * automatic 32b-to-16b conversions when we ask for 16b from it). + * Instead, we'll load 32b from a UBO and unpack from there. + */ + num_components = DIV_ROUND_UP(num_components, 2); + } + unsigned base = nir_intrinsic_base(instr); + nir_intrinsic_instr *load = + nir_intrinsic_instr_create(b->shader, nir_intrinsic_load_ubo); + load->num_components = num_components; + nir_ssa_dest_init(&load->instr, &load->dest, + load->num_components, 32, + instr->dest.ssa.name); + + load->src[0] = nir_src_for_ssa(nir_imm_int(b, + const_state->constant_data_ubo)); + load->src[1] = nir_src_for_ssa(nir_iadd_imm(b, + nir_ssa_for_src(b, instr->src[0], 1), base)); + + nir_intrinsic_set_align(load, + nir_intrinsic_align_mul(instr), + nir_intrinsic_align_offset(instr)); + nir_intrinsic_set_range_base(load, base); + nir_intrinsic_set_range(load, nir_intrinsic_range(instr)); + + nir_builder_instr_insert(b, &load->instr); + + nir_ssa_def *result = &load->dest.ssa; + if (nir_dest_bit_size(instr->dest) == 16) { + result = nir_bitcast_vector(b, result, 16); + result = nir_channels(b, result, BITSET_MASK(instr->num_components)); + } + + return result; +} + +static bool +ir3_lower_load_const_filter(const nir_instr *instr, const void *data) +{ + return (instr->type == nir_instr_type_intrinsic && + nir_instr_as_intrinsic(instr)->intrinsic == nir_intrinsic_load_constant); +} + +/* Lowers load_constant intrinsics to UBO accesses so we can run them through + * the general "upload to const file or leave as UBO access" code. + */ +bool +ir3_nir_lower_load_constant(nir_shader *nir, struct ir3_shader_variant *v) +{ + struct ir3_const_state *const_state = ir3_const_state(v); + + const_state->constant_data_ubo = -1; + + bool progress = nir_shader_lower_instructions(nir, + ir3_lower_load_const_filter, ir3_nir_lower_load_const_instr, + const_state); + + if (progress) { + struct ir3_compiler *compiler = v->shader->compiler; + + /* Save a copy of the NIR constant data to the variant for + * inclusion in the final assembly. + */ + v->constant_data_size = align(nir->constant_data_size, + compiler->const_upload_unit * 4 * sizeof(uint32_t)); + v->constant_data = rzalloc_size(v, v->constant_data_size); + memcpy(v->constant_data, nir->constant_data, + nir->constant_data_size); + } + + return progress; +} diff --git a/src/freedreno/ir3/ir3_shader.h b/src/freedreno/ir3/ir3_shader.h index 36aba4facc7..bba3c627da3 100644 --- a/src/freedreno/ir3/ir3_shader.h +++ b/src/freedreno/ir3/ir3_shader.h @@ -157,6 +157,9 @@ struct ir3_const_state { unsigned num_ubos; unsigned num_driver_params; /* scalar */ + /* UBO that should be mapped to the NIR shader's constant_data (or -1). */ + int32_t constant_data_ubo; + struct { /* user const start at zero */ unsigned ubo; @@ -504,6 +507,12 @@ struct ir3_shader_variant { gl_shader_stage type; struct ir3_shader *shader; + /* variant's copy of nir->constant_data (since we don't track the NIR in + * the variant, and shader->nir is before the opt pass). Moves to v->bin + * after assembly. + */ + void *constant_data; + /* * Below here is serialized when written to disk cache: */ @@ -525,6 +534,8 @@ struct ir3_shader_variant { struct ir3_info info; + uint32_t constant_data_size; + /* Levels of nesting of flow control: */ unsigned branchstack; diff --git a/src/freedreno/vulkan/tu_cmd_buffer.c b/src/freedreno/vulkan/tu_cmd_buffer.c index dfcaca99f42..41d9c81858b 100644 --- a/src/freedreno/vulkan/tu_cmd_buffer.c +++ b/src/freedreno/vulkan/tu_cmd_buffer.c @@ -3013,7 +3013,8 @@ tu6_emit_user_consts(struct tu_cs *cs, const struct tu_pipeline *pipeline, { const struct tu_program_descriptor_linkage *link = &pipeline->program.link[type]; - const struct ir3_ubo_analysis_state *state = &link->const_state.ubo_state; + const struct ir3_const_state *const_state = &link->const_state; + const struct ir3_ubo_analysis_state *state = &const_state->ubo_state; if (link->push_consts.count > 0) { unsigned num_units = link->push_consts.count; @@ -3048,9 +3049,14 @@ tu6_emit_user_consts(struct tu_cs *cs, const struct tu_pipeline *pipeline, debug_assert((offset % 16) == 0); /* Dig out the descriptor from the descriptor state and read the VA from - * it. + * it. All our UBOs are bindless with the exception of the NIR + * constant_data, which is uploaded once in the pipeline. */ - assert(state->range[i].ubo.bindless); + if (!state->range[i].ubo.bindless) { + assert(state->range[i].ubo.block == const_state->constant_data_ubo); + continue; + } + uint32_t *base = state->range[i].ubo.bindless_base == MAX_SETS ? descriptors_state->dynamic_descriptors : descriptors_state->sets[state->range[i].ubo.bindless_base]->mapped_ptr; diff --git a/src/freedreno/vulkan/tu_pipeline.c b/src/freedreno/vulkan/tu_pipeline.c index b2f8c636682..dde112391da 100644 --- a/src/freedreno/vulkan/tu_pipeline.c +++ b/src/freedreno/vulkan/tu_pipeline.c @@ -453,19 +453,61 @@ tu6_emit_xs_config(struct tu_cs *cs, */ size = MIN2(size + base, xs->constlen) - base; - if (size <= 0) - return; + if (size > 0) { + tu_cs_emit_pkt7(cs, tu6_stage2opcode(stage), 3 + size * 4); + tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(base) | + CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) | + CP_LOAD_STATE6_0_STATE_SRC(SS6_DIRECT) | + CP_LOAD_STATE6_0_STATE_BLOCK(tu6_stage2shadersb(stage)) | + CP_LOAD_STATE6_0_NUM_UNIT(size)); + tu_cs_emit(cs, CP_LOAD_STATE6_1_EXT_SRC_ADDR(0)); + tu_cs_emit(cs, CP_LOAD_STATE6_2_EXT_SRC_ADDR_HI(0)); - tu_cs_emit_pkt7(cs, tu6_stage2opcode(stage), 3 + size * 4); - tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(base) | - CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) | - CP_LOAD_STATE6_0_STATE_SRC(SS6_DIRECT) | - CP_LOAD_STATE6_0_STATE_BLOCK(tu6_stage2shadersb(stage)) | - CP_LOAD_STATE6_0_NUM_UNIT(size)); - tu_cs_emit(cs, CP_LOAD_STATE6_1_EXT_SRC_ADDR(0)); - tu_cs_emit(cs, CP_LOAD_STATE6_2_EXT_SRC_ADDR_HI(0)); + tu_cs_emit_array(cs, const_state->immediates, size * 4); + } - tu_cs_emit_array(cs, const_state->immediates, size * 4); + if (const_state->constant_data_ubo != -1) { + uint64_t iova = binary_iova + xs->info.constant_data_offset; + + /* Upload UBO state for the constant data. */ + tu_cs_emit_pkt7(cs, tu6_stage2opcode(stage), 5); + tu_cs_emit(cs, + CP_LOAD_STATE6_0_DST_OFF(const_state->constant_data_ubo) | + CP_LOAD_STATE6_0_STATE_TYPE(ST6_UBO)| + CP_LOAD_STATE6_0_STATE_SRC(SS6_DIRECT) | + CP_LOAD_STATE6_0_STATE_BLOCK(tu6_stage2shadersb(stage)) | + CP_LOAD_STATE6_0_NUM_UNIT(1)); + tu_cs_emit(cs, CP_LOAD_STATE6_1_EXT_SRC_ADDR(0)); + tu_cs_emit(cs, CP_LOAD_STATE6_2_EXT_SRC_ADDR_HI(0)); + int size_vec4s = DIV_ROUND_UP(xs->constant_data_size, 16); + tu_cs_emit_qw(cs, + iova | + (uint64_t)A6XX_UBO_1_SIZE(size_vec4s) << 32); + + /* Upload the constant data to the const file if needed. */ + const struct ir3_ubo_analysis_state *ubo_state = &const_state->ubo_state; + + for (int i = 0; i < ubo_state->num_enabled; i++) { + if (ubo_state->range[i].ubo.block != const_state->constant_data_ubo || + ubo_state->range[i].ubo.bindless) { + continue; + } + + uint32_t start = ubo_state->range[i].start; + uint32_t end = ubo_state->range[i].end; + uint32_t size = MIN2(end - start, + (16 * xs->constlen) - ubo_state->range[i].offset); + + tu_cs_emit_pkt7(cs, tu6_stage2opcode(stage), 3); + tu_cs_emit(cs, + CP_LOAD_STATE6_0_DST_OFF(ubo_state->range[i].offset / 16) | + CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) | + CP_LOAD_STATE6_0_STATE_SRC(SS6_INDIRECT) | + CP_LOAD_STATE6_0_STATE_BLOCK(tu6_stage2shadersb(stage)) | + CP_LOAD_STATE6_0_NUM_UNIT(size / 16)); + tu_cs_emit_qw(cs, iova + start); + } + } } static void @@ -1939,12 +1981,12 @@ tu_pipeline_allocate_cs(struct tu_device *dev, if (builder) { for (uint32_t i = 0; i < MESA_SHADER_STAGES; i++) { if (builder->variants[i]) - size += builder->variants[i]->info.sizedwords; + size += builder->variants[i]->info.size / 4; } - size += builder->binning_variant->info.sizedwords; + size += builder->binning_variant->info.size / 4; } else { - size += compute->info.sizedwords; + size += compute->info.size / 4; } tu_cs_init(&pipeline->cs, dev, TU_CS_MODE_SUB_STREAM, size); @@ -2016,12 +2058,12 @@ tu_upload_variant(struct tu_pipeline *pipeline, return 0; /* this expects to get enough alignment because shaders are allocated first - * and sizedwords is always aligned correctly + * and total size is always aligned correctly * note: an assert in tu6_emit_xs_config validates the alignment */ - tu_cs_alloc(&pipeline->cs, variant->info.sizedwords, 1, &memory); + tu_cs_alloc(&pipeline->cs, variant->info.size / 4, 1, &memory); - memcpy(memory.map, variant->bin, sizeof(uint32_t) * variant->info.sizedwords); + memcpy(memory.map, variant->bin, variant->info.size); return memory.iova; } diff --git a/src/gallium/drivers/freedreno/a6xx/fd6_const.c b/src/gallium/drivers/freedreno/a6xx/fd6_const.c index 020fbf532d2..78b7b05a32c 100644 --- a/src/gallium/drivers/freedreno/a6xx/fd6_const.c +++ b/src/gallium/drivers/freedreno/a6xx/fd6_const.c @@ -248,6 +248,16 @@ fd6_emit_ubos(struct fd_context *ctx, const struct ir3_shader_variant *v, OUT_RING(ring, CP_LOAD_STATE6_2_EXT_SRC_ADDR_HI(0)); for (int i = 0; i < num_ubos; i++) { + /* NIR constant data is packed into the end of the shader. */ + if (i == const_state->constant_data_ubo) { + int size_vec4s = DIV_ROUND_UP(v->constant_data_size, 16); + OUT_RELOC(ring, v->bo, + v->info.constant_data_offset, + (uint64_t)A6XX_UBO_1_SIZE(size_vec4s) << 32, + 0); + continue; + } + struct pipe_constant_buffer *cb = &constbuf->cb[i]; /* If we have user pointers (constbuf 0, aka GL uniforms), upload them diff --git a/src/gallium/drivers/freedreno/ir3/ir3_const.h b/src/gallium/drivers/freedreno/ir3/ir3_const.h index 2c9c56041b5..4dc36c47c5d 100644 --- a/src/gallium/drivers/freedreno/ir3/ir3_const.h +++ b/src/gallium/drivers/freedreno/ir3/ir3_const.h @@ -106,6 +106,44 @@ ir3_user_consts_size(struct ir3_ubo_analysis_state *state, } } +/** + * Uploads the referenced subranges of the nir constant_data to the hardware's + * constant buffer. + */ +static inline void +ir3_emit_constant_data(struct fd_screen *screen, + const struct ir3_shader_variant *v, struct fd_ringbuffer *ring) +{ + const struct ir3_const_state *const_state = ir3_const_state(v); + const struct ir3_ubo_analysis_state *state = &const_state->ubo_state; + + for (unsigned i = 0; i < state->num_enabled; i++) { + unsigned ubo = state->range[i].ubo.block; + if (ubo != const_state->constant_data_ubo) + continue; + + uint32_t size = state->range[i].end - state->range[i].start; + + /* Pre-a6xx, we might have ranges enabled in the shader that aren't + * used in the binning variant. + */ + if (16 * v->constlen <= state->range[i].offset) + continue; + + /* and even if the start of the const buffer is before + * first_immediate, the end may not be: + */ + size = MIN2(size, (16 * v->constlen) - state->range[i].offset); + + if (size == 0) + continue; + + emit_const_bo(ring, v, state->range[i].offset / 4, + v->info.constant_data_offset + state->range[i].start, + size / 4, v->bo); + } +} + /** * Uploads sub-ranges of UBOs to the hardware's constant buffer (UBO access * outside of these ranges will be done using full UBO accesses in the @@ -121,8 +159,10 @@ ir3_emit_user_consts(struct fd_screen *screen, const struct ir3_shader_variant * for (unsigned i = 0; i < state->num_enabled; i++) { assert(!state->range[i].ubo.bindless); unsigned ubo = state->range[i].ubo.block; - if (!(constbuf->enabled_mask & (1 << ubo))) + if (!(constbuf->enabled_mask & (1 << ubo)) || + ubo == const_state->constant_data_ubo) { continue; + } struct pipe_constant_buffer *cb = &constbuf->cb[ubo]; uint32_t size = state->range[i].end - state->range[i].start; @@ -176,6 +216,12 @@ ir3_emit_ubos(struct fd_context *ctx, const struct ir3_shader_variant *v, struct fd_bo *bos[params]; for (uint32_t i = 0; i < params; i++) { + if (i == const_state->constant_data_ubo) { + bos[i] = v->bo; + offsets[i] = v->info.constant_data_offset; + continue; + } + struct pipe_constant_buffer *cb = &constbuf->cb[i]; /* If we have user pointers (constbuf 0, aka GL uniforms), upload @@ -299,6 +345,11 @@ ir3_emit_immediates(struct fd_screen *screen, const struct ir3_shader_variant *v if (size > 0) emit_const_user(ring, v, base, size, const_state->immediates); + + /* NIR constant data has the same lifetime as immediates, so upload it + * now, too. + */ + ir3_emit_constant_data(screen, v, ring); } static inline void diff --git a/src/gallium/drivers/freedreno/ir3/ir3_gallium.c b/src/gallium/drivers/freedreno/ir3/ir3_gallium.c index cb28ed559cf..5a79a7692cc 100644 --- a/src/gallium/drivers/freedreno/ir3/ir3_gallium.c +++ b/src/gallium/drivers/freedreno/ir3/ir3_gallium.c @@ -86,9 +86,7 @@ upload_shader_variant(struct ir3_shader_variant *v) assert(!v->bo); - unsigned sz = v->info.sizedwords * 4; - - v->bo = fd_bo_new(compiler->dev, sz, + v->bo = fd_bo_new(compiler->dev, v->info.size, DRM_FREEDRENO_GEM_CACHE_WCOMBINE | DRM_FREEDRENO_GEM_TYPE_KMEM, "%s:%s", ir3_shader_stage(v), info->name); @@ -96,7 +94,7 @@ upload_shader_variant(struct ir3_shader_variant *v) /* Always include shaders in kernel crash dumps. */ fd_bo_mark_for_dump(v->bo); - memcpy(fd_bo_map(v->bo), v->bin, sz); + memcpy(fd_bo_map(v->bo), v->bin, v->info.size); } struct ir3_shader_variant *