From 3e5d4d50c5b4cdae8f1b5492458379a7be7a7794 Mon Sep 17 00:00:00 2001 From: Danylo Piliaiev Date: Thu, 14 Nov 2024 12:05:17 +0100 Subject: [PATCH] ir3: Use generic const alloc for everything and call it once With all consts going through generic allocations it's now possible to call ir3_setup_const_state once, and have lowerings that dynamically lower things to consts just to update the max consts being used. The only exception for now are immediates, since they eat up the space that was left and allocated much later. Signed-off-by: Danylo Piliaiev Part-of: --- src/freedreno/computerator/a4xx.cc | 2 +- src/freedreno/computerator/a6xx.cc | 2 +- src/freedreno/ir3/ir3_a4xx.c | 2 +- src/freedreno/ir3/ir3_compiler_nir.c | 12 +- src/freedreno/ir3/ir3_nir.c | 209 +++++++++--------- src/freedreno/ir3/ir3_nir.h | 7 + .../ir3/ir3_nir_analyze_ubo_ranges.c | 6 +- src/freedreno/ir3/ir3_nir_opt_preamble.c | 3 +- src/freedreno/ir3/ir3_shader.c | 28 +-- src/freedreno/ir3/ir3_shader.h | 82 ++++--- src/freedreno/vulkan/tu_pipeline.cc | 4 +- src/freedreno/vulkan/tu_shader.cc | 4 +- .../drivers/freedreno/a6xx/fd6_const.cc | 7 +- src/gallium/drivers/freedreno/ir3/ir3_const.h | 28 ++- 14 files changed, 211 insertions(+), 185 deletions(-) diff --git a/src/freedreno/computerator/a4xx.cc b/src/freedreno/computerator/a4xx.cc index 6b8a69f8066..30871ca781f 100644 --- a/src/freedreno/computerator/a4xx.cc +++ b/src/freedreno/computerator/a4xx.cc @@ -207,7 +207,7 @@ cs_const_emit(struct fd_ringbuffer *ring, struct kernel *kernel, struct ir3_shader_variant *v = ir3_kernel->v; const struct ir3_const_state *const_state = ir3_const_state(v); - uint32_t base = const_state->offsets.immediate; + uint32_t base = const_state->allocs.max_const_offset_vec4; int size = DIV_ROUND_UP(const_state->immediates_count, 4); /* truncate size to avoid writing constants that shader diff --git a/src/freedreno/computerator/a6xx.cc b/src/freedreno/computerator/a6xx.cc index f68c0f99dca..755ff5bb95b 100644 --- a/src/freedreno/computerator/a6xx.cc +++ b/src/freedreno/computerator/a6xx.cc @@ -316,7 +316,7 @@ cs_const_emit(struct fd_ringbuffer *ring, struct kernel *kernel, struct ir3_shader_variant *v = ir3_kernel->v; const struct ir3_const_state *const_state = ir3_const_state(v); - uint32_t base = const_state->offsets.immediate; + uint32_t base = const_state->allocs.max_const_offset_vec4; int size = DIV_ROUND_UP(const_state->immediates_count, 4); if (ir3_kernel->info.numwg != INVALID_REG) { diff --git a/src/freedreno/ir3/ir3_a4xx.c b/src/freedreno/ir3/ir3_a4xx.c index 28dfadaaaad..8fa43040ebc 100644 --- a/src/freedreno/ir3/ir3_a4xx.c +++ b/src/freedreno/ir3/ir3_a4xx.c @@ -215,7 +215,7 @@ get_image_offset(struct ir3_context *ctx, const nir_intrinsic_instr *instr, const struct ir3_const_state *const_state = ir3_const_state(ctx->so); assert(const_state->image_dims.mask & (1 << index)); - cb = regid(const_state->offsets.image_dims, 0) + + cb = ir3_const_reg(const_state, IR3_CONST_ALLOC_IMAGE_DIMS, 0) + const_state->image_dims.off[index]; } else { index += ctx->s->info.num_ssbos; diff --git a/src/freedreno/ir3/ir3_compiler_nir.c b/src/freedreno/ir3/ir3_compiler_nir.c index a99ebba2a46..09ce0a4c37f 100644 --- a/src/freedreno/ir3/ir3_compiler_nir.c +++ b/src/freedreno/ir3/ir3_compiler_nir.c @@ -1344,7 +1344,7 @@ emit_intrinsic_load_kernel_input(struct ir3_context *ctx, const struct ir3_const_state *const_state = ir3_const_state(ctx->so); struct ir3_builder *b = &ctx->build; unsigned offset = nir_intrinsic_base(intr); - unsigned p = regid(const_state->offsets.kernel_params, 0); + unsigned p = ir3_const_reg(const_state, IR3_CONST_ALLOC_KERNEL_PARAMS, 0); struct ir3_instruction *src0 = ir3_get_src(ctx, &intr->src[0])[0]; @@ -2600,8 +2600,10 @@ emit_intrinsic(struct ir3_context *ctx, nir_intrinsic_instr *intr) } const struct ir3_const_state *const_state = ir3_const_state(ctx->so); - const unsigned primitive_param = const_state->offsets.primitive_param * 4; - const unsigned primitive_map = const_state->offsets.primitive_map * 4; + const unsigned primitive_param = + const_state->allocs.consts[IR3_CONST_ALLOC_PRIMITIVE_PARAM].offset_vec4 * 4; + const unsigned primitive_map = + const_state->allocs.consts[IR3_CONST_ALLOC_PRIMITIVE_MAP].offset_vec4 * 4; switch (intr->intrinsic) { case nir_intrinsic_decl_reg: @@ -4732,7 +4734,9 @@ emit_stream_out(struct ir3_context *ctx) unsigned stride = strmout->stride[i]; struct ir3_instruction *base, *off; - base = create_uniform(&ctx->build, regid(const_state->offsets.tfbo, i)); + base = create_uniform( + &ctx->build, + ir3_const_reg(const_state, IR3_CONST_ALLOC_TFBO, i)); /* 24-bit should be enough: */ off = ir3_MUL_U24(&ctx->build, vtxcnt, 0, diff --git a/src/freedreno/ir3/ir3_nir.c b/src/freedreno/ir3/ir3_nir.c index 5836ee8dfe9..5240e8e61a7 100644 --- a/src/freedreno/ir3/ir3_nir.c +++ b/src/freedreno/ir3/ir3_nir.c @@ -1110,6 +1110,10 @@ ir3_nir_lower_variant(struct ir3_shader_variant *so, if (so->compiler->load_shader_consts_via_preamble) progress |= OPT(s, ir3_nir_lower_driver_params_to_ubo, so); + if (!so->binning_pass) { + ir3_setup_const_state(s, so, ir3_const_state_mut(so)); + } + /* Do the preamble before analysing UBO ranges, because it's usually * higher-value and because it can result in eliminating some indirect UBO * accesses where otherwise we'd have to push the whole range. However we @@ -1221,13 +1225,6 @@ ir3_nir_lower_variant(struct ir3_shader_variant *so, } nir_sweep(s); - - /* Binning pass variants re-use the const_state of the corresponding - * draw pass shader, so that same const emit can be re-used for both - * passes: - */ - if (!so->binning_pass) - ir3_setup_const_state(s, so, ir3_const_state_mut(so)); } bool @@ -1299,9 +1296,11 @@ ir3_get_driver_param_info(const nir_shader *shader, nir_intrinsic_instr *intr, return true; } -static void -ir3_nir_scan_driver_consts(struct ir3_compiler *compiler, nir_shader *shader, struct ir3_const_state *layout) +uint32_t +ir3_nir_scan_driver_consts(struct ir3_compiler *compiler, nir_shader *shader, + struct ir3_const_image_dims *image_dims) { + uint32_t num_driver_params = 0; nir_foreach_function (function, shader) { if (!function->impl) continue; @@ -1314,32 +1313,34 @@ ir3_nir_scan_driver_consts(struct ir3_compiler *compiler, nir_shader *shader, st nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr); unsigned idx; - switch (intr->intrinsic) { - case nir_intrinsic_image_atomic: - case nir_intrinsic_image_atomic_swap: - case nir_intrinsic_image_load: - case nir_intrinsic_image_store: - case nir_intrinsic_image_size: - /* a4xx gets these supplied by the hw directly (maybe CP?) */ - if (compiler->gen == 5 && - !(intr->intrinsic == nir_intrinsic_image_load && - !(nir_intrinsic_access(intr) & ACCESS_COHERENT))) { - idx = nir_src_as_uint(intr->src[0]); - if (layout->image_dims.mask & (1 << idx)) - break; - layout->image_dims.mask |= (1 << idx); - layout->image_dims.off[idx] = layout->image_dims.count; - layout->image_dims.count += 3; /* three const per */ + if (image_dims) { + switch (intr->intrinsic) { + case nir_intrinsic_image_atomic: + case nir_intrinsic_image_atomic_swap: + case nir_intrinsic_image_load: + case nir_intrinsic_image_store: + case nir_intrinsic_image_size: + /* a4xx gets these supplied by the hw directly (maybe CP?) */ + if (compiler->gen == 5 && + !(intr->intrinsic == nir_intrinsic_image_load && + !(nir_intrinsic_access(intr) & ACCESS_COHERENT))) { + idx = nir_src_as_uint(intr->src[0]); + if (image_dims->mask & (1 << idx)) + break; + image_dims->mask |= (1 << idx); + image_dims->off[idx] = image_dims->count; + image_dims->count += 3; /* three const per */ + } + break; + default: + break; } - break; - default: - break; } struct driver_param_info param_info; if (ir3_get_driver_param_info(shader, intr, ¶m_info)) { - layout->num_driver_params = - MAX2(layout->num_driver_params, + num_driver_params = + MAX2(num_driver_params, param_info.offset + nir_intrinsic_dest_components(intr)); } } @@ -1353,9 +1354,11 @@ ir3_nir_scan_driver_consts(struct ir3_compiler *compiler, nir_shader *shader, st */ if (!compiler->has_shared_regfile && shader->info.stage == MESA_SHADER_COMPUTE) { - layout->num_driver_params = - MAX2(layout->num_driver_params, IR3_DP_CS(workgroup_id_z) + 1); + num_driver_params = + MAX2(num_driver_params, IR3_DP_CS(workgroup_id_z) + 1); } + + return num_driver_params; } void @@ -1413,10 +1416,46 @@ ir3_const_alloc_all_reserved_space(struct ir3_const_allocations *const_alloc) const_alloc->reserved_vec4 = 0; } -/* Sets up the variant-dependent constant state for the ir3_shader. Note - * that it is also used from ir3_nir_analyze_ubo_ranges() to figure out the - * maximum number of driver params that would eventually be used, to leave - * space for this function to allocate the driver params. +void +ir3_alloc_driver_params(struct ir3_const_allocations *const_alloc, + uint32_t *num_driver_params, + struct ir3_compiler *compiler, + gl_shader_stage shader_stage) +{ + if (*num_driver_params == 0) + return; + + /* num_driver_params in dwords. we only need to align to vec4s for the + * common case of immediate constant uploads, but for indirect dispatch + * the constants may also be indirect and so we have to align the area in + * const space to that requirement. + */ + *num_driver_params = align(*num_driver_params, 4); + unsigned upload_unit = 1; + if (shader_stage == MESA_SHADER_COMPUTE || + (*num_driver_params >= IR3_DP_VS(vtxid_base))) { + upload_unit = compiler->const_upload_unit; + } + + /* offset cannot be 0 for vs params loaded by CP_DRAW_INDIRECT_MULTI */ + if (shader_stage == MESA_SHADER_VERTEX && compiler->gen >= 6) + const_alloc->max_const_offset_vec4 = + MAX2(const_alloc->max_const_offset_vec4, 1); + + uint32_t driver_params_size_vec4 = + align(*num_driver_params / 4, upload_unit); + ir3_const_alloc(const_alloc, IR3_CONST_ALLOC_DRIVER_PARAMS, + driver_params_size_vec4, upload_unit); +} + +/* Sets up the variant-dependent constant state for the ir3_shader. + * The consts allocation flow is as follows: + * 1) Turnip/Freedreno allocates consts required by corresponding API, + * e.g. push const, inline uniforms, etc. Then passes ir3_const_allocations + * into IR3. + * 2) ir3_setup_const_state pre-allocates consts with non-negotiable size. + * 3) IR3 lowerings afterwards allocate from the free space left. + * 4) Allocate offsets for consts from step 2) */ void ir3_setup_const_state(nir_shader *nir, struct ir3_shader_variant *v, @@ -1425,9 +1464,8 @@ ir3_setup_const_state(nir_shader *nir, struct ir3_shader_variant *v, struct ir3_compiler *compiler = v->compiler; unsigned ptrsz = ir3_pointer_size(compiler); - memset(&const_state->offsets, ~0, sizeof(const_state->offsets)); - - ir3_nir_scan_driver_consts(compiler, nir, const_state); + const_state->num_driver_params = + ir3_nir_scan_driver_consts(compiler, nir, &const_state->image_dims); if ((compiler->gen < 5) && (v->stream_output.num_outputs > 0)) { const_state->num_driver_params = @@ -1438,92 +1476,56 @@ ir3_setup_const_state(nir_shader *nir, struct ir3_shader_variant *v, assert((const_state->ubo_state.size % 16) == 0); - /* IR3_CONST_ALLOC_DRIVER_PARAMS could have been allocated earlier. */ - if (const_state->allocs.consts[IR3_CONST_ALLOC_DRIVER_PARAMS].size_vec4 == 0) { - ir3_nir_scan_driver_consts(compiler, nir, const_state); - if (const_state->num_driver_params > 0) { - /* num_driver_params in dwords. we only need to align to vec4s for the - * common case of immediate constant uploads, but for indirect dispatch - * the constants may also be indirect and so we have to align the area in - * const space to that requirement. - */ - const_state->num_driver_params = align(const_state->num_driver_params, 4); - unsigned upload_unit = 1; - if (v->type == MESA_SHADER_COMPUTE || - (const_state->num_driver_params >= IR3_DP_VS(vtxid_base))) { - upload_unit = compiler->const_upload_unit; - } - - /* offset cannot be 0 for vs params loaded by CP_DRAW_INDIRECT_MULTI */ - if (v->type == MESA_SHADER_VERTEX && compiler->gen >= 6) - const_state->allocs.max_const_offset_vec4 = - MAX2(const_state->allocs.max_const_offset_vec4, 1); - - uint32_t driver_params_size_vec4 = - align(const_state->num_driver_params / 4, upload_unit); - ir3_const_alloc(&const_state->allocs, IR3_CONST_ALLOC_DRIVER_PARAMS, - driver_params_size_vec4, upload_unit); - } - } - - unsigned constoff = const_state->allocs.max_const_offset_vec4; + ir3_alloc_driver_params(&const_state->allocs, + &const_state->num_driver_params, compiler, + v->type); if (const_state->image_dims.count > 0) { - unsigned cnt = const_state->image_dims.count; - const_state->offsets.image_dims = constoff; - constoff += align(cnt, 4) / 4; + ir3_const_reserve_space(&const_state->allocs, IR3_CONST_ALLOC_IMAGE_DIMS, + align(const_state->image_dims.count, 4) / 4, 1); } - if (v->type == MESA_SHADER_KERNEL) { - const_state->offsets.kernel_params = constoff; - constoff += align(v->cs.req_input_mem, 4) / 4; + if (v->type == MESA_SHADER_KERNEL && v->cs.req_input_mem) { + ir3_const_reserve_space(&const_state->allocs, + IR3_CONST_ALLOC_KERNEL_PARAMS, + align(v->cs.req_input_mem, 4) / 4, 1); } if ((v->type == MESA_SHADER_VERTEX) && (compiler->gen < 5) && v->stream_output.num_outputs > 0) { - const_state->offsets.tfbo = constoff; - constoff += align(IR3_MAX_SO_BUFFERS * ptrsz, 4) / 4; + ir3_const_reserve_space(&const_state->allocs, IR3_CONST_ALLOC_TFBO, + align(IR3_MAX_SO_BUFFERS * ptrsz, 4) / 4, 1); } if (!compiler->load_shader_consts_via_preamble) { switch (v->type) { case MESA_SHADER_TESS_CTRL: case MESA_SHADER_TESS_EVAL: - const_state->offsets.primitive_param = constoff; - constoff += 2; - - const_state->offsets.primitive_map = constoff; + ir3_const_reserve_space(&const_state->allocs, + IR3_CONST_ALLOC_PRIMITIVE_PARAM, 2, 1); break; case MESA_SHADER_GEOMETRY: - const_state->offsets.primitive_param = constoff; - constoff += 1; - - const_state->offsets.primitive_map = constoff; + ir3_const_reserve_space(&const_state->allocs, + IR3_CONST_ALLOC_PRIMITIVE_PARAM, 1, 1); break; default: break; } } - switch (v->type) { - case MESA_SHADER_VERTEX: - const_state->offsets.primitive_param = constoff; - constoff += 1; - break; - case MESA_SHADER_TESS_CTRL: - case MESA_SHADER_TESS_EVAL: - constoff += DIV_ROUND_UP(v->input_size, 4); - break; - case MESA_SHADER_GEOMETRY: - constoff += DIV_ROUND_UP(v->input_size, 4); - break; - default: - break; + if (v->type == MESA_SHADER_VERTEX) { + ir3_const_reserve_space(&const_state->allocs, + IR3_CONST_ALLOC_PRIMITIVE_PARAM, 1, 1); } - const_state->offsets.immediate = constoff; + if ((v->type == MESA_SHADER_TESS_CTRL || v->type == MESA_SHADER_TESS_EVAL || + v->type == MESA_SHADER_GEOMETRY)) { + ir3_const_reserve_space(&const_state->allocs, + IR3_CONST_ALLOC_PRIMITIVE_MAP, + DIV_ROUND_UP(v->input_size, 4), 1); + } - assert(constoff <= ir3_max_const(v)); + assert(const_state->allocs.max_const_offset_vec4 <= ir3_max_const(v)); } uint32_t @@ -1531,8 +1533,9 @@ ir3_const_state_get_free_space(const struct ir3_shader_variant *v, const struct ir3_const_state *const_state, uint32_t align_vec4) { - uint32_t free_space_vec4 = - ir3_max_const(v) - align(const_state->offsets.immediate, align_vec4) - - const_state->allocs.reserved_vec4; + uint32_t aligned_offset_vec4 = + align(const_state->allocs.max_const_offset_vec4, align_vec4); + uint32_t free_space_vec4 = ir3_max_const(v) - aligned_offset_vec4 - + const_state->allocs.reserved_vec4; return free_space_vec4; } diff --git a/src/freedreno/ir3/ir3_nir.h b/src/freedreno/ir3/ir3_nir.h index cf9b1e4546c..2b902dcb3be 100644 --- a/src/freedreno/ir3/ir3_nir.h +++ b/src/freedreno/ir3/ir3_nir.h @@ -89,6 +89,13 @@ void ir3_const_free_reserved_space(struct ir3_const_allocations *const_alloc, enum ir3_const_alloc_type type); void ir3_const_alloc_all_reserved_space(struct ir3_const_allocations *const_alloc); +uint32_t ir3_nir_scan_driver_consts(struct ir3_compiler *compiler, + nir_shader *shader, + struct ir3_const_image_dims *image_dims); +void ir3_alloc_driver_params(struct ir3_const_allocations *const_alloc, + uint32_t *num_driver_params, + struct ir3_compiler *compiler, + enum pipe_shader_type shader_type); bool ir3_nir_lower_load_constant(nir_shader *nir, struct ir3_shader_variant *v); void ir3_nir_analyze_ubo_ranges(nir_shader *nir, struct ir3_shader_variant *v); bool ir3_nir_lower_ubo_loads(nir_shader *nir, struct ir3_shader_variant *v); diff --git a/src/freedreno/ir3/ir3_nir_analyze_ubo_ranges.c b/src/freedreno/ir3/ir3_nir_analyze_ubo_ranges.c index f1f3ffca6bd..aa9812606b1 100644 --- a/src/freedreno/ir3/ir3_nir_analyze_ubo_ranges.c +++ b/src/freedreno/ir3/ir3_nir_analyze_ubo_ranges.c @@ -560,8 +560,7 @@ ir3_nir_lower_const_global_loads(nir_shader *nir, struct ir3_shader_variant *v) global_offset = const_state->allocs.consts[IR3_CONST_ALLOC_GLOBAL].offset_vec4 * 16; } else { - struct ir3_const_state *const_state = ir3_const_state_mut(v); - ir3_setup_const_state(nir, v, const_state); + const struct ir3_const_state *const_state = ir3_const_state(v); global_offset = const_state->allocs.max_const_offset_vec4 * 16; max_upload = ir3_const_state_get_free_space(v, const_state, 1) * 16; @@ -643,10 +642,9 @@ ir3_nir_analyze_ubo_ranges(nir_shader *nir, struct ir3_shader_variant *v) /* Limit our uploads to the amount of constant buffer space available in * the hardware, minus what the shader compiler may need for various * driver params. We do this UBO-to-push-constant before the real - * allocation of the driver params' const space, because UBO pointers can + * allocation of the UBO pointers' const space, because UBO pointers can * be driver params but this pass usually eliminatings them. */ - ir3_setup_const_state(nir, v, const_state); const uint32_t max_upload = ir3_const_state_get_free_space(v, const_state, align_vec4) * 16; diff --git a/src/freedreno/ir3/ir3_nir_opt_preamble.c b/src/freedreno/ir3/ir3_nir_opt_preamble.c index 1d42dd9fee1..e4e27c260a0 100644 --- a/src/freedreno/ir3/ir3_nir_opt_preamble.c +++ b/src/freedreno/ir3/ir3_nir_opt_preamble.c @@ -287,8 +287,7 @@ ir3_nir_opt_preamble(nir_shader *nir, struct ir3_shader_variant *v) max_size = const_state->allocs.consts[IR3_CONST_ALLOC_PREAMBLE].size_vec4 * 4; } else { - struct ir3_const_state *const_state = ir3_const_state_mut(v); - ir3_setup_const_state(nir, v, const_state); + const struct ir3_const_state *const_state = ir3_const_state(v); max_size = ir3_const_state_get_free_space( v, const_state, v->compiler->const_upload_unit) * 4; } diff --git a/src/freedreno/ir3/ir3_shader.c b/src/freedreno/ir3/ir3_shader.c index 804857c5b87..2f777a59ec5 100644 --- a/src/freedreno/ir3/ir3_shader.c +++ b/src/freedreno/ir3/ir3_shader.c @@ -28,7 +28,7 @@ static uint16_t const_imm_index_to_reg(const struct ir3_const_state *const_state, unsigned i) { - return i + (4 * const_state->offsets.immediate); + return i + (4 * const_state->allocs.max_const_offset_vec4); } uint16_t @@ -69,7 +69,8 @@ ir3_const_add_imm(struct ir3_shader_variant *v, uint32_t imm) /* Add on a new immediate to be pushed, if we have space left in the * constbuf. */ - if (const_state->offsets.immediate + const_state->immediates_count / 4 >= + if (const_state->allocs.max_const_offset_vec4 + + const_state->immediates_count / 4 >= ir3_max_const(v)) { return INVALID_CONST_REG; } @@ -776,6 +777,16 @@ ir3_const_alloc_type_to_string(enum ir3_const_alloc_type type) return "global"; case IR3_CONST_ALLOC_UBO_PTRS: return "ubo_ptrs"; + case IR3_CONST_ALLOC_IMAGE_DIMS: + return "image_dims"; + case IR3_CONST_ALLOC_KERNEL_PARAMS: + return "kernel_params"; + case IR3_CONST_ALLOC_TFBO: + return "tfbo"; + case IR3_CONST_ALLOC_PRIMITIVE_PARAM: + return "primitive_param"; + case IR3_CONST_ALLOC_PRIMITIVE_MAP: + return "primitive_map"; default: return "unknown"; } @@ -800,16 +811,6 @@ dump_const_state(struct ir3_shader_variant *so, FILE *out) } } - if (cs->offsets.image_dims != ~0) - fprintf(out, "; image_dims: c%u.x\n", cs->offsets.image_dims); - if (cs->offsets.kernel_params != ~0) - fprintf(out, "; kernel_params: c%u.x\n", cs->offsets.kernel_params); - if (cs->offsets.tfbo != ~0) - fprintf(out, "; tfbo: c%u.x\n", cs->offsets.tfbo); - if (cs->offsets.primitive_param != ~0) - fprintf(out, "; primitive_params: c%u.x\n", cs->offsets.primitive_param); - if (cs->offsets.primitive_map != ~0) - fprintf(out, "; primitive_map: c%u.x\n", cs->offsets.primitive_map); fprintf(out, "; ubo_state:\n"); fprintf(out, "; num_enabled: %u\n", us->num_enabled); for (unsigned i = 0; i < us->num_enabled; i++) { @@ -912,7 +913,8 @@ ir3_shader_disasm(struct ir3_shader_variant *so, uint32_t *bin, FILE *out) const struct ir3_const_state *const_state = ir3_const_state(so); for (i = 0; i < DIV_ROUND_UP(const_state->immediates_count, 4); i++) { - fprintf(out, "@const(c%d.x)\t", const_state->offsets.immediate + i); + fprintf(out, "@const(c%d.x)\t", + const_state->allocs.max_const_offset_vec4 + i); fprintf(out, "0x%08x, 0x%08x, 0x%08x, 0x%08x\n", const_state->immediates[i * 4 + 0], const_state->immediates[i * 4 + 1], diff --git a/src/freedreno/ir3/ir3_shader.h b/src/freedreno/ir3/ir3_shader.h index 013ad807b10..511dd7f5be0 100644 --- a/src/freedreno/ir3/ir3_shader.h +++ b/src/freedreno/ir3/ir3_shader.h @@ -206,7 +206,24 @@ enum ir3_const_alloc_type { IR3_CONST_ALLOC_GLOBAL = 6, /* OpenGL, pre-a6xx; pointers to UBOs */ IR3_CONST_ALLOC_UBO_PTRS = 7, - IR3_CONST_ALLOC_MAX = 8, + /* OpenGL, a5xx only; needed to calculate pixel offset, but only + * for images that have image_{load,store,size,atomic*} intrinsics. + */ + IR3_CONST_ALLOC_IMAGE_DIMS = 8, + /* OpenCL */ + IR3_CONST_ALLOC_KERNEL_PARAMS = 9, + /* OpenGL, TFBO addresses only for vs on a3xx/a4xx */ + IR3_CONST_ALLOC_TFBO = 10, + /* Common, stage-dependent primitive params: + * vs, gs: uvec4(primitive_stride, vertex_stride, 0, 0) + * hs, ds: uvec4(primitive_stride, vertex_stride, + * patch_stride, patch_vertices_in) + * uvec4(tess_param_base, tess_factor_base) + */ + IR3_CONST_ALLOC_PRIMITIVE_PARAM = 11, + /* Common, mapping from varying location to offset. */ + IR3_CONST_ALLOC_PRIMITIVE_MAP = 12, + IR3_CONST_ALLOC_MAX = 13, }; struct ir3_const_allocation { @@ -232,30 +249,30 @@ ir3_const_can_upload(const struct ir3_const_allocations *const_alloc, const_alloc->consts[type].offset_vec4 < shader_const_size_vec4; } +struct ir3_const_image_dims { + uint32_t mask; /* bitmask of images that have image_store */ + uint32_t count; /* number of consts allocated */ + /* three const allocated per image which has image_store: + * + cpp (bytes per pixel) + * + pitch (y pitch) + * + array_pitch (z pitch) + */ + uint32_t off[IR3_MAX_SHADER_IMAGES]; +}; + /** - * Describes the layout of shader consts in the const register file. + * Describes the layout of shader consts in the const register file + * and additional info about individual allocations. * - * Layout of constant registers, each section aligned to vec4. Note - * that pointer size (ubo, etc) changes depending on generation. + * Each consts section is aligned to vec4. Note that pointer + * size (ubo, etc) changes depending on generation. * - * + user consts: only used for turnip push consts - * + Optional consts: ubo ranges, preamble, global, etc. - * + UBO addresses: turnip is bindless and these are wasted - * + image dimensions: a5xx only; needed to calculate pixel offset, but only - * for images that have image_{load,store,size,atomic*} intrinsics - * + kernel params: cl only - * + driver params: these are stage-dependent; see ir3_driver_param - * + TFBO addresses: only for vs on a3xx/a4xx - * + primitive params: these are stage-dependent - * vs, gs: uvec4(primitive_stride, vertex_stride, 0, 0) - * hs, ds: uvec4(primitive_stride, vertex_stride, - * patch_stride, patch_vertices_in) - * uvec4(tess_param_base, tess_factor_base) - * + primitive map - * + lowered immediates - * - * Immediates go last mostly because they are inserted in the CP pass - * after the nir -> ir3 frontend. + * The consts allocation flow is as follows: + * 1) Turnip/Freedreno allocates consts required by corresponding API, + * e.g. push const, inline uniforms, etc. Then passes ir3_const_allocations + * into IR3. + * 2) ir3_setup_const_state allocates consts with non-negotiable size. + * 3) IR3 lowerings afterwards allocate from the free space left. * * Note UBO size in bytes should be aligned to vec4 */ @@ -268,28 +285,9 @@ struct ir3_const_state { struct ir3_driver_ubo driver_params_ubo; struct ir3_driver_ubo primitive_map_ubo, primitive_param_ubo; - struct { - /* Required consts, cannot negotiate their size */ - unsigned image_dims; - unsigned kernel_params; - unsigned tfbo; - unsigned primitive_param; - unsigned primitive_map; - unsigned immediate; - } offsets; - struct ir3_const_allocations allocs; - struct { - uint32_t mask; /* bitmask of images that have image_store */ - uint32_t count; /* number of consts allocated */ - /* three const allocated per image which has image_store: - * + cpp (bytes per pixel) - * + pitch (y pitch) - * + array_pitch (z pitch) - */ - uint32_t off[IR3_MAX_SHADER_IMAGES]; - } image_dims; + struct ir3_const_image_dims image_dims; unsigned immediates_count; unsigned immediates_size; diff --git a/src/freedreno/vulkan/tu_pipeline.cc b/src/freedreno/vulkan/tu_pipeline.cc index 9d8383f4a7f..14553e198ac 100644 --- a/src/freedreno/vulkan/tu_pipeline.cc +++ b/src/freedreno/vulkan/tu_pipeline.cc @@ -619,10 +619,10 @@ tu6_emit_const(struct tu_cs *cs, uint32_t opcode, enum tu_geom_consts_type type, uint32_t base; switch (type) { case TU_CONSTS_PRIMITIVE_MAP: - base = const_state->offsets.primitive_map; + base = const_state->allocs.consts[IR3_CONST_ALLOC_PRIMITIVE_MAP].offset_vec4; break; case TU_CONSTS_PRIMITIVE_PARAM: - base = const_state->offsets.primitive_param; + base = const_state->allocs.consts[IR3_CONST_ALLOC_PRIMITIVE_PARAM].offset_vec4; break; default: unreachable("bad consts type"); diff --git a/src/freedreno/vulkan/tu_shader.cc b/src/freedreno/vulkan/tu_shader.cc index a64cb7f063f..ca7a6da65ca 100644 --- a/src/freedreno/vulkan/tu_shader.cc +++ b/src/freedreno/vulkan/tu_shader.cc @@ -1124,7 +1124,7 @@ static uint32_t tu_xs_get_immediates_packet_size_dwords(const struct ir3_shader_variant *xs) { const struct ir3_const_state *const_state = ir3_const_state(xs); - uint32_t base = const_state->offsets.immediate; + uint32_t base = const_state->allocs.max_const_offset_vec4; int32_t size = DIV_ROUND_UP(const_state->immediates_count, 4); /* truncate size to avoid writing constants that shader @@ -1332,7 +1332,7 @@ tu6_emit_xs(struct tu_cs *cs, /* emit immediates */ const struct ir3_const_state *const_state = ir3_const_state(xs); - uint32_t base = const_state->offsets.immediate; + uint32_t base = const_state->allocs.max_const_offset_vec4; unsigned immediate_size = tu_xs_get_immediates_packet_size_dwords(xs); if (immediate_size > 0) { diff --git a/src/gallium/drivers/freedreno/a6xx/fd6_const.cc b/src/gallium/drivers/freedreno/a6xx/fd6_const.cc index a8436fdbd9b..12fd33db18e 100644 --- a/src/gallium/drivers/freedreno/a6xx/fd6_const.cc +++ b/src/gallium/drivers/freedreno/a6xx/fd6_const.cc @@ -167,8 +167,11 @@ emit_stage_tess_consts(struct fd_ringbuffer *ring, const struct ir3_shader_varia int base = const_state->primitive_param_ubo.idx; fd6_upload_emit_driver_ubo(ctx, ring, v, base, num_params, params); - } else { - const unsigned regid = const_state->offsets.primitive_param; + } else if (ir3_const_can_upload(&const_state->allocs, + IR3_CONST_ALLOC_PRIMITIVE_PARAM, + v->constlen)) { + const unsigned regid = + const_state->allocs.consts[IR3_CONST_ALLOC_PRIMITIVE_PARAM].offset_vec4; int size = MIN2(1 + regid, v->constlen) - regid; if (size > 0) fd6_emit_const_user(ring, v, regid * 4, num_params, params); diff --git a/src/gallium/drivers/freedreno/ir3/ir3_const.h b/src/gallium/drivers/freedreno/ir3/ir3_const.h index 89baf5b6623..d430a0133ab 100644 --- a/src/gallium/drivers/freedreno/ir3/ir3_const.h +++ b/src/gallium/drivers/freedreno/ir3/ir3_const.h @@ -245,8 +245,10 @@ ir3_emit_image_dims(struct fd_screen *screen, struct fd_shaderimg_stateobj *si) { const struct ir3_const_state *const_state = ir3_const_state(v); - uint32_t offset = const_state->offsets.image_dims; - if (v->constlen > offset) { + uint32_t offset = + const_state->allocs.consts[IR3_CONST_ALLOC_IMAGE_DIMS].offset_vec4; + if (ir3_const_can_upload(&const_state->allocs, IR3_CONST_ALLOC_IMAGE_DIMS, + v->constlen)) { uint32_t dims[align(const_state->image_dims.count, 4)]; unsigned mask = const_state->image_dims.mask; @@ -297,7 +299,7 @@ ir3_emit_immediates(const struct ir3_shader_variant *v, struct fd_ringbuffer *ring) { const struct ir3_const_state *const_state = ir3_const_state(v); - uint32_t base = const_state->offsets.immediate; + uint32_t base = const_state->allocs.max_const_offset_vec4; int size = DIV_ROUND_UP(const_state->immediates_count, 4); /* truncate size to avoid writing constants that shader @@ -324,7 +326,13 @@ ir3_emit_link_map(const struct ir3_shader_variant *producer, struct fd_ringbuffer *ring) { const struct ir3_const_state *const_state = ir3_const_state(consumer); - uint32_t base = const_state->offsets.primitive_map; + if (!ir3_const_can_upload(&const_state->allocs, + IR3_CONST_ALLOC_PRIMITIVE_MAP, + consumer->constlen)) + return; + + uint32_t base = + const_state->allocs.consts[IR3_CONST_ALLOC_PRIMITIVE_MAP].offset_vec4; int size = DIV_ROUND_UP(consumer->input_size, 4); /* truncate size to avoid writing constants that shader @@ -347,8 +355,10 @@ emit_tfbos(struct fd_context *ctx, const struct ir3_shader_variant *v, { /* streamout addresses after driver-params: */ const struct ir3_const_state *const_state = ir3_const_state(v); - uint32_t offset = const_state->offsets.tfbo; - if (v->constlen > offset) { + uint32_t offset = + const_state->allocs.consts[IR3_CONST_ALLOC_TFBO].offset_vec4; + if (ir3_const_can_upload(&const_state->allocs, IR3_CONST_ALLOC_TFBO, + v->constlen)) { struct fd_streamout_stateobj *so = &ctx->streamout; const struct ir3_stream_output_info *info = &v->stream_output; uint32_t params = 4; @@ -423,8 +433,10 @@ emit_kernel_params(struct fd_context *ctx, const struct ir3_shader_variant *v, assert_dt { const struct ir3_const_state *const_state = ir3_const_state(v); - uint32_t offset = const_state->offsets.kernel_params; - if (v->constlen > offset) { + uint32_t offset = + const_state->allocs.consts[IR3_CONST_ALLOC_KERNEL_PARAMS].offset_vec4; + if (ir3_const_can_upload(&const_state->allocs, IR3_CONST_ALLOC_KERNEL_PARAMS, + v->constlen)) { ring_wfi(ctx->batch, ring); emit_const_user(ring, v, offset * 4, align(v->cs.req_input_mem, 4),