From 98d6d93a8278fb1b45c9a2fe308482fcb7eccd10 Mon Sep 17 00:00:00 2001 From: Danylo Piliaiev Date: Wed, 20 Dec 2023 16:01:24 +0100 Subject: [PATCH] turnip,ir3/a750: Implement inline uniforms via ldg.k Inline consts suffer the same issue as driver params, so they also should be preloaded via preamble. There is special instruction to load from global memory into consts. Co-Authored-By: Connor Abbott Signed-off-by: Danylo Piliaiev Part-of: --- src/freedreno/ir3/ir3_compiler.c | 1 + src/freedreno/ir3/ir3_compiler.h | 1 + src/freedreno/vulkan/tu_cmd_buffer.cc | 74 ++++++++++++++++++++++++--- src/freedreno/vulkan/tu_shader.cc | 31 ++++++++--- src/freedreno/vulkan/tu_shader.h | 1 + 5 files changed, 95 insertions(+), 13 deletions(-) diff --git a/src/freedreno/ir3/ir3_compiler.c b/src/freedreno/ir3/ir3_compiler.c index 7fcf1b1ed11..5a254be1fe1 100644 --- a/src/freedreno/ir3/ir3_compiler.c +++ b/src/freedreno/ir3/ir3_compiler.c @@ -211,6 +211,7 @@ ir3_compiler_create(struct fd_device *dev, const struct fd_dev_id *dev_id, compiler->has_fs_tex_prefetch = dev_info->a6xx.has_fs_tex_prefetch; compiler->stsc_duplication_quirk = dev_info->a7xx.stsc_duplication_quirk; compiler->load_shader_consts_via_preamble = dev_info->a7xx.load_shader_consts_via_preamble; + compiler->load_inline_uniforms_via_preamble_ldgk = dev_info->a7xx.load_inline_uniforms_via_preamble_ldgk; } else { compiler->max_const_pipeline = 512; compiler->max_const_geom = 512; diff --git a/src/freedreno/ir3/ir3_compiler.h b/src/freedreno/ir3/ir3_compiler.h index 001b6716900..23eb3eeb161 100644 --- a/src/freedreno/ir3/ir3_compiler.h +++ b/src/freedreno/ir3/ir3_compiler.h @@ -251,6 +251,7 @@ struct ir3_compiler { bool stsc_duplication_quirk; bool load_shader_consts_via_preamble; + bool load_inline_uniforms_via_preamble_ldgk; }; void ir3_compiler_destroy(struct ir3_compiler *compiler); diff --git a/src/freedreno/vulkan/tu_cmd_buffer.cc b/src/freedreno/vulkan/tu_cmd_buffer.cc index 220cebc86fd..7282e539149 100644 --- a/src/freedreno/vulkan/tu_cmd_buffer.cc +++ b/src/freedreno/vulkan/tu_cmd_buffer.cc @@ -4279,6 +4279,7 @@ TU_GENX(tu_CmdNextSubpass2); static uint32_t tu6_user_consts_size(const struct tu_const_state *const_state, + bool ldgk, gl_shader_stage type) { uint32_t dwords = 0; @@ -4289,7 +4290,11 @@ tu6_user_consts_size(const struct tu_const_state *const_state, assert(num_units > 0); } - dwords += 8 * const_state->num_inline_ubos; + if (ldgk) { + dwords += 6 + (2 * const_state->num_inline_ubos + 4); + } else { + dwords += 8 * const_state->num_inline_ubos; + } return dwords; } @@ -4357,6 +4362,60 @@ tu6_emit_inline_ubo(struct tu_cs *cs, } } +static void +tu7_emit_inline_ubo(struct tu_cs *cs, + const struct tu_const_state *const_state, + const struct ir3_const_state *ir_const_state, + unsigned constlen, + gl_shader_stage type, + struct tu_descriptor_state *descriptors) +{ + uint64_t addresses[7] = {0}; + unsigned offset = const_state->inline_uniforms_ubo.idx; + + if (offset == -1) + return; + + for (unsigned i = 0; i < const_state->num_inline_ubos; i++) { + const struct tu_inline_ubo *ubo = &const_state->ubos[i]; + + uint64_t va = descriptors->set_iova[ubo->base] & ~0x3f; + addresses[i] = va + ubo->offset; + } + + /* A7XX TODO: Emit data via sub_cs instead of NOP */ + uint64_t iova = tu_cs_emit_data_nop(cs, (uint32_t *)addresses, const_state->num_inline_ubos * 2, 4); + + tu_cs_emit_pkt7(cs, tu6_stage2opcode(type), 5); + tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(offset) | + CP_LOAD_STATE6_0_STATE_TYPE(ST6_UBO) | + CP_LOAD_STATE6_0_STATE_SRC(SS6_DIRECT) | + CP_LOAD_STATE6_0_STATE_BLOCK(tu6_stage2shadersb(type)) | + CP_LOAD_STATE6_0_NUM_UNIT(1)); + tu_cs_emit(cs, CP_LOAD_STATE6_1_EXT_SRC_ADDR(0)); + tu_cs_emit(cs, CP_LOAD_STATE6_2_EXT_SRC_ADDR_HI(0)); + int size_vec4s = DIV_ROUND_UP(const_state->num_inline_ubos * 2, 4); + tu_cs_emit_qw(cs, iova | ((uint64_t)A6XX_UBO_1_SIZE(size_vec4s) << 32)); +} + +static void +tu_emit_inline_ubo(struct tu_cs *cs, + const struct tu_const_state *const_state, + const struct ir3_const_state *ir_const_state, + unsigned constlen, + gl_shader_stage type, + struct tu_descriptor_state *descriptors) +{ + if (!const_state->num_inline_ubos) + return; + + if (cs->device->physical_device->info->a7xx.load_inline_uniforms_via_preamble_ldgk) { + tu7_emit_inline_ubo(cs, const_state, ir_const_state, constlen, type, descriptors); + } else { + tu6_emit_inline_ubo(cs, const_state, constlen, type, descriptors); + } +} + static void tu6_emit_shared_consts(struct tu_cs *cs, const struct tu_push_constant_range *shared_consts, @@ -4410,12 +4469,13 @@ tu6_const_size(struct tu_cmd_buffer *cmd, dwords += shared_consts->dwords + 1; } + bool ldgk = cmd->device->physical_device->info->a7xx.load_inline_uniforms_via_preamble_ldgk; if (compute) { dwords += - tu6_user_consts_size(&cmd->state.shaders[MESA_SHADER_COMPUTE]->const_state, MESA_SHADER_COMPUTE); + tu6_user_consts_size(&cmd->state.shaders[MESA_SHADER_COMPUTE]->const_state, ldgk, MESA_SHADER_COMPUTE); } else { for (uint32_t type = MESA_SHADER_VERTEX; type <= MESA_SHADER_FRAGMENT; type++) - dwords += tu6_user_consts_size(&cmd->state.shaders[type]->const_state, (gl_shader_stage) type); + dwords += tu6_user_consts_size(&cmd->state.shaders[type]->const_state, ldgk, (gl_shader_stage) type); } return dwords; @@ -4447,8 +4507,9 @@ tu_emit_consts(struct tu_cmd_buffer *cmd, bool compute) tu6_emit_per_stage_push_consts( &cs, &cmd->state.shaders[MESA_SHADER_COMPUTE]->const_state, MESA_SHADER_COMPUTE, cmd->push_constants); - tu6_emit_inline_ubo( + tu_emit_inline_ubo( &cs, &cmd->state.shaders[MESA_SHADER_COMPUTE]->const_state, + cmd->state.shaders[MESA_SHADER_COMPUTE]->variant->const_state, cmd->state.shaders[MESA_SHADER_COMPUTE]->variant->constlen, MESA_SHADER_COMPUTE, tu_get_descriptors_state(cmd, VK_PIPELINE_BIND_POINT_COMPUTE)); @@ -4461,8 +4522,9 @@ tu_emit_consts(struct tu_cmd_buffer *cmd, bool compute) tu6_emit_per_stage_push_consts(&cs, &link->tu_const_state, (gl_shader_stage) type, cmd->push_constants); - tu6_emit_inline_ubo(&cs, &link->tu_const_state, link->constlen, - (gl_shader_stage) type, descriptors); + tu_emit_inline_ubo(&cs, &link->tu_const_state, + &link->const_state, link->constlen, + (gl_shader_stage) type, descriptors); } } diff --git a/src/freedreno/vulkan/tu_shader.cc b/src/freedreno/vulkan/tu_shader.cc index c138ddf8c40..5b511a1734c 100644 --- a/src/freedreno/vulkan/tu_shader.cc +++ b/src/freedreno/vulkan/tu_shader.cc @@ -678,12 +678,19 @@ lower_inline_ubo(nir_builder *b, nir_intrinsic_instr *intrin, void *cb_data) unsigned base = UINT_MAX; unsigned range; bool use_load = false; + bool use_ldg_k = + params->dev->physical_device->info->a7xx.load_inline_uniforms_via_preamble_ldgk; + for (unsigned i = 0; i < const_state->num_inline_ubos; i++) { if (const_state->ubos[i].base == binding.desc_set && const_state->ubos[i].offset == binding_layout->offset) { - base = const_state->ubos[i].const_offset_vec4 * 4; - use_load = const_state->ubos[i].push_address; range = const_state->ubos[i].size_vec4 * 4; + if (use_ldg_k) { + base = i * 2; + } else { + use_load = const_state->ubos[i].push_address; + base = const_state->ubos[i].const_offset_vec4 * 4; + } break; } } @@ -703,9 +710,15 @@ lower_inline_ubo(nir_builder *b, nir_intrinsic_instr *intrin, void *cb_data) b->cursor = nir_before_instr(&intrin->instr); nir_def *val; - if (use_load) { - nir_def *base_addr = - nir_load_uniform(b, 2, 32, nir_imm_int(b, 0), .base = base); + if (use_load || use_ldg_k) { + nir_def *base_addr; + if (use_ldg_k) { + base_addr = ir3_load_driver_ubo(b, 2, + ¶ms->shader->const_state.inline_uniforms_ubo, + base); + } else { + base_addr = nir_load_uniform(b, 2, 32, nir_imm_int(b, 0), .base = base); + } val = nir_load_global_ir3(b, intrin->num_components, intrin->def.bit_size, base_addr, nir_ishr_imm(b, offset, 2), @@ -847,6 +860,8 @@ tu_lower_io(nir_shader *shader, struct tu_device *dev, /* Reserve space for inline uniforms, so we can always load them from * constants and not setup a UBO descriptor for them. */ + bool use_ldg_k = + dev->physical_device->info->a7xx.load_inline_uniforms_via_preamble_ldgk; for (unsigned set = 0; set < layout->num_sets; set++) { const struct tu_descriptor_set_layout *desc_layout = layout->set[set].layout; @@ -883,7 +898,7 @@ tu_lower_io(nir_shader *shader, struct tu_device *dev, * executed. Given the small max size, there shouldn't be much reason * to use variable size anyway. */ - bool push_address = desc_layout->has_variable_descriptors && + bool push_address = !use_ldg_k && desc_layout->has_variable_descriptors && b == desc_layout->binding_count - 1; if (push_address) { @@ -902,7 +917,8 @@ tu_lower_io(nir_shader *shader, struct tu_device *dev, .size_vec4 = size_vec4, }; - reserved_consts_vec4 += align(size_vec4, dev->compiler->const_upload_unit); + if (!use_ldg_k) + reserved_consts_vec4 += align(size_vec4, dev->compiler->const_upload_unit); } } @@ -2259,6 +2275,7 @@ tu_shader_init(struct tu_device *dev, const void *key_data, size_t key_size) shader->const_state.fdm_ubo.idx = -1; shader->const_state.dynamic_offsets_ubo.idx = -1; + shader->const_state.inline_uniforms_ubo.idx = -1; return shader; } diff --git a/src/freedreno/vulkan/tu_shader.h b/src/freedreno/vulkan/tu_shader.h index 029bf992ba4..386b305d299 100644 --- a/src/freedreno/vulkan/tu_shader.h +++ b/src/freedreno/vulkan/tu_shader.h @@ -52,6 +52,7 @@ struct tu_const_state struct ir3_driver_ubo fdm_ubo; struct ir3_driver_ubo dynamic_offsets_ubo; + struct ir3_driver_ubo inline_uniforms_ubo; }; struct tu_shader