turnip,ir3/a750: Implement inline uniforms via ldg.k
Inline consts suffer the same issue as driver params, so they also should be preloaded via preamble. There is special instruction to load from global memory into consts. Co-Authored-By: Connor Abbott <cwabbott0@gmail.com> Signed-off-by: Danylo Piliaiev <dpiliaiev@igalia.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/26934>
This commit is contained in:

committed by
Marge Bot

parent
6a744ddebc
commit
98d6d93a82
@@ -211,6 +211,7 @@ ir3_compiler_create(struct fd_device *dev, const struct fd_dev_id *dev_id,
|
||||
compiler->has_fs_tex_prefetch = dev_info->a6xx.has_fs_tex_prefetch;
|
||||
compiler->stsc_duplication_quirk = dev_info->a7xx.stsc_duplication_quirk;
|
||||
compiler->load_shader_consts_via_preamble = dev_info->a7xx.load_shader_consts_via_preamble;
|
||||
compiler->load_inline_uniforms_via_preamble_ldgk = dev_info->a7xx.load_inline_uniforms_via_preamble_ldgk;
|
||||
} else {
|
||||
compiler->max_const_pipeline = 512;
|
||||
compiler->max_const_geom = 512;
|
||||
|
@@ -251,6 +251,7 @@ struct ir3_compiler {
|
||||
bool stsc_duplication_quirk;
|
||||
|
||||
bool load_shader_consts_via_preamble;
|
||||
bool load_inline_uniforms_via_preamble_ldgk;
|
||||
};
|
||||
|
||||
void ir3_compiler_destroy(struct ir3_compiler *compiler);
|
||||
|
@@ -4279,6 +4279,7 @@ TU_GENX(tu_CmdNextSubpass2);
|
||||
|
||||
static uint32_t
|
||||
tu6_user_consts_size(const struct tu_const_state *const_state,
|
||||
bool ldgk,
|
||||
gl_shader_stage type)
|
||||
{
|
||||
uint32_t dwords = 0;
|
||||
@@ -4289,7 +4290,11 @@ tu6_user_consts_size(const struct tu_const_state *const_state,
|
||||
assert(num_units > 0);
|
||||
}
|
||||
|
||||
dwords += 8 * const_state->num_inline_ubos;
|
||||
if (ldgk) {
|
||||
dwords += 6 + (2 * const_state->num_inline_ubos + 4);
|
||||
} else {
|
||||
dwords += 8 * const_state->num_inline_ubos;
|
||||
}
|
||||
|
||||
return dwords;
|
||||
}
|
||||
@@ -4357,6 +4362,60 @@ tu6_emit_inline_ubo(struct tu_cs *cs,
|
||||
}
|
||||
}
|
||||
|
||||
static void
|
||||
tu7_emit_inline_ubo(struct tu_cs *cs,
|
||||
const struct tu_const_state *const_state,
|
||||
const struct ir3_const_state *ir_const_state,
|
||||
unsigned constlen,
|
||||
gl_shader_stage type,
|
||||
struct tu_descriptor_state *descriptors)
|
||||
{
|
||||
uint64_t addresses[7] = {0};
|
||||
unsigned offset = const_state->inline_uniforms_ubo.idx;
|
||||
|
||||
if (offset == -1)
|
||||
return;
|
||||
|
||||
for (unsigned i = 0; i < const_state->num_inline_ubos; i++) {
|
||||
const struct tu_inline_ubo *ubo = &const_state->ubos[i];
|
||||
|
||||
uint64_t va = descriptors->set_iova[ubo->base] & ~0x3f;
|
||||
addresses[i] = va + ubo->offset;
|
||||
}
|
||||
|
||||
/* A7XX TODO: Emit data via sub_cs instead of NOP */
|
||||
uint64_t iova = tu_cs_emit_data_nop(cs, (uint32_t *)addresses, const_state->num_inline_ubos * 2, 4);
|
||||
|
||||
tu_cs_emit_pkt7(cs, tu6_stage2opcode(type), 5);
|
||||
tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(offset) |
|
||||
CP_LOAD_STATE6_0_STATE_TYPE(ST6_UBO) |
|
||||
CP_LOAD_STATE6_0_STATE_SRC(SS6_DIRECT) |
|
||||
CP_LOAD_STATE6_0_STATE_BLOCK(tu6_stage2shadersb(type)) |
|
||||
CP_LOAD_STATE6_0_NUM_UNIT(1));
|
||||
tu_cs_emit(cs, CP_LOAD_STATE6_1_EXT_SRC_ADDR(0));
|
||||
tu_cs_emit(cs, CP_LOAD_STATE6_2_EXT_SRC_ADDR_HI(0));
|
||||
int size_vec4s = DIV_ROUND_UP(const_state->num_inline_ubos * 2, 4);
|
||||
tu_cs_emit_qw(cs, iova | ((uint64_t)A6XX_UBO_1_SIZE(size_vec4s) << 32));
|
||||
}
|
||||
|
||||
static void
|
||||
tu_emit_inline_ubo(struct tu_cs *cs,
|
||||
const struct tu_const_state *const_state,
|
||||
const struct ir3_const_state *ir_const_state,
|
||||
unsigned constlen,
|
||||
gl_shader_stage type,
|
||||
struct tu_descriptor_state *descriptors)
|
||||
{
|
||||
if (!const_state->num_inline_ubos)
|
||||
return;
|
||||
|
||||
if (cs->device->physical_device->info->a7xx.load_inline_uniforms_via_preamble_ldgk) {
|
||||
tu7_emit_inline_ubo(cs, const_state, ir_const_state, constlen, type, descriptors);
|
||||
} else {
|
||||
tu6_emit_inline_ubo(cs, const_state, constlen, type, descriptors);
|
||||
}
|
||||
}
|
||||
|
||||
static void
|
||||
tu6_emit_shared_consts(struct tu_cs *cs,
|
||||
const struct tu_push_constant_range *shared_consts,
|
||||
@@ -4410,12 +4469,13 @@ tu6_const_size(struct tu_cmd_buffer *cmd,
|
||||
dwords += shared_consts->dwords + 1;
|
||||
}
|
||||
|
||||
bool ldgk = cmd->device->physical_device->info->a7xx.load_inline_uniforms_via_preamble_ldgk;
|
||||
if (compute) {
|
||||
dwords +=
|
||||
tu6_user_consts_size(&cmd->state.shaders[MESA_SHADER_COMPUTE]->const_state, MESA_SHADER_COMPUTE);
|
||||
tu6_user_consts_size(&cmd->state.shaders[MESA_SHADER_COMPUTE]->const_state, ldgk, MESA_SHADER_COMPUTE);
|
||||
} else {
|
||||
for (uint32_t type = MESA_SHADER_VERTEX; type <= MESA_SHADER_FRAGMENT; type++)
|
||||
dwords += tu6_user_consts_size(&cmd->state.shaders[type]->const_state, (gl_shader_stage) type);
|
||||
dwords += tu6_user_consts_size(&cmd->state.shaders[type]->const_state, ldgk, (gl_shader_stage) type);
|
||||
}
|
||||
|
||||
return dwords;
|
||||
@@ -4447,8 +4507,9 @@ tu_emit_consts(struct tu_cmd_buffer *cmd, bool compute)
|
||||
tu6_emit_per_stage_push_consts(
|
||||
&cs, &cmd->state.shaders[MESA_SHADER_COMPUTE]->const_state,
|
||||
MESA_SHADER_COMPUTE, cmd->push_constants);
|
||||
tu6_emit_inline_ubo(
|
||||
tu_emit_inline_ubo(
|
||||
&cs, &cmd->state.shaders[MESA_SHADER_COMPUTE]->const_state,
|
||||
cmd->state.shaders[MESA_SHADER_COMPUTE]->variant->const_state,
|
||||
cmd->state.shaders[MESA_SHADER_COMPUTE]->variant->constlen,
|
||||
MESA_SHADER_COMPUTE,
|
||||
tu_get_descriptors_state(cmd, VK_PIPELINE_BIND_POINT_COMPUTE));
|
||||
@@ -4461,8 +4522,9 @@ tu_emit_consts(struct tu_cmd_buffer *cmd, bool compute)
|
||||
tu6_emit_per_stage_push_consts(&cs, &link->tu_const_state,
|
||||
(gl_shader_stage) type,
|
||||
cmd->push_constants);
|
||||
tu6_emit_inline_ubo(&cs, &link->tu_const_state, link->constlen,
|
||||
(gl_shader_stage) type, descriptors);
|
||||
tu_emit_inline_ubo(&cs, &link->tu_const_state,
|
||||
&link->const_state, link->constlen,
|
||||
(gl_shader_stage) type, descriptors);
|
||||
}
|
||||
}
|
||||
|
||||
|
@@ -678,12 +678,19 @@ lower_inline_ubo(nir_builder *b, nir_intrinsic_instr *intrin, void *cb_data)
|
||||
unsigned base = UINT_MAX;
|
||||
unsigned range;
|
||||
bool use_load = false;
|
||||
bool use_ldg_k =
|
||||
params->dev->physical_device->info->a7xx.load_inline_uniforms_via_preamble_ldgk;
|
||||
|
||||
for (unsigned i = 0; i < const_state->num_inline_ubos; i++) {
|
||||
if (const_state->ubos[i].base == binding.desc_set &&
|
||||
const_state->ubos[i].offset == binding_layout->offset) {
|
||||
base = const_state->ubos[i].const_offset_vec4 * 4;
|
||||
use_load = const_state->ubos[i].push_address;
|
||||
range = const_state->ubos[i].size_vec4 * 4;
|
||||
if (use_ldg_k) {
|
||||
base = i * 2;
|
||||
} else {
|
||||
use_load = const_state->ubos[i].push_address;
|
||||
base = const_state->ubos[i].const_offset_vec4 * 4;
|
||||
}
|
||||
break;
|
||||
}
|
||||
}
|
||||
@@ -703,9 +710,15 @@ lower_inline_ubo(nir_builder *b, nir_intrinsic_instr *intrin, void *cb_data)
|
||||
b->cursor = nir_before_instr(&intrin->instr);
|
||||
nir_def *val;
|
||||
|
||||
if (use_load) {
|
||||
nir_def *base_addr =
|
||||
nir_load_uniform(b, 2, 32, nir_imm_int(b, 0), .base = base);
|
||||
if (use_load || use_ldg_k) {
|
||||
nir_def *base_addr;
|
||||
if (use_ldg_k) {
|
||||
base_addr = ir3_load_driver_ubo(b, 2,
|
||||
¶ms->shader->const_state.inline_uniforms_ubo,
|
||||
base);
|
||||
} else {
|
||||
base_addr = nir_load_uniform(b, 2, 32, nir_imm_int(b, 0), .base = base);
|
||||
}
|
||||
val = nir_load_global_ir3(b, intrin->num_components,
|
||||
intrin->def.bit_size,
|
||||
base_addr, nir_ishr_imm(b, offset, 2),
|
||||
@@ -847,6 +860,8 @@ tu_lower_io(nir_shader *shader, struct tu_device *dev,
|
||||
/* Reserve space for inline uniforms, so we can always load them from
|
||||
* constants and not setup a UBO descriptor for them.
|
||||
*/
|
||||
bool use_ldg_k =
|
||||
dev->physical_device->info->a7xx.load_inline_uniforms_via_preamble_ldgk;
|
||||
for (unsigned set = 0; set < layout->num_sets; set++) {
|
||||
const struct tu_descriptor_set_layout *desc_layout =
|
||||
layout->set[set].layout;
|
||||
@@ -883,7 +898,7 @@ tu_lower_io(nir_shader *shader, struct tu_device *dev,
|
||||
* executed. Given the small max size, there shouldn't be much reason
|
||||
* to use variable size anyway.
|
||||
*/
|
||||
bool push_address = desc_layout->has_variable_descriptors &&
|
||||
bool push_address = !use_ldg_k && desc_layout->has_variable_descriptors &&
|
||||
b == desc_layout->binding_count - 1;
|
||||
|
||||
if (push_address) {
|
||||
@@ -902,7 +917,8 @@ tu_lower_io(nir_shader *shader, struct tu_device *dev,
|
||||
.size_vec4 = size_vec4,
|
||||
};
|
||||
|
||||
reserved_consts_vec4 += align(size_vec4, dev->compiler->const_upload_unit);
|
||||
if (!use_ldg_k)
|
||||
reserved_consts_vec4 += align(size_vec4, dev->compiler->const_upload_unit);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -2259,6 +2275,7 @@ tu_shader_init(struct tu_device *dev, const void *key_data, size_t key_size)
|
||||
|
||||
shader->const_state.fdm_ubo.idx = -1;
|
||||
shader->const_state.dynamic_offsets_ubo.idx = -1;
|
||||
shader->const_state.inline_uniforms_ubo.idx = -1;
|
||||
|
||||
return shader;
|
||||
}
|
||||
|
@@ -52,6 +52,7 @@ struct tu_const_state
|
||||
|
||||
struct ir3_driver_ubo fdm_ubo;
|
||||
struct ir3_driver_ubo dynamic_offsets_ubo;
|
||||
struct ir3_driver_ubo inline_uniforms_ubo;
|
||||
};
|
||||
|
||||
struct tu_shader
|
||||
|
Reference in New Issue
Block a user