turnip,ir3/a750: Implement inline uniforms via ldg.k

Inline consts suffer the same issue as driver params, so they also
should be preloaded via preamble. There is special instruction to
load from global memory into consts.

Co-Authored-By: Connor Abbott <cwabbott0@gmail.com>
Signed-off-by: Danylo Piliaiev <dpiliaiev@igalia.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/26934>
This commit is contained in:
Danylo Piliaiev
2023-12-20 16:01:24 +01:00
committed by Marge Bot
parent 6a744ddebc
commit 98d6d93a82
5 changed files with 95 additions and 13 deletions

View File

@@ -211,6 +211,7 @@ ir3_compiler_create(struct fd_device *dev, const struct fd_dev_id *dev_id,
compiler->has_fs_tex_prefetch = dev_info->a6xx.has_fs_tex_prefetch;
compiler->stsc_duplication_quirk = dev_info->a7xx.stsc_duplication_quirk;
compiler->load_shader_consts_via_preamble = dev_info->a7xx.load_shader_consts_via_preamble;
compiler->load_inline_uniforms_via_preamble_ldgk = dev_info->a7xx.load_inline_uniforms_via_preamble_ldgk;
} else {
compiler->max_const_pipeline = 512;
compiler->max_const_geom = 512;

View File

@@ -251,6 +251,7 @@ struct ir3_compiler {
bool stsc_duplication_quirk;
bool load_shader_consts_via_preamble;
bool load_inline_uniforms_via_preamble_ldgk;
};
void ir3_compiler_destroy(struct ir3_compiler *compiler);

View File

@@ -4279,6 +4279,7 @@ TU_GENX(tu_CmdNextSubpass2);
static uint32_t
tu6_user_consts_size(const struct tu_const_state *const_state,
bool ldgk,
gl_shader_stage type)
{
uint32_t dwords = 0;
@@ -4289,7 +4290,11 @@ tu6_user_consts_size(const struct tu_const_state *const_state,
assert(num_units > 0);
}
dwords += 8 * const_state->num_inline_ubos;
if (ldgk) {
dwords += 6 + (2 * const_state->num_inline_ubos + 4);
} else {
dwords += 8 * const_state->num_inline_ubos;
}
return dwords;
}
@@ -4357,6 +4362,60 @@ tu6_emit_inline_ubo(struct tu_cs *cs,
}
}
static void
tu7_emit_inline_ubo(struct tu_cs *cs,
const struct tu_const_state *const_state,
const struct ir3_const_state *ir_const_state,
unsigned constlen,
gl_shader_stage type,
struct tu_descriptor_state *descriptors)
{
uint64_t addresses[7] = {0};
unsigned offset = const_state->inline_uniforms_ubo.idx;
if (offset == -1)
return;
for (unsigned i = 0; i < const_state->num_inline_ubos; i++) {
const struct tu_inline_ubo *ubo = &const_state->ubos[i];
uint64_t va = descriptors->set_iova[ubo->base] & ~0x3f;
addresses[i] = va + ubo->offset;
}
/* A7XX TODO: Emit data via sub_cs instead of NOP */
uint64_t iova = tu_cs_emit_data_nop(cs, (uint32_t *)addresses, const_state->num_inline_ubos * 2, 4);
tu_cs_emit_pkt7(cs, tu6_stage2opcode(type), 5);
tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(offset) |
CP_LOAD_STATE6_0_STATE_TYPE(ST6_UBO) |
CP_LOAD_STATE6_0_STATE_SRC(SS6_DIRECT) |
CP_LOAD_STATE6_0_STATE_BLOCK(tu6_stage2shadersb(type)) |
CP_LOAD_STATE6_0_NUM_UNIT(1));
tu_cs_emit(cs, CP_LOAD_STATE6_1_EXT_SRC_ADDR(0));
tu_cs_emit(cs, CP_LOAD_STATE6_2_EXT_SRC_ADDR_HI(0));
int size_vec4s = DIV_ROUND_UP(const_state->num_inline_ubos * 2, 4);
tu_cs_emit_qw(cs, iova | ((uint64_t)A6XX_UBO_1_SIZE(size_vec4s) << 32));
}
static void
tu_emit_inline_ubo(struct tu_cs *cs,
const struct tu_const_state *const_state,
const struct ir3_const_state *ir_const_state,
unsigned constlen,
gl_shader_stage type,
struct tu_descriptor_state *descriptors)
{
if (!const_state->num_inline_ubos)
return;
if (cs->device->physical_device->info->a7xx.load_inline_uniforms_via_preamble_ldgk) {
tu7_emit_inline_ubo(cs, const_state, ir_const_state, constlen, type, descriptors);
} else {
tu6_emit_inline_ubo(cs, const_state, constlen, type, descriptors);
}
}
static void
tu6_emit_shared_consts(struct tu_cs *cs,
const struct tu_push_constant_range *shared_consts,
@@ -4410,12 +4469,13 @@ tu6_const_size(struct tu_cmd_buffer *cmd,
dwords += shared_consts->dwords + 1;
}
bool ldgk = cmd->device->physical_device->info->a7xx.load_inline_uniforms_via_preamble_ldgk;
if (compute) {
dwords +=
tu6_user_consts_size(&cmd->state.shaders[MESA_SHADER_COMPUTE]->const_state, MESA_SHADER_COMPUTE);
tu6_user_consts_size(&cmd->state.shaders[MESA_SHADER_COMPUTE]->const_state, ldgk, MESA_SHADER_COMPUTE);
} else {
for (uint32_t type = MESA_SHADER_VERTEX; type <= MESA_SHADER_FRAGMENT; type++)
dwords += tu6_user_consts_size(&cmd->state.shaders[type]->const_state, (gl_shader_stage) type);
dwords += tu6_user_consts_size(&cmd->state.shaders[type]->const_state, ldgk, (gl_shader_stage) type);
}
return dwords;
@@ -4447,8 +4507,9 @@ tu_emit_consts(struct tu_cmd_buffer *cmd, bool compute)
tu6_emit_per_stage_push_consts(
&cs, &cmd->state.shaders[MESA_SHADER_COMPUTE]->const_state,
MESA_SHADER_COMPUTE, cmd->push_constants);
tu6_emit_inline_ubo(
tu_emit_inline_ubo(
&cs, &cmd->state.shaders[MESA_SHADER_COMPUTE]->const_state,
cmd->state.shaders[MESA_SHADER_COMPUTE]->variant->const_state,
cmd->state.shaders[MESA_SHADER_COMPUTE]->variant->constlen,
MESA_SHADER_COMPUTE,
tu_get_descriptors_state(cmd, VK_PIPELINE_BIND_POINT_COMPUTE));
@@ -4461,8 +4522,9 @@ tu_emit_consts(struct tu_cmd_buffer *cmd, bool compute)
tu6_emit_per_stage_push_consts(&cs, &link->tu_const_state,
(gl_shader_stage) type,
cmd->push_constants);
tu6_emit_inline_ubo(&cs, &link->tu_const_state, link->constlen,
(gl_shader_stage) type, descriptors);
tu_emit_inline_ubo(&cs, &link->tu_const_state,
&link->const_state, link->constlen,
(gl_shader_stage) type, descriptors);
}
}

View File

@@ -678,12 +678,19 @@ lower_inline_ubo(nir_builder *b, nir_intrinsic_instr *intrin, void *cb_data)
unsigned base = UINT_MAX;
unsigned range;
bool use_load = false;
bool use_ldg_k =
params->dev->physical_device->info->a7xx.load_inline_uniforms_via_preamble_ldgk;
for (unsigned i = 0; i < const_state->num_inline_ubos; i++) {
if (const_state->ubos[i].base == binding.desc_set &&
const_state->ubos[i].offset == binding_layout->offset) {
base = const_state->ubos[i].const_offset_vec4 * 4;
use_load = const_state->ubos[i].push_address;
range = const_state->ubos[i].size_vec4 * 4;
if (use_ldg_k) {
base = i * 2;
} else {
use_load = const_state->ubos[i].push_address;
base = const_state->ubos[i].const_offset_vec4 * 4;
}
break;
}
}
@@ -703,9 +710,15 @@ lower_inline_ubo(nir_builder *b, nir_intrinsic_instr *intrin, void *cb_data)
b->cursor = nir_before_instr(&intrin->instr);
nir_def *val;
if (use_load) {
nir_def *base_addr =
nir_load_uniform(b, 2, 32, nir_imm_int(b, 0), .base = base);
if (use_load || use_ldg_k) {
nir_def *base_addr;
if (use_ldg_k) {
base_addr = ir3_load_driver_ubo(b, 2,
&params->shader->const_state.inline_uniforms_ubo,
base);
} else {
base_addr = nir_load_uniform(b, 2, 32, nir_imm_int(b, 0), .base = base);
}
val = nir_load_global_ir3(b, intrin->num_components,
intrin->def.bit_size,
base_addr, nir_ishr_imm(b, offset, 2),
@@ -847,6 +860,8 @@ tu_lower_io(nir_shader *shader, struct tu_device *dev,
/* Reserve space for inline uniforms, so we can always load them from
* constants and not setup a UBO descriptor for them.
*/
bool use_ldg_k =
dev->physical_device->info->a7xx.load_inline_uniforms_via_preamble_ldgk;
for (unsigned set = 0; set < layout->num_sets; set++) {
const struct tu_descriptor_set_layout *desc_layout =
layout->set[set].layout;
@@ -883,7 +898,7 @@ tu_lower_io(nir_shader *shader, struct tu_device *dev,
* executed. Given the small max size, there shouldn't be much reason
* to use variable size anyway.
*/
bool push_address = desc_layout->has_variable_descriptors &&
bool push_address = !use_ldg_k && desc_layout->has_variable_descriptors &&
b == desc_layout->binding_count - 1;
if (push_address) {
@@ -902,7 +917,8 @@ tu_lower_io(nir_shader *shader, struct tu_device *dev,
.size_vec4 = size_vec4,
};
reserved_consts_vec4 += align(size_vec4, dev->compiler->const_upload_unit);
if (!use_ldg_k)
reserved_consts_vec4 += align(size_vec4, dev->compiler->const_upload_unit);
}
}
@@ -2259,6 +2275,7 @@ tu_shader_init(struct tu_device *dev, const void *key_data, size_t key_size)
shader->const_state.fdm_ubo.idx = -1;
shader->const_state.dynamic_offsets_ubo.idx = -1;
shader->const_state.inline_uniforms_ubo.idx = -1;
return shader;
}

View File

@@ -52,6 +52,7 @@ struct tu_const_state
struct ir3_driver_ubo fdm_ubo;
struct ir3_driver_ubo dynamic_offsets_ubo;
struct ir3_driver_ubo inline_uniforms_ubo;
};
struct tu_shader