tu: Decouple vertex input state from shader

Emit VFD_DECODE and VFD_DEST separately, similarly to what Gallium does.
This means we emit a few more VFD_DECODE for binning shaders and when
there are unused attributes, but hopefully the overhead won't be too
much. In exchange we lose one draw state, and in the future we can
pre-compute the dynamic vertex state independently of the shader, so
there should be lower CPU overhead with dynamic vertex inputs.

Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/17554>
This commit is contained in:
Connor Abbott
2022-07-13 15:48:57 +02:00
committed by Marge Bot
parent 35dc99924a
commit c82af0c43b
4 changed files with 79 additions and 66 deletions

View File

@@ -489,7 +489,6 @@ tu_cs_emit_draw_state(struct tu_cs *cs, uint32_t id, struct tu_draw_state state)
uint32_t enable_mask;
switch (id) {
case TU_DRAW_STATE_PROGRAM:
case TU_DRAW_STATE_VI:
/* The blob seems to not enable this (DESC_SETS_LOAD) for binning, even
* when resources would actually be used in the binning shader.
* Presumably the overhead of prefetching the resources isn't
@@ -500,7 +499,6 @@ tu_cs_emit_draw_state(struct tu_cs *cs, uint32_t id, struct tu_draw_state state)
CP_SET_DRAW_STATE__0_SYSMEM;
break;
case TU_DRAW_STATE_PROGRAM_BINNING:
case TU_DRAW_STATE_VI_BINNING:
enable_mask = CP_SET_DRAW_STATE__0_BINNING;
break;
case TU_DRAW_STATE_INPUT_ATTACHMENTS_GMEM:
@@ -2440,12 +2438,11 @@ tu_CmdBindPipeline(VkCommandBuffer commandBuffer,
if (!(cmd->state.dirty & TU_CMD_DIRTY_DRAW_STATE)) {
uint32_t mask = ~pipeline->dynamic_state_mask & BITFIELD_MASK(TU_DYNAMIC_STATE_COUNT);
tu_cs_emit_pkt7(cs, CP_SET_DRAW_STATE, 3 * (8 + util_bitcount(mask)));
tu_cs_emit_pkt7(cs, CP_SET_DRAW_STATE, 3 * (7 + util_bitcount(mask)));
tu_cs_emit_draw_state(cs, TU_DRAW_STATE_PROGRAM_CONFIG, pipeline->program.config_state);
tu_cs_emit_draw_state(cs, TU_DRAW_STATE_PROGRAM, pipeline->program.state);
tu_cs_emit_draw_state(cs, TU_DRAW_STATE_PROGRAM_BINNING, pipeline->program.binning_state);
tu_cs_emit_draw_state(cs, TU_DRAW_STATE_VI, pipeline->vi.state);
tu_cs_emit_draw_state(cs, TU_DRAW_STATE_VI_BINNING, pipeline->vi.binning_state);
tu_cs_emit_draw_state(cs, TU_DRAW_STATE_RAST, pipeline->rast_state);
tu_cs_emit_draw_state(cs, TU_DRAW_STATE_PRIM_MODE_SYSMEM, pipeline->prim_order_state_sysmem);
tu_cs_emit_draw_state(cs, TU_DRAW_STATE_PRIM_MODE_GMEM, pipeline->prim_order_state_gmem);
@@ -4482,7 +4479,6 @@ tu6_draw_common(struct tu_cmd_buffer *cmd,
tu_cs_emit_draw_state(cs, TU_DRAW_STATE_PROGRAM, pipeline->program.state);
tu_cs_emit_draw_state(cs, TU_DRAW_STATE_PROGRAM_BINNING, pipeline->program.binning_state);
tu_cs_emit_draw_state(cs, TU_DRAW_STATE_VI, pipeline->vi.state);
tu_cs_emit_draw_state(cs, TU_DRAW_STATE_VI_BINNING, pipeline->vi.binning_state);
tu_cs_emit_draw_state(cs, TU_DRAW_STATE_RAST, pipeline->rast_state);
tu_cs_emit_draw_state(cs, TU_DRAW_STATE_PRIM_MODE_SYSMEM, pipeline->prim_order_state_sysmem);
tu_cs_emit_draw_state(cs, TU_DRAW_STATE_PRIM_MODE_GMEM, pipeline->prim_order_state_gmem);

View File

@@ -26,7 +26,6 @@ enum tu_draw_state_group_id
TU_DRAW_STATE_PROGRAM_BINNING,
TU_DRAW_STATE_VB,
TU_DRAW_STATE_VI,
TU_DRAW_STATE_VI_BINNING,
TU_DRAW_STATE_RAST,
TU_DRAW_STATE_CONST,
TU_DRAW_STATE_DESC_SETS,

View File

@@ -714,6 +714,50 @@ tu6_emit_cs_config(struct tu_cs *cs,
}
}
#define TU6_EMIT_VFD_DEST_MAX_DWORDS (MAX_VERTEX_ATTRIBS + 2)
static void
tu6_emit_vfd_dest(struct tu_cs *cs,
const struct ir3_shader_variant *vs)
{
int32_t input_for_attr[MAX_VERTEX_ATTRIBS];
uint32_t attr_count = 0;
for (unsigned i = 0; i < MAX_VERTEX_ATTRIBS; i++)
input_for_attr[i] = -1;
for (unsigned i = 0; i < vs->inputs_count; i++) {
if (vs->inputs[i].sysval || vs->inputs[i].regid == regid(63, 0))
continue;
assert(vs->inputs[i].slot >= VERT_ATTRIB_GENERIC0);
unsigned loc = vs->inputs[i].slot - VERT_ATTRIB_GENERIC0;
input_for_attr[loc] = i;
attr_count = MAX2(attr_count, loc + 1);
}
tu_cs_emit_regs(cs,
A6XX_VFD_CONTROL_0(
.fetch_cnt = attr_count, /* decode_cnt for binning pass ? */
.decode_cnt = attr_count));
if (attr_count)
tu_cs_emit_pkt4(cs, REG_A6XX_VFD_DEST_CNTL_INSTR(0), attr_count);
for (unsigned i = 0; i < attr_count; i++) {
if (input_for_attr[i] >= 0) {
unsigned input_idx = input_for_attr[i];
tu_cs_emit(cs, A6XX_VFD_DEST_CNTL_INSTR(0,
.writemask = vs->inputs[input_idx].compmask,
.regid = vs->inputs[input_idx].regid).value);
} else {
tu_cs_emit(cs, A6XX_VFD_DEST_CNTL_INSTR(0,
.writemask = 0,
.regid = regid(63, 0)).value);
}
}
}
static void
tu6_emit_vs_system_values(struct tu_cs *cs,
const struct ir3_shader_variant *vs,
@@ -1763,6 +1807,8 @@ tu6_emit_program(struct tu_cs *cs,
tu_cs_emit_pkt4(cs, REG_A6XX_SP_HS_WAVE_INPUT_SIZE, 1);
tu_cs_emit(cs, 0);
tu6_emit_vfd_dest(cs, vs);
tu6_emit_vpc(cs, vs, hs, ds, gs, fs, cps_per_patch);
tu6_emit_vpc_varying_modes(cs, fs);
@@ -1805,12 +1851,11 @@ tu6_emit_program(struct tu_cs *cs,
}
}
#define TU6_EMIT_VERTEX_INPUT_MAX_DWORDS (MAX_VERTEX_ATTRIBS * 5 + 4)
#define TU6_EMIT_VERTEX_INPUT_MAX_DWORDS (MAX_VERTEX_ATTRIBS * 2 + 1)
static void
tu6_emit_vertex_input(struct tu_pipeline *pipeline,
struct tu_draw_state *vi_state,
const struct ir3_shader_variant *vs,
const VkPipelineVertexInputStateCreateInfo *info)
{
uint32_t binding_instanced = 0; /* bitmask of instanced bindings */
@@ -1845,61 +1890,38 @@ tu6_emit_vertex_input(struct tu_pipeline *pipeline,
}
}
int32_t input_for_attr[MAX_VERTEX_ATTRIBS];
uint32_t used_attrs_count = 0;
const VkVertexInputAttributeDescription *attrs[MAX_VERTEX_ATTRIBS] = { };
unsigned attr_count = 0;
for (uint32_t i = 0; i < info->vertexAttributeDescriptionCount; i++) {
const VkVertexInputAttributeDescription *attr =
&info->pVertexAttributeDescriptions[i];
attrs[attr->location] = attr;
attr_count = MAX2(attr_count, attr->location + 1);
}
for (uint32_t attr_idx = 0; attr_idx < info->vertexAttributeDescriptionCount; attr_idx++) {
input_for_attr[attr_idx] = -1;
for (uint32_t input_idx = 0; input_idx < vs->inputs_count; input_idx++) {
if ((vs->inputs[input_idx].slot - VERT_ATTRIB_GENERIC0) ==
info->pVertexAttributeDescriptions[attr_idx].location) {
input_for_attr[attr_idx] = input_idx;
used_attrs_count++;
break;
}
if (attr_count != 0)
tu_cs_emit_pkt4(&cs, REG_A6XX_VFD_DECODE_INSTR(0), attr_count * 2);
for (uint32_t loc = 0; loc < attr_count; loc++) {
const VkVertexInputAttributeDescription *attr = attrs[loc];
if (attr) {
const struct tu_native_format format = tu6_format_vtx(attr->format);
tu_cs_emit(&cs, A6XX_VFD_DECODE_INSTR(0,
.idx = attr->binding,
.offset = attr->offset,
.instanced = binding_instanced & (1 << attr->binding),
.format = format.fmt,
.swap = format.swap,
.unk30 = 1,
._float = !vk_format_is_int(attr->format)).value);
tu_cs_emit(&cs, A6XX_VFD_DECODE_STEP_RATE(0, step_rate[attr->binding]).value);
} else {
tu_cs_emit(&cs, 0);
tu_cs_emit(&cs, 0);
}
}
if (used_attrs_count)
tu_cs_emit_pkt4(&cs, REG_A6XX_VFD_DECODE_INSTR(0), used_attrs_count * 2);
for (uint32_t attr_idx = 0; attr_idx < info->vertexAttributeDescriptionCount; attr_idx++) {
const VkVertexInputAttributeDescription *attr =
&info->pVertexAttributeDescriptions[attr_idx];
if (input_for_attr[attr_idx] == -1)
continue;
const struct tu_native_format format = tu6_format_vtx(attr->format);
tu_cs_emit(&cs, A6XX_VFD_DECODE_INSTR(0,
.idx = attr->binding,
.offset = attr->offset,
.instanced = binding_instanced & (1 << attr->binding),
.format = format.fmt,
.swap = format.swap,
.unk30 = 1,
._float = !vk_format_is_int(attr->format)).value);
tu_cs_emit(&cs, A6XX_VFD_DECODE_STEP_RATE(0, step_rate[attr->binding]).value);
}
if (used_attrs_count)
tu_cs_emit_pkt4(&cs, REG_A6XX_VFD_DEST_CNTL_INSTR(0), used_attrs_count);
for (uint32_t attr_idx = 0; attr_idx < info->vertexAttributeDescriptionCount; attr_idx++) {
int32_t input_idx = input_for_attr[attr_idx];
if (input_idx == -1)
continue;
tu_cs_emit(&cs, A6XX_VFD_DEST_CNTL_INSTR(0,
.writemask = vs->inputs[input_idx].compmask,
.regid = vs->inputs[input_idx].regid).value);
}
tu_cs_emit_regs(&cs,
A6XX_VFD_CONTROL_0(
.fetch_cnt = used_attrs_count, /* decode_cnt for binning pass ? */
.decode_cnt = used_attrs_count));
*vi_state = tu_cs_end_draw_state(&pipeline->cs, &cs);
}
@@ -2336,7 +2358,8 @@ tu_pipeline_allocate_cs(struct tu_device *dev,
/* graphics case: */
if (builder) {
size += 2 * TU6_EMIT_VERTEX_INPUT_MAX_DWORDS;
size += TU6_EMIT_VERTEX_INPUT_MAX_DWORDS +
2 * TU6_EMIT_VFD_DEST_MAX_DWORDS;
for (uint32_t i = 0; i < ARRAY_SIZE(builder->shaders->variants); i++) {
if (builder->shaders->variants[i]) {
@@ -3257,8 +3280,6 @@ tu_pipeline_builder_parse_vertex_input(struct tu_pipeline_builder *builder,
{
const VkPipelineVertexInputStateCreateInfo *vi_info =
builder->create_info->pVertexInputState;
const struct ir3_shader_variant *vs = builder->shaders->variants[MESA_SHADER_VERTEX];
const struct ir3_shader_variant *bs = builder->binning_variant;
/* Bindings may contain holes */
for (unsigned i = 0; i < vi_info->vertexBindingDescriptionCount; i++) {
@@ -3266,9 +3287,7 @@ tu_pipeline_builder_parse_vertex_input(struct tu_pipeline_builder *builder,
MAX2(pipeline->num_vbs, vi_info->pVertexBindingDescriptions[i].binding + 1);
}
tu6_emit_vertex_input(pipeline, &pipeline->vi.state, vs, vi_info);
if (bs)
tu6_emit_vertex_input(pipeline, &pipeline->vi.binning_state, bs, vi_info);
tu6_emit_vertex_input(pipeline, &pipeline->vi.state, vi_info);
}
static void

View File

@@ -167,7 +167,6 @@ struct tu_pipeline
struct
{
struct tu_draw_state state;
struct tu_draw_state binning_state;
} vi;
struct