From c82af0c43b6e6270835d084984560592e1079af0 Mon Sep 17 00:00:00 2001 From: Connor Abbott Date: Wed, 13 Jul 2022 15:48:57 +0200 Subject: [PATCH] tu: Decouple vertex input state from shader Emit VFD_DECODE and VFD_DEST separately, similarly to what Gallium does. This means we emit a few more VFD_DECODE for binning shaders and when there are unused attributes, but hopefully the overhead won't be too much. In exchange we lose one draw state, and in the future we can pre-compute the dynamic vertex state independently of the shader, so there should be lower CPU overhead with dynamic vertex inputs. Part-of: --- src/freedreno/vulkan/tu_cmd_buffer.c | 6 +- src/freedreno/vulkan/tu_cmd_buffer.h | 1 - src/freedreno/vulkan/tu_pipeline.c | 137 +++++++++++++++------------ src/freedreno/vulkan/tu_pipeline.h | 1 - 4 files changed, 79 insertions(+), 66 deletions(-) diff --git a/src/freedreno/vulkan/tu_cmd_buffer.c b/src/freedreno/vulkan/tu_cmd_buffer.c index b6eea06aa07..52d15a4984d 100644 --- a/src/freedreno/vulkan/tu_cmd_buffer.c +++ b/src/freedreno/vulkan/tu_cmd_buffer.c @@ -489,7 +489,6 @@ tu_cs_emit_draw_state(struct tu_cs *cs, uint32_t id, struct tu_draw_state state) uint32_t enable_mask; switch (id) { case TU_DRAW_STATE_PROGRAM: - case TU_DRAW_STATE_VI: /* The blob seems to not enable this (DESC_SETS_LOAD) for binning, even * when resources would actually be used in the binning shader. * Presumably the overhead of prefetching the resources isn't @@ -500,7 +499,6 @@ tu_cs_emit_draw_state(struct tu_cs *cs, uint32_t id, struct tu_draw_state state) CP_SET_DRAW_STATE__0_SYSMEM; break; case TU_DRAW_STATE_PROGRAM_BINNING: - case TU_DRAW_STATE_VI_BINNING: enable_mask = CP_SET_DRAW_STATE__0_BINNING; break; case TU_DRAW_STATE_INPUT_ATTACHMENTS_GMEM: @@ -2440,12 +2438,11 @@ tu_CmdBindPipeline(VkCommandBuffer commandBuffer, if (!(cmd->state.dirty & TU_CMD_DIRTY_DRAW_STATE)) { uint32_t mask = ~pipeline->dynamic_state_mask & BITFIELD_MASK(TU_DYNAMIC_STATE_COUNT); - tu_cs_emit_pkt7(cs, CP_SET_DRAW_STATE, 3 * (8 + util_bitcount(mask))); + tu_cs_emit_pkt7(cs, CP_SET_DRAW_STATE, 3 * (7 + util_bitcount(mask))); tu_cs_emit_draw_state(cs, TU_DRAW_STATE_PROGRAM_CONFIG, pipeline->program.config_state); tu_cs_emit_draw_state(cs, TU_DRAW_STATE_PROGRAM, pipeline->program.state); tu_cs_emit_draw_state(cs, TU_DRAW_STATE_PROGRAM_BINNING, pipeline->program.binning_state); tu_cs_emit_draw_state(cs, TU_DRAW_STATE_VI, pipeline->vi.state); - tu_cs_emit_draw_state(cs, TU_DRAW_STATE_VI_BINNING, pipeline->vi.binning_state); tu_cs_emit_draw_state(cs, TU_DRAW_STATE_RAST, pipeline->rast_state); tu_cs_emit_draw_state(cs, TU_DRAW_STATE_PRIM_MODE_SYSMEM, pipeline->prim_order_state_sysmem); tu_cs_emit_draw_state(cs, TU_DRAW_STATE_PRIM_MODE_GMEM, pipeline->prim_order_state_gmem); @@ -4482,7 +4479,6 @@ tu6_draw_common(struct tu_cmd_buffer *cmd, tu_cs_emit_draw_state(cs, TU_DRAW_STATE_PROGRAM, pipeline->program.state); tu_cs_emit_draw_state(cs, TU_DRAW_STATE_PROGRAM_BINNING, pipeline->program.binning_state); tu_cs_emit_draw_state(cs, TU_DRAW_STATE_VI, pipeline->vi.state); - tu_cs_emit_draw_state(cs, TU_DRAW_STATE_VI_BINNING, pipeline->vi.binning_state); tu_cs_emit_draw_state(cs, TU_DRAW_STATE_RAST, pipeline->rast_state); tu_cs_emit_draw_state(cs, TU_DRAW_STATE_PRIM_MODE_SYSMEM, pipeline->prim_order_state_sysmem); tu_cs_emit_draw_state(cs, TU_DRAW_STATE_PRIM_MODE_GMEM, pipeline->prim_order_state_gmem); diff --git a/src/freedreno/vulkan/tu_cmd_buffer.h b/src/freedreno/vulkan/tu_cmd_buffer.h index d37d007b079..19e842729b0 100644 --- a/src/freedreno/vulkan/tu_cmd_buffer.h +++ b/src/freedreno/vulkan/tu_cmd_buffer.h @@ -26,7 +26,6 @@ enum tu_draw_state_group_id TU_DRAW_STATE_PROGRAM_BINNING, TU_DRAW_STATE_VB, TU_DRAW_STATE_VI, - TU_DRAW_STATE_VI_BINNING, TU_DRAW_STATE_RAST, TU_DRAW_STATE_CONST, TU_DRAW_STATE_DESC_SETS, diff --git a/src/freedreno/vulkan/tu_pipeline.c b/src/freedreno/vulkan/tu_pipeline.c index e436d037fec..d3f77060b56 100644 --- a/src/freedreno/vulkan/tu_pipeline.c +++ b/src/freedreno/vulkan/tu_pipeline.c @@ -714,6 +714,50 @@ tu6_emit_cs_config(struct tu_cs *cs, } } +#define TU6_EMIT_VFD_DEST_MAX_DWORDS (MAX_VERTEX_ATTRIBS + 2) + +static void +tu6_emit_vfd_dest(struct tu_cs *cs, + const struct ir3_shader_variant *vs) +{ + int32_t input_for_attr[MAX_VERTEX_ATTRIBS]; + uint32_t attr_count = 0; + + for (unsigned i = 0; i < MAX_VERTEX_ATTRIBS; i++) + input_for_attr[i] = -1; + + for (unsigned i = 0; i < vs->inputs_count; i++) { + if (vs->inputs[i].sysval || vs->inputs[i].regid == regid(63, 0)) + continue; + + assert(vs->inputs[i].slot >= VERT_ATTRIB_GENERIC0); + unsigned loc = vs->inputs[i].slot - VERT_ATTRIB_GENERIC0; + input_for_attr[loc] = i; + attr_count = MAX2(attr_count, loc + 1); + } + + tu_cs_emit_regs(cs, + A6XX_VFD_CONTROL_0( + .fetch_cnt = attr_count, /* decode_cnt for binning pass ? */ + .decode_cnt = attr_count)); + + if (attr_count) + tu_cs_emit_pkt4(cs, REG_A6XX_VFD_DEST_CNTL_INSTR(0), attr_count); + + for (unsigned i = 0; i < attr_count; i++) { + if (input_for_attr[i] >= 0) { + unsigned input_idx = input_for_attr[i]; + tu_cs_emit(cs, A6XX_VFD_DEST_CNTL_INSTR(0, + .writemask = vs->inputs[input_idx].compmask, + .regid = vs->inputs[input_idx].regid).value); + } else { + tu_cs_emit(cs, A6XX_VFD_DEST_CNTL_INSTR(0, + .writemask = 0, + .regid = regid(63, 0)).value); + } + } +} + static void tu6_emit_vs_system_values(struct tu_cs *cs, const struct ir3_shader_variant *vs, @@ -1763,6 +1807,8 @@ tu6_emit_program(struct tu_cs *cs, tu_cs_emit_pkt4(cs, REG_A6XX_SP_HS_WAVE_INPUT_SIZE, 1); tu_cs_emit(cs, 0); + tu6_emit_vfd_dest(cs, vs); + tu6_emit_vpc(cs, vs, hs, ds, gs, fs, cps_per_patch); tu6_emit_vpc_varying_modes(cs, fs); @@ -1805,12 +1851,11 @@ tu6_emit_program(struct tu_cs *cs, } } -#define TU6_EMIT_VERTEX_INPUT_MAX_DWORDS (MAX_VERTEX_ATTRIBS * 5 + 4) +#define TU6_EMIT_VERTEX_INPUT_MAX_DWORDS (MAX_VERTEX_ATTRIBS * 2 + 1) static void tu6_emit_vertex_input(struct tu_pipeline *pipeline, struct tu_draw_state *vi_state, - const struct ir3_shader_variant *vs, const VkPipelineVertexInputStateCreateInfo *info) { uint32_t binding_instanced = 0; /* bitmask of instanced bindings */ @@ -1845,61 +1890,38 @@ tu6_emit_vertex_input(struct tu_pipeline *pipeline, } } - int32_t input_for_attr[MAX_VERTEX_ATTRIBS]; - uint32_t used_attrs_count = 0; + const VkVertexInputAttributeDescription *attrs[MAX_VERTEX_ATTRIBS] = { }; + unsigned attr_count = 0; + for (uint32_t i = 0; i < info->vertexAttributeDescriptionCount; i++) { + const VkVertexInputAttributeDescription *attr = + &info->pVertexAttributeDescriptions[i]; + attrs[attr->location] = attr; + attr_count = MAX2(attr_count, attr->location + 1); + } - for (uint32_t attr_idx = 0; attr_idx < info->vertexAttributeDescriptionCount; attr_idx++) { - input_for_attr[attr_idx] = -1; - for (uint32_t input_idx = 0; input_idx < vs->inputs_count; input_idx++) { - if ((vs->inputs[input_idx].slot - VERT_ATTRIB_GENERIC0) == - info->pVertexAttributeDescriptions[attr_idx].location) { - input_for_attr[attr_idx] = input_idx; - used_attrs_count++; - break; - } + if (attr_count != 0) + tu_cs_emit_pkt4(&cs, REG_A6XX_VFD_DECODE_INSTR(0), attr_count * 2); + + for (uint32_t loc = 0; loc < attr_count; loc++) { + const VkVertexInputAttributeDescription *attr = attrs[loc]; + + if (attr) { + const struct tu_native_format format = tu6_format_vtx(attr->format); + tu_cs_emit(&cs, A6XX_VFD_DECODE_INSTR(0, + .idx = attr->binding, + .offset = attr->offset, + .instanced = binding_instanced & (1 << attr->binding), + .format = format.fmt, + .swap = format.swap, + .unk30 = 1, + ._float = !vk_format_is_int(attr->format)).value); + tu_cs_emit(&cs, A6XX_VFD_DECODE_STEP_RATE(0, step_rate[attr->binding]).value); + } else { + tu_cs_emit(&cs, 0); + tu_cs_emit(&cs, 0); } } - if (used_attrs_count) - tu_cs_emit_pkt4(&cs, REG_A6XX_VFD_DECODE_INSTR(0), used_attrs_count * 2); - - for (uint32_t attr_idx = 0; attr_idx < info->vertexAttributeDescriptionCount; attr_idx++) { - const VkVertexInputAttributeDescription *attr = - &info->pVertexAttributeDescriptions[attr_idx]; - - if (input_for_attr[attr_idx] == -1) - continue; - - const struct tu_native_format format = tu6_format_vtx(attr->format); - tu_cs_emit(&cs, A6XX_VFD_DECODE_INSTR(0, - .idx = attr->binding, - .offset = attr->offset, - .instanced = binding_instanced & (1 << attr->binding), - .format = format.fmt, - .swap = format.swap, - .unk30 = 1, - ._float = !vk_format_is_int(attr->format)).value); - tu_cs_emit(&cs, A6XX_VFD_DECODE_STEP_RATE(0, step_rate[attr->binding]).value); - } - - if (used_attrs_count) - tu_cs_emit_pkt4(&cs, REG_A6XX_VFD_DEST_CNTL_INSTR(0), used_attrs_count); - - for (uint32_t attr_idx = 0; attr_idx < info->vertexAttributeDescriptionCount; attr_idx++) { - int32_t input_idx = input_for_attr[attr_idx]; - if (input_idx == -1) - continue; - - tu_cs_emit(&cs, A6XX_VFD_DEST_CNTL_INSTR(0, - .writemask = vs->inputs[input_idx].compmask, - .regid = vs->inputs[input_idx].regid).value); - } - - tu_cs_emit_regs(&cs, - A6XX_VFD_CONTROL_0( - .fetch_cnt = used_attrs_count, /* decode_cnt for binning pass ? */ - .decode_cnt = used_attrs_count)); - *vi_state = tu_cs_end_draw_state(&pipeline->cs, &cs); } @@ -2336,7 +2358,8 @@ tu_pipeline_allocate_cs(struct tu_device *dev, /* graphics case: */ if (builder) { - size += 2 * TU6_EMIT_VERTEX_INPUT_MAX_DWORDS; + size += TU6_EMIT_VERTEX_INPUT_MAX_DWORDS + + 2 * TU6_EMIT_VFD_DEST_MAX_DWORDS; for (uint32_t i = 0; i < ARRAY_SIZE(builder->shaders->variants); i++) { if (builder->shaders->variants[i]) { @@ -3257,8 +3280,6 @@ tu_pipeline_builder_parse_vertex_input(struct tu_pipeline_builder *builder, { const VkPipelineVertexInputStateCreateInfo *vi_info = builder->create_info->pVertexInputState; - const struct ir3_shader_variant *vs = builder->shaders->variants[MESA_SHADER_VERTEX]; - const struct ir3_shader_variant *bs = builder->binning_variant; /* Bindings may contain holes */ for (unsigned i = 0; i < vi_info->vertexBindingDescriptionCount; i++) { @@ -3266,9 +3287,7 @@ tu_pipeline_builder_parse_vertex_input(struct tu_pipeline_builder *builder, MAX2(pipeline->num_vbs, vi_info->pVertexBindingDescriptions[i].binding + 1); } - tu6_emit_vertex_input(pipeline, &pipeline->vi.state, vs, vi_info); - if (bs) - tu6_emit_vertex_input(pipeline, &pipeline->vi.binning_state, bs, vi_info); + tu6_emit_vertex_input(pipeline, &pipeline->vi.state, vi_info); } static void diff --git a/src/freedreno/vulkan/tu_pipeline.h b/src/freedreno/vulkan/tu_pipeline.h index 4bc758cd16c..6b074b4b71b 100644 --- a/src/freedreno/vulkan/tu_pipeline.h +++ b/src/freedreno/vulkan/tu_pipeline.h @@ -167,7 +167,6 @@ struct tu_pipeline struct { struct tu_draw_state state; - struct tu_draw_state binning_state; } vi; struct