From 743b41a2844678de7c3b8347ca73bf08ef8fc4b3 Mon Sep 17 00:00:00 2001 From: Boris Brezillon Date: Sun, 28 Apr 2024 11:52:19 +0200 Subject: [PATCH] panvk: Use IDVS jobs when we can This optimizes things by splitting the position and vertex processing in two, allowing primitives to be discarded before the varying shader is executed. This optimization is even more important if we throw layered rendering into the mix, because layered rendering on Bifrost is implemented with N IDVS/fragment jobs (N being the number of layers), with primitives not targetting a given layer being artificially culled in the vertex shader by issuing a position outside the render area. Signed-off-by: Boris Brezillon Reviewed-by: Mary Guillemard Part-of: --- src/panfrost/vulkan/jm/panvk_vX_cmd_draw.c | 252 +++++++++++++++------ src/panfrost/vulkan/panvk_vX_shader.c | 5 +- 2 files changed, 187 insertions(+), 70 deletions(-) diff --git a/src/panfrost/vulkan/jm/panvk_vX_cmd_draw.c b/src/panfrost/vulkan/jm/panvk_vX_cmd_draw.c index 1ce87e1ddd1..fce4f2a237f 100644 --- a/src/panfrost/vulkan/jm/panvk_vX_cmd_draw.c +++ b/src/panfrost/vulkan/jm/panvk_vX_cmd_draw.c @@ -70,9 +70,14 @@ struct panvk_draw_info { mali_ptr viewport; struct { struct panfrost_ptr vertex_copy_desc; - struct panfrost_ptr vertex; struct panfrost_ptr frag_copy_desc; - struct panfrost_ptr tiler; + union { + struct { + struct panfrost_ptr vertex; + struct panfrost_ptr tiler; + }; + struct panfrost_ptr idvs; + }; } jobs; }; @@ -782,39 +787,14 @@ panvk_draw_prepare_viewport(struct panvk_cmd_buffer *cmdbuf, } static void -panvk_draw_prepare_vertex_job(struct panvk_cmd_buffer *cmdbuf, - struct panvk_draw_info *draw) +panvk_emit_vertex_dcd(struct panvk_cmd_buffer *cmdbuf, + const struct panvk_draw_info *draw, void *dcd) { - struct panvk_device *dev = to_panvk_device(cmdbuf->vk.base.device); const struct panvk_shader *vs = cmdbuf->state.gfx.vs.shader; - struct panvk_shader_desc_state *vs_desc_state = &cmdbuf->state.gfx.vs.desc; - struct panvk_batch *batch = cmdbuf->cur_batch; + const struct panvk_shader_desc_state *vs_desc_state = + &cmdbuf->state.gfx.vs.desc; - panvk_per_arch(cmd_prepare_shader_desc_tables)(&cmdbuf->desc_pool.base, - &cmdbuf->state.gfx.desc_state, - vs, vs_desc_state); - - struct panfrost_ptr ptr = panvk_per_arch(meta_get_copy_desc_job)( - dev, &cmdbuf->desc_pool.base, vs, &cmdbuf->state.gfx.desc_state, - vs_desc_state); - - if (ptr.cpu) - util_dynarray_append(&batch->jobs, void *, ptr.cpu); - - draw->jobs.vertex_copy_desc = ptr; - - ptr = pan_pool_alloc_desc(&cmdbuf->desc_pool.base, COMPUTE_JOB); - util_dynarray_append(&batch->jobs, void *, ptr.cpu); - draw->jobs.vertex = ptr; - - memcpy(pan_section_ptr(ptr.cpu, COMPUTE_JOB, INVOCATION), &draw->invocation, - pan_size(INVOCATION)); - - pan_section_pack(ptr.cpu, COMPUTE_JOB, PARAMETERS, cfg) { - cfg.job_task_split = 5; - } - - pan_section_pack(ptr.cpu, COMPUTE_JOB, DRAW, cfg) { + pan_pack(dcd, DRAW, cfg) { cfg.state = panvk_priv_mem_dev_addr(vs->rsd); cfg.attributes = draw->vs.attributes; cfg.attribute_buffers = draw->vs.attribute_bufs; @@ -831,6 +811,28 @@ panvk_draw_prepare_vertex_job(struct panvk_cmd_buffer *cmdbuf, } } +static void +panvk_draw_prepare_vertex_job(struct panvk_cmd_buffer *cmdbuf, + struct panvk_draw_info *draw) +{ + struct panvk_batch *batch = cmdbuf->cur_batch; + struct panfrost_ptr ptr = + pan_pool_alloc_desc(&cmdbuf->desc_pool.base, COMPUTE_JOB); + + util_dynarray_append(&batch->jobs, void *, ptr.cpu); + draw->jobs.vertex = ptr; + + memcpy(pan_section_ptr(ptr.cpu, COMPUTE_JOB, INVOCATION), &draw->invocation, + pan_size(INVOCATION)); + + pan_section_pack(ptr.cpu, COMPUTE_JOB, PARAMETERS, cfg) { + cfg.job_task_split = 5; + } + + panvk_emit_vertex_dcd(cmdbuf, draw, + pan_section_ptr(ptr.cpu, COMPUTE_JOB, DRAW)); +} + static enum mali_draw_mode translate_prim_topology(VkPrimitiveTopology in) { @@ -867,6 +869,7 @@ panvk_emit_tiler_primitive(struct panvk_cmd_buffer *cmdbuf, bool writes_point_size = vs->info.vs.writes_point_size && ia->primitive_topology == VK_PRIMITIVE_TOPOLOGY_POINT_LIST; + bool secondary_shader = vs->info.vs.secondary_enable && fs_required(cmdbuf); pan_pack(prim, PRIMITIVE, cfg) { cfg.draw_mode = translate_prim_topology(ia->primitive_topology); @@ -900,6 +903,8 @@ panvk_emit_tiler_primitive(struct panvk_cmd_buffer *cmdbuf, cfg.index_count = draw->vertex_count; cfg.index_type = MALI_INDEX_TYPE_NONE; } + + cfg.secondary_shader = secondary_shader; } } @@ -974,18 +979,8 @@ panvk_draw_prepare_tiler_job(struct panvk_cmd_buffer *cmdbuf, { struct panvk_device *dev = to_panvk_device(cmdbuf->vk.base.device); struct panvk_batch *batch = cmdbuf->cur_batch; - - /* If the vertex job doesn't write the position, we don't need a tiler job. */ - if (!draw->position) - return; - const struct panvk_shader *fs = cmdbuf->state.gfx.fs.shader; struct panvk_shader_desc_state *fs_desc_state = &cmdbuf->state.gfx.fs.desc; - - panvk_per_arch(cmd_prepare_shader_desc_tables)(&cmdbuf->desc_pool.base, - &cmdbuf->state.gfx.desc_state, - fs, fs_desc_state); - struct panfrost_ptr ptr = panvk_per_arch(meta_get_copy_desc_job)( dev, &cmdbuf->desc_pool.base, fs, &cmdbuf->state.gfx.desc_state, fs_desc_state); @@ -1019,6 +1014,79 @@ panvk_draw_prepare_tiler_job(struct panvk_cmd_buffer *cmdbuf, ; } +static void +panvk_draw_prepare_idvs_job(struct panvk_cmd_buffer *cmdbuf, + struct panvk_draw_info *draw) +{ + struct panvk_batch *batch = cmdbuf->cur_batch; + struct panfrost_ptr ptr = + pan_pool_alloc_desc(&cmdbuf->desc_pool.base, INDEXED_VERTEX_JOB); + + util_dynarray_append(&batch->jobs, void *, ptr.cpu); + draw->jobs.idvs = ptr; + + memcpy(pan_section_ptr(ptr.cpu, INDEXED_VERTEX_JOB, INVOCATION), + &draw->invocation, pan_size(INVOCATION)); + + panvk_emit_tiler_primitive( + cmdbuf, draw, pan_section_ptr(ptr.cpu, INDEXED_VERTEX_JOB, PRIMITIVE)); + + panvk_emit_tiler_primitive_size( + cmdbuf, draw, + pan_section_ptr(ptr.cpu, INDEXED_VERTEX_JOB, PRIMITIVE_SIZE)); + + pan_section_pack(ptr.cpu, INDEXED_VERTEX_JOB, TILER, cfg) { + cfg.address = draw->tiler_ctx->bifrost; + } + + pan_section_pack(ptr.cpu, INDEXED_VERTEX_JOB, PADDING, _) { + } + + panvk_emit_tiler_dcd( + cmdbuf, draw, + pan_section_ptr(ptr.cpu, INDEXED_VERTEX_JOB, FRAGMENT_DRAW)); + + panvk_emit_vertex_dcd( + cmdbuf, draw, pan_section_ptr(ptr.cpu, INDEXED_VERTEX_JOB, VERTEX_DRAW)); +} + +static void +panvk_draw_prepare_vs_copy_desc_job(struct panvk_cmd_buffer *cmdbuf, + struct panvk_draw_info *draw) +{ + struct panvk_device *dev = to_panvk_device(cmdbuf->vk.base.device); + struct panvk_batch *batch = cmdbuf->cur_batch; + const struct panvk_shader *vs = cmdbuf->state.gfx.vs.shader; + const struct panvk_shader_desc_state *vs_desc_state = + &cmdbuf->state.gfx.vs.desc; + struct panfrost_ptr ptr = panvk_per_arch(meta_get_copy_desc_job)( + dev, &cmdbuf->desc_pool.base, vs, &cmdbuf->state.gfx.desc_state, + vs_desc_state); + + if (ptr.cpu) + util_dynarray_append(&batch->jobs, void *, ptr.cpu); + + draw->jobs.vertex_copy_desc = ptr; +} + +static void +panvk_draw_prepare_fs_copy_desc_job(struct panvk_cmd_buffer *cmdbuf, + struct panvk_draw_info *draw) +{ + struct panvk_device *dev = to_panvk_device(cmdbuf->vk.base.device); + const struct panvk_shader *fs = cmdbuf->state.gfx.fs.shader; + struct panvk_shader_desc_state *fs_desc_state = &cmdbuf->state.gfx.fs.desc; + struct panvk_batch *batch = cmdbuf->cur_batch; + struct panfrost_ptr ptr = panvk_per_arch(meta_get_copy_desc_job)( + dev, &cmdbuf->desc_pool.base, fs, &cmdbuf->state.gfx.desc_state, + fs_desc_state); + + if (ptr.cpu) + util_dynarray_append(&batch->jobs, void *, ptr.cpu); + + draw->jobs.frag_copy_desc = ptr; +} + void panvk_per_arch(cmd_preload_fb_after_batch_split)(struct panvk_cmd_buffer *cmdbuf) { @@ -1063,9 +1131,12 @@ panvk_cmd_draw(struct panvk_cmd_buffer *cmdbuf, struct panvk_draw_info *draw) struct panvk_batch *batch = cmdbuf->cur_batch; const struct panvk_shader *vs = cmdbuf->state.gfx.vs.shader; const struct panvk_shader *fs = cmdbuf->state.gfx.fs.shader; + struct panvk_shader_desc_state *vs_desc_state = &cmdbuf->state.gfx.vs.desc; + struct panvk_shader_desc_state *fs_desc_state = &cmdbuf->state.gfx.fs.desc; struct panvk_descriptor_state *desc_state = &cmdbuf->state.gfx.desc_state; const struct vk_rasterization_state *rs = &cmdbuf->vk.dynamic_graphics_state.rs; + bool idvs = vs->info.vs.idvs; /* If there's no vertex shader, we can skip the draw. */ if (!panvk_priv_mem_dev_addr(vs->rsd)) @@ -1101,6 +1172,38 @@ panvk_cmd_draw(struct panvk_cmd_buffer *cmdbuf, struct panvk_draw_info *draw) &cmdbuf->state.gfx.sysvals, sizeof(cmdbuf->state.gfx.sysvals)); } + panvk_per_arch(cmd_prepare_shader_desc_tables)(&cmdbuf->desc_pool.base, + &cmdbuf->state.gfx.desc_state, + vs, vs_desc_state); + panvk_draw_prepare_vs_copy_desc_job(cmdbuf, draw); + + unsigned copy_desc_job_id = + draw->jobs.vertex_copy_desc.gpu + ? pan_jc_add_job(&batch->jc, MALI_JOB_TYPE_COMPUTE, false, false, 0, 0, + &draw->jobs.vertex_copy_desc, false) + : 0; + + bool vs_writes_pos = + cmdbuf->state.gfx.link.buf_strides[PANVK_VARY_BUF_POSITION] > 0; + bool needs_tiling = !rs->rasterizer_discard_enable && vs_writes_pos; + + /* No need to setup the FS desc tables if the FS is not executed. */ + if (needs_tiling && fs_required(cmdbuf)) { + panvk_per_arch(cmd_prepare_shader_desc_tables)( + &cmdbuf->desc_pool.base, &cmdbuf->state.gfx.desc_state, fs, + fs_desc_state); + panvk_draw_prepare_fs_copy_desc_job(cmdbuf, draw); + + if (draw->jobs.frag_copy_desc.gpu) { + /* We don't need to add frag_copy_desc as a dependency because the + * tiler job doesn't execute the fragment shader, the fragment job + * will, and the tiler/fragment synchronization happens at the batch + * level. */ + pan_jc_add_job(&batch->jc, MALI_JOB_TYPE_COMPUTE, false, false, 0, 0, + &draw->jobs.frag_copy_desc, false); + } + } + /* TODO: indexed draws */ draw->tls = batch->tls.gpu; draw->fb = batch->fb.desc.gpu; @@ -1115,38 +1218,51 @@ panvk_cmd_draw(struct panvk_cmd_buffer *cmdbuf, struct panvk_draw_info *draw) panvk_draw_prepare_attributes(cmdbuf, draw); panvk_draw_prepare_viewport(cmdbuf, draw); panvk_draw_prepare_tiler_context(cmdbuf, draw); - panvk_draw_prepare_vertex_job(cmdbuf, draw); - panvk_draw_prepare_tiler_job(cmdbuf, draw); batch->tlsinfo.tls.size = MAX3(vs->info.tls_size, fs ? fs->info.tls_size : 0, batch->tlsinfo.tls.size); - unsigned copy_desc_job_id = - draw->jobs.vertex_copy_desc.gpu - ? pan_jc_add_job(&batch->jc, MALI_JOB_TYPE_COMPUTE, false, false, 0, 0, - &draw->jobs.vertex_copy_desc, false) - : 0; + if (idvs) { + panvk_draw_prepare_idvs_job(cmdbuf, draw); + pan_jc_add_job(&batch->jc, MALI_JOB_TYPE_INDEXED_VERTEX, false, false, 0, + copy_desc_job_id, &draw->jobs.idvs, false); + } else { + panvk_draw_prepare_vertex_job(cmdbuf, draw); - unsigned vjob_id = - pan_jc_add_job(&batch->jc, MALI_JOB_TYPE_VERTEX, false, false, 0, - copy_desc_job_id, &draw->jobs.vertex, false); + unsigned vjob_id = + pan_jc_add_job(&batch->jc, MALI_JOB_TYPE_VERTEX, false, false, 0, + copy_desc_job_id, &draw->jobs.vertex, false); - if (!rs->rasterizer_discard_enable && draw->position) { - /* We don't need to add frag_copy_desc as a dependency, because the - * tiler job doesn't execute the fragment shader. The fragment job - * will, and the tiler/fragment synchronization happens at the batch - * level. */ - if (draw->jobs.frag_copy_desc.gpu) - pan_jc_add_job(&batch->jc, MALI_JOB_TYPE_COMPUTE, false, false, 0, 0, - &draw->jobs.frag_copy_desc, false); - - pan_jc_add_job(&batch->jc, MALI_JOB_TYPE_TILER, false, false, vjob_id, 0, - &draw->jobs.tiler, false); + if (needs_tiling) { + panvk_draw_prepare_tiler_job(cmdbuf, draw); + pan_jc_add_job(&batch->jc, MALI_JOB_TYPE_TILER, false, false, vjob_id, + 0, &draw->jobs.tiler, false); + } } /* Clear the dirty flags all at once */ cmdbuf->state.gfx.dirty = 0; } +static unsigned +padded_vertex_count(struct panvk_cmd_buffer *cmdbuf, uint32_t vertex_count, + uint32_t instance_count) +{ + if (instance_count == 1) + return vertex_count; + + bool idvs = cmdbuf->state.gfx.vs.shader->info.vs.idvs; + + /* Index-Driven Vertex Shading requires different instances to + * have different cache lines for position results. Each vertex + * position is 16 bytes and the Mali cache line is 64 bytes, so + * the instance count must be aligned to 4 vertices. + */ + if (idvs) + vertex_count = ALIGN_POT(vertex_count, 4); + + return panfrost_padded_vertex_count(vertex_count); +} + VKAPI_ATTR void VKAPI_CALL panvk_per_arch(CmdDraw)(VkCommandBuffer commandBuffer, uint32_t vertexCount, uint32_t instanceCount, uint32_t firstVertex, @@ -1163,9 +1279,8 @@ panvk_per_arch(CmdDraw)(VkCommandBuffer commandBuffer, uint32_t vertexCount, .vertex_range = vertexCount, .first_instance = firstInstance, .instance_count = instanceCount, - .padded_vertex_count = instanceCount > 1 - ? panfrost_padded_vertex_count(vertexCount) - : vertexCount, + .padded_vertex_count = + padded_vertex_count(cmdbuf, vertexCount, instanceCount), .offset_start = firstVertex, }; @@ -1250,9 +1365,8 @@ panvk_per_arch(CmdDrawIndexed)(VkCommandBuffer commandBuffer, .instance_count = instanceCount, .vertex_range = vertex_range, .vertex_count = indexCount + abs(vertexOffset), - .padded_vertex_count = instanceCount > 1 - ? panfrost_padded_vertex_count(vertex_range) - : vertex_range, + .padded_vertex_count = + padded_vertex_count(cmdbuf, vertex_range, instanceCount), .offset_start = min_vertex + vertexOffset, .indices = panvk_buffer_gpu_ptr(cmdbuf->state.gfx.ib.buffer, cmdbuf->state.gfx.ib.offset) + diff --git a/src/panfrost/vulkan/panvk_vX_shader.c b/src/panfrost/vulkan/panvk_vX_shader.c index e9a893aa68f..d7dbc8b0db4 100644 --- a/src/panfrost/vulkan/panvk_vX_shader.c +++ b/src/panfrost/vulkan/panvk_vX_shader.c @@ -427,6 +427,10 @@ panvk_compile_nir(struct panvk_device *dev, nir_shader *nir, VERT_ATTRIB_GENERIC0; shader->info.attribute_count = util_last_bit(gen_attribs); + + /* NULL IDVS shaders are not allowed. */ + if (!bin_size) + shader->info.vs.idvs = false; } /* Image attributes start at MAX_VS_ATTRIBS in the VS attribute table, @@ -515,7 +519,6 @@ panvk_compile_shader(struct panvk_device *dev, struct panfrost_compile_inputs inputs = { .gpu_id = phys_dev->kmod.props.gpu_prod_id, .no_ubo_to_push = true, - .no_idvs = true, /* TODO */ }; panvk_lower_nir(dev, nir, info->set_layout_count, info->set_layouts,