From 9c5e47e66d7c715c6a3587c456076618899bccdc Mon Sep 17 00:00:00 2001 From: Matt Coster Date: Mon, 28 Nov 2022 16:36:57 +0000 Subject: [PATCH] pvr: Split render job submission for multi-layer framebuffers Signed-off-by: Matt Coster Reviewed-by: Karmjit Mahil Part-of: --- src/imagination/vulkan/pvr_cmd_buffer.c | 67 ++++++- src/imagination/vulkan/pvr_private.h | 9 + src/imagination/vulkan/pvr_queue.c | 223 ++++++++++++++++++------ 3 files changed, 239 insertions(+), 60 deletions(-) diff --git a/src/imagination/vulkan/pvr_cmd_buffer.c b/src/imagination/vulkan/pvr_cmd_buffer.c index d2cd33a6a4e..c020b4b2892 100644 --- a/src/imagination/vulkan/pvr_cmd_buffer.c +++ b/src/imagination/vulkan/pvr_cmd_buffer.c @@ -91,6 +91,7 @@ static void pvr_cmd_buffer_free_sub_cmd(struct pvr_cmd_buffer *cmd_buffer, case PVR_SUB_CMD_TYPE_GRAPHICS: util_dynarray_fini(&sub_cmd->gfx.sec_query_indices); pvr_csb_finish(&sub_cmd->gfx.control_stream); + pvr_bo_free(cmd_buffer->device, sub_cmd->gfx.terminate_ctrl_stream); pvr_bo_free(cmd_buffer->device, sub_cmd->gfx.depth_bias_bo); pvr_bo_free(cmd_buffer->device, sub_cmd->gfx.scissor_bo); break; @@ -343,22 +344,25 @@ err_free_depth_bias_bo: } static VkResult -pvr_cmd_buffer_emit_ppp_state(struct pvr_cmd_buffer *cmd_buffer, - struct pvr_sub_cmd_gfx *const sub_cmd) +pvr_cmd_buffer_emit_ppp_state(const struct pvr_cmd_buffer *const cmd_buffer, + struct pvr_csb *const csb) { - struct pvr_framebuffer *framebuffer = + const struct pvr_framebuffer *const framebuffer = cmd_buffer->state.render_pass_info.framebuffer; - pvr_csb_emit (&sub_cmd->control_stream, VDMCTRL_PPP_STATE0, state0) { + assert(csb->stream_type == PVR_CMD_STREAM_TYPE_GRAPHICS || + csb->stream_type == PVR_CMD_STREAM_TYPE_GRAPHICS_DEFERRED); + + pvr_csb_emit (csb, VDMCTRL_PPP_STATE0, state0) { state0.addrmsb = framebuffer->ppp_state_bo->vma->dev_addr; state0.word_count = framebuffer->ppp_state_size; } - pvr_csb_emit (&sub_cmd->control_stream, VDMCTRL_PPP_STATE1, state1) { + pvr_csb_emit (csb, VDMCTRL_PPP_STATE1, state1) { state1.addrlsb = framebuffer->ppp_state_bo->vma->dev_addr; } - return VK_SUCCESS; + return csb->status; } VkResult pvr_cmd_buffer_upload_general(struct pvr_cmd_buffer *const cmd_buffer, @@ -559,6 +563,44 @@ err_free_usc_pixel_program: return result; } +static VkResult pvr_sub_cmd_gfx_build_terminate_ctrl_stream( + struct pvr_device *const device, + const struct pvr_cmd_buffer *const cmd_buffer, + struct pvr_sub_cmd_gfx *const gfx_sub_cmd) +{ + struct list_head bo_list; + struct pvr_csb csb; + VkResult result; + + pvr_csb_init(device, PVR_CMD_STREAM_TYPE_GRAPHICS, &csb); + + result = pvr_cmd_buffer_emit_ppp_state(cmd_buffer, &csb); + if (result != VK_SUCCESS) + goto err_csb_finish; + + result = pvr_csb_emit_terminate(&csb); + if (result != VK_SUCCESS) + goto err_csb_finish; + + result = pvr_csb_bake(&csb, &bo_list); + if (result != VK_SUCCESS) + goto err_csb_finish; + + /* This is a trivial control stream, there's no reason it should ever require + * more memory than a single bo can provide. + */ + assert(list_is_singular(&bo_list)); + gfx_sub_cmd->terminate_ctrl_stream = + list_first_entry(&bo_list, struct pvr_bo, link); + + return VK_SUCCESS; + +err_csb_finish: + pvr_csb_finish(&csb); + + return result; +} + static VkResult pvr_load_op_constants_create_and_upload(struct pvr_cmd_buffer *cmd_buffer, const struct pvr_load_op *load_op, @@ -1535,7 +1577,18 @@ VkResult pvr_cmd_buffer_end_sub_cmd(struct pvr_cmd_buffer *cmd_buffer) return result; } - result = pvr_cmd_buffer_emit_ppp_state(cmd_buffer, gfx_sub_cmd); + if (pvr_sub_cmd_gfx_requires_split_submit(gfx_sub_cmd)) { + result = pvr_sub_cmd_gfx_build_terminate_ctrl_stream(device, + cmd_buffer, + gfx_sub_cmd); + if (result != VK_SUCCESS) { + state->status = result; + return result; + } + } + + result = pvr_cmd_buffer_emit_ppp_state(cmd_buffer, + &gfx_sub_cmd->control_stream); if (result != VK_SUCCESS) { state->status = result; return result; diff --git a/src/imagination/vulkan/pvr_private.h b/src/imagination/vulkan/pvr_private.h index 05fc2c0f1ad..b8b345fc77f 100644 --- a/src/imagination/vulkan/pvr_private.h +++ b/src/imagination/vulkan/pvr_private.h @@ -659,6 +659,9 @@ struct pvr_sub_cmd_gfx { /* Control stream builder object */ struct pvr_csb control_stream; + /* Required iff pvr_sub_cmd_gfx_requires_split_submit() returns true. */ + struct pvr_bo *terminate_ctrl_stream; + uint32_t hw_render_idx; uint32_t max_tiles_in_flight; @@ -1548,6 +1551,12 @@ pvr_stage_mask_dst(VkPipelineStageFlags2KHR stage_mask) return pvr_stage_mask(stage_mask); } +static inline bool pvr_sub_cmd_gfx_requires_split_submit( + const struct pvr_sub_cmd_gfx *const sub_cmd) +{ + return sub_cmd->job.run_frag && sub_cmd->framebuffer->layers > 1; +} + VkResult pvr_pds_fragment_program_create_and_upload( struct pvr_device *device, const VkAllocationCallbacks *allocator, diff --git a/src/imagination/vulkan/pvr_queue.c b/src/imagination/vulkan/pvr_queue.c index 9546402b295..700c79a3089 100644 --- a/src/imagination/vulkan/pvr_queue.c +++ b/src/imagination/vulkan/pvr_queue.c @@ -205,6 +205,150 @@ VkResult pvr_QueueWaitIdle(VkQueue _queue) return VK_SUCCESS; } +static VkResult +pvr_process_graphics_cmd_part(struct pvr_device *const device, + struct pvr_render_ctx *const gfx_ctx, + struct pvr_render_job *const job, + struct vk_sync *const geom_barrier, + struct vk_sync *const frag_barrier, + struct vk_sync **const geom_completion, + struct vk_sync **const frag_completion, + struct vk_sync **const waits, + const uint32_t wait_count, + uint32_t *const stage_flags) +{ + struct vk_sync *geom_sync = NULL; + struct vk_sync *frag_sync = NULL; + VkResult result; + + /* For each of geom and frag, a completion sync is optional but only allowed + * iff barrier is present. + */ + assert(geom_barrier || !geom_completion); + assert(frag_barrier || !frag_completion); + + if (geom_barrier) { + result = vk_sync_create(&device->vk, + &device->pdevice->ws->syncobj_type, + 0U, + 0UL, + &geom_sync); + if (result != VK_SUCCESS) + goto err_out; + } + + if (frag_barrier) { + result = vk_sync_create(&device->vk, + &device->pdevice->ws->syncobj_type, + 0U, + 0UL, + &frag_sync); + if (result != VK_SUCCESS) + goto err_destroy_sync_geom; + } + + result = pvr_render_job_submit(gfx_ctx, + job, + geom_barrier, + frag_barrier, + waits, + wait_count, + stage_flags, + geom_sync, + frag_sync); + if (result != VK_SUCCESS) + goto err_destroy_sync_frag; + + /* Replace the completion fences. */ + if (geom_sync) { + if (*geom_completion) + vk_sync_destroy(&device->vk, *geom_completion); + + *geom_completion = geom_sync; + } + + if (frag_sync) { + if (*frag_completion) + vk_sync_destroy(&device->vk, *frag_completion); + + *frag_completion = frag_sync; + } + + return VK_SUCCESS; + +err_destroy_sync_frag: + if (frag_sync) + vk_sync_destroy(&device->vk, frag_sync); + +err_destroy_sync_geom: + if (geom_sync) + vk_sync_destroy(&device->vk, geom_sync); + +err_out: + return result; +} + +static VkResult +pvr_process_split_graphics_cmd(struct pvr_device *const device, + struct pvr_render_ctx *const gfx_ctx, + struct pvr_sub_cmd_gfx *sub_cmd, + struct vk_sync *const geom_barrier, + struct vk_sync *const frag_barrier, + struct vk_sync **const geom_completion, + struct vk_sync **const frag_completion, + struct vk_sync **const waits, + const uint32_t wait_count, + uint32_t *const stage_flags) +{ + struct pvr_render_job *const job = &sub_cmd->job; + const pvr_dev_addr_t original_ctrl_stream_addr = job->ctrl_stream_addr; + const bool original_geometry_terminate = job->geometry_terminate; + const bool original_run_frag = job->run_frag; + VkResult result; + + /* First submit must not touch fragment work. */ + job->geometry_terminate = false; + job->run_frag = false; + + result = pvr_process_graphics_cmd_part(device, + gfx_ctx, + job, + geom_barrier, + NULL, + geom_completion, + NULL, + waits, + wait_count, + stage_flags); + + job->geometry_terminate = original_geometry_terminate; + job->run_frag = original_run_frag; + + if (result != VK_SUCCESS) + return result; + + /* Second submit contains only a trivial control stream to terminate the + * geometry work. + */ + assert(sub_cmd->terminate_ctrl_stream); + job->ctrl_stream_addr = sub_cmd->terminate_ctrl_stream->vma->dev_addr; + + result = pvr_process_graphics_cmd_part(device, + gfx_ctx, + job, + NULL, + frag_barrier, + NULL, + frag_completion, + waits, + wait_count, + stage_flags); + + job->ctrl_stream_addr = original_ctrl_stream_addr; + + return result; +} + static VkResult pvr_process_graphics_cmd(struct pvr_device *device, struct pvr_queue *queue, @@ -217,66 +361,39 @@ pvr_process_graphics_cmd(struct pvr_device *device, uint32_t *stage_flags, struct vk_sync *completions[static PVR_JOB_TYPE_MAX]) { - const struct pvr_framebuffer *framebuffer = sub_cmd->framebuffer; - struct vk_sync *sync_geom; - struct vk_sync *sync_frag; - VkResult result; - - result = vk_sync_create(&device->vk, - &device->pdevice->ws->syncobj_type, - 0U, - 0UL, - &sync_geom); - if (result != VK_SUCCESS) - return result; - - result = vk_sync_create(&device->vk, - &device->pdevice->ws->syncobj_type, - 0U, - 0UL, - &sync_frag); - if (result != VK_SUCCESS) { - vk_sync_destroy(&device->vk, sync_geom); - return result; - } - /* FIXME: DoShadowLoadOrStore() */ - /* FIXME: If the framebuffer being rendered to has multiple layers then we - * need to split submissions that run a fragment job into two. + /* Perform two render submits when using multiple framebuffer layers. The + * first submit contains just geometry, while the second only terminates + * (and triggers the fragment render if originally specified). This is needed + * because the render target cache gets cleared on terminating submits, which + * could result in missing primitives. */ - if (sub_cmd->job.run_frag && framebuffer->layers > 1) - pvr_finishme("Split job submission for framebuffers with > 1 layers"); - - result = pvr_render_job_submit(queue->gfx_ctx, - &sub_cmd->job, - barrier_geom, - barrier_frag, - waits, - wait_count, - stage_flags, - sync_geom, - sync_frag); - if (result != VK_SUCCESS) { - vk_sync_destroy(&device->vk, sync_geom); - vk_sync_destroy(&device->vk, sync_frag); - return result; + if (pvr_sub_cmd_gfx_requires_split_submit(sub_cmd)) { + return pvr_process_split_graphics_cmd(device, + queue->gfx_ctx, + sub_cmd, + barrier_geom, + barrier_frag, + &completions[PVR_JOB_TYPE_GEOM], + &completions[PVR_JOB_TYPE_FRAG], + waits, + wait_count, + stage_flags); } - /* Replace the completion fences. */ - if (completions[PVR_JOB_TYPE_GEOM]) - vk_sync_destroy(&device->vk, completions[PVR_JOB_TYPE_GEOM]); - - completions[PVR_JOB_TYPE_GEOM] = sync_geom; - - if (completions[PVR_JOB_TYPE_FRAG]) - vk_sync_destroy(&device->vk, completions[PVR_JOB_TYPE_FRAG]); - - completions[PVR_JOB_TYPE_FRAG] = sync_frag; + return pvr_process_graphics_cmd_part(device, + queue->gfx_ctx, + &sub_cmd->job, + barrier_geom, + barrier_frag, + &completions[PVR_JOB_TYPE_GEOM], + &completions[PVR_JOB_TYPE_FRAG], + waits, + wait_count, + stage_flags); /* FIXME: DoShadowLoadOrStore() */ - - return result; } static VkResult