diff --git a/src/freedreno/vulkan/tu_cmd_buffer.cc b/src/freedreno/vulkan/tu_cmd_buffer.cc index a1e9a1abae9..d1bd465849a 100644 --- a/src/freedreno/vulkan/tu_cmd_buffer.cc +++ b/src/freedreno/vulkan/tu_cmd_buffer.cc @@ -756,7 +756,7 @@ use_sysmem_rendering(struct tu_cmd_buffer *cmd, /* Optimization: there is no reason to load gmem if there is no * geometry to process. COND_REG_EXEC predicate is set here, - * but the actual skip happens in tu6_emit_tile_load() and tile_store_cs, + * but the actual skip happens in tu_load_gmem_attachment() and tile_store_cs, * for each blit separately. */ static void @@ -958,17 +958,6 @@ tu6_emit_sysmem_resolves(struct tu_cmd_buffer *cmd, } } -static void -tu6_emit_tile_load(struct tu_cmd_buffer *cmd, struct tu_cs *cs) -{ - tu6_emit_blit_scissor(cmd, cs, true); - - const bool cond_exec_allowed = cmd->state.tiling->binning && - cmd->state.pass->has_cond_load_store; - for (uint32_t i = 0; i < cmd->state.pass->attachment_count; ++i) - tu_load_gmem_attachment(cmd, cs, i, cond_exec_allowed, false); -} - static void tu6_emit_tile_store(struct tu_cmd_buffer *cmd, struct tu_cs *cs) { @@ -1466,23 +1455,12 @@ tu_emit_renderpass_begin(struct tu_cmd_buffer *cmd) { struct tu_cs *cs = &cmd->draw_cs; - if (cmd->state.pass->has_fdm) - tu_cs_set_writeable(cs, true); - - tu_cond_exec_start(cs, CP_COND_EXEC_0_RENDER_MODE_GMEM); - - tu6_emit_tile_load(cmd, cs); - - tu6_emit_blit_scissor(cmd, cs, false); - - for (uint32_t i = 0; i < cmd->state.pass->attachment_count; ++i) - tu_clear_gmem_attachment(cmd, cs, i); - - tu_cond_exec_end(cs); - - if (cmd->state.pass->has_fdm) - tu_cs_set_writeable(cs, false); - + /* Emit sysmem loads and clears, which we do all of in one cond block at the + * beginning of the render pass. + * + * gmem loads and clears happen per-subpass, so we can reuse gmem space + * between attachments in separate subpasses. + */ tu_cond_exec_start(cs, CP_COND_EXEC_0_RENDER_MODE_SYSMEM); for (uint32_t i = 0; i < cmd->state.pass->attachment_count; ++i) @@ -3600,13 +3578,64 @@ tu_subpass_barrier(struct tu_cmd_buffer *cmd_buffer, tu_flush_for_stage(cache, src_stage, dst_stage); } -/* emit mrt/zs/msaa/ubwc state for the subpass that is starting (either at - * vkCmdBeginRenderPass2() or vkCmdNextSubpass2()) +static void +tu_emit_subpass_begin_gmem(struct tu_cmd_buffer *cmd) +{ + struct tu_cs *cs = &cmd->draw_cs; + uint32_t subpass_idx = cmd->state.subpass - cmd->state.pass->subpasses; + + /* If we might choose to bin, then put the loads under a check for geometry + * having been binned to this tile. If we don't choose to bin in the end, + * then we will have manually set those registers to say geometry is present. + * + * However, if the draw CS has a write to the condition for some other reason + * (perf queries), then we can't do this optimization since the + * start-of-the-CS geometry condition will have been overwritten. + */ + bool cond_load_allowed = cmd->state.tiling->binning && + cmd->state.pass->has_cond_load_store && + !cmd->state.rp.draw_cs_writes_to_cond_pred; + + tu_cond_exec_start(cs, CP_COND_EXEC_0_RENDER_MODE_GMEM); + + /* Emit gmem loads that are first used in this subpass. */ + bool emitted_scissor = false; + for (uint32_t i = 0; i < cmd->state.pass->attachment_count; ++i) { + struct tu_render_pass_attachment *att = &cmd->state.pass->attachments[i]; + if ((att->load || att->load_stencil) && att->first_subpass_idx == subpass_idx) { + if (!emitted_scissor) { + tu6_emit_blit_scissor(cmd, cs, true); + emitted_scissor = true; + } + tu_load_gmem_attachment(cmd, cs, i, cond_load_allowed, false); + } + } + + /* Emit gmem clears that are first used in this subpass. */ + emitted_scissor = false; + for (uint32_t i = 0; i < cmd->state.pass->attachment_count; ++i) { + struct tu_render_pass_attachment *att = &cmd->state.pass->attachments[i]; + if (att->clear_mask && att->first_subpass_idx == subpass_idx) { + if (!emitted_scissor) { + tu6_emit_blit_scissor(cmd, cs, false); + emitted_scissor = true; + } + tu_clear_gmem_attachment(cmd, cs, i); + } + } + + tu_cond_exec_end(cs); /* CP_COND_EXEC_0_RENDER_MODE_GMEM */ +} + +/* emit gmem loads/clears, and mrt/zs/msaa/ubwc state for the subpass that is + * starting (either at vkCmdBeginRenderPass2() or vkCmdNextSubpass2()) */ static void tu_emit_subpass_begin(struct tu_cmd_buffer *cmd) { tu_fill_render_pass_state(&cmd->state.vk_rp, cmd->state.pass, cmd->state.subpass); + + tu_emit_subpass_begin_gmem(cmd); tu6_emit_zs(cmd, cmd->state.subpass, &cmd->draw_cs); tu6_emit_mrt(cmd, cmd->state.subpass, &cmd->draw_cs); tu6_emit_render_cntl(cmd, cmd->state.subpass, &cmd->draw_cs, false); diff --git a/src/freedreno/vulkan/tu_pass.cc b/src/freedreno/vulkan/tu_pass.cc index 38331f848a2..d6fa6fa9806 100644 --- a/src/freedreno/vulkan/tu_pass.cc +++ b/src/freedreno/vulkan/tu_pass.cc @@ -796,11 +796,8 @@ tu_subpass_use_attachment(struct tu_render_pass *pass, int i, uint32_t a, const update_samples(subpass, pCreateInfo->pAttachments[a].samples); att->clear_views |= subpass->multiview_mask; - /* Loads and clears are emitted at vkBeginRenderPass() time. */ - if (att->clear_mask || att->load || att->load_stencil) - att->first_subpass_idx = 0; - else - att->first_subpass_idx = MIN2(i, att->first_subpass_idx); + /* Loads and clears are emitted at the start of the subpass that needs them. */ + att->first_subpass_idx = MIN2(i, att->first_subpass_idx); /* Stores are emitted at vkEndRenderPass() time. */ if (att->store || att->store_stencil)