diff --git a/src/amd/vulkan/layers/radv_sqtt_layer.c b/src/amd/vulkan/layers/radv_sqtt_layer.c index 4b65f8567b7..06942023b02 100644 --- a/src/amd/vulkan/layers/radv_sqtt_layer.c +++ b/src/amd/vulkan/layers/radv_sqtt_layer.c @@ -201,7 +201,7 @@ radv_sqtt_reloc_graphics_shaders(struct radv_device *device, } if (device->shader_use_invisible_vram) { - if (!radv_shader_dma_submit(device, submission, NULL)) + if (!radv_shader_dma_submit(device, submission, &pipeline->base.shader_upload_seq)) return VK_ERROR_UNKNOWN; } diff --git a/src/amd/vulkan/radv_cmd_buffer.c b/src/amd/vulkan/radv_cmd_buffer.c index b9ba19c41e7..cb25f1e60af 100644 --- a/src/amd/vulkan/radv_cmd_buffer.c +++ b/src/amd/vulkan/radv_cmd_buffer.c @@ -420,6 +420,7 @@ radv_reset_cmd_buffer(struct vk_command_buffer *vk_cmd_buffer, cmd_buffer->ace_internal.sem.gfx2ace_value = 0; cmd_buffer->ace_internal.sem.emitted_gfx2ace_value = 0; cmd_buffer->ace_internal.sem.va = 0; + cmd_buffer->shader_upload_seq = 0; if (cmd_buffer->upload.upload_bo) radv_cs_add_buffer(cmd_buffer->device->ws, cmd_buffer->cs, cmd_buffer->upload.upload_bo); @@ -1848,6 +1849,8 @@ radv_emit_ps_epilog_state(struct radv_cmd_buffer *cmd_buffer, struct radv_shader radv_emit_shader_pointer(cmd_buffer->device, cmd_buffer->cs, base_reg + loc->sgpr_idx * 4, ps_epilog->va, false); + cmd_buffer->shader_upload_seq = MAX2(cmd_buffer->shader_upload_seq, ps_epilog->upload_seq); + cmd_buffer->state.emitted_ps_epilog = ps_epilog; } @@ -3905,6 +3908,8 @@ radv_emit_vertex_input(struct radv_cmd_buffer *cmd_buffer, bool pipeline_is_dirt emit_prolog_regs(cmd_buffer, vs_shader, prolog, pipeline_is_dirty); emit_prolog_inputs(cmd_buffer, vs_shader, nontrivial_divisors, pipeline_is_dirty); + cmd_buffer->shader_upload_seq = MAX2(cmd_buffer->shader_upload_seq, prolog->upload_seq); + cmd_buffer->state.emitted_vs_prolog = prolog; if (unlikely(cmd_buffer->device->trace_bo)) @@ -6374,6 +6379,10 @@ radv_CmdBindPipeline(VkCommandBuffer commandBuffer, VkPipelineBindPoint pipeline assert(!"invalid bind point"); break; } + + if (cmd_buffer->device->shader_use_invisible_vram) + cmd_buffer->shader_upload_seq = + MAX2(cmd_buffer->shader_upload_seq, pipeline->shader_upload_seq); } VKAPI_ATTR void VKAPI_CALL @@ -7153,6 +7162,8 @@ radv_CmdExecuteCommands(VkCommandBuffer commandBuffer, uint32_t commandBufferCou if (secondary->gds_oa_needed) primary->gds_oa_needed = true; + primary->shader_upload_seq = MAX2(primary->shader_upload_seq, secondary->shader_upload_seq); + if (!secondary->state.render.has_image_views && primary->state.render.active && (primary->state.dirty & RADV_CMD_DIRTY_FRAMEBUFFER)) { /* Emit the framebuffer state from primary if secondary diff --git a/src/amd/vulkan/radv_pipeline.c b/src/amd/vulkan/radv_pipeline.c index 14f139e1a19..ff00b2d1a1a 100644 --- a/src/amd/vulkan/radv_pipeline.c +++ b/src/amd/vulkan/radv_pipeline.c @@ -4998,6 +4998,13 @@ radv_graphics_pipeline_init(struct radv_graphics_pipeline *pipeline, struct radv pipeline->base.push_constant_size = pipeline_layout.push_constant_size; pipeline->base.dynamic_offset_count = pipeline_layout.dynamic_offset_count; + for (unsigned i = 0; i < MESA_VULKAN_SHADER_STAGES; i++) { + if (pipeline->base.shaders[i]) { + pipeline->base.shader_upload_seq = MAX2(pipeline->base.shader_upload_seq, + pipeline->base.shaders[i]->upload_seq); + } + } + if (extra) { radv_pipeline_init_extra(pipeline, extra, &blend, &state, &vgt_gs_out_prim_type); } @@ -5264,6 +5271,8 @@ radv_compute_pipeline_init(struct radv_compute_pipeline *pipeline, pipeline->base.push_constant_size = layout->push_constant_size; pipeline->base.dynamic_offset_count = layout->dynamic_offset_count; + pipeline->base.shader_upload_seq = pipeline->base.shaders[MESA_SHADER_COMPUTE]->upload_seq; + if (device->physical_device->rad_info.has_cs_regalloc_hang_bug) { struct radv_shader *compute_shader = pipeline->base.shaders[MESA_SHADER_COMPUTE]; unsigned *cs_block_size = compute_shader->info.cs.block_size; diff --git a/src/amd/vulkan/radv_private.h b/src/amd/vulkan/radv_private.h index adb3a1ce7f1..392e0c445ed 100644 --- a/src/amd/vulkan/radv_private.h +++ b/src/amd/vulkan/radv_private.h @@ -839,6 +839,8 @@ struct radv_queue { struct radv_queue_state state; struct radv_queue_state *ace_internal_state; struct radeon_winsys_bo *gang_sem_bo; + + uint64_t last_shader_upload_seq; }; int radv_queue_init(struct radv_device *device, struct radv_queue *queue, int idx, @@ -1771,6 +1773,8 @@ struct radv_cmd_buffer { struct radv_video_session *vid; struct radv_video_session_params *params; } video; + + uint64_t shader_upload_seq; }; extern const struct vk_command_buffer_ops radv_cmd_buffer_ops; @@ -2154,6 +2158,8 @@ struct radv_pipeline { struct radv_shader *shaders[MESA_VULKAN_SHADER_STAGES]; struct radv_shader *gs_copy_shader; + uint64_t shader_upload_seq; + struct radeon_cmdbuf cs; uint32_t ctx_cs_hash; struct radeon_cmdbuf ctx_cs; diff --git a/src/amd/vulkan/radv_queue.c b/src/amd/vulkan/radv_queue.c index 68d1537b734..5e7bfb5b741 100644 --- a/src/amd/vulkan/radv_queue.c +++ b/src/amd/vulkan/radv_queue.c @@ -28,6 +28,8 @@ #include "radv_cs.h" #include "radv_debug.h" #include "radv_private.h" +#include "vk_sync.h" +#include "vk_semaphore.h" /* The number of IBs per submit isn't infinite, it depends on the IP type * (ie. some initial setup needed for a submit) and the number of IBs (4 DW). @@ -1563,6 +1565,19 @@ radv_create_perf_counter_lock_cs(struct radv_device *device, unsigned pass, bool return *cs_ref; } +static void +radv_get_shader_upload_sync_wait(struct radv_device *device, uint64_t shader_upload_seq, + struct vk_sync_wait *out_sync_wait) +{ + struct vk_semaphore *semaphore = vk_semaphore_from_handle(device->shader_upload_sem); + struct vk_sync *sync = vk_semaphore_get_active_sync(semaphore); + *out_sync_wait = (struct vk_sync_wait){ + .sync = sync, + .wait_value = shader_upload_seq, + .stage_mask = VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT, + }; +} + static VkResult radv_queue_submit_normal(struct radv_queue *queue, struct vk_queue_submit *submission) { @@ -1571,6 +1586,9 @@ radv_queue_submit_normal(struct radv_queue *queue, struct vk_queue_submit *submi bool use_ace = false; bool use_perf_counters = false; VkResult result; + uint64_t shader_upload_seq = 0; + uint32_t wait_count = submission->wait_count; + struct vk_sync_wait *waits = submission->waits; result = radv_update_preambles(&queue->state, queue->device, submission->command_buffers, submission->command_buffer_count, &use_perf_counters, &use_ace); @@ -1600,6 +1618,27 @@ radv_queue_submit_normal(struct radv_queue *queue, struct vk_queue_submit *submi if (queue->device->trace_bo) simple_mtx_lock(&queue->device->trace_mtx); + for (uint32_t j = 0; j < submission->command_buffer_count; j++) { + struct radv_cmd_buffer *cmd_buffer = (struct radv_cmd_buffer *)submission->command_buffers[j]; + shader_upload_seq = MAX2(shader_upload_seq, cmd_buffer->shader_upload_seq); + } + + if (shader_upload_seq > queue->last_shader_upload_seq) { + /* Patch the wait array to add waiting for referenced shaders to upload. */ + struct vk_sync_wait *new_waits = malloc(sizeof(struct vk_sync_wait) * (wait_count + 1)); + if (!new_waits) { + result = VK_ERROR_OUT_OF_HOST_MEMORY; + goto fail; + } + + memcpy(new_waits, submission->waits, sizeof(struct vk_sync_wait) * submission->wait_count); + radv_get_shader_upload_sync_wait(queue->device, shader_upload_seq, + &new_waits[submission->wait_count]); + + waits = new_waits; + wait_count += 1; + } + struct radeon_cmdbuf *perf_ctr_lock_cs = NULL; struct radeon_cmdbuf *perf_ctr_unlock_cs = NULL; @@ -1625,7 +1664,7 @@ radv_queue_submit_normal(struct radv_queue *queue, struct vk_queue_submit *submi /* For fences on the same queue/vm amdgpu doesn't wait till all processing is finished * before starting the next cmdbuffer, so we need to do it here. */ - const bool need_wait = submission->wait_count > 0; + const bool need_wait = wait_count > 0; unsigned num_preambles = 0; struct radeon_cmdbuf *preambles[4] = {0}; @@ -1700,7 +1739,7 @@ radv_queue_submit_normal(struct radv_queue *queue, struct vk_queue_submit *submi submit.preamble_count = submit_ace ? num_preambles : num_1q_preambles; result = queue->device->ws->cs_submit( - ctx, &submit, j == 0 ? submission->wait_count : 0, submission->waits, + ctx, &submit, j == 0 ? wait_count : 0, waits, last_submit ? submission->signal_count : 0, submission->signals, can_patch); if (result != VK_SUCCESS) @@ -1718,8 +1757,13 @@ radv_queue_submit_normal(struct radv_queue *queue, struct vk_queue_submit *submi preambles[1] = !use_ace ? NULL : queue->ace_internal_state->initial_preamble_cs; } + queue->last_shader_upload_seq = + MAX2(queue->last_shader_upload_seq, shader_upload_seq); + fail: free(cs_array); + if (waits != submission->waits) + free(waits); if (queue->device->trace_bo) simple_mtx_unlock(&queue->device->trace_mtx); diff --git a/src/amd/vulkan/radv_shader.c b/src/amd/vulkan/radv_shader.c index 7c3c8b0c350..a1c356a956f 100644 --- a/src/amd/vulkan/radv_shader.c +++ b/src/amd/vulkan/radv_shader.c @@ -2474,7 +2474,7 @@ radv_shader_create(struct radv_device *device, const struct radv_shader_binary * return NULL; } - if (!radv_shader_dma_submit(device, submission, NULL)) + if (!radv_shader_dma_submit(device, submission, &shader->upload_seq)) return NULL; } else { void *dest_ptr = shader->alloc->arena->ptr + shader->alloc->offset; @@ -2535,7 +2535,7 @@ radv_shader_part_binary_upload(struct radv_device *device, struct radv_shader_pa ptr32[i] = DEBUGGER_END_OF_CODE_MARKER; if (device->shader_use_invisible_vram) { - if (!radv_shader_dma_submit(device, submission, NULL)) + if (!radv_shader_dma_submit(device, submission, &shader_part->upload_seq)) return false; } @@ -3011,6 +3011,11 @@ radv_shader_destroy(struct radv_device *device, struct radv_shader *shader) { assert(shader->ref_count == 0); + if (device->shader_use_invisible_vram) { + /* Wait for any pending upload to complete, or we'll be writing into freed shader memory. */ + radv_shader_wait_for_upload(device, shader->upload_seq); + } + radv_free_shader_memory(device, shader->alloc); free(shader->code); @@ -3027,6 +3032,11 @@ radv_shader_part_destroy(struct radv_device *device, struct radv_shader_part *sh { assert(shader_part->ref_count == 0); + if (device->shader_use_invisible_vram) { + /* Wait for any pending upload to complete, or we'll be writing into freed shader memory. */ + radv_shader_wait_for_upload(device, shader_part->upload_seq); + } + if (shader_part->alloc) radv_free_shader_memory(device, shader_part->alloc); free(shader_part->binary); diff --git a/src/amd/vulkan/radv_shader.h b/src/amd/vulkan/radv_shader.h index 9b0b91fb716..8f327d307f3 100644 --- a/src/amd/vulkan/radv_shader.h +++ b/src/amd/vulkan/radv_shader.h @@ -492,6 +492,8 @@ struct radv_shader { union radv_shader_arena_block *alloc; uint64_t va; + uint64_t upload_seq; + struct ac_shader_config config; uint32_t code_size; uint32_t exec_size; @@ -521,6 +523,7 @@ struct radv_shader_part { uint8_t num_preserved_sgprs; bool nontrivial_divisors; uint32_t spi_shader_col_format; + uint64_t upload_seq; struct radv_shader_part_binary *binary;