radv: Wait for shader uploads asynchronously.

This introduces tracking of the required semaphore values in pipelines,
which is then propagated to cmd_buffers on bind. Each queue also keeps
track the maximum count it has waited for, so that we can avoid the waiting
overhead once all the shaders are loaded and referenced.

Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/16271>
This commit is contained in:
Tatsuyuki Ishi
2022-07-12 17:25:00 +09:00
committed by Marge Bot
parent a8c5fd3b1b
commit 0cde42a506
7 changed files with 88 additions and 5 deletions

View File

@@ -201,7 +201,7 @@ radv_sqtt_reloc_graphics_shaders(struct radv_device *device,
} }
if (device->shader_use_invisible_vram) { if (device->shader_use_invisible_vram) {
if (!radv_shader_dma_submit(device, submission, NULL)) if (!radv_shader_dma_submit(device, submission, &pipeline->base.shader_upload_seq))
return VK_ERROR_UNKNOWN; return VK_ERROR_UNKNOWN;
} }

View File

@@ -420,6 +420,7 @@ radv_reset_cmd_buffer(struct vk_command_buffer *vk_cmd_buffer,
cmd_buffer->ace_internal.sem.gfx2ace_value = 0; cmd_buffer->ace_internal.sem.gfx2ace_value = 0;
cmd_buffer->ace_internal.sem.emitted_gfx2ace_value = 0; cmd_buffer->ace_internal.sem.emitted_gfx2ace_value = 0;
cmd_buffer->ace_internal.sem.va = 0; cmd_buffer->ace_internal.sem.va = 0;
cmd_buffer->shader_upload_seq = 0;
if (cmd_buffer->upload.upload_bo) if (cmd_buffer->upload.upload_bo)
radv_cs_add_buffer(cmd_buffer->device->ws, cmd_buffer->cs, cmd_buffer->upload.upload_bo); radv_cs_add_buffer(cmd_buffer->device->ws, cmd_buffer->cs, cmd_buffer->upload.upload_bo);
@@ -1848,6 +1849,8 @@ radv_emit_ps_epilog_state(struct radv_cmd_buffer *cmd_buffer, struct radv_shader
radv_emit_shader_pointer(cmd_buffer->device, cmd_buffer->cs, base_reg + loc->sgpr_idx * 4, radv_emit_shader_pointer(cmd_buffer->device, cmd_buffer->cs, base_reg + loc->sgpr_idx * 4,
ps_epilog->va, false); ps_epilog->va, false);
cmd_buffer->shader_upload_seq = MAX2(cmd_buffer->shader_upload_seq, ps_epilog->upload_seq);
cmd_buffer->state.emitted_ps_epilog = ps_epilog; cmd_buffer->state.emitted_ps_epilog = ps_epilog;
} }
@@ -3905,6 +3908,8 @@ radv_emit_vertex_input(struct radv_cmd_buffer *cmd_buffer, bool pipeline_is_dirt
emit_prolog_regs(cmd_buffer, vs_shader, prolog, pipeline_is_dirty); emit_prolog_regs(cmd_buffer, vs_shader, prolog, pipeline_is_dirty);
emit_prolog_inputs(cmd_buffer, vs_shader, nontrivial_divisors, pipeline_is_dirty); emit_prolog_inputs(cmd_buffer, vs_shader, nontrivial_divisors, pipeline_is_dirty);
cmd_buffer->shader_upload_seq = MAX2(cmd_buffer->shader_upload_seq, prolog->upload_seq);
cmd_buffer->state.emitted_vs_prolog = prolog; cmd_buffer->state.emitted_vs_prolog = prolog;
if (unlikely(cmd_buffer->device->trace_bo)) if (unlikely(cmd_buffer->device->trace_bo))
@@ -6374,6 +6379,10 @@ radv_CmdBindPipeline(VkCommandBuffer commandBuffer, VkPipelineBindPoint pipeline
assert(!"invalid bind point"); assert(!"invalid bind point");
break; break;
} }
if (cmd_buffer->device->shader_use_invisible_vram)
cmd_buffer->shader_upload_seq =
MAX2(cmd_buffer->shader_upload_seq, pipeline->shader_upload_seq);
} }
VKAPI_ATTR void VKAPI_CALL VKAPI_ATTR void VKAPI_CALL
@@ -7153,6 +7162,8 @@ radv_CmdExecuteCommands(VkCommandBuffer commandBuffer, uint32_t commandBufferCou
if (secondary->gds_oa_needed) if (secondary->gds_oa_needed)
primary->gds_oa_needed = true; primary->gds_oa_needed = true;
primary->shader_upload_seq = MAX2(primary->shader_upload_seq, secondary->shader_upload_seq);
if (!secondary->state.render.has_image_views && primary->state.render.active && if (!secondary->state.render.has_image_views && primary->state.render.active &&
(primary->state.dirty & RADV_CMD_DIRTY_FRAMEBUFFER)) { (primary->state.dirty & RADV_CMD_DIRTY_FRAMEBUFFER)) {
/* Emit the framebuffer state from primary if secondary /* Emit the framebuffer state from primary if secondary

View File

@@ -4998,6 +4998,13 @@ radv_graphics_pipeline_init(struct radv_graphics_pipeline *pipeline, struct radv
pipeline->base.push_constant_size = pipeline_layout.push_constant_size; pipeline->base.push_constant_size = pipeline_layout.push_constant_size;
pipeline->base.dynamic_offset_count = pipeline_layout.dynamic_offset_count; pipeline->base.dynamic_offset_count = pipeline_layout.dynamic_offset_count;
for (unsigned i = 0; i < MESA_VULKAN_SHADER_STAGES; i++) {
if (pipeline->base.shaders[i]) {
pipeline->base.shader_upload_seq = MAX2(pipeline->base.shader_upload_seq,
pipeline->base.shaders[i]->upload_seq);
}
}
if (extra) { if (extra) {
radv_pipeline_init_extra(pipeline, extra, &blend, &state, &vgt_gs_out_prim_type); radv_pipeline_init_extra(pipeline, extra, &blend, &state, &vgt_gs_out_prim_type);
} }
@@ -5264,6 +5271,8 @@ radv_compute_pipeline_init(struct radv_compute_pipeline *pipeline,
pipeline->base.push_constant_size = layout->push_constant_size; pipeline->base.push_constant_size = layout->push_constant_size;
pipeline->base.dynamic_offset_count = layout->dynamic_offset_count; pipeline->base.dynamic_offset_count = layout->dynamic_offset_count;
pipeline->base.shader_upload_seq = pipeline->base.shaders[MESA_SHADER_COMPUTE]->upload_seq;
if (device->physical_device->rad_info.has_cs_regalloc_hang_bug) { if (device->physical_device->rad_info.has_cs_regalloc_hang_bug) {
struct radv_shader *compute_shader = pipeline->base.shaders[MESA_SHADER_COMPUTE]; struct radv_shader *compute_shader = pipeline->base.shaders[MESA_SHADER_COMPUTE];
unsigned *cs_block_size = compute_shader->info.cs.block_size; unsigned *cs_block_size = compute_shader->info.cs.block_size;

View File

@@ -839,6 +839,8 @@ struct radv_queue {
struct radv_queue_state state; struct radv_queue_state state;
struct radv_queue_state *ace_internal_state; struct radv_queue_state *ace_internal_state;
struct radeon_winsys_bo *gang_sem_bo; struct radeon_winsys_bo *gang_sem_bo;
uint64_t last_shader_upload_seq;
}; };
int radv_queue_init(struct radv_device *device, struct radv_queue *queue, int idx, int radv_queue_init(struct radv_device *device, struct radv_queue *queue, int idx,
@@ -1771,6 +1773,8 @@ struct radv_cmd_buffer {
struct radv_video_session *vid; struct radv_video_session *vid;
struct radv_video_session_params *params; struct radv_video_session_params *params;
} video; } video;
uint64_t shader_upload_seq;
}; };
extern const struct vk_command_buffer_ops radv_cmd_buffer_ops; extern const struct vk_command_buffer_ops radv_cmd_buffer_ops;
@@ -2154,6 +2158,8 @@ struct radv_pipeline {
struct radv_shader *shaders[MESA_VULKAN_SHADER_STAGES]; struct radv_shader *shaders[MESA_VULKAN_SHADER_STAGES];
struct radv_shader *gs_copy_shader; struct radv_shader *gs_copy_shader;
uint64_t shader_upload_seq;
struct radeon_cmdbuf cs; struct radeon_cmdbuf cs;
uint32_t ctx_cs_hash; uint32_t ctx_cs_hash;
struct radeon_cmdbuf ctx_cs; struct radeon_cmdbuf ctx_cs;

View File

@@ -28,6 +28,8 @@
#include "radv_cs.h" #include "radv_cs.h"
#include "radv_debug.h" #include "radv_debug.h"
#include "radv_private.h" #include "radv_private.h"
#include "vk_sync.h"
#include "vk_semaphore.h"
/* The number of IBs per submit isn't infinite, it depends on the IP type /* The number of IBs per submit isn't infinite, it depends on the IP type
* (ie. some initial setup needed for a submit) and the number of IBs (4 DW). * (ie. some initial setup needed for a submit) and the number of IBs (4 DW).
@@ -1563,6 +1565,19 @@ radv_create_perf_counter_lock_cs(struct radv_device *device, unsigned pass, bool
return *cs_ref; return *cs_ref;
} }
static void
radv_get_shader_upload_sync_wait(struct radv_device *device, uint64_t shader_upload_seq,
struct vk_sync_wait *out_sync_wait)
{
struct vk_semaphore *semaphore = vk_semaphore_from_handle(device->shader_upload_sem);
struct vk_sync *sync = vk_semaphore_get_active_sync(semaphore);
*out_sync_wait = (struct vk_sync_wait){
.sync = sync,
.wait_value = shader_upload_seq,
.stage_mask = VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT,
};
}
static VkResult static VkResult
radv_queue_submit_normal(struct radv_queue *queue, struct vk_queue_submit *submission) radv_queue_submit_normal(struct radv_queue *queue, struct vk_queue_submit *submission)
{ {
@@ -1571,6 +1586,9 @@ radv_queue_submit_normal(struct radv_queue *queue, struct vk_queue_submit *submi
bool use_ace = false; bool use_ace = false;
bool use_perf_counters = false; bool use_perf_counters = false;
VkResult result; VkResult result;
uint64_t shader_upload_seq = 0;
uint32_t wait_count = submission->wait_count;
struct vk_sync_wait *waits = submission->waits;
result = radv_update_preambles(&queue->state, queue->device, submission->command_buffers, result = radv_update_preambles(&queue->state, queue->device, submission->command_buffers,
submission->command_buffer_count, &use_perf_counters, &use_ace); submission->command_buffer_count, &use_perf_counters, &use_ace);
@@ -1600,6 +1618,27 @@ radv_queue_submit_normal(struct radv_queue *queue, struct vk_queue_submit *submi
if (queue->device->trace_bo) if (queue->device->trace_bo)
simple_mtx_lock(&queue->device->trace_mtx); simple_mtx_lock(&queue->device->trace_mtx);
for (uint32_t j = 0; j < submission->command_buffer_count; j++) {
struct radv_cmd_buffer *cmd_buffer = (struct radv_cmd_buffer *)submission->command_buffers[j];
shader_upload_seq = MAX2(shader_upload_seq, cmd_buffer->shader_upload_seq);
}
if (shader_upload_seq > queue->last_shader_upload_seq) {
/* Patch the wait array to add waiting for referenced shaders to upload. */
struct vk_sync_wait *new_waits = malloc(sizeof(struct vk_sync_wait) * (wait_count + 1));
if (!new_waits) {
result = VK_ERROR_OUT_OF_HOST_MEMORY;
goto fail;
}
memcpy(new_waits, submission->waits, sizeof(struct vk_sync_wait) * submission->wait_count);
radv_get_shader_upload_sync_wait(queue->device, shader_upload_seq,
&new_waits[submission->wait_count]);
waits = new_waits;
wait_count += 1;
}
struct radeon_cmdbuf *perf_ctr_lock_cs = NULL; struct radeon_cmdbuf *perf_ctr_lock_cs = NULL;
struct radeon_cmdbuf *perf_ctr_unlock_cs = NULL; struct radeon_cmdbuf *perf_ctr_unlock_cs = NULL;
@@ -1625,7 +1664,7 @@ radv_queue_submit_normal(struct radv_queue *queue, struct vk_queue_submit *submi
/* For fences on the same queue/vm amdgpu doesn't wait till all processing is finished /* For fences on the same queue/vm amdgpu doesn't wait till all processing is finished
* before starting the next cmdbuffer, so we need to do it here. * before starting the next cmdbuffer, so we need to do it here.
*/ */
const bool need_wait = submission->wait_count > 0; const bool need_wait = wait_count > 0;
unsigned num_preambles = 0; unsigned num_preambles = 0;
struct radeon_cmdbuf *preambles[4] = {0}; struct radeon_cmdbuf *preambles[4] = {0};
@@ -1700,7 +1739,7 @@ radv_queue_submit_normal(struct radv_queue *queue, struct vk_queue_submit *submi
submit.preamble_count = submit_ace ? num_preambles : num_1q_preambles; submit.preamble_count = submit_ace ? num_preambles : num_1q_preambles;
result = queue->device->ws->cs_submit( result = queue->device->ws->cs_submit(
ctx, &submit, j == 0 ? submission->wait_count : 0, submission->waits, ctx, &submit, j == 0 ? wait_count : 0, waits,
last_submit ? submission->signal_count : 0, submission->signals, can_patch); last_submit ? submission->signal_count : 0, submission->signals, can_patch);
if (result != VK_SUCCESS) if (result != VK_SUCCESS)
@@ -1718,8 +1757,13 @@ radv_queue_submit_normal(struct radv_queue *queue, struct vk_queue_submit *submi
preambles[1] = !use_ace ? NULL : queue->ace_internal_state->initial_preamble_cs; preambles[1] = !use_ace ? NULL : queue->ace_internal_state->initial_preamble_cs;
} }
queue->last_shader_upload_seq =
MAX2(queue->last_shader_upload_seq, shader_upload_seq);
fail: fail:
free(cs_array); free(cs_array);
if (waits != submission->waits)
free(waits);
if (queue->device->trace_bo) if (queue->device->trace_bo)
simple_mtx_unlock(&queue->device->trace_mtx); simple_mtx_unlock(&queue->device->trace_mtx);

View File

@@ -2474,7 +2474,7 @@ radv_shader_create(struct radv_device *device, const struct radv_shader_binary *
return NULL; return NULL;
} }
if (!radv_shader_dma_submit(device, submission, NULL)) if (!radv_shader_dma_submit(device, submission, &shader->upload_seq))
return NULL; return NULL;
} else { } else {
void *dest_ptr = shader->alloc->arena->ptr + shader->alloc->offset; void *dest_ptr = shader->alloc->arena->ptr + shader->alloc->offset;
@@ -2535,7 +2535,7 @@ radv_shader_part_binary_upload(struct radv_device *device, struct radv_shader_pa
ptr32[i] = DEBUGGER_END_OF_CODE_MARKER; ptr32[i] = DEBUGGER_END_OF_CODE_MARKER;
if (device->shader_use_invisible_vram) { if (device->shader_use_invisible_vram) {
if (!radv_shader_dma_submit(device, submission, NULL)) if (!radv_shader_dma_submit(device, submission, &shader_part->upload_seq))
return false; return false;
} }
@@ -3011,6 +3011,11 @@ radv_shader_destroy(struct radv_device *device, struct radv_shader *shader)
{ {
assert(shader->ref_count == 0); assert(shader->ref_count == 0);
if (device->shader_use_invisible_vram) {
/* Wait for any pending upload to complete, or we'll be writing into freed shader memory. */
radv_shader_wait_for_upload(device, shader->upload_seq);
}
radv_free_shader_memory(device, shader->alloc); radv_free_shader_memory(device, shader->alloc);
free(shader->code); free(shader->code);
@@ -3027,6 +3032,11 @@ radv_shader_part_destroy(struct radv_device *device, struct radv_shader_part *sh
{ {
assert(shader_part->ref_count == 0); assert(shader_part->ref_count == 0);
if (device->shader_use_invisible_vram) {
/* Wait for any pending upload to complete, or we'll be writing into freed shader memory. */
radv_shader_wait_for_upload(device, shader_part->upload_seq);
}
if (shader_part->alloc) if (shader_part->alloc)
radv_free_shader_memory(device, shader_part->alloc); radv_free_shader_memory(device, shader_part->alloc);
free(shader_part->binary); free(shader_part->binary);

View File

@@ -492,6 +492,8 @@ struct radv_shader {
union radv_shader_arena_block *alloc; union radv_shader_arena_block *alloc;
uint64_t va; uint64_t va;
uint64_t upload_seq;
struct ac_shader_config config; struct ac_shader_config config;
uint32_t code_size; uint32_t code_size;
uint32_t exec_size; uint32_t exec_size;
@@ -521,6 +523,7 @@ struct radv_shader_part {
uint8_t num_preserved_sgprs; uint8_t num_preserved_sgprs;
bool nontrivial_divisors; bool nontrivial_divisors;
uint32_t spi_shader_col_format; uint32_t spi_shader_col_format;
uint64_t upload_seq;
struct radv_shader_part_binary *binary; struct radv_shader_part_binary *binary;