radv: Flush in the initial preamble CS.

Signed-off-by: Bas Nieuwenhuizen <basni@google.com>
Reviewed-by: Dave Airlie <airlied@redhat.com>
This commit is contained in:
Bas Nieuwenhuizen
2017-02-20 09:26:00 +01:00
parent c121739c47
commit 5241fb0ffb
3 changed files with 156 additions and 98 deletions

View File

@@ -792,8 +792,10 @@ radv_queue_finish(struct radv_queue *queue)
if (queue->hw_ctx) if (queue->hw_ctx)
queue->device->ws->ctx_destroy(queue->hw_ctx); queue->device->ws->ctx_destroy(queue->hw_ctx);
if (queue->preamble_cs) if (queue->initial_preamble_cs)
queue->device->ws->cs_destroy(queue->preamble_cs); queue->device->ws->cs_destroy(queue->initial_preamble_cs);
if (queue->continue_preamble_cs)
queue->device->ws->cs_destroy(queue->continue_preamble_cs);
if (queue->descriptor_bo) if (queue->descriptor_bo)
queue->device->ws->buffer_destroy(queue->descriptor_bo); queue->device->ws->buffer_destroy(queue->descriptor_bo);
if (queue->scratch_bo) if (queue->scratch_bo)
@@ -939,6 +941,21 @@ VkResult radv_CreateDevice(
break; break;
} }
device->ws->cs_finalize(device->empty_cs[family]); device->ws->cs_finalize(device->empty_cs[family]);
device->flush_cs[family] = device->ws->cs_create(device->ws, family);
switch (family) {
case RADV_QUEUE_GENERAL:
case RADV_QUEUE_COMPUTE:
si_cs_emit_cache_flush(device->flush_cs[family],
device->physical_device->rad_info.chip_class,
family == RADV_QUEUE_COMPUTE && device->physical_device->rad_info.chip_class >= CIK,
RADV_CMD_FLAG_INV_ICACHE |
RADV_CMD_FLAG_INV_SMEM_L1 |
RADV_CMD_FLAG_INV_VMEM_L1 |
RADV_CMD_FLAG_INV_GLOBAL_L2);
break;
}
device->ws->cs_finalize(device->flush_cs[family]);
} }
if (getenv("RADV_TRACE_FILE")) { if (getenv("RADV_TRACE_FILE")) {
@@ -995,6 +1012,8 @@ void radv_DestroyDevice(
vk_free(&device->alloc, device->queues[i]); vk_free(&device->alloc, device->queues[i]);
if (device->empty_cs[i]) if (device->empty_cs[i])
device->ws->cs_destroy(device->empty_cs[i]); device->ws->cs_destroy(device->empty_cs[i]);
if (device->flush_cs[i])
device->ws->cs_destroy(device->flush_cs[i]);
} }
radv_device_finish_meta(device); radv_device_finish_meta(device);
@@ -1192,25 +1211,25 @@ radv_get_preamble_cs(struct radv_queue *queue,
uint32_t compute_scratch_size, uint32_t compute_scratch_size,
uint32_t esgs_ring_size, uint32_t esgs_ring_size,
uint32_t gsvs_ring_size, uint32_t gsvs_ring_size,
struct radeon_winsys_cs **preamble_cs) struct radeon_winsys_cs **initial_preamble_cs,
struct radeon_winsys_cs **continue_preamble_cs)
{ {
struct radeon_winsys_bo *scratch_bo = NULL; struct radeon_winsys_bo *scratch_bo = NULL;
struct radeon_winsys_bo *descriptor_bo = NULL; struct radeon_winsys_bo *descriptor_bo = NULL;
struct radeon_winsys_bo *compute_scratch_bo = NULL; struct radeon_winsys_bo *compute_scratch_bo = NULL;
struct radeon_winsys_bo *esgs_ring_bo = NULL; struct radeon_winsys_bo *esgs_ring_bo = NULL;
struct radeon_winsys_bo *gsvs_ring_bo = NULL; struct radeon_winsys_bo *gsvs_ring_bo = NULL;
struct radeon_winsys_cs *cs = NULL; struct radeon_winsys_cs *dest_cs[2] = {0};
if (!scratch_size && !compute_scratch_size && !esgs_ring_size && !gsvs_ring_size) {
*preamble_cs = NULL;
return VK_SUCCESS;
}
if (scratch_size <= queue->scratch_size && if (scratch_size <= queue->scratch_size &&
compute_scratch_size <= queue->compute_scratch_size && compute_scratch_size <= queue->compute_scratch_size &&
esgs_ring_size <= queue->esgs_ring_size && esgs_ring_size <= queue->esgs_ring_size &&
gsvs_ring_size <= queue->gsvs_ring_size) { gsvs_ring_size <= queue->gsvs_ring_size &&
*preamble_cs = queue->preamble_cs; queue->initial_preamble_cs) {
*initial_preamble_cs = queue->initial_preamble_cs;
*continue_preamble_cs = queue->continue_preamble_cs;
if (!scratch_size && !compute_scratch_size && !esgs_ring_size && !gsvs_ring_size)
*continue_preamble_cs = NULL;
return VK_SUCCESS; return VK_SUCCESS;
} }
@@ -1282,11 +1301,14 @@ radv_get_preamble_cs(struct radv_queue *queue,
} else } else
descriptor_bo = queue->descriptor_bo; descriptor_bo = queue->descriptor_bo;
for(int i = 0; i < 2; ++i) {
struct radeon_winsys_cs *cs = NULL;
cs = queue->device->ws->cs_create(queue->device->ws, cs = queue->device->ws->cs_create(queue->device->ws,
queue->queue_family_index ? RING_COMPUTE : RING_GFX); queue->queue_family_index ? RING_COMPUTE : RING_GFX);
if (!cs) if (!cs)
goto fail; goto fail;
dest_cs[i] = cs;
if (scratch_bo) if (scratch_bo)
queue->device->ws->cs_add_buffer(cs, scratch_bo, 8); queue->device->ws->cs_add_buffer(cs, scratch_bo, 8);
@@ -1363,13 +1385,29 @@ radv_get_preamble_cs(struct radv_queue *queue,
radeon_emit(cs, rsrc1); radeon_emit(cs, rsrc1);
} }
if (!i) {
si_cs_emit_cache_flush(cs,
queue->device->physical_device->rad_info.chip_class,
queue->queue_family_index == RING_COMPUTE &&
queue->device->physical_device->rad_info.chip_class >= CIK,
RADV_CMD_FLAG_INV_ICACHE |
RADV_CMD_FLAG_INV_SMEM_L1 |
RADV_CMD_FLAG_INV_VMEM_L1 |
RADV_CMD_FLAG_INV_GLOBAL_L2);
}
if (!queue->device->ws->cs_finalize(cs)) if (!queue->device->ws->cs_finalize(cs))
goto fail; goto fail;
}
if (queue->preamble_cs) if (queue->initial_preamble_cs)
queue->device->ws->cs_destroy(queue->preamble_cs); queue->device->ws->cs_destroy(queue->initial_preamble_cs);
queue->preamble_cs = cs; if (queue->continue_preamble_cs)
queue->device->ws->cs_destroy(queue->continue_preamble_cs);
queue->initial_preamble_cs = dest_cs[0];
queue->continue_preamble_cs = dest_cs[1];
if (scratch_bo != queue->scratch_bo) { if (scratch_bo != queue->scratch_bo) {
if (queue->scratch_bo) if (queue->scratch_bo)
@@ -1406,11 +1444,15 @@ radv_get_preamble_cs(struct radv_queue *queue,
queue->descriptor_bo = descriptor_bo; queue->descriptor_bo = descriptor_bo;
} }
*preamble_cs = cs; *initial_preamble_cs = queue->initial_preamble_cs;
*continue_preamble_cs = queue->continue_preamble_cs;
if (!scratch_size && !compute_scratch_size && !esgs_ring_size && !gsvs_ring_size)
*continue_preamble_cs = NULL;
return VK_SUCCESS; return VK_SUCCESS;
fail: fail:
if (cs) for (int i = 0; i < ARRAY_SIZE(dest_cs); ++i)
queue->device->ws->cs_destroy(cs); if (dest_cs[i])
queue->device->ws->cs_destroy(dest_cs[i]);
if (descriptor_bo && descriptor_bo != queue->descriptor_bo) if (descriptor_bo && descriptor_bo != queue->descriptor_bo)
queue->device->ws->buffer_destroy(descriptor_bo); queue->device->ws->buffer_destroy(descriptor_bo);
if (scratch_bo && scratch_bo != queue->scratch_bo) if (scratch_bo && scratch_bo != queue->scratch_bo)
@@ -1439,7 +1481,7 @@ VkResult radv_QueueSubmit(
uint32_t scratch_size = 0; uint32_t scratch_size = 0;
uint32_t compute_scratch_size = 0; uint32_t compute_scratch_size = 0;
uint32_t esgs_ring_size = 0, gsvs_ring_size = 0; uint32_t esgs_ring_size = 0, gsvs_ring_size = 0;
struct radeon_winsys_cs *preamble_cs = NULL; struct radeon_winsys_cs *initial_preamble_cs = NULL, *continue_preamble_cs = NULL;
VkResult result; VkResult result;
bool fence_emitted = false; bool fence_emitted = false;
@@ -1458,13 +1500,16 @@ VkResult radv_QueueSubmit(
} }
} }
result = radv_get_preamble_cs(queue, scratch_size, compute_scratch_size, esgs_ring_size, gsvs_ring_size, &preamble_cs); result = radv_get_preamble_cs(queue, scratch_size, compute_scratch_size,
esgs_ring_size, gsvs_ring_size,
&initial_preamble_cs, &continue_preamble_cs);
if (result != VK_SUCCESS) if (result != VK_SUCCESS)
return result; return result;
for (uint32_t i = 0; i < submitCount; i++) { for (uint32_t i = 0; i < submitCount; i++) {
struct radeon_winsys_cs **cs_array; struct radeon_winsys_cs **cs_array;
bool can_patch = true; bool has_flush = !submitCount;
bool can_patch = !has_flush;
uint32_t advance; uint32_t advance;
if (!pSubmits[i].commandBufferCount) { if (!pSubmits[i].commandBufferCount) {
@@ -1487,29 +1532,32 @@ VkResult radv_QueueSubmit(
} }
cs_array = malloc(sizeof(struct radeon_winsys_cs *) * cs_array = malloc(sizeof(struct radeon_winsys_cs *) *
pSubmits[i].commandBufferCount); (pSubmits[i].commandBufferCount + has_flush));
if(has_flush)
cs_array[0] = queue->device->flush_cs[queue->queue_family_index];
for (uint32_t j = 0; j < pSubmits[i].commandBufferCount; j++) { for (uint32_t j = 0; j < pSubmits[i].commandBufferCount; j++) {
RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer,
pSubmits[i].pCommandBuffers[j]); pSubmits[i].pCommandBuffers[j]);
assert(cmd_buffer->level == VK_COMMAND_BUFFER_LEVEL_PRIMARY); assert(cmd_buffer->level == VK_COMMAND_BUFFER_LEVEL_PRIMARY);
cs_array[j] = cmd_buffer->cs; cs_array[j + has_flush] = cmd_buffer->cs;
if ((cmd_buffer->usage_flags & VK_COMMAND_BUFFER_USAGE_SIMULTANEOUS_USE_BIT)) if ((cmd_buffer->usage_flags & VK_COMMAND_BUFFER_USAGE_SIMULTANEOUS_USE_BIT))
can_patch = false; can_patch = false;
} }
for (uint32_t j = 0; j < pSubmits[i].commandBufferCount; j += advance) { for (uint32_t j = 0; j < pSubmits[i].commandBufferCount + has_flush; j += advance) {
advance = MIN2(max_cs_submission, advance = MIN2(max_cs_submission,
pSubmits[i].commandBufferCount - j); pSubmits[i].commandBufferCount + has_flush - j);
bool b = j == 0; bool b = j == 0;
bool e = j + advance == pSubmits[i].commandBufferCount; bool e = j + advance == pSubmits[i].commandBufferCount + has_flush;
if (queue->device->trace_bo) if (queue->device->trace_bo)
*queue->device->trace_id_ptr = 0; *queue->device->trace_id_ptr = 0;
ret = queue->device->ws->cs_submit(ctx, queue->queue_idx, cs_array + j, ret = queue->device->ws->cs_submit(ctx, queue->queue_idx, cs_array + j,
advance, preamble_cs, preamble_cs, advance, initial_preamble_cs, continue_preamble_cs,
(struct radeon_winsys_sem **)pSubmits[i].pWaitSemaphores, (struct radeon_winsys_sem **)pSubmits[i].pWaitSemaphores,
b ? pSubmits[i].waitSemaphoreCount : 0, b ? pSubmits[i].waitSemaphoreCount : 0,
(struct radeon_winsys_sem **)pSubmits[i].pSignalSemaphores, (struct radeon_winsys_sem **)pSubmits[i].pSignalSemaphores,

View File

@@ -479,7 +479,8 @@ struct radv_queue {
struct radeon_winsys_bo *compute_scratch_bo; struct radeon_winsys_bo *compute_scratch_bo;
struct radeon_winsys_bo *esgs_ring_bo; struct radeon_winsys_bo *esgs_ring_bo;
struct radeon_winsys_bo *gsvs_ring_bo; struct radeon_winsys_bo *gsvs_ring_bo;
struct radeon_winsys_cs *preamble_cs; struct radeon_winsys_cs *initial_preamble_cs;
struct radeon_winsys_cs *continue_preamble_cs;
}; };
struct radv_device { struct radv_device {
@@ -495,6 +496,7 @@ struct radv_device {
struct radv_queue *queues[RADV_MAX_QUEUE_FAMILIES]; struct radv_queue *queues[RADV_MAX_QUEUE_FAMILIES];
int queue_count[RADV_MAX_QUEUE_FAMILIES]; int queue_count[RADV_MAX_QUEUE_FAMILIES];
struct radeon_winsys_cs *empty_cs[RADV_MAX_QUEUE_FAMILIES]; struct radeon_winsys_cs *empty_cs[RADV_MAX_QUEUE_FAMILIES];
struct radeon_winsys_cs *flush_cs[RADV_MAX_QUEUE_FAMILIES];
uint64_t debug_flags; uint64_t debug_flags;
@@ -764,6 +766,14 @@ void si_write_scissors(struct radeon_winsys_cs *cs, int first,
int count, const VkRect2D *scissors); int count, const VkRect2D *scissors);
uint32_t si_get_ia_multi_vgt_param(struct radv_cmd_buffer *cmd_buffer, uint32_t si_get_ia_multi_vgt_param(struct radv_cmd_buffer *cmd_buffer,
bool instanced_or_indirect_draw, uint32_t draw_vertex_count); bool instanced_or_indirect_draw, uint32_t draw_vertex_count);
void si_cs_emit_cache_flush(struct radeon_winsys_cs *cs,
enum chip_class chip_class,
bool is_mec,
enum radv_cmd_flush_bits flush_bits);
void si_cs_emit_cache_flush(struct radeon_winsys_cs *cs,
enum chip_class chip_class,
bool is_mec,
enum radv_cmd_flush_bits flush_bits);
void si_emit_cache_flush(struct radv_cmd_buffer *cmd_buffer); void si_emit_cache_flush(struct radv_cmd_buffer *cmd_buffer);
void si_cp_dma_buffer_copy(struct radv_cmd_buffer *cmd_buffer, void si_cp_dma_buffer_copy(struct radv_cmd_buffer *cmd_buffer,
uint64_t src_va, uint64_t dest_va, uint64_t src_va, uint64_t dest_va,

View File

@@ -689,7 +689,7 @@ si_get_ia_multi_vgt_param(struct radv_cmd_buffer *cmd_buffer,
} }
static void void
si_cs_emit_cache_flush(struct radeon_winsys_cs *cs, si_cs_emit_cache_flush(struct radeon_winsys_cs *cs,
enum chip_class chip_class, enum chip_class chip_class,
bool is_mec, bool is_mec,