radv: fix indirect dispatches on the compute queue on GFX7

GFX7 CP requires the indirect dispatch VA to be aligned to 32-bytes.

This fixes dEQP-VK.api.command_buffers.many_indirect_disps_on_secondary,
but it's unexpected that it uncovered this bug.

Cc: mesa-stable
Signed-off-by: Samuel Pitoiset <samuel.pitoiset@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/27148>
This commit is contained in:
Samuel Pitoiset
2024-01-18 17:54:26 +01:00
committed by Marge Bot
parent c3a64f8dd1
commit 5c03cdbd02
3 changed files with 36 additions and 2 deletions

View File

@@ -1255,6 +1255,11 @@ bool ac_query_gpu_info(int fd, void *dev_p, struct radeon_info *info,
info->has_async_compute_threadgroup_bug = info->family == CHIP_ICELAND ||
info->family == CHIP_TONGA;
/* GFX7 CP requires 32 bytes alignment for the indirect buffer arguments on
* the compute queue.
*/
info->has_async_compute_align32_bug = info->gfx_level == GFX7;
/* Support for GFX10.3 was added with F32_ME_FEATURE_VERSION_31 but the
* feature version wasn't bumped.
*/

View File

@@ -102,6 +102,7 @@ struct radeon_info {
bool has_vgt_flush_ngg_legacy_bug;
bool has_cs_regalloc_hang_bug;
bool has_async_compute_threadgroup_bug;
bool has_async_compute_align32_bug;
bool has_32bit_predication;
bool has_3d_cube_border_color_mipmap;
bool has_image_opcodes;

View File

@@ -9691,11 +9691,39 @@ radv_emit_dispatch_packets(struct radv_cmd_buffer *cmd_buffer, const struct radv
}
if (radv_cmd_buffer_uses_mec(cmd_buffer)) {
uint64_t indirect_va = info->va;
radv_cs_emit_compute_predication(&cmd_buffer->state, cs, cmd_buffer->mec_inv_pred_va,
&cmd_buffer->mec_inv_pred_emitted, 4 /* DISPATCH_INDIRECT size */);
if (cmd_buffer->device->physical_device->rad_info.has_async_compute_align32_bug &&
cmd_buffer->qf == RADV_QUEUE_COMPUTE && !radv_is_aligned(indirect_va, 32)) {
const uint64_t unaligned_va = indirect_va;
UNUSED void *ptr;
uint32_t offset;
if (!radv_cmd_buffer_upload_alloc_aligned(cmd_buffer, sizeof(VkDispatchIndirectCommand), 32, &offset, &ptr))
return;
indirect_va = radv_buffer_get_va(cmd_buffer->upload.upload_bo) + offset;
for (uint32_t i = 0; i < 3; i++) {
const uint64_t src_va = unaligned_va + i * 4;
const uint64_t dst_va = indirect_va + i * 4;
radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, 0));
radeon_emit(cs, COPY_DATA_SRC_SEL(COPY_DATA_SRC_MEM) | COPY_DATA_DST_SEL(COPY_DATA_DST_MEM) |
COPY_DATA_WR_CONFIRM);
radeon_emit(cs, src_va);
radeon_emit(cs, src_va >> 32);
radeon_emit(cs, dst_va);
radeon_emit(cs, dst_va >> 32);
}
}
radeon_emit(cs, PKT3(PKT3_DISPATCH_INDIRECT, 2, 0) | PKT3_SHADER_TYPE_S(1));
radeon_emit(cs, info->va);
radeon_emit(cs, info->va >> 32);
radeon_emit(cs, indirect_va);
radeon_emit(cs, indirect_va >> 32);
radeon_emit(cs, dispatch_initiator);
} else {
radeon_emit(cs, PKT3(PKT3_SET_BASE, 2, 0) | PKT3_SHADER_TYPE_S(1));