radv: move emitting the strmout buffer in CmdDrawIndirectByteCountEXT()

This doesn't need to be in the generic draw path because only one
draw command uses it.

Signed-off-by: Samuel Pitoiset <samuel.pitoiset@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/20299>
This commit is contained in:
Samuel Pitoiset
2022-12-12 16:56:42 +01:00
committed by Marge Bot
parent cb0a17652d
commit 6aaba10c6e

View File

@@ -5058,38 +5058,6 @@ radv_emit_draw_registers(struct radv_cmd_buffer *cmd_buffer, const struct radv_d
}
}
if (draw_info->strmout_buffer) {
uint64_t va = radv_buffer_get_va(draw_info->strmout_buffer->bo);
va += draw_info->strmout_buffer->offset + draw_info->strmout_buffer_offset;
radeon_set_context_reg(cs, R_028B30_VGT_STRMOUT_DRAW_OPAQUE_VERTEX_STRIDE, draw_info->stride);
if (info->gfx_level >= GFX10) {
/* Emitting a COPY_DATA packet should be enough because RADV doesn't support preemption
* (shadow memory) but for unknown reasons, it can lead to GPU hangs on GFX10+.
*/
radeon_emit(cs, PKT3(PKT3_PFP_SYNC_ME, 0, 0));
radeon_emit(cs, 0);
radeon_emit(cs, PKT3(PKT3_LOAD_CONTEXT_REG_INDEX, 3, 0));
radeon_emit(cs, va);
radeon_emit(cs, va >> 32);
radeon_emit(cs, (R_028B2C_VGT_STRMOUT_DRAW_OPAQUE_BUFFER_FILLED_SIZE - SI_CONTEXT_REG_OFFSET) >> 2);
radeon_emit(cs, 1); /* 1 DWORD */
} else {
radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, 0));
radeon_emit(cs, COPY_DATA_SRC_SEL(COPY_DATA_SRC_MEM) | COPY_DATA_DST_SEL(COPY_DATA_REG) |
COPY_DATA_WR_CONFIRM);
radeon_emit(cs, va);
radeon_emit(cs, va >> 32);
radeon_emit(cs, R_028B2C_VGT_STRMOUT_DRAW_OPAQUE_BUFFER_FILLED_SIZE >> 2);
radeon_emit(cs, 0); /* unused */
}
radv_cs_add_buffer(cmd_buffer->device->ws, cs, draw_info->strmout_buffer->bo);
}
/* RDNA2 is affected by a hardware bug when instance packing is enabled for adjacent primitive
* topologies and instance_count > 1, pipeline stats generated by GE are incorrect. It needs to
* be applied for indexed and non-indexed draws.
@@ -10859,6 +10827,42 @@ radv_CmdEndTransformFeedbackEXT(VkCommandBuffer commandBuffer, uint32_t firstCou
radv_set_streamout_enable(cmd_buffer, false);
}
static void
radv_emit_strmout_buffer(struct radv_cmd_buffer *cmd_buffer, const struct radv_draw_info *draw_info)
{
const enum amd_gfx_level gfx_level = cmd_buffer->device->physical_device->rad_info.gfx_level;
uint64_t va = radv_buffer_get_va(draw_info->strmout_buffer->bo);
struct radeon_cmdbuf *cs = cmd_buffer->cs;
va += draw_info->strmout_buffer->offset + draw_info->strmout_buffer_offset;
radeon_set_context_reg(cs, R_028B30_VGT_STRMOUT_DRAW_OPAQUE_VERTEX_STRIDE, draw_info->stride);
if (gfx_level >= GFX10) {
/* Emitting a COPY_DATA packet should be enough because RADV doesn't support preemption
* (shadow memory) but for unknown reasons, it can lead to GPU hangs on GFX10+.
*/
radeon_emit(cs, PKT3(PKT3_PFP_SYNC_ME, 0, 0));
radeon_emit(cs, 0);
radeon_emit(cs, PKT3(PKT3_LOAD_CONTEXT_REG_INDEX, 3, 0));
radeon_emit(cs, va);
radeon_emit(cs, va >> 32);
radeon_emit(cs, (R_028B2C_VGT_STRMOUT_DRAW_OPAQUE_BUFFER_FILLED_SIZE - SI_CONTEXT_REG_OFFSET) >> 2);
radeon_emit(cs, 1); /* 1 DWORD */
} else {
radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, 0));
radeon_emit(cs, COPY_DATA_SRC_SEL(COPY_DATA_SRC_MEM) | COPY_DATA_DST_SEL(COPY_DATA_REG) |
COPY_DATA_WR_CONFIRM);
radeon_emit(cs, va);
radeon_emit(cs, va >> 32);
radeon_emit(cs, R_028B2C_VGT_STRMOUT_DRAW_OPAQUE_BUFFER_FILLED_SIZE >> 2);
radeon_emit(cs, 0); /* unused */
}
radv_cs_add_buffer(cmd_buffer->device->ws, cs, draw_info->strmout_buffer->bo);
}
VKAPI_ATTR void VKAPI_CALL
radv_CmdDrawIndirectByteCountEXT(VkCommandBuffer commandBuffer, uint32_t instanceCount,
uint32_t firstInstance, VkBuffer _counterBuffer,
@@ -10881,6 +10885,7 @@ radv_CmdDrawIndirectByteCountEXT(VkCommandBuffer commandBuffer, uint32_t instanc
if (!radv_before_draw(cmd_buffer, &info, 1))
return;
struct VkMultiDrawInfoEXT minfo = { 0, 0 };
radv_emit_strmout_buffer(cmd_buffer, &info);
radv_emit_direct_draw_packets(cmd_buffer, &info, 1, &minfo, S_0287F0_USE_OPAQUE(1), 0);
radv_after_draw(cmd_buffer);
}