radv: implement VK_EXT_transform_feedback
This implementation should work and potential bugs can be fixed during the release candidates window anyway. Signed-off-by: Samuel Pitoiset <samuel.pitoiset@gmail.com> Reviewed-by: Dave Airlie <airlied@redhat.com>
This commit is contained in:
@@ -119,6 +119,7 @@
|
||||
#define STRMOUT_OFFSET_FROM_VGT_FILLED_SIZE 1
|
||||
#define STRMOUT_OFFSET_FROM_MEM 2
|
||||
#define STRMOUT_OFFSET_NONE 3
|
||||
#define STRMOUT_DATA_TYPE(x) (((unsigned)(x) & 0x1) << 7)
|
||||
#define STRMOUT_SELECT_BUFFER(x) (((unsigned)(x) & 0x3) << 8)
|
||||
#define PKT3_DRAW_INDEX_OFFSET_2 0x35
|
||||
#define PKT3_WRITE_DATA 0x37
|
||||
|
@@ -196,6 +196,23 @@ radv_bind_dynamic_state(struct radv_cmd_buffer *cmd_buffer,
|
||||
cmd_buffer->state.dirty |= dest_mask;
|
||||
}
|
||||
|
||||
static void
|
||||
radv_bind_streamout_state(struct radv_cmd_buffer *cmd_buffer,
|
||||
struct radv_pipeline *pipeline)
|
||||
{
|
||||
struct radv_streamout_state *so = &cmd_buffer->state.streamout;
|
||||
struct radv_shader_info *info;
|
||||
|
||||
if (!pipeline->streamout_shader)
|
||||
return;
|
||||
|
||||
info = &pipeline->streamout_shader->info.info;
|
||||
for (int i = 0; i < MAX_SO_BUFFERS; i++)
|
||||
so->stride_in_dw[i] = info->so.strides[i];
|
||||
|
||||
so->enabled_stream_buffers_mask = info->so.enabled_stream_buffers_mask;
|
||||
}
|
||||
|
||||
bool radv_cmd_buffer_uses_mec(struct radv_cmd_buffer *cmd_buffer)
|
||||
{
|
||||
return cmd_buffer->queue_family_index == RADV_QUEUE_COMPUTE &&
|
||||
@@ -1875,10 +1892,94 @@ radv_flush_vertex_descriptors(struct radv_cmd_buffer *cmd_buffer,
|
||||
cmd_buffer->state.dirty &= ~RADV_CMD_DIRTY_VERTEX_BUFFER;
|
||||
}
|
||||
|
||||
static void
|
||||
radv_emit_streamout_buffers(struct radv_cmd_buffer *cmd_buffer, uint64_t va)
|
||||
{
|
||||
struct radv_pipeline *pipeline = cmd_buffer->state.pipeline;
|
||||
struct radv_userdata_info *loc;
|
||||
uint32_t base_reg;
|
||||
|
||||
for (unsigned stage = 0; stage < MESA_SHADER_STAGES; ++stage) {
|
||||
if (!radv_get_shader(pipeline, stage))
|
||||
continue;
|
||||
|
||||
loc = radv_lookup_user_sgpr(pipeline, stage,
|
||||
AC_UD_STREAMOUT_BUFFERS);
|
||||
if (loc->sgpr_idx == -1)
|
||||
continue;
|
||||
|
||||
base_reg = pipeline->user_data_0[stage];
|
||||
|
||||
radv_emit_shader_pointer(cmd_buffer->device, cmd_buffer->cs,
|
||||
base_reg + loc->sgpr_idx * 4, va, false);
|
||||
}
|
||||
|
||||
if (pipeline->gs_copy_shader) {
|
||||
loc = &pipeline->gs_copy_shader->info.user_sgprs_locs.shader_data[AC_UD_STREAMOUT_BUFFERS];
|
||||
if (loc->sgpr_idx != -1) {
|
||||
base_reg = R_00B130_SPI_SHADER_USER_DATA_VS_0;
|
||||
|
||||
radv_emit_shader_pointer(cmd_buffer->device, cmd_buffer->cs,
|
||||
base_reg + loc->sgpr_idx * 4, va, false);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static void
|
||||
radv_flush_streamout_descriptors(struct radv_cmd_buffer *cmd_buffer)
|
||||
{
|
||||
if (cmd_buffer->state.dirty & RADV_CMD_DIRTY_STREAMOUT_BUFFER) {
|
||||
struct radv_streamout_binding *sb = cmd_buffer->streamout_bindings;
|
||||
struct radv_streamout_state *so = &cmd_buffer->state.streamout;
|
||||
unsigned so_offset;
|
||||
void *so_ptr;
|
||||
uint64_t va;
|
||||
|
||||
/* Allocate some descriptor state for streamout buffers. */
|
||||
if (!radv_cmd_buffer_upload_alloc(cmd_buffer,
|
||||
MAX_SO_BUFFERS * 16, 256,
|
||||
&so_offset, &so_ptr))
|
||||
return;
|
||||
|
||||
for (uint32_t i = 0; i < MAX_SO_BUFFERS; i++) {
|
||||
struct radv_buffer *buffer = sb[i].buffer;
|
||||
uint32_t *desc = &((uint32_t *)so_ptr)[i * 4];
|
||||
|
||||
if (!(so->enabled_mask & (1 << i)))
|
||||
continue;
|
||||
|
||||
va = radv_buffer_get_va(buffer->bo) + buffer->offset;
|
||||
|
||||
/* Set the descriptor.
|
||||
*
|
||||
* On VI, the format must be non-INVALID, otherwise
|
||||
* the buffer will be considered not bound and store
|
||||
* instructions will be no-ops.
|
||||
*/
|
||||
desc[0] = va;
|
||||
desc[1] = S_008F04_BASE_ADDRESS_HI(va >> 32);
|
||||
desc[2] = 0xffffffff;
|
||||
desc[3] = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) |
|
||||
S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
|
||||
S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) |
|
||||
S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W) |
|
||||
S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32);
|
||||
}
|
||||
|
||||
va = radv_buffer_get_va(cmd_buffer->upload.upload_bo);
|
||||
va += so_offset;
|
||||
|
||||
radv_emit_streamout_buffers(cmd_buffer, va);
|
||||
}
|
||||
|
||||
cmd_buffer->state.dirty &= ~RADV_CMD_DIRTY_STREAMOUT_BUFFER;
|
||||
}
|
||||
|
||||
static void
|
||||
radv_upload_graphics_shader_descriptors(struct radv_cmd_buffer *cmd_buffer, bool pipeline_is_dirty)
|
||||
{
|
||||
radv_flush_vertex_descriptors(cmd_buffer, pipeline_is_dirty);
|
||||
radv_flush_streamout_descriptors(cmd_buffer);
|
||||
radv_flush_descriptors(cmd_buffer, VK_SHADER_STAGE_ALL_GRAPHICS);
|
||||
radv_flush_constants(cmd_buffer, VK_SHADER_STAGE_ALL_GRAPHICS);
|
||||
}
|
||||
@@ -1969,7 +2070,8 @@ static void radv_stage_flush(struct radv_cmd_buffer *cmd_buffer,
|
||||
VK_PIPELINE_STAGE_VERTEX_SHADER_BIT |
|
||||
VK_PIPELINE_STAGE_TESSELLATION_CONTROL_SHADER_BIT |
|
||||
VK_PIPELINE_STAGE_TESSELLATION_EVALUATION_SHADER_BIT |
|
||||
VK_PIPELINE_STAGE_GEOMETRY_SHADER_BIT)) {
|
||||
VK_PIPELINE_STAGE_GEOMETRY_SHADER_BIT |
|
||||
VK_PIPELINE_STAGE_TRANSFORM_FEEDBACK_BIT_EXT)) {
|
||||
cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_VS_PARTIAL_FLUSH;
|
||||
}
|
||||
}
|
||||
@@ -1993,6 +2095,8 @@ radv_src_access_flush(struct radv_cmd_buffer *cmd_buffer,
|
||||
for_each_bit(b, src_flags) {
|
||||
switch ((VkAccessFlagBits)(1 << b)) {
|
||||
case VK_ACCESS_SHADER_WRITE_BIT:
|
||||
case VK_ACCESS_TRANSFORM_FEEDBACK_WRITE_BIT_EXT:
|
||||
case VK_ACCESS_TRANSFORM_FEEDBACK_COUNTER_WRITE_BIT_EXT:
|
||||
flush_bits |= RADV_CMD_FLAG_WRITEBACK_GLOBAL_L2;
|
||||
break;
|
||||
case VK_ACCESS_COLOR_ATTACHMENT_WRITE_BIT:
|
||||
@@ -2062,6 +2166,7 @@ radv_dst_access_flush(struct radv_cmd_buffer *cmd_buffer,
|
||||
switch ((VkAccessFlagBits)(1 << b)) {
|
||||
case VK_ACCESS_INDIRECT_COMMAND_READ_BIT:
|
||||
case VK_ACCESS_INDEX_READ_BIT:
|
||||
case VK_ACCESS_TRANSFORM_FEEDBACK_COUNTER_WRITE_BIT_EXT:
|
||||
break;
|
||||
case VK_ACCESS_UNIFORM_READ_BIT:
|
||||
flush_bits |= RADV_CMD_FLAG_INV_VMEM_L1 | RADV_CMD_FLAG_INV_SMEM_L1;
|
||||
@@ -2716,6 +2821,7 @@ void radv_CmdBindPipeline(
|
||||
cmd_buffer->state.prefetch_L2_mask |= RADV_PREFETCH_SHADERS;
|
||||
|
||||
radv_bind_dynamic_state(cmd_buffer, &pipeline->dynamic_state);
|
||||
radv_bind_streamout_state(cmd_buffer, pipeline);
|
||||
|
||||
if (pipeline->graphics.esgs_ring_size > cmd_buffer->esgs_ring_size_needed)
|
||||
cmd_buffer->esgs_ring_size_needed = pipeline->graphics.esgs_ring_size;
|
||||
@@ -3138,12 +3244,13 @@ static void radv_emit_view_index(struct radv_cmd_buffer *cmd_buffer, unsigned in
|
||||
|
||||
static void
|
||||
radv_cs_emit_draw_packet(struct radv_cmd_buffer *cmd_buffer,
|
||||
uint32_t vertex_count)
|
||||
uint32_t vertex_count,
|
||||
bool use_opaque)
|
||||
{
|
||||
radeon_emit(cmd_buffer->cs, PKT3(PKT3_DRAW_INDEX_AUTO, 1, cmd_buffer->state.predicating));
|
||||
radeon_emit(cmd_buffer->cs, vertex_count);
|
||||
radeon_emit(cmd_buffer->cs, V_0287F0_DI_SRC_SEL_AUTO_INDEX |
|
||||
S_0287F0_USE_OPAQUE(0));
|
||||
S_0287F0_USE_OPAQUE(use_opaque));
|
||||
}
|
||||
|
||||
static void
|
||||
@@ -3247,6 +3354,12 @@ struct radv_draw_info {
|
||||
*/
|
||||
struct radv_buffer *count_buffer;
|
||||
uint64_t count_buffer_offset;
|
||||
|
||||
/**
|
||||
* Stream output parameters resource.
|
||||
*/
|
||||
struct radv_buffer *strmout_buffer;
|
||||
uint64_t strmout_buffer_offset;
|
||||
};
|
||||
|
||||
static void
|
||||
@@ -3257,6 +3370,27 @@ radv_emit_draw_packets(struct radv_cmd_buffer *cmd_buffer,
|
||||
struct radeon_winsys *ws = cmd_buffer->device->ws;
|
||||
struct radeon_cmdbuf *cs = cmd_buffer->cs;
|
||||
|
||||
if (info->strmout_buffer) {
|
||||
uint64_t va = radv_buffer_get_va(info->strmout_buffer->bo);
|
||||
|
||||
va += info->strmout_buffer->offset +
|
||||
info->strmout_buffer_offset;
|
||||
|
||||
radeon_set_context_reg(cs, R_028B30_VGT_STRMOUT_DRAW_OPAQUE_VERTEX_STRIDE,
|
||||
info->stride);
|
||||
|
||||
radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, 0));
|
||||
radeon_emit(cs, COPY_DATA_SRC_SEL(COPY_DATA_SRC_MEM) |
|
||||
COPY_DATA_DST_SEL(COPY_DATA_REG) |
|
||||
COPY_DATA_WR_CONFIRM);
|
||||
radeon_emit(cs, va);
|
||||
radeon_emit(cs, va >> 32);
|
||||
radeon_emit(cs, R_028B2C_VGT_STRMOUT_DRAW_OPAQUE_BUFFER_FILLED_SIZE >> 2);
|
||||
radeon_emit(cs, 0); /* unused */
|
||||
|
||||
radv_cs_add_buffer(ws, cs, info->strmout_buffer->bo);
|
||||
}
|
||||
|
||||
if (info->indirect) {
|
||||
uint64_t va = radv_buffer_get_va(info->indirect->bo);
|
||||
uint64_t count_va = 0;
|
||||
@@ -3341,14 +3475,17 @@ radv_emit_draw_packets(struct radv_cmd_buffer *cmd_buffer,
|
||||
}
|
||||
} else {
|
||||
if (!state->subpass->view_mask) {
|
||||
radv_cs_emit_draw_packet(cmd_buffer, info->count);
|
||||
radv_cs_emit_draw_packet(cmd_buffer,
|
||||
info->count,
|
||||
!!info->strmout_buffer);
|
||||
} else {
|
||||
unsigned i;
|
||||
for_each_bit(i, state->subpass->view_mask) {
|
||||
radv_emit_view_index(cmd_buffer, i);
|
||||
|
||||
radv_cs_emit_draw_packet(cmd_buffer,
|
||||
info->count);
|
||||
info->count,
|
||||
!!info->strmout_buffer);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -3442,6 +3579,8 @@ static void
|
||||
radv_draw(struct radv_cmd_buffer *cmd_buffer,
|
||||
const struct radv_draw_info *info)
|
||||
{
|
||||
struct radeon_info *rad_info =
|
||||
&cmd_buffer->device->physical_device->rad_info;
|
||||
bool has_prefetch =
|
||||
cmd_buffer->device->physical_device->rad_info.chip_class >= CIK;
|
||||
bool pipeline_is_dirty =
|
||||
@@ -3511,6 +3650,16 @@ radv_draw(struct radv_cmd_buffer *cmd_buffer,
|
||||
}
|
||||
}
|
||||
|
||||
/* Workaround for a VGT hang when streamout is enabled.
|
||||
* It must be done after drawing.
|
||||
*/
|
||||
if (cmd_buffer->state.streamout.streamout_enabled &&
|
||||
(rad_info->family == CHIP_HAWAII ||
|
||||
rad_info->family == CHIP_TONGA ||
|
||||
rad_info->family == CHIP_FIJI)) {
|
||||
cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_VGT_STREAMOUT_SYNC;
|
||||
}
|
||||
|
||||
assert(cmd_buffer->cs->cdw <= cdw_max);
|
||||
radv_cmd_buffer_after_draw(cmd_buffer, RADV_CMD_FLAG_PS_PARTIAL_FLUSH);
|
||||
}
|
||||
@@ -4486,3 +4635,229 @@ void radv_CmdEndConditionalRenderingEXT(
|
||||
cmd_buffer->state.predication_type = -1;
|
||||
cmd_buffer->state.predication_va = 0;
|
||||
}
|
||||
|
||||
/* VK_EXT_transform_feedback */
|
||||
void radv_CmdBindTransformFeedbackBuffersEXT(
|
||||
VkCommandBuffer commandBuffer,
|
||||
uint32_t firstBinding,
|
||||
uint32_t bindingCount,
|
||||
const VkBuffer* pBuffers,
|
||||
const VkDeviceSize* pOffsets,
|
||||
const VkDeviceSize* pSizes)
|
||||
{
|
||||
RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
|
||||
struct radv_streamout_binding *sb = cmd_buffer->streamout_bindings;
|
||||
uint8_t enabled_mask = 0;
|
||||
|
||||
assert(firstBinding + bindingCount <= MAX_SO_BUFFERS);
|
||||
for (uint32_t i = 0; i < bindingCount; i++) {
|
||||
uint32_t idx = firstBinding + i;
|
||||
|
||||
sb[idx].buffer = radv_buffer_from_handle(pBuffers[i]);
|
||||
sb[idx].offset = pOffsets[i];
|
||||
sb[idx].size = pSizes[i];
|
||||
|
||||
radv_cs_add_buffer(cmd_buffer->device->ws, cmd_buffer->cs,
|
||||
sb[idx].buffer->bo);
|
||||
|
||||
enabled_mask |= 1 << idx;
|
||||
}
|
||||
|
||||
cmd_buffer->state.streamout.enabled_mask = enabled_mask;
|
||||
|
||||
cmd_buffer->state.dirty |= RADV_CMD_DIRTY_STREAMOUT_BUFFER;
|
||||
}
|
||||
|
||||
static void
|
||||
radv_emit_streamout_enable(struct radv_cmd_buffer *cmd_buffer)
|
||||
{
|
||||
struct radv_streamout_state *so = &cmd_buffer->state.streamout;
|
||||
struct radeon_cmdbuf *cs = cmd_buffer->cs;
|
||||
|
||||
radeon_set_context_reg_seq(cs, R_028B94_VGT_STRMOUT_CONFIG, 2);
|
||||
radeon_emit(cs,
|
||||
S_028B94_STREAMOUT_0_EN(so->streamout_enabled) |
|
||||
S_028B94_RAST_STREAM(0) |
|
||||
S_028B94_STREAMOUT_1_EN(so->streamout_enabled) |
|
||||
S_028B94_STREAMOUT_2_EN(so->streamout_enabled) |
|
||||
S_028B94_STREAMOUT_3_EN(so->streamout_enabled));
|
||||
radeon_emit(cs, so->hw_enabled_mask &
|
||||
so->enabled_stream_buffers_mask);
|
||||
}
|
||||
|
||||
static void
|
||||
radv_set_streamout_enable(struct radv_cmd_buffer *cmd_buffer, bool enable)
|
||||
{
|
||||
struct radv_streamout_state *so = &cmd_buffer->state.streamout;
|
||||
bool old_streamout_enabled = so->streamout_enabled;
|
||||
uint32_t old_hw_enabled_mask = so->hw_enabled_mask;
|
||||
|
||||
so->streamout_enabled = enable;
|
||||
|
||||
so->hw_enabled_mask = so->enabled_mask |
|
||||
(so->enabled_mask << 4) |
|
||||
(so->enabled_mask << 8) |
|
||||
(so->enabled_mask << 12);
|
||||
|
||||
if ((old_streamout_enabled != so->streamout_enabled) ||
|
||||
(old_hw_enabled_mask != so->hw_enabled_mask))
|
||||
radv_emit_streamout_enable(cmd_buffer);
|
||||
}
|
||||
|
||||
static void radv_flush_vgt_streamout(struct radv_cmd_buffer *cmd_buffer)
|
||||
{
|
||||
struct radeon_cmdbuf *cs = cmd_buffer->cs;
|
||||
unsigned reg_strmout_cntl;
|
||||
|
||||
/* The register is at different places on different ASICs. */
|
||||
if (cmd_buffer->device->physical_device->rad_info.chip_class >= CIK) {
|
||||
reg_strmout_cntl = R_0300FC_CP_STRMOUT_CNTL;
|
||||
radeon_set_uconfig_reg(cs, reg_strmout_cntl, 0);
|
||||
} else {
|
||||
reg_strmout_cntl = R_0084FC_CP_STRMOUT_CNTL;
|
||||
radeon_set_config_reg(cs, reg_strmout_cntl, 0);
|
||||
}
|
||||
|
||||
radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
|
||||
radeon_emit(cs, EVENT_TYPE(EVENT_TYPE_SO_VGTSTREAMOUT_FLUSH) | EVENT_INDEX(0));
|
||||
|
||||
radeon_emit(cs, PKT3(PKT3_WAIT_REG_MEM, 5, 0));
|
||||
radeon_emit(cs, WAIT_REG_MEM_EQUAL); /* wait until the register is equal to the reference value */
|
||||
radeon_emit(cs, reg_strmout_cntl >> 2); /* register */
|
||||
radeon_emit(cs, 0);
|
||||
radeon_emit(cs, S_0084FC_OFFSET_UPDATE_DONE(1)); /* reference value */
|
||||
radeon_emit(cs, S_0084FC_OFFSET_UPDATE_DONE(1)); /* mask */
|
||||
radeon_emit(cs, 4); /* poll interval */
|
||||
}
|
||||
|
||||
void radv_CmdBeginTransformFeedbackEXT(
|
||||
VkCommandBuffer commandBuffer,
|
||||
uint32_t firstBuffer,
|
||||
uint32_t bufferCount,
|
||||
const VkBuffer* pCounterBuffers,
|
||||
const VkDeviceSize* pCounterBufferOffsets)
|
||||
{
|
||||
RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
|
||||
struct radv_streamout_binding *sb = cmd_buffer->streamout_bindings;
|
||||
struct radv_streamout_state *so = &cmd_buffer->state.streamout;
|
||||
struct radeon_cmdbuf *cs = cmd_buffer->cs;
|
||||
|
||||
radv_flush_vgt_streamout(cmd_buffer);
|
||||
|
||||
assert(firstBuffer + bufferCount <= MAX_SO_BUFFERS);
|
||||
for (uint32_t i = firstBuffer; i < bufferCount; i++) {
|
||||
if (!(so->enabled_mask & (1 << i)))
|
||||
continue;
|
||||
|
||||
/* SI binds streamout buffers as shader resources.
|
||||
* VGT only counts primitives and tells the shader through
|
||||
* SGPRs what to do.
|
||||
*/
|
||||
radeon_set_context_reg_seq(cs, R_028AD0_VGT_STRMOUT_BUFFER_SIZE_0 + 16*i, 2);
|
||||
radeon_emit(cs, (sb[i].offset + sb[i].size) >> 2); /* BUFFER_SIZE (in DW) */
|
||||
radeon_emit(cs, so->stride_in_dw[i]); /* VTX_STRIDE (in DW) */
|
||||
|
||||
if (pCounterBuffers && pCounterBuffers[i]) {
|
||||
/* The array of counter buffers is optional. */
|
||||
RADV_FROM_HANDLE(radv_buffer, buffer, pCounterBuffers[i]);
|
||||
uint64_t va = radv_buffer_get_va(buffer->bo);
|
||||
|
||||
va += buffer->offset + pCounterBufferOffsets[i];
|
||||
|
||||
/* Append */
|
||||
radeon_emit(cs, PKT3(PKT3_STRMOUT_BUFFER_UPDATE, 4, 0));
|
||||
radeon_emit(cs, STRMOUT_SELECT_BUFFER(i) |
|
||||
STRMOUT_DATA_TYPE(1) | /* offset in bytes */
|
||||
STRMOUT_OFFSET_SOURCE(STRMOUT_OFFSET_FROM_MEM)); /* control */
|
||||
radeon_emit(cs, 0); /* unused */
|
||||
radeon_emit(cs, 0); /* unused */
|
||||
radeon_emit(cs, va); /* src address lo */
|
||||
radeon_emit(cs, va >> 32); /* src address hi */
|
||||
|
||||
radv_cs_add_buffer(cmd_buffer->device->ws, cs, buffer->bo);
|
||||
} else {
|
||||
/* Start from the beginning. */
|
||||
radeon_emit(cs, PKT3(PKT3_STRMOUT_BUFFER_UPDATE, 4, 0));
|
||||
radeon_emit(cs, STRMOUT_SELECT_BUFFER(i) |
|
||||
STRMOUT_DATA_TYPE(1) | /* offset in bytes */
|
||||
STRMOUT_OFFSET_SOURCE(STRMOUT_OFFSET_FROM_PACKET)); /* control */
|
||||
radeon_emit(cs, 0); /* unused */
|
||||
radeon_emit(cs, 0); /* unused */
|
||||
radeon_emit(cs, sb[i].offset >> 2); /* buffer offset in DW */
|
||||
radeon_emit(cs, 0); /* unused */
|
||||
}
|
||||
}
|
||||
|
||||
radv_set_streamout_enable(cmd_buffer, true);
|
||||
}
|
||||
|
||||
void radv_CmdEndTransformFeedbackEXT(
|
||||
VkCommandBuffer commandBuffer,
|
||||
uint32_t firstBuffer,
|
||||
uint32_t bufferCount,
|
||||
const VkBuffer* pCounterBuffers,
|
||||
const VkDeviceSize* pCounterBufferOffsets)
|
||||
{
|
||||
RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
|
||||
struct radv_streamout_state *so = &cmd_buffer->state.streamout;
|
||||
struct radeon_cmdbuf *cs = cmd_buffer->cs;
|
||||
|
||||
radv_flush_vgt_streamout(cmd_buffer);
|
||||
|
||||
assert(firstBuffer + bufferCount <= MAX_SO_BUFFERS);
|
||||
for (uint32_t i = firstBuffer; i < bufferCount; i++) {
|
||||
if (!(so->enabled_mask & (1 << i)))
|
||||
continue;
|
||||
|
||||
if (pCounterBuffers && pCounterBuffers[i]) {
|
||||
/* The array of counters buffer is optional. */
|
||||
RADV_FROM_HANDLE(radv_buffer, buffer, pCounterBuffers[i]);
|
||||
uint64_t va = radv_buffer_get_va(buffer->bo);
|
||||
|
||||
va += buffer->offset + pCounterBufferOffsets[i];
|
||||
|
||||
radeon_emit(cs, PKT3(PKT3_STRMOUT_BUFFER_UPDATE, 4, 0));
|
||||
radeon_emit(cs, STRMOUT_SELECT_BUFFER(i) |
|
||||
STRMOUT_DATA_TYPE(1) | /* offset in bytes */
|
||||
STRMOUT_OFFSET_SOURCE(STRMOUT_OFFSET_NONE) |
|
||||
STRMOUT_STORE_BUFFER_FILLED_SIZE); /* control */
|
||||
radeon_emit(cs, va); /* dst address lo */
|
||||
radeon_emit(cs, va >> 32); /* dst address hi */
|
||||
radeon_emit(cs, 0); /* unused */
|
||||
radeon_emit(cs, 0); /* unused */
|
||||
|
||||
radv_cs_add_buffer(cmd_buffer->device->ws, cs, buffer->bo);
|
||||
}
|
||||
|
||||
/* Deactivate transform feedback by zeroing the buffer size.
|
||||
* The counters (primitives generated, primitives emitted) may
|
||||
* be enabled even if there is not buffer bound. This ensures
|
||||
* that the primitives-emitted query won't increment.
|
||||
*/
|
||||
radeon_set_context_reg(cs, R_028AD0_VGT_STRMOUT_BUFFER_SIZE_0 + 16*i, 0);
|
||||
}
|
||||
|
||||
radv_set_streamout_enable(cmd_buffer, false);
|
||||
}
|
||||
|
||||
void radv_CmdDrawIndirectByteCountEXT(
|
||||
VkCommandBuffer commandBuffer,
|
||||
uint32_t instanceCount,
|
||||
uint32_t firstInstance,
|
||||
VkBuffer _counterBuffer,
|
||||
VkDeviceSize counterBufferOffset,
|
||||
uint32_t counterOffset,
|
||||
uint32_t vertexStride)
|
||||
{
|
||||
RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
|
||||
RADV_FROM_HANDLE(radv_buffer, counterBuffer, _counterBuffer);
|
||||
struct radv_draw_info info = {};
|
||||
|
||||
info.instance_count = instanceCount;
|
||||
info.first_instance = firstInstance;
|
||||
info.strmout_buffer = counterBuffer;
|
||||
info.strmout_buffer_offset = counterBufferOffset;
|
||||
info.stride = vertexStride;
|
||||
|
||||
radv_draw(cmd_buffer, &info);
|
||||
}
|
||||
|
@@ -840,6 +840,13 @@ void radv_GetPhysicalDeviceFeatures2(
|
||||
features->vertexAttributeInstanceRateZeroDivisor = VK_TRUE;
|
||||
break;
|
||||
}
|
||||
case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_TRANSFORM_FEEDBACK_FEATURES_EXT: {
|
||||
VkPhysicalDeviceTransformFeedbackFeaturesEXT *features =
|
||||
(VkPhysicalDeviceTransformFeedbackFeaturesEXT*)ext;
|
||||
features->transformFeedback = true;
|
||||
features->geometryStreams = true;
|
||||
break;
|
||||
}
|
||||
default:
|
||||
break;
|
||||
}
|
||||
@@ -1213,6 +1220,21 @@ void radv_GetPhysicalDeviceProperties2(
|
||||
};
|
||||
break;
|
||||
}
|
||||
case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_TRANSFORM_FEEDBACK_PROPERTIES_EXT: {
|
||||
VkPhysicalDeviceTransformFeedbackPropertiesEXT *properties =
|
||||
(VkPhysicalDeviceTransformFeedbackPropertiesEXT *)ext;
|
||||
properties->maxTransformFeedbackStreams = MAX_SO_STREAMS;
|
||||
properties->maxTransformFeedbackBuffers = MAX_SO_BUFFERS;
|
||||
properties->maxTransformFeedbackBufferSize = UINT32_MAX;
|
||||
properties->maxTransformFeedbackStreamDataSize = 512;
|
||||
properties->maxTransformFeedbackBufferDataSize = UINT32_MAX;
|
||||
properties->maxTransformFeedbackBufferDataStride = 512;
|
||||
properties->transformFeedbackQueries = true;
|
||||
properties->transformFeedbackStreamsLinesTriangles = false;
|
||||
properties->transformFeedbackRasterizationStreamSelect = false;
|
||||
properties->transformFeedbackDraw = true;
|
||||
break;
|
||||
}
|
||||
default:
|
||||
break;
|
||||
}
|
||||
|
@@ -109,6 +109,7 @@ EXTENSIONS = [
|
||||
Extension('VK_EXT_sampler_filter_minmax', 1, 'device->rad_info.chip_class >= CIK'),
|
||||
Extension('VK_EXT_shader_viewport_index_layer', 1, True),
|
||||
Extension('VK_EXT_shader_stencil_export', 1, True),
|
||||
Extension('VK_EXT_transform_feedback', 1, True),
|
||||
Extension('VK_EXT_vertex_attribute_divisor', 3, True),
|
||||
Extension('VK_AMD_draw_indirect_count', 1, True),
|
||||
Extension('VK_AMD_gcn_shader', 1, True),
|
||||
|
@@ -3482,6 +3482,22 @@ radv_compute_vertex_input_state(struct radv_pipeline *pipeline,
|
||||
}
|
||||
}
|
||||
|
||||
static struct radv_shader_variant *
|
||||
radv_pipeline_get_streamout_shader(struct radv_pipeline *pipeline)
|
||||
{
|
||||
int i;
|
||||
|
||||
for (i = MESA_SHADER_GEOMETRY; i >= MESA_SHADER_VERTEX; i--) {
|
||||
struct radv_shader_variant *shader =
|
||||
radv_get_shader(pipeline, i);
|
||||
|
||||
if (shader && shader->info.info.so.num_outputs > 0)
|
||||
return shader;
|
||||
}
|
||||
|
||||
return NULL;
|
||||
}
|
||||
|
||||
static VkResult
|
||||
radv_pipeline_init(struct radv_pipeline *pipeline,
|
||||
struct radv_device *device,
|
||||
@@ -3597,6 +3613,9 @@ radv_pipeline_init(struct radv_pipeline *pipeline,
|
||||
pipeline->graphics.vtx_emit_num = 2;
|
||||
}
|
||||
|
||||
/* Find the last vertex shader stage that eventually uses streamout. */
|
||||
pipeline->streamout_shader = radv_pipeline_get_streamout_shader(pipeline);
|
||||
|
||||
result = radv_pipeline_scratch_init(device, pipeline);
|
||||
radv_pipeline_generate_pm4(pipeline, pCreateInfo, extra, &blend, &tess, &gs, prim, gs_out);
|
||||
|
||||
|
@@ -843,6 +843,7 @@ enum radv_cmd_dirty_bits {
|
||||
RADV_CMD_DIRTY_INDEX_BUFFER = 1 << 11,
|
||||
RADV_CMD_DIRTY_FRAMEBUFFER = 1 << 12,
|
||||
RADV_CMD_DIRTY_VERTEX_BUFFER = 1 << 13,
|
||||
RADV_CMD_DIRTY_STREAMOUT_BUFFER = 1 << 14,
|
||||
};
|
||||
|
||||
enum radv_cmd_flush_bits {
|
||||
@@ -868,6 +869,7 @@ enum radv_cmd_flush_bits {
|
||||
/* Pipeline query controls. */
|
||||
RADV_CMD_FLAG_START_PIPELINE_STATS = 1 << 13,
|
||||
RADV_CMD_FLAG_STOP_PIPELINE_STATS = 1 << 14,
|
||||
RADV_CMD_FLAG_VGT_STREAMOUT_SYNC = 1 << 15,
|
||||
|
||||
RADV_CMD_FLUSH_AND_INV_FRAMEBUFFER = (RADV_CMD_FLAG_FLUSH_AND_INV_CB |
|
||||
RADV_CMD_FLAG_FLUSH_AND_INV_CB_META |
|
||||
@@ -880,6 +882,29 @@ struct radv_vertex_binding {
|
||||
VkDeviceSize offset;
|
||||
};
|
||||
|
||||
struct radv_streamout_binding {
|
||||
struct radv_buffer *buffer;
|
||||
VkDeviceSize offset;
|
||||
VkDeviceSize size;
|
||||
};
|
||||
|
||||
struct radv_streamout_state {
|
||||
/* Mask of bound streamout buffers. */
|
||||
uint8_t enabled_mask;
|
||||
|
||||
/* External state that comes from the last vertex stage, it must be
|
||||
* set explicitely when binding a new graphics pipeline.
|
||||
*/
|
||||
uint16_t stride_in_dw[MAX_SO_BUFFERS];
|
||||
uint32_t enabled_stream_buffers_mask; /* stream0 buffers0-3 in 4 LSB */
|
||||
|
||||
/* State of VGT_STRMOUT_BUFFER_(CONFIG|END) */
|
||||
uint32_t hw_enabled_mask;
|
||||
|
||||
/* State of VGT_STRMOUT_(CONFIG|EN) */
|
||||
bool streamout_enabled;
|
||||
};
|
||||
|
||||
struct radv_viewport_state {
|
||||
uint32_t count;
|
||||
VkViewport viewports[MAX_VIEWPORTS];
|
||||
@@ -987,6 +1012,7 @@ struct radv_cmd_state {
|
||||
const struct radv_subpass * subpass;
|
||||
struct radv_dynamic_state dynamic;
|
||||
struct radv_attachment_state * attachments;
|
||||
struct radv_streamout_state streamout;
|
||||
VkRect2D render_area;
|
||||
|
||||
/* Index buffer */
|
||||
@@ -1056,6 +1082,7 @@ struct radv_cmd_buffer {
|
||||
struct radeon_cmdbuf *cs;
|
||||
struct radv_cmd_state state;
|
||||
struct radv_vertex_binding vertex_bindings[MAX_VBS];
|
||||
struct radv_streamout_binding streamout_bindings[MAX_SO_BUFFERS];
|
||||
uint32_t queue_family_index;
|
||||
|
||||
uint8_t push_constants[MAX_PUSH_CONSTANTS_SIZE];
|
||||
@@ -1353,6 +1380,9 @@ struct radv_pipeline {
|
||||
|
||||
unsigned max_waves;
|
||||
unsigned scratch_bytes_per_wave;
|
||||
|
||||
/* Not NULL if graphics pipeline uses streamout. */
|
||||
struct radv_shader_variant *streamout_shader;
|
||||
};
|
||||
|
||||
static inline bool radv_pipeline_has_gs(const struct radv_pipeline *pipeline)
|
||||
|
@@ -789,6 +789,9 @@ VkResult radv_CreateQueryPool(
|
||||
case VK_QUERY_TYPE_TIMESTAMP:
|
||||
pool->stride = 8;
|
||||
break;
|
||||
case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT:
|
||||
pool->stride = 32;
|
||||
break;
|
||||
default:
|
||||
unreachable("creating unhandled query type");
|
||||
}
|
||||
@@ -951,6 +954,44 @@ VkResult radv_GetQueryPoolResults(
|
||||
}
|
||||
break;
|
||||
}
|
||||
case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT: {
|
||||
volatile uint64_t const *src64 = (volatile uint64_t const *)src;
|
||||
uint64_t num_primitives_written;
|
||||
uint64_t primitive_storage_needed;
|
||||
|
||||
/* SAMPLE_STREAMOUTSTATS stores this structure:
|
||||
* {
|
||||
* u64 NumPrimitivesWritten;
|
||||
* u64 PrimitiveStorageNeeded;
|
||||
* }
|
||||
*/
|
||||
available = 1;
|
||||
for (int j = 0; j < 4; j++) {
|
||||
if (!(src64[j] & 0x8000000000000000UL))
|
||||
available = 0;
|
||||
}
|
||||
|
||||
if (!available && !(flags & VK_QUERY_RESULT_PARTIAL_BIT)) {
|
||||
result = VK_NOT_READY;
|
||||
break;
|
||||
}
|
||||
|
||||
num_primitives_written = src64[3] - src64[1];
|
||||
primitive_storage_needed = src64[2] - src64[0];
|
||||
|
||||
if (flags & VK_QUERY_RESULT_64_BIT) {
|
||||
*(uint64_t *)dest = num_primitives_written;
|
||||
dest += 8;
|
||||
*(uint64_t *)dest = primitive_storage_needed;
|
||||
dest += 8;
|
||||
} else {
|
||||
*(uint32_t *)dest = num_primitives_written;
|
||||
dest += 4;
|
||||
*(uint32_t *)dest = primitive_storage_needed;
|
||||
dest += 4;
|
||||
}
|
||||
break;
|
||||
}
|
||||
default:
|
||||
unreachable("trying to get results of unhandled query type");
|
||||
}
|
||||
@@ -1109,10 +1150,22 @@ void radv_CmdResetQueryPool(
|
||||
}
|
||||
}
|
||||
|
||||
static unsigned event_type_for_stream(unsigned stream)
|
||||
{
|
||||
switch (stream) {
|
||||
default:
|
||||
case 0: return V_028A90_SAMPLE_STREAMOUTSTATS;
|
||||
case 1: return V_028A90_SAMPLE_STREAMOUTSTATS1;
|
||||
case 2: return V_028A90_SAMPLE_STREAMOUTSTATS2;
|
||||
case 3: return V_028A90_SAMPLE_STREAMOUTSTATS3;
|
||||
}
|
||||
}
|
||||
|
||||
static void emit_begin_query(struct radv_cmd_buffer *cmd_buffer,
|
||||
uint64_t va,
|
||||
VkQueryType query_type,
|
||||
VkQueryControlFlags flags)
|
||||
VkQueryControlFlags flags,
|
||||
uint32_t index)
|
||||
{
|
||||
struct radeon_cmdbuf *cs = cmd_buffer->cs;
|
||||
switch (query_type) {
|
||||
@@ -1161,6 +1214,16 @@ static void emit_begin_query(struct radv_cmd_buffer *cmd_buffer,
|
||||
radeon_emit(cs, va);
|
||||
radeon_emit(cs, va >> 32);
|
||||
break;
|
||||
case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT:
|
||||
radeon_check_space(cmd_buffer->device->ws, cs, 4);
|
||||
|
||||
assert(index < MAX_SO_STREAMS);
|
||||
|
||||
radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 2, 0));
|
||||
radeon_emit(cs, EVENT_TYPE(event_type_for_stream(index)) | EVENT_INDEX(3));
|
||||
radeon_emit(cs, va);
|
||||
radeon_emit(cs, va >> 32);
|
||||
break;
|
||||
default:
|
||||
unreachable("beginning unhandled query type");
|
||||
}
|
||||
@@ -1169,7 +1232,7 @@ static void emit_begin_query(struct radv_cmd_buffer *cmd_buffer,
|
||||
|
||||
static void emit_end_query(struct radv_cmd_buffer *cmd_buffer,
|
||||
uint64_t va, uint64_t avail_va,
|
||||
VkQueryType query_type)
|
||||
VkQueryType query_type, uint32_t index)
|
||||
{
|
||||
struct radeon_cmdbuf *cs = cmd_buffer->cs;
|
||||
switch (query_type) {
|
||||
@@ -1215,16 +1278,27 @@ static void emit_end_query(struct radv_cmd_buffer *cmd_buffer,
|
||||
avail_va, 0, 1,
|
||||
cmd_buffer->gfx9_eop_bug_va);
|
||||
break;
|
||||
case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT:
|
||||
radeon_check_space(cmd_buffer->device->ws, cs, 4);
|
||||
|
||||
assert(index < MAX_SO_STREAMS);
|
||||
|
||||
radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 2, 0));
|
||||
radeon_emit(cs, EVENT_TYPE(event_type_for_stream(index)) | EVENT_INDEX(3));
|
||||
radeon_emit(cs, (va + 16));
|
||||
radeon_emit(cs, (va + 16) >> 32);
|
||||
break;
|
||||
default:
|
||||
unreachable("ending unhandled query type");
|
||||
}
|
||||
}
|
||||
|
||||
void radv_CmdBeginQuery(
|
||||
void radv_CmdBeginQueryIndexedEXT(
|
||||
VkCommandBuffer commandBuffer,
|
||||
VkQueryPool queryPool,
|
||||
uint32_t query,
|
||||
VkQueryControlFlags flags)
|
||||
VkQueryControlFlags flags,
|
||||
uint32_t index)
|
||||
{
|
||||
RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
|
||||
RADV_FROM_HANDLE(radv_query_pool, pool, queryPool);
|
||||
@@ -1247,14 +1321,23 @@ void radv_CmdBeginQuery(
|
||||
|
||||
va += pool->stride * query;
|
||||
|
||||
emit_begin_query(cmd_buffer, va, pool->type, flags);
|
||||
emit_begin_query(cmd_buffer, va, pool->type, flags, index);
|
||||
}
|
||||
|
||||
|
||||
void radv_CmdEndQuery(
|
||||
void radv_CmdBeginQuery(
|
||||
VkCommandBuffer commandBuffer,
|
||||
VkQueryPool queryPool,
|
||||
uint32_t query)
|
||||
uint32_t query,
|
||||
VkQueryControlFlags flags)
|
||||
{
|
||||
radv_CmdBeginQueryIndexedEXT(commandBuffer, queryPool, query, flags, 0);
|
||||
}
|
||||
|
||||
void radv_CmdEndQueryIndexedEXT(
|
||||
VkCommandBuffer commandBuffer,
|
||||
VkQueryPool queryPool,
|
||||
uint32_t query,
|
||||
uint32_t index)
|
||||
{
|
||||
RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
|
||||
RADV_FROM_HANDLE(radv_query_pool, pool, queryPool);
|
||||
@@ -1265,7 +1348,7 @@ void radv_CmdEndQuery(
|
||||
/* Do not need to add the pool BO to the list because the query must
|
||||
* currently be active, which means the BO is already in the list.
|
||||
*/
|
||||
emit_end_query(cmd_buffer, va, avail_va, pool->type);
|
||||
emit_end_query(cmd_buffer, va, avail_va, pool->type, index);
|
||||
|
||||
/*
|
||||
* For multiview we have to emit a query for each bit in the mask,
|
||||
@@ -1282,12 +1365,20 @@ void radv_CmdEndQuery(
|
||||
for (unsigned i = 1; i < util_bitcount(cmd_buffer->state.subpass->view_mask); i++) {
|
||||
va += pool->stride;
|
||||
avail_va += 4;
|
||||
emit_begin_query(cmd_buffer, va, pool->type, 0);
|
||||
emit_end_query(cmd_buffer, va, avail_va, pool->type);
|
||||
emit_begin_query(cmd_buffer, va, pool->type, 0, 0);
|
||||
emit_end_query(cmd_buffer, va, avail_va, pool->type, 0);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void radv_CmdEndQuery(
|
||||
VkCommandBuffer commandBuffer,
|
||||
VkQueryPool queryPool,
|
||||
uint32_t query)
|
||||
{
|
||||
radv_CmdEndQueryIndexedEXT(commandBuffer, queryPool, query, 0);
|
||||
}
|
||||
|
||||
void radv_CmdWriteTimestamp(
|
||||
VkCommandBuffer commandBuffer,
|
||||
VkPipelineStageFlagBits pipelineStage,
|
||||
|
@@ -243,6 +243,8 @@ radv_shader_compile_to_nir(struct radv_device *device,
|
||||
.runtime_descriptor_array = true,
|
||||
.stencil_export = true,
|
||||
.storage_16bit = true,
|
||||
.geometry_streams = true,
|
||||
.transform_feedback = true,
|
||||
},
|
||||
};
|
||||
entry_point = spirv_to_nir(spirv, module->size / 4,
|
||||
@@ -434,7 +436,12 @@ radv_fill_shader_variant(struct radv_device *device,
|
||||
variant->code_size = radv_get_shader_binary_size(binary);
|
||||
variant->rsrc2 = S_00B12C_USER_SGPR(variant->info.num_user_sgprs) |
|
||||
S_00B12C_USER_SGPR_MSB(variant->info.num_user_sgprs >> 5) |
|
||||
S_00B12C_SCRATCH_EN(scratch_enabled);
|
||||
S_00B12C_SCRATCH_EN(scratch_enabled) |
|
||||
S_00B12C_SO_BASE0_EN(!!info->so.strides[0]) |
|
||||
S_00B12C_SO_BASE1_EN(!!info->so.strides[1]) |
|
||||
S_00B12C_SO_BASE2_EN(!!info->so.strides[2]) |
|
||||
S_00B12C_SO_BASE3_EN(!!info->so.strides[3]) |
|
||||
S_00B12C_SO_EN(!!info->so.num_outputs);
|
||||
|
||||
variant->rsrc1 = S_00B848_VGPRS((variant->config.num_vgprs - 1) / 4) |
|
||||
S_00B848_SGPRS((variant->config.num_sgprs - 1) / 8) |
|
||||
|
@@ -883,6 +883,12 @@ si_cs_emit_cache_flush(struct radeon_cmdbuf *cs,
|
||||
radeon_emit(cs, EVENT_TYPE(V_028A90_VGT_FLUSH) | EVENT_INDEX(0));
|
||||
}
|
||||
|
||||
/* VGT streamout state sync */
|
||||
if (flush_bits & RADV_CMD_FLAG_VGT_STREAMOUT_SYNC) {
|
||||
radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
|
||||
radeon_emit(cs, EVENT_TYPE(V_028A90_VGT_STREAMOUT_SYNC) | EVENT_INDEX(0));
|
||||
}
|
||||
|
||||
/* Make sure ME is idle (it executes most packets) before continuing.
|
||||
* This prevents read-after-write hazards between PFP and ME.
|
||||
*/
|
||||
|
Reference in New Issue
Block a user