diff --git a/src/amd/vulkan/radv_cmd_buffer.c b/src/amd/vulkan/radv_cmd_buffer.c index 4a51481c1aa..52943114b11 100644 --- a/src/amd/vulkan/radv_cmd_buffer.c +++ b/src/amd/vulkan/radv_cmd_buffer.c @@ -4180,6 +4180,10 @@ radv_flush_ngg_query_state(struct radv_cmd_buffer *cmd_buffer) if (cmd_buffer->state.active_prims_gen_gds_queries) ngg_query_state |= radv_ngg_query_prim_gen; + if (cmd_buffer->state.active_prims_xfb_gds_queries) { + ngg_query_state |= radv_ngg_query_prim_xfb | radv_ngg_query_prim_gen; + } + base_reg = pipeline->base.user_data_0[stage]; assert(loc->sgpr_idx != -1); diff --git a/src/amd/vulkan/radv_nir_lower_abi.c b/src/amd/vulkan/radv_nir_lower_abi.c index d317022b7b8..6f097a8003b 100644 --- a/src/amd/vulkan/radv_nir_lower_abi.c +++ b/src/amd/vulkan/radv_nir_lower_abi.c @@ -201,6 +201,9 @@ lower_abi_instr(nir_builder *b, nir_instr *instr, void *state) case nir_intrinsic_load_prim_gen_query_enabled_amd: replacement = ngg_query_bool_setting(b, radv_ngg_query_prim_gen, s); break; + case nir_intrinsic_load_prim_xfb_query_enabled_amd: + replacement = ngg_query_bool_setting(b, radv_ngg_query_prim_xfb, s); + break; case nir_intrinsic_load_cull_any_enabled_amd: replacement = nggc_bool_setting( b, radv_nggc_front_face | radv_nggc_back_face | radv_nggc_small_primitives, s); @@ -338,8 +341,9 @@ lower_abi_instr(nir_builder *b, nir_instr *instr, void *state) } /* GDS counters: - * offset 0 - pipeline statistics counter for all streams - * offset 4|8|12|16 - generated primitive counter for stream 0|1|2|3 + * offset 0 - pipeline statistics counter for all streams + * offset 4| 8|12|16 - generated primitive counter for stream 0|1|2|3 + * offset 20|24|28|32 - written primitive counter for stream 0|1|2|3 */ case nir_intrinsic_atomic_add_gs_emit_prim_count_amd: nir_gds_atomic_add_amd(b, 32, intrin->src[0].ssa, nir_imm_int(b, 0), nir_imm_int(b, 0x100)); @@ -350,7 +354,9 @@ lower_abi_instr(nir_builder *b, nir_instr *instr, void *state) nir_imm_int(b, 0x100)); break; case nir_intrinsic_atomic_add_xfb_prim_count_amd: - /* No-op for RADV. */ + nir_gds_atomic_add_amd(b, 32, intrin->src[0].ssa, + nir_imm_int(b, 20 + nir_intrinsic_stream_id(intrin) * 4), + nir_imm_int(b, 0x100)); break; case nir_intrinsic_load_streamout_config_amd: diff --git a/src/amd/vulkan/radv_private.h b/src/amd/vulkan/radv_private.h index 20145d8a2a8..e95a7b57809 100644 --- a/src/amd/vulkan/radv_private.h +++ b/src/amd/vulkan/radv_private.h @@ -1218,6 +1218,7 @@ enum radv_ngg_query_state { radv_ngg_query_none = 0, radv_ngg_query_pipeline_stat = 1 << 0, radv_ngg_query_prim_gen = 1 << 1, + radv_ngg_query_prim_xfb = 1 << 2, }; struct radv_vertex_binding { @@ -1539,6 +1540,7 @@ struct radv_cmd_state { unsigned active_pipeline_gds_queries; unsigned active_prims_gen_queries; unsigned active_prims_gen_gds_queries; + unsigned active_prims_xfb_gds_queries; uint32_t trace_id; uint32_t last_ia_multi_vgt_param; uint32_t last_ge_cntl; @@ -1792,6 +1794,9 @@ unsigned radv_get_default_max_sample_dist(int log_samples); void radv_device_init_msaa(struct radv_device *device); VkResult radv_device_init_vrs_state(struct radv_device *device); +void radv_emit_write_data_imm(struct radeon_cmdbuf *cs, unsigned engine_sel, uint64_t va, + uint32_t imm); + void radv_update_ds_clear_metadata(struct radv_cmd_buffer *cmd_buffer, const struct radv_image_view *iview, VkClearDepthStencilValue ds_clear_value, diff --git a/src/amd/vulkan/radv_query.c b/src/amd/vulkan/radv_query.c index e72c4da3b01..a1870cced27 100644 --- a/src/amd/vulkan/radv_query.c +++ b/src/amd/vulkan/radv_query.c @@ -1116,6 +1116,8 @@ radv_CreateQueryPool(VkDevice _device, const VkQueryPoolCreateInfo *pCreateInfo, pool->stride = 8; break; case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT: + pool->stride = 32; + break; case VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT: pool->stride = 32; if (pool->uses_gds) { @@ -1819,7 +1821,19 @@ emit_begin_query(struct radv_cmd_buffer *cmd_buffer, struct radv_query_pool *poo break; } case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT: - emit_sample_streamout(cmd_buffer, va, index); + if (cmd_buffer->device->physical_device->use_ngg_streamout) { + /* generated prim counter */ + gfx10_copy_gds_query(cmd_buffer, 4 + index * 4, va); + radv_emit_write_data_imm(cs, V_370_ME, va + 4, 0x80000000); + + /* written prim counter */ + gfx10_copy_gds_query(cmd_buffer, 20 + index * 4, va + 8); + radv_emit_write_data_imm(cs, V_370_ME, va + 12, 0x80000000); + + cmd_buffer->state.active_prims_xfb_gds_queries++; + } else { + emit_sample_streamout(cmd_buffer, va, index); + } break; case VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT: { if (!cmd_buffer->state.active_prims_gen_queries) { @@ -1918,7 +1932,19 @@ emit_end_query(struct radv_cmd_buffer *cmd_buffer, struct radv_query_pool *pool, break; } case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT: - emit_sample_streamout(cmd_buffer, va + 16, index); + if (cmd_buffer->device->physical_device->use_ngg_streamout) { + /* generated prim counter */ + gfx10_copy_gds_query(cmd_buffer, 4 + index * 4, va + 16); + radv_emit_write_data_imm(cs, V_370_ME, va + 20, 0x80000000); + + /* written prim counter */ + gfx10_copy_gds_query(cmd_buffer, 20 + index * 4, va + 24); + radv_emit_write_data_imm(cs, V_370_ME, va + 28, 0x80000000); + + cmd_buffer->state.active_prims_xfb_gds_queries--; + } else { + emit_sample_streamout(cmd_buffer, va + 16, index); + } break; case VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT: { if (cmd_buffer->state.active_prims_gen_queries == 1) { diff --git a/src/amd/vulkan/si_cmd_buffer.c b/src/amd/vulkan/si_cmd_buffer.c index 8fea04c349f..2589c83df3b 100644 --- a/src/amd/vulkan/si_cmd_buffer.c +++ b/src/amd/vulkan/si_cmd_buffer.c @@ -2062,3 +2062,13 @@ radv_device_init_msaa(struct radv_device *device) for (i = 0; i < 8; i++) radv_get_sample_position(device, 8, i, device->sample_locations_8x[i]); } + +void +radv_emit_write_data_imm(struct radeon_cmdbuf *cs, unsigned engine_sel, uint64_t va, uint32_t imm) +{ + radeon_emit(cs, PKT3(PKT3_WRITE_DATA, 3, 0)); + radeon_emit(cs, S_370_DST_SEL(V_370_MEM) | S_370_WR_CONFIRM(1) | S_370_ENGINE_SEL(engine_sel)); + radeon_emit(cs, va); + radeon_emit(cs, va >> 32); + radeon_emit(cs, imm); +}