radv: implement transform feedback queries with NGG streamout

The control bit is written to the upper bits because GDS counters
are 32-bits only, this allows to re-use the existing query shader.

Tested on GFX10.3.

Signed-off-by: Samuel Pitoiset <samuel.pitoiset@gmail.com>
Reviewed-by: Bas Nieuwenhuizen <bas@basnieuwenhuizen.nl>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/19325>
This commit is contained in:
Samuel Pitoiset
2022-10-28 16:58:52 +02:00
committed by Marge Bot
parent 7cfd0e8d31
commit 25e311e9d3
5 changed files with 56 additions and 5 deletions

View File

@@ -4180,6 +4180,10 @@ radv_flush_ngg_query_state(struct radv_cmd_buffer *cmd_buffer)
if (cmd_buffer->state.active_prims_gen_gds_queries)
ngg_query_state |= radv_ngg_query_prim_gen;
if (cmd_buffer->state.active_prims_xfb_gds_queries) {
ngg_query_state |= radv_ngg_query_prim_xfb | radv_ngg_query_prim_gen;
}
base_reg = pipeline->base.user_data_0[stage];
assert(loc->sgpr_idx != -1);

View File

@@ -201,6 +201,9 @@ lower_abi_instr(nir_builder *b, nir_instr *instr, void *state)
case nir_intrinsic_load_prim_gen_query_enabled_amd:
replacement = ngg_query_bool_setting(b, radv_ngg_query_prim_gen, s);
break;
case nir_intrinsic_load_prim_xfb_query_enabled_amd:
replacement = ngg_query_bool_setting(b, radv_ngg_query_prim_xfb, s);
break;
case nir_intrinsic_load_cull_any_enabled_amd:
replacement = nggc_bool_setting(
b, radv_nggc_front_face | radv_nggc_back_face | radv_nggc_small_primitives, s);
@@ -338,8 +341,9 @@ lower_abi_instr(nir_builder *b, nir_instr *instr, void *state)
}
/* GDS counters:
* offset 0 - pipeline statistics counter for all streams
* offset 4|8|12|16 - generated primitive counter for stream 0|1|2|3
* offset 0 - pipeline statistics counter for all streams
* offset 4| 8|12|16 - generated primitive counter for stream 0|1|2|3
* offset 20|24|28|32 - written primitive counter for stream 0|1|2|3
*/
case nir_intrinsic_atomic_add_gs_emit_prim_count_amd:
nir_gds_atomic_add_amd(b, 32, intrin->src[0].ssa, nir_imm_int(b, 0), nir_imm_int(b, 0x100));
@@ -350,7 +354,9 @@ lower_abi_instr(nir_builder *b, nir_instr *instr, void *state)
nir_imm_int(b, 0x100));
break;
case nir_intrinsic_atomic_add_xfb_prim_count_amd:
/* No-op for RADV. */
nir_gds_atomic_add_amd(b, 32, intrin->src[0].ssa,
nir_imm_int(b, 20 + nir_intrinsic_stream_id(intrin) * 4),
nir_imm_int(b, 0x100));
break;
case nir_intrinsic_load_streamout_config_amd:

View File

@@ -1218,6 +1218,7 @@ enum radv_ngg_query_state {
radv_ngg_query_none = 0,
radv_ngg_query_pipeline_stat = 1 << 0,
radv_ngg_query_prim_gen = 1 << 1,
radv_ngg_query_prim_xfb = 1 << 2,
};
struct radv_vertex_binding {
@@ -1539,6 +1540,7 @@ struct radv_cmd_state {
unsigned active_pipeline_gds_queries;
unsigned active_prims_gen_queries;
unsigned active_prims_gen_gds_queries;
unsigned active_prims_xfb_gds_queries;
uint32_t trace_id;
uint32_t last_ia_multi_vgt_param;
uint32_t last_ge_cntl;
@@ -1792,6 +1794,9 @@ unsigned radv_get_default_max_sample_dist(int log_samples);
void radv_device_init_msaa(struct radv_device *device);
VkResult radv_device_init_vrs_state(struct radv_device *device);
void radv_emit_write_data_imm(struct radeon_cmdbuf *cs, unsigned engine_sel, uint64_t va,
uint32_t imm);
void radv_update_ds_clear_metadata(struct radv_cmd_buffer *cmd_buffer,
const struct radv_image_view *iview,
VkClearDepthStencilValue ds_clear_value,

View File

@@ -1116,6 +1116,8 @@ radv_CreateQueryPool(VkDevice _device, const VkQueryPoolCreateInfo *pCreateInfo,
pool->stride = 8;
break;
case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT:
pool->stride = 32;
break;
case VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT:
pool->stride = 32;
if (pool->uses_gds) {
@@ -1819,7 +1821,19 @@ emit_begin_query(struct radv_cmd_buffer *cmd_buffer, struct radv_query_pool *poo
break;
}
case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT:
emit_sample_streamout(cmd_buffer, va, index);
if (cmd_buffer->device->physical_device->use_ngg_streamout) {
/* generated prim counter */
gfx10_copy_gds_query(cmd_buffer, 4 + index * 4, va);
radv_emit_write_data_imm(cs, V_370_ME, va + 4, 0x80000000);
/* written prim counter */
gfx10_copy_gds_query(cmd_buffer, 20 + index * 4, va + 8);
radv_emit_write_data_imm(cs, V_370_ME, va + 12, 0x80000000);
cmd_buffer->state.active_prims_xfb_gds_queries++;
} else {
emit_sample_streamout(cmd_buffer, va, index);
}
break;
case VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT: {
if (!cmd_buffer->state.active_prims_gen_queries) {
@@ -1918,7 +1932,19 @@ emit_end_query(struct radv_cmd_buffer *cmd_buffer, struct radv_query_pool *pool,
break;
}
case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT:
emit_sample_streamout(cmd_buffer, va + 16, index);
if (cmd_buffer->device->physical_device->use_ngg_streamout) {
/* generated prim counter */
gfx10_copy_gds_query(cmd_buffer, 4 + index * 4, va + 16);
radv_emit_write_data_imm(cs, V_370_ME, va + 20, 0x80000000);
/* written prim counter */
gfx10_copy_gds_query(cmd_buffer, 20 + index * 4, va + 24);
radv_emit_write_data_imm(cs, V_370_ME, va + 28, 0x80000000);
cmd_buffer->state.active_prims_xfb_gds_queries--;
} else {
emit_sample_streamout(cmd_buffer, va + 16, index);
}
break;
case VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT: {
if (cmd_buffer->state.active_prims_gen_queries == 1) {

View File

@@ -2062,3 +2062,13 @@ radv_device_init_msaa(struct radv_device *device)
for (i = 0; i < 8; i++)
radv_get_sample_position(device, 8, i, device->sample_locations_8x[i]);
}
void
radv_emit_write_data_imm(struct radeon_cmdbuf *cs, unsigned engine_sel, uint64_t va, uint32_t imm)
{
radeon_emit(cs, PKT3(PKT3_WRITE_DATA, 3, 0));
radeon_emit(cs, S_370_DST_SEL(V_370_MEM) | S_370_WR_CONFIRM(1) | S_370_ENGINE_SEL(engine_sel));
radeon_emit(cs, va);
radeon_emit(cs, va >> 32);
radeon_emit(cs, imm);
}