radeonsi: implement GL_GEOMETRY_SHADER_PRIMITIVES_EMITTED_ARB in shaders
Statistics only work in non-NGG mode. If screen->use_ngg is true, we can't know if the draw will actually use NGG or not, so this commit switch to a shader based implementation of this counter. To avoid modifying si_query, the shader implementation behaves like the hw one: it uses the same buffer size and offset. The emulation path activation in the shader is controlled by vs_state_bit[31]. Reviewed-by: Marek Olšák <marek.olsak@amd.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/15861>
This commit is contained in:
@@ -77,7 +77,6 @@ spec@arb_gpu_shader_fp64@execution@conversion@vert-conversion-explicit-double-fl
|
|||||||
spec@arb_gpu_shader_fp64@execution@conversion@vert-conversion-explicit-dvec2-vec2,Fail
|
spec@arb_gpu_shader_fp64@execution@conversion@vert-conversion-explicit-dvec2-vec2,Fail
|
||||||
spec@arb_gpu_shader_fp64@execution@conversion@vert-conversion-explicit-dvec3-vec3,Fail
|
spec@arb_gpu_shader_fp64@execution@conversion@vert-conversion-explicit-dvec3-vec3,Fail
|
||||||
spec@arb_gpu_shader_fp64@execution@conversion@vert-conversion-explicit-dvec4-vec4,Fail
|
spec@arb_gpu_shader_fp64@execution@conversion@vert-conversion-explicit-dvec4-vec4,Fail
|
||||||
spec@arb_pipeline_statistics_query@arb_pipeline_statistics_query-geom,Fail
|
|
||||||
spec@arb_program_interface_query@arb_program_interface_query-getprogramresourceindex,Fail
|
spec@arb_program_interface_query@arb_program_interface_query-getprogramresourceindex,Fail
|
||||||
spec@arb_program_interface_query@arb_program_interface_query-getprogramresourceindex@'vs_input2[1][0]' on GL_PROGRAM_INPUT,Fail
|
spec@arb_program_interface_query@arb_program_interface_query-getprogramresourceindex@'vs_input2[1][0]' on GL_PROGRAM_INPUT,Fail
|
||||||
spec@arb_query_buffer_object@coherency,Fail
|
spec@arb_query_buffer_object@coherency,Fail
|
||||||
@@ -172,7 +171,6 @@ wgl@wgl-sanity,Fail
|
|||||||
|
|
||||||
# glcts failures
|
# glcts failures
|
||||||
KHR-GL46.gl_spirv.spirv_glsl_to_spirv_builtin_functions_test,Fail
|
KHR-GL46.gl_spirv.spirv_glsl_to_spirv_builtin_functions_test,Fail
|
||||||
KHR-GL46.pipeline_statistics_query_tests_ARB.functional_geometry_shader_queries,Fail
|
|
||||||
KHR-GL46.shader_ballot_tests.ShaderBallotFunctionRead,Fail
|
KHR-GL46.shader_ballot_tests.ShaderBallotFunctionRead,Fail
|
||||||
KHR-GL46.shader_ballot_tests.ShaderBallotBitmasks,Fail
|
KHR-GL46.shader_ballot_tests.ShaderBallotBitmasks,Fail
|
||||||
KHR-GL46.sparse_texture_tests.SparseTextureCommitment,Fail
|
KHR-GL46.sparse_texture_tests.SparseTextureCommitment,Fail
|
||||||
|
|
@@ -80,7 +80,6 @@ spec@arb_gpu_shader_fp64@execution@conversion@vert-conversion-explicit-dvec2-vec
|
|||||||
spec@arb_gpu_shader_fp64@execution@conversion@vert-conversion-explicit-dvec3-vec3,Fail
|
spec@arb_gpu_shader_fp64@execution@conversion@vert-conversion-explicit-dvec3-vec3,Fail
|
||||||
spec@arb_gpu_shader_fp64@execution@conversion@vert-conversion-explicit-dvec4-vec4,Fail
|
spec@arb_gpu_shader_fp64@execution@conversion@vert-conversion-explicit-dvec4-vec4,Fail
|
||||||
spec@arb_pipeline_statistics_query@arb_pipeline_statistics_query-frag,Fail
|
spec@arb_pipeline_statistics_query@arb_pipeline_statistics_query-frag,Fail
|
||||||
spec@arb_pipeline_statistics_query@arb_pipeline_statistics_query-geom,Fail
|
|
||||||
spec@arb_program_interface_query@arb_program_interface_query-getprogramresourceindex,Fail
|
spec@arb_program_interface_query@arb_program_interface_query-getprogramresourceindex,Fail
|
||||||
spec@arb_program_interface_query@arb_program_interface_query-getprogramresourceindex@'vs_input2[1][0]' on GL_PROGRAM_INPUT,Fail
|
spec@arb_program_interface_query@arb_program_interface_query-getprogramresourceindex@'vs_input2[1][0]' on GL_PROGRAM_INPUT,Fail
|
||||||
spec@arb_query_buffer_object@coherency,Fail
|
spec@arb_query_buffer_object@coherency,Fail
|
||||||
@@ -186,7 +185,6 @@ wgl@wgl-multi-window-single-context,Fail
|
|||||||
wgl@wgl-sanity,Fail
|
wgl@wgl-sanity,Fail
|
||||||
|
|
||||||
# glcts failures
|
# glcts failures
|
||||||
KHR-GL46.pipeline_statistics_query_tests_ARB.functional_geometry_shader_queries,Fail
|
|
||||||
KHR-GL46.shader_ballot_tests.ShaderBallotFunctionRead,Fail
|
KHR-GL46.shader_ballot_tests.ShaderBallotFunctionRead,Fail
|
||||||
KHR-GL46.sparse_texture2_tests.SparseTexture2Allocation,Fail
|
KHR-GL46.sparse_texture2_tests.SparseTexture2Allocation,Fail
|
||||||
KHR-GL46.sparse_texture2_tests.SparseTexture2Commitment,Fail
|
KHR-GL46.sparse_texture2_tests.SparseTexture2Commitment,Fail
|
||||||
|
|
@@ -262,7 +262,6 @@ wgl@wgl-sanity,Fail
|
|||||||
|
|
||||||
# glcts failures
|
# glcts failures
|
||||||
KHR-GL46.gl_spirv.spirv_glsl_to_spirv_builtin_functions_test,Fail
|
KHR-GL46.gl_spirv.spirv_glsl_to_spirv_builtin_functions_test,Fail
|
||||||
KHR-GL46.pipeline_statistics_query_tests_ARB.functional_geometry_shader_queries,Fail
|
|
||||||
KHR-GL46.packed_pixels.pbo_rectangle.r16_snorm,Fail
|
KHR-GL46.packed_pixels.pbo_rectangle.r16_snorm,Fail
|
||||||
KHR-GL46.packed_pixels.pbo_rectangle.r8_snorm,Fail
|
KHR-GL46.packed_pixels.pbo_rectangle.r8_snorm,Fail
|
||||||
KHR-GL46.packed_pixels.pbo_rectangle.rg16_snorm,Fail
|
KHR-GL46.packed_pixels.pbo_rectangle.rg16_snorm,Fail
|
||||||
|
|
@@ -23,6 +23,7 @@
|
|||||||
|
|
||||||
#include "ac_llvm_cull.h"
|
#include "ac_llvm_cull.h"
|
||||||
#include "si_pipe.h"
|
#include "si_pipe.h"
|
||||||
|
#include "si_query.h"
|
||||||
#include "si_shader_internal.h"
|
#include "si_shader_internal.h"
|
||||||
#include "sid.h"
|
#include "sid.h"
|
||||||
#include "util/u_memory.h"
|
#include "util/u_memory.h"
|
||||||
@@ -70,6 +71,14 @@ static LLVMValueRef ngg_get_query_buf(struct si_shader_context *ctx)
|
|||||||
LLVMConstInt(ctx->ac.i32, SI_GS_QUERY_BUF, false));
|
LLVMConstInt(ctx->ac.i32, SI_GS_QUERY_BUF, false));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static LLVMValueRef ngg_get_emulated_counters_buf(struct si_shader_context *ctx)
|
||||||
|
{
|
||||||
|
LLVMValueRef buf_ptr = ac_get_arg(&ctx->ac, ctx->internal_bindings);
|
||||||
|
|
||||||
|
return ac_build_load_to_sgpr(&ctx->ac, buf_ptr,
|
||||||
|
LLVMConstInt(ctx->ac.i32, SI_GS_QUERY_EMULATED_COUNTERS_BUF, false));
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Return the number of vertices as a constant in \p num_vertices,
|
* Return the number of vertices as a constant in \p num_vertices,
|
||||||
* and return a more precise value as LLVMValueRef from the function.
|
* and return a more precise value as LLVMValueRef from the function.
|
||||||
@@ -2129,6 +2138,27 @@ void gfx10_ngg_gs_emit_epilogue(struct si_shader_context *ctx)
|
|||||||
}
|
}
|
||||||
|
|
||||||
ac_build_export_prim(&ctx->ac, &prim);
|
ac_build_export_prim(&ctx->ac, &prim);
|
||||||
|
|
||||||
|
tmp = si_unpack_param(ctx, ctx->vs_state_bits, 31, 1);
|
||||||
|
tmp = LLVMBuildTrunc(builder, tmp, ctx->ac.i1, "");
|
||||||
|
ac_build_ifcc(&ctx->ac, tmp, 5229); /* if (GS_PIPELINE_STATS_EMU) */
|
||||||
|
ac_build_ifcc(&ctx->ac, LLVMBuildNot(builder, prim.isnull, ""), 5237);
|
||||||
|
{
|
||||||
|
LLVMValueRef args[] = {
|
||||||
|
ctx->ac.i32_1,
|
||||||
|
ngg_get_emulated_counters_buf(ctx),
|
||||||
|
LLVMConstInt(ctx->ac.i32,
|
||||||
|
(si_hw_query_dw_offset(PIPE_STAT_QUERY_GS_PRIMITIVES) +
|
||||||
|
SI_QUERY_STATS_END_OFFSET_DW) * 4,
|
||||||
|
false),
|
||||||
|
ctx->ac.i32_0, /* soffset */
|
||||||
|
ctx->ac.i32_0, /* cachepolicy */
|
||||||
|
};
|
||||||
|
|
||||||
|
ac_build_intrinsic(&ctx->ac, "llvm.amdgcn.raw.buffer.atomic.add.i32", ctx->ac.i32, args, 5, 0);
|
||||||
|
}
|
||||||
|
ac_build_endif(&ctx->ac, 5237);
|
||||||
|
ac_build_endif(&ctx->ac, 5229);
|
||||||
}
|
}
|
||||||
ac_build_endif(&ctx->ac, 5140);
|
ac_build_endif(&ctx->ac, 5140);
|
||||||
|
|
||||||
|
@@ -1159,6 +1159,8 @@ struct si_context {
|
|||||||
unsigned last_gs_out_prim;
|
unsigned last_gs_out_prim;
|
||||||
unsigned current_vs_state;
|
unsigned current_vs_state;
|
||||||
unsigned last_vs_state;
|
unsigned last_vs_state;
|
||||||
|
bool current_gs_stats_counter_emul;
|
||||||
|
bool last_gs_stats_counter_emul;
|
||||||
enum pipe_prim_type current_rast_prim; /* primitive type after TES, GS */
|
enum pipe_prim_type current_rast_prim; /* primitive type after TES, GS */
|
||||||
|
|
||||||
struct si_small_prim_cull_info last_small_prim_cull_info;
|
struct si_small_prim_cull_info last_small_prim_cull_info;
|
||||||
@@ -1263,6 +1265,7 @@ struct si_context {
|
|||||||
int num_occlusion_queries;
|
int num_occlusion_queries;
|
||||||
int num_perfect_occlusion_queries;
|
int num_perfect_occlusion_queries;
|
||||||
int num_pipeline_stat_queries;
|
int num_pipeline_stat_queries;
|
||||||
|
int num_pipeline_stat_emulated_queries;
|
||||||
struct list_head active_queries;
|
struct list_head active_queries;
|
||||||
unsigned num_cs_dw_queries_suspend;
|
unsigned num_cs_dw_queries_suspend;
|
||||||
|
|
||||||
|
@@ -730,6 +730,9 @@ static struct pipe_query *si_query_hw_create(struct si_screen *sscreen, unsigned
|
|||||||
query->result_size += 8; /* for the fence + alignment */
|
query->result_size += 8; /* for the fence + alignment */
|
||||||
query->b.num_cs_dw_suspend = 6 + si_cp_write_fence_dwords(sscreen);
|
query->b.num_cs_dw_suspend = 6 + si_cp_write_fence_dwords(sscreen);
|
||||||
query->index = index;
|
query->index = index;
|
||||||
|
if (index == PIPE_STAT_QUERY_GS_PRIMITIVES &&
|
||||||
|
sscreen->use_ngg && (sscreen->info.chip_class >= GFX10 && sscreen->info.chip_class <= GFX10_3))
|
||||||
|
query->flags |= SI_QUERY_EMULATE_GS_COUNTERS;
|
||||||
break;
|
break;
|
||||||
default:
|
default:
|
||||||
assert(0);
|
assert(0);
|
||||||
@@ -836,12 +839,44 @@ static void si_query_hw_do_emit_start(struct si_context *sctx, struct si_query_h
|
|||||||
EOP_DATA_SEL_TIMESTAMP, NULL, va, 0, query->b.type);
|
EOP_DATA_SEL_TIMESTAMP, NULL, va, 0, query->b.type);
|
||||||
break;
|
break;
|
||||||
case PIPE_QUERY_PIPELINE_STATISTICS: {
|
case PIPE_QUERY_PIPELINE_STATISTICS: {
|
||||||
|
if (sctx->screen->use_ngg && query->flags & SI_QUERY_EMULATE_GS_COUNTERS) {
|
||||||
|
/* The hw GS primitive counter doesn't work when ngg is active.
|
||||||
|
* So if use_ngg is true, we don't use the hw version but instead
|
||||||
|
* emulate it in the GS shader.
|
||||||
|
* The value is written at the same position, so we don't need to
|
||||||
|
* change anything else.
|
||||||
|
* If ngg is enabled for the draw, the primitive count is written in
|
||||||
|
* gfx10_ngg_gs_emit_epilogue. If ngg is disabled, the number of exported
|
||||||
|
* vertices is stored in gs_emitted_vertices and the number of prim
|
||||||
|
* is computed based on the output prim type in emit_gs_epilogue.
|
||||||
|
*/
|
||||||
|
struct pipe_shader_buffer sbuf;
|
||||||
|
sbuf.buffer = &buffer->b.b;
|
||||||
|
sbuf.buffer_offset = query->buffer.results_end;
|
||||||
|
sbuf.buffer_size = buffer->bo_size;
|
||||||
|
si_set_internal_shader_buffer(sctx, SI_GS_QUERY_EMULATED_COUNTERS_BUF, &sbuf);
|
||||||
|
sctx->current_gs_stats_counter_emul = true;
|
||||||
|
|
||||||
|
const uint32_t zero = 0;
|
||||||
|
radeon_begin(cs);
|
||||||
|
/* Clear the emulated counter end value. We don't clear start because it's unused. */
|
||||||
|
va += (si_hw_query_dw_offset(query->index) + SI_QUERY_STATS_END_OFFSET_DW) * 4;
|
||||||
|
radeon_emit(PKT3(PKT3_WRITE_DATA, 2 + 1, 0));
|
||||||
|
radeon_emit(S_370_DST_SEL(V_370_MEM) | S_370_WR_CONFIRM(1) | S_370_ENGINE_SEL(V_370_PFP));
|
||||||
|
radeon_emit(va);
|
||||||
|
radeon_emit(va >> 32);
|
||||||
|
radeon_emit(zero);
|
||||||
|
radeon_end();
|
||||||
|
|
||||||
|
sctx->num_pipeline_stat_emulated_queries++;
|
||||||
|
} else {
|
||||||
radeon_begin(cs);
|
radeon_begin(cs);
|
||||||
radeon_emit(PKT3(PKT3_EVENT_WRITE, 2, 0));
|
radeon_emit(PKT3(PKT3_EVENT_WRITE, 2, 0));
|
||||||
radeon_emit(EVENT_TYPE(V_028A90_SAMPLE_PIPELINESTAT) | EVENT_INDEX(2));
|
radeon_emit(EVENT_TYPE(V_028A90_SAMPLE_PIPELINESTAT) | EVENT_INDEX(2));
|
||||||
radeon_emit(va);
|
radeon_emit(va);
|
||||||
radeon_emit(va >> 32);
|
radeon_emit(va >> 32);
|
||||||
radeon_end();
|
radeon_end();
|
||||||
|
}
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
default:
|
default:
|
||||||
@@ -918,11 +953,22 @@ static void si_query_hw_do_emit_stop(struct si_context *sctx, struct si_query_hw
|
|||||||
unsigned sample_size = (query->result_size - 8) / 2;
|
unsigned sample_size = (query->result_size - 8) / 2;
|
||||||
|
|
||||||
va += sample_size;
|
va += sample_size;
|
||||||
|
|
||||||
radeon_begin(cs);
|
radeon_begin(cs);
|
||||||
|
if (sctx->screen->use_ngg && query->flags & SI_QUERY_EMULATE_GS_COUNTERS) {
|
||||||
|
radeon_emit(PKT3(PKT3_EVENT_WRITE, 0, 0));
|
||||||
|
radeon_emit(EVENT_TYPE(V_028A90_VS_PARTIAL_FLUSH) | EVENT_INDEX(4));
|
||||||
|
|
||||||
|
if (--sctx->num_pipeline_stat_emulated_queries == 0) {
|
||||||
|
si_set_internal_shader_buffer(sctx, SI_GS_QUERY_BUF, NULL);
|
||||||
|
sctx->current_gs_stats_counter_emul = false;
|
||||||
|
}
|
||||||
|
} else {
|
||||||
radeon_emit(PKT3(PKT3_EVENT_WRITE, 2, 0));
|
radeon_emit(PKT3(PKT3_EVENT_WRITE, 2, 0));
|
||||||
radeon_emit(EVENT_TYPE(V_028A90_SAMPLE_PIPELINESTAT) | EVENT_INDEX(2));
|
radeon_emit(EVENT_TYPE(V_028A90_SAMPLE_PIPELINESTAT) | EVENT_INDEX(2));
|
||||||
radeon_emit(va);
|
radeon_emit(va);
|
||||||
radeon_emit(va >> 32);
|
radeon_emit(va >> 32);
|
||||||
|
}
|
||||||
radeon_end();
|
radeon_end();
|
||||||
|
|
||||||
fence_va = va + sample_size;
|
fence_va = va + sample_size;
|
||||||
|
@@ -170,6 +170,10 @@ enum
|
|||||||
/* gap */
|
/* gap */
|
||||||
/* whether begin_query doesn't clear the result */
|
/* whether begin_query doesn't clear the result */
|
||||||
SI_QUERY_HW_FLAG_BEGIN_RESUMES = (1 << 2),
|
SI_QUERY_HW_FLAG_BEGIN_RESUMES = (1 << 2),
|
||||||
|
/* whether GS invocations and emitted primitives counters are emulated
|
||||||
|
* using atomic adds.
|
||||||
|
*/
|
||||||
|
SI_QUERY_EMULATE_GS_COUNTERS = (1 << 3),
|
||||||
};
|
};
|
||||||
|
|
||||||
struct si_query_hw_ops {
|
struct si_query_hw_ops {
|
||||||
|
@@ -273,6 +273,8 @@ enum
|
|||||||
#define C_VS_STATE_LS_OUT_PATCH_SIZE 0xFF0007FF
|
#define C_VS_STATE_LS_OUT_PATCH_SIZE 0xFF0007FF
|
||||||
#define S_VS_STATE_LS_OUT_VERTEX_SIZE(x) (((unsigned)(x)&0xFF) << 24)
|
#define S_VS_STATE_LS_OUT_VERTEX_SIZE(x) (((unsigned)(x)&0xFF) << 24)
|
||||||
#define C_VS_STATE_LS_OUT_VERTEX_SIZE 0x00FFFFFF
|
#define C_VS_STATE_LS_OUT_VERTEX_SIZE 0x00FFFFFF
|
||||||
|
#define S_VS_STATE_GS_PIPELINE_STATS_EMU(x) (((unsigned)(x)&0x1) << 31)
|
||||||
|
#define C_VS_STATE_GS_PIPELINE_STATS_EMU 0x7FFFFFFF
|
||||||
|
|
||||||
enum
|
enum
|
||||||
{
|
{
|
||||||
|
@@ -152,6 +152,8 @@ struct si_shader_context {
|
|||||||
LLVMValueRef gs_ngg_emit;
|
LLVMValueRef gs_ngg_emit;
|
||||||
LLVMValueRef gs_ngg_scratch;
|
LLVMValueRef gs_ngg_scratch;
|
||||||
LLVMValueRef return_value;
|
LLVMValueRef return_value;
|
||||||
|
|
||||||
|
LLVMValueRef gs_emitted_vertices;
|
||||||
};
|
};
|
||||||
|
|
||||||
static inline struct si_shader_context *si_shader_context_from_abi(struct ac_shader_abi *abi)
|
static inline struct si_shader_context *si_shader_context_from_abi(struct ac_shader_abi *abi)
|
||||||
|
@@ -919,6 +919,8 @@ bool si_llvm_translate_nir(struct si_shader_context *ctx, struct si_shader *shad
|
|||||||
ctx->ac.module, LLVMArrayType(ctx->ac.i32, 0), "ngg_emit", AC_ADDR_SPACE_LDS);
|
ctx->ac.module, LLVMArrayType(ctx->ac.i32, 0), "ngg_emit", AC_ADDR_SPACE_LDS);
|
||||||
LLVMSetLinkage(ctx->gs_ngg_emit, LLVMExternalLinkage);
|
LLVMSetLinkage(ctx->gs_ngg_emit, LLVMExternalLinkage);
|
||||||
LLVMSetAlignment(ctx->gs_ngg_emit, 4);
|
LLVMSetAlignment(ctx->gs_ngg_emit, 4);
|
||||||
|
} else {
|
||||||
|
ctx->gs_emitted_vertices = LLVMConstInt(ctx->ac.i32, 0, false);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@@ -25,6 +25,7 @@
|
|||||||
#include "ac_nir.h"
|
#include "ac_nir.h"
|
||||||
#include "si_pipe.h"
|
#include "si_pipe.h"
|
||||||
#include "si_shader_internal.h"
|
#include "si_shader_internal.h"
|
||||||
|
#include "si_query.h"
|
||||||
#include "sid.h"
|
#include "sid.h"
|
||||||
#include "util/u_memory.h"
|
#include "util/u_memory.h"
|
||||||
|
|
||||||
@@ -200,6 +201,14 @@ static LLVMValueRef si_get_gs_wave_id(struct si_shader_context *ctx)
|
|||||||
return ac_get_arg(&ctx->ac, ctx->args.gs_wave_id);
|
return ac_get_arg(&ctx->ac, ctx->args.gs_wave_id);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static LLVMValueRef ngg_get_emulated_counters_buf(struct si_shader_context *ctx)
|
||||||
|
{
|
||||||
|
LLVMValueRef buf_ptr = ac_get_arg(&ctx->ac, ctx->internal_bindings);
|
||||||
|
|
||||||
|
return ac_build_load_to_sgpr(&ctx->ac, buf_ptr,
|
||||||
|
LLVMConstInt(ctx->ac.i32, SI_GS_QUERY_EMULATED_COUNTERS_BUF, false));
|
||||||
|
}
|
||||||
|
|
||||||
static void emit_gs_epilogue(struct si_shader_context *ctx)
|
static void emit_gs_epilogue(struct si_shader_context *ctx)
|
||||||
{
|
{
|
||||||
if (ctx->shader->key.ge.as_ngg) {
|
if (ctx->shader->key.ge.as_ngg) {
|
||||||
@@ -210,6 +219,46 @@ static void emit_gs_epilogue(struct si_shader_context *ctx)
|
|||||||
if (ctx->screen->info.chip_class >= GFX10)
|
if (ctx->screen->info.chip_class >= GFX10)
|
||||||
LLVMBuildFence(ctx->ac.builder, LLVMAtomicOrderingRelease, false, "");
|
LLVMBuildFence(ctx->ac.builder, LLVMAtomicOrderingRelease, false, "");
|
||||||
|
|
||||||
|
if (ctx->screen->use_ngg) {
|
||||||
|
/* Implement PIPE_STAT_QUERY_GS_PRIMITIVES for non-ngg draws because we can't
|
||||||
|
* use pipeline statistics (they would be correct but when screen->use_ngg, we
|
||||||
|
* can't know when the query is started if the next draw(s) will use ngg or not).
|
||||||
|
*/
|
||||||
|
LLVMValueRef tmp = si_unpack_param(ctx, ctx->vs_state_bits, 31, 1);
|
||||||
|
tmp = LLVMBuildTrunc(ctx->ac.builder, tmp, ctx->ac.i1, "");
|
||||||
|
ac_build_ifcc(&ctx->ac, tmp, 5229); /* if (GS_PIPELINE_STATS_EMU) */
|
||||||
|
{
|
||||||
|
LLVMValueRef prim = ctx->ac.i32_0;
|
||||||
|
switch (ctx->shader->selector->info.base.gs.output_primitive) {
|
||||||
|
case SHADER_PRIM_POINTS:
|
||||||
|
prim = ctx->gs_emitted_vertices;
|
||||||
|
break;
|
||||||
|
case SHADER_PRIM_LINE_STRIP:
|
||||||
|
prim = LLVMBuildSub(ctx->ac.builder, ctx->gs_emitted_vertices, ctx->ac.i32_1, "");
|
||||||
|
prim = ac_build_imax(&ctx->ac, prim, ctx->ac.i32_0);
|
||||||
|
break;
|
||||||
|
case SHADER_PRIM_TRIANGLE_STRIP:
|
||||||
|
prim = LLVMBuildSub(ctx->ac.builder, ctx->gs_emitted_vertices, LLVMConstInt(ctx->ac.i32, 2, 0), "");
|
||||||
|
prim = ac_build_imax(&ctx->ac, prim, ctx->ac.i32_0);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
LLVMValueRef args[] = {
|
||||||
|
prim,
|
||||||
|
ngg_get_emulated_counters_buf(ctx),
|
||||||
|
LLVMConstInt(ctx->ac.i32,
|
||||||
|
(si_hw_query_dw_offset(PIPE_STAT_QUERY_GS_PRIMITIVES) +
|
||||||
|
SI_QUERY_STATS_END_OFFSET_DW) * 4,
|
||||||
|
false),
|
||||||
|
ctx->ac.i32_0, /* soffset */
|
||||||
|
ctx->ac.i32_0, /* cachepolicy */
|
||||||
|
};
|
||||||
|
|
||||||
|
ac_build_intrinsic(&ctx->ac, "llvm.amdgcn.raw.buffer.atomic.add.i32", ctx->ac.i32, args, 5, 0);
|
||||||
|
}
|
||||||
|
ac_build_endif(&ctx->ac, 5229);
|
||||||
|
}
|
||||||
|
|
||||||
ac_build_sendmsg(&ctx->ac, AC_SENDMSG_GS_OP_NOP | AC_SENDMSG_GS_DONE, si_get_gs_wave_id(ctx));
|
ac_build_sendmsg(&ctx->ac, AC_SENDMSG_GS_OP_NOP | AC_SENDMSG_GS_DONE, si_get_gs_wave_id(ctx));
|
||||||
|
|
||||||
if (ctx->screen->info.chip_class >= GFX9)
|
if (ctx->screen->info.chip_class >= GFX9)
|
||||||
@@ -295,6 +344,9 @@ static void si_llvm_emit_vertex(struct ac_shader_abi *abi, unsigned stream, LLVM
|
|||||||
if (offset) {
|
if (offset) {
|
||||||
ac_build_sendmsg(&ctx->ac, AC_SENDMSG_GS_OP_EMIT | AC_SENDMSG_GS | (stream << 8),
|
ac_build_sendmsg(&ctx->ac, AC_SENDMSG_GS_OP_EMIT | AC_SENDMSG_GS | (stream << 8),
|
||||||
si_get_gs_wave_id(ctx));
|
si_get_gs_wave_id(ctx));
|
||||||
|
|
||||||
|
ctx->gs_emitted_vertices = LLVMBuildAdd(ctx->ac.builder, ctx->gs_emitted_vertices,
|
||||||
|
ctx->ac.i32_1, "vert");
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!use_kill)
|
if (!use_kill)
|
||||||
|
@@ -369,6 +369,7 @@ enum
|
|||||||
|
|
||||||
SI_RING_ESGS, /* gfx6-8 */
|
SI_RING_ESGS, /* gfx6-8 */
|
||||||
SI_RING_GSVS, /* gfx6-10 */
|
SI_RING_GSVS, /* gfx6-10 */
|
||||||
|
SI_GS_QUERY_EMULATED_COUNTERS_BUF, /* gfx10+ */
|
||||||
|
|
||||||
SI_NUM_INTERNAL_BINDINGS,
|
SI_NUM_INTERNAL_BINDINGS,
|
||||||
|
|
||||||
|
@@ -1229,15 +1229,31 @@ static void si_emit_vs_state(struct si_context *sctx, unsigned index_size)
|
|||||||
sctx->current_vs_state |= S_VS_STATE_INDEXED(!!index_size);
|
sctx->current_vs_state |= S_VS_STATE_INDEXED(!!index_size);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (sctx->current_vs_state != sctx->last_vs_state) {
|
bool gs_counters_emu = (GFX_VERSION >= GFX10 && GFX_VERSION <= GFX10_3) && HAS_GS;
|
||||||
|
|
||||||
|
if (sctx->current_vs_state != sctx->last_vs_state ||
|
||||||
|
(gs_counters_emu && sctx->current_gs_stats_counter_emul != sctx->last_gs_stats_counter_emul)) {
|
||||||
struct radeon_cmdbuf *cs = &sctx->gfx_cs;
|
struct radeon_cmdbuf *cs = &sctx->gfx_cs;
|
||||||
|
|
||||||
/* For the API vertex shader (VS_STATE_INDEXED, LS_OUT_*). */
|
/* For the API vertex shader (VS_STATE_INDEXED, LS_OUT_*). */
|
||||||
unsigned vs_base = si_get_user_data_base(GFX_VERSION, HAS_TESS, HAS_GS, NGG,
|
unsigned vs_base = si_get_user_data_base(GFX_VERSION, HAS_TESS, HAS_GS, NGG,
|
||||||
PIPE_SHADER_VERTEX);
|
PIPE_SHADER_VERTEX);
|
||||||
|
|
||||||
|
unsigned vs_state = sctx->current_vs_state;
|
||||||
|
unsigned gs_state = vs_state;
|
||||||
|
if (gs_counters_emu) {
|
||||||
|
/* Remove HS/LS state and apply Add GS-specific state to control
|
||||||
|
* counters emulation.
|
||||||
|
*/
|
||||||
|
gs_state = vs_state & C_VS_STATE_LS_OUT_PATCH_SIZE & C_VS_STATE_LS_OUT_VERTEX_SIZE;
|
||||||
|
gs_state |= S_VS_STATE_GS_PIPELINE_STATS_EMU(sctx->current_gs_stats_counter_emul);
|
||||||
|
sctx->last_gs_stats_counter_emul = sctx->current_gs_stats_counter_emul;
|
||||||
|
}
|
||||||
|
|
||||||
radeon_begin(cs);
|
radeon_begin(cs);
|
||||||
radeon_set_sh_reg(vs_base + SI_SGPR_VS_STATE_BITS * 4,
|
radeon_set_sh_reg(vs_base + SI_SGPR_VS_STATE_BITS * 4,
|
||||||
sctx->current_vs_state);
|
(gs_counters_emu && vs_base == R_00B230_SPI_SHADER_USER_DATA_GS_0) ?
|
||||||
|
gs_state : vs_state);
|
||||||
|
|
||||||
/* Set CLAMP_VERTEX_COLOR and OUTPRIM in the last stage
|
/* Set CLAMP_VERTEX_COLOR and OUTPRIM in the last stage
|
||||||
* before the rasterizer.
|
* before the rasterizer.
|
||||||
@@ -1246,14 +1262,13 @@ static void si_emit_vs_state(struct si_context *sctx, unsigned index_size)
|
|||||||
*/
|
*/
|
||||||
if (GFX_VERSION <= GFX10_3 && vs_base != R_00B130_SPI_SHADER_USER_DATA_VS_0) {
|
if (GFX_VERSION <= GFX10_3 && vs_base != R_00B130_SPI_SHADER_USER_DATA_VS_0) {
|
||||||
radeon_set_sh_reg(R_00B130_SPI_SHADER_USER_DATA_VS_0 + SI_SGPR_VS_STATE_BITS * 4,
|
radeon_set_sh_reg(R_00B130_SPI_SHADER_USER_DATA_VS_0 + SI_SGPR_VS_STATE_BITS * 4,
|
||||||
sctx->current_vs_state);
|
vs_state);
|
||||||
}
|
}
|
||||||
|
|
||||||
/* For NGG: */
|
/* For NGG: */
|
||||||
if (GFX_VERSION >= GFX10 && vs_base != R_00B230_SPI_SHADER_USER_DATA_GS_0) {
|
if (GFX_VERSION >= GFX10 && vs_base != R_00B230_SPI_SHADER_USER_DATA_GS_0)
|
||||||
radeon_set_sh_reg(R_00B230_SPI_SHADER_USER_DATA_GS_0 + SI_SGPR_VS_STATE_BITS * 4,
|
radeon_set_sh_reg(R_00B230_SPI_SHADER_USER_DATA_GS_0 + SI_SGPR_VS_STATE_BITS * 4,
|
||||||
sctx->current_vs_state);
|
gs_state);
|
||||||
}
|
|
||||||
radeon_end();
|
radeon_end();
|
||||||
|
|
||||||
sctx->last_vs_state = sctx->current_vs_state;
|
sctx->last_vs_state = sctx->current_vs_state;
|
||||||
|
Reference in New Issue
Block a user