freedreno/a6xx: GL_ARB_pipeline_statistics_query

Handle the other pipeline stats counters in order to implement
GL_ARB_pipeline_statistics_query.  Note that this does away with
collecting *all* the counters if DEBUG_COUNTERS is enabled, other-
wise it was getting over-complicated.

Signed-off-by: Rob Clark <robdclark@chromium.org>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/23301>
This commit is contained in:
Rob Clark
2023-05-28 12:03:51 -07:00
committed by Marge Bot
parent 8192498530
commit c32c5a7749
8 changed files with 181 additions and 40 deletions

View File

@@ -229,7 +229,7 @@ GL 4.6, GLSL 4.60 -- all DONE: radeonsi, virgl, zink
GL_ARB_gl_spirv DONE (freedreno, i965/gen7+, llvmpipe)
GL_ARB_indirect_parameters DONE (freedreno/a6xx+, i965/gen7+, nvc0, llvmpipe, virgl, d3d12)
GL_ARB_pipeline_statistics_query DONE (i965, nvc0, r600, llvmpipe, softpipe)
GL_ARB_pipeline_statistics_query DONE (freedreno/a6xx+, i965, nvc0, r600, llvmpipe, softpipe)
GL_ARB_polygon_offset_clamp DONE (freedreno, i965, nv50, nvc0, r600, llvmpipe, v3d, panfrost)
GL_ARB_shader_atomic_counter_ops DONE (freedreno/a5xx+, i965/gen7+, nvc0, r600, llvmpipe, softpipe, v3d)
GL_ARB_shader_draw_parameters DONE (freedreno/a6xx+, i965, llvmpipe, nvc0, d3d12)

View File

@@ -324,6 +324,31 @@ spec@arb_query_buffer_object@qbo@query-GL_TIME_ELAPSED-SYNC_CPU_READ_AFTER_CACHE
spec@arb_query_buffer_object@qbo@query-GL_TIME_ELAPSED-SYNC_CPU_READ_AFTER_CACHE_TEST-GL_UNSIGNED_INT,Fail
spec@arb_query_buffer_object@qbo@query-GL_TIME_ELAPSED-SYNC_CPU_READ_AFTER_CACHE_TEST-GL_UNSIGNED_INT64_ARB,Fail
# These seem to fail with zink as well (and on various other drivers
# from what I can tell from expectations files, so maybe test issue?)
spec@arb_query_buffer_object@coherency,Fail
spec@arb_query_buffer_object@coherency@index-buffer-GL_TESS_CONTROL_SHADER_PATCHES,Fail
spec@arb_query_buffer_object@coherency@indirect-dispatch-GL_TESS_CONTROL_SHADER_PATCHES,Fail
spec@arb_query_buffer_object@coherency@indirect-draw-GL_TESS_CONTROL_SHADER_PATCHES,Fail
spec@arb_query_buffer_object@coherency@indirect-draw-count-GL_COMPUTE_SHADER_INVOCATIONS,Fail
spec@arb_query_buffer_object@coherency@indirect-draw-count-GL_GEOMETRY_SHADER_INVOCATIONS,Fail
spec@arb_query_buffer_object@coherency@indirect-draw-count-GL_GEOMETRY_SHADER_PRIMITIVES_EMITTED,Fail
spec@arb_query_buffer_object@coherency@indirect-draw-count-GL_TESS_CONTROL_SHADER_PATCHES,Fail
spec@arb_query_buffer_object@coherency@indirect-draw-count-GL_TESS_EVALUATION_SHADER_INVOCATIONS,Fail
spec@arb_query_buffer_object@coherency@indirect-draw-count-GL_TRANSFORM_FEEDBACK_PRIMITIVES_WRITTEN,Fail
spec@arb_query_buffer_object@qbo@query-GL_TESS_CONTROL_SHADER_PATCHES-ASYNC_CPU_READ_AFTER-GL_INT,Fail
spec@arb_query_buffer_object@qbo@query-GL_TESS_CONTROL_SHADER_PATCHES-ASYNC_CPU_READ_AFTER-GL_UNSIGNED_INT,Fail
spec@arb_query_buffer_object@qbo@query-GL_TESS_CONTROL_SHADER_PATCHES-ASYNC_CPU_READ_AFTER-GL_UNSIGNED_INT64_ARB,Fail
spec@arb_query_buffer_object@qbo@query-GL_TESS_CONTROL_SHADER_PATCHES-ASYNC_CPU_READ_BEFORE-GL_INT,Fail
spec@arb_query_buffer_object@qbo@query-GL_TESS_CONTROL_SHADER_PATCHES-ASYNC_CPU_READ_BEFORE-GL_UNSIGNED_INT,Fail
spec@arb_query_buffer_object@qbo@query-GL_TESS_CONTROL_SHADER_PATCHES-ASYNC_CPU_READ_BEFORE-GL_UNSIGNED_INT64_ARB,Fail
spec@arb_query_buffer_object@qbo@query-GL_TESS_CONTROL_SHADER_PATCHES-SYNC-GL_INT,Fail
spec@arb_query_buffer_object@qbo@query-GL_TESS_CONTROL_SHADER_PATCHES-SYNC-GL_UNSIGNED_INT,Fail
spec@arb_query_buffer_object@qbo@query-GL_TESS_CONTROL_SHADER_PATCHES-SYNC-GL_UNSIGNED_INT64_ARB,Fail
spec@arb_query_buffer_object@qbo@query-GL_TESS_CONTROL_SHADER_PATCHES-SYNC_CPU_READ_AFTER_CACHE_TEST-GL_INT,Fail
spec@arb_query_buffer_object@qbo@query-GL_TESS_CONTROL_SHADER_PATCHES-SYNC_CPU_READ_AFTER_CACHE_TEST-GL_UNSIGNED_INT,Fail
spec@arb_query_buffer_object@qbo@query-GL_TESS_CONTROL_SHADER_PATCHES-SYNC_CPU_READ_AFTER_CACHE_TEST-GL_UNSIGNED_INT64_ARB,Fail
spill-dEQP-VK.subgroups.ballot_broadcast.compute.subgroupbroadcast_bool,Fail
spill-dEQP-VK.subgroups.ballot_broadcast.compute.subgroupbroadcast_bool_requiredsubgroupsize128,Fail

View File

@@ -330,6 +330,31 @@ spec@arb_query_buffer_object@qbo@query-GL_TIME_ELAPSED-SYNC_CPU_READ_AFTER_CACHE
spec@arb_query_buffer_object@qbo@query-GL_TIME_ELAPSED-SYNC_CPU_READ_AFTER_CACHE_TEST-GL_UNSIGNED_INT,Fail
spec@arb_query_buffer_object@qbo@query-GL_TIME_ELAPSED-SYNC_CPU_READ_AFTER_CACHE_TEST-GL_UNSIGNED_INT64_ARB,Fail
# These seem to fail with zink as well (and on various other drivers
# from what I can tell from expectations files, so maybe test issue?)
spec@arb_query_buffer_object@coherency,Fail
spec@arb_query_buffer_object@coherency@index-buffer-GL_TESS_CONTROL_SHADER_PATCHES,Fail
spec@arb_query_buffer_object@coherency@indirect-dispatch-GL_TESS_CONTROL_SHADER_PATCHES,Fail
spec@arb_query_buffer_object@coherency@indirect-draw-GL_TESS_CONTROL_SHADER_PATCHES,Fail
spec@arb_query_buffer_object@coherency@indirect-draw-count-GL_COMPUTE_SHADER_INVOCATIONS,Fail
spec@arb_query_buffer_object@coherency@indirect-draw-count-GL_GEOMETRY_SHADER_INVOCATIONS,Fail
spec@arb_query_buffer_object@coherency@indirect-draw-count-GL_GEOMETRY_SHADER_PRIMITIVES_EMITTED,Fail
spec@arb_query_buffer_object@coherency@indirect-draw-count-GL_TESS_CONTROL_SHADER_PATCHES,Fail
spec@arb_query_buffer_object@coherency@indirect-draw-count-GL_TESS_EVALUATION_SHADER_INVOCATIONS,Fail
spec@arb_query_buffer_object@coherency@indirect-draw-count-GL_TRANSFORM_FEEDBACK_PRIMITIVES_WRITTEN,Fail
spec@arb_query_buffer_object@qbo@query-GL_TESS_CONTROL_SHADER_PATCHES-ASYNC_CPU_READ_AFTER-GL_INT,Fail
spec@arb_query_buffer_object@qbo@query-GL_TESS_CONTROL_SHADER_PATCHES-ASYNC_CPU_READ_AFTER-GL_UNSIGNED_INT,Fail
spec@arb_query_buffer_object@qbo@query-GL_TESS_CONTROL_SHADER_PATCHES-ASYNC_CPU_READ_AFTER-GL_UNSIGNED_INT64_ARB,Fail
spec@arb_query_buffer_object@qbo@query-GL_TESS_CONTROL_SHADER_PATCHES-ASYNC_CPU_READ_BEFORE-GL_INT,Fail
spec@arb_query_buffer_object@qbo@query-GL_TESS_CONTROL_SHADER_PATCHES-ASYNC_CPU_READ_BEFORE-GL_UNSIGNED_INT,Fail
spec@arb_query_buffer_object@qbo@query-GL_TESS_CONTROL_SHADER_PATCHES-ASYNC_CPU_READ_BEFORE-GL_UNSIGNED_INT64_ARB,Fail
spec@arb_query_buffer_object@qbo@query-GL_TESS_CONTROL_SHADER_PATCHES-SYNC-GL_INT,Fail
spec@arb_query_buffer_object@qbo@query-GL_TESS_CONTROL_SHADER_PATCHES-SYNC-GL_UNSIGNED_INT,Fail
spec@arb_query_buffer_object@qbo@query-GL_TESS_CONTROL_SHADER_PATCHES-SYNC-GL_UNSIGNED_INT64_ARB,Fail
spec@arb_query_buffer_object@qbo@query-GL_TESS_CONTROL_SHADER_PATCHES-SYNC_CPU_READ_AFTER_CACHE_TEST-GL_INT,Fail
spec@arb_query_buffer_object@qbo@query-GL_TESS_CONTROL_SHADER_PATCHES-SYNC_CPU_READ_AFTER_CACHE_TEST-GL_UNSIGNED_INT,Fail
spec@arb_query_buffer_object@qbo@query-GL_TESS_CONTROL_SHADER_PATCHES-SYNC_CPU_READ_AFTER_CACHE_TEST-GL_UNSIGNED_INT64_ARB,Fail
# Excerpt:
# Image comparison failed: reference = -0.000488281, expected = 0:0:0:0, result = 0:0:0:3
# Image comparison failed: reference = 0, expected = 0:0:0:0, result = 0:0:0:3

View File

@@ -347,7 +347,7 @@ static const struct fd_acc_sample_provider timestamp = {
struct PACKED fd6_pipeline_stats_sample {
struct fd_acc_query_sample base;
uint64_t start[16], stop[16], result;
uint64_t start, stop, result;
};
DEFINE_CAST(fd_acc_query_sample, fd6_pipeline_stats_sample);
@@ -355,34 +355,98 @@ DEFINE_CAST(fd_acc_query_sample, fd6_pipeline_stats_sample);
OUT_RELOC(ring, fd_resource((aq)->prsc)->bo, \
__offsetof(struct fd6_pipeline_stats_sample, field), 0, 0);
#ifdef DEBUG_COUNTERS
static const unsigned counter_count = 10;
static const unsigned counter_base = REG_A6XX_RBBM_PRIMCTR_0_LO;
#else
static const unsigned counter_count = 1;
static const unsigned counter_base = REG_A6XX_RBBM_PRIMCTR_7_LO;
#endif
/* Mapping of counters to pipeline stats:
*
* Gallium (PIPE_STAT_QUERY_x) | Vulkan (VK_QUERY_PIPELINE_STATISTIC_x_BIT) | hw counter
* ----------------------------+--------------------------------------------+----------------
* IA_VERTICES | INPUT_ASSEMBLY_VERTICES | RBBM_PRIMCTR_0
* IA_PRIMITIVES | INPUT_ASSEMBLY_PRIMITIVES | RBBM_PRIMCTR_1
* VS_INVOCATIONS | VERTEX_SHADER_INVOCATIONS | RBBM_PRIMCTR_0
* GS_INVOCATIONS | GEOMETRY_SHADER_INVOCATIONS | RBBM_PRIMCTR_5
* GS_PRIMITIVES | GEOMETRY_SHADER_PRIMITIVES | RBBM_PRIMCTR_6
* C_INVOCATIONS | CLIPPING_INVOCATIONS | RBBM_PRIMCTR_7
* C_PRIMITIVES | CLIPPING_PRIMITIVES | RBBM_PRIMCTR_8
* PS_INVOCATIONS | FRAGMENT_SHADER_INVOCATIONS | RBBM_PRIMCTR_9
* HS_INVOCATIONS | TESSELLATION_CONTROL_SHADER_PATCHES | RBBM_PRIMCTR_2
* DS_INVOCATIONS | TESSELLATION_EVALUATION_SHADER_INVOCATIONS | RBBM_PRIMCTR_4
* CS_INVOCATIONS | COMPUTE_SHADER_INVOCATIONS | RBBM_PRIMCTR_10
*
* Note that "Vertices corresponding to incomplete primitives may contribute to the count.",
* in our case they do not, so IA_VERTICES and VS_INVOCATIONS are the same thing.
*/
enum stats_type {
STATS_PRIMITIVE,
STATS_FRAGMENT,
STATS_COMPUTE,
};
static const struct {
enum vgt_event_type start, stop;
} stats_counter_events[] = {
[STATS_PRIMITIVE] = { START_PRIMITIVE_CTRS, STOP_PRIMITIVE_CTRS },
[STATS_FRAGMENT] = { START_FRAGMENT_CTRS, STOP_FRAGMENT_CTRS },
[STATS_COMPUTE] = { START_COMPUTE_CTRS, STOP_COMPUTE_CTRS },
};
static enum stats_type
get_stats_type(struct fd_acc_query *aq)
{
if (aq->provider->query_type == PIPE_QUERY_PRIMITIVES_GENERATED)
return STATS_PRIMITIVE;
switch (aq->base.index) {
case PIPE_STAT_QUERY_PS_INVOCATIONS: return STATS_FRAGMENT;
case PIPE_STAT_QUERY_CS_INVOCATIONS: return STATS_COMPUTE;
default:
return STATS_PRIMITIVE;
}
}
static unsigned
stats_counter_index(struct fd_acc_query *aq)
{
if (aq->provider->query_type == PIPE_QUERY_PRIMITIVES_GENERATED)
return 7;
switch (aq->base.index) {
case PIPE_STAT_QUERY_IA_VERTICES: return 0;
case PIPE_STAT_QUERY_IA_PRIMITIVES: return 1;
case PIPE_STAT_QUERY_VS_INVOCATIONS: return 0;
case PIPE_STAT_QUERY_GS_INVOCATIONS: return 5;
case PIPE_STAT_QUERY_GS_PRIMITIVES: return 6;
case PIPE_STAT_QUERY_C_INVOCATIONS: return 7;
case PIPE_STAT_QUERY_C_PRIMITIVES: return 8;
case PIPE_STAT_QUERY_PS_INVOCATIONS: return 9;
case PIPE_STAT_QUERY_HS_INVOCATIONS: return 2;
case PIPE_STAT_QUERY_DS_INVOCATIONS: return 4;
case PIPE_STAT_QUERY_CS_INVOCATIONS: return 10;
default:
return 0;
}
}
static void
log_pipeline_stats(struct fd6_pipeline_stats_sample *ps)
log_pipeline_stats(struct fd6_pipeline_stats_sample *ps, unsigned idx)
{
#ifdef DEBUG_COUNTERS
const char *labels[] = {
"vs_vertices_in", "vs_primitives_out",
"hs_vertices_in", "hs_patches_out",
"ds_vertices_in", "ds_primitives_out",
"gs_primitives_in", "gs_primitives_out",
"ras_primitives_in", "x",
"VS_INVOCATIONS",
"IA_PRIMITIVES",
"HS_INVOCATIONS",
"??",
"DS_INVOCATIONS",
"GS_INVOCATIONS",
"GS_PRIMITIVES",
"C_INVOCATIONS",
"C_PRIMITIVES",
"PS_INVOCATIONS",
"CS_INVOCATIONS",
};
mesa_logd(" counter\t\tstart\t\t\tstop\t\t\tdiff");
for (int i = 0; i < ARRAY_SIZE(labels); i++) {
int register_idx = i + (counter_base - REG_A6XX_RBBM_PRIMCTR_0_LO) / 2;
mesa_logd(" RBBM_PRIMCTR_%d\t0x%016" PRIx64 "\t0x%016" PRIx64 "\t%" PRIi64
"\t%s",
register_idx, ps->start[i], ps->stop[i],
ps->stop[i] - ps->start[i], labels[register_idx]);
}
mesa_logd(" RBBM_PRIMCTR_%d\t0x%016" PRIx64 "\t0x%016" PRIx64 "\t%" PRIi64 "\t%s",
idx, ps->start, ps->stop, ps->stop - ps->start, labels[idx]);
#endif
}
@@ -391,17 +455,23 @@ pipeline_stats_resume(struct fd_acc_query *aq, struct fd_batch *batch)
assert_dt
{
struct fd_ringbuffer *ring = batch->draw;
enum stats_type type = get_stats_type(aq);
unsigned idx = stats_counter_index(aq);
unsigned reg = REG_A6XX_RBBM_PRIMCTR_0_LO + (2 * idx);
OUT_WFI5(ring);
OUT_PKT7(ring, CP_REG_TO_MEM, 3);
OUT_RING(ring, CP_REG_TO_MEM_0_64B | CP_REG_TO_MEM_0_CNT(counter_count * 2) |
CP_REG_TO_MEM_0_REG(counter_base));
OUT_RING(ring, CP_REG_TO_MEM_0_64B |
CP_REG_TO_MEM_0_CNT(2) |
CP_REG_TO_MEM_0_REG(reg));
stats_reloc(ring, aq, start);
if (!batch->pipeline_stats_queries_active)
fd6_event_write(batch, ring, START_PRIMITIVE_CTRS, false);
batch->pipeline_stats_queries_active++;
assert(type < ARRAY_SIZE(batch->pipeline_stats_queries_active));
if (!batch->pipeline_stats_queries_active[type])
fd6_event_write(batch, ring, stats_counter_events[type].start, false);
batch->pipeline_stats_queries_active[type]++;
}
static void
@@ -409,27 +479,33 @@ pipeline_stats_pause(struct fd_acc_query *aq, struct fd_batch *batch)
assert_dt
{
struct fd_ringbuffer *ring = batch->draw;
enum stats_type type = get_stats_type(aq);
unsigned idx = stats_counter_index(aq);
unsigned reg = REG_A6XX_RBBM_PRIMCTR_0_LO + (2 * idx);
OUT_WFI5(ring);
/* snapshot the end values: */
OUT_PKT7(ring, CP_REG_TO_MEM, 3);
OUT_RING(ring, CP_REG_TO_MEM_0_64B | CP_REG_TO_MEM_0_CNT(counter_count * 2) |
CP_REG_TO_MEM_0_REG(counter_base));
OUT_RING(ring, CP_REG_TO_MEM_0_64B |
CP_REG_TO_MEM_0_CNT(2) |
CP_REG_TO_MEM_0_REG(reg));
stats_reloc(ring, aq, stop);
assert(batch->pipeline_stats_queries_active > 0);
batch->pipeline_stats_queries_active--;
if (batch->pipeline_stats_queries_active)
fd6_event_write(batch, ring, STOP_PRIMITIVE_CTRS, false);
assert(type < ARRAY_SIZE(batch->pipeline_stats_queries_active));
assert(batch->pipeline_stats_queries_active[type] > 0);
batch->pipeline_stats_queries_active[type]--;
if (batch->pipeline_stats_queries_active[type])
fd6_event_write(batch, ring, stats_counter_events[type].stop, false);
/* result += stop - start: */
OUT_PKT7(ring, CP_MEM_TO_MEM, 9);
OUT_RING(ring, CP_MEM_TO_MEM_0_DOUBLE | CP_MEM_TO_MEM_0_NEG_C | 0x40000000);
stats_reloc(ring, aq, result);
stats_reloc(ring, aq, result);
stats_reloc(ring, aq, stop[(REG_A6XX_RBBM_PRIMCTR_7_LO - counter_base) / 2])
stats_reloc(ring, aq, start[(REG_A6XX_RBBM_PRIMCTR_7_LO - counter_base) / 2]);
stats_reloc(ring, aq, stop)
stats_reloc(ring, aq, start);
}
static void
@@ -439,7 +515,7 @@ pipeline_stats_result(struct fd_acc_query *aq,
{
struct fd6_pipeline_stats_sample *ps = fd6_pipeline_stats_sample(s);
log_pipeline_stats(ps);
log_pipeline_stats(ps, stats_counter_index(aq));
result->u64 = ps->result;
}
@@ -464,6 +540,15 @@ static const struct fd_acc_sample_provider primitives_generated = {
.result_resource = pipeline_stats_result_resource,
};
static const struct fd_acc_sample_provider pipeline_statistics_single = {
.query_type = PIPE_QUERY_PIPELINE_STATISTICS_SINGLE,
.size = sizeof(struct fd6_pipeline_stats_sample),
.resume = pipeline_stats_resume,
.pause = pipeline_stats_pause,
.result = pipeline_stats_result,
.result_resource = pipeline_stats_result_resource,
};
struct PACKED fd6_primitives_sample {
struct fd_acc_query_sample base;
@@ -885,6 +970,8 @@ fd6_query_context_init(struct pipe_context *pctx) disable_thread_safety_analysis
fd_acc_query_register_provider(pctx, &timestamp);
fd_acc_query_register_provider(pctx, &primitives_generated);
fd_acc_query_register_provider(pctx, &pipeline_statistics_single);
fd_acc_query_register_provider(pctx, &primitives_emitted);
fd_acc_query_register_provider(pctx, &so_overflow_any_predicate);
fd_acc_query_register_provider(pctx, &so_overflow_predicate);

View File

@@ -282,10 +282,11 @@ struct fd_batch {
uint32_t next_sample_offset;
/* The # of pipeline-stats queries running. In case of nested
* queries using START/STOP_PRIMITIVE_CNTRS, we need to start
* only on the first one and stop only on the last one.
* queries using {START/STOP}_{PRIMITIVE,FRAGMENT,COMPUTE}_CNTRS,
* we need to start only on the first one and stop only on the
* last one.
*/
uint32_t pipeline_stats_queries_active;
uint8_t pipeline_stats_queries_active[3];
/* cached samples (in case multiple queries need to reference
* the same sample snapshot)

View File

@@ -265,7 +265,7 @@ enum fd_buffer_mask {
FD_BUFFER_LRZ = BIT(15),
};
#define MAX_HW_SAMPLE_PROVIDERS 9
#define MAX_HW_SAMPLE_PROVIDERS 10
struct fd_hw_sample_provider;
struct fd_hw_sample;

View File

@@ -135,6 +135,8 @@ pidx(unsigned query_type)
return 7;
case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
return 8;
case PIPE_QUERY_PIPELINE_STATISTICS_SINGLE:
return 9;
default:
return -1;

View File

@@ -544,6 +544,7 @@ fd_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
(is_a4xx(screen) || is_a5xx(screen) || is_a6xx(screen));
case PIPE_CAP_QUERY_BUFFER_OBJECT:
case PIPE_CAP_QUERY_SO_OVERFLOW:
case PIPE_CAP_QUERY_PIPELINE_STATISTICS_SINGLE:
return is_a6xx(screen);
case PIPE_CAP_VENDOR_ID: