diff --git a/src/gallium/drivers/iris/iris_perf.c b/src/gallium/drivers/iris/iris_perf.c index 7c0378aacee..1e5ec8140dc 100644 --- a/src/gallium/drivers/iris/iris_perf.c +++ b/src/gallium/drivers/iris/iris_perf.c @@ -31,18 +31,11 @@ iris_oa_bo_alloc(void *bufmgr, const char *name, uint64_t size) } static void -iris_perf_emit_mi_flush(struct iris_context *ice) +iris_perf_emit_stall_at_pixel_scoreboard(struct iris_context *ice) { - const int flags = PIPE_CONTROL_RENDER_TARGET_FLUSH | - PIPE_CONTROL_INSTRUCTION_INVALIDATE | - PIPE_CONTROL_CONST_CACHE_INVALIDATE | - PIPE_CONTROL_DATA_CACHE_FLUSH | - PIPE_CONTROL_DEPTH_CACHE_FLUSH | - PIPE_CONTROL_VF_CACHE_INVALIDATE | - PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE | - PIPE_CONTROL_CS_STALL; - iris_emit_pipe_control_flush(&ice->batches[IRIS_BATCH_RENDER], - "OA metrics", flags); + iris_emit_end_of_pipe_sync(&ice->batches[IRIS_BATCH_RENDER], + "OA metrics", + PIPE_CONTROL_STALL_AT_SCOREBOARD); } static void @@ -106,7 +99,8 @@ iris_perf_init_vtbl(struct gen_perf_config *perf_cfg) perf_cfg->vtbl.bo_unreference = (bo_unreference_t)iris_bo_unreference; perf_cfg->vtbl.bo_map = (bo_map_t)iris_bo_map; perf_cfg->vtbl.bo_unmap = (bo_unmap_t)iris_bo_unmap; - perf_cfg->vtbl.emit_mi_flush = (emit_mi_flush_t)iris_perf_emit_mi_flush; + perf_cfg->vtbl.emit_stall_at_pixel_scoreboard = + (emit_mi_flush_t)iris_perf_emit_stall_at_pixel_scoreboard; perf_cfg->vtbl.emit_mi_report_perf_count = (emit_mi_report_t)iris_perf_emit_mi_report_perf_count; diff --git a/src/intel/perf/gen_perf.c b/src/intel/perf/gen_perf.c index daa092c88c9..9e987d599d7 100644 --- a/src/intel/perf/gen_perf.c +++ b/src/intel/perf/gen_perf.c @@ -1716,15 +1716,9 @@ gen_perf_begin_query(struct gen_perf_context *perf_ctx, * end snapshot - otherwise the results won't be a complete representation * of the work. * - * Theoretically there could be opportunities to minimize how much of the - * GPU pipeline is drained, or that we stall for, when we know what specific - * units the performance counters being queried relate to but we don't - * currently attempt to be clever here. - * - * Note: with our current simple approach here then for back-to-back queries - * we will redundantly emit duplicate commands to synchronize the command - * streamer with the rest of the GPU pipeline, but we assume that in HW the - * second synchronization is effectively a NOOP. + * To achieve this, we stall the pipeline at pixel scoreboard (prevent any + * additional work to be processed by the pipeline until all pixels of the + * previous draw has be completed). * * N.B. The final results are based on deltas of counters between (inside) * Begin/End markers so even though the total wall clock time of the @@ -1738,7 +1732,7 @@ gen_perf_begin_query(struct gen_perf_context *perf_ctx, * This is our Begin synchronization point to drain current work on the * GPU before we capture our first counter snapshot... */ - perf_cfg->vtbl.emit_mi_flush(perf_ctx->ctx); + perf_cfg->vtbl.emit_stall_at_pixel_scoreboard(perf_ctx->ctx); switch (queryinfo->kind) { case GEN_PERF_QUERY_TYPE_OA: @@ -1920,7 +1914,7 @@ gen_perf_end_query(struct gen_perf_context *perf_ctx, * For more details see comment in brw_begin_perf_query for * corresponding flush. */ - perf_cfg->vtbl.emit_mi_flush(perf_ctx->ctx); + perf_cfg->vtbl.emit_stall_at_pixel_scoreboard(perf_ctx->ctx); switch (query->queryinfo->kind) { case GEN_PERF_QUERY_TYPE_OA: diff --git a/src/intel/perf/gen_perf.h b/src/intel/perf/gen_perf.h index 46d37e07c25..2cd246a1dca 100644 --- a/src/intel/perf/gen_perf.h +++ b/src/intel/perf/gen_perf.h @@ -219,7 +219,7 @@ struct gen_perf_config { bool (*batch_references)(void *batch, void *bo); void (*bo_wait_rendering)(void *bo); int (*bo_busy)(void *bo); - void (*emit_mi_flush)(void *ctx); + void (*emit_stall_at_pixel_scoreboard)(void *ctx); void (*emit_mi_report_perf_count)(void *ctx, void *bo, uint32_t offset_in_bytes, diff --git a/src/mesa/drivers/dri/i965/brw_performance_query.c b/src/mesa/drivers/dri/i965/brw_performance_query.c index 0e5459e5e5e..cfd3efe374e 100644 --- a/src/mesa/drivers/dri/i965/brw_performance_query.c +++ b/src/mesa/drivers/dri/i965/brw_performance_query.c @@ -459,6 +459,13 @@ brw_oa_batchbuffer_flush(void *c, const char *file, int line) _intel_batchbuffer_flush_fence(ctx, -1, NULL, file, line); } +static void +brw_oa_emit_stall_at_pixel_scoreboard(void *c) +{ + struct brw_context *brw = c; + brw_emit_end_of_pipe_sync(brw, PIPE_CONTROL_STALL_AT_SCOREBOARD); +} + typedef void (*capture_frequency_stat_register_t)(void *, void *, uint32_t ); typedef void (*store_register_mem64_t)(void *ctx, void *bo, uint32_t reg, uint32_t offset); @@ -487,7 +494,8 @@ brw_init_perf_query_info(struct gl_context *ctx) perf_cfg->vtbl.bo_unreference = (bo_unreference_t)brw_bo_unreference; perf_cfg->vtbl.bo_map = (bo_map_t)brw_bo_map; perf_cfg->vtbl.bo_unmap = (bo_unmap_t)brw_bo_unmap; - perf_cfg->vtbl.emit_mi_flush = (emit_mi_flush_t)brw_emit_mi_flush; + perf_cfg->vtbl.emit_stall_at_pixel_scoreboard = + (emit_mi_flush_t)brw_oa_emit_stall_at_pixel_scoreboard; perf_cfg->vtbl.emit_mi_report_perf_count = (emit_mi_report_t)brw_oa_emit_mi_report_perf_count; perf_cfg->vtbl.batchbuffer_flush = brw_oa_batchbuffer_flush;