anv: reset query pools using blorp
Previously we used PC to set query data to 0 during CmdResetQueryPool. This was slow when clearing large query pools. Switching to blorp to clear pools is faster for large query pools. Red Dead Redemption 2: +1.5% speedup Reviewed-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/22178>
This commit is contained in:
@@ -76,6 +76,7 @@ static const driOptionDescription anv_dri_options[] = {
|
|||||||
DRI_CONF_ANV_FP64_WORKAROUND_ENABLED(false)
|
DRI_CONF_ANV_FP64_WORKAROUND_ENABLED(false)
|
||||||
DRI_CONF_ANV_GENERATED_INDIRECT_THRESHOLD(4)
|
DRI_CONF_ANV_GENERATED_INDIRECT_THRESHOLD(4)
|
||||||
DRI_CONF_NO_16BIT(false)
|
DRI_CONF_NO_16BIT(false)
|
||||||
|
DRI_CONF_ANV_QUERY_CLEAR_WITH_BLORP_THRESHOLD(6)
|
||||||
DRI_CONF_SECTION_END
|
DRI_CONF_SECTION_END
|
||||||
|
|
||||||
DRI_CONF_SECTION_DEBUG
|
DRI_CONF_SECTION_DEBUG
|
||||||
@@ -1113,6 +1114,8 @@ anv_init_dri_options(struct anv_instance *instance)
|
|||||||
driQueryOptionb(&instance->dri_options, "fp64_workaround_enabled");
|
driQueryOptionb(&instance->dri_options, "fp64_workaround_enabled");
|
||||||
instance->generated_indirect_threshold =
|
instance->generated_indirect_threshold =
|
||||||
driQueryOptioni(&instance->dri_options, "generated_indirect_threshold");
|
driQueryOptioni(&instance->dri_options, "generated_indirect_threshold");
|
||||||
|
instance->query_clear_with_blorp_threshold =
|
||||||
|
driQueryOptioni(&instance->dri_options, "query_clear_with_blorp_threshold");
|
||||||
}
|
}
|
||||||
|
|
||||||
VkResult anv_CreateInstance(
|
VkResult anv_CreateInstance(
|
||||||
|
@@ -1029,6 +1029,7 @@ struct anv_instance {
|
|||||||
bool fp64_workaround_enabled;
|
bool fp64_workaround_enabled;
|
||||||
float lower_depth_range_rate;
|
float lower_depth_range_rate;
|
||||||
unsigned generated_indirect_threshold;
|
unsigned generated_indirect_threshold;
|
||||||
|
unsigned query_clear_with_blorp_threshold;
|
||||||
|
|
||||||
/* HW workarounds */
|
/* HW workarounds */
|
||||||
bool no_16bit;
|
bool no_16bit;
|
||||||
@@ -2085,6 +2086,11 @@ enum anv_pipe_bits {
|
|||||||
* implement a workaround for Gfx9.
|
* implement a workaround for Gfx9.
|
||||||
*/
|
*/
|
||||||
ANV_PIPE_POST_SYNC_BIT = (1 << 25),
|
ANV_PIPE_POST_SYNC_BIT = (1 << 25),
|
||||||
|
|
||||||
|
/* This bit does not exist directly in PIPE_CONTROL. It means that render
|
||||||
|
* target operations related to clearing of queries are ongoing.
|
||||||
|
*/
|
||||||
|
ANV_PIPE_QUERY_CLEARS_BIT = (1 << 26),
|
||||||
};
|
};
|
||||||
|
|
||||||
#define ANV_PIPE_FLUSH_BITS ( \
|
#define ANV_PIPE_FLUSH_BITS ( \
|
||||||
@@ -2127,6 +2133,20 @@ enum anv_pipe_bits {
|
|||||||
#define ANV_PIPE_GPGPU_BITS ( \
|
#define ANV_PIPE_GPGPU_BITS ( \
|
||||||
(GFX_VERx10 >= 125 ? ANV_PIPE_UNTYPED_DATAPORT_CACHE_FLUSH_BIT : 0))
|
(GFX_VERx10 >= 125 ? ANV_PIPE_UNTYPED_DATAPORT_CACHE_FLUSH_BIT : 0))
|
||||||
|
|
||||||
|
/* Things we need to flush before accessing query data using the command
|
||||||
|
* streamer.
|
||||||
|
*
|
||||||
|
* Prior to DG2 experiments show that the command streamer is not coherent
|
||||||
|
* with the tile cache so we need to flush it to make any data visible to CS.
|
||||||
|
*
|
||||||
|
* Otherwise we want to flush the RT cache which is where blorp writes, either
|
||||||
|
* for clearing the query buffer or for clearing the destination buffer in
|
||||||
|
* vkCopyQueryPoolResults().
|
||||||
|
*/
|
||||||
|
#define ANV_PIPE_QUERY_FLUSH_BITS ( \
|
||||||
|
(GFX_VERx10 < 125 ? ANV_PIPE_TILE_CACHE_FLUSH_BIT : 0) | \
|
||||||
|
ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT)
|
||||||
|
|
||||||
enum intel_ds_stall_flag
|
enum intel_ds_stall_flag
|
||||||
anv_pipe_flush_bit_to_ds_stall_flag(enum anv_pipe_bits bits);
|
anv_pipe_flush_bit_to_ds_stall_flag(enum anv_pipe_bits bits);
|
||||||
|
|
||||||
|
@@ -1752,7 +1752,16 @@ genX(emit_apply_pipe_flushes)(struct anv_batch *batch,
|
|||||||
* saying that render target writes are ongoing.
|
* saying that render target writes are ongoing.
|
||||||
*/
|
*/
|
||||||
if (bits & ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT)
|
if (bits & ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT)
|
||||||
bits &= ~(ANV_PIPE_RENDER_TARGET_BUFFER_WRITES);
|
bits &= ~ANV_PIPE_RENDER_TARGET_BUFFER_WRITES;
|
||||||
|
|
||||||
|
/* If the conditions for flushing the query clears are met, we can
|
||||||
|
* toggle the bit off.
|
||||||
|
*/
|
||||||
|
if ((bits & ANV_PIPE_QUERY_FLUSH_BITS) == ANV_PIPE_QUERY_FLUSH_BITS &&
|
||||||
|
(bits & (ANV_PIPE_END_OF_PIPE_SYNC_BIT |
|
||||||
|
ANV_PIPE_CS_STALL_BIT))) {
|
||||||
|
bits &= ~ANV_PIPE_QUERY_CLEARS_BIT;
|
||||||
|
}
|
||||||
|
|
||||||
bits &= ~(ANV_PIPE_FLUSH_BITS | ANV_PIPE_STALL_BITS |
|
bits &= ~(ANV_PIPE_FLUSH_BITS | ANV_PIPE_STALL_BITS |
|
||||||
ANV_PIPE_END_OF_PIPE_SYNC_BIT);
|
ANV_PIPE_END_OF_PIPE_SYNC_BIT);
|
||||||
@@ -3803,6 +3812,16 @@ genX(EndCommandBuffer)(
|
|||||||
return VK_SUCCESS;
|
return VK_SUCCESS;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/* Flush query clears using blorp so that secondary query writes do not
|
||||||
|
* race with the clear.
|
||||||
|
*/
|
||||||
|
if (cmd_buffer->state.pending_pipe_bits & ANV_PIPE_QUERY_CLEARS_BIT) {
|
||||||
|
anv_add_pending_pipe_bits(cmd_buffer,
|
||||||
|
ANV_PIPE_QUERY_FLUSH_BITS |
|
||||||
|
ANV_PIPE_NEEDS_END_OF_PIPE_SYNC_BIT,
|
||||||
|
"query clear flush prior command buffer end");
|
||||||
|
}
|
||||||
|
|
||||||
genX(cmd_buffer_flush_generated_draws)(cmd_buffer);
|
genX(cmd_buffer_flush_generated_draws)(cmd_buffer);
|
||||||
|
|
||||||
/* Turn on object level preemption if it is disabled to have it in known
|
/* Turn on object level preemption if it is disabled to have it in known
|
||||||
@@ -3878,6 +3897,16 @@ genX(CmdExecuteCommands)(
|
|||||||
*/
|
*/
|
||||||
genX(apply_task_urb_workaround)(primary);
|
genX(apply_task_urb_workaround)(primary);
|
||||||
|
|
||||||
|
/* Flush query clears using blorp so that secondary query writes do not
|
||||||
|
* race with the clear.
|
||||||
|
*/
|
||||||
|
if (primary->state.pending_pipe_bits & ANV_PIPE_QUERY_CLEARS_BIT) {
|
||||||
|
anv_add_pending_pipe_bits(primary,
|
||||||
|
ANV_PIPE_QUERY_FLUSH_BITS |
|
||||||
|
ANV_PIPE_NEEDS_END_OF_PIPE_SYNC_BIT,
|
||||||
|
"query clear flush prior to secondary buffer");
|
||||||
|
}
|
||||||
|
|
||||||
/* The secondary command buffer doesn't know which textures etc. have been
|
/* The secondary command buffer doesn't know which textures etc. have been
|
||||||
* flushed prior to their execution. Apply those flushes now.
|
* flushed prior to their execution. Apply those flushes now.
|
||||||
*/
|
*/
|
||||||
@@ -6535,6 +6564,21 @@ genX(flush_pipeline_select)(struct anv_cmd_buffer *cmd_buffer,
|
|||||||
cmd_buffer->state.compute.pipeline_dirty = true;
|
cmd_buffer->state.compute.pipeline_dirty = true;
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
|
||||||
|
#if GFX_VERx10 < 125
|
||||||
|
/* We apparently cannot flush the tile cache (color/depth) from the GPGPU
|
||||||
|
* pipeline. That means query clears will not be visible to query
|
||||||
|
* copy/write. So we need to flush it before going to GPGPU mode.
|
||||||
|
*/
|
||||||
|
if (cmd_buffer->state.current_pipeline == _3D &&
|
||||||
|
(cmd_buffer->state.pending_pipe_bits & ANV_PIPE_QUERY_CLEARS_BIT)) {
|
||||||
|
anv_add_pending_pipe_bits(cmd_buffer,
|
||||||
|
ANV_PIPE_QUERY_FLUSH_BITS |
|
||||||
|
ANV_PIPE_END_OF_PIPE_SYNC_BIT,
|
||||||
|
"query clear flush prior to GPGPU");
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
#if GFX_VER >= 12
|
#if GFX_VER >= 12
|
||||||
/* From Tigerlake PRM, Volume 2a, PIPELINE_SELECT:
|
/* From Tigerlake PRM, Volume 2a, PIPELINE_SELECT:
|
||||||
*
|
*
|
||||||
|
@@ -181,7 +181,6 @@ VkResult genX(CreateQueryPool)(
|
|||||||
case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SERIALIZATION_BOTTOM_LEVEL_POINTERS_KHR:
|
case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SERIALIZATION_BOTTOM_LEVEL_POINTERS_KHR:
|
||||||
uint64s_per_slot = 1 + 2 /* availability + size (PostbuildInfoSerializationDesc) */;
|
uint64s_per_slot = 1 + 2 /* availability + size (PostbuildInfoSerializationDesc) */;
|
||||||
break;
|
break;
|
||||||
break;
|
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
case VK_QUERY_TYPE_RESULT_STATUS_ONLY_KHR:
|
case VK_QUERY_TYPE_RESULT_STATUS_ONLY_KHR:
|
||||||
@@ -783,6 +782,19 @@ void genX(CmdResetQueryPool)(
|
|||||||
{
|
{
|
||||||
ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
|
ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
|
||||||
ANV_FROM_HANDLE(anv_query_pool, pool, queryPool);
|
ANV_FROM_HANDLE(anv_query_pool, pool, queryPool);
|
||||||
|
struct anv_physical_device *pdevice = cmd_buffer->device->physical;
|
||||||
|
|
||||||
|
if (queryCount >= pdevice->instance->query_clear_with_blorp_threshold) {
|
||||||
|
anv_cmd_buffer_fill_area(cmd_buffer,
|
||||||
|
anv_query_address(pool, firstQuery),
|
||||||
|
queryCount * pool->stride,
|
||||||
|
0);
|
||||||
|
|
||||||
|
anv_add_pending_pipe_bits(cmd_buffer,
|
||||||
|
ANV_PIPE_QUERY_CLEARS_BIT,
|
||||||
|
"vkCmdResetQueryPool of timestamps");
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
switch (pool->type) {
|
switch (pool->type) {
|
||||||
case VK_QUERY_TYPE_OCCLUSION:
|
case VK_QUERY_TYPE_OCCLUSION:
|
||||||
@@ -963,6 +975,22 @@ emit_perf_intel_query(struct anv_cmd_buffer *cmd_buffer,
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static void
|
||||||
|
emit_query_clear_flush(struct anv_cmd_buffer *cmd_buffer,
|
||||||
|
struct anv_query_pool *pool,
|
||||||
|
const char *reason)
|
||||||
|
{
|
||||||
|
if ((cmd_buffer->state.pending_pipe_bits & ANV_PIPE_QUERY_CLEARS_BIT) == 0)
|
||||||
|
return;
|
||||||
|
|
||||||
|
anv_add_pending_pipe_bits(cmd_buffer,
|
||||||
|
ANV_PIPE_QUERY_FLUSH_BITS |
|
||||||
|
ANV_PIPE_END_OF_PIPE_SYNC_BIT,
|
||||||
|
reason);
|
||||||
|
genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
void genX(CmdBeginQuery)(
|
void genX(CmdBeginQuery)(
|
||||||
VkCommandBuffer commandBuffer,
|
VkCommandBuffer commandBuffer,
|
||||||
VkQueryPool queryPool,
|
VkQueryPool queryPool,
|
||||||
@@ -983,6 +1011,8 @@ void genX(CmdBeginQueryIndexedEXT)(
|
|||||||
ANV_FROM_HANDLE(anv_query_pool, pool, queryPool);
|
ANV_FROM_HANDLE(anv_query_pool, pool, queryPool);
|
||||||
struct anv_address query_addr = anv_query_address(pool, query);
|
struct anv_address query_addr = anv_query_address(pool, query);
|
||||||
|
|
||||||
|
emit_query_clear_flush(cmd_buffer, pool, "CmdBeginQuery* flush query clears");
|
||||||
|
|
||||||
struct mi_builder b;
|
struct mi_builder b;
|
||||||
mi_builder_init(&b, cmd_buffer->device->info, &cmd_buffer->batch);
|
mi_builder_init(&b, cmd_buffer->device->info, &cmd_buffer->batch);
|
||||||
|
|
||||||
@@ -1352,6 +1382,9 @@ void genX(CmdWriteTimestamp2)(
|
|||||||
|
|
||||||
assert(pool->type == VK_QUERY_TYPE_TIMESTAMP);
|
assert(pool->type == VK_QUERY_TYPE_TIMESTAMP);
|
||||||
|
|
||||||
|
emit_query_clear_flush(cmd_buffer, pool,
|
||||||
|
"CmdWriteTimestamp flush query clears");
|
||||||
|
|
||||||
struct mi_builder b;
|
struct mi_builder b;
|
||||||
mi_builder_init(&b, cmd_buffer->device->info, &cmd_buffer->batch);
|
mi_builder_init(&b, cmd_buffer->device->info, &cmd_buffer->batch);
|
||||||
|
|
||||||
@@ -1472,17 +1505,22 @@ void genX(CmdCopyQueryPoolResults)(
|
|||||||
* to ensure proper ordering of the commands from the 3d pipe and the
|
* to ensure proper ordering of the commands from the 3d pipe and the
|
||||||
* command streamer.
|
* command streamer.
|
||||||
*/
|
*/
|
||||||
if (cmd_buffer->state.pending_pipe_bits & ANV_PIPE_RENDER_TARGET_BUFFER_WRITES) {
|
const bool need_flushes =
|
||||||
|
(cmd_buffer->state.pending_pipe_bits &
|
||||||
|
(ANV_PIPE_RENDER_TARGET_BUFFER_WRITES |
|
||||||
|
ANV_PIPE_QUERY_CLEARS_BIT));
|
||||||
|
|
||||||
|
if (need_flushes) {
|
||||||
anv_add_pending_pipe_bits(cmd_buffer,
|
anv_add_pending_pipe_bits(cmd_buffer,
|
||||||
ANV_PIPE_TILE_CACHE_FLUSH_BIT |
|
ANV_PIPE_QUERY_FLUSH_BITS | ANV_PIPE_CS_STALL_BIT,
|
||||||
ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT,
|
|
||||||
"CopyQueryPoolResults");
|
"CopyQueryPoolResults");
|
||||||
}
|
}
|
||||||
|
|
||||||
if ((cmd_buffer->state.pending_pipe_bits & ANV_PIPE_FLUSH_BITS) ||
|
bool need_cs_stall =
|
||||||
|
(cmd_buffer->state.pending_pipe_bits & ANV_PIPE_FLUSH_BITS) ||
|
||||||
/* Occlusion & timestamp queries are written using a PIPE_CONTROL and
|
/* Occlusion & timestamp queries are written using a PIPE_CONTROL and
|
||||||
* because we're about to copy values from MI commands, we need to
|
* because we're about to copy values from MI commands, we need to stall
|
||||||
* stall the command streamer to make sure the PIPE_CONTROL values have
|
* the command streamer to make sure the PIPE_CONTROL values have
|
||||||
* landed, otherwise we could see inconsistent values & availability.
|
* landed, otherwise we could see inconsistent values & availability.
|
||||||
*
|
*
|
||||||
* From the vulkan spec:
|
* From the vulkan spec:
|
||||||
@@ -1492,13 +1530,17 @@ void genX(CmdCopyQueryPoolResults)(
|
|||||||
* any additional synchronization."
|
* any additional synchronization."
|
||||||
*/
|
*/
|
||||||
pool->type == VK_QUERY_TYPE_OCCLUSION ||
|
pool->type == VK_QUERY_TYPE_OCCLUSION ||
|
||||||
pool->type == VK_QUERY_TYPE_TIMESTAMP) {
|
pool->type == VK_QUERY_TYPE_TIMESTAMP;
|
||||||
|
|
||||||
|
if (need_cs_stall) {
|
||||||
anv_add_pending_pipe_bits(cmd_buffer,
|
anv_add_pending_pipe_bits(cmd_buffer,
|
||||||
ANV_PIPE_CS_STALL_BIT,
|
ANV_PIPE_CS_STALL_BIT,
|
||||||
"CopyQueryPoolResults");
|
"CopyQueryPoolResults stall");
|
||||||
genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (need_cs_stall || need_flushes)
|
||||||
|
genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
|
||||||
|
|
||||||
struct anv_address dest_addr = anv_address_add(buffer->address, destOffset);
|
struct anv_address dest_addr = anv_address_add(buffer->address, destOffset);
|
||||||
for (uint32_t i = 0; i < queryCount; i++) {
|
for (uint32_t i = 0; i < queryCount; i++) {
|
||||||
struct anv_address query_addr = anv_query_address(pool, firstQuery + i);
|
struct anv_address query_addr = anv_query_address(pool, firstQuery + i);
|
||||||
|
@@ -654,6 +654,10 @@
|
|||||||
DRI_CONF_OPT_I(generated_indirect_threshold, def, 0, INT32_MAX, \
|
DRI_CONF_OPT_I(generated_indirect_threshold, def, 0, INT32_MAX, \
|
||||||
"Indirect threshold count above which we start generating commands")
|
"Indirect threshold count above which we start generating commands")
|
||||||
|
|
||||||
|
#define DRI_CONF_ANV_QUERY_CLEAR_WITH_BLORP_THRESHOLD(def) \
|
||||||
|
DRI_CONF_OPT_I(query_clear_with_blorp_threshold, def, 0, INT32_MAX, \
|
||||||
|
"Indirect threshold count above which we start generating commands")
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* \brief DZN specific configuration options
|
* \brief DZN specific configuration options
|
||||||
*/
|
*/
|
||||||
|
Reference in New Issue
Block a user