anv: More carefully dirty state in BindPipeline
Instead of blindly dirtying descriptors and push constants the moment we
see a pipeline change, check to see if it actually changes the bind
layout or push constant layout. This doubles the runtime performance of
one CPU-limited example running with the Dawn WebGPU implementation when
running on my laptop.
NOTE: This effectively reverts beca63c6c0
. While it was a nice
optimization, it was based on prog_data and we can't do that anymore
once we start allowing the same binding table to be used with multiple
different pipelines.
Reviewed-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
This commit is contained in:
@@ -380,6 +380,34 @@ anv_cmd_emit_conditional_render_predicate(struct anv_cmd_buffer *cmd_buffer)
|
||||
cmd_buffer);
|
||||
}
|
||||
|
||||
static bool
|
||||
mem_update(void *dst, const void *src, size_t size)
|
||||
{
|
||||
if (memcmp(dst, src, size) == 0)
|
||||
return false;
|
||||
|
||||
memcpy(dst, src, size);
|
||||
return true;
|
||||
}
|
||||
|
||||
static void
|
||||
set_dirty_for_bind_map(struct anv_cmd_buffer *cmd_buffer,
|
||||
gl_shader_stage stage,
|
||||
const struct anv_pipeline_bind_map *map)
|
||||
{
|
||||
if (mem_update(cmd_buffer->state.surface_sha1s[stage],
|
||||
map->surface_sha1, sizeof(map->surface_sha1)))
|
||||
cmd_buffer->state.descriptors_dirty |= mesa_to_vk_shader_stage(stage);
|
||||
|
||||
if (mem_update(cmd_buffer->state.sampler_sha1s[stage],
|
||||
map->sampler_sha1, sizeof(map->sampler_sha1)))
|
||||
cmd_buffer->state.descriptors_dirty |= mesa_to_vk_shader_stage(stage);
|
||||
|
||||
if (mem_update(cmd_buffer->state.push_sha1s[stage],
|
||||
map->push_sha1, sizeof(map->push_sha1)))
|
||||
cmd_buffer->state.push_constants_dirty |= mesa_to_vk_shader_stage(stage);
|
||||
}
|
||||
|
||||
void anv_CmdBindPipeline(
|
||||
VkCommandBuffer commandBuffer,
|
||||
VkPipelineBindPoint pipelineBindPoint,
|
||||
@@ -389,19 +417,30 @@ void anv_CmdBindPipeline(
|
||||
ANV_FROM_HANDLE(anv_pipeline, pipeline, _pipeline);
|
||||
|
||||
switch (pipelineBindPoint) {
|
||||
case VK_PIPELINE_BIND_POINT_COMPUTE:
|
||||
case VK_PIPELINE_BIND_POINT_COMPUTE: {
|
||||
if (cmd_buffer->state.compute.base.pipeline == pipeline)
|
||||
return;
|
||||
|
||||
cmd_buffer->state.compute.base.pipeline = pipeline;
|
||||
cmd_buffer->state.compute.pipeline_dirty = true;
|
||||
cmd_buffer->state.push_constants_dirty |= VK_SHADER_STAGE_COMPUTE_BIT;
|
||||
cmd_buffer->state.descriptors_dirty |= VK_SHADER_STAGE_COMPUTE_BIT;
|
||||
const struct anv_pipeline_bind_map *bind_map =
|
||||
&pipeline->shaders[MESA_SHADER_COMPUTE]->bind_map;
|
||||
set_dirty_for_bind_map(cmd_buffer, MESA_SHADER_COMPUTE, bind_map);
|
||||
break;
|
||||
}
|
||||
|
||||
case VK_PIPELINE_BIND_POINT_GRAPHICS:
|
||||
if (cmd_buffer->state.gfx.base.pipeline == pipeline)
|
||||
return;
|
||||
|
||||
cmd_buffer->state.gfx.base.pipeline = pipeline;
|
||||
cmd_buffer->state.gfx.vb_dirty |= pipeline->vb_used;
|
||||
cmd_buffer->state.gfx.dirty |= ANV_CMD_DIRTY_PIPELINE;
|
||||
cmd_buffer->state.push_constants_dirty |= pipeline->active_stages;
|
||||
cmd_buffer->state.descriptors_dirty |= pipeline->active_stages;
|
||||
|
||||
anv_foreach_stage(stage, pipeline->active_stages) {
|
||||
set_dirty_for_bind_map(cmd_buffer, stage,
|
||||
&pipeline->shaders[stage]->bind_map);
|
||||
}
|
||||
|
||||
/* Apply the dynamic state from the pipeline */
|
||||
cmd_buffer->state.gfx.dirty |=
|
||||
|
@@ -25,6 +25,7 @@
|
||||
#include "program/prog_parameter.h"
|
||||
#include "nir/nir_builder.h"
|
||||
#include "compiler/brw_nir.h"
|
||||
#include "util/mesa-sha1.h"
|
||||
#include "util/set.h"
|
||||
|
||||
/* Sampler tables don't actually have a maximum size but we pick one just so
|
||||
@@ -1318,6 +1319,7 @@ anv_nir_apply_pipeline_layout(const struct anv_physical_device *pdevice,
|
||||
dim == GLSL_SAMPLER_DIM_SUBPASS_MS)
|
||||
pipe_binding[i].input_attachment_index = var->data.index + i;
|
||||
|
||||
/* NOTE: This is a uint8_t so we really do need to != 0 here */
|
||||
pipe_binding[i].write_only =
|
||||
(var->data.image.access & ACCESS_NON_READABLE) != 0;
|
||||
}
|
||||
@@ -1367,4 +1369,15 @@ anv_nir_apply_pipeline_layout(const struct anv_physical_device *pdevice,
|
||||
}
|
||||
|
||||
ralloc_free(mem_ctx);
|
||||
|
||||
/* Now that we're done computing the surface and sampler portions of the
|
||||
* bind map, hash them. This lets us quickly determine if the actual
|
||||
* mapping has changed and not just a no-op pipeline change.
|
||||
*/
|
||||
_mesa_sha1_compute(map->surface_to_descriptor,
|
||||
map->surface_count * sizeof(struct anv_pipeline_binding),
|
||||
map->surface_sha1);
|
||||
_mesa_sha1_compute(map->sampler_to_descriptor,
|
||||
map->sampler_count * sizeof(struct anv_pipeline_binding),
|
||||
map->sampler_sha1);
|
||||
}
|
||||
|
@@ -23,6 +23,7 @@
|
||||
|
||||
#include "anv_nir.h"
|
||||
#include "compiler/brw_nir.h"
|
||||
#include "util/mesa-sha1.h"
|
||||
|
||||
void
|
||||
anv_nir_compute_push_layout(const struct anv_physical_device *pdevice,
|
||||
@@ -173,6 +174,14 @@ anv_nir_compute_push_layout(const struct anv_physical_device *pdevice,
|
||||
*/
|
||||
map->push_ranges[0] = push_constant_range;
|
||||
}
|
||||
|
||||
/* Now that we're done computing the push constant portion of the
|
||||
* bind map, hash it. This lets us quickly determine if the actual
|
||||
* mapping has changed and not just a no-op pipeline change.
|
||||
*/
|
||||
_mesa_sha1_compute(map->push_ranges,
|
||||
sizeof(map->push_ranges),
|
||||
map->push_sha1);
|
||||
}
|
||||
|
||||
void
|
||||
|
@@ -1550,6 +1550,12 @@ anv_pipeline_compile_cs(struct anv_pipeline *pipeline,
|
||||
|
||||
anv_nir_validate_push_layout(&stage.prog_data.base, &stage.bind_map);
|
||||
|
||||
if (!stage.prog_data.cs.uses_num_work_groups) {
|
||||
assert(stage.bind_map.surface_to_descriptor[0].set ==
|
||||
ANV_DESCRIPTOR_SET_NUM_WORK_GROUPS);
|
||||
stage.bind_map.surface_to_descriptor[0].set = ANV_DESCRIPTOR_SET_NULL;
|
||||
}
|
||||
|
||||
const unsigned code_size = stage.prog_data.base.program_size;
|
||||
bin = anv_device_upload_kernel(pipeline->device, cache,
|
||||
&stage.cache_key, sizeof(stage.cache_key),
|
||||
|
@@ -161,6 +161,12 @@ anv_shader_bin_write_to_blob(const struct anv_shader_bin *shader,
|
||||
blob_write_uint32(blob, 0);
|
||||
}
|
||||
|
||||
blob_write_bytes(blob, shader->bind_map.surface_sha1,
|
||||
sizeof(shader->bind_map.surface_sha1));
|
||||
blob_write_bytes(blob, shader->bind_map.sampler_sha1,
|
||||
sizeof(shader->bind_map.sampler_sha1));
|
||||
blob_write_bytes(blob, shader->bind_map.push_sha1,
|
||||
sizeof(shader->bind_map.push_sha1));
|
||||
blob_write_uint32(blob, shader->bind_map.surface_count);
|
||||
blob_write_uint32(blob, shader->bind_map.sampler_count);
|
||||
blob_write_bytes(blob, shader->bind_map.surface_to_descriptor,
|
||||
@@ -206,6 +212,9 @@ anv_shader_bin_create_from_blob(struct anv_device *device,
|
||||
xfb_info = blob_read_bytes(blob, xfb_size);
|
||||
|
||||
struct anv_pipeline_bind_map bind_map;
|
||||
blob_copy_bytes(blob, bind_map.surface_sha1, sizeof(bind_map.surface_sha1));
|
||||
blob_copy_bytes(blob, bind_map.sampler_sha1, sizeof(bind_map.sampler_sha1));
|
||||
blob_copy_bytes(blob, bind_map.push_sha1, sizeof(bind_map.push_sha1));
|
||||
bind_map.surface_count = blob_read_uint32(blob);
|
||||
bind_map.sampler_count = blob_read_uint32(blob);
|
||||
bind_map.surface_to_descriptor = (void *)
|
||||
|
@@ -2009,6 +2009,7 @@ anv_descriptor_set_destroy(struct anv_device *device,
|
||||
struct anv_descriptor_pool *pool,
|
||||
struct anv_descriptor_set *set);
|
||||
|
||||
#define ANV_DESCRIPTOR_SET_NULL (UINT8_MAX - 5)
|
||||
#define ANV_DESCRIPTOR_SET_PUSH_CONSTANTS (UINT8_MAX - 4)
|
||||
#define ANV_DESCRIPTOR_SET_DESCRIPTORS (UINT8_MAX - 3)
|
||||
#define ANV_DESCRIPTOR_SET_NUM_WORK_GROUPS (UINT8_MAX - 2)
|
||||
@@ -2042,7 +2043,12 @@ struct anv_pipeline_binding {
|
||||
};
|
||||
|
||||
/** For a storage image, whether it is write-only */
|
||||
bool write_only;
|
||||
uint8_t write_only;
|
||||
|
||||
/** Pad to 64 bits so that there are no holes and we can safely memcmp
|
||||
* assuming POD zero-initialization.
|
||||
*/
|
||||
uint8_t pad;
|
||||
};
|
||||
|
||||
struct anv_push_range {
|
||||
@@ -2575,6 +2581,10 @@ struct anv_cmd_state {
|
||||
struct anv_state binding_tables[MESA_SHADER_STAGES];
|
||||
struct anv_state samplers[MESA_SHADER_STAGES];
|
||||
|
||||
unsigned char sampler_sha1s[MESA_SHADER_STAGES][20];
|
||||
unsigned char surface_sha1s[MESA_SHADER_STAGES][20];
|
||||
unsigned char push_sha1s[MESA_SHADER_STAGES][20];
|
||||
|
||||
/**
|
||||
* Whether or not the gen8 PMA fix is enabled. We ensure that, at the top
|
||||
* of any command buffer it is disabled by disabling it in EndCommandBuffer
|
||||
@@ -2936,6 +2946,10 @@ mesa_to_vk_shader_stage(gl_shader_stage mesa_stage)
|
||||
__tmp &= ~(1 << (stage)))
|
||||
|
||||
struct anv_pipeline_bind_map {
|
||||
unsigned char surface_sha1[20];
|
||||
unsigned char sampler_sha1[20];
|
||||
unsigned char push_sha1[20];
|
||||
|
||||
uint32_t surface_count;
|
||||
uint32_t sampler_count;
|
||||
|
||||
|
@@ -2122,8 +2122,7 @@ emit_binding_table(struct anv_cmd_buffer *cmd_buffer,
|
||||
return VK_SUCCESS;
|
||||
}
|
||||
|
||||
struct anv_shader_bin *bin = pipeline->shaders[stage];
|
||||
struct anv_pipeline_bind_map *map = &bin->bind_map;
|
||||
struct anv_pipeline_bind_map *map = &pipeline->shaders[stage]->bind_map;
|
||||
if (map->surface_count == 0) {
|
||||
*bt_state = (struct anv_state) { 0, };
|
||||
return VK_SUCCESS;
|
||||
@@ -2149,6 +2148,10 @@ emit_binding_table(struct anv_cmd_buffer *cmd_buffer,
|
||||
struct anv_state surface_state;
|
||||
|
||||
switch (binding->set) {
|
||||
case ANV_DESCRIPTOR_SET_NULL:
|
||||
bt_map[s] = 0;
|
||||
break;
|
||||
|
||||
case ANV_DESCRIPTOR_SET_COLOR_ATTACHMENTS:
|
||||
/* Color attachment binding */
|
||||
assert(stage == MESA_SHADER_FRAGMENT);
|
||||
@@ -2199,8 +2202,6 @@ emit_binding_table(struct anv_cmd_buffer *cmd_buffer,
|
||||
case ANV_DESCRIPTOR_SET_NUM_WORK_GROUPS: {
|
||||
/* This is always the first binding for compute shaders */
|
||||
assert(stage == MESA_SHADER_COMPUTE && s == 0);
|
||||
if (!get_cs_prog_data(pipeline)->uses_num_work_groups)
|
||||
break;
|
||||
|
||||
struct anv_state surface_state =
|
||||
anv_cmd_buffer_alloc_surface_state(cmd_buffer);
|
||||
@@ -2305,16 +2306,6 @@ emit_binding_table(struct anv_cmd_buffer *cmd_buffer,
|
||||
break;
|
||||
|
||||
case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC:
|
||||
/* If the shader never does any UBO pulls (this is a fairly common
|
||||
* case) then we don't need to fill out those binding table entries.
|
||||
* The real cost savings here is that we don't have to build the
|
||||
* surface state for them which is surprisingly expensive when it's
|
||||
* on the hot-path.
|
||||
*/
|
||||
if (!bin->prog_data->has_ubo_pull)
|
||||
continue;
|
||||
/* Fall through */
|
||||
|
||||
case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC: {
|
||||
/* Compute the offset within the buffer */
|
||||
struct anv_push_constants *push =
|
||||
@@ -2730,11 +2721,6 @@ genX(cmd_buffer_flush_state)(struct anv_cmd_buffer *cmd_buffer)
|
||||
if (cmd_buffer->state.gfx.dirty & ANV_CMD_DIRTY_PIPELINE) {
|
||||
anv_batch_emit_batch(&cmd_buffer->batch, &pipeline->batch);
|
||||
|
||||
/* The exact descriptor layout is pulled from the pipeline, so we need
|
||||
* to re-emit binding tables on every pipeline change.
|
||||
*/
|
||||
cmd_buffer->state.descriptors_dirty |= pipeline->active_stages;
|
||||
|
||||
/* If the pipeline changed, we may need to re-allocate push constant
|
||||
* space in the URB.
|
||||
*/
|
||||
|
Reference in New Issue
Block a user