diff --git a/src/panfrost/vulkan/csf/panvk_vX_cmd_dispatch.c b/src/panfrost/vulkan/csf/panvk_vX_cmd_dispatch.c index b86db4e4e20..2bb2c6f3fef 100644 --- a/src/panfrost/vulkan/csf/panvk_vX_cmd_dispatch.c +++ b/src/panfrost/vulkan/csf/panvk_vX_cmd_dispatch.c @@ -64,15 +64,6 @@ prepare_driver_set(struct panvk_cmd_buffer *cmdbuf) return VK_SUCCESS; } -static VkResult -prepare_push_uniforms(struct panvk_cmd_buffer *cmdbuf) -{ - cmdbuf->state.compute.push_uniforms = panvk_per_arch( - cmd_prepare_push_uniforms)(cmdbuf, VK_PIPELINE_BIND_POINT_COMPUTE); - return cmdbuf->state.compute.push_uniforms ? VK_SUCCESS - : VK_ERROR_OUT_OF_DEVICE_MEMORY; -} - static void calculate_task_axis_and_increment(const struct panvk_shader *shader, struct panvk_physical_device *phys_dev, @@ -238,7 +229,8 @@ cmd_dispatch(struct panvk_cmd_buffer *cmdbuf, struct panvk_dispatch_info *info) if (result != VK_SUCCESS) return; - result = prepare_push_uniforms(cmdbuf); + result = panvk_per_arch(cmd_prepare_push_uniforms)( + cmdbuf, cmdbuf->state.compute.shader); if (result != VK_SUCCESS) return; @@ -268,11 +260,8 @@ cmd_dispatch(struct panvk_cmd_buffer *cmdbuf, struct panvk_dispatch_info *info) cs_move64_to(b, cs_sr_reg64(b, 0), cs_desc_state->res_table); if (compute_state_dirty(cmdbuf, PUSH_UNIFORMS)) { - uint32_t push_size = - SYSVALS_PUSH_CONST_BASE + sizeof(struct panvk_compute_sysvals); - uint64_t fau_count = DIV_ROUND_UP(push_size, 8); - mali_ptr fau_ptr = - cmdbuf->state.compute.push_uniforms | (fau_count << 56); + mali_ptr fau_ptr = cmdbuf->state.compute.push_uniforms | + ((uint64_t)shader->fau.total_count << 56); cs_move64_to(b, cs_sr_reg64(b, 8), fau_ptr); } @@ -294,11 +283,11 @@ cmd_dispatch(struct panvk_cmd_buffer *cmdbuf, struct panvk_dispatch_info *info) } cs_move32_to(b, cs_sr_reg32(b, 33), wg_size.opaque[0]); cs_move32_to(b, cs_sr_reg32(b, 34), - info->direct.wg_base.x * shader->local_size.x); + info->wg_base.x * shader->local_size.x); cs_move32_to(b, cs_sr_reg32(b, 35), - info->direct.wg_base.y * shader->local_size.y); + info->wg_base.y * shader->local_size.y); cs_move32_to(b, cs_sr_reg32(b, 36), - info->direct.wg_base.z * shader->local_size.z); + info->wg_base.z * shader->local_size.z); if (indirect) { /* Load parameters from indirect buffer and update workgroup count * registers and sysvals */ @@ -309,10 +298,25 @@ cmd_dispatch(struct panvk_cmd_buffer *cmdbuf, struct panvk_dispatch_info *info) cs_move64_to(b, cs_scratch_reg64(b, 0), cmdbuf->state.compute.push_uniforms); cs_wait_slot(b, SB_ID(LS), false); - cs_store(b, cs_sr_reg_tuple(b, 37, 3), cs_scratch_reg64(b, 0), - BITFIELD_MASK(3), - SYSVALS_PUSH_CONST_BASE + - offsetof(struct panvk_compute_sysvals, num_work_groups)); + + if (shader_uses_sysval(shader, compute, num_work_groups.x)) { + cs_store32(b, cs_sr_reg32(b, 37), cs_scratch_reg64(b, 0), + shader_remapped_sysval_offset( + shader, sysval_offset(compute, num_work_groups.x))); + } + + if (shader_uses_sysval(shader, compute, num_work_groups.y)) { + cs_store32(b, cs_sr_reg32(b, 38), cs_scratch_reg64(b, 0), + shader_remapped_sysval_offset( + shader, sysval_offset(compute, num_work_groups.y))); + } + + if (shader_uses_sysval(shader, compute, num_work_groups.z)) { + cs_store32(b, cs_sr_reg32(b, 39), cs_scratch_reg64(b, 0), + shader_remapped_sysval_offset( + shader, sysval_offset(compute, num_work_groups.z))); + } + cs_wait_slot(b, SB_ID(LS), false); } else { cs_move32_to(b, cs_sr_reg32(b, 37), info->direct.wg_count.x); @@ -385,10 +389,8 @@ panvk_per_arch(CmdDispatchBase)(VkCommandBuffer commandBuffer, { VK_FROM_HANDLE(panvk_cmd_buffer, cmdbuf, commandBuffer); struct panvk_dispatch_info info = { - .direct = { - .wg_base = {baseGroupX, baseGroupY, baseGroupZ}, - .wg_count = {groupCountX, groupCountY, groupCountZ}, - } + .wg_base = {baseGroupX, baseGroupY, baseGroupZ}, + .direct.wg_count = {groupCountX, groupCountY, groupCountZ}, }; cmd_dispatch(cmdbuf, &info); } diff --git a/src/panfrost/vulkan/csf/panvk_vX_cmd_draw.c b/src/panfrost/vulkan/csf/panvk_vX_cmd_draw.c index db9be1104da..8fd0dad8dd6 100644 --- a/src/panfrost/vulkan/csf/panvk_vX_cmd_draw.c +++ b/src/panfrost/vulkan/csf/panvk_vX_cmd_draw.c @@ -404,7 +404,9 @@ prepare_blend(struct panvk_cmd_buffer *cmdbuf) panvk_per_arch(blend_emit_descs)(cmdbuf, bds); - cs_move64_to(b, cs_sr_reg64(b, 50), ptr.gpu | bd_count); + cs_update_vt_ctx(b) + cs_move64_to(b, cs_sr_reg64(b, 50), ptr.gpu | bd_count); + return VK_SUCCESS; } @@ -1200,24 +1202,38 @@ prepare_push_uniforms(struct panvk_cmd_buffer *cmdbuf) { struct cs_builder *b = panvk_get_cs_builder(cmdbuf, PANVK_SUBQUEUE_VERTEX_TILER); + const struct panvk_shader *vs = cmdbuf->state.gfx.vs.shader; + const struct panvk_shader *fs = get_fs(cmdbuf); + VkResult result; - if (gfx_state_dirty(cmdbuf, PUSH_UNIFORMS)) { - cmdbuf->state.gfx.push_uniforms = panvk_per_arch( - cmd_prepare_push_uniforms)(cmdbuf, VK_PIPELINE_BIND_POINT_GRAPHICS); - if (!cmdbuf->state.gfx.push_uniforms) - return VK_ERROR_OUT_OF_DEVICE_MEMORY; - - uint32_t push_size = - SYSVALS_PUSH_CONST_BASE + sizeof(struct panvk_graphics_sysvals); - uint64_t fau_count = DIV_ROUND_UP(push_size, 8); - mali_ptr fau_ptr = cmdbuf->state.gfx.push_uniforms | (fau_count << 56); + if (gfx_state_dirty(cmdbuf, VS_PUSH_UNIFORMS)) { + result = panvk_per_arch(cmd_prepare_push_uniforms)(cmdbuf, vs); + if (result != VK_SUCCESS) + return result; cs_update_vt_ctx(b) { - cs_move64_to(b, cs_sr_reg64(b, 8), fau_ptr); - cs_move64_to(b, cs_sr_reg64(b, 12), fau_ptr); + cs_move64_to(b, cs_sr_reg64(b, 8), + cmdbuf->state.gfx.vs.push_uniforms | + ((uint64_t)vs->fau.total_count << 56)); } } + if (fs_user_dirty(cmdbuf) || gfx_state_dirty(cmdbuf, FS_PUSH_UNIFORMS)) { + mali_ptr fau_ptr = 0; + + if (fs) { + result = panvk_per_arch(cmd_prepare_push_uniforms)(cmdbuf, fs); + if (result != VK_SUCCESS) + return result; + + fau_ptr = cmdbuf->state.gfx.fs.push_uniforms | + ((uint64_t)fs->fau.total_count << 56); + } + + cs_update_vt_ctx(b) + cs_move64_to(b, cs_sr_reg64(b, 12), fau_ptr); + } + return VK_SUCCESS; } @@ -1535,6 +1551,10 @@ prepare_draw(struct panvk_cmd_buffer *cmdbuf, struct panvk_draw_info *draw) return result; } + result = prepare_blend(cmdbuf); + if (result != VK_SUCCESS) + return result; + panvk_per_arch(cmd_prepare_draw_sysvals)(cmdbuf, draw); result = prepare_push_uniforms(cmdbuf); @@ -1570,10 +1590,6 @@ prepare_draw(struct panvk_cmd_buffer *cmdbuf, struct panvk_draw_info *draw) cs_move32_to(b, cs_sr_reg32(b, 48), varying_size); - result = prepare_blend(cmdbuf); - if (result != VK_SUCCESS) - return result; - result = prepare_ds(cmdbuf); if (result != VK_SUCCESS) return result; @@ -1789,7 +1805,7 @@ panvk_cmd_draw_indirect(struct panvk_cmd_buffer *cmdbuf, assert(draw->indirect.draw_count == 1); /* Force a new push uniform block to be allocated */ - gfx_state_set_dirty(cmdbuf, PUSH_UNIFORMS); + gfx_state_set_dirty(cmdbuf, VS_PUSH_UNIFORMS); result = prepare_draw(cmdbuf, draw); if (result != VK_SUCCESS) @@ -1808,17 +1824,27 @@ panvk_cmd_draw_indirect(struct panvk_cmd_buffer *cmdbuf, /* Wait for the SR33-37 indirect buffer load. */ cs_wait_slot(b, SB_ID(LS), false); - struct cs_index fau_block_addr = cs_scratch_reg64(b, 2); - cs_move64_to(b, fau_block_addr, cmdbuf->state.gfx.push_uniforms); - cs_store32(b, cs_sr_reg32(b, 36), fau_block_addr, - SYSVALS_PUSH_CONST_BASE + - offsetof(struct panvk_graphics_sysvals, vs.first_vertex)); - cs_store32(b, cs_sr_reg32(b, 37), fau_block_addr, - SYSVALS_PUSH_CONST_BASE + - offsetof(struct panvk_graphics_sysvals, vs.base_instance)); + if (shader_uses_sysval(vs, graphics, vs.first_vertex) || + shader_uses_sysval(vs, graphics, vs.base_instance)) { + struct cs_index fau_block_addr = cs_scratch_reg64(b, 2); + cs_move64_to(b, fau_block_addr, cmdbuf->state.gfx.vs.push_uniforms); - /* Wait for the store using SR-37 as src to finish, so we can overwrite it. */ - cs_wait_slot(b, SB_ID(LS), false); + if (shader_uses_sysval(vs, graphics, vs.first_vertex)) { + cs_store32(b, cs_sr_reg32(b, 36), fau_block_addr, + shader_remapped_sysval_offset( + vs, sysval_offset(graphics, vs.first_vertex))); + } + + if (shader_uses_sysval(vs, graphics, vs.base_instance)) { + cs_store32(b, cs_sr_reg32(b, 37), fau_block_addr, + shader_remapped_sysval_offset( + vs, sysval_offset(graphics, vs.base_instance))); + } + + /* Wait for the store using SR-37 as src to finish, so we can overwrite + * it. */ + cs_wait_slot(b, SB_ID(LS), false); + } /* NIR expects zero-based instance ID, but even if it did have an intrinsic to * load the absolute instance ID, we'd want to keep it zero-based to work around diff --git a/src/panfrost/vulkan/jm/panvk_vX_cmd_dispatch.c b/src/panfrost/vulkan/jm/panvk_vX_cmd_dispatch.c index 2e5128e8180..a5ee126b346 100644 --- a/src/panfrost/vulkan/jm/panvk_vX_cmd_dispatch.c +++ b/src/panfrost/vulkan/jm/panvk_vX_cmd_dispatch.c @@ -44,10 +44,8 @@ panvk_per_arch(CmdDispatchBase)(VkCommandBuffer commandBuffer, return; struct panvk_dispatch_info info = { - .direct = { - .wg_base = {baseGroupX, baseGroupY, baseGroupZ}, - .wg_count = {groupCountX, groupCountY, groupCountZ}, - }, + .wg_base = {baseGroupX, baseGroupY, baseGroupZ}, + .direct.wg_count = {groupCountX, groupCountY, groupCountZ}, }; struct panvk_device *dev = to_panvk_device(cmdbuf->vk.base.device); struct panvk_physical_device *phys_dev = @@ -80,14 +78,10 @@ panvk_per_arch(CmdDispatchBase)(VkCommandBuffer commandBuffer, panvk_per_arch(cmd_prepare_dispatch_sysvals)(cmdbuf, &info); - if (compute_state_dirty(cmdbuf, PUSH_UNIFORMS)) { - cmdbuf->state.compute.push_uniforms = panvk_per_arch( - cmd_prepare_push_uniforms)(cmdbuf, VK_PIPELINE_BIND_POINT_COMPUTE); - if (!cmdbuf->state.compute.push_uniforms) - return; - } - - mali_ptr push_uniforms = cmdbuf->state.compute.push_uniforms; + result = panvk_per_arch(cmd_prepare_push_uniforms)( + cmdbuf, cmdbuf->state.compute.shader); + if (result != VK_SUCCESS) + return; struct panfrost_ptr copy_desc_job = {0}; @@ -130,7 +124,7 @@ panvk_per_arch(CmdDispatchBase)(VkCommandBuffer commandBuffer, cs_desc_state->tables[PANVK_BIFROST_DESC_TABLE_IMG]; cfg.thread_storage = tsd; cfg.uniform_buffers = cs_desc_state->tables[PANVK_BIFROST_DESC_TABLE_UBO]; - cfg.push_uniforms = push_uniforms; + cfg.push_uniforms = cmdbuf->state.compute.push_uniforms; cfg.textures = cs_desc_state->tables[PANVK_BIFROST_DESC_TABLE_TEXTURE]; cfg.samplers = cs_desc_state->tables[PANVK_BIFROST_DESC_TABLE_SAMPLER]; } diff --git a/src/panfrost/vulkan/jm/panvk_vX_cmd_draw.c b/src/panfrost/vulkan/jm/panvk_vX_cmd_draw.c index fbe76dfa649..97ad076addc 100644 --- a/src/panfrost/vulkan/jm/panvk_vX_cmd_draw.c +++ b/src/panfrost/vulkan/jm/panvk_vX_cmd_draw.c @@ -50,7 +50,6 @@ struct panvk_draw_data { mali_ptr rsd; mali_ptr varyings; } fs; - mali_ptr push_uniforms; mali_ptr varying_bufs; mali_ptr position; mali_ptr indices; @@ -722,7 +721,7 @@ panvk_emit_vertex_dcd(struct panvk_cmd_buffer *cmdbuf, cfg.instance_size = draw->info.instance.count > 1 ? draw->padded_vertex_count : 1; cfg.uniform_buffers = vs_desc_state->tables[PANVK_BIFROST_DESC_TABLE_UBO]; - cfg.push_uniforms = draw->push_uniforms; + cfg.push_uniforms = cmdbuf->state.gfx.vs.push_uniforms; cfg.textures = vs_desc_state->tables[PANVK_BIFROST_DESC_TABLE_TEXTURE]; cfg.samplers = vs_desc_state->tables[PANVK_BIFROST_DESC_TABLE_SAMPLER]; } @@ -900,7 +899,7 @@ panvk_emit_tiler_dcd(struct panvk_cmd_buffer *cmdbuf, cfg.instance_size = draw->info.instance.count > 1 ? draw->padded_vertex_count : 1; cfg.uniform_buffers = fs_desc_state->tables[PANVK_BIFROST_DESC_TABLE_UBO]; - cfg.push_uniforms = draw->push_uniforms; + cfg.push_uniforms = cmdbuf->state.gfx.fs.push_uniforms; cfg.textures = fs_desc_state->tables[PANVK_BIFROST_DESC_TABLE_TEXTURE]; cfg.samplers = fs_desc_state->tables[PANVK_BIFROST_DESC_TABLE_SAMPLER]; @@ -1261,12 +1260,18 @@ panvk_cmd_draw(struct panvk_cmd_buffer *cmdbuf, struct panvk_draw_data *draw) panvk_per_arch(cmd_prepare_draw_sysvals)(cmdbuf, &draw->info); - cmdbuf->state.gfx.push_uniforms = panvk_per_arch( - cmd_prepare_push_uniforms)(cmdbuf, VK_PIPELINE_BIND_POINT_GRAPHICS); - if (!cmdbuf->state.gfx.push_uniforms) + result = panvk_per_arch(cmd_prepare_push_uniforms)( + cmdbuf, cmdbuf->state.gfx.vs.shader); + if (result != VK_SUCCESS) return; - draw->push_uniforms = cmdbuf->state.gfx.push_uniforms; + if (fs) { + result = panvk_per_arch(cmd_prepare_push_uniforms)( + cmdbuf, cmdbuf->state.gfx.fs.shader); + if (result != VK_SUCCESS) + return; + } + result = panvk_draw_prepare_tiler_context(cmdbuf, draw); if (result != VK_SUCCESS) return; diff --git a/src/panfrost/vulkan/panvk_cmd_dispatch.h b/src/panfrost/vulkan/panvk_cmd_dispatch.h index b21f37a8ccd..09545e32279 100644 --- a/src/panfrost/vulkan/panvk_cmd_dispatch.h +++ b/src/panfrost/vulkan/panvk_cmd_dispatch.h @@ -43,11 +43,24 @@ struct panvk_cmd_compute_state { compute_state_clear_all_dirty(__cmdbuf); \ } while (0) +#define set_compute_sysval(__cmdbuf, __dirty, __name, __val) \ + do { \ + struct panvk_compute_sysvals __new_sysval; \ + __new_sysval.__name = (__val); \ + if (memcmp(&(__cmdbuf)->state.compute.sysvals.__name, \ + &__new_sysval.__name, sizeof(__new_sysval.__name))) { \ + (__cmdbuf)->state.compute.sysvals.__name = __new_sysval.__name; \ + BITSET_SET_RANGE(__dirty, sysval_fau_start(compute, __name), \ + sysval_fau_start(compute, __name)); \ + } \ + } while (0) + struct panvk_dispatch_info { struct { - struct { - uint32_t x, y, z; - } wg_base; + uint32_t x, y, z; + } wg_base; + + struct { struct { uint32_t x, y, z; } wg_count; diff --git a/src/panfrost/vulkan/panvk_cmd_draw.h b/src/panfrost/vulkan/panvk_cmd_draw.h index 34bba81361b..fe5d70307f4 100644 --- a/src/panfrost/vulkan/panvk_cmd_draw.h +++ b/src/panfrost/vulkan/panvk_cmd_draw.h @@ -87,7 +87,8 @@ enum panvk_cmd_graphics_dirty_state { PANVK_CMD_GRAPHICS_DIRTY_OQ, PANVK_CMD_GRAPHICS_DIRTY_DESC_STATE, PANVK_CMD_GRAPHICS_DIRTY_RENDER_STATE, - PANVK_CMD_GRAPHICS_DIRTY_PUSH_UNIFORMS, + PANVK_CMD_GRAPHICS_DIRTY_VS_PUSH_UNIFORMS, + PANVK_CMD_GRAPHICS_DIRTY_FS_PUSH_UNIFORMS, PANVK_CMD_GRAPHICS_DIRTY_STATE_COUNT, }; @@ -109,6 +110,7 @@ struct panvk_cmd_graphics_state { struct { const struct panvk_shader *shader; struct panvk_shader_desc_state desc; + mali_ptr push_uniforms; bool required; #if PAN_ARCH <= 7 mali_ptr rsd; @@ -118,6 +120,7 @@ struct panvk_cmd_graphics_state { struct { const struct panvk_shader *shader; struct panvk_shader_desc_state desc; + mali_ptr push_uniforms; #if PAN_ARCH <= 7 mali_ptr attribs; mali_ptr attrib_bufs; @@ -142,8 +145,6 @@ struct panvk_cmd_graphics_state { struct panvk_rendering_state render; - mali_ptr push_uniforms; - #if PAN_ARCH <= 7 mali_ptr vpd; #endif @@ -171,6 +172,18 @@ struct panvk_cmd_graphics_state { #define gfx_state_set_all_dirty(__cmdbuf) \ BITSET_ONES((__cmdbuf)->state.gfx.dirty) +#define set_gfx_sysval(__cmdbuf, __dirty, __name, __val) \ + do { \ + struct panvk_graphics_sysvals __new_sysval; \ + __new_sysval.__name = __val; \ + if (memcmp(&(__cmdbuf)->state.gfx.sysvals.__name, &__new_sysval.__name, \ + sizeof(__new_sysval.__name))) { \ + (__cmdbuf)->state.gfx.sysvals.__name = __new_sysval.__name; \ + BITSET_SET_RANGE(__dirty, sysval_fau_start(graphics, __name), \ + sysval_fau_end(graphics, __name)); \ + } \ + } while (0) + static inline uint32_t panvk_select_tiler_hierarchy_mask(const struct panvk_physical_device *phys_dev, const struct panvk_cmd_graphics_state *state) @@ -278,11 +291,15 @@ cached_fs_required(ASSERTED const struct panvk_cmd_graphics_state *state, do { \ bool __set_fs_dirty = \ (__cmdbuf)->state.gfx.fs.shader != get_fs(__cmdbuf); \ + bool __set_fs_push_dirty = \ + __set_fs_dirty && gfx_state_dirty(__cmdbuf, FS_PUSH_UNIFORMS); \ vk_dynamic_graphics_state_clear_dirty( \ &(__cmdbuf)->vk.dynamic_graphics_state); \ gfx_state_clear_all_dirty(__cmdbuf); \ if (__set_fs_dirty) \ gfx_state_set_dirty(__cmdbuf, FS); \ + if (__set_fs_push_dirty) \ + gfx_state_set_dirty(__cmdbuf, FS_PUSH_UNIFORMS); \ } while (0) void diff --git a/src/panfrost/vulkan/panvk_cmd_push_constant.h b/src/panfrost/vulkan/panvk_cmd_push_constant.h index a48ca7cf12c..5c709e09f37 100644 --- a/src/panfrost/vulkan/panvk_cmd_push_constant.h +++ b/src/panfrost/vulkan/panvk_cmd_push_constant.h @@ -11,15 +11,16 @@ #include "genxml/gen_macros.h" struct panvk_cmd_buffer; +struct panvk_shader; #define MAX_PUSH_CONSTANTS_SIZE 128 struct panvk_push_constant_state { - uint8_t data[MAX_PUSH_CONSTANTS_SIZE]; + uint64_t data[MAX_PUSH_CONSTANTS_SIZE / sizeof(uint64_t)]; }; -mali_ptr +VkResult panvk_per_arch(cmd_prepare_push_uniforms)(struct panvk_cmd_buffer *cmdbuf, - VkPipelineBindPoint ptype); + const struct panvk_shader *shader); #endif diff --git a/src/panfrost/vulkan/panvk_shader.h b/src/panfrost/vulkan/panvk_shader.h index 968b3469518..211ad95e900 100644 --- a/src/panfrost/vulkan/panvk_shader.h +++ b/src/panfrost/vulkan/panvk_shader.h @@ -52,16 +52,16 @@ enum panvk_desc_table_id { #endif struct panvk_graphics_sysvals { + struct { + float constants[4]; + } blend; + struct { struct { float x, y, z; } scale, offset; } viewport; - struct { - float constants[4]; - } blend; - struct { #if PAN_ARCH <= 7 int32_t raw_vertex_offset; @@ -105,32 +105,94 @@ struct panvk_compute_sysvals { #endif }; +/* This is not the final offset in the push constant buffer (AKA FAU), but + * just a magic offset we use before packing push constants so we can easily + * identify the type of push constant (driver sysvals vs user push constants). + */ #define SYSVALS_PUSH_CONST_BASE MAX_PUSH_CONSTANTS_SIZE +#define FAU_WORD_SIZE sizeof(uint64_t) + +static_assert((sizeof(struct panvk_compute_sysvals) % FAU_WORD_SIZE) == 0, + "struct panvk_compute_sysvals must be 8-byte aligned"); +static_assert((sizeof(struct panvk_graphics_sysvals) % FAU_WORD_SIZE) == 0, + "struct panvk_graphics_sysvals must be 8-byte aligned"); + +#define sysval_size(__ptype, __name) \ + sizeof(((struct panvk_##__ptype##_sysvals *)NULL)->__name) + +#define sysval_offset(__ptype, __name) \ + offsetof(struct panvk_##__ptype##_sysvals, __name) + +#define sysval_entry_size(__ptype, __name) \ + sizeof(((struct panvk_##__ptype##_sysvals *)NULL)->__name[0]) + +#define sysval_entry_offset(__ptype, __name, __idx) \ + (sysval_offset(__ptype, __name) + \ + (sysval_entry_size(__ptype, __name) * __idx)) + +#define sysval_fau_start(__ptype, __name) \ + (sysval_offset(__ptype, __name) / FAU_WORD_SIZE) + +#define sysval_fau_end(__ptype, __name) \ + ((sysval_offset(__ptype, __name) + sysval_size(__ptype, __name) - 1) / \ + FAU_WORD_SIZE) + +#define sysval_fau_entry_start(__ptype, __name, __idx) \ + (sysval_entry_offset(__ptype, __name, __idx) / FAU_WORD_SIZE) + +#define sysval_fau_entry_end(__ptype, __name, __idx) \ + ((sysval_entry_offset(__ptype, __name, __idx + 1) - 1) / FAU_WORD_SIZE) + +#define shader_remapped_fau_offset(__shader, __kind, __offset) \ + ((FAU_WORD_SIZE * BITSET_PREFIX_SUM((__shader)->fau.used_##__kind, \ + (__offset) / FAU_WORD_SIZE)) + \ + ((__offset) % FAU_WORD_SIZE)) + +#define shader_remapped_sysval_offset(__shader, __offset) \ + shader_remapped_fau_offset(__shader, sysvals, __offset) + +#define shader_remapped_push_const_offset(__shader, __offset) \ + (((__shader)->fau.sysval_count * FAU_WORD_SIZE) + \ + shader_remapped_fau_offset(__shader, push_consts, __offset)) + +#define shader_use_sysval(__shader, __ptype, __name) \ + BITSET_SET_RANGE((__shader)->fau.used_sysvals, \ + sysval_fau_start(__ptype, __name), \ + sysval_fau_end(__ptype, __name)) + +#define shader_uses_sysval(__shader, __ptype, __name) \ + BITSET_TEST_RANGE((__shader)->fau.used_sysvals, \ + sysval_fau_start(__ptype, __name), \ + sysval_fau_end(__ptype, __name)) + +#define shader_uses_sysval_entry(__shader, __ptype, __name, __idx) \ + BITSET_TEST_RANGE((__shader)->fau.used_sysvals, \ + sysval_fau_entry_start(__ptype, __name, __idx), \ + sysval_fau_entry_end(__ptype, __name, __idx)) + +#define shader_use_sysval_range(__shader, __base, __range) \ + BITSET_SET_RANGE((__shader)->fau.used_sysvals, (__base) / FAU_WORD_SIZE, \ + ((__base) + (__range) - 1) / FAU_WORD_SIZE) + +#define shader_use_push_const_range(__shader, __base, __range) \ + BITSET_SET_RANGE((__shader)->fau.used_push_consts, \ + (__base) / FAU_WORD_SIZE, \ + ((__base) + (__range) - 1) / FAU_WORD_SIZE) #define load_sysval(__b, __ptype, __bitsz, __name) \ nir_load_push_constant( \ - __b, \ - sizeof(((struct panvk_##__ptype##_sysvals *)NULL)->__name) / \ - ((__bitsz) / 8), \ - __bitsz, \ - nir_imm_int(__b, offsetof(struct panvk_##__ptype##_sysvals, __name)), \ - .base = SYSVALS_PUSH_CONST_BASE, \ - .range = sizeof(struct panvk_##__ptype##_sysvals)) + __b, sysval_size(__ptype, __name) / ((__bitsz) / 8), __bitsz, \ + nir_imm_int(__b, sysval_offset(__ptype, __name)), \ + .base = SYSVALS_PUSH_CONST_BASE) #define load_sysval_entry(__b, __ptype, __bitsz, __name, __dyn_idx) \ nir_load_push_constant( \ - __b, \ - sizeof(((struct panvk_##__ptype##_sysvals *)NULL)->__name[0]) / \ - ((__bitsz) / 8), \ - __bitsz, \ + __b, sysval_entry_size(__ptype, __name) / ((__bitsz) / 8), __bitsz, \ nir_iadd_imm( \ __b, \ - nir_imul_imm( \ - __b, __dyn_idx, \ - sizeof(((struct panvk_##__ptype##_sysvals *)NULL)->__name[0])), \ - offsetof(struct panvk_##__ptype##_sysvals, __name)), \ - .base = SYSVALS_PUSH_CONST_BASE, \ - .range = sizeof(struct panvk_##__ptype##_sysvals)) + nir_imul_imm(__b, __dyn_idx, sysval_entry_size(__ptype, __name)), \ + sysval_offset(__ptype, __name)), \ + .base = SYSVALS_PUSH_CONST_BASE) #if PAN_ARCH <= 7 enum panvk_bifrost_desc_table_type { @@ -155,6 +217,20 @@ enum panvk_bifrost_desc_table_type { #define COPY_DESC_HANDLE_EXTRACT_INDEX(handle) ((handle) & BITFIELD_MASK(28)) #define COPY_DESC_HANDLE_EXTRACT_TABLE(handle) ((handle) >> 28) +#define MAX_COMPUTE_SYSVAL_FAUS \ + (sizeof(struct panvk_compute_sysvals) / FAU_WORD_SIZE) +#define MAX_GFX_SYSVAL_FAUS \ + (sizeof(struct panvk_graphics_sysvals) / FAU_WORD_SIZE) +#define MAX_SYSVAL_FAUS MAX2(MAX_COMPUTE_SYSVAL_FAUS, MAX_GFX_SYSVAL_FAUS) +#define MAX_PUSH_CONST_FAUS (MAX_PUSH_CONSTANTS_SIZE / FAU_WORD_SIZE) + +struct panvk_shader_fau_info { + BITSET_DECLARE(used_sysvals, MAX_SYSVAL_FAUS); + BITSET_DECLARE(used_push_consts, MAX_PUSH_CONST_FAUS); + uint32_t sysval_count; + uint32_t total_count; +}; + struct panvk_shader { struct vk_shader vk; struct pan_shader_info info; @@ -184,6 +260,8 @@ struct panvk_shader { #endif } desc_info; + struct panvk_shader_fau_info fau; + const void *bin_ptr; uint32_t bin_size; diff --git a/src/panfrost/vulkan/panvk_vX_blend.c b/src/panfrost/vulkan/panvk_vX_blend.c index 8db95f9bc0c..0b3730156d2 100644 --- a/src/panfrost/vulkan/panvk_vX_blend.c +++ b/src/panfrost/vulkan/panvk_vX_blend.c @@ -37,11 +37,9 @@ lower_load_blend_const(nir_builder *b, nir_instr *instr, UNUSED void *data) b->cursor = nir_before_instr(instr); - unsigned offset = offsetof(struct panvk_graphics_sysvals, blend.constants); + /* Blend constants are always passed through FAU words 0:3. */ nir_def *blend_consts = nir_load_push_constant( - b, intr->def.num_components, intr->def.bit_size, - /* Push constants are placed first, and then come the sysvals. */ - nir_imm_int(b, SYSVALS_PUSH_CONST_BASE + offset)); + b, intr->def.num_components, intr->def.bit_size, nir_imm_int(b, 0)); nir_def_rewrite_uses(&intr->def, blend_consts); return true; @@ -412,7 +410,7 @@ panvk_per_arch(blend_emit_descs)(struct panvk_cmd_buffer *cmdbuf, } if (blend_info->shader_loads_blend_const) - gfx_state_set_dirty(cmdbuf, PUSH_UNIFORMS); + gfx_state_set_dirty(cmdbuf, FS_PUSH_UNIFORMS); return VK_SUCCESS; } diff --git a/src/panfrost/vulkan/panvk_vX_cmd_dispatch.c b/src/panfrost/vulkan/panvk_vX_cmd_dispatch.c index 6d0ed14e4a0..2ecbb874e4c 100644 --- a/src/panfrost/vulkan/panvk_vX_cmd_dispatch.c +++ b/src/panfrost/vulkan/panvk_vX_cmd_dispatch.c @@ -12,24 +12,34 @@ void panvk_per_arch(cmd_prepare_dispatch_sysvals)( struct panvk_cmd_buffer *cmdbuf, const struct panvk_dispatch_info *info) { - struct panvk_compute_sysvals *sysvals = &cmdbuf->state.compute.sysvals; const struct panvk_shader *shader = cmdbuf->state.compute.shader; + BITSET_DECLARE(dirty_sysvals, MAX_SYSVAL_FAUS) = {0}; /* In indirect case, some sysvals are read from the indirect dispatch * buffer. */ if (info->indirect.buffer_dev_addr == 0) { - sysvals->base.x = info->direct.wg_base.x; - sysvals->base.y = info->direct.wg_base.y; - sysvals->base.z = info->direct.wg_base.z; - sysvals->num_work_groups.x = info->direct.wg_count.x; - sysvals->num_work_groups.y = info->direct.wg_count.y; - sysvals->num_work_groups.z = info->direct.wg_count.z; + set_compute_sysval(cmdbuf, dirty_sysvals, num_work_groups.x, + info->direct.wg_count.x); + set_compute_sysval(cmdbuf, dirty_sysvals, num_work_groups.y, + info->direct.wg_count.y); + set_compute_sysval(cmdbuf, dirty_sysvals, num_work_groups.z, + info->direct.wg_count.z); + } else { + BITSET_SET_RANGE(dirty_sysvals, + sysval_fau_start(compute, num_work_groups), + sysval_fau_end(compute, num_work_groups)); } - sysvals->local_group_size.x = shader->local_size.x; - sysvals->local_group_size.y = shader->local_size.y; - sysvals->local_group_size.z = shader->local_size.z; + set_compute_sysval(cmdbuf, dirty_sysvals, base.x, info->wg_base.x); + set_compute_sysval(cmdbuf, dirty_sysvals, base.y, info->wg_base.y); + set_compute_sysval(cmdbuf, dirty_sysvals, base.z, info->wg_base.z); + set_compute_sysval(cmdbuf, dirty_sysvals, local_group_size.x, + shader->local_size.x); + set_compute_sysval(cmdbuf, dirty_sysvals, local_group_size.y, + shader->local_size.y); + set_compute_sysval(cmdbuf, dirty_sysvals, local_group_size.z, + shader->local_size.z); #if PAN_ARCH <= 7 struct panvk_descriptor_state *desc_state = @@ -39,16 +49,21 @@ panvk_per_arch(cmd_prepare_dispatch_sysvals)( if (compute_state_dirty(cmdbuf, CS) || compute_state_dirty(cmdbuf, DESC_STATE)) { - sysvals->desc.sets[PANVK_DESC_TABLE_CS_DYN_SSBOS] = - cs_desc_state->dyn_ssbos; + set_compute_sysval(cmdbuf, dirty_sysvals, + desc.sets[PANVK_DESC_TABLE_CS_DYN_SSBOS], + cs_desc_state->dyn_ssbos); } for (uint32_t i = 0; i < MAX_SETS; i++) { - if (shader->desc_info.used_set_mask & BITFIELD_BIT(i)) - sysvals->desc.sets[i] = desc_state->sets[i]->descs.dev; + if (shader->desc_info.used_set_mask & BITFIELD_BIT(i)) { + set_compute_sysval(cmdbuf, dirty_sysvals, desc.sets[i], + desc_state->sets[i]->descs.dev); + } } #endif - /* We unconditionally update the sysvals, so push_uniforms is always dirty. */ - compute_state_set_dirty(cmdbuf, PUSH_UNIFORMS); + /* Dirty push_uniforms if the used_sysvals/dirty_sysvals overlap. */ + BITSET_AND(dirty_sysvals, dirty_sysvals, shader->fau.used_sysvals); + if (!BITSET_IS_EMPTY(dirty_sysvals)) + compute_state_set_dirty(cmdbuf, PUSH_UNIFORMS); } diff --git a/src/panfrost/vulkan/panvk_vX_cmd_draw.c b/src/panfrost/vulkan/panvk_vX_cmd_draw.c index 885cf3b83c3..7d3cf3eb8c4 100644 --- a/src/panfrost/vulkan/panvk_vX_cmd_draw.c +++ b/src/panfrost/vulkan/panvk_vX_cmd_draw.c @@ -548,43 +548,27 @@ void panvk_per_arch(cmd_prepare_draw_sysvals)(struct panvk_cmd_buffer *cmdbuf, const struct panvk_draw_info *info) { - struct panvk_graphics_sysvals *sysvals = &cmdbuf->state.gfx.sysvals; struct vk_color_blend_state *cb = &cmdbuf->vk.dynamic_graphics_state.cb; - const struct panvk_shader *fs = cmdbuf->state.gfx.fs.shader; + const struct panvk_shader *fs = get_fs(cmdbuf); uint32_t noperspective_varyings = fs ? fs->info.varyings.noperspective : 0; + BITSET_DECLARE(dirty_sysvals, MAX_SYSVAL_FAUS) = {0}; - if (sysvals->vs.noperspective_varyings != noperspective_varyings) { - sysvals->vs.noperspective_varyings = noperspective_varyings; - gfx_state_set_dirty(cmdbuf, PUSH_UNIFORMS); - } - - if (sysvals->vs.first_vertex != info->vertex.base) { - sysvals->vs.first_vertex = info->vertex.base; - gfx_state_set_dirty(cmdbuf, PUSH_UNIFORMS); - } - - if (sysvals->vs.base_instance != info->instance.base) { - sysvals->vs.base_instance = info->instance.base; - gfx_state_set_dirty(cmdbuf, PUSH_UNIFORMS); - } + set_gfx_sysval(cmdbuf, dirty_sysvals, vs.noperspective_varyings, + noperspective_varyings); + set_gfx_sysval(cmdbuf, dirty_sysvals, vs.first_vertex, info->vertex.base); + set_gfx_sysval(cmdbuf, dirty_sysvals, vs.base_instance, info->instance.base); #if PAN_ARCH <= 7 - if (sysvals->vs.raw_vertex_offset != info->vertex.raw_offset) { - sysvals->vs.raw_vertex_offset = info->vertex.raw_offset; - gfx_state_set_dirty(cmdbuf, PUSH_UNIFORMS); - } - - if (sysvals->layer_id != info->layer_id) { - sysvals->layer_id = info->layer_id; - gfx_state_set_dirty(cmdbuf, PUSH_UNIFORMS); - } + set_gfx_sysval(cmdbuf, dirty_sysvals, vs.raw_vertex_offset, + info->vertex.raw_offset); + set_gfx_sysval(cmdbuf, dirty_sysvals, layer_id, info->layer_id); #endif if (dyn_gfx_state_dirty(cmdbuf, CB_BLEND_CONSTANTS)) { - for (unsigned i = 0; i < ARRAY_SIZE(cb->blend_constants); i++) - sysvals->blend.constants[i] = - CLAMP(cb->blend_constants[i], 0.0f, 1.0f); - gfx_state_set_dirty(cmdbuf, PUSH_UNIFORMS); + for (unsigned i = 0; i < ARRAY_SIZE(cb->blend_constants); i++) { + set_gfx_sysval(cmdbuf, dirty_sysvals, blend.constants[i], + CLAMP(cb->blend_constants[i], 0.0f, 1.0f)); + } } if (dyn_gfx_state_dirty(cmdbuf, VP_VIEWPORTS) || @@ -600,9 +584,12 @@ panvk_per_arch(cmd_prepare_draw_sysvals)(struct panvk_cmd_buffer *cmdbuf, * py = height * pz = maxDepth - minDepth */ - sysvals->viewport.scale.x = 0.5f * viewport->width; - sysvals->viewport.scale.y = 0.5f * viewport->height; - sysvals->viewport.scale.z = (viewport->maxDepth - viewport->minDepth); + set_gfx_sysval(cmdbuf, dirty_sysvals, viewport.scale.x, + 0.5f * viewport->width); + set_gfx_sysval(cmdbuf, dirty_sysvals, viewport.scale.y, + 0.5f * viewport->height); + set_gfx_sysval(cmdbuf, dirty_sysvals, viewport.scale.z, + (viewport->maxDepth - viewport->minDepth)); /* Upload the viewport offset. Defined as (ox, oy, oz) at the start of * section 24.5 ("Controlling the Viewport") of the Vulkan spec. At the @@ -612,9 +599,12 @@ panvk_per_arch(cmd_prepare_draw_sysvals)(struct panvk_cmd_buffer *cmdbuf, * oy = y + height/2 * oz = minDepth */ - sysvals->viewport.offset.x = (0.5f * viewport->width) + viewport->x; - sysvals->viewport.offset.y = (0.5f * viewport->height) + viewport->y; - sysvals->viewport.offset.z = viewport->minDepth; + set_gfx_sysval(cmdbuf, dirty_sysvals, viewport.offset.x, + (0.5f * viewport->width) + viewport->x); + set_gfx_sysval(cmdbuf, dirty_sysvals, viewport.offset.y, + (0.5f * viewport->height) + viewport->y); + set_gfx_sysval(cmdbuf, dirty_sysvals, viewport.offset.z, + viewport->minDepth); #if PAN_ARCH >= 9 /* Doing the viewport transform in the vertex shader and then depth @@ -628,6 +618,7 @@ panvk_per_arch(cmd_prepare_draw_sysvals)(struct panvk_cmd_buffer *cmdbuf, * doesn't help with the precision loss, but at least clipping isn't * completely broken. */ + const struct panvk_graphics_sysvals *sysvals = &cmdbuf->state.gfx.sysvals; const struct vk_rasterization_state *rs = &cmdbuf->vk.dynamic_graphics_state.rs; @@ -637,7 +628,8 @@ panvk_per_arch(cmd_prepare_draw_sysvals)(struct panvk_cmd_buffer *cmdbuf, float z_max = viewport->maxDepth; float z_sign = z_min <= z_max ? 1.0f : -1.0f; - sysvals->viewport.scale.z = z_sign * MIN_DEPTH_CLIP_RANGE; + set_gfx_sysval(cmdbuf, dirty_sysvals, viewport.scale.z, + z_sign * MIN_DEPTH_CLIP_RANGE); /* Middle of the user range is * z_range_center = z_min + (z_max - z_min) * 0.5f, @@ -648,40 +640,61 @@ panvk_per_arch(cmd_prepare_draw_sysvals)(struct panvk_cmd_buffer *cmdbuf, */ float z_offset = (z_max + z_min - sysvals->viewport.scale.z) * 0.5f; /* Bump offset off-center if necessary, to not go out of range */ - sysvals->viewport.offset.z = CLAMP(z_offset, 0.0f, 1.0f); + set_gfx_sysval(cmdbuf, dirty_sysvals, viewport.offset.z, + CLAMP(z_offset, 0.0f, 1.0f)); } #endif - - gfx_state_set_dirty(cmdbuf, PUSH_UNIFORMS); } -#if PAN_ARCH <= 7 const struct panvk_shader *vs = cmdbuf->state.gfx.vs.shader; + +#if PAN_ARCH <= 7 struct panvk_descriptor_state *desc_state = &cmdbuf->state.gfx.desc_state; struct panvk_shader_desc_state *vs_desc_state = &cmdbuf->state.gfx.vs.desc; struct panvk_shader_desc_state *fs_desc_state = &cmdbuf->state.gfx.fs.desc; if (gfx_state_dirty(cmdbuf, DESC_STATE) || gfx_state_dirty(cmdbuf, VS)) { - sysvals->desc.sets[PANVK_DESC_TABLE_VS_DYN_SSBOS] = - vs_desc_state->dyn_ssbos; - gfx_state_set_dirty(cmdbuf, PUSH_UNIFORMS); + set_gfx_sysval(cmdbuf, dirty_sysvals, + desc.sets[PANVK_DESC_TABLE_VS_DYN_SSBOS], + vs_desc_state->dyn_ssbos); } if (gfx_state_dirty(cmdbuf, DESC_STATE) || gfx_state_dirty(cmdbuf, FS)) { - sysvals->desc.sets[PANVK_DESC_TABLE_FS_DYN_SSBOS] = - fs_desc_state->dyn_ssbos; - gfx_state_set_dirty(cmdbuf, PUSH_UNIFORMS); + set_gfx_sysval(cmdbuf, dirty_sysvals, + desc.sets[PANVK_DESC_TABLE_FS_DYN_SSBOS], + fs_desc_state->dyn_ssbos); } for (uint32_t i = 0; i < MAX_SETS; i++) { uint32_t used_set_mask = vs->desc_info.used_set_mask | (fs ? fs->desc_info.used_set_mask : 0); - if (used_set_mask & BITFIELD_BIT(i)) - sysvals->desc.sets[i] = desc_state->sets[i]->descs.dev; - gfx_state_set_dirty(cmdbuf, PUSH_UNIFORMS); + if (used_set_mask & BITFIELD_BIT(i)) { + set_gfx_sysval(cmdbuf, dirty_sysvals, desc.sets[i], + desc_state->sets[i]->descs.dev); + } } #endif + + /* We mask the dirty sysvals by the shader usage, and only flag + * the push uniforms dirty if those intersect. */ + BITSET_DECLARE(dirty_shader_sysvals, MAX_SYSVAL_FAUS); + BITSET_AND(dirty_shader_sysvals, dirty_sysvals, vs->fau.used_sysvals); + if (!BITSET_IS_EMPTY(dirty_shader_sysvals)) + gfx_state_set_dirty(cmdbuf, VS_PUSH_UNIFORMS); + + if (fs) { + BITSET_AND(dirty_shader_sysvals, dirty_sysvals, fs->fau.used_sysvals); + + /* If blend constants are not read by the blend shader, we can consider + * they are not read at all, so clear the dirty bits to avoid re-emitting + * FAUs when we can. */ + if (!cmdbuf->state.gfx.cb.info.shader_loads_blend_const) + BITSET_CLEAR_RANGE(dirty_shader_sysvals, 0, 3); + + if (!BITSET_IS_EMPTY(dirty_shader_sysvals)) + gfx_state_set_dirty(cmdbuf, FS_PUSH_UNIFORMS); + } } VKAPI_ATTR void VKAPI_CALL diff --git a/src/panfrost/vulkan/panvk_vX_cmd_meta.c b/src/panfrost/vulkan/panvk_vX_cmd_meta.c index ee31336118a..337de1743f9 100644 --- a/src/panfrost/vulkan/panvk_vX_cmd_meta.c +++ b/src/panfrost/vulkan/panvk_vX_cmd_meta.c @@ -62,12 +62,8 @@ panvk_per_arch(cmd_meta_compute_end)( push_set0->desc_count = save_ctx->push_set0.desc_count; } - if (memcmp(cmdbuf->state.push_constants.data, save_ctx->push_constants.data, - sizeof(cmdbuf->state.push_constants.data))) { - cmdbuf->state.push_constants = save_ctx->push_constants; - compute_state_set_dirty(cmdbuf, PUSH_UNIFORMS); - gfx_state_set_dirty(cmdbuf, PUSH_UNIFORMS); - } + cmdbuf->state.push_constants = save_ctx->push_constants; + compute_state_set_dirty(cmdbuf, PUSH_UNIFORMS); cmdbuf->state.compute.shader = save_ctx->cs.shader; cmdbuf->state.compute.cs.desc = save_ctx->cs.desc; @@ -127,12 +123,9 @@ panvk_per_arch(cmd_meta_gfx_end)( push_set0->desc_count = save_ctx->push_set0.desc_count; } - if (memcmp(cmdbuf->state.push_constants.data, save_ctx->push_constants.data, - sizeof(cmdbuf->state.push_constants.data))) { - cmdbuf->state.push_constants = save_ctx->push_constants; - compute_state_set_dirty(cmdbuf, PUSH_UNIFORMS); - gfx_state_set_dirty(cmdbuf, PUSH_UNIFORMS); - } + cmdbuf->state.push_constants = save_ctx->push_constants; + gfx_state_set_dirty(cmdbuf, VS_PUSH_UNIFORMS); + gfx_state_set_dirty(cmdbuf, FS_PUSH_UNIFORMS); cmdbuf->state.gfx.fs.shader = save_ctx->fs.shader; cmdbuf->state.gfx.fs.desc = save_ctx->fs.desc; diff --git a/src/panfrost/vulkan/panvk_vX_cmd_push_constant.c b/src/panfrost/vulkan/panvk_vX_cmd_push_constant.c index a0e7e642117..7cecf52bee1 100644 --- a/src/panfrost/vulkan/panvk_vX_cmd_push_constant.c +++ b/src/panfrost/vulkan/panvk_vX_cmd_push_constant.c @@ -7,35 +7,70 @@ #include "panvk_cmd_buffer.h" #include "panvk_entrypoints.h" -mali_ptr +VkResult panvk_per_arch(cmd_prepare_push_uniforms)(struct panvk_cmd_buffer *cmdbuf, - VkPipelineBindPoint ptype) + const struct panvk_shader *shader) { - uint32_t sysvals_sz = ptype == VK_PIPELINE_BIND_POINT_GRAPHICS - ? sizeof(struct panvk_graphics_sysvals) - : sizeof(struct panvk_compute_sysvals); - const void *sysvals = ptype == VK_PIPELINE_BIND_POINT_GRAPHICS - ? (void *)&cmdbuf->state.gfx.sysvals - : (void *)&cmdbuf->state.compute.sysvals; - struct panfrost_ptr push_uniforms = panvk_cmd_alloc_dev_mem( - cmdbuf, desc, SYSVALS_PUSH_CONST_BASE + sysvals_sz, 16); + mali_ptr *push_ptr; - if (push_uniforms.gpu) { - if (ptype == VK_PIPELINE_BIND_POINT_GRAPHICS) - cmdbuf->state.gfx.sysvals.push_consts = push_uniforms.gpu; - else - cmdbuf->state.compute.sysvals.push_consts = push_uniforms.gpu; - - /* The first half is used for push constants. */ - memcpy(push_uniforms.cpu, cmdbuf->state.push_constants.data, - sizeof(cmdbuf->state.push_constants.data)); - - /* The second half is used for sysvals. */ - memcpy((uint8_t *)push_uniforms.cpu + SYSVALS_PUSH_CONST_BASE, sysvals, - sysvals_sz); + switch (shader->vk.stage) { + case MESA_SHADER_COMPUTE: + if (!compute_state_dirty(cmdbuf, PUSH_UNIFORMS)) + return VK_SUCCESS; + push_ptr = &cmdbuf->state.compute.push_uniforms; + break; + case MESA_SHADER_VERTEX: + if (!gfx_state_dirty(cmdbuf, VS_PUSH_UNIFORMS)) + return VK_SUCCESS; + push_ptr = &cmdbuf->state.gfx.vs.push_uniforms; + break; + case MESA_SHADER_FRAGMENT: + if (!gfx_state_dirty(cmdbuf, FS_PUSH_UNIFORMS)) + return VK_SUCCESS; + push_ptr = &cmdbuf->state.gfx.fs.push_uniforms; + break; + default: + assert(!"Invalid stage"); + return VK_SUCCESS; } - return push_uniforms.gpu; + if (!shader->fau.total_count) { + *push_ptr = 0; + return VK_SUCCESS; + } + + struct panfrost_ptr push_uniforms = panvk_cmd_alloc_dev_mem( + cmdbuf, desc, shader->fau.total_count * sizeof(uint64_t), + sizeof(uint64_t)); + + if (!push_uniforms.gpu) + return VK_ERROR_OUT_OF_DEVICE_MEMORY; + + if (shader->vk.stage == MESA_SHADER_COMPUTE) { + cmdbuf->state.compute.sysvals.push_consts = + push_uniforms.gpu + (shader->fau.sysval_count * FAU_WORD_SIZE); + } else { + cmdbuf->state.gfx.sysvals.push_consts = + push_uniforms.gpu + (shader->fau.sysval_count * FAU_WORD_SIZE); + } + + uint64_t *sysvals = shader->vk.stage == MESA_SHADER_COMPUTE + ? (uint64_t *)&cmdbuf->state.compute.sysvals + : (uint64_t *)&cmdbuf->state.gfx.sysvals; + uint64_t *push_consts = cmdbuf->state.push_constants.data; + uint64_t *faus = push_uniforms.cpu; + uint32_t w, fau = 0; + + /* After packing, the sysvals come first, followed by the user push constants. + * The ordering is encoded shader side, so don't re-order these loops. */ + BITSET_FOREACH_SET(w, shader->fau.used_sysvals, MAX_SYSVAL_FAUS) + faus[fau++] = sysvals[w]; + + BITSET_FOREACH_SET(w, shader->fau.used_push_consts, MAX_PUSH_CONST_FAUS) + faus[fau++] = push_consts[w]; + + *push_ptr = push_uniforms.gpu; + return VK_SUCCESS; } VKAPI_ATTR void VKAPI_CALL @@ -45,12 +80,17 @@ panvk_per_arch(CmdPushConstants2KHR)( { VK_FROM_HANDLE(panvk_cmd_buffer, cmdbuf, commandBuffer); - if (pPushConstantsInfo->stageFlags & VK_SHADER_STAGE_ALL_GRAPHICS) - gfx_state_set_dirty(cmdbuf, PUSH_UNIFORMS); + if (pPushConstantsInfo->stageFlags & VK_SHADER_STAGE_VERTEX_BIT) + gfx_state_set_dirty(cmdbuf, VS_PUSH_UNIFORMS); + + if (pPushConstantsInfo->stageFlags & VK_SHADER_STAGE_FRAGMENT_BIT) + gfx_state_set_dirty(cmdbuf, FS_PUSH_UNIFORMS); if (pPushConstantsInfo->stageFlags & VK_SHADER_STAGE_COMPUTE_BIT) compute_state_set_dirty(cmdbuf, PUSH_UNIFORMS); - memcpy(cmdbuf->state.push_constants.data + pPushConstantsInfo->offset, - pPushConstantsInfo->pValues, pPushConstantsInfo->size); + uint8_t *data = + (uint8_t *)cmdbuf->state.push_constants.data + pPushConstantsInfo->offset; + + memcpy(data, pPushConstantsInfo->pValues, pPushConstantsInfo->size); } diff --git a/src/panfrost/vulkan/panvk_vX_shader.c b/src/panfrost/vulkan/panvk_vX_shader.c index 6496c375b26..d746b39eb9b 100644 --- a/src/panfrost/vulkan/panvk_vX_shader.c +++ b/src/panfrost/vulkan/panvk_vX_shader.c @@ -460,54 +460,166 @@ valhall_lower_get_ssbo_size(struct nir_builder *b, } static bool -lower_load_push_consts(nir_builder *b, nir_intrinsic_instr *intr, - UNUSED void *data) +collect_push_constant(struct nir_builder *b, nir_intrinsic_instr *intr, + void *data) { if (intr->intrinsic != nir_intrinsic_load_push_constant) return false; - unsigned base = nir_intrinsic_base(intr); + struct panvk_shader *shader = data; + uint32_t base = nir_intrinsic_base(intr); + bool is_sysval = base >= SYSVALS_PUSH_CONST_BASE; + uint32_t offset, size; - /* We always set the range to zero, to make sure no pass is using it after - * that point. */ - nir_intrinsic_set_range(intr, 0); + /* Sysvals should have a constant offset. */ + assert(!is_sysval || nir_src_is_const(intr->src[0])); + + if (is_sysval) + base -= SYSVALS_PUSH_CONST_BASE; + + /* If the offset is dynamic, we need to flag [base:base+range] as used, to + * allow global mem access. */ + if (!nir_src_is_const(intr->src[0])) { + offset = base; + size = nir_intrinsic_range(intr); + + /* Flag the push_consts sysval as needed if we have an indirect offset. */ + if (b->shader->info.stage == MESA_SHADER_COMPUTE) + shader_use_sysval(shader, compute, push_consts); + else + shader_use_sysval(shader, graphics, push_consts); + } else { + offset = base + nir_src_as_uint(intr->src[0]); + size = (intr->def.bit_size / 8) * intr->def.num_components; + } + + if (is_sysval) + shader_use_sysval_range(shader, offset, size); + else + shader_use_push_const_range(shader, offset, size); + + return true; +} + +static bool +move_push_constant(struct nir_builder *b, nir_intrinsic_instr *intr, void *data) +{ + if (intr->intrinsic != nir_intrinsic_load_push_constant) + return false; + + struct panvk_shader *shader = data; + unsigned base = nir_intrinsic_base(intr); + bool is_sysval = base >= SYSVALS_PUSH_CONST_BASE; + + if (is_sysval) + base -= SYSVALS_PUSH_CONST_BASE; + + /* Sysvals should have a constant offset. */ + assert(!is_sysval || nir_src_is_const(intr->src[0])); b->cursor = nir_before_instr(&intr->instr); - /* Offset is constant, we just propagate base to the offset if it's not - * already zero. */ if (nir_src_is_const(intr->src[0])) { - if (base == 0) - return true; + unsigned offset = base + nir_src_as_uint(intr->src[0]); - nir_src_rewrite(&intr->src[0], - nir_imm_int(b, nir_src_as_uint(intr->src[0]) + base)); + /* We place the sysvals first, and then comes the user push constants. + * We do that so we always have the blend constants at offset 0 for + * blend shaders. */ + if (is_sysval) + offset = shader_remapped_sysval_offset(shader, offset); + else + offset = shader_remapped_push_const_offset(shader, offset); + + nir_src_rewrite(&intr->src[0], nir_imm_int(b, offset)); + + /* We always set the range/base to zero, to make sure no pass is using it + * after that point. */ nir_intrinsic_set_base(intr, 0); - return true; + nir_intrinsic_set_range(intr, 0); + } else { + /* We don't use load_sysval() on purpose, because it would set + * .base=SYSVALS_PUSH_CONST_BASE, and we're supposed to force a base of + * zero in this pass. */ + unsigned push_const_buf_offset = shader_remapped_sysval_offset( + shader, b->shader->info.stage == MESA_SHADER_COMPUTE + ? sysval_offset(compute, push_consts) + : sysval_offset(graphics, push_consts)); + nir_def *push_const_buf = nir_load_push_constant( + b, 1, 64, nir_imm_int(b, push_const_buf_offset)); + unsigned push_const_offset = + shader_remapped_fau_offset(shader, push_consts, base); + nir_def *offset = nir_iadd_imm(b, intr->src[0].ssa, push_const_offset); + unsigned align = nir_combined_align(nir_intrinsic_align_mul(intr), + nir_intrinsic_align_offset(intr)); + + /* We assume an alignment of 64-bit max for packed push-constants. */ + align = MIN2(align, FAU_WORD_SIZE); + nir_def *value = + nir_load_global(b, nir_iadd(b, push_const_buf, nir_u2u64(b, offset)), + align, intr->def.num_components, intr->def.bit_size); + + nir_def_replace(&intr->def, value); } - /* We don't use load_sysval() on purpose, because it would set - * .base=SYSVALS_PUSH_CONST_BASE, and we're supposed to force a base of - * zero in this pass. */ - unsigned push_const_addr_offset = - SYSVALS_PUSH_CONST_BASE + - (b->shader->info.stage == MESA_SHADER_COMPUTE - ? offsetof(struct panvk_compute_sysvals, push_consts) - : offsetof(struct panvk_graphics_sysvals, push_consts)); - nir_def *push_const_buf = - nir_load_push_constant(b, 1, 64, nir_imm_int(b, push_const_addr_offset)); - - nir_def *offset = nir_iadd_imm(b, intr->src[0].ssa, base); - unsigned align = nir_combined_align(nir_intrinsic_align_mul(intr), - nir_intrinsic_align_offset(intr)); - nir_def *value = - nir_load_global(b, nir_iadd(b, push_const_buf, nir_u2u64(b, offset)), - align, intr->def.num_components, intr->def.bit_size); - - nir_def_replace(&intr->def, value); return true; } +static void +lower_load_push_consts(nir_shader *nir, struct panvk_shader *shader) +{ + /* Before we lower load_push_constant()s with a dynamic offset to global + * loads, we want to run a few optimization passes to get rid of offset + * calculation involving only constant values. */ + bool progress = false; + do { + progress = false; + NIR_PASS(progress, nir, nir_copy_prop); + NIR_PASS(progress, nir, nir_opt_remove_phis); + NIR_PASS(progress, nir, nir_opt_dce); + NIR_PASS(progress, nir, nir_opt_dead_cf); + NIR_PASS(progress, nir, nir_opt_cse); + NIR_PASS(progress, nir, nir_opt_peephole_select, 64, false, true); + NIR_PASS(progress, nir, nir_opt_algebraic); + NIR_PASS(progress, nir, nir_opt_constant_folding); + } while (progress); + + /* We always reserve the 4 blend constant words for fragment shaders, + * because we don't know the blend configuration at this point, and + * we might end up with a blend shader reading those blend constants. */ + if (shader->vk.stage == MESA_SHADER_FRAGMENT) { + /* We rely on blend constants being placed first and covering 4 words. */ + STATIC_ASSERT( + offsetof(struct panvk_graphics_sysvals, blend.constants) == 0 && + sizeof(((struct panvk_graphics_sysvals *)NULL)->blend.constants) == + 16); + + shader_use_sysval(shader, graphics, blend.constants); + } + + progress = false; + NIR_PASS(progress, nir, nir_shader_intrinsics_pass, collect_push_constant, + nir_metadata_all, shader); + + /* Some load_push_constant instructions might be eliminated after + * scalarization+dead-code-elimination. Since these pass happen in + * bifrost_compile(), we can't run the push_constant packing after the + * optimization took place, so let's just have our own FAU count instead + * of using info.push.count to make it consistent with the + * used_{sysvals,push_consts} bitmaps, even if it sometimes implies loading + * more than we really need. Doing that also takes into account the fact + * blend constants are never loaded from the fragment shader, but might be + * needed in the blend shader. */ + shader->fau.sysval_count = BITSET_COUNT(shader->fau.used_sysvals); + shader->fau.total_count = + shader->fau.sysval_count + BITSET_COUNT(shader->fau.used_push_consts); + + if (!progress) + return; + + NIR_PASS(_, nir, nir_shader_intrinsics_pass, move_push_constant, + nir_metadata_control_flow, shader); +} + static void panvk_lower_nir(struct panvk_device *dev, nir_shader *nir, uint32_t set_layout_count, @@ -632,24 +744,7 @@ panvk_lower_nir(struct panvk_device *dev, nir_shader *nir, NIR_PASS(_, nir, nir_shader_instructions_pass, panvk_lower_sysvals, nir_metadata_control_flow, NULL); - /* Before we lower load_push_constant()s with a dynamic offset to global - * loads, we want to run a few optimization passes to get rid of offset - * calculation involving only constant values. */ - bool progress = false; - do { - progress = false; - NIR_PASS(progress, nir, nir_copy_prop); - NIR_PASS(progress, nir, nir_opt_remove_phis); - NIR_PASS(progress, nir, nir_opt_dce); - NIR_PASS(progress, nir, nir_opt_dead_cf); - NIR_PASS(progress, nir, nir_opt_cse); - NIR_PASS(progress, nir, nir_opt_peephole_select, 64, false, true); - NIR_PASS(progress, nir, nir_opt_algebraic); - NIR_PASS(progress, nir, nir_opt_constant_folding); - } while (progress); - - NIR_PASS(_, nir, nir_shader_intrinsics_pass, lower_load_push_consts, - nir_metadata_control_flow, NULL); + lower_load_push_consts(nir, shader); } static VkResult @@ -955,6 +1050,10 @@ panvk_compile_shader(struct panvk_device *dev, result = panvk_compile_nir(dev, nir, info->flags, &inputs, shader); + /* We need to update info.push.count because it's used to initialize the + * RSD in pan_shader_prepare_rsd(). */ + shader->info.push.count = shader->fau.total_count * 2; + if (result != VK_SUCCESS) { panvk_shader_destroy(&dev->vk, &shader->vk, pAllocator); return result; @@ -1091,6 +1190,9 @@ panvk_deserialize_shader(struct vk_device *vk_dev, struct blob_reader *blob, struct pan_shader_info info; blob_copy_bytes(blob, &info, sizeof(info)); + struct panvk_shader_fau_info fau; + blob_copy_bytes(blob, &fau, sizeof(fau)); + struct pan_compute_dim local_size; blob_copy_bytes(blob, &local_size, sizeof(local_size)); @@ -1105,6 +1207,7 @@ panvk_deserialize_shader(struct vk_device *vk_dev, struct blob_reader *blob, return panvk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY); shader->info = info; + shader->fau = fau; shader->local_size = local_size; shader->bin_size = bin_size; @@ -1187,6 +1290,7 @@ panvk_shader_serialize(struct vk_device *vk_dev, return false; blob_write_bytes(blob, &shader->info, sizeof(shader->info)); + blob_write_bytes(blob, &shader->fau, sizeof(shader->fau)); blob_write_bytes(blob, &shader->local_size, sizeof(shader->local_size)); blob_write_uint32(blob, shader->bin_size); blob_write_bytes(blob, shader->bin_ptr, shader->bin_size); @@ -1546,18 +1650,21 @@ panvk_cmd_bind_shader(struct panvk_cmd_buffer *cmd, const gl_shader_stage stage, if (cmd->state.compute.shader != shader) { cmd->state.compute.shader = shader; compute_state_set_dirty(cmd, CS); + compute_state_set_dirty(cmd, PUSH_UNIFORMS); } break; case MESA_SHADER_VERTEX: if (cmd->state.gfx.vs.shader != shader) { cmd->state.gfx.vs.shader = shader; gfx_state_set_dirty(cmd, VS); + gfx_state_set_dirty(cmd, VS_PUSH_UNIFORMS); } break; case MESA_SHADER_FRAGMENT: if (cmd->state.gfx.fs.shader != shader) { cmd->state.gfx.fs.shader = shader; gfx_state_set_dirty(cmd, FS); + gfx_state_set_dirty(cmd, FS_PUSH_UNIFORMS); } break; default: