panvk: Pack push constants

We're about to add more sysvals, and the more we add, the bigger the
sysvals region gets, which increases the amount of memory we have to
allocate when push_uniforms are dirty.

Instead of allocating FAUs for all sysvals/push_constants, track FAU
usage per-shader, and pack those. This implies emitting an FAU buffer
per stage instead of trying to share it, but that's an acceptable
trade-off.

While at it, automate the sysval dirty tracking a bit.

Signed-off-by: Boris Brezillon <boris.brezillon@collabora.com>
Reviewed-by: Chia-I Wu <olvaffe@gmail.com>
Reviewed-by: Mary Guillemard <mary.guillemard@collabora.com>
Reviewed-by: Lars-Ivar Hesselberg Simonsen <lars-ivar.simonsen@arm.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/32415>
This commit is contained in:
Boris Brezillon
2024-12-06 19:00:18 +01:00
committed by Marge Bot
parent aa78fe7683
commit ae76a6a045
14 changed files with 566 additions and 264 deletions

View File

@@ -64,15 +64,6 @@ prepare_driver_set(struct panvk_cmd_buffer *cmdbuf)
return VK_SUCCESS;
}
static VkResult
prepare_push_uniforms(struct panvk_cmd_buffer *cmdbuf)
{
cmdbuf->state.compute.push_uniforms = panvk_per_arch(
cmd_prepare_push_uniforms)(cmdbuf, VK_PIPELINE_BIND_POINT_COMPUTE);
return cmdbuf->state.compute.push_uniforms ? VK_SUCCESS
: VK_ERROR_OUT_OF_DEVICE_MEMORY;
}
static void
calculate_task_axis_and_increment(const struct panvk_shader *shader,
struct panvk_physical_device *phys_dev,
@@ -238,7 +229,8 @@ cmd_dispatch(struct panvk_cmd_buffer *cmdbuf, struct panvk_dispatch_info *info)
if (result != VK_SUCCESS)
return;
result = prepare_push_uniforms(cmdbuf);
result = panvk_per_arch(cmd_prepare_push_uniforms)(
cmdbuf, cmdbuf->state.compute.shader);
if (result != VK_SUCCESS)
return;
@@ -268,11 +260,8 @@ cmd_dispatch(struct panvk_cmd_buffer *cmdbuf, struct panvk_dispatch_info *info)
cs_move64_to(b, cs_sr_reg64(b, 0), cs_desc_state->res_table);
if (compute_state_dirty(cmdbuf, PUSH_UNIFORMS)) {
uint32_t push_size =
SYSVALS_PUSH_CONST_BASE + sizeof(struct panvk_compute_sysvals);
uint64_t fau_count = DIV_ROUND_UP(push_size, 8);
mali_ptr fau_ptr =
cmdbuf->state.compute.push_uniforms | (fau_count << 56);
mali_ptr fau_ptr = cmdbuf->state.compute.push_uniforms |
((uint64_t)shader->fau.total_count << 56);
cs_move64_to(b, cs_sr_reg64(b, 8), fau_ptr);
}
@@ -294,11 +283,11 @@ cmd_dispatch(struct panvk_cmd_buffer *cmdbuf, struct panvk_dispatch_info *info)
}
cs_move32_to(b, cs_sr_reg32(b, 33), wg_size.opaque[0]);
cs_move32_to(b, cs_sr_reg32(b, 34),
info->direct.wg_base.x * shader->local_size.x);
info->wg_base.x * shader->local_size.x);
cs_move32_to(b, cs_sr_reg32(b, 35),
info->direct.wg_base.y * shader->local_size.y);
info->wg_base.y * shader->local_size.y);
cs_move32_to(b, cs_sr_reg32(b, 36),
info->direct.wg_base.z * shader->local_size.z);
info->wg_base.z * shader->local_size.z);
if (indirect) {
/* Load parameters from indirect buffer and update workgroup count
* registers and sysvals */
@@ -309,10 +298,25 @@ cmd_dispatch(struct panvk_cmd_buffer *cmdbuf, struct panvk_dispatch_info *info)
cs_move64_to(b, cs_scratch_reg64(b, 0),
cmdbuf->state.compute.push_uniforms);
cs_wait_slot(b, SB_ID(LS), false);
cs_store(b, cs_sr_reg_tuple(b, 37, 3), cs_scratch_reg64(b, 0),
BITFIELD_MASK(3),
SYSVALS_PUSH_CONST_BASE +
offsetof(struct panvk_compute_sysvals, num_work_groups));
if (shader_uses_sysval(shader, compute, num_work_groups.x)) {
cs_store32(b, cs_sr_reg32(b, 37), cs_scratch_reg64(b, 0),
shader_remapped_sysval_offset(
shader, sysval_offset(compute, num_work_groups.x)));
}
if (shader_uses_sysval(shader, compute, num_work_groups.y)) {
cs_store32(b, cs_sr_reg32(b, 38), cs_scratch_reg64(b, 0),
shader_remapped_sysval_offset(
shader, sysval_offset(compute, num_work_groups.y)));
}
if (shader_uses_sysval(shader, compute, num_work_groups.z)) {
cs_store32(b, cs_sr_reg32(b, 39), cs_scratch_reg64(b, 0),
shader_remapped_sysval_offset(
shader, sysval_offset(compute, num_work_groups.z)));
}
cs_wait_slot(b, SB_ID(LS), false);
} else {
cs_move32_to(b, cs_sr_reg32(b, 37), info->direct.wg_count.x);
@@ -385,10 +389,8 @@ panvk_per_arch(CmdDispatchBase)(VkCommandBuffer commandBuffer,
{
VK_FROM_HANDLE(panvk_cmd_buffer, cmdbuf, commandBuffer);
struct panvk_dispatch_info info = {
.direct = {
.wg_base = {baseGroupX, baseGroupY, baseGroupZ},
.wg_count = {groupCountX, groupCountY, groupCountZ},
}
.wg_base = {baseGroupX, baseGroupY, baseGroupZ},
.direct.wg_count = {groupCountX, groupCountY, groupCountZ},
};
cmd_dispatch(cmdbuf, &info);
}

View File

@@ -404,7 +404,9 @@ prepare_blend(struct panvk_cmd_buffer *cmdbuf)
panvk_per_arch(blend_emit_descs)(cmdbuf, bds);
cs_move64_to(b, cs_sr_reg64(b, 50), ptr.gpu | bd_count);
cs_update_vt_ctx(b)
cs_move64_to(b, cs_sr_reg64(b, 50), ptr.gpu | bd_count);
return VK_SUCCESS;
}
@@ -1200,24 +1202,38 @@ prepare_push_uniforms(struct panvk_cmd_buffer *cmdbuf)
{
struct cs_builder *b =
panvk_get_cs_builder(cmdbuf, PANVK_SUBQUEUE_VERTEX_TILER);
const struct panvk_shader *vs = cmdbuf->state.gfx.vs.shader;
const struct panvk_shader *fs = get_fs(cmdbuf);
VkResult result;
if (gfx_state_dirty(cmdbuf, PUSH_UNIFORMS)) {
cmdbuf->state.gfx.push_uniforms = panvk_per_arch(
cmd_prepare_push_uniforms)(cmdbuf, VK_PIPELINE_BIND_POINT_GRAPHICS);
if (!cmdbuf->state.gfx.push_uniforms)
return VK_ERROR_OUT_OF_DEVICE_MEMORY;
uint32_t push_size =
SYSVALS_PUSH_CONST_BASE + sizeof(struct panvk_graphics_sysvals);
uint64_t fau_count = DIV_ROUND_UP(push_size, 8);
mali_ptr fau_ptr = cmdbuf->state.gfx.push_uniforms | (fau_count << 56);
if (gfx_state_dirty(cmdbuf, VS_PUSH_UNIFORMS)) {
result = panvk_per_arch(cmd_prepare_push_uniforms)(cmdbuf, vs);
if (result != VK_SUCCESS)
return result;
cs_update_vt_ctx(b) {
cs_move64_to(b, cs_sr_reg64(b, 8), fau_ptr);
cs_move64_to(b, cs_sr_reg64(b, 12), fau_ptr);
cs_move64_to(b, cs_sr_reg64(b, 8),
cmdbuf->state.gfx.vs.push_uniforms |
((uint64_t)vs->fau.total_count << 56));
}
}
if (fs_user_dirty(cmdbuf) || gfx_state_dirty(cmdbuf, FS_PUSH_UNIFORMS)) {
mali_ptr fau_ptr = 0;
if (fs) {
result = panvk_per_arch(cmd_prepare_push_uniforms)(cmdbuf, fs);
if (result != VK_SUCCESS)
return result;
fau_ptr = cmdbuf->state.gfx.fs.push_uniforms |
((uint64_t)fs->fau.total_count << 56);
}
cs_update_vt_ctx(b)
cs_move64_to(b, cs_sr_reg64(b, 12), fau_ptr);
}
return VK_SUCCESS;
}
@@ -1535,6 +1551,10 @@ prepare_draw(struct panvk_cmd_buffer *cmdbuf, struct panvk_draw_info *draw)
return result;
}
result = prepare_blend(cmdbuf);
if (result != VK_SUCCESS)
return result;
panvk_per_arch(cmd_prepare_draw_sysvals)(cmdbuf, draw);
result = prepare_push_uniforms(cmdbuf);
@@ -1570,10 +1590,6 @@ prepare_draw(struct panvk_cmd_buffer *cmdbuf, struct panvk_draw_info *draw)
cs_move32_to(b, cs_sr_reg32(b, 48), varying_size);
result = prepare_blend(cmdbuf);
if (result != VK_SUCCESS)
return result;
result = prepare_ds(cmdbuf);
if (result != VK_SUCCESS)
return result;
@@ -1789,7 +1805,7 @@ panvk_cmd_draw_indirect(struct panvk_cmd_buffer *cmdbuf,
assert(draw->indirect.draw_count == 1);
/* Force a new push uniform block to be allocated */
gfx_state_set_dirty(cmdbuf, PUSH_UNIFORMS);
gfx_state_set_dirty(cmdbuf, VS_PUSH_UNIFORMS);
result = prepare_draw(cmdbuf, draw);
if (result != VK_SUCCESS)
@@ -1808,17 +1824,27 @@ panvk_cmd_draw_indirect(struct panvk_cmd_buffer *cmdbuf,
/* Wait for the SR33-37 indirect buffer load. */
cs_wait_slot(b, SB_ID(LS), false);
struct cs_index fau_block_addr = cs_scratch_reg64(b, 2);
cs_move64_to(b, fau_block_addr, cmdbuf->state.gfx.push_uniforms);
cs_store32(b, cs_sr_reg32(b, 36), fau_block_addr,
SYSVALS_PUSH_CONST_BASE +
offsetof(struct panvk_graphics_sysvals, vs.first_vertex));
cs_store32(b, cs_sr_reg32(b, 37), fau_block_addr,
SYSVALS_PUSH_CONST_BASE +
offsetof(struct panvk_graphics_sysvals, vs.base_instance));
if (shader_uses_sysval(vs, graphics, vs.first_vertex) ||
shader_uses_sysval(vs, graphics, vs.base_instance)) {
struct cs_index fau_block_addr = cs_scratch_reg64(b, 2);
cs_move64_to(b, fau_block_addr, cmdbuf->state.gfx.vs.push_uniforms);
/* Wait for the store using SR-37 as src to finish, so we can overwrite it. */
cs_wait_slot(b, SB_ID(LS), false);
if (shader_uses_sysval(vs, graphics, vs.first_vertex)) {
cs_store32(b, cs_sr_reg32(b, 36), fau_block_addr,
shader_remapped_sysval_offset(
vs, sysval_offset(graphics, vs.first_vertex)));
}
if (shader_uses_sysval(vs, graphics, vs.base_instance)) {
cs_store32(b, cs_sr_reg32(b, 37), fau_block_addr,
shader_remapped_sysval_offset(
vs, sysval_offset(graphics, vs.base_instance)));
}
/* Wait for the store using SR-37 as src to finish, so we can overwrite
* it. */
cs_wait_slot(b, SB_ID(LS), false);
}
/* NIR expects zero-based instance ID, but even if it did have an intrinsic to
* load the absolute instance ID, we'd want to keep it zero-based to work around

View File

@@ -44,10 +44,8 @@ panvk_per_arch(CmdDispatchBase)(VkCommandBuffer commandBuffer,
return;
struct panvk_dispatch_info info = {
.direct = {
.wg_base = {baseGroupX, baseGroupY, baseGroupZ},
.wg_count = {groupCountX, groupCountY, groupCountZ},
},
.wg_base = {baseGroupX, baseGroupY, baseGroupZ},
.direct.wg_count = {groupCountX, groupCountY, groupCountZ},
};
struct panvk_device *dev = to_panvk_device(cmdbuf->vk.base.device);
struct panvk_physical_device *phys_dev =
@@ -80,14 +78,10 @@ panvk_per_arch(CmdDispatchBase)(VkCommandBuffer commandBuffer,
panvk_per_arch(cmd_prepare_dispatch_sysvals)(cmdbuf, &info);
if (compute_state_dirty(cmdbuf, PUSH_UNIFORMS)) {
cmdbuf->state.compute.push_uniforms = panvk_per_arch(
cmd_prepare_push_uniforms)(cmdbuf, VK_PIPELINE_BIND_POINT_COMPUTE);
if (!cmdbuf->state.compute.push_uniforms)
return;
}
mali_ptr push_uniforms = cmdbuf->state.compute.push_uniforms;
result = panvk_per_arch(cmd_prepare_push_uniforms)(
cmdbuf, cmdbuf->state.compute.shader);
if (result != VK_SUCCESS)
return;
struct panfrost_ptr copy_desc_job = {0};
@@ -130,7 +124,7 @@ panvk_per_arch(CmdDispatchBase)(VkCommandBuffer commandBuffer,
cs_desc_state->tables[PANVK_BIFROST_DESC_TABLE_IMG];
cfg.thread_storage = tsd;
cfg.uniform_buffers = cs_desc_state->tables[PANVK_BIFROST_DESC_TABLE_UBO];
cfg.push_uniforms = push_uniforms;
cfg.push_uniforms = cmdbuf->state.compute.push_uniforms;
cfg.textures = cs_desc_state->tables[PANVK_BIFROST_DESC_TABLE_TEXTURE];
cfg.samplers = cs_desc_state->tables[PANVK_BIFROST_DESC_TABLE_SAMPLER];
}

View File

@@ -50,7 +50,6 @@ struct panvk_draw_data {
mali_ptr rsd;
mali_ptr varyings;
} fs;
mali_ptr push_uniforms;
mali_ptr varying_bufs;
mali_ptr position;
mali_ptr indices;
@@ -722,7 +721,7 @@ panvk_emit_vertex_dcd(struct panvk_cmd_buffer *cmdbuf,
cfg.instance_size =
draw->info.instance.count > 1 ? draw->padded_vertex_count : 1;
cfg.uniform_buffers = vs_desc_state->tables[PANVK_BIFROST_DESC_TABLE_UBO];
cfg.push_uniforms = draw->push_uniforms;
cfg.push_uniforms = cmdbuf->state.gfx.vs.push_uniforms;
cfg.textures = vs_desc_state->tables[PANVK_BIFROST_DESC_TABLE_TEXTURE];
cfg.samplers = vs_desc_state->tables[PANVK_BIFROST_DESC_TABLE_SAMPLER];
}
@@ -900,7 +899,7 @@ panvk_emit_tiler_dcd(struct panvk_cmd_buffer *cmdbuf,
cfg.instance_size =
draw->info.instance.count > 1 ? draw->padded_vertex_count : 1;
cfg.uniform_buffers = fs_desc_state->tables[PANVK_BIFROST_DESC_TABLE_UBO];
cfg.push_uniforms = draw->push_uniforms;
cfg.push_uniforms = cmdbuf->state.gfx.fs.push_uniforms;
cfg.textures = fs_desc_state->tables[PANVK_BIFROST_DESC_TABLE_TEXTURE];
cfg.samplers = fs_desc_state->tables[PANVK_BIFROST_DESC_TABLE_SAMPLER];
@@ -1261,12 +1260,18 @@ panvk_cmd_draw(struct panvk_cmd_buffer *cmdbuf, struct panvk_draw_data *draw)
panvk_per_arch(cmd_prepare_draw_sysvals)(cmdbuf, &draw->info);
cmdbuf->state.gfx.push_uniforms = panvk_per_arch(
cmd_prepare_push_uniforms)(cmdbuf, VK_PIPELINE_BIND_POINT_GRAPHICS);
if (!cmdbuf->state.gfx.push_uniforms)
result = panvk_per_arch(cmd_prepare_push_uniforms)(
cmdbuf, cmdbuf->state.gfx.vs.shader);
if (result != VK_SUCCESS)
return;
draw->push_uniforms = cmdbuf->state.gfx.push_uniforms;
if (fs) {
result = panvk_per_arch(cmd_prepare_push_uniforms)(
cmdbuf, cmdbuf->state.gfx.fs.shader);
if (result != VK_SUCCESS)
return;
}
result = panvk_draw_prepare_tiler_context(cmdbuf, draw);
if (result != VK_SUCCESS)
return;

View File

@@ -43,11 +43,24 @@ struct panvk_cmd_compute_state {
compute_state_clear_all_dirty(__cmdbuf); \
} while (0)
#define set_compute_sysval(__cmdbuf, __dirty, __name, __val) \
do { \
struct panvk_compute_sysvals __new_sysval; \
__new_sysval.__name = (__val); \
if (memcmp(&(__cmdbuf)->state.compute.sysvals.__name, \
&__new_sysval.__name, sizeof(__new_sysval.__name))) { \
(__cmdbuf)->state.compute.sysvals.__name = __new_sysval.__name; \
BITSET_SET_RANGE(__dirty, sysval_fau_start(compute, __name), \
sysval_fau_start(compute, __name)); \
} \
} while (0)
struct panvk_dispatch_info {
struct {
struct {
uint32_t x, y, z;
} wg_base;
uint32_t x, y, z;
} wg_base;
struct {
struct {
uint32_t x, y, z;
} wg_count;

View File

@@ -87,7 +87,8 @@ enum panvk_cmd_graphics_dirty_state {
PANVK_CMD_GRAPHICS_DIRTY_OQ,
PANVK_CMD_GRAPHICS_DIRTY_DESC_STATE,
PANVK_CMD_GRAPHICS_DIRTY_RENDER_STATE,
PANVK_CMD_GRAPHICS_DIRTY_PUSH_UNIFORMS,
PANVK_CMD_GRAPHICS_DIRTY_VS_PUSH_UNIFORMS,
PANVK_CMD_GRAPHICS_DIRTY_FS_PUSH_UNIFORMS,
PANVK_CMD_GRAPHICS_DIRTY_STATE_COUNT,
};
@@ -109,6 +110,7 @@ struct panvk_cmd_graphics_state {
struct {
const struct panvk_shader *shader;
struct panvk_shader_desc_state desc;
mali_ptr push_uniforms;
bool required;
#if PAN_ARCH <= 7
mali_ptr rsd;
@@ -118,6 +120,7 @@ struct panvk_cmd_graphics_state {
struct {
const struct panvk_shader *shader;
struct panvk_shader_desc_state desc;
mali_ptr push_uniforms;
#if PAN_ARCH <= 7
mali_ptr attribs;
mali_ptr attrib_bufs;
@@ -142,8 +145,6 @@ struct panvk_cmd_graphics_state {
struct panvk_rendering_state render;
mali_ptr push_uniforms;
#if PAN_ARCH <= 7
mali_ptr vpd;
#endif
@@ -171,6 +172,18 @@ struct panvk_cmd_graphics_state {
#define gfx_state_set_all_dirty(__cmdbuf) \
BITSET_ONES((__cmdbuf)->state.gfx.dirty)
#define set_gfx_sysval(__cmdbuf, __dirty, __name, __val) \
do { \
struct panvk_graphics_sysvals __new_sysval; \
__new_sysval.__name = __val; \
if (memcmp(&(__cmdbuf)->state.gfx.sysvals.__name, &__new_sysval.__name, \
sizeof(__new_sysval.__name))) { \
(__cmdbuf)->state.gfx.sysvals.__name = __new_sysval.__name; \
BITSET_SET_RANGE(__dirty, sysval_fau_start(graphics, __name), \
sysval_fau_end(graphics, __name)); \
} \
} while (0)
static inline uint32_t
panvk_select_tiler_hierarchy_mask(const struct panvk_physical_device *phys_dev,
const struct panvk_cmd_graphics_state *state)
@@ -278,11 +291,15 @@ cached_fs_required(ASSERTED const struct panvk_cmd_graphics_state *state,
do { \
bool __set_fs_dirty = \
(__cmdbuf)->state.gfx.fs.shader != get_fs(__cmdbuf); \
bool __set_fs_push_dirty = \
__set_fs_dirty && gfx_state_dirty(__cmdbuf, FS_PUSH_UNIFORMS); \
vk_dynamic_graphics_state_clear_dirty( \
&(__cmdbuf)->vk.dynamic_graphics_state); \
gfx_state_clear_all_dirty(__cmdbuf); \
if (__set_fs_dirty) \
gfx_state_set_dirty(__cmdbuf, FS); \
if (__set_fs_push_dirty) \
gfx_state_set_dirty(__cmdbuf, FS_PUSH_UNIFORMS); \
} while (0)
void

View File

@@ -11,15 +11,16 @@
#include "genxml/gen_macros.h"
struct panvk_cmd_buffer;
struct panvk_shader;
#define MAX_PUSH_CONSTANTS_SIZE 128
struct panvk_push_constant_state {
uint8_t data[MAX_PUSH_CONSTANTS_SIZE];
uint64_t data[MAX_PUSH_CONSTANTS_SIZE / sizeof(uint64_t)];
};
mali_ptr
VkResult
panvk_per_arch(cmd_prepare_push_uniforms)(struct panvk_cmd_buffer *cmdbuf,
VkPipelineBindPoint ptype);
const struct panvk_shader *shader);
#endif

View File

@@ -52,16 +52,16 @@ enum panvk_desc_table_id {
#endif
struct panvk_graphics_sysvals {
struct {
float constants[4];
} blend;
struct {
struct {
float x, y, z;
} scale, offset;
} viewport;
struct {
float constants[4];
} blend;
struct {
#if PAN_ARCH <= 7
int32_t raw_vertex_offset;
@@ -105,32 +105,94 @@ struct panvk_compute_sysvals {
#endif
};
/* This is not the final offset in the push constant buffer (AKA FAU), but
* just a magic offset we use before packing push constants so we can easily
* identify the type of push constant (driver sysvals vs user push constants).
*/
#define SYSVALS_PUSH_CONST_BASE MAX_PUSH_CONSTANTS_SIZE
#define FAU_WORD_SIZE sizeof(uint64_t)
static_assert((sizeof(struct panvk_compute_sysvals) % FAU_WORD_SIZE) == 0,
"struct panvk_compute_sysvals must be 8-byte aligned");
static_assert((sizeof(struct panvk_graphics_sysvals) % FAU_WORD_SIZE) == 0,
"struct panvk_graphics_sysvals must be 8-byte aligned");
#define sysval_size(__ptype, __name) \
sizeof(((struct panvk_##__ptype##_sysvals *)NULL)->__name)
#define sysval_offset(__ptype, __name) \
offsetof(struct panvk_##__ptype##_sysvals, __name)
#define sysval_entry_size(__ptype, __name) \
sizeof(((struct panvk_##__ptype##_sysvals *)NULL)->__name[0])
#define sysval_entry_offset(__ptype, __name, __idx) \
(sysval_offset(__ptype, __name) + \
(sysval_entry_size(__ptype, __name) * __idx))
#define sysval_fau_start(__ptype, __name) \
(sysval_offset(__ptype, __name) / FAU_WORD_SIZE)
#define sysval_fau_end(__ptype, __name) \
((sysval_offset(__ptype, __name) + sysval_size(__ptype, __name) - 1) / \
FAU_WORD_SIZE)
#define sysval_fau_entry_start(__ptype, __name, __idx) \
(sysval_entry_offset(__ptype, __name, __idx) / FAU_WORD_SIZE)
#define sysval_fau_entry_end(__ptype, __name, __idx) \
((sysval_entry_offset(__ptype, __name, __idx + 1) - 1) / FAU_WORD_SIZE)
#define shader_remapped_fau_offset(__shader, __kind, __offset) \
((FAU_WORD_SIZE * BITSET_PREFIX_SUM((__shader)->fau.used_##__kind, \
(__offset) / FAU_WORD_SIZE)) + \
((__offset) % FAU_WORD_SIZE))
#define shader_remapped_sysval_offset(__shader, __offset) \
shader_remapped_fau_offset(__shader, sysvals, __offset)
#define shader_remapped_push_const_offset(__shader, __offset) \
(((__shader)->fau.sysval_count * FAU_WORD_SIZE) + \
shader_remapped_fau_offset(__shader, push_consts, __offset))
#define shader_use_sysval(__shader, __ptype, __name) \
BITSET_SET_RANGE((__shader)->fau.used_sysvals, \
sysval_fau_start(__ptype, __name), \
sysval_fau_end(__ptype, __name))
#define shader_uses_sysval(__shader, __ptype, __name) \
BITSET_TEST_RANGE((__shader)->fau.used_sysvals, \
sysval_fau_start(__ptype, __name), \
sysval_fau_end(__ptype, __name))
#define shader_uses_sysval_entry(__shader, __ptype, __name, __idx) \
BITSET_TEST_RANGE((__shader)->fau.used_sysvals, \
sysval_fau_entry_start(__ptype, __name, __idx), \
sysval_fau_entry_end(__ptype, __name, __idx))
#define shader_use_sysval_range(__shader, __base, __range) \
BITSET_SET_RANGE((__shader)->fau.used_sysvals, (__base) / FAU_WORD_SIZE, \
((__base) + (__range) - 1) / FAU_WORD_SIZE)
#define shader_use_push_const_range(__shader, __base, __range) \
BITSET_SET_RANGE((__shader)->fau.used_push_consts, \
(__base) / FAU_WORD_SIZE, \
((__base) + (__range) - 1) / FAU_WORD_SIZE)
#define load_sysval(__b, __ptype, __bitsz, __name) \
nir_load_push_constant( \
__b, \
sizeof(((struct panvk_##__ptype##_sysvals *)NULL)->__name) / \
((__bitsz) / 8), \
__bitsz, \
nir_imm_int(__b, offsetof(struct panvk_##__ptype##_sysvals, __name)), \
.base = SYSVALS_PUSH_CONST_BASE, \
.range = sizeof(struct panvk_##__ptype##_sysvals))
__b, sysval_size(__ptype, __name) / ((__bitsz) / 8), __bitsz, \
nir_imm_int(__b, sysval_offset(__ptype, __name)), \
.base = SYSVALS_PUSH_CONST_BASE)
#define load_sysval_entry(__b, __ptype, __bitsz, __name, __dyn_idx) \
nir_load_push_constant( \
__b, \
sizeof(((struct panvk_##__ptype##_sysvals *)NULL)->__name[0]) / \
((__bitsz) / 8), \
__bitsz, \
__b, sysval_entry_size(__ptype, __name) / ((__bitsz) / 8), __bitsz, \
nir_iadd_imm( \
__b, \
nir_imul_imm( \
__b, __dyn_idx, \
sizeof(((struct panvk_##__ptype##_sysvals *)NULL)->__name[0])), \
offsetof(struct panvk_##__ptype##_sysvals, __name)), \
.base = SYSVALS_PUSH_CONST_BASE, \
.range = sizeof(struct panvk_##__ptype##_sysvals))
nir_imul_imm(__b, __dyn_idx, sysval_entry_size(__ptype, __name)), \
sysval_offset(__ptype, __name)), \
.base = SYSVALS_PUSH_CONST_BASE)
#if PAN_ARCH <= 7
enum panvk_bifrost_desc_table_type {
@@ -155,6 +217,20 @@ enum panvk_bifrost_desc_table_type {
#define COPY_DESC_HANDLE_EXTRACT_INDEX(handle) ((handle) & BITFIELD_MASK(28))
#define COPY_DESC_HANDLE_EXTRACT_TABLE(handle) ((handle) >> 28)
#define MAX_COMPUTE_SYSVAL_FAUS \
(sizeof(struct panvk_compute_sysvals) / FAU_WORD_SIZE)
#define MAX_GFX_SYSVAL_FAUS \
(sizeof(struct panvk_graphics_sysvals) / FAU_WORD_SIZE)
#define MAX_SYSVAL_FAUS MAX2(MAX_COMPUTE_SYSVAL_FAUS, MAX_GFX_SYSVAL_FAUS)
#define MAX_PUSH_CONST_FAUS (MAX_PUSH_CONSTANTS_SIZE / FAU_WORD_SIZE)
struct panvk_shader_fau_info {
BITSET_DECLARE(used_sysvals, MAX_SYSVAL_FAUS);
BITSET_DECLARE(used_push_consts, MAX_PUSH_CONST_FAUS);
uint32_t sysval_count;
uint32_t total_count;
};
struct panvk_shader {
struct vk_shader vk;
struct pan_shader_info info;
@@ -184,6 +260,8 @@ struct panvk_shader {
#endif
} desc_info;
struct panvk_shader_fau_info fau;
const void *bin_ptr;
uint32_t bin_size;

View File

@@ -37,11 +37,9 @@ lower_load_blend_const(nir_builder *b, nir_instr *instr, UNUSED void *data)
b->cursor = nir_before_instr(instr);
unsigned offset = offsetof(struct panvk_graphics_sysvals, blend.constants);
/* Blend constants are always passed through FAU words 0:3. */
nir_def *blend_consts = nir_load_push_constant(
b, intr->def.num_components, intr->def.bit_size,
/* Push constants are placed first, and then come the sysvals. */
nir_imm_int(b, SYSVALS_PUSH_CONST_BASE + offset));
b, intr->def.num_components, intr->def.bit_size, nir_imm_int(b, 0));
nir_def_rewrite_uses(&intr->def, blend_consts);
return true;
@@ -412,7 +410,7 @@ panvk_per_arch(blend_emit_descs)(struct panvk_cmd_buffer *cmdbuf,
}
if (blend_info->shader_loads_blend_const)
gfx_state_set_dirty(cmdbuf, PUSH_UNIFORMS);
gfx_state_set_dirty(cmdbuf, FS_PUSH_UNIFORMS);
return VK_SUCCESS;
}

View File

@@ -12,24 +12,34 @@ void
panvk_per_arch(cmd_prepare_dispatch_sysvals)(
struct panvk_cmd_buffer *cmdbuf, const struct panvk_dispatch_info *info)
{
struct panvk_compute_sysvals *sysvals = &cmdbuf->state.compute.sysvals;
const struct panvk_shader *shader = cmdbuf->state.compute.shader;
BITSET_DECLARE(dirty_sysvals, MAX_SYSVAL_FAUS) = {0};
/* In indirect case, some sysvals are read from the indirect dispatch
* buffer.
*/
if (info->indirect.buffer_dev_addr == 0) {
sysvals->base.x = info->direct.wg_base.x;
sysvals->base.y = info->direct.wg_base.y;
sysvals->base.z = info->direct.wg_base.z;
sysvals->num_work_groups.x = info->direct.wg_count.x;
sysvals->num_work_groups.y = info->direct.wg_count.y;
sysvals->num_work_groups.z = info->direct.wg_count.z;
set_compute_sysval(cmdbuf, dirty_sysvals, num_work_groups.x,
info->direct.wg_count.x);
set_compute_sysval(cmdbuf, dirty_sysvals, num_work_groups.y,
info->direct.wg_count.y);
set_compute_sysval(cmdbuf, dirty_sysvals, num_work_groups.z,
info->direct.wg_count.z);
} else {
BITSET_SET_RANGE(dirty_sysvals,
sysval_fau_start(compute, num_work_groups),
sysval_fau_end(compute, num_work_groups));
}
sysvals->local_group_size.x = shader->local_size.x;
sysvals->local_group_size.y = shader->local_size.y;
sysvals->local_group_size.z = shader->local_size.z;
set_compute_sysval(cmdbuf, dirty_sysvals, base.x, info->wg_base.x);
set_compute_sysval(cmdbuf, dirty_sysvals, base.y, info->wg_base.y);
set_compute_sysval(cmdbuf, dirty_sysvals, base.z, info->wg_base.z);
set_compute_sysval(cmdbuf, dirty_sysvals, local_group_size.x,
shader->local_size.x);
set_compute_sysval(cmdbuf, dirty_sysvals, local_group_size.y,
shader->local_size.y);
set_compute_sysval(cmdbuf, dirty_sysvals, local_group_size.z,
shader->local_size.z);
#if PAN_ARCH <= 7
struct panvk_descriptor_state *desc_state =
@@ -39,16 +49,21 @@ panvk_per_arch(cmd_prepare_dispatch_sysvals)(
if (compute_state_dirty(cmdbuf, CS) ||
compute_state_dirty(cmdbuf, DESC_STATE)) {
sysvals->desc.sets[PANVK_DESC_TABLE_CS_DYN_SSBOS] =
cs_desc_state->dyn_ssbos;
set_compute_sysval(cmdbuf, dirty_sysvals,
desc.sets[PANVK_DESC_TABLE_CS_DYN_SSBOS],
cs_desc_state->dyn_ssbos);
}
for (uint32_t i = 0; i < MAX_SETS; i++) {
if (shader->desc_info.used_set_mask & BITFIELD_BIT(i))
sysvals->desc.sets[i] = desc_state->sets[i]->descs.dev;
if (shader->desc_info.used_set_mask & BITFIELD_BIT(i)) {
set_compute_sysval(cmdbuf, dirty_sysvals, desc.sets[i],
desc_state->sets[i]->descs.dev);
}
}
#endif
/* We unconditionally update the sysvals, so push_uniforms is always dirty. */
compute_state_set_dirty(cmdbuf, PUSH_UNIFORMS);
/* Dirty push_uniforms if the used_sysvals/dirty_sysvals overlap. */
BITSET_AND(dirty_sysvals, dirty_sysvals, shader->fau.used_sysvals);
if (!BITSET_IS_EMPTY(dirty_sysvals))
compute_state_set_dirty(cmdbuf, PUSH_UNIFORMS);
}

View File

@@ -548,43 +548,27 @@ void
panvk_per_arch(cmd_prepare_draw_sysvals)(struct panvk_cmd_buffer *cmdbuf,
const struct panvk_draw_info *info)
{
struct panvk_graphics_sysvals *sysvals = &cmdbuf->state.gfx.sysvals;
struct vk_color_blend_state *cb = &cmdbuf->vk.dynamic_graphics_state.cb;
const struct panvk_shader *fs = cmdbuf->state.gfx.fs.shader;
const struct panvk_shader *fs = get_fs(cmdbuf);
uint32_t noperspective_varyings = fs ? fs->info.varyings.noperspective : 0;
BITSET_DECLARE(dirty_sysvals, MAX_SYSVAL_FAUS) = {0};
if (sysvals->vs.noperspective_varyings != noperspective_varyings) {
sysvals->vs.noperspective_varyings = noperspective_varyings;
gfx_state_set_dirty(cmdbuf, PUSH_UNIFORMS);
}
if (sysvals->vs.first_vertex != info->vertex.base) {
sysvals->vs.first_vertex = info->vertex.base;
gfx_state_set_dirty(cmdbuf, PUSH_UNIFORMS);
}
if (sysvals->vs.base_instance != info->instance.base) {
sysvals->vs.base_instance = info->instance.base;
gfx_state_set_dirty(cmdbuf, PUSH_UNIFORMS);
}
set_gfx_sysval(cmdbuf, dirty_sysvals, vs.noperspective_varyings,
noperspective_varyings);
set_gfx_sysval(cmdbuf, dirty_sysvals, vs.first_vertex, info->vertex.base);
set_gfx_sysval(cmdbuf, dirty_sysvals, vs.base_instance, info->instance.base);
#if PAN_ARCH <= 7
if (sysvals->vs.raw_vertex_offset != info->vertex.raw_offset) {
sysvals->vs.raw_vertex_offset = info->vertex.raw_offset;
gfx_state_set_dirty(cmdbuf, PUSH_UNIFORMS);
}
if (sysvals->layer_id != info->layer_id) {
sysvals->layer_id = info->layer_id;
gfx_state_set_dirty(cmdbuf, PUSH_UNIFORMS);
}
set_gfx_sysval(cmdbuf, dirty_sysvals, vs.raw_vertex_offset,
info->vertex.raw_offset);
set_gfx_sysval(cmdbuf, dirty_sysvals, layer_id, info->layer_id);
#endif
if (dyn_gfx_state_dirty(cmdbuf, CB_BLEND_CONSTANTS)) {
for (unsigned i = 0; i < ARRAY_SIZE(cb->blend_constants); i++)
sysvals->blend.constants[i] =
CLAMP(cb->blend_constants[i], 0.0f, 1.0f);
gfx_state_set_dirty(cmdbuf, PUSH_UNIFORMS);
for (unsigned i = 0; i < ARRAY_SIZE(cb->blend_constants); i++) {
set_gfx_sysval(cmdbuf, dirty_sysvals, blend.constants[i],
CLAMP(cb->blend_constants[i], 0.0f, 1.0f));
}
}
if (dyn_gfx_state_dirty(cmdbuf, VP_VIEWPORTS) ||
@@ -600,9 +584,12 @@ panvk_per_arch(cmd_prepare_draw_sysvals)(struct panvk_cmd_buffer *cmdbuf,
* py = height
* pz = maxDepth - minDepth
*/
sysvals->viewport.scale.x = 0.5f * viewport->width;
sysvals->viewport.scale.y = 0.5f * viewport->height;
sysvals->viewport.scale.z = (viewport->maxDepth - viewport->minDepth);
set_gfx_sysval(cmdbuf, dirty_sysvals, viewport.scale.x,
0.5f * viewport->width);
set_gfx_sysval(cmdbuf, dirty_sysvals, viewport.scale.y,
0.5f * viewport->height);
set_gfx_sysval(cmdbuf, dirty_sysvals, viewport.scale.z,
(viewport->maxDepth - viewport->minDepth));
/* Upload the viewport offset. Defined as (ox, oy, oz) at the start of
* section 24.5 ("Controlling the Viewport") of the Vulkan spec. At the
@@ -612,9 +599,12 @@ panvk_per_arch(cmd_prepare_draw_sysvals)(struct panvk_cmd_buffer *cmdbuf,
* oy = y + height/2
* oz = minDepth
*/
sysvals->viewport.offset.x = (0.5f * viewport->width) + viewport->x;
sysvals->viewport.offset.y = (0.5f * viewport->height) + viewport->y;
sysvals->viewport.offset.z = viewport->minDepth;
set_gfx_sysval(cmdbuf, dirty_sysvals, viewport.offset.x,
(0.5f * viewport->width) + viewport->x);
set_gfx_sysval(cmdbuf, dirty_sysvals, viewport.offset.y,
(0.5f * viewport->height) + viewport->y);
set_gfx_sysval(cmdbuf, dirty_sysvals, viewport.offset.z,
viewport->minDepth);
#if PAN_ARCH >= 9
/* Doing the viewport transform in the vertex shader and then depth
@@ -628,6 +618,7 @@ panvk_per_arch(cmd_prepare_draw_sysvals)(struct panvk_cmd_buffer *cmdbuf,
* doesn't help with the precision loss, but at least clipping isn't
* completely broken.
*/
const struct panvk_graphics_sysvals *sysvals = &cmdbuf->state.gfx.sysvals;
const struct vk_rasterization_state *rs =
&cmdbuf->vk.dynamic_graphics_state.rs;
@@ -637,7 +628,8 @@ panvk_per_arch(cmd_prepare_draw_sysvals)(struct panvk_cmd_buffer *cmdbuf,
float z_max = viewport->maxDepth;
float z_sign = z_min <= z_max ? 1.0f : -1.0f;
sysvals->viewport.scale.z = z_sign * MIN_DEPTH_CLIP_RANGE;
set_gfx_sysval(cmdbuf, dirty_sysvals, viewport.scale.z,
z_sign * MIN_DEPTH_CLIP_RANGE);
/* Middle of the user range is
* z_range_center = z_min + (z_max - z_min) * 0.5f,
@@ -648,40 +640,61 @@ panvk_per_arch(cmd_prepare_draw_sysvals)(struct panvk_cmd_buffer *cmdbuf,
*/
float z_offset = (z_max + z_min - sysvals->viewport.scale.z) * 0.5f;
/* Bump offset off-center if necessary, to not go out of range */
sysvals->viewport.offset.z = CLAMP(z_offset, 0.0f, 1.0f);
set_gfx_sysval(cmdbuf, dirty_sysvals, viewport.offset.z,
CLAMP(z_offset, 0.0f, 1.0f));
}
#endif
gfx_state_set_dirty(cmdbuf, PUSH_UNIFORMS);
}
#if PAN_ARCH <= 7
const struct panvk_shader *vs = cmdbuf->state.gfx.vs.shader;
#if PAN_ARCH <= 7
struct panvk_descriptor_state *desc_state = &cmdbuf->state.gfx.desc_state;
struct panvk_shader_desc_state *vs_desc_state = &cmdbuf->state.gfx.vs.desc;
struct panvk_shader_desc_state *fs_desc_state = &cmdbuf->state.gfx.fs.desc;
if (gfx_state_dirty(cmdbuf, DESC_STATE) || gfx_state_dirty(cmdbuf, VS)) {
sysvals->desc.sets[PANVK_DESC_TABLE_VS_DYN_SSBOS] =
vs_desc_state->dyn_ssbos;
gfx_state_set_dirty(cmdbuf, PUSH_UNIFORMS);
set_gfx_sysval(cmdbuf, dirty_sysvals,
desc.sets[PANVK_DESC_TABLE_VS_DYN_SSBOS],
vs_desc_state->dyn_ssbos);
}
if (gfx_state_dirty(cmdbuf, DESC_STATE) || gfx_state_dirty(cmdbuf, FS)) {
sysvals->desc.sets[PANVK_DESC_TABLE_FS_DYN_SSBOS] =
fs_desc_state->dyn_ssbos;
gfx_state_set_dirty(cmdbuf, PUSH_UNIFORMS);
set_gfx_sysval(cmdbuf, dirty_sysvals,
desc.sets[PANVK_DESC_TABLE_FS_DYN_SSBOS],
fs_desc_state->dyn_ssbos);
}
for (uint32_t i = 0; i < MAX_SETS; i++) {
uint32_t used_set_mask =
vs->desc_info.used_set_mask | (fs ? fs->desc_info.used_set_mask : 0);
if (used_set_mask & BITFIELD_BIT(i))
sysvals->desc.sets[i] = desc_state->sets[i]->descs.dev;
gfx_state_set_dirty(cmdbuf, PUSH_UNIFORMS);
if (used_set_mask & BITFIELD_BIT(i)) {
set_gfx_sysval(cmdbuf, dirty_sysvals, desc.sets[i],
desc_state->sets[i]->descs.dev);
}
}
#endif
/* We mask the dirty sysvals by the shader usage, and only flag
* the push uniforms dirty if those intersect. */
BITSET_DECLARE(dirty_shader_sysvals, MAX_SYSVAL_FAUS);
BITSET_AND(dirty_shader_sysvals, dirty_sysvals, vs->fau.used_sysvals);
if (!BITSET_IS_EMPTY(dirty_shader_sysvals))
gfx_state_set_dirty(cmdbuf, VS_PUSH_UNIFORMS);
if (fs) {
BITSET_AND(dirty_shader_sysvals, dirty_sysvals, fs->fau.used_sysvals);
/* If blend constants are not read by the blend shader, we can consider
* they are not read at all, so clear the dirty bits to avoid re-emitting
* FAUs when we can. */
if (!cmdbuf->state.gfx.cb.info.shader_loads_blend_const)
BITSET_CLEAR_RANGE(dirty_shader_sysvals, 0, 3);
if (!BITSET_IS_EMPTY(dirty_shader_sysvals))
gfx_state_set_dirty(cmdbuf, FS_PUSH_UNIFORMS);
}
}
VKAPI_ATTR void VKAPI_CALL

View File

@@ -62,12 +62,8 @@ panvk_per_arch(cmd_meta_compute_end)(
push_set0->desc_count = save_ctx->push_set0.desc_count;
}
if (memcmp(cmdbuf->state.push_constants.data, save_ctx->push_constants.data,
sizeof(cmdbuf->state.push_constants.data))) {
cmdbuf->state.push_constants = save_ctx->push_constants;
compute_state_set_dirty(cmdbuf, PUSH_UNIFORMS);
gfx_state_set_dirty(cmdbuf, PUSH_UNIFORMS);
}
cmdbuf->state.push_constants = save_ctx->push_constants;
compute_state_set_dirty(cmdbuf, PUSH_UNIFORMS);
cmdbuf->state.compute.shader = save_ctx->cs.shader;
cmdbuf->state.compute.cs.desc = save_ctx->cs.desc;
@@ -127,12 +123,9 @@ panvk_per_arch(cmd_meta_gfx_end)(
push_set0->desc_count = save_ctx->push_set0.desc_count;
}
if (memcmp(cmdbuf->state.push_constants.data, save_ctx->push_constants.data,
sizeof(cmdbuf->state.push_constants.data))) {
cmdbuf->state.push_constants = save_ctx->push_constants;
compute_state_set_dirty(cmdbuf, PUSH_UNIFORMS);
gfx_state_set_dirty(cmdbuf, PUSH_UNIFORMS);
}
cmdbuf->state.push_constants = save_ctx->push_constants;
gfx_state_set_dirty(cmdbuf, VS_PUSH_UNIFORMS);
gfx_state_set_dirty(cmdbuf, FS_PUSH_UNIFORMS);
cmdbuf->state.gfx.fs.shader = save_ctx->fs.shader;
cmdbuf->state.gfx.fs.desc = save_ctx->fs.desc;

View File

@@ -7,35 +7,70 @@
#include "panvk_cmd_buffer.h"
#include "panvk_entrypoints.h"
mali_ptr
VkResult
panvk_per_arch(cmd_prepare_push_uniforms)(struct panvk_cmd_buffer *cmdbuf,
VkPipelineBindPoint ptype)
const struct panvk_shader *shader)
{
uint32_t sysvals_sz = ptype == VK_PIPELINE_BIND_POINT_GRAPHICS
? sizeof(struct panvk_graphics_sysvals)
: sizeof(struct panvk_compute_sysvals);
const void *sysvals = ptype == VK_PIPELINE_BIND_POINT_GRAPHICS
? (void *)&cmdbuf->state.gfx.sysvals
: (void *)&cmdbuf->state.compute.sysvals;
struct panfrost_ptr push_uniforms = panvk_cmd_alloc_dev_mem(
cmdbuf, desc, SYSVALS_PUSH_CONST_BASE + sysvals_sz, 16);
mali_ptr *push_ptr;
if (push_uniforms.gpu) {
if (ptype == VK_PIPELINE_BIND_POINT_GRAPHICS)
cmdbuf->state.gfx.sysvals.push_consts = push_uniforms.gpu;
else
cmdbuf->state.compute.sysvals.push_consts = push_uniforms.gpu;
/* The first half is used for push constants. */
memcpy(push_uniforms.cpu, cmdbuf->state.push_constants.data,
sizeof(cmdbuf->state.push_constants.data));
/* The second half is used for sysvals. */
memcpy((uint8_t *)push_uniforms.cpu + SYSVALS_PUSH_CONST_BASE, sysvals,
sysvals_sz);
switch (shader->vk.stage) {
case MESA_SHADER_COMPUTE:
if (!compute_state_dirty(cmdbuf, PUSH_UNIFORMS))
return VK_SUCCESS;
push_ptr = &cmdbuf->state.compute.push_uniforms;
break;
case MESA_SHADER_VERTEX:
if (!gfx_state_dirty(cmdbuf, VS_PUSH_UNIFORMS))
return VK_SUCCESS;
push_ptr = &cmdbuf->state.gfx.vs.push_uniforms;
break;
case MESA_SHADER_FRAGMENT:
if (!gfx_state_dirty(cmdbuf, FS_PUSH_UNIFORMS))
return VK_SUCCESS;
push_ptr = &cmdbuf->state.gfx.fs.push_uniforms;
break;
default:
assert(!"Invalid stage");
return VK_SUCCESS;
}
return push_uniforms.gpu;
if (!shader->fau.total_count) {
*push_ptr = 0;
return VK_SUCCESS;
}
struct panfrost_ptr push_uniforms = panvk_cmd_alloc_dev_mem(
cmdbuf, desc, shader->fau.total_count * sizeof(uint64_t),
sizeof(uint64_t));
if (!push_uniforms.gpu)
return VK_ERROR_OUT_OF_DEVICE_MEMORY;
if (shader->vk.stage == MESA_SHADER_COMPUTE) {
cmdbuf->state.compute.sysvals.push_consts =
push_uniforms.gpu + (shader->fau.sysval_count * FAU_WORD_SIZE);
} else {
cmdbuf->state.gfx.sysvals.push_consts =
push_uniforms.gpu + (shader->fau.sysval_count * FAU_WORD_SIZE);
}
uint64_t *sysvals = shader->vk.stage == MESA_SHADER_COMPUTE
? (uint64_t *)&cmdbuf->state.compute.sysvals
: (uint64_t *)&cmdbuf->state.gfx.sysvals;
uint64_t *push_consts = cmdbuf->state.push_constants.data;
uint64_t *faus = push_uniforms.cpu;
uint32_t w, fau = 0;
/* After packing, the sysvals come first, followed by the user push constants.
* The ordering is encoded shader side, so don't re-order these loops. */
BITSET_FOREACH_SET(w, shader->fau.used_sysvals, MAX_SYSVAL_FAUS)
faus[fau++] = sysvals[w];
BITSET_FOREACH_SET(w, shader->fau.used_push_consts, MAX_PUSH_CONST_FAUS)
faus[fau++] = push_consts[w];
*push_ptr = push_uniforms.gpu;
return VK_SUCCESS;
}
VKAPI_ATTR void VKAPI_CALL
@@ -45,12 +80,17 @@ panvk_per_arch(CmdPushConstants2KHR)(
{
VK_FROM_HANDLE(panvk_cmd_buffer, cmdbuf, commandBuffer);
if (pPushConstantsInfo->stageFlags & VK_SHADER_STAGE_ALL_GRAPHICS)
gfx_state_set_dirty(cmdbuf, PUSH_UNIFORMS);
if (pPushConstantsInfo->stageFlags & VK_SHADER_STAGE_VERTEX_BIT)
gfx_state_set_dirty(cmdbuf, VS_PUSH_UNIFORMS);
if (pPushConstantsInfo->stageFlags & VK_SHADER_STAGE_FRAGMENT_BIT)
gfx_state_set_dirty(cmdbuf, FS_PUSH_UNIFORMS);
if (pPushConstantsInfo->stageFlags & VK_SHADER_STAGE_COMPUTE_BIT)
compute_state_set_dirty(cmdbuf, PUSH_UNIFORMS);
memcpy(cmdbuf->state.push_constants.data + pPushConstantsInfo->offset,
pPushConstantsInfo->pValues, pPushConstantsInfo->size);
uint8_t *data =
(uint8_t *)cmdbuf->state.push_constants.data + pPushConstantsInfo->offset;
memcpy(data, pPushConstantsInfo->pValues, pPushConstantsInfo->size);
}

View File

@@ -460,54 +460,166 @@ valhall_lower_get_ssbo_size(struct nir_builder *b,
}
static bool
lower_load_push_consts(nir_builder *b, nir_intrinsic_instr *intr,
UNUSED void *data)
collect_push_constant(struct nir_builder *b, nir_intrinsic_instr *intr,
void *data)
{
if (intr->intrinsic != nir_intrinsic_load_push_constant)
return false;
unsigned base = nir_intrinsic_base(intr);
struct panvk_shader *shader = data;
uint32_t base = nir_intrinsic_base(intr);
bool is_sysval = base >= SYSVALS_PUSH_CONST_BASE;
uint32_t offset, size;
/* We always set the range to zero, to make sure no pass is using it after
* that point. */
nir_intrinsic_set_range(intr, 0);
/* Sysvals should have a constant offset. */
assert(!is_sysval || nir_src_is_const(intr->src[0]));
if (is_sysval)
base -= SYSVALS_PUSH_CONST_BASE;
/* If the offset is dynamic, we need to flag [base:base+range] as used, to
* allow global mem access. */
if (!nir_src_is_const(intr->src[0])) {
offset = base;
size = nir_intrinsic_range(intr);
/* Flag the push_consts sysval as needed if we have an indirect offset. */
if (b->shader->info.stage == MESA_SHADER_COMPUTE)
shader_use_sysval(shader, compute, push_consts);
else
shader_use_sysval(shader, graphics, push_consts);
} else {
offset = base + nir_src_as_uint(intr->src[0]);
size = (intr->def.bit_size / 8) * intr->def.num_components;
}
if (is_sysval)
shader_use_sysval_range(shader, offset, size);
else
shader_use_push_const_range(shader, offset, size);
return true;
}
static bool
move_push_constant(struct nir_builder *b, nir_intrinsic_instr *intr, void *data)
{
if (intr->intrinsic != nir_intrinsic_load_push_constant)
return false;
struct panvk_shader *shader = data;
unsigned base = nir_intrinsic_base(intr);
bool is_sysval = base >= SYSVALS_PUSH_CONST_BASE;
if (is_sysval)
base -= SYSVALS_PUSH_CONST_BASE;
/* Sysvals should have a constant offset. */
assert(!is_sysval || nir_src_is_const(intr->src[0]));
b->cursor = nir_before_instr(&intr->instr);
/* Offset is constant, we just propagate base to the offset if it's not
* already zero. */
if (nir_src_is_const(intr->src[0])) {
if (base == 0)
return true;
unsigned offset = base + nir_src_as_uint(intr->src[0]);
nir_src_rewrite(&intr->src[0],
nir_imm_int(b, nir_src_as_uint(intr->src[0]) + base));
/* We place the sysvals first, and then comes the user push constants.
* We do that so we always have the blend constants at offset 0 for
* blend shaders. */
if (is_sysval)
offset = shader_remapped_sysval_offset(shader, offset);
else
offset = shader_remapped_push_const_offset(shader, offset);
nir_src_rewrite(&intr->src[0], nir_imm_int(b, offset));
/* We always set the range/base to zero, to make sure no pass is using it
* after that point. */
nir_intrinsic_set_base(intr, 0);
return true;
nir_intrinsic_set_range(intr, 0);
} else {
/* We don't use load_sysval() on purpose, because it would set
* .base=SYSVALS_PUSH_CONST_BASE, and we're supposed to force a base of
* zero in this pass. */
unsigned push_const_buf_offset = shader_remapped_sysval_offset(
shader, b->shader->info.stage == MESA_SHADER_COMPUTE
? sysval_offset(compute, push_consts)
: sysval_offset(graphics, push_consts));
nir_def *push_const_buf = nir_load_push_constant(
b, 1, 64, nir_imm_int(b, push_const_buf_offset));
unsigned push_const_offset =
shader_remapped_fau_offset(shader, push_consts, base);
nir_def *offset = nir_iadd_imm(b, intr->src[0].ssa, push_const_offset);
unsigned align = nir_combined_align(nir_intrinsic_align_mul(intr),
nir_intrinsic_align_offset(intr));
/* We assume an alignment of 64-bit max for packed push-constants. */
align = MIN2(align, FAU_WORD_SIZE);
nir_def *value =
nir_load_global(b, nir_iadd(b, push_const_buf, nir_u2u64(b, offset)),
align, intr->def.num_components, intr->def.bit_size);
nir_def_replace(&intr->def, value);
}
/* We don't use load_sysval() on purpose, because it would set
* .base=SYSVALS_PUSH_CONST_BASE, and we're supposed to force a base of
* zero in this pass. */
unsigned push_const_addr_offset =
SYSVALS_PUSH_CONST_BASE +
(b->shader->info.stage == MESA_SHADER_COMPUTE
? offsetof(struct panvk_compute_sysvals, push_consts)
: offsetof(struct panvk_graphics_sysvals, push_consts));
nir_def *push_const_buf =
nir_load_push_constant(b, 1, 64, nir_imm_int(b, push_const_addr_offset));
nir_def *offset = nir_iadd_imm(b, intr->src[0].ssa, base);
unsigned align = nir_combined_align(nir_intrinsic_align_mul(intr),
nir_intrinsic_align_offset(intr));
nir_def *value =
nir_load_global(b, nir_iadd(b, push_const_buf, nir_u2u64(b, offset)),
align, intr->def.num_components, intr->def.bit_size);
nir_def_replace(&intr->def, value);
return true;
}
static void
lower_load_push_consts(nir_shader *nir, struct panvk_shader *shader)
{
/* Before we lower load_push_constant()s with a dynamic offset to global
* loads, we want to run a few optimization passes to get rid of offset
* calculation involving only constant values. */
bool progress = false;
do {
progress = false;
NIR_PASS(progress, nir, nir_copy_prop);
NIR_PASS(progress, nir, nir_opt_remove_phis);
NIR_PASS(progress, nir, nir_opt_dce);
NIR_PASS(progress, nir, nir_opt_dead_cf);
NIR_PASS(progress, nir, nir_opt_cse);
NIR_PASS(progress, nir, nir_opt_peephole_select, 64, false, true);
NIR_PASS(progress, nir, nir_opt_algebraic);
NIR_PASS(progress, nir, nir_opt_constant_folding);
} while (progress);
/* We always reserve the 4 blend constant words for fragment shaders,
* because we don't know the blend configuration at this point, and
* we might end up with a blend shader reading those blend constants. */
if (shader->vk.stage == MESA_SHADER_FRAGMENT) {
/* We rely on blend constants being placed first and covering 4 words. */
STATIC_ASSERT(
offsetof(struct panvk_graphics_sysvals, blend.constants) == 0 &&
sizeof(((struct panvk_graphics_sysvals *)NULL)->blend.constants) ==
16);
shader_use_sysval(shader, graphics, blend.constants);
}
progress = false;
NIR_PASS(progress, nir, nir_shader_intrinsics_pass, collect_push_constant,
nir_metadata_all, shader);
/* Some load_push_constant instructions might be eliminated after
* scalarization+dead-code-elimination. Since these pass happen in
* bifrost_compile(), we can't run the push_constant packing after the
* optimization took place, so let's just have our own FAU count instead
* of using info.push.count to make it consistent with the
* used_{sysvals,push_consts} bitmaps, even if it sometimes implies loading
* more than we really need. Doing that also takes into account the fact
* blend constants are never loaded from the fragment shader, but might be
* needed in the blend shader. */
shader->fau.sysval_count = BITSET_COUNT(shader->fau.used_sysvals);
shader->fau.total_count =
shader->fau.sysval_count + BITSET_COUNT(shader->fau.used_push_consts);
if (!progress)
return;
NIR_PASS(_, nir, nir_shader_intrinsics_pass, move_push_constant,
nir_metadata_control_flow, shader);
}
static void
panvk_lower_nir(struct panvk_device *dev, nir_shader *nir,
uint32_t set_layout_count,
@@ -632,24 +744,7 @@ panvk_lower_nir(struct panvk_device *dev, nir_shader *nir,
NIR_PASS(_, nir, nir_shader_instructions_pass, panvk_lower_sysvals,
nir_metadata_control_flow, NULL);
/* Before we lower load_push_constant()s with a dynamic offset to global
* loads, we want to run a few optimization passes to get rid of offset
* calculation involving only constant values. */
bool progress = false;
do {
progress = false;
NIR_PASS(progress, nir, nir_copy_prop);
NIR_PASS(progress, nir, nir_opt_remove_phis);
NIR_PASS(progress, nir, nir_opt_dce);
NIR_PASS(progress, nir, nir_opt_dead_cf);
NIR_PASS(progress, nir, nir_opt_cse);
NIR_PASS(progress, nir, nir_opt_peephole_select, 64, false, true);
NIR_PASS(progress, nir, nir_opt_algebraic);
NIR_PASS(progress, nir, nir_opt_constant_folding);
} while (progress);
NIR_PASS(_, nir, nir_shader_intrinsics_pass, lower_load_push_consts,
nir_metadata_control_flow, NULL);
lower_load_push_consts(nir, shader);
}
static VkResult
@@ -955,6 +1050,10 @@ panvk_compile_shader(struct panvk_device *dev,
result = panvk_compile_nir(dev, nir, info->flags, &inputs, shader);
/* We need to update info.push.count because it's used to initialize the
* RSD in pan_shader_prepare_rsd(). */
shader->info.push.count = shader->fau.total_count * 2;
if (result != VK_SUCCESS) {
panvk_shader_destroy(&dev->vk, &shader->vk, pAllocator);
return result;
@@ -1091,6 +1190,9 @@ panvk_deserialize_shader(struct vk_device *vk_dev, struct blob_reader *blob,
struct pan_shader_info info;
blob_copy_bytes(blob, &info, sizeof(info));
struct panvk_shader_fau_info fau;
blob_copy_bytes(blob, &fau, sizeof(fau));
struct pan_compute_dim local_size;
blob_copy_bytes(blob, &local_size, sizeof(local_size));
@@ -1105,6 +1207,7 @@ panvk_deserialize_shader(struct vk_device *vk_dev, struct blob_reader *blob,
return panvk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
shader->info = info;
shader->fau = fau;
shader->local_size = local_size;
shader->bin_size = bin_size;
@@ -1187,6 +1290,7 @@ panvk_shader_serialize(struct vk_device *vk_dev,
return false;
blob_write_bytes(blob, &shader->info, sizeof(shader->info));
blob_write_bytes(blob, &shader->fau, sizeof(shader->fau));
blob_write_bytes(blob, &shader->local_size, sizeof(shader->local_size));
blob_write_uint32(blob, shader->bin_size);
blob_write_bytes(blob, shader->bin_ptr, shader->bin_size);
@@ -1546,18 +1650,21 @@ panvk_cmd_bind_shader(struct panvk_cmd_buffer *cmd, const gl_shader_stage stage,
if (cmd->state.compute.shader != shader) {
cmd->state.compute.shader = shader;
compute_state_set_dirty(cmd, CS);
compute_state_set_dirty(cmd, PUSH_UNIFORMS);
}
break;
case MESA_SHADER_VERTEX:
if (cmd->state.gfx.vs.shader != shader) {
cmd->state.gfx.vs.shader = shader;
gfx_state_set_dirty(cmd, VS);
gfx_state_set_dirty(cmd, VS_PUSH_UNIFORMS);
}
break;
case MESA_SHADER_FRAGMENT:
if (cmd->state.gfx.fs.shader != shader) {
cmd->state.gfx.fs.shader = shader;
gfx_state_set_dirty(cmd, FS);
gfx_state_set_dirty(cmd, FS_PUSH_UNIFORMS);
}
break;
default: