panvk: Use IDVS jobs when we can
This optimizes things by splitting the position and vertex processing in two, allowing primitives to be discarded before the varying shader is executed. This optimization is even more important if we throw layered rendering into the mix, because layered rendering on Bifrost is implemented with N IDVS/fragment jobs (N being the number of layers), with primitives not targetting a given layer being artificially culled in the vertex shader by issuing a position outside the render area. Signed-off-by: Boris Brezillon <boris.brezillon@collabora.com> Reviewed-by: Mary Guillemard <mary.guillemard@collabora.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/29450>
This commit is contained in:

committed by
Marge Bot

parent
8293376f7c
commit
743b41a284
@@ -70,9 +70,14 @@ struct panvk_draw_info {
|
||||
mali_ptr viewport;
|
||||
struct {
|
||||
struct panfrost_ptr vertex_copy_desc;
|
||||
struct panfrost_ptr vertex;
|
||||
struct panfrost_ptr frag_copy_desc;
|
||||
struct panfrost_ptr tiler;
|
||||
union {
|
||||
struct {
|
||||
struct panfrost_ptr vertex;
|
||||
struct panfrost_ptr tiler;
|
||||
};
|
||||
struct panfrost_ptr idvs;
|
||||
};
|
||||
} jobs;
|
||||
};
|
||||
|
||||
@@ -782,39 +787,14 @@ panvk_draw_prepare_viewport(struct panvk_cmd_buffer *cmdbuf,
|
||||
}
|
||||
|
||||
static void
|
||||
panvk_draw_prepare_vertex_job(struct panvk_cmd_buffer *cmdbuf,
|
||||
struct panvk_draw_info *draw)
|
||||
panvk_emit_vertex_dcd(struct panvk_cmd_buffer *cmdbuf,
|
||||
const struct panvk_draw_info *draw, void *dcd)
|
||||
{
|
||||
struct panvk_device *dev = to_panvk_device(cmdbuf->vk.base.device);
|
||||
const struct panvk_shader *vs = cmdbuf->state.gfx.vs.shader;
|
||||
struct panvk_shader_desc_state *vs_desc_state = &cmdbuf->state.gfx.vs.desc;
|
||||
struct panvk_batch *batch = cmdbuf->cur_batch;
|
||||
const struct panvk_shader_desc_state *vs_desc_state =
|
||||
&cmdbuf->state.gfx.vs.desc;
|
||||
|
||||
panvk_per_arch(cmd_prepare_shader_desc_tables)(&cmdbuf->desc_pool.base,
|
||||
&cmdbuf->state.gfx.desc_state,
|
||||
vs, vs_desc_state);
|
||||
|
||||
struct panfrost_ptr ptr = panvk_per_arch(meta_get_copy_desc_job)(
|
||||
dev, &cmdbuf->desc_pool.base, vs, &cmdbuf->state.gfx.desc_state,
|
||||
vs_desc_state);
|
||||
|
||||
if (ptr.cpu)
|
||||
util_dynarray_append(&batch->jobs, void *, ptr.cpu);
|
||||
|
||||
draw->jobs.vertex_copy_desc = ptr;
|
||||
|
||||
ptr = pan_pool_alloc_desc(&cmdbuf->desc_pool.base, COMPUTE_JOB);
|
||||
util_dynarray_append(&batch->jobs, void *, ptr.cpu);
|
||||
draw->jobs.vertex = ptr;
|
||||
|
||||
memcpy(pan_section_ptr(ptr.cpu, COMPUTE_JOB, INVOCATION), &draw->invocation,
|
||||
pan_size(INVOCATION));
|
||||
|
||||
pan_section_pack(ptr.cpu, COMPUTE_JOB, PARAMETERS, cfg) {
|
||||
cfg.job_task_split = 5;
|
||||
}
|
||||
|
||||
pan_section_pack(ptr.cpu, COMPUTE_JOB, DRAW, cfg) {
|
||||
pan_pack(dcd, DRAW, cfg) {
|
||||
cfg.state = panvk_priv_mem_dev_addr(vs->rsd);
|
||||
cfg.attributes = draw->vs.attributes;
|
||||
cfg.attribute_buffers = draw->vs.attribute_bufs;
|
||||
@@ -831,6 +811,28 @@ panvk_draw_prepare_vertex_job(struct panvk_cmd_buffer *cmdbuf,
|
||||
}
|
||||
}
|
||||
|
||||
static void
|
||||
panvk_draw_prepare_vertex_job(struct panvk_cmd_buffer *cmdbuf,
|
||||
struct panvk_draw_info *draw)
|
||||
{
|
||||
struct panvk_batch *batch = cmdbuf->cur_batch;
|
||||
struct panfrost_ptr ptr =
|
||||
pan_pool_alloc_desc(&cmdbuf->desc_pool.base, COMPUTE_JOB);
|
||||
|
||||
util_dynarray_append(&batch->jobs, void *, ptr.cpu);
|
||||
draw->jobs.vertex = ptr;
|
||||
|
||||
memcpy(pan_section_ptr(ptr.cpu, COMPUTE_JOB, INVOCATION), &draw->invocation,
|
||||
pan_size(INVOCATION));
|
||||
|
||||
pan_section_pack(ptr.cpu, COMPUTE_JOB, PARAMETERS, cfg) {
|
||||
cfg.job_task_split = 5;
|
||||
}
|
||||
|
||||
panvk_emit_vertex_dcd(cmdbuf, draw,
|
||||
pan_section_ptr(ptr.cpu, COMPUTE_JOB, DRAW));
|
||||
}
|
||||
|
||||
static enum mali_draw_mode
|
||||
translate_prim_topology(VkPrimitiveTopology in)
|
||||
{
|
||||
@@ -867,6 +869,7 @@ panvk_emit_tiler_primitive(struct panvk_cmd_buffer *cmdbuf,
|
||||
bool writes_point_size =
|
||||
vs->info.vs.writes_point_size &&
|
||||
ia->primitive_topology == VK_PRIMITIVE_TOPOLOGY_POINT_LIST;
|
||||
bool secondary_shader = vs->info.vs.secondary_enable && fs_required(cmdbuf);
|
||||
|
||||
pan_pack(prim, PRIMITIVE, cfg) {
|
||||
cfg.draw_mode = translate_prim_topology(ia->primitive_topology);
|
||||
@@ -900,6 +903,8 @@ panvk_emit_tiler_primitive(struct panvk_cmd_buffer *cmdbuf,
|
||||
cfg.index_count = draw->vertex_count;
|
||||
cfg.index_type = MALI_INDEX_TYPE_NONE;
|
||||
}
|
||||
|
||||
cfg.secondary_shader = secondary_shader;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -974,18 +979,8 @@ panvk_draw_prepare_tiler_job(struct panvk_cmd_buffer *cmdbuf,
|
||||
{
|
||||
struct panvk_device *dev = to_panvk_device(cmdbuf->vk.base.device);
|
||||
struct panvk_batch *batch = cmdbuf->cur_batch;
|
||||
|
||||
/* If the vertex job doesn't write the position, we don't need a tiler job. */
|
||||
if (!draw->position)
|
||||
return;
|
||||
|
||||
const struct panvk_shader *fs = cmdbuf->state.gfx.fs.shader;
|
||||
struct panvk_shader_desc_state *fs_desc_state = &cmdbuf->state.gfx.fs.desc;
|
||||
|
||||
panvk_per_arch(cmd_prepare_shader_desc_tables)(&cmdbuf->desc_pool.base,
|
||||
&cmdbuf->state.gfx.desc_state,
|
||||
fs, fs_desc_state);
|
||||
|
||||
struct panfrost_ptr ptr = panvk_per_arch(meta_get_copy_desc_job)(
|
||||
dev, &cmdbuf->desc_pool.base, fs, &cmdbuf->state.gfx.desc_state,
|
||||
fs_desc_state);
|
||||
@@ -1019,6 +1014,79 @@ panvk_draw_prepare_tiler_job(struct panvk_cmd_buffer *cmdbuf,
|
||||
;
|
||||
}
|
||||
|
||||
static void
|
||||
panvk_draw_prepare_idvs_job(struct panvk_cmd_buffer *cmdbuf,
|
||||
struct panvk_draw_info *draw)
|
||||
{
|
||||
struct panvk_batch *batch = cmdbuf->cur_batch;
|
||||
struct panfrost_ptr ptr =
|
||||
pan_pool_alloc_desc(&cmdbuf->desc_pool.base, INDEXED_VERTEX_JOB);
|
||||
|
||||
util_dynarray_append(&batch->jobs, void *, ptr.cpu);
|
||||
draw->jobs.idvs = ptr;
|
||||
|
||||
memcpy(pan_section_ptr(ptr.cpu, INDEXED_VERTEX_JOB, INVOCATION),
|
||||
&draw->invocation, pan_size(INVOCATION));
|
||||
|
||||
panvk_emit_tiler_primitive(
|
||||
cmdbuf, draw, pan_section_ptr(ptr.cpu, INDEXED_VERTEX_JOB, PRIMITIVE));
|
||||
|
||||
panvk_emit_tiler_primitive_size(
|
||||
cmdbuf, draw,
|
||||
pan_section_ptr(ptr.cpu, INDEXED_VERTEX_JOB, PRIMITIVE_SIZE));
|
||||
|
||||
pan_section_pack(ptr.cpu, INDEXED_VERTEX_JOB, TILER, cfg) {
|
||||
cfg.address = draw->tiler_ctx->bifrost;
|
||||
}
|
||||
|
||||
pan_section_pack(ptr.cpu, INDEXED_VERTEX_JOB, PADDING, _) {
|
||||
}
|
||||
|
||||
panvk_emit_tiler_dcd(
|
||||
cmdbuf, draw,
|
||||
pan_section_ptr(ptr.cpu, INDEXED_VERTEX_JOB, FRAGMENT_DRAW));
|
||||
|
||||
panvk_emit_vertex_dcd(
|
||||
cmdbuf, draw, pan_section_ptr(ptr.cpu, INDEXED_VERTEX_JOB, VERTEX_DRAW));
|
||||
}
|
||||
|
||||
static void
|
||||
panvk_draw_prepare_vs_copy_desc_job(struct panvk_cmd_buffer *cmdbuf,
|
||||
struct panvk_draw_info *draw)
|
||||
{
|
||||
struct panvk_device *dev = to_panvk_device(cmdbuf->vk.base.device);
|
||||
struct panvk_batch *batch = cmdbuf->cur_batch;
|
||||
const struct panvk_shader *vs = cmdbuf->state.gfx.vs.shader;
|
||||
const struct panvk_shader_desc_state *vs_desc_state =
|
||||
&cmdbuf->state.gfx.vs.desc;
|
||||
struct panfrost_ptr ptr = panvk_per_arch(meta_get_copy_desc_job)(
|
||||
dev, &cmdbuf->desc_pool.base, vs, &cmdbuf->state.gfx.desc_state,
|
||||
vs_desc_state);
|
||||
|
||||
if (ptr.cpu)
|
||||
util_dynarray_append(&batch->jobs, void *, ptr.cpu);
|
||||
|
||||
draw->jobs.vertex_copy_desc = ptr;
|
||||
}
|
||||
|
||||
static void
|
||||
panvk_draw_prepare_fs_copy_desc_job(struct panvk_cmd_buffer *cmdbuf,
|
||||
struct panvk_draw_info *draw)
|
||||
{
|
||||
struct panvk_device *dev = to_panvk_device(cmdbuf->vk.base.device);
|
||||
const struct panvk_shader *fs = cmdbuf->state.gfx.fs.shader;
|
||||
struct panvk_shader_desc_state *fs_desc_state = &cmdbuf->state.gfx.fs.desc;
|
||||
struct panvk_batch *batch = cmdbuf->cur_batch;
|
||||
struct panfrost_ptr ptr = panvk_per_arch(meta_get_copy_desc_job)(
|
||||
dev, &cmdbuf->desc_pool.base, fs, &cmdbuf->state.gfx.desc_state,
|
||||
fs_desc_state);
|
||||
|
||||
if (ptr.cpu)
|
||||
util_dynarray_append(&batch->jobs, void *, ptr.cpu);
|
||||
|
||||
draw->jobs.frag_copy_desc = ptr;
|
||||
}
|
||||
|
||||
void
|
||||
panvk_per_arch(cmd_preload_fb_after_batch_split)(struct panvk_cmd_buffer *cmdbuf)
|
||||
{
|
||||
@@ -1063,9 +1131,12 @@ panvk_cmd_draw(struct panvk_cmd_buffer *cmdbuf, struct panvk_draw_info *draw)
|
||||
struct panvk_batch *batch = cmdbuf->cur_batch;
|
||||
const struct panvk_shader *vs = cmdbuf->state.gfx.vs.shader;
|
||||
const struct panvk_shader *fs = cmdbuf->state.gfx.fs.shader;
|
||||
struct panvk_shader_desc_state *vs_desc_state = &cmdbuf->state.gfx.vs.desc;
|
||||
struct panvk_shader_desc_state *fs_desc_state = &cmdbuf->state.gfx.fs.desc;
|
||||
struct panvk_descriptor_state *desc_state = &cmdbuf->state.gfx.desc_state;
|
||||
const struct vk_rasterization_state *rs =
|
||||
&cmdbuf->vk.dynamic_graphics_state.rs;
|
||||
bool idvs = vs->info.vs.idvs;
|
||||
|
||||
/* If there's no vertex shader, we can skip the draw. */
|
||||
if (!panvk_priv_mem_dev_addr(vs->rsd))
|
||||
@@ -1101,6 +1172,38 @@ panvk_cmd_draw(struct panvk_cmd_buffer *cmdbuf, struct panvk_draw_info *draw)
|
||||
&cmdbuf->state.gfx.sysvals, sizeof(cmdbuf->state.gfx.sysvals));
|
||||
}
|
||||
|
||||
panvk_per_arch(cmd_prepare_shader_desc_tables)(&cmdbuf->desc_pool.base,
|
||||
&cmdbuf->state.gfx.desc_state,
|
||||
vs, vs_desc_state);
|
||||
panvk_draw_prepare_vs_copy_desc_job(cmdbuf, draw);
|
||||
|
||||
unsigned copy_desc_job_id =
|
||||
draw->jobs.vertex_copy_desc.gpu
|
||||
? pan_jc_add_job(&batch->jc, MALI_JOB_TYPE_COMPUTE, false, false, 0, 0,
|
||||
&draw->jobs.vertex_copy_desc, false)
|
||||
: 0;
|
||||
|
||||
bool vs_writes_pos =
|
||||
cmdbuf->state.gfx.link.buf_strides[PANVK_VARY_BUF_POSITION] > 0;
|
||||
bool needs_tiling = !rs->rasterizer_discard_enable && vs_writes_pos;
|
||||
|
||||
/* No need to setup the FS desc tables if the FS is not executed. */
|
||||
if (needs_tiling && fs_required(cmdbuf)) {
|
||||
panvk_per_arch(cmd_prepare_shader_desc_tables)(
|
||||
&cmdbuf->desc_pool.base, &cmdbuf->state.gfx.desc_state, fs,
|
||||
fs_desc_state);
|
||||
panvk_draw_prepare_fs_copy_desc_job(cmdbuf, draw);
|
||||
|
||||
if (draw->jobs.frag_copy_desc.gpu) {
|
||||
/* We don't need to add frag_copy_desc as a dependency because the
|
||||
* tiler job doesn't execute the fragment shader, the fragment job
|
||||
* will, and the tiler/fragment synchronization happens at the batch
|
||||
* level. */
|
||||
pan_jc_add_job(&batch->jc, MALI_JOB_TYPE_COMPUTE, false, false, 0, 0,
|
||||
&draw->jobs.frag_copy_desc, false);
|
||||
}
|
||||
}
|
||||
|
||||
/* TODO: indexed draws */
|
||||
draw->tls = batch->tls.gpu;
|
||||
draw->fb = batch->fb.desc.gpu;
|
||||
@@ -1115,38 +1218,51 @@ panvk_cmd_draw(struct panvk_cmd_buffer *cmdbuf, struct panvk_draw_info *draw)
|
||||
panvk_draw_prepare_attributes(cmdbuf, draw);
|
||||
panvk_draw_prepare_viewport(cmdbuf, draw);
|
||||
panvk_draw_prepare_tiler_context(cmdbuf, draw);
|
||||
panvk_draw_prepare_vertex_job(cmdbuf, draw);
|
||||
panvk_draw_prepare_tiler_job(cmdbuf, draw);
|
||||
batch->tlsinfo.tls.size = MAX3(vs->info.tls_size, fs ? fs->info.tls_size : 0,
|
||||
batch->tlsinfo.tls.size);
|
||||
|
||||
unsigned copy_desc_job_id =
|
||||
draw->jobs.vertex_copy_desc.gpu
|
||||
? pan_jc_add_job(&batch->jc, MALI_JOB_TYPE_COMPUTE, false, false, 0, 0,
|
||||
&draw->jobs.vertex_copy_desc, false)
|
||||
: 0;
|
||||
if (idvs) {
|
||||
panvk_draw_prepare_idvs_job(cmdbuf, draw);
|
||||
pan_jc_add_job(&batch->jc, MALI_JOB_TYPE_INDEXED_VERTEX, false, false, 0,
|
||||
copy_desc_job_id, &draw->jobs.idvs, false);
|
||||
} else {
|
||||
panvk_draw_prepare_vertex_job(cmdbuf, draw);
|
||||
|
||||
unsigned vjob_id =
|
||||
pan_jc_add_job(&batch->jc, MALI_JOB_TYPE_VERTEX, false, false, 0,
|
||||
copy_desc_job_id, &draw->jobs.vertex, false);
|
||||
unsigned vjob_id =
|
||||
pan_jc_add_job(&batch->jc, MALI_JOB_TYPE_VERTEX, false, false, 0,
|
||||
copy_desc_job_id, &draw->jobs.vertex, false);
|
||||
|
||||
if (!rs->rasterizer_discard_enable && draw->position) {
|
||||
/* We don't need to add frag_copy_desc as a dependency, because the
|
||||
* tiler job doesn't execute the fragment shader. The fragment job
|
||||
* will, and the tiler/fragment synchronization happens at the batch
|
||||
* level. */
|
||||
if (draw->jobs.frag_copy_desc.gpu)
|
||||
pan_jc_add_job(&batch->jc, MALI_JOB_TYPE_COMPUTE, false, false, 0, 0,
|
||||
&draw->jobs.frag_copy_desc, false);
|
||||
|
||||
pan_jc_add_job(&batch->jc, MALI_JOB_TYPE_TILER, false, false, vjob_id, 0,
|
||||
&draw->jobs.tiler, false);
|
||||
if (needs_tiling) {
|
||||
panvk_draw_prepare_tiler_job(cmdbuf, draw);
|
||||
pan_jc_add_job(&batch->jc, MALI_JOB_TYPE_TILER, false, false, vjob_id,
|
||||
0, &draw->jobs.tiler, false);
|
||||
}
|
||||
}
|
||||
|
||||
/* Clear the dirty flags all at once */
|
||||
cmdbuf->state.gfx.dirty = 0;
|
||||
}
|
||||
|
||||
static unsigned
|
||||
padded_vertex_count(struct panvk_cmd_buffer *cmdbuf, uint32_t vertex_count,
|
||||
uint32_t instance_count)
|
||||
{
|
||||
if (instance_count == 1)
|
||||
return vertex_count;
|
||||
|
||||
bool idvs = cmdbuf->state.gfx.vs.shader->info.vs.idvs;
|
||||
|
||||
/* Index-Driven Vertex Shading requires different instances to
|
||||
* have different cache lines for position results. Each vertex
|
||||
* position is 16 bytes and the Mali cache line is 64 bytes, so
|
||||
* the instance count must be aligned to 4 vertices.
|
||||
*/
|
||||
if (idvs)
|
||||
vertex_count = ALIGN_POT(vertex_count, 4);
|
||||
|
||||
return panfrost_padded_vertex_count(vertex_count);
|
||||
}
|
||||
|
||||
VKAPI_ATTR void VKAPI_CALL
|
||||
panvk_per_arch(CmdDraw)(VkCommandBuffer commandBuffer, uint32_t vertexCount,
|
||||
uint32_t instanceCount, uint32_t firstVertex,
|
||||
@@ -1163,9 +1279,8 @@ panvk_per_arch(CmdDraw)(VkCommandBuffer commandBuffer, uint32_t vertexCount,
|
||||
.vertex_range = vertexCount,
|
||||
.first_instance = firstInstance,
|
||||
.instance_count = instanceCount,
|
||||
.padded_vertex_count = instanceCount > 1
|
||||
? panfrost_padded_vertex_count(vertexCount)
|
||||
: vertexCount,
|
||||
.padded_vertex_count =
|
||||
padded_vertex_count(cmdbuf, vertexCount, instanceCount),
|
||||
.offset_start = firstVertex,
|
||||
};
|
||||
|
||||
@@ -1250,9 +1365,8 @@ panvk_per_arch(CmdDrawIndexed)(VkCommandBuffer commandBuffer,
|
||||
.instance_count = instanceCount,
|
||||
.vertex_range = vertex_range,
|
||||
.vertex_count = indexCount + abs(vertexOffset),
|
||||
.padded_vertex_count = instanceCount > 1
|
||||
? panfrost_padded_vertex_count(vertex_range)
|
||||
: vertex_range,
|
||||
.padded_vertex_count =
|
||||
padded_vertex_count(cmdbuf, vertex_range, instanceCount),
|
||||
.offset_start = min_vertex + vertexOffset,
|
||||
.indices = panvk_buffer_gpu_ptr(cmdbuf->state.gfx.ib.buffer,
|
||||
cmdbuf->state.gfx.ib.offset) +
|
||||
|
@@ -427,6 +427,10 @@ panvk_compile_nir(struct panvk_device *dev, nir_shader *nir,
|
||||
VERT_ATTRIB_GENERIC0;
|
||||
|
||||
shader->info.attribute_count = util_last_bit(gen_attribs);
|
||||
|
||||
/* NULL IDVS shaders are not allowed. */
|
||||
if (!bin_size)
|
||||
shader->info.vs.idvs = false;
|
||||
}
|
||||
|
||||
/* Image attributes start at MAX_VS_ATTRIBS in the VS attribute table,
|
||||
@@ -515,7 +519,6 @@ panvk_compile_shader(struct panvk_device *dev,
|
||||
struct panfrost_compile_inputs inputs = {
|
||||
.gpu_id = phys_dev->kmod.props.gpu_prod_id,
|
||||
.no_ubo_to_push = true,
|
||||
.no_idvs = true, /* TODO */
|
||||
};
|
||||
|
||||
panvk_lower_nir(dev, nir, info->set_layout_count, info->set_layouts,
|
||||
|
Reference in New Issue
Block a user