panvk: Add support for layered rendering

This is needed if we want to use vk_meta_blit.

Signed-off-by: Boris Brezillon <boris.brezillon@collabora.com>
Reviewed-by: Mary Guillemard <mary.guillemard@collabora.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/29450>
This commit is contained in:
Boris Brezillon
2024-04-28 18:45:30 +02:00
committed by Marge Bot
parent 743b41a284
commit 0e74b6eda9
11 changed files with 272 additions and 134 deletions

View File

@@ -37,23 +37,25 @@ struct panvk_batch {
struct list_head node;
struct util_dynarray jobs;
struct util_dynarray event_ops;
struct pan_jc jc;
struct pan_jc vtc_jc;
struct pan_jc frag_jc;
struct {
struct panfrost_ptr desc;
uint32_t desc_stride;
uint32_t bo_count;
/* One slot per color, two more slots for the depth/stencil buffers. */
struct pan_kmod_bo *bos[MAX_RTS + 2];
uint32_t layer_count;
} fb;
struct {
struct pan_kmod_bo *src, *dst;
} blit;
struct panfrost_ptr tls;
mali_ptr fragment_job;
struct {
struct pan_tiler_context ctx;
struct panfrost_ptr heap_desc;
struct panfrost_ptr ctx_desc;
struct panfrost_ptr ctx_descs;
struct mali_tiler_heap_packed heap_templ;
struct mali_tiler_context_packed ctx_templ;
} tiler;
@@ -125,6 +127,7 @@ struct panvk_cmd_graphics_state {
struct {
VkRenderingFlags flags;
uint32_t layer_count;
enum vk_rp_attachment_flags bound_attachments;
struct {
@@ -206,7 +209,8 @@ void panvk_per_arch(cmd_alloc_fb_desc)(struct panvk_cmd_buffer *cmdbuf);
void panvk_per_arch(cmd_alloc_tls_desc)(struct panvk_cmd_buffer *cmdbuf,
bool gfx);
void panvk_per_arch(cmd_prepare_tiler_context)(struct panvk_cmd_buffer *cmdbuf);
void panvk_per_arch(cmd_prepare_tiler_context)(struct panvk_cmd_buffer *cmdbuf,
uint32_t layer_idx);
void panvk_per_arch(cmd_preload_fb_after_batch_split)(
struct panvk_cmd_buffer *cmdbuf);

View File

@@ -62,22 +62,22 @@ panvk_debug_adjust_bo_flags(const struct panvk_device *device,
}
static void
panvk_cmd_prepare_fragment_job(struct panvk_cmd_buffer *cmdbuf)
panvk_cmd_prepare_fragment_job(struct panvk_cmd_buffer *cmdbuf, mali_ptr fbd)
{
const struct pan_fb_info *fbinfo = &cmdbuf->state.gfx.render.fb.info;
struct panvk_batch *batch = cmdbuf->cur_batch;
struct panfrost_ptr job_ptr =
pan_pool_alloc_desc(&cmdbuf->desc_pool.base, FRAGMENT_JOB);
GENX(pan_emit_fragment_job_payload)
(fbinfo, batch->fb.desc.gpu, job_ptr.cpu);
GENX(pan_emit_fragment_job_payload)(fbinfo, fbd, job_ptr.cpu);
pan_section_pack(job_ptr.cpu, FRAGMENT_JOB, HEADER, header) {
header.type = MALI_JOB_TYPE_FRAGMENT;
header.index = 1;
}
batch->fragment_job = job_ptr.gpu;
pan_jc_add_job(&batch->frag_jc, MALI_JOB_TYPE_FRAGMENT, false, false, 0, 0,
&job_ptr, false);
util_dynarray_append(&batch->jobs, void *, job_ptr.cpu);
}
@@ -93,7 +93,7 @@ panvk_per_arch(cmd_close_batch)(struct panvk_cmd_buffer *cmdbuf)
assert(batch);
if (!batch->fb.desc.gpu && !batch->jc.first_job) {
if (!batch->fb.desc.gpu && !batch->vtc_jc.first_job) {
if (util_dynarray_num_elements(&batch->event_ops,
struct panvk_cmd_event_op) == 0) {
/* Content-less batch, let's drop it */
@@ -105,7 +105,7 @@ panvk_per_arch(cmd_close_batch)(struct panvk_cmd_buffer *cmdbuf)
struct panfrost_ptr ptr =
pan_pool_alloc_desc(&cmdbuf->desc_pool.base, JOB_HEADER);
util_dynarray_append(&batch->jobs, void *, ptr.cpu);
pan_jc_add_job(&batch->jc, MALI_JOB_TYPE_NULL, false, false, 0, 0,
pan_jc_add_job(&batch->vtc_jc, MALI_JOB_TYPE_NULL, false, false, 0, 0,
&ptr, false);
list_addtail(&batch->node, &cmdbuf->batches);
}
@@ -119,14 +119,6 @@ panvk_per_arch(cmd_close_batch)(struct panvk_cmd_buffer *cmdbuf)
list_addtail(&batch->node, &cmdbuf->batches);
if (batch->jc.first_tiler) {
ASSERTED unsigned num_preload_jobs = GENX(pan_preload_fb)(
&dev->meta.blitter.cache, &cmdbuf->desc_pool.base,
&cmdbuf->state.gfx.render.fb.info, 0, batch->tls.gpu, NULL);
assert(num_preload_jobs == 0);
}
if (batch->tlsinfo.tls.size) {
unsigned thread_tls_alloc =
panfrost_query_thread_tls_alloc(&phys_dev->kmod.props);
@@ -156,11 +148,29 @@ panvk_per_arch(cmd_close_batch)(struct panvk_cmd_buffer *cmdbuf)
panfrost_sample_positions_offset(
pan_sample_pattern(fbinfo->nr_samples));
batch->fb.desc.gpu |= GENX(pan_emit_fbd)(
&cmdbuf->state.gfx.render.fb.info, 0, &batch->tlsinfo,
&batch->tiler.ctx, batch->fb.desc.cpu);
for (uint32_t i = 0; i < batch->fb.layer_count; i++) {
mali_ptr fbd = batch->fb.desc.gpu + (batch->fb.desc_stride * i);
if (batch->vtc_jc.first_tiler) {
cmdbuf->state.gfx.render.fb.info.bifrost.pre_post.dcds.gpu = 0;
panvk_cmd_prepare_fragment_job(cmdbuf);
ASSERTED unsigned num_preload_jobs = GENX(pan_preload_fb)(
&dev->meta.blitter.cache, &cmdbuf->desc_pool.base,
&cmdbuf->state.gfx.render.fb.info, i, batch->tls.gpu, NULL);
/* Bifrost GPUs use pre frame DCDs to preload the FB content. We
* thus expect num_preload_jobs to be zero.
*/
assert(!num_preload_jobs);
}
panvk_per_arch(cmd_prepare_tiler_context)(cmdbuf, i);
fbd |= GENX(pan_emit_fbd)(
&cmdbuf->state.gfx.render.fb.info, i, &batch->tlsinfo,
&batch->tiler.ctx,
batch->fb.desc.cpu + (batch->fb.desc_stride * i));
panvk_cmd_prepare_fragment_job(cmdbuf, fbd);
}
}
cmdbuf->cur_batch = NULL;
@@ -176,14 +186,24 @@ panvk_per_arch(cmd_alloc_fb_desc)(struct panvk_cmd_buffer *cmdbuf)
const struct pan_fb_info *fbinfo = &cmdbuf->state.gfx.render.fb.info;
bool has_zs_ext = fbinfo->zs.view.zs || fbinfo->zs.view.s;
batch->fb.layer_count = cmdbuf->state.gfx.render.layer_count;
unsigned fbd_size = pan_size(FRAMEBUFFER);
if (has_zs_ext)
fbd_size = ALIGN_POT(fbd_size, pan_alignment(ZS_CRC_EXTENSION)) +
pan_size(ZS_CRC_EXTENSION);
fbd_size = ALIGN_POT(fbd_size, pan_alignment(RENDER_TARGET)) +
(MAX2(fbinfo->rt_count, 1) * pan_size(RENDER_TARGET));
batch->fb.bo_count = cmdbuf->state.gfx.render.fb.bo_count;
memcpy(batch->fb.bos, cmdbuf->state.gfx.render.fb.bos,
batch->fb.bo_count * sizeof(batch->fb.bos[0]));
batch->fb.desc = pan_pool_alloc_desc_aggregate(
&cmdbuf->desc_pool.base, PAN_DESC(FRAMEBUFFER),
PAN_DESC_ARRAY(has_zs_ext ? 1 : 0, ZS_CRC_EXTENSION),
PAN_DESC_ARRAY(MAX2(fbinfo->rt_count, 1), RENDER_TARGET));
batch->fb.desc = pan_pool_alloc_aligned(&cmdbuf->desc_pool.base,
fbd_size * batch->fb.layer_count,
pan_alignment(FRAMEBUFFER));
batch->fb.desc_stride = fbd_size;
memset(&cmdbuf->state.gfx.render.fb.info.bifrost.pre_post.dcds, 0,
sizeof(cmdbuf->state.gfx.render.fb.info.bifrost.pre_post.dcds));
@@ -201,19 +221,22 @@ panvk_per_arch(cmd_alloc_tls_desc)(struct panvk_cmd_buffer *cmdbuf, bool gfx)
}
void
panvk_per_arch(cmd_prepare_tiler_context)(struct panvk_cmd_buffer *cmdbuf)
panvk_per_arch(cmd_prepare_tiler_context)(struct panvk_cmd_buffer *cmdbuf,
uint32_t layer_idx)
{
struct panvk_device *dev = to_panvk_device(cmdbuf->vk.base.device);
struct pan_fb_info *fbinfo = &cmdbuf->state.gfx.render.fb.info;
struct panvk_batch *batch = cmdbuf->cur_batch;
if (batch->tiler.ctx_desc.cpu)
return;
if (batch->tiler.ctx_descs.cpu)
goto out_set_layer_ctx;
const struct pan_fb_info *fbinfo = &cmdbuf->state.gfx.render.fb.info;
uint32_t layer_count = cmdbuf->state.gfx.render.layer_count;
batch->tiler.heap_desc =
pan_pool_alloc_desc(&cmdbuf->desc_pool.base, TILER_HEAP);
batch->tiler.ctx_desc =
pan_pool_alloc_desc(&cmdbuf->desc_pool.base, TILER_CONTEXT);
batch->tiler.ctx_descs = pan_pool_alloc_desc_array(
&cmdbuf->desc_pool.base, layer_count, TILER_CONTEXT);
pan_pack(&batch->tiler.heap_templ, TILER_HEAP, cfg) {
cfg.size = pan_kmod_bo_size(dev->tiler_heap->bo);
@@ -232,9 +255,20 @@ panvk_per_arch(cmd_prepare_tiler_context)(struct panvk_cmd_buffer *cmdbuf)
memcpy(batch->tiler.heap_desc.cpu, &batch->tiler.heap_templ,
sizeof(batch->tiler.heap_templ));
memcpy(batch->tiler.ctx_desc.cpu, &batch->tiler.ctx_templ,
sizeof(batch->tiler.ctx_templ));
batch->tiler.ctx.bifrost = batch->tiler.ctx_desc.gpu;
struct mali_tiler_context_packed *ctxs = batch->tiler.ctx_descs.cpu;
assert(layer_count > 0);
for (uint32_t i = 0; i < layer_count; i++) {
STATIC_ASSERT(
!(pan_size(TILER_CONTEXT) & (pan_alignment(TILER_CONTEXT) - 1)));
memcpy(&ctxs[i], &batch->tiler.ctx_templ, sizeof(*ctxs));
}
out_set_layer_ctx:
batch->tiler.ctx.bifrost =
batch->tiler.ctx_descs.gpu + (pan_size(TILER_CONTEXT) * layer_idx);
}
struct panvk_batch *

View File

@@ -136,11 +136,11 @@ panvk_per_arch(CmdDispatchBase)(VkCommandBuffer commandBuffer,
unsigned copy_desc_dep =
copy_desc_job.gpu
? pan_jc_add_job(&batch->jc, MALI_JOB_TYPE_COMPUTE, false, false, 0, 0,
&copy_desc_job, false)
? pan_jc_add_job(&batch->vtc_jc, MALI_JOB_TYPE_COMPUTE, false, false,
0, 0, &copy_desc_job, false)
: 0;
pan_jc_add_job(&batch->jc, MALI_JOB_TYPE_COMPUTE, false, false, 0,
pan_jc_add_job(&batch->vtc_jc, MALI_JOB_TYPE_COMPUTE, false, false, 0,
copy_desc_dep, &job, false);
batch->tlsinfo.tls.size = shader->info.tls_size;

View File

@@ -46,6 +46,7 @@ struct panvk_draw_info {
unsigned instance_count;
int vertex_offset;
unsigned offset_start;
uint32_t layer_id;
struct mali_invocation_packed invocation;
struct {
mali_ptr varyings;
@@ -105,10 +106,12 @@ panvk_cmd_prepare_draw_sysvals(struct panvk_cmd_buffer *cmdbuf,
unsigned base_vertex = draw->index_size ? draw->vertex_offset : 0;
if (sysvals->vs.first_vertex != draw->offset_start ||
sysvals->vs.base_vertex != base_vertex ||
sysvals->vs.base_instance != draw->first_instance) {
sysvals->vs.base_instance != draw->first_instance ||
sysvals->layer_id != draw->layer_id) {
sysvals->vs.first_vertex = draw->offset_start;
sysvals->vs.base_vertex = base_vertex;
sysvals->vs.base_instance = draw->first_instance;
sysvals->layer_id = draw->layer_id;
cmdbuf->state.gfx.push_uniforms = 0;
}
@@ -468,7 +471,7 @@ panvk_draw_prepare_tiler_context(struct panvk_cmd_buffer *cmdbuf,
{
struct panvk_batch *batch = cmdbuf->cur_batch;
panvk_per_arch(cmd_prepare_tiler_context)(cmdbuf);
panvk_per_arch(cmd_prepare_tiler_context)(cmdbuf, draw->layer_id);
draw->tiler_ctx = &batch->tiler.ctx;
}
@@ -1134,6 +1137,7 @@ panvk_cmd_draw(struct panvk_cmd_buffer *cmdbuf, struct panvk_draw_info *draw)
struct panvk_shader_desc_state *vs_desc_state = &cmdbuf->state.gfx.vs.desc;
struct panvk_shader_desc_state *fs_desc_state = &cmdbuf->state.gfx.fs.desc;
struct panvk_descriptor_state *desc_state = &cmdbuf->state.gfx.desc_state;
uint32_t layer_count = cmdbuf->state.gfx.render.layer_count;
const struct vk_rasterization_state *rs =
&cmdbuf->vk.dynamic_graphics_state.rs;
bool idvs = vs->info.vs.idvs;
@@ -1146,7 +1150,7 @@ panvk_cmd_draw(struct panvk_cmd_buffer *cmdbuf, struct panvk_draw_info *draw)
* pilot shader dealing with descriptor copies, and we need one
* <vertex,tiler> pair per draw.
*/
if (batch->jc.job_index >= (UINT16_MAX - 4)) {
if (batch->vtc_jc.job_index + (4 * layer_count) >= UINT16_MAX) {
panvk_per_arch(cmd_close_batch)(cmdbuf);
panvk_per_arch(cmd_preload_fb_after_batch_split)(cmdbuf);
batch = panvk_per_arch(cmd_open_batch)(cmdbuf);
@@ -1164,13 +1168,6 @@ panvk_cmd_draw(struct panvk_cmd_buffer *cmdbuf, struct panvk_draw_info *draw)
panvk_per_arch(cmd_prepare_push_descs)(&cmdbuf->desc_pool.base, desc_state,
used_set_mask);
panvk_cmd_prepare_draw_sysvals(cmdbuf, draw);
if (!cmdbuf->state.gfx.push_uniforms) {
cmdbuf->state.gfx.push_uniforms = panvk_cmd_prepare_push_uniforms(
&cmdbuf->desc_pool.base, &cmdbuf->state.push_constants,
&cmdbuf->state.gfx.sysvals, sizeof(cmdbuf->state.gfx.sysvals));
}
panvk_per_arch(cmd_prepare_shader_desc_tables)(&cmdbuf->desc_pool.base,
&cmdbuf->state.gfx.desc_state,
@@ -1179,8 +1176,8 @@ panvk_cmd_draw(struct panvk_cmd_buffer *cmdbuf, struct panvk_draw_info *draw)
unsigned copy_desc_job_id =
draw->jobs.vertex_copy_desc.gpu
? pan_jc_add_job(&batch->jc, MALI_JOB_TYPE_COMPUTE, false, false, 0, 0,
&draw->jobs.vertex_copy_desc, false)
? pan_jc_add_job(&batch->vtc_jc, MALI_JOB_TYPE_COMPUTE, false, false,
0, 0, &draw->jobs.vertex_copy_desc, false)
: 0;
bool vs_writes_pos =
@@ -1199,43 +1196,51 @@ panvk_cmd_draw(struct panvk_cmd_buffer *cmdbuf, struct panvk_draw_info *draw)
* tiler job doesn't execute the fragment shader, the fragment job
* will, and the tiler/fragment synchronization happens at the batch
* level. */
pan_jc_add_job(&batch->jc, MALI_JOB_TYPE_COMPUTE, false, false, 0, 0,
&draw->jobs.frag_copy_desc, false);
pan_jc_add_job(&batch->vtc_jc, MALI_JOB_TYPE_COMPUTE, false, false, 0,
0, &draw->jobs.frag_copy_desc, false);
}
}
/* TODO: indexed draws */
draw->tls = batch->tls.gpu;
draw->fb = batch->fb.desc.gpu;
draw->push_uniforms = cmdbuf->state.gfx.push_uniforms;
panfrost_pack_work_groups_compute(&draw->invocation, 1, draw->vertex_range,
draw->instance_count, 1, 1, 1, true,
false);
panvk_draw_prepare_fs_rsd(cmdbuf, draw);
panvk_draw_prepare_varyings(cmdbuf, draw);
panvk_draw_prepare_attributes(cmdbuf, draw);
panvk_draw_prepare_viewport(cmdbuf, draw);
panvk_draw_prepare_tiler_context(cmdbuf, draw);
batch->tlsinfo.tls.size = MAX3(vs->info.tls_size, fs ? fs->info.tls_size : 0,
batch->tlsinfo.tls.size);
if (idvs) {
panvk_draw_prepare_idvs_job(cmdbuf, draw);
pan_jc_add_job(&batch->jc, MALI_JOB_TYPE_INDEXED_VERTEX, false, false, 0,
copy_desc_job_id, &draw->jobs.idvs, false);
} else {
panvk_draw_prepare_vertex_job(cmdbuf, draw);
for (uint32_t i = 0; i < layer_count; i++) {
draw->layer_id = i;
panvk_draw_prepare_varyings(cmdbuf, draw);
panvk_cmd_prepare_draw_sysvals(cmdbuf, draw);
cmdbuf->state.gfx.push_uniforms = panvk_cmd_prepare_push_uniforms(
&cmdbuf->desc_pool.base, &cmdbuf->state.push_constants,
&cmdbuf->state.gfx.sysvals, sizeof(cmdbuf->state.gfx.sysvals));
draw->push_uniforms = cmdbuf->state.gfx.push_uniforms;
panvk_draw_prepare_tiler_context(cmdbuf, draw);
unsigned vjob_id =
pan_jc_add_job(&batch->jc, MALI_JOB_TYPE_VERTEX, false, false, 0,
copy_desc_job_id, &draw->jobs.vertex, false);
if (idvs) {
panvk_draw_prepare_idvs_job(cmdbuf, draw);
pan_jc_add_job(&batch->vtc_jc, MALI_JOB_TYPE_INDEXED_VERTEX, false,
false, 0, copy_desc_job_id, &draw->jobs.idvs, false);
} else {
panvk_draw_prepare_vertex_job(cmdbuf, draw);
if (needs_tiling) {
panvk_draw_prepare_tiler_job(cmdbuf, draw);
pan_jc_add_job(&batch->jc, MALI_JOB_TYPE_TILER, false, false, vjob_id,
0, &draw->jobs.tiler, false);
unsigned vjob_id =
pan_jc_add_job(&batch->vtc_jc, MALI_JOB_TYPE_VERTEX, false, false,
0, copy_desc_job_id, &draw->jobs.vertex, false);
if (needs_tiling) {
panvk_draw_prepare_tiler_job(cmdbuf, draw);
pan_jc_add_job(&batch->vtc_jc, MALI_JOB_TYPE_TILER, false, false,
vjob_id, 0, &draw->jobs.tiler, false);
}
}
}
@@ -1417,6 +1422,7 @@ panvk_cmd_begin_rendering_init_state(struct panvk_cmd_buffer *cmdbuf,
sizeof(cmdbuf->state.gfx.render.color_attachments));
cmdbuf->state.gfx.render.bound_attachments = 0;
cmdbuf->state.gfx.render.layer_count = pRenderingInfo->layerCount;
*fbinfo = (struct pan_fb_info){
.tile_buf_budget = panfrost_query_optimal_tib_size(phys_dev->model),
.nr_samples = 1,

View File

@@ -63,7 +63,8 @@ panvk_add_wait_event_operation(struct panvk_cmd_buffer *cmdbuf,
/* Let's close the current batch so any future commands wait on the
* event signal operation.
*/
if (cmdbuf->cur_batch->fragment_job || cmdbuf->cur_batch->jc.first_job) {
if (cmdbuf->cur_batch->frag_jc.first_job ||
cmdbuf->cur_batch->vtc_jc.first_job) {
panvk_per_arch(cmd_close_batch)(cmdbuf);
panvk_per_arch(cmd_preload_fb_after_batch_split)(cmdbuf);
panvk_per_arch(cmd_open_batch)(cmdbuf);

View File

@@ -120,6 +120,7 @@ panvk_meta_blit(struct panvk_cmd_buffer *cmdbuf,
}
panvk_per_arch(cmd_close_batch)(cmdbuf);
cmdbuf->state.gfx.render.layer_count = 1;
GENX(pan_blit_ctx_init)
(&dev->meta.blitter.cache, blitinfo, &cmdbuf->desc_pool.base, &ctx);
@@ -136,13 +137,13 @@ panvk_meta_blit(struct panvk_cmd_buffer *cmdbuf,
batch->blit.dst = dst_img->bo;
panvk_per_arch(cmd_alloc_tls_desc)(cmdbuf, true);
panvk_per_arch(cmd_alloc_fb_desc)(cmdbuf);
panvk_per_arch(cmd_prepare_tiler_context)(cmdbuf);
panvk_per_arch(cmd_prepare_tiler_context)(cmdbuf, 0);
tsd = batch->tls.gpu;
tiler = batch->tiler.ctx_desc.gpu;
tiler = batch->tiler.ctx_descs.gpu;
struct panfrost_ptr job =
GENX(pan_blit)(&ctx, &cmdbuf->desc_pool.base, &batch->jc, tsd, tiler);
struct panfrost_ptr job = GENX(pan_blit)(&ctx, &cmdbuf->desc_pool.base,
&batch->vtc_jc, tsd, tiler);
util_dynarray_append(&batch->jobs, void *, job.cpu);
panvk_per_arch(cmd_close_batch)(cmdbuf);
} while (pan_blit_next_surface(&ctx));

View File

@@ -274,7 +274,8 @@ panvk_meta_clear_attachment(struct panvk_cmd_buffer *cmdbuf, unsigned rt,
panvk_per_arch(cmd_alloc_fb_desc)(cmdbuf);
panvk_per_arch(cmd_alloc_tls_desc)(cmdbuf, true);
panvk_per_arch(cmd_prepare_tiler_context)(cmdbuf);
cmdbuf->state.gfx.render.layer_count = 1;
panvk_per_arch(cmd_prepare_tiler_context)(cmdbuf, 0);
mali_ptr vpd = panvk_per_arch(meta_emit_viewport)(&cmdbuf->desc_pool.base,
minx, miny, maxx, maxy);
@@ -288,7 +289,7 @@ panvk_meta_clear_attachment(struct panvk_cmd_buffer *cmdbuf, unsigned rt,
enum glsl_base_type base_type = panvk_meta_get_format_type(pfmt);
mali_ptr tiler = batch->tiler.ctx_desc.gpu;
mali_ptr tiler = batch->tiler.ctx_descs.gpu;
mali_ptr tsd = batch->tls.gpu;
mali_ptr pushconsts = 0, rsd = 0;
@@ -311,8 +312,8 @@ panvk_meta_clear_attachment(struct panvk_cmd_buffer *cmdbuf, unsigned rt,
struct panfrost_ptr job;
job = panvk_meta_clear_attachment_emit_tiler_job(
&cmdbuf->desc_pool.base, &batch->jc, coordinates, pushconsts, vpd, rsd,
tsd, tiler);
&cmdbuf->desc_pool.base, &batch->vtc_jc, coordinates, pushconsts, vpd,
rsd, tsd, tiler);
util_dynarray_append(&batch->jobs, void *, job.cpu);
}
@@ -336,6 +337,7 @@ panvk_meta_clear_color_img(struct panvk_cmd_buffer *cmdbuf,
PIPE_SWIZZLE_W},
};
cmdbuf->state.gfx.render.layer_count = 1;
cmdbuf->state.gfx.render.fb.crc_valid[0] = false;
*fbinfo = (struct pan_fb_info){
.tile_buf_budget = panfrost_query_optimal_tib_size(phys_dev->model),

View File

@@ -629,6 +629,7 @@ panvk_meta_copy_img2img(struct panvk_cmd_buffer *cmdbuf,
u_minify(dst->pimage.layout.width, region->dstSubresource.mipLevel);
unsigned height =
u_minify(dst->pimage.layout.height, region->dstSubresource.mipLevel);
cmdbuf->state.gfx.render.layer_count = 1;
cmdbuf->state.gfx.render.fb.crc_valid[0] = false;
*fbinfo = (struct pan_fb_info){
.tile_buf_budget = panfrost_query_optimal_tib_size(phys_dev->model),
@@ -679,18 +680,18 @@ panvk_meta_copy_img2img(struct panvk_cmd_buffer *cmdbuf,
batch->blit.dst = dst->bo;
panvk_per_arch(cmd_alloc_tls_desc)(cmdbuf, true);
panvk_per_arch(cmd_alloc_fb_desc)(cmdbuf);
panvk_per_arch(cmd_prepare_tiler_context)(cmdbuf);
panvk_per_arch(cmd_prepare_tiler_context)(cmdbuf, 0);
mali_ptr tsd, tiler;
tsd = batch->tls.gpu;
tiler = batch->tiler.ctx_desc.gpu;
tiler = batch->tiler.ctx_descs.gpu;
struct panfrost_ptr job;
job = panvk_meta_copy_emit_tiler_job(&cmdbuf->desc_pool.base, &batch->jc,
src_coords, dst_coords, texture,
sampler, 0, vpd, rsd, tsd, tiler);
job = panvk_meta_copy_emit_tiler_job(
&cmdbuf->desc_pool.base, &batch->vtc_jc, src_coords, dst_coords,
texture, sampler, 0, vpd, rsd, tsd, tiler);
util_dynarray_append(&batch->jobs, void *, job.cpu);
panvk_per_arch(cmd_close_batch)(cmdbuf);
@@ -1062,6 +1063,7 @@ panvk_meta_copy_buf2img(struct panvk_cmd_buffer *cmdbuf,
};
/* TODO: don't force preloads of dst resources if unneeded */
cmdbuf->state.gfx.render.layer_count = 1;
cmdbuf->state.gfx.render.fb.crc_valid[0] = false;
*fbinfo = (struct pan_fb_info){
.tile_buf_budget = panfrost_query_optimal_tib_size(phys_dev->model),
@@ -1119,18 +1121,18 @@ panvk_meta_copy_buf2img(struct panvk_cmd_buffer *cmdbuf,
batch->blit.dst = img->bo;
panvk_per_arch(cmd_alloc_tls_desc)(cmdbuf, true);
panvk_per_arch(cmd_alloc_fb_desc)(cmdbuf);
panvk_per_arch(cmd_prepare_tiler_context)(cmdbuf);
panvk_per_arch(cmd_prepare_tiler_context)(cmdbuf, 0);
mali_ptr tsd, tiler;
tsd = batch->tls.gpu;
tiler = batch->tiler.ctx_desc.gpu;
tiler = batch->tiler.ctx_descs.gpu;
struct panfrost_ptr job;
job = panvk_meta_copy_emit_tiler_job(&cmdbuf->desc_pool.base, &batch->jc,
src_coords, dst_coords, 0, 0,
pushconsts, vpd, rsd, tsd, tiler);
job = panvk_meta_copy_emit_tiler_job(
&cmdbuf->desc_pool.base, &batch->vtc_jc, src_coords, dst_coords, 0, 0,
pushconsts, vpd, rsd, tsd, tiler);
util_dynarray_append(&batch->jobs, void *, job.cpu);
panvk_per_arch(cmd_close_batch)(cmdbuf);
@@ -1560,8 +1562,8 @@ panvk_meta_copy_img2buf(struct panvk_cmd_buffer *cmdbuf,
};
struct panfrost_ptr job = panvk_meta_copy_emit_compute_job(
&cmdbuf->desc_pool.base, &batch->jc, &num_wg, &wg_sz, texture, sampler,
pushconsts, rsd, tsd);
&cmdbuf->desc_pool.base, &batch->vtc_jc, &num_wg, &wg_sz, texture,
sampler, pushconsts, rsd, tsd);
util_dynarray_append(&batch->jobs, void *, job.cpu);
@@ -1727,8 +1729,8 @@ panvk_meta_copy_buf2buf(struct panvk_cmd_buffer *cmdbuf,
struct pan_compute_dim num_wg = {nblocks, 1, 1};
struct pan_compute_dim wg_sz = {1, 1, 1};
struct panfrost_ptr job = panvk_meta_copy_emit_compute_job(
&cmdbuf->desc_pool.base, &batch->jc, &num_wg, &wg_sz, 0, 0, pushconsts,
rsd, tsd);
&cmdbuf->desc_pool.base, &batch->vtc_jc, &num_wg, &wg_sz, 0, 0,
pushconsts, rsd, tsd);
util_dynarray_append(&batch->jobs, void *, job.cpu);
@@ -1875,8 +1877,8 @@ panvk_meta_fill_buf(struct panvk_cmd_buffer *cmdbuf,
struct pan_compute_dim num_wg = {nwords, 1, 1};
struct pan_compute_dim wg_sz = {1, 1, 1};
struct panfrost_ptr job = panvk_meta_copy_emit_compute_job(
&cmdbuf->desc_pool.base, &batch->jc, &num_wg, &wg_sz, 0, 0, pushconsts,
rsd, tsd);
&cmdbuf->desc_pool.base, &batch->vtc_jc, &num_wg, &wg_sz, 0, 0,
pushconsts, rsd, tsd);
util_dynarray_append(&batch->jobs, void *, job.cpu);
@@ -1925,8 +1927,8 @@ panvk_meta_update_buf(struct panvk_cmd_buffer *cmdbuf,
struct pan_compute_dim num_wg = {nblocks, 1, 1};
struct pan_compute_dim wg_sz = {1, 1, 1};
struct panfrost_ptr job = panvk_meta_copy_emit_compute_job(
&cmdbuf->desc_pool.base, &batch->jc, &num_wg, &wg_sz, 0, 0, pushconsts,
rsd, tsd);
&cmdbuf->desc_pool.base, &batch->vtc_jc, &num_wg, &wg_sz, 0, 0,
pushconsts, rsd, tsd);
util_dynarray_append(&batch->jobs, void *, job.cpu);

View File

@@ -48,22 +48,25 @@ panvk_queue_submit_batch(struct panvk_queue *queue, struct panvk_batch *batch,
memset((*job), 0, 4 * 4);
/* Reset the tiler before re-issuing the batch */
if (batch->tiler.ctx_desc.cpu) {
if (batch->tiler.ctx_descs.cpu) {
memcpy(batch->tiler.heap_desc.cpu, &batch->tiler.heap_templ,
sizeof(batch->tiler.heap_templ));
memcpy(batch->tiler.ctx_desc.cpu, &batch->tiler.ctx_templ,
sizeof(batch->tiler.ctx_templ));
struct mali_tiler_context_packed *ctxs = batch->tiler.ctx_descs.cpu;
for (uint32_t i = 0; i < batch->fb.layer_count; i++)
memcpy(&ctxs[i], &batch->tiler.ctx_templ, sizeof(*ctxs));
}
}
if (batch->jc.first_job) {
if (batch->vtc_jc.first_job) {
struct drm_panfrost_submit submit = {
.bo_handles = (uintptr_t)bos,
.bo_handle_count = nr_bos,
.in_syncs = (uintptr_t)in_fences,
.in_sync_count = nr_in_fences,
.out_sync = queue->sync,
.jc = batch->jc.first_job,
.jc = batch->vtc_jc.first_job,
};
ret = drmIoctl(dev->vk.drm_fd, DRM_IOCTL_PANFROST_SUBMIT, &submit);
@@ -76,7 +79,7 @@ panvk_queue_submit_batch(struct panvk_queue *queue, struct panvk_batch *batch,
}
if (debug & PANVK_DEBUG_TRACE) {
pandecode_jc(dev->debug.decode_ctx, batch->jc.first_job,
pandecode_jc(dev->debug.decode_ctx, batch->vtc_jc.first_job,
phys_dev->kmod.props.gpu_prod_id);
}
@@ -88,16 +91,16 @@ panvk_queue_submit_batch(struct panvk_queue *queue, struct panvk_batch *batch,
phys_dev->kmod.props.gpu_prod_id);
}
if (batch->fragment_job) {
if (batch->frag_jc.first_job) {
struct drm_panfrost_submit submit = {
.bo_handles = (uintptr_t)bos,
.bo_handle_count = nr_bos,
.out_sync = queue->sync,
.jc = batch->fragment_job,
.jc = batch->frag_jc.first_job,
.requirements = PANFROST_JD_REQ_FS,
};
if (batch->jc.first_job) {
if (batch->vtc_jc.first_job) {
submit.in_syncs = (uintptr_t)(&queue->sync);
submit.in_sync_count = 1;
} else {
@@ -114,7 +117,7 @@ panvk_queue_submit_batch(struct panvk_queue *queue, struct panvk_batch *batch,
}
if (debug & PANVK_DEBUG_TRACE)
pandecode_jc(dev->debug.decode_ctx, batch->fragment_job,
pandecode_jc(dev->debug.decode_ctx, batch->frag_jc.first_job,
phys_dev->kmod.props.gpu_prod_id);
if (debug & PANVK_DEBUG_DUMP)
@@ -236,7 +239,7 @@ panvk_queue_submit(struct vk_queue *vk_queue, struct vk_queue_submit *submit)
panvk_pool_num_bos(&cmdbuf->tls_pool) +
batch->fb.bo_count + (batch->blit.src ? 1 : 0) +
(batch->blit.dst ? 1 : 0) +
(batch->jc.first_tiler ? 1 : 0) + 1;
(batch->vtc_jc.first_tiler ? 1 : 0) + 1;
unsigned bo_idx = 0;
uint32_t bos[nr_bos];
@@ -258,7 +261,7 @@ panvk_queue_submit(struct vk_queue *vk_queue, struct vk_queue_submit *submit)
if (batch->blit.dst)
bos[bo_idx++] = pan_kmod_bo_handle(batch->blit.dst);
if (batch->jc.first_tiler)
if (batch->vtc_jc.first_tiler)
bos[bo_idx++] = pan_kmod_bo_handle(dev->tiler_heap->bo);
bos[bo_idx++] = pan_kmod_bo_handle(dev->sample_positions->bo);

View File

@@ -57,6 +57,11 @@ struct panvk_graphics_sysvals {
} vs;
#if PAN_ARCH <= 7
/* gl_Layer on Bifrost is a bit of hack. We have to issue one draw per
* layer, and filter primitives at the VS level.
*/
int32_t layer_id;
struct {
uint64_t sets[MAX_SETS];
uint64_t vs_dyn_ssbos;

View File

@@ -58,14 +58,13 @@
#include "vk_util.h"
static nir_def *
load_sysval_from_push_const(nir_builder *b, nir_intrinsic_instr *intr,
unsigned offset)
load_sysval_from_push_const(nir_builder *b, unsigned offset, unsigned bit_size,
unsigned num_comps)
{
return nir_load_push_constant(
b, intr->def.num_components, intr->def.bit_size, nir_imm_int(b, 0),
b, num_comps, bit_size, nir_imm_int(b, 0),
/* Push constants are placed first, and then come the sysvals. */
.base = offset + 256,
.range = intr->def.num_components * intr->def.bit_size / 8);
.base = offset + 256, .range = num_comps * bit_size / 8);
}
static bool
@@ -75,53 +74,54 @@ panvk_lower_sysvals(nir_builder *b, nir_instr *instr, void *data)
return false;
nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
unsigned num_comps = intr->def.num_components;
unsigned bit_size = intr->def.bit_size;
nir_def *val = NULL;
b->cursor = nir_before_instr(instr);
#define SYSVAL(ptype, name) offsetof(struct panvk_##ptype##_sysvals, name)
switch (intr->intrinsic) {
case nir_intrinsic_load_base_workgroup_id:
val =
load_sysval_from_push_const(b, intr, SYSVAL(compute, base));
val = load_sysval_from_push_const(b, SYSVAL(compute, base), bit_size,
num_comps);
break;
case nir_intrinsic_load_num_workgroups:
val =
load_sysval_from_push_const(b, intr, SYSVAL(compute, num_work_groups));
val = load_sysval_from_push_const(b, SYSVAL(compute, num_work_groups),
bit_size, num_comps);
break;
case nir_intrinsic_load_workgroup_size:
val = load_sysval_from_push_const(b, intr,
SYSVAL(compute, local_group_size));
val = load_sysval_from_push_const(b, SYSVAL(compute, local_group_size),
bit_size, num_comps);
break;
case nir_intrinsic_load_viewport_scale:
val =
load_sysval_from_push_const(b, intr, SYSVAL(graphics, viewport.scale));
val = load_sysval_from_push_const(b, SYSVAL(graphics, viewport.scale),
bit_size, num_comps);
break;
case nir_intrinsic_load_viewport_offset:
val = load_sysval_from_push_const(b, intr,
SYSVAL(graphics, viewport.offset));
val = load_sysval_from_push_const(b, SYSVAL(graphics, viewport.offset),
bit_size, num_comps);
break;
case nir_intrinsic_load_first_vertex:
val = load_sysval_from_push_const(b, intr,
SYSVAL(graphics, vs.first_vertex));
val = load_sysval_from_push_const(b, SYSVAL(graphics, vs.first_vertex),
bit_size, num_comps);
break;
case nir_intrinsic_load_base_vertex:
val =
load_sysval_from_push_const(b, intr, SYSVAL(graphics, vs.base_vertex));
val = load_sysval_from_push_const(b, SYSVAL(graphics, vs.base_vertex),
bit_size, num_comps);
break;
case nir_intrinsic_load_base_instance:
val = load_sysval_from_push_const(b, intr,
SYSVAL(graphics, vs.base_instance));
val = load_sysval_from_push_const(b, SYSVAL(graphics, vs.base_instance),
bit_size, num_comps);
break;
case nir_intrinsic_load_blend_const_color_rgba:
val = load_sysval_from_push_const(b, intr,
SYSVAL(graphics, blend.constants));
val = load_sysval_from_push_const(b, SYSVAL(graphics, blend.constants),
bit_size, num_comps);
break;
case nir_intrinsic_load_layer_id:
/* We don't support layered rendering yet, so force the layer_id to
* zero for now.
*/
val = nir_imm_int(b, 0);
assert(b->shader->info.stage = MESA_SHADER_FRAGMENT);
val = load_sysval_from_push_const(b, SYSVAL(graphics, layer_id), bit_size,
num_comps);
break;
default:
@@ -134,6 +134,82 @@ panvk_lower_sysvals(nir_builder *b, nir_instr *instr, void *data)
return true;
}
static bool
lower_gl_pos_layer_writes(nir_builder *b, nir_instr *instr, void *data)
{
if (instr->type != nir_instr_type_intrinsic)
return false;
nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
if (intr->intrinsic != nir_intrinsic_copy_deref)
return false;
nir_variable *dst_var = nir_intrinsic_get_var(intr, 0);
nir_variable *src_var = nir_intrinsic_get_var(intr, 1);
if (!dst_var || dst_var->data.mode != nir_var_shader_out || !src_var ||
src_var->data.mode != nir_var_shader_temp)
return false;
if (dst_var->data.location == VARYING_SLOT_LAYER) {
/* We don't really write the layer, we just make sure primitives are
* discarded if gl_Layer doesn't match the layer passed to the draw.
*/
b->cursor = nir_instr_remove(instr);
return true;
}
if (dst_var->data.location == VARYING_SLOT_POS) {
nir_variable *temp_layer_var = data;
nir_variable *temp_pos_var = src_var;
b->cursor = nir_before_instr(instr);
nir_def *layer = nir_load_var(b, temp_layer_var);
nir_def *pos = nir_load_var(b, temp_pos_var);
nir_def *inf_pos = nir_imm_vec4(b, INFINITY, INFINITY, INFINITY, 1.0f);
nir_def *ref_layer = load_sysval_from_push_const(
b, offsetof(struct panvk_graphics_sysvals, layer_id), 32, 1);
nir_store_var(b, temp_pos_var,
nir_bcsel(b, nir_ieq(b, layer, ref_layer), pos, inf_pos),
0xf);
return true;
}
return false;
}
static bool
lower_layer_writes(nir_shader *nir)
{
if (nir->info.stage == MESA_SHADER_FRAGMENT)
return false;
nir_variable *temp_layer_var = NULL;
bool has_layer_var = false;
nir_foreach_variable_with_modes(var, nir,
nir_var_shader_out | nir_var_shader_temp) {
if (var->data.mode == nir_var_shader_out &&
var->data.location == VARYING_SLOT_LAYER)
has_layer_var = true;
if (var->data.mode == nir_var_shader_temp &&
var->data.location == VARYING_SLOT_LAYER)
temp_layer_var = var;
}
if (!has_layer_var)
return false;
assert(temp_layer_var);
return nir_shader_instructions_pass(
nir, lower_gl_pos_layer_writes,
nir_metadata_block_index | nir_metadata_dominance, temp_layer_var);
}
static void
shared_type_info(const struct glsl_type *type, unsigned *size, unsigned *align)
{
@@ -202,6 +278,10 @@ panvk_preprocess_nir(UNUSED struct vk_physical_device *vk_pdev, nir_shader *nir)
NIR_PASS_V(nir, nir_lower_io_to_temporaries, nir_shader_get_entrypoint(nir),
true, true);
/* This needs to be done just after the io_to_temporaries pass, because we
* rely on in/out temporaries to collect the final layer_id value. */
NIR_PASS_V(nir, lower_layer_writes);
NIR_PASS_V(nir, nir_lower_indirect_derefs,
nir_var_shader_in | nir_var_shader_out, UINT32_MAX);