From 0bc3502ca36b7b5f244e5de34813f46ce2ff4a1b Mon Sep 17 00:00:00 2001 From: Boris Brezillon Date: Tue, 24 Sep 2024 15:30:36 +0200 Subject: [PATCH] panvk: Implement a custom FB preload logic This has several advantages over using pan_blitter for that: - we can catch allocation failures and flag the command buffer invalid - we can re-use the vk_meta_device object list to keep track of our preload shaders - we can re-use surface descriptors instead of re-emitting them every time a preload is done Signed-off-by: Boris Brezillon Acked-by: Eric R. Smith Reviewed-by: Mary Guillemard Part-of: --- src/panfrost/ci/panfrost-g52-fails.txt | 3 - src/panfrost/lib/pan_desc.c | 10 + src/panfrost/vulkan/csf/panvk_vX_cmd_buffer.c | 1 - src/panfrost/vulkan/csf/panvk_vX_cmd_draw.c | 28 +- src/panfrost/vulkan/jm/panvk_vX_cmd_buffer.c | 20 +- src/panfrost/vulkan/meson.build | 1 + src/panfrost/vulkan/panvk_cmd_fb_preload.h | 13 + src/panfrost/vulkan/panvk_device.h | 17 - src/panfrost/vulkan/panvk_meta.h | 1 + src/panfrost/vulkan/panvk_vX_cmd_fb_preload.c | 750 ++++++++++++++++++ src/panfrost/vulkan/panvk_vX_device.c | 51 +- 11 files changed, 792 insertions(+), 103 deletions(-) create mode 100644 src/panfrost/vulkan/panvk_cmd_fb_preload.h create mode 100644 src/panfrost/vulkan/panvk_vX_cmd_fb_preload.c diff --git a/src/panfrost/ci/panfrost-g52-fails.txt b/src/panfrost/ci/panfrost-g52-fails.txt index 54f66d610aa..85104e1c3ba 100644 --- a/src/panfrost/ci/panfrost-g52-fails.txt +++ b/src/panfrost/ci/panfrost-g52-fails.txt @@ -2621,11 +2621,8 @@ dEQP-VK.pipeline.pipeline_library.depth.nocolor.format.x8_d24_unorm_pack32.compa spec@ext_image_dma_buf_import@ext_image_dma_buf_import-refcount-multithread,Crash # physical device and device needs more robustness in allocation handling -dEQP-VK.api.object_management.alloc_callback_fail.device,Crash -dEQP-VK.api.object_management.alloc_callback_fail.device_group,Crash dEQP-VK.api.object_management.max_concurrent.device,Fail dEQP-VK.api.object_management.max_concurrent.device_group,Fail -dEQP-VK.api.device_init.create_instance_device_intentional_alloc_fail.basic,Crash # query pool not supported yet dEQP-VK.api.null_handle.destroy_query_pool,Crash diff --git a/src/panfrost/lib/pan_desc.c b/src/panfrost/lib/pan_desc.c index bac5186b171..81b6580ecbb 100644 --- a/src/panfrost/lib/pan_desc.c +++ b/src/panfrost/lib/pan_desc.c @@ -750,7 +750,17 @@ GENX(pan_emit_fbd)(const struct pan_fb_info *fb, unsigned layer_idx, force_clean_write); cfg.post_frame = pan_fix_frame_shader_mode(fb->bifrost.pre_post.modes[2], force_clean_write); +#if PAN_ARCH <= 7 + /* On Bifrost, the layer_id is passed through a push_uniform, which forces + * us to have one pre/post DCD array per layer. */ + cfg.frame_shader_dcds = + fb->bifrost.pre_post.dcds.gpu + (layer_idx * 3 * pan_size(DRAW)); +#else + /* On Valhall, layer_id is passed through the framebuffer frame_arg, which + * is preloaded in r62, so we can use the same pre/post DCD array for all + * layers. */ cfg.frame_shader_dcds = fb->bifrost.pre_post.dcds.gpu; +#endif cfg.tiler = PAN_ARCH >= 9 ? tiler_ctx->valhall.desc : tiler_ctx->bifrost.desc; #endif diff --git a/src/panfrost/vulkan/csf/panvk_vX_cmd_buffer.c b/src/panfrost/vulkan/csf/panvk_vX_cmd_buffer.c index 7979d0a4770..497b417cbe0 100644 --- a/src/panfrost/vulkan/csf/panvk_vX_cmd_buffer.c +++ b/src/panfrost/vulkan/csf/panvk_vX_cmd_buffer.c @@ -40,7 +40,6 @@ #include "panvk_physical_device.h" #include "panvk_priv_bo.h" -#include "pan_blitter.h" #include "pan_desc.h" #include "pan_encoder.h" #include "pan_props.h" diff --git a/src/panfrost/vulkan/csf/panvk_vX_cmd_draw.c b/src/panfrost/vulkan/csf/panvk_vX_cmd_draw.c index 05e8624ca01..5bd8d37f4fd 100644 --- a/src/panfrost/vulkan/csf/panvk_vX_cmd_draw.c +++ b/src/panfrost/vulkan/csf/panvk_vX_cmd_draw.c @@ -16,6 +16,7 @@ #include "panvk_cmd_alloc.h" #include "panvk_cmd_buffer.h" #include "panvk_cmd_desc_state.h" +#include "panvk_cmd_fb_preload.h" #include "panvk_cmd_meta.h" #include "panvk_device.h" #include "panvk_entrypoints.h" @@ -1977,23 +1978,6 @@ resolve_attachments(struct panvk_cmd_buffer *cmdbuf) static uint8_t prepare_fb_desc(struct panvk_cmd_buffer *cmdbuf, uint32_t layer, void *fbd) { - struct panvk_device *dev = to_panvk_device(cmdbuf->vk.base.device); - - memset(&cmdbuf->state.gfx.render.fb.info.bifrost.pre_post.dcds, 0, - sizeof(cmdbuf->state.gfx.render.fb.info.bifrost.pre_post.dcds)); - - if (cmdbuf->state.tls.desc.gpu) { - ASSERTED unsigned num_preload_jobs = - GENX(pan_preload_fb)(&dev->blitter.cache, &cmdbuf->desc_pool.base, - &cmdbuf->state.gfx.render.fb.info, layer, - cmdbuf->state.tls.desc.gpu, NULL); - - /* Valhall GPUs use pre frame DCDs to preload the FB content. We - * thus expect num_preload_jobs to be zero. - */ - assert(!num_preload_jobs); - } - struct pan_tiler_context tiler_ctx = { .valhall.layer_offset = layer - (layer % MAX_LAYERS_PER_TILER_DESC), }; @@ -2092,11 +2076,11 @@ wait_finish_tiling(struct panvk_cmd_buffer *cmdbuf) vt_sync_addr); } -static void +static VkResult issue_fragment_jobs(struct panvk_cmd_buffer *cmdbuf) { if (!cmdbuf->state.gfx.render.fbds.gpu) - return; + return VK_SUCCESS; struct panvk_device *dev = to_panvk_device(cmdbuf->vk.base.device); struct pan_fb_info *fbinfo = &cmdbuf->state.gfx.render.fb.info; @@ -2132,6 +2116,10 @@ issue_fragment_jobs(struct panvk_cmd_buffer *cmdbuf) struct panfrost_ptr fbds = cmdbuf->state.gfx.render.fbds; uint8_t fbd_flags = 0; + VkResult result = panvk_per_arch(cmd_fb_preload)(cmdbuf); + if (result != VK_SUCCESS) + return result; + /* We prepare all FB descriptors upfront. */ for (uint32_t i = 0; i < cmdbuf->state.gfx.render.layer_count; i++) { uint32_t new_fbd_flags = @@ -2280,6 +2268,8 @@ issue_fragment_jobs(struct panvk_cmd_buffer *cmdbuf) memset(&cmdbuf->state.gfx.render.fbds, 0, sizeof(cmdbuf->state.gfx.render.fbds)); cmdbuf->state.gfx.render.tiler = 0; + + return VK_SUCCESS; } void diff --git a/src/panfrost/vulkan/jm/panvk_vX_cmd_buffer.c b/src/panfrost/vulkan/jm/panvk_vX_cmd_buffer.c index d9e9d997ea7..9a30a4d5f63 100644 --- a/src/panfrost/vulkan/jm/panvk_vX_cmd_buffer.c +++ b/src/panfrost/vulkan/jm/panvk_vX_cmd_buffer.c @@ -32,6 +32,7 @@ #include "panvk_cmd_alloc.h" #include "panvk_cmd_buffer.h" #include "panvk_cmd_desc_state.h" +#include "panvk_cmd_fb_preload.h" #include "panvk_cmd_pool.h" #include "panvk_cmd_push_constant.h" #include "panvk_device.h" @@ -40,7 +41,6 @@ #include "panvk_physical_device.h" #include "panvk_priv_bo.h" -#include "pan_blitter.h" #include "pan_desc.h" #include "pan_encoder.h" #include "pan_props.h" @@ -140,22 +140,16 @@ panvk_per_arch(cmd_close_batch)(struct panvk_cmd_buffer *cmdbuf) panfrost_sample_positions_offset( pan_sample_pattern(fbinfo->nr_samples)); + if (batch->vtc_jc.first_tiler) { + VkResult result = panvk_per_arch(cmd_fb_preload)(cmdbuf); + if (result != VK_SUCCESS) + return; + } + for (uint32_t i = 0; i < batch->fb.layer_count; i++) { VkResult result; mali_ptr fbd = batch->fb.desc.gpu + (batch->fb.desc_stride * i); - if (batch->vtc_jc.first_tiler) { - cmdbuf->state.gfx.render.fb.info.bifrost.pre_post.dcds.gpu = 0; - - ASSERTED unsigned num_preload_jobs = GENX(pan_preload_fb)( - &dev->blitter.cache, &cmdbuf->desc_pool.base, - &cmdbuf->state.gfx.render.fb.info, i, batch->tls.gpu, NULL); - - /* Bifrost GPUs use pre frame DCDs to preload the FB content. We - * thus expect num_preload_jobs to be zero. - */ - assert(!num_preload_jobs); - } result = panvk_per_arch(cmd_prepare_tiler_context)(cmdbuf, i); if (result != VK_SUCCESS) diff --git a/src/panfrost/vulkan/meson.build b/src/panfrost/vulkan/meson.build index 6afbd57948b..10ff45a20c7 100644 --- a/src/panfrost/vulkan/meson.build +++ b/src/panfrost/vulkan/meson.build @@ -72,6 +72,7 @@ common_per_arch_files = [ panvk_entrypoints[0], 'panvk_vX_blend.c', 'panvk_vX_buffer_view.c', + 'panvk_vX_cmd_fb_preload.c', 'panvk_vX_cmd_desc_state.c', 'panvk_vX_cmd_meta.c', 'panvk_vX_cmd_push_constant.c', diff --git a/src/panfrost/vulkan/panvk_cmd_fb_preload.h b/src/panfrost/vulkan/panvk_cmd_fb_preload.h new file mode 100644 index 00000000000..4ff37edea0e --- /dev/null +++ b/src/panfrost/vulkan/panvk_cmd_fb_preload.h @@ -0,0 +1,13 @@ +/* + * Copyright © 2021 Collabora Ltd. + * SPDX-License-Identifier: MIT + */ + +#ifndef PANVK_FB_PRELOAD_H +#define PANVK_FB_PRELOAD_H + +#include "panvk_cmd_buffer.h" + +VkResult panvk_per_arch(cmd_fb_preload)(struct panvk_cmd_buffer *cmdbuf); + +#endif diff --git a/src/panfrost/vulkan/panvk_device.h b/src/panfrost/vulkan/panvk_device.h index 53798e7d791..052e62578d5 100644 --- a/src/panfrost/vulkan/panvk_device.h +++ b/src/panfrost/vulkan/panvk_device.h @@ -20,7 +20,6 @@ #include "kmod/pan_kmod.h" #include "util/pan_ir.h" -#include "pan_blitter.h" #include "util/vma.h" @@ -43,22 +42,6 @@ struct panvk_device { struct panvk_priv_bo *tiler_heap; struct panvk_priv_bo *sample_positions; - /* Access to the blitter pools are protected by the blitter - * shader/rsd locks. They can't be merged with other binary/desc - * pools unless we patch pan_blitter.c to support external pool locks. - * - * FIXME: The blitter infrastructure is only needed for FB preload. - * We should probably consider getting rid of the dependency we have - * on pan_desc.c and implement preload ourselves so we don't have - * to duplicate caches. - */ - struct { - struct panvk_pool bin_pool; - struct panvk_pool desc_pool; - struct pan_blitter_cache cache; - struct pan_blend_shader_cache blend_shader_cache; - } blitter; - struct vk_meta_device meta; struct { diff --git a/src/panfrost/vulkan/panvk_meta.h b/src/panfrost/vulkan/panvk_meta.h index 12a9e3414cc..3dd28ab1fc4 100644 --- a/src/panfrost/vulkan/panvk_meta.h +++ b/src/panfrost/vulkan/panvk_meta.h @@ -15,6 +15,7 @@ enum panvk_meta_object_key_type { PANVK_META_OBJECT_KEY_BLEND_SHADER = VK_META_OBJECT_KEY_DRIVER_OFFSET, PANVK_META_OBJECT_KEY_COPY_DESC_SHADER, + PANVK_META_OBJECT_KEY_FB_PRELOAD_SHADER, }; static inline VkFormat diff --git a/src/panfrost/vulkan/panvk_vX_cmd_fb_preload.c b/src/panfrost/vulkan/panvk_vX_cmd_fb_preload.c new file mode 100644 index 00000000000..b9fd55017d9 --- /dev/null +++ b/src/panfrost/vulkan/panvk_vX_cmd_fb_preload.c @@ -0,0 +1,750 @@ +/* + * Copyright © 2021 Collabora Ltd. + * SPDX-License-Identifier: MIT + */ + +#include "panvk_cmd_alloc.h" +#include "panvk_cmd_fb_preload.h" +#include "panvk_image_view.h" +#include "panvk_meta.h" +#include "panvk_shader.h" + +#include "nir_builder.h" + +#include "pan_shader.h" + +struct panvk_fb_preload_shader_key { + enum panvk_meta_object_key_type type; + VkImageViewType view_type; + VkSampleCountFlagBits samples; + VkImageAspectFlags aspects; + bool needs_layer_id; + struct { + nir_alu_type type; + } color[8]; +}; + +static nir_def * +texel_fetch(nir_builder *b, VkImageViewType view_type, + nir_alu_type reg_type, unsigned tex_idx, + nir_def *sample_id, nir_def *coords) +{ + nir_tex_instr *tex = nir_tex_instr_create(b->shader, sample_id ? 3 : 2); + + tex->op = sample_id ? nir_texop_txf_ms : nir_texop_txf; + tex->dest_type = reg_type; + tex->is_array = vk_image_view_type_is_array(view_type); + tex->sampler_dim = sample_id ? GLSL_SAMPLER_DIM_MS + : vk_image_view_type_to_sampler_dim(view_type); + tex->coord_components = coords->num_components; + tex->src[0] = nir_tex_src_for_ssa(nir_tex_src_coord, coords); + tex->src[1] = nir_tex_src_for_ssa(nir_tex_src_lod, nir_imm_int(b, 0)); + + if (sample_id) + tex->src[2] = nir_tex_src_for_ssa(nir_tex_src_ms_index, sample_id); + +#if PAN_ARCH <= 7 + tex->sampler_index = 0; + tex->texture_index = tex_idx; +#else + tex->sampler_index = pan_res_handle(0, 0); + tex->texture_index = pan_res_handle(0, tex_idx + 1); +#endif + + nir_def_init(&tex->instr, &tex->def, 4, 32); + nir_builder_instr_insert(b, &tex->instr); + + return &tex->def; +} + +static nir_variable * +color_output_var(nir_builder *b, VkImageViewType view_type, + VkImageAspectFlags aspect, VkSampleCountFlagBits samples, + nir_alu_type fmt_type, unsigned rt) +{ + enum glsl_base_type base_type = + nir_get_glsl_base_type_for_nir_type(fmt_type); + const struct glsl_type *var_type = glsl_vector_type(base_type, 4); + static const char *var_names[] = { + "gl_FragData[0]", "gl_FragData[1]", "gl_FragData[2]", "gl_FragData[3]", + "gl_FragData[4]", "gl_FragData[5]", "gl_FragData[6]", "gl_FragData[7]", + }; + + assert(rt < ARRAY_SIZE(var_names)); + + nir_variable *var = nir_variable_create(b->shader, nir_var_shader_out, + var_type, var_names[rt]); + var->data.location = FRAG_RESULT_DATA0 + rt; + + return var; +} + +static nir_def * +get_layer_id(nir_builder *b) +{ +#if PAN_ARCH <= 7 + return nir_load_push_constant(b, 1, 32, nir_imm_int(b, 0), .base = 0, + .range = 4); +#else + return nir_load_layer_id(b); +#endif +} + +static nir_shader * +get_preload_nir_shader(const struct panvk_fb_preload_shader_key *key) +{ + nir_builder builder = nir_builder_init_simple_shader( + MESA_SHADER_FRAGMENT, GENX(pan_shader_get_compiler_options)(), + "panvk-meta-preload"); + nir_builder *b = &builder; + nir_def *sample_id = + key->samples != VK_SAMPLE_COUNT_1_BIT ? nir_load_sample_id(b) : NULL; + nir_def *coords = nir_u2u32(b, nir_load_pixel_coord(b)); + + if (key->view_type == VK_IMAGE_VIEW_TYPE_2D_ARRAY || + key->view_type == VK_IMAGE_VIEW_TYPE_CUBE_ARRAY || + key->view_type == VK_IMAGE_VIEW_TYPE_CUBE || + key->view_type == VK_IMAGE_VIEW_TYPE_3D) { + coords = + nir_vec3(b, nir_channel(b, coords, 0), nir_channel(b, coords, 1), + key->needs_layer_id ? get_layer_id(b) : nir_imm_int(b, 0)); + } + + if (key->aspects == VK_IMAGE_ASPECT_COLOR_BIT) { + for (uint32_t i = 0; i < ARRAY_SIZE(key->color); i++) { + if (key->color[i].type == nir_type_invalid) + continue; + + nir_def *texel = texel_fetch(b, key->view_type, key->color[i].type, i, + sample_id, coords); + + nir_store_output( + b, texel, nir_imm_int(b, 0), .base = i, + .src_type = key->color[i].type, + .io_semantics.location = FRAG_RESULT_DATA0 + i, + .io_semantics.num_slots = 1, + .write_mask = nir_component_mask(texel->num_components)); + } + } + + if (key->aspects & VK_IMAGE_ASPECT_DEPTH_BIT) { + nir_def *texel = texel_fetch(b, key->view_type, nir_type_float32, 0, + sample_id, coords); + + nir_store_output(b, nir_channel(b, texel, 0), nir_imm_int(b, 0), + .base = 0, .src_type = nir_type_float32, + .io_semantics.location = FRAG_RESULT_DEPTH, + .io_semantics.num_slots = 1, + .write_mask = nir_component_mask(1)); + } + + if (key->aspects & VK_IMAGE_ASPECT_STENCIL_BIT) { + nir_def *texel = texel_fetch( + b, key->view_type, nir_type_uint32, + key->aspects & VK_IMAGE_ASPECT_DEPTH_BIT ? 1 : 0, sample_id, coords); + + nir_store_output(b, nir_channel(b, texel, 0), nir_imm_int(b, 0), + .base = 0, .src_type = nir_type_uint32, + .io_semantics.location = FRAG_RESULT_STENCIL, + .io_semantics.num_slots = 1, + .write_mask = nir_component_mask(1)); + } + + return b->shader; +} + +static VkResult +get_preload_shader(struct panvk_device *dev, + const struct panvk_fb_preload_shader_key *key, + struct panvk_internal_shader **shader_out) +{ + struct panvk_physical_device *phys_dev = + to_panvk_physical_device(dev->vk.physical); + struct panvk_internal_shader *shader; + VkShaderEXT shader_handle = (VkShaderEXT)vk_meta_lookup_object( + &dev->meta, VK_OBJECT_TYPE_SHADER_EXT, key, sizeof(*key)); + if (shader_handle != VK_NULL_HANDLE) + goto out; + + nir_shader *nir = get_preload_nir_shader(key); + + nir_shader_gather_info(nir, nir_shader_get_entrypoint(nir)); + + struct panfrost_compile_inputs inputs = { + .gpu_id = phys_dev->kmod.props.gpu_prod_id, + .no_ubo_to_push = true, + }; + + pan_shader_preprocess(nir, inputs.gpu_id); + + VkResult result = panvk_per_arch(create_internal_shader)( + dev, nir, &inputs, &shader); + if (result != VK_SUCCESS) + return result; + +#if PAN_ARCH >= 9 + shader->spd = panvk_pool_alloc_desc(&dev->mempools.rw, SHADER_PROGRAM); + if (!panvk_priv_mem_host_addr(shader->spd)) { + vk_shader_destroy(&dev->vk, &shader->vk, NULL); + return panvk_error(dev, VK_ERROR_OUT_OF_DEVICE_MEMORY); + } + + pan_pack(panvk_priv_mem_host_addr(shader->spd), SHADER_PROGRAM, cfg) { + cfg.stage = MALI_SHADER_STAGE_FRAGMENT; + cfg.fragment_coverage_bitmask_type = MALI_COVERAGE_BITMASK_TYPE_GL; + cfg.register_allocation = MALI_SHADER_REGISTER_ALLOCATION_32_PER_THREAD; + cfg.binary = panvk_priv_mem_dev_addr(shader->code_mem); + cfg.preload.r48_r63 = shader->info.preload >> 48; + } +#endif + + shader_handle = (VkShaderEXT)vk_meta_cache_object( + &dev->vk, &dev->meta, key, sizeof(*key), VK_OBJECT_TYPE_SHADER_EXT, + (uint64_t)panvk_internal_shader_to_handle(shader)); + +out: + shader = panvk_internal_shader_from_handle(shader_handle); + *shader_out = shader; + return VK_SUCCESS; +} + +static VkResult +alloc_pre_post_dcds(struct panvk_cmd_buffer *cmdbuf) +{ + struct pan_fb_info *fbinfo = &cmdbuf->state.gfx.render.fb.info; + + if (fbinfo->bifrost.pre_post.dcds.gpu) + return VK_SUCCESS; + + uint32_t dcd_count = + 3 * (PAN_ARCH <= 7 ? cmdbuf->state.gfx.render.layer_count : 1); + + fbinfo->bifrost.pre_post.dcds = panvk_cmd_alloc_desc_array(cmdbuf, dcd_count, DRAW); + if (!fbinfo->bifrost.pre_post.dcds.cpu) + return VK_ERROR_OUT_OF_DEVICE_MEMORY; + + return VK_SUCCESS; +} + +static enum mali_register_file_format +get_reg_fmt(nir_alu_type type) +{ + switch (type) { + case nir_type_float32: + return MALI_REGISTER_FILE_FORMAT_F32; + case nir_type_uint32: + return MALI_REGISTER_FILE_FORMAT_U32; + case nir_type_int32: + return MALI_REGISTER_FILE_FORMAT_I32; + default: + assert(!"Invalid reg type"); + return MALI_REGISTER_FILE_FORMAT_F32; + } +} + +static void +fill_textures(struct panvk_cmd_buffer *cmdbuf, + const struct panvk_fb_preload_shader_key *key, + struct mali_texture_packed *textures) +{ + struct pan_fb_info *fbinfo = &cmdbuf->state.gfx.render.fb.info; + + if (key->aspects == VK_IMAGE_ASPECT_COLOR_BIT) { + for (unsigned i = 0; i < fbinfo->rt_count; i++) { + struct panvk_image_view *iview = + cmdbuf->state.gfx.render.color_attachments.iviews[i]; + + if (iview) + textures[i] = iview->descs.tex; + else + textures[i] = (struct mali_texture_packed){0}; + } + return; + } + + uint32_t idx = 0; + if (key->aspects & VK_IMAGE_ASPECT_DEPTH_BIT) { + struct panvk_image_view *iview = + cmdbuf->state.gfx.render.z_attachment.iview + ?: cmdbuf->state.gfx.render.s_attachment.iview; + + textures[idx++] = vk_format_has_depth(iview->vk.view_format) + ? iview->descs.tex + : iview->descs.other_aspect_tex; + } + + if (key->aspects & VK_IMAGE_ASPECT_STENCIL_BIT) { + struct panvk_image_view *iview = + cmdbuf->state.gfx.render.s_attachment.iview + ?: cmdbuf->state.gfx.render.z_attachment.iview; + + textures[idx++] = vk_format_has_depth(iview->vk.view_format) + ? iview->descs.other_aspect_tex + : iview->descs.tex; + } +} + +static void +fill_bds(struct panvk_cmd_buffer *cmdbuf, + const struct panvk_fb_preload_shader_key *key, + struct mali_blend_packed *bds) +{ + struct pan_fb_info *fbinfo = &cmdbuf->state.gfx.render.fb.info; + uint32_t bd_count = MAX2(fbinfo->rt_count, 1); + + for (unsigned i = 0; i < bd_count; i++) { + const struct pan_image_view *pview = + fbinfo->rts[i].preload ? fbinfo->rts[i].view : NULL; + + pan_pack(&bds[i], BLEND, cfg) { + if (key->aspects != VK_IMAGE_ASPECT_COLOR_BIT || !pview) { + cfg.enable = false; + cfg.internal.mode = MALI_BLEND_MODE_OFF; + continue; + } + + cfg.round_to_fb_precision = true; + cfg.srgb = util_format_is_srgb(pview->format); + cfg.internal.mode = MALI_BLEND_MODE_OPAQUE; + cfg.equation.rgb.a = MALI_BLEND_OPERAND_A_SRC; + cfg.equation.rgb.b = MALI_BLEND_OPERAND_B_SRC; + cfg.equation.rgb.c = MALI_BLEND_OPERAND_C_ZERO; + cfg.equation.alpha.a = MALI_BLEND_OPERAND_A_SRC; + cfg.equation.alpha.b = MALI_BLEND_OPERAND_B_SRC; + cfg.equation.alpha.c = MALI_BLEND_OPERAND_C_ZERO; + cfg.equation.color_mask = 0xf; + + cfg.internal.fixed_function.num_comps = 4; + cfg.internal.fixed_function.conversion.memory_format = GENX( + panfrost_dithered_format_from_pipe_format)(pview->format, false); + cfg.internal.fixed_function.rt = i; +#if PAN_ARCH <= 7 + cfg.internal.fixed_function.conversion.register_format = + get_reg_fmt(key->color[i].type); +#endif + } + } +} + +#if PAN_ARCH <= 7 +static VkResult +cmd_emit_dcd(struct panvk_cmd_buffer *cmdbuf, + const struct panvk_fb_preload_shader_key *key) +{ + struct panvk_device *dev = to_panvk_device(cmdbuf->vk.base.device); + struct pan_fb_info *fbinfo = &cmdbuf->state.gfx.render.fb.info; + struct panvk_internal_shader *shader = NULL; + + VkResult result = get_preload_shader(dev, key, &shader); + if (result != VK_SUCCESS) + return result; + + uint32_t tex_count = key->aspects == VK_IMAGE_ASPECT_COLOR_BIT + ? fbinfo->rt_count + : util_bitcount(key->aspects); + uint32_t bd_count = MAX2(fbinfo->rt_count, 1); + + struct panfrost_ptr rsd = panvk_cmd_alloc_desc_aggregate( + cmdbuf, PAN_DESC(RENDERER_STATE), + PAN_DESC_ARRAY(bd_count, BLEND)); + if (!rsd.cpu) + return VK_ERROR_OUT_OF_DEVICE_MEMORY; + + pan_pack(rsd.cpu, RENDERER_STATE, cfg) { + pan_shader_prepare_rsd(&shader->info, + panvk_priv_mem_dev_addr(shader->code_mem), &cfg); + + cfg.shader.texture_count = tex_count; + cfg.shader.sampler_count = 1; + + cfg.multisample_misc.sample_mask = 0xFFFF; + cfg.multisample_misc.multisample_enable = key->samples > 1; + cfg.multisample_misc.evaluate_per_sample = key->samples > 1; + + cfg.multisample_misc.depth_function = MALI_FUNC_ALWAYS; + cfg.multisample_misc.depth_write_mask = + (key->aspects & VK_IMAGE_ASPECT_DEPTH_BIT) != 0; + + cfg.stencil_mask_misc.stencil_enable = + (key->aspects & VK_IMAGE_ASPECT_STENCIL_BIT) != 0; + cfg.stencil_mask_misc.stencil_mask_front = 0xFF; + cfg.stencil_mask_misc.stencil_mask_back = 0xFF; + cfg.stencil_front.compare_function = MALI_FUNC_ALWAYS; + cfg.stencil_front.stencil_fail = MALI_STENCIL_OP_REPLACE; + cfg.stencil_front.depth_fail = MALI_STENCIL_OP_REPLACE; + cfg.stencil_front.depth_pass = MALI_STENCIL_OP_REPLACE; + cfg.stencil_front.mask = 0xFF; + + cfg.stencil_back = cfg.stencil_front; + + if (key->aspects == VK_IMAGE_ASPECT_COLOR_BIT) { + /* Skipping ATEST requires forcing Z/S */ + cfg.properties.zs_update_operation = MALI_PIXEL_KILL_STRONG_EARLY; + cfg.properties.pixel_kill_operation = MALI_PIXEL_KILL_FORCE_EARLY; + } else { + /* Writing Z/S requires late updates */ + cfg.properties.zs_update_operation = MALI_PIXEL_KILL_FORCE_LATE; + cfg.properties.pixel_kill_operation = MALI_PIXEL_KILL_FORCE_LATE; + } + + /* However, while shaders writing Z/S can normally be killed, on v6 + * for frame shaders it can cause GPU timeouts, so only allow colour + * blit shaders to be killed. */ + cfg.properties.allow_forward_pixel_to_kill = + key->aspects == VK_IMAGE_ASPECT_COLOR_BIT; + + if (PAN_ARCH == 6) + cfg.properties.allow_forward_pixel_to_be_killed = + key->aspects == VK_IMAGE_ASPECT_COLOR_BIT; + } + + fill_bds(cmdbuf, key, rsd.cpu + pan_size(RENDERER_STATE)); + + struct panvk_batch *batch = cmdbuf->cur_batch; + uint16_t minx = 0, miny = 0, maxx, maxy; + + /* Align on 32x32 tiles */ + minx = fbinfo->extent.minx & ~31; + miny = fbinfo->extent.miny & ~31; + maxx = MIN2(ALIGN_POT(fbinfo->extent.maxx + 1, 32), fbinfo->width) - 1; + maxy = MIN2(ALIGN_POT(fbinfo->extent.maxy + 1, 32), fbinfo->height) - 1; + + struct panfrost_ptr vpd = panvk_cmd_alloc_desc(cmdbuf, VIEWPORT); + if (!vpd.cpu) + return VK_ERROR_OUT_OF_DEVICE_MEMORY; + + pan_pack(vpd.cpu, VIEWPORT, cfg) { + cfg.scissor_minimum_x = minx; + cfg.scissor_minimum_y = miny; + cfg.scissor_maximum_x = maxx; + cfg.scissor_maximum_y = maxy; + } + + struct panfrost_ptr sampler = panvk_cmd_alloc_desc(cmdbuf, SAMPLER); + if (!sampler.cpu) + return VK_ERROR_OUT_OF_DEVICE_MEMORY; + + pan_pack(sampler.cpu, SAMPLER, cfg) { + cfg.seamless_cube_map = false; + cfg.normalized_coordinates = false; + cfg.minify_nearest = true; + cfg.magnify_nearest = true; + } + + struct panfrost_ptr textures = + panvk_cmd_alloc_desc_array(cmdbuf, tex_count, TEXTURE); + if (!textures.cpu) + return VK_ERROR_OUT_OF_DEVICE_MEMORY; + + fill_textures(cmdbuf, key, textures.cpu); + + result = alloc_pre_post_dcds(cmdbuf); + if (result != VK_SUCCESS) + return result; + + struct mali_draw_packed dcd_base; + + pan_pack(&dcd_base, DRAW, cfg) { + cfg.thread_storage = batch->tls.gpu; + cfg.state = rsd.gpu; + + cfg.viewport = vpd.gpu; + + cfg.textures = textures.gpu; + cfg.samplers = sampler.gpu; + +#if PAN_ARCH >= 6 + /* Until we decide to support FB CRC, we can consider that untouched tiles + * should never be written back. */ + cfg.clean_fragment_write = false; +#endif + } + + struct mali_draw_packed *dcds = fbinfo->bifrost.pre_post.dcds.cpu; + uint32_t dcd_idx = key->aspects == VK_IMAGE_ASPECT_COLOR_BIT ? 0 : 1; + + if (key->needs_layer_id) { + struct panfrost_ptr layer_ids = panvk_cmd_alloc_dev_mem( + cmdbuf, desc, + cmdbuf->state.gfx.render.layer_count * sizeof(uint64_t), + sizeof(uint64_t)); + uint32_t *layer_id = layer_ids.cpu; + + for (uint32_t l = 0; l < cmdbuf->state.gfx.render.layer_count; l++) { + struct mali_draw_packed dcd_layer; + + /* Push uniform pointer has to be 8-byte aligned, so we have to skip + * odd layer_id entries. */ + layer_id[2 * l] = l; + pan_pack(&dcd_layer, DRAW, cfg) { + cfg.push_uniforms = layer_ids.gpu + (sizeof(uint64_t) * l); + }; + + pan_merge(dcd_layer, dcd_base, DRAW); + dcds[(l * 3) + dcd_idx] = dcd_layer; + } + } else { + dcds[dcd_idx] = dcd_base; + } + + if (key->aspects == VK_IMAGE_ASPECT_COLOR_BIT) { + fbinfo->bifrost.pre_post.modes[dcd_idx] = + MALI_PRE_POST_FRAME_SHADER_MODE_INTERSECT; + } else { + enum pipe_format fmt = fbinfo->zs.view.zs + ? fbinfo->zs.view.zs->planes[0]->layout.format + : fbinfo->zs.view.s->planes[0]->layout.format; + bool always = false; + + /* If we're dealing with a combined ZS resource and only one + * component is cleared, we need to reload the whole surface + * because the zs_clean_pixel_write_enable flag is set in that + * case. + */ + if (util_format_is_depth_and_stencil(fmt) && + fbinfo->zs.clear.z != fbinfo->zs.clear.s) + always = true; + + /* We could use INTERSECT on Bifrost v7 too, but + * EARLY_ZS_ALWAYS has the advantage of reloading the ZS tile + * buffer one or more tiles ahead, making ZS data immediately + * available for any ZS tests taking place in other shaders. + * Thing's haven't been benchmarked to determine what's + * preferable (saving bandwidth vs having ZS preloaded + * earlier), so let's leave it like that for now. + */ + fbinfo->bifrost.pre_post.modes[dcd_idx] = + PAN_ARCH > 6 + ? MALI_PRE_POST_FRAME_SHADER_MODE_EARLY_ZS_ALWAYS + : always ? MALI_PRE_POST_FRAME_SHADER_MODE_ALWAYS + : MALI_PRE_POST_FRAME_SHADER_MODE_INTERSECT; + } + + return VK_SUCCESS; +} +#else +static VkResult +cmd_emit_dcd(struct panvk_cmd_buffer *cmdbuf, + struct panvk_fb_preload_shader_key *key) +{ + struct panvk_device *dev = to_panvk_device(cmdbuf->vk.base.device); + struct pan_fb_info *fbinfo = &cmdbuf->state.gfx.render.fb.info; + struct panvk_internal_shader *shader = NULL; + + VkResult result = get_preload_shader(dev, key, &shader); + if (result != VK_SUCCESS) + return result; + + uint32_t bd_count = + key->aspects == VK_IMAGE_ASPECT_COLOR_BIT ? fbinfo->rt_count : 0; + struct panfrost_ptr bds = + panvk_cmd_alloc_desc_array(cmdbuf, bd_count, BLEND); + if (bd_count > 0 && !bds.cpu) + return VK_ERROR_OUT_OF_DEVICE_MEMORY; + + uint32_t tex_count = key->aspects == VK_IMAGE_ASPECT_COLOR_BIT + ? fbinfo->rt_count + : util_bitcount(key->aspects); + uint32_t desc_count = tex_count + 1; + + struct panfrost_ptr descs = panvk_cmd_alloc_dev_mem( + cmdbuf, desc, desc_count * PANVK_DESCRIPTOR_SIZE, PANVK_DESCRIPTOR_SIZE); + if (!descs.cpu) + return VK_ERROR_OUT_OF_DEVICE_MEMORY; + + struct mali_sampler_packed *sampler = descs.cpu; + + pan_pack(sampler, SAMPLER, cfg) { + cfg.seamless_cube_map = false; + cfg.normalized_coordinates = false; + cfg.minify_nearest = true; + cfg.magnify_nearest = true; + } + + fill_textures(cmdbuf, key, descs.cpu + PANVK_DESCRIPTOR_SIZE); + + if (key->aspects == VK_IMAGE_ASPECT_COLOR_BIT) + fill_bds(cmdbuf, key, bds.cpu); + + struct panfrost_ptr res_table = panvk_cmd_alloc_desc(cmdbuf, RESOURCE); + if (!res_table.cpu) + return VK_ERROR_OUT_OF_DEVICE_MEMORY; + + pan_pack(res_table.cpu, RESOURCE, cfg) { + cfg.address = descs.gpu; + cfg.size = desc_count * PANVK_DESCRIPTOR_SIZE; + } + + struct panfrost_ptr zsd = panvk_cmd_alloc_desc(cmdbuf, DEPTH_STENCIL); + if (!zsd.cpu) + return VK_ERROR_OUT_OF_DEVICE_MEMORY; + + pan_pack(zsd.cpu, DEPTH_STENCIL, cfg) { + cfg.depth_function = MALI_FUNC_ALWAYS; + cfg.depth_write_enable = fbinfo->zs.preload.z; + + if (fbinfo->zs.preload.z) + cfg.depth_source = MALI_DEPTH_SOURCE_SHADER; + + cfg.stencil_test_enable = fbinfo->zs.preload.s; + cfg.stencil_from_shader = fbinfo->zs.preload.s; + + cfg.front_compare_function = MALI_FUNC_ALWAYS; + cfg.front_stencil_fail = MALI_STENCIL_OP_REPLACE; + cfg.front_depth_fail = MALI_STENCIL_OP_REPLACE; + cfg.front_depth_pass = MALI_STENCIL_OP_REPLACE; + cfg.front_write_mask = 0xFF; + cfg.front_value_mask = 0xFF; + + cfg.back_compare_function = MALI_FUNC_ALWAYS; + cfg.back_stencil_fail = MALI_STENCIL_OP_REPLACE; + cfg.back_depth_fail = MALI_STENCIL_OP_REPLACE; + cfg.back_depth_pass = MALI_STENCIL_OP_REPLACE; + cfg.back_write_mask = 0xFF; + cfg.back_value_mask = 0xFF; + + cfg.depth_cull_enable = false; + } + + result = alloc_pre_post_dcds(cmdbuf); + if (result != VK_SUCCESS) + return result; + + struct mali_draw_packed *dcds = fbinfo->bifrost.pre_post.dcds.cpu; + uint32_t dcd_idx = key->aspects == VK_IMAGE_ASPECT_COLOR_BIT ? 0 : 1; + + pan_pack(&dcds[dcd_idx], DRAW, cfg) { + if (key->aspects == VK_IMAGE_ASPECT_COLOR_BIT) { + /* Skipping ATEST requires forcing Z/S */ + cfg.zs_update_operation = MALI_PIXEL_KILL_STRONG_EARLY; + cfg.pixel_kill_operation = MALI_PIXEL_KILL_FORCE_EARLY; + + cfg.blend = bds.gpu; + cfg.blend_count = bd_count; + cfg.render_target_mask = cmdbuf->state.gfx.render.bound_attachments; + } else { + /* ZS_EMIT requires late update/kill */ + cfg.zs_update_operation = MALI_PIXEL_KILL_FORCE_LATE; + cfg.pixel_kill_operation = MALI_PIXEL_KILL_FORCE_LATE; + cfg.blend_count = 0; + } + + cfg.allow_forward_pixel_to_kill = + key->aspects == VK_IMAGE_ASPECT_COLOR_BIT; + cfg.allow_forward_pixel_to_be_killed = true; + cfg.depth_stencil = zsd.gpu; + cfg.sample_mask = 0xFFFF; + cfg.multisample_enable = key->samples > 1; + cfg.evaluate_per_sample = key->samples > 1; + cfg.maximum_z = 1.0; + cfg.clean_fragment_write = false; + cfg.shader.resources = res_table.gpu | 1; + cfg.shader.shader = panvk_priv_mem_dev_addr(shader->spd); + cfg.shader.thread_storage = cmdbuf->state.gfx.tsd; + } + + if (key->aspects == VK_IMAGE_ASPECT_COLOR_BIT) { + fbinfo->bifrost.pre_post.modes[dcd_idx] = + MALI_PRE_POST_FRAME_SHADER_MODE_INTERSECT; + } else { + /* We could use INTERSECT on Valhall too, but + * EARLY_ZS_ALWAYS has the advantage of reloading the ZS tile + * buffer one or more tiles ahead, making ZS data immediately + * available for any ZS tests taking place in other shaders. + * Thing's haven't been benchmarked to determine what's + * preferable (saving bandwidth vs having ZS preloaded + * earlier), so let's leave it like that for now. + */ + fbinfo->bifrost.pre_post.modes[dcd_idx] = + MALI_PRE_POST_FRAME_SHADER_MODE_EARLY_ZS_ALWAYS; + } + + return VK_SUCCESS; +} +#endif + +static VkResult +cmd_preload_zs_attachments(struct panvk_cmd_buffer *cmdbuf) +{ + struct pan_fb_info *fbinfo = &cmdbuf->state.gfx.render.fb.info; + + if (!fbinfo->zs.preload.s && !fbinfo->zs.preload.z) + return VK_SUCCESS; + + struct panvk_fb_preload_shader_key key = { + .type = PANVK_META_OBJECT_KEY_FB_PRELOAD_SHADER, + .samples = fbinfo->nr_samples, + .needs_layer_id = cmdbuf->state.gfx.render.layer_count > 1, + }; + + if (fbinfo->zs.preload.z) { + key.aspects = VK_IMAGE_ASPECT_DEPTH_BIT; + key.view_type = + cmdbuf->state.gfx.render.z_attachment.iview + ? cmdbuf->state.gfx.render.z_attachment.iview->vk.view_type + : cmdbuf->state.gfx.render.s_attachment.iview->vk.view_type; + } + + if (fbinfo->zs.preload.s) { + VkImageViewType view_type = + cmdbuf->state.gfx.render.s_attachment.iview + ? cmdbuf->state.gfx.render.s_attachment.iview->vk.view_type + : cmdbuf->state.gfx.render.z_attachment.iview->vk.view_type; + + key.aspects |= VK_IMAGE_ASPECT_STENCIL_BIT; + if (!fbinfo->zs.preload.z) + key.view_type = view_type; + + assert(key.view_type == view_type); + } + + return cmd_emit_dcd(cmdbuf, &key); +} + +static VkResult +cmd_preload_color_attachments(struct panvk_cmd_buffer *cmdbuf) +{ + struct pan_fb_info *fbinfo = &cmdbuf->state.gfx.render.fb.info; + struct panvk_fb_preload_shader_key key = { + .type = PANVK_META_OBJECT_KEY_FB_PRELOAD_SHADER, + .samples = fbinfo->nr_samples, + .needs_layer_id = cmdbuf->state.gfx.render.layer_count > 1, + .aspects = VK_IMAGE_ASPECT_COLOR_BIT, + }; + bool needs_preload = false; + + for (uint32_t i = 0; i < fbinfo->rt_count; i++) { + if (!fbinfo->rts[i].preload) + continue; + + enum pipe_format pfmt = fbinfo->rts[i].view->format; + struct panvk_image_view *iview = + cmdbuf->state.gfx.render.color_attachments.iviews[i]; + + key.color[i].type = util_format_is_pure_uint(pfmt) ? nir_type_uint32 + : util_format_is_pure_sint(pfmt) ? nir_type_int32 + : nir_type_float32; + + if (!needs_preload) { + key.view_type = iview->vk.view_type; + needs_preload = true; + } + + assert(key.view_type == iview->vk.view_type); + } + + if (!needs_preload) + return VK_SUCCESS; + + return cmd_emit_dcd(cmdbuf, &key); +} + +VkResult +panvk_per_arch(cmd_fb_preload)(struct panvk_cmd_buffer *cmdbuf) +{ + VkResult result = cmd_preload_color_attachments(cmdbuf); + if (result != VK_SUCCESS) + return result; + + return cmd_preload_zs_attachments(cmdbuf); +} diff --git a/src/panfrost/vulkan/panvk_vX_device.c b/src/panfrost/vulkan/panvk_vX_device.c index c5cd9f31039..b178fc9a3e0 100644 --- a/src/panfrost/vulkan/panvk_vX_device.c +++ b/src/panfrost/vulkan/panvk_vX_device.c @@ -150,49 +150,6 @@ panvk_meta_cleanup(struct panvk_device *device) vk_meta_device_finish(&device->vk, &device->meta); } -static void -panvk_preload_blitter_init(struct panvk_device *device) -{ - const struct panvk_physical_device *physical_device = - to_panvk_physical_device(device->vk.physical); - - struct panvk_pool_properties bin_pool_props = { - .create_flags = PAN_KMOD_BO_FLAG_EXECUTABLE, - .slab_size = 16 * 1024, - .label = "panvk_meta blitter binary pool", - .owns_bos = true, - .needs_locking = false, - .prealloc = false, - }; - panvk_pool_init(&device->blitter.bin_pool, device, NULL, &bin_pool_props); - - struct panvk_pool_properties desc_pool_props = { - .create_flags = 0, - .slab_size = 16 * 1024, - .label = "panvk_meta blitter descriptor pool", - .owns_bos = true, - .needs_locking = false, - .prealloc = false, - }; - panvk_pool_init(&device->blitter.desc_pool, device, NULL, &desc_pool_props); - - pan_blend_shader_cache_init(&device->blitter.blend_shader_cache, - physical_device->kmod.props.gpu_prod_id); - GENX(pan_blitter_cache_init) - (&device->blitter.cache, physical_device->kmod.props.gpu_prod_id, - &device->blitter.blend_shader_cache, &device->blitter.bin_pool.base, - &device->blitter.desc_pool.base); -} - -static void -panvk_preload_blitter_cleanup(struct panvk_device *device) -{ - GENX(pan_blitter_cache_cleanup)(&device->blitter.cache); - pan_blend_shader_cache_cleanup(&device->blitter.blend_shader_cache); - panvk_pool_cleanup(&device->blitter.desc_pool); - panvk_pool_cleanup(&device->blitter.bin_pool); -} - /* Always reserve the lower 32MB. */ #define PANVK_VA_RESERVE_BOTTOM 0x2000000ull @@ -311,11 +268,9 @@ panvk_per_arch(create_device)(struct panvk_physical_device *physical_device, vk_device_set_drm_fd(&device->vk, device->kmod.dev->fd); - panvk_preload_blitter_init(device); - result = panvk_meta_init(device); if (result != VK_SUCCESS) - goto err_cleanup_blitter; + goto err_free_priv_bos; for (unsigned i = 0; i < pCreateInfo->queueCreateInfoCount; i++) { const VkDeviceQueueCreateInfo *queue_create = @@ -356,9 +311,6 @@ err_finish_queues: panvk_meta_cleanup(device); -err_cleanup_blitter: - panvk_preload_blitter_cleanup(device); - err_free_priv_bos: panvk_priv_bo_unref(device->sample_positions); panvk_priv_bo_unref(device->tiler_heap); @@ -393,7 +345,6 @@ panvk_per_arch(destroy_device)(struct panvk_device *device, } panvk_meta_cleanup(device); - panvk_preload_blitter_cleanup(device); panvk_priv_bo_unref(device->tiler_heap); panvk_priv_bo_unref(device->sample_positions); panvk_device_cleanup_mempools(device);