panvk: Implement a custom FB preload logic

This has several advantages over using pan_blitter for that:

- we can catch allocation failures and flag the command buffer invalid
- we can re-use the vk_meta_device object list to keep track of our
  preload shaders
- we can re-use surface descriptors instead of re-emitting them every
  time a preload is done

Signed-off-by: Boris Brezillon <boris.brezillon@collabora.com>
Acked-by: Eric R. Smith <eric.smith@collabora.com>
Reviewed-by: Mary Guillemard <mary.guillemard@collabora.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/31441>
This commit is contained in:
Boris Brezillon
2024-09-24 15:30:36 +02:00
committed by Marge Bot
parent 607e517a11
commit 0bc3502ca3
11 changed files with 792 additions and 103 deletions

View File

@@ -2621,11 +2621,8 @@ dEQP-VK.pipeline.pipeline_library.depth.nocolor.format.x8_d24_unorm_pack32.compa
spec@ext_image_dma_buf_import@ext_image_dma_buf_import-refcount-multithread,Crash
# physical device and device needs more robustness in allocation handling
dEQP-VK.api.object_management.alloc_callback_fail.device,Crash
dEQP-VK.api.object_management.alloc_callback_fail.device_group,Crash
dEQP-VK.api.object_management.max_concurrent.device,Fail
dEQP-VK.api.object_management.max_concurrent.device_group,Fail
dEQP-VK.api.device_init.create_instance_device_intentional_alloc_fail.basic,Crash
# query pool not supported yet
dEQP-VK.api.null_handle.destroy_query_pool,Crash

View File

@@ -750,7 +750,17 @@ GENX(pan_emit_fbd)(const struct pan_fb_info *fb, unsigned layer_idx,
force_clean_write);
cfg.post_frame = pan_fix_frame_shader_mode(fb->bifrost.pre_post.modes[2],
force_clean_write);
#if PAN_ARCH <= 7
/* On Bifrost, the layer_id is passed through a push_uniform, which forces
* us to have one pre/post DCD array per layer. */
cfg.frame_shader_dcds =
fb->bifrost.pre_post.dcds.gpu + (layer_idx * 3 * pan_size(DRAW));
#else
/* On Valhall, layer_id is passed through the framebuffer frame_arg, which
* is preloaded in r62, so we can use the same pre/post DCD array for all
* layers. */
cfg.frame_shader_dcds = fb->bifrost.pre_post.dcds.gpu;
#endif
cfg.tiler =
PAN_ARCH >= 9 ? tiler_ctx->valhall.desc : tiler_ctx->bifrost.desc;
#endif

View File

@@ -40,7 +40,6 @@
#include "panvk_physical_device.h"
#include "panvk_priv_bo.h"
#include "pan_blitter.h"
#include "pan_desc.h"
#include "pan_encoder.h"
#include "pan_props.h"

View File

@@ -16,6 +16,7 @@
#include "panvk_cmd_alloc.h"
#include "panvk_cmd_buffer.h"
#include "panvk_cmd_desc_state.h"
#include "panvk_cmd_fb_preload.h"
#include "panvk_cmd_meta.h"
#include "panvk_device.h"
#include "panvk_entrypoints.h"
@@ -1977,23 +1978,6 @@ resolve_attachments(struct panvk_cmd_buffer *cmdbuf)
static uint8_t
prepare_fb_desc(struct panvk_cmd_buffer *cmdbuf, uint32_t layer, void *fbd)
{
struct panvk_device *dev = to_panvk_device(cmdbuf->vk.base.device);
memset(&cmdbuf->state.gfx.render.fb.info.bifrost.pre_post.dcds, 0,
sizeof(cmdbuf->state.gfx.render.fb.info.bifrost.pre_post.dcds));
if (cmdbuf->state.tls.desc.gpu) {
ASSERTED unsigned num_preload_jobs =
GENX(pan_preload_fb)(&dev->blitter.cache, &cmdbuf->desc_pool.base,
&cmdbuf->state.gfx.render.fb.info, layer,
cmdbuf->state.tls.desc.gpu, NULL);
/* Valhall GPUs use pre frame DCDs to preload the FB content. We
* thus expect num_preload_jobs to be zero.
*/
assert(!num_preload_jobs);
}
struct pan_tiler_context tiler_ctx = {
.valhall.layer_offset = layer - (layer % MAX_LAYERS_PER_TILER_DESC),
};
@@ -2092,11 +2076,11 @@ wait_finish_tiling(struct panvk_cmd_buffer *cmdbuf)
vt_sync_addr);
}
static void
static VkResult
issue_fragment_jobs(struct panvk_cmd_buffer *cmdbuf)
{
if (!cmdbuf->state.gfx.render.fbds.gpu)
return;
return VK_SUCCESS;
struct panvk_device *dev = to_panvk_device(cmdbuf->vk.base.device);
struct pan_fb_info *fbinfo = &cmdbuf->state.gfx.render.fb.info;
@@ -2132,6 +2116,10 @@ issue_fragment_jobs(struct panvk_cmd_buffer *cmdbuf)
struct panfrost_ptr fbds = cmdbuf->state.gfx.render.fbds;
uint8_t fbd_flags = 0;
VkResult result = panvk_per_arch(cmd_fb_preload)(cmdbuf);
if (result != VK_SUCCESS)
return result;
/* We prepare all FB descriptors upfront. */
for (uint32_t i = 0; i < cmdbuf->state.gfx.render.layer_count; i++) {
uint32_t new_fbd_flags =
@@ -2280,6 +2268,8 @@ issue_fragment_jobs(struct panvk_cmd_buffer *cmdbuf)
memset(&cmdbuf->state.gfx.render.fbds, 0,
sizeof(cmdbuf->state.gfx.render.fbds));
cmdbuf->state.gfx.render.tiler = 0;
return VK_SUCCESS;
}
void

View File

@@ -32,6 +32,7 @@
#include "panvk_cmd_alloc.h"
#include "panvk_cmd_buffer.h"
#include "panvk_cmd_desc_state.h"
#include "panvk_cmd_fb_preload.h"
#include "panvk_cmd_pool.h"
#include "panvk_cmd_push_constant.h"
#include "panvk_device.h"
@@ -40,7 +41,6 @@
#include "panvk_physical_device.h"
#include "panvk_priv_bo.h"
#include "pan_blitter.h"
#include "pan_desc.h"
#include "pan_encoder.h"
#include "pan_props.h"
@@ -140,22 +140,16 @@ panvk_per_arch(cmd_close_batch)(struct panvk_cmd_buffer *cmdbuf)
panfrost_sample_positions_offset(
pan_sample_pattern(fbinfo->nr_samples));
if (batch->vtc_jc.first_tiler) {
VkResult result = panvk_per_arch(cmd_fb_preload)(cmdbuf);
if (result != VK_SUCCESS)
return;
}
for (uint32_t i = 0; i < batch->fb.layer_count; i++) {
VkResult result;
mali_ptr fbd = batch->fb.desc.gpu + (batch->fb.desc_stride * i);
if (batch->vtc_jc.first_tiler) {
cmdbuf->state.gfx.render.fb.info.bifrost.pre_post.dcds.gpu = 0;
ASSERTED unsigned num_preload_jobs = GENX(pan_preload_fb)(
&dev->blitter.cache, &cmdbuf->desc_pool.base,
&cmdbuf->state.gfx.render.fb.info, i, batch->tls.gpu, NULL);
/* Bifrost GPUs use pre frame DCDs to preload the FB content. We
* thus expect num_preload_jobs to be zero.
*/
assert(!num_preload_jobs);
}
result = panvk_per_arch(cmd_prepare_tiler_context)(cmdbuf, i);
if (result != VK_SUCCESS)

View File

@@ -72,6 +72,7 @@ common_per_arch_files = [
panvk_entrypoints[0],
'panvk_vX_blend.c',
'panvk_vX_buffer_view.c',
'panvk_vX_cmd_fb_preload.c',
'panvk_vX_cmd_desc_state.c',
'panvk_vX_cmd_meta.c',
'panvk_vX_cmd_push_constant.c',

View File

@@ -0,0 +1,13 @@
/*
* Copyright © 2021 Collabora Ltd.
* SPDX-License-Identifier: MIT
*/
#ifndef PANVK_FB_PRELOAD_H
#define PANVK_FB_PRELOAD_H
#include "panvk_cmd_buffer.h"
VkResult panvk_per_arch(cmd_fb_preload)(struct panvk_cmd_buffer *cmdbuf);
#endif

View File

@@ -20,7 +20,6 @@
#include "kmod/pan_kmod.h"
#include "util/pan_ir.h"
#include "pan_blitter.h"
#include "util/vma.h"
@@ -43,22 +42,6 @@ struct panvk_device {
struct panvk_priv_bo *tiler_heap;
struct panvk_priv_bo *sample_positions;
/* Access to the blitter pools are protected by the blitter
* shader/rsd locks. They can't be merged with other binary/desc
* pools unless we patch pan_blitter.c to support external pool locks.
*
* FIXME: The blitter infrastructure is only needed for FB preload.
* We should probably consider getting rid of the dependency we have
* on pan_desc.c and implement preload ourselves so we don't have
* to duplicate caches.
*/
struct {
struct panvk_pool bin_pool;
struct panvk_pool desc_pool;
struct pan_blitter_cache cache;
struct pan_blend_shader_cache blend_shader_cache;
} blitter;
struct vk_meta_device meta;
struct {

View File

@@ -15,6 +15,7 @@
enum panvk_meta_object_key_type {
PANVK_META_OBJECT_KEY_BLEND_SHADER = VK_META_OBJECT_KEY_DRIVER_OFFSET,
PANVK_META_OBJECT_KEY_COPY_DESC_SHADER,
PANVK_META_OBJECT_KEY_FB_PRELOAD_SHADER,
};
static inline VkFormat

View File

@@ -0,0 +1,750 @@
/*
* Copyright © 2021 Collabora Ltd.
* SPDX-License-Identifier: MIT
*/
#include "panvk_cmd_alloc.h"
#include "panvk_cmd_fb_preload.h"
#include "panvk_image_view.h"
#include "panvk_meta.h"
#include "panvk_shader.h"
#include "nir_builder.h"
#include "pan_shader.h"
struct panvk_fb_preload_shader_key {
enum panvk_meta_object_key_type type;
VkImageViewType view_type;
VkSampleCountFlagBits samples;
VkImageAspectFlags aspects;
bool needs_layer_id;
struct {
nir_alu_type type;
} color[8];
};
static nir_def *
texel_fetch(nir_builder *b, VkImageViewType view_type,
nir_alu_type reg_type, unsigned tex_idx,
nir_def *sample_id, nir_def *coords)
{
nir_tex_instr *tex = nir_tex_instr_create(b->shader, sample_id ? 3 : 2);
tex->op = sample_id ? nir_texop_txf_ms : nir_texop_txf;
tex->dest_type = reg_type;
tex->is_array = vk_image_view_type_is_array(view_type);
tex->sampler_dim = sample_id ? GLSL_SAMPLER_DIM_MS
: vk_image_view_type_to_sampler_dim(view_type);
tex->coord_components = coords->num_components;
tex->src[0] = nir_tex_src_for_ssa(nir_tex_src_coord, coords);
tex->src[1] = nir_tex_src_for_ssa(nir_tex_src_lod, nir_imm_int(b, 0));
if (sample_id)
tex->src[2] = nir_tex_src_for_ssa(nir_tex_src_ms_index, sample_id);
#if PAN_ARCH <= 7
tex->sampler_index = 0;
tex->texture_index = tex_idx;
#else
tex->sampler_index = pan_res_handle(0, 0);
tex->texture_index = pan_res_handle(0, tex_idx + 1);
#endif
nir_def_init(&tex->instr, &tex->def, 4, 32);
nir_builder_instr_insert(b, &tex->instr);
return &tex->def;
}
static nir_variable *
color_output_var(nir_builder *b, VkImageViewType view_type,
VkImageAspectFlags aspect, VkSampleCountFlagBits samples,
nir_alu_type fmt_type, unsigned rt)
{
enum glsl_base_type base_type =
nir_get_glsl_base_type_for_nir_type(fmt_type);
const struct glsl_type *var_type = glsl_vector_type(base_type, 4);
static const char *var_names[] = {
"gl_FragData[0]", "gl_FragData[1]", "gl_FragData[2]", "gl_FragData[3]",
"gl_FragData[4]", "gl_FragData[5]", "gl_FragData[6]", "gl_FragData[7]",
};
assert(rt < ARRAY_SIZE(var_names));
nir_variable *var = nir_variable_create(b->shader, nir_var_shader_out,
var_type, var_names[rt]);
var->data.location = FRAG_RESULT_DATA0 + rt;
return var;
}
static nir_def *
get_layer_id(nir_builder *b)
{
#if PAN_ARCH <= 7
return nir_load_push_constant(b, 1, 32, nir_imm_int(b, 0), .base = 0,
.range = 4);
#else
return nir_load_layer_id(b);
#endif
}
static nir_shader *
get_preload_nir_shader(const struct panvk_fb_preload_shader_key *key)
{
nir_builder builder = nir_builder_init_simple_shader(
MESA_SHADER_FRAGMENT, GENX(pan_shader_get_compiler_options)(),
"panvk-meta-preload");
nir_builder *b = &builder;
nir_def *sample_id =
key->samples != VK_SAMPLE_COUNT_1_BIT ? nir_load_sample_id(b) : NULL;
nir_def *coords = nir_u2u32(b, nir_load_pixel_coord(b));
if (key->view_type == VK_IMAGE_VIEW_TYPE_2D_ARRAY ||
key->view_type == VK_IMAGE_VIEW_TYPE_CUBE_ARRAY ||
key->view_type == VK_IMAGE_VIEW_TYPE_CUBE ||
key->view_type == VK_IMAGE_VIEW_TYPE_3D) {
coords =
nir_vec3(b, nir_channel(b, coords, 0), nir_channel(b, coords, 1),
key->needs_layer_id ? get_layer_id(b) : nir_imm_int(b, 0));
}
if (key->aspects == VK_IMAGE_ASPECT_COLOR_BIT) {
for (uint32_t i = 0; i < ARRAY_SIZE(key->color); i++) {
if (key->color[i].type == nir_type_invalid)
continue;
nir_def *texel = texel_fetch(b, key->view_type, key->color[i].type, i,
sample_id, coords);
nir_store_output(
b, texel, nir_imm_int(b, 0), .base = i,
.src_type = key->color[i].type,
.io_semantics.location = FRAG_RESULT_DATA0 + i,
.io_semantics.num_slots = 1,
.write_mask = nir_component_mask(texel->num_components));
}
}
if (key->aspects & VK_IMAGE_ASPECT_DEPTH_BIT) {
nir_def *texel = texel_fetch(b, key->view_type, nir_type_float32, 0,
sample_id, coords);
nir_store_output(b, nir_channel(b, texel, 0), nir_imm_int(b, 0),
.base = 0, .src_type = nir_type_float32,
.io_semantics.location = FRAG_RESULT_DEPTH,
.io_semantics.num_slots = 1,
.write_mask = nir_component_mask(1));
}
if (key->aspects & VK_IMAGE_ASPECT_STENCIL_BIT) {
nir_def *texel = texel_fetch(
b, key->view_type, nir_type_uint32,
key->aspects & VK_IMAGE_ASPECT_DEPTH_BIT ? 1 : 0, sample_id, coords);
nir_store_output(b, nir_channel(b, texel, 0), nir_imm_int(b, 0),
.base = 0, .src_type = nir_type_uint32,
.io_semantics.location = FRAG_RESULT_STENCIL,
.io_semantics.num_slots = 1,
.write_mask = nir_component_mask(1));
}
return b->shader;
}
static VkResult
get_preload_shader(struct panvk_device *dev,
const struct panvk_fb_preload_shader_key *key,
struct panvk_internal_shader **shader_out)
{
struct panvk_physical_device *phys_dev =
to_panvk_physical_device(dev->vk.physical);
struct panvk_internal_shader *shader;
VkShaderEXT shader_handle = (VkShaderEXT)vk_meta_lookup_object(
&dev->meta, VK_OBJECT_TYPE_SHADER_EXT, key, sizeof(*key));
if (shader_handle != VK_NULL_HANDLE)
goto out;
nir_shader *nir = get_preload_nir_shader(key);
nir_shader_gather_info(nir, nir_shader_get_entrypoint(nir));
struct panfrost_compile_inputs inputs = {
.gpu_id = phys_dev->kmod.props.gpu_prod_id,
.no_ubo_to_push = true,
};
pan_shader_preprocess(nir, inputs.gpu_id);
VkResult result = panvk_per_arch(create_internal_shader)(
dev, nir, &inputs, &shader);
if (result != VK_SUCCESS)
return result;
#if PAN_ARCH >= 9
shader->spd = panvk_pool_alloc_desc(&dev->mempools.rw, SHADER_PROGRAM);
if (!panvk_priv_mem_host_addr(shader->spd)) {
vk_shader_destroy(&dev->vk, &shader->vk, NULL);
return panvk_error(dev, VK_ERROR_OUT_OF_DEVICE_MEMORY);
}
pan_pack(panvk_priv_mem_host_addr(shader->spd), SHADER_PROGRAM, cfg) {
cfg.stage = MALI_SHADER_STAGE_FRAGMENT;
cfg.fragment_coverage_bitmask_type = MALI_COVERAGE_BITMASK_TYPE_GL;
cfg.register_allocation = MALI_SHADER_REGISTER_ALLOCATION_32_PER_THREAD;
cfg.binary = panvk_priv_mem_dev_addr(shader->code_mem);
cfg.preload.r48_r63 = shader->info.preload >> 48;
}
#endif
shader_handle = (VkShaderEXT)vk_meta_cache_object(
&dev->vk, &dev->meta, key, sizeof(*key), VK_OBJECT_TYPE_SHADER_EXT,
(uint64_t)panvk_internal_shader_to_handle(shader));
out:
shader = panvk_internal_shader_from_handle(shader_handle);
*shader_out = shader;
return VK_SUCCESS;
}
static VkResult
alloc_pre_post_dcds(struct panvk_cmd_buffer *cmdbuf)
{
struct pan_fb_info *fbinfo = &cmdbuf->state.gfx.render.fb.info;
if (fbinfo->bifrost.pre_post.dcds.gpu)
return VK_SUCCESS;
uint32_t dcd_count =
3 * (PAN_ARCH <= 7 ? cmdbuf->state.gfx.render.layer_count : 1);
fbinfo->bifrost.pre_post.dcds = panvk_cmd_alloc_desc_array(cmdbuf, dcd_count, DRAW);
if (!fbinfo->bifrost.pre_post.dcds.cpu)
return VK_ERROR_OUT_OF_DEVICE_MEMORY;
return VK_SUCCESS;
}
static enum mali_register_file_format
get_reg_fmt(nir_alu_type type)
{
switch (type) {
case nir_type_float32:
return MALI_REGISTER_FILE_FORMAT_F32;
case nir_type_uint32:
return MALI_REGISTER_FILE_FORMAT_U32;
case nir_type_int32:
return MALI_REGISTER_FILE_FORMAT_I32;
default:
assert(!"Invalid reg type");
return MALI_REGISTER_FILE_FORMAT_F32;
}
}
static void
fill_textures(struct panvk_cmd_buffer *cmdbuf,
const struct panvk_fb_preload_shader_key *key,
struct mali_texture_packed *textures)
{
struct pan_fb_info *fbinfo = &cmdbuf->state.gfx.render.fb.info;
if (key->aspects == VK_IMAGE_ASPECT_COLOR_BIT) {
for (unsigned i = 0; i < fbinfo->rt_count; i++) {
struct panvk_image_view *iview =
cmdbuf->state.gfx.render.color_attachments.iviews[i];
if (iview)
textures[i] = iview->descs.tex;
else
textures[i] = (struct mali_texture_packed){0};
}
return;
}
uint32_t idx = 0;
if (key->aspects & VK_IMAGE_ASPECT_DEPTH_BIT) {
struct panvk_image_view *iview =
cmdbuf->state.gfx.render.z_attachment.iview
?: cmdbuf->state.gfx.render.s_attachment.iview;
textures[idx++] = vk_format_has_depth(iview->vk.view_format)
? iview->descs.tex
: iview->descs.other_aspect_tex;
}
if (key->aspects & VK_IMAGE_ASPECT_STENCIL_BIT) {
struct panvk_image_view *iview =
cmdbuf->state.gfx.render.s_attachment.iview
?: cmdbuf->state.gfx.render.z_attachment.iview;
textures[idx++] = vk_format_has_depth(iview->vk.view_format)
? iview->descs.other_aspect_tex
: iview->descs.tex;
}
}
static void
fill_bds(struct panvk_cmd_buffer *cmdbuf,
const struct panvk_fb_preload_shader_key *key,
struct mali_blend_packed *bds)
{
struct pan_fb_info *fbinfo = &cmdbuf->state.gfx.render.fb.info;
uint32_t bd_count = MAX2(fbinfo->rt_count, 1);
for (unsigned i = 0; i < bd_count; i++) {
const struct pan_image_view *pview =
fbinfo->rts[i].preload ? fbinfo->rts[i].view : NULL;
pan_pack(&bds[i], BLEND, cfg) {
if (key->aspects != VK_IMAGE_ASPECT_COLOR_BIT || !pview) {
cfg.enable = false;
cfg.internal.mode = MALI_BLEND_MODE_OFF;
continue;
}
cfg.round_to_fb_precision = true;
cfg.srgb = util_format_is_srgb(pview->format);
cfg.internal.mode = MALI_BLEND_MODE_OPAQUE;
cfg.equation.rgb.a = MALI_BLEND_OPERAND_A_SRC;
cfg.equation.rgb.b = MALI_BLEND_OPERAND_B_SRC;
cfg.equation.rgb.c = MALI_BLEND_OPERAND_C_ZERO;
cfg.equation.alpha.a = MALI_BLEND_OPERAND_A_SRC;
cfg.equation.alpha.b = MALI_BLEND_OPERAND_B_SRC;
cfg.equation.alpha.c = MALI_BLEND_OPERAND_C_ZERO;
cfg.equation.color_mask = 0xf;
cfg.internal.fixed_function.num_comps = 4;
cfg.internal.fixed_function.conversion.memory_format = GENX(
panfrost_dithered_format_from_pipe_format)(pview->format, false);
cfg.internal.fixed_function.rt = i;
#if PAN_ARCH <= 7
cfg.internal.fixed_function.conversion.register_format =
get_reg_fmt(key->color[i].type);
#endif
}
}
}
#if PAN_ARCH <= 7
static VkResult
cmd_emit_dcd(struct panvk_cmd_buffer *cmdbuf,
const struct panvk_fb_preload_shader_key *key)
{
struct panvk_device *dev = to_panvk_device(cmdbuf->vk.base.device);
struct pan_fb_info *fbinfo = &cmdbuf->state.gfx.render.fb.info;
struct panvk_internal_shader *shader = NULL;
VkResult result = get_preload_shader(dev, key, &shader);
if (result != VK_SUCCESS)
return result;
uint32_t tex_count = key->aspects == VK_IMAGE_ASPECT_COLOR_BIT
? fbinfo->rt_count
: util_bitcount(key->aspects);
uint32_t bd_count = MAX2(fbinfo->rt_count, 1);
struct panfrost_ptr rsd = panvk_cmd_alloc_desc_aggregate(
cmdbuf, PAN_DESC(RENDERER_STATE),
PAN_DESC_ARRAY(bd_count, BLEND));
if (!rsd.cpu)
return VK_ERROR_OUT_OF_DEVICE_MEMORY;
pan_pack(rsd.cpu, RENDERER_STATE, cfg) {
pan_shader_prepare_rsd(&shader->info,
panvk_priv_mem_dev_addr(shader->code_mem), &cfg);
cfg.shader.texture_count = tex_count;
cfg.shader.sampler_count = 1;
cfg.multisample_misc.sample_mask = 0xFFFF;
cfg.multisample_misc.multisample_enable = key->samples > 1;
cfg.multisample_misc.evaluate_per_sample = key->samples > 1;
cfg.multisample_misc.depth_function = MALI_FUNC_ALWAYS;
cfg.multisample_misc.depth_write_mask =
(key->aspects & VK_IMAGE_ASPECT_DEPTH_BIT) != 0;
cfg.stencil_mask_misc.stencil_enable =
(key->aspects & VK_IMAGE_ASPECT_STENCIL_BIT) != 0;
cfg.stencil_mask_misc.stencil_mask_front = 0xFF;
cfg.stencil_mask_misc.stencil_mask_back = 0xFF;
cfg.stencil_front.compare_function = MALI_FUNC_ALWAYS;
cfg.stencil_front.stencil_fail = MALI_STENCIL_OP_REPLACE;
cfg.stencil_front.depth_fail = MALI_STENCIL_OP_REPLACE;
cfg.stencil_front.depth_pass = MALI_STENCIL_OP_REPLACE;
cfg.stencil_front.mask = 0xFF;
cfg.stencil_back = cfg.stencil_front;
if (key->aspects == VK_IMAGE_ASPECT_COLOR_BIT) {
/* Skipping ATEST requires forcing Z/S */
cfg.properties.zs_update_operation = MALI_PIXEL_KILL_STRONG_EARLY;
cfg.properties.pixel_kill_operation = MALI_PIXEL_KILL_FORCE_EARLY;
} else {
/* Writing Z/S requires late updates */
cfg.properties.zs_update_operation = MALI_PIXEL_KILL_FORCE_LATE;
cfg.properties.pixel_kill_operation = MALI_PIXEL_KILL_FORCE_LATE;
}
/* However, while shaders writing Z/S can normally be killed, on v6
* for frame shaders it can cause GPU timeouts, so only allow colour
* blit shaders to be killed. */
cfg.properties.allow_forward_pixel_to_kill =
key->aspects == VK_IMAGE_ASPECT_COLOR_BIT;
if (PAN_ARCH == 6)
cfg.properties.allow_forward_pixel_to_be_killed =
key->aspects == VK_IMAGE_ASPECT_COLOR_BIT;
}
fill_bds(cmdbuf, key, rsd.cpu + pan_size(RENDERER_STATE));
struct panvk_batch *batch = cmdbuf->cur_batch;
uint16_t minx = 0, miny = 0, maxx, maxy;
/* Align on 32x32 tiles */
minx = fbinfo->extent.minx & ~31;
miny = fbinfo->extent.miny & ~31;
maxx = MIN2(ALIGN_POT(fbinfo->extent.maxx + 1, 32), fbinfo->width) - 1;
maxy = MIN2(ALIGN_POT(fbinfo->extent.maxy + 1, 32), fbinfo->height) - 1;
struct panfrost_ptr vpd = panvk_cmd_alloc_desc(cmdbuf, VIEWPORT);
if (!vpd.cpu)
return VK_ERROR_OUT_OF_DEVICE_MEMORY;
pan_pack(vpd.cpu, VIEWPORT, cfg) {
cfg.scissor_minimum_x = minx;
cfg.scissor_minimum_y = miny;
cfg.scissor_maximum_x = maxx;
cfg.scissor_maximum_y = maxy;
}
struct panfrost_ptr sampler = panvk_cmd_alloc_desc(cmdbuf, SAMPLER);
if (!sampler.cpu)
return VK_ERROR_OUT_OF_DEVICE_MEMORY;
pan_pack(sampler.cpu, SAMPLER, cfg) {
cfg.seamless_cube_map = false;
cfg.normalized_coordinates = false;
cfg.minify_nearest = true;
cfg.magnify_nearest = true;
}
struct panfrost_ptr textures =
panvk_cmd_alloc_desc_array(cmdbuf, tex_count, TEXTURE);
if (!textures.cpu)
return VK_ERROR_OUT_OF_DEVICE_MEMORY;
fill_textures(cmdbuf, key, textures.cpu);
result = alloc_pre_post_dcds(cmdbuf);
if (result != VK_SUCCESS)
return result;
struct mali_draw_packed dcd_base;
pan_pack(&dcd_base, DRAW, cfg) {
cfg.thread_storage = batch->tls.gpu;
cfg.state = rsd.gpu;
cfg.viewport = vpd.gpu;
cfg.textures = textures.gpu;
cfg.samplers = sampler.gpu;
#if PAN_ARCH >= 6
/* Until we decide to support FB CRC, we can consider that untouched tiles
* should never be written back. */
cfg.clean_fragment_write = false;
#endif
}
struct mali_draw_packed *dcds = fbinfo->bifrost.pre_post.dcds.cpu;
uint32_t dcd_idx = key->aspects == VK_IMAGE_ASPECT_COLOR_BIT ? 0 : 1;
if (key->needs_layer_id) {
struct panfrost_ptr layer_ids = panvk_cmd_alloc_dev_mem(
cmdbuf, desc,
cmdbuf->state.gfx.render.layer_count * sizeof(uint64_t),
sizeof(uint64_t));
uint32_t *layer_id = layer_ids.cpu;
for (uint32_t l = 0; l < cmdbuf->state.gfx.render.layer_count; l++) {
struct mali_draw_packed dcd_layer;
/* Push uniform pointer has to be 8-byte aligned, so we have to skip
* odd layer_id entries. */
layer_id[2 * l] = l;
pan_pack(&dcd_layer, DRAW, cfg) {
cfg.push_uniforms = layer_ids.gpu + (sizeof(uint64_t) * l);
};
pan_merge(dcd_layer, dcd_base, DRAW);
dcds[(l * 3) + dcd_idx] = dcd_layer;
}
} else {
dcds[dcd_idx] = dcd_base;
}
if (key->aspects == VK_IMAGE_ASPECT_COLOR_BIT) {
fbinfo->bifrost.pre_post.modes[dcd_idx] =
MALI_PRE_POST_FRAME_SHADER_MODE_INTERSECT;
} else {
enum pipe_format fmt = fbinfo->zs.view.zs
? fbinfo->zs.view.zs->planes[0]->layout.format
: fbinfo->zs.view.s->planes[0]->layout.format;
bool always = false;
/* If we're dealing with a combined ZS resource and only one
* component is cleared, we need to reload the whole surface
* because the zs_clean_pixel_write_enable flag is set in that
* case.
*/
if (util_format_is_depth_and_stencil(fmt) &&
fbinfo->zs.clear.z != fbinfo->zs.clear.s)
always = true;
/* We could use INTERSECT on Bifrost v7 too, but
* EARLY_ZS_ALWAYS has the advantage of reloading the ZS tile
* buffer one or more tiles ahead, making ZS data immediately
* available for any ZS tests taking place in other shaders.
* Thing's haven't been benchmarked to determine what's
* preferable (saving bandwidth vs having ZS preloaded
* earlier), so let's leave it like that for now.
*/
fbinfo->bifrost.pre_post.modes[dcd_idx] =
PAN_ARCH > 6
? MALI_PRE_POST_FRAME_SHADER_MODE_EARLY_ZS_ALWAYS
: always ? MALI_PRE_POST_FRAME_SHADER_MODE_ALWAYS
: MALI_PRE_POST_FRAME_SHADER_MODE_INTERSECT;
}
return VK_SUCCESS;
}
#else
static VkResult
cmd_emit_dcd(struct panvk_cmd_buffer *cmdbuf,
struct panvk_fb_preload_shader_key *key)
{
struct panvk_device *dev = to_panvk_device(cmdbuf->vk.base.device);
struct pan_fb_info *fbinfo = &cmdbuf->state.gfx.render.fb.info;
struct panvk_internal_shader *shader = NULL;
VkResult result = get_preload_shader(dev, key, &shader);
if (result != VK_SUCCESS)
return result;
uint32_t bd_count =
key->aspects == VK_IMAGE_ASPECT_COLOR_BIT ? fbinfo->rt_count : 0;
struct panfrost_ptr bds =
panvk_cmd_alloc_desc_array(cmdbuf, bd_count, BLEND);
if (bd_count > 0 && !bds.cpu)
return VK_ERROR_OUT_OF_DEVICE_MEMORY;
uint32_t tex_count = key->aspects == VK_IMAGE_ASPECT_COLOR_BIT
? fbinfo->rt_count
: util_bitcount(key->aspects);
uint32_t desc_count = tex_count + 1;
struct panfrost_ptr descs = panvk_cmd_alloc_dev_mem(
cmdbuf, desc, desc_count * PANVK_DESCRIPTOR_SIZE, PANVK_DESCRIPTOR_SIZE);
if (!descs.cpu)
return VK_ERROR_OUT_OF_DEVICE_MEMORY;
struct mali_sampler_packed *sampler = descs.cpu;
pan_pack(sampler, SAMPLER, cfg) {
cfg.seamless_cube_map = false;
cfg.normalized_coordinates = false;
cfg.minify_nearest = true;
cfg.magnify_nearest = true;
}
fill_textures(cmdbuf, key, descs.cpu + PANVK_DESCRIPTOR_SIZE);
if (key->aspects == VK_IMAGE_ASPECT_COLOR_BIT)
fill_bds(cmdbuf, key, bds.cpu);
struct panfrost_ptr res_table = panvk_cmd_alloc_desc(cmdbuf, RESOURCE);
if (!res_table.cpu)
return VK_ERROR_OUT_OF_DEVICE_MEMORY;
pan_pack(res_table.cpu, RESOURCE, cfg) {
cfg.address = descs.gpu;
cfg.size = desc_count * PANVK_DESCRIPTOR_SIZE;
}
struct panfrost_ptr zsd = panvk_cmd_alloc_desc(cmdbuf, DEPTH_STENCIL);
if (!zsd.cpu)
return VK_ERROR_OUT_OF_DEVICE_MEMORY;
pan_pack(zsd.cpu, DEPTH_STENCIL, cfg) {
cfg.depth_function = MALI_FUNC_ALWAYS;
cfg.depth_write_enable = fbinfo->zs.preload.z;
if (fbinfo->zs.preload.z)
cfg.depth_source = MALI_DEPTH_SOURCE_SHADER;
cfg.stencil_test_enable = fbinfo->zs.preload.s;
cfg.stencil_from_shader = fbinfo->zs.preload.s;
cfg.front_compare_function = MALI_FUNC_ALWAYS;
cfg.front_stencil_fail = MALI_STENCIL_OP_REPLACE;
cfg.front_depth_fail = MALI_STENCIL_OP_REPLACE;
cfg.front_depth_pass = MALI_STENCIL_OP_REPLACE;
cfg.front_write_mask = 0xFF;
cfg.front_value_mask = 0xFF;
cfg.back_compare_function = MALI_FUNC_ALWAYS;
cfg.back_stencil_fail = MALI_STENCIL_OP_REPLACE;
cfg.back_depth_fail = MALI_STENCIL_OP_REPLACE;
cfg.back_depth_pass = MALI_STENCIL_OP_REPLACE;
cfg.back_write_mask = 0xFF;
cfg.back_value_mask = 0xFF;
cfg.depth_cull_enable = false;
}
result = alloc_pre_post_dcds(cmdbuf);
if (result != VK_SUCCESS)
return result;
struct mali_draw_packed *dcds = fbinfo->bifrost.pre_post.dcds.cpu;
uint32_t dcd_idx = key->aspects == VK_IMAGE_ASPECT_COLOR_BIT ? 0 : 1;
pan_pack(&dcds[dcd_idx], DRAW, cfg) {
if (key->aspects == VK_IMAGE_ASPECT_COLOR_BIT) {
/* Skipping ATEST requires forcing Z/S */
cfg.zs_update_operation = MALI_PIXEL_KILL_STRONG_EARLY;
cfg.pixel_kill_operation = MALI_PIXEL_KILL_FORCE_EARLY;
cfg.blend = bds.gpu;
cfg.blend_count = bd_count;
cfg.render_target_mask = cmdbuf->state.gfx.render.bound_attachments;
} else {
/* ZS_EMIT requires late update/kill */
cfg.zs_update_operation = MALI_PIXEL_KILL_FORCE_LATE;
cfg.pixel_kill_operation = MALI_PIXEL_KILL_FORCE_LATE;
cfg.blend_count = 0;
}
cfg.allow_forward_pixel_to_kill =
key->aspects == VK_IMAGE_ASPECT_COLOR_BIT;
cfg.allow_forward_pixel_to_be_killed = true;
cfg.depth_stencil = zsd.gpu;
cfg.sample_mask = 0xFFFF;
cfg.multisample_enable = key->samples > 1;
cfg.evaluate_per_sample = key->samples > 1;
cfg.maximum_z = 1.0;
cfg.clean_fragment_write = false;
cfg.shader.resources = res_table.gpu | 1;
cfg.shader.shader = panvk_priv_mem_dev_addr(shader->spd);
cfg.shader.thread_storage = cmdbuf->state.gfx.tsd;
}
if (key->aspects == VK_IMAGE_ASPECT_COLOR_BIT) {
fbinfo->bifrost.pre_post.modes[dcd_idx] =
MALI_PRE_POST_FRAME_SHADER_MODE_INTERSECT;
} else {
/* We could use INTERSECT on Valhall too, but
* EARLY_ZS_ALWAYS has the advantage of reloading the ZS tile
* buffer one or more tiles ahead, making ZS data immediately
* available for any ZS tests taking place in other shaders.
* Thing's haven't been benchmarked to determine what's
* preferable (saving bandwidth vs having ZS preloaded
* earlier), so let's leave it like that for now.
*/
fbinfo->bifrost.pre_post.modes[dcd_idx] =
MALI_PRE_POST_FRAME_SHADER_MODE_EARLY_ZS_ALWAYS;
}
return VK_SUCCESS;
}
#endif
static VkResult
cmd_preload_zs_attachments(struct panvk_cmd_buffer *cmdbuf)
{
struct pan_fb_info *fbinfo = &cmdbuf->state.gfx.render.fb.info;
if (!fbinfo->zs.preload.s && !fbinfo->zs.preload.z)
return VK_SUCCESS;
struct panvk_fb_preload_shader_key key = {
.type = PANVK_META_OBJECT_KEY_FB_PRELOAD_SHADER,
.samples = fbinfo->nr_samples,
.needs_layer_id = cmdbuf->state.gfx.render.layer_count > 1,
};
if (fbinfo->zs.preload.z) {
key.aspects = VK_IMAGE_ASPECT_DEPTH_BIT;
key.view_type =
cmdbuf->state.gfx.render.z_attachment.iview
? cmdbuf->state.gfx.render.z_attachment.iview->vk.view_type
: cmdbuf->state.gfx.render.s_attachment.iview->vk.view_type;
}
if (fbinfo->zs.preload.s) {
VkImageViewType view_type =
cmdbuf->state.gfx.render.s_attachment.iview
? cmdbuf->state.gfx.render.s_attachment.iview->vk.view_type
: cmdbuf->state.gfx.render.z_attachment.iview->vk.view_type;
key.aspects |= VK_IMAGE_ASPECT_STENCIL_BIT;
if (!fbinfo->zs.preload.z)
key.view_type = view_type;
assert(key.view_type == view_type);
}
return cmd_emit_dcd(cmdbuf, &key);
}
static VkResult
cmd_preload_color_attachments(struct panvk_cmd_buffer *cmdbuf)
{
struct pan_fb_info *fbinfo = &cmdbuf->state.gfx.render.fb.info;
struct panvk_fb_preload_shader_key key = {
.type = PANVK_META_OBJECT_KEY_FB_PRELOAD_SHADER,
.samples = fbinfo->nr_samples,
.needs_layer_id = cmdbuf->state.gfx.render.layer_count > 1,
.aspects = VK_IMAGE_ASPECT_COLOR_BIT,
};
bool needs_preload = false;
for (uint32_t i = 0; i < fbinfo->rt_count; i++) {
if (!fbinfo->rts[i].preload)
continue;
enum pipe_format pfmt = fbinfo->rts[i].view->format;
struct panvk_image_view *iview =
cmdbuf->state.gfx.render.color_attachments.iviews[i];
key.color[i].type = util_format_is_pure_uint(pfmt) ? nir_type_uint32
: util_format_is_pure_sint(pfmt) ? nir_type_int32
: nir_type_float32;
if (!needs_preload) {
key.view_type = iview->vk.view_type;
needs_preload = true;
}
assert(key.view_type == iview->vk.view_type);
}
if (!needs_preload)
return VK_SUCCESS;
return cmd_emit_dcd(cmdbuf, &key);
}
VkResult
panvk_per_arch(cmd_fb_preload)(struct panvk_cmd_buffer *cmdbuf)
{
VkResult result = cmd_preload_color_attachments(cmdbuf);
if (result != VK_SUCCESS)
return result;
return cmd_preload_zs_attachments(cmdbuf);
}

View File

@@ -150,49 +150,6 @@ panvk_meta_cleanup(struct panvk_device *device)
vk_meta_device_finish(&device->vk, &device->meta);
}
static void
panvk_preload_blitter_init(struct panvk_device *device)
{
const struct panvk_physical_device *physical_device =
to_panvk_physical_device(device->vk.physical);
struct panvk_pool_properties bin_pool_props = {
.create_flags = PAN_KMOD_BO_FLAG_EXECUTABLE,
.slab_size = 16 * 1024,
.label = "panvk_meta blitter binary pool",
.owns_bos = true,
.needs_locking = false,
.prealloc = false,
};
panvk_pool_init(&device->blitter.bin_pool, device, NULL, &bin_pool_props);
struct panvk_pool_properties desc_pool_props = {
.create_flags = 0,
.slab_size = 16 * 1024,
.label = "panvk_meta blitter descriptor pool",
.owns_bos = true,
.needs_locking = false,
.prealloc = false,
};
panvk_pool_init(&device->blitter.desc_pool, device, NULL, &desc_pool_props);
pan_blend_shader_cache_init(&device->blitter.blend_shader_cache,
physical_device->kmod.props.gpu_prod_id);
GENX(pan_blitter_cache_init)
(&device->blitter.cache, physical_device->kmod.props.gpu_prod_id,
&device->blitter.blend_shader_cache, &device->blitter.bin_pool.base,
&device->blitter.desc_pool.base);
}
static void
panvk_preload_blitter_cleanup(struct panvk_device *device)
{
GENX(pan_blitter_cache_cleanup)(&device->blitter.cache);
pan_blend_shader_cache_cleanup(&device->blitter.blend_shader_cache);
panvk_pool_cleanup(&device->blitter.desc_pool);
panvk_pool_cleanup(&device->blitter.bin_pool);
}
/* Always reserve the lower 32MB. */
#define PANVK_VA_RESERVE_BOTTOM 0x2000000ull
@@ -311,11 +268,9 @@ panvk_per_arch(create_device)(struct panvk_physical_device *physical_device,
vk_device_set_drm_fd(&device->vk, device->kmod.dev->fd);
panvk_preload_blitter_init(device);
result = panvk_meta_init(device);
if (result != VK_SUCCESS)
goto err_cleanup_blitter;
goto err_free_priv_bos;
for (unsigned i = 0; i < pCreateInfo->queueCreateInfoCount; i++) {
const VkDeviceQueueCreateInfo *queue_create =
@@ -356,9 +311,6 @@ err_finish_queues:
panvk_meta_cleanup(device);
err_cleanup_blitter:
panvk_preload_blitter_cleanup(device);
err_free_priv_bos:
panvk_priv_bo_unref(device->sample_positions);
panvk_priv_bo_unref(device->tiler_heap);
@@ -393,7 +345,6 @@ panvk_per_arch(destroy_device)(struct panvk_device *device,
}
panvk_meta_cleanup(device);
panvk_preload_blitter_cleanup(device);
panvk_priv_bo_unref(device->tiler_heap);
panvk_priv_bo_unref(device->sample_positions);
panvk_device_cleanup_mempools(device);