
this fixes null FS + cull distance/API sample mask, which require a prolog. fixes upcoming CTS. Backport-to: 25.1 Signed-off-by: Alyssa Rosenzweig <alyssa@rosenzweig.io> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/34585> (cherry picked from commit 3ab8ce8579c967e1d8fb024de298a958511d9ffd)
3916 lines
136 KiB
C
3916 lines
136 KiB
C
/*
|
|
* Copyright 2024 Valve Corporation
|
|
* Copyright 2024 Alyssa Rosenzweig
|
|
* Copyright 2022-2023 Collabora Ltd. and Red Hat Inc.
|
|
* SPDX-License-Identifier: MIT
|
|
*/
|
|
#include <assert.h>
|
|
#include "agx_abi.h"
|
|
#include "agx_bg_eot.h"
|
|
#include "agx_bo.h"
|
|
#include "agx_compile.h"
|
|
#include "agx_compiler.h"
|
|
#include "agx_device.h"
|
|
#include "agx_helpers.h"
|
|
#include "agx_linker.h"
|
|
#include "agx_nir_lower_gs.h"
|
|
#include "agx_nir_lower_vbo.h"
|
|
#include "agx_ppp.h"
|
|
#include "agx_tilebuffer.h"
|
|
#include "agx_usc.h"
|
|
#include "agx_uvs.h"
|
|
#include "hk_buffer.h"
|
|
#include "hk_cmd_buffer.h"
|
|
#include "hk_device.h"
|
|
#include "hk_entrypoints.h"
|
|
#include "hk_image.h"
|
|
#include "hk_image_view.h"
|
|
#include "hk_physical_device.h"
|
|
#include "hk_private.h"
|
|
#include "hk_shader.h"
|
|
|
|
#include "asahi/genxml/agx_pack.h"
|
|
#include "asahi/libagx/compression.h"
|
|
#include "asahi/libagx/geometry.h"
|
|
#include "asahi/libagx/libagx.h"
|
|
#include "asahi/libagx/query.h"
|
|
#include "asahi/libagx/tessellator.h"
|
|
#include "util/blend.h"
|
|
#include "util/format/format_utils.h"
|
|
#include "util/format/u_formats.h"
|
|
#include "util/macros.h"
|
|
#include "util/ralloc.h"
|
|
#include "util/u_prim.h"
|
|
#include "vulkan/vulkan_core.h"
|
|
#include "layout.h"
|
|
#include "libagx_dgc.h"
|
|
#include "libagx_shaders.h"
|
|
#include "nir.h"
|
|
#include "nir_builder.h"
|
|
#include "nir_lower_blend.h"
|
|
#include "nir_xfb_info.h"
|
|
#include "pool.h"
|
|
#include "shader_enums.h"
|
|
#include "vk_blend.h"
|
|
#include "vk_enum_to_str.h"
|
|
#include "vk_format.h"
|
|
#include "vk_graphics_state.h"
|
|
#include "vk_pipeline.h"
|
|
#include "vk_render_pass.h"
|
|
#include "vk_standard_sample_locations.h"
|
|
#include "vk_util.h"
|
|
|
|
#define IS_DIRTY(bit) BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_##bit)
|
|
|
|
#define IS_SHADER_DIRTY(bit) \
|
|
(cmd->state.gfx.shaders_dirty & BITFIELD_BIT(MESA_SHADER_##bit))
|
|
|
|
#define IS_LINKED_DIRTY(bit) \
|
|
(cmd->state.gfx.linked_dirty & BITFIELD_BIT(MESA_SHADER_##bit))
|
|
|
|
/* CTS coverage of indirect draws is pretty bad, so it's helpful to be able to
|
|
* get some extra smoke testing.
|
|
*/
|
|
#define HK_TEST_INDIRECTS (0)
|
|
|
|
UNUSED static inline void
|
|
print_draw(struct agx_draw d, FILE *fp)
|
|
{
|
|
if (agx_is_indirect(d.b))
|
|
fprintf(fp, "indirect (buffer %" PRIx64 "):", d.b.ptr);
|
|
else
|
|
fprintf(fp, "direct (%ux%u):", d.b.count[0], d.b.count[1]);
|
|
|
|
if (d.index_size)
|
|
fprintf(fp, " index_size=%u", agx_index_size_to_B(d.index_size));
|
|
else
|
|
fprintf(fp, " non-indexed");
|
|
|
|
if (d.restart)
|
|
fprintf(fp, " restart");
|
|
|
|
if (d.index_bias)
|
|
fprintf(fp, " index_bias=%u", d.index_bias);
|
|
|
|
if (d.start)
|
|
fprintf(fp, " start=%u", d.start);
|
|
|
|
if (d.start_instance)
|
|
fprintf(fp, " start_instance=%u", d.start_instance);
|
|
|
|
fprintf(fp, "\n");
|
|
}
|
|
|
|
/* XXX: deduplicate */
|
|
static inline enum mesa_prim
|
|
vk_conv_topology(VkPrimitiveTopology topology)
|
|
{
|
|
switch (topology) {
|
|
case VK_PRIMITIVE_TOPOLOGY_POINT_LIST:
|
|
return MESA_PRIM_POINTS;
|
|
case VK_PRIMITIVE_TOPOLOGY_LINE_LIST:
|
|
return MESA_PRIM_LINES;
|
|
case VK_PRIMITIVE_TOPOLOGY_LINE_STRIP:
|
|
return MESA_PRIM_LINE_STRIP;
|
|
case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST:
|
|
#pragma GCC diagnostic push
|
|
#pragma GCC diagnostic ignored "-Wswitch"
|
|
case VK_PRIMITIVE_TOPOLOGY_META_RECT_LIST_MESA:
|
|
#pragma GCC diagnostic pop
|
|
return MESA_PRIM_TRIANGLES;
|
|
case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_STRIP:
|
|
return MESA_PRIM_TRIANGLE_STRIP;
|
|
case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_FAN:
|
|
return MESA_PRIM_TRIANGLE_FAN;
|
|
case VK_PRIMITIVE_TOPOLOGY_LINE_LIST_WITH_ADJACENCY:
|
|
return MESA_PRIM_LINES_ADJACENCY;
|
|
case VK_PRIMITIVE_TOPOLOGY_LINE_STRIP_WITH_ADJACENCY:
|
|
return MESA_PRIM_LINE_STRIP_ADJACENCY;
|
|
case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST_WITH_ADJACENCY:
|
|
return MESA_PRIM_TRIANGLES_ADJACENCY;
|
|
case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_STRIP_WITH_ADJACENCY:
|
|
return MESA_PRIM_TRIANGLE_STRIP_ADJACENCY;
|
|
case VK_PRIMITIVE_TOPOLOGY_PATCH_LIST:
|
|
return MESA_PRIM_PATCHES;
|
|
default:
|
|
unreachable("invalid");
|
|
}
|
|
}
|
|
|
|
static void
|
|
hk_cmd_buffer_dirty_render_pass(struct hk_cmd_buffer *cmd)
|
|
{
|
|
struct vk_dynamic_graphics_state *dyn = &cmd->vk.dynamic_graphics_state;
|
|
|
|
/* These depend on color attachment count */
|
|
BITSET_SET(dyn->dirty, MESA_VK_DYNAMIC_CB_COLOR_WRITE_ENABLES);
|
|
BITSET_SET(dyn->dirty, MESA_VK_DYNAMIC_CB_BLEND_ENABLES);
|
|
BITSET_SET(dyn->dirty, MESA_VK_DYNAMIC_CB_BLEND_EQUATIONS);
|
|
BITSET_SET(dyn->dirty, MESA_VK_DYNAMIC_CB_WRITE_MASKS);
|
|
|
|
/* These depend on the depth/stencil format */
|
|
BITSET_SET(dyn->dirty, MESA_VK_DYNAMIC_DS_DEPTH_TEST_ENABLE);
|
|
BITSET_SET(dyn->dirty, MESA_VK_DYNAMIC_DS_DEPTH_WRITE_ENABLE);
|
|
BITSET_SET(dyn->dirty, MESA_VK_DYNAMIC_DS_DEPTH_BOUNDS_TEST_ENABLE);
|
|
BITSET_SET(dyn->dirty, MESA_VK_DYNAMIC_DS_STENCIL_TEST_ENABLE);
|
|
BITSET_SET(dyn->dirty, MESA_VK_DYNAMIC_RS_DEPTH_BIAS_FACTORS);
|
|
|
|
/* This may depend on render targets for ESO */
|
|
BITSET_SET(dyn->dirty, MESA_VK_DYNAMIC_MS_RASTERIZATION_SAMPLES);
|
|
|
|
/* This may depend on render targets */
|
|
BITSET_SET(dyn->dirty, MESA_VK_DYNAMIC_COLOR_ATTACHMENT_MAP);
|
|
}
|
|
|
|
void
|
|
hk_cmd_buffer_begin_graphics(struct hk_cmd_buffer *cmd,
|
|
const VkCommandBufferBeginInfo *pBeginInfo)
|
|
{
|
|
if (cmd->vk.level != VK_COMMAND_BUFFER_LEVEL_PRIMARY &&
|
|
(pBeginInfo->flags & VK_COMMAND_BUFFER_USAGE_RENDER_PASS_CONTINUE_BIT)) {
|
|
char gcbiar_data[VK_GCBIARR_DATA_SIZE(HK_MAX_RTS)];
|
|
const VkRenderingInfo *resume_info =
|
|
vk_get_command_buffer_inheritance_as_rendering_resume(
|
|
cmd->vk.level, pBeginInfo, gcbiar_data);
|
|
if (resume_info) {
|
|
hk_CmdBeginRendering(hk_cmd_buffer_to_handle(cmd), resume_info);
|
|
} else {
|
|
const VkCommandBufferInheritanceRenderingInfo *inheritance_info =
|
|
vk_get_command_buffer_inheritance_rendering_info(cmd->vk.level,
|
|
pBeginInfo);
|
|
assert(inheritance_info);
|
|
|
|
struct hk_rendering_state *render = &cmd->state.gfx.render;
|
|
render->flags = inheritance_info->flags;
|
|
render->area = (VkRect2D){};
|
|
render->layer_count = 0;
|
|
render->view_mask = inheritance_info->viewMask;
|
|
render->tilebuffer.nr_samples = inheritance_info->rasterizationSamples;
|
|
|
|
render->color_att_count = inheritance_info->colorAttachmentCount;
|
|
for (uint32_t i = 0; i < render->color_att_count; i++) {
|
|
render->color_att[i].vk_format =
|
|
inheritance_info->pColorAttachmentFormats[i];
|
|
}
|
|
render->depth_att.vk_format = inheritance_info->depthAttachmentFormat;
|
|
render->stencil_att.vk_format =
|
|
inheritance_info->stencilAttachmentFormat;
|
|
|
|
const VkRenderingAttachmentLocationInfoKHR att_loc_info_default = {
|
|
.sType = VK_STRUCTURE_TYPE_RENDERING_ATTACHMENT_LOCATION_INFO_KHR,
|
|
.colorAttachmentCount = inheritance_info->colorAttachmentCount,
|
|
};
|
|
const VkRenderingAttachmentLocationInfoKHR *att_loc_info =
|
|
vk_get_command_buffer_rendering_attachment_location_info(
|
|
cmd->vk.level, pBeginInfo);
|
|
if (att_loc_info == NULL)
|
|
att_loc_info = &att_loc_info_default;
|
|
|
|
vk_cmd_set_rendering_attachment_locations(&cmd->vk, att_loc_info);
|
|
|
|
hk_cmd_buffer_dirty_render_pass(cmd);
|
|
}
|
|
}
|
|
|
|
hk_cmd_buffer_dirty_all(cmd);
|
|
|
|
/* If multiview is disabled, always read 0. If multiview is enabled,
|
|
* hk_set_view_index will dirty the root each draw.
|
|
*/
|
|
cmd->state.gfx.descriptors.root.draw.view_index = 0;
|
|
cmd->state.gfx.descriptors.root_dirty = true;
|
|
}
|
|
|
|
void
|
|
hk_cmd_invalidate_graphics_state(struct hk_cmd_buffer *cmd)
|
|
{
|
|
hk_cmd_buffer_dirty_all(cmd);
|
|
|
|
/* From the Vulkan 1.3.275 spec:
|
|
*
|
|
* "...There is one exception to this rule - if the primary command
|
|
* buffer is inside a render pass instance, then the render pass and
|
|
* subpass state is not disturbed by executing secondary command
|
|
* buffers."
|
|
*
|
|
* We need to reset everything EXCEPT the render pass state.
|
|
*/
|
|
struct hk_rendering_state render_save = cmd->state.gfx.render;
|
|
memset(&cmd->state.gfx, 0, sizeof(cmd->state.gfx));
|
|
cmd->state.gfx.render = render_save;
|
|
}
|
|
|
|
static void
|
|
hk_attachment_init(struct hk_attachment *att,
|
|
const VkRenderingAttachmentInfo *info)
|
|
{
|
|
if (info == NULL || info->imageView == VK_NULL_HANDLE) {
|
|
*att = (struct hk_attachment){
|
|
.iview = NULL,
|
|
};
|
|
return;
|
|
}
|
|
|
|
VK_FROM_HANDLE(hk_image_view, iview, info->imageView);
|
|
*att = (struct hk_attachment){
|
|
.vk_format = iview->vk.format,
|
|
.iview = iview,
|
|
};
|
|
|
|
if (info->resolveMode != VK_RESOLVE_MODE_NONE) {
|
|
VK_FROM_HANDLE(hk_image_view, res_iview, info->resolveImageView);
|
|
att->resolve_mode = info->resolveMode;
|
|
att->resolve_iview = res_iview;
|
|
}
|
|
}
|
|
|
|
VKAPI_ATTR void VKAPI_CALL
|
|
hk_GetRenderingAreaGranularityKHR(
|
|
VkDevice device, const VkRenderingAreaInfoKHR *pRenderingAreaInfo,
|
|
VkExtent2D *pGranularity)
|
|
{
|
|
*pGranularity = (VkExtent2D){.width = 1, .height = 1};
|
|
}
|
|
|
|
static bool
|
|
is_attachment_stored(const VkRenderingAttachmentInfo *att)
|
|
{
|
|
/* When resolving, we store the intermediate multisampled image as the
|
|
* resolve is a separate control stream. This could be optimized.
|
|
*/
|
|
return att->storeOp == VK_ATTACHMENT_STORE_OP_STORE ||
|
|
att->resolveMode != VK_RESOLVE_MODE_NONE;
|
|
}
|
|
|
|
static struct hk_bg_eot
|
|
hk_build_bg_eot(struct hk_cmd_buffer *cmd, const VkRenderingInfo *info,
|
|
bool store, bool partial_render, bool incomplete_render_area)
|
|
{
|
|
struct hk_device *dev = hk_cmd_buffer_device(cmd);
|
|
struct hk_rendering_state *render = &cmd->state.gfx.render;
|
|
|
|
/* Construct the key */
|
|
struct agx_bg_eot_key key = {.tib = render->tilebuffer};
|
|
static_assert(AGX_BG_EOT_NONE == 0, "default initializer");
|
|
|
|
key.tib.layered = (render->cr.layers > 1);
|
|
|
|
bool needs_textures_for_spilled_rts =
|
|
agx_tilebuffer_spills(&render->tilebuffer) && !partial_render && !store;
|
|
|
|
for (unsigned i = 0; i < info->colorAttachmentCount; ++i) {
|
|
const VkRenderingAttachmentInfo *att_info = &info->pColorAttachments[i];
|
|
if (att_info->imageView == VK_NULL_HANDLE)
|
|
continue;
|
|
|
|
/* Partial render programs exist only to store/load the tilebuffer to
|
|
* main memory. When render targets are already spilled to main memory,
|
|
* there's nothing to do.
|
|
*/
|
|
if (key.tib.spilled[i] && (partial_render || store))
|
|
continue;
|
|
|
|
if (store) {
|
|
bool should_store = is_attachment_stored(att_info);
|
|
|
|
/* Partial renders always need to flush to memory. */
|
|
should_store |= partial_render;
|
|
|
|
if (should_store)
|
|
key.op[i] = AGX_EOT_STORE;
|
|
} else {
|
|
bool load = att_info->loadOp == VK_ATTACHMENT_LOAD_OP_LOAD;
|
|
bool clear = att_info->loadOp == VK_ATTACHMENT_LOAD_OP_CLEAR;
|
|
|
|
/* The background program used for partial renders must always load
|
|
* whatever was stored in the mid-frame end-of-tile program.
|
|
*/
|
|
load |= partial_render;
|
|
|
|
/* With an incomplete render area, we're forced to load back tiles and
|
|
* then use the 3D pipe for the clear.
|
|
*/
|
|
load |= incomplete_render_area;
|
|
|
|
/* Don't read back spilled render targets, they're already in memory */
|
|
load &= !key.tib.spilled[i];
|
|
|
|
/* This is a very frustrating corner case. From the spec:
|
|
*
|
|
* VK_ATTACHMENT_STORE_OP_NONE specifies the contents within the
|
|
* render area are not accessed by the store operation as long as
|
|
* no values are written to the attachment during the render pass.
|
|
*
|
|
* With VK_ATTACHMENT_STORE_OP_NONE, we suppress stores on the main
|
|
* end-of-tile program. Unfortunately, that's not enough: we also need
|
|
* to preserve the contents throughout partial renders. The easiest way
|
|
* to do that is forcing a load in the background program, so that
|
|
* partial stores for unused attachments will be no-op'd by writing
|
|
* existing contents.
|
|
*
|
|
* Optimizing this would require nontrivial tracking. Fortunately,
|
|
* this is all Android gunk and we don't have to care too much for
|
|
* dekstop games. So do the simple thing.
|
|
*/
|
|
bool no_store = (att_info->storeOp == VK_ATTACHMENT_STORE_OP_NONE);
|
|
bool no_store_wa = no_store && !load && !clear;
|
|
if (no_store_wa) {
|
|
perf_debug(cmd, "STORE_OP_NONE workaround");
|
|
}
|
|
|
|
load |= no_store_wa;
|
|
|
|
/* Don't apply clears for spilled render targets when we clear the
|
|
* render area explicitly after.
|
|
*/
|
|
if (key.tib.spilled[i] && incomplete_render_area)
|
|
continue;
|
|
|
|
if (load)
|
|
key.op[i] = AGX_BG_LOAD;
|
|
else if (clear)
|
|
key.op[i] = AGX_BG_CLEAR;
|
|
}
|
|
}
|
|
|
|
/* Begin building the pipeline */
|
|
size_t usc_size = agx_usc_size(3 + HK_MAX_RTS);
|
|
struct agx_ptr t = hk_pool_usc_alloc(cmd, usc_size, 64);
|
|
if (!t.cpu)
|
|
return (struct hk_bg_eot){.usc = t.gpu};
|
|
|
|
struct agx_usc_builder b = agx_usc_builder(t.cpu, usc_size);
|
|
|
|
bool uses_txf = false;
|
|
unsigned uniforms = 0;
|
|
unsigned nr_tex = 0;
|
|
|
|
for (unsigned rt = 0; rt < HK_MAX_RTS; ++rt) {
|
|
const VkRenderingAttachmentInfo *att_info = &info->pColorAttachments[rt];
|
|
struct hk_image_view *iview = render->color_att[rt].iview;
|
|
|
|
if (key.op[rt] == AGX_BG_LOAD) {
|
|
uses_txf = true;
|
|
|
|
uint32_t index = key.tib.layered
|
|
? iview->planes[0].layered_background_desc_index
|
|
: iview->planes[0].background_desc_index;
|
|
|
|
agx_usc_pack(&b, TEXTURE, cfg) {
|
|
/* Shifted to match eMRT indexing, could be optimized */
|
|
cfg.start = rt * 2;
|
|
cfg.count = 1;
|
|
cfg.buffer = dev->images.bo->va->addr + index * AGX_TEXTURE_LENGTH;
|
|
}
|
|
|
|
nr_tex = (rt * 2) + 1;
|
|
} else if (key.op[rt] == AGX_BG_CLEAR) {
|
|
static_assert(sizeof(att_info->clearValue.color) == 16, "fixed ABI");
|
|
uint64_t colour =
|
|
hk_pool_upload(cmd, &att_info->clearValue.color, 16, 16);
|
|
|
|
agx_usc_uniform(&b, 4 + (8 * rt), 8, colour);
|
|
uniforms = MAX2(uniforms, 4 + (8 * rt) + 8);
|
|
} else if (key.op[rt] == AGX_EOT_STORE) {
|
|
uint32_t index = key.tib.layered
|
|
? iview->planes[0].layered_eot_pbe_desc_index
|
|
: iview->planes[0].eot_pbe_desc_index;
|
|
|
|
agx_usc_pack(&b, TEXTURE, cfg) {
|
|
cfg.start = rt;
|
|
cfg.count = 1;
|
|
cfg.buffer = dev->images.bo->va->addr + index * AGX_TEXTURE_LENGTH;
|
|
}
|
|
|
|
nr_tex = rt + 1;
|
|
}
|
|
}
|
|
|
|
if (needs_textures_for_spilled_rts) {
|
|
hk_usc_upload_spilled_rt_descs(&b, cmd);
|
|
uniforms = MAX2(uniforms, 4);
|
|
}
|
|
|
|
if (uses_txf) {
|
|
agx_usc_push_packed(&b, SAMPLER, dev->dev.txf_sampler);
|
|
}
|
|
|
|
/* For attachmentless rendering, we don't know the sample count until
|
|
* draw-time. But we have trivial bg/eot programs in that case too.
|
|
*/
|
|
if (key.tib.nr_samples >= 1) {
|
|
agx_usc_push_packed(&b, SHARED, &key.tib.usc);
|
|
} else {
|
|
assert(key.tib.sample_size_B == 0);
|
|
agx_usc_shared_none(&b);
|
|
|
|
key.tib.nr_samples = 1;
|
|
}
|
|
|
|
/* Get the shader */
|
|
key.reserved_preamble = uniforms;
|
|
/* XXX: locking? */
|
|
struct agx_bg_eot_shader *shader = agx_get_bg_eot_shader(&dev->bg_eot, &key);
|
|
|
|
agx_usc_pack(&b, SHADER, cfg) {
|
|
cfg.code = agx_usc_addr(&dev->dev, shader->ptr);
|
|
cfg.unk_2 = 0;
|
|
}
|
|
|
|
agx_usc_pack(&b, REGISTERS, cfg)
|
|
cfg.register_count = shader->info.nr_gprs;
|
|
|
|
if (shader->info.has_preamble) {
|
|
agx_usc_pack(&b, PRESHADER, cfg) {
|
|
cfg.code =
|
|
agx_usc_addr(&dev->dev, shader->ptr + shader->info.preamble_offset);
|
|
}
|
|
} else {
|
|
agx_usc_pack(&b, NO_PRESHADER, cfg)
|
|
;
|
|
}
|
|
|
|
struct hk_bg_eot ret = {.usc = t.gpu};
|
|
|
|
agx_pack(&ret.counts, COUNTS, cfg) {
|
|
cfg.uniform_register_count = shader->info.push_count;
|
|
cfg.preshader_register_count = shader->info.nr_preamble_gprs;
|
|
cfg.texture_state_register_count = nr_tex;
|
|
cfg.sampler_state_register_count =
|
|
agx_translate_sampler_state_count(uses_txf ? 1 : 0, false);
|
|
}
|
|
|
|
return ret;
|
|
}
|
|
|
|
static bool
|
|
is_aligned(unsigned x, unsigned pot_alignment)
|
|
{
|
|
assert(util_is_power_of_two_nonzero(pot_alignment));
|
|
return (x & (pot_alignment - 1)) == 0;
|
|
}
|
|
|
|
static void
|
|
hk_merge_render_iview(struct hk_rendering_state *render,
|
|
struct hk_image_view *iview, bool zls)
|
|
{
|
|
if (iview) {
|
|
unsigned samples = iview->vk.image->samples;
|
|
/* TODO: is this right for ycbcr? */
|
|
unsigned level = iview->vk.base_mip_level;
|
|
unsigned width = u_minify(iview->vk.image->extent.width, level);
|
|
unsigned height = u_minify(iview->vk.image->extent.height, level);
|
|
|
|
assert(render->tilebuffer.nr_samples == 0 ||
|
|
render->tilebuffer.nr_samples == samples);
|
|
render->tilebuffer.nr_samples = samples;
|
|
|
|
/* TODO: Is this merging logic sound? Not sure how this is supposed to
|
|
* work conceptually.
|
|
*/
|
|
render->cr.width = MAX2(render->cr.width, width);
|
|
render->cr.height = MAX2(render->cr.height, height);
|
|
|
|
if (zls) {
|
|
render->cr.zls_width = width;
|
|
render->cr.zls_height = height;
|
|
}
|
|
}
|
|
}
|
|
|
|
static void
|
|
hk_pack_zls_control(struct agx_zls_control_packed *packed,
|
|
struct ail_layout *z_layout, struct ail_layout *s_layout,
|
|
const VkRenderingAttachmentInfo *attach_z,
|
|
const VkRenderingAttachmentInfo *attach_s,
|
|
bool incomplete_render_area, bool partial_render)
|
|
{
|
|
agx_pack(packed, ZLS_CONTROL, zls_control) {
|
|
if (z_layout) {
|
|
/* XXX: Dropping Z stores is wrong if the render pass gets split into
|
|
* multiple control streams (can that ever happen?) We need more ZLS
|
|
* variants. Force || true for now.
|
|
*/
|
|
zls_control.z_store_enable =
|
|
attach_z->storeOp == VK_ATTACHMENT_STORE_OP_STORE ||
|
|
attach_z->resolveMode != VK_RESOLVE_MODE_NONE || partial_render ||
|
|
true;
|
|
|
|
zls_control.z_load_enable =
|
|
attach_z->loadOp == VK_ATTACHMENT_LOAD_OP_LOAD || partial_render ||
|
|
incomplete_render_area;
|
|
|
|
if (z_layout->compressed) {
|
|
zls_control.z_compress_1 = true;
|
|
zls_control.z_compress_2 = true;
|
|
}
|
|
|
|
if (z_layout->format == PIPE_FORMAT_Z16_UNORM) {
|
|
zls_control.z_format = AGX_ZLS_FORMAT_16;
|
|
} else {
|
|
zls_control.z_format = AGX_ZLS_FORMAT_32F;
|
|
}
|
|
}
|
|
|
|
if (s_layout) {
|
|
/* TODO:
|
|
* Fail
|
|
* dEQP-VK.renderpass.dedicated_allocation.formats.d32_sfloat_s8_uint.input.dont_care.store.self_dep_clear_draw_use_input_aspect
|
|
* without the force
|
|
* .. maybe a VkRenderPass emulation bug.
|
|
*/
|
|
zls_control.s_store_enable =
|
|
attach_s->storeOp == VK_ATTACHMENT_STORE_OP_STORE ||
|
|
attach_s->resolveMode != VK_RESOLVE_MODE_NONE || partial_render ||
|
|
true;
|
|
|
|
zls_control.s_load_enable =
|
|
attach_s->loadOp == VK_ATTACHMENT_LOAD_OP_LOAD || partial_render ||
|
|
incomplete_render_area;
|
|
|
|
if (s_layout->compressed) {
|
|
zls_control.s_compress_1 = true;
|
|
zls_control.s_compress_2 = true;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
VKAPI_ATTR void VKAPI_CALL
|
|
hk_CmdBeginRendering(VkCommandBuffer commandBuffer,
|
|
const VkRenderingInfo *pRenderingInfo)
|
|
{
|
|
VK_FROM_HANDLE(hk_cmd_buffer, cmd, commandBuffer);
|
|
struct hk_rendering_state *render = &cmd->state.gfx.render;
|
|
|
|
memset(render, 0, sizeof(*render));
|
|
|
|
render->flags = pRenderingInfo->flags;
|
|
render->area = pRenderingInfo->renderArea;
|
|
render->view_mask = pRenderingInfo->viewMask;
|
|
render->layer_count = pRenderingInfo->layerCount;
|
|
render->tilebuffer.nr_samples = 0;
|
|
|
|
const uint32_t layer_count = render->view_mask
|
|
? util_last_bit(render->view_mask)
|
|
: render->layer_count;
|
|
|
|
render->color_att_count = pRenderingInfo->colorAttachmentCount;
|
|
for (uint32_t i = 0; i < render->color_att_count; i++) {
|
|
hk_attachment_init(&render->color_att[i],
|
|
&pRenderingInfo->pColorAttachments[i]);
|
|
}
|
|
|
|
hk_attachment_init(&render->depth_att, pRenderingInfo->pDepthAttachment);
|
|
hk_attachment_init(&render->stencil_att, pRenderingInfo->pStencilAttachment);
|
|
|
|
for (uint32_t i = 0; i < render->color_att_count; i++) {
|
|
hk_merge_render_iview(render, render->color_att[i].iview, false);
|
|
}
|
|
|
|
hk_merge_render_iview(
|
|
render, render->depth_att.iview ?: render->stencil_att.iview, true);
|
|
|
|
/* Infer for attachmentless. samples is inferred at draw-time. */
|
|
render->cr.width =
|
|
MAX2(render->cr.width, render->area.offset.x + render->area.extent.width);
|
|
|
|
render->cr.height = MAX2(render->cr.height,
|
|
render->area.offset.y + render->area.extent.height);
|
|
|
|
if (!render->cr.zls_width) {
|
|
render->cr.zls_width = render->cr.width;
|
|
render->cr.zls_height = render->cr.height;
|
|
}
|
|
|
|
render->cr.layers = layer_count;
|
|
|
|
/* Choose a tilebuffer layout given the framebuffer key */
|
|
enum pipe_format formats[HK_MAX_RTS] = {0};
|
|
for (unsigned i = 0; i < render->color_att_count; ++i) {
|
|
formats[i] = hk_format_to_pipe_format(render->color_att[i].vk_format);
|
|
}
|
|
|
|
/* For now, we force layered=true since it makes compatibility problems way
|
|
* easier.
|
|
*/
|
|
render->tilebuffer = agx_build_tilebuffer_layout(
|
|
formats, render->color_att_count, render->tilebuffer.nr_samples, true);
|
|
|
|
const VkRenderingAttachmentLocationInfoKHR ral_info = {
|
|
.sType = VK_STRUCTURE_TYPE_RENDERING_ATTACHMENT_LOCATION_INFO_KHR,
|
|
.colorAttachmentCount = pRenderingInfo->colorAttachmentCount,
|
|
};
|
|
vk_cmd_set_rendering_attachment_locations(&cmd->vk, &ral_info);
|
|
|
|
hk_cmd_buffer_dirty_render_pass(cmd);
|
|
|
|
/* Determine whether the render area is complete, enabling us to use a
|
|
* fast-clear.
|
|
*
|
|
* TODO: If it is incomplete but tile aligned, it should be possibly to fast
|
|
* clear with the appropriate settings. This is critical for performance.
|
|
*/
|
|
bool incomplete_render_area =
|
|
render->area.offset.x > 0 || render->area.offset.y > 0 ||
|
|
render->area.extent.width < render->cr.width ||
|
|
render->area.extent.height < render->cr.height ||
|
|
(render->view_mask &&
|
|
render->view_mask != BITFIELD64_MASK(render->cr.layers));
|
|
|
|
perf_debug(cmd, "Rendering %ux%ux%u@%u %s%s", render->cr.width,
|
|
render->cr.height, render->cr.layers,
|
|
render->tilebuffer.nr_samples,
|
|
render->view_mask ? " multiview" : "",
|
|
incomplete_render_area ? " incomplete" : "");
|
|
|
|
render->cr.bg.main = hk_build_bg_eot(cmd, pRenderingInfo, false, false,
|
|
incomplete_render_area);
|
|
render->cr.bg.partial =
|
|
hk_build_bg_eot(cmd, pRenderingInfo, false, true, incomplete_render_area);
|
|
|
|
render->cr.eot.main =
|
|
hk_build_bg_eot(cmd, pRenderingInfo, true, false, incomplete_render_area);
|
|
render->cr.eot.partial =
|
|
hk_build_bg_eot(cmd, pRenderingInfo, true, true, incomplete_render_area);
|
|
|
|
render->cr.isp_bgobjvals = 0x300;
|
|
|
|
const VkRenderingAttachmentInfo *attach_z = pRenderingInfo->pDepthAttachment;
|
|
const VkRenderingAttachmentInfo *attach_s =
|
|
pRenderingInfo->pStencilAttachment;
|
|
|
|
struct ail_layout *z_layout = NULL, *s_layout = NULL;
|
|
|
|
if (attach_z != NULL && attach_z != VK_NULL_HANDLE && attach_z->imageView) {
|
|
struct hk_image_view *view = render->depth_att.iview;
|
|
struct hk_image *image =
|
|
container_of(view->vk.image, struct hk_image, vk);
|
|
|
|
z_layout = &image->planes[0].layout;
|
|
|
|
unsigned level = view->vk.base_mip_level;
|
|
unsigned first_layer = view->vk.base_array_layer;
|
|
|
|
const struct util_format_description *desc =
|
|
util_format_description(hk_format_to_pipe_format(view->vk.format));
|
|
|
|
assert(desc->format == PIPE_FORMAT_Z32_FLOAT ||
|
|
desc->format == PIPE_FORMAT_Z16_UNORM ||
|
|
desc->format == PIPE_FORMAT_Z32_FLOAT_S8X24_UINT);
|
|
|
|
render->cr.depth.buffer =
|
|
hk_image_base_address(image, 0) +
|
|
ail_get_layer_level_B(z_layout, first_layer, level);
|
|
|
|
/* Main stride in pages */
|
|
assert((z_layout->depth_px == 1 ||
|
|
is_aligned(z_layout->layer_stride_B, AIL_PAGESIZE)) &&
|
|
"Page aligned Z layers");
|
|
|
|
unsigned stride_pages = z_layout->layer_stride_B / AIL_PAGESIZE;
|
|
render->cr.depth.stride = ((stride_pages - 1) << 14) | 1;
|
|
|
|
assert(z_layout->tiling != AIL_TILING_LINEAR && "must tile");
|
|
|
|
if (z_layout->compressed) {
|
|
render->cr.depth.meta =
|
|
hk_image_base_address(image, 0) + z_layout->metadata_offset_B +
|
|
(first_layer * z_layout->compression_layer_stride_B) +
|
|
z_layout->level_offsets_compressed_B[level];
|
|
|
|
/* Meta stride in cache lines */
|
|
assert(
|
|
is_aligned(z_layout->compression_layer_stride_B, AIL_CACHELINE) &&
|
|
"Cacheline aligned Z meta layers");
|
|
|
|
unsigned stride_lines =
|
|
z_layout->compression_layer_stride_B / AIL_CACHELINE;
|
|
render->cr.depth.meta_stride = (stride_lines - 1) << 14;
|
|
}
|
|
|
|
float clear_depth = attach_z->clearValue.depthStencil.depth;
|
|
|
|
if (z_layout->format == PIPE_FORMAT_Z16_UNORM) {
|
|
render->cr.isp_bgobjdepth = _mesa_float_to_unorm(clear_depth, 16);
|
|
} else {
|
|
render->cr.isp_bgobjdepth = fui(clear_depth);
|
|
}
|
|
}
|
|
|
|
if (attach_s != NULL && attach_s != VK_NULL_HANDLE && attach_s->imageView) {
|
|
struct hk_image_view *view = render->stencil_att.iview;
|
|
struct hk_image *image =
|
|
container_of(view->vk.image, struct hk_image, vk);
|
|
|
|
/* Stencil is always the last plane (possibly the only plane) */
|
|
unsigned plane = image->plane_count - 1;
|
|
s_layout = &image->planes[plane].layout;
|
|
assert(s_layout->format == PIPE_FORMAT_S8_UINT);
|
|
|
|
unsigned level = view->vk.base_mip_level;
|
|
unsigned first_layer = view->vk.base_array_layer;
|
|
|
|
render->cr.stencil.buffer =
|
|
hk_image_base_address(image, plane) +
|
|
ail_get_layer_level_B(s_layout, first_layer, level);
|
|
|
|
/* Main stride in pages */
|
|
assert((s_layout->depth_px == 1 ||
|
|
is_aligned(s_layout->layer_stride_B, AIL_PAGESIZE)) &&
|
|
"Page aligned S layers");
|
|
unsigned stride_pages = s_layout->layer_stride_B / AIL_PAGESIZE;
|
|
render->cr.stencil.stride = ((stride_pages - 1) << 14) | 1;
|
|
|
|
if (s_layout->compressed) {
|
|
render->cr.stencil.meta =
|
|
hk_image_base_address(image, plane) + s_layout->metadata_offset_B +
|
|
(first_layer * s_layout->compression_layer_stride_B) +
|
|
s_layout->level_offsets_compressed_B[level];
|
|
|
|
/* Meta stride in cache lines */
|
|
assert(
|
|
is_aligned(s_layout->compression_layer_stride_B, AIL_CACHELINE) &&
|
|
"Cacheline aligned S meta layers");
|
|
|
|
unsigned stride_lines =
|
|
s_layout->compression_layer_stride_B / AIL_CACHELINE;
|
|
|
|
render->cr.stencil.meta_stride = (stride_lines - 1) << 14;
|
|
}
|
|
|
|
render->cr.isp_bgobjvals |= attach_s->clearValue.depthStencil.stencil;
|
|
}
|
|
|
|
hk_pack_zls_control(&render->cr.zls_control, z_layout, s_layout, attach_z,
|
|
attach_s, incomplete_render_area, false);
|
|
|
|
hk_pack_zls_control(&render->cr.zls_control_partial, z_layout, s_layout,
|
|
attach_z, attach_s, incomplete_render_area, true);
|
|
|
|
/* If multiview is disabled, always read 0. If multiview is enabled,
|
|
* hk_set_view_index will dirty the root each draw.
|
|
*/
|
|
cmd->state.gfx.descriptors.root.draw.view_index = 0;
|
|
cmd->state.gfx.descriptors.root_dirty = true;
|
|
|
|
if (render->flags & VK_RENDERING_RESUMING_BIT)
|
|
return;
|
|
|
|
/* The first control stream of the render pass is special since it gets
|
|
* the clears. Create it and swap in the clear.
|
|
*/
|
|
assert(!cmd->current_cs.gfx && "not already in a render pass");
|
|
struct hk_cs *cs = hk_cmd_buffer_get_cs(cmd, false /* compute */);
|
|
if (!cs)
|
|
return;
|
|
|
|
cs->cr.bg.main = render->cr.bg.main;
|
|
cs->cr.zls_control = render->cr.zls_control;
|
|
|
|
/* Reordering barrier for post-gfx, in case we had any. */
|
|
hk_cmd_buffer_end_compute_internal(cmd, &cmd->current_cs.post_gfx);
|
|
|
|
/* Don't reorder compute across render passes.
|
|
*
|
|
* TODO: Check if this is necessary if the proper PipelineBarriers are
|
|
* handled... there may be CTS bugs...
|
|
*/
|
|
hk_cmd_buffer_end_compute(cmd);
|
|
|
|
/* If we spill colour attachments, we need to decompress them. This happens
|
|
* at the start of the render; it is not re-emitted when resuming
|
|
* secondaries. It could be hoisted to the start of the command buffer but
|
|
* we're not that clever yet.
|
|
*/
|
|
if (agx_tilebuffer_spills(&render->tilebuffer)) {
|
|
perf_debug(cmd, "eMRT render pass");
|
|
|
|
for (unsigned i = 0; i < render->color_att_count; ++i) {
|
|
struct hk_image_view *view = render->color_att[i].iview;
|
|
if (view) {
|
|
struct hk_image *image =
|
|
container_of(view->vk.image, struct hk_image, vk);
|
|
|
|
/* TODO: YCbCr interaction? */
|
|
uint8_t plane = 0;
|
|
uint8_t image_plane = view->planes[plane].image_plane;
|
|
struct ail_layout *layout = &image->planes[image_plane].layout;
|
|
|
|
if (ail_is_level_compressed(layout, view->vk.base_mip_level)) {
|
|
perf_debug(cmd, "Decompressing in-place");
|
|
|
|
unsigned level = view->vk.base_mip_level;
|
|
unsigned layer = view->vk.base_array_layer;
|
|
uint64_t base = hk_image_base_address(image, image_plane);
|
|
|
|
struct libagx_decompress_images imgs = {
|
|
.compressed = view->planes[plane].emrt_texture,
|
|
.uncompressed = view->planes[plane].emrt_pbe,
|
|
};
|
|
|
|
struct agx_grid grid =
|
|
agx_3d(ail_metadata_width_tl(layout, level) * 32,
|
|
ail_metadata_height_tl(layout, level), layer_count);
|
|
|
|
libagx_decompress(cmd, grid, AGX_BARRIER_ALL | AGX_PREGFX,
|
|
layout, layer, level, base,
|
|
hk_pool_upload(cmd, &imgs, sizeof(imgs), 64));
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
uint32_t clear_count = 0;
|
|
VkClearAttachment clear_att[HK_MAX_RTS + 1];
|
|
bool resolved_clear = false;
|
|
|
|
for (uint32_t i = 0; i < pRenderingInfo->colorAttachmentCount; i++) {
|
|
const VkRenderingAttachmentInfo *att_info =
|
|
&pRenderingInfo->pColorAttachments[i];
|
|
if (att_info->imageView == VK_NULL_HANDLE ||
|
|
att_info->loadOp != VK_ATTACHMENT_LOAD_OP_CLEAR)
|
|
continue;
|
|
|
|
clear_att[clear_count++] = (VkClearAttachment){
|
|
.aspectMask = VK_IMAGE_ASPECT_COLOR_BIT,
|
|
.colorAttachment = i,
|
|
.clearValue = att_info->clearValue,
|
|
};
|
|
|
|
resolved_clear |= is_attachment_stored(att_info);
|
|
}
|
|
|
|
clear_att[clear_count] = (VkClearAttachment){
|
|
.aspectMask = 0,
|
|
};
|
|
|
|
if (attach_z && attach_z->imageView != VK_NULL_HANDLE &&
|
|
attach_z->loadOp == VK_ATTACHMENT_LOAD_OP_CLEAR) {
|
|
clear_att[clear_count].aspectMask |= VK_IMAGE_ASPECT_DEPTH_BIT;
|
|
clear_att[clear_count].clearValue.depthStencil.depth =
|
|
attach_z->clearValue.depthStencil.depth;
|
|
|
|
resolved_clear |= is_attachment_stored(attach_z);
|
|
}
|
|
|
|
if (attach_s != NULL && attach_s->imageView != VK_NULL_HANDLE &&
|
|
attach_s->loadOp == VK_ATTACHMENT_LOAD_OP_CLEAR) {
|
|
clear_att[clear_count].aspectMask |= VK_IMAGE_ASPECT_STENCIL_BIT;
|
|
clear_att[clear_count].clearValue.depthStencil.stencil =
|
|
attach_s->clearValue.depthStencil.stencil;
|
|
|
|
resolved_clear |= is_attachment_stored(attach_s);
|
|
}
|
|
|
|
if (clear_att[clear_count].aspectMask != 0)
|
|
clear_count++;
|
|
|
|
if (clear_count > 0 && incomplete_render_area) {
|
|
const VkClearRect clear_rect = {
|
|
.rect = render->area,
|
|
.baseArrayLayer = 0,
|
|
.layerCount = render->view_mask ? 1 : render->layer_count,
|
|
};
|
|
|
|
hk_CmdClearAttachments(hk_cmd_buffer_to_handle(cmd), clear_count,
|
|
clear_att, 1, &clear_rect);
|
|
} else {
|
|
/* If a tile is empty, we do not want to process it, as the redundant
|
|
* roundtrip of memory-->tilebuffer-->memory wastes a tremendous amount of
|
|
* memory bandwidth. Any draw marks a tile as non-empty, so we only need
|
|
* to process empty tiles if the background+EOT programs have a side
|
|
* effect. This is the case exactly when there is an attachment we are
|
|
* fast clearing and then storing.
|
|
*/
|
|
cs->cr.process_empty_tiles = resolved_clear;
|
|
}
|
|
}
|
|
|
|
VKAPI_ATTR void VKAPI_CALL
|
|
hk_CmdEndRendering(VkCommandBuffer commandBuffer)
|
|
{
|
|
VK_FROM_HANDLE(hk_cmd_buffer, cmd, commandBuffer);
|
|
struct hk_rendering_state *render = &cmd->state.gfx.render;
|
|
|
|
/* The last control stream of the render pass is special since it gets its
|
|
* stores dropped. Swap it in.
|
|
*/
|
|
struct hk_cs *cs = cmd->current_cs.gfx;
|
|
if (cs) {
|
|
cs->cr.eot.main = render->cr.eot.main;
|
|
}
|
|
|
|
perf_debug(cmd, "End rendering");
|
|
hk_cmd_buffer_end_graphics(cmd);
|
|
|
|
bool need_resolve = false;
|
|
|
|
/* Translate render state back to VK for meta */
|
|
VkRenderingAttachmentInfo vk_color_att[HK_MAX_RTS];
|
|
for (uint32_t i = 0; i < render->color_att_count; i++) {
|
|
if (render->color_att[i].resolve_mode != VK_RESOLVE_MODE_NONE)
|
|
need_resolve = true;
|
|
|
|
vk_color_att[i] = (VkRenderingAttachmentInfo){
|
|
.sType = VK_STRUCTURE_TYPE_RENDERING_ATTACHMENT_INFO,
|
|
.imageView = hk_image_view_to_handle(render->color_att[i].iview),
|
|
.imageLayout = VK_IMAGE_LAYOUT_GENERAL,
|
|
.resolveMode = render->color_att[i].resolve_mode,
|
|
.resolveImageView =
|
|
hk_image_view_to_handle(render->color_att[i].resolve_iview),
|
|
.resolveImageLayout = VK_IMAGE_LAYOUT_GENERAL,
|
|
};
|
|
}
|
|
|
|
const VkRenderingAttachmentInfo vk_depth_att = {
|
|
.sType = VK_STRUCTURE_TYPE_RENDERING_ATTACHMENT_INFO,
|
|
.imageView = hk_image_view_to_handle(render->depth_att.iview),
|
|
.imageLayout = VK_IMAGE_LAYOUT_GENERAL,
|
|
.resolveMode = render->depth_att.resolve_mode,
|
|
.resolveImageView =
|
|
hk_image_view_to_handle(render->depth_att.resolve_iview),
|
|
.resolveImageLayout = VK_IMAGE_LAYOUT_GENERAL,
|
|
};
|
|
if (render->depth_att.resolve_mode != VK_RESOLVE_MODE_NONE)
|
|
need_resolve = true;
|
|
|
|
const VkRenderingAttachmentInfo vk_stencil_att = {
|
|
.sType = VK_STRUCTURE_TYPE_RENDERING_ATTACHMENT_INFO,
|
|
.imageView = hk_image_view_to_handle(render->stencil_att.iview),
|
|
.imageLayout = VK_IMAGE_LAYOUT_GENERAL,
|
|
.resolveMode = render->stencil_att.resolve_mode,
|
|
.resolveImageView =
|
|
hk_image_view_to_handle(render->stencil_att.resolve_iview),
|
|
.resolveImageLayout = VK_IMAGE_LAYOUT_GENERAL,
|
|
};
|
|
if (render->stencil_att.resolve_mode != VK_RESOLVE_MODE_NONE)
|
|
need_resolve = true;
|
|
|
|
const VkRenderingInfo vk_render = {
|
|
.sType = VK_STRUCTURE_TYPE_RENDERING_INFO,
|
|
.renderArea = render->area,
|
|
.layerCount = render->layer_count,
|
|
.viewMask = render->view_mask,
|
|
.colorAttachmentCount = render->color_att_count,
|
|
.pColorAttachments = vk_color_att,
|
|
.pDepthAttachment = &vk_depth_att,
|
|
.pStencilAttachment = &vk_stencil_att,
|
|
};
|
|
|
|
if (render->flags & VK_RENDERING_SUSPENDING_BIT)
|
|
need_resolve = false;
|
|
|
|
memset(render, 0, sizeof(*render));
|
|
|
|
if (need_resolve) {
|
|
perf_debug(cmd, "Resolving render pass, colour store op %u",
|
|
vk_color_att[0].storeOp);
|
|
|
|
hk_meta_resolve_rendering(cmd, &vk_render);
|
|
}
|
|
}
|
|
|
|
static uint64_t
|
|
hk_geometry_state(struct hk_cmd_buffer *cmd)
|
|
{
|
|
struct hk_device *dev = hk_cmd_buffer_device(cmd);
|
|
|
|
/* We tie heap allocation to geometry state allocation, so allocate now. */
|
|
if (unlikely(!dev->heap)) {
|
|
perf_debug(cmd, "Allocating heap");
|
|
|
|
size_t size = 128 * 1024 * 1024;
|
|
dev->heap = agx_bo_create(&dev->dev, size, 0, 0, "Geometry heap");
|
|
|
|
/* The geometry state buffer is initialized here and then is treated by
|
|
* the CPU as rodata, even though the GPU uses it for scratch internally.
|
|
*/
|
|
off_t off = dev->rodata.geometry_state - dev->rodata.bo->va->addr;
|
|
struct agx_geometry_state *map = agx_bo_map(dev->rodata.bo) + off;
|
|
|
|
*map = (struct agx_geometry_state){
|
|
.heap = dev->heap->va->addr,
|
|
.heap_size = size,
|
|
};
|
|
}
|
|
|
|
/* We need to free all allocations after each command buffer execution */
|
|
if (!cmd->uses_heap) {
|
|
perf_debug(cmd, "Freeing heap");
|
|
uint64_t addr = dev->rodata.geometry_state;
|
|
|
|
/* Zeroing the allocated index frees everything */
|
|
hk_queue_write(cmd,
|
|
addr + offsetof(struct agx_geometry_state, heap_bottom), 0,
|
|
true /* after gfx */);
|
|
|
|
cmd->uses_heap = true;
|
|
}
|
|
|
|
return dev->rodata.geometry_state;
|
|
}
|
|
|
|
static uint64_t
|
|
hk_upload_ia_params(struct hk_cmd_buffer *cmd, struct agx_draw draw)
|
|
{
|
|
assert(!agx_is_indirect(draw.b) && "indirect params written by GPU");
|
|
|
|
struct agx_ia_state ia = {.verts_per_instance = draw.b.count[0]};
|
|
|
|
if (draw.indexed) {
|
|
unsigned index_size_B = agx_index_size_to_B(draw.index_size);
|
|
unsigned range_el = agx_draw_index_range_el(draw);
|
|
|
|
ia.index_buffer = libagx_index_buffer(agx_draw_index_buffer(draw),
|
|
range_el, 0, index_size_B);
|
|
|
|
ia.index_buffer_range_el = range_el;
|
|
}
|
|
|
|
return hk_pool_upload(cmd, &ia, sizeof(ia), 8);
|
|
}
|
|
|
|
static enum mesa_prim
|
|
hk_gs_in_prim(struct hk_cmd_buffer *cmd)
|
|
{
|
|
struct vk_dynamic_graphics_state *dyn = &cmd->vk.dynamic_graphics_state;
|
|
struct hk_graphics_state *gfx = &cmd->state.gfx;
|
|
struct hk_api_shader *tes = gfx->shaders[MESA_SHADER_TESS_EVAL];
|
|
|
|
if (tes != NULL)
|
|
return gfx->tess.prim;
|
|
else
|
|
return vk_conv_topology(dyn->ia.primitive_topology);
|
|
}
|
|
|
|
static enum mesa_prim
|
|
hk_rast_prim(struct hk_cmd_buffer *cmd)
|
|
{
|
|
struct hk_graphics_state *gfx = &cmd->state.gfx;
|
|
struct hk_api_shader *gs = gfx->shaders[MESA_SHADER_GEOMETRY];
|
|
struct vk_dynamic_graphics_state *dyn = &cmd->vk.dynamic_graphics_state;
|
|
|
|
if (gs != NULL) {
|
|
return gs->variants[HK_GS_VARIANT_RAST].info.gs.mode;
|
|
} else {
|
|
switch (dyn->ia.primitive_topology) {
|
|
case VK_PRIMITIVE_TOPOLOGY_LINE_LIST_WITH_ADJACENCY:
|
|
case VK_PRIMITIVE_TOPOLOGY_LINE_STRIP_WITH_ADJACENCY:
|
|
return MESA_PRIM_LINES;
|
|
case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST_WITH_ADJACENCY:
|
|
case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_STRIP_WITH_ADJACENCY:
|
|
return MESA_PRIM_TRIANGLES;
|
|
default:
|
|
return hk_gs_in_prim(cmd);
|
|
}
|
|
}
|
|
}
|
|
|
|
static uint64_t
|
|
hk_upload_geometry_params(struct hk_cmd_buffer *cmd, struct agx_draw draw)
|
|
{
|
|
struct hk_descriptor_state *desc = &cmd->state.gfx.descriptors;
|
|
struct vk_dynamic_graphics_state *dyn = &cmd->vk.dynamic_graphics_state;
|
|
struct hk_graphics_state *gfx = &cmd->state.gfx;
|
|
struct hk_api_shader *gs = gfx->shaders[MESA_SHADER_GEOMETRY];
|
|
struct hk_shader *fs = hk_only_variant(gfx->shaders[MESA_SHADER_FRAGMENT]);
|
|
|
|
bool rast_disc = dyn->rs.rasterizer_discard_enable;
|
|
struct hk_shader *count = hk_count_gs_variant(gs, rast_disc);
|
|
|
|
/* XXX: We should deduplicate this logic */
|
|
bool indirect = agx_is_indirect(draw.b) ||
|
|
gfx->shaders[MESA_SHADER_TESS_EVAL] || draw.restart;
|
|
enum mesa_prim mode = hk_gs_in_prim(cmd);
|
|
|
|
if (draw.restart) {
|
|
mode = u_decomposed_prim(mode);
|
|
}
|
|
|
|
struct agx_geometry_params params = {
|
|
.state = hk_geometry_state(cmd),
|
|
.flat_outputs = fs ? fs->info.fs.interp.flat : 0,
|
|
.input_topology = mode,
|
|
|
|
/* Overriden by the indirect setup kernel. As tess->GS is always indirect,
|
|
* we can assume here that we're VS->GS.
|
|
*/
|
|
.input_buffer = desc->root.draw.vertex_output_buffer,
|
|
.input_mask = desc->root.draw.vertex_outputs,
|
|
};
|
|
|
|
if (gfx->xfb_enabled) {
|
|
for (unsigned i = 0; i < ARRAY_SIZE(gfx->xfb); ++i) {
|
|
params.xfb_base_original[i] = gfx->xfb[i].addr;
|
|
params.xfb_size[i] = gfx->xfb[i].range;
|
|
params.xfb_offs_ptrs[i] = gfx->xfb_offsets + i * sizeof(uint32_t);
|
|
}
|
|
}
|
|
|
|
for (unsigned i = 0; i < ARRAY_SIZE(gfx->xfb_query); ++i) {
|
|
uint64_t q = gfx->xfb_query[i];
|
|
|
|
if (q) {
|
|
params.xfb_prims_generated_counter[i] = q;
|
|
params.prims_generated_counter[i] = q + sizeof(uint64_t);
|
|
}
|
|
}
|
|
|
|
/* Calculate input primitive count for direct draws, and allocate the vertex
|
|
* & count buffers. GPU calculates and allocates for indirect draws.
|
|
*/
|
|
params.count_buffer_stride = count->info.gs.count_words * 4;
|
|
|
|
if (!count->info.gs.prefix_sum && params.count_buffer_stride) {
|
|
struct agx_ptr T = hk_pool_alloc(cmd, 16, 4);
|
|
memset(T.cpu, 0, 16);
|
|
params.count_buffer = T.gpu;
|
|
}
|
|
|
|
if (indirect) {
|
|
/* TODO: size */
|
|
cmd->geom_indirect = hk_pool_alloc(cmd, 64, 4).gpu;
|
|
|
|
params.indirect_desc = cmd->geom_indirect;
|
|
params.vs_grid[2] = params.gs_grid[2] = 1;
|
|
} else {
|
|
uint32_t verts = draw.b.count[0], instances = draw.b.count[1];
|
|
|
|
params.vs_grid[0] = verts;
|
|
params.gs_grid[0] = u_decomposed_prims_for_vertices(mode, verts);
|
|
|
|
params.primitives_log2 = util_logbase2_ceil(params.gs_grid[0]);
|
|
params.input_primitives = params.gs_grid[0] * instances;
|
|
|
|
unsigned size = params.input_primitives * params.count_buffer_stride;
|
|
if (count->info.gs.prefix_sum && size) {
|
|
params.count_buffer = hk_pool_alloc(cmd, size, 4).gpu;
|
|
}
|
|
|
|
cmd->geom_index_count =
|
|
params.input_primitives * count->info.gs.max_indices;
|
|
|
|
params.output_index_buffer =
|
|
hk_pool_alloc(cmd, cmd->geom_index_count * 4, 4).gpu;
|
|
|
|
cmd->geom_index_buffer = params.output_index_buffer;
|
|
}
|
|
|
|
desc->root_dirty = true;
|
|
return hk_pool_upload(cmd, ¶ms, sizeof(params), 8);
|
|
}
|
|
|
|
static void
|
|
hk_upload_tess_params(struct hk_cmd_buffer *cmd, struct libagx_tess_args *out,
|
|
struct agx_draw draw)
|
|
{
|
|
struct hk_device *dev = hk_cmd_buffer_device(cmd);
|
|
struct vk_dynamic_graphics_state *dyn = &cmd->vk.dynamic_graphics_state;
|
|
struct hk_graphics_state *gfx = &cmd->state.gfx;
|
|
struct hk_shader *tcs = hk_only_variant(gfx->shaders[MESA_SHADER_TESS_CTRL]);
|
|
|
|
enum libagx_tess_partitioning partitioning =
|
|
gfx->tess.info.spacing == TESS_SPACING_EQUAL
|
|
? LIBAGX_TESS_PARTITIONING_INTEGER
|
|
: gfx->tess.info.spacing == TESS_SPACING_FRACTIONAL_ODD
|
|
? LIBAGX_TESS_PARTITIONING_FRACTIONAL_ODD
|
|
: LIBAGX_TESS_PARTITIONING_FRACTIONAL_EVEN;
|
|
|
|
struct libagx_tess_args args = {
|
|
.heap = hk_geometry_state(cmd),
|
|
.tcs_stride_el = tcs->info.tess.tcs_output_stride / 4,
|
|
.statistic = hk_pipeline_stat_addr(
|
|
cmd,
|
|
VK_QUERY_PIPELINE_STATISTIC_TESSELLATION_EVALUATION_SHADER_INVOCATIONS_BIT),
|
|
|
|
.input_patch_size = dyn->ts.patch_control_points,
|
|
.output_patch_size = tcs->info.tess.tcs_output_patch_size,
|
|
.tcs_patch_constants = tcs->info.tess.tcs_nr_patch_outputs,
|
|
.tcs_per_vertex_outputs = tcs->info.tess.tcs_per_vertex_outputs,
|
|
.partitioning = partitioning,
|
|
.points_mode = gfx->tess.info.points,
|
|
.isolines = gfx->tess.info.mode == TESS_PRIMITIVE_ISOLINES,
|
|
};
|
|
|
|
if (!args.points_mode && gfx->tess.info.mode != TESS_PRIMITIVE_ISOLINES) {
|
|
args.ccw = gfx->tess.info.ccw;
|
|
args.ccw ^=
|
|
dyn->ts.domain_origin == VK_TESSELLATION_DOMAIN_ORIGIN_LOWER_LEFT;
|
|
}
|
|
|
|
uint32_t draw_stride_el = 5;
|
|
size_t draw_stride_B = draw_stride_el * sizeof(uint32_t);
|
|
|
|
/* heap is allocated by hk_geometry_state */
|
|
args.patch_coord_buffer = dev->heap->va->addr;
|
|
|
|
if (!agx_is_indirect(draw.b)) {
|
|
unsigned in_patches = draw.b.count[0] / args.input_patch_size;
|
|
unsigned unrolled_patches = in_patches * draw.b.count[1];
|
|
|
|
uint32_t alloc = 0;
|
|
uint32_t tcs_out_offs = alloc;
|
|
alloc += unrolled_patches * args.tcs_stride_el * 4 * 32;
|
|
|
|
uint32_t patch_coord_offs = alloc;
|
|
alloc += unrolled_patches * 4 * 32;
|
|
|
|
uint32_t count_offs = alloc;
|
|
alloc += unrolled_patches * sizeof(uint32_t) * 32;
|
|
|
|
/* Single API draw */
|
|
uint32_t draw_offs = alloc;
|
|
alloc += draw_stride_B;
|
|
|
|
struct agx_ptr blob = hk_pool_alloc(cmd, alloc, 4);
|
|
args.tcs_buffer = blob.gpu + tcs_out_offs;
|
|
args.patches_per_instance = in_patches;
|
|
args.coord_allocs = blob.gpu + patch_coord_offs;
|
|
args.nr_patches = unrolled_patches;
|
|
args.out_draws = blob.gpu + draw_offs;
|
|
args.counts = blob.gpu + count_offs;
|
|
} else {
|
|
/* Allocate 3x indirect global+local grids for VS/TCS/tess */
|
|
uint32_t grid_stride = sizeof(uint32_t) * 6;
|
|
gfx->tess.grids = hk_pool_alloc(cmd, grid_stride * 3, 4).gpu;
|
|
|
|
args.out_draws = hk_pool_alloc(cmd, draw_stride_B, 4).gpu;
|
|
}
|
|
|
|
gfx->tess.out_draws = args.out_draws;
|
|
memcpy(out, &args, sizeof(args));
|
|
}
|
|
|
|
static struct hk_api_shader *
|
|
hk_build_meta_shader_locked(struct hk_device *dev, struct hk_internal_key *key,
|
|
hk_internal_builder_t builder)
|
|
{
|
|
/* Try to get the cached shader */
|
|
struct hash_entry *ent = _mesa_hash_table_search(dev->kernels.ht, key);
|
|
if (ent)
|
|
return ent->data;
|
|
|
|
nir_builder b = nir_builder_init_simple_shader(MESA_SHADER_COMPUTE,
|
|
&agx_nir_options, NULL);
|
|
builder(&b, key->key);
|
|
|
|
const struct vk_pipeline_robustness_state rs = {
|
|
.images = VK_PIPELINE_ROBUSTNESS_IMAGE_BEHAVIOR_DISABLED_EXT,
|
|
.storage_buffers = VK_PIPELINE_ROBUSTNESS_BUFFER_BEHAVIOR_DISABLED_EXT,
|
|
.uniform_buffers = VK_PIPELINE_ROBUSTNESS_BUFFER_BEHAVIOR_DISABLED_EXT,
|
|
.vertex_inputs = VK_PIPELINE_ROBUSTNESS_BUFFER_BEHAVIOR_DISABLED_EXT,
|
|
};
|
|
|
|
struct vk_shader_compile_info info = {
|
|
.stage = b.shader->info.stage,
|
|
.nir = b.shader,
|
|
.robustness = &rs,
|
|
};
|
|
|
|
hk_preprocess_nir_internal(dev->vk.physical, b.shader);
|
|
|
|
struct hk_api_shader *s;
|
|
if (hk_compile_shader(dev, &info, NULL, NULL, &s) != VK_SUCCESS)
|
|
return NULL;
|
|
|
|
/* ..and cache it before we return. The key is on the stack right now, so
|
|
* clone it before using it as a hash table key. The clone is logically owned
|
|
* by the hash table.
|
|
*/
|
|
size_t total_key_size = sizeof(*key) + key->key_size;
|
|
void *cloned_key = ralloc_memdup(dev->kernels.ht, key, total_key_size);
|
|
|
|
_mesa_hash_table_insert(dev->kernels.ht, cloned_key, s);
|
|
return s;
|
|
}
|
|
|
|
struct hk_api_shader *
|
|
hk_meta_shader(struct hk_device *dev, hk_internal_builder_t builder, void *data,
|
|
size_t data_size)
|
|
{
|
|
size_t total_key_size = sizeof(struct hk_internal_key) + data_size;
|
|
|
|
struct hk_internal_key *key = alloca(total_key_size);
|
|
key->builder = builder;
|
|
key->key_size = data_size;
|
|
|
|
if (data_size)
|
|
memcpy(key->key, data, data_size);
|
|
|
|
simple_mtx_lock(&dev->kernels.lock);
|
|
struct hk_api_shader *s = hk_build_meta_shader_locked(dev, key, builder);
|
|
simple_mtx_unlock(&dev->kernels.lock);
|
|
|
|
return s;
|
|
}
|
|
|
|
static struct agx_draw
|
|
hk_draw_as_indexed_indirect(struct hk_cmd_buffer *cmd, struct agx_draw draw)
|
|
{
|
|
assert(draw.indexed);
|
|
|
|
if (agx_is_indirect(draw.b))
|
|
return draw;
|
|
|
|
VkDrawIndexedIndirectCommand desc = {
|
|
.indexCount = draw.b.count[0],
|
|
.instanceCount = draw.b.count[1],
|
|
.firstIndex = draw.start,
|
|
.vertexOffset = draw.index_bias,
|
|
.firstInstance = draw.start_instance,
|
|
};
|
|
|
|
return agx_draw_indexed_indirect(
|
|
hk_pool_upload(cmd, &desc, sizeof(desc), 4), draw.index_buffer,
|
|
draw.index_buffer_range_B, draw.index_size, draw.restart);
|
|
}
|
|
|
|
static struct agx_draw
|
|
hk_draw_without_restart(struct hk_cmd_buffer *cmd, struct agx_draw draw,
|
|
uint32_t draw_count)
|
|
{
|
|
struct hk_device *dev = hk_cmd_buffer_device(cmd);
|
|
struct hk_graphics_state *gfx = &cmd->state.gfx;
|
|
struct vk_dynamic_graphics_state *dyn = &cmd->vk.dynamic_graphics_state;
|
|
|
|
perf_debug(cmd, "Unrolling primitive restart due to GS/XFB");
|
|
|
|
/* The unroll kernel assumes an indirect draw. Synthesize one if needed */
|
|
draw = hk_draw_as_indexed_indirect(cmd, draw);
|
|
|
|
/* Next, we unroll the index buffer used by the indirect draw */
|
|
enum mesa_prim prim = vk_conv_topology(dyn->ia.primitive_topology);
|
|
|
|
assert(draw_count == 1 && "TODO: multidraw");
|
|
|
|
struct libagx_unroll_restart_args ia = {
|
|
.heap = hk_geometry_state(cmd),
|
|
.index_buffer = draw.index_buffer,
|
|
.in_draw = draw.b.ptr,
|
|
.out_draw = hk_pool_alloc(cmd, 5 * sizeof(uint32_t) * draw_count, 4).gpu,
|
|
.max_draws = 1 /* TODO: MDI */,
|
|
.restart_index = gfx->index.restart,
|
|
.index_buffer_size_el = agx_draw_index_range_el(draw),
|
|
.index_size_log2 = draw.index_size,
|
|
.flatshade_first =
|
|
dyn->rs.provoking_vertex == VK_PROVOKING_VERTEX_MODE_FIRST_VERTEX_EXT,
|
|
};
|
|
|
|
libagx_unroll_restart_struct(cmd, agx_1d(1024 * draw_count),
|
|
AGX_BARRIER_ALL | AGX_PREGFX, ia,
|
|
libagx_compact_prim(prim));
|
|
|
|
return agx_draw_indexed_indirect(ia.out_draw, dev->heap->va->addr,
|
|
dev->heap->size, draw.index_size,
|
|
false /* restart */);
|
|
}
|
|
|
|
static struct agx_draw
|
|
hk_launch_gs_prerast(struct hk_cmd_buffer *cmd, struct hk_cs *cs,
|
|
struct agx_draw draw)
|
|
{
|
|
struct hk_device *dev = hk_cmd_buffer_device(cmd);
|
|
struct hk_graphics_state *gfx = &cmd->state.gfx;
|
|
struct hk_descriptor_state *desc = &cmd->state.gfx.descriptors;
|
|
struct hk_api_shader *gs = gfx->shaders[MESA_SHADER_GEOMETRY];
|
|
struct agx_grid grid_vs, grid_gs;
|
|
|
|
struct vk_dynamic_graphics_state *dyn = &cmd->vk.dynamic_graphics_state;
|
|
bool rast_disc = dyn->rs.rasterizer_discard_enable;
|
|
|
|
struct hk_shader *vs = hk_bound_sw_vs_before_gs(gfx);
|
|
struct hk_shader *main = hk_main_gs_variant(gs, rast_disc);
|
|
struct hk_shader *count = hk_count_gs_variant(gs, rast_disc);
|
|
struct hk_shader *pre_gs = hk_pre_gs_variant(gs, rast_disc);
|
|
|
|
uint64_t geometry_params = desc->root.draw.geometry_params;
|
|
unsigned count_words = count->info.gs.count_words;
|
|
|
|
if (false /* TODO */)
|
|
perf_debug(cmd, "Transform feedbck");
|
|
else if (count_words)
|
|
perf_debug(cmd, "Geometry shader with counts");
|
|
else
|
|
perf_debug(cmd, "Geometry shader without counts");
|
|
|
|
enum mesa_prim mode = hk_gs_in_prim(cmd);
|
|
|
|
if (draw.restart) {
|
|
draw = hk_draw_without_restart(cmd, draw, 1);
|
|
mode = u_decomposed_prim(mode);
|
|
}
|
|
|
|
/* Setup grids */
|
|
if (agx_is_indirect(draw.b)) {
|
|
struct libagx_gs_setup_indirect_args gsi = {
|
|
.index_buffer = draw.index_buffer,
|
|
.draw = draw.b.ptr,
|
|
.ia = desc->root.draw.input_assembly,
|
|
.p = desc->root.draw.geometry_params,
|
|
.vs_outputs = vs->b.info.outputs,
|
|
.prim = mode,
|
|
.is_prefix_summing = count->info.gs.prefix_sum,
|
|
.indices_per_in_prim = count->info.gs.max_indices,
|
|
};
|
|
|
|
if (cmd->state.gfx.shaders[MESA_SHADER_TESS_EVAL]) {
|
|
gsi.vertex_buffer = desc->root.draw.tess_params +
|
|
offsetof(struct libagx_tess_args, tes_buffer);
|
|
} else {
|
|
gsi.vertex_buffer = desc->root.root_desc_addr +
|
|
offsetof(struct hk_root_descriptor_table,
|
|
draw.vertex_output_buffer);
|
|
}
|
|
|
|
if (draw.indexed) {
|
|
gsi.index_size_B = agx_index_size_to_B(draw.index_size);
|
|
gsi.index_buffer_range_el = agx_draw_index_range_el(draw);
|
|
}
|
|
|
|
libagx_gs_setup_indirect_struct(cmd, agx_1d(1),
|
|
AGX_BARRIER_ALL | AGX_PREGFX, gsi);
|
|
|
|
grid_vs = agx_grid_indirect(
|
|
geometry_params + offsetof(struct agx_geometry_params, vs_grid));
|
|
|
|
grid_gs = agx_grid_indirect(
|
|
geometry_params + offsetof(struct agx_geometry_params, gs_grid));
|
|
} else {
|
|
grid_vs = grid_gs = draw.b;
|
|
grid_gs.count[0] = u_decomposed_prims_for_vertices(mode, draw.b.count[0]);
|
|
}
|
|
|
|
/* Launch the vertex shader first */
|
|
hk_reserve_scratch(cmd, cs, vs);
|
|
hk_dispatch_with_usc(dev, cs, &vs->b.info,
|
|
hk_upload_usc_words(cmd, vs,
|
|
vs->info.stage == MESA_SHADER_VERTEX
|
|
? gfx->linked[MESA_SHADER_VERTEX]
|
|
: vs->only_linked),
|
|
grid_vs, agx_workgroup(1, 1, 1));
|
|
|
|
/* Transform feedback and various queries require extra dispatching,
|
|
* determine if we need that here.
|
|
*/
|
|
VkQueryPipelineStatisticFlagBits gs_queries =
|
|
VK_QUERY_PIPELINE_STATISTIC_GEOMETRY_SHADER_INVOCATIONS_BIT |
|
|
VK_QUERY_PIPELINE_STATISTIC_GEOMETRY_SHADER_PRIMITIVES_BIT |
|
|
VK_QUERY_PIPELINE_STATISTIC_CLIPPING_PRIMITIVES_BIT |
|
|
VK_QUERY_PIPELINE_STATISTIC_CLIPPING_INVOCATIONS_BIT;
|
|
|
|
struct hk_root_descriptor_table *root = &cmd->state.gfx.descriptors.root;
|
|
bool xfb_or_queries =
|
|
main->info.gs.xfb || (root->draw.pipeline_stats_flags & gs_queries);
|
|
|
|
if (xfb_or_queries) {
|
|
/* If we need counts, launch the count shader and prefix sum the results. */
|
|
if (count_words) {
|
|
perf_debug(dev, "Geometry shader count");
|
|
hk_dispatch_with_local_size(cmd, cs, count, grid_gs,
|
|
agx_workgroup(1, 1, 1));
|
|
}
|
|
|
|
if (count->info.gs.prefix_sum) {
|
|
perf_debug(dev, "Geometry shader transform feedback prefix sum");
|
|
libagx_prefix_sum_geom(cmd, agx_1d(1024 * count_words),
|
|
AGX_BARRIER_ALL | AGX_PREGFX, geometry_params);
|
|
}
|
|
|
|
/* Transform feedback / query program */
|
|
perf_debug(dev, "Transform feedback / geometry query");
|
|
hk_dispatch_with_local_size(cmd, cs, pre_gs, agx_1d(1),
|
|
agx_workgroup(1, 1, 1));
|
|
}
|
|
|
|
/* Pre-rast geometry shader */
|
|
hk_dispatch_with_local_size(cmd, cs, main, grid_gs, agx_workgroup(1, 1, 1));
|
|
|
|
if (agx_is_indirect(draw.b)) {
|
|
return agx_draw_indexed_indirect(cmd->geom_indirect, dev->heap->va->addr,
|
|
dev->heap->size, AGX_INDEX_SIZE_U32,
|
|
true);
|
|
} else {
|
|
return agx_draw_indexed(cmd->geom_index_count, 1, 0, 0, 0,
|
|
cmd->geom_index_buffer, cmd->geom_index_count * 4,
|
|
AGX_INDEX_SIZE_U32, true);
|
|
}
|
|
}
|
|
|
|
static struct agx_draw
|
|
hk_launch_tess(struct hk_cmd_buffer *cmd, struct hk_cs *cs,
|
|
struct agx_draw draw, uint64_t c_prims, uint64_t c_inv)
|
|
{
|
|
struct hk_device *dev = hk_cmd_buffer_device(cmd);
|
|
struct hk_graphics_state *gfx = &cmd->state.gfx;
|
|
struct agx_grid grid_vs, grid_tcs, grid_tess;
|
|
|
|
struct hk_shader *vs = hk_bound_sw_vs(gfx);
|
|
struct hk_shader *tcs = hk_only_variant(gfx->shaders[MESA_SHADER_TESS_CTRL]);
|
|
|
|
struct vk_dynamic_graphics_state *dyn = &cmd->vk.dynamic_graphics_state;
|
|
uint32_t input_patch_size = dyn->ts.patch_control_points;
|
|
uint64_t state = gfx->descriptors.root.draw.tess_params;
|
|
struct hk_tess_info info = gfx->tess.info;
|
|
|
|
perf_debug(cmd, "Tessellation");
|
|
|
|
uint64_t tcs_stat = hk_pipeline_stat_addr(
|
|
cmd, VK_QUERY_PIPELINE_STATISTIC_TESSELLATION_CONTROL_SHADER_PATCHES_BIT);
|
|
|
|
/* Setup grids */
|
|
if (agx_is_indirect(draw.b)) {
|
|
perf_debug(cmd, "Indirect tessellation");
|
|
|
|
struct libagx_tess_setup_indirect_args args = {
|
|
.p = state,
|
|
.grids = gfx->tess.grids,
|
|
.indirect = draw.b.ptr,
|
|
.ia = gfx->descriptors.root.draw.input_assembly,
|
|
.vertex_outputs = vs->b.info.outputs,
|
|
.vertex_output_buffer_ptr =
|
|
gfx->root + offsetof(struct hk_root_descriptor_table,
|
|
draw.vertex_output_buffer),
|
|
.tcs_statistic = hk_pipeline_stat_addr(
|
|
cmd,
|
|
VK_QUERY_PIPELINE_STATISTIC_TESSELLATION_CONTROL_SHADER_PATCHES_BIT),
|
|
};
|
|
|
|
if (draw.indexed) {
|
|
args.in_index_buffer = draw.index_buffer;
|
|
args.in_index_size_B = agx_index_size_to_B(draw.index_size);
|
|
args.in_index_buffer_range_el = agx_draw_index_range_el(draw);
|
|
}
|
|
|
|
libagx_tess_setup_indirect_struct(cmd, agx_1d(1),
|
|
AGX_BARRIER_ALL | AGX_PREGFX, args);
|
|
|
|
uint32_t grid_stride = sizeof(uint32_t) * 6;
|
|
grid_vs = agx_grid_indirect_local(gfx->tess.grids + 0 * grid_stride);
|
|
grid_tcs = agx_grid_indirect_local(gfx->tess.grids + 1 * grid_stride);
|
|
grid_tess = agx_grid_indirect_local(gfx->tess.grids + 2 * grid_stride);
|
|
} else {
|
|
uint32_t patches = draw.b.count[0] / input_patch_size;
|
|
grid_vs = grid_tcs = draw.b;
|
|
|
|
grid_tcs.count[0] = patches * tcs->info.tess.tcs_output_patch_size;
|
|
grid_tess = agx_1d(patches * draw.b.count[1]);
|
|
|
|
/* TCS invocation counter increments once per-patch */
|
|
if (tcs_stat) {
|
|
perf_debug(cmd, "Direct TCS statistic");
|
|
libagx_increment_statistic(
|
|
cmd, agx_1d(1), AGX_BARRIER_ALL | AGX_PREGFX, tcs_stat, patches);
|
|
}
|
|
}
|
|
|
|
/* First launch the VS and TCS */
|
|
hk_reserve_scratch(cmd, cs, vs);
|
|
hk_reserve_scratch(cmd, cs, tcs);
|
|
|
|
hk_dispatch_with_usc(
|
|
dev, cs, &vs->b.info,
|
|
hk_upload_usc_words(cmd, vs, gfx->linked[MESA_SHADER_VERTEX]), grid_vs,
|
|
agx_workgroup(64, 1, 1));
|
|
|
|
hk_dispatch_with_usc(
|
|
dev, cs, &tcs->b.info, hk_upload_usc_words(cmd, tcs, tcs->only_linked),
|
|
grid_tcs, agx_workgroup(tcs->info.tess.tcs_output_patch_size, 1, 1));
|
|
|
|
/* First generate counts, then prefix sum them, and then tessellate. */
|
|
libagx_tessellate(cmd, grid_tess, AGX_BARRIER_ALL | AGX_PREGFX, info.mode,
|
|
LIBAGX_TESS_MODE_COUNT, state);
|
|
|
|
libagx_prefix_sum_tess(cmd, agx_1d(1024), AGX_BARRIER_ALL | AGX_PREGFX,
|
|
state, c_prims, c_inv, c_prims || c_inv);
|
|
|
|
libagx_tessellate(cmd, grid_tess, AGX_BARRIER_ALL | AGX_PREGFX, info.mode,
|
|
LIBAGX_TESS_MODE_WITH_COUNTS, state);
|
|
|
|
return agx_draw_indexed_indirect(gfx->tess.out_draws, dev->heap->va->addr,
|
|
dev->heap->size, AGX_INDEX_SIZE_U32, false);
|
|
}
|
|
|
|
void
|
|
hk_cmd_bind_graphics_shader(struct hk_cmd_buffer *cmd,
|
|
const gl_shader_stage stage,
|
|
struct hk_api_shader *shader)
|
|
{
|
|
struct hk_device *dev = hk_cmd_buffer_device(cmd);
|
|
struct vk_dynamic_graphics_state *dyn = &cmd->vk.dynamic_graphics_state;
|
|
|
|
/* Null fragment shaders are annoying to handle, because we may still need an
|
|
* actual fragment shader to attach a prolog to. Rather than adding special
|
|
* cases, we just bind an empty fragment shader instead of NULL to make
|
|
* everything work correctly.
|
|
*/
|
|
if (stage == MESA_SHADER_FRAGMENT && shader == NULL) {
|
|
shader = dev->null_fs;
|
|
}
|
|
|
|
assert(stage < ARRAY_SIZE(cmd->state.gfx.shaders));
|
|
if (cmd->state.gfx.shaders[stage] == shader)
|
|
return;
|
|
|
|
cmd->state.gfx.shaders[stage] = shader;
|
|
cmd->state.gfx.shaders_dirty |= BITFIELD_BIT(stage);
|
|
|
|
if (stage == MESA_SHADER_FRAGMENT) {
|
|
BITSET_SET(dyn->dirty, MESA_VK_DYNAMIC_MS_RASTERIZATION_SAMPLES);
|
|
}
|
|
}
|
|
|
|
static void
|
|
hk_flush_shaders(struct hk_cmd_buffer *cmd)
|
|
{
|
|
if (cmd->state.gfx.shaders_dirty == 0)
|
|
return;
|
|
|
|
struct hk_graphics_state *gfx = &cmd->state.gfx;
|
|
struct hk_descriptor_state *desc = &cmd->state.gfx.descriptors;
|
|
desc->root_dirty = true;
|
|
|
|
/* Geometry shading overrides the restart index, reemit on rebind */
|
|
if (IS_SHADER_DIRTY(GEOMETRY)) {
|
|
struct hk_api_shader *gs = gfx->shaders[MESA_SHADER_GEOMETRY];
|
|
|
|
desc->root.draw.api_gs = gs && !gs->is_passthrough;
|
|
}
|
|
|
|
struct hk_shader *hw_vs = hk_bound_hw_vs(gfx);
|
|
struct hk_api_shader *fs = gfx->shaders[MESA_SHADER_FRAGMENT];
|
|
|
|
/* If we have a new VS/FS pair, UVS locations may have changed so need to
|
|
* relink. We do this here because there's no dependence on the fast linked
|
|
* shaders.
|
|
*/
|
|
agx_assign_uvs(&gfx->linked_varyings, &hw_vs->info.uvs,
|
|
fs ? hk_only_variant(fs)->info.fs.interp.flat : 0,
|
|
fs ? hk_only_variant(fs)->info.fs.interp.linear : 0);
|
|
|
|
for (unsigned i = 0; i < VARYING_SLOT_MAX; ++i) {
|
|
desc->root.draw.uvs_index[i] = gfx->linked_varyings.slots[i];
|
|
}
|
|
}
|
|
|
|
static struct agx_shader_part *
|
|
hk_get_prolog_epilog_locked(struct hk_device *dev, struct hk_internal_key *key,
|
|
hk_internal_builder_t builder, bool preprocess_nir,
|
|
bool stop, unsigned cf_base)
|
|
{
|
|
/* Try to get the cached shader */
|
|
struct hash_entry *ent = _mesa_hash_table_search(dev->prolog_epilog.ht, key);
|
|
if (ent)
|
|
return ent->data;
|
|
|
|
nir_builder b = nir_builder_init_simple_shader(0, &agx_nir_options, NULL);
|
|
builder(&b, key->key);
|
|
|
|
if (preprocess_nir)
|
|
agx_preprocess_nir(b.shader);
|
|
|
|
struct agx_shader_key backend_key = {
|
|
.dev = agx_gather_device_key(&dev->dev),
|
|
.secondary = true,
|
|
.no_stop = !stop,
|
|
};
|
|
|
|
/* We always use dynamic sample shading in the GL driver. Indicate that. */
|
|
if (b.shader->info.stage == MESA_SHADER_FRAGMENT) {
|
|
backend_key.fs.cf_base = cf_base;
|
|
|
|
if (b.shader->info.fs.uses_sample_shading)
|
|
backend_key.fs.inside_sample_loop = true;
|
|
}
|
|
|
|
struct agx_shader_part *part =
|
|
rzalloc(dev->prolog_epilog.ht, struct agx_shader_part);
|
|
|
|
agx_compile_shader_nir(b.shader, &backend_key, part);
|
|
|
|
ralloc_free(b.shader);
|
|
|
|
/* ..and cache it before we return. The key is on the stack right now, so
|
|
* clone it before using it as a hash table key. The clone is logically owned
|
|
* by the hash table.
|
|
*/
|
|
size_t total_key_size = sizeof(*key) + key->key_size;
|
|
void *cloned_key = ralloc_memdup(dev->prolog_epilog.ht, key, total_key_size);
|
|
|
|
_mesa_hash_table_insert(dev->prolog_epilog.ht, cloned_key, part);
|
|
return part;
|
|
}
|
|
|
|
static struct agx_shader_part *
|
|
hk_get_prolog_epilog(struct hk_device *dev, void *data, size_t data_size,
|
|
hk_internal_builder_t builder, bool preprocess_nir,
|
|
bool stop, unsigned cf_base)
|
|
{
|
|
/* Build the meta shader key */
|
|
size_t total_key_size = sizeof(struct hk_internal_key) + data_size;
|
|
|
|
struct hk_internal_key *key = alloca(total_key_size);
|
|
key->builder = builder;
|
|
key->key_size = data_size;
|
|
|
|
if (data_size)
|
|
memcpy(key->key, data, data_size);
|
|
|
|
simple_mtx_lock(&dev->prolog_epilog.lock);
|
|
|
|
struct agx_shader_part *part = hk_get_prolog_epilog_locked(
|
|
dev, key, builder, preprocess_nir, stop, cf_base);
|
|
|
|
simple_mtx_unlock(&dev->prolog_epilog.lock);
|
|
return part;
|
|
}
|
|
|
|
static struct hk_linked_shader *
|
|
hk_get_fast_linked_locked_vs(struct hk_device *dev, struct hk_shader *shader,
|
|
struct hk_fast_link_key_vs *key)
|
|
{
|
|
struct agx_shader_part *prolog =
|
|
hk_get_prolog_epilog(dev, &key->prolog, sizeof(key->prolog),
|
|
agx_nir_vs_prolog, false, false, 0);
|
|
|
|
struct hk_linked_shader *linked =
|
|
hk_fast_link(dev, false, shader, prolog, NULL, 0);
|
|
|
|
struct hk_fast_link_key *key_clone =
|
|
ralloc_memdup(shader->linked.ht, key, sizeof(*key));
|
|
|
|
/* XXX: Fix this higher up the stack */
|
|
linked->sw_indexing = !key->prolog.hw || key->prolog.adjacency;
|
|
linked->b.uses_base_param |= linked->sw_indexing;
|
|
|
|
_mesa_hash_table_insert(shader->linked.ht, key_clone, linked);
|
|
return linked;
|
|
}
|
|
|
|
static void
|
|
build_fs_prolog(nir_builder *b, const void *key)
|
|
{
|
|
agx_nir_fs_prolog(b, key);
|
|
|
|
/* Lower load_stat_query_address_agx, needed for FS statistics */
|
|
NIR_PASS(_, b->shader, hk_lower_uvs_index, 0);
|
|
}
|
|
|
|
static struct hk_linked_shader *
|
|
hk_get_fast_linked_locked_fs(struct hk_device *dev, struct hk_shader *shader,
|
|
struct hk_fast_link_key_fs *key)
|
|
{
|
|
/* TODO: prolog without fs needs to work too... */
|
|
bool needs_prolog = key->prolog.statistics ||
|
|
key->prolog.cull_distance_size ||
|
|
key->prolog.api_sample_mask != 0xff;
|
|
|
|
struct agx_shader_part *prolog = NULL;
|
|
if (needs_prolog) {
|
|
prolog = hk_get_prolog_epilog(dev, &key->prolog, sizeof(key->prolog),
|
|
build_fs_prolog, false, false,
|
|
key->prolog.cf_base);
|
|
}
|
|
|
|
/* If sample shading is used, don't stop at the epilog, there's a
|
|
* footer that the fast linker will insert to stop.
|
|
*/
|
|
bool epilog_stop = (key->nr_samples_shaded == 0);
|
|
|
|
struct agx_shader_part *epilog =
|
|
hk_get_prolog_epilog(dev, &key->epilog, sizeof(key->epilog),
|
|
agx_nir_fs_epilog, true, epilog_stop, 0);
|
|
|
|
struct hk_linked_shader *linked =
|
|
hk_fast_link(dev, true, shader, prolog, epilog, key->nr_samples_shaded);
|
|
|
|
struct hk_fast_link_key *key_clone =
|
|
ralloc_memdup(shader->linked.ht, key, sizeof(*key));
|
|
|
|
_mesa_hash_table_insert(shader->linked.ht, key_clone, linked);
|
|
return linked;
|
|
}
|
|
|
|
/*
|
|
* First, look for a fully linked variant. Else, build the required shader
|
|
* parts and link.
|
|
*/
|
|
static struct hk_linked_shader *
|
|
hk_get_fast_linked(struct hk_device *dev, struct hk_shader *shader, void *key)
|
|
{
|
|
struct hk_linked_shader *linked;
|
|
simple_mtx_lock(&shader->linked.lock);
|
|
|
|
struct hash_entry *ent = _mesa_hash_table_search(shader->linked.ht, key);
|
|
|
|
if (ent)
|
|
linked = ent->data;
|
|
else if (shader->info.stage == MESA_SHADER_VERTEX)
|
|
linked = hk_get_fast_linked_locked_vs(dev, shader, key);
|
|
else if (shader->info.stage == MESA_SHADER_FRAGMENT)
|
|
linked = hk_get_fast_linked_locked_fs(dev, shader, key);
|
|
else
|
|
unreachable("invalid stage");
|
|
|
|
simple_mtx_unlock(&shader->linked.lock);
|
|
return linked;
|
|
}
|
|
|
|
static void
|
|
hk_update_fast_linked(struct hk_cmd_buffer *cmd, struct hk_shader *shader,
|
|
void *key)
|
|
{
|
|
struct hk_device *dev = hk_cmd_buffer_device(cmd);
|
|
struct hk_linked_shader *new = hk_get_fast_linked(dev, shader, key);
|
|
gl_shader_stage stage = shader->info.stage;
|
|
|
|
if (cmd->state.gfx.linked[stage] != new) {
|
|
cmd->state.gfx.linked[stage] = new;
|
|
cmd->state.gfx.linked_dirty |= BITFIELD_BIT(stage);
|
|
}
|
|
}
|
|
|
|
static enum agx_polygon_mode
|
|
translate_polygon_mode(VkPolygonMode vk_mode)
|
|
{
|
|
static_assert((enum agx_polygon_mode)VK_POLYGON_MODE_FILL ==
|
|
AGX_POLYGON_MODE_FILL);
|
|
static_assert((enum agx_polygon_mode)VK_POLYGON_MODE_LINE ==
|
|
AGX_POLYGON_MODE_LINE);
|
|
static_assert((enum agx_polygon_mode)VK_POLYGON_MODE_POINT ==
|
|
AGX_POLYGON_MODE_POINT);
|
|
|
|
assert(vk_mode <= VK_POLYGON_MODE_POINT);
|
|
return (enum agx_polygon_mode)vk_mode;
|
|
}
|
|
|
|
static enum agx_zs_func
|
|
translate_compare_op(VkCompareOp vk_mode)
|
|
{
|
|
static_assert((enum agx_zs_func)VK_COMPARE_OP_NEVER == AGX_ZS_FUNC_NEVER);
|
|
static_assert((enum agx_zs_func)VK_COMPARE_OP_LESS == AGX_ZS_FUNC_LESS);
|
|
static_assert((enum agx_zs_func)VK_COMPARE_OP_EQUAL == AGX_ZS_FUNC_EQUAL);
|
|
static_assert((enum agx_zs_func)VK_COMPARE_OP_LESS_OR_EQUAL ==
|
|
AGX_ZS_FUNC_LEQUAL);
|
|
static_assert((enum agx_zs_func)VK_COMPARE_OP_GREATER ==
|
|
AGX_ZS_FUNC_GREATER);
|
|
static_assert((enum agx_zs_func)VK_COMPARE_OP_NOT_EQUAL ==
|
|
AGX_ZS_FUNC_NOT_EQUAL);
|
|
static_assert((enum agx_zs_func)VK_COMPARE_OP_GREATER_OR_EQUAL ==
|
|
AGX_ZS_FUNC_GEQUAL);
|
|
static_assert((enum agx_zs_func)VK_COMPARE_OP_ALWAYS == AGX_ZS_FUNC_ALWAYS);
|
|
|
|
assert(vk_mode <= VK_COMPARE_OP_ALWAYS);
|
|
return (enum agx_zs_func)vk_mode;
|
|
}
|
|
|
|
static enum agx_stencil_op
|
|
translate_stencil_op(VkStencilOp vk_op)
|
|
{
|
|
static_assert((enum agx_stencil_op)VK_STENCIL_OP_KEEP ==
|
|
AGX_STENCIL_OP_KEEP);
|
|
static_assert((enum agx_stencil_op)VK_STENCIL_OP_ZERO ==
|
|
AGX_STENCIL_OP_ZERO);
|
|
static_assert((enum agx_stencil_op)VK_STENCIL_OP_REPLACE ==
|
|
AGX_STENCIL_OP_REPLACE);
|
|
static_assert((enum agx_stencil_op)VK_STENCIL_OP_INCREMENT_AND_CLAMP ==
|
|
AGX_STENCIL_OP_INCR_SAT);
|
|
static_assert((enum agx_stencil_op)VK_STENCIL_OP_DECREMENT_AND_CLAMP ==
|
|
AGX_STENCIL_OP_DECR_SAT);
|
|
static_assert((enum agx_stencil_op)VK_STENCIL_OP_INVERT ==
|
|
AGX_STENCIL_OP_INVERT);
|
|
static_assert((enum agx_stencil_op)VK_STENCIL_OP_INCREMENT_AND_WRAP ==
|
|
AGX_STENCIL_OP_INCR_WRAP);
|
|
static_assert((enum agx_stencil_op)VK_STENCIL_OP_DECREMENT_AND_WRAP ==
|
|
AGX_STENCIL_OP_DECR_WRAP);
|
|
|
|
return (enum agx_stencil_op)vk_op;
|
|
}
|
|
|
|
static void
|
|
hk_ppp_push_stencil_face(struct agx_ppp_update *ppp,
|
|
struct vk_stencil_test_face_state s, bool enabled)
|
|
{
|
|
if (enabled) {
|
|
agx_ppp_push(ppp, FRAGMENT_STENCIL, cfg) {
|
|
cfg.compare = translate_compare_op(s.op.compare);
|
|
cfg.write_mask = s.write_mask;
|
|
cfg.read_mask = s.compare_mask;
|
|
|
|
cfg.depth_pass = translate_stencil_op(s.op.pass);
|
|
cfg.depth_fail = translate_stencil_op(s.op.depth_fail);
|
|
cfg.stencil_fail = translate_stencil_op(s.op.fail);
|
|
}
|
|
} else {
|
|
agx_ppp_push(ppp, FRAGMENT_STENCIL, cfg) {
|
|
cfg.compare = AGX_ZS_FUNC_ALWAYS;
|
|
cfg.write_mask = 0xFF;
|
|
cfg.read_mask = 0xFF;
|
|
|
|
cfg.depth_pass = AGX_STENCIL_OP_KEEP;
|
|
cfg.depth_fail = AGX_STENCIL_OP_KEEP;
|
|
cfg.stencil_fail = AGX_STENCIL_OP_KEEP;
|
|
}
|
|
}
|
|
}
|
|
|
|
static bool
|
|
hk_stencil_test_enabled(struct hk_cmd_buffer *cmd)
|
|
{
|
|
const struct hk_rendering_state *render = &cmd->state.gfx.render;
|
|
struct vk_dynamic_graphics_state *dyn = &cmd->vk.dynamic_graphics_state;
|
|
|
|
return dyn->ds.stencil.test_enable &&
|
|
render->stencil_att.vk_format != VK_FORMAT_UNDEFINED;
|
|
}
|
|
|
|
static void
|
|
hk_flush_vp_state(struct hk_cmd_buffer *cmd, struct hk_cs *cs, uint8_t **out)
|
|
{
|
|
const struct vk_dynamic_graphics_state *dyn =
|
|
&cmd->vk.dynamic_graphics_state;
|
|
|
|
/* We always need at least 1 viewport for the hardware. With rasterizer
|
|
* discard the app may not supply any, but we can just program garbage.
|
|
*/
|
|
unsigned count = MAX2(dyn->vp.viewport_count, 1);
|
|
|
|
unsigned minx[HK_MAX_VIEWPORTS] = {0}, miny[HK_MAX_VIEWPORTS] = {0};
|
|
unsigned maxx[HK_MAX_VIEWPORTS] = {0}, maxy[HK_MAX_VIEWPORTS] = {0};
|
|
|
|
/* We implicitly scissor to the viewport. We need to do a min/max dance to
|
|
* handle inverted viewports.
|
|
*/
|
|
for (uint32_t i = 0; i < dyn->vp.viewport_count; i++) {
|
|
const VkViewport *vp = &dyn->vp.viewports[i];
|
|
|
|
minx[i] = MIN2(vp->x, vp->x + vp->width);
|
|
miny[i] = MIN2(vp->y, vp->y + vp->height);
|
|
maxx[i] = MAX2(vp->x, vp->x + vp->width);
|
|
maxy[i] = MAX2(vp->y, vp->y + vp->height);
|
|
}
|
|
|
|
/* Additionally clamp to the framebuffer so we don't rasterize
|
|
* off-screen pixels. TODO: Is this necessary? the GL driver does this but
|
|
* it might be cargoculted at this point.
|
|
*/
|
|
for (unsigned i = 0; i < count; ++i) {
|
|
minx[i] = MIN2(minx[i], cmd->state.gfx.render.cr.width);
|
|
maxx[i] = MIN2(maxx[i], cmd->state.gfx.render.cr.width);
|
|
miny[i] = MIN2(miny[i], cmd->state.gfx.render.cr.height);
|
|
maxy[i] = MIN2(maxy[i], cmd->state.gfx.render.cr.height);
|
|
}
|
|
|
|
/* We additionally apply any API scissors */
|
|
for (unsigned i = 0; i < dyn->vp.scissor_count; ++i) {
|
|
const VkRect2D *s = &dyn->vp.scissors[i];
|
|
|
|
minx[i] = MAX2(minx[i], s->offset.x);
|
|
miny[i] = MAX2(miny[i], s->offset.y);
|
|
maxx[i] = MIN2(maxx[i], s->offset.x + s->extent.width);
|
|
maxy[i] = MIN2(maxy[i], s->offset.y + s->extent.height);
|
|
}
|
|
|
|
/* Upload a hardware scissor for each viewport, whether there's a
|
|
* corresponding API scissor or not.
|
|
*/
|
|
unsigned index = cs->scissor.size / AGX_SCISSOR_LENGTH;
|
|
struct agx_scissor_packed *scissors =
|
|
util_dynarray_grow_bytes(&cs->scissor, count, AGX_SCISSOR_LENGTH);
|
|
|
|
for (unsigned i = 0; i < count; ++i) {
|
|
const VkViewport *vp = &dyn->vp.viewports[i];
|
|
|
|
agx_pack(scissors + i, SCISSOR, cfg) {
|
|
cfg.min_x = minx[i];
|
|
cfg.min_y = miny[i];
|
|
cfg.max_x = maxx[i];
|
|
cfg.max_y = maxy[i];
|
|
|
|
/* These settings in conjunction with the PPP control depth clip/clamp
|
|
* settings implement depth clip/clamping. Properly setting them
|
|
* together is required for conformant depth clip enable.
|
|
*
|
|
* TODO: Reverse-engineer the finer interactions here.
|
|
*/
|
|
if (dyn->rs.depth_clamp_enable) {
|
|
cfg.min_z = MIN2(vp->minDepth, vp->maxDepth);
|
|
cfg.max_z = MAX2(vp->minDepth, vp->maxDepth);
|
|
} else {
|
|
cfg.min_z = 0.0;
|
|
cfg.max_z = 1.0;
|
|
}
|
|
}
|
|
}
|
|
|
|
/* Upload state */
|
|
struct AGX_PPP_HEADER present = {
|
|
.depth_bias_scissor = true,
|
|
.region_clip = true,
|
|
.viewport = true,
|
|
.viewport_count = count,
|
|
};
|
|
|
|
size_t size = agx_ppp_update_size(&present);
|
|
struct agx_ptr T = hk_pool_alloc(cmd, size, 64);
|
|
if (!T.cpu)
|
|
return;
|
|
|
|
struct agx_ppp_update ppp = agx_new_ppp_update(T, size, &present);
|
|
|
|
agx_ppp_push(&ppp, DEPTH_BIAS_SCISSOR, cfg) {
|
|
cfg.scissor = index;
|
|
|
|
/* Use the current depth bias, we allocate linearly */
|
|
unsigned count = cs->depth_bias.size / AGX_DEPTH_BIAS_LENGTH;
|
|
cfg.depth_bias = count ? count - 1 : 0;
|
|
};
|
|
|
|
for (unsigned i = 0; i < count; ++i) {
|
|
agx_ppp_push(&ppp, REGION_CLIP, cfg) {
|
|
cfg.enable = true;
|
|
cfg.min_x = minx[i] / 32;
|
|
cfg.min_y = miny[i] / 32;
|
|
cfg.max_x = DIV_ROUND_UP(MAX2(maxx[i], 1), 32);
|
|
cfg.max_y = DIV_ROUND_UP(MAX2(maxy[i], 1), 32);
|
|
}
|
|
}
|
|
|
|
agx_ppp_push(&ppp, VIEWPORT_CONTROL, cfg)
|
|
;
|
|
|
|
/* Upload viewports */
|
|
for (unsigned i = 0; i < count; ++i) {
|
|
const VkViewport *vp = &dyn->vp.viewports[i];
|
|
|
|
agx_ppp_push(&ppp, VIEWPORT, cfg) {
|
|
cfg.translate_x = vp->x + 0.5f * vp->width;
|
|
cfg.translate_y = vp->y + 0.5f * vp->height;
|
|
cfg.translate_z = vp->minDepth;
|
|
|
|
cfg.scale_x = vp->width * 0.5f;
|
|
cfg.scale_y = vp->height * 0.5f;
|
|
cfg.scale_z = vp->maxDepth - vp->minDepth;
|
|
}
|
|
}
|
|
|
|
agx_ppp_fini(out, &ppp);
|
|
}
|
|
|
|
static enum agx_object_type
|
|
translate_object_type(enum mesa_prim topology)
|
|
{
|
|
static_assert(MESA_PRIM_LINES < MESA_PRIM_LINE_STRIP);
|
|
static_assert(MESA_PRIM_TRIANGLES >= MESA_PRIM_LINE_STRIP);
|
|
|
|
if (topology == MESA_PRIM_POINTS)
|
|
return AGX_OBJECT_TYPE_POINT_SPRITE_UV01;
|
|
else if (topology <= MESA_PRIM_LINE_STRIP)
|
|
return AGX_OBJECT_TYPE_LINE;
|
|
else
|
|
return AGX_OBJECT_TYPE_TRIANGLE;
|
|
}
|
|
|
|
static enum agx_primitive
|
|
translate_hw_primitive_topology(enum mesa_prim prim)
|
|
{
|
|
switch (prim) {
|
|
case MESA_PRIM_POINTS:
|
|
return AGX_PRIMITIVE_POINTS;
|
|
case MESA_PRIM_LINES:
|
|
return AGX_PRIMITIVE_LINES;
|
|
case MESA_PRIM_LINE_STRIP:
|
|
return AGX_PRIMITIVE_LINE_STRIP;
|
|
case MESA_PRIM_TRIANGLES:
|
|
return AGX_PRIMITIVE_TRIANGLES;
|
|
case MESA_PRIM_TRIANGLE_STRIP:
|
|
return AGX_PRIMITIVE_TRIANGLE_STRIP;
|
|
case MESA_PRIM_TRIANGLE_FAN:
|
|
return AGX_PRIMITIVE_TRIANGLE_FAN;
|
|
default:
|
|
unreachable("Invalid hardware primitive topology");
|
|
}
|
|
}
|
|
|
|
static inline enum agx_vdm_vertex
|
|
translate_vdm_vertex(unsigned vtx)
|
|
{
|
|
static_assert(AGX_VDM_VERTEX_0 == 0);
|
|
static_assert(AGX_VDM_VERTEX_1 == 1);
|
|
static_assert(AGX_VDM_VERTEX_2 == 2);
|
|
|
|
assert(vtx <= 2);
|
|
return vtx;
|
|
}
|
|
|
|
static inline enum agx_ppp_vertex
|
|
translate_ppp_vertex(unsigned vtx)
|
|
{
|
|
static_assert(AGX_PPP_VERTEX_0 == 0 + 1);
|
|
static_assert(AGX_PPP_VERTEX_1 == 1 + 1);
|
|
static_assert(AGX_PPP_VERTEX_2 == 2 + 1);
|
|
|
|
assert(vtx <= 2);
|
|
return vtx + 1;
|
|
}
|
|
|
|
static void
|
|
hk_flush_index(struct hk_cmd_buffer *cmd, struct hk_cs *cs)
|
|
{
|
|
uint32_t index = cmd->state.gfx.shaders[MESA_SHADER_GEOMETRY]
|
|
? BITFIELD_MASK(32)
|
|
: cmd->state.gfx.index.restart;
|
|
|
|
/* VDM State updates are relatively expensive, so only emit them when the
|
|
* restart index changes. This is simpler than accurate dirty tracking.
|
|
*/
|
|
if (cs->restart_index != index) {
|
|
uint8_t *out = cs->current;
|
|
agx_push(out, VDM_STATE, cfg) {
|
|
cfg.restart_index_present = true;
|
|
}
|
|
|
|
agx_push(out, VDM_STATE_RESTART_INDEX, cfg) {
|
|
cfg.value = index;
|
|
}
|
|
|
|
cs->current = out;
|
|
cs->restart_index = index;
|
|
}
|
|
}
|
|
|
|
/*
|
|
* Return the given sample positions, packed into a 32-bit word with fixed
|
|
* point nibbles for each x/y component of the (at most 4) samples. This is
|
|
* suitable for programming the PPP_MULTISAMPLECTL control register.
|
|
*/
|
|
static uint32_t
|
|
hk_pack_ppp_multisamplectrl(const struct vk_sample_locations_state *sl)
|
|
{
|
|
uint32_t ctrl = 0;
|
|
|
|
for (int32_t i = sl->per_pixel - 1; i >= 0; i--) {
|
|
VkSampleLocationEXT loc = sl->locations[i];
|
|
|
|
uint32_t x = CLAMP(loc.x, 0.0f, 0.9375f) * 16.0;
|
|
uint32_t y = CLAMP(loc.y, 0.0f, 0.9375f) * 16.0;
|
|
|
|
assert(x <= 15);
|
|
assert(y <= 15);
|
|
|
|
/* Push bytes in reverse order so we can use constant shifts. */
|
|
ctrl = (ctrl << 8) | (y << 4) | x;
|
|
}
|
|
|
|
return ctrl;
|
|
}
|
|
|
|
/*
|
|
* Return the standard sample positions, prepacked as above for efficiency.
|
|
*/
|
|
uint32_t
|
|
hk_default_sample_positions(unsigned nr_samples)
|
|
{
|
|
switch (nr_samples) {
|
|
case 0:
|
|
case 1:
|
|
return 0x88;
|
|
case 2:
|
|
return 0x44cc;
|
|
case 4:
|
|
return 0xeaa26e26;
|
|
default:
|
|
unreachable("Invalid sample count");
|
|
}
|
|
}
|
|
|
|
static void
|
|
hk_flush_ppp_state(struct hk_cmd_buffer *cmd, struct hk_cs *cs, uint8_t **out)
|
|
{
|
|
const struct hk_rendering_state *render = &cmd->state.gfx.render;
|
|
struct vk_dynamic_graphics_state *dyn = &cmd->vk.dynamic_graphics_state;
|
|
|
|
struct hk_graphics_state *gfx = &cmd->state.gfx;
|
|
struct hk_shader *hw_vs = hk_bound_hw_vs(gfx);
|
|
struct hk_shader *fs = hk_only_variant(gfx->shaders[MESA_SHADER_FRAGMENT]);
|
|
|
|
bool hw_vs_dirty = IS_SHADER_DIRTY(VERTEX) || IS_SHADER_DIRTY(TESS_EVAL) ||
|
|
IS_SHADER_DIRTY(GEOMETRY);
|
|
bool fs_dirty = IS_SHADER_DIRTY(FRAGMENT);
|
|
|
|
struct hk_linked_shader *linked_fs = gfx->linked[MESA_SHADER_FRAGMENT];
|
|
bool linked_fs_dirty = IS_LINKED_DIRTY(FRAGMENT);
|
|
|
|
bool varyings_dirty = gfx->dirty & HK_DIRTY_VARYINGS;
|
|
|
|
bool face_dirty =
|
|
IS_DIRTY(DS_DEPTH_TEST_ENABLE) || IS_DIRTY(DS_DEPTH_WRITE_ENABLE) ||
|
|
IS_DIRTY(DS_DEPTH_COMPARE_OP) || IS_DIRTY(DS_STENCIL_REFERENCE) ||
|
|
IS_DIRTY(RS_LINE_WIDTH) || IS_DIRTY(RS_POLYGON_MODE) || fs_dirty;
|
|
|
|
bool stencil_face_dirty =
|
|
IS_DIRTY(DS_STENCIL_OP) || IS_DIRTY(DS_STENCIL_COMPARE_MASK) ||
|
|
IS_DIRTY(DS_STENCIL_WRITE_MASK) || IS_DIRTY(DS_STENCIL_TEST_ENABLE);
|
|
|
|
struct AGX_PPP_HEADER dirty = {
|
|
.fragment_control =
|
|
IS_DIRTY(DS_STENCIL_TEST_ENABLE) || IS_DIRTY(IA_PRIMITIVE_TOPOLOGY) ||
|
|
IS_DIRTY(RS_DEPTH_BIAS_ENABLE) || gfx->dirty & HK_DIRTY_OCCLUSION,
|
|
|
|
.fragment_control_2 =
|
|
IS_DIRTY(RS_RASTERIZER_DISCARD_ENABLE) || linked_fs_dirty,
|
|
|
|
.fragment_front_face = face_dirty,
|
|
.fragment_front_face_2 = fs_dirty || IS_DIRTY(IA_PRIMITIVE_TOPOLOGY),
|
|
.fragment_front_stencil = stencil_face_dirty,
|
|
.fragment_back_face = face_dirty,
|
|
.fragment_back_face_2 = fs_dirty || IS_DIRTY(IA_PRIMITIVE_TOPOLOGY),
|
|
.fragment_back_stencil = stencil_face_dirty,
|
|
.output_select = hw_vs_dirty || linked_fs_dirty || varyings_dirty,
|
|
.varying_counts_32 = varyings_dirty,
|
|
.varying_counts_16 = varyings_dirty,
|
|
.cull = IS_DIRTY(RS_CULL_MODE) ||
|
|
IS_DIRTY(RS_RASTERIZER_DISCARD_ENABLE) ||
|
|
IS_DIRTY(RS_FRONT_FACE) || IS_DIRTY(RS_DEPTH_CLIP_ENABLE) ||
|
|
IS_DIRTY(RS_DEPTH_CLAMP_ENABLE) || IS_DIRTY(RS_LINE_MODE) ||
|
|
IS_DIRTY(IA_PRIMITIVE_TOPOLOGY) ||
|
|
(gfx->dirty & HK_DIRTY_PROVOKING) || IS_SHADER_DIRTY(TESS_CTRL) ||
|
|
IS_SHADER_DIRTY(TESS_EVAL) || IS_DIRTY(TS_DOMAIN_ORIGIN),
|
|
.cull_2 = varyings_dirty,
|
|
|
|
/* With a null FS, the fragment shader PPP word is ignored and doesn't
|
|
* need to be present.
|
|
*/
|
|
.fragment_shader = fs && (fs_dirty || linked_fs_dirty || varyings_dirty ||
|
|
gfx->descriptors.root_dirty),
|
|
|
|
.occlusion_query = gfx->dirty & HK_DIRTY_OCCLUSION,
|
|
.output_size = hw_vs_dirty,
|
|
.viewport_count = 1, /* irrelevant */
|
|
};
|
|
|
|
/* Calculate the update size. If it equals the header, there is nothing to
|
|
* update so early-exit.
|
|
*/
|
|
size_t size = agx_ppp_update_size(&dirty);
|
|
if (size == AGX_PPP_HEADER_LENGTH)
|
|
return;
|
|
|
|
/* Otherwise, allocate enough space for the update and push it. */
|
|
assert(size > AGX_PPP_HEADER_LENGTH);
|
|
|
|
struct agx_ptr T = hk_pool_alloc(cmd, size, 64);
|
|
if (!T.cpu)
|
|
return;
|
|
|
|
struct agx_ppp_update ppp = agx_new_ppp_update(T, size, &dirty);
|
|
|
|
if (dirty.fragment_control) {
|
|
agx_ppp_push(&ppp, FRAGMENT_CONTROL, cfg) {
|
|
cfg.visibility_mode = gfx->occlusion.mode;
|
|
cfg.stencil_test_enable = hk_stencil_test_enabled(cmd);
|
|
|
|
/* TODO: Consider optimizing this? */
|
|
cfg.two_sided_stencil = cfg.stencil_test_enable;
|
|
|
|
cfg.depth_bias_enable = dyn->rs.depth_bias.enable &&
|
|
gfx->object_type == AGX_OBJECT_TYPE_TRIANGLE;
|
|
|
|
/* Always enable scissoring so we may scissor to the viewport (TODO:
|
|
* optimize this out if the viewport is the default and the app does
|
|
* not use the scissor test)
|
|
*/
|
|
cfg.scissor_enable = true;
|
|
|
|
/* This avoids broken derivatives along primitive edges */
|
|
cfg.disable_tri_merging = gfx->object_type != AGX_OBJECT_TYPE_TRIANGLE;
|
|
}
|
|
}
|
|
|
|
if (dirty.fragment_control_2) {
|
|
if (linked_fs) {
|
|
/* Annoying, rasterizer_discard seems to be ignored (sometimes?) in the
|
|
* main fragment control word and has to be combined into the secondary
|
|
* word for reliable behaviour.
|
|
*/
|
|
agx_ppp_push_merged(&ppp, FRAGMENT_CONTROL, cfg,
|
|
linked_fs->b.fragment_control) {
|
|
|
|
cfg.tag_write_disable = dyn->rs.rasterizer_discard_enable;
|
|
}
|
|
} else {
|
|
/* If there is no fragment shader, we must disable tag writes to avoid
|
|
* executing the missing shader. This optimizes depth-only passes.
|
|
*/
|
|
agx_ppp_push(&ppp, FRAGMENT_CONTROL, cfg) {
|
|
cfg.tag_write_disable = true;
|
|
cfg.pass_type = AGX_PASS_TYPE_OPAQUE;
|
|
}
|
|
}
|
|
}
|
|
|
|
struct agx_fragment_face_packed fragment_face = {};
|
|
struct agx_fragment_face_2_packed fragment_face_2 = {};
|
|
|
|
if (dirty.fragment_front_face) {
|
|
bool has_z = render->depth_att.vk_format != VK_FORMAT_UNDEFINED;
|
|
bool z_test = has_z && dyn->ds.depth.test_enable;
|
|
|
|
agx_pack(&fragment_face, FRAGMENT_FACE, cfg) {
|
|
cfg.line_width = agx_pack_line_width(dyn->rs.line.width);
|
|
cfg.polygon_mode = translate_polygon_mode(dyn->rs.polygon_mode);
|
|
cfg.disable_depth_write = !(z_test && dyn->ds.depth.write_enable);
|
|
|
|
if (z_test && !gfx->descriptors.root.draw.force_never_in_shader)
|
|
cfg.depth_function = translate_compare_op(dyn->ds.depth.compare_op);
|
|
else
|
|
cfg.depth_function = AGX_ZS_FUNC_ALWAYS;
|
|
};
|
|
|
|
agx_ppp_push_merged(&ppp, FRAGMENT_FACE, cfg, fragment_face) {
|
|
cfg.stencil_reference = dyn->ds.stencil.front.reference;
|
|
}
|
|
}
|
|
|
|
if (dirty.fragment_front_face_2) {
|
|
if (fs) {
|
|
agx_pack(&fragment_face_2, FRAGMENT_FACE_2, cfg) {
|
|
cfg.object_type = gfx->object_type;
|
|
}
|
|
|
|
agx_merge(fragment_face_2, fs->frag_face, FRAGMENT_FACE_2);
|
|
agx_ppp_push_packed(&ppp, &fragment_face_2, FRAGMENT_FACE_2);
|
|
} else {
|
|
agx_ppp_fragment_face_2(&ppp, gfx->object_type, NULL);
|
|
}
|
|
}
|
|
|
|
if (dirty.fragment_front_stencil) {
|
|
hk_ppp_push_stencil_face(&ppp, dyn->ds.stencil.front,
|
|
hk_stencil_test_enabled(cmd));
|
|
}
|
|
|
|
if (dirty.fragment_back_face) {
|
|
assert(dirty.fragment_front_face);
|
|
|
|
agx_ppp_push_merged(&ppp, FRAGMENT_FACE, cfg, fragment_face) {
|
|
cfg.stencil_reference = dyn->ds.stencil.back.reference;
|
|
}
|
|
}
|
|
|
|
if (dirty.fragment_back_face_2) {
|
|
assert(dirty.fragment_front_face_2);
|
|
|
|
agx_ppp_push_packed(&ppp, &fragment_face_2, FRAGMENT_FACE_2);
|
|
}
|
|
|
|
if (dirty.fragment_back_stencil) {
|
|
hk_ppp_push_stencil_face(&ppp, dyn->ds.stencil.back,
|
|
hk_stencil_test_enabled(cmd));
|
|
}
|
|
|
|
if (dirty.output_select) {
|
|
struct agx_output_select_packed osel = hw_vs->info.uvs.osel;
|
|
|
|
if (linked_fs) {
|
|
agx_ppp_push_merged_blobs(&ppp, AGX_OUTPUT_SELECT_LENGTH, &osel,
|
|
&linked_fs->b.osel);
|
|
} else {
|
|
agx_ppp_push_packed(&ppp, &osel, OUTPUT_SELECT);
|
|
}
|
|
}
|
|
|
|
assert(dirty.varying_counts_32 == dirty.varying_counts_16);
|
|
|
|
if (dirty.varying_counts_32) {
|
|
agx_ppp_push_packed(&ppp, &gfx->linked_varyings.counts_32,
|
|
VARYING_COUNTS);
|
|
|
|
agx_ppp_push_packed(&ppp, &gfx->linked_varyings.counts_16,
|
|
VARYING_COUNTS);
|
|
}
|
|
|
|
if (dirty.cull) {
|
|
agx_ppp_push(&ppp, CULL, cfg) {
|
|
cfg.cull_front = dyn->rs.cull_mode & VK_CULL_MODE_FRONT_BIT;
|
|
cfg.cull_back = dyn->rs.cull_mode & VK_CULL_MODE_BACK_BIT;
|
|
cfg.front_face_ccw = dyn->rs.front_face != VK_FRONT_FACE_CLOCKWISE;
|
|
|
|
if (gfx->shaders[MESA_SHADER_TESS_CTRL] &&
|
|
!gfx->shaders[MESA_SHADER_GEOMETRY]) {
|
|
cfg.front_face_ccw ^= gfx->tess.info.ccw;
|
|
cfg.front_face_ccw ^= dyn->ts.domain_origin ==
|
|
VK_TESSELLATION_DOMAIN_ORIGIN_LOWER_LEFT;
|
|
}
|
|
|
|
cfg.flat_shading_vertex = translate_ppp_vertex(gfx->provoking);
|
|
cfg.rasterizer_discard = dyn->rs.rasterizer_discard_enable;
|
|
|
|
/* We do not support unrestricted depth, so clamping is inverted from
|
|
* clipping. This implementation seems to pass CTS without unrestricted
|
|
* depth support.
|
|
*
|
|
* TODO: Make sure this is right with gl_FragDepth.
|
|
*/
|
|
cfg.depth_clip = vk_rasterization_state_depth_clip_enable(&dyn->rs);
|
|
cfg.depth_clamp = !cfg.depth_clip;
|
|
|
|
cfg.primitive_msaa =
|
|
gfx->object_type == AGX_OBJECT_TYPE_LINE &&
|
|
dyn->rs.line.mode == VK_LINE_RASTERIZATION_MODE_BRESENHAM_KHR;
|
|
}
|
|
}
|
|
|
|
if (dirty.cull_2) {
|
|
agx_ppp_push(&ppp, CULL_2, cfg) {
|
|
cfg.needs_primitive_id = gfx->generate_primitive_id;
|
|
cfg.clamp_w = true;
|
|
}
|
|
}
|
|
|
|
if (dirty.fragment_shader) {
|
|
/* TODO: Do less often? */
|
|
hk_reserve_scratch(cmd, cs, fs);
|
|
|
|
agx_ppp_push_packed(&ppp, &linked_fs->fs_counts, FRAGMENT_SHADER_WORD_0);
|
|
|
|
agx_ppp_push(&ppp, FRAGMENT_SHADER_WORD_1, cfg) {
|
|
cfg.pipeline = hk_upload_usc_words(cmd, fs, linked_fs);
|
|
}
|
|
|
|
agx_ppp_push(&ppp, FRAGMENT_SHADER_WORD_2, cfg) {
|
|
cfg.cf_bindings = gfx->varyings;
|
|
}
|
|
|
|
agx_ppp_push(&ppp, FRAGMENT_SHADER_WORD_3, cfg)
|
|
;
|
|
}
|
|
|
|
if (dirty.occlusion_query) {
|
|
agx_ppp_push(&ppp, FRAGMENT_OCCLUSION_QUERY, cfg) {
|
|
cfg.index = gfx->occlusion.index;
|
|
}
|
|
}
|
|
|
|
if (dirty.output_size) {
|
|
agx_ppp_push(&ppp, OUTPUT_SIZE, cfg) {
|
|
cfg.count = hw_vs->info.uvs.size;
|
|
}
|
|
}
|
|
|
|
agx_ppp_fini(out, &ppp);
|
|
}
|
|
|
|
/*
|
|
* Based somewhat on the calculation in the PowerVR driver, and mostly trial &
|
|
* error to pass CTS. This is a mess.
|
|
*/
|
|
static float
|
|
hk_depth_bias_factor(VkFormat format, bool exact, bool force_unorm)
|
|
{
|
|
if (format == VK_FORMAT_D16_UNORM) {
|
|
return exact ? (1 << 16) : (1 << 15);
|
|
} else if (force_unorm) {
|
|
return exact ? (1ull << 24) : (1ull << 23);
|
|
} else {
|
|
return 1.0;
|
|
}
|
|
}
|
|
|
|
static void
|
|
hk_flush_dynamic_state(struct hk_cmd_buffer *cmd, struct hk_cs *cs,
|
|
uint32_t draw_id, struct agx_draw draw)
|
|
{
|
|
struct hk_device *dev = hk_cmd_buffer_device(cmd);
|
|
const struct hk_rendering_state *render = &cmd->state.gfx.render;
|
|
struct vk_dynamic_graphics_state *dyn = &cmd->vk.dynamic_graphics_state;
|
|
|
|
struct hk_graphics_state *gfx = &cmd->state.gfx;
|
|
|
|
struct hk_shader *hw_vs = hk_bound_hw_vs(gfx);
|
|
struct hk_shader *sw_vs = hk_bound_sw_vs(gfx);
|
|
|
|
if (!vk_dynamic_graphics_state_any_dirty(dyn) && !gfx->dirty &&
|
|
!gfx->descriptors.root_dirty && !gfx->shaders_dirty &&
|
|
!sw_vs->b.info.uses_draw_id && !sw_vs->b.info.uses_base_param &&
|
|
!(gfx->linked[MESA_SHADER_VERTEX] &&
|
|
gfx->linked[MESA_SHADER_VERTEX]->b.uses_base_param))
|
|
return;
|
|
|
|
struct hk_descriptor_state *desc = &cmd->state.gfx.descriptors;
|
|
|
|
assert(cs->current + 0x1000 < cs->end && "already ensured space");
|
|
uint8_t *out = cs->current;
|
|
|
|
struct hk_shader *fs = hk_only_variant(gfx->shaders[MESA_SHADER_FRAGMENT]);
|
|
|
|
bool gt_dirty = IS_SHADER_DIRTY(TESS_CTRL) || IS_SHADER_DIRTY(TESS_EVAL) ||
|
|
IS_SHADER_DIRTY(GEOMETRY);
|
|
bool vgt_dirty = IS_SHADER_DIRTY(VERTEX) || gt_dirty;
|
|
bool fs_dirty = IS_SHADER_DIRTY(FRAGMENT);
|
|
|
|
if (IS_DIRTY(CB_BLEND_CONSTANTS)) {
|
|
static_assert(sizeof(desc->root.draw.blend_constant) ==
|
|
sizeof(dyn->cb.blend_constants) &&
|
|
"common size");
|
|
|
|
memcpy(desc->root.draw.blend_constant, dyn->cb.blend_constants,
|
|
sizeof(dyn->cb.blend_constants));
|
|
desc->root_dirty = true;
|
|
}
|
|
|
|
if (IS_DIRTY(MS_SAMPLE_MASK)) {
|
|
desc->root.draw.api_sample_mask = dyn->ms.sample_mask;
|
|
desc->root_dirty = true;
|
|
}
|
|
|
|
if (fs_dirty || IS_DIRTY(DS_DEPTH_TEST_ENABLE) ||
|
|
IS_DIRTY(DS_DEPTH_COMPARE_OP)) {
|
|
|
|
const struct hk_rendering_state *render = &cmd->state.gfx.render;
|
|
bool has_z = render->depth_att.vk_format != VK_FORMAT_UNDEFINED;
|
|
bool z_test = has_z && dyn->ds.depth.test_enable;
|
|
|
|
desc->root.draw.force_never_in_shader =
|
|
z_test && dyn->ds.depth.compare_op == VK_COMPARE_OP_NEVER && fs &&
|
|
fs->info.fs.writes_memory;
|
|
|
|
desc->root_dirty = true;
|
|
}
|
|
|
|
/* The main shader must not run tests if the epilog will. */
|
|
bool nontrivial_force_early =
|
|
fs && (fs->b.info.early_fragment_tests &&
|
|
(fs->b.info.writes_sample_mask || fs->info.fs.writes_memory));
|
|
|
|
bool epilog_discards = dyn->ms.alpha_to_coverage_enable ||
|
|
(fs && (fs->info.fs.epilog_key.write_z ||
|
|
fs->info.fs.epilog_key.write_s));
|
|
epilog_discards &= !nontrivial_force_early;
|
|
|
|
if (fs_dirty || IS_DIRTY(MS_ALPHA_TO_COVERAGE_ENABLE)) {
|
|
desc->root.draw.no_epilog_discard = !epilog_discards ? ~0 : 0;
|
|
desc->root_dirty = true;
|
|
}
|
|
|
|
if (IS_DIRTY(VI) || IS_DIRTY(VI_BINDINGS_VALID) ||
|
|
IS_DIRTY(VI_BINDING_STRIDES) || vgt_dirty || true /* TODO */) {
|
|
|
|
struct hk_fast_link_key_vs key = {
|
|
.prolog.hw = (sw_vs == hw_vs),
|
|
|
|
/* FIXME: handle pipeline robustness "properly" */
|
|
.prolog.robustness.level =
|
|
(dev->vk.enabled_features.robustBufferAccess2 ||
|
|
dev->vk.enabled_features.pipelineRobustness)
|
|
? AGX_ROBUSTNESS_D3D
|
|
: dev->vk.enabled_features.robustBufferAccess
|
|
? AGX_ROBUSTNESS_GL
|
|
: AGX_ROBUSTNESS_DISABLED,
|
|
|
|
.prolog.robustness.soft_fault = agx_has_soft_fault(&dev->dev),
|
|
};
|
|
|
|
enum mesa_prim prim = vk_conv_topology(dyn->ia.primitive_topology);
|
|
|
|
if (mesa_prim_has_adjacency(prim)) {
|
|
if (draw.restart) {
|
|
prim = u_decomposed_prim(prim);
|
|
}
|
|
|
|
key.prolog.adjacency = prim;
|
|
}
|
|
|
|
if (key.prolog.adjacency || !key.prolog.hw) {
|
|
key.prolog.sw_index_size_B =
|
|
draw.indexed ? agx_index_size_to_B(draw.index_size) : 0;
|
|
}
|
|
|
|
static_assert(sizeof(key.prolog.component_mask) ==
|
|
sizeof(sw_vs->info.vs.attrib_components_read));
|
|
BITSET_COPY(key.prolog.component_mask,
|
|
sw_vs->info.vs.attrib_components_read);
|
|
|
|
u_foreach_bit(a, dyn->vi->attributes_valid) {
|
|
struct vk_vertex_attribute_state attr = dyn->vi->attributes[a];
|
|
|
|
assert(dyn->vi->bindings_valid & BITFIELD_BIT(attr.binding));
|
|
struct vk_vertex_binding_state binding =
|
|
dyn->vi->bindings[attr.binding];
|
|
|
|
/* nir_assign_io_var_locations compacts vertex inputs, eliminating
|
|
* unused inputs. We need to do the same here to match the locations.
|
|
*/
|
|
unsigned slot =
|
|
util_bitcount64(sw_vs->info.vs.attribs_read & BITFIELD_MASK(a));
|
|
|
|
key.prolog.attribs[slot] = (struct agx_velem_key){
|
|
.format = hk_format_to_pipe_format(attr.format),
|
|
.stride = dyn->vi_binding_strides[attr.binding],
|
|
.divisor = binding.divisor,
|
|
.instanced = binding.input_rate == VK_VERTEX_INPUT_RATE_INSTANCE,
|
|
};
|
|
}
|
|
|
|
hk_update_fast_linked(cmd, sw_vs, &key);
|
|
}
|
|
|
|
if (IS_DIRTY(VI) || IS_DIRTY(VI_BINDINGS_VALID) || vgt_dirty ||
|
|
(gfx->dirty & HK_DIRTY_VB)) {
|
|
|
|
unsigned slot = 0;
|
|
u_foreach_bit(a, sw_vs->info.vs.attribs_read) {
|
|
if (dyn->vi->attributes_valid & BITFIELD_BIT(a)) {
|
|
struct vk_vertex_attribute_state attr = dyn->vi->attributes[a];
|
|
struct hk_addr_range vb = gfx->vb[attr.binding];
|
|
|
|
desc->root.draw.attrib_clamps[slot] = agx_calculate_vbo_clamp(
|
|
vb.addr, hk_format_to_pipe_format(attr.format), vb.range,
|
|
dyn->vi_binding_strides[attr.binding], attr.offset,
|
|
&desc->root.draw.attrib_base[slot]);
|
|
} else {
|
|
desc->root.draw.attrib_base[slot] = AGX_ZERO_PAGE_ADDRESS;
|
|
desc->root.draw.attrib_clamps[slot] = 0;
|
|
}
|
|
|
|
++slot;
|
|
}
|
|
|
|
desc->root_dirty = true;
|
|
}
|
|
|
|
if (vgt_dirty || IS_SHADER_DIRTY(FRAGMENT) ||
|
|
IS_DIRTY(MS_RASTERIZATION_SAMPLES) || IS_DIRTY(MS_SAMPLE_MASK) ||
|
|
IS_DIRTY(MS_ALPHA_TO_COVERAGE_ENABLE) ||
|
|
IS_DIRTY(MS_ALPHA_TO_ONE_ENABLE) || IS_DIRTY(CB_LOGIC_OP) ||
|
|
IS_DIRTY(CB_LOGIC_OP_ENABLE) || IS_DIRTY(CB_WRITE_MASKS) ||
|
|
IS_DIRTY(CB_COLOR_WRITE_ENABLES) || IS_DIRTY(CB_ATTACHMENT_COUNT) ||
|
|
IS_DIRTY(CB_BLEND_ENABLES) || IS_DIRTY(CB_BLEND_EQUATIONS) ||
|
|
IS_DIRTY(CB_BLEND_CONSTANTS) ||
|
|
desc->root_dirty /* for pipeline stats */ || true) {
|
|
|
|
unsigned tib_sample_mask = BITFIELD_MASK(dyn->ms.rasterization_samples);
|
|
unsigned api_sample_mask = dyn->ms.sample_mask & tib_sample_mask;
|
|
bool has_sample_mask = api_sample_mask != tib_sample_mask;
|
|
|
|
if (hw_vs->info.cull_distance_array_size) {
|
|
perf_debug(cmd, "Emulating cull distance (size %u, %s a frag shader)",
|
|
hw_vs->info.cull_distance_array_size,
|
|
fs ? "with" : "without");
|
|
}
|
|
|
|
if (has_sample_mask) {
|
|
perf_debug(cmd, "Emulating sample mask (%s a frag shader)",
|
|
fs ? "with" : "without");
|
|
}
|
|
|
|
if (fs) {
|
|
unsigned samples_shaded = 0;
|
|
if (fs->info.fs.epilog_key.sample_shading)
|
|
samples_shaded = dyn->ms.rasterization_samples;
|
|
|
|
struct hk_fast_link_key_fs key = {
|
|
.prolog.statistics = hk_pipeline_stat_addr(
|
|
cmd,
|
|
VK_QUERY_PIPELINE_STATISTIC_FRAGMENT_SHADER_INVOCATIONS_BIT),
|
|
|
|
.prolog.cull_distance_size = hw_vs->info.cull_distance_array_size,
|
|
.prolog.api_sample_mask = has_sample_mask ? api_sample_mask : 0xff,
|
|
.nr_samples_shaded = samples_shaded,
|
|
};
|
|
|
|
bool prolog_discards =
|
|
has_sample_mask || key.prolog.cull_distance_size;
|
|
|
|
bool needs_prolog = key.prolog.statistics || prolog_discards;
|
|
|
|
if (needs_prolog) {
|
|
/* With late main shader tests, the prolog runs tests if neither the
|
|
* main shader nor epilog will.
|
|
*
|
|
* With (nontrivial) early main shader tests, the prolog does not
|
|
* run tests, the tests will run at the start of the main shader.
|
|
* This ensures tests are after API sample mask and cull distance
|
|
* discards.
|
|
*/
|
|
key.prolog.run_zs_tests = !nontrivial_force_early &&
|
|
!fs->b.info.writes_sample_mask &&
|
|
!epilog_discards && prolog_discards;
|
|
|
|
if (key.prolog.cull_distance_size) {
|
|
key.prolog.cf_base = fs->b.info.varyings.fs.nr_cf;
|
|
}
|
|
}
|
|
|
|
key.epilog = (struct agx_fs_epilog_key){
|
|
.link = fs->info.fs.epilog_key,
|
|
.nr_samples = MAX2(dyn->ms.rasterization_samples, 1),
|
|
.blend.alpha_to_coverage = dyn->ms.alpha_to_coverage_enable,
|
|
.blend.alpha_to_one = dyn->ms.alpha_to_one_enable,
|
|
.blend.logicop_enable = dyn->cb.logic_op_enable,
|
|
.blend.logicop_func = vk_logic_op_to_pipe(dyn->cb.logic_op),
|
|
};
|
|
|
|
for (unsigned rt = 0; rt < ARRAY_SIZE(dyn->cal.color_map); ++rt) {
|
|
int map = dyn->cal.color_map[rt];
|
|
key.epilog.remap[rt] = map == MESA_VK_ATTACHMENT_UNUSED ? -1 : map;
|
|
}
|
|
|
|
if (dyn->ms.alpha_to_one_enable || dyn->ms.alpha_to_coverage_enable ||
|
|
dyn->cb.logic_op_enable) {
|
|
|
|
perf_debug(
|
|
cmd, "Epilog with%s%s%s",
|
|
dyn->ms.alpha_to_one_enable ? " alpha-to-one" : "",
|
|
dyn->ms.alpha_to_coverage_enable ? " alpha-to-coverage" : "",
|
|
dyn->cb.logic_op_enable ? " logic-op" : "");
|
|
}
|
|
|
|
key.epilog.link.already_ran_zs |= nontrivial_force_early;
|
|
|
|
struct hk_rendering_state *render = &cmd->state.gfx.render;
|
|
for (uint32_t i = 0; i < render->color_att_count; i++) {
|
|
key.epilog.rt_formats[i] =
|
|
hk_format_to_pipe_format(render->color_att[i].vk_format);
|
|
|
|
const struct vk_color_blend_attachment_state *cb =
|
|
&dyn->cb.attachments[i];
|
|
|
|
bool write_enable = dyn->cb.color_write_enables & BITFIELD_BIT(i);
|
|
unsigned write_mask = write_enable ? cb->write_mask : 0;
|
|
|
|
/* nir_lower_blend always blends, so use a default blend state when
|
|
* blending is disabled at an API level.
|
|
*/
|
|
if (!dyn->cb.attachments[i].blend_enable) {
|
|
key.epilog.blend.rt[i] = (struct agx_blend_rt_key){
|
|
.colormask = write_mask,
|
|
.rgb_func = PIPE_BLEND_ADD,
|
|
.alpha_func = PIPE_BLEND_ADD,
|
|
.rgb_src_factor = PIPE_BLENDFACTOR_ONE,
|
|
.alpha_src_factor = PIPE_BLENDFACTOR_ONE,
|
|
.rgb_dst_factor = PIPE_BLENDFACTOR_ZERO,
|
|
.alpha_dst_factor = PIPE_BLENDFACTOR_ZERO,
|
|
};
|
|
} else {
|
|
key.epilog.blend.rt[i] = (struct agx_blend_rt_key){
|
|
.colormask = write_mask,
|
|
|
|
.rgb_src_factor =
|
|
vk_blend_factor_to_pipe(cb->src_color_blend_factor),
|
|
|
|
.rgb_dst_factor =
|
|
vk_blend_factor_to_pipe(cb->dst_color_blend_factor),
|
|
|
|
.rgb_func = vk_blend_op_to_pipe(cb->color_blend_op),
|
|
|
|
.alpha_src_factor =
|
|
vk_blend_factor_to_pipe(cb->src_alpha_blend_factor),
|
|
|
|
.alpha_dst_factor =
|
|
vk_blend_factor_to_pipe(cb->dst_alpha_blend_factor),
|
|
|
|
.alpha_func = vk_blend_op_to_pipe(cb->alpha_blend_op),
|
|
};
|
|
}
|
|
}
|
|
|
|
hk_update_fast_linked(cmd, fs, &key);
|
|
} else {
|
|
/* TODO: prolog without fs needs to work too... */
|
|
if (cmd->state.gfx.linked[MESA_SHADER_FRAGMENT] != NULL) {
|
|
cmd->state.gfx.linked_dirty |= BITFIELD_BIT(MESA_SHADER_FRAGMENT);
|
|
cmd->state.gfx.linked[MESA_SHADER_FRAGMENT] = NULL;
|
|
}
|
|
}
|
|
}
|
|
|
|
/* If the vertex shader uses draw parameters, vertex uniforms are dirty every
|
|
* draw. Fragment uniforms are unaffected.
|
|
*
|
|
* For a direct draw, we upload the draw parameters as-if indirect to
|
|
* avoid keying to indirectness.
|
|
*/
|
|
if (gfx->linked[MESA_SHADER_VERTEX]->b.uses_base_param) {
|
|
if (agx_is_indirect(draw.b)) {
|
|
gfx->draw_params = draw.b.ptr;
|
|
|
|
if (draw.indexed) {
|
|
gfx->draw_params +=
|
|
offsetof(VkDrawIndexedIndirectCommand, vertexOffset);
|
|
} else {
|
|
gfx->draw_params += offsetof(VkDrawIndirectCommand, firstVertex);
|
|
}
|
|
} else {
|
|
uint32_t params[] = {
|
|
draw.indexed ? draw.index_bias : draw.start,
|
|
draw.start_instance,
|
|
};
|
|
|
|
gfx->draw_params = hk_pool_upload(cmd, params, sizeof(params), 4);
|
|
}
|
|
} else {
|
|
gfx->draw_params = 0;
|
|
}
|
|
|
|
if (sw_vs->b.info.uses_draw_id) {
|
|
/* TODO: rodata? */
|
|
gfx->draw_id_ptr = hk_pool_upload(cmd, &draw_id, 2, 4);
|
|
} else {
|
|
gfx->draw_id_ptr = 0;
|
|
}
|
|
|
|
if (IS_DIRTY(IA_PRIMITIVE_TOPOLOGY) || gt_dirty) {
|
|
enum mesa_prim prim = hk_rast_prim(cmd);
|
|
|
|
gfx->topology = translate_hw_primitive_topology(prim);
|
|
gfx->object_type = translate_object_type(prim);
|
|
}
|
|
|
|
if (IS_DIRTY(IA_PRIMITIVE_TOPOLOGY) || IS_DIRTY(RS_PROVOKING_VERTEX)) {
|
|
unsigned provoking;
|
|
if (dyn->rs.provoking_vertex == VK_PROVOKING_VERTEX_MODE_LAST_VERTEX_EXT)
|
|
provoking = 2;
|
|
else if (gfx->topology == AGX_PRIMITIVE_TRIANGLE_FAN)
|
|
provoking = 1;
|
|
else
|
|
provoking = 0;
|
|
|
|
if (provoking != gfx->provoking) {
|
|
gfx->provoking = provoking;
|
|
gfx->dirty |= HK_DIRTY_PROVOKING;
|
|
|
|
gfx->descriptors.root.draw.provoking = provoking;
|
|
gfx->descriptors.root_dirty = true;
|
|
}
|
|
}
|
|
|
|
/* With attachmentless rendering, we don't know the sample count until draw
|
|
* time, so we do a late tilebuffer fix up. But with rasterizer discard,
|
|
* rasterization_samples might be 0.
|
|
*
|
|
* Note that we ignore dyn->ms.rasterization_samples when we do have a sample
|
|
* count from an attachment. In Vulkan, these have to match anyway, but DX12
|
|
* drivers are robust against this scenarios and vkd3d-proton will go out of
|
|
* spec here. No reason we can't be robust here too.
|
|
*/
|
|
if (dyn->ms.rasterization_samples && !gfx->render.tilebuffer.nr_samples) {
|
|
agx_tilebuffer_set_samples(&gfx->render.tilebuffer,
|
|
dyn->ms.rasterization_samples);
|
|
|
|
cs->tib = gfx->render.tilebuffer;
|
|
}
|
|
|
|
if (IS_DIRTY(MS_SAMPLE_LOCATIONS) || IS_DIRTY(MS_SAMPLE_LOCATIONS_ENABLE) ||
|
|
IS_DIRTY(MS_RASTERIZATION_SAMPLES)) {
|
|
|
|
uint32_t ctrl;
|
|
if (dyn->ms.sample_locations_enable) {
|
|
ctrl = hk_pack_ppp_multisamplectrl(dyn->ms.sample_locations);
|
|
} else {
|
|
ctrl = hk_default_sample_positions(dyn->ms.rasterization_samples);
|
|
}
|
|
|
|
bool dont_commit = cmd->in_meta || dyn->ms.rasterization_samples == 0;
|
|
|
|
if (!cs->has_sample_locations) {
|
|
cs->ppp_multisamplectl = ctrl;
|
|
|
|
/* If we're in vk_meta, do not commit to the sample locations yet.
|
|
* vk_meta doesn't care, but the app will!
|
|
*/
|
|
cs->has_sample_locations |= !dont_commit;
|
|
} else {
|
|
assert(dont_commit || cs->ppp_multisamplectl == ctrl);
|
|
}
|
|
|
|
gfx->descriptors.root.draw.ppp_multisamplectl = ctrl;
|
|
gfx->descriptors.root_dirty = true;
|
|
}
|
|
|
|
/* Link varyings before uploading tessellation state, becuase the
|
|
* gfx->generate_primitive_id boolean needs to be plumbed.
|
|
*/
|
|
struct hk_linked_shader *linked_vs = gfx->linked[MESA_SHADER_VERTEX];
|
|
struct hk_linked_shader *linked_fs = gfx->linked[MESA_SHADER_FRAGMENT];
|
|
bool linked_vs_dirty = IS_LINKED_DIRTY(VERTEX);
|
|
bool linked_fs_dirty = IS_LINKED_DIRTY(FRAGMENT);
|
|
|
|
if ((gfx->dirty & HK_DIRTY_PROVOKING) || vgt_dirty || linked_fs_dirty) {
|
|
unsigned bindings = linked_fs ? linked_fs->b.cf.nr_bindings : 0;
|
|
if (bindings) {
|
|
size_t linkage_size =
|
|
AGX_CF_BINDING_HEADER_LENGTH + (bindings * AGX_CF_BINDING_LENGTH);
|
|
|
|
struct agx_ptr t = hk_pool_usc_alloc(cmd, linkage_size, 16);
|
|
if (!t.cpu)
|
|
return;
|
|
|
|
agx_link_varyings_vs_fs(
|
|
t.cpu, &gfx->linked_varyings, hw_vs->info.uvs.user_size,
|
|
&linked_fs->b.cf, gfx->provoking, 0, &gfx->generate_primitive_id);
|
|
|
|
gfx->varyings = agx_usc_addr(&dev->dev, t.gpu);
|
|
} else {
|
|
gfx->varyings = 0;
|
|
}
|
|
|
|
gfx->dirty |= HK_DIRTY_VARYINGS;
|
|
}
|
|
|
|
if (gfx->shaders[MESA_SHADER_TESS_EVAL] ||
|
|
gfx->shaders[MESA_SHADER_GEOMETRY] || linked_vs->sw_indexing) {
|
|
/* XXX: We should deduplicate this logic */
|
|
bool indirect = agx_is_indirect(draw.b) || draw.restart;
|
|
|
|
desc->root.draw.input_assembly =
|
|
indirect ? hk_pool_alloc(cmd, sizeof(struct agx_ia_state), 4).gpu
|
|
: hk_upload_ia_params(cmd, draw);
|
|
desc->root_dirty = true;
|
|
}
|
|
|
|
if (gfx->shaders[MESA_SHADER_TESS_EVAL] ||
|
|
gfx->shaders[MESA_SHADER_GEOMETRY]) {
|
|
|
|
struct hk_shader *vs = hk_bound_sw_vs(gfx);
|
|
desc->root.draw.vertex_outputs = vs->b.info.outputs;
|
|
|
|
/* XXX: We should deduplicate this logic */
|
|
bool indirect = agx_is_indirect(draw.b) || draw.restart;
|
|
|
|
if (!indirect) {
|
|
uint32_t verts = draw.b.count[0], instances = draw.b.count[1];
|
|
unsigned vb_size =
|
|
libagx_tcs_in_size(verts * instances, vs->b.info.outputs);
|
|
|
|
/* Allocate if there are any outputs, or use the null sink to trap
|
|
* reads if there aren't. Those reads are undefined but should not
|
|
* fault. Affects:
|
|
*
|
|
* dEQP-VK.pipeline.monolithic.no_position.explicit_declarations.basic.single_view.v0_g1
|
|
*/
|
|
desc->root.draw.vertex_output_buffer =
|
|
vb_size ? hk_pool_alloc(cmd, vb_size, 4).gpu
|
|
: dev->rodata.null_sink;
|
|
}
|
|
}
|
|
|
|
struct agx_ptr tess_args = {0};
|
|
if (gfx->shaders[MESA_SHADER_TESS_EVAL]) {
|
|
tess_args = hk_pool_alloc(cmd, sizeof(struct libagx_tess_args), 4);
|
|
gfx->descriptors.root.draw.tess_params = tess_args.gpu;
|
|
gfx->descriptors.root_dirty = true;
|
|
}
|
|
|
|
if (gfx->shaders[MESA_SHADER_GEOMETRY]) {
|
|
gfx->descriptors.root.draw.geometry_params =
|
|
hk_upload_geometry_params(cmd, draw);
|
|
|
|
gfx->descriptors.root_dirty = true;
|
|
}
|
|
|
|
/* Root must be uploaded after the above, which touch the root */
|
|
if (gfx->descriptors.root_dirty) {
|
|
gfx->root =
|
|
hk_cmd_buffer_upload_root(cmd, VK_PIPELINE_BIND_POINT_GRAPHICS);
|
|
|
|
/* Tess parameters depend on the root address, so we defer the upload
|
|
* until after uploading root. But the root depends on the tess address,
|
|
* so we allocate tess parameters before uploading root.
|
|
*
|
|
* This whole mechanism is a mess ported over from the GL driver. I'm
|
|
* planning to do a massive rework of indirect geom/tess so I'm trying not
|
|
* to perfectionism it in the mean time.
|
|
*/
|
|
if (tess_args.cpu) {
|
|
hk_upload_tess_params(cmd, tess_args.cpu, draw);
|
|
}
|
|
}
|
|
|
|
/* Hardware dynamic state must be deferred until after the root and fast
|
|
* linking, since it will use the root address and the linked shaders.
|
|
*/
|
|
if ((gfx->dirty & (HK_DIRTY_PROVOKING | HK_DIRTY_VARYINGS)) ||
|
|
IS_DIRTY(RS_RASTERIZER_DISCARD_ENABLE) || linked_vs_dirty || vgt_dirty ||
|
|
gfx->descriptors.root_dirty || gfx->draw_id_ptr || gfx->draw_params) {
|
|
|
|
/* TODO: Do less often? */
|
|
hk_reserve_scratch(cmd, cs, hw_vs);
|
|
|
|
agx_push(out, VDM_STATE, cfg) {
|
|
cfg.vertex_shader_word_0_present = true;
|
|
cfg.vertex_shader_word_1_present = true;
|
|
cfg.vertex_outputs_present = true;
|
|
cfg.vertex_unknown_present = true;
|
|
}
|
|
|
|
agx_push_packed(out, hw_vs->counts, VDM_STATE_VERTEX_SHADER_WORD_0);
|
|
|
|
struct hk_linked_shader *linked_hw_vs =
|
|
(hw_vs == sw_vs) ? linked_vs : hw_vs->only_linked;
|
|
|
|
agx_push(out, VDM_STATE_VERTEX_SHADER_WORD_1, cfg) {
|
|
cfg.pipeline = hk_upload_usc_words(cmd, hw_vs, linked_hw_vs);
|
|
}
|
|
|
|
agx_push_packed(out, hw_vs->info.uvs.vdm, VDM_STATE_VERTEX_OUTPUTS);
|
|
|
|
agx_push(out, VDM_STATE_VERTEX_UNKNOWN, cfg) {
|
|
cfg.flat_shading_control = translate_vdm_vertex(gfx->provoking);
|
|
cfg.unknown_4 = cfg.unknown_5 = dyn->rs.rasterizer_discard_enable;
|
|
cfg.generate_primitive_id = gfx->generate_primitive_id;
|
|
}
|
|
|
|
/* Pad up to a multiple of 8 bytes */
|
|
memset(out, 0, 4);
|
|
out += 4;
|
|
}
|
|
|
|
if (IS_DIRTY(RS_DEPTH_BIAS_FACTORS)) {
|
|
void *ptr =
|
|
util_dynarray_grow_bytes(&cs->depth_bias, 1, AGX_DEPTH_BIAS_LENGTH);
|
|
|
|
bool exact = dyn->rs.depth_bias.exact;
|
|
bool force_unorm =
|
|
dyn->rs.depth_bias.representation ==
|
|
VK_DEPTH_BIAS_REPRESENTATION_LEAST_REPRESENTABLE_VALUE_FORCE_UNORM_EXT;
|
|
|
|
agx_pack(ptr, DEPTH_BIAS, cfg) {
|
|
cfg.slope_scale = dyn->rs.depth_bias.slope_factor;
|
|
cfg.clamp = dyn->rs.depth_bias.clamp;
|
|
cfg.depth_bias = dyn->rs.depth_bias.constant_factor;
|
|
cfg.depth_bias /= hk_depth_bias_factor(render->depth_att.vk_format,
|
|
exact, force_unorm);
|
|
}
|
|
}
|
|
|
|
/* Hardware viewport/scissor state is entangled with depth bias. */
|
|
if (IS_DIRTY(RS_DEPTH_BIAS_FACTORS) || IS_DIRTY(VP_SCISSORS) ||
|
|
IS_DIRTY(VP_SCISSOR_COUNT) || IS_DIRTY(VP_VIEWPORTS) ||
|
|
IS_DIRTY(VP_VIEWPORT_COUNT) ||
|
|
IS_DIRTY(VP_DEPTH_CLIP_NEGATIVE_ONE_TO_ONE) ||
|
|
IS_DIRTY(RS_DEPTH_CLIP_ENABLE) || IS_DIRTY(RS_DEPTH_CLAMP_ENABLE)) {
|
|
|
|
hk_flush_vp_state(cmd, cs, &out);
|
|
}
|
|
|
|
hk_flush_ppp_state(cmd, cs, &out);
|
|
cs->current = out;
|
|
|
|
vk_dynamic_graphics_state_clear_dirty(dyn);
|
|
gfx->shaders_dirty = 0;
|
|
gfx->linked_dirty = 0;
|
|
gfx->dirty = 0;
|
|
gfx->descriptors.root_dirty = false;
|
|
}
|
|
|
|
static bool
|
|
hk_needs_index_robustness(struct hk_cmd_buffer *cmd, struct agx_draw *draw)
|
|
{
|
|
struct hk_graphics_state *gfx = &cmd->state.gfx;
|
|
struct hk_device *dev = hk_cmd_buffer_device(cmd);
|
|
|
|
if (!draw->indexed)
|
|
return false;
|
|
|
|
/* Geometry or tessellation use robust software index buffer fetch anyway */
|
|
if (gfx->shaders[MESA_SHADER_GEOMETRY] ||
|
|
gfx->shaders[MESA_SHADER_TESS_EVAL])
|
|
return false;
|
|
|
|
/* Soft fault does not cover the hardware index buffer fetch, use the zero
|
|
* page instead.
|
|
*/
|
|
if (draw->index_buffer_range_B == 0) {
|
|
draw->index_buffer = AGX_ZERO_PAGE_ADDRESS;
|
|
draw->index_buffer_range_B = 4;
|
|
draw->start = 0;
|
|
return false;
|
|
}
|
|
|
|
if (!(dev->vk.enabled_features.robustBufferAccess ||
|
|
dev->vk.enabled_features.robustBufferAccess2 ||
|
|
dev->vk.enabled_features.pipelineRobustness))
|
|
return false;
|
|
|
|
if (agx_is_indirect(draw->b))
|
|
return true;
|
|
|
|
return agx_direct_draw_overreads_indices(*draw);
|
|
}
|
|
|
|
static void
|
|
hk_handle_passthrough_gs(struct hk_cmd_buffer *cmd, struct agx_draw draw)
|
|
{
|
|
struct hk_graphics_state *gfx = &cmd->state.gfx;
|
|
struct hk_api_shader *gs = gfx->shaders[MESA_SHADER_GEOMETRY];
|
|
|
|
/* If there's an application geometry shader, there's nothing to un/bind */
|
|
if (gs && !gs->is_passthrough)
|
|
return;
|
|
|
|
/* Determine if we need a geometry shader to emulate XFB or adjacency */
|
|
struct hk_shader *last_sw = hk_bound_sw_vs_before_gs(gfx);
|
|
uint32_t xfb_outputs = last_sw->info.xfb_info.output_count;
|
|
bool needs_gs = xfb_outputs;
|
|
|
|
/* If we already have a matching GS configuration, we're done */
|
|
if ((gs != NULL) == needs_gs)
|
|
return;
|
|
|
|
/* If we don't need a GS but we do have a passthrough, unbind it */
|
|
if (gs) {
|
|
assert(!needs_gs && gs->is_passthrough);
|
|
hk_cmd_bind_graphics_shader(cmd, MESA_SHADER_GEOMETRY, NULL);
|
|
return;
|
|
}
|
|
|
|
/* Else, we need to bind a passthrough GS */
|
|
size_t key_size =
|
|
sizeof(struct hk_passthrough_gs_key) + nir_xfb_info_size(xfb_outputs);
|
|
struct hk_passthrough_gs_key *key = alloca(key_size);
|
|
|
|
*key = (struct hk_passthrough_gs_key){
|
|
.prim = u_decomposed_prim(hk_gs_in_prim(cmd)),
|
|
.outputs = last_sw->b.info.outputs,
|
|
.clip_distance_array_size = last_sw->info.clip_distance_array_size,
|
|
.cull_distance_array_size = last_sw->info.cull_distance_array_size,
|
|
};
|
|
|
|
if (xfb_outputs) {
|
|
typed_memcpy(key->xfb_stride, last_sw->info.xfb_stride,
|
|
ARRAY_SIZE(key->xfb_stride));
|
|
|
|
memcpy(&key->xfb_info, &last_sw->info.xfb_info,
|
|
nir_xfb_info_size(xfb_outputs));
|
|
}
|
|
|
|
struct hk_device *dev = hk_cmd_buffer_device(cmd);
|
|
perf_debug(cmd, "Binding passthrough GS for%s\n", xfb_outputs ? " XFB" : "");
|
|
|
|
gs = hk_meta_shader(dev, hk_nir_passthrough_gs, key, key_size);
|
|
gs->is_passthrough = true;
|
|
hk_cmd_bind_graphics_shader(cmd, MESA_SHADER_GEOMETRY, gs);
|
|
}
|
|
|
|
static struct hk_cs *
|
|
hk_flush_gfx_state(struct hk_cmd_buffer *cmd, uint32_t draw_id,
|
|
struct agx_draw draw)
|
|
{
|
|
struct hk_device *dev = hk_cmd_buffer_device(cmd);
|
|
struct hk_graphics_state *gfx = &cmd->state.gfx;
|
|
struct hk_descriptor_state *desc = &gfx->descriptors;
|
|
|
|
struct hk_cs *cs = hk_cmd_buffer_get_cs(cmd, false /* compute */);
|
|
const struct vk_dynamic_graphics_state *dyn =
|
|
&cmd->vk.dynamic_graphics_state;
|
|
|
|
if (!cs)
|
|
return NULL;
|
|
|
|
/* Annoyingly,
|
|
* VK_DEPTH_BIAS_REPRESENTATION_LEAST_REPRESENTABLE_VALUE_FORCE_UNORM_EXT is
|
|
* render pass state on Imaginapple but draw state in Vulkan. In practice,
|
|
* Proton never changes it within a render pass, but we technically need to
|
|
* handle the switch regardless. Do so early since `cs` will be invalidated
|
|
* if we need to split the render pass to switch representation mid-frame.
|
|
*
|
|
* Note we only do this dance with depth bias is actually enabled to avoid
|
|
* senseless control stream splits with DXVK.
|
|
*/
|
|
if ((IS_DIRTY(RS_DEPTH_BIAS_FACTORS) || IS_DIRTY(RS_DEPTH_BIAS_ENABLE)) &&
|
|
dyn->rs.depth_bias.enable) {
|
|
|
|
bool dbias_is_int =
|
|
(dyn->rs.depth_bias.representation ==
|
|
VK_DEPTH_BIAS_REPRESENTATION_LEAST_REPRESENTABLE_VALUE_FORCE_UNORM_EXT) ||
|
|
(gfx->render.depth_att.vk_format == VK_FORMAT_D16_UNORM);
|
|
|
|
/* Attempt to set dbias_is_int per the draw requirement. If this fails,
|
|
* flush the control stream and set it on the new control stream.
|
|
*/
|
|
bool succ = u_tristate_set(&cs->cr.dbias_is_int, dbias_is_int);
|
|
if (!succ) {
|
|
perf_debug(cmd, "Splitting control stream due to depth bias");
|
|
|
|
hk_cmd_buffer_end_graphics(cmd);
|
|
cs = hk_cmd_buffer_get_cs(cmd, false /* compute */);
|
|
|
|
succ = u_tristate_set(&cs->cr.dbias_is_int, dbias_is_int);
|
|
assert(succ && "can always set tri-state on a new control stream");
|
|
}
|
|
}
|
|
|
|
hk_ensure_cs_has_space(cmd, cs, 0x2000 /* TODO */);
|
|
|
|
#ifndef NDEBUG
|
|
if (unlikely(dev->dev.debug & AGX_DBG_DIRTY)) {
|
|
hk_cmd_buffer_dirty_all(cmd);
|
|
}
|
|
#endif
|
|
|
|
/* Merge tess info before GS construction since that depends on
|
|
* gfx->tess.prim
|
|
*/
|
|
if ((IS_SHADER_DIRTY(TESS_CTRL) || IS_SHADER_DIRTY(TESS_EVAL)) &&
|
|
gfx->shaders[MESA_SHADER_TESS_CTRL]) {
|
|
struct hk_api_shader *tcs = gfx->shaders[MESA_SHADER_TESS_CTRL];
|
|
struct hk_api_shader *tes = gfx->shaders[MESA_SHADER_TESS_EVAL];
|
|
struct hk_shader *tese = hk_any_variant(tes);
|
|
struct hk_shader *tesc = hk_only_variant(tcs);
|
|
|
|
gfx->tess.info =
|
|
hk_tess_info_merge(tese->info.tess.info, tesc->info.tess.info);
|
|
|
|
/* Determine primitive based on the merged state */
|
|
if (gfx->tess.info.points) {
|
|
gfx->tess.prim = MESA_PRIM_POINTS;
|
|
} else if (gfx->tess.info.mode == TESS_PRIMITIVE_ISOLINES) {
|
|
gfx->tess.prim = MESA_PRIM_LINES;
|
|
} else {
|
|
gfx->tess.prim = MESA_PRIM_TRIANGLES;
|
|
}
|
|
}
|
|
|
|
/* TODO: Try to reduce draw overhead of this */
|
|
hk_handle_passthrough_gs(cmd, draw);
|
|
|
|
hk_flush_shaders(cmd);
|
|
|
|
if (desc->push_dirty)
|
|
hk_cmd_buffer_flush_push_descriptors(cmd, desc);
|
|
|
|
if (draw.restart || gfx->shaders[MESA_SHADER_GEOMETRY])
|
|
hk_flush_index(cmd, cs);
|
|
|
|
hk_flush_dynamic_state(cmd, cs, draw_id, draw);
|
|
return cs;
|
|
}
|
|
|
|
VKAPI_ATTR void VKAPI_CALL
|
|
hk_CmdBindIndexBuffer2KHR(VkCommandBuffer commandBuffer, VkBuffer _buffer,
|
|
VkDeviceSize offset, VkDeviceSize size,
|
|
VkIndexType indexType)
|
|
{
|
|
VK_FROM_HANDLE(hk_cmd_buffer, cmd, commandBuffer);
|
|
VK_FROM_HANDLE(hk_buffer, buffer, _buffer);
|
|
|
|
cmd->state.gfx.index = (struct hk_index_buffer_state){
|
|
.buffer = hk_buffer_addr_range(buffer, offset, size, true),
|
|
.size = agx_translate_index_size(vk_index_type_to_bytes(indexType)),
|
|
.restart = vk_index_to_restart(indexType),
|
|
};
|
|
|
|
/* TODO: check if necessary, blob does this */
|
|
cmd->state.gfx.index.buffer.range =
|
|
align(cmd->state.gfx.index.buffer.range, 4);
|
|
}
|
|
|
|
void
|
|
hk_cmd_bind_vertex_buffer(struct hk_cmd_buffer *cmd, uint32_t vb_idx,
|
|
struct hk_addr_range addr_range)
|
|
{
|
|
cmd->state.gfx.vb[vb_idx] = addr_range;
|
|
cmd->state.gfx.dirty |= HK_DIRTY_VB;
|
|
}
|
|
|
|
VKAPI_ATTR void VKAPI_CALL
|
|
hk_CmdBindVertexBuffers2(VkCommandBuffer commandBuffer, uint32_t firstBinding,
|
|
uint32_t bindingCount, const VkBuffer *pBuffers,
|
|
const VkDeviceSize *pOffsets,
|
|
const VkDeviceSize *pSizes,
|
|
const VkDeviceSize *pStrides)
|
|
{
|
|
VK_FROM_HANDLE(hk_cmd_buffer, cmd, commandBuffer);
|
|
|
|
if (pStrides) {
|
|
vk_cmd_set_vertex_binding_strides(&cmd->vk, firstBinding, bindingCount,
|
|
pStrides);
|
|
}
|
|
|
|
for (uint32_t i = 0; i < bindingCount; i++) {
|
|
VK_FROM_HANDLE(hk_buffer, buffer, pBuffers[i]);
|
|
uint32_t idx = firstBinding + i;
|
|
|
|
uint64_t size = pSizes ? pSizes[i] : VK_WHOLE_SIZE;
|
|
const struct hk_addr_range addr_range =
|
|
hk_buffer_addr_range(buffer, pOffsets[i], size, true);
|
|
|
|
hk_cmd_bind_vertex_buffer(cmd, idx, addr_range);
|
|
}
|
|
}
|
|
|
|
static bool
|
|
hk_set_view_index(struct hk_cmd_buffer *cmd, uint32_t view_idx)
|
|
{
|
|
if (cmd->state.gfx.render.view_mask) {
|
|
cmd->state.gfx.descriptors.root.draw.view_index = view_idx;
|
|
cmd->state.gfx.descriptors.root_dirty = true;
|
|
}
|
|
|
|
return true;
|
|
}
|
|
|
|
/*
|
|
* Iterator macro to duplicate a draw for each enabled view (when multiview is
|
|
* enabled, else always view 0). Along with hk_lower_multiview, this forms the
|
|
* world's worst multiview lowering.
|
|
*/
|
|
#define hk_foreach_view(cmd) \
|
|
u_foreach_bit(view_idx, cmd->state.gfx.render.view_mask ?: 1) \
|
|
if (hk_set_view_index(cmd, view_idx))
|
|
|
|
static void
|
|
hk_ia_update(struct hk_cmd_buffer *cmd, struct agx_draw draw,
|
|
uint64_t ia_vertices, uint64_t ia_prims, uint64_t vs_invocations,
|
|
uint64_t c_prims, uint64_t c_inv)
|
|
{
|
|
perf_debug(cmd, "Input assembly counters");
|
|
|
|
uint64_t draw_ptr;
|
|
if (agx_is_indirect(draw.b)) {
|
|
draw_ptr = draw.b.ptr;
|
|
} else {
|
|
uint32_t desc[] = {draw.b.count[0], draw.b.count[1], 0};
|
|
draw_ptr = hk_pool_upload(cmd, &desc, sizeof(desc), 4);
|
|
}
|
|
|
|
struct vk_dynamic_graphics_state *dyn = &cmd->vk.dynamic_graphics_state;
|
|
enum mesa_prim prim = vk_conv_topology(dyn->ia.primitive_topology);
|
|
|
|
bool geom = cmd->state.gfx.shaders[MESA_SHADER_GEOMETRY];
|
|
bool tess = cmd->state.gfx.shaders[MESA_SHADER_TESS_EVAL];
|
|
|
|
/* Clipper counters depend on geom/tess outputs and must be written with the
|
|
* geom/tess output. They are updated as IA counters only when geom/tess is
|
|
* not used.
|
|
*/
|
|
if (geom || tess) {
|
|
c_prims = 0;
|
|
c_inv = 0;
|
|
}
|
|
|
|
uint32_t patch_size = dyn->ts.patch_control_points;
|
|
|
|
if (draw.restart) {
|
|
uint32_t index_size_B = agx_index_size_to_B(draw.index_size);
|
|
|
|
libagx_increment_ia_restart(
|
|
cmd, agx_1d(1024), AGX_BARRIER_ALL | AGX_PREGFX, ia_vertices, ia_prims,
|
|
vs_invocations, c_prims, c_inv, draw_ptr, draw.index_buffer,
|
|
agx_draw_index_range_el(draw), cmd->state.gfx.index.restart,
|
|
index_size_B, prim, patch_size);
|
|
} else {
|
|
libagx_increment_ia(cmd, agx_1d(1), AGX_BARRIER_ALL | AGX_PREGFX,
|
|
ia_vertices, ia_prims, vs_invocations, c_prims, c_inv,
|
|
draw_ptr, prim, patch_size);
|
|
}
|
|
}
|
|
|
|
static void
|
|
hk_draw(struct hk_cmd_buffer *cmd, uint16_t draw_id, struct agx_draw draw_)
|
|
{
|
|
const struct vk_dynamic_graphics_state *dyn =
|
|
&cmd->vk.dynamic_graphics_state;
|
|
|
|
/* Filter trivial draws so we don't need to worry about null index buffers */
|
|
if (!agx_is_indirect(draw_.b) &&
|
|
(draw_.b.count[0] == 0 || draw_.b.count[1] == 0))
|
|
return;
|
|
|
|
draw_.restart = dyn->ia.primitive_restart_enable && draw_.indexed;
|
|
draw_.index_size = cmd->state.gfx.index.size;
|
|
|
|
uint64_t stat_ia_verts = hk_pipeline_stat_addr(
|
|
cmd, VK_QUERY_PIPELINE_STATISTIC_INPUT_ASSEMBLY_VERTICES_BIT);
|
|
|
|
uint64_t stat_ia_prims = hk_pipeline_stat_addr(
|
|
cmd, VK_QUERY_PIPELINE_STATISTIC_INPUT_ASSEMBLY_PRIMITIVES_BIT);
|
|
|
|
uint64_t stat_vs_inv = hk_pipeline_stat_addr(
|
|
cmd, VK_QUERY_PIPELINE_STATISTIC_VERTEX_SHADER_INVOCATIONS_BIT);
|
|
|
|
uint64_t stat_c_inv = hk_pipeline_stat_addr(
|
|
cmd, VK_QUERY_PIPELINE_STATISTIC_CLIPPING_INVOCATIONS_BIT);
|
|
|
|
uint64_t stat_c_prims = hk_pipeline_stat_addr(
|
|
cmd, VK_QUERY_PIPELINE_STATISTIC_CLIPPING_PRIMITIVES_BIT);
|
|
|
|
bool ia_stats = stat_ia_verts || stat_ia_prims || stat_vs_inv ||
|
|
stat_c_inv || stat_c_prims;
|
|
struct hk_device *dev = hk_cmd_buffer_device(cmd);
|
|
|
|
hk_foreach_view(cmd) {
|
|
struct agx_draw draw = draw_;
|
|
struct hk_cs *cs = hk_flush_gfx_state(cmd, draw_id, draw);
|
|
/* If we failed to allocate a control stream, we've already lost the
|
|
* device. Just drop the draw so we don't crash.
|
|
*/
|
|
if (!cs)
|
|
return;
|
|
|
|
struct vk_dynamic_graphics_state *dyn = &cmd->vk.dynamic_graphics_state;
|
|
bool geom = cmd->state.gfx.shaders[MESA_SHADER_GEOMETRY];
|
|
bool tess = cmd->state.gfx.shaders[MESA_SHADER_TESS_EVAL];
|
|
bool needs_idx_robust = hk_needs_index_robustness(cmd, &draw);
|
|
bool adj =
|
|
mesa_prim_has_adjacency(vk_conv_topology(dyn->ia.primitive_topology));
|
|
adj &= !geom;
|
|
needs_idx_robust &= !adj;
|
|
|
|
struct hk_cs *ccs = NULL;
|
|
uint8_t *out = cs->current;
|
|
assert(cs->current + 0x1000 < cs->end);
|
|
|
|
if (tess && HK_PERF(dev, NOTESS))
|
|
continue;
|
|
|
|
cs->stats.calls++;
|
|
|
|
if (geom || tess) {
|
|
ccs =
|
|
hk_cmd_buffer_get_cs_general(cmd, &cmd->current_cs.pre_gfx, true);
|
|
if (!ccs)
|
|
return;
|
|
}
|
|
|
|
if (ia_stats) {
|
|
hk_ia_update(cmd, draw, stat_ia_verts, stat_ia_prims, stat_vs_inv,
|
|
stat_c_prims, stat_c_inv);
|
|
}
|
|
|
|
if (tess) {
|
|
draw = hk_launch_tess(cmd, ccs, draw, geom ? 0 : stat_c_prims,
|
|
geom ? 0 : stat_c_inv);
|
|
}
|
|
|
|
if (geom) {
|
|
draw = hk_launch_gs_prerast(cmd, ccs, draw);
|
|
|
|
/* We must not draw if the app specified rasterizer discard. This is
|
|
* required for both performance (it is pointless to rasterize and
|
|
* there are no side effects), but also correctness (no indirect draw
|
|
* descriptor will be filled out).
|
|
*/
|
|
if (dyn->rs.rasterizer_discard_enable)
|
|
continue;
|
|
}
|
|
|
|
if (adj) {
|
|
assert(!geom && "geometry shaders handle adj directly");
|
|
enum mesa_prim prim = vk_conv_topology(dyn->ia.primitive_topology);
|
|
|
|
if (draw.restart) {
|
|
draw = hk_draw_without_restart(cmd, draw, 1);
|
|
prim = u_decomposed_prim(prim);
|
|
}
|
|
|
|
if (agx_is_indirect(draw.b)) {
|
|
const size_t size = sizeof(VkDrawIndexedIndirectCommand);
|
|
static_assert(sizeof(VkDrawIndexedIndirectCommand) >
|
|
sizeof(VkDrawIndirectCommand),
|
|
"allocation size is conservative");
|
|
|
|
uint64_t out_draw = hk_pool_alloc(cmd, size, 4).gpu;
|
|
struct hk_descriptor_state *desc = &cmd->state.gfx.descriptors;
|
|
|
|
libagx_draw_without_adj(
|
|
cmd, agx_1d(1), AGX_BARRIER_ALL | AGX_PREGFX, out_draw,
|
|
draw.b.ptr, desc->root.draw.input_assembly, draw.index_buffer,
|
|
draw.indexed ? agx_draw_index_range_el(draw) : 0,
|
|
draw.indexed ? agx_index_size_to_B(draw.index_size) : 0, prim);
|
|
|
|
draw = agx_draw_indirect(out_draw);
|
|
} else {
|
|
unsigned count = libagx_remap_adj_count(draw.b.count[0], prim);
|
|
|
|
draw = (struct agx_draw){
|
|
.b = agx_3d(count, draw.b.count[1], 1),
|
|
};
|
|
}
|
|
}
|
|
|
|
enum agx_primitive topology = cmd->state.gfx.topology;
|
|
if (needs_idx_robust) {
|
|
assert(!geom && !tess && !adj);
|
|
perf_debug(cmd, "lowering robust index buffer");
|
|
|
|
cs->current = out;
|
|
|
|
draw = hk_draw_as_indexed_indirect(cmd, draw);
|
|
|
|
size_t size_B = libagx_draw_robust_index_vdm_size();
|
|
uint64_t target = hk_cs_alloc_for_indirect(cs, size_B);
|
|
|
|
libagx_draw_robust_index(cmd, agx_1d(32), AGX_BARRIER_ALL | AGX_PREGFX,
|
|
target, hk_geometry_state(cmd), draw.b.ptr,
|
|
draw.index_buffer, draw.index_buffer_range_B,
|
|
draw.restart, topology, draw.index_size);
|
|
} else {
|
|
cs->current = (void *)agx_vdm_draw((uint32_t *)out, dev->dev.chip,
|
|
draw, topology);
|
|
}
|
|
|
|
cs->stats.cmds++;
|
|
}
|
|
}
|
|
|
|
VKAPI_ATTR void VKAPI_CALL
|
|
hk_CmdDraw(VkCommandBuffer commandBuffer, uint32_t vertexCount,
|
|
uint32_t instanceCount, uint32_t firstVertex, uint32_t firstInstance)
|
|
{
|
|
VK_FROM_HANDLE(hk_cmd_buffer, cmd, commandBuffer);
|
|
struct agx_draw draw;
|
|
|
|
if (HK_TEST_INDIRECTS) {
|
|
uint32_t data[] = {
|
|
vertexCount,
|
|
instanceCount,
|
|
firstVertex,
|
|
firstInstance,
|
|
};
|
|
|
|
draw = agx_draw_indirect(hk_pool_upload(cmd, data, sizeof(data), 4));
|
|
} else {
|
|
draw = (struct agx_draw){
|
|
.b = agx_3d(vertexCount, instanceCount, 1),
|
|
.start = firstVertex,
|
|
.start_instance = firstInstance,
|
|
};
|
|
}
|
|
|
|
hk_draw(cmd, 0, draw);
|
|
}
|
|
|
|
VKAPI_ATTR void VKAPI_CALL
|
|
hk_CmdDrawMultiEXT(VkCommandBuffer commandBuffer, uint32_t drawCount,
|
|
const VkMultiDrawInfoEXT *pVertexInfo,
|
|
uint32_t instanceCount, uint32_t firstInstance,
|
|
uint32_t stride)
|
|
{
|
|
VK_FROM_HANDLE(hk_cmd_buffer, cmd, commandBuffer);
|
|
|
|
for (unsigned i = 0; i < drawCount; ++i) {
|
|
struct agx_draw draw = {
|
|
.b = agx_3d(pVertexInfo->vertexCount, instanceCount, 1),
|
|
.start = pVertexInfo->firstVertex,
|
|
.start_instance = firstInstance,
|
|
};
|
|
|
|
hk_draw(cmd, i, draw);
|
|
pVertexInfo = ((void *)pVertexInfo) + stride;
|
|
}
|
|
}
|
|
|
|
static void
|
|
hk_draw_indexed(VkCommandBuffer commandBuffer, uint16_t draw_id,
|
|
uint32_t indexCount, uint32_t instanceCount,
|
|
uint32_t firstIndex, int32_t vertexOffset,
|
|
uint32_t firstInstance)
|
|
{
|
|
VK_FROM_HANDLE(hk_cmd_buffer, cmd, commandBuffer);
|
|
struct agx_draw draw;
|
|
struct hk_addr_range buf = cmd->state.gfx.index.buffer;
|
|
|
|
if (HK_TEST_INDIRECTS && draw_id == 0) {
|
|
uint32_t data[] = {
|
|
indexCount, instanceCount, firstIndex, vertexOffset, firstInstance,
|
|
};
|
|
uint64_t addr = hk_pool_upload(cmd, data, sizeof(data), 4);
|
|
|
|
draw = agx_draw_indexed_indirect(addr, buf.addr, buf.range, 0, 0);
|
|
} else {
|
|
draw =
|
|
agx_draw_indexed(indexCount, instanceCount, firstIndex, vertexOffset,
|
|
firstInstance, buf.addr, buf.range, 0, 0);
|
|
}
|
|
|
|
hk_draw(cmd, draw_id, draw);
|
|
}
|
|
|
|
VKAPI_ATTR void VKAPI_CALL
|
|
hk_CmdDrawIndexed(VkCommandBuffer commandBuffer, uint32_t indexCount,
|
|
uint32_t instanceCount, uint32_t firstIndex,
|
|
int32_t vertexOffset, uint32_t firstInstance)
|
|
{
|
|
hk_draw_indexed(commandBuffer, 0, indexCount, instanceCount, firstIndex,
|
|
vertexOffset, firstInstance);
|
|
}
|
|
|
|
VKAPI_ATTR void VKAPI_CALL
|
|
hk_CmdDrawMultiIndexedEXT(VkCommandBuffer commandBuffer, uint32_t drawCount,
|
|
const VkMultiDrawIndexedInfoEXT *pIndexInfo,
|
|
uint32_t instanceCount, uint32_t firstInstance,
|
|
uint32_t stride, const int32_t *pVertexOffset)
|
|
{
|
|
for (unsigned i = 0; i < drawCount; ++i) {
|
|
const uint32_t vertex_offset =
|
|
pVertexOffset != NULL ? *pVertexOffset : pIndexInfo->vertexOffset;
|
|
|
|
hk_draw_indexed(commandBuffer, i, pIndexInfo->indexCount, instanceCount,
|
|
pIndexInfo->firstIndex, vertex_offset, firstInstance);
|
|
|
|
pIndexInfo = ((void *)pIndexInfo) + stride;
|
|
}
|
|
}
|
|
|
|
static void
|
|
hk_draw_indirect_inner(VkCommandBuffer commandBuffer, uint64_t base,
|
|
uint32_t drawCount, uint32_t stride)
|
|
{
|
|
VK_FROM_HANDLE(hk_cmd_buffer, cmd, commandBuffer);
|
|
|
|
/* From the Vulkan 1.3.238 spec:
|
|
*
|
|
* VUID-vkCmdDrawIndirect-drawCount-00476
|
|
*
|
|
* "If drawCount is greater than 1, stride must be a multiple of 4 and
|
|
* must be greater than or equal to sizeof(VkDrawIndirectCommand)"
|
|
*
|
|
* and
|
|
*
|
|
* "If drawCount is less than or equal to one, stride is ignored."
|
|
*/
|
|
if (drawCount > 1) {
|
|
assert(stride % 4 == 0);
|
|
assert(stride >= sizeof(VkDrawIndirectCommand));
|
|
}
|
|
|
|
for (unsigned draw_id = 0; draw_id < drawCount; ++draw_id) {
|
|
uint64_t addr = base + stride * draw_id;
|
|
hk_draw(cmd, draw_id, agx_draw_indirect(addr));
|
|
}
|
|
}
|
|
|
|
VKAPI_ATTR void VKAPI_CALL
|
|
hk_CmdDrawIndirect(VkCommandBuffer commandBuffer, VkBuffer _buffer,
|
|
VkDeviceSize offset, uint32_t drawCount, uint32_t stride)
|
|
{
|
|
VK_FROM_HANDLE(hk_buffer, buffer, _buffer);
|
|
|
|
hk_draw_indirect_inner(commandBuffer, hk_buffer_address_ro(buffer, offset),
|
|
drawCount, stride);
|
|
}
|
|
|
|
static void
|
|
hk_draw_indexed_indirect_inner(VkCommandBuffer commandBuffer, uint64_t buffer,
|
|
uint32_t drawCount, uint32_t stride)
|
|
{
|
|
VK_FROM_HANDLE(hk_cmd_buffer, cmd, commandBuffer);
|
|
|
|
/* From the Vulkan 1.3.238 spec:
|
|
*
|
|
* VUID-vkCmdDrawIndexedIndirect-drawCount-00528
|
|
*
|
|
* "If drawCount is greater than 1, stride must be a multiple of 4 and
|
|
* must be greater than or equal to
|
|
* sizeof(VkDrawIndexedIndirectCommand)"
|
|
*
|
|
* and
|
|
*
|
|
* "If drawCount is less than or equal to one, stride is ignored."
|
|
*/
|
|
if (drawCount > 1) {
|
|
assert(stride % 4 == 0);
|
|
assert(stride >= sizeof(VkDrawIndexedIndirectCommand));
|
|
}
|
|
|
|
for (unsigned draw_id = 0; draw_id < drawCount; ++draw_id) {
|
|
uint64_t addr = buffer + stride * draw_id;
|
|
struct hk_addr_range buf = cmd->state.gfx.index.buffer;
|
|
|
|
hk_draw(cmd, draw_id,
|
|
agx_draw_indexed_indirect(addr, buf.addr, buf.range, 0, 0));
|
|
}
|
|
}
|
|
|
|
VKAPI_ATTR void VKAPI_CALL
|
|
hk_CmdDrawIndexedIndirect(VkCommandBuffer commandBuffer, VkBuffer _buffer,
|
|
VkDeviceSize offset, uint32_t drawCount,
|
|
uint32_t stride)
|
|
{
|
|
VK_FROM_HANDLE(hk_buffer, buffer, _buffer);
|
|
|
|
hk_draw_indexed_indirect_inner(
|
|
commandBuffer, hk_buffer_address_ro(buffer, offset), drawCount, stride);
|
|
}
|
|
|
|
/*
|
|
* To implement drawIndirectCount generically, we dispatch a compute kernel to
|
|
* patch the indirect buffer and then we dispatch the predicated maxDrawCount
|
|
* indirect draws.
|
|
*/
|
|
static void
|
|
hk_draw_indirect_count(VkCommandBuffer commandBuffer, VkBuffer _buffer,
|
|
VkDeviceSize offset, VkBuffer countBuffer,
|
|
VkDeviceSize countBufferOffset, uint32_t maxDrawCount,
|
|
uint32_t stride, bool indexed)
|
|
{
|
|
VK_FROM_HANDLE(hk_cmd_buffer, cmd, commandBuffer);
|
|
VK_FROM_HANDLE(hk_buffer, buffer, _buffer);
|
|
VK_FROM_HANDLE(hk_buffer, count_buffer, countBuffer);
|
|
|
|
perf_debug(cmd, "Draw indirect count");
|
|
|
|
assert((stride % 4) == 0 && "aligned");
|
|
|
|
size_t out_stride = sizeof(uint32_t) * (indexed ? 5 : 4);
|
|
uint64_t patched = hk_pool_alloc(cmd, out_stride * maxDrawCount, 4).gpu;
|
|
uint64_t in = hk_buffer_address_ro(buffer, offset);
|
|
uint64_t count_addr = hk_buffer_address_ro(count_buffer, countBufferOffset);
|
|
|
|
libagx_predicate_indirect(cmd, agx_1d(maxDrawCount),
|
|
AGX_BARRIER_ALL | AGX_PREGFX, patched, in,
|
|
count_addr, stride / 4, indexed);
|
|
|
|
if (indexed) {
|
|
hk_draw_indexed_indirect_inner(commandBuffer, patched, maxDrawCount,
|
|
out_stride);
|
|
} else {
|
|
hk_draw_indirect_inner(commandBuffer, patched, maxDrawCount, out_stride);
|
|
}
|
|
}
|
|
|
|
VKAPI_ATTR void VKAPI_CALL
|
|
hk_CmdDrawIndirectCount(VkCommandBuffer commandBuffer, VkBuffer _buffer,
|
|
VkDeviceSize offset, VkBuffer countBuffer,
|
|
VkDeviceSize countBufferOffset, uint32_t maxDrawCount,
|
|
uint32_t stride)
|
|
{
|
|
hk_draw_indirect_count(commandBuffer, _buffer, offset, countBuffer,
|
|
countBufferOffset, maxDrawCount, stride, false);
|
|
}
|
|
|
|
VKAPI_ATTR void VKAPI_CALL
|
|
hk_CmdDrawIndexedIndirectCount(VkCommandBuffer commandBuffer, VkBuffer _buffer,
|
|
VkDeviceSize offset, VkBuffer countBuffer,
|
|
VkDeviceSize countBufferOffset,
|
|
uint32_t maxDrawCount, uint32_t stride)
|
|
{
|
|
hk_draw_indirect_count(commandBuffer, _buffer, offset, countBuffer,
|
|
countBufferOffset, maxDrawCount, stride, true);
|
|
}
|
|
|
|
VKAPI_ATTR void VKAPI_CALL
|
|
hk_CmdDrawIndirectByteCountEXT(VkCommandBuffer commandBuffer,
|
|
uint32_t instanceCount, uint32_t firstInstance,
|
|
VkBuffer counterBuffer,
|
|
VkDeviceSize counterBufferOffset,
|
|
uint32_t counterOffset, uint32_t vertexStride)
|
|
{
|
|
unreachable("TODO");
|
|
}
|
|
|
|
VKAPI_ATTR void VKAPI_CALL
|
|
hk_CmdBindTransformFeedbackBuffersEXT(VkCommandBuffer commandBuffer,
|
|
uint32_t firstBinding,
|
|
uint32_t bindingCount,
|
|
const VkBuffer *pBuffers,
|
|
const VkDeviceSize *pOffsets,
|
|
const VkDeviceSize *pSizes)
|
|
{
|
|
VK_FROM_HANDLE(hk_cmd_buffer, cmd, commandBuffer);
|
|
struct hk_graphics_state *gfx = &cmd->state.gfx;
|
|
|
|
for (uint32_t i = 0; i < bindingCount; i++) {
|
|
VK_FROM_HANDLE(hk_buffer, buffer, pBuffers[i]);
|
|
uint32_t idx = firstBinding + i;
|
|
uint64_t size = pSizes ? pSizes[i] : VK_WHOLE_SIZE;
|
|
|
|
gfx->xfb[idx] = hk_buffer_addr_range(buffer, pOffsets[i], size, false);
|
|
}
|
|
}
|
|
|
|
static void
|
|
hk_begin_end_xfb(VkCommandBuffer commandBuffer, uint32_t firstCounterBuffer,
|
|
uint32_t counterBufferCount, const VkBuffer *pCounterBuffers,
|
|
const VkDeviceSize *pCounterBufferOffsets, bool begin)
|
|
|
|
{
|
|
VK_FROM_HANDLE(hk_cmd_buffer, cmd, commandBuffer);
|
|
struct hk_graphics_state *gfx = &cmd->state.gfx;
|
|
|
|
gfx->xfb_enabled = begin;
|
|
|
|
/* If we haven't reserved XFB offsets yet for the command buffer, do so. */
|
|
if (!gfx->xfb_offsets) {
|
|
gfx->xfb_offsets = hk_pool_alloc(cmd, 4 * sizeof(uint32_t), 4).gpu;
|
|
}
|
|
|
|
struct libagx_xfb_counter_copy params = {};
|
|
unsigned copies = 0;
|
|
|
|
/* For CmdBeginTransformFeedbackEXT, we need to initialize everything */
|
|
if (begin) {
|
|
for (copies = 0; copies < 4; ++copies) {
|
|
params.dest[copies] = gfx->xfb_offsets + copies * sizeof(uint32_t);
|
|
}
|
|
}
|
|
|
|
for (unsigned i = 0; i < counterBufferCount; ++i) {
|
|
if (pCounterBuffers[i] == VK_NULL_HANDLE)
|
|
continue;
|
|
|
|
VK_FROM_HANDLE(hk_buffer, buffer, pCounterBuffers[i]);
|
|
|
|
uint64_t offset = pCounterBufferOffsets ? pCounterBufferOffsets[i] : 0;
|
|
uint64_t cb_addr = hk_buffer_address_rw(buffer, offset);
|
|
uint32_t cmd_idx = firstCounterBuffer + i;
|
|
|
|
if (begin) {
|
|
params.src[cmd_idx] = cb_addr;
|
|
} else {
|
|
params.dest[copies] = cb_addr;
|
|
params.src[copies] = gfx->xfb_offsets + cmd_idx * sizeof(uint32_t);
|
|
++copies;
|
|
}
|
|
}
|
|
|
|
if (begin)
|
|
copies = 4;
|
|
|
|
if (copies > 0) {
|
|
perf_debug(cmd, "XFB counter copy");
|
|
|
|
libagx_copy_xfb_counters(cmd, agx_1d(copies),
|
|
AGX_BARRIER_ALL | AGX_PREGFX,
|
|
hk_pool_upload(cmd, ¶ms, sizeof(params), 8));
|
|
}
|
|
}
|
|
|
|
VKAPI_ATTR void VKAPI_CALL
|
|
hk_CmdBeginTransformFeedbackEXT(VkCommandBuffer commandBuffer,
|
|
uint32_t firstCounterBuffer,
|
|
uint32_t counterBufferCount,
|
|
const VkBuffer *pCounterBuffers,
|
|
const VkDeviceSize *pCounterBufferOffsets)
|
|
{
|
|
hk_begin_end_xfb(commandBuffer, firstCounterBuffer, counterBufferCount,
|
|
pCounterBuffers, pCounterBufferOffsets, true);
|
|
}
|
|
|
|
VKAPI_ATTR void VKAPI_CALL
|
|
hk_CmdEndTransformFeedbackEXT(VkCommandBuffer commandBuffer,
|
|
uint32_t firstCounterBuffer,
|
|
uint32_t counterBufferCount,
|
|
const VkBuffer *pCounterBuffers,
|
|
const VkDeviceSize *pCounterBufferOffsets)
|
|
{
|
|
hk_begin_end_xfb(commandBuffer, firstCounterBuffer, counterBufferCount,
|
|
pCounterBuffers, pCounterBufferOffsets, false);
|
|
}
|
|
|
|
VKAPI_ATTR void VKAPI_CALL
|
|
hk_CmdBeginConditionalRenderingEXT(
|
|
VkCommandBuffer commandBuffer,
|
|
const VkConditionalRenderingBeginInfoEXT *pConditionalRenderingBegin)
|
|
{
|
|
unreachable("stub");
|
|
}
|
|
|
|
VKAPI_ATTR void VKAPI_CALL
|
|
hk_CmdEndConditionalRenderingEXT(VkCommandBuffer commandBuffer)
|
|
{
|
|
unreachable("stub");
|
|
}
|