panfrost: do not push "true" UBOs
Panfrost supports pushing uniforms to hardware uniform registers (RMU/FAU for Midgard/Bifrost respectively). Since OpenGL uniforms are lowered to UBO #0, it does this with a pass that pushes UBOs. That's good! The pass also pushes 'true' OpenGL UBOs, since they look the same in the backend at this point. This is where the trouble comes in: - True UBOs are allocated in GPU BOs, not CPU allocated buffers. That means it's write-combine memory, which we cannot read from efficiently (at least depending on coherency details that were never plumbed through panfrost.ko and unlikely to be replumbed now that panthor is the new hot stuff). So, pushing true UBOs reduces GPU overhead at the cost of tremendous CPU overhead. This is dubious... When I benchmarked this on MT8192 in early 2023, this pushing improved FPS in SuperTuxKart but hurt FPS in Dolphin. - True UBOs can be written on the GPU. In OpenGL, we have batch tracking infrastructure to sort this mess out in theory. What this means is that pushing UBOs requires us to flush writers AND STALL at draw-time. If this is ever hit, our performance is utterly trashed. But it gets worse. - True UBOs can be written in the same batch that reads them. For example, we could bind a buffer as a transform feedback buffer, do a draw with XFB, then rebind as a UBO and do a draw reading. This is where we collapse -- our logic will flush the writer, which is the same batch we were in the middle of enqueueing a draw to. When we try to push words, we'll crash with theatrics. This could be solved by smartening the batch tracking logic but it's not trivial by any means. So, pushing true UBOs on the CPU is broken and can hurt performance. Stop doing it! Long term, the solution will be to push on the GPU instead. This avoids all of these issues. This can be done with a compute kernel or with CSF instructions. The Vulkan driver will likely have to do this for performance, since pushing UBOs from the CPU is utterly broken in Vulkan for the above reasons. I have a branch somewhere doing this on v9 but I'm doing this on NIR time to unblock a core change that was crashing piglit due to this pile of unsoundness. Let's fix the correctness issues first, then someone can look at recovering performance later when we're not blocking unrelated work. Fixes corruption in Piglit test gles-3.0-transform-feedback-uniform-buffer-object, which writes a UBO with transform feedback. (I suspect the test still doesn't pass for the same reason it's broken on other tilers. But that's a better place to be than oodles of memory corruption.) According to CI, fixes spec@arb_uniform_buffer_object@rendering{-dsa}-offset. Cc: mesa-stable Signed-off-by: Alyssa Rosenzweig <alyssa@rosenzweig.io> Reviewed-by: Boris Brezillon <boris.brezillon@collabora.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/34193>
This commit is contained in:

committed by
Caterina Shablia

parent
2c75b6bb01
commit
59a3e12039
@@ -1321,28 +1321,6 @@ panfrost_upload_sysvals(struct panfrost_batch *batch, void *ptr_cpu,
|
||||
}
|
||||
}
|
||||
|
||||
static const void *
|
||||
panfrost_map_constant_buffer_cpu(struct panfrost_context *ctx,
|
||||
struct panfrost_constant_buffer *buf,
|
||||
unsigned index)
|
||||
{
|
||||
struct pipe_constant_buffer *cb = &buf->cb[index];
|
||||
struct panfrost_resource *rsrc = pan_resource(cb->buffer);
|
||||
|
||||
if (rsrc) {
|
||||
if (panfrost_bo_mmap(rsrc->bo))
|
||||
return NULL;
|
||||
|
||||
panfrost_flush_writer(ctx, rsrc, "CPU constant buffer mapping");
|
||||
panfrost_bo_wait(rsrc->bo, INT64_MAX, false);
|
||||
|
||||
return rsrc->bo->ptr.cpu + cb->buffer_offset;
|
||||
} else if (cb->user_buffer) {
|
||||
return cb->user_buffer + cb->buffer_offset;
|
||||
} else
|
||||
unreachable("No constant buffer");
|
||||
}
|
||||
|
||||
/* Emit a single UBO record. On Valhall, UBOs are dumb buffers and are
|
||||
* implemented with buffer descriptors in the resource table, sized in terms of
|
||||
* bytes. On Bifrost and older, UBOs have special uniform buffer data
|
||||
@@ -1504,13 +1482,33 @@ panfrost_emit_const_buf(struct panfrost_batch *batch,
|
||||
sysval_comp < ARRAY_SIZE(batch->num_wg_sysval))
|
||||
batch->num_wg_sysval[sysval_comp] = ptr;
|
||||
}
|
||||
/* Map the UBO, this should be cheap. For some buffers this may
|
||||
* read from write-combine memory which is slow, though :-(
|
||||
|
||||
/* Grab the mapped memory. We only do this path for sysvals & user
|
||||
* buffers, which are already CPU mapped. We do not use this path for
|
||||
* "real" UBOs for a few reasons. First, real UBOs are generally mapped
|
||||
* write-combine, so reading them here is very expensive. Second, real
|
||||
* UBOs may be written from the GPU, which would require a full stall to
|
||||
* get the results fro m the GPU. Third, it may happen that *this* batch
|
||||
* is writing the UBO which would require us to split the batch *and*
|
||||
* stall, which we lack the batch tracking primitives to do correctly.
|
||||
*
|
||||
* The "proper" way to push true UBOs is on-device. Either we would
|
||||
* dispatch a small compute kernel to run this logic at the start of the
|
||||
* draw, or we would wire up nir_opt_preamble to compute kernels to the
|
||||
* same effect. We will likely do this for Vulkan.
|
||||
*
|
||||
* For now, use the straightforward correct implementation.
|
||||
*/
|
||||
const void *mapped_ubo =
|
||||
(src.ubo == sysval_ubo)
|
||||
? sysvals
|
||||
: panfrost_map_constant_buffer_cpu(ctx, buf, src.ubo);
|
||||
const void *mapped_ubo;
|
||||
if (src.ubo == sysval_ubo) {
|
||||
mapped_ubo = sysvals;
|
||||
} else {
|
||||
struct pipe_constant_buffer *cb = &buf->cb[src.ubo];
|
||||
assert(!cb->buffer && cb->user_buffer &&
|
||||
"only user buffers use this path");
|
||||
|
||||
mapped_ubo = cb->user_buffer + cb->buffer_offset;
|
||||
}
|
||||
|
||||
if (!mapped_ubo)
|
||||
return 0;
|
||||
|
@@ -74,8 +74,6 @@ spec@arb_texture_rg@fbo-blending-formats@GL_R8,Fail
|
||||
spec@arb_texture_rg@fbo-blending-formats@GL_RG8,Fail
|
||||
spec@arb_texture_rg@fbo-blending-formats@GL_RG,Fail
|
||||
spec@arb_transform_feedback_instanced@draw-auto instanced,Fail
|
||||
spec@arb_uniform_buffer_object@rendering-dsa-offset,Fail
|
||||
spec@arb_uniform_buffer_object@rendering-offset,Fail
|
||||
spec@egl 1.4@eglterminate then unbind context,Fail
|
||||
spec@egl_chromium_sync_control@conformance@eglGetSyncValuesCHROMIUM_msc_and_sbc_test,Fail
|
||||
spec@egl_chromium_sync_control@conformance,Fail
|
||||
|
@@ -66,8 +66,6 @@ spec@arb_texture_rg@fbo-blending-formats@GL_R8,Fail
|
||||
spec@arb_texture_rg@fbo-blending-formats@GL_RG8,Fail
|
||||
spec@arb_texture_rg@fbo-blending-formats@GL_RG,Fail
|
||||
spec@arb_transform_feedback_instanced@draw-auto instanced,Fail
|
||||
spec@arb_uniform_buffer_object@rendering-dsa-offset,Fail
|
||||
spec@arb_uniform_buffer_object@rendering-offset,Fail
|
||||
spec@egl 1.4@eglterminate then unbind context,Fail
|
||||
spec@egl_chromium_sync_control@conformance@eglGetSyncValuesCHROMIUM_msc_and_sbc_test,Fail
|
||||
spec@egl_chromium_sync_control@conformance,Fail
|
||||
|
@@ -36,12 +36,16 @@ bi_is_ubo(bi_instr *ins)
|
||||
(ins->seg == BI_SEG_UBO);
|
||||
}
|
||||
|
||||
/* For now, we only allow pushing UBO 0. This matches the Gallium convention
|
||||
* where UBO 0 is mapped on the CPU but other UBOs are not. When we switch to
|
||||
* pushing UBOs with a compute kernel (or CSF instructions), we can relax this.
|
||||
*/
|
||||
static bool
|
||||
bi_is_direct_aligned_ubo(bi_instr *ins)
|
||||
bi_is_pushable_ubo(bi_instr *ins)
|
||||
{
|
||||
return bi_is_ubo(ins) && (ins->src[0].type == BI_INDEX_CONSTANT) &&
|
||||
(ins->src[1].type == BI_INDEX_CONSTANT) &&
|
||||
((ins->src[0].value & 0x3) == 0);
|
||||
((ins->src[0].value & 0x3) == 0) && (ins->src[1].value == 0);
|
||||
}
|
||||
|
||||
/* Represents use data for a single UBO */
|
||||
@@ -69,7 +73,7 @@ bi_analyze_ranges(bi_context *ctx)
|
||||
res.blocks = calloc(res.nr_blocks, sizeof(struct bi_ubo_block));
|
||||
|
||||
bi_foreach_instr_global(ctx, ins) {
|
||||
if (!bi_is_direct_aligned_ubo(ins))
|
||||
if (!bi_is_pushable_ubo(ins))
|
||||
continue;
|
||||
|
||||
unsigned ubo = pan_res_handle_get_index(ins->src[1].value);
|
||||
@@ -130,6 +134,9 @@ bi_pick_ubo(struct panfrost_ubo_push *push, struct bi_ubo_analysis *analysis)
|
||||
void
|
||||
bi_opt_push_ubo(bi_context *ctx)
|
||||
{
|
||||
/* We only push from the "default" UBO 0 */
|
||||
assert(ctx->nir->info.first_ubo_is_default_ubo && "precondition");
|
||||
|
||||
struct bi_ubo_analysis analysis = bi_analyze_ranges(ctx);
|
||||
bi_pick_ubo(ctx->info.push, &analysis);
|
||||
|
||||
@@ -142,7 +149,7 @@ bi_opt_push_ubo(bi_context *ctx)
|
||||
unsigned ubo = pan_res_handle_get_index(ins->src[1].value);
|
||||
unsigned offset = ins->src[0].value;
|
||||
|
||||
if (!bi_is_direct_aligned_ubo(ins)) {
|
||||
if (!bi_is_pushable_ubo(ins)) {
|
||||
/* The load can't be pushed, so this UBO needs to be
|
||||
* uploaded conventionally */
|
||||
if (ins->src[1].type == BI_INDEX_CONSTANT)
|
||||
|
@@ -42,11 +42,15 @@ mir_is_ubo(midgard_instruction *ins)
|
||||
return (ins->type == TAG_LOAD_STORE_4) && (OP_IS_UBO_READ(ins->op));
|
||||
}
|
||||
|
||||
/* We only allow pushing UBO 0. This matches the Gallium convention
|
||||
* where UBO 0 is mapped on the CPU but other UBOs are not.
|
||||
*/
|
||||
static bool
|
||||
mir_is_direct_aligned_ubo(midgard_instruction *ins)
|
||||
mir_is_pushable_ubo(midgard_instruction *ins)
|
||||
{
|
||||
return mir_is_ubo(ins) && !(ins->constants.u32[0] & 0xF) &&
|
||||
(ins->src[1] == ~0) && (ins->src[2] == ~0);
|
||||
(ins->src[1] == ~0) && (ins->src[2] == ~0) &&
|
||||
midgard_unpack_ubo_index_imm(ins->load_store) == 0;
|
||||
}
|
||||
|
||||
/* Represents use data for a single UBO */
|
||||
@@ -74,7 +78,7 @@ mir_analyze_ranges(compiler_context *ctx)
|
||||
res.blocks = calloc(res.nr_blocks, sizeof(struct mir_ubo_block));
|
||||
|
||||
mir_foreach_instr_global(ctx, ins) {
|
||||
if (!mir_is_direct_aligned_ubo(ins))
|
||||
if (!mir_is_pushable_ubo(ins))
|
||||
continue;
|
||||
|
||||
unsigned ubo = midgard_unpack_ubo_index_imm(ins->load_store);
|
||||
@@ -272,6 +276,9 @@ midgard_promote_uniforms(compiler_context *ctx)
|
||||
return;
|
||||
}
|
||||
|
||||
/* We only push from the "default" UBO 0 */
|
||||
assert(ctx->nir->info.first_ubo_is_default_ubo && "precondition");
|
||||
|
||||
struct mir_ubo_analysis analysis = mir_analyze_ranges(ctx);
|
||||
|
||||
unsigned work_count = mir_work_heuristic(ctx, &analysis);
|
||||
@@ -293,7 +300,7 @@ midgard_promote_uniforms(compiler_context *ctx)
|
||||
unsigned ubo = midgard_unpack_ubo_index_imm(ins->load_store);
|
||||
unsigned qword = ins->constants.u32[0] / 16;
|
||||
|
||||
if (!mir_is_direct_aligned_ubo(ins)) {
|
||||
if (!mir_is_pushable_ubo(ins)) {
|
||||
if (ins->src[1] == ~0)
|
||||
ctx->ubo_mask |= BITSET_BIT(ubo);
|
||||
else
|
||||
|
Reference in New Issue
Block a user