panfrost: Pass alignments explicitly
In most cases, GPU data structures need only be self-aligned; the worst-case 128 byte alignment is wasteful. By passing explicit alignments, we can reduce memory usage, avoid extra allocations, and improve descriptor cache locality. Signed-off-by: Alyssa Rosenzweig <alyssa.rosenzweig@collabora.com> Reviewed-by: Tomeu Vizoso <tomeu.vizoso@collabora.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/6373>
This commit is contained in:

committed by
Tomeu Vizoso

parent
1cb47f8eea
commit
373a204bdd
@@ -217,9 +217,13 @@ panfrost_get_index_buffer_bounded(struct panfrost_context *ctx,
|
|||||||
} else {
|
} else {
|
||||||
/* Otherwise, we need to upload to transient memory */
|
/* Otherwise, we need to upload to transient memory */
|
||||||
const uint8_t *ibuf8 = (const uint8_t *) info->index.user;
|
const uint8_t *ibuf8 = (const uint8_t *) info->index.user;
|
||||||
out = panfrost_pool_upload(&batch->pool, ibuf8 + offset,
|
struct panfrost_transfer T =
|
||||||
info->count *
|
panfrost_pool_alloc_aligned(&batch->pool,
|
||||||
info->index_size);
|
info->count * info->index_size,
|
||||||
|
info->index_size);
|
||||||
|
|
||||||
|
memcpy(T.cpu, ibuf8 + offset, info->count * info->index_size);
|
||||||
|
out = T.gpu;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (needs_indices) {
|
if (needs_indices) {
|
||||||
@@ -814,7 +818,7 @@ panfrost_emit_shader_meta(struct panfrost_batch *batch,
|
|||||||
|
|
||||||
panfrost_frag_shader_meta_init(ctx, &meta, rts);
|
panfrost_frag_shader_meta_init(ctx, &meta, rts);
|
||||||
|
|
||||||
xfer = panfrost_pool_alloc(&batch->pool, desc_size);
|
xfer = panfrost_pool_alloc_aligned(&batch->pool, desc_size, sizeof(meta));
|
||||||
|
|
||||||
memcpy(xfer.cpu, &meta, sizeof(meta));
|
memcpy(xfer.cpu, &meta, sizeof(meta));
|
||||||
memcpy(xfer.cpu + sizeof(meta), rts, rt_size * rt_count);
|
memcpy(xfer.cpu + sizeof(meta), rts, rt_size * rt_count);
|
||||||
@@ -1106,8 +1110,8 @@ panfrost_emit_const_buf(struct panfrost_batch *batch,
|
|||||||
size_t sys_size = sizeof(float) * 4 * ss->sysval_count;
|
size_t sys_size = sizeof(float) * 4 * ss->sysval_count;
|
||||||
size_t uniform_size = has_uniforms ? (buf->cb[0].buffer_size) : 0;
|
size_t uniform_size = has_uniforms ? (buf->cb[0].buffer_size) : 0;
|
||||||
size_t size = sys_size + uniform_size;
|
size_t size = sys_size + uniform_size;
|
||||||
struct panfrost_transfer transfer = panfrost_pool_alloc(&batch->pool,
|
struct panfrost_transfer transfer =
|
||||||
size);
|
panfrost_pool_alloc_aligned(&batch->pool, size, 16);
|
||||||
|
|
||||||
/* Upload sysvals requested by the shader */
|
/* Upload sysvals requested by the shader */
|
||||||
panfrost_upload_sysvals(batch, transfer.cpu, ss, stage);
|
panfrost_upload_sysvals(batch, transfer.cpu, ss, stage);
|
||||||
@@ -1125,7 +1129,10 @@ panfrost_emit_const_buf(struct panfrost_batch *batch,
|
|||||||
assert(ubo_count >= 1);
|
assert(ubo_count >= 1);
|
||||||
|
|
||||||
size_t sz = MALI_UNIFORM_BUFFER_LENGTH * ubo_count;
|
size_t sz = MALI_UNIFORM_BUFFER_LENGTH * ubo_count;
|
||||||
struct panfrost_transfer ubos = panfrost_pool_alloc(&batch->pool, sz);
|
struct panfrost_transfer ubos =
|
||||||
|
panfrost_pool_alloc_aligned(&batch->pool, sz,
|
||||||
|
MALI_UNIFORM_BUFFER_LENGTH);
|
||||||
|
|
||||||
uint64_t *ubo_ptr = (uint64_t *) ubos.cpu;
|
uint64_t *ubo_ptr = (uint64_t *) ubos.cpu;
|
||||||
|
|
||||||
/* Upload uniforms as a UBO */
|
/* Upload uniforms as a UBO */
|
||||||
@@ -1244,9 +1251,10 @@ panfrost_emit_texture_descriptors(struct panfrost_batch *batch,
|
|||||||
return;
|
return;
|
||||||
|
|
||||||
if (device->quirks & IS_BIFROST) {
|
if (device->quirks & IS_BIFROST) {
|
||||||
struct panfrost_transfer T = panfrost_pool_alloc(&batch->pool,
|
struct panfrost_transfer T = panfrost_pool_alloc_aligned(&batch->pool,
|
||||||
MALI_BIFROST_TEXTURE_LENGTH *
|
MALI_BIFROST_TEXTURE_LENGTH *
|
||||||
ctx->sampler_view_count[stage]);
|
ctx->sampler_view_count[stage],
|
||||||
|
MALI_BIFROST_TEXTURE_LENGTH);
|
||||||
|
|
||||||
struct mali_bifrost_texture_packed *out =
|
struct mali_bifrost_texture_packed *out =
|
||||||
(struct mali_bifrost_texture_packed *) T.cpu;
|
(struct mali_bifrost_texture_packed *) T.cpu;
|
||||||
@@ -1303,7 +1311,7 @@ panfrost_emit_sampler_descriptors(struct panfrost_batch *batch,
|
|||||||
assert(MALI_BIFROST_SAMPLER_LENGTH == MALI_MIDGARD_SAMPLER_LENGTH);
|
assert(MALI_BIFROST_SAMPLER_LENGTH == MALI_MIDGARD_SAMPLER_LENGTH);
|
||||||
|
|
||||||
size_t sz = desc_size * ctx->sampler_count[stage];
|
size_t sz = desc_size * ctx->sampler_count[stage];
|
||||||
struct panfrost_transfer T = panfrost_pool_alloc(&batch->pool, sz);
|
struct panfrost_transfer T = panfrost_pool_alloc_aligned(&batch->pool, sz, desc_size);
|
||||||
struct mali_midgard_sampler_packed *out = (struct mali_midgard_sampler_packed *) T.cpu;
|
struct mali_midgard_sampler_packed *out = (struct mali_midgard_sampler_packed *) T.cpu;
|
||||||
|
|
||||||
for (unsigned i = 0; i < ctx->sampler_count[stage]; ++i)
|
for (unsigned i = 0; i < ctx->sampler_count[stage]; ++i)
|
||||||
@@ -1324,11 +1332,13 @@ panfrost_emit_vertex_data(struct panfrost_batch *batch,
|
|||||||
|
|
||||||
/* Worst case: everything is NPOT */
|
/* Worst case: everything is NPOT */
|
||||||
|
|
||||||
struct panfrost_transfer S = panfrost_pool_alloc(&batch->pool,
|
struct panfrost_transfer S = panfrost_pool_alloc_aligned(&batch->pool,
|
||||||
MALI_ATTRIBUTE_LENGTH * PIPE_MAX_ATTRIBS * 2);
|
MALI_ATTRIBUTE_LENGTH * PIPE_MAX_ATTRIBS * 2,
|
||||||
|
MALI_ATTRIBUTE_LENGTH);
|
||||||
|
|
||||||
struct panfrost_transfer T = panfrost_pool_alloc(&batch->pool,
|
struct panfrost_transfer T = panfrost_pool_alloc_aligned(&batch->pool,
|
||||||
MALI_ATTRIBUTE_LENGTH * (PAN_INSTANCE_ID + 1));
|
MALI_ATTRIBUTE_LENGTH * (PAN_INSTANCE_ID + 1),
|
||||||
|
MALI_ATTRIBUTE_LENGTH);
|
||||||
|
|
||||||
struct mali_attribute_buffer_packed *bufs =
|
struct mali_attribute_buffer_packed *bufs =
|
||||||
(struct mali_attribute_buffer_packed *) S.cpu;
|
(struct mali_attribute_buffer_packed *) S.cpu;
|
||||||
@@ -1496,7 +1506,7 @@ panfrost_emit_varyings(struct panfrost_batch *batch,
|
|||||||
unsigned stride, unsigned count)
|
unsigned stride, unsigned count)
|
||||||
{
|
{
|
||||||
unsigned size = stride * count;
|
unsigned size = stride * count;
|
||||||
mali_ptr ptr = panfrost_pool_alloc(&batch->invisible_pool, size).gpu;
|
mali_ptr ptr = panfrost_pool_alloc_aligned(&batch->invisible_pool, size, 64).gpu;
|
||||||
|
|
||||||
pan_pack(slot, ATTRIBUTE_BUFFER, cfg) {
|
pan_pack(slot, ATTRIBUTE_BUFFER, cfg) {
|
||||||
cfg.stride = stride;
|
cfg.stride = stride;
|
||||||
@@ -1931,9 +1941,8 @@ panfrost_emit_varying_descriptor(struct panfrost_batch *batch,
|
|||||||
vs_size = MALI_ATTRIBUTE_LENGTH * vs->varying_count;
|
vs_size = MALI_ATTRIBUTE_LENGTH * vs->varying_count;
|
||||||
fs_size = MALI_ATTRIBUTE_LENGTH * fs->varying_count;
|
fs_size = MALI_ATTRIBUTE_LENGTH * fs->varying_count;
|
||||||
|
|
||||||
struct panfrost_transfer trans = panfrost_pool_alloc(&batch->pool,
|
struct panfrost_transfer trans = panfrost_pool_alloc_aligned(
|
||||||
vs_size +
|
&batch->pool, vs_size + fs_size, MALI_ATTRIBUTE_LENGTH);
|
||||||
fs_size);
|
|
||||||
|
|
||||||
struct pipe_stream_output_info *so = &vs->stream_output;
|
struct pipe_stream_output_info *so = &vs->stream_output;
|
||||||
unsigned present = pan_varying_present(vs, fs, dev->quirks);
|
unsigned present = pan_varying_present(vs, fs, dev->quirks);
|
||||||
@@ -1979,8 +1988,9 @@ panfrost_emit_varying_descriptor(struct panfrost_batch *batch,
|
|||||||
}
|
}
|
||||||
|
|
||||||
unsigned xfb_base = pan_xfb_base(present);
|
unsigned xfb_base = pan_xfb_base(present);
|
||||||
struct panfrost_transfer T = panfrost_pool_alloc(&batch->pool,
|
struct panfrost_transfer T = panfrost_pool_alloc_aligned(&batch->pool,
|
||||||
MALI_ATTRIBUTE_BUFFER_LENGTH * (xfb_base + ctx->streamout.num_targets));
|
MALI_ATTRIBUTE_BUFFER_LENGTH * (xfb_base + ctx->streamout.num_targets),
|
||||||
|
MALI_ATTRIBUTE_BUFFER_LENGTH);
|
||||||
struct mali_attribute_buffer_packed *varyings =
|
struct mali_attribute_buffer_packed *varyings =
|
||||||
(struct mali_attribute_buffer_packed *) T.cpu;
|
(struct mali_attribute_buffer_packed *) T.cpu;
|
||||||
|
|
||||||
|
@@ -748,7 +748,7 @@ panfrost_batch_reserve_framebuffer(struct panfrost_batch *batch)
|
|||||||
sizeof(struct mali_single_framebuffer) :
|
sizeof(struct mali_single_framebuffer) :
|
||||||
sizeof(struct mali_framebuffer);
|
sizeof(struct mali_framebuffer);
|
||||||
|
|
||||||
batch->framebuffer = panfrost_pool_alloc(&batch->pool, size);
|
batch->framebuffer = panfrost_pool_alloc_aligned(&batch->pool, size, 64);
|
||||||
|
|
||||||
/* Tag the pointer */
|
/* Tag the pointer */
|
||||||
if (!(dev->quirks & MIDGARD_SFBD))
|
if (!(dev->quirks & MIDGARD_SFBD))
|
||||||
@@ -870,8 +870,8 @@ panfrost_load_surface(struct panfrost_batch *batch, struct pipe_surface *surf, u
|
|||||||
blend_shader = bo->gpu | b->first_tag;
|
blend_shader = bo->gpu | b->first_tag;
|
||||||
}
|
}
|
||||||
|
|
||||||
struct panfrost_transfer transfer = panfrost_pool_alloc(&batch->pool,
|
struct panfrost_transfer transfer = panfrost_pool_alloc_aligned(&batch->pool,
|
||||||
4 * 4 * 6 * rsrc->damage.inverted_len);
|
4 * 4 * 6 * rsrc->damage.inverted_len, 64);
|
||||||
|
|
||||||
for (unsigned i = 0; i < rsrc->damage.inverted_len; ++i) {
|
for (unsigned i = 0; i < rsrc->damage.inverted_len; ++i) {
|
||||||
float *o = (float *) (transfer.cpu + (4 * 4 * 6 * i));
|
float *o = (float *) (transfer.cpu + (4 * 4 * 6 * i));
|
||||||
|
@@ -395,7 +395,7 @@ panfrost_mfbd_upload(struct panfrost_batch *batch,
|
|||||||
sizeof(struct mali_render_target) * 8;
|
sizeof(struct mali_render_target) * 8;
|
||||||
|
|
||||||
struct panfrost_transfer m_f_trans =
|
struct panfrost_transfer m_f_trans =
|
||||||
panfrost_pool_alloc(&batch->pool, total_sz);
|
panfrost_pool_alloc_aligned(&batch->pool, total_sz, 64);
|
||||||
|
|
||||||
/* Do the transfer */
|
/* Do the transfer */
|
||||||
|
|
||||||
|
@@ -290,7 +290,8 @@ panfrost_load_midg(
|
|||||||
* textures, removing the need to separately key the blit shaders for
|
* textures, removing the need to separately key the blit shaders for
|
||||||
* 2D and 3D variants */
|
* 2D and 3D variants */
|
||||||
|
|
||||||
struct panfrost_transfer texture_t = panfrost_pool_alloc(pool, MALI_MIDGARD_TEXTURE_LENGTH + sizeof(mali_ptr) * 2 * MAX2(image->nr_samples, 1));
|
struct panfrost_transfer texture_t = panfrost_pool_alloc_aligned(
|
||||||
|
pool, MALI_MIDGARD_TEXTURE_LENGTH + sizeof(mali_ptr) * 2 * MAX2(image->nr_samples, 1), 128);
|
||||||
|
|
||||||
panfrost_new_texture(texture_t.cpu,
|
panfrost_new_texture(texture_t.cpu,
|
||||||
image->width0, image->height0,
|
image->width0, image->height0,
|
||||||
@@ -311,7 +312,9 @@ panfrost_load_midg(
|
|||||||
pan_pack(sampler.cpu, MIDGARD_SAMPLER, cfg)
|
pan_pack(sampler.cpu, MIDGARD_SAMPLER, cfg)
|
||||||
cfg.normalized_coordinates = false;
|
cfg.normalized_coordinates = false;
|
||||||
|
|
||||||
struct panfrost_transfer shader_meta_t = panfrost_pool_alloc(pool, sizeof(shader_meta) + 8 * sizeof(struct midgard_blend_rt));
|
struct panfrost_transfer shader_meta_t = panfrost_pool_alloc_aligned(
|
||||||
|
pool, sizeof(shader_meta) + 8 * sizeof(struct midgard_blend_rt), 128);
|
||||||
|
|
||||||
memcpy(shader_meta_t.cpu, &shader_meta, sizeof(shader_meta));
|
memcpy(shader_meta_t.cpu, &shader_meta, sizeof(shader_meta));
|
||||||
|
|
||||||
for (unsigned i = 0; i < 8; ++i) {
|
for (unsigned i = 0; i < 8; ++i) {
|
||||||
|
@@ -27,9 +27,6 @@
|
|||||||
#include "pan_bo.h"
|
#include "pan_bo.h"
|
||||||
#include "pan_pool.h"
|
#include "pan_pool.h"
|
||||||
|
|
||||||
/* TODO: What does this actually have to be? */
|
|
||||||
#define ALIGNMENT 128
|
|
||||||
|
|
||||||
/* Transient command stream pooling: command stream uploads try to simply copy
|
/* Transient command stream pooling: command stream uploads try to simply copy
|
||||||
* into whereever we left off. If there isn't space, we allocate a new entry
|
* into whereever we left off. If there isn't space, we allocate a new entry
|
||||||
* into the pool and copy there */
|
* into the pool and copy there */
|
||||||
@@ -80,14 +77,11 @@ panfrost_create_pool(void *memctx, struct panfrost_device *dev,
|
|||||||
}
|
}
|
||||||
|
|
||||||
struct panfrost_transfer
|
struct panfrost_transfer
|
||||||
panfrost_pool_alloc(struct pan_pool *pool, size_t sz)
|
panfrost_pool_alloc_aligned(struct pan_pool *pool, size_t sz, unsigned alignment)
|
||||||
{
|
{
|
||||||
/* Pad the size */
|
|
||||||
sz = ALIGN_POT(sz, ALIGNMENT);
|
|
||||||
|
|
||||||
/* Find or create a suitable BO */
|
/* Find or create a suitable BO */
|
||||||
struct panfrost_bo *bo = pool->transient_bo;
|
struct panfrost_bo *bo = pool->transient_bo;
|
||||||
unsigned offset = pool->transient_offset;
|
unsigned offset = ALIGN_POT(pool->transient_offset, alignment);
|
||||||
|
|
||||||
/* If we don't fit, allocate a new backing */
|
/* If we don't fit, allocate a new backing */
|
||||||
if (unlikely(bo == NULL || (offset + sz) >= TRANSIENT_SLAB_SIZE)) {
|
if (unlikely(bo == NULL || (offset + sz) >= TRANSIENT_SLAB_SIZE)) {
|
||||||
@@ -96,7 +90,7 @@ panfrost_pool_alloc(struct pan_pool *pool, size_t sz)
|
|||||||
offset = 0;
|
offset = 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
pool->transient_offset += sz;
|
pool->transient_offset = offset + sz;
|
||||||
|
|
||||||
struct panfrost_transfer ret = {
|
struct panfrost_transfer ret = {
|
||||||
.cpu = bo->cpu + offset,
|
.cpu = bo->cpu + offset,
|
||||||
@@ -104,7 +98,6 @@ panfrost_pool_alloc(struct pan_pool *pool, size_t sz)
|
|||||||
};
|
};
|
||||||
|
|
||||||
return ret;
|
return ret;
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
mali_ptr
|
mali_ptr
|
||||||
|
@@ -61,6 +61,17 @@ struct panfrost_transfer {
|
|||||||
mali_ptr gpu;
|
mali_ptr gpu;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
struct panfrost_transfer
|
||||||
|
panfrost_pool_alloc_aligned(struct pan_pool *pool, size_t sz, unsigned alignment);
|
||||||
|
|
||||||
|
/* Default to self-alignment */
|
||||||
|
|
||||||
|
static inline struct panfrost_transfer
|
||||||
|
panfrost_pool_alloc(struct pan_pool *pool, size_t sz)
|
||||||
|
{
|
||||||
|
return panfrost_pool_alloc_aligned(pool, sz, util_next_power_of_two(sz));
|
||||||
|
}
|
||||||
|
|
||||||
struct panfrost_transfer
|
struct panfrost_transfer
|
||||||
panfrost_pool_alloc(struct pan_pool *pool, size_t sz);
|
panfrost_pool_alloc(struct pan_pool *pool, size_t sz);
|
||||||
|
|
||||||
|
@@ -145,7 +145,8 @@ panfrost_new_job(
|
|||||||
if (inject)
|
if (inject)
|
||||||
job.next_job = scoreboard->first_job;
|
job.next_job = scoreboard->first_job;
|
||||||
|
|
||||||
struct panfrost_transfer transfer = panfrost_pool_alloc(pool, sizeof(job) + payload_size);
|
struct panfrost_transfer transfer =
|
||||||
|
panfrost_pool_alloc_aligned(pool, sizeof(job) + payload_size, 64);
|
||||||
memcpy(transfer.cpu, &job, sizeof(job));
|
memcpy(transfer.cpu, &job, sizeof(job));
|
||||||
memcpy(transfer.cpu + sizeof(job), payload, payload_size);
|
memcpy(transfer.cpu + sizeof(job), payload, payload_size);
|
||||||
|
|
||||||
|
Reference in New Issue
Block a user