panfrost: Implement instanced rendering
We implement GLES3.0 instanced rendering with full support for instanced arrays (via instance divisors). To do so, we use the new invocation helpers to invoke a triplet of (1, vertex_count, instance_count), rather than simply (1, vertex_count, 1). We rewrite the attribute handling code into a new pan_instancing.c file which handles both the simple LINEAR case for non-instanced as well as each of the new instancing cases: MODULO (for per-vertex attributes), POT and NPOT divisors. As a side effect, we rework how vertex buffers are handled, duplicating them to be 1:1 with vertex descriptors to simplify instancing code paths dramatically. This might be a performance regression, but this remains to be seen; if so, we can always deduplicate later with some added logic in pan_instancing.c Signed-off-by: Alyssa Rosenzweig <alyssa.rosenzweig@collabora.com>
This commit is contained in:
@@ -834,8 +834,9 @@ struct mali_attr_meta {
|
||||
/* Always observed to be zero at the moment */
|
||||
unsigned unknown3 : 2;
|
||||
|
||||
/* When packing multiple attributes in a buffer, offset addresses by this value */
|
||||
uint32_t src_offset;
|
||||
/* When packing multiple attributes in a buffer, offset addresses by
|
||||
* this value. Obscurely, this is signed. */
|
||||
int32_t src_offset;
|
||||
} __attribute__((packed));
|
||||
|
||||
enum mali_fbd_type {
|
||||
@@ -1061,7 +1062,16 @@ struct midgard_payload_vertex_tiler {
|
||||
u32 zero3;
|
||||
#endif
|
||||
|
||||
u32 gl_enables; // 0x5
|
||||
u16 gl_enables; // 0x5
|
||||
|
||||
/* Both zero for non-instanced draws. For instanced draws, a
|
||||
* decomposition of padded_num_vertices. See the comments about the
|
||||
* corresponding fields in mali_attr for context. */
|
||||
|
||||
unsigned instance_shift : 5;
|
||||
unsigned instance_odd : 3;
|
||||
|
||||
u8 zero4;
|
||||
|
||||
/* Offset for first vertex in buffer */
|
||||
u32 draw_start;
|
||||
|
@@ -58,6 +58,7 @@ files_panfrost = files(
|
||||
'pan_pretty_print.c',
|
||||
'pan_fragment.c',
|
||||
'pan_invocation.c',
|
||||
'pan_instancing.c',
|
||||
'pan_scoreboard.c',
|
||||
'pan_sfbd.c',
|
||||
'pan_mfbd.c',
|
||||
|
@@ -1255,7 +1255,9 @@ emit_intrinsic(compiler_context *ctx, nir_intrinsic_instr *instr)
|
||||
bool is_ubo = instr->intrinsic == nir_intrinsic_load_ubo;
|
||||
|
||||
/* Get the base type of the intrinsic */
|
||||
nir_alu_type t = nir_intrinsic_type(instr);
|
||||
/* TODO: Infer type? Does it matter? */
|
||||
nir_alu_type t =
|
||||
is_ubo ? nir_type_uint : nir_intrinsic_type(instr);
|
||||
t = nir_alu_type_get_base_type(t);
|
||||
|
||||
if (!is_ubo) {
|
||||
|
@@ -552,7 +552,7 @@ panfrost_emit_point_coord(union mali_attr *slot)
|
||||
static void
|
||||
panfrost_emit_varying_descriptor(
|
||||
struct panfrost_context *ctx,
|
||||
unsigned invocation_count)
|
||||
unsigned vertex_count)
|
||||
{
|
||||
/* Load the shaders */
|
||||
|
||||
@@ -638,19 +638,19 @@ panfrost_emit_varying_descriptor(
|
||||
unsigned idx = 0;
|
||||
|
||||
panfrost_emit_varyings(ctx, &varyings[idx++], num_gen_varyings * 16,
|
||||
invocation_count);
|
||||
vertex_count);
|
||||
|
||||
/* fp32 vec4 gl_Position */
|
||||
ctx->payload_tiler.postfix.position_varying =
|
||||
panfrost_emit_varyings(ctx, &varyings[idx++],
|
||||
sizeof(float) * 4, invocation_count);
|
||||
sizeof(float) * 4, vertex_count);
|
||||
|
||||
|
||||
if (vs->writes_point_size || fs->reads_point_coord) {
|
||||
/* fp16 vec1 gl_PointSize */
|
||||
ctx->payload_tiler.primitive_size.pointer =
|
||||
panfrost_emit_varyings(ctx, &varyings[idx++],
|
||||
2, invocation_count);
|
||||
2, vertex_count);
|
||||
}
|
||||
|
||||
if (fs->reads_point_coord) {
|
||||
@@ -663,7 +663,7 @@ panfrost_emit_varying_descriptor(
|
||||
ctx->payload_tiler.postfix.varyings = varyings_p;
|
||||
}
|
||||
|
||||
static mali_ptr
|
||||
mali_ptr
|
||||
panfrost_vertex_buffer_address(struct panfrost_context *ctx, unsigned i)
|
||||
{
|
||||
struct pipe_vertex_buffer *buf = &ctx->vertex_buffers[i];
|
||||
@@ -672,48 +672,6 @@ panfrost_vertex_buffer_address(struct panfrost_context *ctx, unsigned i)
|
||||
return rsrc->bo->gpu + buf->buffer_offset;
|
||||
}
|
||||
|
||||
/* Emits attributes and varying descriptors, which should be called every draw,
|
||||
* excepting some obscure circumstances */
|
||||
|
||||
static void
|
||||
panfrost_emit_vertex_data(struct panfrost_context *ctx, struct panfrost_job *job)
|
||||
{
|
||||
/* Staged mali_attr, and index into them. i =/= k, depending on the
|
||||
* vertex buffer mask */
|
||||
union mali_attr attrs[PIPE_MAX_ATTRIBS];
|
||||
unsigned k = 0;
|
||||
|
||||
unsigned invocation_count = MALI_NEGATIVE(ctx->payload_tiler.prefix.invocation_count);
|
||||
|
||||
for (int i = 0; i < ARRAY_SIZE(ctx->vertex_buffers); ++i) {
|
||||
if (!(ctx->vb_mask & (1 << i))) continue;
|
||||
|
||||
struct pipe_vertex_buffer *buf = &ctx->vertex_buffers[i];
|
||||
struct panfrost_resource *rsrc = (struct panfrost_resource *) (buf->buffer.resource);
|
||||
|
||||
if (!rsrc) continue;
|
||||
|
||||
/* Align to 64 bytes by masking off the lower bits. This
|
||||
* will be adjusted back when we fixup the src_offset in
|
||||
* mali_attr_meta */
|
||||
|
||||
mali_ptr addr = panfrost_vertex_buffer_address(ctx, i) & ~63;
|
||||
|
||||
/* Offset vertex count by draw_start to make sure we upload enough */
|
||||
attrs[k].stride = buf->stride;
|
||||
attrs[k].size = rsrc->base.width0;
|
||||
|
||||
panfrost_job_add_bo(job, rsrc->bo);
|
||||
attrs[k].elements = addr | MALI_ATTR_LINEAR;
|
||||
|
||||
++k;
|
||||
}
|
||||
|
||||
ctx->payload_vertex.postfix.attributes = panfrost_upload_transient(ctx, attrs, k * sizeof(union mali_attr));
|
||||
|
||||
panfrost_emit_varying_descriptor(ctx, invocation_count);
|
||||
}
|
||||
|
||||
static bool
|
||||
panfrost_writes_point_size(struct panfrost_context *ctx)
|
||||
{
|
||||
@@ -759,12 +717,24 @@ panfrost_stage_attributes(struct panfrost_context *ctx)
|
||||
* QED.
|
||||
*/
|
||||
|
||||
unsigned start = ctx->payload_vertex.draw_start;
|
||||
|
||||
for (unsigned i = 0; i < so->num_elements; ++i) {
|
||||
unsigned vbi = so->pipe[i].vertex_buffer_index;
|
||||
struct pipe_vertex_buffer *buf = &ctx->vertex_buffers[vbi];
|
||||
mali_ptr addr = panfrost_vertex_buffer_address(ctx, vbi);
|
||||
|
||||
/* Adjust by the masked off bits of the offset */
|
||||
target[i].src_offset += (addr & 63);
|
||||
|
||||
/* Also, somewhat obscurely per-instance data needs to be
|
||||
* offset in response to a delayed start in an indexed draw */
|
||||
|
||||
if (so->pipe[i].instance_divisor && ctx->instance_count > 1 && start) {
|
||||
target[i].src_offset -= buf->stride * start;
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
||||
ctx->payload_vertex.postfix.attribute_meta = transfer.gpu;
|
||||
@@ -1028,7 +998,11 @@ panfrost_emit_for_draw(struct panfrost_context *ctx, bool with_vertex_data)
|
||||
struct panfrost_job *job = panfrost_get_job_for_fbo(ctx);
|
||||
|
||||
if (with_vertex_data) {
|
||||
panfrost_emit_vertex_data(ctx, job);
|
||||
panfrost_emit_vertex_data(job);
|
||||
|
||||
/* Varyings emitted for -all- geometry */
|
||||
unsigned total_count = ctx->padded_count * ctx->instance_count;
|
||||
panfrost_emit_varying_descriptor(ctx, total_count);
|
||||
}
|
||||
|
||||
bool msaa = ctx->rasterizer->base.multisample;
|
||||
@@ -1580,9 +1554,11 @@ panfrost_get_index_buffer_mapped(struct panfrost_context *ctx, const struct pipe
|
||||
struct panfrost_resource *rsrc = (struct panfrost_resource *) (info->index.resource);
|
||||
|
||||
off_t offset = info->start * info->index_size;
|
||||
struct panfrost_job *batch = panfrost_get_job_for_fbo(ctx);
|
||||
|
||||
if (!info->has_user_indices) {
|
||||
/* Only resources can be directly mapped */
|
||||
panfrost_job_add_bo(batch, rsrc->bo);
|
||||
return rsrc->bo->gpu + offset;
|
||||
} else {
|
||||
/* Otherwise, we need to upload to transient memory */
|
||||
@@ -1657,6 +1633,7 @@ panfrost_draw_vbo(
|
||||
ctx->payload_tiler.prefix.draw_mode = g2m_draw_mode(mode);
|
||||
|
||||
ctx->vertex_count = info->count;
|
||||
ctx->instance_count = info->instance_count;
|
||||
|
||||
/* For non-indexed draws, they're the same */
|
||||
unsigned vertex_count = ctx->vertex_count;
|
||||
@@ -1673,9 +1650,20 @@ panfrost_draw_vbo(
|
||||
|
||||
/* For higher amounts of vertices (greater than what fits in a 16-bit
|
||||
* short), the other value is needed, otherwise there will be bizarre
|
||||
* rendering artefacts. It's not clear what these values mean yet. */
|
||||
* rendering artefacts. It's not clear what these values mean yet. This
|
||||
* change is also needed for instancing and sometimes points (perhaps
|
||||
* related to dynamically setting gl_PointSize) */
|
||||
|
||||
draw_flags |= (mode == PIPE_PRIM_POINTS || ctx->vertex_count > 65535) ? 0x3000 : 0x18000;
|
||||
bool is_points = mode == PIPE_PRIM_POINTS;
|
||||
bool many_verts = ctx->vertex_count > 0xFFFF;
|
||||
bool instanced = ctx->instance_count > 1;
|
||||
|
||||
draw_flags |= (is_points || many_verts || instanced) ? 0x3000 : 0x18000;
|
||||
|
||||
/* This doesn't make much sense */
|
||||
if (mode == PIPE_PRIM_LINE_STRIP) {
|
||||
draw_flags |= 0x800;
|
||||
}
|
||||
|
||||
if (info->index_size) {
|
||||
/* Calculate the min/max index used so we can figure out how
|
||||
@@ -1721,11 +1709,42 @@ panfrost_draw_vbo(
|
||||
panfrost_pack_work_groups_fused(
|
||||
&ctx->payload_vertex.prefix,
|
||||
&ctx->payload_tiler.prefix,
|
||||
1, vertex_count, 1,
|
||||
1, vertex_count, info->instance_count,
|
||||
1, 1, 1);
|
||||
|
||||
ctx->payload_tiler.prefix.unknown_draw = draw_flags;
|
||||
|
||||
/* Encode the padded vertex count */
|
||||
|
||||
if (info->instance_count > 1) {
|
||||
/* Triangles have non-even vertex counts so they change how
|
||||
* padding works internally */
|
||||
|
||||
bool is_triangle =
|
||||
mode == PIPE_PRIM_TRIANGLES ||
|
||||
mode == PIPE_PRIM_TRIANGLE_STRIP ||
|
||||
mode == PIPE_PRIM_TRIANGLE_FAN;
|
||||
|
||||
struct pan_shift_odd so =
|
||||
panfrost_padded_vertex_count(vertex_count, !is_triangle);
|
||||
|
||||
ctx->payload_vertex.instance_shift = so.shift;
|
||||
ctx->payload_tiler.instance_shift = so.shift;
|
||||
|
||||
ctx->payload_vertex.instance_odd = so.odd;
|
||||
ctx->payload_tiler.instance_odd = so.odd;
|
||||
|
||||
ctx->padded_count = pan_expand_shift_odd(so);
|
||||
} else {
|
||||
ctx->padded_count = ctx->vertex_count;
|
||||
|
||||
/* Reset instancing state */
|
||||
ctx->payload_vertex.instance_shift = 0;
|
||||
ctx->payload_vertex.instance_odd = 0;
|
||||
ctx->payload_tiler.instance_shift = 0;
|
||||
ctx->payload_tiler.instance_odd = 0;
|
||||
}
|
||||
|
||||
/* Fire off the draw itself */
|
||||
panfrost_queue_draw(ctx);
|
||||
}
|
||||
@@ -1807,7 +1826,7 @@ panfrost_create_vertex_elements_state(
|
||||
panfrost_allocate_chunk(pan_context(pctx), 0, HEAP_DESCRIPTOR);
|
||||
|
||||
for (int i = 0; i < num_elements; ++i) {
|
||||
so->hw[i].index = elements[i].vertex_buffer_index;
|
||||
so->hw[i].index = i;
|
||||
|
||||
enum pipe_format fmt = elements[i].src_format;
|
||||
const struct util_format_description *desc = util_format_description(fmt);
|
||||
|
@@ -152,6 +152,11 @@ struct panfrost_context {
|
||||
int dirty;
|
||||
|
||||
unsigned vertex_count;
|
||||
unsigned instance_count;
|
||||
|
||||
/* If instancing is enabled, vertex count padded for instance; if
|
||||
* it is disabled, just equal to plain vertex count */
|
||||
unsigned padded_count;
|
||||
|
||||
union mali_attr attributes[PIPE_MAX_ATTRIBS];
|
||||
|
||||
@@ -364,6 +369,27 @@ panfrost_pack_work_groups_fused(
|
||||
unsigned size_y,
|
||||
unsigned size_z);
|
||||
|
||||
/* Instancing */
|
||||
|
||||
mali_ptr
|
||||
panfrost_vertex_buffer_address(struct panfrost_context *ctx, unsigned i);
|
||||
|
||||
void
|
||||
panfrost_emit_vertex_data(struct panfrost_job *batch);
|
||||
|
||||
struct pan_shift_odd {
|
||||
unsigned shift;
|
||||
unsigned odd;
|
||||
};
|
||||
|
||||
struct pan_shift_odd
|
||||
panfrost_padded_vertex_count(
|
||||
unsigned vertex_count,
|
||||
bool primitive_pot);
|
||||
|
||||
|
||||
unsigned
|
||||
pan_expand_shift_odd(struct pan_shift_odd o);
|
||||
|
||||
|
||||
#endif
|
||||
|
341
src/gallium/drivers/panfrost/pan_instancing.c
Normal file
341
src/gallium/drivers/panfrost/pan_instancing.c
Normal file
@@ -0,0 +1,341 @@
|
||||
/*
|
||||
* Copyright (C) 2018-2019 Alyssa Rosenzweig
|
||||
* Copyright (C) 2019 Collabora, Ltd.
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
* to deal in the Software without restriction, including without limitation
|
||||
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
* and/or sell copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice (including the next
|
||||
* paragraph) shall be included in all copies or substantial portions of the
|
||||
* Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
||||
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
* SOFTWARE.
|
||||
*
|
||||
*/
|
||||
|
||||
#include "pan_context.h"
|
||||
|
||||
/* See mali_job for notes on how this works. But basically, for small vertex
|
||||
* counts, we have a lookup table, and for large vertex counts, we look at the
|
||||
* high bits as a heuristic. This has to match exactly how the hardware
|
||||
* calculates this (which is why the algorithm is so weird) or else instancing
|
||||
* will break. */
|
||||
|
||||
/* Given an odd number (of the form 2k + 1), compute k */
|
||||
#define ODD(odd) ((odd - 1) >> 1)
|
||||
|
||||
/* Given the shift/odd pair, recover the original padded integer */
|
||||
|
||||
unsigned
|
||||
pan_expand_shift_odd(struct pan_shift_odd o)
|
||||
{
|
||||
unsigned odd = 2*o.odd + 1;
|
||||
unsigned shift = 1 << o.shift;
|
||||
return odd * shift;
|
||||
}
|
||||
|
||||
static inline struct pan_shift_odd
|
||||
pan_factored(unsigned pot, unsigned odd)
|
||||
{
|
||||
struct pan_shift_odd out;
|
||||
|
||||
assert(util_is_power_of_two_or_zero(pot));
|
||||
assert(odd & 1);
|
||||
|
||||
/* Odd is of the form (2k + 1) = (k << 1) + 1 = (k << 1) | 1.
|
||||
*
|
||||
* So (odd >> 1) = ((k << 1) | 1) >> 1 = ((k << 1) >> 1) | (1 >> 1)
|
||||
* = k | 0 = k */
|
||||
|
||||
out.odd = (odd >> 1);
|
||||
|
||||
/* POT is the form (1 << shift) */
|
||||
out.shift = __builtin_ctz(pot);
|
||||
|
||||
return out;
|
||||
}
|
||||
|
||||
|
||||
/* For small vertices. Second argument is whether the primitive takes a
|
||||
* power-of-two argument, which determines how rounding works. True for POINTS
|
||||
* and LINES, false for TRIANGLES. Presumably true for QUADS but you'd be crazy
|
||||
* to try instanced quads on ES class hardware <3 */
|
||||
|
||||
static struct {
|
||||
unsigned pot;
|
||||
unsigned odd;
|
||||
} small_lut[] = {
|
||||
{ 0, 1 },
|
||||
{ 1, 1 },
|
||||
{ 2, 1 },
|
||||
{ 1, 3 },
|
||||
{ 4, 1 },
|
||||
{ 1, 5 },
|
||||
{ 2, 3 },
|
||||
{ 1, 7 },
|
||||
{ 8, 1 },
|
||||
{ 1, 9 },
|
||||
{ 2, 5 },
|
||||
{ 4, 3 }, /* 11 */
|
||||
{ 4, 3 },
|
||||
{ 2, 7 }, /* 13 */
|
||||
{ 2, 7 },
|
||||
{ 16, 1 }, /* 15 */
|
||||
{ 16, 1 },
|
||||
{ 2, 9 },
|
||||
{ 4, 5 }, /* 20 */
|
||||
{ 4, 5 }
|
||||
};
|
||||
|
||||
static struct pan_shift_odd
|
||||
panfrost_small_padded_vertex_count(unsigned idx)
|
||||
{
|
||||
return pan_factored(
|
||||
small_lut[idx].pot,
|
||||
small_lut[idx].odd);
|
||||
}
|
||||
|
||||
static struct pan_shift_odd
|
||||
panfrost_large_padded_vertex_count(uint32_t vertex_count)
|
||||
{
|
||||
struct pan_shift_odd out = { 0 };
|
||||
|
||||
/* First, we have to find the highest set one */
|
||||
unsigned highest = 32 - __builtin_clz(vertex_count);
|
||||
|
||||
/* Using that, we mask out the highest 4-bits */
|
||||
unsigned n = highest - 4;
|
||||
unsigned nibble = (vertex_count >> n) & 0xF;
|
||||
|
||||
/* Great, we have the nibble. Now we can just try possibilities. Note
|
||||
* that we don't care about the bottom most bit in most cases, and we
|
||||
* know the top bit must be 1 */
|
||||
|
||||
unsigned middle_two = (nibble >> 1) & 0x3;
|
||||
|
||||
switch (middle_two) {
|
||||
case 0b00:
|
||||
if (nibble & 1)
|
||||
return pan_factored(1 << n, 9);
|
||||
else
|
||||
return pan_factored(1 << (n + 1), 5);
|
||||
case 0b01:
|
||||
return pan_factored(1 << (n + 2), 3);
|
||||
case 0b10:
|
||||
return pan_factored(1 << (n + 1), 7);
|
||||
case 0b11:
|
||||
return pan_factored(1 << (n + 4), 1);
|
||||
default:
|
||||
unreachable("Invalid two bits");
|
||||
}
|
||||
|
||||
return out;
|
||||
}
|
||||
|
||||
struct pan_shift_odd
|
||||
panfrost_padded_vertex_count(
|
||||
unsigned vertex_count,
|
||||
bool pot)
|
||||
{
|
||||
assert(vertex_count > 0);
|
||||
|
||||
if (vertex_count < 20) {
|
||||
/* Add an off-by-one if it won't align naturally (quirk of the hardware) */
|
||||
//if (!pot)
|
||||
// vertex_count++;
|
||||
|
||||
return panfrost_small_padded_vertex_count(vertex_count);
|
||||
} else
|
||||
return panfrost_large_padded_vertex_count(vertex_count);
|
||||
}
|
||||
|
||||
/* The much, much more irritating case -- instancing is enabled. See
|
||||
* panfrost_job.h for notes on how this works */
|
||||
|
||||
static unsigned
|
||||
panfrost_vertex_instanced(
|
||||
struct panfrost_job *batch,
|
||||
struct panfrost_resource *rsrc,
|
||||
unsigned divisor,
|
||||
union mali_attr *attrs,
|
||||
mali_ptr addr,
|
||||
unsigned vertex_count,
|
||||
unsigned instance_count)
|
||||
{
|
||||
/* First, grab the padded vertex count */
|
||||
|
||||
struct pan_shift_odd o = {
|
||||
.shift = batch->ctx->payload_tiler.instance_shift,
|
||||
.odd = batch->ctx->payload_tiler.instance_odd,
|
||||
};
|
||||
|
||||
unsigned padded_count = batch->ctx->padded_count;
|
||||
|
||||
/* Depending if there is an instance divisor or not, packing varies.
|
||||
* When there is a divisor, the hardware-level divisor is actually the
|
||||
* product of the instance divisor and the padded count */
|
||||
|
||||
unsigned hw_divisor = padded_count * divisor;
|
||||
|
||||
if (divisor == 0) {
|
||||
/* Per-vertex attributes use the MODULO mode. First, compute
|
||||
* the modulus */
|
||||
|
||||
attrs->elements |= MALI_ATTR_MODULO;
|
||||
attrs->shift = o.shift;
|
||||
attrs->extra_flags = o.odd;
|
||||
|
||||
return 1;
|
||||
} else if (util_is_power_of_two_or_zero(hw_divisor)) {
|
||||
/* If there is a divisor but the hardware divisor works out to
|
||||
* a power of two (not terribly exceptional), we can use an
|
||||
* easy path (just shifting) */
|
||||
|
||||
attrs->elements |= MALI_ATTR_POT_DIVIDE;
|
||||
attrs->shift = __builtin_ctz(hw_divisor);
|
||||
|
||||
return 1;
|
||||
} else {
|
||||
/* We have a NPOT divisor. Here's the fun one (multipling by
|
||||
* the inverse and shifting) */
|
||||
|
||||
/* floor(log2(d)) */
|
||||
unsigned shift = util_logbase2(hw_divisor);
|
||||
|
||||
/* m = ceil(2^(32 + shift) / d) */
|
||||
uint64_t shift_hi = 32 + shift;
|
||||
uint64_t t = 1ll << shift_hi;
|
||||
double t_f = t;
|
||||
double hw_divisor_d = hw_divisor;
|
||||
double m_f = ceil(t_f / hw_divisor_d);
|
||||
unsigned m = m_f;
|
||||
|
||||
/* Default case */
|
||||
unsigned magic_divisor = m, extra_flags = 0;
|
||||
|
||||
/* e = 2^(shift + 32) % d */
|
||||
uint64_t e = t % hw_divisor;
|
||||
|
||||
/* Apply round-down algorithm? e <= 2^shift?. XXX: The blob
|
||||
* seems to use a different condition */
|
||||
if (e <= (1 << shift)) {
|
||||
magic_divisor = m - 1;
|
||||
extra_flags = 1;
|
||||
}
|
||||
|
||||
/* Top flag implicitly set */
|
||||
assert(magic_divisor & (1 << 31));
|
||||
magic_divisor &= ~(1 << 31);
|
||||
|
||||
/* Upload to two different slots */
|
||||
|
||||
attrs[0].elements |= MALI_ATTR_NPOT_DIVIDE;
|
||||
attrs[0].shift = shift;
|
||||
attrs[0].extra_flags = extra_flags;
|
||||
|
||||
attrs[1].unk = 0x20;
|
||||
attrs[1].magic_divisor = magic_divisor;
|
||||
attrs[1].zero = 0;
|
||||
attrs[1].divisor = divisor;
|
||||
|
||||
return 2;
|
||||
}
|
||||
}
|
||||
|
||||
void
|
||||
panfrost_emit_vertex_data(struct panfrost_job *batch)
|
||||
{
|
||||
struct panfrost_context *ctx = batch->ctx;
|
||||
struct panfrost_vertex_state *so = ctx->vertex;
|
||||
|
||||
/* Staged mali_attr, and index into them. i =/= k, depending on the
|
||||
* vertex buffer mask and instancing. Twice as much room is allocated,
|
||||
* for a worst case of NPOT_DIVIDEs which take up extra slot */
|
||||
union mali_attr attrs[PIPE_MAX_ATTRIBS * 2];
|
||||
unsigned k = 0;
|
||||
|
||||
unsigned vertex_count = ctx->vertex_count;
|
||||
unsigned instanced_count = ctx->instance_count;
|
||||
|
||||
for (unsigned i = 0; i < so->num_elements; ++i) {
|
||||
/* We map a mali_attr to be 1:1 with the mali_attr_meta, which
|
||||
* means duplicating some vertex buffers (who cares? aside from
|
||||
* maybe some caching implications but I somehow doubt that
|
||||
* matters) */
|
||||
|
||||
struct pipe_vertex_element *elem = &so->pipe[i];
|
||||
unsigned vbi = elem->vertex_buffer_index;
|
||||
|
||||
/* The exception to 1:1 mapping is that we can have multiple
|
||||
* entries (NPOT divisors), so we fixup anyways */
|
||||
|
||||
so->hw[i].index = k;
|
||||
|
||||
if (!(ctx->vb_mask & (1 << vbi))) continue;
|
||||
|
||||
struct pipe_vertex_buffer *buf = &ctx->vertex_buffers[vbi];
|
||||
struct panfrost_resource *rsrc = (struct panfrost_resource *) (buf->buffer.resource);
|
||||
|
||||
if (!rsrc) continue;
|
||||
|
||||
/* Align to 64 bytes by masking off the lower bits. This
|
||||
* will be adjusted back when we fixup the src_offset in
|
||||
* mali_attr_meta */
|
||||
|
||||
mali_ptr raw_addr = panfrost_vertex_buffer_address(ctx, vbi);
|
||||
mali_ptr addr = raw_addr & ~63;
|
||||
unsigned chopped_addr = raw_addr - addr;
|
||||
|
||||
/* Add a dependency of the batch on the vertex buffer */
|
||||
panfrost_job_add_bo(batch, rsrc->bo);
|
||||
|
||||
/* Set common fields */
|
||||
attrs[k].elements = addr;
|
||||
attrs[k].stride = buf->stride;
|
||||
attrs[k].size = rsrc->base.width0;
|
||||
|
||||
/* We need to add the extra size we masked off (for
|
||||
* correctness) so the data doesn't get clamped away */
|
||||
attrs[k].size += chopped_addr;
|
||||
|
||||
/* Instancing uses a dramatically different code path than
|
||||
* linear, so dispatch for the actual emission now that the
|
||||
* common code is finished */
|
||||
|
||||
unsigned divisor = elem->instance_divisor;
|
||||
|
||||
if (divisor && instanced_count == 1) {
|
||||
/* Silly corner case where there's a divisor(=1) but
|
||||
* there's no legitimate instancing. So we want *every*
|
||||
* attribute to be the same. So set stride to zero so
|
||||
* we don't go anywhere. */
|
||||
|
||||
attrs[k].size = attrs[k].stride + chopped_addr;
|
||||
attrs[k].stride = 0;
|
||||
attrs[k++].elements |= MALI_ATTR_LINEAR;
|
||||
} else if (instanced_count <= 1) {
|
||||
/* Normal, non-instanced attributes */
|
||||
attrs[k++].elements |= MALI_ATTR_LINEAR;
|
||||
} else {
|
||||
k += panfrost_vertex_instanced(
|
||||
batch, rsrc, divisor, &attrs[k], addr, vertex_count, instanced_count);
|
||||
}
|
||||
}
|
||||
|
||||
/* Upload whatever we emitted and go */
|
||||
|
||||
ctx->payload_vertex.postfix.attributes =
|
||||
panfrost_upload_transient(ctx, attrs, k * sizeof(union mali_attr));
|
||||
}
|
||||
|
||||
|
@@ -120,6 +120,7 @@ panfrost_pack_work_groups_fused(
|
||||
tiler->size_y_shift = vertex->size_y_shift;
|
||||
tiler->size_z_shift = vertex->size_z_shift;
|
||||
tiler->workgroups_x_shift = vertex->workgroups_x_shift;
|
||||
tiler->workgroups_x_shift_2 = vertex->workgroups_x_shift_2;
|
||||
tiler->workgroups_y_shift = vertex->workgroups_y_shift;
|
||||
tiler->workgroups_z_shift = vertex->workgroups_z_shift;
|
||||
|
||||
|
@@ -118,6 +118,10 @@ panfrost_get_param(struct pipe_screen *screen, enum pipe_cap param)
|
||||
case PIPE_CAP_VERTEX_ELEMENT_INSTANCE_DIVISOR:
|
||||
return 1;
|
||||
|
||||
/* TODO: Where does this req come from in practice? */
|
||||
case PIPE_CAP_VERTEX_BUFFER_STRIDE_4BYTE_ALIGNED_ONLY:
|
||||
return 1;
|
||||
|
||||
case PIPE_CAP_MAX_TEXTURE_2D_SIZE:
|
||||
return 4096;
|
||||
case PIPE_CAP_MAX_TEXTURE_3D_LEVELS:
|
||||
|
@@ -859,6 +859,100 @@ pandecode_replay_mfbd_bfr(uint64_t gpu_va, int job_no, bool with_render_targets)
|
||||
return MALI_NEGATIVE(fb->rt_count_1);
|
||||
}
|
||||
|
||||
/* Just add a comment decoding the shift/odd fields forming the padded vertices
|
||||
* count */
|
||||
|
||||
static void
|
||||
pandecode_padded_vertices(unsigned shift, unsigned k)
|
||||
{
|
||||
unsigned odd = 2*k + 1;
|
||||
unsigned pot = 1 << shift;
|
||||
pandecode_msg("padded_num_vertices = %d\n", odd * pot);
|
||||
}
|
||||
|
||||
/* Given a magic divisor, recover what we were trying to divide by.
|
||||
*
|
||||
* Let m represent the magic divisor. By definition, m is an element on Z, whre
|
||||
* 0 <= m < 2^N, for N bits in m.
|
||||
*
|
||||
* Let q represent the number we would like to divide by.
|
||||
*
|
||||
* By definition of a magic divisor for N-bit unsigned integers (a number you
|
||||
* multiply by to magically get division), m is a number such that:
|
||||
*
|
||||
* (m * x) & (2^N - 1) = floor(x/q).
|
||||
* for all x on Z where 0 <= x < 2^N
|
||||
*
|
||||
* Ignore the case where any of the above values equals zero; it is irrelevant
|
||||
* for our purposes (instanced arrays).
|
||||
*
|
||||
* Choose x = q. Then:
|
||||
*
|
||||
* (m * x) & (2^N - 1) = floor(x/q).
|
||||
* (m * q) & (2^N - 1) = floor(q/q).
|
||||
*
|
||||
* floor(q/q) = floor(1) = 1, therefore:
|
||||
*
|
||||
* (m * q) & (2^N - 1) = 1
|
||||
*
|
||||
* Recall the identity that the bitwise AND of one less than a power-of-two
|
||||
* equals the modulo with that power of two, i.e. for all x:
|
||||
*
|
||||
* x & (2^N - 1) = x % N
|
||||
*
|
||||
* Therefore:
|
||||
*
|
||||
* mq % (2^N) = 1
|
||||
*
|
||||
* By definition, a modular multiplicative inverse of a number m is the number
|
||||
* q such that with respect to a modulos M:
|
||||
*
|
||||
* mq % M = 1
|
||||
*
|
||||
* Therefore, q is the modular multiplicative inverse of m with modulus 2^N.
|
||||
*
|
||||
*/
|
||||
|
||||
static void
|
||||
pandecode_magic_divisor(uint32_t magic, unsigned shift, unsigned orig_divisor, unsigned extra)
|
||||
{
|
||||
/* Compute the modular inverse of `magic` with respect to 2^(32 -
|
||||
* shift) the most lame way possible... just repeatedly add.
|
||||
* Asymptoptically slow but nobody cares in practice, unless you have
|
||||
* massive numbers of vertices or high divisors. */
|
||||
|
||||
unsigned inverse = 0;
|
||||
|
||||
/* Magic implicitly has the highest bit set */
|
||||
magic |= (1 << 31);
|
||||
|
||||
/* Depending on rounding direction */
|
||||
if (extra)
|
||||
magic++;
|
||||
|
||||
for (;;) {
|
||||
uint32_t product = magic * inverse;
|
||||
|
||||
if (shift) {
|
||||
product >>= shift;
|
||||
}
|
||||
|
||||
if (product == 1)
|
||||
break;
|
||||
|
||||
++inverse;
|
||||
}
|
||||
|
||||
pandecode_msg("dividing by %d (maybe off by two)\n", inverse);
|
||||
|
||||
/* Recall we're supposed to divide by (gl_level_divisor *
|
||||
* padded_num_vertices) */
|
||||
|
||||
unsigned padded_num_vertices = inverse / orig_divisor;
|
||||
|
||||
pandecode_msg("padded_num_vertices = %d\n", padded_num_vertices);
|
||||
}
|
||||
|
||||
static void
|
||||
pandecode_replay_attributes(const struct pandecode_mapped_memory *mem,
|
||||
mali_ptr addr, int job_no, char *suffix,
|
||||
@@ -905,9 +999,9 @@ pandecode_replay_attributes(const struct pandecode_mapped_memory *mem,
|
||||
/* Decode further where possible */
|
||||
|
||||
if (mode == MALI_ATTR_MODULO) {
|
||||
unsigned odd = (2 * attr[i].extra_flags) + 1;
|
||||
unsigned pot = (1 << attr[i].shift);
|
||||
pandecode_msg("padded_num_vertices = %d\n", odd * pot);
|
||||
pandecode_padded_vertices(
|
||||
attr[i].shift,
|
||||
attr[i].extra_flags);
|
||||
}
|
||||
|
||||
pandecode_indent--;
|
||||
@@ -922,6 +1016,7 @@ pandecode_replay_attributes(const struct pandecode_mapped_memory *mem,
|
||||
if (attr[i].zero != 0)
|
||||
pandecode_prop("zero = 0x%x /* XXX zero tripped */", attr[i].zero);
|
||||
pandecode_prop("divisor = %d", attr[i].divisor);
|
||||
pandecode_magic_divisor(attr[i].magic_divisor, attr[i - 1].shift, attr[i].divisor, attr[i - 1].extra_flags);
|
||||
pandecode_indent--;
|
||||
pandecode_log("}, \n");
|
||||
}
|
||||
@@ -1114,7 +1209,7 @@ pandecode_replay_attribute_meta(int job_no, int count, const struct mali_vertex_
|
||||
|
||||
pandecode_prop("unknown1 = 0x%" PRIx64, (u64) attr_meta->unknown1);
|
||||
pandecode_prop("unknown3 = 0x%" PRIx64, (u64) attr_meta->unknown3);
|
||||
pandecode_prop("src_offset = 0x%" PRIx64, (u64) attr_meta->src_offset);
|
||||
pandecode_prop("src_offset = %d", attr_meta->src_offset);
|
||||
pandecode_indent--;
|
||||
pandecode_log("},\n");
|
||||
|
||||
@@ -2040,6 +2135,15 @@ pandecode_replay_vertex_or_tiler_job_mdg(const struct mali_job_descriptor_header
|
||||
|
||||
pandecode_replay_gl_enables(v->gl_enables, h->job_type);
|
||||
|
||||
if (v->instance_shift || v->instance_odd) {
|
||||
pandecode_prop("instance_shift = 0x%d /* %d */",
|
||||
v->instance_shift, 1 << v->instance_shift);
|
||||
pandecode_prop("instance_odd = 0x%X /* %d */",
|
||||
v->instance_odd, (2 * v->instance_odd) + 1);
|
||||
|
||||
pandecode_padded_vertices(v->instance_shift, v->instance_odd);
|
||||
}
|
||||
|
||||
if (v->draw_start)
|
||||
pandecode_prop("draw_start = %d", v->draw_start);
|
||||
|
||||
|
Reference in New Issue
Block a user