panfrost: Use compute-based XFB on Midgard

Now we're back to a single XFB implementation for all gens. Fixes:

   KHR-GLES31.core.draw_indirect.advanced-twoPasses-transformFeedback-arrays
   KHR-GLES31.core.draw_indirect.advanced-twoPasses-transformFeedback-elements

Cc: mesa-stable
Signed-off-by: Alyssa Rosenzweig <alyssa@collabora.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/19238>
(cherry picked from commit 0955fe8fe2)
This commit is contained in:
Alyssa Rosenzweig
2022-10-21 12:28:35 -04:00
committed by Dylan Baker
parent ca852a09af
commit 0add4f1f1a
9 changed files with 47 additions and 134 deletions

View File

@@ -2416,7 +2416,7 @@
"description": "panfrost: Use compute-based XFB on Midgard",
"nominated": true,
"nomination_type": 0,
"resolution": 0,
"resolution": 1,
"main_sha": null,
"because_sha": null
},

View File

@@ -47,7 +47,7 @@ panfrost_shader_compile(struct pipe_screen *pscreen,
nir_shader *s = nir_shader_clone(NULL, ir);
if (dev->arch >= 6 && s->xfb_info && !s->info.internal) {
if (s->xfb_info && !s->info.internal) {
/* Create compute shader doing transform feedback */
nir_shader *xfb = nir_shader_clone(NULL, s);
xfb->info.name = ralloc_asprintf(xfb, "%s@xfb", xfb->info.name);

View File

@@ -2001,9 +2001,14 @@ panfrost_emit_vertex_data(struct panfrost_batch *batch,
unsigned nr_bufs = ((so->nr_bufs + nr_images) * bufs_per_attrib) +
(PAN_ARCH >= 6 ? 1 : 0);
unsigned count = vs->info.attribute_count;
if (vs->xfb)
count = MAX2(count, vs->xfb->info.attribute_count);
#if PAN_ARCH <= 5
/* Midgard needs vertexid/instanceid handled specially */
bool special_vbufs = vs->info.attribute_count >= PAN_VERTEX_ID;
bool special_vbufs = count >= PAN_VERTEX_ID;
if (special_vbufs)
nr_bufs += 2;
@@ -2018,8 +2023,7 @@ panfrost_emit_vertex_data(struct panfrost_batch *batch,
pan_pool_alloc_desc_array(&batch->pool.base, nr_bufs,
ATTRIBUTE_BUFFER);
struct panfrost_ptr T =
pan_pool_alloc_desc_array(&batch->pool.base,
vs->info.attribute_count,
pan_pool_alloc_desc_array(&batch->pool.base, count,
ATTRIBUTE);
struct mali_attribute_buffer_packed *bufs =
@@ -2242,50 +2246,6 @@ panfrost_emit_varyings(struct panfrost_batch *batch,
return ptr;
}
#if PAN_ARCH <= 5
static void
panfrost_emit_streamout(struct panfrost_batch *batch,
struct mali_attribute_buffer_packed *slot,
unsigned stride, unsigned count,
struct pipe_stream_output_target *target)
{
unsigned max_size = target->buffer_size;
unsigned expected_size = stride * count;
/* Grab the BO and bind it to the batch */
struct panfrost_resource *rsrc = pan_resource(target->buffer);
struct panfrost_bo *bo = rsrc->image.data.bo;
panfrost_batch_write_rsrc(batch, rsrc, PIPE_SHADER_VERTEX);
panfrost_batch_read_rsrc(batch, rsrc, PIPE_SHADER_FRAGMENT);
unsigned offset = panfrost_xfb_offset(stride, target);
pan_pack(slot, ATTRIBUTE_BUFFER, cfg) {
cfg.pointer = bo->ptr.gpu + (offset & ~63);
cfg.stride = stride;
cfg.size = MIN2(max_size, expected_size) + (offset & 63);
util_range_add(&rsrc->base, &rsrc->valid_buffer_range,
offset, cfg.size);
}
}
/* Helpers for manipulating stream out information so we can pack varyings
* accordingly. Compute the src_offset for a given captured varying */
static struct pipe_stream_output *
pan_get_so(struct pipe_stream_output_info *info, gl_varying_slot loc)
{
for (unsigned i = 0; i < info->num_outputs; ++i) {
if (info->output[i].register_index == loc)
return &info->output[i];
}
unreachable("Varying not captured");
}
#endif
/* Given a varying, figure out which index it corresponds to */
static inline unsigned
@@ -2294,16 +2254,6 @@ pan_varying_index(unsigned present, enum pan_special_varying v)
return util_bitcount(present & BITFIELD_MASK(v));
}
/* Get the base offset for XFB buffers, which by convention come after
* everything else. Wrapper function for semantic reasons; by construction this
* is just popcount. */
static inline unsigned
pan_xfb_base(unsigned present)
{
return util_bitcount(present);
}
/* Determines which varying buffers are required */
static inline unsigned
@@ -2459,10 +2409,6 @@ panfrost_emit_varying(const struct panfrost_device *dev,
enum pipe_format pipe_format,
unsigned present,
uint16_t point_sprite_mask,
struct pipe_stream_output_info *xfb,
uint64_t xfb_loc_mask,
unsigned max_xfb,
unsigned *xfb_offsets,
signed offset,
enum pan_special_varying pos_varying)
{
@@ -2474,21 +2420,8 @@ panfrost_emit_varying(const struct panfrost_device *dev,
gl_varying_slot loc = varying.location;
mali_pixel_format format = dev->formats[pipe_format].hw;
#if PAN_ARCH <= 5
struct pipe_stream_output *o = (xfb_loc_mask & BITFIELD64_BIT(loc)) ?
pan_get_so(xfb, loc) : NULL;
#else
struct pipe_stream_output *o = NULL;
#endif
if (util_varying_is_point_coord(loc, point_sprite_mask)) {
pan_emit_vary_special(dev, out, present, PAN_VARY_PNTCOORD);
} else if (o && o->output_buffer < max_xfb) {
unsigned fixup_offset = xfb_offsets[o->output_buffer] & 63;
pan_emit_vary(dev, out,
pan_xfb_base(present) + o->output_buffer,
format, (o->dst_offset * 4) + fixup_offset);
} else if (loc == VARYING_SLOT_POS) {
pan_emit_vary_special(dev, out, present, pos_varying);
} else if (loc == VARYING_SLOT_PSIZ) {
@@ -2511,12 +2444,10 @@ panfrost_emit_varying_descs(
struct panfrost_pool *pool,
struct panfrost_shader_state *producer,
struct panfrost_shader_state *consumer,
struct panfrost_streamout *xfb,
uint16_t point_coord_mask,
struct pan_linkage *out)
{
struct panfrost_device *dev = pool->base.dev;
struct pipe_stream_output_info *xfb_info = &producer->stream_output;
unsigned producer_count = producer->info.varyings.output_count;
unsigned consumer_count = consumer->info.varyings.input_count;
@@ -2550,16 +2481,6 @@ panfrost_emit_varying_descs(
out->stride = pan_assign_varyings(dev, &producer->info,
&consumer->info, offsets);
unsigned xfb_offsets[PIPE_MAX_SO_BUFFERS] = {0};
for (unsigned i = 0; i < xfb->num_targets; ++i) {
if (!xfb->targets[i])
continue;
xfb_offsets[i] = panfrost_xfb_offset(xfb_info->stride[i] * 4,
xfb->targets[i]);
}
for (unsigned i = 0; i < producer_count; ++i) {
signed j = pan_find_vary(consumer->info.varyings.input,
consumer->info.varyings.input_count,
@@ -2571,9 +2492,7 @@ panfrost_emit_varying_descs(
panfrost_emit_varying(dev, descs + i,
producer->info.varyings.output[i], format,
out->present, 0, &producer->stream_output,
producer->so_mask, xfb->num_targets,
xfb_offsets, offsets[i], PAN_VARY_POSITION);
out->present, 0, offsets[i], PAN_VARY_POSITION);
}
for (unsigned i = 0; i < consumer_count; ++i) {
@@ -2587,9 +2506,7 @@ panfrost_emit_varying_descs(
consumer->info.varyings.input[i],
consumer->info.varyings.input[i].format,
out->present, point_coord_mask,
&producer->stream_output, producer->so_mask,
xfb->num_targets, xfb_offsets, offset,
PAN_VARY_FRAGCOORD);
offset, PAN_VARY_FRAGCOORD);
}
}
@@ -2640,7 +2557,6 @@ panfrost_emit_varying_descriptor(struct panfrost_batch *batch,
/* In good conditions, we only need to link varyings once */
bool prelink =
(point_coord_mask == 0) &&
(PAN_ARCH >= 6 || ctx->streamout.num_targets == 0) &&
!vs->info.separable &&
!fs->info.separable;
@@ -2653,43 +2569,24 @@ panfrost_emit_varying_descriptor(struct panfrost_batch *batch,
struct panfrost_pool *pool =
prelink ? &ctx->descs : &batch->pool;
panfrost_emit_varying_descs(pool, vs, fs, &ctx->streamout, point_coord_mask, linkage);
panfrost_emit_varying_descs(pool, vs, fs, point_coord_mask, linkage);
}
unsigned present = linkage->present, stride = linkage->stride;
unsigned xfb_base = pan_xfb_base(present);
unsigned count = util_bitcount(present);
struct panfrost_ptr T =
pan_pool_alloc_desc_array(&batch->pool.base,
xfb_base +
ctx->streamout.num_targets + 1,
count + 1,
ATTRIBUTE_BUFFER);
struct mali_attribute_buffer_packed *varyings =
(struct mali_attribute_buffer_packed *) T.cpu;
if (buffer_count)
*buffer_count = xfb_base + ctx->streamout.num_targets;
*buffer_count = count;
#if PAN_ARCH >= 6
/* Suppress prefetch on Bifrost */
memset(varyings + xfb_base + ctx->streamout.num_targets, 0, sizeof(*varyings));
#else
/* Emit the stream out buffers. We need enough room for all the
* vertices we emit across all instances */
struct pipe_stream_output_info *so = &vs->stream_output;
unsigned out_count = ctx->instance_count *
u_stream_outputs_for_vertices(ctx->active_prim, ctx->vertex_count);
for (unsigned i = 0; i < ctx->streamout.num_targets; ++i) {
if (!ctx->streamout.targets[i])
continue;
panfrost_emit_streamout(batch, &varyings[xfb_base + i],
so->stride[i] * 4,
out_count,
ctx->streamout.targets[i]);
}
memset(varyings + count, 0, sizeof(*varyings));
#endif
if (stride) {
@@ -3656,7 +3553,7 @@ panfrost_launch_xfb(struct panfrost_batch *batch,
panfrost_pack_work_groups_compute(&invocation,
1, count, info->instance_count,
1, 1, 1, false, false);
1, 1, 1, PAN_ARCH <= 5, false);
batch->uniform_buffers[PIPE_SHADER_VERTEX] =
panfrost_emit_const_buf(batch, PIPE_SHADER_VERTEX, NULL,
@@ -3665,9 +3562,12 @@ panfrost_launch_xfb(struct panfrost_batch *batch,
panfrost_draw_emit_vertex(batch, info, &invocation, 0, 0,
attribs, attrib_bufs, t.cpu);
#endif
panfrost_add_job(&batch->pool.base, &batch->scoreboard,
MALI_JOB_TYPE_COMPUTE, true, false,
0, 0, &t, false);
enum mali_job_type job_type = MALI_JOB_TYPE_COMPUTE;
#if PAN_ARCH <= 5
job_type = MALI_JOB_TYPE_VERTEX;
#endif
panfrost_add_job(&batch->pool.base, &batch->scoreboard, job_type,
true, false, 0, 0, &t, false);
ctx->shader[PIPE_SHADER_VERTEX] = saved_vs;
batch->rsd[PIPE_SHADER_VERTEX] = saved_rsd;
@@ -3801,14 +3701,12 @@ panfrost_direct_draw(struct panfrost_batch *batch,
panfrost_update_shader_state(batch, PIPE_SHADER_FRAGMENT);
panfrost_clean_state_3d(ctx);
#if PAN_ARCH >= 6
if (vs->xfb) {
#if PAN_ARCH >= 9
mali_ptr attribs = 0, attrib_bufs = 0;
#endif
panfrost_launch_xfb(batch, info, attribs, attrib_bufs, draw->count);
}
#endif
/* Increment transform feedback offsets */
panfrost_update_streamout_offsets(ctx);

View File

@@ -4922,7 +4922,7 @@ bi_finalize_nir(nir_shader *nir, unsigned gpu_id, bool is_blend)
NIR_PASS_V(nir, nir_io_add_const_offset_to_base,
nir_var_shader_in | nir_var_shader_out);
NIR_PASS_V(nir, nir_io_add_intrinsic_xfb_info);
NIR_PASS_V(nir, bifrost_nir_lower_xfb);
NIR_PASS_V(nir, pan_lower_xfb);
}
bi_optimize_nir(nir, gpu_id, is_blend);

View File

@@ -29,7 +29,6 @@ libpanfrost_bifrost_files = files(
'bi_liveness.c',
'bi_lower_divergent_indirects.c',
'bi_lower_swizzle.c',
'bi_lower_xfb.c',
'bi_print.c',
'bi_opt_constant_fold.c',
'bi_opt_copy_prop.c',

View File

@@ -363,6 +363,13 @@ optimise_nir(nir_shader *nir, unsigned quirks, bool is_blend, bool is_blit)
NIR_PASS(progress, nir, pan_lower_helper_invocation);
NIR_PASS(progress, nir, pan_lower_sample_pos);
if (nir->xfb_info != NULL && nir->info.has_transform_feedback_varyings) {
NIR_PASS_V(nir, nir_io_add_const_offset_to_base,
nir_var_shader_in | nir_var_shader_out);
NIR_PASS_V(nir, nir_io_add_intrinsic_xfb_info);
NIR_PASS_V(nir, pan_lower_xfb);
}
NIR_PASS(progress, nir, midgard_nir_lower_algebraic_early);
NIR_PASS_V(nir, nir_lower_alu_to_scalar, mdg_should_scalarize, NULL);
@@ -2089,9 +2096,14 @@ emit_intrinsic(compiler_context *ctx, nir_intrinsic_instr *instr)
emit_global(ctx, &instr->instr, false, reg, &instr->src[1], seg);
break;
case nir_intrinsic_load_first_vertex:
case nir_intrinsic_load_ssbo_address:
case nir_intrinsic_load_xfb_address:
emit_sysval_read(ctx, &instr->instr, 2, 0);
break;
case nir_intrinsic_load_first_vertex:
case nir_intrinsic_load_work_dim:
case nir_intrinsic_load_num_vertices:
emit_sysval_read(ctx, &instr->instr, 1, 0);
break;
@@ -2100,6 +2112,7 @@ emit_intrinsic(compiler_context *ctx, nir_intrinsic_instr *instr)
break;
case nir_intrinsic_load_base_instance:
case nir_intrinsic_get_ssbo_size:
emit_sysval_read(ctx, &instr->instr, 1, 8);
break;
@@ -2107,10 +2120,6 @@ emit_intrinsic(compiler_context *ctx, nir_intrinsic_instr *instr)
emit_sysval_read(ctx, &instr->instr, 2, 0);
break;
case nir_intrinsic_get_ssbo_size:
emit_sysval_read(ctx, &instr->instr, 1, 8);
break;
case nir_intrinsic_load_viewport_scale:
case nir_intrinsic_load_viewport_offset:
case nir_intrinsic_load_num_workgroups:

View File

@@ -30,6 +30,7 @@ libpanfrost_util_files = files(
'pan_lower_helper_invocation.c',
'pan_lower_sample_position.c',
'pan_lower_writeout.c',
'pan_lower_xfb.c',
'pan_lower_64bit_intrin.c',
'pan_sysval.c',
)

View File

@@ -502,6 +502,7 @@ bool pan_nir_lower_64bit_intrin(nir_shader *shader);
bool pan_lower_helper_invocation(nir_shader *shader);
bool pan_lower_sample_pos(nir_shader *shader);
bool pan_lower_xfb(nir_shader *nir);
/*
* Helper returning the subgroup size. Generally, this is equal to the number of

View File

@@ -21,7 +21,9 @@
* SOFTWARE.
*/
#include "bifrost_nir.h"
#include "pan_ir.h"
#include "compiler/nir/nir_builder.h"
static void
lower_xfb_output(nir_builder *b, nir_intrinsic_instr *intr,
@@ -42,6 +44,9 @@ lower_xfb_output(nir_builder *b, nir_intrinsic_instr *intr,
nir_load_num_vertices(b)),
nir_load_vertex_id_zero_base(b));
BITSET_SET(b->shader->info.system_values_read, SYSTEM_VALUE_VERTEX_ID_ZERO_BASE);
BITSET_SET(b->shader->info.system_values_read, SYSTEM_VALUE_INSTANCE_ID);
nir_ssa_def *buf = nir_load_xfb_address(b, 64, .base = buffer);
nir_ssa_def *addr =
nir_iadd(b, buf, nir_u2u64(b,
@@ -87,7 +92,7 @@ lower_xfb(nir_builder *b, nir_instr *instr, UNUSED void *data)
}
bool
bifrost_nir_lower_xfb(nir_shader *nir)
pan_lower_xfb(nir_shader *nir)
{
return nir_shader_instructions_pass(nir, lower_xfb,
nir_metadata_block_index |