panfrost: Use compute-based XFB on Midgard

Now we're back to a single XFB implementation for all gens. Fixes: KHR-GLES31.core.draw_indirect.advanced-twoPasses-transformFeedback-arrays KHR-GLES31.core.draw_indirect.advanced-twoPasses-transformFeedback-elements Cc: mesa-stable Signed-off-by: Alyssa Rosenzweig <alyssa@collabora.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/19238> (cherry picked from commit 0955fe8fe2)
2022-10-21 12:28:35 -04:00
parent ca852a09af
commit 0add4f1f1a
9 changed files with 47 additions and 134 deletions
--- a/.pick_status.json
+++ b/.pick_status.json
@@ -2416,7 +2416,7 @@
        "description": "panfrost: Use compute-based XFB on Midgard",
        "nominated": true,
        "nomination_type": 0,
-        "resolution": 0,
+        "resolution": 1,
        "main_sha": null,
        "because_sha": null
    },
--- a/src/gallium/drivers/panfrost/pan_assemble.c
+++ b/src/gallium/drivers/panfrost/pan_assemble.c
@@ -47,7 +47,7 @@ panfrost_shader_compile(struct pipe_screen *pscreen,

        nir_shader *s = nir_shader_clone(NULL, ir);

-        if (dev->arch >= 6 && s->xfb_info && !s->info.internal) {
+        if (s->xfb_info && !s->info.internal) {
                /* Create compute shader doing transform feedback */
                nir_shader *xfb = nir_shader_clone(NULL, s);
                xfb->info.name = ralloc_asprintf(xfb, "%s@xfb", xfb->info.name);
--- a/src/gallium/drivers/panfrost/pan_cmdstream.c
+++ b/src/gallium/drivers/panfrost/pan_cmdstream.c
@@ -2001,9 +2001,14 @@ panfrost_emit_vertex_data(struct panfrost_batch *batch,
        unsigned nr_bufs = ((so->nr_bufs + nr_images) * bufs_per_attrib) +
                           (PAN_ARCH >= 6 ? 1 : 0);

+        unsigned count = vs->info.attribute_count;
+
+        if (vs->xfb)
+                count = MAX2(count, vs->xfb->info.attribute_count);
+
 #if PAN_ARCH <= 5
        /* Midgard needs vertexid/instanceid handled specially */
-        bool special_vbufs = vs->info.attribute_count >= PAN_VERTEX_ID;
+        bool special_vbufs = count >= PAN_VERTEX_ID;

        if (special_vbufs)
                nr_bufs += 2;
@@ -2018,8 +2023,7 @@ panfrost_emit_vertex_data(struct panfrost_batch *batch,
                pan_pool_alloc_desc_array(&batch->pool.base, nr_bufs,
                                          ATTRIBUTE_BUFFER);
        struct panfrost_ptr T =
-                pan_pool_alloc_desc_array(&batch->pool.base,
-                                          vs->info.attribute_count,
+                pan_pool_alloc_desc_array(&batch->pool.base, count,
                                          ATTRIBUTE);

        struct mali_attribute_buffer_packed *bufs =
@@ -2242,50 +2246,6 @@ panfrost_emit_varyings(struct panfrost_batch *batch,
        return ptr;
 }

-#if PAN_ARCH <= 5
-static void
-panfrost_emit_streamout(struct panfrost_batch *batch,
-                        struct mali_attribute_buffer_packed *slot,
-                        unsigned stride, unsigned count,
-                        struct pipe_stream_output_target *target)
-{
-        unsigned max_size = target->buffer_size;
-        unsigned expected_size = stride * count;
-
-        /* Grab the BO and bind it to the batch */
-        struct panfrost_resource *rsrc = pan_resource(target->buffer);
-        struct panfrost_bo *bo = rsrc->image.data.bo;
-
-        panfrost_batch_write_rsrc(batch, rsrc, PIPE_SHADER_VERTEX);
-        panfrost_batch_read_rsrc(batch, rsrc, PIPE_SHADER_FRAGMENT);
-
-        unsigned offset = panfrost_xfb_offset(stride, target);
-
-        pan_pack(slot, ATTRIBUTE_BUFFER, cfg) {
-                cfg.pointer = bo->ptr.gpu + (offset & ~63);
-                cfg.stride = stride;
-                cfg.size = MIN2(max_size, expected_size) + (offset & 63);
-
-                util_range_add(&rsrc->base, &rsrc->valid_buffer_range,
-                                offset, cfg.size);
-        }
-}
-
-/* Helpers for manipulating stream out information so we can pack varyings
- * accordingly. Compute the src_offset for a given captured varying */
-
-static struct pipe_stream_output *
-pan_get_so(struct pipe_stream_output_info *info, gl_varying_slot loc)
-{
-        for (unsigned i = 0; i < info->num_outputs; ++i) {
-                if (info->output[i].register_index == loc)
-                        return &info->output[i];
-        }
-
-        unreachable("Varying not captured");
-}
-#endif
-
 /* Given a varying, figure out which index it corresponds to */

 static inline unsigned
@@ -2294,16 +2254,6 @@ pan_varying_index(unsigned present, enum pan_special_varying v)
        return util_bitcount(present & BITFIELD_MASK(v));
 }

-/* Get the base offset for XFB buffers, which by convention come after
- * everything else. Wrapper function for semantic reasons; by construction this
- * is just popcount. */
-
-static inline unsigned
-pan_xfb_base(unsigned present)
-{
-        return util_bitcount(present);
-}
-
 /* Determines which varying buffers are required */

 static inline unsigned
@@ -2459,10 +2409,6 @@ panfrost_emit_varying(const struct panfrost_device *dev,
                      enum pipe_format pipe_format,
                      unsigned present,
                      uint16_t point_sprite_mask,
-                      struct pipe_stream_output_info *xfb,
-                      uint64_t xfb_loc_mask,
-                      unsigned max_xfb,
-                      unsigned *xfb_offsets,
                      signed offset,
                      enum pan_special_varying pos_varying)
 {
@@ -2474,21 +2420,8 @@ panfrost_emit_varying(const struct panfrost_device *dev,
        gl_varying_slot loc = varying.location;
        mali_pixel_format format = dev->formats[pipe_format].hw;

-#if PAN_ARCH <= 5
-        struct pipe_stream_output *o = (xfb_loc_mask & BITFIELD64_BIT(loc)) ?
-                pan_get_so(xfb, loc) : NULL;
-#else
-        struct pipe_stream_output *o = NULL;
-#endif
-
        if (util_varying_is_point_coord(loc, point_sprite_mask)) {
                pan_emit_vary_special(dev, out, present, PAN_VARY_PNTCOORD);
-        } else if (o && o->output_buffer < max_xfb) {
-                unsigned fixup_offset = xfb_offsets[o->output_buffer] & 63;
-
-                pan_emit_vary(dev, out,
-                                pan_xfb_base(present) + o->output_buffer,
-                                format, (o->dst_offset * 4) + fixup_offset);
        } else if (loc == VARYING_SLOT_POS) {
                pan_emit_vary_special(dev, out, present, pos_varying);
        } else if (loc == VARYING_SLOT_PSIZ) {
@@ -2511,12 +2444,10 @@ panfrost_emit_varying_descs(
                struct panfrost_pool *pool,
                struct panfrost_shader_state *producer,
                struct panfrost_shader_state *consumer,
-                struct panfrost_streamout *xfb,
                uint16_t point_coord_mask,
                struct pan_linkage *out)
 {
        struct panfrost_device *dev = pool->base.dev;
-        struct pipe_stream_output_info *xfb_info = &producer->stream_output;
        unsigned producer_count = producer->info.varyings.output_count;
        unsigned consumer_count = consumer->info.varyings.input_count;

@@ -2550,16 +2481,6 @@ panfrost_emit_varying_descs(
        out->stride = pan_assign_varyings(dev, &producer->info,
                        &consumer->info, offsets);

-        unsigned xfb_offsets[PIPE_MAX_SO_BUFFERS] = {0};
-
-        for (unsigned i = 0; i < xfb->num_targets; ++i) {
-                if (!xfb->targets[i])
-                        continue;
-
-                xfb_offsets[i] = panfrost_xfb_offset(xfb_info->stride[i] * 4,
-                                                     xfb->targets[i]);
-        }
-
        for (unsigned i = 0; i < producer_count; ++i) {
                signed j = pan_find_vary(consumer->info.varyings.input,
                                consumer->info.varyings.input_count,
@@ -2571,9 +2492,7 @@ panfrost_emit_varying_descs(

                panfrost_emit_varying(dev, descs + i,
                                producer->info.varyings.output[i], format,
-                                out->present, 0, &producer->stream_output,
-                                producer->so_mask, xfb->num_targets,
-                                xfb_offsets, offsets[i], PAN_VARY_POSITION);
+                                out->present, 0, offsets[i], PAN_VARY_POSITION);
        }

        for (unsigned i = 0; i < consumer_count; ++i) {
@@ -2587,9 +2506,7 @@ panfrost_emit_varying_descs(
                                consumer->info.varyings.input[i],
                                consumer->info.varyings.input[i].format,
                                out->present, point_coord_mask,
-                                &producer->stream_output, producer->so_mask,
-                                xfb->num_targets, xfb_offsets, offset,
-                                PAN_VARY_FRAGCOORD);
+                                offset, PAN_VARY_FRAGCOORD);
        }
 }

@@ -2640,7 +2557,6 @@ panfrost_emit_varying_descriptor(struct panfrost_batch *batch,
        /* In good conditions, we only need to link varyings once */
        bool prelink =
                (point_coord_mask == 0) &&
-                (PAN_ARCH >= 6 || ctx->streamout.num_targets == 0) &&
                !vs->info.separable &&
                !fs->info.separable;

@@ -2653,43 +2569,24 @@ panfrost_emit_varying_descriptor(struct panfrost_batch *batch,
                struct panfrost_pool *pool =
                        prelink ? &ctx->descs : &batch->pool;

-                panfrost_emit_varying_descs(pool, vs, fs, &ctx->streamout, point_coord_mask, linkage);
+                panfrost_emit_varying_descs(pool, vs, fs, point_coord_mask, linkage);
        }

        unsigned present = linkage->present, stride = linkage->stride;
-        unsigned xfb_base = pan_xfb_base(present);
+        unsigned count = util_bitcount(present);
        struct panfrost_ptr T =
                pan_pool_alloc_desc_array(&batch->pool.base,
-                                          xfb_base +
-                                          ctx->streamout.num_targets + 1,
+                                          count + 1,
                                          ATTRIBUTE_BUFFER);
        struct mali_attribute_buffer_packed *varyings =
                (struct mali_attribute_buffer_packed *) T.cpu;

        if (buffer_count)
-                *buffer_count = xfb_base + ctx->streamout.num_targets;
+                *buffer_count = count;

 #if PAN_ARCH >= 6
        /* Suppress prefetch on Bifrost */
-        memset(varyings + xfb_base + ctx->streamout.num_targets, 0, sizeof(*varyings));
-#else
-        /* Emit the stream out buffers. We need enough room for all the
-         * vertices we emit across all instances */
-
-        struct pipe_stream_output_info *so = &vs->stream_output;
-
-        unsigned out_count = ctx->instance_count *
-                u_stream_outputs_for_vertices(ctx->active_prim, ctx->vertex_count);
-
-        for (unsigned i = 0; i < ctx->streamout.num_targets; ++i) {
-                if (!ctx->streamout.targets[i])
-                        continue;
-
-                panfrost_emit_streamout(batch, &varyings[xfb_base + i],
-                                        so->stride[i] * 4,
-                                        out_count,
-                                        ctx->streamout.targets[i]);
-        }
+        memset(varyings + count, 0, sizeof(*varyings));
 #endif

        if (stride) {
@@ -3656,7 +3553,7 @@ panfrost_launch_xfb(struct panfrost_batch *batch,

        panfrost_pack_work_groups_compute(&invocation,
                        1, count, info->instance_count,
-                        1, 1, 1, false, false);
+                        1, 1, 1, PAN_ARCH <= 5, false);

        batch->uniform_buffers[PIPE_SHADER_VERTEX] =
                panfrost_emit_const_buf(batch, PIPE_SHADER_VERTEX, NULL,
@@ -3665,9 +3562,12 @@ panfrost_launch_xfb(struct panfrost_batch *batch,
        panfrost_draw_emit_vertex(batch, info, &invocation, 0, 0,
                                  attribs, attrib_bufs, t.cpu);
 #endif
-        panfrost_add_job(&batch->pool.base, &batch->scoreboard,
-                        MALI_JOB_TYPE_COMPUTE, true, false,
-                        0, 0, &t, false);
+        enum mali_job_type job_type = MALI_JOB_TYPE_COMPUTE;
+#if PAN_ARCH <= 5
+        job_type = MALI_JOB_TYPE_VERTEX;
+#endif
+        panfrost_add_job(&batch->pool.base, &batch->scoreboard, job_type,
+                         true, false, 0, 0, &t, false);

        ctx->shader[PIPE_SHADER_VERTEX] = saved_vs;
        batch->rsd[PIPE_SHADER_VERTEX] = saved_rsd;
@@ -3801,14 +3701,12 @@ panfrost_direct_draw(struct panfrost_batch *batch,
        panfrost_update_shader_state(batch, PIPE_SHADER_FRAGMENT);
        panfrost_clean_state_3d(ctx);

-#if PAN_ARCH >= 6
        if (vs->xfb) {
 #if PAN_ARCH >= 9
                mali_ptr attribs = 0, attrib_bufs = 0;
 #endif
                panfrost_launch_xfb(batch, info, attribs, attrib_bufs, draw->count);
        }
-#endif

        /* Increment transform feedback offsets */
        panfrost_update_streamout_offsets(ctx);
--- a/src/panfrost/bifrost/bifrost_compile.c
+++ b/src/panfrost/bifrost/bifrost_compile.c
@@ -4922,7 +4922,7 @@ bi_finalize_nir(nir_shader *nir, unsigned gpu_id, bool is_blend)
                NIR_PASS_V(nir, nir_io_add_const_offset_to_base,
                           nir_var_shader_in | nir_var_shader_out);
                NIR_PASS_V(nir, nir_io_add_intrinsic_xfb_info);
-                NIR_PASS_V(nir, bifrost_nir_lower_xfb);
+                NIR_PASS_V(nir, pan_lower_xfb);
        }

        bi_optimize_nir(nir, gpu_id, is_blend);
--- a/src/panfrost/bifrost/meson.build
+++ b/src/panfrost/bifrost/meson.build
@@ -29,7 +29,6 @@ libpanfrost_bifrost_files = files(
  'bi_liveness.c',
  'bi_lower_divergent_indirects.c',
  'bi_lower_swizzle.c',
-  'bi_lower_xfb.c',
  'bi_print.c',
  'bi_opt_constant_fold.c',
  'bi_opt_copy_prop.c',
--- a/src/panfrost/midgard/midgard_compile.c
+++ b/src/panfrost/midgard/midgard_compile.c
@@ -363,6 +363,13 @@ optimise_nir(nir_shader *nir, unsigned quirks, bool is_blend, bool is_blit)
        NIR_PASS(progress, nir, pan_lower_helper_invocation);
        NIR_PASS(progress, nir, pan_lower_sample_pos);

+        if (nir->xfb_info != NULL && nir->info.has_transform_feedback_varyings) {
+                NIR_PASS_V(nir, nir_io_add_const_offset_to_base,
+                           nir_var_shader_in | nir_var_shader_out);
+                NIR_PASS_V(nir, nir_io_add_intrinsic_xfb_info);
+                NIR_PASS_V(nir, pan_lower_xfb);
+        }
+
        NIR_PASS(progress, nir, midgard_nir_lower_algebraic_early);
        NIR_PASS_V(nir, nir_lower_alu_to_scalar, mdg_should_scalarize, NULL);

@@ -2089,9 +2096,14 @@ emit_intrinsic(compiler_context *ctx, nir_intrinsic_instr *instr)
                emit_global(ctx, &instr->instr, false, reg, &instr->src[1], seg);
                break;

-        case nir_intrinsic_load_first_vertex:
        case nir_intrinsic_load_ssbo_address:
+        case nir_intrinsic_load_xfb_address:
+                emit_sysval_read(ctx, &instr->instr, 2, 0);
+                break;
+
+        case nir_intrinsic_load_first_vertex:
        case nir_intrinsic_load_work_dim:
+        case nir_intrinsic_load_num_vertices:
                emit_sysval_read(ctx, &instr->instr, 1, 0);
                break;

@@ -2100,6 +2112,7 @@ emit_intrinsic(compiler_context *ctx, nir_intrinsic_instr *instr)
                break;

        case nir_intrinsic_load_base_instance:
+        case nir_intrinsic_get_ssbo_size:
                emit_sysval_read(ctx, &instr->instr, 1, 8);
                break;

@@ -2107,10 +2120,6 @@ emit_intrinsic(compiler_context *ctx, nir_intrinsic_instr *instr)
                emit_sysval_read(ctx, &instr->instr, 2, 0);
                break;

-        case nir_intrinsic_get_ssbo_size:
-                emit_sysval_read(ctx, &instr->instr, 1, 8);
-                break;
-
        case nir_intrinsic_load_viewport_scale:
        case nir_intrinsic_load_viewport_offset:
        case nir_intrinsic_load_num_workgroups:
--- a/src/panfrost/util/meson.build
+++ b/src/panfrost/util/meson.build
@@ -30,6 +30,7 @@ libpanfrost_util_files = files(
  'pan_lower_helper_invocation.c',
  'pan_lower_sample_position.c',
  'pan_lower_writeout.c',
+  'pan_lower_xfb.c',
  'pan_lower_64bit_intrin.c',
  'pan_sysval.c',
 )
--- a/src/panfrost/util/pan_ir.h
+++ b/src/panfrost/util/pan_ir.h
@@ -502,6 +502,7 @@ bool pan_nir_lower_64bit_intrin(nir_shader *shader);

 bool pan_lower_helper_invocation(nir_shader *shader);
 bool pan_lower_sample_pos(nir_shader *shader);
+bool pan_lower_xfb(nir_shader *nir);

 /*
 * Helper returning the subgroup size. Generally, this is equal to the number of
--- a/src/panfrost/bifrost/bi_lower_xfb.c
+++ b/src/panfrost/bifrost/bi_lower_xfb.c
@@ -21,7 +21,9 @@
 * SOFTWARE.
 */

-#include "bifrost_nir.h"
+
+#include "pan_ir.h"
+#include "compiler/nir/nir_builder.h"

 static void
 lower_xfb_output(nir_builder *b, nir_intrinsic_instr *intr,
@@ -42,6 +44,9 @@ lower_xfb_output(nir_builder *b, nir_intrinsic_instr *intr,
                            nir_load_num_vertices(b)),
                nir_load_vertex_id_zero_base(b));

+        BITSET_SET(b->shader->info.system_values_read, SYSTEM_VALUE_VERTEX_ID_ZERO_BASE);
+        BITSET_SET(b->shader->info.system_values_read, SYSTEM_VALUE_INSTANCE_ID);
+
        nir_ssa_def *buf = nir_load_xfb_address(b, 64, .base = buffer);
        nir_ssa_def *addr =
                nir_iadd(b, buf, nir_u2u64(b,
@@ -87,7 +92,7 @@ lower_xfb(nir_builder *b, nir_instr *instr, UNUSED void *data)
 }

 bool
-bifrost_nir_lower_xfb(nir_shader *nir)
+pan_lower_xfb(nir_shader *nir)
 {
        return nir_shader_instructions_pass(nir, lower_xfb,
                                            nir_metadata_block_index |