diff --git a/docs/features.txt b/docs/features.txt index f8e88faf89d..34709295c87 100644 --- a/docs/features.txt +++ b/docs/features.txt @@ -54,7 +54,7 @@ GL 3.0, GLSL 1.30 --- all DONE: freedreno, i965, nv50, nvc0, r600, radeonsi, llv GL_EXT_draw_buffers2 (Per-buffer blend and masks) DONE (v3d, asahi) GL_EXT_texture_compression_rgtc DONE (all drivers that support GL_EXT_texture_snorm) GL_ARB_texture_rg DONE (v3d, lima, asahi) - GL_EXT_transform_feedback (Transform feedback) DONE (v3d) + GL_EXT_transform_feedback (Transform feedback) DONE (v3d, asahi) GL_ARB_vertex_array_object (Vertex array objects) DONE (v3d, vc4, lima, asahi) GL_EXT_framebuffer_sRGB (sRGB framebuffer format) DONE (v3d, vc4, lima, asahi) glClearBuffer commands DONE @@ -136,8 +136,8 @@ GL 4.0, GLSL 4.00 --- all DONE: freedreno/a6xx, i965/gen7+, nvc0, r600, radeonsi GL_ARB_texture_cube_map_array DONE (freedreno/a4xx+, i965/gen6+, nv50, softpipe, v3d) GL_ARB_texture_gather DONE (freedreno, i965/gen6+, nv50, softpipe, v3d, panfrost, asahi) GL_ARB_texture_query_lod DONE (freedreno, i965, nv50, softpipe, v3d, panfrost) - GL_ARB_transform_feedback2 DONE (freedreno/a3xx+, i965/gen6+, nv50, softpipe, v3d, panfrost) - GL_ARB_transform_feedback3 DONE (freedreno/a3xx+, i965/gen7+, softpipe, ) + GL_ARB_transform_feedback2 DONE (freedreno/a3xx+, i965/gen6+, nv50, softpipe, v3d, panfrost, asahi) + GL_ARB_transform_feedback3 DONE (freedreno/a3xx+, i965/gen7+, softpipe, asahi) GL 4.1, GLSL 4.10 --- all DONE: freedreno/a6xx, i965/gen7+, nvc0, r600, radeonsi, llvmpipe, virgl, zink, d3d12 @@ -156,7 +156,7 @@ GL 4.2, GLSL 4.20 -- all DONE: freedreno/a6xx, i965/gen7+, nvc0, r600, radeonsi, GL_ARB_compressed_texture_pixel_storage DONE (all drivers) GL_ARB_shader_atomic_counters DONE (freedreno/a5xx+, i965, softpipe, v3d, panfrost) GL_ARB_texture_storage DONE (all drivers) - GL_ARB_transform_feedback_instanced DONE (freedreno, i965, nv50, softpipe, v3d) + GL_ARB_transform_feedback_instanced DONE (freedreno, i965, nv50, softpipe, v3d, asahi) GL_ARB_base_instance DONE (freedreno, i965, nv50, softpipe, v3d) GL_ARB_shader_image_load_store DONE (freedreno/a5xx+, i965, softpipe, panfrost) GL_ARB_conservative_depth DONE (all drivers that support GLSL 1.30) diff --git a/src/gallium/drivers/asahi/agx_blit.c b/src/gallium/drivers/asahi/agx_blit.c index 873307c4164..5e104ba93ca 100644 --- a/src/gallium/drivers/asahi/agx_blit.c +++ b/src/gallium/drivers/asahi/agx_blit.c @@ -27,7 +27,8 @@ agx_blitter_save(struct agx_context *ctx, struct blitter_context *blitter, util_blitter_save_blend(blitter, ctx->blend); util_blitter_save_depth_stencil_alpha(blitter, ctx->zs); util_blitter_save_stencil_ref(blitter, &ctx->stencil_ref); - util_blitter_save_so_targets(blitter, 0, NULL); + util_blitter_save_so_targets(blitter, ctx->streamout.num_targets, + ctx->streamout.targets); util_blitter_save_sample_mask(blitter, ctx->sample_mask, 0); util_blitter_save_framebuffer(blitter, &ctx->framebuffer); diff --git a/src/gallium/drivers/asahi/agx_nir_lower_sysvals.c b/src/gallium/drivers/asahi/agx_nir_lower_sysvals.c index b57d9e10503..3ffe61ac36d 100644 --- a/src/gallium/drivers/asahi/agx_nir_lower_sysvals.c +++ b/src/gallium/drivers/asahi/agx_nir_lower_sysvals.c @@ -111,6 +111,18 @@ lower_intrinsic(nir_builder *b, nir_intrinsic_instr *intr) &u->ssbo_size, intr->src[0].ssa); case nir_intrinsic_load_num_workgroups: return load_sysval(b, 3, 32, AGX_SYSVAL_TABLE_GRID, 0); + case nir_intrinsic_load_xfb_address: + return load_sysval_root(b, 1, 64, + &u->vs.xfb.base[nir_intrinsic_base(intr)]); + case nir_intrinsic_load_xfb_size: + return load_sysval_root(b, 1, 32, + &u->vs.xfb.size[nir_intrinsic_base(intr)]); + case nir_intrinsic_load_xfb_index_buffer: + return load_sysval_root(b, 1, 64, &u->vs.xfb.index_buffer); + case nir_intrinsic_load_base_vertex: + return load_sysval_root(b, 1, 32, &u->vs.xfb.base_vertex); + case nir_intrinsic_load_num_vertices: + return load_sysval_root(b, 1, 32, &u->vs.xfb.num_vertices); default: return NULL; } diff --git a/src/gallium/drivers/asahi/agx_pipe.c b/src/gallium/drivers/asahi/agx_pipe.c index 351bd8f7b06..6ce37d9396b 100644 --- a/src/gallium/drivers/asahi/agx_pipe.c +++ b/src/gallium/drivers/asahi/agx_pipe.c @@ -1397,6 +1397,7 @@ agx_create_context(struct pipe_screen *screen, void *priv, unsigned flags) agx_init_state_functions(pctx); agx_init_query_functions(pctx); + agx_init_streamout_functions(pctx); agx_meta_init(&ctx->meta, agx_device(screen)); @@ -1556,15 +1557,15 @@ agx_get_param(struct pipe_screen *pscreen, enum pipe_cap param) return 0; case PIPE_CAP_MAX_STREAM_OUTPUT_BUFFERS: - return is_deqp ? PIPE_MAX_SO_BUFFERS : 0; + return PIPE_MAX_SO_BUFFERS; case PIPE_CAP_MAX_STREAM_OUTPUT_SEPARATE_COMPONENTS: case PIPE_CAP_MAX_STREAM_OUTPUT_INTERLEAVED_COMPONENTS: - return is_deqp ? PIPE_MAX_SO_OUTPUTS : 0; + return PIPE_MAX_SO_OUTPUTS; case PIPE_CAP_STREAM_OUTPUT_PAUSE_RESUME: case PIPE_CAP_STREAM_OUTPUT_INTERLEAVE_BUFFERS: - return is_deqp ? 1 : 0; + return 1; case PIPE_CAP_MAX_TEXTURE_ARRAY_LAYERS: return 2048; @@ -1587,6 +1588,13 @@ agx_get_param(struct pipe_screen *pscreen, enum pipe_cap param) case PIPE_CAP_VERTEX_ATTRIB_ELEMENT_ALIGNED_ONLY: return 1; + /* We run nir_lower_point_size so we need the GLSL linker to copy + * the original gl_PointSize when captured by transform feedback. We could + * also copy it ourselves but it's easier to set the CAP. + */ + case PIPE_CAP_PSIZ_CLAMPED: + return 1; + case PIPE_CAP_MAX_TEXTURE_2D_SIZE: return 16384; case PIPE_CAP_MAX_TEXTURE_3D_LEVELS: diff --git a/src/gallium/drivers/asahi/agx_query.c b/src/gallium/drivers/asahi/agx_query.c index 0fe2e3c8ebb..fcea7fb41e7 100644 --- a/src/gallium/drivers/asahi/agx_query.c +++ b/src/gallium/drivers/asahi/agx_query.c @@ -1,8 +1,10 @@ /* * Copyright 2022 Alyssa Rosenzweig + * Copyright 2019-2020 Collabora, Ltd. * SPDX-License-Identifier: MIT */ +#include "util/u_prim.h" #include "agx_state.h" static struct pipe_query * @@ -39,12 +41,13 @@ agx_begin_query(struct pipe_context *pctx, struct pipe_query *pquery) struct agx_context *ctx = agx_context(pctx); struct agx_query *query = (struct agx_query *)pquery; + ctx->dirty |= AGX_DIRTY_QUERY; + switch (query->type) { case PIPE_QUERY_OCCLUSION_COUNTER: case PIPE_QUERY_OCCLUSION_PREDICATE: case PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE: ctx->occlusion_query = query; - ctx->dirty |= AGX_DIRTY_QUERY; /* begin_query zeroes, flush so we can do that write. If anything (i.e. * other than piglit) actually hits this, we could shadow the query to @@ -60,6 +63,16 @@ agx_begin_query(struct pipe_context *pctx, struct pipe_query *pquery) query->value = 0; return true; + case PIPE_QUERY_PRIMITIVES_GENERATED: + ctx->prims_generated = query; + query->value = 0; + return true; + + case PIPE_QUERY_PRIMITIVES_EMITTED: + ctx->tf_prims_generated = query; + query->value = 0; + return true; + default: return false; } @@ -71,14 +84,20 @@ agx_end_query(struct pipe_context *pctx, struct pipe_query *pquery) struct agx_context *ctx = agx_context(pctx); struct agx_query *query = (struct agx_query *)pquery; + ctx->dirty |= AGX_DIRTY_QUERY; + switch (query->type) { case PIPE_QUERY_OCCLUSION_COUNTER: case PIPE_QUERY_OCCLUSION_PREDICATE: case PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE: ctx->occlusion_query = NULL; - ctx->dirty |= AGX_DIRTY_QUERY; return true; - + case PIPE_QUERY_PRIMITIVES_GENERATED: + ctx->prims_generated = NULL; + return true; + case PIPE_QUERY_PRIMITIVES_EMITTED: + ctx->tf_prims_generated = NULL; + return true; default: return false; } @@ -117,6 +136,11 @@ agx_get_query_result(struct pipe_context *pctx, struct pipe_query *pquery, return true; + case PIPE_QUERY_PRIMITIVES_GENERATED: + case PIPE_QUERY_PRIMITIVES_EMITTED: + vresult->u64 = query->value; + return true; + default: unreachable("Other queries not yet supported"); } diff --git a/src/gallium/drivers/asahi/agx_state.c b/src/gallium/drivers/asahi/agx_state.c index f71fd9065ad..870f77b13cb 100644 --- a/src/gallium/drivers/asahi/agx_state.c +++ b/src/gallium/drivers/asahi/agx_state.c @@ -22,6 +22,7 @@ #include "gallium/auxiliary/util/u_draw.h" #include "gallium/auxiliary/util/u_framebuffer.h" #include "gallium/auxiliary/util/u_helpers.h" +#include "gallium/auxiliary/util/u_prim_restart.h" #include "gallium/auxiliary/util/u_viewport.h" #include "pipe/p_context.h" #include "pipe/p_defines.h" @@ -37,59 +38,6 @@ #include "util/u_transfer.h" #include "agx_disk_cache.h" -static struct pipe_stream_output_target * -agx_create_stream_output_target(struct pipe_context *pctx, - struct pipe_resource *prsc, - unsigned buffer_offset, unsigned buffer_size) -{ - struct pipe_stream_output_target *target; - - target = &rzalloc(pctx, struct agx_streamout_target)->base; - - if (!target) - return NULL; - - pipe_reference_init(&target->reference, 1); - pipe_resource_reference(&target->buffer, prsc); - - target->context = pctx; - target->buffer_offset = buffer_offset; - target->buffer_size = buffer_size; - - return target; -} - -static void -agx_stream_output_target_destroy(struct pipe_context *pctx, - struct pipe_stream_output_target *target) -{ - pipe_resource_reference(&target->buffer, NULL); - ralloc_free(target); -} - -static void -agx_set_stream_output_targets(struct pipe_context *pctx, unsigned num_targets, - struct pipe_stream_output_target **targets, - const unsigned *offsets) -{ - struct agx_context *ctx = agx_context(pctx); - struct agx_streamout *so = &ctx->streamout; - - assert(num_targets <= ARRAY_SIZE(so->targets)); - - for (unsigned i = 0; i < num_targets; i++) { - if (offsets[i] != -1) - agx_so_target(targets[i])->offset = offsets[i]; - - pipe_so_target_reference(&so->targets[i], targets[i]); - } - - for (unsigned i = 0; i < so->num_targets; i++) - pipe_so_target_reference(&so->targets[i], NULL); - - so->num_targets = num_targets; -} - static void agx_set_shader_images(struct pipe_context *pctx, enum pipe_shader_type shader, unsigned start_slot, unsigned count, @@ -1403,6 +1351,9 @@ agx_compile_variant(struct agx_device *dev, struct agx_uncompiled_shader *so, struct asahi_vs_shader_key *key = &key_->vs; NIR_PASS_V(nir, agx_nir_lower_vbo, &key->vbuf); + + if (key->xfb.active && nir->xfb_info != NULL) + NIR_PASS_V(nir, agx_nir_lower_xfb, &key->xfb); } else if (nir->info.stage == MESA_SHADER_FRAGMENT) { struct asahi_fs_shader_key *key = &key_->fs; @@ -1672,12 +1623,14 @@ agx_update_vs(struct agx_context *ctx) /* Only proceed if the shader or anything the key depends on changes * * vb_mask, attributes, vertex_buffers: VERTEX + * streamout.active: XFB */ - if (!(ctx->dirty & (AGX_DIRTY_VS_PROG | AGX_DIRTY_VERTEX))) + if (!(ctx->dirty & (AGX_DIRTY_VS_PROG | AGX_DIRTY_VERTEX | AGX_DIRTY_XFB))) return false; struct asahi_vs_shader_key key = { .vbuf.count = util_last_bit(ctx->vb_mask), + .xfb = ctx->streamout.key, }; memcpy(key.vbuf.attributes, ctx->attributes, @@ -2563,7 +2516,46 @@ agx_draw_vbo(struct pipe_context *pctx, const struct pipe_draw_info *info, return; } + if (indirect && indirect->count_from_stream_output) { + agx_draw_vbo_from_xfb(pctx, info, drawid_offset, indirect); + return; + } + + const nir_shader *nir_vs = ctx->stage[PIPE_SHADER_VERTEX].shader->nir; + bool uses_xfb = nir_vs->xfb_info && ctx->streamout.num_targets; + bool uses_prims_generated = ctx->active_queries && ctx->prims_generated; + + if (indirect && (uses_prims_generated || uses_xfb)) { + perf_debug_ctx(ctx, "Emulating indirect draw due to XFB"); + util_draw_indirect(pctx, info, indirect); + return; + } + + if (uses_xfb && info->primitive_restart) { + perf_debug_ctx(ctx, "Emulating primitive restart due to XFB"); + util_draw_vbo_without_prim_restart(pctx, info, drawid_offset, indirect, + draws); + return; + } + + if (!ctx->streamout.key.active && uses_prims_generated) { + agx_primitives_update_direct(ctx, info, draws); + } + struct agx_batch *batch = agx_get_batch(ctx); + unsigned idx_size = info->index_size; + uint64_t ib = 0; + size_t ib_extent = 0; + + if (idx_size) { + if (indirect != NULL) + ib = agx_index_buffer_rsrc_ptr(batch, info, &ib_extent); + else + ib = agx_index_buffer_direct_ptr(batch, draws, info, &ib_extent); + } + + if (uses_xfb) + agx_launch_so(pctx, info, draws, ib); #ifndef NDEBUG if (unlikely(agx_device(pctx->screen)->debug & AGX_DBG_DIRTY)) @@ -2573,8 +2565,10 @@ agx_draw_vbo(struct pipe_context *pctx, const struct pipe_draw_info *info, if (agx_scissor_culls_everything(ctx)) return; - /* We don't support side effects in vertex stages, so this is trivial */ - if (ctx->rast->base.rasterizer_discard) + /* We don't support side effects in vertex stages (only used internally for + * transform feedback lowering), so this is trivial. + */ + if (ctx->rast->base.rasterizer_discard && !ctx->streamout.key.active) return; /* Dirty track the reduced prim: lines vs points vs triangles */ @@ -2631,17 +2625,6 @@ agx_draw_vbo(struct pipe_context *pctx, const struct pipe_draw_info *info, reduced_prim == PIPE_PRIM_POINTS); enum agx_primitive prim = agx_primitive_for_pipe(info->mode); - unsigned idx_size = info->index_size; - uint64_t ib = 0; - size_t ib_extent = 0; - - if (idx_size) { - if (indirect != NULL) - ib = agx_index_buffer_rsrc_ptr(batch, info, &ib_extent); - else - ib = agx_index_buffer_direct_ptr(batch, draws, info, &ib_extent); - } - if (idx_size) { /* Index sizes are encoded logarithmically */ STATIC_ASSERT(__builtin_ctz(1) == AGX_INDEX_SIZE_U8); @@ -2729,6 +2712,21 @@ agx_draw_vbo(struct pipe_context *pctx, const struct pipe_draw_info *info, out += AGX_INDEX_LIST_BUFFER_SIZE_LENGTH; } + /* Insert a memory barrier after transform feedback so the result may be + * consumed by a subsequent vertex shader. + */ + if (ctx->streamout.key.active) { + agx_pack(out, VDM_BARRIER, cfg) { + cfg.unk_5 = true; + cfg.unk_6 = true; + cfg.unk_8 = true; + cfg.unk_11 = true; + cfg.unk_20 = true; + } + + out += AGX_VDM_BARRIER_LENGTH; + } + batch->encoder_current = out; assert((batch->encoder_current + AGX_VDM_STREAM_LINK_LENGTH) <= batch->encoder_end && @@ -2889,8 +2887,5 @@ agx_init_state_functions(struct pipe_context *ctx) ctx->surface_destroy = agx_surface_destroy; ctx->draw_vbo = agx_draw_vbo; ctx->launch_grid = agx_launch_grid; - ctx->create_stream_output_target = agx_create_stream_output_target; - ctx->stream_output_target_destroy = agx_stream_output_target_destroy; - ctx->set_stream_output_targets = agx_set_stream_output_targets; ctx->texture_barrier = agx_texture_barrier; } diff --git a/src/gallium/drivers/asahi/agx_state.h b/src/gallium/drivers/asahi/agx_state.h index e44d3010946..ab43824cd62 100644 --- a/src/gallium/drivers/asahi/agx_state.h +++ b/src/gallium/drivers/asahi/agx_state.h @@ -39,17 +39,45 @@ struct agx_streamout_target { uint32_t offset; }; -struct agx_streamout { - struct pipe_stream_output_target *targets[PIPE_MAX_SO_BUFFERS]; - unsigned num_targets; -}; - static inline struct agx_streamout_target * agx_so_target(struct pipe_stream_output_target *target) { return (struct agx_streamout_target *)target; } +struct agx_xfb_key { + /* If true, compiles a "transform feedback" program instead of a vertex + * shader. This is a kernel that runs on the VDM and writes out the transform + * feedback buffers, with no rasterization. + */ + bool active; + + /* The index size (1, 2, 4) or 0 if drawing without an index buffer. */ + uint8_t index_size; + + /* The primitive mode for unrolling the vertex ID */ + enum pipe_prim_type mode; + + /* Use first vertex as the provoking vertex for flat shading */ + bool flatshade_first; +}; + +struct agx_xfb_params { + uint64_t base[PIPE_MAX_SO_BUFFERS]; + uint32_t size[PIPE_MAX_SO_BUFFERS]; + uint64_t index_buffer; + uint32_t base_vertex; + uint32_t num_vertices; +}; + +struct agx_streamout { + struct pipe_stream_output_target *targets[PIPE_MAX_SO_BUFFERS]; + unsigned num_targets; + + struct agx_xfb_key key; + struct agx_xfb_params params; +}; + /* Shaders can access fixed-function state through system values. * It is convenient to stash all of this information into a single "root" * descriptor, then push individual parts as needed. @@ -87,6 +115,9 @@ struct PACKED agx_draw_uniforms { struct { /* Vertex buffer object bases, if present */ uint64_t vbo_base[PIPE_MAX_ATTRIBS]; + + /* Transform feedback info for a transform feedback shader */ + struct agx_xfb_params xfb; } vs; struct { @@ -239,6 +270,7 @@ struct agx_blend { struct asahi_vs_shader_key { struct agx_vbufs vbuf; + struct agx_xfb_key xfb; }; struct asahi_fs_shader_key { @@ -277,6 +309,7 @@ enum agx_dirty { AGX_DIRTY_BLEND = BITFIELD_BIT(12), AGX_DIRTY_QUERY = BITFIELD_BIT(13), + AGX_DIRTY_XFB = BITFIELD_BIT(14), }; /* Maximum number of in-progress + under-construction GPU batches. @@ -336,6 +369,8 @@ struct agx_context { enum pipe_render_cond_flag cond_mode; struct agx_query *occlusion_query; + struct agx_query *prims_generated; + struct agx_query *tf_prims_generated; bool active_queries; struct util_debug_callback debug; @@ -410,6 +445,27 @@ agx_context(struct pipe_context *pctx) void agx_init_query_functions(struct pipe_context *ctx); +void +agx_primitives_update_direct(struct agx_context *ctx, + const struct pipe_draw_info *info, + const struct pipe_draw_start_count_bias *draw); + +void agx_nir_lower_xfb(nir_shader *shader, struct agx_xfb_key *key); + +void agx_draw_vbo_from_xfb(struct pipe_context *pctx, + const struct pipe_draw_info *info, + unsigned drawid_offset, + const struct pipe_draw_indirect_info *indirect); + +void agx_launch_so(struct pipe_context *pctx, const struct pipe_draw_info *info, + const struct pipe_draw_start_count_bias *draws, + uint64_t index_buffer); + +uint64_t agx_batch_get_so_address(struct agx_batch *batch, unsigned buffer, + uint32_t *size); + +void agx_init_streamout_functions(struct pipe_context *ctx); + static inline void agx_dirty_all(struct agx_context *ctx) { @@ -438,7 +494,7 @@ struct agx_query { struct agx_batch *writer; unsigned writer_index; - /* For occlusion queries, which use some CPU work */ + /* Accumulator flushed to the CPU */ uint64_t value; }; diff --git a/src/gallium/drivers/asahi/agx_streamout.c b/src/gallium/drivers/asahi/agx_streamout.c new file mode 100644 index 00000000000..190211ca7ee --- /dev/null +++ b/src/gallium/drivers/asahi/agx_streamout.c @@ -0,0 +1,574 @@ +/* + * Copyright 2023 Alyssa Rosenzweig + * Copyright 2022 Collabora Ltd. + * SPDX-License-Identifier: MIT + */ + +#include "compiler/nir/nir_builder.h" +#include "compiler/nir/nir_xfb_info.h" +#include "util/u_draw.h" +#include "util/u_dump.h" +#include "util/u_prim.h" +#include "agx_state.h" + +static struct pipe_stream_output_target * +agx_create_stream_output_target(struct pipe_context *pctx, + struct pipe_resource *prsc, + unsigned buffer_offset, unsigned buffer_size) +{ + struct pipe_stream_output_target *target; + + target = &rzalloc(pctx, struct agx_streamout_target)->base; + + if (!target) + return NULL; + + pipe_reference_init(&target->reference, 1); + pipe_resource_reference(&target->buffer, prsc); + + target->context = pctx; + target->buffer_offset = buffer_offset; + target->buffer_size = buffer_size; + + return target; +} + +static void +agx_stream_output_target_destroy(struct pipe_context *pctx, + struct pipe_stream_output_target *target) +{ + pipe_resource_reference(&target->buffer, NULL); + ralloc_free(target); +} + +static void +agx_set_stream_output_targets(struct pipe_context *pctx, unsigned num_targets, + struct pipe_stream_output_target **targets, + const unsigned *offsets) +{ + struct agx_context *ctx = agx_context(pctx); + struct agx_streamout *so = &ctx->streamout; + + assert(num_targets <= ARRAY_SIZE(so->targets)); + + for (unsigned i = 0; i < num_targets; i++) { + /* From the Gallium documentation: + * + * -1 means the buffer should be appended to, and everything else sets + * the internal offset. + * + * We append regardless, so just check for != -1. Yes, using a negative + * sentinel value with an unsigned type is bananas. But it's in the + * Gallium contract and it will work out fine. Probably should be + * redefined to be ~0 instead of -1 but it doesn't really matter. + */ + if (offsets[i] != -1) + agx_so_target(targets[i])->offset = offsets[i]; + + pipe_so_target_reference(&so->targets[i], targets[i]); + } + + for (unsigned i = num_targets; i < so->num_targets; i++) + pipe_so_target_reference(&so->targets[i], NULL); + + so->num_targets = num_targets; +} + +static struct pipe_stream_output_target * +get_target(struct agx_context *ctx, unsigned buffer) +{ + if (buffer < ctx->streamout.num_targets) + return ctx->streamout.targets[buffer]; + else + return NULL; +} + +/* + * Return the address of the indexed streamout buffer. This will be + * pushed into the streamout shader. + */ +uint64_t +agx_batch_get_so_address(struct agx_batch *batch, unsigned buffer, + uint32_t *size) +{ + struct pipe_stream_output_target *target = get_target(batch->ctx, buffer); + + /* If there's no target, don't write anything */ + if (!target) { + *size = 0; + return 0; + } + + /* Otherwise, write the target */ + struct pipe_stream_output_info *so = + &batch->ctx->stage[PIPE_SHADER_VERTEX].shader->base.stream_output; + + struct agx_resource *rsrc = agx_resource(target->buffer); + agx_batch_writes(batch, rsrc); + + /* The amount of space left depends how much we've already consumed */ + unsigned stride = so->stride[buffer] * 4; + uint32_t offset = agx_so_target(target)->offset * stride; + + *size = offset < target->buffer_size ? (target->buffer_size - offset) : 0; + return rsrc->bo->ptr.gpu + target->buffer_offset + offset; +} + +void +agx_draw_vbo_from_xfb(struct pipe_context *pctx, + const struct pipe_draw_info *info, unsigned drawid_offset, + const struct pipe_draw_indirect_info *indirect) +{ + struct pipe_draw_start_count_bias draw = { + .start = 0, + .count = agx_so_target(indirect->count_from_stream_output)->offset, + }; + + pctx->draw_vbo(pctx, info, drawid_offset, NULL, &draw, 1); +} + +static uint32_t +xfb_prims_for_vertices(enum pipe_prim_type mode, unsigned verts) +{ + uint32_t prims = u_decomposed_prims_for_vertices(mode, verts); + + /* The GL spec isn't super clear about this, but it implies that quads are + * supposed to be tessellated into primitives and piglit + * (ext_transform_feedback-tessellation quads) checks this. + */ + if (u_decomposed_prim(mode) == PIPE_PRIM_QUADS) + prims *= 2; + + return prims; +} + +/* + * Launch a streamout pipeline. + */ +void +agx_launch_so(struct pipe_context *pctx, const struct pipe_draw_info *info, + const struct pipe_draw_start_count_bias *draw, + uint64_t index_buffer) +{ + struct agx_context *ctx = agx_context(pctx); + + /* Break recursion from draw_vbo creating draw calls below: Do not do a + * streamout draw for a streamout draw. + */ + if (ctx->streamout.key.active) + return; + + /* Configure the below draw to launch streamout rather than a regular draw */ + ctx->streamout.key.active = true; + ctx->dirty |= AGX_DIRTY_XFB; + + ctx->streamout.key.index_size = info->index_size; + ctx->streamout.key.mode = info->mode; + ctx->streamout.key.flatshade_first = ctx->rast->base.flatshade_first; + ctx->streamout.params.index_buffer = index_buffer; + + /* Ignore provoking vertex for modes that don't depend on the provoking + * vertex, to reduce shader variants. + */ + if (info->mode != PIPE_PRIM_TRIANGLE_STRIP) + ctx->streamout.key.flatshade_first = false; + + /* Determine how many vertices are XFB there will be */ + unsigned num_outputs = + u_stream_outputs_for_vertices(info->mode, draw->count); + unsigned count = draw->count; + u_trim_pipe_prim(info->mode, &count); + + ctx->streamout.params.base_vertex = + info->index_size ? draw->index_bias : draw->start; + ctx->streamout.params.num_vertices = count; + + /* Streamout runs as a vertex shader with rasterizer discard */ + void *saved_rast = ctx->rast; + pctx->bind_rasterizer_state( + pctx, util_blitter_get_discard_rasterizer_state(ctx->blitter)); + + /* Dispatch a grid of points, this is compute-like */ + util_draw_arrays_instanced(pctx, PIPE_PRIM_POINTS, 0, num_outputs, 0, + info->instance_count); + pctx->bind_rasterizer_state(pctx, saved_rast); + + /* + * Finally, if needed, update the counter of primitives written. The spec + * requires: + * + * If recording the vertices of a primitive to the buffer objects being + * used for transform feedback purposes would result in [overflow]... + * the counter corresponding to the asynchronous query target + * TRANSFORM_FEEDBACK_PRIMITIVES_WRITTEN (see section 13.4) is not + * incremented. + * + * So clamp the number of primitives generated to the number of primitives + * we actually have space to write. + */ + if (ctx->tf_prims_generated) { + uint32_t min_max = ~0; + + for (unsigned i = 0; i < ctx->streamout.num_targets; ++i) { + struct pipe_stream_output_target *target = get_target(ctx, i); + + if (!target) + continue; + + struct pipe_stream_output_info *so = + &ctx->stage[PIPE_SHADER_VERTEX].shader->base.stream_output; + unsigned stride = so->stride[i] * 4; + + /* Ignore spurious targets. I don't see anything in the Gallium + * contract specifically forbidding this. + */ + if (stride == 0) + continue; + + uint32_t offset = agx_so_target(target)->offset * stride; + uint32_t remaining = + offset < target->buffer_size ? (target->buffer_size - offset) : 0; + uint32_t max_vertices = stride ? (remaining / stride) : ~0; + + min_max = MIN2(min_max, max_vertices); + } + + /* We now have the maximum vertices written, round down to primitives */ + uint32_t max_prims = xfb_prims_for_vertices(info->mode, min_max); + uint32_t prims = xfb_prims_for_vertices(info->mode, draw->count); + + ctx->tf_prims_generated->value += MIN2(prims, max_prims); + } + + /* Update the offsets into the streamout buffers */ + for (unsigned i = 0; i < ctx->streamout.num_targets; ++i) { + if (ctx->streamout.targets[i]) + agx_so_target(ctx->streamout.targets[i])->offset += num_outputs; + } + + ctx->dirty |= AGX_DIRTY_XFB; + ctx->streamout.key.active = false; +} + +/* + * Count generated primitives on the CPU for transform feedback. This only works + * in the absence of indirect draws, geometry shaders, or tessellation. + */ +void +agx_primitives_update_direct(struct agx_context *ctx, + const struct pipe_draw_info *info, + const struct pipe_draw_start_count_bias *draw) +{ + assert(ctx->active_queries && ctx->prims_generated && "precondition"); + + ctx->prims_generated->value += + xfb_prims_for_vertices(info->mode, draw->count); +} + +/* The OpenGL spec says: + * + * If recording the vertices of a primitive to the buffer objects being + * used for transform feedback purposes would result in either exceeding + * the limits of any buffer object’s size, or in exceeding the end + * position offset + size − 1, as set by BindBufferRange, then no vertices + * of that primitive are recorded in any buffer object. + * + * This function checks for the absence of overflow. + * + * The difficulty is that we are processing a single vertex at a time, so we + * need to do some arithmetic to figure out the bounds for the whole containing + * primitive. + * + * XXX: How do quads get tessellated? + */ +static nir_ssa_def * +primitive_fits(nir_builder *b, struct agx_xfb_key *key) +{ + /* Get the number of vertices per primitive in the current mode, usually just + * the base number but quads are tessellated. + */ + uint32_t verts_per_prim = u_vertices_per_prim(key->mode); + + if (u_decomposed_prim(key->mode) == PIPE_PRIM_QUADS) + verts_per_prim = 6; + + /* Get the ID for this invocation */ + nir_ssa_def *id = nir_load_vertex_id_zero_base(b); + + /* Figure out the ID for the first vertex of the next primitive. Since + * transform feedback buffers are tightly packed, that's one byte after the + * end of this primitive, which will make bounds checking convenient. That + * will be: + * + * (id - (id % prim size)) + prim size + */ + nir_ssa_def *rem = nir_umod_imm(b, id, verts_per_prim); + nir_ssa_def *next_id = nir_iadd_imm(b, nir_isub(b, id, rem), verts_per_prim); + + /* Figure out where that vertex will land */ + nir_ssa_def *index = nir_iadd( + b, nir_imul(b, nir_load_instance_id(b), nir_load_num_vertices(b)), + next_id); + + /* Now check for overflow in each written buffer */ + nir_ssa_def *all_fits = nir_imm_true(b); + + u_foreach_bit(buffer, b->shader->xfb_info->buffers_written) { + uint16_t stride = b->shader->info.xfb_stride[buffer] * 4; + assert(stride != 0); + + /* For this primitive to fit, the next primitive cannot start after the + * end of the transform feedback buffer. + */ + nir_ssa_def *end_offset = nir_imul_imm(b, index, stride); + + /* Check whether that will remain in bounds */ + nir_ssa_def *fits = + nir_uge(b, nir_load_xfb_size(b, .base = buffer), end_offset); + + /* Accumulate */ + all_fits = nir_iand(b, all_fits, fits); + } + + return all_fits; +} + +static void +insert_overflow_check(nir_shader *nir, struct agx_xfb_key *key) +{ + nir_function_impl *impl = nir_shader_get_entrypoint(nir); + + /* Extract the current transform feedback shader */ + nir_cf_list list; + nir_cf_extract(&list, nir_before_block(nir_start_block(impl)), + nir_after_block(nir_impl_last_block(impl))); + + /* Get a builder for the (now empty) shader */ + nir_builder b; + nir_builder_init(&b, impl); + b.cursor = nir_after_block(nir_start_block(impl)); + + /* Rebuild the shader as + * + * if (!overflow) { + * shader(); + * } + */ + nir_push_if(&b, primitive_fits(&b, key)); + { + b.cursor = nir_cf_reinsert(&list, b.cursor); + } + nir_pop_if(&b, NULL); +} + +static void +lower_xfb_output(nir_builder *b, nir_intrinsic_instr *intr, + unsigned start_component, unsigned num_components, + unsigned buffer, unsigned offset_words) +{ + assert(buffer < MAX_XFB_BUFFERS); + assert(nir_intrinsic_component(intr) == 0); // TODO + + /* Transform feedback info in units of words, convert to bytes. */ + uint16_t stride = b->shader->info.xfb_stride[buffer] * 4; + assert(stride != 0); + + uint16_t offset = offset_words * 4; + + nir_ssa_def *index = nir_iadd( + b, nir_imul(b, nir_load_instance_id(b), nir_load_num_vertices(b)), + nir_load_vertex_id_zero_base(b)); + + nir_ssa_def *xfb_offset = + nir_iadd_imm(b, nir_imul_imm(b, index, stride), offset); + + nir_ssa_def *buf = nir_load_xfb_address(b, 64, .base = buffer); + nir_ssa_def *addr = nir_iadd(b, buf, nir_u2u64(b, xfb_offset)); + + nir_ssa_def *value = nir_channels( + b, intr->src[0].ssa, BITFIELD_MASK(num_components) << start_component); + nir_store_global(b, addr, 4, value, BITFIELD_MASK(num_components)); +} + +static bool +lower_xfb(nir_builder *b, nir_instr *instr, UNUSED void *data) +{ + if (instr->type != nir_instr_type_intrinsic) + return false; + + nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr); + if (intr->intrinsic != nir_intrinsic_store_output) + return false; + + /* Assume the inputs are read */ + BITSET_SET(b->shader->info.system_values_read, + SYSTEM_VALUE_VERTEX_ID_ZERO_BASE); + BITSET_SET(b->shader->info.system_values_read, SYSTEM_VALUE_INSTANCE_ID); + + bool progress = false; + + for (unsigned i = 0; i < 2; ++i) { + nir_io_xfb xfb = + i ? nir_intrinsic_io_xfb2(intr) : nir_intrinsic_io_xfb(intr); + + for (unsigned j = 0; j < 2; ++j) { + if (xfb.out[j].num_components > 0) { + b->cursor = nir_before_instr(&intr->instr); + lower_xfb_output(b, intr, i * 2 + j, xfb.out[j].num_components, + xfb.out[j].buffer, xfb.out[j].offset); + progress = true; + } + } + } + + nir_instr_remove(instr); + return progress; +} + +static bool +lower_xfb_intrinsics(struct nir_builder *b, nir_instr *instr, void *data) +{ + if (instr->type != nir_instr_type_intrinsic) + return false; + + nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr); + b->cursor = nir_before_instr(instr); + + struct agx_xfb_key *key = data; + + switch (intr->intrinsic) { + /* XXX: Rename to "xfb index" to avoid the clash */ + case nir_intrinsic_load_vertex_id_zero_base: { + nir_ssa_def *id = nir_load_vertex_id(b); + nir_ssa_def_rewrite_uses(&intr->dest.ssa, id); + return true; + } + + case nir_intrinsic_load_vertex_id: { + /* Get the raw invocation ID */ + nir_ssa_def *id = nir_load_vertex_id(b); + + /* Tessellate by primitive mode */ + if (key->mode == PIPE_PRIM_LINE_STRIP || + key->mode == PIPE_PRIM_LINE_LOOP) { + /* The last vertex is special for a loop. Check if that's we're dealing + * with. + */ + nir_ssa_def *num_invocations = + nir_imul_imm(b, nir_load_num_vertices(b), 2); + nir_ssa_def *last_vertex = + nir_ieq(b, id, nir_iadd_imm(b, num_invocations, -1)); + + /* (0, 1), (1, 2) */ + id = nir_iadd(b, nir_ushr_imm(b, id, 1), nir_iand_imm(b, id, 1)); + + /* (0, 1), (1, 2), (2, 0) */ + if (key->mode == PIPE_PRIM_LINE_LOOP) { + id = nir_bcsel(b, last_vertex, nir_imm_int(b, 0), id); + } + } else if (key->mode == PIPE_PRIM_TRIANGLE_STRIP) { + /* Order depends on the provoking vertex. + * + * First: (0, 1, 2), (1, 3, 2), (2, 3, 4). + * Last: (0, 1, 2), (2, 1, 3), (2, 3, 4). + */ + nir_ssa_def *prim = nir_udiv_imm(b, id, 3); + nir_ssa_def *rem = nir_umod_imm(b, id, 3); + + unsigned pv = key->flatshade_first ? 0 : 2; + + /* Swap the two non-provoking vertices third vertex in odd triangles */ + nir_ssa_def *even = nir_ieq_imm(b, nir_iand_imm(b, prim, 1), 0); + nir_ssa_def *is_provoking = nir_ieq_imm(b, rem, pv); + nir_ssa_def *no_swap = nir_ior(b, is_provoking, even); + nir_ssa_def *swapped = nir_isub_imm(b, 3 - pv, rem); + nir_ssa_def *off = nir_bcsel(b, no_swap, rem, swapped); + + /* Pull the (maybe swapped) vertex from the corresponding primitive */ + id = nir_iadd(b, prim, off); + } else if (key->mode == PIPE_PRIM_TRIANGLE_FAN) { + /* (0, 1, 2), (0, 2, 3) */ + nir_ssa_def *prim = nir_udiv_imm(b, id, 3); + nir_ssa_def *rem = nir_umod_imm(b, id, 3); + + id = nir_bcsel(b, nir_ieq_imm(b, rem, 0), nir_imm_int(b, 0), + nir_iadd(b, prim, rem)); + } else if (key->mode == PIPE_PRIM_QUADS || + key->mode == PIPE_PRIM_QUAD_STRIP) { + /* Quads: [(0, 1, 3), (3, 1, 2)], [(4, 5, 7), (7, 5, 6)] + * Quad strips: [(0, 1, 3), (0, 2, 3)], [(2, 3, 5), (2, 4, 5)] + */ + bool strips = key->mode == PIPE_PRIM_QUAD_STRIP; + + nir_ssa_def *prim = nir_udiv_imm(b, id, 6); + nir_ssa_def *rem = nir_umod_imm(b, id, 6); + nir_ssa_def *base = nir_imul_imm(b, prim, strips ? 2 : 4); + + /* Quads: [0, 1, 3, 3, 1, 2] + * Quad strips: [0, 1, 3, 0, 2, 3] + */ + uint32_t order_quads = 0x213310; + uint32_t order_strips = 0x230310; + uint32_t order = strips ? order_strips : order_quads; + + /* Index out of the bitpacked array */ + nir_ssa_def *offset = nir_iand_imm( + b, nir_ushr(b, nir_imm_int(b, order), nir_imul_imm(b, rem, 4)), + 0xF); + + id = nir_iadd(b, base, offset); + } + + /* Add the "start", either an index bias or a base vertex */ + id = nir_iadd(b, id, nir_load_base_vertex(b)); + + /* If drawing with an index buffer, pull the vertex ID. Otherwise, the + * vertex ID is just the index as-is. + */ + if (key->index_size) { + nir_ssa_def *index_buffer = nir_load_xfb_index_buffer(b, 64); + nir_ssa_def *offset = nir_imul_imm(b, id, key->index_size); + nir_ssa_def *address = nir_iadd(b, index_buffer, nir_u2u64(b, offset)); + nir_ssa_def *index = nir_load_global_constant( + b, address, key->index_size, 1, key->index_size * 8); + + id = nir_u2uN(b, index, id->bit_size); + } + + nir_ssa_def_rewrite_uses(&intr->dest.ssa, id); + return true; + } + + default: + return false; + } +} + +void +agx_nir_lower_xfb(nir_shader *nir, struct agx_xfb_key *key) +{ + assert(nir->info.stage == MESA_SHADER_VERTEX); + + NIR_PASS_V(nir, nir_io_add_const_offset_to_base, + nir_var_shader_in | nir_var_shader_out); + NIR_PASS_V(nir, nir_io_add_intrinsic_xfb_info); + + NIR_PASS_V(nir, insert_overflow_check, key); + NIR_PASS_V(nir, nir_shader_instructions_pass, lower_xfb, + nir_metadata_block_index | nir_metadata_dominance, key); + NIR_PASS_V(nir, nir_shader_instructions_pass, lower_xfb_intrinsics, + nir_metadata_block_index | nir_metadata_dominance, key); + + /* Lowering XFB creates piles of dead code. Eliminate now so we don't + * push unnecessary sysvals. + */ + NIR_PASS_V(nir, nir_opt_dce); +} + +void +agx_init_streamout_functions(struct pipe_context *ctx) +{ + ctx->create_stream_output_target = agx_create_stream_output_target; + ctx->stream_output_target_destroy = agx_stream_output_target_destroy; + ctx->set_stream_output_targets = agx_set_stream_output_targets; +} diff --git a/src/gallium/drivers/asahi/agx_uniforms.c b/src/gallium/drivers/asahi/agx_uniforms.c index ad946fc144b..5999f69ecba 100644 --- a/src/gallium/drivers/asahi/agx_uniforms.c +++ b/src/gallium/drivers/asahi/agx_uniforms.c @@ -87,6 +87,16 @@ agx_upload_uniforms(struct agx_batch *batch, uint64_t textures, u_foreach_bit(vbo, ctx->vb_mask) { uniforms.vs.vbo_base[vbo] = agx_vertex_buffer_ptr(batch, vbo); } + + if (ctx->streamout.key.active) { + uniforms.vs.xfb = ctx->streamout.params; + + for (unsigned i = 0; i < batch->ctx->streamout.num_targets; ++i) { + uint32_t size = 0; + uniforms.vs.xfb.base[i] = agx_batch_get_so_address(batch, i, &size); + uniforms.vs.xfb.size[i] = size; + } + } } else if (stage == PIPE_SHADER_FRAGMENT) { memcpy(uniforms.fs.blend_constant, &ctx->blend_color, sizeof(ctx->blend_color)); diff --git a/src/gallium/drivers/asahi/meson.build b/src/gallium/drivers/asahi/meson.build index 3a6bc9e6922..6a80e1f803a 100644 --- a/src/gallium/drivers/asahi/meson.build +++ b/src/gallium/drivers/asahi/meson.build @@ -10,6 +10,7 @@ files_asahi = files( 'agx_nir_lower_sysvals.c', 'agx_query.c', 'agx_state.c', + 'agx_streamout.c', 'agx_uniforms.c', )