diff --git a/docs/features.txt b/docs/features.txt
index f8e88faf89d..34709295c87 100644
--- a/docs/features.txt
+++ b/docs/features.txt
@@ -54,7 +54,7 @@ GL 3.0, GLSL 1.30 --- all DONE: freedreno, i965, nv50, nvc0, r600, radeonsi, llv
   GL_EXT_draw_buffers2 (Per-buffer blend and masks)     DONE (v3d, asahi)
   GL_EXT_texture_compression_rgtc                       DONE (all drivers that support GL_EXT_texture_snorm)
   GL_ARB_texture_rg                                     DONE (v3d, lima, asahi)
-  GL_EXT_transform_feedback (Transform feedback)        DONE (v3d)
+  GL_EXT_transform_feedback (Transform feedback)        DONE (v3d, asahi)
   GL_ARB_vertex_array_object (Vertex array objects)     DONE (v3d, vc4, lima, asahi)
   GL_EXT_framebuffer_sRGB (sRGB framebuffer format)     DONE (v3d, vc4, lima, asahi)
   glClearBuffer commands                                DONE
@@ -136,8 +136,8 @@ GL 4.0, GLSL 4.00 --- all DONE: freedreno/a6xx, i965/gen7+, nvc0, r600, radeonsi
   GL_ARB_texture_cube_map_array                         DONE (freedreno/a4xx+, i965/gen6+, nv50, softpipe, v3d)
   GL_ARB_texture_gather                                 DONE (freedreno, i965/gen6+, nv50, softpipe, v3d, panfrost, asahi)
   GL_ARB_texture_query_lod                              DONE (freedreno, i965, nv50, softpipe, v3d, panfrost)
-  GL_ARB_transform_feedback2                            DONE (freedreno/a3xx+, i965/gen6+, nv50, softpipe, v3d, panfrost)
-  GL_ARB_transform_feedback3                            DONE (freedreno/a3xx+, i965/gen7+, softpipe, )
+  GL_ARB_transform_feedback2                            DONE (freedreno/a3xx+, i965/gen6+, nv50, softpipe, v3d, panfrost, asahi)
+  GL_ARB_transform_feedback3                            DONE (freedreno/a3xx+, i965/gen7+, softpipe, asahi)
 
 
 GL 4.1, GLSL 4.10 --- all DONE: freedreno/a6xx, i965/gen7+, nvc0, r600, radeonsi, llvmpipe, virgl, zink, d3d12
@@ -156,7 +156,7 @@ GL 4.2, GLSL 4.20 -- all DONE: freedreno/a6xx, i965/gen7+, nvc0, r600, radeonsi,
   GL_ARB_compressed_texture_pixel_storage               DONE (all drivers)
   GL_ARB_shader_atomic_counters                         DONE (freedreno/a5xx+, i965, softpipe, v3d, panfrost)
   GL_ARB_texture_storage                                DONE (all drivers)
-  GL_ARB_transform_feedback_instanced                   DONE (freedreno, i965, nv50, softpipe, v3d)
+  GL_ARB_transform_feedback_instanced                   DONE (freedreno, i965, nv50, softpipe, v3d, asahi)
   GL_ARB_base_instance                                  DONE (freedreno, i965, nv50, softpipe, v3d)
   GL_ARB_shader_image_load_store                        DONE (freedreno/a5xx+, i965, softpipe, panfrost)
   GL_ARB_conservative_depth                             DONE (all drivers that support GLSL 1.30)
diff --git a/src/gallium/drivers/asahi/agx_blit.c b/src/gallium/drivers/asahi/agx_blit.c
index 873307c4164..5e104ba93ca 100644
--- a/src/gallium/drivers/asahi/agx_blit.c
+++ b/src/gallium/drivers/asahi/agx_blit.c
@@ -27,7 +27,8 @@ agx_blitter_save(struct agx_context *ctx, struct blitter_context *blitter,
    util_blitter_save_blend(blitter, ctx->blend);
    util_blitter_save_depth_stencil_alpha(blitter, ctx->zs);
    util_blitter_save_stencil_ref(blitter, &ctx->stencil_ref);
-   util_blitter_save_so_targets(blitter, 0, NULL);
+   util_blitter_save_so_targets(blitter, ctx->streamout.num_targets,
+                                ctx->streamout.targets);
    util_blitter_save_sample_mask(blitter, ctx->sample_mask, 0);
 
    util_blitter_save_framebuffer(blitter, &ctx->framebuffer);
diff --git a/src/gallium/drivers/asahi/agx_nir_lower_sysvals.c b/src/gallium/drivers/asahi/agx_nir_lower_sysvals.c
index b57d9e10503..3ffe61ac36d 100644
--- a/src/gallium/drivers/asahi/agx_nir_lower_sysvals.c
+++ b/src/gallium/drivers/asahi/agx_nir_lower_sysvals.c
@@ -111,6 +111,18 @@ lower_intrinsic(nir_builder *b, nir_intrinsic_instr *intr)
                                   &u->ssbo_size, intr->src[0].ssa);
    case nir_intrinsic_load_num_workgroups:
       return load_sysval(b, 3, 32, AGX_SYSVAL_TABLE_GRID, 0);
+   case nir_intrinsic_load_xfb_address:
+      return load_sysval_root(b, 1, 64,
+                              &u->vs.xfb.base[nir_intrinsic_base(intr)]);
+   case nir_intrinsic_load_xfb_size:
+      return load_sysval_root(b, 1, 32,
+                              &u->vs.xfb.size[nir_intrinsic_base(intr)]);
+   case nir_intrinsic_load_xfb_index_buffer:
+      return load_sysval_root(b, 1, 64, &u->vs.xfb.index_buffer);
+   case nir_intrinsic_load_base_vertex:
+      return load_sysval_root(b, 1, 32, &u->vs.xfb.base_vertex);
+   case nir_intrinsic_load_num_vertices:
+      return load_sysval_root(b, 1, 32, &u->vs.xfb.num_vertices);
    default:
       return NULL;
    }
diff --git a/src/gallium/drivers/asahi/agx_pipe.c b/src/gallium/drivers/asahi/agx_pipe.c
index 351bd8f7b06..6ce37d9396b 100644
--- a/src/gallium/drivers/asahi/agx_pipe.c
+++ b/src/gallium/drivers/asahi/agx_pipe.c
@@ -1397,6 +1397,7 @@ agx_create_context(struct pipe_screen *screen, void *priv, unsigned flags)
 
    agx_init_state_functions(pctx);
    agx_init_query_functions(pctx);
+   agx_init_streamout_functions(pctx);
 
    agx_meta_init(&ctx->meta, agx_device(screen));
 
@@ -1556,15 +1557,15 @@ agx_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
       return 0;
 
    case PIPE_CAP_MAX_STREAM_OUTPUT_BUFFERS:
-      return is_deqp ? PIPE_MAX_SO_BUFFERS : 0;
+      return PIPE_MAX_SO_BUFFERS;
 
    case PIPE_CAP_MAX_STREAM_OUTPUT_SEPARATE_COMPONENTS:
    case PIPE_CAP_MAX_STREAM_OUTPUT_INTERLEAVED_COMPONENTS:
-      return is_deqp ? PIPE_MAX_SO_OUTPUTS : 0;
+      return PIPE_MAX_SO_OUTPUTS;
 
    case PIPE_CAP_STREAM_OUTPUT_PAUSE_RESUME:
    case PIPE_CAP_STREAM_OUTPUT_INTERLEAVE_BUFFERS:
-      return is_deqp ? 1 : 0;
+      return 1;
 
    case PIPE_CAP_MAX_TEXTURE_ARRAY_LAYERS:
       return 2048;
@@ -1587,6 +1588,13 @@ agx_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
    case PIPE_CAP_VERTEX_ATTRIB_ELEMENT_ALIGNED_ONLY:
       return 1;
 
+   /* We run nir_lower_point_size so we need the GLSL linker to copy
+    * the original gl_PointSize when captured by transform feedback. We could
+    * also copy it ourselves but it's easier to set the CAP.
+    */
+   case PIPE_CAP_PSIZ_CLAMPED:
+      return 1;
+
    case PIPE_CAP_MAX_TEXTURE_2D_SIZE:
       return 16384;
    case PIPE_CAP_MAX_TEXTURE_3D_LEVELS:
diff --git a/src/gallium/drivers/asahi/agx_query.c b/src/gallium/drivers/asahi/agx_query.c
index 0fe2e3c8ebb..fcea7fb41e7 100644
--- a/src/gallium/drivers/asahi/agx_query.c
+++ b/src/gallium/drivers/asahi/agx_query.c
@@ -1,8 +1,10 @@
 /*
  * Copyright 2022 Alyssa Rosenzweig
+ * Copyright 2019-2020 Collabora, Ltd.
  * SPDX-License-Identifier: MIT
  */
 
+#include "util/u_prim.h"
 #include "agx_state.h"
 
 static struct pipe_query *
@@ -39,12 +41,13 @@ agx_begin_query(struct pipe_context *pctx, struct pipe_query *pquery)
    struct agx_context *ctx = agx_context(pctx);
    struct agx_query *query = (struct agx_query *)pquery;
 
+   ctx->dirty |= AGX_DIRTY_QUERY;
+
    switch (query->type) {
    case PIPE_QUERY_OCCLUSION_COUNTER:
    case PIPE_QUERY_OCCLUSION_PREDICATE:
    case PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE:
       ctx->occlusion_query = query;
-      ctx->dirty |= AGX_DIRTY_QUERY;
 
       /* begin_query zeroes, flush so we can do that write. If anything (i.e.
        * other than piglit) actually hits this, we could shadow the query to
@@ -60,6 +63,16 @@ agx_begin_query(struct pipe_context *pctx, struct pipe_query *pquery)
       query->value = 0;
       return true;
 
+   case PIPE_QUERY_PRIMITIVES_GENERATED:
+      ctx->prims_generated = query;
+      query->value = 0;
+      return true;
+
+   case PIPE_QUERY_PRIMITIVES_EMITTED:
+      ctx->tf_prims_generated = query;
+      query->value = 0;
+      return true;
+
    default:
       return false;
    }
@@ -71,14 +84,20 @@ agx_end_query(struct pipe_context *pctx, struct pipe_query *pquery)
    struct agx_context *ctx = agx_context(pctx);
    struct agx_query *query = (struct agx_query *)pquery;
 
+   ctx->dirty |= AGX_DIRTY_QUERY;
+
    switch (query->type) {
    case PIPE_QUERY_OCCLUSION_COUNTER:
    case PIPE_QUERY_OCCLUSION_PREDICATE:
    case PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE:
       ctx->occlusion_query = NULL;
-      ctx->dirty |= AGX_DIRTY_QUERY;
       return true;
-
+   case PIPE_QUERY_PRIMITIVES_GENERATED:
+      ctx->prims_generated = NULL;
+      return true;
+   case PIPE_QUERY_PRIMITIVES_EMITTED:
+      ctx->tf_prims_generated = NULL;
+      return true;
    default:
       return false;
    }
@@ -117,6 +136,11 @@ agx_get_query_result(struct pipe_context *pctx, struct pipe_query *pquery,
 
       return true;
 
+   case PIPE_QUERY_PRIMITIVES_GENERATED:
+   case PIPE_QUERY_PRIMITIVES_EMITTED:
+      vresult->u64 = query->value;
+      return true;
+
    default:
       unreachable("Other queries not yet supported");
    }
diff --git a/src/gallium/drivers/asahi/agx_state.c b/src/gallium/drivers/asahi/agx_state.c
index f71fd9065ad..870f77b13cb 100644
--- a/src/gallium/drivers/asahi/agx_state.c
+++ b/src/gallium/drivers/asahi/agx_state.c
@@ -22,6 +22,7 @@
 #include "gallium/auxiliary/util/u_draw.h"
 #include "gallium/auxiliary/util/u_framebuffer.h"
 #include "gallium/auxiliary/util/u_helpers.h"
+#include "gallium/auxiliary/util/u_prim_restart.h"
 #include "gallium/auxiliary/util/u_viewport.h"
 #include "pipe/p_context.h"
 #include "pipe/p_defines.h"
@@ -37,59 +38,6 @@
 #include "util/u_transfer.h"
 #include "agx_disk_cache.h"
 
-static struct pipe_stream_output_target *
-agx_create_stream_output_target(struct pipe_context *pctx,
-                                struct pipe_resource *prsc,
-                                unsigned buffer_offset, unsigned buffer_size)
-{
-   struct pipe_stream_output_target *target;
-
-   target = &rzalloc(pctx, struct agx_streamout_target)->base;
-
-   if (!target)
-      return NULL;
-
-   pipe_reference_init(&target->reference, 1);
-   pipe_resource_reference(&target->buffer, prsc);
-
-   target->context = pctx;
-   target->buffer_offset = buffer_offset;
-   target->buffer_size = buffer_size;
-
-   return target;
-}
-
-static void
-agx_stream_output_target_destroy(struct pipe_context *pctx,
-                                 struct pipe_stream_output_target *target)
-{
-   pipe_resource_reference(&target->buffer, NULL);
-   ralloc_free(target);
-}
-
-static void
-agx_set_stream_output_targets(struct pipe_context *pctx, unsigned num_targets,
-                              struct pipe_stream_output_target **targets,
-                              const unsigned *offsets)
-{
-   struct agx_context *ctx = agx_context(pctx);
-   struct agx_streamout *so = &ctx->streamout;
-
-   assert(num_targets <= ARRAY_SIZE(so->targets));
-
-   for (unsigned i = 0; i < num_targets; i++) {
-      if (offsets[i] != -1)
-         agx_so_target(targets[i])->offset = offsets[i];
-
-      pipe_so_target_reference(&so->targets[i], targets[i]);
-   }
-
-   for (unsigned i = 0; i < so->num_targets; i++)
-      pipe_so_target_reference(&so->targets[i], NULL);
-
-   so->num_targets = num_targets;
-}
-
 static void
 agx_set_shader_images(struct pipe_context *pctx, enum pipe_shader_type shader,
                       unsigned start_slot, unsigned count,
@@ -1403,6 +1351,9 @@ agx_compile_variant(struct agx_device *dev, struct agx_uncompiled_shader *so,
       struct asahi_vs_shader_key *key = &key_->vs;
 
       NIR_PASS_V(nir, agx_nir_lower_vbo, &key->vbuf);
+
+      if (key->xfb.active && nir->xfb_info != NULL)
+         NIR_PASS_V(nir, agx_nir_lower_xfb, &key->xfb);
    } else if (nir->info.stage == MESA_SHADER_FRAGMENT) {
       struct asahi_fs_shader_key *key = &key_->fs;
 
@@ -1672,12 +1623,14 @@ agx_update_vs(struct agx_context *ctx)
    /* Only proceed if the shader or anything the key depends on changes
     *
     * vb_mask, attributes, vertex_buffers: VERTEX
+    * streamout.active: XFB
     */
-   if (!(ctx->dirty & (AGX_DIRTY_VS_PROG | AGX_DIRTY_VERTEX)))
+   if (!(ctx->dirty & (AGX_DIRTY_VS_PROG | AGX_DIRTY_VERTEX | AGX_DIRTY_XFB)))
       return false;
 
    struct asahi_vs_shader_key key = {
       .vbuf.count = util_last_bit(ctx->vb_mask),
+      .xfb = ctx->streamout.key,
    };
 
    memcpy(key.vbuf.attributes, ctx->attributes,
@@ -2563,7 +2516,46 @@ agx_draw_vbo(struct pipe_context *pctx, const struct pipe_draw_info *info,
       return;
    }
 
+   if (indirect && indirect->count_from_stream_output) {
+      agx_draw_vbo_from_xfb(pctx, info, drawid_offset, indirect);
+      return;
+   }
+
+   const nir_shader *nir_vs = ctx->stage[PIPE_SHADER_VERTEX].shader->nir;
+   bool uses_xfb = nir_vs->xfb_info && ctx->streamout.num_targets;
+   bool uses_prims_generated = ctx->active_queries && ctx->prims_generated;
+
+   if (indirect && (uses_prims_generated || uses_xfb)) {
+      perf_debug_ctx(ctx, "Emulating indirect draw due to XFB");
+      util_draw_indirect(pctx, info, indirect);
+      return;
+   }
+
+   if (uses_xfb && info->primitive_restart) {
+      perf_debug_ctx(ctx, "Emulating primitive restart due to XFB");
+      util_draw_vbo_without_prim_restart(pctx, info, drawid_offset, indirect,
+                                         draws);
+      return;
+   }
+
+   if (!ctx->streamout.key.active && uses_prims_generated) {
+      agx_primitives_update_direct(ctx, info, draws);
+   }
+
    struct agx_batch *batch = agx_get_batch(ctx);
+   unsigned idx_size = info->index_size;
+   uint64_t ib = 0;
+   size_t ib_extent = 0;
+
+   if (idx_size) {
+      if (indirect != NULL)
+         ib = agx_index_buffer_rsrc_ptr(batch, info, &ib_extent);
+      else
+         ib = agx_index_buffer_direct_ptr(batch, draws, info, &ib_extent);
+   }
+
+   if (uses_xfb)
+      agx_launch_so(pctx, info, draws, ib);
 
 #ifndef NDEBUG
    if (unlikely(agx_device(pctx->screen)->debug & AGX_DBG_DIRTY))
@@ -2573,8 +2565,10 @@ agx_draw_vbo(struct pipe_context *pctx, const struct pipe_draw_info *info,
    if (agx_scissor_culls_everything(ctx))
       return;
 
-   /* We don't support side effects in vertex stages, so this is trivial */
-   if (ctx->rast->base.rasterizer_discard)
+   /* We don't support side effects in vertex stages (only used internally for
+    * transform feedback lowering), so this is trivial.
+    */
+   if (ctx->rast->base.rasterizer_discard && !ctx->streamout.key.active)
       return;
 
    /* Dirty track the reduced prim: lines vs points vs triangles */
@@ -2631,17 +2625,6 @@ agx_draw_vbo(struct pipe_context *pctx, const struct pipe_draw_info *info,
                                    reduced_prim == PIPE_PRIM_POINTS);
 
    enum agx_primitive prim = agx_primitive_for_pipe(info->mode);
-   unsigned idx_size = info->index_size;
-   uint64_t ib = 0;
-   size_t ib_extent = 0;
-
-   if (idx_size) {
-      if (indirect != NULL)
-         ib = agx_index_buffer_rsrc_ptr(batch, info, &ib_extent);
-      else
-         ib = agx_index_buffer_direct_ptr(batch, draws, info, &ib_extent);
-   }
-
    if (idx_size) {
       /* Index sizes are encoded logarithmically */
       STATIC_ASSERT(__builtin_ctz(1) == AGX_INDEX_SIZE_U8);
@@ -2729,6 +2712,21 @@ agx_draw_vbo(struct pipe_context *pctx, const struct pipe_draw_info *info,
       out += AGX_INDEX_LIST_BUFFER_SIZE_LENGTH;
    }
 
+   /* Insert a memory barrier after transform feedback so the result may be
+    * consumed by a subsequent vertex shader.
+    */
+   if (ctx->streamout.key.active) {
+      agx_pack(out, VDM_BARRIER, cfg) {
+         cfg.unk_5 = true;
+         cfg.unk_6 = true;
+         cfg.unk_8 = true;
+         cfg.unk_11 = true;
+         cfg.unk_20 = true;
+      }
+
+      out += AGX_VDM_BARRIER_LENGTH;
+   }
+
    batch->encoder_current = out;
    assert((batch->encoder_current + AGX_VDM_STREAM_LINK_LENGTH) <=
              batch->encoder_end &&
@@ -2889,8 +2887,5 @@ agx_init_state_functions(struct pipe_context *ctx)
    ctx->surface_destroy = agx_surface_destroy;
    ctx->draw_vbo = agx_draw_vbo;
    ctx->launch_grid = agx_launch_grid;
-   ctx->create_stream_output_target = agx_create_stream_output_target;
-   ctx->stream_output_target_destroy = agx_stream_output_target_destroy;
-   ctx->set_stream_output_targets = agx_set_stream_output_targets;
    ctx->texture_barrier = agx_texture_barrier;
 }
diff --git a/src/gallium/drivers/asahi/agx_state.h b/src/gallium/drivers/asahi/agx_state.h
index e44d3010946..ab43824cd62 100644
--- a/src/gallium/drivers/asahi/agx_state.h
+++ b/src/gallium/drivers/asahi/agx_state.h
@@ -39,17 +39,45 @@ struct agx_streamout_target {
    uint32_t offset;
 };
 
-struct agx_streamout {
-   struct pipe_stream_output_target *targets[PIPE_MAX_SO_BUFFERS];
-   unsigned num_targets;
-};
-
 static inline struct agx_streamout_target *
 agx_so_target(struct pipe_stream_output_target *target)
 {
    return (struct agx_streamout_target *)target;
 }
 
+struct agx_xfb_key {
+   /* If true, compiles a "transform feedback" program instead of a vertex
+    * shader. This is a kernel that runs on the VDM and writes out the transform
+    * feedback buffers, with no rasterization.
+    */
+   bool active;
+
+   /* The index size (1, 2, 4) or 0 if drawing without an index buffer. */
+   uint8_t index_size;
+
+   /* The primitive mode for unrolling the vertex ID */
+   enum pipe_prim_type mode;
+
+   /* Use first vertex as the provoking vertex for flat shading */
+   bool flatshade_first;
+};
+
+struct agx_xfb_params {
+   uint64_t base[PIPE_MAX_SO_BUFFERS];
+   uint32_t size[PIPE_MAX_SO_BUFFERS];
+   uint64_t index_buffer;
+   uint32_t base_vertex;
+   uint32_t num_vertices;
+};
+
+struct agx_streamout {
+   struct pipe_stream_output_target *targets[PIPE_MAX_SO_BUFFERS];
+   unsigned num_targets;
+
+   struct agx_xfb_key key;
+   struct agx_xfb_params params;
+};
+
 /* Shaders can access fixed-function state through system values.
  * It is convenient to stash all of this information into a single "root"
  * descriptor, then push individual parts as needed.
@@ -87,6 +115,9 @@ struct PACKED agx_draw_uniforms {
       struct {
          /* Vertex buffer object bases, if present */
          uint64_t vbo_base[PIPE_MAX_ATTRIBS];
+
+         /* Transform feedback info for a transform feedback shader */
+         struct agx_xfb_params xfb;
       } vs;
 
       struct {
@@ -239,6 +270,7 @@ struct agx_blend {
 
 struct asahi_vs_shader_key {
    struct agx_vbufs vbuf;
+   struct agx_xfb_key xfb;
 };
 
 struct asahi_fs_shader_key {
@@ -277,6 +309,7 @@ enum agx_dirty {
 
    AGX_DIRTY_BLEND = BITFIELD_BIT(12),
    AGX_DIRTY_QUERY = BITFIELD_BIT(13),
+   AGX_DIRTY_XFB = BITFIELD_BIT(14),
 };
 
 /* Maximum number of in-progress + under-construction GPU batches.
@@ -336,6 +369,8 @@ struct agx_context {
    enum pipe_render_cond_flag cond_mode;
 
    struct agx_query *occlusion_query;
+   struct agx_query *prims_generated;
+   struct agx_query *tf_prims_generated;
    bool active_queries;
 
    struct util_debug_callback debug;
@@ -410,6 +445,27 @@ agx_context(struct pipe_context *pctx)
 
 void agx_init_query_functions(struct pipe_context *ctx);
 
+void
+agx_primitives_update_direct(struct agx_context *ctx,
+                             const struct pipe_draw_info *info,
+                             const struct pipe_draw_start_count_bias *draw);
+
+void agx_nir_lower_xfb(nir_shader *shader, struct agx_xfb_key *key);
+
+void agx_draw_vbo_from_xfb(struct pipe_context *pctx,
+                           const struct pipe_draw_info *info,
+                           unsigned drawid_offset,
+                           const struct pipe_draw_indirect_info *indirect);
+
+void agx_launch_so(struct pipe_context *pctx, const struct pipe_draw_info *info,
+                   const struct pipe_draw_start_count_bias *draws,
+                   uint64_t index_buffer);
+
+uint64_t agx_batch_get_so_address(struct agx_batch *batch, unsigned buffer,
+                                  uint32_t *size);
+
+void agx_init_streamout_functions(struct pipe_context *ctx);
+
 static inline void
 agx_dirty_all(struct agx_context *ctx)
 {
@@ -438,7 +494,7 @@ struct agx_query {
    struct agx_batch *writer;
    unsigned writer_index;
 
-   /* For occlusion queries, which use some CPU work */
+   /* Accumulator flushed to the CPU */
    uint64_t value;
 };
 
diff --git a/src/gallium/drivers/asahi/agx_streamout.c b/src/gallium/drivers/asahi/agx_streamout.c
new file mode 100644
index 00000000000..190211ca7ee
--- /dev/null
+++ b/src/gallium/drivers/asahi/agx_streamout.c
@@ -0,0 +1,574 @@
+/*
+ * Copyright 2023 Alyssa Rosenzweig
+ * Copyright 2022 Collabora Ltd.
+ * SPDX-License-Identifier: MIT
+ */
+
+#include "compiler/nir/nir_builder.h"
+#include "compiler/nir/nir_xfb_info.h"
+#include "util/u_draw.h"
+#include "util/u_dump.h"
+#include "util/u_prim.h"
+#include "agx_state.h"
+
+static struct pipe_stream_output_target *
+agx_create_stream_output_target(struct pipe_context *pctx,
+                                struct pipe_resource *prsc,
+                                unsigned buffer_offset, unsigned buffer_size)
+{
+   struct pipe_stream_output_target *target;
+
+   target = &rzalloc(pctx, struct agx_streamout_target)->base;
+
+   if (!target)
+      return NULL;
+
+   pipe_reference_init(&target->reference, 1);
+   pipe_resource_reference(&target->buffer, prsc);
+
+   target->context = pctx;
+   target->buffer_offset = buffer_offset;
+   target->buffer_size = buffer_size;
+
+   return target;
+}
+
+static void
+agx_stream_output_target_destroy(struct pipe_context *pctx,
+                                 struct pipe_stream_output_target *target)
+{
+   pipe_resource_reference(&target->buffer, NULL);
+   ralloc_free(target);
+}
+
+static void
+agx_set_stream_output_targets(struct pipe_context *pctx, unsigned num_targets,
+                              struct pipe_stream_output_target **targets,
+                              const unsigned *offsets)
+{
+   struct agx_context *ctx = agx_context(pctx);
+   struct agx_streamout *so = &ctx->streamout;
+
+   assert(num_targets <= ARRAY_SIZE(so->targets));
+
+   for (unsigned i = 0; i < num_targets; i++) {
+      /* From the Gallium documentation:
+       *
+       *    -1 means the buffer should be appended to, and everything else sets
+       *    the internal offset.
+       *
+       * We append regardless, so just check for != -1. Yes, using a negative
+       * sentinel value with an unsigned type is bananas. But it's in the
+       * Gallium contract and it will work out fine. Probably should be
+       * redefined to be ~0 instead of -1 but it doesn't really matter.
+       */
+      if (offsets[i] != -1)
+         agx_so_target(targets[i])->offset = offsets[i];
+
+      pipe_so_target_reference(&so->targets[i], targets[i]);
+   }
+
+   for (unsigned i = num_targets; i < so->num_targets; i++)
+      pipe_so_target_reference(&so->targets[i], NULL);
+
+   so->num_targets = num_targets;
+}
+
+static struct pipe_stream_output_target *
+get_target(struct agx_context *ctx, unsigned buffer)
+{
+   if (buffer < ctx->streamout.num_targets)
+      return ctx->streamout.targets[buffer];
+   else
+      return NULL;
+}
+
+/*
+ * Return the address of the indexed streamout buffer. This will be
+ * pushed into the streamout shader.
+ */
+uint64_t
+agx_batch_get_so_address(struct agx_batch *batch, unsigned buffer,
+                         uint32_t *size)
+{
+   struct pipe_stream_output_target *target = get_target(batch->ctx, buffer);
+
+   /* If there's no target, don't write anything */
+   if (!target) {
+      *size = 0;
+      return 0;
+   }
+
+   /* Otherwise, write the target */
+   struct pipe_stream_output_info *so =
+      &batch->ctx->stage[PIPE_SHADER_VERTEX].shader->base.stream_output;
+
+   struct agx_resource *rsrc = agx_resource(target->buffer);
+   agx_batch_writes(batch, rsrc);
+
+   /* The amount of space left depends how much we've already consumed */
+   unsigned stride = so->stride[buffer] * 4;
+   uint32_t offset = agx_so_target(target)->offset * stride;
+
+   *size = offset < target->buffer_size ? (target->buffer_size - offset) : 0;
+   return rsrc->bo->ptr.gpu + target->buffer_offset + offset;
+}
+
+void
+agx_draw_vbo_from_xfb(struct pipe_context *pctx,
+                      const struct pipe_draw_info *info, unsigned drawid_offset,
+                      const struct pipe_draw_indirect_info *indirect)
+{
+   struct pipe_draw_start_count_bias draw = {
+      .start = 0,
+      .count = agx_so_target(indirect->count_from_stream_output)->offset,
+   };
+
+   pctx->draw_vbo(pctx, info, drawid_offset, NULL, &draw, 1);
+}
+
+static uint32_t
+xfb_prims_for_vertices(enum pipe_prim_type mode, unsigned verts)
+{
+   uint32_t prims = u_decomposed_prims_for_vertices(mode, verts);
+
+   /* The GL spec isn't super clear about this, but it implies that quads are
+    * supposed to be tessellated into primitives and piglit
+    * (ext_transform_feedback-tessellation quads) checks this.
+    */
+   if (u_decomposed_prim(mode) == PIPE_PRIM_QUADS)
+      prims *= 2;
+
+   return prims;
+}
+
+/*
+ * Launch a streamout pipeline.
+ */
+void
+agx_launch_so(struct pipe_context *pctx, const struct pipe_draw_info *info,
+              const struct pipe_draw_start_count_bias *draw,
+              uint64_t index_buffer)
+{
+   struct agx_context *ctx = agx_context(pctx);
+
+   /* Break recursion from draw_vbo creating draw calls below: Do not do a
+    * streamout draw for a streamout draw.
+    */
+   if (ctx->streamout.key.active)
+      return;
+
+   /* Configure the below draw to launch streamout rather than a regular draw */
+   ctx->streamout.key.active = true;
+   ctx->dirty |= AGX_DIRTY_XFB;
+
+   ctx->streamout.key.index_size = info->index_size;
+   ctx->streamout.key.mode = info->mode;
+   ctx->streamout.key.flatshade_first = ctx->rast->base.flatshade_first;
+   ctx->streamout.params.index_buffer = index_buffer;
+
+   /* Ignore provoking vertex for modes that don't depend on the provoking
+    * vertex, to reduce shader variants.
+    */
+   if (info->mode != PIPE_PRIM_TRIANGLE_STRIP)
+      ctx->streamout.key.flatshade_first = false;
+
+   /* Determine how many vertices are XFB there will be */
+   unsigned num_outputs =
+      u_stream_outputs_for_vertices(info->mode, draw->count);
+   unsigned count = draw->count;
+   u_trim_pipe_prim(info->mode, &count);
+
+   ctx->streamout.params.base_vertex =
+      info->index_size ? draw->index_bias : draw->start;
+   ctx->streamout.params.num_vertices = count;
+
+   /* Streamout runs as a vertex shader with rasterizer discard */
+   void *saved_rast = ctx->rast;
+   pctx->bind_rasterizer_state(
+      pctx, util_blitter_get_discard_rasterizer_state(ctx->blitter));
+
+   /* Dispatch a grid of points, this is compute-like */
+   util_draw_arrays_instanced(pctx, PIPE_PRIM_POINTS, 0, num_outputs, 0,
+                              info->instance_count);
+   pctx->bind_rasterizer_state(pctx, saved_rast);
+
+   /*
+    * Finally, if needed, update the counter of primitives written. The spec
+    * requires:
+    *
+    *    If recording the vertices of a primitive to the buffer objects being
+    *    used for transform feedback purposes would result in [overflow]...
+    *    the counter corresponding to the asynchronous query target
+    *    TRANSFORM_FEEDBACK_PRIMITIVES_WRITTEN (see section 13.4) is not
+    *    incremented.
+    *
+    * So clamp the number of primitives generated to the number of primitives
+    * we actually have space to write.
+    */
+   if (ctx->tf_prims_generated) {
+      uint32_t min_max = ~0;
+
+      for (unsigned i = 0; i < ctx->streamout.num_targets; ++i) {
+         struct pipe_stream_output_target *target = get_target(ctx, i);
+
+         if (!target)
+            continue;
+
+         struct pipe_stream_output_info *so =
+            &ctx->stage[PIPE_SHADER_VERTEX].shader->base.stream_output;
+         unsigned stride = so->stride[i] * 4;
+
+         /* Ignore spurious targets. I don't see anything in the Gallium
+          * contract specifically forbidding this.
+          */
+         if (stride == 0)
+            continue;
+
+         uint32_t offset = agx_so_target(target)->offset * stride;
+         uint32_t remaining =
+            offset < target->buffer_size ? (target->buffer_size - offset) : 0;
+         uint32_t max_vertices = stride ? (remaining / stride) : ~0;
+
+         min_max = MIN2(min_max, max_vertices);
+      }
+
+      /* We now have the maximum vertices written, round down to primitives */
+      uint32_t max_prims = xfb_prims_for_vertices(info->mode, min_max);
+      uint32_t prims = xfb_prims_for_vertices(info->mode, draw->count);
+
+      ctx->tf_prims_generated->value += MIN2(prims, max_prims);
+   }
+
+   /* Update the offsets into the streamout buffers */
+   for (unsigned i = 0; i < ctx->streamout.num_targets; ++i) {
+      if (ctx->streamout.targets[i])
+         agx_so_target(ctx->streamout.targets[i])->offset += num_outputs;
+   }
+
+   ctx->dirty |= AGX_DIRTY_XFB;
+   ctx->streamout.key.active = false;
+}
+
+/*
+ * Count generated primitives on the CPU for transform feedback. This only works
+ * in the absence of indirect draws, geometry shaders, or tessellation.
+ */
+void
+agx_primitives_update_direct(struct agx_context *ctx,
+                             const struct pipe_draw_info *info,
+                             const struct pipe_draw_start_count_bias *draw)
+{
+   assert(ctx->active_queries && ctx->prims_generated && "precondition");
+
+   ctx->prims_generated->value +=
+      xfb_prims_for_vertices(info->mode, draw->count);
+}
+
+/* The OpenGL spec says:
+ *
+ *    If recording the vertices of a primitive to the buffer objects being
+ *    used for transform feedback purposes would result in either exceeding
+ *    the limits of any buffer object’s size, or in exceeding the end
+ *    position offset + size − 1, as set by BindBufferRange, then no vertices
+ *    of that primitive are recorded in any buffer object.
+ *
+ * This function checks for the absence of overflow.
+ *
+ * The difficulty is that we are processing a single vertex at a time, so we
+ * need to do some arithmetic to figure out the bounds for the whole containing
+ * primitive.
+ *
+ * XXX: How do quads get tessellated?
+ */
+static nir_ssa_def *
+primitive_fits(nir_builder *b, struct agx_xfb_key *key)
+{
+   /* Get the number of vertices per primitive in the current mode, usually just
+    * the base number but quads are tessellated.
+    */
+   uint32_t verts_per_prim = u_vertices_per_prim(key->mode);
+
+   if (u_decomposed_prim(key->mode) == PIPE_PRIM_QUADS)
+      verts_per_prim = 6;
+
+   /* Get the ID for this invocation */
+   nir_ssa_def *id = nir_load_vertex_id_zero_base(b);
+
+   /* Figure out the ID for the first vertex of the next primitive. Since
+    * transform feedback buffers are tightly packed, that's one byte after the
+    * end of this primitive, which will make bounds checking convenient. That
+    * will be:
+    *
+    *    (id - (id % prim size)) + prim size
+    */
+   nir_ssa_def *rem = nir_umod_imm(b, id, verts_per_prim);
+   nir_ssa_def *next_id = nir_iadd_imm(b, nir_isub(b, id, rem), verts_per_prim);
+
+   /* Figure out where that vertex will land */
+   nir_ssa_def *index = nir_iadd(
+      b, nir_imul(b, nir_load_instance_id(b), nir_load_num_vertices(b)),
+      next_id);
+
+   /* Now check for overflow in each written buffer */
+   nir_ssa_def *all_fits = nir_imm_true(b);
+
+   u_foreach_bit(buffer, b->shader->xfb_info->buffers_written) {
+      uint16_t stride = b->shader->info.xfb_stride[buffer] * 4;
+      assert(stride != 0);
+
+      /* For this primitive to fit, the next primitive cannot start after the
+       * end of the transform feedback buffer.
+       */
+      nir_ssa_def *end_offset = nir_imul_imm(b, index, stride);
+
+      /* Check whether that will remain in bounds */
+      nir_ssa_def *fits =
+         nir_uge(b, nir_load_xfb_size(b, .base = buffer), end_offset);
+
+      /* Accumulate */
+      all_fits = nir_iand(b, all_fits, fits);
+   }
+
+   return all_fits;
+}
+
+static void
+insert_overflow_check(nir_shader *nir, struct agx_xfb_key *key)
+{
+   nir_function_impl *impl = nir_shader_get_entrypoint(nir);
+
+   /* Extract the current transform feedback shader */
+   nir_cf_list list;
+   nir_cf_extract(&list, nir_before_block(nir_start_block(impl)),
+                  nir_after_block(nir_impl_last_block(impl)));
+
+   /* Get a builder for the (now empty) shader */
+   nir_builder b;
+   nir_builder_init(&b, impl);
+   b.cursor = nir_after_block(nir_start_block(impl));
+
+   /* Rebuild the shader as
+    *
+    *    if (!overflow) {
+    *       shader();
+    *    }
+    */
+   nir_push_if(&b, primitive_fits(&b, key));
+   {
+      b.cursor = nir_cf_reinsert(&list, b.cursor);
+   }
+   nir_pop_if(&b, NULL);
+}
+
+static void
+lower_xfb_output(nir_builder *b, nir_intrinsic_instr *intr,
+                 unsigned start_component, unsigned num_components,
+                 unsigned buffer, unsigned offset_words)
+{
+   assert(buffer < MAX_XFB_BUFFERS);
+   assert(nir_intrinsic_component(intr) == 0); // TODO
+
+   /* Transform feedback info in units of words, convert to bytes. */
+   uint16_t stride = b->shader->info.xfb_stride[buffer] * 4;
+   assert(stride != 0);
+
+   uint16_t offset = offset_words * 4;
+
+   nir_ssa_def *index = nir_iadd(
+      b, nir_imul(b, nir_load_instance_id(b), nir_load_num_vertices(b)),
+      nir_load_vertex_id_zero_base(b));
+
+   nir_ssa_def *xfb_offset =
+      nir_iadd_imm(b, nir_imul_imm(b, index, stride), offset);
+
+   nir_ssa_def *buf = nir_load_xfb_address(b, 64, .base = buffer);
+   nir_ssa_def *addr = nir_iadd(b, buf, nir_u2u64(b, xfb_offset));
+
+   nir_ssa_def *value = nir_channels(
+      b, intr->src[0].ssa, BITFIELD_MASK(num_components) << start_component);
+   nir_store_global(b, addr, 4, value, BITFIELD_MASK(num_components));
+}
+
+static bool
+lower_xfb(nir_builder *b, nir_instr *instr, UNUSED void *data)
+{
+   if (instr->type != nir_instr_type_intrinsic)
+      return false;
+
+   nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
+   if (intr->intrinsic != nir_intrinsic_store_output)
+      return false;
+
+   /* Assume the inputs are read */
+   BITSET_SET(b->shader->info.system_values_read,
+              SYSTEM_VALUE_VERTEX_ID_ZERO_BASE);
+   BITSET_SET(b->shader->info.system_values_read, SYSTEM_VALUE_INSTANCE_ID);
+
+   bool progress = false;
+
+   for (unsigned i = 0; i < 2; ++i) {
+      nir_io_xfb xfb =
+         i ? nir_intrinsic_io_xfb2(intr) : nir_intrinsic_io_xfb(intr);
+
+      for (unsigned j = 0; j < 2; ++j) {
+         if (xfb.out[j].num_components > 0) {
+            b->cursor = nir_before_instr(&intr->instr);
+            lower_xfb_output(b, intr, i * 2 + j, xfb.out[j].num_components,
+                             xfb.out[j].buffer, xfb.out[j].offset);
+            progress = true;
+         }
+      }
+   }
+
+   nir_instr_remove(instr);
+   return progress;
+}
+
+static bool
+lower_xfb_intrinsics(struct nir_builder *b, nir_instr *instr, void *data)
+{
+   if (instr->type != nir_instr_type_intrinsic)
+      return false;
+
+   nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
+   b->cursor = nir_before_instr(instr);
+
+   struct agx_xfb_key *key = data;
+
+   switch (intr->intrinsic) {
+   /* XXX: Rename to "xfb index" to avoid the clash */
+   case nir_intrinsic_load_vertex_id_zero_base: {
+      nir_ssa_def *id = nir_load_vertex_id(b);
+      nir_ssa_def_rewrite_uses(&intr->dest.ssa, id);
+      return true;
+   }
+
+   case nir_intrinsic_load_vertex_id: {
+      /* Get the raw invocation ID */
+      nir_ssa_def *id = nir_load_vertex_id(b);
+
+      /* Tessellate by primitive mode */
+      if (key->mode == PIPE_PRIM_LINE_STRIP ||
+          key->mode == PIPE_PRIM_LINE_LOOP) {
+         /* The last vertex is special for a loop. Check if that's we're dealing
+          * with.
+          */
+         nir_ssa_def *num_invocations =
+            nir_imul_imm(b, nir_load_num_vertices(b), 2);
+         nir_ssa_def *last_vertex =
+            nir_ieq(b, id, nir_iadd_imm(b, num_invocations, -1));
+
+         /* (0, 1), (1, 2) */
+         id = nir_iadd(b, nir_ushr_imm(b, id, 1), nir_iand_imm(b, id, 1));
+
+         /* (0, 1), (1, 2), (2, 0) */
+         if (key->mode == PIPE_PRIM_LINE_LOOP) {
+            id = nir_bcsel(b, last_vertex, nir_imm_int(b, 0), id);
+         }
+      } else if (key->mode == PIPE_PRIM_TRIANGLE_STRIP) {
+         /* Order depends on the provoking vertex.
+          *
+          * First: (0, 1, 2), (1, 3, 2), (2, 3, 4).
+          * Last:  (0, 1, 2), (2, 1, 3), (2, 3, 4).
+          */
+         nir_ssa_def *prim = nir_udiv_imm(b, id, 3);
+         nir_ssa_def *rem = nir_umod_imm(b, id, 3);
+
+         unsigned pv = key->flatshade_first ? 0 : 2;
+
+         /* Swap the two non-provoking vertices third vertex in odd triangles */
+         nir_ssa_def *even = nir_ieq_imm(b, nir_iand_imm(b, prim, 1), 0);
+         nir_ssa_def *is_provoking = nir_ieq_imm(b, rem, pv);
+         nir_ssa_def *no_swap = nir_ior(b, is_provoking, even);
+         nir_ssa_def *swapped = nir_isub_imm(b, 3 - pv, rem);
+         nir_ssa_def *off = nir_bcsel(b, no_swap, rem, swapped);
+
+         /* Pull the (maybe swapped) vertex from the corresponding primitive */
+         id = nir_iadd(b, prim, off);
+      } else if (key->mode == PIPE_PRIM_TRIANGLE_FAN) {
+         /* (0, 1, 2), (0, 2, 3) */
+         nir_ssa_def *prim = nir_udiv_imm(b, id, 3);
+         nir_ssa_def *rem = nir_umod_imm(b, id, 3);
+
+         id = nir_bcsel(b, nir_ieq_imm(b, rem, 0), nir_imm_int(b, 0),
+                        nir_iadd(b, prim, rem));
+      } else if (key->mode == PIPE_PRIM_QUADS ||
+                 key->mode == PIPE_PRIM_QUAD_STRIP) {
+         /* Quads:       [(0, 1, 3), (3, 1, 2)], [(4, 5, 7), (7, 5, 6)]
+          * Quad strips: [(0, 1, 3), (0, 2, 3)], [(2, 3, 5), (2, 4, 5)]
+          */
+         bool strips = key->mode == PIPE_PRIM_QUAD_STRIP;
+
+         nir_ssa_def *prim = nir_udiv_imm(b, id, 6);
+         nir_ssa_def *rem = nir_umod_imm(b, id, 6);
+         nir_ssa_def *base = nir_imul_imm(b, prim, strips ? 2 : 4);
+
+         /* Quads:       [0, 1, 3, 3, 1, 2]
+          * Quad strips: [0, 1, 3, 0, 2, 3]
+          */
+         uint32_t order_quads = 0x213310;
+         uint32_t order_strips = 0x230310;
+         uint32_t order = strips ? order_strips : order_quads;
+
+         /* Index out of the bitpacked array */
+         nir_ssa_def *offset = nir_iand_imm(
+            b, nir_ushr(b, nir_imm_int(b, order), nir_imul_imm(b, rem, 4)),
+            0xF);
+
+         id = nir_iadd(b, base, offset);
+      }
+
+      /* Add the "start", either an index bias or a base vertex */
+      id = nir_iadd(b, id, nir_load_base_vertex(b));
+
+      /* If drawing with an index buffer, pull the vertex ID. Otherwise, the
+       * vertex ID is just the index as-is.
+       */
+      if (key->index_size) {
+         nir_ssa_def *index_buffer = nir_load_xfb_index_buffer(b, 64);
+         nir_ssa_def *offset = nir_imul_imm(b, id, key->index_size);
+         nir_ssa_def *address = nir_iadd(b, index_buffer, nir_u2u64(b, offset));
+         nir_ssa_def *index = nir_load_global_constant(
+            b, address, key->index_size, 1, key->index_size * 8);
+
+         id = nir_u2uN(b, index, id->bit_size);
+      }
+
+      nir_ssa_def_rewrite_uses(&intr->dest.ssa, id);
+      return true;
+   }
+
+   default:
+      return false;
+   }
+}
+
+void
+agx_nir_lower_xfb(nir_shader *nir, struct agx_xfb_key *key)
+{
+   assert(nir->info.stage == MESA_SHADER_VERTEX);
+
+   NIR_PASS_V(nir, nir_io_add_const_offset_to_base,
+              nir_var_shader_in | nir_var_shader_out);
+   NIR_PASS_V(nir, nir_io_add_intrinsic_xfb_info);
+
+   NIR_PASS_V(nir, insert_overflow_check, key);
+   NIR_PASS_V(nir, nir_shader_instructions_pass, lower_xfb,
+              nir_metadata_block_index | nir_metadata_dominance, key);
+   NIR_PASS_V(nir, nir_shader_instructions_pass, lower_xfb_intrinsics,
+              nir_metadata_block_index | nir_metadata_dominance, key);
+
+   /* Lowering XFB creates piles of dead code. Eliminate now so we don't
+    * push unnecessary sysvals.
+    */
+   NIR_PASS_V(nir, nir_opt_dce);
+}
+
+void
+agx_init_streamout_functions(struct pipe_context *ctx)
+{
+   ctx->create_stream_output_target = agx_create_stream_output_target;
+   ctx->stream_output_target_destroy = agx_stream_output_target_destroy;
+   ctx->set_stream_output_targets = agx_set_stream_output_targets;
+}
diff --git a/src/gallium/drivers/asahi/agx_uniforms.c b/src/gallium/drivers/asahi/agx_uniforms.c
index ad946fc144b..5999f69ecba 100644
--- a/src/gallium/drivers/asahi/agx_uniforms.c
+++ b/src/gallium/drivers/asahi/agx_uniforms.c
@@ -87,6 +87,16 @@ agx_upload_uniforms(struct agx_batch *batch, uint64_t textures,
       u_foreach_bit(vbo, ctx->vb_mask) {
          uniforms.vs.vbo_base[vbo] = agx_vertex_buffer_ptr(batch, vbo);
       }
+
+      if (ctx->streamout.key.active) {
+         uniforms.vs.xfb = ctx->streamout.params;
+
+         for (unsigned i = 0; i < batch->ctx->streamout.num_targets; ++i) {
+            uint32_t size = 0;
+            uniforms.vs.xfb.base[i] = agx_batch_get_so_address(batch, i, &size);
+            uniforms.vs.xfb.size[i] = size;
+         }
+      }
    } else if (stage == PIPE_SHADER_FRAGMENT) {
       memcpy(uniforms.fs.blend_constant, &ctx->blend_color,
              sizeof(ctx->blend_color));
diff --git a/src/gallium/drivers/asahi/meson.build b/src/gallium/drivers/asahi/meson.build
index 3a6bc9e6922..6a80e1f803a 100644
--- a/src/gallium/drivers/asahi/meson.build
+++ b/src/gallium/drivers/asahi/meson.build
@@ -10,6 +10,7 @@ files_asahi = files(
   'agx_nir_lower_sysvals.c',
   'agx_query.c',
   'agx_state.c',
+  'agx_streamout.c',
   'agx_uniforms.c',
 )