asahi,libagx: tessellate on device

Add OpenCL kernels implementing the tessellation algorithm on device. This is an OpenCL C port of the D3D11 reference tessellator, originally written by Microsoft in C++. There are significant differences compared to the CPU based reference implementation: * significant simplifications and clean up. The reference code did a lot of things in weird ways that would be inefficient on the GPU. I did a *lot* of work here to get good AGX assembly generated for the tessellation kernels ... the first attempts were quite bad! Notably, everything is carefully written to ensure that all private memory access is optimized out in NIR; the resulting kernels do not use scratch and do not spill on G13. * prefix sum variants. To implement geom+tess efficiently, we need to first calculate the count of indices generated by the tessellator, then prefix sum that, then tessellate using the prefix sum results writing into 1 large index buffer for a single indirect draw. This isn't too bad, we already have most of the logic and the guts of the prefix sum kernel is shared with geometry shaders. * VDM generation variant. To implement tess alone, it's fastest to generate a hardware Index List word for each patch, adding an appropriate 32-bit index bias to the dynamically allocated U16 index buffers. Then from the CPU, we have the illusion of a single draw to Stream Link with Return to. This requires packing hardware control words from the tessellator kernel. Fortunately, we have GenXML available so we just use agx_pack like we would in the driver. Along the way, we pick up indirect tess support (this follows on naturally), which gets rid of the other bit of tessellation-related cheating. Implementing this requires reworking our internal agx_launch data structures, but that has the nice side effect of speeding up GS invocations too (by fixing the workgroup size). Don't get me wrong. tessellator.cl is the single most unhinged file of my career, featuring GenXML-based pack macros fed by dynamic memory allocation fed by the inscrutable tessellation algorithm. But it works *really* well. Signed-off-by: Alyssa Rosenzweig <alyssa@rosenzweig.io> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/30051>
2024-07-05 11:51:14 -04:00
parent cc9b815efa
commit d26ae4f455
11 changed files with 2531 additions and 390 deletions
--- a/src/asahi/lib/agx_nir_lower_gs.c
+++ b/src/asahi/lib/agx_nir_lower_gs.c
@@ -1534,6 +1534,13 @@ agx_nir_prefix_sum_gs(nir_builder *b, const void *data)
                     nir_channel(b, nir_load_workgroup_id(b), 0));
 }

+void
+agx_nir_prefix_sum_tess(nir_builder *b, const void *data)
+{
+   b->shader->info.workgroup_size[0] = 1024;
+   libagx_prefix_sum_tess(b, nir_load_preamble(b, 1, 64, .base = 0));
+}
+
 void
 agx_nir_gs_setup_indirect(nir_builder *b, const void *data)
 {
@@ -1564,3 +1571,37 @@ agx_nir_unroll_restart(nir_builder *b, const void *data)
   else
      unreachable("invalid index size");
 }
+
+void
+agx_nir_tessellate(nir_builder *b, const void *data)
+{
+   const struct agx_tessellator_key *key = data;
+   b->shader->info.workgroup_size[0] = 64;
+
+   nir_def *params = nir_load_preamble(b, 1, 64, .base = 0);
+   nir_def *patch = nir_channel(b, nir_load_global_invocation_id(b, 32), 0);
+   nir_def *mode = nir_imm_int(b, key->mode);
+   nir_def *partitioning = nir_imm_int(b, key->partitioning);
+   nir_def *output_prim = nir_imm_int(b, key->output_primitive);
+
+   if (key->prim == TESS_PRIMITIVE_ISOLINES)
+      libagx_tess_isoline(b, params, mode, partitioning, output_prim, patch);
+   else if (key->prim == TESS_PRIMITIVE_TRIANGLES)
+      libagx_tess_tri(b, params, mode, partitioning, output_prim, patch);
+   else if (key->prim == TESS_PRIMITIVE_QUADS)
+      libagx_tess_quad(b, params, mode, partitioning, output_prim, patch);
+   else
+      unreachable("invalid tess primitive");
+}
+
+void
+agx_nir_tess_setup_indirect(nir_builder *b, const void *data)
+{
+   const struct agx_tess_setup_indirect_key *key = data;
+
+   nir_def *params = nir_load_preamble(b, 1, 64, .base = 0);
+   nir_def *with_counts = nir_imm_bool(b, key->with_counts);
+   nir_def *point_mode = nir_imm_bool(b, key->point_mode);
+
+   libagx_tess_setup_indirect(b, params, with_counts, point_mode);
+}
--- a/src/asahi/lib/agx_nir_lower_gs.h
+++ b/src/asahi/lib/agx_nir_lower_gs.h
@@ -7,6 +7,7 @@

 #include <stdbool.h>
 #include <stdint.h>
+#include "shaders/tessellator.h"
 #include "nir.h"
 #include "shader_enums.h"

@@ -39,6 +40,8 @@ bool agx_nir_lower_gs(struct nir_shader *gs, const struct nir_shader *libagx,

 void agx_nir_prefix_sum_gs(struct nir_builder *b, const void *data);

+void agx_nir_prefix_sum_tess(struct nir_builder *b, const void *data);
+
 struct agx_gs_setup_indirect_key {
   enum mesa_prim prim;
 };
@@ -52,6 +55,23 @@ struct agx_unroll_restart_key {

 void agx_nir_unroll_restart(struct nir_builder *b, const void *key);

+struct agx_tessellator_key {
+   enum tess_primitive_mode prim                      : 8;
+   enum libagx_tess_output_primitive output_primitive : 8;
+   enum libagx_tess_partitioning partitioning         : 8;
+   enum libagx_tess_mode mode                         : 8;
+};
+static_assert(sizeof(struct agx_tessellator_key) == 4, "padded");
+
+struct agx_tess_setup_indirect_key {
+   bool point_mode;
+   bool with_counts;
+   bool padding[2];
+};
+static_assert(sizeof(struct agx_tess_setup_indirect_key) == 4, "padded");
+
+void agx_nir_tessellate(struct nir_builder *b, const void *key);
+
 bool agx_nir_lower_tcs(struct nir_shader *tcs, const struct nir_shader *libagx);

 bool agx_nir_lower_tes(struct nir_shader *tes, const struct nir_shader *libagx);
@@ -59,3 +79,5 @@ bool agx_nir_lower_tes(struct nir_shader *tes, const struct nir_shader *libagx);
 uint64_t agx_tcs_per_vertex_outputs(const struct nir_shader *nir);

 unsigned agx_tcs_output_stride(const struct nir_shader *nir);
+
+void agx_nir_tess_setup_indirect(struct nir_builder *b, const void *data);
--- a/src/asahi/lib/meson.build
+++ b/src/asahi/lib/meson.build
@@ -42,6 +42,7 @@ libagx_shader_files = files(
  'shaders/query.h',
  'shaders/tessellation.cl',
  'shaders/tessellator.cl',
+  'shaders/tessellator.h',
  'shaders/texture.cl',
  'shaders/helper.cl',
 )
--- a/src/asahi/lib/shaders/geometry.cl
+++ b/src/asahi/lib/shaders/geometry.cl
@@ -4,6 +4,7 @@
 * SPDX-License-Identifier: MIT
 */

+#include "shaders/tessellator.h"
 #include "geometry.h"

 /* Compatible with util/u_math.h */
@@ -282,6 +283,17 @@ first_true_thread_in_workgroup(bool cond, local uint *scratch)
   return (first_group * 32) + off;
 }

+/*
+ * Allocate memory from the heap (thread-safe). Returns the offset into the
+ * heap. The allocation will be word-aligned.
+ */
+static inline uint
+libagx_atomic_alloc(global struct agx_geometry_state *heap, uint size_B)
+{
+   return atomic_fetch_add((volatile atomic_uint *)(&heap->heap_bottom),
+                           align(size_B, 8));
+}
+
 /*
 * When unrolling the index buffer for a draw, we translate the old indirect
 * draws to new indirect draws. This routine allocates the new index buffer and
@@ -300,12 +312,11 @@ setup_unroll_for_draw(global struct agx_restart_unroll_params *p,
   uint max_verts = max_prims * mesa_vertices_per_prim(mode);
   uint alloc_size = max_verts * index_size_B;

-   /* Allocate memory from the heap for the unrolled index buffer. Use an atomic
-    * since multiple threads may be running to handle multidraw in parallel.
+   /* Allocate unrolled index buffer. Atomic since multiple threads may be
+    * running to handle multidraw in parallel.
    */
   global struct agx_geometry_state *heap = p->heap;
-   uint old_heap_bottom_B = atomic_fetch_add(
-      (volatile atomic_uint *)(&heap->heap_bottom), align(alloc_size, 4));
+   uint old_heap_bottom_B = libagx_atomic_alloc(p->heap, alloc_size);

   /* Regardless of the input stride, we use tightly packed output draws */
   global uint *out = &p->out_draws[5 * draw];
@@ -612,6 +623,40 @@ libagx_prefix_sum(global uint *buffer, uint len, uint words, uint word)
   }
 }

+kernel void
+libagx_prefix_sum_tess(global struct libagx_tess_args *p)
+{
+   libagx_prefix_sum(p->counts, p->nr_patches, 1 /* words */, 0 /* word */);
+
+   /* After prefix summing, we know the total # of indices, so allocate the
+    * index buffer now. Elect a thread for the allocation.
+    */
+   barrier(CLK_LOCAL_MEM_FENCE);
+   if (get_local_id(0) != 0)
+      return;
+
+   /* The last element of an inclusive prefix sum is the total sum */
+   uint total = p->counts[p->nr_patches - 1];
+
+   /* Allocate 4-byte indices */
+   uint32_t elsize_B = sizeof(uint32_t);
+   uint32_t size_B = total * elsize_B;
+   uint alloc_B = p->heap->heap_bottom;
+   p->heap->heap_bottom += size_B;
+   p->heap->heap_bottom = align(p->heap->heap_bottom, 8);
+
+   p->index_buffer = (global uint32_t *)(((uintptr_t)p->heap->heap) + alloc_B);
+
+   /* ...and now we can generate the API indexed draw */
+   global uint32_t *desc = p->out_draws;
+
+   desc[0] = total;              /* count */
+   desc[1] = 1;                  /* instance_count */
+   desc[2] = alloc_B / elsize_B; /* start */
+   desc[3] = 0;                  /* index_bias */
+   desc[4] = 0;                  /* start_instance */
+}
+
 uintptr_t
 libagx_vertex_output_address(uintptr_t buffer, uint64_t mask, uint vtx,
                             gl_varying_slot location)
--- a/src/asahi/lib/shaders/geometry.h
+++ b/src/asahi/lib/shaders/geometry.h
@@ -203,47 +203,6 @@ struct agx_geometry_params {
 } PACKED;
 AGX_STATIC_ASSERT(sizeof(struct agx_geometry_params) == 78 * 4);

-struct agx_tess_params {
-   /* Persistent (cross-draw) geometry state */
-   GLOBAL(struct agx_geometry_state) state;
-
-   /* Patch coordinate offsets in patch_coord_buffer, indexed by patch ID. */
-   GLOBAL(uint) patch_coord_offs;
-
-   /* Patch coordinate buffer, indexed as:
-    *
-    *    patch_coord_offs[patch_ID] + vertex_in_patch
-    *
-    * Currently float2s, but we might be able to compact later?
-    */
-   GLOBAL(float2) patch_coord_buffer;
-
-   /* Tessellation control shader output buffer, indexed by patch ID. */
-   GLOBAL(uchar) tcs_buffer;
-
-   /* Bitfield of TCS per-vertex outputs */
-   uint64_t tcs_per_vertex_outputs;
-
-   /* Default tess levels used in OpenGL when there is no TCS in the pipeline.
-    * Unused in Vulkan and OpenGL ES.
-    */
-   float tess_level_outer_default[4];
-   float tess_level_inner_default[4];
-
-   /* Number of vertices in the input patch */
-   uint input_patch_size;
-
-   /* Number of vertices in the TCS output patch */
-   uint output_patch_size;
-
-   /* Number of patch constants written by TCS */
-   uint tcs_patch_constants;
-
-   /* Number of input patches per instance of the VS/TCS */
-   uint patches_per_instance;
-} PACKED;
-AGX_STATIC_ASSERT(sizeof(struct agx_tess_params) == 22 * 4);
-
 /* TCS shared memory layout:
 *
 *    vec4 vs_outputs[VERTICES_IN_INPUT_PATCH][TOTAL_VERTEX_OUTPUTS];
--- a/src/asahi/lib/shaders/tessellation.cl
+++ b/src/asahi/lib/shaders/tessellation.cl
@@ -4,15 +4,17 @@
 */

 #include "geometry.h"
+#include "tessellator.h"
+#include <agx_pack.h>

 uint
-libagx_tcs_patch_vertices_in(constant struct agx_tess_params *p)
+libagx_tcs_patch_vertices_in(constant struct libagx_tess_args *p)
 {
   return p->input_patch_size;
 }

 uint
-libagx_tes_patch_vertices_in(constant struct agx_tess_params *p)
+libagx_tes_patch_vertices_in(constant struct libagx_tess_args *p)
 {
   return p->output_patch_size;
 }
@@ -25,7 +27,7 @@ libagx_tcs_in_offset(uint vtx, gl_varying_slot location,
 }

 uintptr_t
-libagx_tcs_out_address(constant struct agx_tess_params *p, uint patch_id,
+libagx_tcs_out_address(constant struct libagx_tess_args *p, uint patch_id,
                       uint vtx_id, gl_varying_slot location, uint nr_patch_out,
                       uint out_patch_size, uint64_t vtx_out_mask)
 {
@@ -45,7 +47,7 @@ libagx_tes_unrolled_patch_id(uint raw_id)
 }

 uint
-libagx_tes_patch_id(constant struct agx_tess_params *p, uint raw_id)
+libagx_tes_patch_id(constant struct libagx_tess_args *p, uint raw_id)
 {
   return libagx_tes_unrolled_patch_id(raw_id) % p->patches_per_instance;
 }
@@ -57,16 +59,20 @@ tes_vertex_id_in_patch(uint raw_id)
 }

 float2
-libagx_load_tess_coord(constant struct agx_tess_params *p, uint raw_id)
+libagx_load_tess_coord(constant struct libagx_tess_args *p, uint raw_id)
 {
   uint patch = libagx_tes_unrolled_patch_id(raw_id);
   uint vtx = tes_vertex_id_in_patch(raw_id);

-   return p->patch_coord_buffer[p->patch_coord_offs[patch] + vtx];
+   global struct libagx_tess_point *t =
+      &p->patch_coord_buffer[p->coord_allocs[patch] + vtx];
+
+   /* Written weirdly because NIR struggles with loads of structs */
+   return *((global float2 *)t);
 }

 uintptr_t
-libagx_tes_in_address(constant struct agx_tess_params *p, uint raw_id,
+libagx_tes_in_address(constant struct libagx_tess_args *p, uint raw_id,
                      uint vtx_id, gl_varying_slot location)
 {
   uint patch = libagx_tes_unrolled_patch_id(raw_id);
@@ -77,7 +83,7 @@ libagx_tes_in_address(constant struct agx_tess_params *p, uint raw_id,
 }

 float4
-libagx_tess_level_outer_default(constant struct agx_tess_params *p)
+libagx_tess_level_outer_default(constant struct libagx_tess_args *p)
 {
   return (
      float4)(p->tess_level_outer_default[0], p->tess_level_outer_default[1],
@@ -85,8 +91,98 @@ libagx_tess_level_outer_default(constant struct agx_tess_params *p)
 }

 float2
-libagx_tess_level_inner_default(constant struct agx_tess_params *p)
+libagx_tess_level_inner_default(constant struct libagx_tess_args *p)
 {
   return (float2)(p->tess_level_inner_default[0],
                   p->tess_level_inner_default[1]);
 }
+
+void
+libagx_tess_setup_indirect(global struct libagx_tess_args *p, bool with_counts,
+                           bool point_mode)
+{
+   uint count = p->indirect[0], instance_count = p->indirect[1];
+   unsigned in_patches = count / p->input_patch_size;
+
+   /* TCS invocation counter increments once per-patch */
+   if (p->tcs_statistic) {
+      *(p->tcs_statistic) += in_patches;
+   }
+
+   size_t draw_stride =
+      ((!with_counts && point_mode) ? 4 : 6) * sizeof(uint32_t);
+
+   unsigned unrolled_patches = in_patches * instance_count;
+
+   uint32_t alloc = 0;
+   uint32_t tcs_out_offs = alloc;
+   alloc += unrolled_patches * p->tcs_stride_el * 4;
+
+   uint32_t patch_coord_offs = alloc;
+   alloc += unrolled_patches * 4;
+
+   uint32_t count_offs = alloc;
+   if (with_counts)
+      alloc += unrolled_patches * sizeof(uint32_t);
+
+   uint vb_offs = alloc;
+   uint vb_size = libagx_tcs_in_size(count * instance_count, p->vertex_outputs);
+   alloc += vb_size;
+
+   /* Allocate all patch calculations in one go */
+   global uchar *blob = p->heap->heap + p->heap->heap_bottom;
+   p->heap->heap_bottom += alloc;
+
+   p->tcs_buffer = (global float *)(blob + tcs_out_offs);
+   p->patches_per_instance = in_patches;
+   p->coord_allocs = (global uint *)(blob + patch_coord_offs);
+   p->nr_patches = unrolled_patches;
+
+   *(p->vertex_output_buffer_ptr) = (uintptr_t)(blob + vb_offs);
+
+   if (with_counts) {
+      p->counts = (global uint32_t *)(blob + count_offs);
+   } else {
+#if 0
+      /* Arrange so we return after all generated draws. agx_pack would be nicer
+       * here but designated initializers lead to scratch access...
+       */
+      global uint32_t *ret =
+         (global uint32_t *)(blob + draw_offs +
+                             (draw_stride * unrolled_patches));
+
+      *ret = (AGX_VDM_BLOCK_TYPE_BARRIER << 29) | /* with return */ (1u << 27);
+#endif
+      /* TODO */
+   }
+
+   /* VS grid size */
+   p->grids[0] = count;
+   p->grids[1] = instance_count;
+   p->grids[2] = 1;
+
+   /* VS workgroup size */
+   p->grids[3] = 64;
+   p->grids[4] = 1;
+   p->grids[5] = 1;
+
+   /* TCS grid size */
+   p->grids[6] = in_patches * p->output_patch_size;
+   p->grids[7] = instance_count;
+   p->grids[8] = 1;
+
+   /* TCS workgroup size */
+   p->grids[9] = p->output_patch_size;
+   p->grids[10] = 1;
+   p->grids[11] = 1;
+
+   /* Tess grid size */
+   p->grids[12] = unrolled_patches;
+   p->grids[13] = 1;
+   p->grids[14] = 1;
+
+   /* Tess workgroup size */
+   p->grids[15] = 64;
+   p->grids[16] = 1;
+   p->grids[17] = 1;
+}
--- a/src/asahi/lib/shaders/tessellator.cl
+++ b/src/asahi/lib/shaders/tessellator.cl
--- a/src/asahi/lib/shaders/tessellator.h
+++ b/src/asahi/lib/shaders/tessellator.h
@@ -0,0 +1,124 @@
+/*
+ * Copyright 2024 Valve Corporation
+ * SPDX-License-Identifier: MIT
+ */
+
+#pragma once
+
+#include "libagx.h"
+
+enum libagx_tess_partitioning {
+   LIBAGX_TESS_PARTITIONING_FRACTIONAL_ODD,
+   LIBAGX_TESS_PARTITIONING_FRACTIONAL_EVEN,
+   LIBAGX_TESS_PARTITIONING_INTEGER,
+};
+
+enum libagx_tess_output_primitive {
+   LIBAGX_TESS_OUTPUT_POINT,
+   LIBAGX_TESS_OUTPUT_TRIANGLE_CW,
+   LIBAGX_TESS_OUTPUT_TRIANGLE_CCW,
+};
+
+enum libagx_tess_mode {
+   /* Do not actually tessellate, just write the index counts */
+   LIBAGX_TESS_MODE_COUNT,
+
+   /* Tessellate using the count buffers to allocate indices */
+   LIBAGX_TESS_MODE_WITH_COUNTS,
+
+   /* Tessellate without count buffers by generating VDM index list words */
+   LIBAGX_TESS_MODE_VDM,
+};
+
+struct libagx_tess_point {
+   float u;
+   float v;
+};
+AGX_STATIC_ASSERT(sizeof(struct libagx_tess_point) == 8);
+
+struct libagx_tess_args {
+   /* Heap to allocate tessellator outputs in */
+   GLOBAL(struct agx_geometry_state) heap;
+
+   /* Patch coordinate buffer, indexed as:
+    *
+    *    coord_allocs[patch_ID] + vertex_in_patch
+    */
+   GLOBAL(struct libagx_tess_point) patch_coord_buffer;
+
+   /* Per-patch index within the heap for the tess coords, written by the
+    * tessellator based on the allocated memory.
+    */
+   GLOBAL(uint32_t) coord_allocs;
+
+   /* Space for output draws from the tessellator. Either API draw calls or
+    * VDM control words, depending on the mode. */
+   GLOBAL(uint32_t) out_draws;
+
+   /* Tessellation control shader output buffer. */
+   GLOBAL(float) tcs_buffer;
+
+   /* Count buffer. # of indices per patch written here, then prefix summed. */
+   GLOBAL(uint32_t) counts;
+
+   /* Allocated index buffer for all patches, if we're prefix summing counts */
+   GLOBAL(uint32_t) index_buffer;
+
+   /* Address of the tess eval invocation counter for implementing pipeline
+    * statistics, if active. Zero if inactive. Incremented by tessellator.
+    */
+   GLOBAL(uint32_t) statistic;
+
+   /* Address of the tess control invocation counter for implementing pipeline
+    * statistics, if active. Zero if inactive. Incremented by indirect tess
+    * setup kernel.
+    */
+   GLOBAL(uint32_t) tcs_statistic;
+
+   /* For indirect draws with tessellation, the grid sizes. VS then TCS then
+    * tess. Allocated by the CPU and written by the tessellation
+    * setup indirect kernel.
+    */
+   GLOBAL(uint32_t) grids;
+
+   /* For indirect draws, the indirect draw descriptor. */
+   GLOBAL(uint32_t) indirect;
+
+   /* For indirect draws, the allocation for the vertex buffer.
+    *
+    * TODO: We could move these fields to an indirect setup kernel, not sure if
+    * it's worth it though...
+    */
+   GLOBAL(uint64_t) vertex_output_buffer_ptr;
+
+   /* For indirect draws, the bitfield of VS outputs */
+   uint64_t vertex_outputs;
+
+   /* Bitfield of TCS per-vertex outputs */
+   uint64_t tcs_per_vertex_outputs;
+
+   /* Default tess levels used in OpenGL when there is no TCS in the pipeline.
+    * Unused in Vulkan and OpenGL ES.
+    */
+   float tess_level_outer_default[4];
+   float tess_level_inner_default[2];
+
+   /* Number of vertices in the input patch */
+   uint32_t input_patch_size;
+
+   /* Number of vertices in the TCS output patch */
+   uint32_t output_patch_size;
+
+   /* Number of patch constants written by TCS */
+   uint32_t tcs_patch_constants;
+
+   /* Number of input patches per instance of the VS/TCS */
+   uint32_t patches_per_instance;
+
+   /* Stride between tessellation facotrs in the TCS output buffer. */
+   uint32_t tcs_stride_el;
+
+   /* Number of patches being tessellated */
+   uint32_t nr_patches;
+} PACKED;
+AGX_STATIC_ASSERT(sizeof(struct libagx_tess_args) == 40 * 4);
--- a/src/gallium/drivers/asahi/agx_query.c
+++ b/src/gallium/drivers/asahi/agx_query.c
@@ -553,8 +553,8 @@ agx_get_query_result_resource_gpu(struct agx_context *ctx,
   ctx->base.set_constant_buffer(&ctx->base, PIPE_SHADER_COMPUTE, 0, false,
                                 &cb);

-   struct pipe_grid_info grid = {.block = {1, 1, 1}, .grid = {1, 1, 1}};
-   agx_launch(batch, &grid, cs, NULL, PIPE_SHADER_COMPUTE);
+   struct agx_grid grid = agx_grid_direct(1, 1, 1, 1, 1, 1);
+   agx_launch(batch, &grid, cs, NULL, PIPE_SHADER_COMPUTE, 0);

   /* take_ownership=true so do not unreference */
   ctx->base.set_constant_buffer(&ctx->base, PIPE_SHADER_COMPUTE, 0, true,
--- a/src/gallium/drivers/asahi/agx_state.c
+++ b/src/gallium/drivers/asahi/agx_state.c
@@ -16,6 +16,7 @@
 #include "asahi/lib/agx_nir_passes.h"
 #include "asahi/lib/agx_ppp.h"
 #include "asahi/lib/agx_usc.h"
+#include "asahi/lib/shaders/tessellator.h"
 #include "compiler/nir/nir.h"
 #include "compiler/nir/nir_serialize.h"
 #include "compiler/shader_enums.h"
@@ -32,7 +33,6 @@
 #include "pipe/p_defines.h"
 #include "pipe/p_screen.h"
 #include "pipe/p_state.h"
-#include "tessellator/p_tessellator.h"
 #include "util/bitscan.h"
 #include "util/bitset.h"
 #include "util/blend.h"
@@ -3047,7 +3047,7 @@ agx_build_pipeline(struct agx_batch *batch, struct agx_compiled_shader *cs,

 static uint32_t
 agx_build_internal_usc(struct agx_batch *batch, struct agx_compiled_shader *cs,
-                       void *data, size_t data_size)
+                       uint64_t data)
 {
   size_t usc_size = agx_usc_size(12);

@@ -3056,8 +3056,7 @@ agx_build_internal_usc(struct agx_batch *batch, struct agx_compiled_shader *cs,

   struct agx_usc_builder b = agx_usc_builder(t.cpu, usc_size);

-   uint64_t ptr = agx_pool_upload(&batch->pool, data, data_size);
-   agx_usc_uniform(&b, 0, 4, agx_pool_upload(&batch->pool, &ptr, 8));
+   agx_usc_uniform(&b, 0, 4, agx_pool_upload(&batch->pool, &data, 8));
   agx_usc_immediates(&b, batch, cs);

   assert(cs->b.info.scratch_size == 0 && "internal kernels don't spill");
@@ -3093,16 +3092,26 @@ agx_build_internal_usc(struct agx_batch *batch, struct agx_compiled_shader *cs,
   return t.gpu;
 }

-void
-agx_launch_with_data(struct agx_batch *batch, const struct pipe_grid_info *info,
-                     meta_shader_builder_t builder, void *key, size_t key_size,
-                     void *data, size_t data_size)
+static void
+agx_launch_with_uploaded_data(struct agx_batch *batch,
+                              const struct agx_grid *grid,
+                              meta_shader_builder_t builder, void *key,
+                              size_t key_size, uint64_t data)
 {
   struct agx_compiled_shader *cs = agx_build_meta_shader_internal(
      batch->ctx, builder, key, key_size, false, false, 0, true);

-   uint32_t usc = agx_build_internal_usc(batch, cs, data, data_size);
-   agx_launch_internal(batch, info, cs, PIPE_SHADER_COMPUTE, usc);
+   uint32_t usc = agx_build_internal_usc(batch, cs, data);
+   agx_launch_internal(batch, grid, cs, PIPE_SHADER_COMPUTE, usc);
+}
+
+void
+agx_launch_with_data(struct agx_batch *batch, const struct agx_grid *grid,
+                     meta_shader_builder_t builder, void *key, size_t key_size,
+                     void *data, size_t data_size)
+{
+   uint64_t upload = agx_pool_upload_aligned(&batch->pool, data, data_size, 4);
+   agx_launch_with_uploaded_data(batch, grid, builder, key, key_size, upload);
 }

 struct asahi_bg_eot
@@ -4053,6 +4062,17 @@ agx_batch_geometry_params(struct agx_batch *batch, uint64_t input_index_buffer,
                                          8, &batch->geom_params_bo);
 }

+static uint64_t
+agx_indirect_buffer_ptr(struct agx_batch *batch,
+                        const struct pipe_draw_indirect_info *indirect)
+{
+   assert(indirect->buffer && "drawauto already handled");
+
+   struct agx_resource *rsrc = agx_resource(indirect->buffer);
+   agx_batch_reads(batch, rsrc);
+   return rsrc->bo->ptr.gpu + indirect->offset;
+}
+
 static void
 agx_launch_gs_prerast(struct agx_batch *batch,
                      const struct pipe_draw_info *info,
@@ -4084,23 +4104,14 @@ agx_launch_gs_prerast(struct agx_batch *batch,

   assert(!info->primitive_restart && "should have been lowered");

-   struct pipe_grid_info grid_vs = {.block = {1, 1, 1}};
-   struct pipe_grid_info grid_gs = {.block = {1, 1, 1}};
-   struct agx_resource grid_indirect_rsrc = {.bo = batch->geom_params_bo};
+   struct agx_grid grid_vs, grid_gs;

   /* Setup grids */
   if (indirect) {
-      assert(indirect->buffer && "drawauto already handled");
-
      struct agx_gs_setup_indirect_key key = {
         .prim = info->mode,
      };

-      const struct pipe_grid_info grid_setup = {
-         .block = {1, 1, 1},
-         .grid = {1, 1, 1},
-      };
-
      uint64_t ib = 0;
      size_t ib_extent = 0;

@@ -4109,13 +4120,10 @@ agx_launch_gs_prerast(struct agx_batch *batch,
                                   &ib_extent);
      }

-      struct agx_resource *rsrc = agx_resource(indirect->buffer);
-      agx_batch_reads(batch, rsrc);
-
      struct agx_gs_setup_indirect_params gsi = {
         .index_buffer = ib,
         .index_buffer_range_el = ib_extent / info->index_size,
-         .draw = rsrc->bo->ptr.gpu + indirect->offset,
+         .draw = agx_indirect_buffer_ptr(batch, indirect),
         .vertex_buffer = batch->uniforms.vertex_output_buffer_ptr,
         .ia = batch->uniforms.input_assembly,
         .geom = batch->uniforms.geometry_params,
@@ -4123,62 +4131,50 @@ agx_launch_gs_prerast(struct agx_batch *batch,
         .index_size_B = info->index_size,
      };

+      const struct agx_grid grid_setup = agx_grid_direct(1, 1, 1, 1, 1, 1);
      agx_launch_with_data(batch, &grid_setup, agx_nir_gs_setup_indirect, &key,
                           sizeof(key), &gsi, sizeof(gsi));

-      /* Wrap the pool allocation in a fake resource for meta-Gallium use */
-      assert(batch->geom_params_bo != NULL);
-      grid_vs.indirect = &grid_indirect_rsrc.base;
-      grid_gs.indirect = &grid_indirect_rsrc.base;
+      uint64_t gp = batch->uniforms.geometry_params;

-      unsigned param_offs =
-         (batch->uniforms.geometry_params - grid_indirect_rsrc.bo->ptr.gpu);
+      grid_vs = agx_grid_indirect(
+         gp + offsetof(struct agx_geometry_params, vs_grid), 1, 1, 1);

-      grid_vs.indirect_offset =
-         param_offs + offsetof(struct agx_geometry_params, vs_grid);
-
-      grid_gs.indirect_offset =
-         param_offs + offsetof(struct agx_geometry_params, gs_grid);
+      grid_gs = agx_grid_indirect(
+         gp + offsetof(struct agx_geometry_params, gs_grid), 1, 1, 1);
   } else {
-      grid_vs.grid[0] = draws->count;
-      grid_vs.grid[1] = info->instance_count;
-      grid_vs.grid[2] = 1;
+      grid_vs =
+         agx_grid_direct(draws->count, info->instance_count, 1, 64, 1, 1);

-      grid_gs.grid[0] =
-         u_decomposed_prims_for_vertices(info->mode, draws->count);
-      grid_gs.grid[1] = info->instance_count;
-      grid_gs.grid[2] = 1;
+      grid_gs = agx_grid_direct(
+         u_decomposed_prims_for_vertices(info->mode, draws->count),
+         info->instance_count, 1, 64, 1, 1);
   }

   /* Launch the vertex shader first */
-   agx_launch(batch, &grid_vs, ctx->vs, ctx->linked.vs, ctx->vs->stage);
+   agx_launch(batch, &grid_vs, ctx->vs, ctx->linked.vs, ctx->vs->stage, 0);

   /* If there is a count shader, launch it and prefix sum the results. */
   if (gs->gs_count) {
      perf_debug(dev, "Geometry shader count");
-      agx_launch(batch, &grid_gs, gs->gs_count, NULL, PIPE_SHADER_GEOMETRY);
+      agx_launch(batch, &grid_gs, gs->gs_count, NULL, PIPE_SHADER_GEOMETRY, 0);

      unsigned words = gs->gs_count_words;
-      agx_launch(batch,
-                 &(const struct pipe_grid_info){
-                    .block = {1024, 1, 1},
-                    .grid = {gs->gs_count_words, 1, 1},
-                 },
+      struct agx_grid grid =
+         agx_grid_direct(1024 * gs->gs_count_words, 1, 1, 1024, 1, 1);
+
+      agx_launch(batch, &grid,
                 agx_build_meta_shader(ctx, agx_nir_prefix_sum_gs, &words,
                                       sizeof(words)),
-                 NULL, PIPE_SHADER_COMPUTE);
+                 NULL, PIPE_SHADER_COMPUTE, 0);
   }

   /* Pre-GS shader */
-   agx_launch(batch,
-              &(const struct pipe_grid_info){
-                 .block = {1, 1, 1},
-                 .grid = {1, 1, 1},
-              },
-              gs->pre_gs, NULL, PIPE_SHADER_COMPUTE);
+   struct agx_grid grid = agx_grid_direct(1, 1, 1, 1, 1, 1);
+   agx_launch(batch, &grid, gs->pre_gs, NULL, PIPE_SHADER_COMPUTE, 0);

   /* Pre-rast geometry shader */
-   agx_launch(batch, &grid_gs, gs, NULL, PIPE_SHADER_GEOMETRY);
+   agx_launch(batch, &grid_gs, gs, NULL, PIPE_SHADER_GEOMETRY, 0);
 }

 static void
@@ -4249,10 +4245,8 @@ agx_draw_without_restart(struct agx_batch *batch,
   };

   /* Unroll the index buffer for each draw */
-   const struct pipe_grid_info grid_setup = {
-      .block = {1024, 1, 1},
-      .grid = {indirect->draw_count, 1, 1},
-   };
+   const struct agx_grid grid_setup =
+      agx_grid_direct(1024 * indirect->draw_count, 1, 1, 1024, 1, 1);

   agx_launch_with_data(batch, &grid_setup, agx_nir_unroll_restart, &key,
                        sizeof(key), &unroll, sizeof(unroll));
@@ -4310,8 +4304,9 @@ agx_needs_passthrough_gs(struct agx_context *ctx,
      return true;
   }

-   /* TODO: this is sloppy, we should add a VDM kernel for this. */
-   if (indirect && ctx->active_queries && ctx->prims_generated[0]) {
+   /* TODO: this is really sloppy, we should add a VDM kernel for this. */
+   if ((indirect || info->mode == MESA_PRIM_PATCHES) && ctx->active_queries &&
+       ctx->prims_generated[0]) {
      perf_debug_ctx(ctx, "Using passthrough GS due to indirect prim query");
      return true;
   }
@@ -4333,8 +4328,11 @@ agx_needs_passthrough_gs(struct agx_context *ctx,
   /* Transform feedback is layered on geometry shaders, so if transform
    * feedback is used, we need a GS.
    */
-   if (ctx->stage[PIPE_SHADER_VERTEX].shader->has_xfb_info &&
-       ctx->streamout.num_targets) {
+   struct agx_uncompiled_shader *last_vtx =
+      ctx->stage[PIPE_SHADER_TESS_EVAL].shader
+         ?: ctx->stage[PIPE_SHADER_VERTEX].shader;
+
+   if (last_vtx->has_xfb_info && ctx->streamout.num_targets) {
      *xfb_only = true;
      return true;
   }
@@ -4343,6 +4341,20 @@ agx_needs_passthrough_gs(struct agx_context *ctx,
   return false;
 }

+static enum mesa_prim
+agx_tess_output_prim(struct agx_uncompiled_shader *tcs,
+                     struct agx_uncompiled_shader *tes)
+{
+   if ((tcs && tcs->tess.point_mode) || tes->tess.point_mode) {
+      return MESA_PRIM_POINTS;
+   } else if (TESS_PRIMITIVE_ISOLINES ==
+              MAX2(tcs ? tcs->tess.primitive : 0, tes->tess.primitive)) {
+      return MESA_PRIM_LINES;
+   } else {
+      return MESA_PRIM_TRIANGLES;
+   }
+}
+
 static struct agx_uncompiled_shader *
 agx_get_passthrough_gs(struct agx_context *ctx,
                       struct agx_uncompiled_shader *prev_cso,
@@ -4350,11 +4362,16 @@ agx_get_passthrough_gs(struct agx_context *ctx,
 {
   bool edgeflags = has_edgeflags(ctx, mode);

+   if (mode == MESA_PRIM_PATCHES) {
+      mode = agx_tess_output_prim(ctx->stage[MESA_SHADER_TESS_CTRL].shader,
+                                  ctx->stage[MESA_SHADER_TESS_EVAL].shader);
+   }
+
   /* Only handle the polygon mode when edge flags are in use, because
    * nir_passthrough_gs doesn't handle transform feedback + polygon mode
-    * properly. Technically this can break edge flags + transform feedback but
-    * that's firmly in "doctor, it hurts when I do this" territory, and I'm not
-    * sure that's even possible to hit. TODO: Reevaluate.
+    * properly. Technically this can break edge flags + transform feedback
+    * but that's firmly in "doctor, it hurts when I do this" territory, and
+    * I'm not sure that's even possible to hit. TODO: Reevaluate.
    */
   unsigned poly_mode =
      edgeflags ? ctx->rast->base.fill_front : PIPE_POLYGON_MODE_FILL;
@@ -4525,35 +4542,39 @@ agx_draw_patches(struct agx_context *ctx, const struct pipe_draw_info *info,
      unbind_tcs_when_done = true;
   }

-   unsigned in_vertices = draws->count;
-   unsigned in_patches = in_vertices / patch_vertices;
+   enum tess_primitive_mode mode =
+      MAX2(tcs->tess.primitive, tes->tess.primitive);
+   enum gl_tess_spacing spacing = MAX2(tcs->tess.spacing, tes->tess.spacing);

-   if (in_patches == 0)
-      return;
+   enum pipe_tess_spacing pspacing = spacing == TESS_SPACING_EQUAL
+                                        ? PIPE_TESS_SPACING_EQUAL
+                                     : spacing == TESS_SPACING_FRACTIONAL_ODD
+                                        ? PIPE_TESS_SPACING_FRACTIONAL_ODD
+                                        : PIPE_TESS_SPACING_FRACTIONAL_EVEN;

-   /* TCS invocation counter increments once per-patch */
-   agx_query_increment_cpu(
-      ctx, ctx->pipeline_statistics[PIPE_STAT_QUERY_HS_INVOCATIONS],
-      in_patches);
+   bool point_mode = MAX2(tcs->tess.point_mode, tes->tess.point_mode);
+   enum mesa_prim out_prim = agx_tess_output_prim(tcs, tes);

-   struct agx_batch *batch = agx_get_compute_batch(ctx);
+   enum libagx_tess_partitioning partitioning =
+      (enum libagx_tess_partitioning)pspacing;
+
+   enum libagx_tess_output_primitive prim =
+      point_mode       ? LIBAGX_TESS_OUTPUT_POINT
+      : !tes->tess.ccw ? LIBAGX_TESS_OUTPUT_TRIANGLE_CCW
+                       : LIBAGX_TESS_OUTPUT_TRIANGLE_CW;
+
+   struct agx_bo *draw_bo = NULL;
+   bool with_counts =
+      indirect || ctx->stage[MESA_SHADER_GEOMETRY].shader != NULL;
+   size_t draw_stride =
+      ((!with_counts && point_mode) ? 4 : 6) * sizeof(uint32_t);
+
+   struct agx_batch *batch = agx_get_batch(ctx);
   agx_batch_init_state(batch);

-   struct pipe_resource *heap =
-      pipe_buffer_create(ctx->base.screen, PIPE_BIND_GLOBAL, PIPE_USAGE_DEFAULT,
-                         1024 * 1024 * 128);
-
-   uint64_t heap_gpu = agx_resource(heap)->bo->ptr.gpu;
-   uint8_t *heap_cpu = agx_resource(heap)->bo->ptr.cpu;
-
-   unsigned unrolled_patch_count = in_patches * info->instance_count;
-
-   uint32_t heap_water = 0;
-   uint32_t tcs_out_offs = heap_water;
-   heap_water += ALIGN(unrolled_patch_count * tcs->tess.output_stride, 4);
-
-   agx_batch_writes(batch, agx_resource(heap), 0);
-   batch->incoherent_writes = true;
+   if (!batch->cdm.bo) {
+      batch->cdm = agx_encoder_allocate(batch, dev);
+   }

   uint64_t ib = 0;
   size_t ib_extent = 0;
@@ -4573,25 +4594,29 @@ agx_draw_patches(struct agx_context *ctx, const struct pipe_draw_info *info,
   agx_upload_draw_params(batch, indirect, draws, info);

   /* Setup parameters */
-   struct agx_tess_params tess_params = {
-      .tcs_buffer = heap_gpu + tcs_out_offs,
+   uint64_t geom_state = agx_batch_geometry_state(batch);
+   assert((tcs->tess.output_stride & 3) == 0 && "must be aligned");
+
+   struct libagx_tess_args args = {
+      .heap = geom_state,
+      .tcs_stride_el = tcs->tess.output_stride / 4,
+      .statistic = agx_get_query_address(
+         batch, ctx->pipeline_statistics[PIPE_STAT_QUERY_DS_INVOCATIONS]),
      .input_patch_size = patch_vertices,
      .output_patch_size = tcs->tess.output_patch_size,
      .tcs_patch_constants = tcs->tess.nr_patch_outputs,
      .tcs_per_vertex_outputs = tcs->tess.per_vertex_outputs,
-      .patch_coord_buffer = heap_gpu,
-      .patches_per_instance = in_patches,
+      .patch_coord_buffer = agx_resource(ctx->heap)->bo->ptr.gpu,
   };

-   memcpy(&tess_params.tess_level_outer_default, ctx->default_outer_level,
+   memcpy(&args.tess_level_outer_default, ctx->default_outer_level,
          sizeof(ctx->default_outer_level));
-   memcpy(&tess_params.tess_level_inner_default, ctx->default_inner_level,
+   memcpy(&args.tess_level_inner_default, ctx->default_inner_level,
          sizeof(ctx->default_inner_level));

-   batch->uniforms.tess_params =
-      agx_pool_upload(&batch->pool, &tess_params, sizeof(tess_params));
+   struct agx_grid vs_grid, tcs_grid, tess_grid;
+   unsigned tess_wg_size = 64;

-   /* Run VS+TCS as compute */
   agx_upload_vbos(batch);
   agx_update_vs(ctx, info->index_size);
   agx_update_tcs(ctx, info);
@@ -4605,153 +4630,184 @@ agx_draw_patches(struct agx_context *ctx, const struct pipe_draw_info *info,

   batch->uniforms.vertex_outputs = ctx->vs->b.info.outputs;

-   unsigned vb_size = libagx_tcs_in_size(draws->count * info->instance_count,
-                                         batch->uniforms.vertex_outputs);
-   uint64_t addr = agx_pool_alloc_aligned(&batch->pool, vb_size, 4).gpu;
-   batch->uniforms.vertex_output_buffer_ptr =
-      agx_pool_upload(&batch->pool, &addr, 8);
+   if (indirect == NULL) {
+      unsigned in_patches = draws->count / patch_vertices;
+      if (in_patches == 0)
+         return;

-   struct pipe_grid_info vs_grid = {
-      .block = {1, 1, 1},
-      .grid = {draws->count, info->instance_count, 1},
-   };
+      /* TCS invocation counter increments once per-patch */
+      agx_query_increment_cpu(
+         ctx, ctx->pipeline_statistics[PIPE_STAT_QUERY_HS_INVOCATIONS],
+         in_patches);

-   agx_launch(batch, &vs_grid, ctx->vs, ctx->linked.vs, PIPE_SHADER_VERTEX);
+      unsigned unrolled_patches = in_patches * info->instance_count;

-   struct pipe_grid_info tcs_grid = {
-      .block = {tcs->tess.output_patch_size, 1, 1},
-      .grid = {in_patches, info->instance_count, 1},
-   };
+      uint32_t alloc = 0;
+      uint32_t tcs_out_offs = alloc;
+      alloc += unrolled_patches * tcs->tess.output_stride;

-   agx_launch(batch, &tcs_grid, ctx->tcs, NULL, PIPE_SHADER_TESS_CTRL);
+      uint32_t patch_coord_offs = alloc;
+      alloc += unrolled_patches * 4;
+
+      uint32_t count_offs = alloc;
+      if (with_counts)
+         alloc += unrolled_patches * sizeof(uint32_t);
+
+      uint32_t draw_offs = alloc;
+
+      if (with_counts) {
+         alloc += draw_stride;
+      } else {
+         /* Padding added because VDM overreads */
+         alloc +=
+            (draw_stride * unrolled_patches) + (AGX_VDM_BARRIER_LENGTH + 0x800);
+      }
+
+      struct agx_ptr blob =
+         agx_pool_alloc_aligned_with_bo(&batch->pool, alloc, 4, &draw_bo);
+
+      args.tcs_buffer = blob.gpu + tcs_out_offs;
+      args.patches_per_instance = in_patches;
+      args.coord_allocs = blob.gpu + patch_coord_offs;
+      args.nr_patches = unrolled_patches;
+      args.out_draws = blob.gpu + draw_offs;
+
+      if (with_counts) {
+         args.counts = blob.gpu + count_offs;
+      } else {
+         /* Arrange so we return after all generated draws */
+         uint8_t *ret =
+            (uint8_t *)blob.cpu + draw_offs + (draw_stride * unrolled_patches);
+
+         agx_pack(ret, VDM_BARRIER, cfg) {
+            cfg.returns = true;
+         }
+      }
+
+      unsigned vb_size = libagx_tcs_in_size(draws->count * info->instance_count,
+                                            batch->uniforms.vertex_outputs);
+      uint64_t addr = agx_pool_alloc_aligned(&batch->pool, vb_size, 4).gpu;
+      batch->uniforms.vertex_output_buffer_ptr =
+         agx_pool_upload(&batch->pool, &addr, 8);
+
+      vs_grid =
+         agx_grid_direct(draws->count, info->instance_count, 1, 64, 1, 1);
+
+      tcs_grid = agx_grid_direct(in_patches * tcs->tess.output_patch_size,
+                                 info->instance_count, 1,
+                                 tcs->tess.output_patch_size, 1, 1);
+
+      tess_grid = agx_grid_direct(unrolled_patches, 1, 1, tess_wg_size, 1, 1);
+   } else if (indirect) {
+      args.tcs_statistic = agx_get_query_address(
+         batch, ctx->pipeline_statistics[PIPE_STAT_QUERY_HS_INVOCATIONS]);
+
+      args.indirect = agx_indirect_buffer_ptr(batch, indirect);
+
+      /* Allocate 3x indirect global+local grids for VS/TCS/tess */
+      uint32_t grid_stride = sizeof(uint32_t) * 6;
+      args.grids = agx_pool_alloc_aligned(&batch->pool, grid_stride * 3, 4).gpu;
+
+      vs_grid = agx_grid_indirect_local(args.grids + 0 * grid_stride);
+      tcs_grid = agx_grid_indirect_local(args.grids + 1 * grid_stride);
+      tess_grid = agx_grid_indirect_local(args.grids + 2 * grid_stride);
+
+      args.vertex_outputs = ctx->vs->b.info.outputs;
+      args.vertex_output_buffer_ptr =
+         agx_pool_alloc_aligned(&batch->pool, 8, 8).gpu;
+
+      batch->uniforms.vertex_output_buffer_ptr = args.vertex_output_buffer_ptr;
+
+      if (with_counts) {
+         args.out_draws = agx_pool_alloc_aligned_with_bo(
+                             &batch->pool, draw_stride, 4, &draw_bo)
+                             .gpu;
+      } else {
+         unreachable("need an extra indirection...");
+      }
+   }
+
+   uint64_t state =
+      agx_pool_upload_aligned(&batch->pool, &args, sizeof(args), 4);
+
+   if (indirect) {
+      const struct agx_grid indirect_grid = agx_grid_direct(1, 1, 1, 1, 1, 1);
+      struct agx_tess_setup_indirect_key indirect_key = {
+         .point_mode = point_mode,
+         .with_counts = with_counts,
+      };
+
+      agx_launch_with_uploaded_data(batch, &indirect_grid,
+                                    agx_nir_tess_setup_indirect, &indirect_key,
+                                    sizeof(indirect_key), state);
+   }
+
+   batch->uniforms.tess_params = state;
+
+   agx_launch(batch, &vs_grid, ctx->vs, ctx->linked.vs, PIPE_SHADER_VERTEX, 0);
+   agx_launch(batch, &tcs_grid, ctx->tcs, NULL, PIPE_SHADER_TESS_CTRL, 0);
   batch->uniforms.vertex_output_buffer_ptr = 0;

-   agx_flush_all(ctx, "HACK");
-   agx_sync_all(ctx, "HACK");
+   struct agx_tessellator_key key = {
+      .prim = mode,
+      .output_primitive = prim,
+      .partitioning = partitioning,
+   };

-   /* Setup batch */
-   batch = agx_get_batch(ctx);
+   if (with_counts) {
+      /* Generate counts */
+      key.mode = LIBAGX_TESS_MODE_COUNT;
+      agx_launch_with_uploaded_data(batch, &tess_grid, agx_nir_tessellate, &key,
+                                    sizeof(key), state);

-   enum tess_primitive_mode mode =
-      MAX2(tcs->tess.primitive, tes->tess.primitive);
-   enum gl_tess_spacing spacing = MAX2(tcs->tess.spacing, tes->tess.spacing);
+      /* Prefix sum counts, allocating index buffer space. */
+      const struct agx_grid prefix_sum_grid =
+         agx_grid_direct(1024, 1, 1, 1024, 1, 1);

-   enum pipe_tess_spacing pspacing = spacing == TESS_SPACING_EQUAL
-                                        ? PIPE_TESS_SPACING_EQUAL
-                                     : spacing == TESS_SPACING_FRACTIONAL_ODD
-                                        ? PIPE_TESS_SPACING_FRACTIONAL_ODD
-                                        : PIPE_TESS_SPACING_FRACTIONAL_EVEN;
+      agx_launch_with_uploaded_data(batch, &prefix_sum_grid,
+                                    agx_nir_prefix_sum_tess, NULL, 0, state);

-   bool point_mode = MAX2(tcs->tess.point_mode, tes->tess.point_mode);
-   enum mesa_prim in_prim = mode == TESS_PRIMITIVE_ISOLINES ? MESA_PRIM_LINES
-                            : mode == TESS_PRIMITIVE_QUADS
-                               ? MESA_PRIM_QUADS
-                               : MESA_PRIM_TRIANGLES;
-   enum mesa_prim out_prim = point_mode ? MESA_PRIM_POINTS
-                             : mode == TESS_PRIMITIVE_ISOLINES
-                                ? MESA_PRIM_LINES
-                                : MESA_PRIM_TRIANGLES;
-
-   struct pipe_tessellator *tess =
-      p_tess_init(in_prim, pspacing, tes->tess.ccw, point_mode);
-
-   struct pipe_tessellator_data data = {0};
-
-   /* Mem allocate */
-   uint32_t patch_coord_offs_offs = heap_water;
-   tess_params.patch_coord_offs = heap_gpu + heap_water;
-   heap_water += align(4 * unrolled_patch_count, 4);
-
-   uint32_t draws_off = heap_water;
-   uint32_t *patch_draws = (uint32_t *)(heap_cpu + heap_water);
-   heap_water += align(sizeof(uint32_t) * 5 * unrolled_patch_count, 4);
-
-   uint32_t *patch_offs = (uint32_t *)(heap_cpu + patch_coord_offs_offs);
-
-   for (unsigned patch = 0; patch < unrolled_patch_count; ++patch) {
-      float *addr =
-         (float *)(heap_cpu + tcs_out_offs + tcs->tess.output_stride * patch);
-
-      struct pipe_tessellation_factors factors = {
-         .outer_tf = {addr[0], addr[1], addr[2], addr[3]},
-         .inner_tf = {addr[4], addr[5]},
-      };
-      p_tessellate(tess, &factors, &data);
-
-      /* Mem allocate indices */
-      uint32_t index_off = heap_water;
-      uint16_t *indices = (uint16_t *)(heap_cpu + heap_water);
-      heap_water += align(sizeof(*indices) * data.num_indices, 4);
-
-      for (unsigned idx = 0; idx < data.num_indices; ++idx) {
-         indices[idx] = data.indices[idx];
-      }
-
-      /* Mem allocate patch coords */
-      heap_water = align(heap_water, 8);
-      patch_offs[patch] = heap_water / 8;
-      float *patch_coords = (float *)(heap_cpu + heap_water);
-      heap_water += align(8 * data.num_domain_points, 4);
-
-      for (unsigned p = 0; p < data.num_domain_points; ++p) {
-         patch_coords[2 * p + 0] = data.domain_points_u[p];
-         patch_coords[2 * p + 1] = data.domain_points_v[p];
-      }
-      assert(data.num_indices < 32768);
-      assert(data.num_domain_points < 8192);
-
-      /* Generate a draw for the patch */
-      uint32_t *desc = patch_draws + (patch * 5);
-
-      desc[0] = data.num_indices;                   /* count */
-      desc[1] = 1;                                  /* instance_count */
-      desc[2] = index_off / sizeof(*indices);       /* start */
-      desc[3] = patch * LIBAGX_TES_PATCH_ID_STRIDE; /* index_bias */
-      desc[4] = 0;                                  /* start_instance */
-
-      /* TES invocation counter increments once per tessellated vertex */
-      agx_query_increment_cpu(
-         ctx, ctx->pipeline_statistics[PIPE_STAT_QUERY_DS_INVOCATIONS],
-         data.num_domain_points);
+      key.mode = LIBAGX_TESS_MODE_WITH_COUNTS;
+   } else {
+      key.mode = LIBAGX_TESS_MODE_VDM;
   }
-   p_tess_destroy(tess);
+
+   /* Now we can tessellate */
+   agx_launch_with_uploaded_data(batch, &tess_grid, agx_nir_tessellate, &key,
+                                 sizeof(key), state);

   /* Run TES as VS */
   void *vs_cso = ctx->stage[PIPE_SHADER_VERTEX].shader;
   void *tes_cso = ctx->stage[PIPE_SHADER_TESS_EVAL].shader;
   ctx->base.bind_vs_state(&ctx->base, tes_cso);
   ctx->in_tess = true;
+   ctx->in_generated_vdm = !with_counts;

   struct pipe_draw_info draw_info = {
      .mode = out_prim,
-      .index_size = 2,
-      .index.resource = heap,
+      .index_size = with_counts ? 4 : (point_mode ? 0 : 2),
+      .index.resource = (!with_counts && point_mode) ? NULL : ctx->heap,
      .instance_count = 1,
      .view_mask = info->view_mask,
   };

   /* Wrap the pool allocation in a fake resource for meta-Gallium use */
-   struct pipe_draw_indirect_info copy_indirect = {
-      .buffer = heap,
-      .offset = draws_off,
-      .stride = 5 * sizeof(uint32_t),
-      .draw_count = in_patches * info->instance_count,
-   };
+   struct agx_resource indirect_rsrc = {.bo = draw_bo};

-   /* Tess param upload is deferred to draw_vbo since the batch may change
-    * within draw_vbo for various reasons, so we can't upload it to the batch
-    * upfront.
-    */
-   memcpy(&ctx->tess_params, &tess_params, sizeof(tess_params));
+   struct pipe_draw_indirect_info copy_indirect = {
+      .buffer = &indirect_rsrc.base,
+      .offset = args.out_draws - draw_bo->ptr.gpu,
+      .stride = draw_stride,
+      .draw_count = 1,
+   };

   ctx->base.draw_vbo(&ctx->base, &draw_info, 0, &copy_indirect, NULL, 1);

   /* Restore vertex state */
   ctx->base.bind_vs_state(&ctx->base, vs_cso);
+   ctx->in_generated_vdm = false;
   ctx->in_tess = false;

-   pipe_resource_reference(&heap, NULL);
-
   if (unbind_tcs_when_done) {
      ctx->base.bind_tcs_state(&ctx->base, NULL);
   }
@@ -4860,13 +4916,6 @@ agx_draw_vbo(struct pipe_context *pctx, const struct pipe_draw_info *info,
      return;
   }

-   /* TODO: stop cheating */
-   if (info->mode == MESA_PRIM_PATCHES && indirect) {
-      perf_debug_ctx(ctx, "indirect tessellation");
-      util_draw_indirect(pctx, info, drawid_offset, indirect);
-      return;
-   }
-
   /* TODO: stop cheating */
   if (ctx->active_queries && !ctx->active_draw_without_restart &&
       (ctx->pipeline_statistics[PIPE_STAT_QUERY_IA_VERTICES] ||
@@ -4878,11 +4927,6 @@ agx_draw_vbo(struct pipe_context *pctx, const struct pipe_draw_info *info,
      return;
   }

-   if (info->mode == MESA_PRIM_PATCHES) {
-      agx_draw_patches(ctx, info, drawid_offset, indirect, draws, num_draws);
-      return;
-   }
-
   bool xfb_passthrough = false;
   if (agx_needs_passthrough_gs(ctx, info, indirect, &xfb_passthrough)) {
      agx_apply_passthrough_gs(ctx, info, drawid_offset, indirect, draws,
@@ -4890,6 +4934,11 @@ agx_draw_vbo(struct pipe_context *pctx, const struct pipe_draw_info *info,
      return;
   }

+   if (info->mode == MESA_PRIM_PATCHES) {
+      agx_draw_patches(ctx, info, drawid_offset, indirect, draws, num_draws);
+      return;
+   }
+
   agx_legalize_feedback_loops(ctx);

   /* Only the rasterization stream counts */
@@ -5017,11 +5066,6 @@ agx_draw_vbo(struct pipe_context *pctx, const struct pipe_draw_info *info,
       IS_DIRTY(BLEND_COLOR) || IS_DIRTY(QUERY) || IS_DIRTY(POLY_STIPPLE) ||
       IS_DIRTY(RS) || IS_DIRTY(PRIM) || ctx->in_tess) {

-      if (ctx->in_tess) {
-         batch->uniforms.tess_params = agx_pool_upload(
-            &batch->pool, &ctx->tess_params, sizeof(ctx->tess_params));
-      }
-
      if (IS_DIRTY(VERTEX)) {
         agx_upload_vbos(batch);
      }
@@ -5143,63 +5187,75 @@ agx_draw_vbo(struct pipe_context *pctx, const struct pipe_draw_info *info,

   uint8_t *out = agx_encode_state(batch, batch->vdm.current);

-   if (info->index_size) {
-      agx_push(out, VDM_STATE, cfg)
-         cfg.restart_index_present = true;
-
-      agx_push(out, VDM_STATE_RESTART_INDEX, cfg)
-         cfg.value = info->restart_index;
-   }
-
-   agx_push(out, INDEX_LIST, cfg) {
-      cfg.primitive = agx_primitive_for_pipe(info->mode);
-
-      if (indirect != NULL) {
-         cfg.indirect_buffer_present = true;
-      } else {
-         cfg.instance_count_present = true;
-         cfg.index_count_present = true;
-         cfg.start_present = true;
-      }
-
-      if (info->index_size) {
-         cfg.restart_enable = info->primitive_restart;
-         cfg.index_buffer_hi = (ib >> 32);
-         cfg.index_size = agx_translate_index_size(info->index_size);
-         cfg.index_buffer_present = true;
-         cfg.index_buffer_size_present = true;
-      }
-   }
-
-   if (info->index_size) {
-      agx_push(out, INDEX_LIST_BUFFER_LO, cfg) {
-         cfg.buffer_lo = ib & BITFIELD_MASK(32);
-      }
-   }
-
-   if (indirect) {
+   if (ctx->in_generated_vdm) {
      struct agx_resource *indirect_rsrc = agx_resource(indirect->buffer);
      uint64_t address = indirect_rsrc->bo->ptr.gpu + indirect->offset;

-      agx_push(out, INDEX_LIST_INDIRECT_BUFFER, cfg) {
-         cfg.address_hi = address >> 32;
-         cfg.address_lo = address & BITFIELD_MASK(32);
+      agx_push(out, VDM_STREAM_LINK, cfg) {
+         cfg.target_lo = address & BITFIELD_MASK(32);
+         cfg.target_hi = address >> 32;
+         cfg.with_return = true;
      }
   } else {
-      agx_push(out, INDEX_LIST_COUNT, cfg)
-         cfg.count = draws->count;

-      agx_push(out, INDEX_LIST_INSTANCES, cfg)
-         cfg.count = info->instance_count;
+      if (info->index_size && info->primitive_restart) {
+         agx_push(out, VDM_STATE, cfg)
+            cfg.restart_index_present = true;

-      agx_push(out, INDEX_LIST_START, cfg) {
-         cfg.start = info->index_size ? draws->index_bias : draws->start;
+         agx_push(out, VDM_STATE_RESTART_INDEX, cfg)
+            cfg.value = info->restart_index;
      }
-   }

-   if (info->index_size) {
-      agx_push(out, INDEX_LIST_BUFFER_SIZE, cfg) {
-         cfg.size = ib_extent;
+      agx_push(out, INDEX_LIST, cfg) {
+         cfg.primitive = agx_primitive_for_pipe(info->mode);
+
+         if (indirect != NULL) {
+            cfg.indirect_buffer_present = true;
+         } else {
+            cfg.instance_count_present = true;
+            cfg.index_count_present = true;
+            cfg.start_present = true;
+         }
+
+         if (info->index_size) {
+            cfg.restart_enable = info->primitive_restart;
+            cfg.index_buffer_hi = (ib >> 32);
+            cfg.index_size = agx_translate_index_size(info->index_size);
+            cfg.index_buffer_present = true;
+            cfg.index_buffer_size_present = true;
+         }
+      }
+
+      if (info->index_size) {
+         agx_push(out, INDEX_LIST_BUFFER_LO, cfg) {
+            cfg.buffer_lo = ib & BITFIELD_MASK(32);
+         }
+      }
+
+      if (indirect) {
+         struct agx_resource *indirect_rsrc = agx_resource(indirect->buffer);
+         uint64_t address = indirect_rsrc->bo->ptr.gpu + indirect->offset;
+
+         agx_push(out, INDEX_LIST_INDIRECT_BUFFER, cfg) {
+            cfg.address_hi = address >> 32;
+            cfg.address_lo = address & BITFIELD_MASK(32);
+         }
+      } else {
+         agx_push(out, INDEX_LIST_COUNT, cfg)
+            cfg.count = draws->count;
+
+         agx_push(out, INDEX_LIST_INSTANCES, cfg)
+            cfg.count = info->instance_count;
+
+         agx_push(out, INDEX_LIST_START, cfg) {
+            cfg.start = info->index_size ? draws->index_bias : draws->start;
+         }
+      }
+
+      if (info->index_size) {
+         agx_push(out, INDEX_LIST_BUFFER_SIZE, cfg) {
+            cfg.size = ib_extent;
+         }
      }
   }

@@ -5242,7 +5298,7 @@ agx_texture_barrier(struct pipe_context *pipe, unsigned flags)
 }

 void
-agx_launch_internal(struct agx_batch *batch, const struct pipe_grid_info *info,
+agx_launch_internal(struct agx_batch *batch, const struct agx_grid *grid,
                    struct agx_compiled_shader *cs, enum pipe_shader_type stage,
                    uint32_t usc)
 {
@@ -5253,11 +5309,7 @@ agx_launch_internal(struct agx_batch *batch, const struct pipe_grid_info *info,
   uint8_t *out = batch->cdm.current;

   agx_push(out, CDM_LAUNCH_WORD_0, cfg) {
-      if (info->indirect)
-         cfg.mode = AGX_CDM_MODE_INDIRECT_GLOBAL;
-      else
-         cfg.mode = AGX_CDM_MODE_DIRECT;
-
+      cfg.mode = grid->mode;
      cfg.uniform_register_count = cs->b.info.push_count;
      cfg.preshader_register_count = cs->b.info.nr_preamble_gprs;
      cfg.texture_state_register_count = agx_nr_tex_descriptors(batch, cs);
@@ -5275,32 +5327,25 @@ agx_launch_internal(struct agx_batch *batch, const struct pipe_grid_info *info,
         ;
   }

-   if (info->indirect) {
-      struct agx_resource *indirect = agx_resource(info->indirect);
-      uint64_t addr = indirect->bo->ptr.gpu + info->indirect_offset;
-
-      agx_push(out, CDM_INDIRECT, cfg) {
-         cfg.address_hi = addr >> 32;
-         cfg.address_lo = addr & BITFIELD64_MASK(32);
+   if (grid->mode == AGX_CDM_MODE_DIRECT) {
+      agx_push(out, CDM_GLOBAL_SIZE, cfg) {
+         cfg.x = grid->global[0];
+         cfg.y = grid->global[1];
+         cfg.z = grid->global[2];
      }
   } else {
-      uint32_t size[3];
-      for (unsigned d = 0; d < 3; ++d) {
-         size[d] = ((info->grid[d] - 1) * info->block[d]) +
-                   (info->last_block[d] ?: info->block[d]);
-      }
-
-      agx_push(out, CDM_GLOBAL_SIZE, cfg) {
-         cfg.x = size[0];
-         cfg.y = size[1];
-         cfg.z = size[2];
+      agx_push(out, CDM_INDIRECT, cfg) {
+         cfg.address_hi = grid->indirect >> 32;
+         cfg.address_lo = grid->indirect & BITFIELD64_MASK(32);
      }
   }

-   agx_push(out, CDM_LOCAL_SIZE, cfg) {
-      cfg.x = info->block[0];
-      cfg.y = info->block[1];
-      cfg.z = info->block[2];
+   if (grid->mode != AGX_CDM_MODE_INDIRECT_LOCAL) {
+      agx_push(out, CDM_LOCAL_SIZE, cfg) {
+         cfg.x = grid->local[0];
+         cfg.y = grid->local[1];
+         cfg.z = grid->local[2];
+      }
   }

   agx_push(out, CDM_BARRIER, cfg) {
@@ -5352,9 +5397,9 @@ agx_launch_internal(struct agx_batch *batch, const struct pipe_grid_info *info,
 }

 void
-agx_launch(struct agx_batch *batch, const struct pipe_grid_info *info,
+agx_launch(struct agx_batch *batch, const struct agx_grid *grid,
           struct agx_compiled_shader *cs, struct agx_linked_shader *linked,
-           enum pipe_shader_type stage)
+           enum pipe_shader_type stage, unsigned variable_shared_mem)
 {
   struct agx_context *ctx = batch->ctx;

@@ -5362,18 +5407,17 @@ agx_launch(struct agx_batch *batch, const struct pipe_grid_info *info,
    * available in GPU memory. This is either the indirect buffer, or just a
    * buffer we upload ourselves if not indirect.
    */
-   if (info->indirect) {
-      struct agx_resource *indirect = agx_resource(info->indirect);
-      agx_batch_reads(batch, indirect);
+   if (grid->mode == AGX_CDM_MODE_DIRECT) {
+      uint32_t groups[3] = {
+         grid->global[0] / grid->local[0],
+         grid->global[1] / grid->local[1],
+         grid->global[2] / grid->local[2],
+      };

      batch->uniforms.tables[AGX_SYSVAL_TABLE_GRID] =
-         indirect->bo->ptr.gpu + info->indirect_offset;
+         agx_pool_upload_aligned(&batch->pool, groups, sizeof(groups), 4);
   } else {
-      static_assert(sizeof(info->grid) == 12,
-                    "matches indirect dispatch buffer");
-
-      batch->uniforms.tables[AGX_SYSVAL_TABLE_GRID] = agx_pool_upload_aligned(
-         &batch->pool, info->grid, sizeof(info->grid), 4);
+      batch->uniforms.tables[AGX_SYSVAL_TABLE_GRID] = grid->indirect;
   }

   util_dynarray_foreach(&ctx->global_buffers, struct pipe_resource *, res) {
@@ -5403,11 +5447,10 @@ agx_launch(struct agx_batch *batch, const struct pipe_grid_info *info,
   }
 #endif

-   uint32_t usc =
-      agx_build_pipeline(batch, cs, linked, PIPE_SHADER_COMPUTE,
-                         info->variable_shared_mem, subgroups_per_core);
+   uint32_t usc = agx_build_pipeline(batch, cs, linked, PIPE_SHADER_COMPUTE,
+                                     variable_shared_mem, subgroups_per_core);

-   agx_launch_internal(batch, info, cs, stage, usc);
+   agx_launch_internal(batch, grid, cs, stage, usc);
 }

 static void
@@ -5454,7 +5497,29 @@ agx_launch_grid(struct pipe_context *pipe, const struct pipe_grid_info *info)
   struct agx_compiled_shader *cs =
      _mesa_hash_table_next_entry(uncompiled->variants, NULL)->data;

-   agx_launch(batch, info, cs, NULL, PIPE_SHADER_COMPUTE);
+   struct agx_grid grid = {
+      .local[0] = info->block[0],
+      .local[1] = info->block[1],
+      .local[2] = info->block[2],
+   };
+
+   if (info->indirect) {
+      struct agx_resource *indirect = agx_resource(info->indirect);
+      agx_batch_reads(batch, indirect);
+
+      grid.mode = AGX_CDM_MODE_INDIRECT_GLOBAL;
+      grid.indirect = indirect->bo->ptr.gpu + info->indirect_offset;
+   } else {
+      grid.mode = AGX_CDM_MODE_DIRECT;
+
+      for (unsigned d = 0; d < 3; ++d) {
+         grid.global[d] = ((info->grid[d] - 1) * info->block[d]) +
+                          (info->last_block[d] ?: info->block[d]);
+      }
+   }
+
+   agx_launch(batch, &grid, cs, NULL, PIPE_SHADER_COMPUTE,
+              info->variable_shared_mem);

   /* TODO: Dirty tracking? */
   agx_dirty_all(ctx);
--- a/src/gallium/drivers/asahi/agx_state.h
+++ b/src/gallium/drivers/asahi/agx_state.h
@@ -682,8 +682,8 @@ struct agx_context {
   struct util_debug_callback debug;
   bool is_noop;

-   struct agx_tess_params tess_params;
   bool in_tess;
+   bool in_generated_vdm;

   struct blitter_context *blitter;
   struct asahi_blitter compute_blitter;
@@ -781,19 +781,67 @@ struct agx_compiled_shader *agx_build_meta_shader(struct agx_context *ctx,
                                                  meta_shader_builder_t builder,
                                                  void *data, size_t data_size);

-void agx_launch_with_data(struct agx_batch *batch,
-                          const struct pipe_grid_info *info,
+struct agx_grid {
+   /* Tag for the union */
+   enum agx_cdm_mode mode;
+
+   /* If mode != INDIRECT_LOCAL, the local size */
+   uint32_t local[3];
+
+   union {
+      /* If mode == DIRECT, the global size. This is *not* multiplied by the
+       * local size, differing from the API definition but matching AGX.
+       */
+      uint32_t global[3];
+
+      /* Address of the indirect buffer if mode != DIRECT */
+      uint64_t indirect;
+   };
+};
+
+static inline const struct agx_grid
+agx_grid_direct(uint32_t global_x, uint32_t global_y, uint32_t global_z,
+                uint32_t local_x, uint32_t local_y, uint32_t local_z)
+{
+   return (struct agx_grid){
+      .mode = AGX_CDM_MODE_DIRECT,
+      .global = {global_x, global_y, global_z},
+      .local = {local_x, local_y, local_z},
+   };
+}
+
+static inline const struct agx_grid
+agx_grid_indirect(uint64_t indirect, uint32_t local_x, uint32_t local_y,
+                  uint32_t local_z)
+{
+   return (struct agx_grid){
+      .mode = AGX_CDM_MODE_INDIRECT_GLOBAL,
+      .local = {local_x, local_y, local_z},
+      .indirect = indirect,
+   };
+}
+
+static inline const struct agx_grid
+agx_grid_indirect_local(uint64_t indirect)
+{
+   return (struct agx_grid){
+      .mode = AGX_CDM_MODE_INDIRECT_LOCAL,
+      .indirect = indirect,
+   };
+}
+
+void agx_launch_with_data(struct agx_batch *batch, const struct agx_grid *grid,
                          meta_shader_builder_t builder, void *key,
                          size_t key_size, void *data, size_t data_size);

-void agx_launch_internal(struct agx_batch *batch,
-                         const struct pipe_grid_info *info,
+void agx_launch_internal(struct agx_batch *batch, const struct agx_grid *grid,
                         struct agx_compiled_shader *cs,
                         enum pipe_shader_type stage, uint32_t usc);

-void agx_launch(struct agx_batch *batch, const struct pipe_grid_info *info,
+void agx_launch(struct agx_batch *batch, const struct agx_grid *grid,
                struct agx_compiled_shader *cs,
-                struct agx_linked_shader *linked, enum pipe_shader_type stage);
+                struct agx_linked_shader *linked, enum pipe_shader_type stage,
+                unsigned variable_shared_mem);

 void agx_init_query_functions(struct pipe_context *ctx);