diff --git a/src/asahi/lib/agx_nir_lower_gs.c b/src/asahi/lib/agx_nir_lower_gs.c
index 567aab61da4..e287afd1b4f 100644
--- a/src/asahi/lib/agx_nir_lower_gs.c
+++ b/src/asahi/lib/agx_nir_lower_gs.c
@@ -1534,6 +1534,13 @@ agx_nir_prefix_sum_gs(nir_builder *b, const void *data)
                      nir_channel(b, nir_load_workgroup_id(b), 0));
 }
 
+void
+agx_nir_prefix_sum_tess(nir_builder *b, const void *data)
+{
+   b->shader->info.workgroup_size[0] = 1024;
+   libagx_prefix_sum_tess(b, nir_load_preamble(b, 1, 64, .base = 0));
+}
+
 void
 agx_nir_gs_setup_indirect(nir_builder *b, const void *data)
 {
@@ -1564,3 +1571,37 @@ agx_nir_unroll_restart(nir_builder *b, const void *data)
    else
       unreachable("invalid index size");
 }
+
+void
+agx_nir_tessellate(nir_builder *b, const void *data)
+{
+   const struct agx_tessellator_key *key = data;
+   b->shader->info.workgroup_size[0] = 64;
+
+   nir_def *params = nir_load_preamble(b, 1, 64, .base = 0);
+   nir_def *patch = nir_channel(b, nir_load_global_invocation_id(b, 32), 0);
+   nir_def *mode = nir_imm_int(b, key->mode);
+   nir_def *partitioning = nir_imm_int(b, key->partitioning);
+   nir_def *output_prim = nir_imm_int(b, key->output_primitive);
+
+   if (key->prim == TESS_PRIMITIVE_ISOLINES)
+      libagx_tess_isoline(b, params, mode, partitioning, output_prim, patch);
+   else if (key->prim == TESS_PRIMITIVE_TRIANGLES)
+      libagx_tess_tri(b, params, mode, partitioning, output_prim, patch);
+   else if (key->prim == TESS_PRIMITIVE_QUADS)
+      libagx_tess_quad(b, params, mode, partitioning, output_prim, patch);
+   else
+      unreachable("invalid tess primitive");
+}
+
+void
+agx_nir_tess_setup_indirect(nir_builder *b, const void *data)
+{
+   const struct agx_tess_setup_indirect_key *key = data;
+
+   nir_def *params = nir_load_preamble(b, 1, 64, .base = 0);
+   nir_def *with_counts = nir_imm_bool(b, key->with_counts);
+   nir_def *point_mode = nir_imm_bool(b, key->point_mode);
+
+   libagx_tess_setup_indirect(b, params, with_counts, point_mode);
+}
diff --git a/src/asahi/lib/agx_nir_lower_gs.h b/src/asahi/lib/agx_nir_lower_gs.h
index f73084552f0..74fca3849ed 100644
--- a/src/asahi/lib/agx_nir_lower_gs.h
+++ b/src/asahi/lib/agx_nir_lower_gs.h
@@ -7,6 +7,7 @@
 
 #include <stdbool.h>
 #include <stdint.h>
+#include "shaders/tessellator.h"
 #include "nir.h"
 #include "shader_enums.h"
 
@@ -39,6 +40,8 @@ bool agx_nir_lower_gs(struct nir_shader *gs, const struct nir_shader *libagx,
 
 void agx_nir_prefix_sum_gs(struct nir_builder *b, const void *data);
 
+void agx_nir_prefix_sum_tess(struct nir_builder *b, const void *data);
+
 struct agx_gs_setup_indirect_key {
    enum mesa_prim prim;
 };
@@ -52,6 +55,23 @@ struct agx_unroll_restart_key {
 
 void agx_nir_unroll_restart(struct nir_builder *b, const void *key);
 
+struct agx_tessellator_key {
+   enum tess_primitive_mode prim                      : 8;
+   enum libagx_tess_output_primitive output_primitive : 8;
+   enum libagx_tess_partitioning partitioning         : 8;
+   enum libagx_tess_mode mode                         : 8;
+};
+static_assert(sizeof(struct agx_tessellator_key) == 4, "padded");
+
+struct agx_tess_setup_indirect_key {
+   bool point_mode;
+   bool with_counts;
+   bool padding[2];
+};
+static_assert(sizeof(struct agx_tess_setup_indirect_key) == 4, "padded");
+
+void agx_nir_tessellate(struct nir_builder *b, const void *key);
+
 bool agx_nir_lower_tcs(struct nir_shader *tcs, const struct nir_shader *libagx);
 
 bool agx_nir_lower_tes(struct nir_shader *tes, const struct nir_shader *libagx);
@@ -59,3 +79,5 @@ bool agx_nir_lower_tes(struct nir_shader *tes, const struct nir_shader *libagx);
 uint64_t agx_tcs_per_vertex_outputs(const struct nir_shader *nir);
 
 unsigned agx_tcs_output_stride(const struct nir_shader *nir);
+
+void agx_nir_tess_setup_indirect(struct nir_builder *b, const void *data);
diff --git a/src/asahi/lib/meson.build b/src/asahi/lib/meson.build
index 0d78932adc4..ced44d9f679 100644
--- a/src/asahi/lib/meson.build
+++ b/src/asahi/lib/meson.build
@@ -42,6 +42,7 @@ libagx_shader_files = files(
   'shaders/query.h',
   'shaders/tessellation.cl',
   'shaders/tessellator.cl',
+  'shaders/tessellator.h',
   'shaders/texture.cl',
   'shaders/helper.cl',
 )
diff --git a/src/asahi/lib/shaders/geometry.cl b/src/asahi/lib/shaders/geometry.cl
index ec4e0f970e0..88468ef9cb2 100644
--- a/src/asahi/lib/shaders/geometry.cl
+++ b/src/asahi/lib/shaders/geometry.cl
@@ -4,6 +4,7 @@
  * SPDX-License-Identifier: MIT
  */
 
+#include "shaders/tessellator.h"
 #include "geometry.h"
 
 /* Compatible with util/u_math.h */
@@ -282,6 +283,17 @@ first_true_thread_in_workgroup(bool cond, local uint *scratch)
    return (first_group * 32) + off;
 }
 
+/*
+ * Allocate memory from the heap (thread-safe). Returns the offset into the
+ * heap. The allocation will be word-aligned.
+ */
+static inline uint
+libagx_atomic_alloc(global struct agx_geometry_state *heap, uint size_B)
+{
+   return atomic_fetch_add((volatile atomic_uint *)(&heap->heap_bottom),
+                           align(size_B, 8));
+}
+
 /*
  * When unrolling the index buffer for a draw, we translate the old indirect
  * draws to new indirect draws. This routine allocates the new index buffer and
@@ -300,12 +312,11 @@ setup_unroll_for_draw(global struct agx_restart_unroll_params *p,
    uint max_verts = max_prims * mesa_vertices_per_prim(mode);
    uint alloc_size = max_verts * index_size_B;
 
-   /* Allocate memory from the heap for the unrolled index buffer. Use an atomic
-    * since multiple threads may be running to handle multidraw in parallel.
+   /* Allocate unrolled index buffer. Atomic since multiple threads may be
+    * running to handle multidraw in parallel.
     */
    global struct agx_geometry_state *heap = p->heap;
-   uint old_heap_bottom_B = atomic_fetch_add(
-      (volatile atomic_uint *)(&heap->heap_bottom), align(alloc_size, 4));
+   uint old_heap_bottom_B = libagx_atomic_alloc(p->heap, alloc_size);
 
    /* Regardless of the input stride, we use tightly packed output draws */
    global uint *out = &p->out_draws[5 * draw];
@@ -612,6 +623,40 @@ libagx_prefix_sum(global uint *buffer, uint len, uint words, uint word)
    }
 }
 
+kernel void
+libagx_prefix_sum_tess(global struct libagx_tess_args *p)
+{
+   libagx_prefix_sum(p->counts, p->nr_patches, 1 /* words */, 0 /* word */);
+
+   /* After prefix summing, we know the total # of indices, so allocate the
+    * index buffer now. Elect a thread for the allocation.
+    */
+   barrier(CLK_LOCAL_MEM_FENCE);
+   if (get_local_id(0) != 0)
+      return;
+
+   /* The last element of an inclusive prefix sum is the total sum */
+   uint total = p->counts[p->nr_patches - 1];
+
+   /* Allocate 4-byte indices */
+   uint32_t elsize_B = sizeof(uint32_t);
+   uint32_t size_B = total * elsize_B;
+   uint alloc_B = p->heap->heap_bottom;
+   p->heap->heap_bottom += size_B;
+   p->heap->heap_bottom = align(p->heap->heap_bottom, 8);
+
+   p->index_buffer = (global uint32_t *)(((uintptr_t)p->heap->heap) + alloc_B);
+
+   /* ...and now we can generate the API indexed draw */
+   global uint32_t *desc = p->out_draws;
+
+   desc[0] = total;              /* count */
+   desc[1] = 1;                  /* instance_count */
+   desc[2] = alloc_B / elsize_B; /* start */
+   desc[3] = 0;                  /* index_bias */
+   desc[4] = 0;                  /* start_instance */
+}
+
 uintptr_t
 libagx_vertex_output_address(uintptr_t buffer, uint64_t mask, uint vtx,
                              gl_varying_slot location)
diff --git a/src/asahi/lib/shaders/geometry.h b/src/asahi/lib/shaders/geometry.h
index 745d17d6b26..b4a2b44f7d4 100644
--- a/src/asahi/lib/shaders/geometry.h
+++ b/src/asahi/lib/shaders/geometry.h
@@ -203,47 +203,6 @@ struct agx_geometry_params {
 } PACKED;
 AGX_STATIC_ASSERT(sizeof(struct agx_geometry_params) == 78 * 4);
 
-struct agx_tess_params {
-   /* Persistent (cross-draw) geometry state */
-   GLOBAL(struct agx_geometry_state) state;
-
-   /* Patch coordinate offsets in patch_coord_buffer, indexed by patch ID. */
-   GLOBAL(uint) patch_coord_offs;
-
-   /* Patch coordinate buffer, indexed as:
-    *
-    *    patch_coord_offs[patch_ID] + vertex_in_patch
-    *
-    * Currently float2s, but we might be able to compact later?
-    */
-   GLOBAL(float2) patch_coord_buffer;
-
-   /* Tessellation control shader output buffer, indexed by patch ID. */
-   GLOBAL(uchar) tcs_buffer;
-
-   /* Bitfield of TCS per-vertex outputs */
-   uint64_t tcs_per_vertex_outputs;
-
-   /* Default tess levels used in OpenGL when there is no TCS in the pipeline.
-    * Unused in Vulkan and OpenGL ES.
-    */
-   float tess_level_outer_default[4];
-   float tess_level_inner_default[4];
-
-   /* Number of vertices in the input patch */
-   uint input_patch_size;
-
-   /* Number of vertices in the TCS output patch */
-   uint output_patch_size;
-
-   /* Number of patch constants written by TCS */
-   uint tcs_patch_constants;
-
-   /* Number of input patches per instance of the VS/TCS */
-   uint patches_per_instance;
-} PACKED;
-AGX_STATIC_ASSERT(sizeof(struct agx_tess_params) == 22 * 4);
-
 /* TCS shared memory layout:
  *
  *    vec4 vs_outputs[VERTICES_IN_INPUT_PATCH][TOTAL_VERTEX_OUTPUTS];
diff --git a/src/asahi/lib/shaders/tessellation.cl b/src/asahi/lib/shaders/tessellation.cl
index c4d549b9ff7..5dfea8345a1 100644
--- a/src/asahi/lib/shaders/tessellation.cl
+++ b/src/asahi/lib/shaders/tessellation.cl
@@ -4,15 +4,17 @@
  */
 
 #include "geometry.h"
+#include "tessellator.h"
+#include <agx_pack.h>
 
 uint
-libagx_tcs_patch_vertices_in(constant struct agx_tess_params *p)
+libagx_tcs_patch_vertices_in(constant struct libagx_tess_args *p)
 {
    return p->input_patch_size;
 }
 
 uint
-libagx_tes_patch_vertices_in(constant struct agx_tess_params *p)
+libagx_tes_patch_vertices_in(constant struct libagx_tess_args *p)
 {
    return p->output_patch_size;
 }
@@ -25,7 +27,7 @@ libagx_tcs_in_offset(uint vtx, gl_varying_slot location,
 }
 
 uintptr_t
-libagx_tcs_out_address(constant struct agx_tess_params *p, uint patch_id,
+libagx_tcs_out_address(constant struct libagx_tess_args *p, uint patch_id,
                        uint vtx_id, gl_varying_slot location, uint nr_patch_out,
                        uint out_patch_size, uint64_t vtx_out_mask)
 {
@@ -45,7 +47,7 @@ libagx_tes_unrolled_patch_id(uint raw_id)
 }
 
 uint
-libagx_tes_patch_id(constant struct agx_tess_params *p, uint raw_id)
+libagx_tes_patch_id(constant struct libagx_tess_args *p, uint raw_id)
 {
    return libagx_tes_unrolled_patch_id(raw_id) % p->patches_per_instance;
 }
@@ -57,16 +59,20 @@ tes_vertex_id_in_patch(uint raw_id)
 }
 
 float2
-libagx_load_tess_coord(constant struct agx_tess_params *p, uint raw_id)
+libagx_load_tess_coord(constant struct libagx_tess_args *p, uint raw_id)
 {
    uint patch = libagx_tes_unrolled_patch_id(raw_id);
    uint vtx = tes_vertex_id_in_patch(raw_id);
 
-   return p->patch_coord_buffer[p->patch_coord_offs[patch] + vtx];
+   global struct libagx_tess_point *t =
+      &p->patch_coord_buffer[p->coord_allocs[patch] + vtx];
+
+   /* Written weirdly because NIR struggles with loads of structs */
+   return *((global float2 *)t);
 }
 
 uintptr_t
-libagx_tes_in_address(constant struct agx_tess_params *p, uint raw_id,
+libagx_tes_in_address(constant struct libagx_tess_args *p, uint raw_id,
                       uint vtx_id, gl_varying_slot location)
 {
    uint patch = libagx_tes_unrolled_patch_id(raw_id);
@@ -77,7 +83,7 @@ libagx_tes_in_address(constant struct agx_tess_params *p, uint raw_id,
 }
 
 float4
-libagx_tess_level_outer_default(constant struct agx_tess_params *p)
+libagx_tess_level_outer_default(constant struct libagx_tess_args *p)
 {
    return (
       float4)(p->tess_level_outer_default[0], p->tess_level_outer_default[1],
@@ -85,8 +91,98 @@ libagx_tess_level_outer_default(constant struct agx_tess_params *p)
 }
 
 float2
-libagx_tess_level_inner_default(constant struct agx_tess_params *p)
+libagx_tess_level_inner_default(constant struct libagx_tess_args *p)
 {
    return (float2)(p->tess_level_inner_default[0],
                    p->tess_level_inner_default[1]);
 }
+
+void
+libagx_tess_setup_indirect(global struct libagx_tess_args *p, bool with_counts,
+                           bool point_mode)
+{
+   uint count = p->indirect[0], instance_count = p->indirect[1];
+   unsigned in_patches = count / p->input_patch_size;
+
+   /* TCS invocation counter increments once per-patch */
+   if (p->tcs_statistic) {
+      *(p->tcs_statistic) += in_patches;
+   }
+
+   size_t draw_stride =
+      ((!with_counts && point_mode) ? 4 : 6) * sizeof(uint32_t);
+
+   unsigned unrolled_patches = in_patches * instance_count;
+
+   uint32_t alloc = 0;
+   uint32_t tcs_out_offs = alloc;
+   alloc += unrolled_patches * p->tcs_stride_el * 4;
+
+   uint32_t patch_coord_offs = alloc;
+   alloc += unrolled_patches * 4;
+
+   uint32_t count_offs = alloc;
+   if (with_counts)
+      alloc += unrolled_patches * sizeof(uint32_t);
+
+   uint vb_offs = alloc;
+   uint vb_size = libagx_tcs_in_size(count * instance_count, p->vertex_outputs);
+   alloc += vb_size;
+
+   /* Allocate all patch calculations in one go */
+   global uchar *blob = p->heap->heap + p->heap->heap_bottom;
+   p->heap->heap_bottom += alloc;
+
+   p->tcs_buffer = (global float *)(blob + tcs_out_offs);
+   p->patches_per_instance = in_patches;
+   p->coord_allocs = (global uint *)(blob + patch_coord_offs);
+   p->nr_patches = unrolled_patches;
+
+   *(p->vertex_output_buffer_ptr) = (uintptr_t)(blob + vb_offs);
+
+   if (with_counts) {
+      p->counts = (global uint32_t *)(blob + count_offs);
+   } else {
+#if 0
+      /* Arrange so we return after all generated draws. agx_pack would be nicer
+       * here but designated initializers lead to scratch access...
+       */
+      global uint32_t *ret =
+         (global uint32_t *)(blob + draw_offs +
+                             (draw_stride * unrolled_patches));
+
+      *ret = (AGX_VDM_BLOCK_TYPE_BARRIER << 29) | /* with return */ (1u << 27);
+#endif
+      /* TODO */
+   }
+
+   /* VS grid size */
+   p->grids[0] = count;
+   p->grids[1] = instance_count;
+   p->grids[2] = 1;
+
+   /* VS workgroup size */
+   p->grids[3] = 64;
+   p->grids[4] = 1;
+   p->grids[5] = 1;
+
+   /* TCS grid size */
+   p->grids[6] = in_patches * p->output_patch_size;
+   p->grids[7] = instance_count;
+   p->grids[8] = 1;
+
+   /* TCS workgroup size */
+   p->grids[9] = p->output_patch_size;
+   p->grids[10] = 1;
+   p->grids[11] = 1;
+
+   /* Tess grid size */
+   p->grids[12] = unrolled_patches;
+   p->grids[13] = 1;
+   p->grids[14] = 1;
+
+   /* Tess workgroup size */
+   p->grids[15] = 64;
+   p->grids[16] = 1;
+   p->grids[17] = 1;
+}
diff --git a/src/asahi/lib/shaders/tessellator.cl b/src/asahi/lib/shaders/tessellator.cl
index 0a1fe63e66a..82079242f47 100644
--- a/src/asahi/lib/shaders/tessellator.cl
+++ b/src/asahi/lib/shaders/tessellator.cl
@@ -1,8 +1,1748 @@
 /*
- * Copyright 2023 Alyssa Rosenzweig
- * Copyright (c) Microsoft Corporation
- * SPDX-License-Identifier: MIT
- */
+    Copyright (c) Microsoft Corporation
+
+    Permission is hereby granted, free of charge, to any person obtaining a copy
+   of this software and associated documentation files (the "Software"), to deal
+   in the Software without restriction, including without limitation the rights
+   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+   copies of the Software, and to permit persons to whom the Software is
+   furnished to do so, subject to the following conditions:
+
+    The above copyright notice and this permission notice shall be included in
+   all copies or substantial portions of the Software.
+
+    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+   SOFTWARE.
+*/
 
 #include "geometry.h"
+#include "tessellator.h"
+#include <agx_pack.h>
 
+#if 0
+#include <math.h>
+#include <stdbool.h>
+#include <stdint.h>
+#include <string.h>
+#include "util/macros.h"
+#define min(x, y) (x < y ? x : y)
+#define max(x, y) (x > y ? x : y)
+#define clz(x)    (x ? __builtin_clz(x) : (8 * sizeof(x)))
+#define clamp(x, y, z) (x < y ? y : x > z ? z : x)
+#define align(x, y)    ALIGN_POT(x, y)
+#else
+#define assert(x)
+#endif
+
+#define LIBAGX_TESS_MIN_ISOLINE_DENSITY_TESSELLATION_FACTOR 1.0f
+#define LIBAGX_TESS_MAX_ISOLINE_DENSITY_TESSELLATION_FACTOR 64.0f
+
+typedef unsigned int FXP; // fixed point number
+
+enum {
+   U = 0, // points on a tri patch
+   V = 1,
+};
+
+enum {
+   Ueq0 = 0, // edges on a tri patch
+   Veq0 = 1,
+   Weq0 = 2,
+};
+
+enum {
+   Ueq1 = 2, // edges on a quad patch: Ueq0, Veq0, Ueq1, Veq1
+   Veq1 = 3,
+};
+
+#define QUAD_AXES  2
+#define QUAD_EDGES 4
+#define TRI_EDGES  3
+
+// The interior can just use a simpler stitch.
+typedef enum DIAGONALS {
+   DIAGONALS_INSIDE_TO_OUTSIDE,
+   DIAGONALS_INSIDE_TO_OUTSIDE_EXCEPT_MIDDLE,
+   DIAGONALS_MIRRORED
+} DIAGONALS;
+
+typedef struct TESS_FACTOR_CONTEXT {
+   FXP fxpInvNumSegmentsOnFloorTessFactor;
+   FXP fxpInvNumSegmentsOnCeilTessFactor;
+   FXP fxpHalfTessFactorFraction;
+   int numHalfTessFactorPoints;
+   int splitPointOnFloorHalfTessFactor;
+} TESS_FACTOR_CONTEXT;
+
+struct INDEX_PATCH_CONTEXT {
+   int insidePointIndexDeltaToRealValue;
+   int insidePointIndexBadValue;
+   int insidePointIndexReplacementValue;
+   int outsidePointIndexPatchBase;
+   int outsidePointIndexDeltaToRealValue;
+   int outsidePointIndexBadValue;
+   int outsidePointIndexReplacementValue;
+};
+
+struct INDEX_PATCH_CONTEXT2 {
+   int baseIndexToInvert;
+   int indexInversionEndPoint;
+   int cornerCaseBadValue;
+   int cornerCaseReplacementValue;
+};
+
+struct CHWTessellator {
+   enum libagx_tess_output_primitive outputPrimitive;
+   enum libagx_tess_mode mode;
+   uint index_bias;
+
+   // array where we will store u/v's for the points we generate
+   global struct libagx_tess_point *Point;
+
+   // array where we will store index topology
+   global void *Index;
+
+   // A second index patch we have to do handles the leftover strip of quads in
+   // the middle of an odd quad patch after finishing all the concentric rings.
+   // This also handles the leftover strip of points in the middle of an even
+   // quad patch, when stitching the row of triangles up the left side (V major
+   // quad) or bottom (U major quad) of the inner ring
+   bool bUsingPatchedIndices;
+   bool bUsingPatchedIndices2;
+   struct INDEX_PATCH_CONTEXT IndexPatchCtx;
+   struct INDEX_PATCH_CONTEXT2 IndexPatchCtx2;
+};
+
+#define FXP_INTEGER_BITS  15
+#define FXP_FRACTION_BITS 16
+#define FXP_FRACTION_MASK 0x0000ffff
+#define FXP_INTEGER_MASK  0x7fff0000
+#define FXP_ONE           (1 << FXP_FRACTION_BITS)
+#define FXP_ONE_THIRD     0x00005555
+#define FXP_TWO_THIRDS    0x0000aaaa
+#define FXP_ONE_HALF      0x00008000
+
+static global float *
+tess_factors(constant struct libagx_tess_args *p, uint patch)
+{
+   return p->tcs_buffer + (patch * p->tcs_stride_el);
+}
+
+static inline uint
+libagx_heap_alloc(global struct agx_geometry_state *heap, uint size_B)
+{
+   // TODO: drop align to 4 I think
+   return atomic_fetch_add((volatile atomic_uint *)(&heap->heap_bottom),
+                           align(size_B, 8));
+}
+
+/*
+ * Generate an indexed draw for a patch with the computed number of indices.
+ * This allocates heap memory for the index buffer, returning the allocated
+ * memory.
+ */
+static global void *
+libagx_draw(constant struct libagx_tess_args *p, enum libagx_tess_mode mode,
+            bool lines, uint patch, uint count)
+{
+   if (mode == LIBAGX_TESS_MODE_COUNT) {
+      p->counts[patch] = count;
+   }
+
+   if (mode == LIBAGX_TESS_MODE_VDM) {
+      uint32_t elsize_B = sizeof(uint16_t);
+      uint32_t alloc_B = libagx_heap_alloc(p->heap, elsize_B * count);
+      uint64_t ib = ((uintptr_t)p->heap->heap) + alloc_B;
+
+      global uint32_t *desc = p->out_draws + (patch * 6);
+      agx_pack(&desc[0], INDEX_LIST, cfg) {
+         cfg.index_buffer_hi = (ib >> 32);
+         cfg.primitive = lines ? AGX_PRIMITIVE_LINES : AGX_PRIMITIVE_TRIANGLES;
+         cfg.restart_enable = false;
+         cfg.index_size = AGX_INDEX_SIZE_U16;
+         cfg.index_buffer_size_present = true;
+         cfg.index_buffer_present = true;
+         cfg.index_count_present = true;
+         cfg.instance_count_present = true;
+         cfg.start_present = true;
+         cfg.unk_1_present = false;
+         cfg.indirect_buffer_present = false;
+         cfg.unk_2_present = false;
+         cfg.block_type = AGX_VDM_BLOCK_TYPE_INDEX_LIST;
+      }
+
+      agx_pack(&desc[1], INDEX_LIST_BUFFER_LO, cfg) {
+         cfg.buffer_lo = ib & 0xffffffff;
+      }
+
+      agx_pack(&desc[2], INDEX_LIST_COUNT, cfg) {
+         cfg.count = count;
+      }
+
+      agx_pack(&desc[3], INDEX_LIST_INSTANCES, cfg) {
+         cfg.count = 1;
+      }
+
+      agx_pack(&desc[4], INDEX_LIST_START, cfg) {
+         cfg.start = patch * LIBAGX_TES_PATCH_ID_STRIDE;
+      }
+
+      agx_pack(&desc[5], INDEX_LIST_BUFFER_SIZE, cfg) {
+         cfg.size = align(count * 2, 4);
+      }
+
+      return (global void *)ib;
+   }
+
+   if (mode == LIBAGX_TESS_MODE_WITH_COUNTS) {
+      /* The index buffer is already allocated, get a pointer inside it.
+       * p->counts has had an inclusive prefix sum hence the subtraction.
+       */
+      uint offset_el = p->counts[sub_sat(patch, 1u)];
+      if (patch == 0)
+         offset_el = 0;
+
+      return &p->index_buffer[offset_el];
+   }
+
+   return NULL;
+}
+
+static void
+libagx_draw_points(private struct CHWTessellator *ctx,
+                   constant struct libagx_tess_args *p, uint patch, uint count)
+{
+   if (ctx->mode == LIBAGX_TESS_MODE_VDM) {
+      /* Generate a non-indexed draw for points mode tessellation. */
+      global uint32_t *desc = p->out_draws + (patch * 4);
+      agx_pack(&desc[0], INDEX_LIST, cfg) {
+         cfg.index_buffer_hi = 0;
+         cfg.primitive = AGX_PRIMITIVE_POINTS;
+         cfg.restart_enable = false;
+         cfg.index_size = 0;
+         cfg.index_buffer_size_present = false;
+         cfg.index_buffer_present = false;
+         cfg.index_count_present = true;
+         cfg.instance_count_present = true;
+         cfg.start_present = true;
+         cfg.unk_1_present = false;
+         cfg.indirect_buffer_present = false;
+         cfg.unk_2_present = false;
+         cfg.block_type = AGX_VDM_BLOCK_TYPE_INDEX_LIST;
+      }
+
+      agx_pack(&desc[1], INDEX_LIST_COUNT, cfg) {
+         cfg.count = count;
+      }
+
+      agx_pack(&desc[2], INDEX_LIST_INSTANCES, cfg) {
+         cfg.count = 1;
+      }
+
+      agx_pack(&desc[3], INDEX_LIST_START, cfg) {
+         cfg.start = patch * LIBAGX_TES_PATCH_ID_STRIDE;
+      }
+   } else {
+      /* For points mode with a single draw, we need to generate a trivial index
+       * buffer to stuff in the patch ID in the right place.
+       */
+      global uint32_t *indices = libagx_draw(p, ctx->mode, false, patch, count);
+
+      if (ctx->mode == LIBAGX_TESS_MODE_COUNT)
+         return;
+
+      for (int i = 0; i < count; ++i) {
+         indices[i] = ctx->index_bias + i;
+      }
+   }
+}
+
+static void
+libagx_draw_empty(constant struct libagx_tess_args *p,
+                  enum libagx_tess_mode mode,
+                  enum libagx_tess_output_primitive output_primitive,
+                  uint patch)
+{
+   if (mode == LIBAGX_TESS_MODE_COUNT) {
+      p->counts[patch] = 0;
+   } else if (mode == LIBAGX_TESS_MODE_VDM) {
+      uint32_t words = (output_primitive == LIBAGX_TESS_OUTPUT_POINT) ? 4 : 6;
+      global uint32_t *desc = p->out_draws + (patch * words);
+      uint32_t nop_token = AGX_VDM_BLOCK_TYPE_BARRIER << 29;
+
+      for (uint32_t i = 0; i < words; ++i) {
+         desc[i] = nop_token;
+      }
+   }
+}
+
+/*
+ * Allocate heap memory for domain points for a patch. The allocation
+ * is recorded in the coord_allocs[] array, which is in elements.
+ */
+static global struct libagx_tess_point *
+libagx_heap_alloc_points(constant struct libagx_tess_args *p, uint patch,
+                         uint count)
+{
+   /* If we're recording statistics, increment now. The statistic is for
+    * tessellation evaluation shader invocations, which is equal to the number
+    * of domain points generated.
+    */
+   if (p->statistic) {
+      atomic_fetch_add((volatile atomic_uint *)(p->statistic), count);
+   }
+
+   uint32_t elsize_B = sizeof(struct libagx_tess_point);
+   uint32_t alloc_B = libagx_heap_alloc(p->heap, elsize_B * count);
+   uint32_t alloc_el = alloc_B / elsize_B;
+
+   p->coord_allocs[patch] = alloc_el;
+   return (global struct libagx_tess_point *)(((uintptr_t)p->heap->heap) +
+                                              alloc_B);
+}
+
+// Microsoft D3D11 Fixed Function Tessellator Reference - May 7, 2012
+// amar.patel@microsoft.com
+
+#define LIBAGX_TESS_MIN_ODD_TESSELLATION_FACTOR  1
+#define LIBAGX_TESS_MAX_ODD_TESSELLATION_FACTOR  63
+#define LIBAGX_TESS_MIN_EVEN_TESSELLATION_FACTOR 2
+#define LIBAGX_TESS_MAX_EVEN_TESSELLATION_FACTOR 64
+
+// 2^(-16), min positive fixed point fraction
+#define EPSILON 0.0000152587890625f
+#define MIN_ODD_TESSFACTOR_PLUS_HALF_EPSILON                                   \
+   (LIBAGX_TESS_MIN_ODD_TESSELLATION_FACTOR + EPSILON / 2)
+
+static float clamp_factor(float factor,
+                          enum libagx_tess_partitioning partitioning,
+                          float maxf)
+{
+   float lower = (partitioning == LIBAGX_TESS_PARTITIONING_FRACTIONAL_EVEN)
+                    ? LIBAGX_TESS_MIN_EVEN_TESSELLATION_FACTOR
+                    : LIBAGX_TESS_MIN_ODD_TESSELLATION_FACTOR;
+
+   float upper = (partitioning == LIBAGX_TESS_PARTITIONING_FRACTIONAL_ODD)
+                    ? LIBAGX_TESS_MAX_ODD_TESSELLATION_FACTOR
+                    : LIBAGX_TESS_MAX_EVEN_TESSELLATION_FACTOR;
+
+   // If any TessFactor will end up > 1 after floatToFixed conversion later,
+   // then force the inside TessFactors to be > 1 so there is a picture frame.
+   if (partitioning == LIBAGX_TESS_PARTITIONING_FRACTIONAL_ODD &&
+       maxf > MIN_ODD_TESSFACTOR_PLUS_HALF_EPSILON) {
+
+      lower = LIBAGX_TESS_MIN_ODD_TESSELLATION_FACTOR + EPSILON;
+   }
+
+   factor = clamp(factor, lower, upper);
+
+   if (partitioning == LIBAGX_TESS_PARTITIONING_INTEGER) {
+      factor = ceil(factor);
+   }
+
+   return factor;
+}
+
+
+static FXP
+floatToFixed(const float input)
+{
+   return mad(input, FXP_ONE, 0.5f);
+}
+
+static float
+fixedToFloat(const FXP input)
+{
+   // Don't need to worry about special cases because the bounds are reasonable.
+   return ((float)input) / FXP_ONE;
+}
+
+static bool
+isOdd(const float input)
+{
+   return ((int)input) & 1;
+}
+
+static FXP
+fxpCeil(const FXP input)
+{
+   if (input & FXP_FRACTION_MASK) {
+      return (input & FXP_INTEGER_MASK) + FXP_ONE;
+   }
+   return input;
+}
+
+static FXP
+fxpFloor(const FXP input)
+{
+   return (input & FXP_INTEGER_MASK);
+}
+
+static int
+PatchIndexValue(private struct CHWTessellator *ctx, int index)
+{
+   if (ctx->bUsingPatchedIndices) {
+      // assumed remapped outide indices are > remapped inside vertices
+      if (index >= ctx->IndexPatchCtx.outsidePointIndexPatchBase) {
+         if (index == ctx->IndexPatchCtx.outsidePointIndexBadValue)
+            return ctx->IndexPatchCtx.outsidePointIndexReplacementValue;
+         else
+            return index + ctx->IndexPatchCtx.outsidePointIndexDeltaToRealValue;
+      } else {
+         if (index == ctx->IndexPatchCtx.insidePointIndexBadValue)
+            return ctx->IndexPatchCtx.insidePointIndexReplacementValue;
+         else
+            return index + ctx->IndexPatchCtx.insidePointIndexDeltaToRealValue;
+      }
+   } else if (ctx->bUsingPatchedIndices2) {
+      if (index >= ctx->IndexPatchCtx2.baseIndexToInvert) {
+         if (index == ctx->IndexPatchCtx2.cornerCaseBadValue)
+            return ctx->IndexPatchCtx2.cornerCaseReplacementValue;
+         else
+            return ctx->IndexPatchCtx2.indexInversionEndPoint - index;
+      } else if (index == ctx->IndexPatchCtx2.cornerCaseBadValue) {
+         return ctx->IndexPatchCtx2.cornerCaseReplacementValue;
+      }
+   }
+
+   return index;
+}
+
+static void
+DefinePoint(global struct libagx_tess_point *out, FXP fxpU, FXP fxpV)
+{
+   out->u = fixedToFloat(fxpU);
+   out->v = fixedToFloat(fxpV);
+}
+
+static void
+DefineIndex(private struct CHWTessellator *ctx, int index,
+            int indexStorageOffset)
+{
+   int patched = PatchIndexValue(ctx, index);
+
+   if (ctx->mode == LIBAGX_TESS_MODE_WITH_COUNTS) {
+      global uint32_t *indices = (global uint32_t *)ctx->Index;
+      indices[indexStorageOffset] = ctx->index_bias + patched;
+   } else {
+      global uint16_t *indices = (global uint16_t *)ctx->Index;
+      indices[indexStorageOffset] = patched;
+   }
+}
+
+static void
+DefineClockwiseTriangle(private struct CHWTessellator *ctx, int index0,
+                        int index1, int index2, int indexStorageBaseOffset)
+{
+   // inputs a clockwise triangle, stores a CW or CCW triangle per state state
+   bool cw = ctx->outputPrimitive == LIBAGX_TESS_OUTPUT_TRIANGLE_CW;
+
+   DefineIndex(ctx, index0, indexStorageBaseOffset);
+   DefineIndex(ctx, cw ? index1 : index2, indexStorageBaseOffset + 1);
+   DefineIndex(ctx, cw ? index2 : index1, indexStorageBaseOffset + 2);
+}
+
+static uint32_t
+RemoveMSB(uint32_t val)
+{
+   uint32_t bit = val ? (1 << (31 - clz(val))) : 0;
+   return val & ~bit;
+}
+
+static int
+NumPointsForTessFactor(bool odd, FXP fxpTessFactor)
+{
+   // Add epsilon for rounding and add 1 for odd
+   FXP f = fxpTessFactor + (odd ? (FXP_ONE + 1) : 1);
+   int r = fxpCeil(f / 2) >> (FXP_FRACTION_BITS - 1);
+   return odd ? r : r + 1;
+}
+
+static void
+ComputeTessFactorCtx(bool odd, FXP fxpTessFactor,
+                     private TESS_FACTOR_CONTEXT *TessFactorCtx)
+{
+   // fxpHalfTessFactor == 1/2 if TessFactor is 1,
+   // but we're pretending we are even.
+   FXP fxpHalfTessFactor = (fxpTessFactor + 1 /*round*/) / 2;
+   if (odd || (fxpHalfTessFactor == FXP_ONE_HALF)) {
+      fxpHalfTessFactor += FXP_ONE_HALF;
+   }
+   FXP fxpFloorHalfTessFactor = fxpFloor(fxpHalfTessFactor);
+   FXP fxpCeilHalfTessFactor = fxpCeil(fxpHalfTessFactor);
+   TessFactorCtx->fxpHalfTessFactorFraction = fxpHalfTessFactor - fxpFloorHalfTessFactor;
+   TessFactorCtx->numHalfTessFactorPoints =
+      (fxpCeilHalfTessFactor >> FXP_FRACTION_BITS); // for EVEN, we don't include the point always
+                                                    // fixed at the midpoint of the TessFactor
+   if (fxpCeilHalfTessFactor == fxpFloorHalfTessFactor) {
+      TessFactorCtx->splitPointOnFloorHalfTessFactor =
+         /*pick value to cause this to be ignored*/ TessFactorCtx->numHalfTessFactorPoints + 1;
+   } else if (odd) {
+      if (fxpFloorHalfTessFactor == FXP_ONE) {
+         TessFactorCtx->splitPointOnFloorHalfTessFactor = 0;
+      } else {
+         TessFactorCtx->splitPointOnFloorHalfTessFactor =
+            (RemoveMSB((fxpFloorHalfTessFactor >> FXP_FRACTION_BITS) - 1) << 1) + 1;
+      }
+   } else {
+      TessFactorCtx->splitPointOnFloorHalfTessFactor =
+         (RemoveMSB(fxpFloorHalfTessFactor >> FXP_FRACTION_BITS) << 1) + 1;
+   }
+   int numFloorSegments = (fxpFloorHalfTessFactor * 2) >> FXP_FRACTION_BITS;
+   int numCeilSegments = (fxpCeilHalfTessFactor * 2) >> FXP_FRACTION_BITS;
+   if (odd) {
+      numFloorSegments -= 1;
+      numCeilSegments -= 1;
+   }
+   TessFactorCtx->fxpInvNumSegmentsOnFloorTessFactor =
+      floatToFixed(1.0f / (float)numFloorSegments);
+   TessFactorCtx->fxpInvNumSegmentsOnCeilTessFactor =
+      floatToFixed(1.0f / (float)numCeilSegments);
+}
+
+static FXP
+PlacePointIn1D(private const TESS_FACTOR_CONTEXT *TessFactorCtx, bool odd,
+               int point)
+{
+   bool bFlip = point >= TessFactorCtx->numHalfTessFactorPoints;
+
+   if (bFlip) {
+      point = (TessFactorCtx->numHalfTessFactorPoints << 1) - point - odd;
+   }
+
+   // special casing middle since 16 bit fixed math below can't reproduce 0.5 exactly
+   if (point == TessFactorCtx->numHalfTessFactorPoints)
+      return FXP_ONE_HALF;
+
+   unsigned int indexOnCeilHalfTessFactor = point;
+   unsigned int indexOnFloorHalfTessFactor = indexOnCeilHalfTessFactor;
+   if (point > TessFactorCtx->splitPointOnFloorHalfTessFactor) {
+      indexOnFloorHalfTessFactor -= 1;
+   }
+   // For the fixed point multiplies below, we know the results are <= 16 bits
+   // because the locations on the halfTessFactor are <= half the number of
+   // segments for the total TessFactor. So a number divided by a number that
+   // is at least twice as big will give a result no bigger than 0.5 (which in
+   // fixed point is 16 bits in our case)
+   FXP fxpLocationOnFloorHalfTessFactor =
+      indexOnFloorHalfTessFactor * TessFactorCtx->fxpInvNumSegmentsOnFloorTessFactor;
+   FXP fxpLocationOnCeilHalfTessFactor =
+      indexOnCeilHalfTessFactor * TessFactorCtx->fxpInvNumSegmentsOnCeilTessFactor;
+
+   // Since we know the numbers calculated above are <= fixed point 0.5, and the
+   // equation below is just lerping between two values <= fixed point 0.5
+   // (0x00008000), then we know that the final result before shifting by 16 bits
+   // is no larger than 0x80000000.  Once we shift that down by 16, we get the
+   // result of lerping 2 numbers <= 0.5, which is obviously at most 0.5
+   // (0x00008000)
+   FXP fxpLocation =
+      fxpLocationOnFloorHalfTessFactor * (FXP_ONE - TessFactorCtx->fxpHalfTessFactorFraction) +
+      fxpLocationOnCeilHalfTessFactor * (TessFactorCtx->fxpHalfTessFactorFraction);
+   fxpLocation = (fxpLocation + FXP_ONE_HALF /*round*/) >> FXP_FRACTION_BITS; // get back to n.16
+   if (bFlip) {
+      fxpLocation = FXP_ONE - fxpLocation;
+   }
+   return fxpLocation;
+}
+
+static void
+StitchRegular(private struct CHWTessellator *ctx, bool bTrapezoid,
+              DIAGONALS diagonals, int baseIndexOffset, int numInsideEdgePoints,
+              int insideEdgePointBaseOffset, int outsideEdgePointBaseOffset)
+{
+   int insidePoint = insideEdgePointBaseOffset;
+   int outsidePoint = outsideEdgePointBaseOffset;
+   if (bTrapezoid) {
+      DefineClockwiseTriangle(ctx, outsidePoint, outsidePoint + 1, insidePoint, baseIndexOffset);
+      baseIndexOffset += 3;
+      outsidePoint++;
+   }
+   int p;
+   switch (diagonals) {
+   case DIAGONALS_INSIDE_TO_OUTSIDE:
+      // Diagonals pointing from inside edge forward towards outside edge
+      for (p = 0; p < numInsideEdgePoints - 1; p++) {
+         DefineClockwiseTriangle(ctx, insidePoint, outsidePoint, outsidePoint + 1, baseIndexOffset);
+         baseIndexOffset += 3;
+
+         DefineClockwiseTriangle(ctx, insidePoint, outsidePoint + 1, insidePoint + 1,
+                                 baseIndexOffset);
+         baseIndexOffset += 3;
+         insidePoint++;
+         outsidePoint++;
+      }
+      break;
+   case DIAGONALS_INSIDE_TO_OUTSIDE_EXCEPT_MIDDLE: // Assumes ODD tessellation
+      // Diagonals pointing from outside edge forward towards inside edge
+
+      // First half
+      for (p = 0; p < numInsideEdgePoints / 2 - 1; p++) {
+         DefineClockwiseTriangle(ctx, outsidePoint, outsidePoint + 1, insidePoint, baseIndexOffset);
+         baseIndexOffset += 3;
+         DefineClockwiseTriangle(ctx, insidePoint, outsidePoint + 1, insidePoint + 1,
+                                 baseIndexOffset);
+         baseIndexOffset += 3;
+         insidePoint++;
+         outsidePoint++;
+      }
+
+      // Middle
+      DefineClockwiseTriangle(ctx, outsidePoint, insidePoint + 1, insidePoint, baseIndexOffset);
+      baseIndexOffset += 3;
+      DefineClockwiseTriangle(ctx, outsidePoint, outsidePoint + 1, insidePoint + 1,
+                              baseIndexOffset);
+      baseIndexOffset += 3;
+      insidePoint++;
+      outsidePoint++;
+      p += 2;
+
+      // Second half
+      for (; p < numInsideEdgePoints; p++) {
+         DefineClockwiseTriangle(ctx, outsidePoint, outsidePoint + 1, insidePoint, baseIndexOffset);
+         baseIndexOffset += 3;
+         DefineClockwiseTriangle(ctx, insidePoint, outsidePoint + 1, insidePoint + 1,
+                                 baseIndexOffset);
+         baseIndexOffset += 3;
+         insidePoint++;
+         outsidePoint++;
+      }
+      break;
+   case DIAGONALS_MIRRORED:
+      // First half, diagonals pointing from outside of outside edge to inside of
+      // inside edge
+      for (p = 0; p < numInsideEdgePoints / 2; p++) {
+         DefineClockwiseTriangle(ctx, outsidePoint, insidePoint + 1, insidePoint, baseIndexOffset);
+         baseIndexOffset += 3;
+         DefineClockwiseTriangle(ctx, outsidePoint, outsidePoint + 1, insidePoint + 1,
+                                 baseIndexOffset);
+         baseIndexOffset += 3;
+         insidePoint++;
+         outsidePoint++;
+      }
+      // Second half, diagonals pointing from inside of inside edge to outside of
+      // outside edge
+      for (; p < numInsideEdgePoints - 1; p++) {
+         DefineClockwiseTriangle(ctx, insidePoint, outsidePoint, outsidePoint + 1, baseIndexOffset);
+         baseIndexOffset += 3;
+         DefineClockwiseTriangle(ctx, insidePoint, outsidePoint + 1, insidePoint + 1,
+                                 baseIndexOffset);
+         baseIndexOffset += 3;
+         insidePoint++;
+         outsidePoint++;
+      }
+      break;
+   }
+   if (bTrapezoid) {
+      DefineClockwiseTriangle(ctx, outsidePoint, outsidePoint + 1, insidePoint, baseIndexOffset);
+      baseIndexOffset += 3;
+   }
+}
+
+// loop_start and loop_end give optimal loop bounds for
+// the stitching algorithm further below, for any given halfTssFactor. There
+// is probably a better way to encode this...
+//
+// Return the FIRST entry in finalPointPositionTable awhich is less than
+// halfTessFactor, except entry 0 and 1 which are set up to skip the loop.
+static int
+loop_start(int N)
+{
+   if (N < 2)
+      return 1;
+   else if (N == 2)
+      return 17;
+   else if (N < 5)
+      return 9;
+   else if (N < 9)
+      return 5;
+   else if (N < 17)
+      return 3;
+   else
+      return 2;
+}
+
+// Return the LAST entry in finalPointPositionTable[] which is less than
+// halfTessFactor, except entry 0 and 1 which are set up to skip the loop.
+static int
+loop_end(int N)
+{
+   if (N < 2)
+      return 0;
+   else if (N < 4)
+      return 17;
+   else if (N < 8)
+      return 25;
+   else if (N < 16)
+      return 29;
+   else if (N < 32)
+      return 31;
+   else
+      return 32;
+}
+
+// Tables to assist in the stitching of 2 rows of points having arbitrary
+// TessFactors. The stitching order is governed by Ruler Function vertex
+// split ordering (see external documentation).
+//
+// The contents of the finalPointPositionTable are where vertex i [0..33]
+// ends up on the half-edge at the max tessellation amount given
+// ruler-function split order. Recall the other half of an edge is mirrored,
+// so we only need to deal with one half. This table is used to decide when
+// to advance a point on the interior or exterior. It supports odd TessFactor
+// up to 65 and even TessFactor up to 64.
+
+/* TODO: Is this actually faster than a LUT? */
+static uint32_t
+finalPointPositionTable(uint32_t x)
+{
+   if (x == 0)
+      return 0;
+   if (x == 1)
+      return 0x20;
+
+   uint32_t shift;
+   if ((x & 1) == 0) {
+      shift = 1;
+   } else if ((x & 3) == 3) {
+      shift = 2;
+   } else if ((x & 7) == 5) {
+      shift = 3;
+   } else if (x != 17) {
+      shift = 4;
+   } else {
+      shift = 5;
+   }
+
+   // SWAR vectorized right-shift of (0x20, x)
+   // We're calculating `min(0xf, 0x20 >> shift) + (x >> shift)`.
+   uint32_t items_to_shift = x | (0x20 << 16);
+   uint32_t shifted = items_to_shift >> shift;
+
+   uint32_t bias = min(0xfu, shifted >> 16);
+   return bias + (shifted & 0xffff);
+}
+
+static void
+StitchTransition(private struct CHWTessellator *ctx, int baseIndexOffset,
+                 int insideEdgePointBaseOffset,
+                 int insideNumHalfTessFactorPoints,
+                 bool insideEdgeTessFactorOdd, int outsideEdgePointBaseOffset,
+                 int outsideNumHalfTessFactorPoints, bool outsideTessFactorOdd)
+{
+   if (insideEdgeTessFactorOdd) {
+      insideNumHalfTessFactorPoints -= 1;
+   }
+   if (outsideTessFactorOdd) {
+      outsideNumHalfTessFactorPoints -= 1;
+   }
+   // Walk first half
+   int outsidePoint = outsideEdgePointBaseOffset;
+   int insidePoint = insideEdgePointBaseOffset;
+
+   // iStart,iEnd are a small optimization so the loop below doesn't have to go
+   // from 0 up to 31
+   int iStart = min(loop_start(insideNumHalfTessFactorPoints),
+                    loop_start(outsideNumHalfTessFactorPoints));
+   int iEnd = loop_end(
+      max(insideNumHalfTessFactorPoints, outsideNumHalfTessFactorPoints));
+
+   // since we don't start the loop at 0 below, we need a special case.
+   if (0 < outsideNumHalfTessFactorPoints) {
+      // Advance outside
+      DefineClockwiseTriangle(ctx, outsidePoint, outsidePoint + 1, insidePoint,
+                              baseIndexOffset);
+      baseIndexOffset += 3;
+      outsidePoint++;
+   }
+
+   for (int i = iStart; i <= iEnd; i++) {
+      int bound = finalPointPositionTable(i);
+
+      if (bound < insideNumHalfTessFactorPoints) {
+         // Advance inside
+         DefineClockwiseTriangle(ctx, insidePoint, outsidePoint,
+                                 insidePoint + 1, baseIndexOffset);
+         baseIndexOffset += 3;
+         insidePoint++;
+      }
+      if (bound < outsideNumHalfTessFactorPoints) {
+         // Advance outside
+         DefineClockwiseTriangle(ctx, outsidePoint, outsidePoint + 1,
+                                 insidePoint, baseIndexOffset);
+         baseIndexOffset += 3;
+         outsidePoint++;
+      }
+   }
+
+   if ((insideEdgeTessFactorOdd != outsideTessFactorOdd) ||
+       insideEdgeTessFactorOdd) {
+      if (insideEdgeTessFactorOdd == outsideTessFactorOdd) {
+         // Quad in the middle
+         DefineClockwiseTriangle(ctx, insidePoint, outsidePoint,
+                                 insidePoint + 1, baseIndexOffset);
+         baseIndexOffset += 3;
+         DefineClockwiseTriangle(ctx, insidePoint + 1, outsidePoint,
+                                 outsidePoint + 1, baseIndexOffset);
+         baseIndexOffset += 3;
+         insidePoint++;
+         outsidePoint++;
+      } else if (!insideEdgeTessFactorOdd) {
+         // Triangle pointing inside
+         DefineClockwiseTriangle(ctx, insidePoint, outsidePoint,
+                                 outsidePoint + 1, baseIndexOffset);
+         baseIndexOffset += 3;
+         outsidePoint++;
+      } else {
+         // Triangle pointing outside
+         DefineClockwiseTriangle(ctx, insidePoint, outsidePoint,
+                                 insidePoint + 1, baseIndexOffset);
+         baseIndexOffset += 3;
+         insidePoint++;
+      }
+   }
+
+   // Walk second half.
+   for (int i = iEnd; i >= iStart; i--) {
+      int bound = finalPointPositionTable(i);
+
+      if (bound < outsideNumHalfTessFactorPoints) {
+         // Advance outside
+         DefineClockwiseTriangle(ctx, outsidePoint, outsidePoint + 1,
+                                 insidePoint, baseIndexOffset);
+         baseIndexOffset += 3;
+         outsidePoint++;
+      }
+      if (bound < insideNumHalfTessFactorPoints) {
+         // Advance inside
+         DefineClockwiseTriangle(ctx, insidePoint, outsidePoint,
+                                 insidePoint + 1, baseIndexOffset);
+         baseIndexOffset += 3;
+         insidePoint++;
+      }
+   }
+   // Below case is not needed if we didn't optimize loop above and made it run
+   // from 31 down to 0.
+   if (0 < outsideNumHalfTessFactorPoints) {
+      DefineClockwiseTriangle(ctx, outsidePoint, outsidePoint + 1, insidePoint,
+                              baseIndexOffset);
+      baseIndexOffset += 3;
+      outsidePoint++;
+   }
+}
+
+void
+libagx_tess_isoline(constant struct libagx_tess_args *p,
+                    enum libagx_tess_mode mode,
+                    enum libagx_tess_partitioning partitioning,
+                    enum libagx_tess_output_primitive output_primitive,
+                    uint patch)
+{
+   bool lineDensityOdd;
+   bool lineDetailOdd;
+   TESS_FACTOR_CONTEXT lineDensityTessFactorCtx;
+   TESS_FACTOR_CONTEXT lineDetailTessFactorCtx;
+
+   global float *factors = tess_factors(p, patch);
+   float TessFactor_V_LineDensity = factors[0];
+   float TessFactor_U_LineDetail = factors[1];
+
+   // Is the patch culled? NaN will pass.
+   if (!(TessFactor_V_LineDensity > 0) || !(TessFactor_U_LineDetail > 0)) {
+      libagx_draw_empty(p, mode, output_primitive, patch);
+      return;
+   }
+
+   // Clamp edge TessFactors
+   TessFactor_V_LineDensity =
+      clamp(TessFactor_V_LineDensity,
+            LIBAGX_TESS_MIN_ISOLINE_DENSITY_TESSELLATION_FACTOR,
+            LIBAGX_TESS_MAX_ISOLINE_DENSITY_TESSELLATION_FACTOR);
+   TessFactor_U_LineDetail =
+      clamp_factor(TessFactor_U_LineDetail, partitioning, 0);
+
+   // Process tessFactors
+   if (partitioning == LIBAGX_TESS_PARTITIONING_INTEGER) {
+      lineDetailOdd = isOdd(TessFactor_U_LineDetail);
+   } else {
+      lineDetailOdd = (partitioning == LIBAGX_TESS_PARTITIONING_FRACTIONAL_ODD);
+   }
+
+   FXP fxpTessFactor_U_LineDetail = floatToFixed(TessFactor_U_LineDetail);
+
+   ComputeTessFactorCtx(lineDetailOdd, fxpTessFactor_U_LineDetail,
+                        &lineDetailTessFactorCtx);
+   int numPointsPerLine =
+      NumPointsForTessFactor(lineDetailOdd, fxpTessFactor_U_LineDetail);
+
+   TessFactor_V_LineDensity = ceil(TessFactor_V_LineDensity);
+   lineDensityOdd = isOdd(TessFactor_V_LineDensity);
+   FXP fxpTessFactor_V_LineDensity = floatToFixed(TessFactor_V_LineDensity);
+   ComputeTessFactorCtx(lineDensityOdd, fxpTessFactor_V_LineDensity,
+                        &lineDensityTessFactorCtx);
+
+   // don't draw last line at V == 1.
+   int numLines =
+      NumPointsForTessFactor(lineDensityOdd, fxpTessFactor_V_LineDensity) - 1;
+
+   /* Points */
+   uint num_points = numPointsPerLine * numLines;
+   if (mode != LIBAGX_TESS_MODE_COUNT) {
+      global struct libagx_tess_point *points =
+         libagx_heap_alloc_points(p, patch, num_points);
+
+      for (int line = 0, pointOffset = 0; line < numLines; line++) {
+         FXP fxpV =
+            PlacePointIn1D(&lineDensityTessFactorCtx, lineDensityOdd, line);
+
+         for (int point = 0; point < numPointsPerLine; point++) {
+            FXP fxpU =
+               PlacePointIn1D(&lineDetailTessFactorCtx, lineDetailOdd, point);
+
+            DefinePoint(&points[pointOffset++], fxpU, fxpV);
+         }
+      }
+   }
+
+   struct CHWTessellator ctx;
+   ctx.mode = mode;
+   ctx.index_bias = patch * LIBAGX_TES_PATCH_ID_STRIDE;
+
+   /* Connectivity */
+   if (output_primitive != LIBAGX_TESS_OUTPUT_POINT) {
+      uint num_indices = numLines * (numPointsPerLine - 1) * 2;
+      ctx.Index = libagx_draw(p, mode, true, patch, num_indices);
+
+      if (mode == LIBAGX_TESS_MODE_COUNT)
+         return;
+
+      for (int line = 0, pointOffset = 0, indexOffset = 0; line < numLines;
+           line++) {
+         pointOffset++;
+
+         for (int point = 1; point < numPointsPerLine; point++) {
+            DefineIndex(&ctx, pointOffset - 1, indexOffset++);
+            DefineIndex(&ctx, pointOffset, indexOffset++);
+            pointOffset++;
+         }
+      }
+   } else {
+      libagx_draw_points(&ctx, p, patch, num_points);
+   }
+}
+
+void
+libagx_tess_tri(constant struct libagx_tess_args *p, enum libagx_tess_mode mode,
+
+                enum libagx_tess_partitioning partitioning,
+                enum libagx_tess_output_primitive output_primitive, uint patch)
+{
+   global float *factors = tess_factors(p, patch);
+   float tessFactor_Ueq0 = factors[0];
+   float tessFactor_Veq0 = factors[1];
+   float tessFactor_Weq0 = factors[2];
+   float insideTessFactor_f = factors[4];
+
+   struct CHWTessellator ctx;
+   ctx.outputPrimitive = output_primitive;
+   ctx.Point = NULL;
+   ctx.Index = NULL;
+   ctx.mode = mode;
+   ctx.index_bias = patch * LIBAGX_TES_PATCH_ID_STRIDE;
+   ctx.bUsingPatchedIndices = false;
+   ctx.bUsingPatchedIndices2 = false;
+
+   // Is the patch culled? NaN will pass.
+   if (!(tessFactor_Ueq0 > 0) || !(tessFactor_Veq0 > 0) ||
+       !(tessFactor_Weq0 > 0)) {
+
+      libagx_draw_empty(p, mode, output_primitive, patch);
+
+      return;
+   }
+
+   FXP outsideTessFactor[TRI_EDGES];
+   FXP insideTessFactor;
+   bool outsideTessFactorOdd[TRI_EDGES];
+   bool insideTessFactorOdd;
+   TESS_FACTOR_CONTEXT outsideTessFactorCtx[TRI_EDGES];
+   TESS_FACTOR_CONTEXT insideTessFactorCtx;
+   // Stuff below is just specific to the traversal order
+   // this code happens to use to generate points/lines
+   int numPointsForOutsideEdge[TRI_EDGES];
+   int numPointsForInsideTessFactor;
+   int insideEdgePointBaseOffset;
+
+   // Clamp TessFactors
+   tessFactor_Ueq0 = clamp_factor(tessFactor_Ueq0, partitioning, 0);
+   tessFactor_Veq0 = clamp_factor(tessFactor_Veq0, partitioning, 0);
+   tessFactor_Weq0 = clamp_factor(tessFactor_Weq0, partitioning, 0);
+
+   float maxf = max(max(tessFactor_Ueq0, tessFactor_Veq0), tessFactor_Weq0);
+   insideTessFactor_f = clamp_factor(insideTessFactor_f, partitioning, maxf);
+   // Note the above clamps map NaN to the lower bound
+
+   // Process tessFactors
+   float outsideTessFactor_f[TRI_EDGES] = {tessFactor_Ueq0, tessFactor_Veq0,
+                                           tessFactor_Weq0};
+   if (partitioning == LIBAGX_TESS_PARTITIONING_INTEGER) {
+      for (int edge = 0; edge < TRI_EDGES; edge++) {
+         outsideTessFactorOdd[edge] = isOdd(outsideTessFactor_f[edge]);
+      }
+      insideTessFactorOdd =
+         isOdd(insideTessFactor_f) && (1.0f != insideTessFactor_f);
+   } else {
+      bool odd = (partitioning == LIBAGX_TESS_PARTITIONING_FRACTIONAL_ODD);
+
+      for (int edge = 0; edge < TRI_EDGES; edge++) {
+         outsideTessFactorOdd[edge] = odd;
+      }
+      insideTessFactorOdd = odd;
+   }
+
+   // Save fixed point TessFactors
+   for (int edge = 0; edge < TRI_EDGES; edge++) {
+      outsideTessFactor[edge] = floatToFixed(outsideTessFactor_f[edge]);
+   }
+   insideTessFactor = floatToFixed(insideTessFactor_f);
+
+   if (partitioning != LIBAGX_TESS_PARTITIONING_FRACTIONAL_EVEN) {
+      // Special case if all TessFactors are 1
+      if ((FXP_ONE == insideTessFactor) &&
+          (FXP_ONE == outsideTessFactor[Ueq0]) &&
+          (FXP_ONE == outsideTessFactor[Veq0]) &&
+          (FXP_ONE == outsideTessFactor[Weq0])) {
+
+         /* Just do minimum tess factor */
+         if (mode == LIBAGX_TESS_MODE_COUNT) {
+            p->counts[patch] = 3;
+            return;
+         }
+
+         global struct libagx_tess_point *points =
+            libagx_heap_alloc_points(p, patch, 3);
+
+         DefinePoint(&points[0], 0,
+                     FXP_ONE);          // V=1 (beginning of Ueq0 edge VW)
+         DefinePoint(&points[1], 0, 0); // W=1 (beginning of Veq0 edge WU)
+         DefinePoint(&points[2], FXP_ONE,
+                     0); // U=1 (beginning of Weq0 edge UV)
+
+         if (output_primitive != LIBAGX_TESS_OUTPUT_POINT) {
+            ctx.Index = libagx_draw(p, mode, false, patch, 3);
+
+            DefineClockwiseTriangle(&ctx, 0, 1, 2,
+                                    /*indexStorageBaseOffset*/ 0);
+         } else {
+            libagx_draw_points(&ctx, p, patch, 3);
+         }
+
+         return;
+      }
+   }
+
+   // Compute per-TessFactor metadata
+   for (int edge = 0; edge < TRI_EDGES; edge++) {
+      ComputeTessFactorCtx(outsideTessFactorOdd[edge], outsideTessFactor[edge],
+                           &outsideTessFactorCtx[edge]);
+   }
+   ComputeTessFactorCtx(insideTessFactorOdd, insideTessFactor,
+                        &insideTessFactorCtx);
+
+   // Compute some initial data.
+   int NumPoints = 0;
+
+   // outside edge offsets and storage
+   for (int edge = 0; edge < TRI_EDGES; edge++) {
+      numPointsForOutsideEdge[edge] = NumPointsForTessFactor(
+         outsideTessFactorOdd[edge], outsideTessFactor[edge]);
+      NumPoints += numPointsForOutsideEdge[edge];
+   }
+   NumPoints -= 3;
+
+   // inside edge offsets
+   numPointsForInsideTessFactor =
+      NumPointsForTessFactor(insideTessFactorOdd, insideTessFactor);
+   {
+      int pointCountMin = insideTessFactorOdd ? 4 : 3;
+      // max() allows degenerate transition regions when inside TessFactor == 1
+      numPointsForInsideTessFactor =
+         max(pointCountMin, numPointsForInsideTessFactor);
+   }
+
+   insideEdgePointBaseOffset = NumPoints;
+
+   // inside storage, including interior edges above
+   {
+      int interiorRings = (numPointsForInsideTessFactor >> 1) - 1;
+      int even = insideTessFactorOdd ? 0 : 1;
+      NumPoints += TRI_EDGES * (interiorRings * (interiorRings + even)) + even;
+   }
+
+   /* GENERATE POINTS */
+   if (mode != LIBAGX_TESS_MODE_COUNT) {
+      ctx.Point = libagx_heap_alloc_points(p, patch, NumPoints);
+
+      // Generate exterior ring edge points, clockwise starting from point V
+      // (VW, the U==0 edge)
+      int pointOffset = 0;
+      for (int edge = 0; edge < TRI_EDGES; edge++) {
+         int odd = edge & 0x1;
+         int endPoint = numPointsForOutsideEdge[edge] - 1;
+         // don't include end, since next edge starts with it.
+         for (int p = 0; p < endPoint; p++, pointOffset++) {
+            // whether to reverse point order given we are defining V or U (W
+            // implicit): edge0, VW, has V decreasing, so reverse 1D points
+            // below edge1, WU, has U increasing, so don't reverse 1D points
+            // below edge2, UV, has U decreasing, so reverse 1D points below
+            int q = odd ? p : endPoint - p;
+
+            FXP fxpParam = PlacePointIn1D(&outsideTessFactorCtx[edge],
+                                          outsideTessFactorOdd[edge], q);
+            if (edge == 0) {
+               DefinePoint(&ctx.Point[pointOffset], 0, fxpParam);
+            } else {
+               DefinePoint(&ctx.Point[pointOffset], fxpParam,
+                           (edge == 2) ? FXP_ONE - fxpParam : 0);
+            }
+         }
+      }
+
+      // Generate interior ring points, clockwise spiralling in
+      int numRings = (numPointsForInsideTessFactor >> 1);
+      for (int ring = 1; ring < numRings; ring++) {
+         int startPoint = ring;
+         int endPoint = numPointsForInsideTessFactor - 1 - startPoint;
+
+         for (int edge = 0; edge < TRI_EDGES; edge++) {
+            int odd = edge & 0x1;
+            int perpendicularAxisPoint = startPoint;
+            FXP fxpPerpParam =
+               PlacePointIn1D(&insideTessFactorCtx, insideTessFactorOdd,
+                              perpendicularAxisPoint);
+
+            // Map location to the right size in
+            // barycentric space. We know this fixed
+            // point math won't over/underflow
+            fxpPerpParam *= FXP_TWO_THIRDS;
+            fxpPerpParam = (fxpPerpParam + FXP_ONE_HALF /*round*/) >>
+                           FXP_FRACTION_BITS; // get back to n.16
+
+            // don't include end: next edge starts with it.
+            for (int p = startPoint; p < endPoint; p++, pointOffset++) {
+               // whether to reverse point given we are defining V or U (W
+               // implicit): edge0, VW, has V decreasing, so reverse 1D points
+               // below edge1, WU, has U increasing, so don't reverse 1D points
+               // below edge2, UV, has U decreasing, so reverse 1D points below
+               int q = odd ? p : endPoint - (p - startPoint);
+
+               FXP fxpParam =
+                  PlacePointIn1D(&insideTessFactorCtx, insideTessFactorOdd, q);
+               // edge0 VW, has perpendicular parameter U constant
+               // edge1 WU, has perpendicular parameter V constant
+               // edge2 UV, has perpendicular parameter W constant
+               // reciprocal is the rate of change of edge-parallel parameters
+               // as they are pushed into the triangle
+               const unsigned int deriv = 2;
+
+               // we know this fixed point math won't over/underflow
+               FXP tmp = fxpParam - (fxpPerpParam + 1 /*round*/) / deriv;
+
+               DefinePoint(&ctx.Point[pointOffset],
+                           edge > 0 ? tmp : fxpPerpParam,
+                           edge == 0   ? tmp
+                           : edge == 1 ? fxpPerpParam
+                                       : FXP_ONE - tmp - fxpPerpParam);
+            }
+         }
+      }
+      if (!insideTessFactorOdd) {
+         // Last point is the point at the center.
+         DefinePoint(&ctx.Point[pointOffset], FXP_ONE_THIRD, FXP_ONE_THIRD);
+      }
+   }
+
+   if (output_primitive == LIBAGX_TESS_OUTPUT_POINT) {
+      libagx_draw_points(&ctx, p, patch, NumPoints);
+      return;
+   }
+
+   {
+      // Generate primitives for all the concentric rings, one side at a time
+      // for each ring +1 is so even tess includes the center point, which we
+      // want to now
+      int numRings = ((numPointsForInsideTessFactor + 1) >> 1);
+
+      int NumIndices = 0;
+      {
+         assert(numRings >= 2 && "invariant");
+         int OuterPoints = numPointsForOutsideEdge[0] +
+                           numPointsForOutsideEdge[1] +
+                           numPointsForOutsideEdge[2];
+
+         int numRings18 = numRings * 18;
+         NumIndices = ((numRings18 - 27) * numPointsForInsideTessFactor) +
+                      (3 * OuterPoints) - (numRings18 * (numRings - 1)) +
+                      (insideTessFactorOdd ? 3 : 0);
+      }
+
+      // Generate the draw and allocate the index buffer now that we know the size
+      ctx.Index = libagx_draw(p, mode, false, patch, NumIndices);
+
+      if (mode == LIBAGX_TESS_MODE_COUNT)
+         return;
+
+      int insideOffset = insideEdgePointBaseOffset;
+      int outsideEdgePointBaseOffset = 0;
+
+      NumIndices = 0;
+      for (int ring = 1; ring < numRings; ring++) {
+         int numPointsForInsideEdge = numPointsForInsideTessFactor - 2 * ring;
+         int edge0InsidePointBaseOffset = insideOffset;
+         int edge0OutsidePointBaseOffset = outsideEdgePointBaseOffset;
+         for (int edge = 0; edge < TRI_EDGES; edge++) {
+            int outsidePoints = ring == 1 ? numPointsForOutsideEdge[edge]
+                                          : (numPointsForInsideEdge + 2);
+
+            int numTriangles = numPointsForInsideEdge + outsidePoints - 2;
+
+            int insideBaseOffset;
+            int outsideBaseOffset;
+            if (edge == 2) {
+               ctx.IndexPatchCtx.insidePointIndexDeltaToRealValue =
+                  insideOffset;
+               ctx.IndexPatchCtx.insidePointIndexBadValue =
+                  numPointsForInsideEdge - 1;
+               ctx.IndexPatchCtx.insidePointIndexReplacementValue =
+                  edge0InsidePointBaseOffset;
+               ctx.IndexPatchCtx.outsidePointIndexPatchBase =
+                  ctx.IndexPatchCtx.insidePointIndexBadValue +
+                  1; // past inside patched index range
+               ctx.IndexPatchCtx.outsidePointIndexDeltaToRealValue =
+                  outsideEdgePointBaseOffset -
+                  ctx.IndexPatchCtx.outsidePointIndexPatchBase;
+               ctx.IndexPatchCtx.outsidePointIndexBadValue =
+                  ctx.IndexPatchCtx.outsidePointIndexPatchBase + outsidePoints -
+                  1;
+               ctx.IndexPatchCtx.outsidePointIndexReplacementValue =
+                  edge0OutsidePointBaseOffset;
+               ctx.bUsingPatchedIndices = true;
+               insideBaseOffset = 0;
+               outsideBaseOffset = ctx.IndexPatchCtx.outsidePointIndexPatchBase;
+            } else {
+               insideBaseOffset = insideOffset;
+               outsideBaseOffset = outsideEdgePointBaseOffset;
+            }
+            if (ring == 1) {
+               StitchTransition(
+                  &ctx, /*baseIndexOffset: */ NumIndices, insideBaseOffset,
+                  insideTessFactorCtx.numHalfTessFactorPoints,
+                  insideTessFactorOdd, outsideBaseOffset,
+                  outsideTessFactorCtx[edge].numHalfTessFactorPoints,
+                  outsideTessFactorOdd[edge]);
+            } else {
+               StitchRegular(&ctx, /*bTrapezoid*/ true, DIAGONALS_MIRRORED,
+                             /*baseIndexOffset: */ NumIndices,
+                             numPointsForInsideEdge, insideBaseOffset,
+                             outsideBaseOffset);
+            }
+            if (2 == edge) {
+               ctx.bUsingPatchedIndices = false;
+            }
+            NumIndices += numTriangles * 3;
+            outsideEdgePointBaseOffset += outsidePoints - 1;
+            insideOffset += numPointsForInsideEdge - 1;
+         }
+      }
+      if (insideTessFactorOdd) {
+         // Triangulate center (a single triangle)
+         DefineClockwiseTriangle(&ctx, outsideEdgePointBaseOffset,
+                                 outsideEdgePointBaseOffset + 1,
+                                 outsideEdgePointBaseOffset + 2, NumIndices);
+         NumIndices += 3;
+      }
+   }
+}
+
+void
+libagx_tess_quad(constant struct libagx_tess_args *p,
+                 enum libagx_tess_mode mode,
+                 enum libagx_tess_partitioning partitioning,
+                 enum libagx_tess_output_primitive output_primitive, uint patch)
+{
+   global float *factors = tess_factors(p, patch);
+
+   float tessFactor_Ueq0 = factors[0];
+   float tessFactor_Veq0 = factors[1];
+   float tessFactor_Ueq1 = factors[2];
+   float tessFactor_Veq1 = factors[3];
+
+   float insideTessFactor_U = factors[4];
+   float insideTessFactor_V = factors[5];
+
+   // TODO: fix designated initializer optimization in NIR
+   struct CHWTessellator ctx;
+   ctx.outputPrimitive = output_primitive;
+   ctx.Point = NULL;
+   ctx.Index = NULL;
+   ctx.mode = mode;
+   ctx.index_bias = patch * LIBAGX_TES_PATCH_ID_STRIDE;
+   ctx.bUsingPatchedIndices = false;
+   ctx.bUsingPatchedIndices2 = false;
+
+   // Is the patch culled?
+   if (!(tessFactor_Ueq0 > 0) || // NaN will pass
+       !(tessFactor_Veq0 > 0) || !(tessFactor_Ueq1 > 0) ||
+       !(tessFactor_Veq1 > 0)) {
+      libagx_draw_empty(p, mode, output_primitive, patch);
+      return;
+   }
+
+   FXP outsideTessFactor[QUAD_EDGES];
+   FXP insideTessFactor[QUAD_AXES];
+   bool outsideTessFactorOdd[QUAD_EDGES];
+   bool insideTessFactorOdd[QUAD_AXES];
+   TESS_FACTOR_CONTEXT outsideTessFactorCtx[QUAD_EDGES];
+   TESS_FACTOR_CONTEXT insideTessFactorCtx[QUAD_AXES];
+   // Stuff below is just specific to the traversal order
+   // this code happens to use to generate points/lines
+   int numPointsForOutsideEdge[QUAD_EDGES];
+   int numPointsForInsideTessFactor[QUAD_AXES];
+   int insideEdgePointBaseOffset;
+
+   // Clamp edge TessFactors
+   tessFactor_Ueq0 = clamp_factor(tessFactor_Ueq0, partitioning, 0);
+   tessFactor_Veq0 = clamp_factor(tessFactor_Veq0, partitioning, 0);
+   tessFactor_Ueq1 = clamp_factor(tessFactor_Ueq1, partitioning, 0);
+   tessFactor_Veq1 = clamp_factor(tessFactor_Veq1, partitioning, 0);
+
+   float maxf = max(max(max(tessFactor_Ueq0, tessFactor_Veq0),
+                        max(tessFactor_Ueq1, tessFactor_Veq1)),
+                    max(insideTessFactor_U, insideTessFactor_V));
+
+   insideTessFactor_U = clamp_factor(insideTessFactor_U, partitioning, maxf);
+   insideTessFactor_V = clamp_factor(insideTessFactor_V, partitioning, maxf);
+   // Note the above clamps map NaN to lowerBound
+
+   // Process tessFactors
+   float outsideTessFactor_f[QUAD_EDGES] = {tessFactor_Ueq0, tessFactor_Veq0,
+                                            tessFactor_Ueq1, tessFactor_Veq1};
+   float insideTessFactor_f[QUAD_AXES] = {insideTessFactor_U,
+                                          insideTessFactor_V};
+   int edge, axis;
+   if (partitioning == LIBAGX_TESS_PARTITIONING_INTEGER) {
+      for (edge = 0; edge < QUAD_EDGES; edge++) {
+         outsideTessFactorOdd[edge] = isOdd(outsideTessFactor_f[edge]);
+      }
+      for (axis = 0; axis < QUAD_AXES; axis++) {
+         insideTessFactorOdd[axis] = isOdd(insideTessFactor_f[axis]) &&
+                                     (1.0f != insideTessFactor_f[axis]);
+      }
+   } else {
+      bool odd = (partitioning == LIBAGX_TESS_PARTITIONING_FRACTIONAL_ODD);
+
+      for (edge = 0; edge < QUAD_EDGES; edge++) {
+         outsideTessFactorOdd[edge] = odd;
+      }
+      insideTessFactorOdd[U] = insideTessFactorOdd[V] = odd;
+   }
+
+   // Save fixed point TessFactors
+   for (edge = 0; edge < QUAD_EDGES; edge++) {
+      outsideTessFactor[edge] = floatToFixed(outsideTessFactor_f[edge]);
+   }
+   for (axis = 0; axis < QUAD_AXES; axis++) {
+      insideTessFactor[axis] = floatToFixed(insideTessFactor_f[axis]);
+   }
+
+   if (partitioning != LIBAGX_TESS_PARTITIONING_FRACTIONAL_EVEN) {
+      // Special case if all TessFactors are 1
+      if ((FXP_ONE == insideTessFactor[U]) &&
+          (FXP_ONE == insideTessFactor[V]) &&
+          (FXP_ONE == outsideTessFactor[Ueq0]) &&
+          (FXP_ONE == outsideTessFactor[Veq0]) &&
+          (FXP_ONE == outsideTessFactor[Ueq1]) &&
+          (FXP_ONE == outsideTessFactor[Veq1])) {
+
+         /* Just do minimum tess factor */
+         if (output_primitive != LIBAGX_TESS_OUTPUT_POINT) {
+            ctx.Index = libagx_draw(p, mode, false, patch, 6);
+            if (mode == LIBAGX_TESS_MODE_COUNT)
+               return;
+
+            DefineClockwiseTriangle(&ctx, 0, 1, 3, /*indexStorageOffset*/ 0);
+            DefineClockwiseTriangle(&ctx, 1, 2, 3, /*indexStorageOffset*/ 3);
+         } else {
+            libagx_draw_points(&ctx, p, patch, 4);
+         }
+
+         global struct libagx_tess_point *points =
+            libagx_heap_alloc_points(p, patch, 4);
+
+         DefinePoint(&points[0], 0, 0);
+         DefinePoint(&points[1], FXP_ONE, 0);
+         DefinePoint(&points[2], FXP_ONE, FXP_ONE);
+         DefinePoint(&points[3], 0, FXP_ONE);
+         return;
+      }
+   }
+
+   // Compute TessFactor-specific metadata
+   for (int edge = 0; edge < QUAD_EDGES; edge++) {
+      ComputeTessFactorCtx(outsideTessFactorOdd[edge], outsideTessFactor[edge],
+                           &outsideTessFactorCtx[edge]);
+   }
+
+   for (int axis = 0; axis < QUAD_AXES; axis++) {
+      ComputeTessFactorCtx(insideTessFactorOdd[axis], insideTessFactor[axis],
+                           &insideTessFactorCtx[axis]);
+   }
+
+   int NumPoints = 0;
+
+   // outside edge offsets and storage
+   for (int edge = 0; edge < QUAD_EDGES; edge++) {
+      numPointsForOutsideEdge[edge] = NumPointsForTessFactor(
+         outsideTessFactorOdd[edge], outsideTessFactor[edge]);
+      NumPoints += numPointsForOutsideEdge[edge];
+   }
+   NumPoints -= 4;
+
+   // inside edge offsets
+   for (int axis = 0; axis < QUAD_AXES; axis++) {
+      numPointsForInsideTessFactor[axis] = NumPointsForTessFactor(
+         insideTessFactorOdd[axis], insideTessFactor[axis]);
+      int pointCountMin = insideTessFactorOdd[axis] ? 4 : 3;
+      // max() allows degenerate transition regions when inside TessFactor == 1
+      numPointsForInsideTessFactor[axis] =
+         max(pointCountMin, numPointsForInsideTessFactor[axis]);
+   }
+
+   insideEdgePointBaseOffset = NumPoints;
+
+   // inside storage, including interior edges above
+   int numInteriorPoints = (numPointsForInsideTessFactor[U] - 2) *
+                           (numPointsForInsideTessFactor[V] - 2);
+   NumPoints += numInteriorPoints;
+
+   if (mode != LIBAGX_TESS_MODE_COUNT) {
+      ctx.Point = libagx_heap_alloc_points(p, patch, NumPoints);
+
+      // Generate exterior ring edge points, clockwise from top-left
+      int pointOffset = 0;
+      for (int edge = 0; edge < QUAD_EDGES; edge++) {
+         int odd = edge & 0x1;
+         // don't include end, since next edge starts with it.
+         int endPoint = numPointsForOutsideEdge[edge] - 1;
+         for (int p = 0; p < endPoint; p++, pointOffset++) {
+            FXP fxpParam;
+            int q =
+               ((edge == 1) || (edge == 2)) ? p : endPoint - p; // reverse order
+            fxpParam = PlacePointIn1D(&outsideTessFactorCtx[edge],
+                                      outsideTessFactorOdd[edge], q);
+            if (odd) {
+               DefinePoint(&ctx.Point[pointOffset], fxpParam,
+                           (edge == 3) ? FXP_ONE : 0);
+            } else {
+               DefinePoint(&ctx.Point[pointOffset], (edge == 2) ? FXP_ONE : 0,
+                           fxpParam);
+            }
+         }
+      }
+
+      // Generate interior ring points, clockwise from (U==0,V==1) (bottom-left)
+      // spiralling toward center
+      int minNumPointsForTessFactor =
+         min(numPointsForInsideTessFactor[U], numPointsForInsideTessFactor[V]);
+      // note for even tess we aren't counting center point here.
+      int numRings = (minNumPointsForTessFactor >> 1);
+
+      for (int ring = 1; ring < numRings; ring++) {
+         int startPoint = ring;
+         int endPoint[QUAD_AXES] = {
+            numPointsForInsideTessFactor[U] - 1 - startPoint,
+            numPointsForInsideTessFactor[V] - 1 - startPoint,
+         };
+
+         for (int edge = 0; edge < QUAD_EDGES; edge++) {
+            int odd[QUAD_AXES] = {edge & 0x1, ((edge + 1) & 0x1)};
+            int perpendicularAxisPoint =
+               (edge < 2) ? startPoint : endPoint[odd[0]];
+            FXP fxpPerpParam = PlacePointIn1D(&insideTessFactorCtx[odd[0]],
+                                              insideTessFactorOdd[odd[0]],
+                                              perpendicularAxisPoint);
+
+            for (int p = startPoint; p < endPoint[odd[1]]; p++,
+                     pointOffset++) // don't include end: next edge starts with
+                                    // it.
+            {
+               int q = ((edge == 1) || (edge == 2))
+                          ? p
+                          : endPoint[odd[1]] - (p - startPoint);
+               FXP fxpParam = PlacePointIn1D(&insideTessFactorCtx[odd[1]],
+                                             insideTessFactorOdd[odd[1]], q);
+               if (odd[1]) {
+                  DefinePoint(&ctx.Point[pointOffset], fxpPerpParam, fxpParam);
+               } else {
+                  DefinePoint(&ctx.Point[pointOffset], fxpParam, fxpPerpParam);
+               }
+            }
+         }
+      }
+      // For even tessellation, the inner "ring" is degenerate - a row of points
+      if ((numPointsForInsideTessFactor[U] > numPointsForInsideTessFactor[V]) &&
+          !insideTessFactorOdd[V]) {
+         int startPoint = numRings;
+         int endPoint = numPointsForInsideTessFactor[U] - 1 - startPoint;
+         for (int p = startPoint; p <= endPoint; p++, pointOffset++) {
+            FXP fxpParam = PlacePointIn1D(&insideTessFactorCtx[U],
+                                          insideTessFactorOdd[U], p);
+            DefinePoint(&ctx.Point[pointOffset], fxpParam, FXP_ONE_HALF);
+         }
+      } else if ((numPointsForInsideTessFactor[V] >=
+                  numPointsForInsideTessFactor[U]) &&
+                 !insideTessFactorOdd[U]) {
+         int startPoint = numRings;
+         int endPoint = numPointsForInsideTessFactor[V] - 1 - startPoint;
+         for (int p = endPoint; p >= startPoint; p--, pointOffset++) {
+            FXP fxpParam = PlacePointIn1D(&insideTessFactorCtx[V],
+                                          insideTessFactorOdd[V], p);
+            DefinePoint(&ctx.Point[pointOffset], FXP_ONE_HALF, fxpParam);
+         }
+      }
+   }
+
+   if (output_primitive == LIBAGX_TESS_OUTPUT_POINT) {
+      libagx_draw_points(&ctx, p, patch, NumPoints);
+      return;
+   }
+
+   /* CONNECTIVITY */
+   {
+      // Generate primitives for all the concentric rings, one side at a time
+      // for each ring. +1 is so even tess includes the center point
+      int numPointRowsToCenter[QUAD_AXES] = {
+         (numPointsForInsideTessFactor[U] + 1) >> 1,
+         (numPointsForInsideTessFactor[V] + 1) >> 1,
+      };
+
+      int numRings = min(numPointRowsToCenter[U], numPointRowsToCenter[V]);
+
+      /* Calculate # of indices so we can allocate */
+      {
+         /* numPointsForInsideTessFactor >= 3 so numRings >= 2 */
+         assert(numRings >= 2);
+
+         /* Handle main case */
+         int OuterPoints =
+            numPointsForOutsideEdge[0] + numPointsForOutsideEdge[1] +
+            numPointsForOutsideEdge[2] + numPointsForOutsideEdge[3];
+
+         int InnerPoints =
+            numPointsForInsideTessFactor[U] + numPointsForInsideTessFactor[V];
+
+         int NumIndices = (OuterPoints * 3) + (12 * numRings * InnerPoints) -
+                          (InnerPoints * 18) - (24 * numRings * (numRings - 1));
+
+         /* Determine major/minor axes */
+         bool U_major =
+            (numPointsForInsideTessFactor[U] > numPointsForInsideTessFactor[V]);
+         unsigned M = U_major ? U : V;
+         unsigned m = U_major ? V : U;
+
+         /* Handle degenerate ring */
+         if (insideTessFactorOdd[m]) {
+            assert(numPointsForInsideTessFactor[M] >=
+                   numPointsForInsideTessFactor[m]);
+
+            NumIndices += 12 * ((numPointsForInsideTessFactor[M] >> 1) -
+                                (numPointsForInsideTessFactor[m] >> 1));
+            NumIndices += (insideTessFactorOdd[M] ? 6 : 12);
+         }
+
+         // Generate the draw and allocate the index buffer with the size
+         ctx.Index = libagx_draw(p, mode, false, patch, NumIndices);
+      }
+
+      if (mode == LIBAGX_TESS_MODE_COUNT)
+         return;
+
+      int degeneratePointRing[QUAD_AXES] = {
+         // Even partitioning causes degenerate row of points,
+         // which results in exceptions to the point ordering conventions
+         // when travelling around the rings counterclockwise.
+         !insideTessFactorOdd[V] ? numPointRowsToCenter[V] - 1 : -1,
+         !insideTessFactorOdd[U] ? numPointRowsToCenter[U] - 1 : -1,
+      };
+
+      int numPointsForOutsideEdge_[QUAD_EDGES] = {
+         numPointsForOutsideEdge[Ueq0],
+         numPointsForOutsideEdge[Veq0],
+         numPointsForOutsideEdge[Ueq1],
+         numPointsForOutsideEdge[Veq1],
+      };
+
+      int insideEdgePointBaseOffset_ = insideEdgePointBaseOffset;
+      int outsideEdgePointBaseOffset = 0;
+
+      int NumIndices = 0;
+
+      for (int ring = 1; ring < numRings; ring++) {
+         int numPointsForInsideEdge[QUAD_AXES] = {
+            numPointsForInsideTessFactor[U] - 2 * ring,
+            numPointsForInsideTessFactor[V] - 2 * ring};
+
+         int edge0InsidePointBaseOffset = insideEdgePointBaseOffset_;
+         int edge0OutsidePointBaseOffset = outsideEdgePointBaseOffset;
+
+         for (int edge = 0; edge < QUAD_EDGES; edge++) {
+            int odd = (edge + 1) & 0x1;
+
+            int numTriangles =
+               numPointsForInsideEdge[odd] + numPointsForOutsideEdge_[edge] - 2;
+            int insideBaseOffset;
+            int outsideBaseOffset;
+
+            // We need to patch the indexing so Stitch() can think it sees 2
+            // sequentially increasing rows of points, even though we have
+            // wrapped around to the end of the inner and outer ring's points,
+            // so the last point is really the first point for the ring. We make
+            // it so that when Stitch() calls AddIndex(), that function will do
+            // any necessary index adjustment.
+            if (edge == 3) {
+               if (ring == degeneratePointRing[odd]) {
+                  ctx.IndexPatchCtx2.baseIndexToInvert =
+                     insideEdgePointBaseOffset_ + 1;
+                  ctx.IndexPatchCtx2.cornerCaseBadValue =
+                     outsideEdgePointBaseOffset +
+                     numPointsForOutsideEdge_[edge] - 1;
+                  ctx.IndexPatchCtx2.cornerCaseReplacementValue =
+                     edge0OutsidePointBaseOffset;
+                  ctx.IndexPatchCtx2.indexInversionEndPoint =
+                     (ctx.IndexPatchCtx2.baseIndexToInvert << 1) - 1;
+                  insideBaseOffset = ctx.IndexPatchCtx2.baseIndexToInvert;
+                  outsideBaseOffset = outsideEdgePointBaseOffset;
+                  ctx.bUsingPatchedIndices2 = true;
+               } else {
+                  ctx.IndexPatchCtx.insidePointIndexDeltaToRealValue =
+                     insideEdgePointBaseOffset_;
+                  ctx.IndexPatchCtx.insidePointIndexBadValue =
+                     numPointsForInsideEdge[odd] - 1;
+                  ctx.IndexPatchCtx.insidePointIndexReplacementValue =
+                     edge0InsidePointBaseOffset;
+                  ctx.IndexPatchCtx.outsidePointIndexPatchBase =
+                     ctx.IndexPatchCtx.insidePointIndexBadValue +
+                     1; // past inside patched index range
+                  ctx.IndexPatchCtx.outsidePointIndexDeltaToRealValue =
+                     outsideEdgePointBaseOffset -
+                     ctx.IndexPatchCtx.outsidePointIndexPatchBase;
+                  ctx.IndexPatchCtx.outsidePointIndexBadValue =
+                     ctx.IndexPatchCtx.outsidePointIndexPatchBase +
+                     numPointsForOutsideEdge_[edge] - 1;
+                  ctx.IndexPatchCtx.outsidePointIndexReplacementValue =
+                     edge0OutsidePointBaseOffset;
+
+                  insideBaseOffset = 0;
+                  outsideBaseOffset =
+                     ctx.IndexPatchCtx.outsidePointIndexPatchBase;
+                  ctx.bUsingPatchedIndices = true;
+               }
+            } else if ((edge == 2) && (ring == degeneratePointRing[odd])) {
+               ctx.IndexPatchCtx2.baseIndexToInvert =
+                  insideEdgePointBaseOffset_;
+               ctx.IndexPatchCtx2.cornerCaseBadValue = -1;         // unused
+               ctx.IndexPatchCtx2.cornerCaseReplacementValue = -1; // unused
+               ctx.IndexPatchCtx2.indexInversionEndPoint =
+                  ctx.IndexPatchCtx2.baseIndexToInvert << 1;
+               insideBaseOffset = ctx.IndexPatchCtx2.baseIndexToInvert;
+               outsideBaseOffset = outsideEdgePointBaseOffset;
+               ctx.bUsingPatchedIndices2 = true;
+            } else {
+               insideBaseOffset = insideEdgePointBaseOffset_;
+               outsideBaseOffset = outsideEdgePointBaseOffset;
+            }
+            if (ring == 1) {
+               StitchTransition(
+                  &ctx, /*baseIndexOffset: */ NumIndices, insideBaseOffset,
+                  insideTessFactorCtx[odd].numHalfTessFactorPoints,
+                  insideTessFactorOdd[odd], outsideBaseOffset,
+                  outsideTessFactorCtx[edge].numHalfTessFactorPoints,
+                  outsideTessFactorOdd[edge]);
+            } else {
+               StitchRegular(&ctx, /*bTrapezoid*/ true, DIAGONALS_MIRRORED,
+                             /*baseIndexOffset: */ NumIndices,
+                             numPointsForInsideEdge[odd], insideBaseOffset,
+                             outsideBaseOffset);
+            }
+            ctx.bUsingPatchedIndices = false;
+            ctx.bUsingPatchedIndices2 = false;
+            NumIndices += numTriangles * 3;
+            outsideEdgePointBaseOffset += numPointsForOutsideEdge_[edge] - 1;
+            if ((edge == 2) && (ring == degeneratePointRing[odd])) {
+               insideEdgePointBaseOffset_ -= numPointsForInsideEdge[odd] - 1;
+            } else {
+               insideEdgePointBaseOffset_ += numPointsForInsideEdge[odd] - 1;
+            }
+            numPointsForOutsideEdge_[edge] = numPointsForInsideEdge[odd];
+         }
+      }
+
+      // Triangulate center - a row of quads if odd
+      // This triangulation may be producing diagonals that are asymmetric about
+      // the center of the patch in this region.
+      if ((numPointsForInsideTessFactor[U] > numPointsForInsideTessFactor[V]) &&
+          insideTessFactorOdd[V]) {
+         ctx.bUsingPatchedIndices2 = true;
+         int stripNumQuads = (((numPointsForInsideTessFactor[U] >> 1) -
+                               (numPointsForInsideTessFactor[V] >> 1))
+                              << 1) +
+                             (insideTessFactorOdd[U] ? 1 : 2);
+         ctx.IndexPatchCtx2.baseIndexToInvert =
+            outsideEdgePointBaseOffset + stripNumQuads + 2;
+         ctx.IndexPatchCtx2.cornerCaseBadValue =
+            ctx.IndexPatchCtx2.baseIndexToInvert;
+         ctx.IndexPatchCtx2.cornerCaseReplacementValue =
+            outsideEdgePointBaseOffset;
+         ctx.IndexPatchCtx2.indexInversionEndPoint =
+            ctx.IndexPatchCtx2.baseIndexToInvert +
+            ctx.IndexPatchCtx2.baseIndexToInvert + stripNumQuads;
+         StitchRegular(
+            &ctx, /*bTrapezoid*/ false, DIAGONALS_INSIDE_TO_OUTSIDE,
+            /*baseIndexOffset: */ NumIndices,
+            /*numInsideEdgePoints:*/ stripNumQuads + 1,
+            /*insideEdgePointBaseOffset*/ ctx.IndexPatchCtx2.baseIndexToInvert,
+            outsideEdgePointBaseOffset + 1);
+         ctx.bUsingPatchedIndices2 = false;
+         NumIndices += stripNumQuads * 6;
+      } else if ((numPointsForInsideTessFactor[V] >=
+                  numPointsForInsideTessFactor[U]) &&
+                 insideTessFactorOdd[U]) {
+         ctx.bUsingPatchedIndices2 = true;
+         int stripNumQuads = (((numPointsForInsideTessFactor[V] >> 1) -
+                               (numPointsForInsideTessFactor[U] >> 1))
+                              << 1) +
+                             (insideTessFactorOdd[V] ? 1 : 2);
+         ctx.IndexPatchCtx2.baseIndexToInvert =
+            outsideEdgePointBaseOffset + stripNumQuads + 1;
+         ctx.IndexPatchCtx2.cornerCaseBadValue = -1; // unused
+         ctx.IndexPatchCtx2.indexInversionEndPoint =
+            ctx.IndexPatchCtx2.baseIndexToInvert +
+            ctx.IndexPatchCtx2.baseIndexToInvert + stripNumQuads;
+         DIAGONALS diag = insideTessFactorOdd[V]
+                             ? DIAGONALS_INSIDE_TO_OUTSIDE_EXCEPT_MIDDLE
+                             : DIAGONALS_INSIDE_TO_OUTSIDE;
+         StitchRegular(
+            &ctx, /*bTrapezoid*/ false, diag,
+            /*baseIndexOffset: */ NumIndices,
+            /*numInsideEdgePoints:*/ stripNumQuads + 1,
+            /*insideEdgePointBaseOffset*/ ctx.IndexPatchCtx2.baseIndexToInvert,
+            outsideEdgePointBaseOffset);
+         ctx.bUsingPatchedIndices2 = false;
+         NumIndices += stripNumQuads * 6;
+      }
+   }
+}
diff --git a/src/asahi/lib/shaders/tessellator.h b/src/asahi/lib/shaders/tessellator.h
new file mode 100644
index 00000000000..ec674f43764
--- /dev/null
+++ b/src/asahi/lib/shaders/tessellator.h
@@ -0,0 +1,124 @@
+/*
+ * Copyright 2024 Valve Corporation
+ * SPDX-License-Identifier: MIT
+ */
+
+#pragma once
+
+#include "libagx.h"
+
+enum libagx_tess_partitioning {
+   LIBAGX_TESS_PARTITIONING_FRACTIONAL_ODD,
+   LIBAGX_TESS_PARTITIONING_FRACTIONAL_EVEN,
+   LIBAGX_TESS_PARTITIONING_INTEGER,
+};
+
+enum libagx_tess_output_primitive {
+   LIBAGX_TESS_OUTPUT_POINT,
+   LIBAGX_TESS_OUTPUT_TRIANGLE_CW,
+   LIBAGX_TESS_OUTPUT_TRIANGLE_CCW,
+};
+
+enum libagx_tess_mode {
+   /* Do not actually tessellate, just write the index counts */
+   LIBAGX_TESS_MODE_COUNT,
+
+   /* Tessellate using the count buffers to allocate indices */
+   LIBAGX_TESS_MODE_WITH_COUNTS,
+
+   /* Tessellate without count buffers by generating VDM index list words */
+   LIBAGX_TESS_MODE_VDM,
+};
+
+struct libagx_tess_point {
+   float u;
+   float v;
+};
+AGX_STATIC_ASSERT(sizeof(struct libagx_tess_point) == 8);
+
+struct libagx_tess_args {
+   /* Heap to allocate tessellator outputs in */
+   GLOBAL(struct agx_geometry_state) heap;
+
+   /* Patch coordinate buffer, indexed as:
+    *
+    *    coord_allocs[patch_ID] + vertex_in_patch
+    */
+   GLOBAL(struct libagx_tess_point) patch_coord_buffer;
+
+   /* Per-patch index within the heap for the tess coords, written by the
+    * tessellator based on the allocated memory.
+    */
+   GLOBAL(uint32_t) coord_allocs;
+
+   /* Space for output draws from the tessellator. Either API draw calls or
+    * VDM control words, depending on the mode. */
+   GLOBAL(uint32_t) out_draws;
+
+   /* Tessellation control shader output buffer. */
+   GLOBAL(float) tcs_buffer;
+
+   /* Count buffer. # of indices per patch written here, then prefix summed. */
+   GLOBAL(uint32_t) counts;
+
+   /* Allocated index buffer for all patches, if we're prefix summing counts */
+   GLOBAL(uint32_t) index_buffer;
+
+   /* Address of the tess eval invocation counter for implementing pipeline
+    * statistics, if active. Zero if inactive. Incremented by tessellator.
+    */
+   GLOBAL(uint32_t) statistic;
+
+   /* Address of the tess control invocation counter for implementing pipeline
+    * statistics, if active. Zero if inactive. Incremented by indirect tess
+    * setup kernel.
+    */
+   GLOBAL(uint32_t) tcs_statistic;
+
+   /* For indirect draws with tessellation, the grid sizes. VS then TCS then
+    * tess. Allocated by the CPU and written by the tessellation
+    * setup indirect kernel.
+    */
+   GLOBAL(uint32_t) grids;
+
+   /* For indirect draws, the indirect draw descriptor. */
+   GLOBAL(uint32_t) indirect;
+
+   /* For indirect draws, the allocation for the vertex buffer.
+    *
+    * TODO: We could move these fields to an indirect setup kernel, not sure if
+    * it's worth it though...
+    */
+   GLOBAL(uint64_t) vertex_output_buffer_ptr;
+
+   /* For indirect draws, the bitfield of VS outputs */
+   uint64_t vertex_outputs;
+
+   /* Bitfield of TCS per-vertex outputs */
+   uint64_t tcs_per_vertex_outputs;
+
+   /* Default tess levels used in OpenGL when there is no TCS in the pipeline.
+    * Unused in Vulkan and OpenGL ES.
+    */
+   float tess_level_outer_default[4];
+   float tess_level_inner_default[2];
+
+   /* Number of vertices in the input patch */
+   uint32_t input_patch_size;
+
+   /* Number of vertices in the TCS output patch */
+   uint32_t output_patch_size;
+
+   /* Number of patch constants written by TCS */
+   uint32_t tcs_patch_constants;
+
+   /* Number of input patches per instance of the VS/TCS */
+   uint32_t patches_per_instance;
+
+   /* Stride between tessellation facotrs in the TCS output buffer. */
+   uint32_t tcs_stride_el;
+
+   /* Number of patches being tessellated */
+   uint32_t nr_patches;
+} PACKED;
+AGX_STATIC_ASSERT(sizeof(struct libagx_tess_args) == 40 * 4);
diff --git a/src/gallium/drivers/asahi/agx_query.c b/src/gallium/drivers/asahi/agx_query.c
index 58835b77d8e..65fc3731e17 100644
--- a/src/gallium/drivers/asahi/agx_query.c
+++ b/src/gallium/drivers/asahi/agx_query.c
@@ -553,8 +553,8 @@ agx_get_query_result_resource_gpu(struct agx_context *ctx,
    ctx->base.set_constant_buffer(&ctx->base, PIPE_SHADER_COMPUTE, 0, false,
                                  &cb);
 
-   struct pipe_grid_info grid = {.block = {1, 1, 1}, .grid = {1, 1, 1}};
-   agx_launch(batch, &grid, cs, NULL, PIPE_SHADER_COMPUTE);
+   struct agx_grid grid = agx_grid_direct(1, 1, 1, 1, 1, 1);
+   agx_launch(batch, &grid, cs, NULL, PIPE_SHADER_COMPUTE, 0);
 
    /* take_ownership=true so do not unreference */
    ctx->base.set_constant_buffer(&ctx->base, PIPE_SHADER_COMPUTE, 0, true,
diff --git a/src/gallium/drivers/asahi/agx_state.c b/src/gallium/drivers/asahi/agx_state.c
index 9b655713e7f..5e26b2696d7 100644
--- a/src/gallium/drivers/asahi/agx_state.c
+++ b/src/gallium/drivers/asahi/agx_state.c
@@ -16,6 +16,7 @@
 #include "asahi/lib/agx_nir_passes.h"
 #include "asahi/lib/agx_ppp.h"
 #include "asahi/lib/agx_usc.h"
+#include "asahi/lib/shaders/tessellator.h"
 #include "compiler/nir/nir.h"
 #include "compiler/nir/nir_serialize.h"
 #include "compiler/shader_enums.h"
@@ -32,7 +33,6 @@
 #include "pipe/p_defines.h"
 #include "pipe/p_screen.h"
 #include "pipe/p_state.h"
-#include "tessellator/p_tessellator.h"
 #include "util/bitscan.h"
 #include "util/bitset.h"
 #include "util/blend.h"
@@ -3047,7 +3047,7 @@ agx_build_pipeline(struct agx_batch *batch, struct agx_compiled_shader *cs,
 
 static uint32_t
 agx_build_internal_usc(struct agx_batch *batch, struct agx_compiled_shader *cs,
-                       void *data, size_t data_size)
+                       uint64_t data)
 {
    size_t usc_size = agx_usc_size(12);
 
@@ -3056,8 +3056,7 @@ agx_build_internal_usc(struct agx_batch *batch, struct agx_compiled_shader *cs,
 
    struct agx_usc_builder b = agx_usc_builder(t.cpu, usc_size);
 
-   uint64_t ptr = agx_pool_upload(&batch->pool, data, data_size);
-   agx_usc_uniform(&b, 0, 4, agx_pool_upload(&batch->pool, &ptr, 8));
+   agx_usc_uniform(&b, 0, 4, agx_pool_upload(&batch->pool, &data, 8));
    agx_usc_immediates(&b, batch, cs);
 
    assert(cs->b.info.scratch_size == 0 && "internal kernels don't spill");
@@ -3093,16 +3092,26 @@ agx_build_internal_usc(struct agx_batch *batch, struct agx_compiled_shader *cs,
    return t.gpu;
 }
 
-void
-agx_launch_with_data(struct agx_batch *batch, const struct pipe_grid_info *info,
-                     meta_shader_builder_t builder, void *key, size_t key_size,
-                     void *data, size_t data_size)
+static void
+agx_launch_with_uploaded_data(struct agx_batch *batch,
+                              const struct agx_grid *grid,
+                              meta_shader_builder_t builder, void *key,
+                              size_t key_size, uint64_t data)
 {
    struct agx_compiled_shader *cs = agx_build_meta_shader_internal(
       batch->ctx, builder, key, key_size, false, false, 0, true);
 
-   uint32_t usc = agx_build_internal_usc(batch, cs, data, data_size);
-   agx_launch_internal(batch, info, cs, PIPE_SHADER_COMPUTE, usc);
+   uint32_t usc = agx_build_internal_usc(batch, cs, data);
+   agx_launch_internal(batch, grid, cs, PIPE_SHADER_COMPUTE, usc);
+}
+
+void
+agx_launch_with_data(struct agx_batch *batch, const struct agx_grid *grid,
+                     meta_shader_builder_t builder, void *key, size_t key_size,
+                     void *data, size_t data_size)
+{
+   uint64_t upload = agx_pool_upload_aligned(&batch->pool, data, data_size, 4);
+   agx_launch_with_uploaded_data(batch, grid, builder, key, key_size, upload);
 }
 
 struct asahi_bg_eot
@@ -4053,6 +4062,17 @@ agx_batch_geometry_params(struct agx_batch *batch, uint64_t input_index_buffer,
                                           8, &batch->geom_params_bo);
 }
 
+static uint64_t
+agx_indirect_buffer_ptr(struct agx_batch *batch,
+                        const struct pipe_draw_indirect_info *indirect)
+{
+   assert(indirect->buffer && "drawauto already handled");
+
+   struct agx_resource *rsrc = agx_resource(indirect->buffer);
+   agx_batch_reads(batch, rsrc);
+   return rsrc->bo->ptr.gpu + indirect->offset;
+}
+
 static void
 agx_launch_gs_prerast(struct agx_batch *batch,
                       const struct pipe_draw_info *info,
@@ -4084,23 +4104,14 @@ agx_launch_gs_prerast(struct agx_batch *batch,
 
    assert(!info->primitive_restart && "should have been lowered");
 
-   struct pipe_grid_info grid_vs = {.block = {1, 1, 1}};
-   struct pipe_grid_info grid_gs = {.block = {1, 1, 1}};
-   struct agx_resource grid_indirect_rsrc = {.bo = batch->geom_params_bo};
+   struct agx_grid grid_vs, grid_gs;
 
    /* Setup grids */
    if (indirect) {
-      assert(indirect->buffer && "drawauto already handled");
-
       struct agx_gs_setup_indirect_key key = {
          .prim = info->mode,
       };
 
-      const struct pipe_grid_info grid_setup = {
-         .block = {1, 1, 1},
-         .grid = {1, 1, 1},
-      };
-
       uint64_t ib = 0;
       size_t ib_extent = 0;
 
@@ -4109,13 +4120,10 @@ agx_launch_gs_prerast(struct agx_batch *batch,
                                    &ib_extent);
       }
 
-      struct agx_resource *rsrc = agx_resource(indirect->buffer);
-      agx_batch_reads(batch, rsrc);
-
       struct agx_gs_setup_indirect_params gsi = {
          .index_buffer = ib,
          .index_buffer_range_el = ib_extent / info->index_size,
-         .draw = rsrc->bo->ptr.gpu + indirect->offset,
+         .draw = agx_indirect_buffer_ptr(batch, indirect),
          .vertex_buffer = batch->uniforms.vertex_output_buffer_ptr,
          .ia = batch->uniforms.input_assembly,
          .geom = batch->uniforms.geometry_params,
@@ -4123,62 +4131,50 @@ agx_launch_gs_prerast(struct agx_batch *batch,
          .index_size_B = info->index_size,
       };
 
+      const struct agx_grid grid_setup = agx_grid_direct(1, 1, 1, 1, 1, 1);
       agx_launch_with_data(batch, &grid_setup, agx_nir_gs_setup_indirect, &key,
                            sizeof(key), &gsi, sizeof(gsi));
 
-      /* Wrap the pool allocation in a fake resource for meta-Gallium use */
-      assert(batch->geom_params_bo != NULL);
-      grid_vs.indirect = &grid_indirect_rsrc.base;
-      grid_gs.indirect = &grid_indirect_rsrc.base;
+      uint64_t gp = batch->uniforms.geometry_params;
 
-      unsigned param_offs =
-         (batch->uniforms.geometry_params - grid_indirect_rsrc.bo->ptr.gpu);
+      grid_vs = agx_grid_indirect(
+         gp + offsetof(struct agx_geometry_params, vs_grid), 1, 1, 1);
 
-      grid_vs.indirect_offset =
-         param_offs + offsetof(struct agx_geometry_params, vs_grid);
-
-      grid_gs.indirect_offset =
-         param_offs + offsetof(struct agx_geometry_params, gs_grid);
+      grid_gs = agx_grid_indirect(
+         gp + offsetof(struct agx_geometry_params, gs_grid), 1, 1, 1);
    } else {
-      grid_vs.grid[0] = draws->count;
-      grid_vs.grid[1] = info->instance_count;
-      grid_vs.grid[2] = 1;
+      grid_vs =
+         agx_grid_direct(draws->count, info->instance_count, 1, 64, 1, 1);
 
-      grid_gs.grid[0] =
-         u_decomposed_prims_for_vertices(info->mode, draws->count);
-      grid_gs.grid[1] = info->instance_count;
-      grid_gs.grid[2] = 1;
+      grid_gs = agx_grid_direct(
+         u_decomposed_prims_for_vertices(info->mode, draws->count),
+         info->instance_count, 1, 64, 1, 1);
    }
 
    /* Launch the vertex shader first */
-   agx_launch(batch, &grid_vs, ctx->vs, ctx->linked.vs, ctx->vs->stage);
+   agx_launch(batch, &grid_vs, ctx->vs, ctx->linked.vs, ctx->vs->stage, 0);
 
    /* If there is a count shader, launch it and prefix sum the results. */
    if (gs->gs_count) {
       perf_debug(dev, "Geometry shader count");
-      agx_launch(batch, &grid_gs, gs->gs_count, NULL, PIPE_SHADER_GEOMETRY);
+      agx_launch(batch, &grid_gs, gs->gs_count, NULL, PIPE_SHADER_GEOMETRY, 0);
 
       unsigned words = gs->gs_count_words;
-      agx_launch(batch,
-                 &(const struct pipe_grid_info){
-                    .block = {1024, 1, 1},
-                    .grid = {gs->gs_count_words, 1, 1},
-                 },
+      struct agx_grid grid =
+         agx_grid_direct(1024 * gs->gs_count_words, 1, 1, 1024, 1, 1);
+
+      agx_launch(batch, &grid,
                  agx_build_meta_shader(ctx, agx_nir_prefix_sum_gs, &words,
                                        sizeof(words)),
-                 NULL, PIPE_SHADER_COMPUTE);
+                 NULL, PIPE_SHADER_COMPUTE, 0);
    }
 
    /* Pre-GS shader */
-   agx_launch(batch,
-              &(const struct pipe_grid_info){
-                 .block = {1, 1, 1},
-                 .grid = {1, 1, 1},
-              },
-              gs->pre_gs, NULL, PIPE_SHADER_COMPUTE);
+   struct agx_grid grid = agx_grid_direct(1, 1, 1, 1, 1, 1);
+   agx_launch(batch, &grid, gs->pre_gs, NULL, PIPE_SHADER_COMPUTE, 0);
 
    /* Pre-rast geometry shader */
-   agx_launch(batch, &grid_gs, gs, NULL, PIPE_SHADER_GEOMETRY);
+   agx_launch(batch, &grid_gs, gs, NULL, PIPE_SHADER_GEOMETRY, 0);
 }
 
 static void
@@ -4249,10 +4245,8 @@ agx_draw_without_restart(struct agx_batch *batch,
    };
 
    /* Unroll the index buffer for each draw */
-   const struct pipe_grid_info grid_setup = {
-      .block = {1024, 1, 1},
-      .grid = {indirect->draw_count, 1, 1},
-   };
+   const struct agx_grid grid_setup =
+      agx_grid_direct(1024 * indirect->draw_count, 1, 1, 1024, 1, 1);
 
    agx_launch_with_data(batch, &grid_setup, agx_nir_unroll_restart, &key,
                         sizeof(key), &unroll, sizeof(unroll));
@@ -4310,8 +4304,9 @@ agx_needs_passthrough_gs(struct agx_context *ctx,
       return true;
    }
 
-   /* TODO: this is sloppy, we should add a VDM kernel for this. */
-   if (indirect && ctx->active_queries && ctx->prims_generated[0]) {
+   /* TODO: this is really sloppy, we should add a VDM kernel for this. */
+   if ((indirect || info->mode == MESA_PRIM_PATCHES) && ctx->active_queries &&
+       ctx->prims_generated[0]) {
       perf_debug_ctx(ctx, "Using passthrough GS due to indirect prim query");
       return true;
    }
@@ -4333,8 +4328,11 @@ agx_needs_passthrough_gs(struct agx_context *ctx,
    /* Transform feedback is layered on geometry shaders, so if transform
     * feedback is used, we need a GS.
     */
-   if (ctx->stage[PIPE_SHADER_VERTEX].shader->has_xfb_info &&
-       ctx->streamout.num_targets) {
+   struct agx_uncompiled_shader *last_vtx =
+      ctx->stage[PIPE_SHADER_TESS_EVAL].shader
+         ?: ctx->stage[PIPE_SHADER_VERTEX].shader;
+
+   if (last_vtx->has_xfb_info && ctx->streamout.num_targets) {
       *xfb_only = true;
       return true;
    }
@@ -4343,6 +4341,20 @@ agx_needs_passthrough_gs(struct agx_context *ctx,
    return false;
 }
 
+static enum mesa_prim
+agx_tess_output_prim(struct agx_uncompiled_shader *tcs,
+                     struct agx_uncompiled_shader *tes)
+{
+   if ((tcs && tcs->tess.point_mode) || tes->tess.point_mode) {
+      return MESA_PRIM_POINTS;
+   } else if (TESS_PRIMITIVE_ISOLINES ==
+              MAX2(tcs ? tcs->tess.primitive : 0, tes->tess.primitive)) {
+      return MESA_PRIM_LINES;
+   } else {
+      return MESA_PRIM_TRIANGLES;
+   }
+}
+
 static struct agx_uncompiled_shader *
 agx_get_passthrough_gs(struct agx_context *ctx,
                        struct agx_uncompiled_shader *prev_cso,
@@ -4350,11 +4362,16 @@ agx_get_passthrough_gs(struct agx_context *ctx,
 {
    bool edgeflags = has_edgeflags(ctx, mode);
 
+   if (mode == MESA_PRIM_PATCHES) {
+      mode = agx_tess_output_prim(ctx->stage[MESA_SHADER_TESS_CTRL].shader,
+                                  ctx->stage[MESA_SHADER_TESS_EVAL].shader);
+   }
+
    /* Only handle the polygon mode when edge flags are in use, because
     * nir_passthrough_gs doesn't handle transform feedback + polygon mode
-    * properly. Technically this can break edge flags + transform feedback but
-    * that's firmly in "doctor, it hurts when I do this" territory, and I'm not
-    * sure that's even possible to hit. TODO: Reevaluate.
+    * properly. Technically this can break edge flags + transform feedback
+    * but that's firmly in "doctor, it hurts when I do this" territory, and
+    * I'm not sure that's even possible to hit. TODO: Reevaluate.
     */
    unsigned poly_mode =
       edgeflags ? ctx->rast->base.fill_front : PIPE_POLYGON_MODE_FILL;
@@ -4525,35 +4542,39 @@ agx_draw_patches(struct agx_context *ctx, const struct pipe_draw_info *info,
       unbind_tcs_when_done = true;
    }
 
-   unsigned in_vertices = draws->count;
-   unsigned in_patches = in_vertices / patch_vertices;
+   enum tess_primitive_mode mode =
+      MAX2(tcs->tess.primitive, tes->tess.primitive);
+   enum gl_tess_spacing spacing = MAX2(tcs->tess.spacing, tes->tess.spacing);
 
-   if (in_patches == 0)
-      return;
+   enum pipe_tess_spacing pspacing = spacing == TESS_SPACING_EQUAL
+                                        ? PIPE_TESS_SPACING_EQUAL
+                                     : spacing == TESS_SPACING_FRACTIONAL_ODD
+                                        ? PIPE_TESS_SPACING_FRACTIONAL_ODD
+                                        : PIPE_TESS_SPACING_FRACTIONAL_EVEN;
 
-   /* TCS invocation counter increments once per-patch */
-   agx_query_increment_cpu(
-      ctx, ctx->pipeline_statistics[PIPE_STAT_QUERY_HS_INVOCATIONS],
-      in_patches);
+   bool point_mode = MAX2(tcs->tess.point_mode, tes->tess.point_mode);
+   enum mesa_prim out_prim = agx_tess_output_prim(tcs, tes);
 
-   struct agx_batch *batch = agx_get_compute_batch(ctx);
+   enum libagx_tess_partitioning partitioning =
+      (enum libagx_tess_partitioning)pspacing;
+
+   enum libagx_tess_output_primitive prim =
+      point_mode       ? LIBAGX_TESS_OUTPUT_POINT
+      : !tes->tess.ccw ? LIBAGX_TESS_OUTPUT_TRIANGLE_CCW
+                       : LIBAGX_TESS_OUTPUT_TRIANGLE_CW;
+
+   struct agx_bo *draw_bo = NULL;
+   bool with_counts =
+      indirect || ctx->stage[MESA_SHADER_GEOMETRY].shader != NULL;
+   size_t draw_stride =
+      ((!with_counts && point_mode) ? 4 : 6) * sizeof(uint32_t);
+
+   struct agx_batch *batch = agx_get_batch(ctx);
    agx_batch_init_state(batch);
 
-   struct pipe_resource *heap =
-      pipe_buffer_create(ctx->base.screen, PIPE_BIND_GLOBAL, PIPE_USAGE_DEFAULT,
-                         1024 * 1024 * 128);
-
-   uint64_t heap_gpu = agx_resource(heap)->bo->ptr.gpu;
-   uint8_t *heap_cpu = agx_resource(heap)->bo->ptr.cpu;
-
-   unsigned unrolled_patch_count = in_patches * info->instance_count;
-
-   uint32_t heap_water = 0;
-   uint32_t tcs_out_offs = heap_water;
-   heap_water += ALIGN(unrolled_patch_count * tcs->tess.output_stride, 4);
-
-   agx_batch_writes(batch, agx_resource(heap), 0);
-   batch->incoherent_writes = true;
+   if (!batch->cdm.bo) {
+      batch->cdm = agx_encoder_allocate(batch, dev);
+   }
 
    uint64_t ib = 0;
    size_t ib_extent = 0;
@@ -4573,25 +4594,29 @@ agx_draw_patches(struct agx_context *ctx, const struct pipe_draw_info *info,
    agx_upload_draw_params(batch, indirect, draws, info);
 
    /* Setup parameters */
-   struct agx_tess_params tess_params = {
-      .tcs_buffer = heap_gpu + tcs_out_offs,
+   uint64_t geom_state = agx_batch_geometry_state(batch);
+   assert((tcs->tess.output_stride & 3) == 0 && "must be aligned");
+
+   struct libagx_tess_args args = {
+      .heap = geom_state,
+      .tcs_stride_el = tcs->tess.output_stride / 4,
+      .statistic = agx_get_query_address(
+         batch, ctx->pipeline_statistics[PIPE_STAT_QUERY_DS_INVOCATIONS]),
       .input_patch_size = patch_vertices,
       .output_patch_size = tcs->tess.output_patch_size,
       .tcs_patch_constants = tcs->tess.nr_patch_outputs,
       .tcs_per_vertex_outputs = tcs->tess.per_vertex_outputs,
-      .patch_coord_buffer = heap_gpu,
-      .patches_per_instance = in_patches,
+      .patch_coord_buffer = agx_resource(ctx->heap)->bo->ptr.gpu,
    };
 
-   memcpy(&tess_params.tess_level_outer_default, ctx->default_outer_level,
+   memcpy(&args.tess_level_outer_default, ctx->default_outer_level,
           sizeof(ctx->default_outer_level));
-   memcpy(&tess_params.tess_level_inner_default, ctx->default_inner_level,
+   memcpy(&args.tess_level_inner_default, ctx->default_inner_level,
           sizeof(ctx->default_inner_level));
 
-   batch->uniforms.tess_params =
-      agx_pool_upload(&batch->pool, &tess_params, sizeof(tess_params));
+   struct agx_grid vs_grid, tcs_grid, tess_grid;
+   unsigned tess_wg_size = 64;
 
-   /* Run VS+TCS as compute */
    agx_upload_vbos(batch);
    agx_update_vs(ctx, info->index_size);
    agx_update_tcs(ctx, info);
@@ -4605,153 +4630,184 @@ agx_draw_patches(struct agx_context *ctx, const struct pipe_draw_info *info,
 
    batch->uniforms.vertex_outputs = ctx->vs->b.info.outputs;
 
-   unsigned vb_size = libagx_tcs_in_size(draws->count * info->instance_count,
-                                         batch->uniforms.vertex_outputs);
-   uint64_t addr = agx_pool_alloc_aligned(&batch->pool, vb_size, 4).gpu;
-   batch->uniforms.vertex_output_buffer_ptr =
-      agx_pool_upload(&batch->pool, &addr, 8);
+   if (indirect == NULL) {
+      unsigned in_patches = draws->count / patch_vertices;
+      if (in_patches == 0)
+         return;
 
-   struct pipe_grid_info vs_grid = {
-      .block = {1, 1, 1},
-      .grid = {draws->count, info->instance_count, 1},
-   };
+      /* TCS invocation counter increments once per-patch */
+      agx_query_increment_cpu(
+         ctx, ctx->pipeline_statistics[PIPE_STAT_QUERY_HS_INVOCATIONS],
+         in_patches);
 
-   agx_launch(batch, &vs_grid, ctx->vs, ctx->linked.vs, PIPE_SHADER_VERTEX);
+      unsigned unrolled_patches = in_patches * info->instance_count;
 
-   struct pipe_grid_info tcs_grid = {
-      .block = {tcs->tess.output_patch_size, 1, 1},
-      .grid = {in_patches, info->instance_count, 1},
-   };
+      uint32_t alloc = 0;
+      uint32_t tcs_out_offs = alloc;
+      alloc += unrolled_patches * tcs->tess.output_stride;
 
-   agx_launch(batch, &tcs_grid, ctx->tcs, NULL, PIPE_SHADER_TESS_CTRL);
+      uint32_t patch_coord_offs = alloc;
+      alloc += unrolled_patches * 4;
+
+      uint32_t count_offs = alloc;
+      if (with_counts)
+         alloc += unrolled_patches * sizeof(uint32_t);
+
+      uint32_t draw_offs = alloc;
+
+      if (with_counts) {
+         alloc += draw_stride;
+      } else {
+         /* Padding added because VDM overreads */
+         alloc +=
+            (draw_stride * unrolled_patches) + (AGX_VDM_BARRIER_LENGTH + 0x800);
+      }
+
+      struct agx_ptr blob =
+         agx_pool_alloc_aligned_with_bo(&batch->pool, alloc, 4, &draw_bo);
+
+      args.tcs_buffer = blob.gpu + tcs_out_offs;
+      args.patches_per_instance = in_patches;
+      args.coord_allocs = blob.gpu + patch_coord_offs;
+      args.nr_patches = unrolled_patches;
+      args.out_draws = blob.gpu + draw_offs;
+
+      if (with_counts) {
+         args.counts = blob.gpu + count_offs;
+      } else {
+         /* Arrange so we return after all generated draws */
+         uint8_t *ret =
+            (uint8_t *)blob.cpu + draw_offs + (draw_stride * unrolled_patches);
+
+         agx_pack(ret, VDM_BARRIER, cfg) {
+            cfg.returns = true;
+         }
+      }
+
+      unsigned vb_size = libagx_tcs_in_size(draws->count * info->instance_count,
+                                            batch->uniforms.vertex_outputs);
+      uint64_t addr = agx_pool_alloc_aligned(&batch->pool, vb_size, 4).gpu;
+      batch->uniforms.vertex_output_buffer_ptr =
+         agx_pool_upload(&batch->pool, &addr, 8);
+
+      vs_grid =
+         agx_grid_direct(draws->count, info->instance_count, 1, 64, 1, 1);
+
+      tcs_grid = agx_grid_direct(in_patches * tcs->tess.output_patch_size,
+                                 info->instance_count, 1,
+                                 tcs->tess.output_patch_size, 1, 1);
+
+      tess_grid = agx_grid_direct(unrolled_patches, 1, 1, tess_wg_size, 1, 1);
+   } else if (indirect) {
+      args.tcs_statistic = agx_get_query_address(
+         batch, ctx->pipeline_statistics[PIPE_STAT_QUERY_HS_INVOCATIONS]);
+
+      args.indirect = agx_indirect_buffer_ptr(batch, indirect);
+
+      /* Allocate 3x indirect global+local grids for VS/TCS/tess */
+      uint32_t grid_stride = sizeof(uint32_t) * 6;
+      args.grids = agx_pool_alloc_aligned(&batch->pool, grid_stride * 3, 4).gpu;
+
+      vs_grid = agx_grid_indirect_local(args.grids + 0 * grid_stride);
+      tcs_grid = agx_grid_indirect_local(args.grids + 1 * grid_stride);
+      tess_grid = agx_grid_indirect_local(args.grids + 2 * grid_stride);
+
+      args.vertex_outputs = ctx->vs->b.info.outputs;
+      args.vertex_output_buffer_ptr =
+         agx_pool_alloc_aligned(&batch->pool, 8, 8).gpu;
+
+      batch->uniforms.vertex_output_buffer_ptr = args.vertex_output_buffer_ptr;
+
+      if (with_counts) {
+         args.out_draws = agx_pool_alloc_aligned_with_bo(
+                             &batch->pool, draw_stride, 4, &draw_bo)
+                             .gpu;
+      } else {
+         unreachable("need an extra indirection...");
+      }
+   }
+
+   uint64_t state =
+      agx_pool_upload_aligned(&batch->pool, &args, sizeof(args), 4);
+
+   if (indirect) {
+      const struct agx_grid indirect_grid = agx_grid_direct(1, 1, 1, 1, 1, 1);
+      struct agx_tess_setup_indirect_key indirect_key = {
+         .point_mode = point_mode,
+         .with_counts = with_counts,
+      };
+
+      agx_launch_with_uploaded_data(batch, &indirect_grid,
+                                    agx_nir_tess_setup_indirect, &indirect_key,
+                                    sizeof(indirect_key), state);
+   }
+
+   batch->uniforms.tess_params = state;
+
+   agx_launch(batch, &vs_grid, ctx->vs, ctx->linked.vs, PIPE_SHADER_VERTEX, 0);
+   agx_launch(batch, &tcs_grid, ctx->tcs, NULL, PIPE_SHADER_TESS_CTRL, 0);
    batch->uniforms.vertex_output_buffer_ptr = 0;
 
-   agx_flush_all(ctx, "HACK");
-   agx_sync_all(ctx, "HACK");
+   struct agx_tessellator_key key = {
+      .prim = mode,
+      .output_primitive = prim,
+      .partitioning = partitioning,
+   };
 
-   /* Setup batch */
-   batch = agx_get_batch(ctx);
+   if (with_counts) {
+      /* Generate counts */
+      key.mode = LIBAGX_TESS_MODE_COUNT;
+      agx_launch_with_uploaded_data(batch, &tess_grid, agx_nir_tessellate, &key,
+                                    sizeof(key), state);
 
-   enum tess_primitive_mode mode =
-      MAX2(tcs->tess.primitive, tes->tess.primitive);
-   enum gl_tess_spacing spacing = MAX2(tcs->tess.spacing, tes->tess.spacing);
+      /* Prefix sum counts, allocating index buffer space. */
+      const struct agx_grid prefix_sum_grid =
+         agx_grid_direct(1024, 1, 1, 1024, 1, 1);
 
-   enum pipe_tess_spacing pspacing = spacing == TESS_SPACING_EQUAL
-                                        ? PIPE_TESS_SPACING_EQUAL
-                                     : spacing == TESS_SPACING_FRACTIONAL_ODD
-                                        ? PIPE_TESS_SPACING_FRACTIONAL_ODD
-                                        : PIPE_TESS_SPACING_FRACTIONAL_EVEN;
+      agx_launch_with_uploaded_data(batch, &prefix_sum_grid,
+                                    agx_nir_prefix_sum_tess, NULL, 0, state);
 
-   bool point_mode = MAX2(tcs->tess.point_mode, tes->tess.point_mode);
-   enum mesa_prim in_prim = mode == TESS_PRIMITIVE_ISOLINES ? MESA_PRIM_LINES
-                            : mode == TESS_PRIMITIVE_QUADS
-                               ? MESA_PRIM_QUADS
-                               : MESA_PRIM_TRIANGLES;
-   enum mesa_prim out_prim = point_mode ? MESA_PRIM_POINTS
-                             : mode == TESS_PRIMITIVE_ISOLINES
-                                ? MESA_PRIM_LINES
-                                : MESA_PRIM_TRIANGLES;
-
-   struct pipe_tessellator *tess =
-      p_tess_init(in_prim, pspacing, tes->tess.ccw, point_mode);
-
-   struct pipe_tessellator_data data = {0};
-
-   /* Mem allocate */
-   uint32_t patch_coord_offs_offs = heap_water;
-   tess_params.patch_coord_offs = heap_gpu + heap_water;
-   heap_water += align(4 * unrolled_patch_count, 4);
-
-   uint32_t draws_off = heap_water;
-   uint32_t *patch_draws = (uint32_t *)(heap_cpu + heap_water);
-   heap_water += align(sizeof(uint32_t) * 5 * unrolled_patch_count, 4);
-
-   uint32_t *patch_offs = (uint32_t *)(heap_cpu + patch_coord_offs_offs);
-
-   for (unsigned patch = 0; patch < unrolled_patch_count; ++patch) {
-      float *addr =
-         (float *)(heap_cpu + tcs_out_offs + tcs->tess.output_stride * patch);
-
-      struct pipe_tessellation_factors factors = {
-         .outer_tf = {addr[0], addr[1], addr[2], addr[3]},
-         .inner_tf = {addr[4], addr[5]},
-      };
-      p_tessellate(tess, &factors, &data);
-
-      /* Mem allocate indices */
-      uint32_t index_off = heap_water;
-      uint16_t *indices = (uint16_t *)(heap_cpu + heap_water);
-      heap_water += align(sizeof(*indices) * data.num_indices, 4);
-
-      for (unsigned idx = 0; idx < data.num_indices; ++idx) {
-         indices[idx] = data.indices[idx];
-      }
-
-      /* Mem allocate patch coords */
-      heap_water = align(heap_water, 8);
-      patch_offs[patch] = heap_water / 8;
-      float *patch_coords = (float *)(heap_cpu + heap_water);
-      heap_water += align(8 * data.num_domain_points, 4);
-
-      for (unsigned p = 0; p < data.num_domain_points; ++p) {
-         patch_coords[2 * p + 0] = data.domain_points_u[p];
-         patch_coords[2 * p + 1] = data.domain_points_v[p];
-      }
-      assert(data.num_indices < 32768);
-      assert(data.num_domain_points < 8192);
-
-      /* Generate a draw for the patch */
-      uint32_t *desc = patch_draws + (patch * 5);
-
-      desc[0] = data.num_indices;                   /* count */
-      desc[1] = 1;                                  /* instance_count */
-      desc[2] = index_off / sizeof(*indices);       /* start */
-      desc[3] = patch * LIBAGX_TES_PATCH_ID_STRIDE; /* index_bias */
-      desc[4] = 0;                                  /* start_instance */
-
-      /* TES invocation counter increments once per tessellated vertex */
-      agx_query_increment_cpu(
-         ctx, ctx->pipeline_statistics[PIPE_STAT_QUERY_DS_INVOCATIONS],
-         data.num_domain_points);
+      key.mode = LIBAGX_TESS_MODE_WITH_COUNTS;
+   } else {
+      key.mode = LIBAGX_TESS_MODE_VDM;
    }
-   p_tess_destroy(tess);
+
+   /* Now we can tessellate */
+   agx_launch_with_uploaded_data(batch, &tess_grid, agx_nir_tessellate, &key,
+                                 sizeof(key), state);
 
    /* Run TES as VS */
    void *vs_cso = ctx->stage[PIPE_SHADER_VERTEX].shader;
    void *tes_cso = ctx->stage[PIPE_SHADER_TESS_EVAL].shader;
    ctx->base.bind_vs_state(&ctx->base, tes_cso);
    ctx->in_tess = true;
+   ctx->in_generated_vdm = !with_counts;
 
    struct pipe_draw_info draw_info = {
       .mode = out_prim,
-      .index_size = 2,
-      .index.resource = heap,
+      .index_size = with_counts ? 4 : (point_mode ? 0 : 2),
+      .index.resource = (!with_counts && point_mode) ? NULL : ctx->heap,
       .instance_count = 1,
       .view_mask = info->view_mask,
    };
 
    /* Wrap the pool allocation in a fake resource for meta-Gallium use */
-   struct pipe_draw_indirect_info copy_indirect = {
-      .buffer = heap,
-      .offset = draws_off,
-      .stride = 5 * sizeof(uint32_t),
-      .draw_count = in_patches * info->instance_count,
-   };
+   struct agx_resource indirect_rsrc = {.bo = draw_bo};
 
-   /* Tess param upload is deferred to draw_vbo since the batch may change
-    * within draw_vbo for various reasons, so we can't upload it to the batch
-    * upfront.
-    */
-   memcpy(&ctx->tess_params, &tess_params, sizeof(tess_params));
+   struct pipe_draw_indirect_info copy_indirect = {
+      .buffer = &indirect_rsrc.base,
+      .offset = args.out_draws - draw_bo->ptr.gpu,
+      .stride = draw_stride,
+      .draw_count = 1,
+   };
 
    ctx->base.draw_vbo(&ctx->base, &draw_info, 0, &copy_indirect, NULL, 1);
 
    /* Restore vertex state */
    ctx->base.bind_vs_state(&ctx->base, vs_cso);
+   ctx->in_generated_vdm = false;
    ctx->in_tess = false;
 
-   pipe_resource_reference(&heap, NULL);
-
    if (unbind_tcs_when_done) {
       ctx->base.bind_tcs_state(&ctx->base, NULL);
    }
@@ -4860,13 +4916,6 @@ agx_draw_vbo(struct pipe_context *pctx, const struct pipe_draw_info *info,
       return;
    }
 
-   /* TODO: stop cheating */
-   if (info->mode == MESA_PRIM_PATCHES && indirect) {
-      perf_debug_ctx(ctx, "indirect tessellation");
-      util_draw_indirect(pctx, info, drawid_offset, indirect);
-      return;
-   }
-
    /* TODO: stop cheating */
    if (ctx->active_queries && !ctx->active_draw_without_restart &&
        (ctx->pipeline_statistics[PIPE_STAT_QUERY_IA_VERTICES] ||
@@ -4878,11 +4927,6 @@ agx_draw_vbo(struct pipe_context *pctx, const struct pipe_draw_info *info,
       return;
    }
 
-   if (info->mode == MESA_PRIM_PATCHES) {
-      agx_draw_patches(ctx, info, drawid_offset, indirect, draws, num_draws);
-      return;
-   }
-
    bool xfb_passthrough = false;
    if (agx_needs_passthrough_gs(ctx, info, indirect, &xfb_passthrough)) {
       agx_apply_passthrough_gs(ctx, info, drawid_offset, indirect, draws,
@@ -4890,6 +4934,11 @@ agx_draw_vbo(struct pipe_context *pctx, const struct pipe_draw_info *info,
       return;
    }
 
+   if (info->mode == MESA_PRIM_PATCHES) {
+      agx_draw_patches(ctx, info, drawid_offset, indirect, draws, num_draws);
+      return;
+   }
+
    agx_legalize_feedback_loops(ctx);
 
    /* Only the rasterization stream counts */
@@ -5017,11 +5066,6 @@ agx_draw_vbo(struct pipe_context *pctx, const struct pipe_draw_info *info,
        IS_DIRTY(BLEND_COLOR) || IS_DIRTY(QUERY) || IS_DIRTY(POLY_STIPPLE) ||
        IS_DIRTY(RS) || IS_DIRTY(PRIM) || ctx->in_tess) {
 
-      if (ctx->in_tess) {
-         batch->uniforms.tess_params = agx_pool_upload(
-            &batch->pool, &ctx->tess_params, sizeof(ctx->tess_params));
-      }
-
       if (IS_DIRTY(VERTEX)) {
          agx_upload_vbos(batch);
       }
@@ -5143,63 +5187,75 @@ agx_draw_vbo(struct pipe_context *pctx, const struct pipe_draw_info *info,
 
    uint8_t *out = agx_encode_state(batch, batch->vdm.current);
 
-   if (info->index_size) {
-      agx_push(out, VDM_STATE, cfg)
-         cfg.restart_index_present = true;
-
-      agx_push(out, VDM_STATE_RESTART_INDEX, cfg)
-         cfg.value = info->restart_index;
-   }
-
-   agx_push(out, INDEX_LIST, cfg) {
-      cfg.primitive = agx_primitive_for_pipe(info->mode);
-
-      if (indirect != NULL) {
-         cfg.indirect_buffer_present = true;
-      } else {
-         cfg.instance_count_present = true;
-         cfg.index_count_present = true;
-         cfg.start_present = true;
-      }
-
-      if (info->index_size) {
-         cfg.restart_enable = info->primitive_restart;
-         cfg.index_buffer_hi = (ib >> 32);
-         cfg.index_size = agx_translate_index_size(info->index_size);
-         cfg.index_buffer_present = true;
-         cfg.index_buffer_size_present = true;
-      }
-   }
-
-   if (info->index_size) {
-      agx_push(out, INDEX_LIST_BUFFER_LO, cfg) {
-         cfg.buffer_lo = ib & BITFIELD_MASK(32);
-      }
-   }
-
-   if (indirect) {
+   if (ctx->in_generated_vdm) {
       struct agx_resource *indirect_rsrc = agx_resource(indirect->buffer);
       uint64_t address = indirect_rsrc->bo->ptr.gpu + indirect->offset;
 
-      agx_push(out, INDEX_LIST_INDIRECT_BUFFER, cfg) {
-         cfg.address_hi = address >> 32;
-         cfg.address_lo = address & BITFIELD_MASK(32);
+      agx_push(out, VDM_STREAM_LINK, cfg) {
+         cfg.target_lo = address & BITFIELD_MASK(32);
+         cfg.target_hi = address >> 32;
+         cfg.with_return = true;
       }
    } else {
-      agx_push(out, INDEX_LIST_COUNT, cfg)
-         cfg.count = draws->count;
 
-      agx_push(out, INDEX_LIST_INSTANCES, cfg)
-         cfg.count = info->instance_count;
+      if (info->index_size && info->primitive_restart) {
+         agx_push(out, VDM_STATE, cfg)
+            cfg.restart_index_present = true;
 
-      agx_push(out, INDEX_LIST_START, cfg) {
-         cfg.start = info->index_size ? draws->index_bias : draws->start;
+         agx_push(out, VDM_STATE_RESTART_INDEX, cfg)
+            cfg.value = info->restart_index;
       }
-   }
 
-   if (info->index_size) {
-      agx_push(out, INDEX_LIST_BUFFER_SIZE, cfg) {
-         cfg.size = ib_extent;
+      agx_push(out, INDEX_LIST, cfg) {
+         cfg.primitive = agx_primitive_for_pipe(info->mode);
+
+         if (indirect != NULL) {
+            cfg.indirect_buffer_present = true;
+         } else {
+            cfg.instance_count_present = true;
+            cfg.index_count_present = true;
+            cfg.start_present = true;
+         }
+
+         if (info->index_size) {
+            cfg.restart_enable = info->primitive_restart;
+            cfg.index_buffer_hi = (ib >> 32);
+            cfg.index_size = agx_translate_index_size(info->index_size);
+            cfg.index_buffer_present = true;
+            cfg.index_buffer_size_present = true;
+         }
+      }
+
+      if (info->index_size) {
+         agx_push(out, INDEX_LIST_BUFFER_LO, cfg) {
+            cfg.buffer_lo = ib & BITFIELD_MASK(32);
+         }
+      }
+
+      if (indirect) {
+         struct agx_resource *indirect_rsrc = agx_resource(indirect->buffer);
+         uint64_t address = indirect_rsrc->bo->ptr.gpu + indirect->offset;
+
+         agx_push(out, INDEX_LIST_INDIRECT_BUFFER, cfg) {
+            cfg.address_hi = address >> 32;
+            cfg.address_lo = address & BITFIELD_MASK(32);
+         }
+      } else {
+         agx_push(out, INDEX_LIST_COUNT, cfg)
+            cfg.count = draws->count;
+
+         agx_push(out, INDEX_LIST_INSTANCES, cfg)
+            cfg.count = info->instance_count;
+
+         agx_push(out, INDEX_LIST_START, cfg) {
+            cfg.start = info->index_size ? draws->index_bias : draws->start;
+         }
+      }
+
+      if (info->index_size) {
+         agx_push(out, INDEX_LIST_BUFFER_SIZE, cfg) {
+            cfg.size = ib_extent;
+         }
       }
    }
 
@@ -5242,7 +5298,7 @@ agx_texture_barrier(struct pipe_context *pipe, unsigned flags)
 }
 
 void
-agx_launch_internal(struct agx_batch *batch, const struct pipe_grid_info *info,
+agx_launch_internal(struct agx_batch *batch, const struct agx_grid *grid,
                     struct agx_compiled_shader *cs, enum pipe_shader_type stage,
                     uint32_t usc)
 {
@@ -5253,11 +5309,7 @@ agx_launch_internal(struct agx_batch *batch, const struct pipe_grid_info *info,
    uint8_t *out = batch->cdm.current;
 
    agx_push(out, CDM_LAUNCH_WORD_0, cfg) {
-      if (info->indirect)
-         cfg.mode = AGX_CDM_MODE_INDIRECT_GLOBAL;
-      else
-         cfg.mode = AGX_CDM_MODE_DIRECT;
-
+      cfg.mode = grid->mode;
       cfg.uniform_register_count = cs->b.info.push_count;
       cfg.preshader_register_count = cs->b.info.nr_preamble_gprs;
       cfg.texture_state_register_count = agx_nr_tex_descriptors(batch, cs);
@@ -5275,32 +5327,25 @@ agx_launch_internal(struct agx_batch *batch, const struct pipe_grid_info *info,
          ;
    }
 
-   if (info->indirect) {
-      struct agx_resource *indirect = agx_resource(info->indirect);
-      uint64_t addr = indirect->bo->ptr.gpu + info->indirect_offset;
-
-      agx_push(out, CDM_INDIRECT, cfg) {
-         cfg.address_hi = addr >> 32;
-         cfg.address_lo = addr & BITFIELD64_MASK(32);
+   if (grid->mode == AGX_CDM_MODE_DIRECT) {
+      agx_push(out, CDM_GLOBAL_SIZE, cfg) {
+         cfg.x = grid->global[0];
+         cfg.y = grid->global[1];
+         cfg.z = grid->global[2];
       }
    } else {
-      uint32_t size[3];
-      for (unsigned d = 0; d < 3; ++d) {
-         size[d] = ((info->grid[d] - 1) * info->block[d]) +
-                   (info->last_block[d] ?: info->block[d]);
-      }
-
-      agx_push(out, CDM_GLOBAL_SIZE, cfg) {
-         cfg.x = size[0];
-         cfg.y = size[1];
-         cfg.z = size[2];
+      agx_push(out, CDM_INDIRECT, cfg) {
+         cfg.address_hi = grid->indirect >> 32;
+         cfg.address_lo = grid->indirect & BITFIELD64_MASK(32);
       }
    }
 
-   agx_push(out, CDM_LOCAL_SIZE, cfg) {
-      cfg.x = info->block[0];
-      cfg.y = info->block[1];
-      cfg.z = info->block[2];
+   if (grid->mode != AGX_CDM_MODE_INDIRECT_LOCAL) {
+      agx_push(out, CDM_LOCAL_SIZE, cfg) {
+         cfg.x = grid->local[0];
+         cfg.y = grid->local[1];
+         cfg.z = grid->local[2];
+      }
    }
 
    agx_push(out, CDM_BARRIER, cfg) {
@@ -5352,9 +5397,9 @@ agx_launch_internal(struct agx_batch *batch, const struct pipe_grid_info *info,
 }
 
 void
-agx_launch(struct agx_batch *batch, const struct pipe_grid_info *info,
+agx_launch(struct agx_batch *batch, const struct agx_grid *grid,
            struct agx_compiled_shader *cs, struct agx_linked_shader *linked,
-           enum pipe_shader_type stage)
+           enum pipe_shader_type stage, unsigned variable_shared_mem)
 {
    struct agx_context *ctx = batch->ctx;
 
@@ -5362,18 +5407,17 @@ agx_launch(struct agx_batch *batch, const struct pipe_grid_info *info,
     * available in GPU memory. This is either the indirect buffer, or just a
     * buffer we upload ourselves if not indirect.
     */
-   if (info->indirect) {
-      struct agx_resource *indirect = agx_resource(info->indirect);
-      agx_batch_reads(batch, indirect);
+   if (grid->mode == AGX_CDM_MODE_DIRECT) {
+      uint32_t groups[3] = {
+         grid->global[0] / grid->local[0],
+         grid->global[1] / grid->local[1],
+         grid->global[2] / grid->local[2],
+      };
 
       batch->uniforms.tables[AGX_SYSVAL_TABLE_GRID] =
-         indirect->bo->ptr.gpu + info->indirect_offset;
+         agx_pool_upload_aligned(&batch->pool, groups, sizeof(groups), 4);
    } else {
-      static_assert(sizeof(info->grid) == 12,
-                    "matches indirect dispatch buffer");
-
-      batch->uniforms.tables[AGX_SYSVAL_TABLE_GRID] = agx_pool_upload_aligned(
-         &batch->pool, info->grid, sizeof(info->grid), 4);
+      batch->uniforms.tables[AGX_SYSVAL_TABLE_GRID] = grid->indirect;
    }
 
    util_dynarray_foreach(&ctx->global_buffers, struct pipe_resource *, res) {
@@ -5403,11 +5447,10 @@ agx_launch(struct agx_batch *batch, const struct pipe_grid_info *info,
    }
 #endif
 
-   uint32_t usc =
-      agx_build_pipeline(batch, cs, linked, PIPE_SHADER_COMPUTE,
-                         info->variable_shared_mem, subgroups_per_core);
+   uint32_t usc = agx_build_pipeline(batch, cs, linked, PIPE_SHADER_COMPUTE,
+                                     variable_shared_mem, subgroups_per_core);
 
-   agx_launch_internal(batch, info, cs, stage, usc);
+   agx_launch_internal(batch, grid, cs, stage, usc);
 }
 
 static void
@@ -5454,7 +5497,29 @@ agx_launch_grid(struct pipe_context *pipe, const struct pipe_grid_info *info)
    struct agx_compiled_shader *cs =
       _mesa_hash_table_next_entry(uncompiled->variants, NULL)->data;
 
-   agx_launch(batch, info, cs, NULL, PIPE_SHADER_COMPUTE);
+   struct agx_grid grid = {
+      .local[0] = info->block[0],
+      .local[1] = info->block[1],
+      .local[2] = info->block[2],
+   };
+
+   if (info->indirect) {
+      struct agx_resource *indirect = agx_resource(info->indirect);
+      agx_batch_reads(batch, indirect);
+
+      grid.mode = AGX_CDM_MODE_INDIRECT_GLOBAL;
+      grid.indirect = indirect->bo->ptr.gpu + info->indirect_offset;
+   } else {
+      grid.mode = AGX_CDM_MODE_DIRECT;
+
+      for (unsigned d = 0; d < 3; ++d) {
+         grid.global[d] = ((info->grid[d] - 1) * info->block[d]) +
+                          (info->last_block[d] ?: info->block[d]);
+      }
+   }
+
+   agx_launch(batch, &grid, cs, NULL, PIPE_SHADER_COMPUTE,
+              info->variable_shared_mem);
 
    /* TODO: Dirty tracking? */
    agx_dirty_all(ctx);
diff --git a/src/gallium/drivers/asahi/agx_state.h b/src/gallium/drivers/asahi/agx_state.h
index 3d139a3c71b..949e343cecc 100644
--- a/src/gallium/drivers/asahi/agx_state.h
+++ b/src/gallium/drivers/asahi/agx_state.h
@@ -682,8 +682,8 @@ struct agx_context {
    struct util_debug_callback debug;
    bool is_noop;
 
-   struct agx_tess_params tess_params;
    bool in_tess;
+   bool in_generated_vdm;
 
    struct blitter_context *blitter;
    struct asahi_blitter compute_blitter;
@@ -781,19 +781,67 @@ struct agx_compiled_shader *agx_build_meta_shader(struct agx_context *ctx,
                                                   meta_shader_builder_t builder,
                                                   void *data, size_t data_size);
 
-void agx_launch_with_data(struct agx_batch *batch,
-                          const struct pipe_grid_info *info,
+struct agx_grid {
+   /* Tag for the union */
+   enum agx_cdm_mode mode;
+
+   /* If mode != INDIRECT_LOCAL, the local size */
+   uint32_t local[3];
+
+   union {
+      /* If mode == DIRECT, the global size. This is *not* multiplied by the
+       * local size, differing from the API definition but matching AGX.
+       */
+      uint32_t global[3];
+
+      /* Address of the indirect buffer if mode != DIRECT */
+      uint64_t indirect;
+   };
+};
+
+static inline const struct agx_grid
+agx_grid_direct(uint32_t global_x, uint32_t global_y, uint32_t global_z,
+                uint32_t local_x, uint32_t local_y, uint32_t local_z)
+{
+   return (struct agx_grid){
+      .mode = AGX_CDM_MODE_DIRECT,
+      .global = {global_x, global_y, global_z},
+      .local = {local_x, local_y, local_z},
+   };
+}
+
+static inline const struct agx_grid
+agx_grid_indirect(uint64_t indirect, uint32_t local_x, uint32_t local_y,
+                  uint32_t local_z)
+{
+   return (struct agx_grid){
+      .mode = AGX_CDM_MODE_INDIRECT_GLOBAL,
+      .local = {local_x, local_y, local_z},
+      .indirect = indirect,
+   };
+}
+
+static inline const struct agx_grid
+agx_grid_indirect_local(uint64_t indirect)
+{
+   return (struct agx_grid){
+      .mode = AGX_CDM_MODE_INDIRECT_LOCAL,
+      .indirect = indirect,
+   };
+}
+
+void agx_launch_with_data(struct agx_batch *batch, const struct agx_grid *grid,
                           meta_shader_builder_t builder, void *key,
                           size_t key_size, void *data, size_t data_size);
 
-void agx_launch_internal(struct agx_batch *batch,
-                         const struct pipe_grid_info *info,
+void agx_launch_internal(struct agx_batch *batch, const struct agx_grid *grid,
                          struct agx_compiled_shader *cs,
                          enum pipe_shader_type stage, uint32_t usc);
 
-void agx_launch(struct agx_batch *batch, const struct pipe_grid_info *info,
+void agx_launch(struct agx_batch *batch, const struct agx_grid *grid,
                 struct agx_compiled_shader *cs,
-                struct agx_linked_shader *linked, enum pipe_shader_type stage);
+                struct agx_linked_shader *linked, enum pipe_shader_type stage,
+                unsigned variable_shared_mem);
 
 void agx_init_query_functions(struct pipe_context *ctx);