diff --git a/src/nouveau/compiler/nak_nir_lower_vtg_io.c b/src/nouveau/compiler/nak_nir_lower_vtg_io.c
index e69352d75e8..34cf8db7aaf 100644
--- a/src/nouveau/compiler/nak_nir_lower_vtg_io.c
+++ b/src/nouveau/compiler/nak_nir_lower_vtg_io.c
@@ -6,11 +6,55 @@
 #include "nak_private.h"
 #include "nir_builder.h"
 
+static nir_def *
+tess_ctrl_output_vtx(nir_builder *b, nir_def *vtx)
+{
+   /* This is the pattern we see emitted by the blob driver:
+    *
+    *    S2R R0, SR_LANEID
+    *    S2R R6, SR_INVOCATION_ID
+    *    # R3 is our vertex index
+    *    SULD.P.2D.CTA.R.IGN R3, [R2], 0x1d, 0x0
+    *    IMAD.IADD R5, R0, 0x1, -R6
+    *    IMAD.SHL.U32 R0, R3, 0x4, RZ
+    *    LEA.HI.SX32 R4, R0, R5, 0x1e
+    *    ALD.O R4, a[0x88], R4
+    *
+    * Translating the MADs and re-naming registers, this is
+    *
+    *    %r0 = iadd %lane -%invoc
+    *    %r1 = imul %vtx 0x4
+    *    %r2 = lea.hi.sx32 %r1 %r0 0x1e
+    *    %out = ald.o a[%r2][0x88]
+    *
+    * But `lea.hi.sx32 %r1 %r0 0x1e` is just `(%r1 >> (32 - 0x1e)) + %r0`.
+    * Since %r1 is just `%vtx * 4` and 0x1e is 30, the whole bit on the left
+    * is `(%vtx * 4) >> 2 = %vtx`, assuming no overflow.  So, this means
+    *
+    *    %r0 = iadd %lane -%invoc
+    *    %r2 = iadd %vtx %r0
+    *    %out = ald.o a[%r2][0x88]
+    *
+    * In other words, the hardware actually indexes them by lane index with
+    * all of the invocations for a given TCS dispatch going in a sequential
+    * range of lanes.  We have to compute the lane index of the requested
+    * invocation from the invocation index.
+    */
+   nir_def *lane = nir_load_sysval_nv(b, 32, .base = NAK_SV_LANE_ID,
+                                      .access = ACCESS_CAN_REORDER);
+   nir_def *invoc = nir_load_sysval_nv(b, 32, .base = NAK_SV_INVOCATION_ID,
+                                       .access = ACCESS_CAN_REORDER);
+
+   return nir_iadd(b, lane, nir_iadd(b, vtx, nir_ineg(b, invoc)));
+}
+
 static bool
 lower_vtg_io_intrin(nir_builder *b,
                     nir_intrinsic_instr *intrin,
                     void *cb_data)
 {
+   b->cursor = nir_before_instr(&intrin->instr);
+
    nir_def *vtx = NULL, *offset = NULL, *data = NULL;
    switch (intrin->intrinsic) {
    case nir_intrinsic_load_input:
@@ -19,11 +63,18 @@ lower_vtg_io_intrin(nir_builder *b,
       break;
 
    case nir_intrinsic_load_per_vertex_input:
-   case nir_intrinsic_load_per_vertex_output:
       vtx = intrin->src[0].ssa;
       offset = intrin->src[1].ssa;
       break;
 
+   case nir_intrinsic_load_per_vertex_output:
+      if (b->shader->info.stage == MESA_SHADER_TESS_CTRL)
+         vtx = tess_ctrl_output_vtx(b, intrin->src[0].ssa);
+      else
+         vtx = intrin->src[0].ssa;
+      offset = intrin->src[1].ssa;
+      break;
+
    case nir_intrinsic_store_output:
       data = intrin->src[0].ssa;
       offset = intrin->src[1].ssa;
@@ -88,8 +139,6 @@ lower_vtg_io_intrin(nir_builder *b,
    else
       mask = nir_component_mask(intrin->num_components);
 
-   b->cursor = nir_before_instr(&intrin->instr);
-
    if (vtx != NULL && !is_output) {
       nir_def *info = nir_load_sysval_nv(b, 32,
                                          .base = NAK_SV_INVOCATION_INFO,