diff --git a/src/nouveau/compiler/nak_nir_lower_vtg_io.c b/src/nouveau/compiler/nak_nir_lower_vtg_io.c index e69352d75e8..34cf8db7aaf 100644 --- a/src/nouveau/compiler/nak_nir_lower_vtg_io.c +++ b/src/nouveau/compiler/nak_nir_lower_vtg_io.c @@ -6,11 +6,55 @@ #include "nak_private.h" #include "nir_builder.h" +static nir_def * +tess_ctrl_output_vtx(nir_builder *b, nir_def *vtx) +{ + /* This is the pattern we see emitted by the blob driver: + * + * S2R R0, SR_LANEID + * S2R R6, SR_INVOCATION_ID + * # R3 is our vertex index + * SULD.P.2D.CTA.R.IGN R3, [R2], 0x1d, 0x0 + * IMAD.IADD R5, R0, 0x1, -R6 + * IMAD.SHL.U32 R0, R3, 0x4, RZ + * LEA.HI.SX32 R4, R0, R5, 0x1e + * ALD.O R4, a[0x88], R4 + * + * Translating the MADs and re-naming registers, this is + * + * %r0 = iadd %lane -%invoc + * %r1 = imul %vtx 0x4 + * %r2 = lea.hi.sx32 %r1 %r0 0x1e + * %out = ald.o a[%r2][0x88] + * + * But `lea.hi.sx32 %r1 %r0 0x1e` is just `(%r1 >> (32 - 0x1e)) + %r0`. + * Since %r1 is just `%vtx * 4` and 0x1e is 30, the whole bit on the left + * is `(%vtx * 4) >> 2 = %vtx`, assuming no overflow. So, this means + * + * %r0 = iadd %lane -%invoc + * %r2 = iadd %vtx %r0 + * %out = ald.o a[%r2][0x88] + * + * In other words, the hardware actually indexes them by lane index with + * all of the invocations for a given TCS dispatch going in a sequential + * range of lanes. We have to compute the lane index of the requested + * invocation from the invocation index. + */ + nir_def *lane = nir_load_sysval_nv(b, 32, .base = NAK_SV_LANE_ID, + .access = ACCESS_CAN_REORDER); + nir_def *invoc = nir_load_sysval_nv(b, 32, .base = NAK_SV_INVOCATION_ID, + .access = ACCESS_CAN_REORDER); + + return nir_iadd(b, lane, nir_iadd(b, vtx, nir_ineg(b, invoc))); +} + static bool lower_vtg_io_intrin(nir_builder *b, nir_intrinsic_instr *intrin, void *cb_data) { + b->cursor = nir_before_instr(&intrin->instr); + nir_def *vtx = NULL, *offset = NULL, *data = NULL; switch (intrin->intrinsic) { case nir_intrinsic_load_input: @@ -19,11 +63,18 @@ lower_vtg_io_intrin(nir_builder *b, break; case nir_intrinsic_load_per_vertex_input: - case nir_intrinsic_load_per_vertex_output: vtx = intrin->src[0].ssa; offset = intrin->src[1].ssa; break; + case nir_intrinsic_load_per_vertex_output: + if (b->shader->info.stage == MESA_SHADER_TESS_CTRL) + vtx = tess_ctrl_output_vtx(b, intrin->src[0].ssa); + else + vtx = intrin->src[0].ssa; + offset = intrin->src[1].ssa; + break; + case nir_intrinsic_store_output: data = intrin->src[0].ssa; offset = intrin->src[1].ssa; @@ -88,8 +139,6 @@ lower_vtg_io_intrin(nir_builder *b, else mask = nir_component_mask(intrin->num_components); - b->cursor = nir_before_instr(&intrin->instr); - if (vtx != NULL && !is_output) { nir_def *info = nir_load_sysval_nv(b, 32, .base = NAK_SV_INVOCATION_INFO,