nak: Fix TCS output reads

The hardware uses the lane index for per-vertex TCS output reads rather
than the vertex index.  Fortunately, it's a pretty easy calculation to
go from one to the other.

Fixes: abe9c1fea2 ("nak: Add NIR lowering for attribute I/O")
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/27284>
This commit is contained in:
Faith Ekstrand
2024-01-25 12:38:54 -06:00
committed by Marge Bot
parent e44e57b4e7
commit 99ef70d8aa

View File

@@ -6,11 +6,55 @@
#include "nak_private.h"
#include "nir_builder.h"
static nir_def *
tess_ctrl_output_vtx(nir_builder *b, nir_def *vtx)
{
/* This is the pattern we see emitted by the blob driver:
*
* S2R R0, SR_LANEID
* S2R R6, SR_INVOCATION_ID
* # R3 is our vertex index
* SULD.P.2D.CTA.R.IGN R3, [R2], 0x1d, 0x0
* IMAD.IADD R5, R0, 0x1, -R6
* IMAD.SHL.U32 R0, R3, 0x4, RZ
* LEA.HI.SX32 R4, R0, R5, 0x1e
* ALD.O R4, a[0x88], R4
*
* Translating the MADs and re-naming registers, this is
*
* %r0 = iadd %lane -%invoc
* %r1 = imul %vtx 0x4
* %r2 = lea.hi.sx32 %r1 %r0 0x1e
* %out = ald.o a[%r2][0x88]
*
* But `lea.hi.sx32 %r1 %r0 0x1e` is just `(%r1 >> (32 - 0x1e)) + %r0`.
* Since %r1 is just `%vtx * 4` and 0x1e is 30, the whole bit on the left
* is `(%vtx * 4) >> 2 = %vtx`, assuming no overflow. So, this means
*
* %r0 = iadd %lane -%invoc
* %r2 = iadd %vtx %r0
* %out = ald.o a[%r2][0x88]
*
* In other words, the hardware actually indexes them by lane index with
* all of the invocations for a given TCS dispatch going in a sequential
* range of lanes. We have to compute the lane index of the requested
* invocation from the invocation index.
*/
nir_def *lane = nir_load_sysval_nv(b, 32, .base = NAK_SV_LANE_ID,
.access = ACCESS_CAN_REORDER);
nir_def *invoc = nir_load_sysval_nv(b, 32, .base = NAK_SV_INVOCATION_ID,
.access = ACCESS_CAN_REORDER);
return nir_iadd(b, lane, nir_iadd(b, vtx, nir_ineg(b, invoc)));
}
static bool
lower_vtg_io_intrin(nir_builder *b,
nir_intrinsic_instr *intrin,
void *cb_data)
{
b->cursor = nir_before_instr(&intrin->instr);
nir_def *vtx = NULL, *offset = NULL, *data = NULL;
switch (intrin->intrinsic) {
case nir_intrinsic_load_input:
@@ -19,11 +63,18 @@ lower_vtg_io_intrin(nir_builder *b,
break;
case nir_intrinsic_load_per_vertex_input:
case nir_intrinsic_load_per_vertex_output:
vtx = intrin->src[0].ssa;
offset = intrin->src[1].ssa;
break;
case nir_intrinsic_load_per_vertex_output:
if (b->shader->info.stage == MESA_SHADER_TESS_CTRL)
vtx = tess_ctrl_output_vtx(b, intrin->src[0].ssa);
else
vtx = intrin->src[0].ssa;
offset = intrin->src[1].ssa;
break;
case nir_intrinsic_store_output:
data = intrin->src[0].ssa;
offset = intrin->src[1].ssa;
@@ -88,8 +139,6 @@ lower_vtg_io_intrin(nir_builder *b,
else
mask = nir_component_mask(intrin->num_components);
b->cursor = nir_before_instr(&intrin->instr);
if (vtx != NULL && !is_output) {
nir_def *info = nir_load_sysval_nv(b, 32,
.base = NAK_SV_INVOCATION_INFO,