freedreno/ir3: rework setup_{input,output} to make struct varyings work

Rework setup_{input,output} to be called during emit_intrinsic, in a way which allows struct/array/matrix type varyings to work. This allows turnip to pass dEQP-VK.glsl.linkage.varying.struct.* Signed-off-by: Jonathan Marek <jonathan@marek.ca> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/6181>
2020-08-12 21:59:33 -04:00
parent c694af40bf
commit a6291b1b11
9 changed files with 110 additions and 169 deletions
--- a/.gitlab-ci/deqp-freedreno-a630-fails.txt
+++ b/.gitlab-ci/deqp-freedreno-a630-fails.txt
@@ -3,7 +3,6 @@ dEQP-GLES2.functional.clipping.triangle_vertex.clip_three.clip_neg_x_neg_z_and_p
 dEQP-VK.binding_model.descriptorset_random.sets4.constant.ubolimitlow.sbolimithigh.imglimithigh.noiub.uab.frag.ialimitlow.0
 dEQP-VK.draw.output_location.array.b8g8r8a8-unorm-mediump-output-vec3
 dEQP-VK.glsl.linkage.varying.struct.mat3x2
 dEQP-VK.graphicsfuzz.mat-array-deep-control-flow
 dEQP-VK.spirv_assembly.instruction.compute.float_controls.fp32.input_args.negate_denorm_preserve
 dEQP-VK.spirv_assembly.instruction.compute.float_controls.fp32.input_args.rounding_rtz_out_prod
--- a/src/freedreno/ir3/ir3_compiler_nir.c
+++ b/src/freedreno/ir3/ir3_compiler_nir.c
@@ -65,14 +65,16 @@ create_input(struct ir3_context *ctx, unsigned compmask)
 }
 static struct ir3_instruction *
-create_frag_input(struct ir3_context *ctx, bool use_ldlv, unsigned n)
+create_frag_input(struct ir3_context *ctx, struct ir3_instruction *coord, unsigned n)
 {
 	struct ir3_block *block = ctx->block;
 	struct ir3_instruction *instr;
 	/* packed inloc is fixed up later: */
 	struct ir3_instruction *inloc = create_immed(block, n);
-	if (use_ldlv) {
+	if (coord) {
 		instr = ir3_BARY_F(block, inloc, 0, coord, 0);
 	} else if (ctx->compiler->flat_bypass) {
 		instr = ir3_LDLV(block, inloc, 0, create_immed(block, 1), 0);
 		instr->cat6.type = TYPE_U32;
 		instr->cat6.iim_val = 1;
@@ -1342,7 +1344,6 @@ static void add_sysval_input_compmask(struct ir3_context *ctx,
 	so->inputs[n].sysval = true;
 	so->inputs[n].slot = slot;
 	so->inputs[n].compmask = compmask;
 	so->inputs[n].interpolate = INTERP_MODE_FLAT;
 	so->total_in++;
 }
@@ -1471,6 +1472,9 @@ get_frag_coord(struct ir3_context *ctx, nir_intrinsic_instr *intr)
 	return ctx->frag_coord;
 }
 static void setup_input(struct ir3_context *ctx, nir_intrinsic_instr *intr);
 static void setup_output(struct ir3_context *ctx, nir_intrinsic_instr *intr);
 static void
 emit_intrinsic(struct ir3_context *ctx, nir_intrinsic_instr *intr)
 {
@@ -1479,7 +1483,7 @@ emit_intrinsic(struct ir3_context *ctx, nir_intrinsic_instr *intr)
 	struct ir3_instruction * const *src;
 	struct ir3_block *b = ctx->block;
 	unsigned dest_components = nir_intrinsic_dest_components(intr);
-	int idx, comp;
+	int idx;
 	if (info->has_dest) {
 		dst = ir3_get_dst(ctx, &intr->dest, dest_components);
@@ -1658,43 +1662,8 @@ emit_intrinsic(struct ir3_context *ctx, nir_intrinsic_instr *intr)
 		emit_intrinsic_barycentric(ctx, intr, dst);
 		break;
 	case nir_intrinsic_load_interpolated_input:
 		idx = nir_intrinsic_base(intr);
 		comp = nir_intrinsic_component(intr);
 		src = ir3_get_src(ctx, &intr->src[0]);
 		if (nir_src_is_const(intr->src[1])) {
 			struct ir3_instruction *coord = ir3_create_collect(ctx, src, 2);
 			idx += nir_src_as_uint(intr->src[1]);
 			for (int i = 0; i < dest_components; i++) {
 				unsigned inloc = idx * 4 + i + comp;
 				if (ctx->so->inputs[idx].bary &&
 						!ctx->so->inputs[idx].use_ldlv) {
 					dst[i] = ir3_BARY_F(b, create_immed(b, inloc), 0, coord, 0);
 				} else {
 					/* for non-varyings use the pre-setup input, since
 					 * that is easier than mapping things back to a
 					 * nir_variable to figure out what it is.
 					 */
 					dst[i] = ctx->inputs[inloc];
 					compile_assert(ctx, dst[i]);
 				}
 			}
 		} else {
 			ir3_context_error(ctx, "unhandled");
 		}
 		break;
 	case nir_intrinsic_load_input:
-		idx = nir_intrinsic_base(intr);
+		setup_input(ctx, intr);
 		comp = nir_intrinsic_component(intr);
 		if (nir_src_is_const(intr->src[0])) {
 			idx += nir_src_as_uint(intr->src[0]);
 			for (int i = 0; i < dest_components; i++) {
 				unsigned n = idx * 4 + i + comp;
 				dst[i] = ctx->inputs[n];
 				compile_assert(ctx, ctx->inputs[n]);
 			}
 		} else {
 			ir3_context_error(ctx, "unhandled");
 		}
 		break;
 	/* All SSBO intrinsics should have been lowered by 'lower_io_offsets'
 	 * pass and replaced by an ir3-specifc version that adds the
@@ -1803,16 +1772,7 @@ emit_intrinsic(struct ir3_context *ctx, nir_intrinsic_instr *intr)
 		b = NULL;
 		break;
 	case nir_intrinsic_store_output:
-		idx = nir_intrinsic_base(intr);
+		setup_output(ctx, intr);
 		comp = nir_intrinsic_component(intr);
 		compile_assert(ctx, nir_src_is_const(intr->src[1]));
 		idx += nir_src_as_uint(intr->src[1]);
 		src = ir3_get_src(ctx, &intr->src[0]);
 		for (int i = 0; i < nir_intrinsic_src_components(intr, 0); i++) {
 			unsigned n = idx * 4 + i + comp;
 			ctx->outputs[n] = src[i];
 		}
 		break;
 	case nir_intrinsic_load_base_vertex:
 	case nir_intrinsic_load_first_vertex:
@@ -2949,92 +2909,53 @@ emit_function(struct ir3_context *ctx, nir_function_impl *impl)
 }
 static void
-setup_input(struct ir3_context *ctx, nir_variable *in)
+setup_input(struct ir3_context *ctx, nir_intrinsic_instr *intr)
 {
 	struct ir3_shader_variant *so = ctx->so;
-	unsigned ncomp = glsl_get_components(in->type);
+	struct ir3_instruction *coord = NULL;
-	unsigned n = in->data.driver_location;
+
-	unsigned frac = in->data.location_frac;
+	if (intr->intrinsic == nir_intrinsic_load_interpolated_input)
-	unsigned slot = in->data.location;
+		coord = ir3_create_collect(ctx, ir3_get_src(ctx, &intr->src[0]), 2);
 	compile_assert(ctx, nir_src_is_const(intr->src[coord ? 1 : 0]));
 	unsigned frac = nir_intrinsic_component(intr);
 	unsigned offset = nir_src_as_uint(intr->src[coord ? 1 : 0]);
 	unsigned ncomp = nir_intrinsic_dest_components(intr);
 	unsigned n = nir_intrinsic_base(intr) + offset;
 	unsigned slot = nir_intrinsic_io_semantics(intr).location + offset;
 	unsigned compmask;
-	/* Inputs are loaded using ldlw or ldg for these stages. */
+	/* Inputs are loaded using ldlw or ldg for other stages. */
-	if (ctx->so->type == MESA_SHADER_TESS_CTRL ||
+	compile_assert(ctx, ctx->so->type == MESA_SHADER_FRAGMENT ||
-			ctx->so->type == MESA_SHADER_TESS_EVAL ||
+						ctx->so->type == MESA_SHADER_VERTEX);
 			ctx->so->type == MESA_SHADER_GEOMETRY)
 		return;
 	/* skip unread inputs, we could end up with (for example), unsplit
 	 * matrix/etc inputs in the case they are not read, so just silently
 	 * skip these.
 	 */
 	if (ncomp > 4)
 		return;
 	if (ctx->so->type == MESA_SHADER_FRAGMENT)
 		compmask = BITFIELD_MASK(ncomp) << frac;
 	else
 		compmask = BITFIELD_MASK(ncomp + frac);
-	/* remove any already set set components */
+	/* for a4xx+ rasterflat */
-	compmask &= ~so->inputs[n].compmask;
+	if (so->inputs[n].rasterflat && ctx->so->key.rasterflat)
-	if (!compmask)
+		coord = NULL;
-		return;
+
 	so->total_in += util_bitcount(compmask & ~so->inputs[n].compmask);
 	so->inputs[n].slot = slot;
 	so->inputs[n].compmask |= compmask;
 	so->inputs_count = MAX2(so->inputs_count, n + 1);
-	so->inputs[n].interpolate = in->data.interpolation;
+	so->inputs[n].flat = !coord;
 	if (ctx->so->type == MESA_SHADER_FRAGMENT) {
 		compile_assert(ctx, slot != VARYING_SLOT_POS);
-		/* if any varyings have 'sample' qualifer, that triggers us
+		so->inputs[n].bary = true;
 		 * to run in per-sample mode:
 		 */
 		so->per_samp |= in->data.sample;
 		for (int i = 0; i < ncomp; i++) {
 			struct ir3_instruction *instr = NULL;
 			unsigned idx = (n * 4) + i + frac;
-
+			ctx->last_dst[i] = create_frag_input(ctx, coord, idx);
 			if (!(compmask & (1 << (i + frac))))
 				continue;
 			if (slot == VARYING_SLOT_POS) {
 				ir3_context_error(ctx, "fragcoord should be a sysval!\n");
 			} else {
 				/* detect the special case for front/back colors where
 				 * we need to do flat vs smooth shading depending on
 				 * rast state:
 				 */
 				if (in->data.interpolation == INTERP_MODE_NONE) {
 					switch (slot) {
 					case VARYING_SLOT_COL0:
 					case VARYING_SLOT_COL1:
 					case VARYING_SLOT_BFC0:
 					case VARYING_SLOT_BFC1:
 						so->inputs[n].rasterflat = true;
 						break;
 					default:
 						break;
 					}
 				}
 				if (ctx->compiler->flat_bypass) {
 					if ((so->inputs[n].interpolate == INTERP_MODE_FLAT) ||
 							(so->inputs[n].rasterflat && ctx->so->key.rasterflat))
 						so->inputs[n].use_ldlv = true;
 				}
 				so->inputs[n].bary = true;
 				instr = create_frag_input(ctx, so->inputs[n].use_ldlv, idx);
 			}
 			compile_assert(ctx, idx < ctx->ninputs && !ctx->inputs[idx]);
 			ctx->inputs[idx] = instr;
 		}
-	} else if (ctx->so->type == MESA_SHADER_VERTEX) {
+	} else {
 		struct ir3_instruction *input = NULL;
 		foreach_input (in, ctx->ir) {
@@ -3067,10 +2988,11 @@ setup_input(struct ir3_context *ctx, nir_variable *in)
 			ir3_split_dest(ctx->block, &ctx->inputs[idx], input, i, 1);
 		}
 	}
-	if (so->inputs[n].bary || (ctx->so->type == MESA_SHADER_VERTEX)) {
+		for (int i = 0; i < ncomp; i++) {
-		so->total_in += util_bitcount(compmask);
+			unsigned idx = (n * 4) + i + frac;
 			ctx->last_dst[i] = ctx->inputs[idx];
 		}
 	}
 }
@@ -3173,14 +3095,18 @@ pack_inlocs(struct ir3_context *ctx)
 }
 static void
-setup_output(struct ir3_context *ctx, nir_variable *out)
+setup_output(struct ir3_context *ctx, nir_intrinsic_instr *intr)
 {
 	struct ir3_shader_variant *so = ctx->so;
-	unsigned slots = glsl_count_vec4_slots(out->type, false, false);
+	nir_io_semantics io = nir_intrinsic_io_semantics(intr);
-	unsigned ncomp = glsl_get_components(glsl_without_array(out->type));
+
-	unsigned n = out->data.driver_location;
+	compile_assert(ctx, nir_src_is_const(intr->src[1]));
-	unsigned frac = out->data.location_frac;
+
-	unsigned slot = out->data.location;
+	unsigned offset = nir_src_as_uint(intr->src[1]);
 	unsigned n = nir_intrinsic_base(intr) + offset;
 	unsigned frac = nir_intrinsic_component(intr);
 	unsigned ncomp = nir_intrinsic_src_components(intr, 0);
 	unsigned slot = io.location + offset;
 	if (ctx->so->type == MESA_SHADER_FRAGMENT) {
 		switch (slot) {
@@ -3197,7 +3123,7 @@ setup_output(struct ir3_context *ctx, nir_variable *out)
 			so->writes_stencilref = true;
 			break;
 		default:
-			slot += out->data.index; /* For dual-src blend */
+			slot += io.dual_source_blend_index; /* For dual-src blend */
 			if (slot >= FRAG_RESULT_DATA0)
 				break;
 			ir3_context_error(ctx, "unknown FS output name: %s\n",
@@ -3236,41 +3162,41 @@ setup_output(struct ir3_context *ctx, nir_variable *out)
 					_mesa_shader_stage_to_string(ctx->so->type),
 					gl_varying_slot_name(slot));
 		}
 	} else if (ctx->so->type == MESA_SHADER_TESS_CTRL) {
 		/* output lowered to buffer writes. */
 		return;
 	} else {
 		ir3_context_error(ctx, "unknown shader type: %d\n", ctx->so->type);
 	}
-	so->outputs_count = out->data.driver_location + slots;
+	so->outputs_count = MAX2(so->outputs_count, n + 1);
 	compile_assert(ctx, so->outputs_count < ARRAY_SIZE(so->outputs));
-	for (int i = 0; i < slots; i++) {
+	so->outputs[n].slot = slot;
 		int slot_base = n + i;
 		so->outputs[slot_base].slot = slot + i;
-		for (int i = 0; i < ncomp; i++) {
+	for (int i = 0; i < ncomp; i++) {
-			unsigned idx = (slot_base * 4) + i + frac;
+		unsigned idx = (n * 4) + i + frac;
-			compile_assert(ctx, idx < ctx->noutputs);
+		compile_assert(ctx, idx < ctx->noutputs);
 		ctx->outputs[idx] = create_immed(ctx->block, fui(0.0));
 	}
 	/* if varying packing doesn't happen, we could end up in a situation
 	 * with "holes" in the output, and since the per-generation code that
 	 * sets up varying linkage registers doesn't expect to have more than
 	 * one varying per vec4 slot, pad the holes.
 	 *
 	 * Note that this should probably generate a performance warning of
 	 * some sort.
 	 */
 	for (int i = 0; i < frac; i++) {
 		unsigned idx = (n * 4) + i;
 		if (!ctx->outputs[idx]) {
 			ctx->outputs[idx] = create_immed(ctx->block, fui(0.0));
 		}
 	}
-		/* if varying packing doesn't happen, we could end up in a situation
+	struct ir3_instruction * const *src = ir3_get_src(ctx, &intr->src[0]);
-		 * with "holes" in the output, and since the per-generation code that
+	for (int i = 0; i < ncomp; i++) {
-		 * sets up varying linkage registers doesn't expect to have more than
+		unsigned idx = (n * 4) + i + frac;
-		 * one varying per vec4 slot, pad the holes.
+		ctx->outputs[idx] = src[i];
 		 *
 		 * Note that this should probably generate a performance warning of
 		 * some sort.
 		 */
 		for (int i = 0; i < frac; i++) {
 			unsigned idx = (slot_base * 4) + i;
 			if (!ctx->outputs[idx]) {
 				ctx->outputs[idx] = create_immed(ctx->block, fui(0.0));
 			}
 		}
 	}
 }
@@ -3279,6 +3205,35 @@ emit_instructions(struct ir3_context *ctx)
 {
 	nir_function_impl *fxn = nir_shader_get_entrypoint(ctx->s);
 	/* some varying setup which can't be done in setup_input(): */
 	if (ctx->so->type == MESA_SHADER_FRAGMENT) {
 		nir_foreach_shader_in_variable (var, ctx->s) {
 			/* if any varyings have 'sample' qualifer, that triggers us
 			 * to run in per-sample mode:
 			 */
 			if (var->data.sample)
 				ctx->so->per_samp = true;
 			/* set rasterflat flag for front/back color */
 			if (var->data.interpolation == INTERP_MODE_NONE) {
 				switch (var->data.location) {
 				case VARYING_SLOT_COL0:
 				case VARYING_SLOT_COL1:
 				case VARYING_SLOT_BFC0:
 				case VARYING_SLOT_BFC1:
 					ctx->so->inputs[var->data.driver_location].rasterflat = true;
 					break;
 				default:
 					break;
 				}
 			}
 		}
 	}
 	/* TODO: for GS/HS/DS, load_input isn't used. but ctx->s->num_inputs is non-zero
 	 * likely the same for num_outputs in cases where store_output isn't used
 	 */
 	ctx->so->inputs_count = ctx->s->num_inputs;
 	ctx->ninputs = ctx->s->num_inputs * 4;
 	ctx->noutputs = ctx->s->num_outputs * 4;
 	ctx->inputs  = rzalloc_array(ctx, struct ir3_instruction *, ctx->ninputs);
@@ -3303,11 +3258,6 @@ emit_instructions(struct ir3_context *ctx)
 		ctx->ij[IJ_PERSP_PIXEL] = create_input(ctx, 0x3);
 	}
 	/* Setup inputs: */
 	nir_foreach_shader_in_variable (var, ctx->s) {
 		setup_input(ctx, var);
 	}
 	/* Defer add_sysval_input() stuff until after setup_inputs(),
 	 * because sysvals need to be appended after varyings:
 	 */
@@ -3351,11 +3301,6 @@ emit_instructions(struct ir3_context *ctx)
 		break;
 	}
 	/* Setup outputs: */
 	nir_foreach_shader_out_variable (var, ctx->s) {
 		setup_output(ctx, var);
 	}
 	/* Find # of samplers. Just assume that we'll be reading from images.. if
 	 * it is write-only we don't have to count it, but after lowering derefs
 	 * is too late to compact indices for that.
--- a/src/freedreno/ir3/ir3_parser.y
+++ b/src/freedreno/ir3/ir3_parser.y
@@ -178,7 +178,6 @@ static void add_sysval(unsigned reg, unsigned compmask, gl_system_value sysval)
 	variant->inputs[n].sysval = true;
 	variant->inputs[n].slot = sysval;
 	variant->inputs[n].compmask = compmask;
 	variant->inputs[n].interpolate = INTERP_MODE_FLAT;
 	variant->total_in++;
 }
--- a/src/freedreno/ir3/ir3_shader.h
+++ b/src/freedreno/ir3/ir3_shader.h
@@ -588,9 +588,8 @@ struct ir3_shader_variant {
 		/* fragment shader specific: */
 		bool    bary       : 1;   /* fetched varying (vs one loaded into reg) */
 		bool    rasterflat : 1;   /* special handling for emit->rasterflat */
 		bool    use_ldlv   : 1;   /* internal to ir3_compiler_nir */
 		bool    half       : 1;
-		enum glsl_interp_mode interpolate;
+		bool    flat       : 1;
 	} inputs[32 + 2];  /* +POSITION +FACE */
 	/* sum of input components (scalar).  For frag shaders, it only counts
--- a/src/freedreno/vulkan/tu_pipeline.c
+++ b/src/freedreno/vulkan/tu_pipeline.c
@@ -1069,8 +1069,7 @@ tu6_vpc_varying_mode(const struct ir3_shader_variant *fs,
         *interp_mode |= INTERP_ONE << 6;
         shift += 2;
      }
-   } else if ((fs->inputs[index].interpolate == INTERP_MODE_FLAT) ||
+   } else if (fs->inputs[index].flat) {
              fs->inputs[index].rasterflat) {
      for (int i = 0; i < 4; i++) {
         if (compmask & (1 << i)) {
            *interp_mode |= INTERP_FLAT << shift;
--- a/src/gallium/drivers/freedreno/a3xx/fd3_program.c
+++ b/src/gallium/drivers/freedreno/a3xx/fd3_program.c
@@ -361,7 +361,7 @@ fd3_program_emit(struct fd_ringbuffer *ring, struct fd3_emit *emit,
 			uint32_t inloc = fp->inputs[j].inloc;
-			if ((fp->inputs[j].interpolate == INTERP_MODE_FLAT) ||
+			if (fp->inputs[j].flat ||
 					(fp->inputs[j].rasterflat && emit->rasterflat)) {
 				uint32_t loc = inloc;
--- a/src/gallium/drivers/freedreno/a4xx/fd4_program.c
+++ b/src/gallium/drivers/freedreno/a4xx/fd4_program.c
@@ -465,7 +465,7 @@ fd4_program_emit(struct fd_ringbuffer *ring, struct fd4_emit *emit,
 			uint32_t inloc = s[FS].v->inputs[j].inloc;
-			if ((s[FS].v->inputs[j].interpolate == INTERP_MODE_FLAT) ||
+			if (s[FS].v->inputs[j].flat ||
 					(s[FS].v->inputs[j].rasterflat && emit->rasterflat)) {
 				uint32_t loc = inloc;
--- a/src/gallium/drivers/freedreno/a5xx/fd5_program.c
+++ b/src/gallium/drivers/freedreno/a5xx/fd5_program.c
@@ -611,7 +611,7 @@ fd5_program_emit(struct fd_context *ctx, struct fd_ringbuffer *ring,
 			uint32_t inloc = s[FS].v->inputs[j].inloc;
-			if ((s[FS].v->inputs[j].interpolate == INTERP_MODE_FLAT) ||
+			if (s[FS].v->inputs[j].flat ||
 					(s[FS].v->inputs[j].rasterflat && emit->rasterflat)) {
 				uint32_t loc = inloc;
--- a/src/gallium/drivers/freedreno/a6xx/fd6_program.c
+++ b/src/gallium/drivers/freedreno/a6xx/fd6_program.c
@@ -940,7 +940,7 @@ emit_interp_state(struct fd_ringbuffer *ring, struct ir3_shader_variant *fs,
 		uint32_t inloc = fs->inputs[j].inloc;
-		if ((fs->inputs[j].interpolate == INTERP_MODE_FLAT) ||
+		if (fs->inputs[j].flat ||
 				(fs->inputs[j].rasterflat && rasterflat)) {
 			uint32_t loc = inloc;