freedreno/ir3: Implement lowering passes for VS and GS

This introduces two new lowering passes. One to lower VS to explicit outputs using STLW and one to lower GS to load input using LDLW and implement the GS specific functionality. Signed-off-by: Kristian H. Kristensen <hoegsberg@google.com>
2019-10-10 17:17:10 -07:00
parent 8f39985b01
commit 8e16fb1528
13 changed files with 517 additions and 2 deletions
--- a/src/compiler/nir/nir.h
+++ b/src/compiler/nir/nir.h
@@ -1524,6 +1524,9 @@ typedef enum {
   NIR_INTRINSIC_SRC_ACCESS,
   NIR_INTRINSIC_DST_ACCESS,

+   /* Driver location for nir_load_patch_location_ir3 */
+   NIR_INTRINSIC_DRIVER_LOCATION,
+
   NIR_INTRINSIC_NUM_INDEX_FLAGS,

 } nir_intrinsic_index_flag;
@@ -1632,6 +1635,7 @@ INTRINSIC_IDX_ACCESSORS(align_offset, ALIGN_OFFSET, unsigned)
 INTRINSIC_IDX_ACCESSORS(desc_type, DESC_TYPE, unsigned)
 INTRINSIC_IDX_ACCESSORS(type, TYPE, nir_alu_type)
 INTRINSIC_IDX_ACCESSORS(swizzle_mask, SWIZZLE_MASK, unsigned)
+INTRINSIC_IDX_ACCESSORS(driver_location, DRIVER_LOCATION, unsigned)

 static inline void
 nir_intrinsic_set_align(nir_intrinsic_instr *intrin,
--- a/src/compiler/nir/nir_intrinsics.py
+++ b/src/compiler/nir/nir_intrinsics.py
@@ -124,6 +124,8 @@ DESC_TYPE = "NIR_INTRINSIC_DESC_TYPE"
 TYPE = "NIR_INTRINSIC_TYPE"
 # The swizzle mask for quad_swizzle_amd & masked_swizzle_amd
 SWIZZLE_MASK = "NIR_INTRINSIC_SWIZZLE_MASK"
+# Driver location of attribute
+DRIVER_LOCATION = "NIR_INTRINSIC_DRIVER_LOCATION"

 #
 # Possible flags:
@@ -771,6 +773,12 @@ intrinsic("ssbo_atomic_xor_ir3",        src_comp=[1, 1, 1, 1],    dest_comp=1)
 intrinsic("ssbo_atomic_exchange_ir3",   src_comp=[1, 1, 1, 1],    dest_comp=1)
 intrinsic("ssbo_atomic_comp_swap_ir3",  src_comp=[1, 1, 1, 1, 1], dest_comp=1)

+# System values for freedreno geometry shaders.
+system_value("vs_primitive_stride_ir3", 1)
+system_value("vs_vertex_stride_ir3", 1)
+system_value("gs_header_ir3", 1)
+system_value("primitive_location_ir3", 1, indices=[DRIVER_LOCATION])
+
 # IR3-specific load/store intrinsics. These access a buffer used to pass data
 # between geometry stages - perhaps it's explicit access to the vertex cache.

--- a/src/compiler/nir/nir_print.c
+++ b/src/compiler/nir/nir_print.c
@@ -800,6 +800,7 @@ print_intrinsic_instr(nir_intrinsic_instr *instr, print_state *state)
      [NIR_INTRINSIC_DESC_TYPE] = "desc_type",
      [NIR_INTRINSIC_TYPE] = "type",
      [NIR_INTRINSIC_SWIZZLE_MASK] = "swizzle_mask",
+      [NIR_INTRINSIC_DRIVER_LOCATION] = "driver_location",
   };
   for (unsigned idx = 1; idx < NIR_INTRINSIC_NUM_INDEX_FLAGS; idx++) {
      if (!info->index_map[idx])
--- a/src/compiler/shader_enums.c
+++ b/src/compiler/shader_enums.c
@@ -254,6 +254,7 @@ gl_system_value_name(gl_system_value sysval)
     ENUM(SYSTEM_VALUE_BARYCENTRIC_SAMPLE),
     ENUM(SYSTEM_VALUE_BARYCENTRIC_CENTROID),
     ENUM(SYSTEM_VALUE_BARYCENTRIC_SIZE),
+     ENUM(SYSTEM_VALUE_GS_HEADER_IR3),
   };
   STATIC_ASSERT(ARRAY_SIZE(names) == SYSTEM_VALUE_MAX);
   return NAME(sysval);
--- a/src/compiler/shader_enums.h
+++ b/src/compiler/shader_enums.h
@@ -641,6 +641,13 @@ typedef enum
   SYSTEM_VALUE_BARYCENTRIC_CENTROID,
   SYSTEM_VALUE_BARYCENTRIC_SIZE,

+   /**
+    * IR3 specific geometry shader system value that packs invocation id,
+    * thread id and vertex id.  Having this as a nir level system value lets
+    * us do the unpacking in nir.
+    */
+   SYSTEM_VALUE_GS_HEADER_IR3,
+
   SYSTEM_VALUE_MAX             /**< Number of values */
 } gl_system_value;

--- a/src/freedreno/Makefile.sources
+++ b/src/freedreno/Makefile.sources
@@ -38,6 +38,7 @@ ir3_SOURCES := \
 	ir3/ir3_nir_lower_load_barycentric_at_sample.c \
 	ir3/ir3_nir_lower_load_barycentric_at_offset.c \
 	ir3/ir3_nir_lower_io_offsets.c \
+	ir3/ir3_nir_lower_tess.c \
 	ir3/ir3_nir_lower_tg4_to_tex.c \
 	ir3/ir3_nir_move_varying_inputs.c \
 	ir3/ir3_print.c \
--- a/src/freedreno/ir3/ir3_context.h
+++ b/src/freedreno/ir3/ir3_context.h
@@ -76,6 +76,10 @@ struct ir3_context {
 	/* For fragment shaders: */
 	struct ir3_instruction *samp_id, *samp_mask_in;

+	/* For geometry shaders: */
+	struct ir3_instruction *primitive_id;
+	struct ir3_instruction *gs_header;
+
 	/* Compute shader inputs: */
 	struct ir3_instruction *local_invocation_id, *work_group_id;

--- a/src/freedreno/ir3/ir3_nir.c
+++ b/src/freedreno/ir3/ir3_nir.c
@@ -101,7 +101,8 @@ ir3_key_lowers_nir(const struct ir3_shader_key *key)
 	return key->fsaturate_s | key->fsaturate_t | key->fsaturate_r |
 			key->vsaturate_s | key->vsaturate_t | key->vsaturate_r |
 			key->ucp_enables | key->color_two_side |
-			key->fclamp_color | key->vclamp_color;
+			key->fclamp_color | key->vclamp_color |
+			key->has_gs;
 }

 #define OPT(nir, pass, ...) ({                             \
@@ -186,6 +187,19 @@ ir3_optimize_nir(struct ir3_shader *shader, nir_shader *s,
 			.lower_tg4_offsets = true,
 	};

+	if (key && key->has_gs) {
+		switch (shader->type) {
+		case MESA_SHADER_VERTEX:
+			NIR_PASS_V(s, ir3_nir_lower_vs_to_explicit_io, shader);
+			break;
+		case MESA_SHADER_GEOMETRY:
+			NIR_PASS_V(s, ir3_nir_lower_gs, shader);
+			break;
+		default:
+			break;
+		}
+	}
+
 	if (key) {
 		switch (shader->type) {
 		case MESA_SHADER_FRAGMENT:
--- a/src/freedreno/ir3/ir3_nir.h
+++ b/src/freedreno/ir3/ir3_nir.h
@@ -41,6 +41,9 @@ bool ir3_nir_lower_load_barycentric_at_sample(nir_shader *shader);
 bool ir3_nir_lower_load_barycentric_at_offset(nir_shader *shader);
 bool ir3_nir_move_varying_inputs(nir_shader *shader);

+void ir3_nir_lower_vs_to_explicit_io(nir_shader *shader, struct ir3_shader *s);
+void ir3_nir_lower_gs(nir_shader *shader, struct ir3_shader *s);
+
 const nir_shader_compiler_options * ir3_get_compiler_options(struct ir3_compiler *compiler);
 bool ir3_key_lowers_nir(const struct ir3_shader_key *key);
 void ir3_optimize_nir(struct ir3_shader *shader, nir_shader *s,
--- a/src/freedreno/ir3/ir3_nir_lower_tess.c
+++ b/src/freedreno/ir3/ir3_nir_lower_tess.c
@@ -0,0 +1,455 @@
+/*
+ * Copyright © 2019 Google, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "ir3_nir.h"
+#include "ir3_compiler.h"
+#include "compiler/nir/nir_builder.h"
+
+struct state {
+	struct primitive_map {
+		unsigned loc[32];
+		unsigned size[32];
+		unsigned stride;
+	} map;
+
+	nir_ssa_def *header;
+
+	nir_variable *vertex_count_var;
+	nir_variable *emitted_vertex_var;
+	nir_variable *vertex_flags_var;
+	nir_variable *vertex_flags_out;
+
+	nir_variable *output_vars[32];
+};
+
+static nir_ssa_def *
+bitfield_extract(nir_builder *b, nir_ssa_def *v, uint32_t start, uint32_t mask)
+{
+	return nir_iand(b, nir_ushr(b, v, nir_imm_int(b, start)),
+			nir_imm_int(b, mask));
+}
+
+static nir_ssa_def *
+build_invocation_id(nir_builder *b, struct state *state)
+{
+	return bitfield_extract(b, state->header, 11, 31);
+}
+
+static nir_ssa_def *
+build_vertex_id(nir_builder *b, struct state *state)
+{
+	return bitfield_extract(b, state->header, 6, 31);
+}
+
+static nir_ssa_def *
+build_local_primitive_id(nir_builder *b, struct state *state)
+{
+	return bitfield_extract(b, state->header, 0, 63);
+}
+
+static nir_variable *
+get_var(struct exec_list *list, int driver_location)
+{
+	nir_foreach_variable(v, list) {
+		if (v->data.driver_location == driver_location) {
+			return v;
+		}
+	}
+
+	return NULL;
+}
+
+static nir_ssa_def *
+build_local_offset(nir_builder *b, struct state *state,
+		nir_ssa_def *vertex, uint32_t base, nir_ssa_def *offset)
+{
+	nir_ssa_def *primitive_stride = nir_load_vs_primitive_stride_ir3(b);
+	nir_ssa_def *primitive_offset =
+		nir_imul(b, build_local_primitive_id(b, state), primitive_stride);
+	nir_ssa_def *attr_offset;
+	nir_ssa_def *vertex_stride;
+
+	if (b->shader->info.stage == MESA_SHADER_VERTEX) {
+		vertex_stride = nir_imm_int(b, state->map.stride * 4);
+		attr_offset = nir_imm_int(b, state->map.loc[base] * 4);
+	} else if (b->shader->info.stage == MESA_SHADER_GEOMETRY) {
+		vertex_stride = nir_load_vs_vertex_stride_ir3(b);
+		attr_offset = nir_load_primitive_location_ir3(b, base);
+	} else {
+		unreachable("bad shader stage");
+	}
+
+	nir_ssa_def *vertex_offset = nir_imul(b, vertex, vertex_stride);
+
+	return nir_iadd(b, nir_iadd(b, primitive_offset, vertex_offset),
+			nir_iadd(b, attr_offset, offset));
+}
+
+static nir_intrinsic_instr *
+replace_intrinsic(nir_builder *b, nir_intrinsic_instr *intr,
+		nir_intrinsic_op op, nir_ssa_def *src0, nir_ssa_def *src1, nir_ssa_def *src2)
+{
+	nir_intrinsic_instr *new_intr =
+		nir_intrinsic_instr_create(b->shader, op);
+
+	new_intr->src[0] = nir_src_for_ssa(src0);
+	if (src1)
+		new_intr->src[1] = nir_src_for_ssa(src1);
+	if (src2)
+		new_intr->src[2] = nir_src_for_ssa(src2);
+
+	new_intr->num_components = intr->num_components;
+
+	if (nir_intrinsic_infos[op].has_dest)
+		nir_ssa_dest_init(&new_intr->instr, &new_intr->dest,
+						  intr->num_components, 32, NULL);
+
+	nir_builder_instr_insert(b, &new_intr->instr);
+
+	if (nir_intrinsic_infos[op].has_dest)
+		nir_ssa_def_rewrite_uses(&intr->dest.ssa, nir_src_for_ssa(&new_intr->dest.ssa));
+
+	nir_instr_remove(&intr->instr);
+
+	return new_intr;
+}
+
+static void
+build_primitive_map(nir_shader *shader, struct primitive_map *map, struct exec_list *list)
+{
+	nir_foreach_variable(var, list) {
+		switch (var->data.location) {
+		case VARYING_SLOT_TESS_LEVEL_OUTER:
+		case VARYING_SLOT_TESS_LEVEL_INNER:
+			continue;
+		}
+
+		unsigned size = glsl_count_attribute_slots(var->type, false) * 4;
+
+		assert(var->data.driver_location < ARRAY_SIZE(map->size));
+		map->size[var->data.driver_location] =
+			MAX2(map->size[var->data.driver_location], size);
+	}
+
+	unsigned loc = 0;
+	for (uint32_t i = 0; i < ARRAY_SIZE(map->size); i++) {
+		if (map->size[i] == 0)
+				continue;
+		nir_variable *var = get_var(list, i);
+		map->loc[i] = loc;
+		loc += map->size[i];
+
+		if (var->data.patch)
+			map->size[i] = 0;
+		else
+			map->size[i] = map->size[i] / glsl_get_length(var->type);
+	}
+
+	map->stride = loc;
+}
+
+static void
+lower_vs_block(nir_block *block, nir_builder *b, struct state *state)
+{
+	nir_foreach_instr_safe(instr, block) {
+		if (instr->type != nir_instr_type_intrinsic)
+			continue;
+
+		nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
+
+		switch (intr->intrinsic) {
+		case nir_intrinsic_store_output: {
+			// src[] = { value, offset }.
+
+			b->cursor = nir_before_instr(&intr->instr);
+
+			nir_ssa_def *vertex_id = build_vertex_id(b, state);
+			nir_ssa_def *offset = build_local_offset(b, state, vertex_id, nir_intrinsic_base(intr),
+					intr->src[1].ssa);
+			nir_intrinsic_instr *store =
+				nir_intrinsic_instr_create(b->shader, nir_intrinsic_store_shared_ir3);
+
+			nir_intrinsic_set_write_mask(store, MASK(intr->num_components));
+			store->src[0] = nir_src_for_ssa(intr->src[0].ssa);
+			store->src[1] = nir_src_for_ssa(offset);
+
+			store->num_components = intr->num_components;
+
+			nir_builder_instr_insert(b, &store->instr);
+			break;
+		}
+
+		default:
+			break;
+		}
+	}
+}
+
+static nir_ssa_def *
+local_thread_id(nir_builder *b)
+{
+	return bitfield_extract(b, nir_load_gs_header_ir3(b), 16, 1023);
+}
+
+void
+ir3_nir_lower_vs_to_explicit_io(nir_shader *shader, struct ir3_shader *s)
+{
+	struct state state = { };
+
+	build_primitive_map(shader, &state.map, &shader->outputs);
+	memcpy(s->output_loc, state.map.loc, sizeof(s->output_loc));
+
+	nir_function_impl *impl = nir_shader_get_entrypoint(shader);
+	assert(impl);
+
+	nir_builder b;
+	nir_builder_init(&b, impl);
+	b.cursor = nir_before_cf_list(&impl->body);
+
+	state.header = nir_load_gs_header_ir3(&b);
+
+	nir_foreach_block_safe(block, impl)
+		lower_vs_block(block, &b, &state);
+
+	nir_metadata_preserve(impl, nir_metadata_block_index |
+			nir_metadata_dominance);
+
+	s->output_size = state.map.stride;
+}
+
+static void
+lower_gs_block(nir_block *block, nir_builder *b, struct state *state)
+{
+	nir_intrinsic_instr *outputs[32] = {};
+
+	nir_foreach_instr_safe(instr, block) {
+		if (instr->type != nir_instr_type_intrinsic)
+			continue;
+
+		nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
+
+		switch (intr->intrinsic) {
+		case nir_intrinsic_store_output: {
+			// src[] = { value, offset }.
+
+			uint32_t loc = nir_intrinsic_base(intr);
+			outputs[loc] = intr;
+			break;
+		}
+
+		case nir_intrinsic_end_primitive: {
+			b->cursor = nir_before_instr(&intr->instr);
+			nir_store_var(b, state->vertex_flags_var, nir_imm_int(b, 4), 0x1);
+			nir_instr_remove(&intr->instr);
+			break;
+		}
+
+		case nir_intrinsic_emit_vertex: {
+
+			/* Load the vertex count */
+			b->cursor = nir_before_instr(&intr->instr);
+			nir_ssa_def *count = nir_load_var(b, state->vertex_count_var);
+
+			nir_push_if(b, nir_ieq(b, count, local_thread_id(b)));
+
+			for (uint32_t i = 0; i < ARRAY_SIZE(outputs); i++) {
+				if (outputs[i]) {
+					nir_store_var(b, state->output_vars[i],
+							outputs[i]->src[0].ssa,
+							(1 << outputs[i]->num_components) - 1);
+
+					nir_instr_remove(&outputs[i]->instr);
+				}
+				outputs[i] = NULL;
+			}
+
+			nir_instr_remove(&intr->instr);
+
+			nir_store_var(b, state->emitted_vertex_var,
+					nir_iadd(b, nir_load_var(b, state->emitted_vertex_var), nir_imm_int(b, 1)), 0x1);
+
+			nir_store_var(b, state->vertex_flags_out,
+					nir_load_var(b, state->vertex_flags_var), 0x1);
+
+			nir_pop_if(b, NULL);
+
+			/* Increment the vertex count by 1 */
+			nir_store_var(b, state->vertex_count_var,
+					nir_iadd(b, count, nir_imm_int(b, 1)), 0x1); /* .x */
+			nir_store_var(b, state->vertex_flags_var, nir_imm_int(b, 0), 0x1);
+
+			break;
+		}
+
+		case nir_intrinsic_load_per_vertex_input: {
+			// src[] = { vertex, offset }.
+
+			b->cursor = nir_before_instr(&intr->instr);
+
+			nir_ssa_def *offset = build_local_offset(b, state,
+					intr->src[0].ssa, // this is typically gl_InvocationID
+					nir_intrinsic_base(intr),
+					intr->src[1].ssa);
+
+			replace_intrinsic(b, intr, nir_intrinsic_load_shared_ir3, offset, NULL, NULL);
+			break;
+		}
+
+		case nir_intrinsic_load_invocation_id: {
+			b->cursor = nir_before_instr(&intr->instr);
+
+			nir_ssa_def *iid = build_invocation_id(b, state);
+			nir_ssa_def_rewrite_uses(&intr->dest.ssa, nir_src_for_ssa(iid));
+			nir_instr_remove(&intr->instr);
+			break;
+		}
+
+		default:
+			break;
+		}
+	}
+}
+
+static void
+emit_store_outputs(nir_builder *b, struct state *state)
+{
+	/* This also stores the internally added vertex_flags output. */
+
+	for (uint32_t i = 0; i < ARRAY_SIZE(state->output_vars); i++) {
+		if (!state->output_vars[i])
+			continue;
+
+		nir_intrinsic_instr *store =
+			nir_intrinsic_instr_create(b->shader, nir_intrinsic_store_output);
+
+		nir_intrinsic_set_base(store, i);
+		store->src[0] = nir_src_for_ssa(nir_load_var(b, state->output_vars[i]));
+		store->src[1] = nir_src_for_ssa(nir_imm_int(b, 0));
+		store->num_components = store->src[0].ssa->num_components;
+
+		nir_builder_instr_insert(b, &store->instr);
+	}
+}
+
+static void
+clean_up_split_vars(nir_shader *shader, struct exec_list *list)
+{
+	uint32_t components[32] = {};
+
+	nir_foreach_variable(var, list) {
+		uint32_t mask =
+			((1 << glsl_get_components(glsl_without_array(var->type))) - 1) << var->data.location_frac;
+		components[var->data.driver_location] |= mask;
+	}
+
+	nir_foreach_variable_safe(var, list) {
+		uint32_t mask =
+			((1 << glsl_get_components(glsl_without_array(var->type))) - 1) << var->data.location_frac;
+		bool subset =
+			(components[var->data.driver_location] | mask) != mask;
+		if (subset)
+			exec_node_remove(&var->node);
+	}
+}
+
+void
+ir3_nir_lower_gs(nir_shader *shader, struct ir3_shader *s)
+{
+	struct state state = { };
+
+	if (shader_debug_enabled(shader->info.stage)) {
+		fprintf(stderr, "NIR (before gs lowering):\n");
+		nir_print_shader(shader, stderr);
+	}
+
+	clean_up_split_vars(shader, &shader->inputs);
+	clean_up_split_vars(shader, &shader->outputs);
+
+	build_primitive_map(shader, &state.map, &shader->inputs);
+
+	uint32_t loc = 0;
+	nir_foreach_variable(var, &shader->outputs) {
+		uint32_t end = var->data.driver_location + glsl_count_attribute_slots(var->type, false);
+		loc = MAX2(loc, end);
+	}
+
+	state.vertex_flags_out = nir_variable_create(shader, nir_var_shader_out,
+			glsl_uint_type(), "vertex_flags");
+	state.vertex_flags_out->data.driver_location = loc;
+	state.vertex_flags_out->data.location = VARYING_SLOT_GS_VERTEX_FLAGS_IR3;
+
+	nir_function_impl *impl = nir_shader_get_entrypoint(shader);
+	assert(impl);
+
+	nir_builder b;
+	nir_builder_init(&b, impl);
+	b.cursor = nir_before_cf_list(&impl->body);
+
+	state.header = nir_load_gs_header_ir3(&b);
+
+	nir_foreach_variable(var, &shader->outputs) {
+		state.output_vars[var->data.driver_location] = 
+			nir_local_variable_create(impl, var->type,
+					ralloc_asprintf(var, "%s:gs-temp", var->name));
+	}
+
+	state.vertex_count_var =
+		nir_local_variable_create(impl, glsl_uint_type(), "vertex_count");
+	state.emitted_vertex_var =
+		nir_local_variable_create(impl, glsl_uint_type(), "emitted_vertex");
+	state.vertex_flags_var =
+		nir_local_variable_create(impl, glsl_uint_type(), "vertex_flags");
+	state.vertex_flags_out = state.output_vars[state.vertex_flags_out->data.driver_location];
+
+	/* initialize to 0 */
+	b.cursor = nir_before_cf_list(&impl->body);
+	nir_store_var(&b, state.vertex_count_var, nir_imm_int(&b, 0), 0x1);
+	nir_store_var(&b, state.emitted_vertex_var, nir_imm_int(&b, 0), 0x1);
+	nir_store_var(&b, state.vertex_flags_var, nir_imm_int(&b, 4), 0x1);
+
+	nir_foreach_block_safe(block, impl)
+		lower_gs_block(block, &b, &state);
+
+	set_foreach(impl->end_block->predecessors, block_entry) {
+		struct nir_block *block = (void *)block_entry->key;
+		b.cursor = nir_after_block_before_jump(block);
+
+		nir_intrinsic_instr *discard_if =
+			nir_intrinsic_instr_create(b.shader, nir_intrinsic_discard_if);
+
+		nir_ssa_def *cond = nir_ieq(&b, nir_load_var(&b, state.emitted_vertex_var), nir_imm_int(&b, 0));
+
+		discard_if->src[0] = nir_src_for_ssa(cond);
+
+		nir_builder_instr_insert(&b, &discard_if->instr);
+
+		emit_store_outputs(&b, &state);
+	}
+
+	nir_metadata_preserve(impl, 0);
+
+	if (shader_debug_enabled(shader->info.stage)) {
+		fprintf(stderr, "NIR (after gs lowering):\n");
+		nir_print_shader(shader, stderr);
+	}
+}
--- a/src/freedreno/ir3/ir3_shader.c
+++ b/src/freedreno/ir3/ir3_shader.c
@@ -350,8 +350,15 @@ output_name(struct ir3_shader_variant *so, int i)
 	if (so->type == MESA_SHADER_FRAGMENT) {
 		return gl_frag_result_name(so->outputs[i].slot);
 	} else {
+		switch (so->outputs[i].slot) {
+		case VARYING_SLOT_GS_HEADER_IR3:
+			return "GS_HEADER";
+		case VARYING_SLOT_GS_VERTEX_FLAGS_IR3:
+			return "GS_VERTEX_FLAGS";
+		default:
 			return gl_varying_slot_name(so->outputs[i].slot);
 		}
+	}
 }

 void
--- a/src/freedreno/ir3/ir3_shader.h
+++ b/src/freedreno/ir3/ir3_shader.h
@@ -554,6 +554,11 @@ struct ir3_shader {

 	struct ir3_shader_variant *variants;
 	mtx_t variants_lock;
+
+	uint32_t output_size; /* Size in dwords of all outputs for VS, size of entire patch for HS. */
+
+	/* Map from driver_location to byte offset in per-primitive storage */
+	unsigned output_loc[32];
 };

 void * ir3_shader_assemble(struct ir3_shader_variant *v, uint32_t gpu_id);
@@ -693,6 +698,10 @@ ir3_find_output_regid(const struct ir3_shader_variant *so, unsigned slot)
 	return regid(63, 0);
 }

+#define VARYING_SLOT_GS_HEADER_IR3			(VARYING_SLOT_MAX + 0)
+#define VARYING_SLOT_GS_VERTEX_FLAGS_IR3	(VARYING_SLOT_MAX + 1)
+
+
 static inline uint32_t
 ir3_find_sysval_regid(const struct ir3_shader_variant *so, unsigned slot)
 {
--- a/src/freedreno/ir3/meson.build
+++ b/src/freedreno/ir3/meson.build
@@ -66,6 +66,7 @@ libfreedreno_ir3_files = files(
  'ir3_nir_lower_load_barycentric_at_sample.c',
  'ir3_nir_lower_load_barycentric_at_offset.c',
  'ir3_nir_lower_io_offsets.c',
+  'ir3_nir_lower_tess.c',
  'ir3_nir_lower_tg4_to_tex.c',
  'ir3_nir_move_varying_inputs.c',
  'ir3_print.c',