radv/gfx10: implement NGG support (VS only)

This needs to be cleaned up a bit, and it probably contains missing stuff and/or bugs. This doesn't fix the "half of the triangles" issue. Signed-off-by: Samuel Pitoiset <samuel.pitoiset@gmail.com> Reviewed-by: Bas Nieuwenhuizen <bas@basnieuwenhuizen.nl>
2019-07-05 08:33:06 +02:00
parent 9e37609d0b
commit ee21bd7440
6 changed files with 610 additions and 22 deletions
--- a/src/amd/vulkan/radv_nir_to_llvm.c
+++ b/src/amd/vulkan/radv_nir_to_llvm.c
@@ -70,6 +70,13 @@ struct radv_shader_context {
 	LLVMValueRef tes_u;
 	LLVMValueRef tes_v;

+	/* HW GS */
+	/* On gfx10:
+	 *  - bits 0..10: ordered_wave_id
+	 *  - bits 12..20: number of vertices in group
+	 *  - bits 22..30: number of primitives in group
+	 */
+	LLVMValueRef gs_tg_info;
 	LLVMValueRef gs2vs_offset;
 	LLVMValueRef gs_wave_id;
 	LLVMValueRef gs_vtx_offset[6];
@@ -823,11 +830,18 @@ declare_vs_input_vgprs(struct radv_shader_context *ctx, struct arg_info *args)
 		if (ctx->options->key.vs.out.as_ls) {
 			add_arg(args, ARG_VGPR, ctx->ac.i32, &ctx->rel_auto_id);
 			add_arg(args, ARG_VGPR, ctx->ac.i32, &ctx->abi.instance_id);
+			add_arg(args, ARG_VGPR, ctx->ac.i32, NULL); /* unused */
 		} else {
-			add_arg(args, ARG_VGPR, ctx->ac.i32, &ctx->abi.instance_id);
-			add_arg(args, ARG_VGPR, ctx->ac.i32, &ctx->vs_prim_id);
+			if (ctx->ac.chip_class >= GFX10) {
+				add_arg(args, ARG_VGPR, ctx->ac.i32, NULL); /* user vgpr */
+				add_arg(args, ARG_VGPR, ctx->ac.i32, NULL); /* user vgpr */
+				add_arg(args, ARG_VGPR, ctx->ac.i32, &ctx->abi.instance_id);
+			} else {
+				add_arg(args, ARG_VGPR, ctx->ac.i32, &ctx->abi.instance_id);
+				add_arg(args, ARG_VGPR, ctx->ac.i32, &ctx->vs_prim_id);
+				add_arg(args, ARG_VGPR, ctx->ac.i32, NULL); /* unused */
+			}
 		}
-		add_arg(args, ARG_VGPR, ctx->ac.i32, NULL); /* unused */
 	}
 }

@@ -969,6 +983,12 @@ static void set_llvm_calling_convention(LLVMValueRef func,
 	LLVMSetFunctionCallConv(func, calling_conv);
 }

+/* Returns whether the stage is a stage that can be directly before the GS */
+static bool is_pre_gs_stage(gl_shader_stage stage)
+{
+	return stage == MESA_SHADER_VERTEX || stage == MESA_SHADER_TESS_EVAL;
+}
+
 static void create_function(struct radv_shader_context *ctx,
                            gl_shader_stage stage,
                            bool has_previous_stage,
@@ -987,6 +1007,15 @@ static void create_function(struct radv_shader_context *ctx,
 			&ctx->ring_offsets);
 	}

+	if (ctx->ac.chip_class >= GFX10) {
+		if (stage == MESA_SHADER_VERTEX && ctx->options->key.vs.out.as_ngg) {
+			/* On GFX10, VS is merged into GS for NGG. */
+			stage = MESA_SHADER_GEOMETRY;
+			has_previous_stage = true;
+			previous_stage = MESA_SHADER_VERTEX;
+		}
+	}
+
 	switch (stage) {
 	case MESA_SHADER_COMPUTE:
 		declare_global_input_sgprs(ctx, &user_sgpr_info, &args,
@@ -1101,8 +1130,14 @@ static void create_function(struct radv_shader_context *ctx,
 	case MESA_SHADER_GEOMETRY:
 		if (has_previous_stage) {
 			// First 6 system regs
-			add_arg(&args, ARG_SGPR, ctx->ac.i32,
-				&ctx->gs2vs_offset);
+			if (ctx->options->key.vs.out.as_ngg) {
+				add_arg(&args, ARG_SGPR, ctx->ac.i32,
+					&ctx->gs_tg_info);
+			} else {
+				add_arg(&args, ARG_SGPR, ctx->ac.i32,
+					&ctx->gs2vs_offset);
+			}
+
 			add_arg(&args, ARG_SGPR, ctx->ac.i32,
 				&ctx->merged_wave_info);
 			add_arg(&args, ARG_SGPR, ctx->ac.i32, &ctx->oc_lds);
@@ -3194,6 +3229,168 @@ handle_ls_outputs_post(struct radv_shader_context *ctx)
 	}
 }

+static LLVMValueRef get_wave_id_in_tg(struct radv_shader_context *ctx)
+{
+	return ac_unpack_param(&ctx->ac, ctx->merged_wave_info, 24, 4);
+}
+
+static LLVMValueRef ngg_get_vtx_cnt(struct radv_shader_context *ctx)
+{
+	return ac_build_bfe(&ctx->ac, ctx->gs_tg_info,
+			    LLVMConstInt(ctx->ac.i32, 12, false),
+			    LLVMConstInt(ctx->ac.i32, 9, false),
+			    false);
+}
+
+static LLVMValueRef ngg_get_prim_cnt(struct radv_shader_context *ctx)
+{
+	return ac_build_bfe(&ctx->ac, ctx->gs_tg_info,
+			    LLVMConstInt(ctx->ac.i32, 22, false),
+			    LLVMConstInt(ctx->ac.i32, 9, false),
+			    false);
+}
+
+/* Send GS Alloc Req message from the first wave of the group to SPI.
+ * Message payload is:
+ * - bits 0..10: vertices in group
+ * - bits 12..22: primitives in group
+ */
+static void build_sendmsg_gs_alloc_req(struct radv_shader_context *ctx,
+				       LLVMValueRef vtx_cnt,
+				       LLVMValueRef prim_cnt)
+{
+	LLVMBuilderRef builder = ctx->ac.builder;
+	LLVMValueRef tmp;
+
+	tmp = LLVMBuildICmp(builder, LLVMIntEQ, get_wave_id_in_tg(ctx), ctx->ac.i32_0, "");
+	ac_build_ifcc(&ctx->ac, tmp, 5020);
+
+	tmp = LLVMBuildShl(builder, prim_cnt, LLVMConstInt(ctx->ac.i32, 12, false),"");
+	tmp = LLVMBuildOr(builder, tmp, vtx_cnt, "");
+	ac_build_sendmsg(&ctx->ac, AC_SENDMSG_GS_ALLOC_REQ, tmp);
+
+	ac_build_endif(&ctx->ac, 5020);
+}
+
+struct ngg_prim {
+	unsigned num_vertices;
+	LLVMValueRef isnull;
+	LLVMValueRef index[3];
+	LLVMValueRef edgeflag[3];
+};
+
+static void build_export_prim(struct radv_shader_context *ctx,
+			      const struct ngg_prim *prim)
+{
+	LLVMBuilderRef builder = ctx->ac.builder;
+	struct ac_export_args args;
+	LLVMValueRef tmp;
+
+	tmp = LLVMBuildZExt(builder, prim->isnull, ctx->ac.i32, "");
+	args.out[0] = LLVMBuildShl(builder, tmp, LLVMConstInt(ctx->ac.i32, 31, false), "");
+
+	for (unsigned i = 0; i < prim->num_vertices; ++i) {
+		tmp = LLVMBuildShl(builder, prim->index[i],
+				   LLVMConstInt(ctx->ac.i32, 10 * i, false), "");
+		args.out[0] = LLVMBuildOr(builder, args.out[0], tmp, "");
+		tmp = LLVMBuildZExt(builder, prim->edgeflag[i], ctx->ac.i32, "");
+		tmp = LLVMBuildShl(builder, tmp,
+				   LLVMConstInt(ctx->ac.i32, 10 * i + 9, false), "");
+		args.out[0] = LLVMBuildOr(builder, args.out[0], tmp, "");
+	}
+
+	args.out[0] = LLVMBuildBitCast(builder, args.out[0], ctx->ac.f32, "");
+	args.out[1] = LLVMGetUndef(ctx->ac.f32);
+	args.out[2] = LLVMGetUndef(ctx->ac.f32);
+	args.out[3] = LLVMGetUndef(ctx->ac.f32);
+
+	args.target = V_008DFC_SQ_EXP_PRIM;
+	args.enabled_channels = 1;
+	args.done = true;
+	args.valid_mask = false;
+	args.compr = false;
+
+	ac_build_export(&ctx->ac, &args);
+}
+
+static void
+handle_ngg_outputs_post(struct radv_shader_context *ctx)
+{
+	LLVMBuilderRef builder = ctx->ac.builder;
+	struct ac_build_if_state if_state;
+	unsigned num_vertices = 3;
+	LLVMValueRef tmp;
+
+	assert(ctx->stage == MESA_SHADER_VERTEX && !ctx->is_gs_copy_shader);
+
+	LLVMValueRef prims_in_wave = ac_unpack_param(&ctx->ac, ctx->merged_wave_info, 8, 8);
+	LLVMValueRef vtx_in_wave = ac_unpack_param(&ctx->ac, ctx->merged_wave_info, 0, 8);
+	LLVMValueRef is_gs_thread = LLVMBuildICmp(builder, LLVMIntULT,
+						  ac_get_thread_id(&ctx->ac), prims_in_wave, "");
+	LLVMValueRef is_es_thread = LLVMBuildICmp(builder, LLVMIntULT,
+						  ac_get_thread_id(&ctx->ac), vtx_in_wave, "");
+	LLVMValueRef vtxindex[] = {
+		ac_unpack_param(&ctx->ac, ctx->gs_vtx_offset[0], 0, 16),
+		ac_unpack_param(&ctx->ac, ctx->gs_vtx_offset[0], 16, 16),
+		ac_unpack_param(&ctx->ac, ctx->gs_vtx_offset[2], 0, 16),
+	};
+
+	/* TODO: streamout */
+
+	/* TODO: VS primitive ID */
+	if (ctx->options->key.vs.out.export_prim_id)
+		assert(0);
+
+	/* TODO: primitive culling */
+
+	build_sendmsg_gs_alloc_req(ctx, ngg_get_vtx_cnt(ctx), ngg_get_prim_cnt(ctx));
+
+	/* TODO: streamout queries */
+	/* Export primitive data to the index buffer. Format is:
+	 *  - bits 0..8: index 0
+	 *  - bit 9: edge flag 0
+	 *  - bits 10..18: index 1
+	 *  - bit 19: edge flag 1
+	 *  - bits 20..28: index 2
+	 *  - bit 29: edge flag 2
+	 *  - bit 31: null primitive (skip)
+	 *
+	 * For the first version, we will always build up all three indices
+	 * independent of the primitive type. The additional garbage data
+	 * shouldn't hurt.
+	 *
+	 * TODO: culling depends on the primitive type, so can have some
+	 * interaction here.
+	 */
+	ac_nir_build_if(&if_state, ctx, is_gs_thread);
+	{
+		struct ngg_prim prim = {};
+
+		prim.num_vertices = num_vertices;
+		prim.isnull = ctx->ac.i1false;
+		memcpy(prim.index, vtxindex, sizeof(vtxindex[0]) * 3);
+
+		for (unsigned i = 0; i < num_vertices; ++i) {
+			tmp = LLVMBuildLShr(builder, ctx->abi.gs_invocation_id,
+					    LLVMConstInt(ctx->ac.i32, 8 + i, false), "");
+			prim.edgeflag[i] = LLVMBuildTrunc(builder, tmp, ctx->ac.i1, "");
+		}
+
+		build_export_prim(ctx, &prim);
+	}
+	ac_nir_build_endif(&if_state);
+
+	/* Export per-vertex data (positions and parameters). */
+	ac_nir_build_if(&if_state, ctx, is_es_thread);
+	{
+		handle_vs_outputs_post(ctx, ctx->options->key.vs.out.export_prim_id,
+				       ctx->options->key.vs.out.export_layer_id,
+				       ctx->options->key.vs.out.export_clip_dists,
+				       &ctx->shader_info->vs.outinfo);
+	}
+	ac_nir_build_endif(&if_state);
+}
+
 static void
 write_tess_factors(struct radv_shader_context *ctx)
 {
@@ -3452,6 +3649,8 @@ handle_shader_outputs_post(struct ac_shader_abi *abi, unsigned max_outputs,
 			handle_ls_outputs_post(ctx);
 		else if (ctx->options->key.vs.out.as_es)
 			handle_es_outputs_post(ctx, &ctx->shader_info->vs.es_info);
+		else if (ctx->options->key.vs.out.as_ngg)
+			handle_ngg_outputs_post(ctx);
 		else
 			handle_vs_outputs_post(ctx, ctx->options->key.vs.out.export_prim_id,
 					       ctx->options->key.vs.out.export_layer_id,
@@ -3703,6 +3902,13 @@ LLVMModuleRef ac_translate_nir_to_llvm(struct ac_llvm_compiler *ac_llvm,
 		                                                            shaders[i]));
 	}

+	if (ctx.ac.chip_class >= GFX10) {
+		if (shaders[0]->info.stage == MESA_SHADER_VERTEX &&
+		    options->key.vs.out.as_ngg) {
+			ctx.max_workgroup_size = 128;
+		}
+	}
+
 	create_function(&ctx, shaders[shader_count - 1]->info.stage, shader_count >= 2,
 	                shader_count >= 2 ? shaders[shader_count - 2]->info.stage  : MESA_SHADER_VERTEX);

@@ -3722,7 +3928,8 @@ LLVMModuleRef ac_translate_nir_to_llvm(struct ac_llvm_compiler *ac_llvm,
 	 */
 	ctx.abi.gfx9_stride_size_workaround_for_atomic = ctx.ac.chip_class == GFX9 && HAVE_LLVM < 0x900;

-	if (shader_count >= 2)
+	bool is_ngg = is_pre_gs_stage(shaders[0]->info.stage) &&  ctx.options->key.vs.out.as_ngg;
+	if (shader_count >= 2 || is_ngg)
 		ac_init_exec_full_mask(&ctx.ac);

 	if ((ctx.ac.family == CHIP_VEGA10 ||
@@ -3788,7 +3995,7 @@ LLVMModuleRef ac_translate_nir_to_llvm(struct ac_llvm_compiler *ac_llvm,
 		ac_setup_rings(&ctx);

 		LLVMBasicBlockRef merge_block;
-		if (shader_count >= 2) {
+		if (shader_count >= 2 || is_ngg) {
 			LLVMValueRef fn = LLVMGetBasicBlockParent(LLVMGetInsertBlock(ctx.ac.builder));
 			LLVMBasicBlockRef then_block = LLVMAppendBasicBlockInContext(ctx.ac.context, fn, "");
 			merge_block = LLVMAppendBasicBlockInContext(ctx.ac.context, fn, "");
@@ -3811,7 +4018,7 @@ LLVMModuleRef ac_translate_nir_to_llvm(struct ac_llvm_compiler *ac_llvm,

 		ac_nir_translate(&ctx.ac, &ctx.abi, shaders[i]);

-		if (shader_count >= 2) {
+		if (shader_count >= 2 || is_ngg) {
 			LLVMBuildBr(ctx.ac.builder, merge_block);
 			LLVMPositionBuilderAtEnd(ctx.ac.builder, merge_block);
 		}
@@ -3955,6 +4162,7 @@ ac_fill_shader_info(struct radv_shader_variant_info *shader_info, struct nir_sha
                shader_info->vs.as_es = options->key.vs.out.as_es;
                shader_info->vs.as_ls = options->key.vs.out.as_ls;
                shader_info->vs.export_prim_id = options->key.vs.out.export_prim_id;
+                shader_info->is_ngg = options->key.vs.out.as_ngg;
                break;
        default:
                break;
--- a/src/amd/vulkan/radv_pipeline.c
+++ b/src/amd/vulkan/radv_pipeline.c
@@ -96,6 +96,30 @@ struct radv_gs_state {
 	uint32_t lds_size;
 };

+struct radv_ngg_state {
+	uint16_t ngg_emit_size; /* in dwords */
+	uint32_t hw_max_esverts;
+	uint32_t max_gsprims;
+	uint32_t max_out_verts;
+	uint32_t prim_amp_factor;
+	uint32_t vgt_esgs_ring_itemsize;
+	bool max_vert_out_per_gs_instance;
+};
+
+bool radv_pipeline_has_ngg(const struct radv_pipeline *pipeline)
+{
+	struct radv_shader_variant *variant = NULL;
+	if (pipeline->shaders[MESA_SHADER_GEOMETRY])
+		variant = pipeline->shaders[MESA_SHADER_GEOMETRY];
+	else if (pipeline->shaders[MESA_SHADER_TESS_EVAL])
+		variant = pipeline->shaders[MESA_SHADER_TESS_EVAL];
+	else if (pipeline->shaders[MESA_SHADER_VERTEX])
+		variant = pipeline->shaders[MESA_SHADER_VERTEX];
+	else
+		return false;
+	return variant->info.is_ngg;
+}
+
 static void
 radv_pipeline_destroy(struct radv_device *device,
                      struct radv_pipeline *pipeline,
@@ -1583,6 +1607,203 @@ calculate_gs_info(const VkGraphicsPipelineCreateInfo *pCreateInfo,
 	return gs;
 }

+static void clamp_gsprims_to_esverts(unsigned *max_gsprims, unsigned max_esverts,
+				     unsigned min_verts_per_prim, bool use_adjacency)
+{
+	unsigned max_reuse = max_esverts - min_verts_per_prim;
+	if (use_adjacency)
+		max_reuse /= 2;
+	*max_gsprims = MIN2(*max_gsprims, 1 + max_reuse);
+}
+
+static struct radv_ngg_state
+calculate_ngg_info(const VkGraphicsPipelineCreateInfo *pCreateInfo,
+		   struct radv_pipeline *pipeline)
+{
+	struct radv_ngg_state ngg = {0};
+	struct radv_shader_variant_info *gs_info = &pipeline->shaders[MESA_SHADER_GEOMETRY]->info;
+	struct radv_es_output_info *es_info =
+		radv_pipeline_has_tess(pipeline) ? &gs_info->tes.es_info : &gs_info->vs.es_info;
+	unsigned gs_type = MESA_SHADER_VERTEX;
+	unsigned max_verts_per_prim = 3; // triangles
+	unsigned min_verts_per_prim =
+		gs_type == MESA_SHADER_GEOMETRY ? max_verts_per_prim : 1;
+	unsigned gs_num_invocations = 1;//MAX2(gs_info->gs.invocations, 1);
+	bool uses_adjacency;
+	switch(pCreateInfo->pInputAssemblyState->topology) {
+	case VK_PRIMITIVE_TOPOLOGY_LINE_LIST_WITH_ADJACENCY:
+	case VK_PRIMITIVE_TOPOLOGY_LINE_STRIP_WITH_ADJACENCY:
+	case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST_WITH_ADJACENCY:
+	case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_STRIP_WITH_ADJACENCY:
+		uses_adjacency = true;
+		break;
+	default:
+		uses_adjacency = false;
+		break;
+	}
+
+	/* All these are in dwords: */
+	/* We can't allow using the whole LDS, because GS waves compete with
+	 * other shader stages for LDS space.
+	 *
+	 * Streamout can increase the ESGS buffer size later on, so be more
+	 * conservative with streamout and use 4K dwords. This may be suboptimal.
+	 *
+	 * Otherwise, use the limit of 7K dwords. The reason is that we need
+	 * to leave some headroom for the max_esverts increase at the end.
+	 *
+	 * TODO: We should really take the shader's internal LDS use into
+	 *       account. The linker will fail if the size is greater than
+	 *       8K dwords.
+	 */
+	const unsigned max_lds_size = (0 /*gs_info->info.so.num_outputs*/ ? 4 : 7) * 1024 - 128;
+	const unsigned target_lds_size = max_lds_size;
+	unsigned esvert_lds_size = 0;
+	unsigned gsprim_lds_size = 0;
+
+	/* All these are per subgroup: */
+	bool max_vert_out_per_gs_instance = false;
+	unsigned max_esverts_base = 256;
+	unsigned max_gsprims_base = 128; /* default prim group size clamp */
+
+	/* Hardware has the following non-natural restrictions on the value
+	 * of GE_CNTL.VERT_GRP_SIZE based on based on the primitive type of
+	 * the draw:
+	 *  - at most 252 for any line input primitive type
+	 *  - at most 251 for any quad input primitive type
+	 *  - at most 251 for triangle strips with adjacency (this happens to
+	 *    be the natural limit for triangle *lists* with adjacency)
+	 */
+	max_esverts_base = MIN2(max_esverts_base, 251 + max_verts_per_prim - 1);
+
+	if (gs_type == MESA_SHADER_GEOMETRY) {
+		unsigned max_out_verts_per_gsprim =
+			gs_info->gs.vertices_out * gs_num_invocations;
+
+		if (max_out_verts_per_gsprim <= 256) {
+			if (max_out_verts_per_gsprim) {
+				max_gsprims_base = MIN2(max_gsprims_base,
+							256 / max_out_verts_per_gsprim);
+			}
+		} else {
+			/* Use special multi-cycling mode in which each GS
+			 * instance gets its own subgroup. Does not work with
+			 * tessellation. */
+			max_vert_out_per_gs_instance = true;
+			max_gsprims_base = 1;
+			max_out_verts_per_gsprim = gs_info->gs.vertices_out;
+		}
+
+		esvert_lds_size = es_info->esgs_itemsize / 4;
+		gsprim_lds_size = (gs_info->gs.gsvs_vertex_size / 4 + 1) * max_out_verts_per_gsprim;
+	} else {
+		/* TODO: This needs to be adjusted once LDS use for compaction
+		 * after culling is implemented. */
+		/*
+		if (es_info->info.so.num_outputs)
+			esvert_lds_size = 4 * es_info->info.so.num_outputs + 1;
+		*/
+	}
+
+	unsigned max_gsprims = max_gsprims_base;
+	unsigned max_esverts = max_esverts_base;
+
+	if (esvert_lds_size)
+		max_esverts = MIN2(max_esverts, target_lds_size / esvert_lds_size);
+	if (gsprim_lds_size)
+		max_gsprims = MIN2(max_gsprims, target_lds_size / gsprim_lds_size);
+
+	max_esverts = MIN2(max_esverts, max_gsprims * max_verts_per_prim);
+	clamp_gsprims_to_esverts(&max_gsprims, max_esverts, min_verts_per_prim, uses_adjacency);
+	assert(max_esverts >= max_verts_per_prim && max_gsprims >= 1);
+
+	if (esvert_lds_size || gsprim_lds_size) {
+		/* Now that we have a rough proportionality between esverts
+		 * and gsprims based on the primitive type, scale both of them
+		 * down simultaneously based on required LDS space.
+		 *
+		 * We could be smarter about this if we knew how much vertex
+		 * reuse to expect.
+		 */
+		unsigned lds_total = max_esverts * esvert_lds_size +
+				     max_gsprims * gsprim_lds_size;
+		if (lds_total > target_lds_size) {
+			max_esverts = max_esverts * target_lds_size / lds_total;
+			max_gsprims = max_gsprims * target_lds_size / lds_total;
+
+			max_esverts = MIN2(max_esverts, max_gsprims * max_verts_per_prim);
+			clamp_gsprims_to_esverts(&max_gsprims, max_esverts,
+						 min_verts_per_prim, uses_adjacency);
+			assert(max_esverts >= max_verts_per_prim && max_gsprims >= 1);
+		}
+	}
+
+	/* Round up towards full wave sizes for better ALU utilization. */
+	if (!max_vert_out_per_gs_instance) {
+		const unsigned wavesize = 64;
+		unsigned orig_max_esverts;
+		unsigned orig_max_gsprims;
+		do {
+			orig_max_esverts = max_esverts;
+			orig_max_gsprims = max_gsprims;
+
+			max_esverts = align(max_esverts, wavesize);
+			max_esverts = MIN2(max_esverts, max_esverts_base);
+			if (esvert_lds_size)
+				max_esverts = MIN2(max_esverts,
+						   (max_lds_size - max_gsprims * gsprim_lds_size) /
+						   esvert_lds_size);
+			max_esverts = MIN2(max_esverts, max_gsprims * max_verts_per_prim);
+
+			max_gsprims = align(max_gsprims, wavesize);
+			max_gsprims = MIN2(max_gsprims, max_gsprims_base);
+			if (gsprim_lds_size)
+				max_gsprims = MIN2(max_gsprims,
+						   (max_lds_size - max_esverts * esvert_lds_size) /
+						   gsprim_lds_size);
+			clamp_gsprims_to_esverts(&max_gsprims, max_esverts,
+						 min_verts_per_prim, uses_adjacency);
+			assert(max_esverts >= max_verts_per_prim && max_gsprims >= 1);
+		} while (orig_max_esverts != max_esverts || orig_max_gsprims != max_gsprims);
+	}
+
+	/* Hardware restriction: minimum value of max_esverts */
+	max_esverts = MAX2(max_esverts, 23 + max_verts_per_prim);
+
+	unsigned max_out_vertices =
+		max_vert_out_per_gs_instance ? gs_info->gs.vertices_out :
+		gs_type == MESA_SHADER_GEOMETRY ?
+		max_gsprims * gs_num_invocations * gs_info->gs.vertices_out :
+		max_esverts;
+	assert(max_out_vertices <= 256);
+
+	unsigned prim_amp_factor = 1;
+	if (gs_type == MESA_SHADER_GEOMETRY) {
+		/* Number of output primitives per GS input primitive after
+		 * GS instancing. */
+		prim_amp_factor = gs_info->gs.vertices_out;
+	}
+
+	/* The GE only checks against the maximum number of ES verts after
+	 * allocating a full GS primitive. So we need to ensure that whenever
+	 * this check passes, there is enough space for a full primitive without
+	 * vertex reuse.
+	 */
+	ngg.hw_max_esverts = max_esverts - max_verts_per_prim + 1;
+	ngg.max_gsprims = max_gsprims;
+	ngg.max_out_verts = max_out_vertices;
+	ngg.prim_amp_factor = prim_amp_factor;
+	ngg.max_vert_out_per_gs_instance = max_vert_out_per_gs_instance;
+	ngg.ngg_emit_size = max_gsprims * gsprim_lds_size;
+	ngg.vgt_esgs_ring_itemsize = 1;
+
+	pipeline->graphics.esgs_ring_size = 4 * max_esverts * esvert_lds_size;
+
+	assert(ngg.hw_max_esverts >= 24); /* HW limitation */
+
+	return ngg;
+}
+
 static void
 calculate_gs_ring_sizes(struct radv_pipeline *pipeline, const struct radv_gs_state *gs)
 {
@@ -2000,7 +2221,8 @@ radv_generate_graphics_pipeline_key(struct radv_pipeline *pipeline,
 }

 static void
-radv_fill_shader_keys(struct radv_shader_variant_key *keys,
+radv_fill_shader_keys(struct radv_device *device,
+		      struct radv_shader_variant_key *keys,
                      const struct radv_pipeline_key *key,
                      nir_shader **nir)
 {
@@ -2031,6 +2253,10 @@ radv_fill_shader_keys(struct radv_shader_variant_key *keys,
 			keys[MESA_SHADER_VERTEX].vs.out.as_es = true;
 	}

+	if (device->physical_device->rad_info.chip_class >= GFX10) {
+		keys[MESA_SHADER_VERTEX].vs.out.as_ngg = true;
+	}
+
 	for(int i = 0; i < MESA_SHADER_STAGES; ++i)
 		keys[i].has_multiview_view_index = key->has_multiview_view_index;

@@ -2221,7 +2447,7 @@ void radv_create_shaders(struct radv_pipeline *pipeline,
 			nir_print_shader(nir[i], stderr);
 	}

-	radv_fill_shader_keys(keys, key, nir);
+	radv_fill_shader_keys(device, keys, key, nir);

 	if (nir[MESA_SHADER_FRAGMENT]) {
 		if (!pipeline->shaders[MESA_SHADER_FRAGMENT]) {
@@ -2356,6 +2582,8 @@ radv_pipeline_stage_to_user_data_0(struct radv_pipeline *pipeline,
 {
 	bool has_gs = radv_pipeline_has_gs(pipeline);
 	bool has_tess = radv_pipeline_has_tess(pipeline);
+	bool has_ngg = radv_pipeline_has_ngg(pipeline);
+
 	switch (stage) {
 	case MESA_SHADER_FRAGMENT:
 		return R_00B030_SPI_SHADER_USER_DATA_PS_0;
@@ -2379,6 +2607,9 @@ radv_pipeline_stage_to_user_data_0(struct radv_pipeline *pipeline,
 			}
 		}

+		if (has_ngg)
+			return R_00B230_SPI_SHADER_USER_DATA_GS_0;
+
 		return R_00B130_SPI_SHADER_USER_DATA_VS_0;
 	case MESA_SHADER_GEOMETRY:
 		return chip_class == GFX9 ? R_00B330_SPI_SHADER_USER_DATA_ES_0 :
@@ -2968,8 +3199,7 @@ radv_pipeline_generate_vgt_gs_mode(struct radeon_cmdbuf *ctx_cs,
                                   struct radv_pipeline *pipeline)
 {
 	const struct radv_vs_output_info *outinfo = get_vs_output_info(pipeline);
-
-	uint32_t vgt_primitiveid_en = false;
+	unsigned vgt_primitiveid_en = 0;
 	uint32_t vgt_gs_mode = 0;

 	if (radv_pipeline_has_gs(pipeline)) {
@@ -2978,9 +3208,17 @@ radv_pipeline_generate_vgt_gs_mode(struct radeon_cmdbuf *ctx_cs,

 		vgt_gs_mode = ac_vgt_gs_mode(gs->info.gs.vertices_out,
 		                             pipeline->device->physical_device->rad_info.chip_class);
+	} else if (radv_pipeline_has_ngg(pipeline)) {
+		const struct radv_shader_variant *vs =
+			pipeline->shaders[MESA_SHADER_VERTEX];
+		bool enable_prim_id =
+			outinfo->export_prim_id || vs->info.info.uses_prim_id;
+
+		vgt_primitiveid_en |= S_028A84_PRIMITIVEID_EN(enable_prim_id) |
+				      S_028A84_NGG_DISABLE_PROVOK_REUSE(enable_prim_id);
 	} else if (outinfo->export_prim_id) {
 		vgt_gs_mode = S_028A40_MODE(V_028A40_GS_SCENARIO_A);
-		vgt_primitiveid_en = true;
+		vgt_primitiveid_en |= S_028A84_PRIMITIVEID_EN(1);
 	}

 	radeon_set_context_reg(ctx_cs, R_028A84_VGT_PRIMITIVEID_EN, vgt_primitiveid_en);
@@ -3084,6 +3322,105 @@ radv_pipeline_generate_hw_ls(struct radeon_cmdbuf *cs,
 	radeon_emit(cs, rsrc2);
 }

+static void
+radv_pipeline_generate_hw_ngg(struct radeon_cmdbuf *ctx_cs,
+			      struct radeon_cmdbuf *cs,
+			      struct radv_pipeline *pipeline,
+			      struct radv_shader_variant *shader,
+			      const struct radv_ngg_state *ngg_state)
+{
+	uint64_t va = radv_buffer_get_va(shader->bo) + shader->bo_offset;
+
+	radeon_set_sh_reg_seq(cs, R_00B320_SPI_SHADER_PGM_LO_ES, 2);
+	radeon_emit(cs, va >> 8);
+	radeon_emit(cs, va >> 40);
+	radeon_set_sh_reg_seq(cs, R_00B228_SPI_SHADER_PGM_RSRC1_GS, 2);
+	radeon_emit(cs, shader->config.rsrc1);
+	radeon_emit(cs, shader->config.rsrc2);
+
+	const struct radv_vs_output_info *outinfo = get_vs_output_info(pipeline);
+	unsigned clip_dist_mask, cull_dist_mask, total_mask;
+	clip_dist_mask = outinfo->clip_dist_mask;
+	cull_dist_mask = outinfo->cull_dist_mask;
+	total_mask = clip_dist_mask | cull_dist_mask;
+	bool misc_vec_ena = outinfo->writes_pointsize ||
+		outinfo->writes_layer ||
+		outinfo->writes_viewport_index;
+	bool break_wave_at_eoi = false;
+
+	radeon_set_context_reg(ctx_cs, R_0286C4_SPI_VS_OUT_CONFIG,
+	                       S_0286C4_VS_EXPORT_COUNT(MAX2(1, outinfo->param_exports) - 1));
+	radeon_set_context_reg(ctx_cs, R_028708_SPI_SHADER_IDX_FORMAT,
+			       S_028708_IDX0_EXPORT_FORMAT(V_028708_SPI_SHADER_1COMP));
+	radeon_set_context_reg(ctx_cs, R_02870C_SPI_SHADER_POS_FORMAT,
+	                       S_02870C_POS0_EXPORT_FORMAT(V_02870C_SPI_SHADER_4COMP) |
+	                       S_02870C_POS1_EXPORT_FORMAT(outinfo->pos_exports > 1 ?
+	                                                   V_02870C_SPI_SHADER_4COMP :
+	                                                   V_02870C_SPI_SHADER_NONE) |
+	                       S_02870C_POS2_EXPORT_FORMAT(outinfo->pos_exports > 2 ?
+	                                                   V_02870C_SPI_SHADER_4COMP :
+	                                                   V_02870C_SPI_SHADER_NONE) |
+	                       S_02870C_POS3_EXPORT_FORMAT(outinfo->pos_exports > 3 ?
+	                                                   V_02870C_SPI_SHADER_4COMP :
+	                                                   V_02870C_SPI_SHADER_NONE));
+
+	radeon_set_context_reg(ctx_cs, R_028818_PA_CL_VTE_CNTL,
+			       S_028818_VTX_W0_FMT(1) |
+			       S_028818_VPORT_X_SCALE_ENA(1) | S_028818_VPORT_X_OFFSET_ENA(1) |
+			       S_028818_VPORT_Y_SCALE_ENA(1) | S_028818_VPORT_Y_OFFSET_ENA(1) |
+			       S_028818_VPORT_Z_SCALE_ENA(1) | S_028818_VPORT_Z_OFFSET_ENA(1));
+	radeon_set_context_reg(ctx_cs, R_02881C_PA_CL_VS_OUT_CNTL,
+	                       S_02881C_USE_VTX_POINT_SIZE(outinfo->writes_pointsize) |
+	                       S_02881C_USE_VTX_RENDER_TARGET_INDX(outinfo->writes_layer) |
+	                       S_02881C_USE_VTX_VIEWPORT_INDX(outinfo->writes_viewport_index) |
+	                       S_02881C_VS_OUT_MISC_VEC_ENA(misc_vec_ena) |
+	                       S_02881C_VS_OUT_MISC_SIDE_BUS_ENA(misc_vec_ena) |
+	                       S_02881C_VS_OUT_CCDIST0_VEC_ENA((total_mask & 0x0f) != 0) |
+	                       S_02881C_VS_OUT_CCDIST1_VEC_ENA((total_mask & 0xf0) != 0) |
+	                       cull_dist_mask << 8 |
+	                       clip_dist_mask);
+
+	/* TODO: Correctly set REUSE_OFF */
+	radeon_set_context_reg(ctx_cs, R_028AB4_VGT_REUSE_OFF,
+			       S_028AB4_REUSE_OFF(0));
+	radeon_set_context_reg(ctx_cs, R_028AAC_VGT_ESGS_RING_ITEMSIZE,
+			       ngg_state->vgt_esgs_ring_itemsize);
+
+	/* NGG specific registers. */
+	struct radv_shader_variant *gs = pipeline->shaders[MESA_SHADER_GEOMETRY];
+	uint32_t gs_num_invocations = gs ? gs->info.gs.invocations : 1;
+
+	radeon_set_context_reg(ctx_cs, R_028A44_VGT_GS_ONCHIP_CNTL,
+			       S_028A44_ES_VERTS_PER_SUBGRP(ngg_state->hw_max_esverts) |
+			       S_028A44_GS_PRIMS_PER_SUBGRP(ngg_state->max_gsprims) |
+			       S_028A44_GS_INST_PRIMS_IN_SUBGRP(ngg_state->max_gsprims * gs_num_invocations));
+	radeon_set_context_reg(ctx_cs, R_0287FC_GE_MAX_OUTPUT_PER_SUBGROUP,
+			       S_0287FC_MAX_VERTS_PER_SUBGROUP(ngg_state->max_out_verts));
+	radeon_set_context_reg(ctx_cs, R_028B4C_GE_NGG_SUBGRP_CNTL,
+			       S_028B4C_PRIM_AMP_FACTOR(ngg_state->prim_amp_factor) |
+			       S_028B4C_THDS_PER_SUBGRP(0)); /* for fast launch */
+	radeon_set_context_reg(ctx_cs, R_028B90_VGT_GS_INSTANCE_CNT,
+			       S_028B90_CNT(gs_num_invocations) |
+			       S_028B90_ENABLE(gs_num_invocations > 1) |
+			       S_028B90_EN_MAX_VERT_OUT_PER_GS_INSTANCE(ngg_state->max_vert_out_per_gs_instance));
+
+	/* User edge flags are set by the pos exports. If user edge flags are
+	 * not used, we must use hw-generated edge flags and pass them via
+	 * the prim export to prevent drawing lines on internal edges of
+	 * decomposed primitives (such as quads) with polygon mode = lines.
+	 *
+	 * TODO: We should combine hw-generated edge flags with user edge
+	 *       flags in the shader.
+	 */
+	radeon_set_context_reg(ctx_cs, R_028838_PA_CL_NGG_CNTL,
+			       S_028838_INDEX_BUF_EDGE_FLAG_ENA(1));
+
+	radeon_set_context_reg(ctx_cs, R_03096C_GE_CNTL,
+			       S_03096C_PRIM_GRP_SIZE(ngg_state->max_gsprims) |
+			       S_03096C_VERT_GRP_SIZE(ngg_state->hw_max_esverts) |
+			       S_03096C_BREAK_WAVE_AT_EOI(break_wave_at_eoi));
+}
+
 static void
 radv_pipeline_generate_hw_hs(struct radeon_cmdbuf *cs,
 			     struct radv_pipeline *pipeline,
@@ -3127,7 +3464,8 @@ static void
 radv_pipeline_generate_vertex_shader(struct radeon_cmdbuf *ctx_cs,
 				     struct radeon_cmdbuf *cs,
 				     struct radv_pipeline *pipeline,
-				     const struct radv_tessellation_state *tess)
+				     const struct radv_tessellation_state *tess,
+				     const struct radv_ngg_state *ngg)
 {
 	struct radv_shader_variant *vs;

@@ -3140,6 +3478,8 @@ radv_pipeline_generate_vertex_shader(struct radeon_cmdbuf *ctx_cs,
 		radv_pipeline_generate_hw_ls(cs, pipeline, vs, tess);
 	else if (vs->info.vs.as_es)
 		radv_pipeline_generate_hw_es(cs, pipeline, vs);
+	else if (vs->info.is_ngg)
+		radv_pipeline_generate_hw_ngg(ctx_cs, cs, pipeline, vs, ngg);
 	else
 		radv_pipeline_generate_hw_vs(ctx_cs, cs, pipeline, vs);
 }
@@ -3468,13 +3808,20 @@ radv_compute_vgt_shader_stages_en(const struct radv_pipeline *pipeline)
 			stages |=  S_028B54_ES_EN(V_028B54_ES_STAGE_DS) |
 				S_028B54_GS_EN(1) |
 				S_028B54_VS_EN(V_028B54_VS_STAGE_COPY_SHADER);
+		else if (radv_pipeline_has_ngg(pipeline))
+			stages |= S_028B54_ES_EN(V_028B54_ES_STAGE_DS) |
+				  S_028B54_PRIMGEN_EN(1);
 		else
 			stages |= S_028B54_VS_EN(V_028B54_VS_STAGE_DS);

-	} else if (radv_pipeline_has_gs(pipeline))
+	} else if (radv_pipeline_has_gs(pipeline)) {
 		stages |= S_028B54_ES_EN(V_028B54_ES_STAGE_REAL) |
 			S_028B54_GS_EN(1) |
 			S_028B54_VS_EN(V_028B54_VS_STAGE_COPY_SHADER);
+	} else if (radv_pipeline_has_ngg(pipeline)) {
+		stages |= S_028B54_ES_EN(V_028B54_ES_STAGE_REAL) |
+			  S_028B54_PRIMGEN_EN(1);
+	}

 	if (pipeline->device->physical_device->rad_info.chip_class >= GFX9)
 		stages |= S_028B54_MAX_PRIMGRP_IN_WAVE(2);
@@ -3555,6 +3902,7 @@ radv_pipeline_generate_pm4(struct radv_pipeline *pipeline,
                           const struct radv_blend_state *blend,
                           const struct radv_tessellation_state *tess,
                           const struct radv_gs_state *gs,
+                           const struct radv_ngg_state *ngg,
                           unsigned prim, unsigned gs_out)
 {
 	struct radeon_cmdbuf *ctx_cs = &pipeline->ctx_cs;
@@ -3570,7 +3918,7 @@ radv_pipeline_generate_pm4(struct radv_pipeline *pipeline,
 	radv_pipeline_generate_raster_state(ctx_cs, pipeline, pCreateInfo);
 	radv_pipeline_generate_multisample_state(ctx_cs, pipeline);
 	radv_pipeline_generate_vgt_gs_mode(ctx_cs, pipeline);
-	radv_pipeline_generate_vertex_shader(ctx_cs, cs, pipeline, tess);
+	radv_pipeline_generate_vertex_shader(ctx_cs, cs, pipeline, tess, ngg);
 	radv_pipeline_generate_tess_shaders(ctx_cs, cs, pipeline, tess);
 	radv_pipeline_generate_geometry_shader(ctx_cs, cs, pipeline, gs);
 	radv_pipeline_generate_fragment_shader(ctx_cs, cs, pipeline);
@@ -3578,7 +3926,7 @@ radv_pipeline_generate_pm4(struct radv_pipeline *pipeline,
 	radv_pipeline_generate_vgt_vertex_reuse(ctx_cs, pipeline);
 	radv_pipeline_generate_binning_state(ctx_cs, pipeline, pCreateInfo);

-	if (pipeline->device->physical_device->rad_info.chip_class >= GFX10)
+	if (pipeline->device->physical_device->rad_info.chip_class >= GFX10 && !radv_pipeline_has_ngg(pipeline))
 		gfx10_pipeline_generate_ge_cntl(ctx_cs, pipeline, tess, gs);

 	radeon_set_context_reg(ctx_cs, R_0286E8_SPI_TMPRING_SIZE,
@@ -3848,8 +4196,12 @@ radv_pipeline_init(struct radv_pipeline *pipeline,
 		}
 	}

+	struct radv_ngg_state ngg = {0};
 	struct radv_gs_state gs = {0};
-	if (radv_pipeline_has_gs(pipeline)) {
+
+	if (radv_pipeline_has_ngg(pipeline)) {
+		ngg = calculate_ngg_info(pCreateInfo, pipeline);
+	} else if (radv_pipeline_has_gs(pipeline)) {
 		gs = calculate_gs_info(pCreateInfo, pipeline);
 		calculate_gs_ring_sizes(pipeline, &gs);
 	}
@@ -3885,7 +4237,7 @@ radv_pipeline_init(struct radv_pipeline *pipeline,
 	pipeline->streamout_shader = radv_pipeline_get_streamout_shader(pipeline);

 	result = radv_pipeline_scratch_init(device, pipeline);
-	radv_pipeline_generate_pm4(pipeline, pCreateInfo, extra, &blend, &tess, &gs, prim, gs_out);
+	radv_pipeline_generate_pm4(pipeline, pCreateInfo, extra, &blend, &tess, &gs, &ngg, prim, gs_out);

 	return result;
 }
--- a/src/amd/vulkan/radv_private.h
+++ b/src/amd/vulkan/radv_private.h
@@ -1510,6 +1510,8 @@ static inline bool radv_pipeline_has_tess(const struct radv_pipeline *pipeline)
 	return pipeline->shaders[MESA_SHADER_TESS_CTRL] ? true : false;
 }

+bool radv_pipeline_has_ngg(const struct radv_pipeline *pipeline);
+
 struct radv_userdata_info *radv_lookup_user_sgpr(struct radv_pipeline *pipeline,
 						 gl_shader_stage stage,
 						 int idx);
--- a/src/amd/vulkan/radv_shader.c
+++ b/src/amd/vulkan/radv_shader.c
@@ -583,7 +583,9 @@ static void radv_postprocess_config(const struct radv_physical_device *pdevice,
 		config_out->rsrc1 |= S_00B428_MEM_ORDERED(pdevice->rad_info.chip_class >= GFX10);
 		break;
 	case MESA_SHADER_VERTEX:
-		if (info->vs.as_ls) {
+		if (info->is_ngg) {
+			config_out->rsrc1 |= S_00B228_MEM_ORDERED(pdevice->rad_info.chip_class >= GFX10);
+		} else if (info->vs.as_ls) {
 			assert(pdevice->rad_info.chip_class <= GFX8);
 			/* We need at least 2 components for LS.
 			 * VGPR0-3: (VertexID, RelAutoindex, InstanceID / StepRate0, InstanceID).
@@ -632,8 +634,19 @@ static void radv_postprocess_config(const struct radv_physical_device *pdevice,
 		break;
 	}

-	if (pdevice->rad_info.chip_class >= GFX9 &&
-	    stage == MESA_SHADER_GEOMETRY) {
+	if (pdevice->rad_info.chip_class >= GFX10 &&
+	    stage == MESA_SHADER_VERTEX) {
+		unsigned gs_vgpr_comp_cnt, es_vgpr_comp_cnt;
+
+		/* VGPR5-8: (VertexID, UserVGPR0, UserVGPR1, UserVGPR2 / InstanceID) */
+		es_vgpr_comp_cnt = info->info.vs.needs_instance_id ? 3 : 0;
+		gs_vgpr_comp_cnt = 3;
+
+		config_out->rsrc1 |= S_00B228_GS_VGPR_COMP_CNT(gs_vgpr_comp_cnt);
+		config_out->rsrc2 |= S_00B22C_ES_VGPR_COMP_CNT(es_vgpr_comp_cnt) |
+				     S_00B22C_LDS_SIZE(config_in->lds_size);
+	} else if (pdevice->rad_info.chip_class >= GFX9 &&
+		   stage == MESA_SHADER_GEOMETRY) {
 		unsigned es_type = info->gs.es_type;
 		unsigned gs_vgpr_comp_cnt, es_vgpr_comp_cnt;

--- a/src/amd/vulkan/radv_shader.h
+++ b/src/amd/vulkan/radv_shader.h
@@ -65,6 +65,7 @@ enum {
 struct radv_vs_out_key {
 	uint32_t as_es:1;
 	uint32_t as_ls:1;
+	uint32_t as_ngg:1;
 	uint32_t export_prim_id:1;
 	uint32_t export_layer_id:1;
 	uint32_t export_clip_dists:1;
@@ -264,6 +265,7 @@ struct radv_shader_variant_info {
 	unsigned num_input_vgprs;
 	unsigned private_mem_vgprs;
 	bool need_indirect_descriptor_sets;
+	bool is_ngg;
 	struct {
 		struct {
 			struct radv_vs_output_info outinfo;
--- a/src/amd/vulkan/si_cmd_buffer.c
+++ b/src/amd/vulkan/si_cmd_buffer.c
@@ -317,6 +317,17 @@ si_emit_graphics(struct radv_physical_device *physical_device,
 	}

 	if (physical_device->rad_info.chip_class >= GFX10) {
+		/* Break up a pixel wave if it contains deallocs for more than
+		 * half the parameter cache.
+		 *
+		 * To avoid a deadlock where pixel waves aren't launched
+		 * because they're waiting for more pixels while the frontend
+		 * is stuck waiting for PC space, the maximum allowed value is
+		 * the size of the PC minus the largest possible allocation for
+		 * a single primitive shader subgroup.
+		 */
+		radeon_set_context_reg(cs, R_028C50_PA_SC_NGG_MODE_CNTL,
+				       S_028C50_MAX_DEALLOCS_IN_WAVE(512));
 		radeon_set_context_reg(cs, R_028C58_VGT_VERTEX_REUSE_BLOCK_CNTL, 14);
 		radeon_set_context_reg(cs, R_02835C_PA_SC_TILE_STEERING_OVERRIDE,
 				       physical_device->rad_info.pa_sc_tile_steering_override);