radv: Fill some tess shader info earlier.

Signed-off-by: Timur Kristóf <timur.kristof@gmail.com> Reviewed-by: Rhys Perry <pendingchaos02@gmail.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/9201>
2021-03-03 16:29:59 +01:00
parent 52219ad3a0
commit b3a16c0e19
5 changed files with 95 additions and 104 deletions
--- a/src/amd/compiler/aco_instruction_selection.cpp
+++ b/src/amd/compiler/aco_instruction_selection.cpp
@@ -11053,7 +11053,7 @@ static void write_tcs_tess_factors(isel_context *ctx)
   store_vmem_mubuf(ctx, tf_vec, hs_ring_tess_factor, byte_offset, tf_base, tf_const_offset, 4, (1 << stride) - 1, true, memory_sync_info());

   /* Store to offchip for TES to read - only if TES reads them */
-   if (ctx->args->options->key.tcs.tes_reads_tess_factors) {
+   if (ctx->args->shader_info->tcs.tes_reads_tess_factors) {
      Temp hs_ring_tess_offchip = bld.smem(aco_opcode::s_load_dwordx4, bld.def(s4), ctx->program->private_segment_buffer, Operand(RING_HS_TESS_OFFCHIP * 16u));
      Temp oc_lds = get_arg(ctx, ctx->args->ac.tess_offchip_offset);

--- a/src/amd/compiler/aco_instruction_selection_setup.cpp
+++ b/src/amd/compiler/aco_instruction_selection_setup.cpp
@@ -448,14 +448,14 @@ setup_tcs_info(isel_context *ctx, nir_shader *nir, nir_shader *vs)
   ctx->tcs_num_inputs = ctx->program->info->tcs.num_linked_inputs;
   ctx->tcs_num_outputs = ctx->program->info->tcs.num_linked_outputs;
   ctx->tcs_num_patch_outputs = ctx->program->info->tcs.num_linked_patch_outputs;
-   ctx->tcs_num_patches = ctx->args->shader_info->tcs.num_patches;
+   ctx->tcs_num_patches = ctx->args->shader_info->num_tess_patches;
   ctx->program->config->lds_size = ctx->args->shader_info->tcs.num_lds_blocks;
 }

 void
 setup_tes_variables(isel_context *ctx, nir_shader *nir)
 {
-   ctx->tcs_num_patches = ctx->args->options->key.tes.num_patches;
+   ctx->tcs_num_patches = ctx->args->shader_info->num_tess_patches;
   ctx->tcs_num_outputs = ctx->program->info->tes.num_linked_inputs;

   if (ctx->stage == tess_eval_vs || ctx->stage == tess_eval_ngg) {
--- a/src/amd/vulkan/radv_nir_to_llvm.c
+++ b/src/amd/vulkan/radv_nir_to_llvm.c
@@ -3465,7 +3465,7 @@ write_tess_factors(struct radv_shader_context *ctx)
 					    16 + tf_offset, ac_glc);

 	//store to offchip for TES to read - only if TES reads them
-	if (ctx->args->options->key.tcs.tes_reads_tess_factors) {
+	if (ctx->args->shader_info->tcs.tes_reads_tess_factors) {
 		LLVMValueRef inner_vec, outer_vec, tf_outer_offset;
 		LLVMValueRef tf_inner_offset;

@@ -3986,12 +3986,12 @@ LLVMModuleRef ac_translate_nir_to_llvm(struct ac_llvm_compiler *ac_llvm,
 			ctx.abi.load_patch_vertices_in = load_patch_vertices_in;
 			ctx.abi.store_tcs_outputs = store_tcs_output;
 			ctx.tcs_num_inputs = ctx.args->shader_info->tcs.num_linked_inputs;
-			ctx.tcs_num_patches = args->shader_info->tcs.num_patches;
+			ctx.tcs_num_patches = args->shader_info->num_tess_patches;
 		} else if (shaders[shader_idx]->info.stage == MESA_SHADER_TESS_EVAL) {
 			ctx.abi.load_tess_varyings = load_tes_input;
 			ctx.abi.load_tess_coord = load_tess_coord;
 			ctx.abi.load_patch_vertices_in = load_patch_vertices_in;
-			ctx.tcs_num_patches = args->options->key.tes.num_patches;
+			ctx.tcs_num_patches = args->shader_info->num_tess_patches;
 		} else if (shaders[shader_idx]->info.stage == MESA_SHADER_VERTEX) {
 			ctx.abi.load_base_vertex = radv_load_base_vertex;
 		} else if (shaders[shader_idx]->info.stage == MESA_SHADER_FRAGMENT) {
--- a/src/amd/vulkan/radv_pipeline.c
+++ b/src/amd/vulkan/radv_pipeline.c
@@ -1451,7 +1451,7 @@ radv_compute_ia_multi_vgt_param_helpers(struct radv_pipeline *pipeline)
 	const struct radv_device *device = pipeline->device;

 	if (radv_pipeline_has_tess(pipeline))
-		ia_multi_vgt_param.primgroup_size = pipeline->shaders[MESA_SHADER_TESS_CTRL]->info.tcs.num_patches;
+		ia_multi_vgt_param.primgroup_size = pipeline->shaders[MESA_SHADER_TESS_CTRL]->info.num_tess_patches;
 	else if (radv_pipeline_has_gs(pipeline))
 		ia_multi_vgt_param.primgroup_size = 64;
 	else
@@ -2733,8 +2733,6 @@ radv_fill_shader_keys(struct radv_device *device,
 		keys[MESA_SHADER_VERTEX].vs_common_out.as_ls = true;
 		keys[MESA_SHADER_TESS_CTRL].tcs.input_vertices = key->tess_input_vertices;
 		keys[MESA_SHADER_TESS_CTRL].tcs.primitive_mode = nir[MESA_SHADER_TESS_EVAL]->info.tess.primitive_mode;
-
-		keys[MESA_SHADER_TESS_CTRL].tcs.tes_reads_tess_factors = !!(nir[MESA_SHADER_TESS_EVAL]->info.inputs_read & (VARYING_BIT_TESS_LEVEL_INNER | VARYING_BIT_TESS_LEVEL_OUTER));
 	}

 	if (nir[MESA_SHADER_GEOMETRY]) {
@@ -2916,13 +2914,6 @@ radv_fill_shader_info(struct radv_pipeline *pipeline,
 		filled_stages |= (1 << MESA_SHADER_FRAGMENT);
 	}

-	if (nir[MESA_SHADER_TESS_CTRL]) {
-		infos[MESA_SHADER_TESS_CTRL].tcs.tes_inputs_read =
-			nir[MESA_SHADER_TESS_EVAL]->info.inputs_read;
-		infos[MESA_SHADER_TESS_CTRL].tcs.tes_patch_inputs_read =
-			nir[MESA_SHADER_TESS_EVAL]->info.patch_inputs_read;
-	}
-
 	if (pipeline->device->physical_device->rad_info.chip_class >= GFX9 &&
 	    nir[MESA_SHADER_TESS_CTRL]) {
 		struct nir_shader *combined_nir[] = {nir[MESA_SHADER_VERTEX], nir[MESA_SHADER_TESS_CTRL]};
@@ -2937,9 +2928,6 @@ radv_fill_shader_info(struct radv_pipeline *pipeline,
 						  &infos[MESA_SHADER_TESS_CTRL]);
 		}

-		keys[MESA_SHADER_TESS_EVAL].tes.num_patches =
-			infos[MESA_SHADER_TESS_CTRL].tcs.num_patches;
-
 		filled_stages |= (1 << MESA_SHADER_VERTEX);
 		filled_stages |= (1 << MESA_SHADER_TESS_CTRL);
 	}
@@ -2965,12 +2953,6 @@ radv_fill_shader_info(struct radv_pipeline *pipeline,
 	active_stages ^= filled_stages;
 	while (active_stages) {
 		int i = u_bit_scan(&active_stages);
-
-		if (i == MESA_SHADER_TESS_EVAL) {
-			keys[MESA_SHADER_TESS_EVAL].tes.num_patches =
-				infos[MESA_SHADER_TESS_CTRL].tcs.num_patches;
-		}
-
 		radv_nir_shader_info_init(&infos[i]);
 		radv_nir_shader_info_pass(nir[i], pipeline->layout,
 					  &keys[i], &infos[i]);
@@ -2991,7 +2973,7 @@ radv_fill_shader_info(struct radv_pipeline *pipeline,

 static void
 merge_tess_info(struct shader_info *tes_info,
-                const struct shader_info *tcs_info)
+                struct shader_info *tcs_info)
 {
 	/* The Vulkan 1.0.38 spec, section 21.1 Tessellator says:
 	 *
@@ -3026,6 +3008,81 @@ merge_tess_info(struct shader_info *tes_info,
 	tes_info->tess.primitive_mode |= tcs_info->tess.primitive_mode;
 	tes_info->tess.ccw |= tcs_info->tess.ccw;
 	tes_info->tess.point_mode |= tcs_info->tess.point_mode;
+
+	/* Copy the merged info back to the TCS */
+	tcs_info->tess.tcs_vertices_out = tes_info->tess.tcs_vertices_out;
+	tcs_info->tess.spacing = tes_info->tess.spacing;
+	tcs_info->tess.primitive_mode = tes_info->tess.primitive_mode;
+	tcs_info->tess.ccw = tes_info->tess.ccw;
+	tcs_info->tess.point_mode = tes_info->tess.point_mode;
+}
+
+static void
+gather_tess_info(struct radv_device *device,
+	             nir_shader **nir, struct radv_shader_info *infos,
+                 const struct radv_pipeline_key *pipeline_key)
+{
+	merge_tess_info(&nir[MESA_SHADER_TESS_EVAL]->info, &nir[MESA_SHADER_TESS_CTRL]->info);
+
+	/* Number of tessellation patches per workgroup processed by the current pipeline. */
+	unsigned num_patches =
+		get_tcs_num_patches(
+			pipeline_key->tess_input_vertices,
+			nir[MESA_SHADER_TESS_CTRL]->info.tess.tcs_vertices_out,
+			infos[MESA_SHADER_TESS_CTRL].tcs.num_linked_inputs,
+			infos[MESA_SHADER_TESS_CTRL].tcs.num_linked_outputs,
+			infos[MESA_SHADER_TESS_CTRL].tcs.num_linked_patch_outputs,
+			device->tess_offchip_block_dw_size,
+			device->physical_device->rad_info.chip_class,
+			device->physical_device->rad_info.family);
+
+	/* LDS size used by VS+TCS for storing TCS inputs and outputs. */
+	unsigned tcs_lds_size =
+		calculate_tess_lds_size(
+			device->physical_device->rad_info.chip_class,
+			pipeline_key->tess_input_vertices,
+			nir[MESA_SHADER_TESS_CTRL]->info.tess.tcs_vertices_out,
+			infos[MESA_SHADER_TESS_CTRL].tcs.num_linked_inputs,
+			num_patches,
+			infos[MESA_SHADER_TESS_CTRL].tcs.num_linked_outputs,
+			infos[MESA_SHADER_TESS_CTRL].tcs.num_linked_patch_outputs);
+
+	infos[MESA_SHADER_TESS_CTRL].num_tess_patches = num_patches;
+	infos[MESA_SHADER_TESS_CTRL].tcs.num_lds_blocks = tcs_lds_size;
+	infos[MESA_SHADER_TESS_CTRL].tcs.tes_reads_tess_factors = !!(nir[MESA_SHADER_TESS_EVAL]->info.inputs_read & (VARYING_BIT_TESS_LEVEL_INNER | VARYING_BIT_TESS_LEVEL_OUTER));
+	infos[MESA_SHADER_TESS_CTRL].tcs.tes_inputs_read = nir[MESA_SHADER_TESS_EVAL]->info.inputs_read;
+	infos[MESA_SHADER_TESS_CTRL].tcs.tes_patch_inputs_read = nir[MESA_SHADER_TESS_EVAL]->info.patch_inputs_read;
+
+	infos[MESA_SHADER_TESS_EVAL].num_tess_patches = num_patches;
+	infos[MESA_SHADER_GEOMETRY].num_tess_patches = num_patches;
+
+	if (!radv_use_llvm_for_stage(device, MESA_SHADER_VERTEX)) {
+		/* When the number of TCS input and output vertices are the same (typically 3):
+		 * - There is an equal amount of LS and HS invocations
+		 * - In case of merged LSHS shaders, the LS and HS halves of the shader
+		 *   always process the exact same vertex. We can use this knowledge to optimize them.
+		 *
+		 * We don't set tcs_in_out_eq if the float controls differ because that might
+		 * involve different float modes for the same block and our optimizer
+		 * doesn't handle a instruction dominating another with a different mode.
+		 */
+		infos[MESA_SHADER_VERTEX].vs.tcs_in_out_eq =
+			device->physical_device->rad_info.chip_class >= GFX9 &&
+			pipeline_key->tess_input_vertices == nir[MESA_SHADER_TESS_CTRL]->info.tess.tcs_vertices_out &&
+			nir[MESA_SHADER_VERTEX]->info.float_controls_execution_mode == nir[MESA_SHADER_TESS_CTRL]->info.float_controls_execution_mode;
+
+		if (infos[MESA_SHADER_VERTEX].vs.tcs_in_out_eq)
+			infos[MESA_SHADER_VERTEX].vs.tcs_temp_only_input_mask =
+				nir[MESA_SHADER_TESS_CTRL]->info.inputs_read &
+				nir[MESA_SHADER_VERTEX]->info.outputs_written &
+				~nir[MESA_SHADER_TESS_CTRL]->info.tess.tcs_cross_invocation_inputs_read &
+				~nir[MESA_SHADER_TESS_CTRL]->info.inputs_read_indirectly &
+				~nir[MESA_SHADER_VERTEX]->info.outputs_accessed_indirectly;
+
+		/* Copy data to TCS so it can be accessed by the backend if they are merged. */
+		infos[MESA_SHADER_TESS_CTRL].vs.tcs_in_out_eq = infos[MESA_SHADER_VERTEX].vs.tcs_in_out_eq;
+		infos[MESA_SHADER_TESS_CTRL].vs.tcs_temp_only_input_mask = infos[MESA_SHADER_VERTEX].vs.tcs_temp_only_input_mask;
+	}
 }

 static
@@ -3314,11 +3371,6 @@ VkResult radv_create_shaders(struct radv_pipeline *pipeline,
 		radv_stop_feedback(stage_feedbacks[i], false);
 	}

-	if (nir[MESA_SHADER_TESS_CTRL]) {
-		nir_lower_patch_vertices(nir[MESA_SHADER_TESS_EVAL], nir[MESA_SHADER_TESS_CTRL]->info.tess.tcs_vertices_out, NULL);
-		merge_tess_info(&nir[MESA_SHADER_TESS_EVAL]->info, &nir[MESA_SHADER_TESS_CTRL]->info);
-	}
-
 	bool optimize_conservatively = flags & VK_PIPELINE_CREATE_DISABLE_OPTIMIZATION_BIT;

 	radv_link_shaders(pipeline, nir, optimize_conservatively);
@@ -3337,6 +3389,14 @@ VkResult radv_create_shaders(struct radv_pipeline *pipeline,
 		}
 	}

+	infos[MESA_SHADER_VERTEX].vs.as_ls = !!nir[MESA_SHADER_TESS_CTRL];
+	infos[MESA_SHADER_VERTEX].vs.as_es = !!nir[MESA_SHADER_GEOMETRY] && !nir[MESA_SHADER_TESS_CTRL];
+	infos[MESA_SHADER_TESS_EVAL].tes.as_es = !!nir[MESA_SHADER_GEOMETRY] && !!nir[MESA_SHADER_TESS_CTRL];
+
+	if (nir[MESA_SHADER_TESS_CTRL]) {
+		nir_lower_patch_vertices(nir[MESA_SHADER_TESS_EVAL], nir[MESA_SHADER_TESS_CTRL]->info.tess.tcs_vertices_out, NULL);
+		gather_tess_info(device, nir, infos, pipeline_key);
+	}

 	for (int i = 0; i < MESA_SHADER_STAGES; ++i) {
 		if (nir[i]) {
@@ -3351,68 +3411,6 @@ VkResult radv_create_shaders(struct radv_pipeline *pipeline,
 			}
 			NIR_PASS_V(nir[i], nir_lower_memory_model);

-			if (i == MESA_SHADER_VERTEX) {
-				if (nir[MESA_SHADER_TESS_CTRL] && !radv_use_llvm_for_stage(device, i)) {
-					/* When the number of TCS input and output vertices are the same (typically 3):
-					 * - There is an equal amount of LS and HS invocations
-					 * - In case of merged LSHS shaders, the LS and HS halves of the shader
-					 *   always process the exact same vertex. We can use this knowledge to optimize them.
-					 *
-					 * We don't set tcs_in_out_eq if the float controls differ because that might
-					 * involve different float modes for the same block and our optimizer
-					 * doesn't handle a instruction dominating another with a different mode.
-					 */
-					infos[i].vs.tcs_in_out_eq =
-					 	device->physical_device->rad_info.chip_class >= GFX9 &&
-						pipeline_key->tess_input_vertices == nir[MESA_SHADER_TESS_CTRL]->info.tess.tcs_vertices_out &&
-						nir[MESA_SHADER_VERTEX]->info.float_controls_execution_mode == nir[MESA_SHADER_TESS_CTRL]->info.float_controls_execution_mode;
-
-					if (infos[i].vs.tcs_in_out_eq)
-						infos[i].vs.tcs_temp_only_input_mask =
-							nir[MESA_SHADER_TESS_CTRL]->info.inputs_read &
-							nir[MESA_SHADER_VERTEX]->info.outputs_written &
-							~nir[MESA_SHADER_TESS_CTRL]->info.tess.tcs_cross_invocation_inputs_read &
-							~nir[MESA_SHADER_TESS_CTRL]->info.inputs_read_indirectly &
-							~nir[MESA_SHADER_VERTEX]->info.outputs_accessed_indirectly;
-
-					/* Copy data to TCS so it can be accessed by the backend if they are merged. */
-					infos[MESA_SHADER_TESS_CTRL].vs.tcs_in_out_eq = infos[i].vs.tcs_in_out_eq;
-					infos[MESA_SHADER_TESS_CTRL].vs.tcs_temp_only_input_mask = infos[i].vs.tcs_temp_only_input_mask;
-				}
-			} else if (i == MESA_SHADER_TESS_CTRL) {
-				/* Copy correct primitive mode from TES info. */
-				nir[i]->info.tess.primitive_mode = nir[MESA_SHADER_TESS_EVAL]->info.tess.primitive_mode;
-
-				/* Number of tessellation patches processed per workgroup in the current pipeline. */
-				unsigned tcs_num_patches =
-					get_tcs_num_patches(
-						pipeline_key->tess_input_vertices,
-						nir[i]->info.tess.tcs_vertices_out,
-						infos[i].tcs.num_linked_inputs,
-						infos[i].tcs.num_linked_outputs,
-						infos[i].tcs.num_linked_patch_outputs,
-						device->tess_offchip_block_dw_size,
-						device->physical_device->rad_info.chip_class,
-						device->physical_device->rad_info.family);
-
-				/* LDS size used by VS+TCS for storing TCS inputs and outputs. */
-				unsigned tcs_lds_size =
-					calculate_tess_lds_size(
-						device->physical_device->rad_info.chip_class,
-						pipeline_key->tess_input_vertices,
-						nir[i]->info.tess.tcs_vertices_out,
-						infos[i].tcs.num_linked_inputs,
-						tcs_num_patches,
-						infos[i].tcs.num_linked_outputs,
-						infos[i].tcs.num_linked_patch_outputs);
-
-				infos[i].tcs.num_patches = tcs_num_patches;
-				infos[i].tcs.num_lds_blocks = tcs_lds_size;
-			} else if (i == MESA_SHADER_TESS_EVAL) {
-				/* Copy num_patches from TCS info. */
-				keys[i].tes.num_patches = infos[MESA_SHADER_TESS_CTRL].tcs.num_patches;
-			}
-
 			bool lower_to_scalar = false;

 			nir_load_store_vectorize_options vectorize_opts = {
@@ -3613,7 +3611,6 @@ VkResult radv_create_shaders(struct radv_pipeline *pipeline,
 			radv_stop_feedback(stage_feedbacks[MESA_SHADER_TESS_CTRL], false);
 		}
 		modules[MESA_SHADER_VERTEX] = NULL;
-		keys[MESA_SHADER_TESS_EVAL].tes.num_patches = pipeline->shaders[MESA_SHADER_TESS_CTRL]->info.tcs.num_patches;
 	}

 	if (device->physical_device->rad_info.chip_class >= GFX9 && modules[MESA_SHADER_GEOMETRY]) {
@@ -3637,10 +3634,6 @@ VkResult radv_create_shaders(struct radv_pipeline *pipeline,

 	for (int i = 0; i < MESA_SHADER_STAGES; ++i) {
 		if(modules[i] && !pipeline->shaders[i]) {
-			if (i == MESA_SHADER_TESS_EVAL) {
-				keys[MESA_SHADER_TESS_EVAL].tes.num_patches = pipeline->shaders[MESA_SHADER_TESS_CTRL]->info.tcs.num_patches;
-			}
-
 			radv_start_feedback(stage_feedbacks[i]);

 			pipeline->shaders[i] = radv_shader_variant_compile(device, modules[i], &nir[i], 1,
@@ -4757,7 +4750,7 @@ radv_pipeline_generate_tess_state(struct radeon_cmdbuf *ctx_cs,

 	num_tcs_input_cp = pCreateInfo->pTessellationState->patchControlPoints;
 	num_tcs_output_cp = pipeline->shaders[MESA_SHADER_TESS_CTRL]->info.tcs.tcs_vertices_out; //TCS VERTICES OUT
-	num_patches = pipeline->shaders[MESA_SHADER_TESS_CTRL]->info.tcs.num_patches;
+	num_patches = pipeline->shaders[MESA_SHADER_TESS_CTRL]->info.num_tess_patches;

 	ls_hs_config = S_028B58_NUM_PATCHES(num_patches) |
 		       S_028B58_HS_NUM_INPUT_CP(num_tcs_input_cp) |
@@ -5254,7 +5247,7 @@ gfx10_pipeline_generate_ge_cntl(struct radeon_cmdbuf *ctx_cs,
 	unsigned vertgroup_size = 256; /* 256 = disable vertex grouping */

 	if (radv_pipeline_has_tess(pipeline)) {
-		primgroup_size = pipeline->shaders[MESA_SHADER_TESS_CTRL]->info.tcs.num_patches;
+		primgroup_size = pipeline->shaders[MESA_SHADER_TESS_CTRL]->info.num_tess_patches;
 	} else if (radv_pipeline_has_gs(pipeline)) {
 		const struct gfx9_gs_info *gs_state =
 			&pipeline->shaders[MESA_SHADER_GEOMETRY]->info.gs_ring_info;
--- a/src/amd/vulkan/radv_shader.h
+++ b/src/amd/vulkan/radv_shader.h
@@ -79,15 +79,12 @@ struct radv_vs_variant_key {

 struct radv_tes_variant_key {
 	struct radv_vs_out_key out;
-
-	uint8_t num_patches;
 };

 struct radv_tcs_variant_key {
 	struct radv_vs_variant_key vs_key;
 	unsigned primitive_mode;
 	unsigned input_vertices;
-	uint32_t tes_reads_tess_factors:1;
 };

 struct radv_fs_variant_key {
@@ -259,6 +256,7 @@ struct radv_shader_info {
 	bool need_indirect_descriptor_sets;
 	bool is_ngg;
 	bool is_ngg_passthrough;
+	uint32_t num_tess_patches;
 	struct {
 		uint8_t input_usage_mask[RADV_VERT_ATTRIB_MAX];
 		uint8_t output_usage_mask[VARYING_SLOT_VAR31 + 1];
@@ -337,11 +335,11 @@ struct radv_shader_info {
 		uint64_t tes_inputs_read;
 		uint64_t tes_patch_inputs_read;
 		unsigned tcs_vertices_out;
-		uint32_t num_patches;
 		uint32_t num_lds_blocks;
 		uint8_t num_linked_inputs;
 		uint8_t num_linked_outputs;
 		uint8_t num_linked_patch_outputs;
+		bool tes_reads_tess_factors:1;
 	} tcs;

 	struct radv_streamout_info so;