From 2f9b30f30c80dc5c6e60e6d023dee7a24ce4ec5a Mon Sep 17 00:00:00 2001 From: Samuel Pitoiset Date: Fri, 26 Aug 2022 12:03:11 +0200 Subject: [PATCH] radv: move computing NGG info and GS info to radv_nir_shader_info_link() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit It's a link step somehow, except for VS only on GFX10+ but keep it there anyways. Signed-off-by: Samuel Pitoiset Reviewed-by: Timur Kristóf Part-of: --- src/amd/vulkan/radv_pipeline.c | 393 +---------------------------- src/amd/vulkan/radv_private.h | 2 +- src/amd/vulkan/radv_shader_info.c | 394 +++++++++++++++++++++++++++++- 3 files changed, 396 insertions(+), 393 deletions(-) diff --git a/src/amd/vulkan/radv_pipeline.c b/src/amd/vulkan/radv_pipeline.c index b74a6abdb56..0452decda6e 100644 --- a/src/amd/vulkan/radv_pipeline.c +++ b/src/amd/vulkan/radv_pipeline.c @@ -1893,142 +1893,6 @@ radv_pipeline_init_depth_stencil_state(struct radv_graphics_pipeline *pipeline, return ds_state; } -static void -gfx9_get_gs_info(const struct radv_device *device, struct radv_pipeline_stage *es_stage, - struct radv_pipeline_stage *gs_stage) -{ - const enum amd_gfx_level gfx_level = device->physical_device->rad_info.gfx_level; - struct radv_shader_info *gs_info = &gs_stage->info; - struct radv_shader_info *es_info = &es_stage->info; - struct gfx9_gs_info *out = &gs_stage->info.gs_ring_info; - - const unsigned gs_num_invocations = MAX2(gs_info->gs.invocations, 1); - const bool uses_adjacency = gs_info->gs.input_prim == SHADER_PRIM_LINES_ADJACENCY || - gs_info->gs.input_prim == SHADER_PRIM_TRIANGLES_ADJACENCY; - - /* All these are in dwords: */ - /* We can't allow using the whole LDS, because GS waves compete with - * other shader stages for LDS space. */ - const unsigned max_lds_size = 8 * 1024; - const unsigned esgs_itemsize = es_info->esgs_itemsize / 4; - unsigned esgs_lds_size; - - /* All these are per subgroup: */ - const unsigned max_out_prims = 32 * 1024; - const unsigned max_es_verts = 255; - const unsigned ideal_gs_prims = 64; - unsigned max_gs_prims, gs_prims; - unsigned min_es_verts, es_verts, worst_case_es_verts; - - if (uses_adjacency || gs_num_invocations > 1) - max_gs_prims = 127 / gs_num_invocations; - else - max_gs_prims = 255; - - /* MAX_PRIMS_PER_SUBGROUP = gs_prims * max_vert_out * gs_invocations. - * Make sure we don't go over the maximum value. - */ - if (gs_info->gs.vertices_out > 0) { - max_gs_prims = - MIN2(max_gs_prims, max_out_prims / (gs_info->gs.vertices_out * gs_num_invocations)); - } - assert(max_gs_prims > 0); - - /* If the primitive has adjacency, halve the number of vertices - * that will be reused in multiple primitives. - */ - min_es_verts = gs_info->gs.vertices_in / (uses_adjacency ? 2 : 1); - - gs_prims = MIN2(ideal_gs_prims, max_gs_prims); - worst_case_es_verts = MIN2(min_es_verts * gs_prims, max_es_verts); - - /* Compute ESGS LDS size based on the worst case number of ES vertices - * needed to create the target number of GS prims per subgroup. - */ - esgs_lds_size = esgs_itemsize * worst_case_es_verts; - - /* If total LDS usage is too big, refactor partitions based on ratio - * of ESGS item sizes. - */ - if (esgs_lds_size > max_lds_size) { - /* Our target GS Prims Per Subgroup was too large. Calculate - * the maximum number of GS Prims Per Subgroup that will fit - * into LDS, capped by the maximum that the hardware can support. - */ - gs_prims = MIN2((max_lds_size / (esgs_itemsize * min_es_verts)), max_gs_prims); - assert(gs_prims > 0); - worst_case_es_verts = MIN2(min_es_verts * gs_prims, max_es_verts); - - esgs_lds_size = esgs_itemsize * worst_case_es_verts; - assert(esgs_lds_size <= max_lds_size); - } - - /* Now calculate remaining ESGS information. */ - if (esgs_lds_size) - es_verts = MIN2(esgs_lds_size / esgs_itemsize, max_es_verts); - else - es_verts = max_es_verts; - - /* Vertices for adjacency primitives are not always reused, so restore - * it for ES_VERTS_PER_SUBGRP. - */ - min_es_verts = gs_info->gs.vertices_in; - - /* For normal primitives, the VGT only checks if they are past the ES - * verts per subgroup after allocating a full GS primitive and if they - * are, kick off a new subgroup. But if those additional ES verts are - * unique (e.g. not reused) we need to make sure there is enough LDS - * space to account for those ES verts beyond ES_VERTS_PER_SUBGRP. - */ - es_verts -= min_es_verts - 1; - - const uint32_t es_verts_per_subgroup = es_verts; - const uint32_t gs_prims_per_subgroup = gs_prims; - const uint32_t gs_inst_prims_in_subgroup = gs_prims * gs_num_invocations; - const uint32_t max_prims_per_subgroup = gs_inst_prims_in_subgroup * gs_info->gs.vertices_out; - out->lds_size = align(esgs_lds_size, 128) / 128; - out->vgt_gs_onchip_cntl = S_028A44_ES_VERTS_PER_SUBGRP(es_verts_per_subgroup) | - S_028A44_GS_PRIMS_PER_SUBGRP(gs_prims_per_subgroup) | - S_028A44_GS_INST_PRIMS_IN_SUBGRP(gs_inst_prims_in_subgroup); - out->vgt_gs_max_prims_per_subgroup = S_028A94_MAX_PRIMS_PER_SUBGROUP(max_prims_per_subgroup); - out->vgt_esgs_ring_itemsize = esgs_itemsize; - assert(max_prims_per_subgroup <= max_out_prims); - - unsigned workgroup_size = ac_compute_esgs_workgroup_size(gfx_level, es_info->wave_size, - es_verts_per_subgroup, gs_inst_prims_in_subgroup); - es_info->workgroup_size = workgroup_size; - gs_info->workgroup_size = workgroup_size; -} - -static void -clamp_gsprims_to_esverts(unsigned *max_gsprims, unsigned max_esverts, unsigned min_verts_per_prim, - bool use_adjacency) -{ - unsigned max_reuse = max_esverts - min_verts_per_prim; - if (use_adjacency) - max_reuse /= 2; - *max_gsprims = MIN2(*max_gsprims, 1 + max_reuse); -} - -static unsigned -radv_get_num_input_vertices(const struct radv_pipeline_stage *es_stage, - const struct radv_pipeline_stage *gs_stage) -{ - if (gs_stage) { - return gs_stage->nir->info.gs.vertices_in; - } - - if (es_stage->stage == MESA_SHADER_TESS_EVAL) { - if (es_stage->nir->info.tess.point_mode) - return 1; - if (es_stage->nir->info.tess._primitive_mode == TESS_PRIMITIVE_ISOLINES) - return 2; - return 3; - } - - return 3; -} - static void gfx10_emit_ge_pc_alloc(struct radeon_cmdbuf *cs, enum amd_gfx_level gfx_level, uint32_t oversub_pc_lines) @@ -2038,245 +1902,6 @@ gfx10_emit_ge_pc_alloc(struct radeon_cmdbuf *cs, enum amd_gfx_level gfx_level, S_030980_OVERSUB_EN(oversub_pc_lines > 0) | S_030980_NUM_PC_LINES(oversub_pc_lines - 1)); } -static unsigned -radv_get_pre_rast_input_topology(const struct radv_pipeline_stage *es_stage, - const struct radv_pipeline_stage *gs_stage) -{ - if (gs_stage) { - return gs_stage->nir->info.gs.input_primitive; - } - - if (es_stage->stage == MESA_SHADER_TESS_EVAL) { - if (es_stage->nir->info.tess.point_mode) - return SHADER_PRIM_POINTS; - if (es_stage->nir->info.tess._primitive_mode == TESS_PRIMITIVE_ISOLINES) - return SHADER_PRIM_LINES; - return SHADER_PRIM_TRIANGLES; - } - - return SHADER_PRIM_TRIANGLES; -} - -static void -gfx10_get_ngg_info(const struct radv_device *device, struct radv_pipeline_stage *es_stage, - struct radv_pipeline_stage *gs_stage) -{ - const enum amd_gfx_level gfx_level = device->physical_device->rad_info.gfx_level; - struct radv_shader_info *gs_info = gs_stage ? &gs_stage->info : NULL; - struct radv_shader_info *es_info = &es_stage->info; - const unsigned max_verts_per_prim = radv_get_num_input_vertices(es_stage, gs_stage); - const unsigned min_verts_per_prim = gs_stage ? max_verts_per_prim : 1; - struct gfx10_ngg_info *out = gs_stage ? &gs_info->ngg_info : &es_info->ngg_info; - - const unsigned gs_num_invocations = gs_stage ? MAX2(gs_info->gs.invocations, 1) : 1; - - const unsigned input_prim = radv_get_pre_rast_input_topology(es_stage, gs_stage); - const bool uses_adjacency = input_prim == SHADER_PRIM_LINES_ADJACENCY || - input_prim == SHADER_PRIM_TRIANGLES_ADJACENCY; - - /* All these are in dwords: */ - /* We can't allow using the whole LDS, because GS waves compete with - * other shader stages for LDS space. - * - * TODO: We should really take the shader's internal LDS use into - * account. The linker will fail if the size is greater than - * 8K dwords. - */ - const unsigned max_lds_size = 8 * 1024 - 768; - const unsigned target_lds_size = max_lds_size; - unsigned esvert_lds_size = 0; - unsigned gsprim_lds_size = 0; - - /* All these are per subgroup: */ - const unsigned min_esverts = gfx_level >= GFX10_3 ? 29 : 24; - bool max_vert_out_per_gs_instance = false; - unsigned max_esverts_base = 128; - unsigned max_gsprims_base = 128; /* default prim group size clamp */ - - /* Hardware has the following non-natural restrictions on the value - * of GE_CNTL.VERT_GRP_SIZE based on based on the primitive type of - * the draw: - * - at most 252 for any line input primitive type - * - at most 251 for any quad input primitive type - * - at most 251 for triangle strips with adjacency (this happens to - * be the natural limit for triangle *lists* with adjacency) - */ - max_esverts_base = MIN2(max_esverts_base, 251 + max_verts_per_prim - 1); - - if (gs_stage) { - unsigned max_out_verts_per_gsprim = gs_info->gs.vertices_out * gs_num_invocations; - - if (max_out_verts_per_gsprim <= 256) { - if (max_out_verts_per_gsprim) { - max_gsprims_base = MIN2(max_gsprims_base, 256 / max_out_verts_per_gsprim); - } - } else { - /* Use special multi-cycling mode in which each GS - * instance gets its own subgroup. Does not work with - * tessellation. */ - max_vert_out_per_gs_instance = true; - max_gsprims_base = 1; - max_out_verts_per_gsprim = gs_info->gs.vertices_out; - } - - esvert_lds_size = es_info->esgs_itemsize / 4; - gsprim_lds_size = (gs_info->gs.gsvs_vertex_size / 4 + 1) * max_out_verts_per_gsprim; - } else { - /* VS and TES. */ - /* LDS size for passing data from GS to ES. */ - struct radv_streamout_info *so_info = &es_info->so; - - if (so_info->num_outputs) - esvert_lds_size = 4 * so_info->num_outputs + 1; - - /* GS stores Primitive IDs (one DWORD) into LDS at the address - * corresponding to the ES thread of the provoking vertex. All - * ES threads load and export PrimitiveID for their thread. - */ - if (es_stage->stage == MESA_SHADER_VERTEX && es_stage->info.outinfo.export_prim_id) - esvert_lds_size = MAX2(esvert_lds_size, 1); - } - - unsigned max_gsprims = max_gsprims_base; - unsigned max_esverts = max_esverts_base; - - if (esvert_lds_size) - max_esverts = MIN2(max_esverts, target_lds_size / esvert_lds_size); - if (gsprim_lds_size) - max_gsprims = MIN2(max_gsprims, target_lds_size / gsprim_lds_size); - - max_esverts = MIN2(max_esverts, max_gsprims * max_verts_per_prim); - clamp_gsprims_to_esverts(&max_gsprims, max_esverts, min_verts_per_prim, uses_adjacency); - assert(max_esverts >= max_verts_per_prim && max_gsprims >= 1); - - if (esvert_lds_size || gsprim_lds_size) { - /* Now that we have a rough proportionality between esverts - * and gsprims based on the primitive type, scale both of them - * down simultaneously based on required LDS space. - * - * We could be smarter about this if we knew how much vertex - * reuse to expect. - */ - unsigned lds_total = max_esverts * esvert_lds_size + max_gsprims * gsprim_lds_size; - if (lds_total > target_lds_size) { - max_esverts = max_esverts * target_lds_size / lds_total; - max_gsprims = max_gsprims * target_lds_size / lds_total; - - max_esverts = MIN2(max_esverts, max_gsprims * max_verts_per_prim); - clamp_gsprims_to_esverts(&max_gsprims, max_esverts, min_verts_per_prim, uses_adjacency); - assert(max_esverts >= max_verts_per_prim && max_gsprims >= 1); - } - } - - /* Round up towards full wave sizes for better ALU utilization. */ - if (!max_vert_out_per_gs_instance) { - unsigned orig_max_esverts; - unsigned orig_max_gsprims; - unsigned wavesize; - - if (gs_stage) { - wavesize = gs_info->wave_size; - } else { - wavesize = es_info->wave_size; - } - - do { - orig_max_esverts = max_esverts; - orig_max_gsprims = max_gsprims; - - max_esverts = align(max_esverts, wavesize); - max_esverts = MIN2(max_esverts, max_esverts_base); - if (esvert_lds_size) - max_esverts = - MIN2(max_esverts, (max_lds_size - max_gsprims * gsprim_lds_size) / esvert_lds_size); - max_esverts = MIN2(max_esverts, max_gsprims * max_verts_per_prim); - - /* Hardware restriction: minimum value of max_esverts */ - if (gfx_level == GFX10) - max_esverts = MAX2(max_esverts, min_esverts - 1 + max_verts_per_prim); - else - max_esverts = MAX2(max_esverts, min_esverts); - - max_gsprims = align(max_gsprims, wavesize); - max_gsprims = MIN2(max_gsprims, max_gsprims_base); - if (gsprim_lds_size) { - /* Don't count unusable vertices to the LDS - * size. Those are vertices above the maximum - * number of vertices that can occur in the - * workgroup, which is e.g. max_gsprims * 3 - * for triangles. - */ - unsigned usable_esverts = MIN2(max_esverts, max_gsprims * max_verts_per_prim); - max_gsprims = MIN2(max_gsprims, - (max_lds_size - usable_esverts * esvert_lds_size) / gsprim_lds_size); - } - clamp_gsprims_to_esverts(&max_gsprims, max_esverts, min_verts_per_prim, uses_adjacency); - assert(max_esverts >= max_verts_per_prim && max_gsprims >= 1); - } while (orig_max_esverts != max_esverts || orig_max_gsprims != max_gsprims); - - /* Verify the restriction. */ - if (gfx_level == GFX10) - assert(max_esverts >= min_esverts - 1 + max_verts_per_prim); - else - assert(max_esverts >= min_esverts); - } else { - /* Hardware restriction: minimum value of max_esverts */ - if (gfx_level == GFX10) - max_esverts = MAX2(max_esverts, min_esverts - 1 + max_verts_per_prim); - else - max_esverts = MAX2(max_esverts, min_esverts); - } - - unsigned max_out_vertices = max_vert_out_per_gs_instance ? gs_info->gs.vertices_out - : gs_stage - ? max_gsprims * gs_num_invocations * gs_info->gs.vertices_out - : max_esverts; - assert(max_out_vertices <= 256); - - unsigned prim_amp_factor = 1; - if (gs_stage) { - /* Number of output primitives per GS input primitive after - * GS instancing. */ - prim_amp_factor = gs_info->gs.vertices_out; - } - - /* On Gfx10, the GE only checks against the maximum number of ES verts - * after allocating a full GS primitive. So we need to ensure that - * whenever this check passes, there is enough space for a full - * primitive without vertex reuse. - */ - if (gfx_level == GFX10) - out->hw_max_esverts = max_esverts - max_verts_per_prim + 1; - else - out->hw_max_esverts = max_esverts; - - out->max_gsprims = max_gsprims; - out->max_out_verts = max_out_vertices; - out->prim_amp_factor = prim_amp_factor; - out->max_vert_out_per_gs_instance = max_vert_out_per_gs_instance; - out->ngg_emit_size = max_gsprims * gsprim_lds_size; - out->enable_vertex_grouping = true; - - /* Don't count unusable vertices. */ - out->esgs_ring_size = MIN2(max_esverts, max_gsprims * max_verts_per_prim) * esvert_lds_size * 4; - - if (gs_stage) { - out->vgt_esgs_ring_itemsize = es_info->esgs_itemsize / 4; - } else { - out->vgt_esgs_ring_itemsize = 1; - } - - assert(out->hw_max_esverts >= min_esverts); /* HW limitation */ - - unsigned workgroup_size = - ac_compute_ngg_workgroup_size( - max_esverts, max_gsprims * gs_num_invocations, max_out_vertices, prim_amp_factor); - if (gs_stage) { - gs_info->workgroup_size = workgroup_size; - } - es_info->workgroup_size = workgroup_size; -} - static void radv_pipeline_init_gs_ring_state(struct radv_graphics_pipeline *pipeline, const struct gfx9_gs_info *gs) { @@ -3328,7 +2953,7 @@ radv_fill_shader_info(struct radv_pipeline *pipeline, &stages[i].info); } - radv_nir_shader_info_link(device, pipeline_key, stages, last_vgt_api_stage); + radv_nir_shader_info_link(device, pipeline_key, stages, pipeline_has_ngg, last_vgt_api_stage); if (stages[MESA_SHADER_TESS_CTRL].nir) { for (gl_shader_stage s = MESA_SHADER_VERTEX; s <= MESA_SHADER_TESS_CTRL; ++s) { @@ -3359,21 +2984,7 @@ radv_fill_shader_info(struct radv_pipeline *pipeline, stages[MESA_SHADER_TASK].nir->info.workgroup_size, false, UINT32_MAX); } - if (pipeline_has_ngg) { - if (last_vgt_api_stage != MESA_SHADER_MESH) { - struct radv_pipeline_stage *es_stage = - stages[MESA_SHADER_TESS_EVAL].nir ? &stages[MESA_SHADER_TESS_EVAL] : &stages[MESA_SHADER_VERTEX]; - struct radv_pipeline_stage *gs_stage = - stages[MESA_SHADER_GEOMETRY].nir ? &stages[MESA_SHADER_GEOMETRY] : NULL; - - gfx10_get_ngg_info(device, es_stage, gs_stage); - } - } else if (stages[MESA_SHADER_GEOMETRY].nir) { - struct radv_pipeline_stage *es_stage = - stages[MESA_SHADER_TESS_EVAL].nir ? &stages[MESA_SHADER_TESS_EVAL] : &stages[MESA_SHADER_VERTEX]; - - gfx9_get_gs_info(device, es_stage, &stages[MESA_SHADER_GEOMETRY]); - } else { + if (!pipeline_has_ngg && !stages[MESA_SHADER_GEOMETRY].nir) { gl_shader_stage hw_vs_api_stage = stages[MESA_SHADER_TESS_EVAL].nir ? MESA_SHADER_TESS_EVAL : MESA_SHADER_VERTEX; stages[hw_vs_api_stage].info.workgroup_size = stages[hw_vs_api_stage].info.wave_size; diff --git a/src/amd/vulkan/radv_private.h b/src/amd/vulkan/radv_private.h index 3c523faa687..c14eb519c17 100644 --- a/src/amd/vulkan/radv_private.h +++ b/src/amd/vulkan/radv_private.h @@ -2804,7 +2804,7 @@ void radv_nir_shader_info_init(struct radv_shader_info *info); void radv_nir_shader_info_link(struct radv_device *device, const struct radv_pipeline_key *pipeline_key, - struct radv_pipeline_stage *stages, + struct radv_pipeline_stage *stages, bool pipeline_has_ngg, gl_shader_stage last_vgt_api_stage); bool radv_thread_trace_init(struct radv_device *device); diff --git a/src/amd/vulkan/radv_shader_info.c b/src/amd/vulkan/radv_shader_info.c index ae192f51880..c811df0415e 100644 --- a/src/amd/vulkan/radv_shader_info.c +++ b/src/amd/vulkan/radv_shader_info.c @@ -798,9 +798,385 @@ radv_nir_shader_info_pass(struct radv_device *device, const struct nir_shader *n info->ballot_bit_size = radv_get_ballot_bit_size(device, nir->info.stage, info); } +static void +gfx9_get_gs_info(const struct radv_device *device, struct radv_pipeline_stage *es_stage, + struct radv_pipeline_stage *gs_stage) +{ + const enum amd_gfx_level gfx_level = device->physical_device->rad_info.gfx_level; + struct radv_shader_info *gs_info = &gs_stage->info; + struct radv_shader_info *es_info = &es_stage->info; + struct gfx9_gs_info *out = &gs_stage->info.gs_ring_info; + + const unsigned gs_num_invocations = MAX2(gs_info->gs.invocations, 1); + const bool uses_adjacency = gs_info->gs.input_prim == SHADER_PRIM_LINES_ADJACENCY || + gs_info->gs.input_prim == SHADER_PRIM_TRIANGLES_ADJACENCY; + + /* All these are in dwords: */ + /* We can't allow using the whole LDS, because GS waves compete with + * other shader stages for LDS space. */ + const unsigned max_lds_size = 8 * 1024; + const unsigned esgs_itemsize = es_info->esgs_itemsize / 4; + unsigned esgs_lds_size; + + /* All these are per subgroup: */ + const unsigned max_out_prims = 32 * 1024; + const unsigned max_es_verts = 255; + const unsigned ideal_gs_prims = 64; + unsigned max_gs_prims, gs_prims; + unsigned min_es_verts, es_verts, worst_case_es_verts; + + if (uses_adjacency || gs_num_invocations > 1) + max_gs_prims = 127 / gs_num_invocations; + else + max_gs_prims = 255; + + /* MAX_PRIMS_PER_SUBGROUP = gs_prims * max_vert_out * gs_invocations. + * Make sure we don't go over the maximum value. + */ + if (gs_info->gs.vertices_out > 0) { + max_gs_prims = + MIN2(max_gs_prims, max_out_prims / (gs_info->gs.vertices_out * gs_num_invocations)); + } + assert(max_gs_prims > 0); + + /* If the primitive has adjacency, halve the number of vertices + * that will be reused in multiple primitives. + */ + min_es_verts = gs_info->gs.vertices_in / (uses_adjacency ? 2 : 1); + + gs_prims = MIN2(ideal_gs_prims, max_gs_prims); + worst_case_es_verts = MIN2(min_es_verts * gs_prims, max_es_verts); + + /* Compute ESGS LDS size based on the worst case number of ES vertices + * needed to create the target number of GS prims per subgroup. + */ + esgs_lds_size = esgs_itemsize * worst_case_es_verts; + + /* If total LDS usage is too big, refactor partitions based on ratio + * of ESGS item sizes. + */ + if (esgs_lds_size > max_lds_size) { + /* Our target GS Prims Per Subgroup was too large. Calculate + * the maximum number of GS Prims Per Subgroup that will fit + * into LDS, capped by the maximum that the hardware can support. + */ + gs_prims = MIN2((max_lds_size / (esgs_itemsize * min_es_verts)), max_gs_prims); + assert(gs_prims > 0); + worst_case_es_verts = MIN2(min_es_verts * gs_prims, max_es_verts); + + esgs_lds_size = esgs_itemsize * worst_case_es_verts; + assert(esgs_lds_size <= max_lds_size); + } + + /* Now calculate remaining ESGS information. */ + if (esgs_lds_size) + es_verts = MIN2(esgs_lds_size / esgs_itemsize, max_es_verts); + else + es_verts = max_es_verts; + + /* Vertices for adjacency primitives are not always reused, so restore + * it for ES_VERTS_PER_SUBGRP. + */ + min_es_verts = gs_info->gs.vertices_in; + + /* For normal primitives, the VGT only checks if they are past the ES + * verts per subgroup after allocating a full GS primitive and if they + * are, kick off a new subgroup. But if those additional ES verts are + * unique (e.g. not reused) we need to make sure there is enough LDS + * space to account for those ES verts beyond ES_VERTS_PER_SUBGRP. + */ + es_verts -= min_es_verts - 1; + + const uint32_t es_verts_per_subgroup = es_verts; + const uint32_t gs_prims_per_subgroup = gs_prims; + const uint32_t gs_inst_prims_in_subgroup = gs_prims * gs_num_invocations; + const uint32_t max_prims_per_subgroup = gs_inst_prims_in_subgroup * gs_info->gs.vertices_out; + out->lds_size = align(esgs_lds_size, 128) / 128; + out->vgt_gs_onchip_cntl = S_028A44_ES_VERTS_PER_SUBGRP(es_verts_per_subgroup) | + S_028A44_GS_PRIMS_PER_SUBGRP(gs_prims_per_subgroup) | + S_028A44_GS_INST_PRIMS_IN_SUBGRP(gs_inst_prims_in_subgroup); + out->vgt_gs_max_prims_per_subgroup = S_028A94_MAX_PRIMS_PER_SUBGROUP(max_prims_per_subgroup); + out->vgt_esgs_ring_itemsize = esgs_itemsize; + assert(max_prims_per_subgroup <= max_out_prims); + + unsigned workgroup_size = ac_compute_esgs_workgroup_size(gfx_level, es_info->wave_size, + es_verts_per_subgroup, gs_inst_prims_in_subgroup); + es_info->workgroup_size = workgroup_size; + gs_info->workgroup_size = workgroup_size; +} + +static void +clamp_gsprims_to_esverts(unsigned *max_gsprims, unsigned max_esverts, unsigned min_verts_per_prim, + bool use_adjacency) +{ + unsigned max_reuse = max_esverts - min_verts_per_prim; + if (use_adjacency) + max_reuse /= 2; + *max_gsprims = MIN2(*max_gsprims, 1 + max_reuse); +} + +static unsigned +radv_get_num_input_vertices(const struct radv_pipeline_stage *es_stage, + const struct radv_pipeline_stage *gs_stage) +{ + if (gs_stage) { + return gs_stage->nir->info.gs.vertices_in; + } + + if (es_stage->stage == MESA_SHADER_TESS_EVAL) { + if (es_stage->nir->info.tess.point_mode) + return 1; + if (es_stage->nir->info.tess._primitive_mode == TESS_PRIMITIVE_ISOLINES) + return 2; + return 3; + } + + return 3; +} + +static unsigned +radv_get_pre_rast_input_topology(const struct radv_pipeline_stage *es_stage, + const struct radv_pipeline_stage *gs_stage) +{ + if (gs_stage) { + return gs_stage->nir->info.gs.input_primitive; + } + + if (es_stage->stage == MESA_SHADER_TESS_EVAL) { + if (es_stage->nir->info.tess.point_mode) + return SHADER_PRIM_POINTS; + if (es_stage->nir->info.tess._primitive_mode == TESS_PRIMITIVE_ISOLINES) + return SHADER_PRIM_LINES; + return SHADER_PRIM_TRIANGLES; + } + + return SHADER_PRIM_TRIANGLES; +} + +static void +gfx10_get_ngg_info(const struct radv_device *device, struct radv_pipeline_stage *es_stage, + struct radv_pipeline_stage *gs_stage) +{ + const enum amd_gfx_level gfx_level = device->physical_device->rad_info.gfx_level; + struct radv_shader_info *gs_info = gs_stage ? &gs_stage->info : NULL; + struct radv_shader_info *es_info = &es_stage->info; + const unsigned max_verts_per_prim = radv_get_num_input_vertices(es_stage, gs_stage); + const unsigned min_verts_per_prim = gs_stage ? max_verts_per_prim : 1; + struct gfx10_ngg_info *out = gs_stage ? &gs_info->ngg_info : &es_info->ngg_info; + + const unsigned gs_num_invocations = gs_stage ? MAX2(gs_info->gs.invocations, 1) : 1; + + const unsigned input_prim = radv_get_pre_rast_input_topology(es_stage, gs_stage); + const bool uses_adjacency = input_prim == SHADER_PRIM_LINES_ADJACENCY || + input_prim == SHADER_PRIM_TRIANGLES_ADJACENCY; + + /* All these are in dwords: */ + /* We can't allow using the whole LDS, because GS waves compete with + * other shader stages for LDS space. + * + * TODO: We should really take the shader's internal LDS use into + * account. The linker will fail if the size is greater than + * 8K dwords. + */ + const unsigned max_lds_size = 8 * 1024 - 768; + const unsigned target_lds_size = max_lds_size; + unsigned esvert_lds_size = 0; + unsigned gsprim_lds_size = 0; + + /* All these are per subgroup: */ + const unsigned min_esverts = gfx_level >= GFX10_3 ? 29 : 24; + bool max_vert_out_per_gs_instance = false; + unsigned max_esverts_base = 128; + unsigned max_gsprims_base = 128; /* default prim group size clamp */ + + /* Hardware has the following non-natural restrictions on the value + * of GE_CNTL.VERT_GRP_SIZE based on based on the primitive type of + * the draw: + * - at most 252 for any line input primitive type + * - at most 251 for any quad input primitive type + * - at most 251 for triangle strips with adjacency (this happens to + * be the natural limit for triangle *lists* with adjacency) + */ + max_esverts_base = MIN2(max_esverts_base, 251 + max_verts_per_prim - 1); + + if (gs_stage) { + unsigned max_out_verts_per_gsprim = gs_info->gs.vertices_out * gs_num_invocations; + + if (max_out_verts_per_gsprim <= 256) { + if (max_out_verts_per_gsprim) { + max_gsprims_base = MIN2(max_gsprims_base, 256 / max_out_verts_per_gsprim); + } + } else { + /* Use special multi-cycling mode in which each GS + * instance gets its own subgroup. Does not work with + * tessellation. */ + max_vert_out_per_gs_instance = true; + max_gsprims_base = 1; + max_out_verts_per_gsprim = gs_info->gs.vertices_out; + } + + esvert_lds_size = es_info->esgs_itemsize / 4; + gsprim_lds_size = (gs_info->gs.gsvs_vertex_size / 4 + 1) * max_out_verts_per_gsprim; + } else { + /* VS and TES. */ + /* LDS size for passing data from GS to ES. */ + struct radv_streamout_info *so_info = &es_info->so; + + if (so_info->num_outputs) + esvert_lds_size = 4 * so_info->num_outputs + 1; + + /* GS stores Primitive IDs (one DWORD) into LDS at the address + * corresponding to the ES thread of the provoking vertex. All + * ES threads load and export PrimitiveID for their thread. + */ + if (es_stage->stage == MESA_SHADER_VERTEX && es_stage->info.outinfo.export_prim_id) + esvert_lds_size = MAX2(esvert_lds_size, 1); + } + + unsigned max_gsprims = max_gsprims_base; + unsigned max_esverts = max_esverts_base; + + if (esvert_lds_size) + max_esverts = MIN2(max_esverts, target_lds_size / esvert_lds_size); + if (gsprim_lds_size) + max_gsprims = MIN2(max_gsprims, target_lds_size / gsprim_lds_size); + + max_esverts = MIN2(max_esverts, max_gsprims * max_verts_per_prim); + clamp_gsprims_to_esverts(&max_gsprims, max_esverts, min_verts_per_prim, uses_adjacency); + assert(max_esverts >= max_verts_per_prim && max_gsprims >= 1); + + if (esvert_lds_size || gsprim_lds_size) { + /* Now that we have a rough proportionality between esverts + * and gsprims based on the primitive type, scale both of them + * down simultaneously based on required LDS space. + * + * We could be smarter about this if we knew how much vertex + * reuse to expect. + */ + unsigned lds_total = max_esverts * esvert_lds_size + max_gsprims * gsprim_lds_size; + if (lds_total > target_lds_size) { + max_esverts = max_esverts * target_lds_size / lds_total; + max_gsprims = max_gsprims * target_lds_size / lds_total; + + max_esverts = MIN2(max_esverts, max_gsprims * max_verts_per_prim); + clamp_gsprims_to_esverts(&max_gsprims, max_esverts, min_verts_per_prim, uses_adjacency); + assert(max_esverts >= max_verts_per_prim && max_gsprims >= 1); + } + } + + /* Round up towards full wave sizes for better ALU utilization. */ + if (!max_vert_out_per_gs_instance) { + unsigned orig_max_esverts; + unsigned orig_max_gsprims; + unsigned wavesize; + + if (gs_stage) { + wavesize = gs_info->wave_size; + } else { + wavesize = es_info->wave_size; + } + + do { + orig_max_esverts = max_esverts; + orig_max_gsprims = max_gsprims; + + max_esverts = align(max_esverts, wavesize); + max_esverts = MIN2(max_esverts, max_esverts_base); + if (esvert_lds_size) + max_esverts = + MIN2(max_esverts, (max_lds_size - max_gsprims * gsprim_lds_size) / esvert_lds_size); + max_esverts = MIN2(max_esverts, max_gsprims * max_verts_per_prim); + + /* Hardware restriction: minimum value of max_esverts */ + if (gfx_level == GFX10) + max_esverts = MAX2(max_esverts, min_esverts - 1 + max_verts_per_prim); + else + max_esverts = MAX2(max_esverts, min_esverts); + + max_gsprims = align(max_gsprims, wavesize); + max_gsprims = MIN2(max_gsprims, max_gsprims_base); + if (gsprim_lds_size) { + /* Don't count unusable vertices to the LDS + * size. Those are vertices above the maximum + * number of vertices that can occur in the + * workgroup, which is e.g. max_gsprims * 3 + * for triangles. + */ + unsigned usable_esverts = MIN2(max_esverts, max_gsprims * max_verts_per_prim); + max_gsprims = MIN2(max_gsprims, + (max_lds_size - usable_esverts * esvert_lds_size) / gsprim_lds_size); + } + clamp_gsprims_to_esverts(&max_gsprims, max_esverts, min_verts_per_prim, uses_adjacency); + assert(max_esverts >= max_verts_per_prim && max_gsprims >= 1); + } while (orig_max_esverts != max_esverts || orig_max_gsprims != max_gsprims); + + /* Verify the restriction. */ + if (gfx_level == GFX10) + assert(max_esverts >= min_esverts - 1 + max_verts_per_prim); + else + assert(max_esverts >= min_esverts); + } else { + /* Hardware restriction: minimum value of max_esverts */ + if (gfx_level == GFX10) + max_esverts = MAX2(max_esverts, min_esverts - 1 + max_verts_per_prim); + else + max_esverts = MAX2(max_esverts, min_esverts); + } + + unsigned max_out_vertices = max_vert_out_per_gs_instance ? gs_info->gs.vertices_out + : gs_stage + ? max_gsprims * gs_num_invocations * gs_info->gs.vertices_out + : max_esverts; + assert(max_out_vertices <= 256); + + unsigned prim_amp_factor = 1; + if (gs_stage) { + /* Number of output primitives per GS input primitive after + * GS instancing. */ + prim_amp_factor = gs_info->gs.vertices_out; + } + + /* On Gfx10, the GE only checks against the maximum number of ES verts + * after allocating a full GS primitive. So we need to ensure that + * whenever this check passes, there is enough space for a full + * primitive without vertex reuse. + */ + if (gfx_level == GFX10) + out->hw_max_esverts = max_esverts - max_verts_per_prim + 1; + else + out->hw_max_esverts = max_esverts; + + out->max_gsprims = max_gsprims; + out->max_out_verts = max_out_vertices; + out->prim_amp_factor = prim_amp_factor; + out->max_vert_out_per_gs_instance = max_vert_out_per_gs_instance; + out->ngg_emit_size = max_gsprims * gsprim_lds_size; + out->enable_vertex_grouping = true; + + /* Don't count unusable vertices. */ + out->esgs_ring_size = MIN2(max_esverts, max_gsprims * max_verts_per_prim) * esvert_lds_size * 4; + + if (gs_stage) { + out->vgt_esgs_ring_itemsize = es_info->esgs_itemsize / 4; + } else { + out->vgt_esgs_ring_itemsize = 1; + } + + assert(out->hw_max_esverts >= min_esverts); /* HW limitation */ + + unsigned workgroup_size = + ac_compute_ngg_workgroup_size( + max_esverts, max_gsprims * gs_num_invocations, max_out_vertices, prim_amp_factor); + if (gs_stage) { + gs_info->workgroup_size = workgroup_size; + } + es_info->workgroup_size = workgroup_size; +} + void radv_nir_shader_info_link(struct radv_device *device, const struct radv_pipeline_key *pipeline_key, - struct radv_pipeline_stage *stages, gl_shader_stage last_vgt_api_stage) + struct radv_pipeline_stage *stages, bool pipeline_has_ngg, + gl_shader_stage last_vgt_api_stage) { if (stages[MESA_SHADER_FRAGMENT].nir) { assert(last_vgt_api_stage != MESA_SHADER_NONE); @@ -897,4 +1273,20 @@ radv_nir_shader_info_link(struct radv_device *device, const struct radv_pipeline /* Task/mesh I/O uses the task ring buffers. */ stages[MESA_SHADER_MESH].info.ms.has_task = true; } + + if (pipeline_has_ngg) { + if (last_vgt_api_stage != MESA_SHADER_MESH) { + struct radv_pipeline_stage *es_stage = + stages[MESA_SHADER_TESS_EVAL].nir ? &stages[MESA_SHADER_TESS_EVAL] : &stages[MESA_SHADER_VERTEX]; + struct radv_pipeline_stage *gs_stage = + stages[MESA_SHADER_GEOMETRY].nir ? &stages[MESA_SHADER_GEOMETRY] : NULL; + + gfx10_get_ngg_info(device, es_stage, gs_stage); + } + } else if (stages[MESA_SHADER_GEOMETRY].nir) { + struct radv_pipeline_stage *es_stage = + stages[MESA_SHADER_TESS_EVAL].nir ? &stages[MESA_SHADER_TESS_EVAL] : &stages[MESA_SHADER_VERTEX]; + + gfx9_get_gs_info(device, es_stage, &stages[MESA_SHADER_GEOMETRY]); + } }