radv: move computing NGG info and GS info to radv_nir_shader_info_link()

It's a link step somehow, except for VS only on GFX10+ but keep it
there anyways.

Signed-off-by: Samuel Pitoiset <samuel.pitoiset@gmail.com>
Reviewed-by: Timur Kristóf <timur.kristof@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/18278>
This commit is contained in:
Samuel Pitoiset
2022-08-26 12:03:11 +02:00
committed by Marge Bot
parent 13af51b2f3
commit 2f9b30f30c
3 changed files with 396 additions and 393 deletions

View File

@@ -1893,142 +1893,6 @@ radv_pipeline_init_depth_stencil_state(struct radv_graphics_pipeline *pipeline,
return ds_state;
}
static void
gfx9_get_gs_info(const struct radv_device *device, struct radv_pipeline_stage *es_stage,
struct radv_pipeline_stage *gs_stage)
{
const enum amd_gfx_level gfx_level = device->physical_device->rad_info.gfx_level;
struct radv_shader_info *gs_info = &gs_stage->info;
struct radv_shader_info *es_info = &es_stage->info;
struct gfx9_gs_info *out = &gs_stage->info.gs_ring_info;
const unsigned gs_num_invocations = MAX2(gs_info->gs.invocations, 1);
const bool uses_adjacency = gs_info->gs.input_prim == SHADER_PRIM_LINES_ADJACENCY ||
gs_info->gs.input_prim == SHADER_PRIM_TRIANGLES_ADJACENCY;
/* All these are in dwords: */
/* We can't allow using the whole LDS, because GS waves compete with
* other shader stages for LDS space. */
const unsigned max_lds_size = 8 * 1024;
const unsigned esgs_itemsize = es_info->esgs_itemsize / 4;
unsigned esgs_lds_size;
/* All these are per subgroup: */
const unsigned max_out_prims = 32 * 1024;
const unsigned max_es_verts = 255;
const unsigned ideal_gs_prims = 64;
unsigned max_gs_prims, gs_prims;
unsigned min_es_verts, es_verts, worst_case_es_verts;
if (uses_adjacency || gs_num_invocations > 1)
max_gs_prims = 127 / gs_num_invocations;
else
max_gs_prims = 255;
/* MAX_PRIMS_PER_SUBGROUP = gs_prims * max_vert_out * gs_invocations.
* Make sure we don't go over the maximum value.
*/
if (gs_info->gs.vertices_out > 0) {
max_gs_prims =
MIN2(max_gs_prims, max_out_prims / (gs_info->gs.vertices_out * gs_num_invocations));
}
assert(max_gs_prims > 0);
/* If the primitive has adjacency, halve the number of vertices
* that will be reused in multiple primitives.
*/
min_es_verts = gs_info->gs.vertices_in / (uses_adjacency ? 2 : 1);
gs_prims = MIN2(ideal_gs_prims, max_gs_prims);
worst_case_es_verts = MIN2(min_es_verts * gs_prims, max_es_verts);
/* Compute ESGS LDS size based on the worst case number of ES vertices
* needed to create the target number of GS prims per subgroup.
*/
esgs_lds_size = esgs_itemsize * worst_case_es_verts;
/* If total LDS usage is too big, refactor partitions based on ratio
* of ESGS item sizes.
*/
if (esgs_lds_size > max_lds_size) {
/* Our target GS Prims Per Subgroup was too large. Calculate
* the maximum number of GS Prims Per Subgroup that will fit
* into LDS, capped by the maximum that the hardware can support.
*/
gs_prims = MIN2((max_lds_size / (esgs_itemsize * min_es_verts)), max_gs_prims);
assert(gs_prims > 0);
worst_case_es_verts = MIN2(min_es_verts * gs_prims, max_es_verts);
esgs_lds_size = esgs_itemsize * worst_case_es_verts;
assert(esgs_lds_size <= max_lds_size);
}
/* Now calculate remaining ESGS information. */
if (esgs_lds_size)
es_verts = MIN2(esgs_lds_size / esgs_itemsize, max_es_verts);
else
es_verts = max_es_verts;
/* Vertices for adjacency primitives are not always reused, so restore
* it for ES_VERTS_PER_SUBGRP.
*/
min_es_verts = gs_info->gs.vertices_in;
/* For normal primitives, the VGT only checks if they are past the ES
* verts per subgroup after allocating a full GS primitive and if they
* are, kick off a new subgroup. But if those additional ES verts are
* unique (e.g. not reused) we need to make sure there is enough LDS
* space to account for those ES verts beyond ES_VERTS_PER_SUBGRP.
*/
es_verts -= min_es_verts - 1;
const uint32_t es_verts_per_subgroup = es_verts;
const uint32_t gs_prims_per_subgroup = gs_prims;
const uint32_t gs_inst_prims_in_subgroup = gs_prims * gs_num_invocations;
const uint32_t max_prims_per_subgroup = gs_inst_prims_in_subgroup * gs_info->gs.vertices_out;
out->lds_size = align(esgs_lds_size, 128) / 128;
out->vgt_gs_onchip_cntl = S_028A44_ES_VERTS_PER_SUBGRP(es_verts_per_subgroup) |
S_028A44_GS_PRIMS_PER_SUBGRP(gs_prims_per_subgroup) |
S_028A44_GS_INST_PRIMS_IN_SUBGRP(gs_inst_prims_in_subgroup);
out->vgt_gs_max_prims_per_subgroup = S_028A94_MAX_PRIMS_PER_SUBGROUP(max_prims_per_subgroup);
out->vgt_esgs_ring_itemsize = esgs_itemsize;
assert(max_prims_per_subgroup <= max_out_prims);
unsigned workgroup_size = ac_compute_esgs_workgroup_size(gfx_level, es_info->wave_size,
es_verts_per_subgroup, gs_inst_prims_in_subgroup);
es_info->workgroup_size = workgroup_size;
gs_info->workgroup_size = workgroup_size;
}
static void
clamp_gsprims_to_esverts(unsigned *max_gsprims, unsigned max_esverts, unsigned min_verts_per_prim,
bool use_adjacency)
{
unsigned max_reuse = max_esverts - min_verts_per_prim;
if (use_adjacency)
max_reuse /= 2;
*max_gsprims = MIN2(*max_gsprims, 1 + max_reuse);
}
static unsigned
radv_get_num_input_vertices(const struct radv_pipeline_stage *es_stage,
const struct radv_pipeline_stage *gs_stage)
{
if (gs_stage) {
return gs_stage->nir->info.gs.vertices_in;
}
if (es_stage->stage == MESA_SHADER_TESS_EVAL) {
if (es_stage->nir->info.tess.point_mode)
return 1;
if (es_stage->nir->info.tess._primitive_mode == TESS_PRIMITIVE_ISOLINES)
return 2;
return 3;
}
return 3;
}
static void
gfx10_emit_ge_pc_alloc(struct radeon_cmdbuf *cs, enum amd_gfx_level gfx_level,
uint32_t oversub_pc_lines)
@@ -2038,245 +1902,6 @@ gfx10_emit_ge_pc_alloc(struct radeon_cmdbuf *cs, enum amd_gfx_level gfx_level,
S_030980_OVERSUB_EN(oversub_pc_lines > 0) | S_030980_NUM_PC_LINES(oversub_pc_lines - 1));
}
static unsigned
radv_get_pre_rast_input_topology(const struct radv_pipeline_stage *es_stage,
const struct radv_pipeline_stage *gs_stage)
{
if (gs_stage) {
return gs_stage->nir->info.gs.input_primitive;
}
if (es_stage->stage == MESA_SHADER_TESS_EVAL) {
if (es_stage->nir->info.tess.point_mode)
return SHADER_PRIM_POINTS;
if (es_stage->nir->info.tess._primitive_mode == TESS_PRIMITIVE_ISOLINES)
return SHADER_PRIM_LINES;
return SHADER_PRIM_TRIANGLES;
}
return SHADER_PRIM_TRIANGLES;
}
static void
gfx10_get_ngg_info(const struct radv_device *device, struct radv_pipeline_stage *es_stage,
struct radv_pipeline_stage *gs_stage)
{
const enum amd_gfx_level gfx_level = device->physical_device->rad_info.gfx_level;
struct radv_shader_info *gs_info = gs_stage ? &gs_stage->info : NULL;
struct radv_shader_info *es_info = &es_stage->info;
const unsigned max_verts_per_prim = radv_get_num_input_vertices(es_stage, gs_stage);
const unsigned min_verts_per_prim = gs_stage ? max_verts_per_prim : 1;
struct gfx10_ngg_info *out = gs_stage ? &gs_info->ngg_info : &es_info->ngg_info;
const unsigned gs_num_invocations = gs_stage ? MAX2(gs_info->gs.invocations, 1) : 1;
const unsigned input_prim = radv_get_pre_rast_input_topology(es_stage, gs_stage);
const bool uses_adjacency = input_prim == SHADER_PRIM_LINES_ADJACENCY ||
input_prim == SHADER_PRIM_TRIANGLES_ADJACENCY;
/* All these are in dwords: */
/* We can't allow using the whole LDS, because GS waves compete with
* other shader stages for LDS space.
*
* TODO: We should really take the shader's internal LDS use into
* account. The linker will fail if the size is greater than
* 8K dwords.
*/
const unsigned max_lds_size = 8 * 1024 - 768;
const unsigned target_lds_size = max_lds_size;
unsigned esvert_lds_size = 0;
unsigned gsprim_lds_size = 0;
/* All these are per subgroup: */
const unsigned min_esverts = gfx_level >= GFX10_3 ? 29 : 24;
bool max_vert_out_per_gs_instance = false;
unsigned max_esverts_base = 128;
unsigned max_gsprims_base = 128; /* default prim group size clamp */
/* Hardware has the following non-natural restrictions on the value
* of GE_CNTL.VERT_GRP_SIZE based on based on the primitive type of
* the draw:
* - at most 252 for any line input primitive type
* - at most 251 for any quad input primitive type
* - at most 251 for triangle strips with adjacency (this happens to
* be the natural limit for triangle *lists* with adjacency)
*/
max_esverts_base = MIN2(max_esverts_base, 251 + max_verts_per_prim - 1);
if (gs_stage) {
unsigned max_out_verts_per_gsprim = gs_info->gs.vertices_out * gs_num_invocations;
if (max_out_verts_per_gsprim <= 256) {
if (max_out_verts_per_gsprim) {
max_gsprims_base = MIN2(max_gsprims_base, 256 / max_out_verts_per_gsprim);
}
} else {
/* Use special multi-cycling mode in which each GS
* instance gets its own subgroup. Does not work with
* tessellation. */
max_vert_out_per_gs_instance = true;
max_gsprims_base = 1;
max_out_verts_per_gsprim = gs_info->gs.vertices_out;
}
esvert_lds_size = es_info->esgs_itemsize / 4;
gsprim_lds_size = (gs_info->gs.gsvs_vertex_size / 4 + 1) * max_out_verts_per_gsprim;
} else {
/* VS and TES. */
/* LDS size for passing data from GS to ES. */
struct radv_streamout_info *so_info = &es_info->so;
if (so_info->num_outputs)
esvert_lds_size = 4 * so_info->num_outputs + 1;
/* GS stores Primitive IDs (one DWORD) into LDS at the address
* corresponding to the ES thread of the provoking vertex. All
* ES threads load and export PrimitiveID for their thread.
*/
if (es_stage->stage == MESA_SHADER_VERTEX && es_stage->info.outinfo.export_prim_id)
esvert_lds_size = MAX2(esvert_lds_size, 1);
}
unsigned max_gsprims = max_gsprims_base;
unsigned max_esverts = max_esverts_base;
if (esvert_lds_size)
max_esverts = MIN2(max_esverts, target_lds_size / esvert_lds_size);
if (gsprim_lds_size)
max_gsprims = MIN2(max_gsprims, target_lds_size / gsprim_lds_size);
max_esverts = MIN2(max_esverts, max_gsprims * max_verts_per_prim);
clamp_gsprims_to_esverts(&max_gsprims, max_esverts, min_verts_per_prim, uses_adjacency);
assert(max_esverts >= max_verts_per_prim && max_gsprims >= 1);
if (esvert_lds_size || gsprim_lds_size) {
/* Now that we have a rough proportionality between esverts
* and gsprims based on the primitive type, scale both of them
* down simultaneously based on required LDS space.
*
* We could be smarter about this if we knew how much vertex
* reuse to expect.
*/
unsigned lds_total = max_esverts * esvert_lds_size + max_gsprims * gsprim_lds_size;
if (lds_total > target_lds_size) {
max_esverts = max_esverts * target_lds_size / lds_total;
max_gsprims = max_gsprims * target_lds_size / lds_total;
max_esverts = MIN2(max_esverts, max_gsprims * max_verts_per_prim);
clamp_gsprims_to_esverts(&max_gsprims, max_esverts, min_verts_per_prim, uses_adjacency);
assert(max_esverts >= max_verts_per_prim && max_gsprims >= 1);
}
}
/* Round up towards full wave sizes for better ALU utilization. */
if (!max_vert_out_per_gs_instance) {
unsigned orig_max_esverts;
unsigned orig_max_gsprims;
unsigned wavesize;
if (gs_stage) {
wavesize = gs_info->wave_size;
} else {
wavesize = es_info->wave_size;
}
do {
orig_max_esverts = max_esverts;
orig_max_gsprims = max_gsprims;
max_esverts = align(max_esverts, wavesize);
max_esverts = MIN2(max_esverts, max_esverts_base);
if (esvert_lds_size)
max_esverts =
MIN2(max_esverts, (max_lds_size - max_gsprims * gsprim_lds_size) / esvert_lds_size);
max_esverts = MIN2(max_esverts, max_gsprims * max_verts_per_prim);
/* Hardware restriction: minimum value of max_esverts */
if (gfx_level == GFX10)
max_esverts = MAX2(max_esverts, min_esverts - 1 + max_verts_per_prim);
else
max_esverts = MAX2(max_esverts, min_esverts);
max_gsprims = align(max_gsprims, wavesize);
max_gsprims = MIN2(max_gsprims, max_gsprims_base);
if (gsprim_lds_size) {
/* Don't count unusable vertices to the LDS
* size. Those are vertices above the maximum
* number of vertices that can occur in the
* workgroup, which is e.g. max_gsprims * 3
* for triangles.
*/
unsigned usable_esverts = MIN2(max_esverts, max_gsprims * max_verts_per_prim);
max_gsprims = MIN2(max_gsprims,
(max_lds_size - usable_esverts * esvert_lds_size) / gsprim_lds_size);
}
clamp_gsprims_to_esverts(&max_gsprims, max_esverts, min_verts_per_prim, uses_adjacency);
assert(max_esverts >= max_verts_per_prim && max_gsprims >= 1);
} while (orig_max_esverts != max_esverts || orig_max_gsprims != max_gsprims);
/* Verify the restriction. */
if (gfx_level == GFX10)
assert(max_esverts >= min_esverts - 1 + max_verts_per_prim);
else
assert(max_esverts >= min_esverts);
} else {
/* Hardware restriction: minimum value of max_esverts */
if (gfx_level == GFX10)
max_esverts = MAX2(max_esverts, min_esverts - 1 + max_verts_per_prim);
else
max_esverts = MAX2(max_esverts, min_esverts);
}
unsigned max_out_vertices = max_vert_out_per_gs_instance ? gs_info->gs.vertices_out
: gs_stage
? max_gsprims * gs_num_invocations * gs_info->gs.vertices_out
: max_esverts;
assert(max_out_vertices <= 256);
unsigned prim_amp_factor = 1;
if (gs_stage) {
/* Number of output primitives per GS input primitive after
* GS instancing. */
prim_amp_factor = gs_info->gs.vertices_out;
}
/* On Gfx10, the GE only checks against the maximum number of ES verts
* after allocating a full GS primitive. So we need to ensure that
* whenever this check passes, there is enough space for a full
* primitive without vertex reuse.
*/
if (gfx_level == GFX10)
out->hw_max_esverts = max_esverts - max_verts_per_prim + 1;
else
out->hw_max_esverts = max_esverts;
out->max_gsprims = max_gsprims;
out->max_out_verts = max_out_vertices;
out->prim_amp_factor = prim_amp_factor;
out->max_vert_out_per_gs_instance = max_vert_out_per_gs_instance;
out->ngg_emit_size = max_gsprims * gsprim_lds_size;
out->enable_vertex_grouping = true;
/* Don't count unusable vertices. */
out->esgs_ring_size = MIN2(max_esverts, max_gsprims * max_verts_per_prim) * esvert_lds_size * 4;
if (gs_stage) {
out->vgt_esgs_ring_itemsize = es_info->esgs_itemsize / 4;
} else {
out->vgt_esgs_ring_itemsize = 1;
}
assert(out->hw_max_esverts >= min_esverts); /* HW limitation */
unsigned workgroup_size =
ac_compute_ngg_workgroup_size(
max_esverts, max_gsprims * gs_num_invocations, max_out_vertices, prim_amp_factor);
if (gs_stage) {
gs_info->workgroup_size = workgroup_size;
}
es_info->workgroup_size = workgroup_size;
}
static void
radv_pipeline_init_gs_ring_state(struct radv_graphics_pipeline *pipeline, const struct gfx9_gs_info *gs)
{
@@ -3328,7 +2953,7 @@ radv_fill_shader_info(struct radv_pipeline *pipeline,
&stages[i].info);
}
radv_nir_shader_info_link(device, pipeline_key, stages, last_vgt_api_stage);
radv_nir_shader_info_link(device, pipeline_key, stages, pipeline_has_ngg, last_vgt_api_stage);
if (stages[MESA_SHADER_TESS_CTRL].nir) {
for (gl_shader_stage s = MESA_SHADER_VERTEX; s <= MESA_SHADER_TESS_CTRL; ++s) {
@@ -3359,21 +2984,7 @@ radv_fill_shader_info(struct radv_pipeline *pipeline,
stages[MESA_SHADER_TASK].nir->info.workgroup_size, false, UINT32_MAX);
}
if (pipeline_has_ngg) {
if (last_vgt_api_stage != MESA_SHADER_MESH) {
struct radv_pipeline_stage *es_stage =
stages[MESA_SHADER_TESS_EVAL].nir ? &stages[MESA_SHADER_TESS_EVAL] : &stages[MESA_SHADER_VERTEX];
struct radv_pipeline_stage *gs_stage =
stages[MESA_SHADER_GEOMETRY].nir ? &stages[MESA_SHADER_GEOMETRY] : NULL;
gfx10_get_ngg_info(device, es_stage, gs_stage);
}
} else if (stages[MESA_SHADER_GEOMETRY].nir) {
struct radv_pipeline_stage *es_stage =
stages[MESA_SHADER_TESS_EVAL].nir ? &stages[MESA_SHADER_TESS_EVAL] : &stages[MESA_SHADER_VERTEX];
gfx9_get_gs_info(device, es_stage, &stages[MESA_SHADER_GEOMETRY]);
} else {
if (!pipeline_has_ngg && !stages[MESA_SHADER_GEOMETRY].nir) {
gl_shader_stage hw_vs_api_stage =
stages[MESA_SHADER_TESS_EVAL].nir ? MESA_SHADER_TESS_EVAL : MESA_SHADER_VERTEX;
stages[hw_vs_api_stage].info.workgroup_size = stages[hw_vs_api_stage].info.wave_size;