radv/gfx10: implement NGG support (VS only)
This needs to be cleaned up a bit, and it probably contains missing stuff and/or bugs. This doesn't fix the "half of the triangles" issue. Signed-off-by: Samuel Pitoiset <samuel.pitoiset@gmail.com> Reviewed-by: Bas Nieuwenhuizen <bas@basnieuwenhuizen.nl>
This commit is contained in:

committed by
Bas Nieuwenhuizen

parent
9e37609d0b
commit
ee21bd7440
@@ -70,6 +70,13 @@ struct radv_shader_context {
|
||||
LLVMValueRef tes_u;
|
||||
LLVMValueRef tes_v;
|
||||
|
||||
/* HW GS */
|
||||
/* On gfx10:
|
||||
* - bits 0..10: ordered_wave_id
|
||||
* - bits 12..20: number of vertices in group
|
||||
* - bits 22..30: number of primitives in group
|
||||
*/
|
||||
LLVMValueRef gs_tg_info;
|
||||
LLVMValueRef gs2vs_offset;
|
||||
LLVMValueRef gs_wave_id;
|
||||
LLVMValueRef gs_vtx_offset[6];
|
||||
@@ -823,11 +830,18 @@ declare_vs_input_vgprs(struct radv_shader_context *ctx, struct arg_info *args)
|
||||
if (ctx->options->key.vs.out.as_ls) {
|
||||
add_arg(args, ARG_VGPR, ctx->ac.i32, &ctx->rel_auto_id);
|
||||
add_arg(args, ARG_VGPR, ctx->ac.i32, &ctx->abi.instance_id);
|
||||
add_arg(args, ARG_VGPR, ctx->ac.i32, NULL); /* unused */
|
||||
} else {
|
||||
add_arg(args, ARG_VGPR, ctx->ac.i32, &ctx->abi.instance_id);
|
||||
add_arg(args, ARG_VGPR, ctx->ac.i32, &ctx->vs_prim_id);
|
||||
if (ctx->ac.chip_class >= GFX10) {
|
||||
add_arg(args, ARG_VGPR, ctx->ac.i32, NULL); /* user vgpr */
|
||||
add_arg(args, ARG_VGPR, ctx->ac.i32, NULL); /* user vgpr */
|
||||
add_arg(args, ARG_VGPR, ctx->ac.i32, &ctx->abi.instance_id);
|
||||
} else {
|
||||
add_arg(args, ARG_VGPR, ctx->ac.i32, &ctx->abi.instance_id);
|
||||
add_arg(args, ARG_VGPR, ctx->ac.i32, &ctx->vs_prim_id);
|
||||
add_arg(args, ARG_VGPR, ctx->ac.i32, NULL); /* unused */
|
||||
}
|
||||
}
|
||||
add_arg(args, ARG_VGPR, ctx->ac.i32, NULL); /* unused */
|
||||
}
|
||||
}
|
||||
|
||||
@@ -969,6 +983,12 @@ static void set_llvm_calling_convention(LLVMValueRef func,
|
||||
LLVMSetFunctionCallConv(func, calling_conv);
|
||||
}
|
||||
|
||||
/* Returns whether the stage is a stage that can be directly before the GS */
|
||||
static bool is_pre_gs_stage(gl_shader_stage stage)
|
||||
{
|
||||
return stage == MESA_SHADER_VERTEX || stage == MESA_SHADER_TESS_EVAL;
|
||||
}
|
||||
|
||||
static void create_function(struct radv_shader_context *ctx,
|
||||
gl_shader_stage stage,
|
||||
bool has_previous_stage,
|
||||
@@ -987,6 +1007,15 @@ static void create_function(struct radv_shader_context *ctx,
|
||||
&ctx->ring_offsets);
|
||||
}
|
||||
|
||||
if (ctx->ac.chip_class >= GFX10) {
|
||||
if (stage == MESA_SHADER_VERTEX && ctx->options->key.vs.out.as_ngg) {
|
||||
/* On GFX10, VS is merged into GS for NGG. */
|
||||
stage = MESA_SHADER_GEOMETRY;
|
||||
has_previous_stage = true;
|
||||
previous_stage = MESA_SHADER_VERTEX;
|
||||
}
|
||||
}
|
||||
|
||||
switch (stage) {
|
||||
case MESA_SHADER_COMPUTE:
|
||||
declare_global_input_sgprs(ctx, &user_sgpr_info, &args,
|
||||
@@ -1101,8 +1130,14 @@ static void create_function(struct radv_shader_context *ctx,
|
||||
case MESA_SHADER_GEOMETRY:
|
||||
if (has_previous_stage) {
|
||||
// First 6 system regs
|
||||
add_arg(&args, ARG_SGPR, ctx->ac.i32,
|
||||
&ctx->gs2vs_offset);
|
||||
if (ctx->options->key.vs.out.as_ngg) {
|
||||
add_arg(&args, ARG_SGPR, ctx->ac.i32,
|
||||
&ctx->gs_tg_info);
|
||||
} else {
|
||||
add_arg(&args, ARG_SGPR, ctx->ac.i32,
|
||||
&ctx->gs2vs_offset);
|
||||
}
|
||||
|
||||
add_arg(&args, ARG_SGPR, ctx->ac.i32,
|
||||
&ctx->merged_wave_info);
|
||||
add_arg(&args, ARG_SGPR, ctx->ac.i32, &ctx->oc_lds);
|
||||
@@ -3194,6 +3229,168 @@ handle_ls_outputs_post(struct radv_shader_context *ctx)
|
||||
}
|
||||
}
|
||||
|
||||
static LLVMValueRef get_wave_id_in_tg(struct radv_shader_context *ctx)
|
||||
{
|
||||
return ac_unpack_param(&ctx->ac, ctx->merged_wave_info, 24, 4);
|
||||
}
|
||||
|
||||
static LLVMValueRef ngg_get_vtx_cnt(struct radv_shader_context *ctx)
|
||||
{
|
||||
return ac_build_bfe(&ctx->ac, ctx->gs_tg_info,
|
||||
LLVMConstInt(ctx->ac.i32, 12, false),
|
||||
LLVMConstInt(ctx->ac.i32, 9, false),
|
||||
false);
|
||||
}
|
||||
|
||||
static LLVMValueRef ngg_get_prim_cnt(struct radv_shader_context *ctx)
|
||||
{
|
||||
return ac_build_bfe(&ctx->ac, ctx->gs_tg_info,
|
||||
LLVMConstInt(ctx->ac.i32, 22, false),
|
||||
LLVMConstInt(ctx->ac.i32, 9, false),
|
||||
false);
|
||||
}
|
||||
|
||||
/* Send GS Alloc Req message from the first wave of the group to SPI.
|
||||
* Message payload is:
|
||||
* - bits 0..10: vertices in group
|
||||
* - bits 12..22: primitives in group
|
||||
*/
|
||||
static void build_sendmsg_gs_alloc_req(struct radv_shader_context *ctx,
|
||||
LLVMValueRef vtx_cnt,
|
||||
LLVMValueRef prim_cnt)
|
||||
{
|
||||
LLVMBuilderRef builder = ctx->ac.builder;
|
||||
LLVMValueRef tmp;
|
||||
|
||||
tmp = LLVMBuildICmp(builder, LLVMIntEQ, get_wave_id_in_tg(ctx), ctx->ac.i32_0, "");
|
||||
ac_build_ifcc(&ctx->ac, tmp, 5020);
|
||||
|
||||
tmp = LLVMBuildShl(builder, prim_cnt, LLVMConstInt(ctx->ac.i32, 12, false),"");
|
||||
tmp = LLVMBuildOr(builder, tmp, vtx_cnt, "");
|
||||
ac_build_sendmsg(&ctx->ac, AC_SENDMSG_GS_ALLOC_REQ, tmp);
|
||||
|
||||
ac_build_endif(&ctx->ac, 5020);
|
||||
}
|
||||
|
||||
struct ngg_prim {
|
||||
unsigned num_vertices;
|
||||
LLVMValueRef isnull;
|
||||
LLVMValueRef index[3];
|
||||
LLVMValueRef edgeflag[3];
|
||||
};
|
||||
|
||||
static void build_export_prim(struct radv_shader_context *ctx,
|
||||
const struct ngg_prim *prim)
|
||||
{
|
||||
LLVMBuilderRef builder = ctx->ac.builder;
|
||||
struct ac_export_args args;
|
||||
LLVMValueRef tmp;
|
||||
|
||||
tmp = LLVMBuildZExt(builder, prim->isnull, ctx->ac.i32, "");
|
||||
args.out[0] = LLVMBuildShl(builder, tmp, LLVMConstInt(ctx->ac.i32, 31, false), "");
|
||||
|
||||
for (unsigned i = 0; i < prim->num_vertices; ++i) {
|
||||
tmp = LLVMBuildShl(builder, prim->index[i],
|
||||
LLVMConstInt(ctx->ac.i32, 10 * i, false), "");
|
||||
args.out[0] = LLVMBuildOr(builder, args.out[0], tmp, "");
|
||||
tmp = LLVMBuildZExt(builder, prim->edgeflag[i], ctx->ac.i32, "");
|
||||
tmp = LLVMBuildShl(builder, tmp,
|
||||
LLVMConstInt(ctx->ac.i32, 10 * i + 9, false), "");
|
||||
args.out[0] = LLVMBuildOr(builder, args.out[0], tmp, "");
|
||||
}
|
||||
|
||||
args.out[0] = LLVMBuildBitCast(builder, args.out[0], ctx->ac.f32, "");
|
||||
args.out[1] = LLVMGetUndef(ctx->ac.f32);
|
||||
args.out[2] = LLVMGetUndef(ctx->ac.f32);
|
||||
args.out[3] = LLVMGetUndef(ctx->ac.f32);
|
||||
|
||||
args.target = V_008DFC_SQ_EXP_PRIM;
|
||||
args.enabled_channels = 1;
|
||||
args.done = true;
|
||||
args.valid_mask = false;
|
||||
args.compr = false;
|
||||
|
||||
ac_build_export(&ctx->ac, &args);
|
||||
}
|
||||
|
||||
static void
|
||||
handle_ngg_outputs_post(struct radv_shader_context *ctx)
|
||||
{
|
||||
LLVMBuilderRef builder = ctx->ac.builder;
|
||||
struct ac_build_if_state if_state;
|
||||
unsigned num_vertices = 3;
|
||||
LLVMValueRef tmp;
|
||||
|
||||
assert(ctx->stage == MESA_SHADER_VERTEX && !ctx->is_gs_copy_shader);
|
||||
|
||||
LLVMValueRef prims_in_wave = ac_unpack_param(&ctx->ac, ctx->merged_wave_info, 8, 8);
|
||||
LLVMValueRef vtx_in_wave = ac_unpack_param(&ctx->ac, ctx->merged_wave_info, 0, 8);
|
||||
LLVMValueRef is_gs_thread = LLVMBuildICmp(builder, LLVMIntULT,
|
||||
ac_get_thread_id(&ctx->ac), prims_in_wave, "");
|
||||
LLVMValueRef is_es_thread = LLVMBuildICmp(builder, LLVMIntULT,
|
||||
ac_get_thread_id(&ctx->ac), vtx_in_wave, "");
|
||||
LLVMValueRef vtxindex[] = {
|
||||
ac_unpack_param(&ctx->ac, ctx->gs_vtx_offset[0], 0, 16),
|
||||
ac_unpack_param(&ctx->ac, ctx->gs_vtx_offset[0], 16, 16),
|
||||
ac_unpack_param(&ctx->ac, ctx->gs_vtx_offset[2], 0, 16),
|
||||
};
|
||||
|
||||
/* TODO: streamout */
|
||||
|
||||
/* TODO: VS primitive ID */
|
||||
if (ctx->options->key.vs.out.export_prim_id)
|
||||
assert(0);
|
||||
|
||||
/* TODO: primitive culling */
|
||||
|
||||
build_sendmsg_gs_alloc_req(ctx, ngg_get_vtx_cnt(ctx), ngg_get_prim_cnt(ctx));
|
||||
|
||||
/* TODO: streamout queries */
|
||||
/* Export primitive data to the index buffer. Format is:
|
||||
* - bits 0..8: index 0
|
||||
* - bit 9: edge flag 0
|
||||
* - bits 10..18: index 1
|
||||
* - bit 19: edge flag 1
|
||||
* - bits 20..28: index 2
|
||||
* - bit 29: edge flag 2
|
||||
* - bit 31: null primitive (skip)
|
||||
*
|
||||
* For the first version, we will always build up all three indices
|
||||
* independent of the primitive type. The additional garbage data
|
||||
* shouldn't hurt.
|
||||
*
|
||||
* TODO: culling depends on the primitive type, so can have some
|
||||
* interaction here.
|
||||
*/
|
||||
ac_nir_build_if(&if_state, ctx, is_gs_thread);
|
||||
{
|
||||
struct ngg_prim prim = {};
|
||||
|
||||
prim.num_vertices = num_vertices;
|
||||
prim.isnull = ctx->ac.i1false;
|
||||
memcpy(prim.index, vtxindex, sizeof(vtxindex[0]) * 3);
|
||||
|
||||
for (unsigned i = 0; i < num_vertices; ++i) {
|
||||
tmp = LLVMBuildLShr(builder, ctx->abi.gs_invocation_id,
|
||||
LLVMConstInt(ctx->ac.i32, 8 + i, false), "");
|
||||
prim.edgeflag[i] = LLVMBuildTrunc(builder, tmp, ctx->ac.i1, "");
|
||||
}
|
||||
|
||||
build_export_prim(ctx, &prim);
|
||||
}
|
||||
ac_nir_build_endif(&if_state);
|
||||
|
||||
/* Export per-vertex data (positions and parameters). */
|
||||
ac_nir_build_if(&if_state, ctx, is_es_thread);
|
||||
{
|
||||
handle_vs_outputs_post(ctx, ctx->options->key.vs.out.export_prim_id,
|
||||
ctx->options->key.vs.out.export_layer_id,
|
||||
ctx->options->key.vs.out.export_clip_dists,
|
||||
&ctx->shader_info->vs.outinfo);
|
||||
}
|
||||
ac_nir_build_endif(&if_state);
|
||||
}
|
||||
|
||||
static void
|
||||
write_tess_factors(struct radv_shader_context *ctx)
|
||||
{
|
||||
@@ -3452,6 +3649,8 @@ handle_shader_outputs_post(struct ac_shader_abi *abi, unsigned max_outputs,
|
||||
handle_ls_outputs_post(ctx);
|
||||
else if (ctx->options->key.vs.out.as_es)
|
||||
handle_es_outputs_post(ctx, &ctx->shader_info->vs.es_info);
|
||||
else if (ctx->options->key.vs.out.as_ngg)
|
||||
handle_ngg_outputs_post(ctx);
|
||||
else
|
||||
handle_vs_outputs_post(ctx, ctx->options->key.vs.out.export_prim_id,
|
||||
ctx->options->key.vs.out.export_layer_id,
|
||||
@@ -3703,6 +3902,13 @@ LLVMModuleRef ac_translate_nir_to_llvm(struct ac_llvm_compiler *ac_llvm,
|
||||
shaders[i]));
|
||||
}
|
||||
|
||||
if (ctx.ac.chip_class >= GFX10) {
|
||||
if (shaders[0]->info.stage == MESA_SHADER_VERTEX &&
|
||||
options->key.vs.out.as_ngg) {
|
||||
ctx.max_workgroup_size = 128;
|
||||
}
|
||||
}
|
||||
|
||||
create_function(&ctx, shaders[shader_count - 1]->info.stage, shader_count >= 2,
|
||||
shader_count >= 2 ? shaders[shader_count - 2]->info.stage : MESA_SHADER_VERTEX);
|
||||
|
||||
@@ -3722,7 +3928,8 @@ LLVMModuleRef ac_translate_nir_to_llvm(struct ac_llvm_compiler *ac_llvm,
|
||||
*/
|
||||
ctx.abi.gfx9_stride_size_workaround_for_atomic = ctx.ac.chip_class == GFX9 && HAVE_LLVM < 0x900;
|
||||
|
||||
if (shader_count >= 2)
|
||||
bool is_ngg = is_pre_gs_stage(shaders[0]->info.stage) && ctx.options->key.vs.out.as_ngg;
|
||||
if (shader_count >= 2 || is_ngg)
|
||||
ac_init_exec_full_mask(&ctx.ac);
|
||||
|
||||
if ((ctx.ac.family == CHIP_VEGA10 ||
|
||||
@@ -3788,7 +3995,7 @@ LLVMModuleRef ac_translate_nir_to_llvm(struct ac_llvm_compiler *ac_llvm,
|
||||
ac_setup_rings(&ctx);
|
||||
|
||||
LLVMBasicBlockRef merge_block;
|
||||
if (shader_count >= 2) {
|
||||
if (shader_count >= 2 || is_ngg) {
|
||||
LLVMValueRef fn = LLVMGetBasicBlockParent(LLVMGetInsertBlock(ctx.ac.builder));
|
||||
LLVMBasicBlockRef then_block = LLVMAppendBasicBlockInContext(ctx.ac.context, fn, "");
|
||||
merge_block = LLVMAppendBasicBlockInContext(ctx.ac.context, fn, "");
|
||||
@@ -3811,7 +4018,7 @@ LLVMModuleRef ac_translate_nir_to_llvm(struct ac_llvm_compiler *ac_llvm,
|
||||
|
||||
ac_nir_translate(&ctx.ac, &ctx.abi, shaders[i]);
|
||||
|
||||
if (shader_count >= 2) {
|
||||
if (shader_count >= 2 || is_ngg) {
|
||||
LLVMBuildBr(ctx.ac.builder, merge_block);
|
||||
LLVMPositionBuilderAtEnd(ctx.ac.builder, merge_block);
|
||||
}
|
||||
@@ -3955,6 +4162,7 @@ ac_fill_shader_info(struct radv_shader_variant_info *shader_info, struct nir_sha
|
||||
shader_info->vs.as_es = options->key.vs.out.as_es;
|
||||
shader_info->vs.as_ls = options->key.vs.out.as_ls;
|
||||
shader_info->vs.export_prim_id = options->key.vs.out.export_prim_id;
|
||||
shader_info->is_ngg = options->key.vs.out.as_ngg;
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
|
@@ -96,6 +96,30 @@ struct radv_gs_state {
|
||||
uint32_t lds_size;
|
||||
};
|
||||
|
||||
struct radv_ngg_state {
|
||||
uint16_t ngg_emit_size; /* in dwords */
|
||||
uint32_t hw_max_esverts;
|
||||
uint32_t max_gsprims;
|
||||
uint32_t max_out_verts;
|
||||
uint32_t prim_amp_factor;
|
||||
uint32_t vgt_esgs_ring_itemsize;
|
||||
bool max_vert_out_per_gs_instance;
|
||||
};
|
||||
|
||||
bool radv_pipeline_has_ngg(const struct radv_pipeline *pipeline)
|
||||
{
|
||||
struct radv_shader_variant *variant = NULL;
|
||||
if (pipeline->shaders[MESA_SHADER_GEOMETRY])
|
||||
variant = pipeline->shaders[MESA_SHADER_GEOMETRY];
|
||||
else if (pipeline->shaders[MESA_SHADER_TESS_EVAL])
|
||||
variant = pipeline->shaders[MESA_SHADER_TESS_EVAL];
|
||||
else if (pipeline->shaders[MESA_SHADER_VERTEX])
|
||||
variant = pipeline->shaders[MESA_SHADER_VERTEX];
|
||||
else
|
||||
return false;
|
||||
return variant->info.is_ngg;
|
||||
}
|
||||
|
||||
static void
|
||||
radv_pipeline_destroy(struct radv_device *device,
|
||||
struct radv_pipeline *pipeline,
|
||||
@@ -1583,6 +1607,203 @@ calculate_gs_info(const VkGraphicsPipelineCreateInfo *pCreateInfo,
|
||||
return gs;
|
||||
}
|
||||
|
||||
static void clamp_gsprims_to_esverts(unsigned *max_gsprims, unsigned max_esverts,
|
||||
unsigned min_verts_per_prim, bool use_adjacency)
|
||||
{
|
||||
unsigned max_reuse = max_esverts - min_verts_per_prim;
|
||||
if (use_adjacency)
|
||||
max_reuse /= 2;
|
||||
*max_gsprims = MIN2(*max_gsprims, 1 + max_reuse);
|
||||
}
|
||||
|
||||
static struct radv_ngg_state
|
||||
calculate_ngg_info(const VkGraphicsPipelineCreateInfo *pCreateInfo,
|
||||
struct radv_pipeline *pipeline)
|
||||
{
|
||||
struct radv_ngg_state ngg = {0};
|
||||
struct radv_shader_variant_info *gs_info = &pipeline->shaders[MESA_SHADER_GEOMETRY]->info;
|
||||
struct radv_es_output_info *es_info =
|
||||
radv_pipeline_has_tess(pipeline) ? &gs_info->tes.es_info : &gs_info->vs.es_info;
|
||||
unsigned gs_type = MESA_SHADER_VERTEX;
|
||||
unsigned max_verts_per_prim = 3; // triangles
|
||||
unsigned min_verts_per_prim =
|
||||
gs_type == MESA_SHADER_GEOMETRY ? max_verts_per_prim : 1;
|
||||
unsigned gs_num_invocations = 1;//MAX2(gs_info->gs.invocations, 1);
|
||||
bool uses_adjacency;
|
||||
switch(pCreateInfo->pInputAssemblyState->topology) {
|
||||
case VK_PRIMITIVE_TOPOLOGY_LINE_LIST_WITH_ADJACENCY:
|
||||
case VK_PRIMITIVE_TOPOLOGY_LINE_STRIP_WITH_ADJACENCY:
|
||||
case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST_WITH_ADJACENCY:
|
||||
case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_STRIP_WITH_ADJACENCY:
|
||||
uses_adjacency = true;
|
||||
break;
|
||||
default:
|
||||
uses_adjacency = false;
|
||||
break;
|
||||
}
|
||||
|
||||
/* All these are in dwords: */
|
||||
/* We can't allow using the whole LDS, because GS waves compete with
|
||||
* other shader stages for LDS space.
|
||||
*
|
||||
* Streamout can increase the ESGS buffer size later on, so be more
|
||||
* conservative with streamout and use 4K dwords. This may be suboptimal.
|
||||
*
|
||||
* Otherwise, use the limit of 7K dwords. The reason is that we need
|
||||
* to leave some headroom for the max_esverts increase at the end.
|
||||
*
|
||||
* TODO: We should really take the shader's internal LDS use into
|
||||
* account. The linker will fail if the size is greater than
|
||||
* 8K dwords.
|
||||
*/
|
||||
const unsigned max_lds_size = (0 /*gs_info->info.so.num_outputs*/ ? 4 : 7) * 1024 - 128;
|
||||
const unsigned target_lds_size = max_lds_size;
|
||||
unsigned esvert_lds_size = 0;
|
||||
unsigned gsprim_lds_size = 0;
|
||||
|
||||
/* All these are per subgroup: */
|
||||
bool max_vert_out_per_gs_instance = false;
|
||||
unsigned max_esverts_base = 256;
|
||||
unsigned max_gsprims_base = 128; /* default prim group size clamp */
|
||||
|
||||
/* Hardware has the following non-natural restrictions on the value
|
||||
* of GE_CNTL.VERT_GRP_SIZE based on based on the primitive type of
|
||||
* the draw:
|
||||
* - at most 252 for any line input primitive type
|
||||
* - at most 251 for any quad input primitive type
|
||||
* - at most 251 for triangle strips with adjacency (this happens to
|
||||
* be the natural limit for triangle *lists* with adjacency)
|
||||
*/
|
||||
max_esverts_base = MIN2(max_esverts_base, 251 + max_verts_per_prim - 1);
|
||||
|
||||
if (gs_type == MESA_SHADER_GEOMETRY) {
|
||||
unsigned max_out_verts_per_gsprim =
|
||||
gs_info->gs.vertices_out * gs_num_invocations;
|
||||
|
||||
if (max_out_verts_per_gsprim <= 256) {
|
||||
if (max_out_verts_per_gsprim) {
|
||||
max_gsprims_base = MIN2(max_gsprims_base,
|
||||
256 / max_out_verts_per_gsprim);
|
||||
}
|
||||
} else {
|
||||
/* Use special multi-cycling mode in which each GS
|
||||
* instance gets its own subgroup. Does not work with
|
||||
* tessellation. */
|
||||
max_vert_out_per_gs_instance = true;
|
||||
max_gsprims_base = 1;
|
||||
max_out_verts_per_gsprim = gs_info->gs.vertices_out;
|
||||
}
|
||||
|
||||
esvert_lds_size = es_info->esgs_itemsize / 4;
|
||||
gsprim_lds_size = (gs_info->gs.gsvs_vertex_size / 4 + 1) * max_out_verts_per_gsprim;
|
||||
} else {
|
||||
/* TODO: This needs to be adjusted once LDS use for compaction
|
||||
* after culling is implemented. */
|
||||
/*
|
||||
if (es_info->info.so.num_outputs)
|
||||
esvert_lds_size = 4 * es_info->info.so.num_outputs + 1;
|
||||
*/
|
||||
}
|
||||
|
||||
unsigned max_gsprims = max_gsprims_base;
|
||||
unsigned max_esverts = max_esverts_base;
|
||||
|
||||
if (esvert_lds_size)
|
||||
max_esverts = MIN2(max_esverts, target_lds_size / esvert_lds_size);
|
||||
if (gsprim_lds_size)
|
||||
max_gsprims = MIN2(max_gsprims, target_lds_size / gsprim_lds_size);
|
||||
|
||||
max_esverts = MIN2(max_esverts, max_gsprims * max_verts_per_prim);
|
||||
clamp_gsprims_to_esverts(&max_gsprims, max_esverts, min_verts_per_prim, uses_adjacency);
|
||||
assert(max_esverts >= max_verts_per_prim && max_gsprims >= 1);
|
||||
|
||||
if (esvert_lds_size || gsprim_lds_size) {
|
||||
/* Now that we have a rough proportionality between esverts
|
||||
* and gsprims based on the primitive type, scale both of them
|
||||
* down simultaneously based on required LDS space.
|
||||
*
|
||||
* We could be smarter about this if we knew how much vertex
|
||||
* reuse to expect.
|
||||
*/
|
||||
unsigned lds_total = max_esverts * esvert_lds_size +
|
||||
max_gsprims * gsprim_lds_size;
|
||||
if (lds_total > target_lds_size) {
|
||||
max_esverts = max_esverts * target_lds_size / lds_total;
|
||||
max_gsprims = max_gsprims * target_lds_size / lds_total;
|
||||
|
||||
max_esverts = MIN2(max_esverts, max_gsprims * max_verts_per_prim);
|
||||
clamp_gsprims_to_esverts(&max_gsprims, max_esverts,
|
||||
min_verts_per_prim, uses_adjacency);
|
||||
assert(max_esverts >= max_verts_per_prim && max_gsprims >= 1);
|
||||
}
|
||||
}
|
||||
|
||||
/* Round up towards full wave sizes for better ALU utilization. */
|
||||
if (!max_vert_out_per_gs_instance) {
|
||||
const unsigned wavesize = 64;
|
||||
unsigned orig_max_esverts;
|
||||
unsigned orig_max_gsprims;
|
||||
do {
|
||||
orig_max_esverts = max_esverts;
|
||||
orig_max_gsprims = max_gsprims;
|
||||
|
||||
max_esverts = align(max_esverts, wavesize);
|
||||
max_esverts = MIN2(max_esverts, max_esverts_base);
|
||||
if (esvert_lds_size)
|
||||
max_esverts = MIN2(max_esverts,
|
||||
(max_lds_size - max_gsprims * gsprim_lds_size) /
|
||||
esvert_lds_size);
|
||||
max_esverts = MIN2(max_esverts, max_gsprims * max_verts_per_prim);
|
||||
|
||||
max_gsprims = align(max_gsprims, wavesize);
|
||||
max_gsprims = MIN2(max_gsprims, max_gsprims_base);
|
||||
if (gsprim_lds_size)
|
||||
max_gsprims = MIN2(max_gsprims,
|
||||
(max_lds_size - max_esverts * esvert_lds_size) /
|
||||
gsprim_lds_size);
|
||||
clamp_gsprims_to_esverts(&max_gsprims, max_esverts,
|
||||
min_verts_per_prim, uses_adjacency);
|
||||
assert(max_esverts >= max_verts_per_prim && max_gsprims >= 1);
|
||||
} while (orig_max_esverts != max_esverts || orig_max_gsprims != max_gsprims);
|
||||
}
|
||||
|
||||
/* Hardware restriction: minimum value of max_esverts */
|
||||
max_esverts = MAX2(max_esverts, 23 + max_verts_per_prim);
|
||||
|
||||
unsigned max_out_vertices =
|
||||
max_vert_out_per_gs_instance ? gs_info->gs.vertices_out :
|
||||
gs_type == MESA_SHADER_GEOMETRY ?
|
||||
max_gsprims * gs_num_invocations * gs_info->gs.vertices_out :
|
||||
max_esverts;
|
||||
assert(max_out_vertices <= 256);
|
||||
|
||||
unsigned prim_amp_factor = 1;
|
||||
if (gs_type == MESA_SHADER_GEOMETRY) {
|
||||
/* Number of output primitives per GS input primitive after
|
||||
* GS instancing. */
|
||||
prim_amp_factor = gs_info->gs.vertices_out;
|
||||
}
|
||||
|
||||
/* The GE only checks against the maximum number of ES verts after
|
||||
* allocating a full GS primitive. So we need to ensure that whenever
|
||||
* this check passes, there is enough space for a full primitive without
|
||||
* vertex reuse.
|
||||
*/
|
||||
ngg.hw_max_esverts = max_esverts - max_verts_per_prim + 1;
|
||||
ngg.max_gsprims = max_gsprims;
|
||||
ngg.max_out_verts = max_out_vertices;
|
||||
ngg.prim_amp_factor = prim_amp_factor;
|
||||
ngg.max_vert_out_per_gs_instance = max_vert_out_per_gs_instance;
|
||||
ngg.ngg_emit_size = max_gsprims * gsprim_lds_size;
|
||||
ngg.vgt_esgs_ring_itemsize = 1;
|
||||
|
||||
pipeline->graphics.esgs_ring_size = 4 * max_esverts * esvert_lds_size;
|
||||
|
||||
assert(ngg.hw_max_esverts >= 24); /* HW limitation */
|
||||
|
||||
return ngg;
|
||||
}
|
||||
|
||||
static void
|
||||
calculate_gs_ring_sizes(struct radv_pipeline *pipeline, const struct radv_gs_state *gs)
|
||||
{
|
||||
@@ -2000,7 +2221,8 @@ radv_generate_graphics_pipeline_key(struct radv_pipeline *pipeline,
|
||||
}
|
||||
|
||||
static void
|
||||
radv_fill_shader_keys(struct radv_shader_variant_key *keys,
|
||||
radv_fill_shader_keys(struct radv_device *device,
|
||||
struct radv_shader_variant_key *keys,
|
||||
const struct radv_pipeline_key *key,
|
||||
nir_shader **nir)
|
||||
{
|
||||
@@ -2031,6 +2253,10 @@ radv_fill_shader_keys(struct radv_shader_variant_key *keys,
|
||||
keys[MESA_SHADER_VERTEX].vs.out.as_es = true;
|
||||
}
|
||||
|
||||
if (device->physical_device->rad_info.chip_class >= GFX10) {
|
||||
keys[MESA_SHADER_VERTEX].vs.out.as_ngg = true;
|
||||
}
|
||||
|
||||
for(int i = 0; i < MESA_SHADER_STAGES; ++i)
|
||||
keys[i].has_multiview_view_index = key->has_multiview_view_index;
|
||||
|
||||
@@ -2221,7 +2447,7 @@ void radv_create_shaders(struct radv_pipeline *pipeline,
|
||||
nir_print_shader(nir[i], stderr);
|
||||
}
|
||||
|
||||
radv_fill_shader_keys(keys, key, nir);
|
||||
radv_fill_shader_keys(device, keys, key, nir);
|
||||
|
||||
if (nir[MESA_SHADER_FRAGMENT]) {
|
||||
if (!pipeline->shaders[MESA_SHADER_FRAGMENT]) {
|
||||
@@ -2356,6 +2582,8 @@ radv_pipeline_stage_to_user_data_0(struct radv_pipeline *pipeline,
|
||||
{
|
||||
bool has_gs = radv_pipeline_has_gs(pipeline);
|
||||
bool has_tess = radv_pipeline_has_tess(pipeline);
|
||||
bool has_ngg = radv_pipeline_has_ngg(pipeline);
|
||||
|
||||
switch (stage) {
|
||||
case MESA_SHADER_FRAGMENT:
|
||||
return R_00B030_SPI_SHADER_USER_DATA_PS_0;
|
||||
@@ -2379,6 +2607,9 @@ radv_pipeline_stage_to_user_data_0(struct radv_pipeline *pipeline,
|
||||
}
|
||||
}
|
||||
|
||||
if (has_ngg)
|
||||
return R_00B230_SPI_SHADER_USER_DATA_GS_0;
|
||||
|
||||
return R_00B130_SPI_SHADER_USER_DATA_VS_0;
|
||||
case MESA_SHADER_GEOMETRY:
|
||||
return chip_class == GFX9 ? R_00B330_SPI_SHADER_USER_DATA_ES_0 :
|
||||
@@ -2968,8 +3199,7 @@ radv_pipeline_generate_vgt_gs_mode(struct radeon_cmdbuf *ctx_cs,
|
||||
struct radv_pipeline *pipeline)
|
||||
{
|
||||
const struct radv_vs_output_info *outinfo = get_vs_output_info(pipeline);
|
||||
|
||||
uint32_t vgt_primitiveid_en = false;
|
||||
unsigned vgt_primitiveid_en = 0;
|
||||
uint32_t vgt_gs_mode = 0;
|
||||
|
||||
if (radv_pipeline_has_gs(pipeline)) {
|
||||
@@ -2978,9 +3208,17 @@ radv_pipeline_generate_vgt_gs_mode(struct radeon_cmdbuf *ctx_cs,
|
||||
|
||||
vgt_gs_mode = ac_vgt_gs_mode(gs->info.gs.vertices_out,
|
||||
pipeline->device->physical_device->rad_info.chip_class);
|
||||
} else if (radv_pipeline_has_ngg(pipeline)) {
|
||||
const struct radv_shader_variant *vs =
|
||||
pipeline->shaders[MESA_SHADER_VERTEX];
|
||||
bool enable_prim_id =
|
||||
outinfo->export_prim_id || vs->info.info.uses_prim_id;
|
||||
|
||||
vgt_primitiveid_en |= S_028A84_PRIMITIVEID_EN(enable_prim_id) |
|
||||
S_028A84_NGG_DISABLE_PROVOK_REUSE(enable_prim_id);
|
||||
} else if (outinfo->export_prim_id) {
|
||||
vgt_gs_mode = S_028A40_MODE(V_028A40_GS_SCENARIO_A);
|
||||
vgt_primitiveid_en = true;
|
||||
vgt_primitiveid_en |= S_028A84_PRIMITIVEID_EN(1);
|
||||
}
|
||||
|
||||
radeon_set_context_reg(ctx_cs, R_028A84_VGT_PRIMITIVEID_EN, vgt_primitiveid_en);
|
||||
@@ -3084,6 +3322,105 @@ radv_pipeline_generate_hw_ls(struct radeon_cmdbuf *cs,
|
||||
radeon_emit(cs, rsrc2);
|
||||
}
|
||||
|
||||
static void
|
||||
radv_pipeline_generate_hw_ngg(struct radeon_cmdbuf *ctx_cs,
|
||||
struct radeon_cmdbuf *cs,
|
||||
struct radv_pipeline *pipeline,
|
||||
struct radv_shader_variant *shader,
|
||||
const struct radv_ngg_state *ngg_state)
|
||||
{
|
||||
uint64_t va = radv_buffer_get_va(shader->bo) + shader->bo_offset;
|
||||
|
||||
radeon_set_sh_reg_seq(cs, R_00B320_SPI_SHADER_PGM_LO_ES, 2);
|
||||
radeon_emit(cs, va >> 8);
|
||||
radeon_emit(cs, va >> 40);
|
||||
radeon_set_sh_reg_seq(cs, R_00B228_SPI_SHADER_PGM_RSRC1_GS, 2);
|
||||
radeon_emit(cs, shader->config.rsrc1);
|
||||
radeon_emit(cs, shader->config.rsrc2);
|
||||
|
||||
const struct radv_vs_output_info *outinfo = get_vs_output_info(pipeline);
|
||||
unsigned clip_dist_mask, cull_dist_mask, total_mask;
|
||||
clip_dist_mask = outinfo->clip_dist_mask;
|
||||
cull_dist_mask = outinfo->cull_dist_mask;
|
||||
total_mask = clip_dist_mask | cull_dist_mask;
|
||||
bool misc_vec_ena = outinfo->writes_pointsize ||
|
||||
outinfo->writes_layer ||
|
||||
outinfo->writes_viewport_index;
|
||||
bool break_wave_at_eoi = false;
|
||||
|
||||
radeon_set_context_reg(ctx_cs, R_0286C4_SPI_VS_OUT_CONFIG,
|
||||
S_0286C4_VS_EXPORT_COUNT(MAX2(1, outinfo->param_exports) - 1));
|
||||
radeon_set_context_reg(ctx_cs, R_028708_SPI_SHADER_IDX_FORMAT,
|
||||
S_028708_IDX0_EXPORT_FORMAT(V_028708_SPI_SHADER_1COMP));
|
||||
radeon_set_context_reg(ctx_cs, R_02870C_SPI_SHADER_POS_FORMAT,
|
||||
S_02870C_POS0_EXPORT_FORMAT(V_02870C_SPI_SHADER_4COMP) |
|
||||
S_02870C_POS1_EXPORT_FORMAT(outinfo->pos_exports > 1 ?
|
||||
V_02870C_SPI_SHADER_4COMP :
|
||||
V_02870C_SPI_SHADER_NONE) |
|
||||
S_02870C_POS2_EXPORT_FORMAT(outinfo->pos_exports > 2 ?
|
||||
V_02870C_SPI_SHADER_4COMP :
|
||||
V_02870C_SPI_SHADER_NONE) |
|
||||
S_02870C_POS3_EXPORT_FORMAT(outinfo->pos_exports > 3 ?
|
||||
V_02870C_SPI_SHADER_4COMP :
|
||||
V_02870C_SPI_SHADER_NONE));
|
||||
|
||||
radeon_set_context_reg(ctx_cs, R_028818_PA_CL_VTE_CNTL,
|
||||
S_028818_VTX_W0_FMT(1) |
|
||||
S_028818_VPORT_X_SCALE_ENA(1) | S_028818_VPORT_X_OFFSET_ENA(1) |
|
||||
S_028818_VPORT_Y_SCALE_ENA(1) | S_028818_VPORT_Y_OFFSET_ENA(1) |
|
||||
S_028818_VPORT_Z_SCALE_ENA(1) | S_028818_VPORT_Z_OFFSET_ENA(1));
|
||||
radeon_set_context_reg(ctx_cs, R_02881C_PA_CL_VS_OUT_CNTL,
|
||||
S_02881C_USE_VTX_POINT_SIZE(outinfo->writes_pointsize) |
|
||||
S_02881C_USE_VTX_RENDER_TARGET_INDX(outinfo->writes_layer) |
|
||||
S_02881C_USE_VTX_VIEWPORT_INDX(outinfo->writes_viewport_index) |
|
||||
S_02881C_VS_OUT_MISC_VEC_ENA(misc_vec_ena) |
|
||||
S_02881C_VS_OUT_MISC_SIDE_BUS_ENA(misc_vec_ena) |
|
||||
S_02881C_VS_OUT_CCDIST0_VEC_ENA((total_mask & 0x0f) != 0) |
|
||||
S_02881C_VS_OUT_CCDIST1_VEC_ENA((total_mask & 0xf0) != 0) |
|
||||
cull_dist_mask << 8 |
|
||||
clip_dist_mask);
|
||||
|
||||
/* TODO: Correctly set REUSE_OFF */
|
||||
radeon_set_context_reg(ctx_cs, R_028AB4_VGT_REUSE_OFF,
|
||||
S_028AB4_REUSE_OFF(0));
|
||||
radeon_set_context_reg(ctx_cs, R_028AAC_VGT_ESGS_RING_ITEMSIZE,
|
||||
ngg_state->vgt_esgs_ring_itemsize);
|
||||
|
||||
/* NGG specific registers. */
|
||||
struct radv_shader_variant *gs = pipeline->shaders[MESA_SHADER_GEOMETRY];
|
||||
uint32_t gs_num_invocations = gs ? gs->info.gs.invocations : 1;
|
||||
|
||||
radeon_set_context_reg(ctx_cs, R_028A44_VGT_GS_ONCHIP_CNTL,
|
||||
S_028A44_ES_VERTS_PER_SUBGRP(ngg_state->hw_max_esverts) |
|
||||
S_028A44_GS_PRIMS_PER_SUBGRP(ngg_state->max_gsprims) |
|
||||
S_028A44_GS_INST_PRIMS_IN_SUBGRP(ngg_state->max_gsprims * gs_num_invocations));
|
||||
radeon_set_context_reg(ctx_cs, R_0287FC_GE_MAX_OUTPUT_PER_SUBGROUP,
|
||||
S_0287FC_MAX_VERTS_PER_SUBGROUP(ngg_state->max_out_verts));
|
||||
radeon_set_context_reg(ctx_cs, R_028B4C_GE_NGG_SUBGRP_CNTL,
|
||||
S_028B4C_PRIM_AMP_FACTOR(ngg_state->prim_amp_factor) |
|
||||
S_028B4C_THDS_PER_SUBGRP(0)); /* for fast launch */
|
||||
radeon_set_context_reg(ctx_cs, R_028B90_VGT_GS_INSTANCE_CNT,
|
||||
S_028B90_CNT(gs_num_invocations) |
|
||||
S_028B90_ENABLE(gs_num_invocations > 1) |
|
||||
S_028B90_EN_MAX_VERT_OUT_PER_GS_INSTANCE(ngg_state->max_vert_out_per_gs_instance));
|
||||
|
||||
/* User edge flags are set by the pos exports. If user edge flags are
|
||||
* not used, we must use hw-generated edge flags and pass them via
|
||||
* the prim export to prevent drawing lines on internal edges of
|
||||
* decomposed primitives (such as quads) with polygon mode = lines.
|
||||
*
|
||||
* TODO: We should combine hw-generated edge flags with user edge
|
||||
* flags in the shader.
|
||||
*/
|
||||
radeon_set_context_reg(ctx_cs, R_028838_PA_CL_NGG_CNTL,
|
||||
S_028838_INDEX_BUF_EDGE_FLAG_ENA(1));
|
||||
|
||||
radeon_set_context_reg(ctx_cs, R_03096C_GE_CNTL,
|
||||
S_03096C_PRIM_GRP_SIZE(ngg_state->max_gsprims) |
|
||||
S_03096C_VERT_GRP_SIZE(ngg_state->hw_max_esverts) |
|
||||
S_03096C_BREAK_WAVE_AT_EOI(break_wave_at_eoi));
|
||||
}
|
||||
|
||||
static void
|
||||
radv_pipeline_generate_hw_hs(struct radeon_cmdbuf *cs,
|
||||
struct radv_pipeline *pipeline,
|
||||
@@ -3127,7 +3464,8 @@ static void
|
||||
radv_pipeline_generate_vertex_shader(struct radeon_cmdbuf *ctx_cs,
|
||||
struct radeon_cmdbuf *cs,
|
||||
struct radv_pipeline *pipeline,
|
||||
const struct radv_tessellation_state *tess)
|
||||
const struct radv_tessellation_state *tess,
|
||||
const struct radv_ngg_state *ngg)
|
||||
{
|
||||
struct radv_shader_variant *vs;
|
||||
|
||||
@@ -3140,6 +3478,8 @@ radv_pipeline_generate_vertex_shader(struct radeon_cmdbuf *ctx_cs,
|
||||
radv_pipeline_generate_hw_ls(cs, pipeline, vs, tess);
|
||||
else if (vs->info.vs.as_es)
|
||||
radv_pipeline_generate_hw_es(cs, pipeline, vs);
|
||||
else if (vs->info.is_ngg)
|
||||
radv_pipeline_generate_hw_ngg(ctx_cs, cs, pipeline, vs, ngg);
|
||||
else
|
||||
radv_pipeline_generate_hw_vs(ctx_cs, cs, pipeline, vs);
|
||||
}
|
||||
@@ -3468,13 +3808,20 @@ radv_compute_vgt_shader_stages_en(const struct radv_pipeline *pipeline)
|
||||
stages |= S_028B54_ES_EN(V_028B54_ES_STAGE_DS) |
|
||||
S_028B54_GS_EN(1) |
|
||||
S_028B54_VS_EN(V_028B54_VS_STAGE_COPY_SHADER);
|
||||
else if (radv_pipeline_has_ngg(pipeline))
|
||||
stages |= S_028B54_ES_EN(V_028B54_ES_STAGE_DS) |
|
||||
S_028B54_PRIMGEN_EN(1);
|
||||
else
|
||||
stages |= S_028B54_VS_EN(V_028B54_VS_STAGE_DS);
|
||||
|
||||
} else if (radv_pipeline_has_gs(pipeline))
|
||||
} else if (radv_pipeline_has_gs(pipeline)) {
|
||||
stages |= S_028B54_ES_EN(V_028B54_ES_STAGE_REAL) |
|
||||
S_028B54_GS_EN(1) |
|
||||
S_028B54_VS_EN(V_028B54_VS_STAGE_COPY_SHADER);
|
||||
} else if (radv_pipeline_has_ngg(pipeline)) {
|
||||
stages |= S_028B54_ES_EN(V_028B54_ES_STAGE_REAL) |
|
||||
S_028B54_PRIMGEN_EN(1);
|
||||
}
|
||||
|
||||
if (pipeline->device->physical_device->rad_info.chip_class >= GFX9)
|
||||
stages |= S_028B54_MAX_PRIMGRP_IN_WAVE(2);
|
||||
@@ -3555,6 +3902,7 @@ radv_pipeline_generate_pm4(struct radv_pipeline *pipeline,
|
||||
const struct radv_blend_state *blend,
|
||||
const struct radv_tessellation_state *tess,
|
||||
const struct radv_gs_state *gs,
|
||||
const struct radv_ngg_state *ngg,
|
||||
unsigned prim, unsigned gs_out)
|
||||
{
|
||||
struct radeon_cmdbuf *ctx_cs = &pipeline->ctx_cs;
|
||||
@@ -3570,7 +3918,7 @@ radv_pipeline_generate_pm4(struct radv_pipeline *pipeline,
|
||||
radv_pipeline_generate_raster_state(ctx_cs, pipeline, pCreateInfo);
|
||||
radv_pipeline_generate_multisample_state(ctx_cs, pipeline);
|
||||
radv_pipeline_generate_vgt_gs_mode(ctx_cs, pipeline);
|
||||
radv_pipeline_generate_vertex_shader(ctx_cs, cs, pipeline, tess);
|
||||
radv_pipeline_generate_vertex_shader(ctx_cs, cs, pipeline, tess, ngg);
|
||||
radv_pipeline_generate_tess_shaders(ctx_cs, cs, pipeline, tess);
|
||||
radv_pipeline_generate_geometry_shader(ctx_cs, cs, pipeline, gs);
|
||||
radv_pipeline_generate_fragment_shader(ctx_cs, cs, pipeline);
|
||||
@@ -3578,7 +3926,7 @@ radv_pipeline_generate_pm4(struct radv_pipeline *pipeline,
|
||||
radv_pipeline_generate_vgt_vertex_reuse(ctx_cs, pipeline);
|
||||
radv_pipeline_generate_binning_state(ctx_cs, pipeline, pCreateInfo);
|
||||
|
||||
if (pipeline->device->physical_device->rad_info.chip_class >= GFX10)
|
||||
if (pipeline->device->physical_device->rad_info.chip_class >= GFX10 && !radv_pipeline_has_ngg(pipeline))
|
||||
gfx10_pipeline_generate_ge_cntl(ctx_cs, pipeline, tess, gs);
|
||||
|
||||
radeon_set_context_reg(ctx_cs, R_0286E8_SPI_TMPRING_SIZE,
|
||||
@@ -3848,8 +4196,12 @@ radv_pipeline_init(struct radv_pipeline *pipeline,
|
||||
}
|
||||
}
|
||||
|
||||
struct radv_ngg_state ngg = {0};
|
||||
struct radv_gs_state gs = {0};
|
||||
if (radv_pipeline_has_gs(pipeline)) {
|
||||
|
||||
if (radv_pipeline_has_ngg(pipeline)) {
|
||||
ngg = calculate_ngg_info(pCreateInfo, pipeline);
|
||||
} else if (radv_pipeline_has_gs(pipeline)) {
|
||||
gs = calculate_gs_info(pCreateInfo, pipeline);
|
||||
calculate_gs_ring_sizes(pipeline, &gs);
|
||||
}
|
||||
@@ -3885,7 +4237,7 @@ radv_pipeline_init(struct radv_pipeline *pipeline,
|
||||
pipeline->streamout_shader = radv_pipeline_get_streamout_shader(pipeline);
|
||||
|
||||
result = radv_pipeline_scratch_init(device, pipeline);
|
||||
radv_pipeline_generate_pm4(pipeline, pCreateInfo, extra, &blend, &tess, &gs, prim, gs_out);
|
||||
radv_pipeline_generate_pm4(pipeline, pCreateInfo, extra, &blend, &tess, &gs, &ngg, prim, gs_out);
|
||||
|
||||
return result;
|
||||
}
|
||||
|
@@ -1510,6 +1510,8 @@ static inline bool radv_pipeline_has_tess(const struct radv_pipeline *pipeline)
|
||||
return pipeline->shaders[MESA_SHADER_TESS_CTRL] ? true : false;
|
||||
}
|
||||
|
||||
bool radv_pipeline_has_ngg(const struct radv_pipeline *pipeline);
|
||||
|
||||
struct radv_userdata_info *radv_lookup_user_sgpr(struct radv_pipeline *pipeline,
|
||||
gl_shader_stage stage,
|
||||
int idx);
|
||||
|
@@ -583,7 +583,9 @@ static void radv_postprocess_config(const struct radv_physical_device *pdevice,
|
||||
config_out->rsrc1 |= S_00B428_MEM_ORDERED(pdevice->rad_info.chip_class >= GFX10);
|
||||
break;
|
||||
case MESA_SHADER_VERTEX:
|
||||
if (info->vs.as_ls) {
|
||||
if (info->is_ngg) {
|
||||
config_out->rsrc1 |= S_00B228_MEM_ORDERED(pdevice->rad_info.chip_class >= GFX10);
|
||||
} else if (info->vs.as_ls) {
|
||||
assert(pdevice->rad_info.chip_class <= GFX8);
|
||||
/* We need at least 2 components for LS.
|
||||
* VGPR0-3: (VertexID, RelAutoindex, InstanceID / StepRate0, InstanceID).
|
||||
@@ -632,8 +634,19 @@ static void radv_postprocess_config(const struct radv_physical_device *pdevice,
|
||||
break;
|
||||
}
|
||||
|
||||
if (pdevice->rad_info.chip_class >= GFX9 &&
|
||||
stage == MESA_SHADER_GEOMETRY) {
|
||||
if (pdevice->rad_info.chip_class >= GFX10 &&
|
||||
stage == MESA_SHADER_VERTEX) {
|
||||
unsigned gs_vgpr_comp_cnt, es_vgpr_comp_cnt;
|
||||
|
||||
/* VGPR5-8: (VertexID, UserVGPR0, UserVGPR1, UserVGPR2 / InstanceID) */
|
||||
es_vgpr_comp_cnt = info->info.vs.needs_instance_id ? 3 : 0;
|
||||
gs_vgpr_comp_cnt = 3;
|
||||
|
||||
config_out->rsrc1 |= S_00B228_GS_VGPR_COMP_CNT(gs_vgpr_comp_cnt);
|
||||
config_out->rsrc2 |= S_00B22C_ES_VGPR_COMP_CNT(es_vgpr_comp_cnt) |
|
||||
S_00B22C_LDS_SIZE(config_in->lds_size);
|
||||
} else if (pdevice->rad_info.chip_class >= GFX9 &&
|
||||
stage == MESA_SHADER_GEOMETRY) {
|
||||
unsigned es_type = info->gs.es_type;
|
||||
unsigned gs_vgpr_comp_cnt, es_vgpr_comp_cnt;
|
||||
|
||||
|
@@ -65,6 +65,7 @@ enum {
|
||||
struct radv_vs_out_key {
|
||||
uint32_t as_es:1;
|
||||
uint32_t as_ls:1;
|
||||
uint32_t as_ngg:1;
|
||||
uint32_t export_prim_id:1;
|
||||
uint32_t export_layer_id:1;
|
||||
uint32_t export_clip_dists:1;
|
||||
@@ -264,6 +265,7 @@ struct radv_shader_variant_info {
|
||||
unsigned num_input_vgprs;
|
||||
unsigned private_mem_vgprs;
|
||||
bool need_indirect_descriptor_sets;
|
||||
bool is_ngg;
|
||||
struct {
|
||||
struct {
|
||||
struct radv_vs_output_info outinfo;
|
||||
|
@@ -317,6 +317,17 @@ si_emit_graphics(struct radv_physical_device *physical_device,
|
||||
}
|
||||
|
||||
if (physical_device->rad_info.chip_class >= GFX10) {
|
||||
/* Break up a pixel wave if it contains deallocs for more than
|
||||
* half the parameter cache.
|
||||
*
|
||||
* To avoid a deadlock where pixel waves aren't launched
|
||||
* because they're waiting for more pixels while the frontend
|
||||
* is stuck waiting for PC space, the maximum allowed value is
|
||||
* the size of the PC minus the largest possible allocation for
|
||||
* a single primitive shader subgroup.
|
||||
*/
|
||||
radeon_set_context_reg(cs, R_028C50_PA_SC_NGG_MODE_CNTL,
|
||||
S_028C50_MAX_DEALLOCS_IN_WAVE(512));
|
||||
radeon_set_context_reg(cs, R_028C58_VGT_VERTEX_REUSE_BLOCK_CNTL, 14);
|
||||
radeon_set_context_reg(cs, R_02835C_PA_SC_TILE_STEERING_OVERRIDE,
|
||||
physical_device->rad_info.pa_sc_tile_steering_override);
|
||||
|
Reference in New Issue
Block a user