radv: Use new NGG NIR lowering for VS/TES when ACO is used.

Signed-off-by: Timur Kristóf <timur.kristof@gmail.com>
Reviewed-by: Daniel Schürmann <daniel@schuermann.dev>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/10740>
This commit is contained in:
Timur Kristóf
2021-04-15 17:21:57 +02:00
committed by Marge Bot
parent 3d589b8b46
commit 9732881729
6 changed files with 90 additions and 144 deletions

View File

@@ -4396,6 +4396,8 @@ bool load_input_from_temps(isel_context *ctx, nir_intrinsic_instr *instr, Temp d
return true;
}
static void export_vs_varying(isel_context *ctx, int slot, bool is_pos, int *next_pos);
void visit_store_output(isel_context *ctx, nir_intrinsic_instr *instr)
{
if (ctx->stage == vertex_vs ||
@@ -4413,6 +4415,11 @@ void visit_store_output(isel_context *ctx, nir_intrinsic_instr *instr)
} else {
unreachable("Shader stage not implemented");
}
/* For NGG VS and TES shaders the primitive ID is exported manually after the other exports so we have to emit an exp here manually */
if (ctx->stage.hw == HWStage::NGG && (ctx->stage.has(SWStage::VS) || ctx->stage.has(SWStage::TES)) &&
nir_intrinsic_io_semantics(instr).location == VARYING_SLOT_PRIMITIVE_ID)
export_vs_varying(ctx, VARYING_SLOT_PRIMITIVE_ID, false, NULL);
}
void emit_interp_instr(isel_context *ctx, unsigned idx, unsigned component, Temp src, Temp dst, Temp prim_mask)
@@ -8496,8 +8503,8 @@ void visit_intrinsic(isel_context *ctx, nir_intrinsic_instr *instr)
break;
default:
if (ctx->stage.hw == HWStage::NGG && !ctx->stage.has(SWStage::GS)) {
/* This is actually the same as gs_prim_id, but we call it differently when there is no SW GS. */
bld.copy(Definition(dst), get_arg(ctx, ctx->args->ac.vs_prim_id));
/* In case of NGG, the GS threads always have the primitive ID even if there is no SW GS. */
bld.copy(Definition(dst), get_arg(ctx, ctx->args->ac.gs_prim_id));
break;
}
unreachable("Unimplemented shader stage for nir_intrinsic_load_primitive_id");
@@ -11330,126 +11337,6 @@ void ngg_emit_prim_export(isel_context *ctx, unsigned num_vertices_per_primitive
false /* compressed */, true/* done */, false /* valid mask */);
}
void ngg_nogs_export_primitives(isel_context *ctx)
{
/* Emit the things that NGG GS threads need to do, for shaders that don't have SW GS.
* These must always come before VS exports.
*
* It is recommended to do these as early as possible. They can be at the beginning when
* there is no SW GS and the shader doesn't write edge flags.
*/
if_context ic;
Temp is_gs_thread = merged_wave_info_to_mask(ctx, 1);
begin_divergent_if_then(ctx, &ic, is_gs_thread);
Builder bld(ctx->program, ctx->block);
constexpr unsigned max_vertices_per_primitive = 3;
unsigned num_vertices_per_primitive = max_vertices_per_primitive;
assert(!ctx->stage.has(SWStage::GS));
if (ctx->stage == vertex_ngg) {
/* TODO: optimize for points & lines */
} else if (ctx->stage == tess_eval_ngg) {
if (ctx->shader->info.tess.point_mode)
num_vertices_per_primitive = 1;
else if (ctx->shader->info.tess.primitive_mode == GL_ISOLINES)
num_vertices_per_primitive = 2;
} else {
unreachable("Unsupported NGG non-GS shader stage");
}
Temp vtxindex[max_vertices_per_primitive];
if (!ctx->args->options->key.vs_common_out.as_ngg_passthrough) {
vtxindex[0] = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand(0xffffu),
get_arg(ctx, ctx->args->ac.gs_vtx_offset[0]));
vtxindex[1] = num_vertices_per_primitive < 2 ? Temp(0, v1) :
bld.vop3(aco_opcode::v_bfe_u32, bld.def(v1),
get_arg(ctx, ctx->args->ac.gs_vtx_offset[0]), Operand(16u), Operand(16u));
vtxindex[2] = num_vertices_per_primitive < 3 ? Temp(0, v1) :
bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand(0xffffu),
get_arg(ctx, ctx->args->ac.gs_vtx_offset[2]));
}
/* Export primitive data to the index buffer. */
ngg_emit_prim_export(ctx, num_vertices_per_primitive, vtxindex);
/* Export primitive ID. */
if (ctx->stage == vertex_ngg && ctx->args->options->key.vs_common_out.export_prim_id) {
/* Copy Primitive IDs from GS threads to the LDS address corresponding to the ES thread of the provoking vertex. */
Temp prim_id = get_arg(ctx, ctx->args->ac.gs_prim_id);
unsigned provoking_vtx_in_prim = 0;
/* For provoking vertex last mode, use num_vtx_in_prim - 1. */
if (ctx->args->options->key.vs.provoking_vtx_last)
provoking_vtx_in_prim = ctx->args->options->key.vs.outprim;
Temp provoking_vtx_index = vtxindex[provoking_vtx_in_prim];
Temp addr = bld.v_mul_imm(bld.def(v1), provoking_vtx_index, 4u);
store_lds(ctx, 4, prim_id, 0x1u, addr, 0u, 4u);
}
begin_divergent_if_else(ctx, &ic);
end_divergent_if(ctx, &ic);
}
void ngg_nogs_export_prim_id(isel_context *ctx)
{
assert(ctx->args->options->key.vs_common_out.export_prim_id);
Temp prim_id;
if (ctx->stage == vertex_ngg) {
/* Wait for GS threads to store primitive ID in LDS. */
Builder bld(ctx->program, ctx->block);
create_workgroup_barrier(bld);
/* Calculate LDS address where the GS threads stored the primitive ID. */
Temp thread_id_in_tg = thread_id_in_threadgroup(ctx);
Temp addr = bld.v_mul24_imm(bld.def(v1), thread_id_in_tg, 4u);
/* Load primitive ID from LDS. */
prim_id = load_lds(ctx, 4, bld.tmp(v1), addr, 0u, 4u);
} else if (ctx->stage == tess_eval_ngg) {
/* TES: Just use the patch ID as the primitive ID. */
prim_id = get_arg(ctx, ctx->args->ac.tes_patch_id);
} else {
unreachable("unsupported NGG non-GS shader stage.");
}
ctx->outputs.mask[VARYING_SLOT_PRIMITIVE_ID] |= 0x1;
ctx->outputs.temps[VARYING_SLOT_PRIMITIVE_ID * 4u] = prim_id;
export_vs_varying(ctx, VARYING_SLOT_PRIMITIVE_ID, false, nullptr);
}
void ngg_nogs_prelude(isel_context *ctx)
{
ngg_emit_wave0_sendmsg_gs_alloc_req(ctx);
if (ctx->ngg_nogs_early_prim_export)
ngg_nogs_export_primitives(ctx);
}
void ngg_nogs_late_export_finale(isel_context *ctx)
{
assert(!ctx->ngg_nogs_early_prim_export);
/* Export VS/TES primitives. */
ngg_nogs_export_primitives(ctx);
/* Export the primitive ID for VS - needs to read LDS written by GS threads. */
if (ctx->args->options->key.vs_common_out.export_prim_id && ctx->stage.has(SWStage::VS)) {
if_context ic;
Temp is_es_thread = merged_wave_info_to_mask(ctx, 0);
begin_divergent_if_then(ctx, &ic, is_es_thread);
ngg_nogs_export_prim_id(ctx);
begin_divergent_if_else(ctx, &ic);
end_divergent_if(ctx, &ic);
}
}
std::pair<Temp, Temp> ngg_gs_workgroup_reduce_and_scan(isel_context *ctx, Temp src_mask)
{
/* Workgroup scan for NGG GS.
@@ -11866,7 +11753,6 @@ void select_program(Program *program,
{
isel_context ctx = setup_isel_context(program, shader_count, shaders, config, args, false);
if_context ic_merged_wave_info;
bool ngg_no_gs = ctx.stage.hw == HWStage::NGG && !ctx.stage.has(SWStage::GS);
bool ngg_gs = ctx.stage.hw == HWStage::NGG && ctx.stage.has(SWStage::GS);
for (unsigned i = 0; i < shader_count; i++) {
@@ -11890,9 +11776,7 @@ void select_program(Program *program,
}
}
if (ngg_no_gs)
ngg_nogs_prelude(&ctx);
else if (!i && ngg_gs)
if (!i && ngg_gs)
ngg_gs_prelude(&ctx);
/* In a merged VS+TCS HS, the VS implementation can be completely empty. */
@@ -11903,7 +11787,7 @@ void select_program(Program *program,
(nir->info.stage == MESA_SHADER_TESS_EVAL &&
ctx.stage == tess_eval_geometry_gs));
bool check_merged_wave_info = ctx.tcs_in_out_eq ? i == 0 : ((shader_count >= 2 && !empty_shader) || ngg_no_gs);
bool check_merged_wave_info = ctx.tcs_in_out_eq ? i == 0 : (shader_count >= 2 && !empty_shader);
bool endif_merged_wave_info = ctx.tcs_in_out_eq ? i == 1 : check_merged_wave_info;
if (i && ngg_gs) {
@@ -11943,10 +11827,6 @@ void select_program(Program *program,
if (ctx.stage.hw == HWStage::VS) {
create_vs_exports(&ctx);
} else if (ngg_no_gs) {
create_vs_exports(&ctx);
if (ctx.args->options->key.vs_common_out.export_prim_id && (ctx.ngg_nogs_early_prim_export || ctx.stage.has(SWStage::TES)))
ngg_nogs_export_prim_id(&ctx);
} else if (nir->info.stage == MESA_SHADER_GEOMETRY && !ngg_gs) {
Builder bld(ctx.program, ctx.block);
bld.barrier(aco_opcode::p_barrier,
@@ -11963,9 +11843,7 @@ void select_program(Program *program,
end_divergent_if(&ctx, &ic_merged_wave_info);
}
if (ngg_no_gs && !ctx.ngg_nogs_early_prim_export)
ngg_nogs_late_export_finale(&ctx);
else if (ngg_gs && nir->info.stage == MESA_SHADER_GEOMETRY)
if (ngg_gs && nir->info.stage == MESA_SHADER_GEOMETRY)
ngg_gs_finale(&ctx);
if (i == 0 && ctx.stage == vertex_tess_control_hs && ctx.tcs_in_out_eq) {

View File

@@ -93,7 +93,6 @@ struct isel_context {
Temp persp_centroid, linear_centroid;
/* GS inputs */
bool ngg_nogs_early_prim_export = false;
bool ngg_gs_early_alloc = false;
bool ngg_gs_known_vtxcnt[4] = {false, false, false, false};
Temp gs_wave_id;

View File

@@ -390,15 +390,11 @@ setup_vs_variables(isel_context *ctx, nir_shader *nir)
/* TODO: NGG streamout */
if (ctx->stage.hw == HWStage::NGG)
assert(!ctx->args->shader_info->so.num_outputs);
/* TODO: check if the shader writes edge flags (not in Vulkan) */
ctx->ngg_nogs_early_prim_export = exec_list_is_singular(&nir_shader_get_entrypoint(nir)->body);
}
if (ctx->stage == vertex_ngg && ctx->args->options->key.vs_common_out.export_prim_id) {
/* We need to store the primitive IDs in LDS */
unsigned lds_size = ctx->program->info->ngg_info.esgs_ring_size;
ctx->program->config->lds_size = DIV_ROUND_UP(lds_size, ctx->program->dev.lds_encoding_granule);
if (ctx->stage == vertex_ngg) {
ctx->program->config->lds_size = DIV_ROUND_UP(nir->info.shared_size, ctx->program->dev.lds_encoding_granule);
assert((ctx->program->config->lds_size * ctx->program->dev.lds_encoding_granule) < (32 * 1024));
}
}
@@ -463,8 +459,11 @@ setup_tes_variables(isel_context *ctx, nir_shader *nir)
/* TODO: NGG streamout */
if (ctx->stage.hw == HWStage::NGG)
assert(!ctx->args->shader_info->so.num_outputs);
}
ctx->ngg_nogs_early_prim_export = exec_list_is_singular(&nir_shader_get_entrypoint(nir)->body);
if (ctx->stage == tess_eval_ngg) {
ctx->program->config->lds_size = DIV_ROUND_UP(nir->info.shared_size, ctx->program->dev.lds_encoding_granule);
assert((ctx->program->config->lds_size * ctx->program->dev.lds_encoding_granule) < (32 * 1024));
}
}

View File

@@ -3434,6 +3434,7 @@ radv_create_shaders(struct radv_pipeline *pipeline, struct radv_device *device,
/* Lower I/O intrinsics to memory instructions. */
bool io_to_mem = radv_lower_io_to_mem(device, nir[i], &infos[i], pipeline_key);
bool lowered_ngg = radv_lower_ngg(device, nir[i], !!nir[MESA_SHADER_GEOMETRY], &infos[i], pipeline_key, &keys[i]);
/* optimize the lowered ALU operations */
bool more_algebraic = true;
@@ -3446,7 +3447,7 @@ radv_create_shaders(struct radv_pipeline *pipeline, struct radv_device *device,
NIR_PASS(more_algebraic, nir[i], nir_opt_algebraic);
}
if (io_to_mem || i == MESA_SHADER_COMPUTE)
if (io_to_mem || lowered_ngg || i == MESA_SHADER_COMPUTE)
NIR_PASS_V(nir[i], nir_opt_offsets);
/* Do late algebraic optimization to turn add(a,

View File

@@ -810,6 +810,70 @@ radv_lower_io_to_mem(struct radv_device *device, struct nir_shader *nir,
return false;
}
bool radv_lower_ngg(struct radv_device *device, struct nir_shader *nir, bool has_gs,
struct radv_shader_info *info,
const struct radv_pipeline_key *pl_key,
struct radv_shader_variant_key *key)
{
/* TODO: support the LLVM backend with the NIR lowering */
if (radv_use_llvm_for_stage(device, nir->info.stage))
return false;
ac_nir_ngg_config out_conf = {0};
const struct gfx10_ngg_info *ngg_info = &info->ngg_info;
unsigned num_gs_invocations = (nir->info.stage != MESA_SHADER_GEOMETRY || ngg_info->max_vert_out_per_gs_instance) ? 1 : info->gs.invocations;
unsigned max_workgroup_size = MAX4(ngg_info->hw_max_esverts, /* Invocations that process an input vertex */
ngg_info->max_out_verts, /* Invocations that export an output vertex */
ngg_info->max_gsprims * num_gs_invocations, /* Invocations that process an input primitive */
ngg_info->max_gsprims * num_gs_invocations * ngg_info->prim_amp_factor /* Invocations that produce an output primitive */);
/* Maximum HW limit for NGG workgroups */
assert(max_workgroup_size <= 256);
if (nir->info.stage == MESA_SHADER_VERTEX ||
nir->info.stage == MESA_SHADER_TESS_EVAL) {
if (has_gs || !key->vs_common_out.as_ngg)
return false;
unsigned num_vertices_per_prim = 3;
if (nir->info.stage == MESA_SHADER_TESS_EVAL) {
if (nir->info.tess.point_mode)
num_vertices_per_prim = 1;
else if (nir->info.tess.primitive_mode == GL_ISOLINES)
num_vertices_per_prim = 2;
} else if (nir->info.stage == MESA_SHADER_VERTEX) {
/* Need to add 1, because: V_028A6C_POINTLIST=0, V_028A6C_LINESTRIP=1, V_028A6C_TRISTRIP=2, etc. */
num_vertices_per_prim = key->vs.outprim + 1;
}
out_conf =
ac_nir_lower_ngg_nogs(
nir,
ngg_info->hw_max_esverts,
num_vertices_per_prim,
max_workgroup_size,
info->wave_size,
false,
key->vs_common_out.as_ngg_passthrough,
key->vs_common_out.export_prim_id,
key->vs.provoking_vtx_last);
info->is_ngg_passthrough = out_conf.passthrough;
key->vs_common_out.as_ngg_passthrough = out_conf.passthrough;
} else if (nir->info.stage == MESA_SHADER_GEOMETRY) {
if (!key->vs_common_out.as_ngg)
return false;
/* TODO: lower NGG GS in NIR */
return false;
} else {
return false;
}
return true;
}
static void *
radv_alloc_shader_memory(struct radv_device *device, struct radv_shader_variant *shader)
{

View File

@@ -556,4 +556,9 @@ void radv_lower_io(struct radv_device *device, nir_shader *nir);
bool radv_lower_io_to_mem(struct radv_device *device, struct nir_shader *nir,
struct radv_shader_info *info, const struct radv_pipeline_key *pl_key);
bool radv_lower_ngg(struct radv_device *device, struct nir_shader *nir, bool has_gs,
struct radv_shader_info *info,
const struct radv_pipeline_key *pl_key,
struct radv_shader_variant_key *key);
#endif