diff --git a/src/gallium/drivers/radeonsi/gfx10_shader_ngg.c b/src/gallium/drivers/radeonsi/gfx10_shader_ngg.c index a11976d23e3..4b93940831f 100644 --- a/src/gallium/drivers/radeonsi/gfx10_shader_ngg.c +++ b/src/gallium/drivers/radeonsi/gfx10_shader_ngg.c @@ -755,32 +755,34 @@ void gfx10_emit_ngg_culling_epilogue(struct ac_shader_abi *abi, unsigned max_out assert(sel->info.stage == MESA_SHADER_VERTEX || (sel->info.stage == MESA_SHADER_TESS_EVAL && !shader->key.as_es)); - LLVMValueRef position[4] = {}; + LLVMValueRef es_vtxptr = ngg_nogs_vertex_ptr(ctx, get_thread_id_in_tg(ctx)); + unsigned pos_index = 0; + for (unsigned i = 0; i < info->num_outputs; i++) { + LLVMValueRef position[4]; + switch (info->output_semantic[i]) { case VARYING_SLOT_POS: + pos_index = i; for (unsigned j = 0; j < 4; j++) { position[j] = LLVMBuildLoad(ctx->ac.builder, addrs[4 * i + j], ""); } + + /* Store Position.W into LDS. */ + LLVMBuildStore( + builder, ac_to_integer(&ctx->ac, position[3]), + ac_build_gep0(&ctx->ac, es_vtxptr, LLVMConstInt(ctx->ac.i32, lds_pos_w, 0))); + + /* Store Position.XY / W into LDS. */ + for (unsigned chan = 0; chan < 2; chan++) { + LLVMValueRef val = ac_build_fdiv(&ctx->ac, position[chan], position[3]); + LLVMBuildStore( + builder, ac_to_integer(&ctx->ac, val), + ac_build_gep0(&ctx->ac, es_vtxptr, LLVMConstInt(ctx->ac.i32, lds_pos_x_div_w + chan, 0))); + } break; } } - assert(position[0]); - - /* Store Position.XYZW into LDS. */ - LLVMValueRef es_vtxptr = ngg_nogs_vertex_ptr(ctx, get_thread_id_in_tg(ctx)); - for (unsigned chan = 0; chan < 4; chan++) { - LLVMBuildStore( - builder, ac_to_integer(&ctx->ac, position[chan]), - ac_build_gep0(&ctx->ac, es_vtxptr, LLVMConstInt(ctx->ac.i32, lds_pos_x + chan, 0))); - } - /* Store Position.XY / W into LDS. */ - for (unsigned chan = 0; chan < 2; chan++) { - LLVMValueRef val = ac_build_fdiv(&ctx->ac, position[chan], position[3]); - LLVMBuildStore( - builder, ac_to_integer(&ctx->ac, val), - ac_build_gep0(&ctx->ac, es_vtxptr, LLVMConstInt(ctx->ac.i32, lds_pos_x_div_w + chan, 0))); - } /* Store VertexID and InstanceID. ES threads will have to load them * from LDS after vertex compaction and use them instead of their own @@ -1001,12 +1003,20 @@ void gfx10_emit_ngg_culling_epilogue(struct ac_shader_abi *abi, unsigned max_out { LLVMValueRef old_id = get_thread_id_in_tg(ctx); LLVMValueRef new_id = ac_prefix_bitcount_2x64(&ctx->ac, es_mask, old_id); + LLVMValueRef new_vtx = ngg_nogs_vertex_ptr(ctx, new_id); LLVMBuildStore( builder, LLVMBuildTrunc(builder, old_id, ctx->ac.i8, ""), - si_build_gep_i8(ctx, ngg_nogs_vertex_ptr(ctx, new_id), lds_byte0_old_thread_id)); + si_build_gep_i8(ctx, new_vtx, lds_byte0_old_thread_id)); LLVMBuildStore(builder, LLVMBuildTrunc(builder, new_id, ctx->ac.i8, ""), si_build_gep_i8(ctx, es_vtxptr, lds_byte1_new_thread_id)); + + /* Store Position.XYZW into LDS. */ + for (unsigned chan = 0; chan < 4; chan++) { + LLVMBuildStore( + builder, ac_to_integer(&ctx->ac, LLVMBuildLoad(builder, addrs[4 * pos_index + chan], "")), + ac_build_gep0(&ctx->ac, new_vtx, LLVMConstInt(ctx->ac.i32, lds_pos_x + chan, 0))); + } } ac_build_endif(&ctx->ac, 16009); @@ -1187,9 +1197,6 @@ void gfx10_emit_ngg_culling_epilogue(struct ac_shader_abi *abi, unsigned max_out if (num_vgprs == 3) vgpr++; } - /* Return the old thread ID. */ - val = LLVMBuildLoad(builder, old_thread_id, ""); - ret = LLVMBuildInsertValue(builder, ret, ac_to_float(&ctx->ac, val), vgpr++, ""); /* These two also use LDS. */ if (sel->info.writes_edgeflag || @@ -1397,7 +1404,7 @@ void gfx10_emit_ngg_epilogue(struct ac_shader_abi *abi, unsigned max_outputs, LL */ if (info->output_semantic[i] == VARYING_SLOT_POS && ctx->shader->key.opt.ngg_culling) { - vertex_ptr = ngg_nogs_vertex_ptr(ctx, ac_get_arg(&ctx->ac, ctx->ngg_old_thread_id)); + vertex_ptr = ngg_nogs_vertex_ptr(ctx, get_thread_id_in_tg(ctx)); for (unsigned j = 0; j < 4; j++) { tmp = LLVMConstInt(ctx->ac.i32, lds_pos_x + j, 0); diff --git a/src/gallium/drivers/radeonsi/si_shader.c b/src/gallium/drivers/radeonsi/si_shader.c index e83abc92eb7..ac85ec68cb5 100644 --- a/src/gallium/drivers/radeonsi/si_shader.c +++ b/src/gallium/drivers/radeonsi/si_shader.c @@ -289,8 +289,7 @@ static void declare_vb_descriptor_input_sgprs(struct si_shader_context *ctx) } } -static void declare_vs_input_vgprs(struct si_shader_context *ctx, unsigned *num_prolog_vgprs, - bool ngg_cull_shader) +static void declare_vs_input_vgprs(struct si_shader_context *ctx, unsigned *num_prolog_vgprs) { struct si_shader *shader = ctx->shader; @@ -316,10 +315,6 @@ static void declare_vs_input_vgprs(struct si_shader_context *ctx, unsigned *num_ } if (!shader->is_gs_copy_shader) { - if (shader->key.opt.ngg_culling && !ngg_cull_shader) { - ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->ngg_old_thread_id); - } - /* Vertex load indices. */ if (shader->selector->info.num_inputs) { ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->vertex_index0); @@ -351,16 +346,12 @@ static void declare_vs_blit_inputs(struct si_shader_context *ctx, unsigned vs_bl } } -static void declare_tes_input_vgprs(struct si_shader_context *ctx, bool ngg_cull_shader) +static void declare_tes_input_vgprs(struct si_shader_context *ctx) { ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_FLOAT, &ctx->tes_u); ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_FLOAT, &ctx->tes_v); ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->tes_rel_patch_id); ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.tes_patch_id); - - if (ctx->shader->key.opt.ngg_culling && !ngg_cull_shader) { - ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->ngg_old_thread_id); - } } enum @@ -404,7 +395,7 @@ void si_create_function(struct si_shader_context *ctx, bool ngg_cull_shader) declare_vs_blit_inputs(ctx, shader->selector->info.base.vs.blit_sgprs_amd); /* VGPRs */ - declare_vs_input_vgprs(ctx, &num_prolog_vgprs, ngg_cull_shader); + declare_vs_input_vgprs(ctx, &num_prolog_vgprs); break; } @@ -423,7 +414,7 @@ void si_create_function(struct si_shader_context *ctx, bool ngg_cull_shader) } /* VGPRs */ - declare_vs_input_vgprs(ctx, &num_prolog_vgprs, ngg_cull_shader); + declare_vs_input_vgprs(ctx, &num_prolog_vgprs); /* Return values */ if (shader->key.opt.vs_as_prim_discard_cs) { @@ -480,7 +471,7 @@ void si_create_function(struct si_shader_context *ctx, bool ngg_cull_shader) ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.tcs_rel_ids); if (ctx->stage == MESA_SHADER_VERTEX) { - declare_vs_input_vgprs(ctx, &num_prolog_vgprs, ngg_cull_shader); + declare_vs_input_vgprs(ctx, &num_prolog_vgprs); /* LS return values are inputs to the TCS main shader part. */ for (i = 0; i < 8 + GFX9_TCS_NUM_USER_SGPR; i++) @@ -548,9 +539,9 @@ void si_create_function(struct si_shader_context *ctx, bool ngg_cull_shader) ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->gs_vtx45_offset); if (ctx->stage == MESA_SHADER_VERTEX) { - declare_vs_input_vgprs(ctx, &num_prolog_vgprs, ngg_cull_shader); + declare_vs_input_vgprs(ctx, &num_prolog_vgprs); } else if (ctx->stage == MESA_SHADER_TESS_EVAL) { - declare_tes_input_vgprs(ctx, ngg_cull_shader); + declare_tes_input_vgprs(ctx); } if ((ctx->shader->key.as_es || ngg_cull_shader) && @@ -572,12 +563,12 @@ void si_create_function(struct si_shader_context *ctx, bool ngg_cull_shader) num_user_sgprs = GFX9_TESGS_NUM_USER_SGPR; } - /* The NGG cull shader has to return all 9 VGPRs + the old thread ID. + /* The NGG cull shader has to return all 9 VGPRs. * * The normal merged ESGS shader only has to return the 5 VGPRs * for the GS stage. */ - num_vgprs = ngg_cull_shader ? 10 : 5; + num_vgprs = ngg_cull_shader ? 9 : 5; /* ES return values are inputs to GS. */ for (i = 0; i < 8 + num_user_sgprs; i++) @@ -604,7 +595,7 @@ void si_create_function(struct si_shader_context *ctx, bool ngg_cull_shader) } /* VGPRs */ - declare_tes_input_vgprs(ctx, ngg_cull_shader); + declare_tes_input_vgprs(ctx); break; case MESA_SHADER_GEOMETRY: @@ -1560,8 +1551,6 @@ static void si_get_vs_prolog_key(const struct si_shader_info *info, unsigned num !!(shader_out->key.opt.ngg_culling & SI_NGG_CULL_GS_FAST_LAUNCH_TRI_LIST); key->vs_prolog.gs_fast_launch_tri_strip = !!(shader_out->key.opt.ngg_culling & SI_NGG_CULL_GS_FAST_LAUNCH_TRI_STRIP); - } else { - key->vs_prolog.has_ngg_cull_inputs = !!shader_out->key.opt.ngg_culling; } if (shader_out->selector->info.stage == MESA_SHADER_TESS_CTRL) { diff --git a/src/gallium/drivers/radeonsi/si_shader.h b/src/gallium/drivers/radeonsi/si_shader.h index 4985ce66259..dda56066615 100644 --- a/src/gallium/drivers/radeonsi/si_shader.h +++ b/src/gallium/drivers/radeonsi/si_shader.h @@ -555,7 +555,6 @@ union si_shader_part_key { unsigned as_es : 1; unsigned as_ngg : 1; unsigned as_prim_discard_cs : 1; - unsigned has_ngg_cull_inputs : 1; /* from the NGG cull shader */ unsigned gs_fast_launch_tri_list : 1; /* for NGG culling */ unsigned gs_fast_launch_tri_strip : 1; /* for NGG culling */ /* Prologs for monolithic shaders shouldn't set EXEC. */ diff --git a/src/gallium/drivers/radeonsi/si_shader_internal.h b/src/gallium/drivers/radeonsi/si_shader_internal.h index 85e86d8fe78..8649a78db5c 100644 --- a/src/gallium/drivers/radeonsi/si_shader_internal.h +++ b/src/gallium/drivers/radeonsi/si_shader_internal.h @@ -109,7 +109,6 @@ struct si_shader_context { */ struct ac_arg vs_state_bits; struct ac_arg vs_blit_inputs; - struct ac_arg ngg_old_thread_id; /* generated by the NGG cull shader */ /* HW VS */ struct ac_arg streamout_config; struct ac_arg streamout_write_index; diff --git a/src/gallium/drivers/radeonsi/si_shader_llvm_vs.c b/src/gallium/drivers/radeonsi/si_shader_llvm_vs.c index d996ccc1b11..bafe964c84c 100644 --- a/src/gallium/drivers/radeonsi/si_shader_llvm_vs.c +++ b/src/gallium/drivers/radeonsi/si_shader_llvm_vs.c @@ -779,7 +779,7 @@ void si_llvm_build_vs_prolog(struct si_shader_context *ctx, union si_shader_part int num_returns, i; unsigned first_vs_vgpr = key->vs_prolog.num_merged_next_stage_vgprs; unsigned num_input_vgprs = - key->vs_prolog.num_merged_next_stage_vgprs + 4 + (key->vs_prolog.has_ngg_cull_inputs ? 1 : 0); + key->vs_prolog.num_merged_next_stage_vgprs + 4; struct ac_arg input_sgpr_param[key->vs_prolog.num_input_sgprs]; struct ac_arg input_vgpr_param[10]; LLVMValueRef input_vgprs[10];