From 8de5b11b292cea4b3cd1da771573b96b511a033c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= Date: Tue, 4 Jan 2022 13:34:16 -0500 Subject: [PATCH] radeonsi: move most "info" fields from si_shader_selector into si_shader_info It's where they should be, and future commits might require this. Acked-by: Pierre-Eric Pelloux-Prayer Part-of: --- .../drivers/radeonsi/gfx10_shader_ngg.c | 16 +- src/gallium/drivers/radeonsi/si_compute.c | 2 +- src/gallium/drivers/radeonsi/si_pipe.h | 5 +- src/gallium/drivers/radeonsi/si_shader.c | 16 +- src/gallium/drivers/radeonsi/si_shader.h | 66 +++--- src/gallium/drivers/radeonsi/si_shader_info.c | 158 +++++++++++++- src/gallium/drivers/radeonsi/si_shader_llvm.c | 2 +- .../drivers/radeonsi/si_shader_llvm_gs.c | 2 +- .../drivers/radeonsi/si_shader_llvm_tess.c | 10 +- .../drivers/radeonsi/si_shader_llvm_vs.c | 8 +- src/gallium/drivers/radeonsi/si_state.c | 4 +- .../drivers/radeonsi/si_state_draw.cpp | 14 +- .../drivers/radeonsi/si_state_shaders.cpp | 197 ++---------------- 13 files changed, 251 insertions(+), 249 deletions(-) diff --git a/src/gallium/drivers/radeonsi/gfx10_shader_ngg.c b/src/gallium/drivers/radeonsi/gfx10_shader_ngg.c index 6e2e5cff573..834e787d90e 100644 --- a/src/gallium/drivers/radeonsi/gfx10_shader_ngg.c +++ b/src/gallium/drivers/radeonsi/gfx10_shader_ngg.c @@ -934,7 +934,7 @@ void gfx10_emit_ngg_culling_epilogue(struct ac_shader_abi *abi) LLVMValueRef position[4] = {}; unsigned pos_index = 0; unsigned clip_plane_enable = SI_NGG_CULL_GET_CLIP_PLANE_ENABLE(shader->key.ge.opt.ngg_culling); - unsigned clipdist_enable = (sel->clipdist_mask & clip_plane_enable) | sel->culldist_mask; + unsigned clipdist_enable = (sel->info.clipdist_mask & clip_plane_enable) | sel->info.culldist_mask; bool has_clipdist_mask = false; for (unsigned i = 0; i < info->num_outputs; i++) { @@ -999,7 +999,7 @@ void gfx10_emit_ngg_culling_epilogue(struct ac_shader_abi *abi) } } - if (clip_plane_enable && !sel->clipdist_mask) { + if (clip_plane_enable && !sel->info.clipdist_mask) { /* When clip planes are enabled and there are no clip distance outputs, * we should use user clip planes and cull against the position. */ @@ -1337,7 +1337,7 @@ void gfx10_emit_ngg_culling_epilogue(struct ac_shader_abi *abi) ret = si_insert_input_ptr(ctx, ret, ctx->args.start_instance, 8 + SI_SGPR_START_INSTANCE); ret = si_insert_input_ptr(ctx, ret, ctx->args.vertex_buffers, 8 + GFX9_GS_NUM_USER_SGPR); - for (unsigned i = 0; i < shader->selector->num_vbos_in_user_sgprs; i++) { + for (unsigned i = 0; i < shader->selector->info.num_vbos_in_user_sgprs; i++) { ret = si_insert_input_v4i32(ctx, ret, ctx->vb_descriptors[i], 8 + SI_SGPR_VS_VB_DESCRIPTOR_FIRST + i * 4); } @@ -1349,8 +1349,8 @@ void gfx10_emit_ngg_culling_epilogue(struct ac_shader_abi *abi) unsigned vgpr; if (ctx->stage == MESA_SHADER_VERTEX) { - if (shader->selector->num_vbos_in_user_sgprs) { - vgpr = 8 + SI_SGPR_VS_VB_DESCRIPTOR_FIRST + shader->selector->num_vbos_in_user_sgprs * 4; + if (shader->selector->info.num_vbos_in_user_sgprs) { + vgpr = 8 + SI_SGPR_VS_VB_DESCRIPTOR_FIRST + shader->selector->info.num_vbos_in_user_sgprs * 4; } else { vgpr = 8 + GFX9_GS_NUM_USER_SGPR + 1; } @@ -1770,7 +1770,7 @@ void gfx10_ngg_gs_emit_vertex(struct si_shader_context *ctx, unsigned stream, LL LLVMBuildStore(builder, out_val, ngg_gs_get_emit_output_ptr(ctx, vertexptr, out_idx)); } } - assert(out_idx * 4 == sel->gsvs_vertex_size); + assert(out_idx * 4 == info->gsvs_vertex_size); /* Determine and store whether this vertex completed a primitive. */ const LLVMValueRef curverts = LLVMBuildLoad(builder, ctx->gs_curprim_verts[stream], ""); @@ -2227,8 +2227,8 @@ retry_select_mode: max_out_verts_per_gsprim = gs_sel->info.base.gs.vertices_out; } - esvert_lds_size = es_sel->esgs_itemsize / 4; - gsprim_lds_size = (gs_sel->gsvs_vertex_size / 4 + 1) * max_out_verts_per_gsprim; + esvert_lds_size = es_sel->info.esgs_itemsize / 4; + gsprim_lds_size = (gs_sel->info.gsvs_vertex_size / 4 + 1) * max_out_verts_per_gsprim; if (gsprim_lds_size > target_lds_size && !force_multi_cycling) { if (gs_sel->tess_turns_off_ngg || es_sel->info.stage != MESA_SHADER_TESS_EVAL) { diff --git a/src/gallium/drivers/radeonsi/si_compute.c b/src/gallium/drivers/radeonsi/si_compute.c index 0d4e4985a97..030da33f209 100644 --- a/src/gallium/drivers/radeonsi/si_compute.c +++ b/src/gallium/drivers/radeonsi/si_compute.c @@ -125,7 +125,7 @@ static void si_create_compute_state_async(void *job, void *gdata, int thread_ind si_init_compiler(sscreen, compiler); assert(program->ir_type == PIPE_SHADER_IR_NIR); - si_nir_scan_shader(sel->nir, &sel->info); + si_nir_scan_shader(sscreen, sel->nir, &sel->info); si_get_active_slot_masks(&sel->info, &sel->active_const_and_shader_buffers, &sel->active_samplers_and_images); diff --git a/src/gallium/drivers/radeonsi/si_pipe.h b/src/gallium/drivers/radeonsi/si_pipe.h index 50d8a4cc999..55e2336be20 100644 --- a/src/gallium/drivers/radeonsi/si_pipe.h +++ b/src/gallium/drivers/radeonsi/si_pipe.h @@ -122,7 +122,6 @@ extern "C" { #define SI_MAX_BORDER_COLORS 4096 #define SI_MAX_VIEWPORTS 16 -#define SI_USER_CLIP_PLANE_MASK 0x3F #define SI_MAP_BUFFER_ALIGNMENT 64 /* We only support the minimum allowed value (512), so that we can pack a 3D block size * in 1 SGPR. */ @@ -1882,8 +1881,8 @@ static inline unsigned si_get_total_colormask(struct si_context *sctx) sctx->framebuffer.colorbuf_enabled_4bit & sctx->queued.named.blend->cb_target_mask; if (!ps->info.color0_writes_all_cbufs) - colormask &= ps->colors_written_4bit; - else if (!ps->colors_written_4bit) + colormask &= ps->info.colors_written_4bit; + else if (!ps->info.colors_written_4bit) colormask = 0; /* color0 writes all cbufs, but it's not written */ return colormask; diff --git a/src/gallium/drivers/radeonsi/si_shader.c b/src/gallium/drivers/radeonsi/si_shader.c index 4e0784880ed..cefe63cfeb5 100644 --- a/src/gallium/drivers/radeonsi/si_shader.c +++ b/src/gallium/drivers/radeonsi/si_shader.c @@ -281,7 +281,7 @@ static void declare_vb_descriptor_input_sgprs(struct si_shader_context *ctx) { ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_CONST_DESC_PTR, &ctx->args.vertex_buffers); - unsigned num_vbos_in_user_sgprs = ctx->shader->selector->num_vbos_in_user_sgprs; + unsigned num_vbos_in_user_sgprs = ctx->shader->selector->info.num_vbos_in_user_sgprs; if (num_vbos_in_user_sgprs) { unsigned user_sgprs = ctx->args.num_sgprs_used; @@ -496,14 +496,14 @@ void si_init_shader_args(struct si_shader_context *ctx, bool ngg_cull_shader) /* VS outputs passed via VGPRs to TCS. */ if (shader->key.ge.opt.same_patch_vertices) { - unsigned num_outputs = util_last_bit64(shader->selector->outputs_written); + unsigned num_outputs = util_last_bit64(shader->selector->info.outputs_written); for (i = 0; i < num_outputs * 4; i++) ac_add_return(&ctx->args, AC_ARG_VGPR); } } else { /* TCS inputs are passed via VGPRs from VS. */ if (shader->key.ge.opt.same_patch_vertices) { - unsigned num_inputs = util_last_bit64(shader->previous_stage_sel->outputs_written); + unsigned num_inputs = util_last_bit64(shader->previous_stage_sel->info.outputs_written); for (i = 0; i < num_inputs * 4; i++) ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_FLOAT, NULL); } @@ -592,10 +592,10 @@ void si_init_shader_args(struct si_shader_context *ctx, bool ngg_cull_shader) */ num_user_sgprs = GFX9_GS_NUM_USER_SGPR + 1; - if (shader->selector->num_vbos_in_user_sgprs) { + if (shader->selector->info.num_vbos_in_user_sgprs) { assert(num_user_sgprs <= SI_SGPR_VS_VB_DESCRIPTOR_FIRST); num_user_sgprs = - SI_SGPR_VS_VB_DESCRIPTOR_FIRST + shader->selector->num_vbos_in_user_sgprs * 4; + SI_SGPR_VS_VB_DESCRIPTOR_FIRST + shader->selector->info.num_vbos_in_user_sgprs * 4; } } else { num_user_sgprs = GFX9_GS_NUM_USER_SGPR; @@ -1319,7 +1319,7 @@ bool si_vs_needs_prolog(const struct si_shader_selector *sel, /* VGPR initialization fixup for Vega10 and Raven is always done in the * VS prolog. */ - return sel->vs_needs_prolog || prolog_key->ls_vgpr_fix || + return sel->info.vs_needs_prolog || prolog_key->ls_vgpr_fix || /* The 2nd VS prolog loads input VGPRs from LDS */ (key->ge.opt.ngg_culling && !ngg_cull_shader && !is_gs); } @@ -1575,7 +1575,7 @@ struct nir_shader *si_get_nir_shader(struct si_shader_selector *sel, void si_update_shader_binary_info(struct si_shader *shader, nir_shader *nir) { struct si_shader_info info; - si_nir_scan_shader(nir, &info); + si_nir_scan_shader(shader->selector->screen, nir, &info); shader->info.uses_vmem_load_other |= info.uses_vmem_load_other; shader->info.uses_vmem_sampler_or_bvh |= info.uses_vmem_sampler_or_bvh; @@ -1915,7 +1915,7 @@ void si_get_ps_prolog_key(struct si_shader *shader, union si_shader_part_key *ke shader->info.uses_vmem_load_other = true; if (info->colors_read) { - ubyte *color = shader->selector->color_attr_index; + ubyte *color = shader->selector->info.color_attr_index; if (shader->key.ps.part.prolog.color_two_side) { /* BCOLORs are stored after the last input. */ diff --git a/src/gallium/drivers/radeonsi/si_shader.h b/src/gallium/drivers/radeonsi/si_shader.h index 9e193d4b0c8..3c5ca4d70a2 100644 --- a/src/gallium/drivers/radeonsi/si_shader.h +++ b/src/gallium/drivers/radeonsi/si_shader.h @@ -156,6 +156,7 @@ struct si_context; #define SI_MAX_ATTRIBS 16 #define SI_MAX_VS_OUTPUTS 40 +#define SI_USER_CLIP_PLANE_MASK 0x3F #define SI_NGG_PRIM_EDGE_FLAG_BITS ((1 << 9) | (1 << 19) | (1 << 29)) @@ -362,18 +363,43 @@ struct si_shader_info { ubyte output_streams[PIPE_MAX_SHADER_OUTPUTS]; ubyte output_type[PIPE_MAX_SHADER_OUTPUTS]; /* enum nir_alu_type */ - ubyte color_interpolate[2]; - ubyte color_interpolate_loc[2]; - - int constbuf0_num_slots; + ubyte num_vs_inputs; + ubyte num_vbos_in_user_sgprs; ubyte num_stream_output_components[4]; uint16_t enabled_streamout_buffer_mask; - uint num_memory_stores; + uint64_t inputs_read; /* "get_unique_index" bits */ + uint64_t tcs_vgpr_only_inputs; /* TCS inputs that are only in VGPRs, not LDS. */ + uint64_t outputs_written_before_ps; /* "get_unique_index" bits */ + uint64_t outputs_written; /* "get_unique_index" bits */ + uint32_t patch_outputs_written; /* "get_unique_index_patch" bits */ + + ubyte clipdist_mask; + ubyte culldist_mask; + + uint16_t lshs_vertex_stride; + uint16_t esgs_itemsize; /* vertex stride */ + uint16_t gsvs_vertex_size; + ubyte gs_input_verts_per_prim; + unsigned max_gsvs_emit_size; + + /* PS parameters */ + unsigned db_shader_control; + /* Set 0xf or 0x0 (4 bits) per each written output. + * ANDed with spi_shader_col_format. + */ + unsigned colors_written_4bit; + + int constbuf0_num_slots; + uint num_memory_stores; + ubyte color_attr_index[2]; + ubyte color_interpolate[2]; + ubyte color_interpolate_loc[2]; ubyte colors_read; /**< which color components are read by the FS */ ubyte colors_written; uint16_t output_color_types; /**< Each bit pair is enum si_color_output_type */ + bool vs_needs_prolog; bool color0_writes_all_cbufs; /**< gl_FragColor */ bool reads_samplemask; /**< does fragment shader read sample mask? */ bool reads_tess_factors; /**< If TES reads TESSINNER or TESSOUTER */ @@ -465,44 +491,17 @@ struct si_shader_selector { enum pipe_shader_type pipe_shader_type; ubyte const_and_shader_buf_descriptors_index; ubyte sampler_and_images_descriptors_index; - bool vs_needs_prolog; ubyte cs_shaderbufs_sgpr_index; ubyte cs_num_shaderbufs_in_user_sgprs; ubyte cs_images_sgpr_index; ubyte cs_images_num_sgprs; ubyte cs_num_images_in_user_sgprs; - ubyte num_vs_inputs; - ubyte num_vbos_in_user_sgprs; unsigned ngg_cull_vert_threshold; /* UINT32_MAX = disabled */ - ubyte clipdist_mask; - ubyte culldist_mask; enum pipe_prim_type rast_prim; - /* ES parameters. */ - uint16_t esgs_itemsize; /* vertex stride */ - uint16_t lshs_vertex_stride; - /* GS parameters. */ - uint16_t gsvs_vertex_size; - ubyte gs_input_verts_per_prim; - unsigned max_gsvs_emit_size; bool tess_turns_off_ngg; - /* PS parameters. */ - ubyte color_attr_index[2]; - unsigned db_shader_control; - /* Set 0xf or 0x0 (4 bits) per each written output. - * ANDed with spi_shader_col_format. - */ - unsigned colors_written_4bit; - - uint64_t outputs_written_before_ps; /* "get_unique_index" bits */ - uint64_t outputs_written; /* "get_unique_index" bits */ - uint32_t patch_outputs_written; /* "get_unique_index_patch" bits */ - - uint64_t inputs_read; /* "get_unique_index" bits */ - uint64_t tcs_vgpr_only_inputs; /* TCS inputs that are only in VGPRs, not LDS. */ - /* bitmasks of used descriptor slots */ uint64_t active_const_and_shader_buffers; uint64_t active_samplers_and_images; @@ -952,7 +951,8 @@ const char *si_get_shader_name(const struct si_shader *shader); void si_shader_binary_clean(struct si_shader_binary *binary); /* si_shader_info.c */ -void si_nir_scan_shader(const struct nir_shader *nir, struct si_shader_info *info); +void si_nir_scan_shader(struct si_screen *sscreen, const struct nir_shader *nir, + struct si_shader_info *info); /* si_shader_llvm_gs.c */ struct si_shader *si_generate_gs_copy_shader(struct si_screen *sscreen, diff --git a/src/gallium/drivers/radeonsi/si_shader_info.c b/src/gallium/drivers/radeonsi/si_shader_info.c index b9a5c9a8d46..3d21ef9a44d 100644 --- a/src/gallium/drivers/radeonsi/si_shader_info.c +++ b/src/gallium/drivers/radeonsi/si_shader_info.c @@ -22,8 +22,10 @@ * USE OR OTHER DEALINGS IN THE SOFTWARE. */ -#include "si_shader.h" +#include "si_pipe.h" #include "util/mesa-sha1.h" +#include "util/u_prim.h" +#include "sid.h" struct si_shader_profile { @@ -580,7 +582,8 @@ static void scan_instruction(const struct nir_shader *nir, struct si_shader_info } } -void si_nir_scan_shader(const struct nir_shader *nir, struct si_shader_info *info) +void si_nir_scan_shader(struct si_screen *sscreen, const struct nir_shader *nir, + struct si_shader_info *info) { memset(info, 0, sizeof(*info)); info->base = nir->info; @@ -729,4 +732,155 @@ void si_nir_scan_shader(const struct nir_shader *nir, struct si_shader_info *inf info->output_readmask[i] &= info->output_usagemask[i]; info->has_divergent_loop = nir_has_divergent_loop((nir_shader*)nir); + + if (info->stage == MESA_SHADER_VERTEX || + info->stage == MESA_SHADER_TESS_CTRL || + info->stage == MESA_SHADER_TESS_EVAL || + info->stage == MESA_SHADER_GEOMETRY) { + if (info->stage == MESA_SHADER_TESS_CTRL) { + /* Always reserve space for these. */ + info->patch_outputs_written |= + (1ull << si_shader_io_get_unique_index_patch(VARYING_SLOT_TESS_LEVEL_INNER)) | + (1ull << si_shader_io_get_unique_index_patch(VARYING_SLOT_TESS_LEVEL_OUTER)); + } + for (unsigned i = 0; i < info->num_outputs; i++) { + unsigned semantic = info->output_semantic[i]; + + if (semantic == VARYING_SLOT_TESS_LEVEL_INNER || + semantic == VARYING_SLOT_TESS_LEVEL_OUTER || + (semantic >= VARYING_SLOT_PATCH0 && semantic < VARYING_SLOT_TESS_MAX)) { + info->patch_outputs_written |= 1ull << si_shader_io_get_unique_index_patch(semantic); + } else if ((semantic <= VARYING_SLOT_VAR31 || semantic >= VARYING_SLOT_VAR0_16BIT) && + semantic != VARYING_SLOT_EDGE) { + info->outputs_written |= 1ull << si_shader_io_get_unique_index(semantic, false); + + /* Ignore outputs that are not passed from VS to PS. */ + if (semantic != VARYING_SLOT_POS && + semantic != VARYING_SLOT_PSIZ && + semantic != VARYING_SLOT_CLIP_VERTEX) { + info->outputs_written_before_ps |= 1ull + << si_shader_io_get_unique_index(semantic, true); + } + } + } + } + + if (nir->info.stage == MESA_SHADER_VERTEX) { + info->num_vs_inputs = + info->stage == MESA_SHADER_VERTEX && !info->base.vs.blit_sgprs_amd ? info->num_inputs : 0; + unsigned num_vbos_in_sgprs = si_num_vbos_in_user_sgprs_inline(sscreen->info.chip_class); + info->num_vbos_in_user_sgprs = MIN2(info->num_vs_inputs, num_vbos_in_sgprs); + + /* The prolog is a no-op if there are no inputs. */ + info->vs_needs_prolog = info->num_inputs && !info->base.vs.blit_sgprs_amd; + } + + if (nir->info.stage == MESA_SHADER_VERTEX || + nir->info.stage == MESA_SHADER_TESS_CTRL || + nir->info.stage == MESA_SHADER_TESS_EVAL) { + info->esgs_itemsize = util_last_bit64(info->outputs_written) * 16; + info->lshs_vertex_stride = info->esgs_itemsize; + + /* Add 1 dword to reduce LDS bank conflicts, so that each vertex + * will start on a different bank. (except for the maximum 32*16). + */ + if (info->lshs_vertex_stride < 32 * 16) + info->lshs_vertex_stride += 4; + + /* For the ESGS ring in LDS, add 1 dword to reduce LDS bank + * conflicts, i.e. each vertex will start at a different bank. + */ + if (sscreen->info.chip_class >= GFX9) + info->esgs_itemsize += 4; + + assert(((info->esgs_itemsize / 4) & C_028AAC_ITEMSIZE) == 0); + + info->tcs_vgpr_only_inputs = ~info->base.tess.tcs_cross_invocation_inputs_read & + ~info->base.inputs_read_indirectly & + info->base.inputs_read; + } + + if (nir->info.stage == MESA_SHADER_GEOMETRY) { + info->gsvs_vertex_size = info->num_outputs * 16; + info->max_gsvs_emit_size = info->gsvs_vertex_size * info->base.gs.vertices_out; + info->gs_input_verts_per_prim = + u_vertices_per_prim((enum pipe_prim_type)info->base.gs.input_primitive); + } + + info->clipdist_mask = info->writes_clipvertex ? SI_USER_CLIP_PLANE_MASK : + u_bit_consecutive(0, info->base.clip_distance_array_size); + info->culldist_mask = u_bit_consecutive(0, info->base.cull_distance_array_size) << + info->base.clip_distance_array_size; + + if (nir->info.stage == MESA_SHADER_FRAGMENT) { + for (unsigned i = 0; i < info->num_inputs; i++) { + unsigned semantic = info->input[i].semantic; + + if ((semantic <= VARYING_SLOT_VAR31 || semantic >= VARYING_SLOT_VAR0_16BIT) && + semantic != VARYING_SLOT_PNTC) { + info->inputs_read |= 1ull << si_shader_io_get_unique_index(semantic, true); + } + } + + for (unsigned i = 0; i < 8; i++) + if (info->colors_written & (1 << i)) + info->colors_written_4bit |= 0xf << (4 * i); + + for (unsigned i = 0; i < info->num_inputs; i++) { + if (info->input[i].semantic == VARYING_SLOT_COL0) + info->color_attr_index[0] = i; + else if (info->input[i].semantic == VARYING_SLOT_COL1) + info->color_attr_index[1] = i; + } + + /* DB_SHADER_CONTROL */ + info->db_shader_control = S_02880C_Z_EXPORT_ENABLE(info->writes_z) | + S_02880C_STENCIL_TEST_VAL_EXPORT_ENABLE(info->writes_stencil) | + S_02880C_MASK_EXPORT_ENABLE(info->writes_samplemask) | + S_02880C_KILL_ENABLE(info->base.fs.uses_discard); + + switch (info->base.fs.depth_layout) { + case FRAG_DEPTH_LAYOUT_GREATER: + info->db_shader_control |= S_02880C_CONSERVATIVE_Z_EXPORT(V_02880C_EXPORT_GREATER_THAN_Z); + break; + case FRAG_DEPTH_LAYOUT_LESS: + info->db_shader_control |= S_02880C_CONSERVATIVE_Z_EXPORT(V_02880C_EXPORT_LESS_THAN_Z); + break; + default:; + } + + /* Z_ORDER, EXEC_ON_HIER_FAIL and EXEC_ON_NOOP should be set as following: + * + * | early Z/S | writes_mem | allow_ReZ? | Z_ORDER | EXEC_ON_HIER_FAIL | EXEC_ON_NOOP + * --|-----------|------------|------------|--------------------|-------------------|------------- + * 1a| false | false | true | EarlyZ_Then_ReZ | 0 | 0 + * 1b| false | false | false | EarlyZ_Then_LateZ | 0 | 0 + * 2 | false | true | n/a | LateZ | 1 | 0 + * 3 | true | false | n/a | EarlyZ_Then_LateZ | 0 | 0 + * 4 | true | true | n/a | EarlyZ_Then_LateZ | 0 | 1 + * + * In cases 3 and 4, HW will force Z_ORDER to EarlyZ regardless of what's set in the register. + * In case 2, NOOP_CULL is a don't care field. In case 2, 3 and 4, ReZ doesn't make sense. + * + * Don't use ReZ without profiling !!! + * + * ReZ decreases performance by 15% in DiRT: Showdown on Ultra settings, which has pretty complex + * shaders. + */ + if (info->base.fs.early_fragment_tests) { + /* Cases 3, 4. */ + info->db_shader_control |= S_02880C_DEPTH_BEFORE_SHADER(1) | + S_02880C_Z_ORDER(V_02880C_EARLY_Z_THEN_LATE_Z) | + S_02880C_EXEC_ON_NOOP(info->base.writes_memory); + } else if (info->base.writes_memory) { + /* Case 2. */ + info->db_shader_control |= S_02880C_Z_ORDER(V_02880C_LATE_Z) | S_02880C_EXEC_ON_HIER_FAIL(1); + } else { + /* Case 1. */ + info->db_shader_control |= S_02880C_Z_ORDER(V_02880C_EARLY_Z_THEN_LATE_Z); + } + + if (info->base.fs.post_depth_coverage) + info->db_shader_control |= S_02880C_PRE_SHADER_DEPTH_COVERAGE_ENABLE(1); + } } diff --git a/src/gallium/drivers/radeonsi/si_shader_llvm.c b/src/gallium/drivers/radeonsi/si_shader_llvm.c index 101d2fb116e..ee31e133ce8 100644 --- a/src/gallium/drivers/radeonsi/si_shader_llvm.c +++ b/src/gallium/drivers/radeonsi/si_shader_llvm.c @@ -1027,7 +1027,7 @@ bool si_llvm_translate_nir(struct si_shader_context *ctx, struct si_shader *shad /* We need the barrier only if TCS inputs are read from LDS. */ if (!shader->key.ge.opt.same_patch_vertices || shader->selector->info.base.inputs_read & - ~shader->selector->tcs_vgpr_only_inputs) + ~shader->selector->info.tcs_vgpr_only_inputs) ac_build_s_barrier(&ctx->ac); } else if (ctx->stage == MESA_SHADER_GEOMETRY && !shader->key.ge.as_ngg) { /* gfx10_ngg_gs_emit_prologue inserts the barrier for NGG. */ diff --git a/src/gallium/drivers/radeonsi/si_shader_llvm_gs.c b/src/gallium/drivers/radeonsi/si_shader_llvm_gs.c index 6a570ceff14..e20af7e1358 100644 --- a/src/gallium/drivers/radeonsi/si_shader_llvm_gs.c +++ b/src/gallium/drivers/radeonsi/si_shader_llvm_gs.c @@ -140,7 +140,7 @@ void si_llvm_emit_es_epilogue(struct ac_shader_abi *abi) int i; if (ctx->screen->info.chip_class >= GFX9 && info->num_outputs) { - unsigned itemsize_dw = es->selector->esgs_itemsize / 4; + unsigned itemsize_dw = es->selector->info.esgs_itemsize / 4; LLVMValueRef vertex_idx = ac_get_thread_id(&ctx->ac); LLVMValueRef wave_idx = si_unpack_param(ctx, ctx->args.merged_wave_info, 24, 4); vertex_idx = diff --git a/src/gallium/drivers/radeonsi/si_shader_llvm_tess.c b/src/gallium/drivers/radeonsi/si_shader_llvm_tess.c index 6acd2606045..18b203604c8 100644 --- a/src/gallium/drivers/radeonsi/si_shader_llvm_tess.c +++ b/src/gallium/drivers/radeonsi/si_shader_llvm_tess.c @@ -74,7 +74,7 @@ static unsigned get_tcs_out_vertex_dw_stride_constant(struct si_shader_context * if (ctx->shader->key.ge.mono.u.ff_tcs_inputs_to_copy) return util_last_bit64(ctx->shader->key.ge.mono.u.ff_tcs_inputs_to_copy) * 4; - return util_last_bit64(ctx->shader->selector->outputs_written) * 4; + return util_last_bit64(ctx->shader->selector->info.outputs_written) * 4; } static LLVMValueRef get_tcs_out_vertex_dw_stride(struct si_shader_context *ctx) @@ -92,7 +92,7 @@ static LLVMValueRef get_tcs_out_patch_stride(struct si_shader_context *ctx) const struct si_shader_info *info = &ctx->shader->selector->info; unsigned tcs_out_vertices = info->base.tess.tcs_vertices_out; unsigned vertex_dw_stride = get_tcs_out_vertex_dw_stride_constant(ctx); - unsigned num_patch_outputs = util_last_bit64(ctx->shader->selector->patch_outputs_written); + unsigned num_patch_outputs = util_last_bit64(ctx->shader->selector->info.patch_outputs_written); unsigned patch_dw_stride = tcs_out_vertices * vertex_dw_stride + num_patch_outputs * 4; return LLVMConstInt(ctx->ac.i32, patch_dw_stride, 0); } @@ -155,12 +155,12 @@ static LLVMValueRef get_tcs_in_vertex_dw_stride(struct si_shader_context *ctx) switch (ctx->stage) { case MESA_SHADER_VERTEX: - stride = ctx->shader->selector->lshs_vertex_stride / 4; + stride = ctx->shader->selector->info.lshs_vertex_stride / 4; return LLVMConstInt(ctx->ac.i32, stride, 0); case MESA_SHADER_TESS_CTRL: if (ctx->screen->info.chip_class >= GFX9 && ctx->shader->is_monolithic) { - stride = ctx->shader->key.ge.part.tcs.ls->lshs_vertex_stride / 4; + stride = ctx->shader->key.ge.part.tcs.ls->info.lshs_vertex_stride / 4; return LLVMConstInt(ctx->ac.i32, stride, 0); } return si_unpack_param(ctx, ctx->vs_state_bits, 24, 8); @@ -980,7 +980,7 @@ void si_llvm_emit_ls_epilogue(struct ac_shader_abi *abi) LLVMValueRef value = LLVMBuildLoad(ctx->ac.builder, addrs[4 * i + chan], ""); if (!shader->key.ge.opt.same_patch_vertices || - !(ctx->next_shader_sel->tcs_vgpr_only_inputs & (1ull << semantic))) + !(ctx->next_shader_sel->info.tcs_vgpr_only_inputs & (1ull << semantic))) lshs_lds_store(ctx, chan, dw_addr, value); if (shader->key.ge.opt.same_patch_vertices) { diff --git a/src/gallium/drivers/radeonsi/si_shader_llvm_vs.c b/src/gallium/drivers/radeonsi/si_shader_llvm_vs.c index d0c699cf924..ab984f2f7fb 100644 --- a/src/gallium/drivers/radeonsi/si_shader_llvm_vs.c +++ b/src/gallium/drivers/radeonsi/si_shader_llvm_vs.c @@ -111,7 +111,7 @@ static void load_input_vs(struct si_shader_context *ctx, unsigned input_index, L unsigned bit_size = info->input[input_index].fp16_lo_hi_valid & 0x1 ? 16 : 32; LLVMTypeRef int_type = bit_size == 16 ? ctx->ac.i16 : ctx->ac.i32; LLVMTypeRef float_type = bit_size == 16 ? ctx->ac.f16 : ctx->ac.f32; - unsigned num_vbos_in_user_sgprs = ctx->shader->selector->num_vbos_in_user_sgprs; + unsigned num_vbos_in_user_sgprs = ctx->shader->selector->info.num_vbos_in_user_sgprs; union si_vs_fix_fetch fix_fetch; LLVMValueRef vb_desc; LLVMValueRef vertex_index; @@ -391,7 +391,7 @@ void si_llvm_clipvertex_to_clipdist(struct si_shader_context *ctx, LLVMValueRef ptr = ac_get_arg(&ctx->ac, ctx->internal_bindings); LLVMValueRef constbuf_index = LLVMConstInt(ctx->ac.i32, SI_VS_CONST_CLIP_PLANES, 0); LLVMValueRef const_resource = ac_build_load_to_sgpr(&ctx->ac, ptr, constbuf_index); - unsigned clipdist_mask = ctx->shader->selector->clipdist_mask & + unsigned clipdist_mask = ctx->shader->selector->info.clipdist_mask & ~ctx->shader->key.ge.opt.kill_clip_distances; for (reg_index = 0; reg_index < 2; reg_index++) { @@ -569,9 +569,9 @@ void si_llvm_build_vs_exports(struct si_shader_context *ctx, LLVMValueRef psize_value = NULL, edgeflag_value = NULL, layer_value = NULL, viewport_index_value = NULL; unsigned pos_idx, index; - unsigned clipdist_mask = (shader->selector->clipdist_mask & + unsigned clipdist_mask = (shader->selector->info.clipdist_mask & ~shader->key.ge.opt.kill_clip_distances) | - shader->selector->culldist_mask; + shader->selector->info.culldist_mask; int i; si_vertex_color_clamping(ctx, outputs, noutput); diff --git a/src/gallium/drivers/radeonsi/si_state.c b/src/gallium/drivers/radeonsi/si_state.c index fb52d33ad19..cdae8f4b89e 100644 --- a/src/gallium/drivers/radeonsi/si_state.c +++ b/src/gallium/drivers/radeonsi/si_state.c @@ -834,9 +834,9 @@ static void si_emit_clip_regs(struct si_context *sctx) struct si_state_rasterizer *rs = sctx->queued.named.rasterizer; bool window_space = info->stage == MESA_SHADER_VERTEX ? info->base.vs.window_space_position : 0; - unsigned clipdist_mask = vs_sel->clipdist_mask; + unsigned clipdist_mask = vs_sel->info.clipdist_mask; unsigned ucp_mask = clipdist_mask ? 0 : rs->clip_plane_enable & SI_USER_CLIP_PLANE_MASK; - unsigned culldist_mask = vs_sel->culldist_mask; + unsigned culldist_mask = vs_sel->info.culldist_mask; /* Clip distances on points have no effect, so need to be implemented * as cull distances. This applies for the clipvertex case as well. diff --git a/src/gallium/drivers/radeonsi/si_state_draw.cpp b/src/gallium/drivers/radeonsi/si_state_draw.cpp index ce1836af8fa..8bdf945ae90 100644 --- a/src/gallium/drivers/radeonsi/si_state_draw.cpp +++ b/src/gallium/drivers/radeonsi/si_state_draw.cpp @@ -559,13 +559,13 @@ static void si_emit_derived_tess_state(struct si_context *sctx, unsigned *num_pa /* This calculates how shader inputs and outputs among VS, TCS, and TES * are laid out in LDS. */ - unsigned num_tcs_inputs = util_last_bit64(ls->outputs_written); + unsigned num_tcs_inputs = util_last_bit64(ls->info.outputs_written); unsigned num_tcs_output_cp, num_tcs_outputs, num_tcs_patch_outputs; if (sctx->shader.tcs.cso) { - num_tcs_outputs = util_last_bit64(tcs->outputs_written); + num_tcs_outputs = util_last_bit64(tcs->info.outputs_written); num_tcs_output_cp = tcs->info.base.tess.tcs_vertices_out; - num_tcs_patch_outputs = util_last_bit64(tcs->patch_outputs_written); + num_tcs_patch_outputs = util_last_bit64(tcs->info.patch_outputs_written); } else { /* No TCS. Route varyings from LS to TES. */ num_tcs_outputs = num_tcs_inputs; @@ -573,13 +573,13 @@ static void si_emit_derived_tess_state(struct si_context *sctx, unsigned *num_pa num_tcs_patch_outputs = 2; /* TESSINNER + TESSOUTER */ } - unsigned input_vertex_size = ls->lshs_vertex_stride; + unsigned input_vertex_size = ls->info.lshs_vertex_stride; unsigned output_vertex_size = num_tcs_outputs * 16; unsigned input_patch_size; /* Allocate LDS for TCS inputs only if it's used. */ if (!ls_current->key.ge.opt.same_patch_vertices || - tcs->info.base.inputs_read & ~tcs->tcs_vgpr_only_inputs) + tcs->info.base.inputs_read & ~tcs->info.tcs_vgpr_only_inputs) input_patch_size = num_tcs_input_cp * input_vertex_size; else input_patch_size = 0; @@ -2112,8 +2112,8 @@ static void si_draw(struct pipe_context *ctx, struct si_shader_selector *vs = sctx->shader.vs.cso; struct si_vertex_state *vstate = (struct si_vertex_state *)state; if (unlikely(!vs || - (!IS_DRAW_VERTEX_STATE && sctx->num_vertex_elements < vs->num_vs_inputs) || - (IS_DRAW_VERTEX_STATE && vstate->velems.count < vs->num_vs_inputs) || + (!IS_DRAW_VERTEX_STATE && sctx->num_vertex_elements < vs->info.num_vs_inputs) || + (IS_DRAW_VERTEX_STATE && vstate->velems.count < vs->info.num_vs_inputs) || !sctx->shader.ps.cso || (HAS_TESS != (prim == PIPE_PRIM_PATCHES)))) { assert(0); return; diff --git a/src/gallium/drivers/radeonsi/si_state_shaders.cpp b/src/gallium/drivers/radeonsi/si_state_shaders.cpp index 0ee4bf523da..b0700bf4a78 100644 --- a/src/gallium/drivers/radeonsi/si_state_shaders.cpp +++ b/src/gallium/drivers/radeonsi/si_state_shaders.cpp @@ -614,7 +614,7 @@ static unsigned si_get_num_vs_user_sgprs(struct si_shader *shader, { struct si_shader_selector *vs = shader->previous_stage_sel ? shader->previous_stage_sel : shader->selector; - unsigned num_vbos_in_user_sgprs = vs->num_vbos_in_user_sgprs; + unsigned num_vbos_in_user_sgprs = vs->info.num_vbos_in_user_sgprs; /* 1 SGPR is reserved for the vertex buffer pointer. */ assert(num_always_on_user_sgprs <= SI_SGPR_VS_VB_DESCRIPTOR_FIRST - 1); @@ -744,7 +744,7 @@ static void si_emit_shader_es(struct si_context *sctx) radeon_begin(&sctx->gfx_cs); radeon_opt_set_context_reg(sctx, R_028AAC_VGT_ESGS_RING_ITEMSIZE, SI_TRACKED_VGT_ESGS_RING_ITEMSIZE, - shader->selector->esgs_itemsize / 4); + shader->selector->info.esgs_itemsize / 4); if (shader->selector->info.stage == MESA_SHADER_TESS_EVAL) radeon_opt_set_context_reg(sctx, R_028B6C_VGT_TF_PARAM, SI_TRACKED_VGT_TF_PARAM, @@ -815,7 +815,7 @@ void gfx9_get_gs_info(struct si_shader_selector *es, struct si_shader_selector * /* We can't allow using the whole LDS, because GS waves compete with * other shader stages for LDS space. */ const unsigned max_lds_size = 8 * 1024; - const unsigned esgs_itemsize = es->esgs_itemsize / 4; + const unsigned esgs_itemsize = es->info.esgs_itemsize / 4; unsigned esgs_lds_size; /* All these are per subgroup: */ @@ -842,7 +842,7 @@ void gfx9_get_gs_info(struct si_shader_selector *es, struct si_shader_selector * /* If the primitive has adjacency, halve the number of vertices * that will be reused in multiple primitives. */ - min_es_verts = gs->gs_input_verts_per_prim / (uses_adjacency ? 2 : 1); + min_es_verts = gs->info.gs_input_verts_per_prim / (uses_adjacency ? 2 : 1); gs_prims = MIN2(ideal_gs_prims, max_gs_prims); worst_case_es_verts = MIN2(min_es_verts * gs_prims, max_es_verts); @@ -877,7 +877,7 @@ void gfx9_get_gs_info(struct si_shader_selector *es, struct si_shader_selector * /* Vertices for adjacency primitives are not always reused, so restore * it for ES_VERTS_PER_SUBGRP. */ - min_es_verts = gs->gs_input_verts_per_prim; + min_es_verts = gs->info.gs_input_verts_per_prim; /* For normal primitives, the VGT only checks if they are past the ES * verts per subgroup after allocating a full GS primitive and if they @@ -1105,7 +1105,7 @@ static void si_shader_gs(struct si_screen *sscreen, struct si_shader *shader) S_028A44_GS_INST_PRIMS_IN_SUBGRP(shader->gs_info.gs_inst_prims_in_subgroup); shader->ctx_reg.gs.vgt_gs_max_prims_per_subgroup = S_028A94_MAX_PRIMS_PER_SUBGROUP(shader->gs_info.max_prims_per_subgroup); - shader->ctx_reg.gs.vgt_esgs_ring_itemsize = shader->key.ge.part.gs.es->esgs_itemsize / 4; + shader->ctx_reg.gs.vgt_esgs_ring_itemsize = shader->key.ge.part.gs.es->info.esgs_itemsize / 4; if (es_stage == MESA_SHADER_TESS_EVAL) si_set_tesseval_regs(sscreen, shader->key.ge.part.gs.es, shader); @@ -1286,8 +1286,8 @@ static unsigned si_get_vs_out_cntl(const struct si_shader_selector *sel, const struct si_shader *shader, bool ngg) { /* Clip distances can be killed, but cull distances can't. */ - unsigned clipcull_mask = (sel->clipdist_mask & ~shader->key.ge.opt.kill_clip_distances) | - sel->culldist_mask; + unsigned clipcull_mask = (sel->info.clipdist_mask & ~shader->key.ge.opt.kill_clip_distances) | + sel->info.culldist_mask; bool writes_psize = sel->info.writes_psize && !shader->key.ge.opt.kill_pointsize; bool misc_vec_ena = writes_psize || (sel->info.writes_edgeflag && !ngg) || sel->screen->options.vrs2x2 || @@ -1427,7 +1427,7 @@ static void gfx10_shader_ngg(struct si_screen *sscreen, struct si_shader *shader gs_sel->info.writes_primid); if (gs_stage == MESA_SHADER_GEOMETRY) { - shader->ctx_reg.ngg.vgt_esgs_ring_itemsize = es_sel->esgs_itemsize / 4; + shader->ctx_reg.ngg.vgt_esgs_ring_itemsize = es_sel->info.esgs_itemsize / 4; shader->ctx_reg.ngg.vgt_gs_max_vert_out = gs_sel->info.base.gs.vertices_out; } else { shader->ctx_reg.ngg.vgt_esgs_ring_itemsize = 1; @@ -2071,16 +2071,16 @@ void si_update_ps_inputs_read_or_disabled(struct si_context *sctx) (!ps_colormask && !ps_modifies_zs && !ps->info.base.writes_memory); } - sctx->ps_inputs_read_or_disabled = ps_disabled ? 0 : ps->inputs_read; + sctx->ps_inputs_read_or_disabled = ps_disabled ? 0 : ps->info.inputs_read; } static void si_get_vs_key_outputs(struct si_context *sctx, struct si_shader_selector *vs, union si_shader_key *key) { - key->ge.opt.kill_clip_distances = vs->clipdist_mask & ~sctx->queued.named.rasterizer->clip_plane_enable; + key->ge.opt.kill_clip_distances = vs->info.clipdist_mask & ~sctx->queued.named.rasterizer->clip_plane_enable; /* Find out which VS outputs aren't used by the PS. */ - uint64_t outputs_written = vs->outputs_written_before_ps; + uint64_t outputs_written = vs->info.outputs_written_before_ps; uint64_t linked = outputs_written & sctx->ps_inputs_read_or_disabled; key->ge.opt.kill_outputs = ~linked & outputs_written; @@ -2185,7 +2185,7 @@ void si_ps_key_update_framebuffer_blend(struct si_context *sctx) /* Disable unwritten outputs (if WRITE_ALL_CBUFS isn't enabled). */ if (!key->ps.part.epilog.last_cbuf) { - key->ps.part.epilog.spi_shader_col_format &= sel->colors_written_4bit; + key->ps.part.epilog.spi_shader_col_format &= sel->info.colors_written_4bit; key->ps.part.epilog.color_is_int8 &= sel->info.colors_written; key->ps.part.epilog.color_is_int10 &= sel->info.colors_written; } @@ -2196,7 +2196,7 @@ void si_ps_key_update_framebuffer_blend(struct si_context *sctx) * * Dual source blending never has color buffer 1 enabled, so ignore it. */ - if (sel->colors_written_4bit & + if (sel->info.colors_written_4bit & (blend->dual_src_blend ? 0xffffff0f : 0xffffffff) & ~(sctx->framebuffer.colorbuf_enabled_4bit & blend->cb_target_enabled_4bit)) key->ps.opt.prefer_mono = 1; @@ -2944,7 +2944,7 @@ static void si_init_shader_selector_async(void *job, void *gdata, int thread_ind semantic != VARYING_SLOT_CLIP_VERTEX && semantic != VARYING_SLOT_EDGE) { id = si_shader_io_get_unique_index(semantic, true); - sel->outputs_written_before_ps &= ~(1ull << id); + sel->info.outputs_written_before_ps &= ~(1ull << id); } } } @@ -3024,7 +3024,6 @@ static void *si_create_shader_selector(struct pipe_context *ctx, struct si_screen *sscreen = (struct si_screen *)ctx->screen; struct si_context *sctx = (struct si_context *)ctx; struct si_shader_selector *sel = CALLOC_STRUCT(si_shader_selector); - int i; if (!sel) return NULL; @@ -3040,7 +3039,7 @@ static void *si_create_shader_selector(struct pipe_context *ctx, sel->nir = (nir_shader*)state->ir.nir; } - si_nir_scan_shader(sel->nir, &sel->info); + si_nir_scan_shader(sscreen, sel->nir, &sel->info); const enum pipe_shader_type type = pipe_shader_type_from_mesa(sel->info.stage); sel->pipe_shader_type = type; @@ -3053,49 +3052,6 @@ static void *si_create_shader_selector(struct pipe_context *ctx, si_get_active_slot_masks(&sel->info, &sel->active_const_and_shader_buffers, &sel->active_samplers_and_images); - sel->num_vs_inputs = - sel->info.stage == MESA_SHADER_VERTEX && !sel->info.base.vs.blit_sgprs_amd - ? sel->info.num_inputs - : 0; - unsigned num_vbos_in_sgprs = si_num_vbos_in_user_sgprs_inline(sscreen->info.chip_class); - sel->num_vbos_in_user_sgprs = MIN2(sel->num_vs_inputs, num_vbos_in_sgprs); - - /* The prolog is a no-op if there are no inputs. */ - sel->vs_needs_prolog = sel->info.stage == MESA_SHADER_VERTEX && sel->info.num_inputs && - !sel->info.base.vs.blit_sgprs_amd; - - if (sel->info.stage == MESA_SHADER_VERTEX || - sel->info.stage == MESA_SHADER_TESS_CTRL || - sel->info.stage == MESA_SHADER_TESS_EVAL || - sel->info.stage == MESA_SHADER_GEOMETRY) { - if (sel->info.stage == MESA_SHADER_TESS_CTRL) { - /* Always reserve space for these. */ - sel->patch_outputs_written |= - (1ull << si_shader_io_get_unique_index_patch(VARYING_SLOT_TESS_LEVEL_INNER)) | - (1ull << si_shader_io_get_unique_index_patch(VARYING_SLOT_TESS_LEVEL_OUTER)); - } - for (i = 0; i < sel->info.num_outputs; i++) { - unsigned semantic = sel->info.output_semantic[i]; - - if (semantic == VARYING_SLOT_TESS_LEVEL_INNER || - semantic == VARYING_SLOT_TESS_LEVEL_OUTER || - (semantic >= VARYING_SLOT_PATCH0 && semantic < VARYING_SLOT_TESS_MAX)) { - sel->patch_outputs_written |= 1ull << si_shader_io_get_unique_index_patch(semantic); - } else if ((semantic <= VARYING_SLOT_VAR31 || semantic >= VARYING_SLOT_VAR0_16BIT) && - semantic != VARYING_SLOT_EDGE) { - sel->outputs_written |= 1ull << si_shader_io_get_unique_index(semantic, false); - - /* Ignore outputs that are not passed from VS to PS. */ - if (semantic != VARYING_SLOT_POS && - semantic != VARYING_SLOT_PSIZ && - semantic != VARYING_SLOT_CLIP_VERTEX) { - sel->outputs_written_before_ps |= 1ull - << si_shader_io_get_unique_index(semantic, true); - } - } - } - } - switch (sel->info.stage) { case MESA_SHADER_GEOMETRY: /* Only possibilities: POINTS, LINE_STRIP, TRIANGLES */ @@ -3103,11 +3059,6 @@ static void *si_create_shader_selector(struct pipe_context *ctx, if (util_rast_prim_is_triangles(sel->rast_prim)) sel->rast_prim = PIPE_PRIM_TRIANGLES; - sel->gsvs_vertex_size = sel->info.num_outputs * 16; - sel->max_gsvs_emit_size = sel->gsvs_vertex_size * sel->info.base.gs.vertices_out; - sel->gs_input_verts_per_prim = - u_vertices_per_prim((enum pipe_prim_type)sel->info.base.gs.input_primitive); - /* EN_MAX_VERT_OUT_PER_GS_INSTANCE does not work with tesselation so * we can't split workgroups. Disable ngg if any of the following conditions is true: * - num_invocations * gs.vertices_out > 256 @@ -3120,30 +3071,7 @@ static void *si_create_shader_selector(struct pipe_context *ctx, break; case MESA_SHADER_VERTEX: - case MESA_SHADER_TESS_CTRL: case MESA_SHADER_TESS_EVAL: - sel->esgs_itemsize = util_last_bit64(sel->outputs_written) * 16; - sel->lshs_vertex_stride = sel->esgs_itemsize; - - /* Add 1 dword to reduce LDS bank conflicts, so that each vertex - * will start on a different bank. (except for the maximum 32*16). - */ - if (sel->lshs_vertex_stride < 32 * 16) - sel->lshs_vertex_stride += 4; - - /* For the ESGS ring in LDS, add 1 dword to reduce LDS bank - * conflicts, i.e. each vertex will start at a different bank. - */ - if (sctx->chip_class >= GFX9) - sel->esgs_itemsize += 4; - - assert(((sel->esgs_itemsize / 4) & C_028AAC_ITEMSIZE) == 0); - - sel->tcs_vgpr_only_inputs = ~sel->info.base.tess.tcs_cross_invocation_inputs_read & - ~sel->info.base.inputs_read_indirectly & - sel->info.base.inputs_read; - - /* Only for TES: */ if (sel->info.stage == MESA_SHADER_TESS_EVAL) { if (sel->info.base.tess.point_mode) sel->rast_prim = PIPE_PRIM_POINTS; @@ -3155,28 +3083,6 @@ static void *si_create_shader_selector(struct pipe_context *ctx, sel->rast_prim = PIPE_PRIM_TRIANGLES; } break; - - case MESA_SHADER_FRAGMENT: - for (i = 0; i < sel->info.num_inputs; i++) { - unsigned semantic = sel->info.input[i].semantic; - - if ((semantic <= VARYING_SLOT_VAR31 || semantic >= VARYING_SLOT_VAR0_16BIT) && - semantic != VARYING_SLOT_PNTC) { - sel->inputs_read |= 1ull << si_shader_io_get_unique_index(semantic, true); - } - } - - for (i = 0; i < 8; i++) - if (sel->info.colors_written & (1 << i)) - sel->colors_written_4bit |= 0xf << (4 * i); - - for (i = 0; i < sel->info.num_inputs; i++) { - if (sel->info.input[i].semantic == VARYING_SLOT_COL0) - sel->color_attr_index[0] = i; - else if (sel->info.input[i].semantic == VARYING_SLOT_COL1) - sel->color_attr_index[1] = i; - } - break; default:; } @@ -3208,63 +3114,6 @@ static void *si_create_shader_selector(struct pipe_context *ctx, } } - sel->clipdist_mask = sel->info.writes_clipvertex ? SI_USER_CLIP_PLANE_MASK : - u_bit_consecutive(0, sel->info.base.clip_distance_array_size); - sel->culldist_mask = u_bit_consecutive(0, sel->info.base.cull_distance_array_size) << - sel->info.base.clip_distance_array_size; - - /* DB_SHADER_CONTROL */ - sel->db_shader_control = S_02880C_Z_EXPORT_ENABLE(sel->info.writes_z) | - S_02880C_STENCIL_TEST_VAL_EXPORT_ENABLE(sel->info.writes_stencil) | - S_02880C_MASK_EXPORT_ENABLE(sel->info.writes_samplemask) | - S_02880C_KILL_ENABLE(sel->info.base.fs.uses_discard); - - if (sel->info.stage == MESA_SHADER_FRAGMENT) { - switch (sel->info.base.fs.depth_layout) { - case FRAG_DEPTH_LAYOUT_GREATER: - sel->db_shader_control |= S_02880C_CONSERVATIVE_Z_EXPORT(V_02880C_EXPORT_GREATER_THAN_Z); - break; - case FRAG_DEPTH_LAYOUT_LESS: - sel->db_shader_control |= S_02880C_CONSERVATIVE_Z_EXPORT(V_02880C_EXPORT_LESS_THAN_Z); - break; - default:; - } - - /* Z_ORDER, EXEC_ON_HIER_FAIL and EXEC_ON_NOOP should be set as following: - * - * | early Z/S | writes_mem | allow_ReZ? | Z_ORDER | EXEC_ON_HIER_FAIL | EXEC_ON_NOOP - * --|-----------|------------|------------|--------------------|-------------------|------------- - * 1a| false | false | true | EarlyZ_Then_ReZ | 0 | 0 - * 1b| false | false | false | EarlyZ_Then_LateZ | 0 | 0 - * 2 | false | true | n/a | LateZ | 1 | 0 - * 3 | true | false | n/a | EarlyZ_Then_LateZ | 0 | 0 - * 4 | true | true | n/a | EarlyZ_Then_LateZ | 0 | 1 - * - * In cases 3 and 4, HW will force Z_ORDER to EarlyZ regardless of what's set in the register. - * In case 2, NOOP_CULL is a don't care field. In case 2, 3 and 4, ReZ doesn't make sense. - * - * Don't use ReZ without profiling !!! - * - * ReZ decreases performance by 15% in DiRT: Showdown on Ultra settings, which has pretty complex - * shaders. - */ - if (sel->info.base.fs.early_fragment_tests) { - /* Cases 3, 4. */ - sel->db_shader_control |= S_02880C_DEPTH_BEFORE_SHADER(1) | - S_02880C_Z_ORDER(V_02880C_EARLY_Z_THEN_LATE_Z) | - S_02880C_EXEC_ON_NOOP(sel->info.base.writes_memory); - } else if (sel->info.base.writes_memory) { - /* Case 2. */ - sel->db_shader_control |= S_02880C_Z_ORDER(V_02880C_LATE_Z) | S_02880C_EXEC_ON_HIER_FAIL(1); - } else { - /* Case 1. */ - sel->db_shader_control |= S_02880C_Z_ORDER(V_02880C_EARLY_Z_THEN_LATE_Z); - } - - if (sel->info.base.fs.post_depth_coverage) - sel->db_shader_control |= S_02880C_PRE_SHADER_DEPTH_COVERAGE_ENABLE(1); - } - (void)simple_mtx_init(&sel->mutex, mtx_plain); si_schedule_initial_compile(sctx, sel->info.stage, &sel->ready, &sel->compiler_ctx_state, @@ -3315,8 +3164,8 @@ static void si_update_clip_regs(struct si_context *sctx, struct si_shader_select (!old_hw_vs || (old_hw_vs->info.stage == MESA_SHADER_VERTEX && old_hw_vs->info.base.vs.window_space_position) != (next_hw_vs->info.stage == MESA_SHADER_VERTEX && next_hw_vs->info.base.vs.window_space_position) || - old_hw_vs->clipdist_mask != next_hw_vs->clipdist_mask || - old_hw_vs->culldist_mask != next_hw_vs->culldist_mask || !old_hw_vs_variant || + old_hw_vs->info.clipdist_mask != next_hw_vs->info.clipdist_mask || + old_hw_vs->info.culldist_mask != next_hw_vs->info.culldist_mask || !old_hw_vs_variant || !next_hw_vs_variant || old_hw_vs_variant->pa_cl_vs_out_cntl != next_hw_vs_variant->pa_cl_vs_out_cntl)) si_mark_atom_dirty(sctx, &sctx->atoms.s.clip_regs); @@ -3383,7 +3232,7 @@ static void si_bind_vs_shader(struct pipe_context *ctx, void *state) sctx->shader.vs.current = sel ? sel->first_variant : NULL; sctx->num_vs_blit_sgprs = sel ? sel->info.base.vs.blit_sgprs_amd : 0; sctx->vs_uses_draw_id = sel ? sel->info.uses_drawid : false; - sctx->fixed_func_tcs_shader.key.ge.mono.u.ff_tcs_inputs_to_copy = sel ? sel->outputs_written : 0; + sctx->fixed_func_tcs_shader.key.ge.mono.u.ff_tcs_inputs_to_copy = sel ? sel->info.outputs_written : 0; if (si_update_ngg(sctx)) si_shader_change_notify(sctx); @@ -3556,7 +3405,7 @@ void si_update_ps_kill_enable(struct si_context *sctx) if (!sctx->shader.ps.cso) return; - unsigned db_shader_control = sctx->shader.ps.cso->db_shader_control | + unsigned db_shader_control = sctx->shader.ps.cso->info.db_shader_control | S_02880C_KILL_ENABLE(sctx->queued.named.dsa->alpha_func != PIPE_FUNC_ALWAYS); if (sctx->ps_db_shader_control != db_shader_control) { @@ -3801,12 +3650,12 @@ bool si_update_gs_ring_buffers(struct si_context *sctx) unsigned max_size = ((unsigned)(63.999 * 1024 * 1024) & ~255) * num_se; /* Calculate the minimum size. */ - unsigned min_esgs_ring_size = align(es->esgs_itemsize * gs_vertex_reuse * wave_size, alignment); + unsigned min_esgs_ring_size = align(es->info.esgs_itemsize * gs_vertex_reuse * wave_size, alignment); /* These are recommended sizes, not minimum sizes. */ unsigned esgs_ring_size = - max_gs_waves * 2 * wave_size * es->esgs_itemsize * gs->gs_input_verts_per_prim; - unsigned gsvs_ring_size = max_gs_waves * 2 * wave_size * gs->max_gsvs_emit_size; + max_gs_waves * 2 * wave_size * es->info.esgs_itemsize * gs->info.gs_input_verts_per_prim; + unsigned gsvs_ring_size = max_gs_waves * 2 * wave_size * gs->info.max_gsvs_emit_size; min_esgs_ring_size = align(min_esgs_ring_size, alignment); esgs_ring_size = align(esgs_ring_size, alignment);