radeonsi: use si_shader::wave_size
Reviewed-by: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer@amd.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/13878>
This commit is contained in:
@@ -2276,14 +2276,13 @@ retry_select_mode:
|
||||
|
||||
/* Round up towards full wave sizes for better ALU utilization. */
|
||||
if (!max_vert_out_per_gs_instance) {
|
||||
const unsigned wavesize = si_get_shader_wave_size(shader);
|
||||
unsigned orig_max_esverts;
|
||||
unsigned orig_max_gsprims;
|
||||
do {
|
||||
orig_max_esverts = max_esverts;
|
||||
orig_max_gsprims = max_gsprims;
|
||||
|
||||
max_esverts = align(max_esverts, wavesize);
|
||||
max_esverts = align(max_esverts, shader->wave_size);
|
||||
max_esverts = MIN2(max_esverts, max_esverts_base);
|
||||
if (esvert_lds_size)
|
||||
max_esverts =
|
||||
@@ -2293,7 +2292,7 @@ retry_select_mode:
|
||||
/* Hardware restriction: minimum value of max_esverts */
|
||||
max_esverts = MAX2(max_esverts, min_esverts);
|
||||
|
||||
max_gsprims = align(max_gsprims, wavesize);
|
||||
max_gsprims = align(max_gsprims, shader->wave_size);
|
||||
max_gsprims = MIN2(max_gsprims, max_gsprims_base);
|
||||
if (gsprim_lds_size) {
|
||||
/* Don't count unusable vertices to the LDS size. Those are vertices above
|
||||
|
@@ -68,7 +68,7 @@ static const amd_kernel_code_t *si_compute_get_code_object(const struct si_compu
|
||||
if (!ac_rtld_open(&rtld,
|
||||
(struct ac_rtld_open_info){.info = &sel->screen->info,
|
||||
.shader_type = MESA_SHADER_COMPUTE,
|
||||
.wave_size = sel->screen->compute_wave_size,
|
||||
.wave_size = program->shader.wave_size,
|
||||
.num_parts = 1,
|
||||
.elf_ptrs = &program->shader.binary.elf_buffer,
|
||||
.elf_sizes = &program->shader.binary.elf_size}))
|
||||
@@ -193,7 +193,7 @@ static void si_create_compute_state_async(void *job, void *gdata, int thread_ind
|
||||
bool scratch_enabled = shader->config.scratch_bytes_per_wave > 0;
|
||||
|
||||
shader->config.rsrc1 = S_00B848_VGPRS((shader->config.num_vgprs - 1) /
|
||||
((sscreen->compute_wave_size == 32 ||
|
||||
((shader->wave_size == 32 ||
|
||||
sscreen->info.wave64_vgpr_alloc_granularity == 8) ? 8 : 4)) |
|
||||
S_00B848_DX10_CLAMP(1) |
|
||||
S_00B848_MEM_ORDERED(si_shader_mem_ordered(shader)) |
|
||||
@@ -770,7 +770,7 @@ static void si_emit_dispatch_packets(struct si_context *sctx, const struct pipe_
|
||||
bool render_cond_bit = sctx->render_cond_enabled;
|
||||
unsigned threads_per_threadgroup = info->block[0] * info->block[1] * info->block[2];
|
||||
unsigned waves_per_threadgroup =
|
||||
DIV_ROUND_UP(threads_per_threadgroup, sscreen->compute_wave_size);
|
||||
DIV_ROUND_UP(threads_per_threadgroup, sctx->cs_shader_state.program->shader.wave_size);
|
||||
unsigned threadgroups_per_cu = 1;
|
||||
|
||||
if (sctx->chip_class >= GFX10 && waves_per_threadgroup == 1)
|
||||
@@ -792,7 +792,7 @@ static void si_emit_dispatch_packets(struct si_context *sctx, const struct pipe_
|
||||
/* If the KMD allows it (there is a KMD hw register for it),
|
||||
* allow launching waves out-of-order. (same as Vulkan) */
|
||||
S_00B800_ORDER_MODE(sctx->chip_class >= GFX7) |
|
||||
S_00B800_CS_W32_EN(sscreen->compute_wave_size == 32);
|
||||
S_00B800_CS_W32_EN(sctx->cs_shader_state.program->shader.wave_size == 32);
|
||||
|
||||
const uint *last_block = info->last_block;
|
||||
bool partial_block_en = last_block[0] || last_block[1] || last_block[2];
|
||||
|
@@ -919,28 +919,27 @@ static void si_print_annotated_shader(struct si_shader *shader, struct ac_wave_i
|
||||
*/
|
||||
unsigned num_inst = 0;
|
||||
uint64_t inst_addr = start_addr;
|
||||
unsigned wave_size = si_get_shader_wave_size(shader);
|
||||
struct ac_rtld_binary rtld_binaries[5] = {};
|
||||
struct si_shader_inst *instructions =
|
||||
calloc(shader->bo->b.b.width0 / 4, sizeof(struct si_shader_inst));
|
||||
|
||||
if (shader->prolog) {
|
||||
si_add_split_disasm(screen, &rtld_binaries[0], &shader->prolog->binary, &inst_addr, &num_inst,
|
||||
instructions, stage, wave_size);
|
||||
instructions, stage, shader->wave_size);
|
||||
}
|
||||
if (shader->previous_stage) {
|
||||
si_add_split_disasm(screen, &rtld_binaries[1], &shader->previous_stage->binary, &inst_addr,
|
||||
&num_inst, instructions, stage, wave_size);
|
||||
&num_inst, instructions, stage, shader->wave_size);
|
||||
}
|
||||
if (shader->prolog2) {
|
||||
si_add_split_disasm(screen, &rtld_binaries[2], &shader->prolog2->binary, &inst_addr,
|
||||
&num_inst, instructions, stage, wave_size);
|
||||
&num_inst, instructions, stage, shader->wave_size);
|
||||
}
|
||||
si_add_split_disasm(screen, &rtld_binaries[3], &shader->binary, &inst_addr, &num_inst,
|
||||
instructions, stage, wave_size);
|
||||
instructions, stage, shader->wave_size);
|
||||
if (shader->epilog) {
|
||||
si_add_split_disasm(screen, &rtld_binaries[4], &shader->epilog->binary, &inst_addr, &num_inst,
|
||||
instructions, stage, wave_size);
|
||||
instructions, stage, shader->wave_size);
|
||||
}
|
||||
|
||||
fprintf(f, COLOR_YELLOW "%s - annotated disassembly:" COLOR_RESET "\n",
|
||||
|
@@ -819,7 +819,7 @@ static bool si_shader_binary_open(struct si_screen *screen, struct si_shader *sh
|
||||
.halt_at_entry = screen->options.halt_shaders,
|
||||
},
|
||||
.shader_type = sel->info.stage,
|
||||
.wave_size = si_get_shader_wave_size(shader),
|
||||
.wave_size = shader->wave_size,
|
||||
.num_parts = num_parts,
|
||||
.elf_ptrs = part_elfs,
|
||||
.elf_sizes = part_sizes,
|
||||
@@ -992,7 +992,7 @@ static void si_calculate_max_simd_waves(struct si_shader *shader)
|
||||
case MESA_SHADER_COMPUTE: {
|
||||
unsigned max_workgroup_size = si_get_max_workgroup_size(shader);
|
||||
lds_per_wave = (conf->lds_size * lds_increment) /
|
||||
DIV_ROUND_UP(max_workgroup_size, sscreen->compute_wave_size);
|
||||
DIV_ROUND_UP(max_workgroup_size, shader->wave_size);
|
||||
}
|
||||
break;
|
||||
default:;
|
||||
@@ -1025,7 +1025,7 @@ void si_shader_dump_stats_for_shader_db(struct si_screen *screen, struct si_shad
|
||||
|
||||
if (screen->options.debug_disassembly)
|
||||
si_shader_dump_disassembly(screen, &shader->binary, shader->selector->info.stage,
|
||||
si_get_shader_wave_size(shader), debug, "main", NULL);
|
||||
shader->wave_size, debug, "main", NULL);
|
||||
|
||||
pipe_debug_message(debug, SHADER_INFO,
|
||||
"Shader Stats: SGPRS: %d VGPRS: %d Code Size: %d "
|
||||
@@ -1123,25 +1123,24 @@ void si_shader_dump(struct si_screen *sscreen, struct si_shader *shader,
|
||||
|
||||
if (!check_debug_option ||
|
||||
(si_can_dump_shader(sscreen, stage) && !(sscreen->debug_flags & DBG(NO_ASM)))) {
|
||||
unsigned wave_size = si_get_shader_wave_size(shader);
|
||||
|
||||
fprintf(file, "\n%s:\n", si_get_shader_name(shader));
|
||||
|
||||
if (shader->prolog)
|
||||
si_shader_dump_disassembly(sscreen, &shader->prolog->binary, stage, wave_size, debug,
|
||||
si_shader_dump_disassembly(sscreen, &shader->prolog->binary, stage, shader->wave_size, debug,
|
||||
"prolog", file);
|
||||
if (shader->previous_stage)
|
||||
si_shader_dump_disassembly(sscreen, &shader->previous_stage->binary, stage,
|
||||
wave_size, debug, "previous stage", file);
|
||||
shader->wave_size, debug, "previous stage", file);
|
||||
if (shader->prolog2)
|
||||
si_shader_dump_disassembly(sscreen, &shader->prolog2->binary, stage, wave_size,
|
||||
si_shader_dump_disassembly(sscreen, &shader->prolog2->binary, stage, shader->wave_size,
|
||||
debug, "prolog2", file);
|
||||
|
||||
si_shader_dump_disassembly(sscreen, &shader->binary, stage, wave_size, debug, "main",
|
||||
si_shader_dump_disassembly(sscreen, &shader->binary, stage, shader->wave_size, debug, "main",
|
||||
file);
|
||||
|
||||
if (shader->epilog)
|
||||
si_shader_dump_disassembly(sscreen, &shader->epilog->binary, stage, wave_size, debug,
|
||||
si_shader_dump_disassembly(sscreen, &shader->epilog->binary, stage, shader->wave_size, debug,
|
||||
"epilog", file);
|
||||
fprintf(file, "\n");
|
||||
}
|
||||
@@ -1330,7 +1329,7 @@ void si_get_vs_prolog_key(const struct si_shader_info *info, unsigned num_input_
|
||||
{
|
||||
memset(key, 0, sizeof(*key));
|
||||
key->vs_prolog.states = *prolog_key;
|
||||
key->vs_prolog.wave32 = si_get_shader_wave_size(shader_out) == 32;
|
||||
key->vs_prolog.wave32 = shader_out->wave_size == 32;
|
||||
key->vs_prolog.num_input_sgprs = num_input_sgprs;
|
||||
key->vs_prolog.num_inputs = info->num_inputs;
|
||||
key->vs_prolog.as_ls = shader_out->key.ge.as_ls;
|
||||
@@ -1522,14 +1521,13 @@ bool si_compile_shader(struct si_screen *sscreen, struct ac_llvm_compiler *compi
|
||||
|
||||
/* Validate SGPR and VGPR usage for compute to detect compiler bugs. */
|
||||
if (sel->info.stage == MESA_SHADER_COMPUTE) {
|
||||
unsigned wave_size = sscreen->compute_wave_size;
|
||||
unsigned max_vgprs =
|
||||
sscreen->info.num_physical_wave64_vgprs_per_simd * (wave_size == 32 ? 2 : 1);
|
||||
sscreen->info.num_physical_wave64_vgprs_per_simd * (shader->wave_size == 32 ? 2 : 1);
|
||||
unsigned max_sgprs = sscreen->info.num_physical_sgprs_per_simd;
|
||||
unsigned max_sgprs_per_wave = 128;
|
||||
unsigned simds_per_tg = 4; /* assuming WGP mode on gfx10 */
|
||||
unsigned threads_per_tg = si_get_max_workgroup_size(shader);
|
||||
unsigned waves_per_tg = DIV_ROUND_UP(threads_per_tg, wave_size);
|
||||
unsigned waves_per_tg = DIV_ROUND_UP(threads_per_tg, shader->wave_size);
|
||||
unsigned waves_per_simd = DIV_ROUND_UP(waves_per_tg, simds_per_tg);
|
||||
|
||||
max_vgprs = max_vgprs / waves_per_simd;
|
||||
@@ -1709,7 +1707,7 @@ static bool si_shader_select_tcs_parts(struct si_screen *sscreen, struct ac_llvm
|
||||
/* Get the epilog. */
|
||||
union si_shader_part_key epilog_key;
|
||||
memset(&epilog_key, 0, sizeof(epilog_key));
|
||||
epilog_key.tcs_epilog.wave32 = si_get_shader_wave_size(shader) == 32;
|
||||
epilog_key.tcs_epilog.wave32 = shader->wave_size == 32;
|
||||
epilog_key.tcs_epilog.states = shader->key.ge.part.tcs.epilog;
|
||||
|
||||
shader->epilog = si_get_shader_part(sscreen, &sscreen->tcs_epilogs, MESA_SHADER_TESS_CTRL, false,
|
||||
@@ -1754,7 +1752,7 @@ void si_get_ps_prolog_key(struct si_shader *shader, union si_shader_part_key *ke
|
||||
|
||||
memset(key, 0, sizeof(*key));
|
||||
key->ps_prolog.states = shader->key.ps.part.prolog;
|
||||
key->ps_prolog.wave32 = si_get_shader_wave_size(shader) == 32;
|
||||
key->ps_prolog.wave32 = shader->wave_size == 32;
|
||||
key->ps_prolog.colors_read = info->colors_read;
|
||||
key->ps_prolog.num_input_sgprs = shader->info.num_input_sgprs;
|
||||
key->ps_prolog.num_input_vgprs = shader->info.num_input_vgprs;
|
||||
@@ -1888,7 +1886,7 @@ void si_get_ps_epilog_key(struct si_shader *shader, union si_shader_part_key *ke
|
||||
{
|
||||
struct si_shader_info *info = &shader->selector->info;
|
||||
memset(key, 0, sizeof(*key));
|
||||
key->ps_epilog.wave32 = si_get_shader_wave_size(shader) == 32;
|
||||
key->ps_epilog.wave32 = shader->wave_size == 32;
|
||||
key->ps_epilog.colors_written = info->colors_written;
|
||||
key->ps_epilog.color_types = info->output_color_types;
|
||||
key->ps_epilog.writes_z = info->writes_z;
|
||||
@@ -2013,7 +2011,7 @@ void si_fix_resource_usage(struct si_screen *sscreen, struct si_shader *shader)
|
||||
shader->config.num_sgprs = MAX2(shader->config.num_sgprs, min_sgprs);
|
||||
|
||||
if (shader->selector->info.stage == MESA_SHADER_COMPUTE &&
|
||||
si_get_max_workgroup_size(shader) > sscreen->compute_wave_size) {
|
||||
si_get_max_workgroup_size(shader) > shader->wave_size) {
|
||||
si_multiwave_lds_size_workaround(sscreen, &shader->config.lds_size);
|
||||
}
|
||||
}
|
||||
|
@@ -1090,7 +1090,7 @@ bool si_llvm_compile_shader(struct si_screen *sscreen, struct ac_llvm_compiler *
|
||||
struct si_shader_selector *sel = shader->selector;
|
||||
struct si_shader_context ctx;
|
||||
|
||||
si_llvm_context_init(&ctx, sscreen, compiler, si_get_shader_wave_size(shader));
|
||||
si_llvm_context_init(&ctx, sscreen, compiler, shader->wave_size);
|
||||
|
||||
LLVMValueRef ngg_cull_main_fn = NULL;
|
||||
if (sel->info.stage <= MESA_SHADER_TESS_EVAL && shader->key.ge.opt.ngg_culling) {
|
||||
|
@@ -427,9 +427,7 @@ struct si_shader *si_generate_gs_copy_shader(struct si_screen *sscreen,
|
||||
shader->is_gs_copy_shader = true;
|
||||
shader->wave_size = si_get_shader_wave_size(shader);
|
||||
|
||||
si_llvm_context_init(&ctx, sscreen, compiler,
|
||||
si_get_wave_size(sscreen, MESA_SHADER_VERTEX,
|
||||
false, false));
|
||||
si_llvm_context_init(&ctx, sscreen, compiler, shader->wave_size);
|
||||
ctx.shader = shader;
|
||||
ctx.stage = MESA_SHADER_VERTEX;
|
||||
|
||||
|
@@ -1014,7 +1014,7 @@ si_sqtt_add_code_object(struct si_context* sctx,
|
||||
record->shader_data[gl_shader_stage].hw_stage = hw_stage;
|
||||
record->shader_data[gl_shader_stage].is_combined = false;
|
||||
record->shader_data[gl_shader_stage].scratch_memory_size = shader->config.scratch_bytes_per_wave;
|
||||
record->shader_data[gl_shader_stage].wavefront_size = si_get_shader_wave_size(shader);
|
||||
record->shader_data[gl_shader_stage].wavefront_size = shader->wave_size;
|
||||
|
||||
record->shader_stages_mask |= 1 << gl_shader_stage;
|
||||
record->num_shaders_combined++;
|
||||
|
@@ -631,7 +631,7 @@ static void si_emit_derived_tess_state(struct si_context *sctx, unsigned *num_pa
|
||||
* if it's only partially filled.
|
||||
*/
|
||||
unsigned temp_verts_per_tg = *num_patches * max_verts_per_patch;
|
||||
unsigned wave_size = sctx->screen->ge_wave_size;
|
||||
unsigned wave_size = ls_current->wave_size;
|
||||
|
||||
if (temp_verts_per_tg > wave_size &&
|
||||
(wave_size - temp_verts_per_tg % wave_size >= MAX2(max_verts_per_patch, 8)))
|
||||
|
@@ -586,7 +586,7 @@ static void si_shader_hs(struct si_screen *sscreen, struct si_shader *shader)
|
||||
|
||||
si_pm4_set_reg(
|
||||
pm4, R_00B428_SPI_SHADER_PGM_RSRC1_HS,
|
||||
S_00B428_VGPRS((shader->config.num_vgprs - 1) / (sscreen->ge_wave_size == 32 ? 8 : 4)) |
|
||||
S_00B428_VGPRS((shader->config.num_vgprs - 1) / (shader->wave_size == 32 ? 8 : 4)) |
|
||||
(sscreen->info.chip_class <= GFX9 ? S_00B428_SGPRS((shader->config.num_sgprs - 1) / 8)
|
||||
: 0) |
|
||||
S_00B428_DX10_CLAMP(1) | S_00B428_MEM_ORDERED(si_shader_mem_ordered(shader)) |
|
||||
@@ -1207,7 +1207,6 @@ static void gfx10_shader_ngg(struct si_screen *sscreen, struct si_shader *shader
|
||||
else
|
||||
gs_vgpr_comp_cnt = 0; /* VGPR0 contains offsets 0, 1 */
|
||||
|
||||
unsigned wave_size = si_get_shader_wave_size(shader);
|
||||
unsigned late_alloc_wave64, cu_mask;
|
||||
|
||||
ac_compute_late_alloc(&sscreen->info, true, shader->key.ge.opt.ngg_culling,
|
||||
@@ -1217,7 +1216,7 @@ static void gfx10_shader_ngg(struct si_screen *sscreen, struct si_shader *shader
|
||||
si_pm4_set_reg(pm4, R_00B320_SPI_SHADER_PGM_LO_ES, va >> 8);
|
||||
si_pm4_set_reg(
|
||||
pm4, R_00B228_SPI_SHADER_PGM_RSRC1_GS,
|
||||
S_00B228_VGPRS((shader->config.num_vgprs - 1) / (wave_size == 32 ? 8 : 4)) |
|
||||
S_00B228_VGPRS((shader->config.num_vgprs - 1) / (shader->wave_size == 32 ? 8 : 4)) |
|
||||
S_00B228_FLOAT_MODE(shader->config.float_mode) | S_00B228_DX10_CLAMP(1) |
|
||||
S_00B228_MEM_ORDERED(si_shader_mem_ordered(shader)) |
|
||||
/* Disable the WGP mode on gfx10.3 because it can hang. (it happened on VanGogh)
|
||||
@@ -1511,7 +1510,7 @@ static void si_shader_vs(struct si_screen *sscreen, struct si_shader *shader,
|
||||
S_00B124_MEM_BASE(sscreen->info.address32_hi >> 8));
|
||||
|
||||
uint32_t rsrc1 =
|
||||
S_00B128_VGPRS((shader->config.num_vgprs - 1) / (sscreen->ge_wave_size == 32 ? 8 : 4)) |
|
||||
S_00B128_VGPRS((shader->config.num_vgprs - 1) / (shader->wave_size == 32 ? 8 : 4)) |
|
||||
S_00B128_VGPR_COMP_CNT(vgpr_comp_cnt) | S_00B128_DX10_CLAMP(1) |
|
||||
S_00B128_MEM_ORDERED(si_shader_mem_ordered(shader)) |
|
||||
S_00B128_FLOAT_MODE(shader->config.float_mode);
|
||||
@@ -1715,7 +1714,7 @@ static void si_shader_ps(struct si_screen *sscreen, struct si_shader *shader)
|
||||
|
||||
/* Set interpolation controls. */
|
||||
spi_ps_in_control = S_0286D8_NUM_INTERP(num_interp) |
|
||||
S_0286D8_PS_W32_EN(sscreen->ps_wave_size == 32);
|
||||
S_0286D8_PS_W32_EN(shader->wave_size == 32);
|
||||
|
||||
shader->ctx_reg.ps.num_interp = num_interp;
|
||||
shader->ctx_reg.ps.spi_baryc_cntl = spi_baryc_cntl;
|
||||
@@ -1731,7 +1730,7 @@ static void si_shader_ps(struct si_screen *sscreen, struct si_shader *shader)
|
||||
S_00B024_MEM_BASE(sscreen->info.address32_hi >> 8));
|
||||
|
||||
uint32_t rsrc1 =
|
||||
S_00B028_VGPRS((shader->config.num_vgprs - 1) / (sscreen->ps_wave_size == 32 ? 8 : 4)) |
|
||||
S_00B028_VGPRS((shader->config.num_vgprs - 1) / (shader->wave_size == 32 ? 8 : 4)) |
|
||||
S_00B028_DX10_CLAMP(1) | S_00B028_MEM_ORDERED(si_shader_mem_ordered(shader)) |
|
||||
S_00B028_FLOAT_MODE(shader->config.float_mode);
|
||||
|
||||
|
Reference in New Issue
Block a user