radeonsi/gfx10: rewrite late alloc computation
- Use conservative late alloc when the number of CUs <= 6. - Move the late alloc GS register to the GS shader state, so that it can be tuned for NGG culling. Acked-by: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer@amd.com>
This commit is contained in:
@@ -5565,46 +5565,46 @@ static void si_init_config(struct si_context *sctx)
|
|||||||
|
|
||||||
/* Compute LATE_ALLOC_VS.LIMIT. */
|
/* Compute LATE_ALLOC_VS.LIMIT. */
|
||||||
unsigned num_cu_per_sh = sscreen->info.num_good_cu_per_sh;
|
unsigned num_cu_per_sh = sscreen->info.num_good_cu_per_sh;
|
||||||
unsigned late_alloc_limit; /* The limit is per SH. */
|
unsigned late_alloc_wave64 = 0; /* The limit is per SH. */
|
||||||
|
|
||||||
if (sctx->family == CHIP_KABINI) {
|
|
||||||
late_alloc_limit = 0; /* Potential hang on Kabini. */
|
|
||||||
} else if (num_cu_per_sh <= 4) {
|
|
||||||
/* Too few available compute units per SH. Disallowing
|
|
||||||
* VS to run on one CU could hurt us more than late VS
|
|
||||||
* allocation would help.
|
|
||||||
*
|
|
||||||
* 2 is the highest safe number that allows us to keep
|
|
||||||
* all CUs enabled.
|
|
||||||
*/
|
|
||||||
late_alloc_limit = 2;
|
|
||||||
} else {
|
|
||||||
/* This is a good initial value, allowing 1 late_alloc
|
|
||||||
* wave per SIMD on num_cu - 2.
|
|
||||||
*/
|
|
||||||
late_alloc_limit = (num_cu_per_sh - 2) * 4;
|
|
||||||
}
|
|
||||||
|
|
||||||
unsigned late_alloc_limit_gs = late_alloc_limit;
|
|
||||||
unsigned cu_mask_vs = 0xffff;
|
unsigned cu_mask_vs = 0xffff;
|
||||||
unsigned cu_mask_gs = 0xffff;
|
unsigned cu_mask_gs = 0xffff;
|
||||||
|
|
||||||
if (late_alloc_limit > 2) {
|
if (sctx->chip_class >= GFX10) {
|
||||||
if (sctx->chip_class >= GFX10) {
|
/* For Wave32, the hw will launch twice the number of late
|
||||||
/* CU2 & CU3 disabled because of the dual CU design */
|
* alloc waves, so 1 == 2x wave32.
|
||||||
cu_mask_vs = 0xfff3;
|
*/
|
||||||
cu_mask_gs = 0xfff3; /* NGG only */
|
if (num_cu_per_sh <= 6) {
|
||||||
|
late_alloc_wave64 = num_cu_per_sh - 2;
|
||||||
} else {
|
} else {
|
||||||
cu_mask_vs = 0xfffe; /* 1 CU disabled */
|
late_alloc_wave64 = (num_cu_per_sh - 2) * 4;
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/* Don't use late alloc for NGG on Navi14 due to a hw bug.
|
/* CU2 & CU3 disabled because of the dual CU design */
|
||||||
* If NGG is never used, enable all CUs.
|
/* Late alloc is not used for NGG on Navi14 due to a hw bug. */
|
||||||
*/
|
cu_mask_vs = 0xfff3;
|
||||||
if (!sscreen->use_ngg || sctx->family == CHIP_NAVI14) {
|
cu_mask_gs = sscreen->use_ngg &&
|
||||||
late_alloc_limit_gs = 0;
|
sctx->family != CHIP_NAVI14 ? 0xfff3 : 0xffff;
|
||||||
cu_mask_gs = 0xffff;
|
}
|
||||||
|
} else {
|
||||||
|
if (sctx->family == CHIP_KABINI) {
|
||||||
|
late_alloc_wave64 = 0; /* Potential hang on Kabini. */
|
||||||
|
} else if (num_cu_per_sh <= 4) {
|
||||||
|
/* Too few available compute units per SH. Disallowing
|
||||||
|
* VS to run on one CU could hurt us more than late VS
|
||||||
|
* allocation would help.
|
||||||
|
*
|
||||||
|
* 2 is the highest safe number that allows us to keep
|
||||||
|
* all CUs enabled.
|
||||||
|
*/
|
||||||
|
late_alloc_wave64 = 2;
|
||||||
|
} else {
|
||||||
|
/* This is a good initial value, allowing 1 late_alloc
|
||||||
|
* wave per SIMD on num_cu - 2.
|
||||||
|
*/
|
||||||
|
late_alloc_wave64 = (num_cu_per_sh - 2) * 4;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (late_alloc_wave64 > 2)
|
||||||
|
cu_mask_vs = 0xfffe; /* 1 CU disabled */
|
||||||
}
|
}
|
||||||
|
|
||||||
/* VS can't execute on one CU if the limit is > 2. */
|
/* VS can't execute on one CU if the limit is > 2. */
|
||||||
@@ -5612,17 +5612,11 @@ static void si_init_config(struct si_context *sctx)
|
|||||||
S_00B118_CU_EN(cu_mask_vs) |
|
S_00B118_CU_EN(cu_mask_vs) |
|
||||||
S_00B118_WAVE_LIMIT(0x3F));
|
S_00B118_WAVE_LIMIT(0x3F));
|
||||||
si_pm4_set_reg(pm4, R_00B11C_SPI_SHADER_LATE_ALLOC_VS,
|
si_pm4_set_reg(pm4, R_00B11C_SPI_SHADER_LATE_ALLOC_VS,
|
||||||
S_00B11C_LIMIT(late_alloc_limit));
|
S_00B11C_LIMIT(late_alloc_wave64));
|
||||||
|
|
||||||
si_pm4_set_reg(pm4, R_00B21C_SPI_SHADER_PGM_RSRC3_GS,
|
si_pm4_set_reg(pm4, R_00B21C_SPI_SHADER_PGM_RSRC3_GS,
|
||||||
S_00B21C_CU_EN(cu_mask_gs) | S_00B21C_WAVE_LIMIT(0x3F));
|
S_00B21C_CU_EN(cu_mask_gs) | S_00B21C_WAVE_LIMIT(0x3F));
|
||||||
|
|
||||||
if (sctx->chip_class >= GFX10) {
|
|
||||||
si_pm4_set_reg(pm4, R_00B204_SPI_SHADER_PGM_RSRC4_GS,
|
|
||||||
S_00B204_CU_EN(0xffff) |
|
|
||||||
S_00B204_SPI_SHADER_LATE_ALLOC_GS_GFX10(late_alloc_limit_gs));
|
|
||||||
}
|
|
||||||
|
|
||||||
si_pm4_set_reg(pm4, R_00B01C_SPI_SHADER_PGM_RSRC3_PS,
|
si_pm4_set_reg(pm4, R_00B01C_SPI_SHADER_PGM_RSRC3_PS,
|
||||||
S_00B01C_CU_EN(0xffff) | S_00B01C_WAVE_LIMIT(0x3F));
|
S_00B01C_CU_EN(0xffff) | S_00B01C_WAVE_LIMIT(0x3F));
|
||||||
}
|
}
|
||||||
|
@@ -934,6 +934,12 @@ static void si_shader_gs(struct si_screen *sscreen, struct si_shader *shader)
|
|||||||
si_pm4_set_reg(pm4, R_00B228_SPI_SHADER_PGM_RSRC1_GS, rsrc1);
|
si_pm4_set_reg(pm4, R_00B228_SPI_SHADER_PGM_RSRC1_GS, rsrc1);
|
||||||
si_pm4_set_reg(pm4, R_00B22C_SPI_SHADER_PGM_RSRC2_GS, rsrc2);
|
si_pm4_set_reg(pm4, R_00B22C_SPI_SHADER_PGM_RSRC2_GS, rsrc2);
|
||||||
|
|
||||||
|
if (sscreen->info.chip_class >= GFX10) {
|
||||||
|
si_pm4_set_reg(pm4, R_00B204_SPI_SHADER_PGM_RSRC4_GS,
|
||||||
|
S_00B204_CU_EN(0xffff) |
|
||||||
|
S_00B204_SPI_SHADER_LATE_ALLOC_GS_GFX10(0));
|
||||||
|
}
|
||||||
|
|
||||||
shader->ctx_reg.gs.vgt_gs_onchip_cntl =
|
shader->ctx_reg.gs.vgt_gs_onchip_cntl =
|
||||||
S_028A44_ES_VERTS_PER_SUBGRP(shader->gs_info.es_verts_per_subgroup) |
|
S_028A44_ES_VERTS_PER_SUBGRP(shader->gs_info.es_verts_per_subgroup) |
|
||||||
S_028A44_GS_PRIMS_PER_SUBGRP(shader->gs_info.gs_prims_per_subgroup) |
|
S_028A44_GS_PRIMS_PER_SUBGRP(shader->gs_info.gs_prims_per_subgroup) |
|
||||||
@@ -1215,6 +1221,26 @@ static void gfx10_shader_ngg(struct si_screen *sscreen, struct si_shader *shader
|
|||||||
S_00B22C_OC_LDS_EN(es_type == PIPE_SHADER_TESS_EVAL) |
|
S_00B22C_OC_LDS_EN(es_type == PIPE_SHADER_TESS_EVAL) |
|
||||||
S_00B22C_LDS_SIZE(shader->config.lds_size));
|
S_00B22C_LDS_SIZE(shader->config.lds_size));
|
||||||
|
|
||||||
|
/* Determine LATE_ALLOC_GS. */
|
||||||
|
unsigned num_cu_per_sh = sscreen->info.num_good_cu_per_sh;
|
||||||
|
unsigned late_alloc_wave64; /* The limit is per SH. */
|
||||||
|
|
||||||
|
/* For Wave32, the hw will launch twice the number of late
|
||||||
|
* alloc waves, so 1 == 2x wave32.
|
||||||
|
*
|
||||||
|
* Don't use late alloc for NGG on Navi14 due to a hw bug.
|
||||||
|
*/
|
||||||
|
if (sscreen->info.family == CHIP_NAVI14)
|
||||||
|
late_alloc_wave64 = 0;
|
||||||
|
else if (num_cu_per_sh <= 6)
|
||||||
|
late_alloc_wave64 = num_cu_per_sh - 2; /* All CUs enabled */
|
||||||
|
else
|
||||||
|
late_alloc_wave64 = (num_cu_per_sh - 2) * 4;
|
||||||
|
|
||||||
|
si_pm4_set_reg(pm4, R_00B204_SPI_SHADER_PGM_RSRC4_GS,
|
||||||
|
S_00B204_CU_EN(0xffff) |
|
||||||
|
S_00B204_SPI_SHADER_LATE_ALLOC_GS_GFX10(late_alloc_wave64));
|
||||||
|
|
||||||
nparams = MAX2(shader->info.nr_param_exports, 1);
|
nparams = MAX2(shader->info.nr_param_exports, 1);
|
||||||
shader->ctx_reg.ngg.spi_vs_out_config =
|
shader->ctx_reg.ngg.spi_vs_out_config =
|
||||||
S_0286C4_VS_EXPORT_COUNT(nparams - 1) |
|
S_0286C4_VS_EXPORT_COUNT(nparams - 1) |
|
||||||
|
Reference in New Issue
Block a user