ac,radeonsi: rework how scratch_waves is used and move it to ac_gpu_info.c
The addition of the "compute" parameter is for a future change. Reviewed-by: Mihai Preda <mhpreda@gmail.com> Reviewed-by: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer@amd.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/15966>
This commit is contained in:
@@ -1270,6 +1270,15 @@ bool ac_query_gpu_info(int fd, void *dev_p, struct radeon_info *info,
|
|||||||
info->num_physical_wave64_vgprs_per_simd = info->chip_class >= GFX10 ? 512 : 256;
|
info->num_physical_wave64_vgprs_per_simd = info->chip_class >= GFX10 ? 512 : 256;
|
||||||
info->num_simd_per_compute_unit = info->chip_class >= GFX10 ? 2 : 4;
|
info->num_simd_per_compute_unit = info->chip_class >= GFX10 ? 2 : 4;
|
||||||
|
|
||||||
|
/* The maximum number of scratch waves. The number is only a function of the number of CUs.
|
||||||
|
* It should be large enough to hold at least 1 threadgroup. Use the minimum per-SA CU count.
|
||||||
|
*
|
||||||
|
* We can decrease the number to make it fit into the infinity cache.
|
||||||
|
*/
|
||||||
|
const unsigned max_waves_per_tg = 32; /* 1024 threads in Wave32 */
|
||||||
|
info->max_scratch_waves = MAX2(32 * info->min_good_cu_per_sa * info->max_sa_per_se * info->num_se,
|
||||||
|
max_waves_per_tg);
|
||||||
|
|
||||||
set_custom_cu_en_mask(info);
|
set_custom_cu_en_mask(info);
|
||||||
|
|
||||||
const char *ib_filename = debug_get_option("AMD_PARSE_IB", NULL);
|
const char *ib_filename = debug_get_option("AMD_PARSE_IB", NULL);
|
||||||
@@ -1480,6 +1489,7 @@ void ac_print_gpu_info(struct radeon_info *info, FILE *f)
|
|||||||
fprintf(f, " min_wave64_vgpr_alloc = %i\n", info->min_wave64_vgpr_alloc);
|
fprintf(f, " min_wave64_vgpr_alloc = %i\n", info->min_wave64_vgpr_alloc);
|
||||||
fprintf(f, " max_vgpr_alloc = %i\n", info->max_vgpr_alloc);
|
fprintf(f, " max_vgpr_alloc = %i\n", info->max_vgpr_alloc);
|
||||||
fprintf(f, " wave64_vgpr_alloc_granularity = %i\n", info->wave64_vgpr_alloc_granularity);
|
fprintf(f, " wave64_vgpr_alloc_granularity = %i\n", info->wave64_vgpr_alloc_granularity);
|
||||||
|
fprintf(f, " max_scratch_waves = %i\n", info->max_scratch_waves);
|
||||||
|
|
||||||
fprintf(f, "Render backend info:\n");
|
fprintf(f, "Render backend info:\n");
|
||||||
fprintf(f, " pa_sc_tile_steering_override = 0x%x\n", info->pa_sc_tile_steering_override);
|
fprintf(f, " pa_sc_tile_steering_override = 0x%x\n", info->pa_sc_tile_steering_override);
|
||||||
|
@@ -217,6 +217,7 @@ struct radeon_info {
|
|||||||
uint32_t min_wave64_vgpr_alloc;
|
uint32_t min_wave64_vgpr_alloc;
|
||||||
uint32_t max_vgpr_alloc;
|
uint32_t max_vgpr_alloc;
|
||||||
uint32_t wave64_vgpr_alloc_granularity;
|
uint32_t wave64_vgpr_alloc_granularity;
|
||||||
|
uint32_t max_scratch_waves;
|
||||||
|
|
||||||
/* Render backends (color + depth blocks). */
|
/* Render backends (color + depth blocks). */
|
||||||
uint32_t r300_num_gb_pipes;
|
uint32_t r300_num_gb_pipes;
|
||||||
|
@@ -607,7 +607,7 @@ void ac_set_reg_cu_en(void *cs, unsigned reg_offset, uint32_t value, uint32_t cl
|
|||||||
}
|
}
|
||||||
|
|
||||||
/* Return the register value and tune bytes_per_wave to increase scratch performance. */
|
/* Return the register value and tune bytes_per_wave to increase scratch performance. */
|
||||||
void ac_get_scratch_tmpring_size(const struct radeon_info *info, unsigned max_scratch_waves,
|
void ac_get_scratch_tmpring_size(const struct radeon_info *info, bool compute,
|
||||||
unsigned bytes_per_wave, unsigned *max_seen_bytes_per_wave,
|
unsigned bytes_per_wave, unsigned *max_seen_bytes_per_wave,
|
||||||
uint32_t *tmpring_size)
|
uint32_t *tmpring_size)
|
||||||
{
|
{
|
||||||
@@ -640,6 +640,6 @@ void ac_get_scratch_tmpring_size(const struct radeon_info *info, unsigned max_sc
|
|||||||
*max_seen_bytes_per_wave = MAX2(*max_seen_bytes_per_wave, bytes_per_wave);
|
*max_seen_bytes_per_wave = MAX2(*max_seen_bytes_per_wave, bytes_per_wave);
|
||||||
|
|
||||||
/* TODO: We could decrease WAVES to make the whole buffer fit into the infinity cache. */
|
/* TODO: We could decrease WAVES to make the whole buffer fit into the infinity cache. */
|
||||||
*tmpring_size = S_0286E8_WAVES(max_scratch_waves) |
|
*tmpring_size = S_0286E8_WAVES(info->max_scratch_waves) |
|
||||||
S_0286E8_WAVESIZE(*max_seen_bytes_per_wave >> size_shift);
|
S_0286E8_WAVESIZE(*max_seen_bytes_per_wave >> size_shift);
|
||||||
}
|
}
|
||||||
|
@@ -134,7 +134,7 @@ void ac_set_reg_cu_en(void *cs, unsigned reg_offset, uint32_t value, uint32_t cl
|
|||||||
unsigned value_shift, const struct radeon_info *info,
|
unsigned value_shift, const struct radeon_info *info,
|
||||||
void set_sh_reg(void*, unsigned, uint32_t));
|
void set_sh_reg(void*, unsigned, uint32_t));
|
||||||
|
|
||||||
void ac_get_scratch_tmpring_size(const struct radeon_info *info, unsigned max_scratch_waves,
|
void ac_get_scratch_tmpring_size(const struct radeon_info *info, bool compute,
|
||||||
unsigned bytes_per_wave, unsigned *max_seen_bytes_per_wave,
|
unsigned bytes_per_wave, unsigned *max_seen_bytes_per_wave,
|
||||||
uint32_t *tmpring_size);
|
uint32_t *tmpring_size);
|
||||||
|
|
||||||
|
@@ -455,7 +455,7 @@ static bool si_setup_compute_scratch_buffer(struct si_context *sctx, struct si_s
|
|||||||
{
|
{
|
||||||
uint64_t scratch_bo_size, scratch_needed;
|
uint64_t scratch_bo_size, scratch_needed;
|
||||||
scratch_bo_size = 0;
|
scratch_bo_size = 0;
|
||||||
scratch_needed = sctx->max_seen_compute_scratch_bytes_per_wave * sctx->scratch_waves;
|
scratch_needed = sctx->max_seen_compute_scratch_bytes_per_wave * sctx->screen->info.max_scratch_waves;
|
||||||
if (sctx->compute_scratch_buffer)
|
if (sctx->compute_scratch_buffer)
|
||||||
scratch_bo_size = sctx->compute_scratch_buffer->b.b.width0;
|
scratch_bo_size = sctx->compute_scratch_buffer->b.b.width0;
|
||||||
|
|
||||||
@@ -526,7 +526,7 @@ static bool si_switch_compute_shader(struct si_context *sctx, struct si_compute
|
|||||||
}
|
}
|
||||||
|
|
||||||
unsigned tmpring_size;
|
unsigned tmpring_size;
|
||||||
ac_get_scratch_tmpring_size(&sctx->screen->info, sctx->scratch_waves,
|
ac_get_scratch_tmpring_size(&sctx->screen->info, true,
|
||||||
config->scratch_bytes_per_wave,
|
config->scratch_bytes_per_wave,
|
||||||
&sctx->max_seen_compute_scratch_bytes_per_wave, &tmpring_size);
|
&sctx->max_seen_compute_scratch_bytes_per_wave, &tmpring_size);
|
||||||
|
|
||||||
@@ -534,12 +534,6 @@ static bool si_switch_compute_shader(struct si_context *sctx, struct si_compute
|
|||||||
return false;
|
return false;
|
||||||
|
|
||||||
if (shader->scratch_bo) {
|
if (shader->scratch_bo) {
|
||||||
COMPUTE_DBG(sctx->screen,
|
|
||||||
"Waves: %u; Scratch per wave: %u bytes; "
|
|
||||||
"Total Scratch: %u bytes\n",
|
|
||||||
sctx->scratch_waves, config->scratch_bytes_per_wave,
|
|
||||||
config->scratch_bytes_per_wave * sctx->scratch_waves);
|
|
||||||
|
|
||||||
radeon_add_to_buffer_list(sctx, &sctx->gfx_cs, shader->scratch_bo,
|
radeon_add_to_buffer_list(sctx, &sctx->gfx_cs, shader->scratch_bo,
|
||||||
RADEON_USAGE_READWRITE | RADEON_PRIO_SCRATCH_BUFFER);
|
RADEON_USAGE_READWRITE | RADEON_PRIO_SCRATCH_BUFFER);
|
||||||
}
|
}
|
||||||
|
@@ -682,21 +682,6 @@ static struct pipe_context *si_create_context(struct pipe_screen *screen, unsign
|
|||||||
si_set_internal_const_buffer(sctx, SI_PS_CONST_SAMPLE_POSITIONS, &sctx->null_const_buf);
|
si_set_internal_const_buffer(sctx, SI_PS_CONST_SAMPLE_POSITIONS, &sctx->null_const_buf);
|
||||||
}
|
}
|
||||||
|
|
||||||
uint64_t max_threads_per_block;
|
|
||||||
screen->get_compute_param(screen, PIPE_SHADER_IR_NIR, PIPE_COMPUTE_CAP_MAX_THREADS_PER_BLOCK,
|
|
||||||
&max_threads_per_block);
|
|
||||||
|
|
||||||
/* The maximum number of scratch waves. Scratch space isn't divided
|
|
||||||
* evenly between CUs. The number is only a function of the number of CUs.
|
|
||||||
* We can decrease the constant to decrease the scratch buffer size.
|
|
||||||
*
|
|
||||||
* sctx->scratch_waves must be >= the maximum possible size of
|
|
||||||
* 1 threadgroup, so that the hw doesn't hang from being unable
|
|
||||||
* to start any.
|
|
||||||
*/
|
|
||||||
sctx->scratch_waves =
|
|
||||||
MAX2(32 * sscreen->info.num_good_compute_units, max_threads_per_block / 64);
|
|
||||||
|
|
||||||
/* Bindless handles. */
|
/* Bindless handles. */
|
||||||
sctx->tex_handles = _mesa_hash_table_create(NULL, _mesa_hash_pointer, _mesa_key_pointer_equal);
|
sctx->tex_handles = _mesa_hash_table_create(NULL, _mesa_hash_pointer, _mesa_key_pointer_equal);
|
||||||
sctx->img_handles = _mesa_hash_table_create(NULL, _mesa_hash_pointer, _mesa_key_pointer_equal);
|
sctx->img_handles = _mesa_hash_table_create(NULL, _mesa_hash_pointer, _mesa_key_pointer_equal);
|
||||||
|
@@ -1156,7 +1156,6 @@ struct si_context {
|
|||||||
|
|
||||||
/* Scratch buffer */
|
/* Scratch buffer */
|
||||||
struct si_resource *scratch_buffer;
|
struct si_resource *scratch_buffer;
|
||||||
unsigned scratch_waves;
|
|
||||||
unsigned spi_tmpring_size;
|
unsigned spi_tmpring_size;
|
||||||
unsigned max_seen_scratch_bytes_per_wave;
|
unsigned max_seen_scratch_bytes_per_wave;
|
||||||
unsigned max_seen_compute_scratch_bytes_per_wave;
|
unsigned max_seen_compute_scratch_bytes_per_wave;
|
||||||
|
@@ -3902,10 +3902,11 @@ static bool si_update_scratch_relocs(struct si_context *sctx)
|
|||||||
bool si_update_spi_tmpring_size(struct si_context *sctx, unsigned bytes)
|
bool si_update_spi_tmpring_size(struct si_context *sctx, unsigned bytes)
|
||||||
{
|
{
|
||||||
unsigned spi_tmpring_size;
|
unsigned spi_tmpring_size;
|
||||||
ac_get_scratch_tmpring_size(&sctx->screen->info, sctx->scratch_waves, bytes,
|
ac_get_scratch_tmpring_size(&sctx->screen->info, false, bytes,
|
||||||
&sctx->max_seen_scratch_bytes_per_wave, &spi_tmpring_size);
|
&sctx->max_seen_scratch_bytes_per_wave, &spi_tmpring_size);
|
||||||
|
|
||||||
unsigned scratch_needed_size = sctx->max_seen_scratch_bytes_per_wave * sctx->scratch_waves;
|
unsigned scratch_needed_size = sctx->max_seen_scratch_bytes_per_wave *
|
||||||
|
sctx->screen->info.max_scratch_waves;
|
||||||
|
|
||||||
if (scratch_needed_size > 0) {
|
if (scratch_needed_size > 0) {
|
||||||
if (!sctx->scratch_buffer || scratch_needed_size > sctx->scratch_buffer->b.b.width0) {
|
if (!sctx->scratch_buffer || scratch_needed_size > sctx->scratch_buffer->b.b.width0) {
|
||||||
|
Reference in New Issue
Block a user