From 1bf39b1f9d115d69aa7b192beb7cde5eea31dffe Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= Date: Sat, 2 Apr 2022 00:45:24 -0400 Subject: [PATCH] ac,radeonsi: rework how scratch_waves is used and move it to ac_gpu_info.c The addition of the "compute" parameter is for a future change. Reviewed-by: Mihai Preda Reviewed-by: Pierre-Eric Pelloux-Prayer Part-of: --- src/amd/common/ac_gpu_info.c | 10 ++++++++++ src/amd/common/ac_gpu_info.h | 1 + src/amd/common/ac_shader_util.c | 4 ++-- src/amd/common/ac_shader_util.h | 2 +- src/gallium/drivers/radeonsi/si_compute.c | 10 ++-------- src/gallium/drivers/radeonsi/si_pipe.c | 15 --------------- src/gallium/drivers/radeonsi/si_pipe.h | 1 - src/gallium/drivers/radeonsi/si_state_shaders.cpp | 5 +++-- 8 files changed, 19 insertions(+), 29 deletions(-) diff --git a/src/amd/common/ac_gpu_info.c b/src/amd/common/ac_gpu_info.c index d0a0403657b..29d997d8bc5 100644 --- a/src/amd/common/ac_gpu_info.c +++ b/src/amd/common/ac_gpu_info.c @@ -1270,6 +1270,15 @@ bool ac_query_gpu_info(int fd, void *dev_p, struct radeon_info *info, info->num_physical_wave64_vgprs_per_simd = info->chip_class >= GFX10 ? 512 : 256; info->num_simd_per_compute_unit = info->chip_class >= GFX10 ? 2 : 4; + /* The maximum number of scratch waves. The number is only a function of the number of CUs. + * It should be large enough to hold at least 1 threadgroup. Use the minimum per-SA CU count. + * + * We can decrease the number to make it fit into the infinity cache. + */ + const unsigned max_waves_per_tg = 32; /* 1024 threads in Wave32 */ + info->max_scratch_waves = MAX2(32 * info->min_good_cu_per_sa * info->max_sa_per_se * info->num_se, + max_waves_per_tg); + set_custom_cu_en_mask(info); const char *ib_filename = debug_get_option("AMD_PARSE_IB", NULL); @@ -1480,6 +1489,7 @@ void ac_print_gpu_info(struct radeon_info *info, FILE *f) fprintf(f, " min_wave64_vgpr_alloc = %i\n", info->min_wave64_vgpr_alloc); fprintf(f, " max_vgpr_alloc = %i\n", info->max_vgpr_alloc); fprintf(f, " wave64_vgpr_alloc_granularity = %i\n", info->wave64_vgpr_alloc_granularity); + fprintf(f, " max_scratch_waves = %i\n", info->max_scratch_waves); fprintf(f, "Render backend info:\n"); fprintf(f, " pa_sc_tile_steering_override = 0x%x\n", info->pa_sc_tile_steering_override); diff --git a/src/amd/common/ac_gpu_info.h b/src/amd/common/ac_gpu_info.h index e8dd0d8504a..a3f3421e053 100644 --- a/src/amd/common/ac_gpu_info.h +++ b/src/amd/common/ac_gpu_info.h @@ -217,6 +217,7 @@ struct radeon_info { uint32_t min_wave64_vgpr_alloc; uint32_t max_vgpr_alloc; uint32_t wave64_vgpr_alloc_granularity; + uint32_t max_scratch_waves; /* Render backends (color + depth blocks). */ uint32_t r300_num_gb_pipes; diff --git a/src/amd/common/ac_shader_util.c b/src/amd/common/ac_shader_util.c index 7ecea15bc08..224c54cde35 100644 --- a/src/amd/common/ac_shader_util.c +++ b/src/amd/common/ac_shader_util.c @@ -607,7 +607,7 @@ void ac_set_reg_cu_en(void *cs, unsigned reg_offset, uint32_t value, uint32_t cl } /* Return the register value and tune bytes_per_wave to increase scratch performance. */ -void ac_get_scratch_tmpring_size(const struct radeon_info *info, unsigned max_scratch_waves, +void ac_get_scratch_tmpring_size(const struct radeon_info *info, bool compute, unsigned bytes_per_wave, unsigned *max_seen_bytes_per_wave, uint32_t *tmpring_size) { @@ -640,6 +640,6 @@ void ac_get_scratch_tmpring_size(const struct radeon_info *info, unsigned max_sc *max_seen_bytes_per_wave = MAX2(*max_seen_bytes_per_wave, bytes_per_wave); /* TODO: We could decrease WAVES to make the whole buffer fit into the infinity cache. */ - *tmpring_size = S_0286E8_WAVES(max_scratch_waves) | + *tmpring_size = S_0286E8_WAVES(info->max_scratch_waves) | S_0286E8_WAVESIZE(*max_seen_bytes_per_wave >> size_shift); } diff --git a/src/amd/common/ac_shader_util.h b/src/amd/common/ac_shader_util.h index 129f6ebe93d..f1d8f3ca958 100644 --- a/src/amd/common/ac_shader_util.h +++ b/src/amd/common/ac_shader_util.h @@ -134,7 +134,7 @@ void ac_set_reg_cu_en(void *cs, unsigned reg_offset, uint32_t value, uint32_t cl unsigned value_shift, const struct radeon_info *info, void set_sh_reg(void*, unsigned, uint32_t)); -void ac_get_scratch_tmpring_size(const struct radeon_info *info, unsigned max_scratch_waves, +void ac_get_scratch_tmpring_size(const struct radeon_info *info, bool compute, unsigned bytes_per_wave, unsigned *max_seen_bytes_per_wave, uint32_t *tmpring_size); diff --git a/src/gallium/drivers/radeonsi/si_compute.c b/src/gallium/drivers/radeonsi/si_compute.c index 989914f085e..e63bc5e038c 100644 --- a/src/gallium/drivers/radeonsi/si_compute.c +++ b/src/gallium/drivers/radeonsi/si_compute.c @@ -455,7 +455,7 @@ static bool si_setup_compute_scratch_buffer(struct si_context *sctx, struct si_s { uint64_t scratch_bo_size, scratch_needed; scratch_bo_size = 0; - scratch_needed = sctx->max_seen_compute_scratch_bytes_per_wave * sctx->scratch_waves; + scratch_needed = sctx->max_seen_compute_scratch_bytes_per_wave * sctx->screen->info.max_scratch_waves; if (sctx->compute_scratch_buffer) scratch_bo_size = sctx->compute_scratch_buffer->b.b.width0; @@ -526,7 +526,7 @@ static bool si_switch_compute_shader(struct si_context *sctx, struct si_compute } unsigned tmpring_size; - ac_get_scratch_tmpring_size(&sctx->screen->info, sctx->scratch_waves, + ac_get_scratch_tmpring_size(&sctx->screen->info, true, config->scratch_bytes_per_wave, &sctx->max_seen_compute_scratch_bytes_per_wave, &tmpring_size); @@ -534,12 +534,6 @@ static bool si_switch_compute_shader(struct si_context *sctx, struct si_compute return false; if (shader->scratch_bo) { - COMPUTE_DBG(sctx->screen, - "Waves: %u; Scratch per wave: %u bytes; " - "Total Scratch: %u bytes\n", - sctx->scratch_waves, config->scratch_bytes_per_wave, - config->scratch_bytes_per_wave * sctx->scratch_waves); - radeon_add_to_buffer_list(sctx, &sctx->gfx_cs, shader->scratch_bo, RADEON_USAGE_READWRITE | RADEON_PRIO_SCRATCH_BUFFER); } diff --git a/src/gallium/drivers/radeonsi/si_pipe.c b/src/gallium/drivers/radeonsi/si_pipe.c index 0ddb3bfa14c..a849ef4b705 100644 --- a/src/gallium/drivers/radeonsi/si_pipe.c +++ b/src/gallium/drivers/radeonsi/si_pipe.c @@ -682,21 +682,6 @@ static struct pipe_context *si_create_context(struct pipe_screen *screen, unsign si_set_internal_const_buffer(sctx, SI_PS_CONST_SAMPLE_POSITIONS, &sctx->null_const_buf); } - uint64_t max_threads_per_block; - screen->get_compute_param(screen, PIPE_SHADER_IR_NIR, PIPE_COMPUTE_CAP_MAX_THREADS_PER_BLOCK, - &max_threads_per_block); - - /* The maximum number of scratch waves. Scratch space isn't divided - * evenly between CUs. The number is only a function of the number of CUs. - * We can decrease the constant to decrease the scratch buffer size. - * - * sctx->scratch_waves must be >= the maximum possible size of - * 1 threadgroup, so that the hw doesn't hang from being unable - * to start any. - */ - sctx->scratch_waves = - MAX2(32 * sscreen->info.num_good_compute_units, max_threads_per_block / 64); - /* Bindless handles. */ sctx->tex_handles = _mesa_hash_table_create(NULL, _mesa_hash_pointer, _mesa_key_pointer_equal); sctx->img_handles = _mesa_hash_table_create(NULL, _mesa_hash_pointer, _mesa_key_pointer_equal); diff --git a/src/gallium/drivers/radeonsi/si_pipe.h b/src/gallium/drivers/radeonsi/si_pipe.h index cce699a5dad..fe5f9d81a1c 100644 --- a/src/gallium/drivers/radeonsi/si_pipe.h +++ b/src/gallium/drivers/radeonsi/si_pipe.h @@ -1156,7 +1156,6 @@ struct si_context { /* Scratch buffer */ struct si_resource *scratch_buffer; - unsigned scratch_waves; unsigned spi_tmpring_size; unsigned max_seen_scratch_bytes_per_wave; unsigned max_seen_compute_scratch_bytes_per_wave; diff --git a/src/gallium/drivers/radeonsi/si_state_shaders.cpp b/src/gallium/drivers/radeonsi/si_state_shaders.cpp index 8c48da2261f..cd47c2dd143 100644 --- a/src/gallium/drivers/radeonsi/si_state_shaders.cpp +++ b/src/gallium/drivers/radeonsi/si_state_shaders.cpp @@ -3902,10 +3902,11 @@ static bool si_update_scratch_relocs(struct si_context *sctx) bool si_update_spi_tmpring_size(struct si_context *sctx, unsigned bytes) { unsigned spi_tmpring_size; - ac_get_scratch_tmpring_size(&sctx->screen->info, sctx->scratch_waves, bytes, + ac_get_scratch_tmpring_size(&sctx->screen->info, false, bytes, &sctx->max_seen_scratch_bytes_per_wave, &spi_tmpring_size); - unsigned scratch_needed_size = sctx->max_seen_scratch_bytes_per_wave * sctx->scratch_waves; + unsigned scratch_needed_size = sctx->max_seen_scratch_bytes_per_wave * + sctx->screen->info.max_scratch_waves; if (scratch_needed_size > 0) { if (!sctx->scratch_buffer || scratch_needed_size > sctx->scratch_buffer->b.b.width0) {