radeonsi: don't use threadID.yz/blockID.yz for copy_image if those are always 0

This can improve performance because fewer VGPRs and SGPRs need to be
initialized.

Reviewed-by: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer@amd.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/24732>
This commit is contained in:
Marek Olšák
2023-08-06 22:15:42 -04:00
committed by Marge Bot
parent 47a57416cf
commit 3952b89ebb
4 changed files with 24 additions and 12 deletions

View File

@@ -537,7 +537,7 @@ void si_copy_buffer(struct si_context *sctx, struct pipe_resource *dst, struct p
}
}
static void
static unsigned
set_work_size(struct pipe_grid_info *info, unsigned block_x, unsigned block_y, unsigned block_z,
unsigned work_x, unsigned work_y, unsigned work_z)
{
@@ -550,6 +550,8 @@ set_work_size(struct pipe_grid_info *info, unsigned block_x, unsigned block_y, u
info->last_block[i] = work[i] % info->block[i];
info->grid[i] = DIV_ROUND_UP(work[i], info->block[i]);
}
return work_z > 1 ? 3 : (work_y > 1 ? 2 : 1);
}
static void si_launch_grid_internal_images(struct si_context *sctx,
@@ -764,12 +766,13 @@ bool si_compute_copy_image(struct si_context *sctx, struct pipe_resource *dst, u
sctx->cs_user_data[1] = src_box->y | (dsty << 16);
sctx->cs_user_data[2] = src_box->z | (dstz << 16);
set_work_size(&info, block_x, block_y, block_z,
src_box->width, src_box->height, src_box->depth);
unsigned wg_dim =
set_work_size(&info, block_x, block_y, block_z,
src_box->width, src_box->height, src_box->depth);
void **copy_image_cs_ptr = &sctx->cs_copy_image[src_is_1d][dst_is_1d];
void **copy_image_cs_ptr = &sctx->cs_copy_image[wg_dim - 1][src_is_1d][dst_is_1d];
if (!*copy_image_cs_ptr)
*copy_image_cs_ptr = si_create_copy_image_cs(sctx, src_is_1d, dst_is_1d);
*copy_image_cs_ptr = si_create_copy_image_cs(sctx, wg_dim, src_is_1d, dst_is_1d);
assert(*copy_image_cs_ptr);

View File

@@ -259,8 +259,10 @@ static void si_destroy_context(struct pipe_context *context)
sctx->b.delete_compute_state(&sctx->b, sctx->cs_copy_buffer);
for (unsigned i = 0; i < ARRAY_SIZE(sctx->cs_copy_image); i++) {
for (unsigned j = 0; j < ARRAY_SIZE(sctx->cs_copy_image[i]); j++) {
if (sctx->cs_copy_image[i][j])
sctx->b.delete_compute_state(&sctx->b, sctx->cs_copy_image[i][j]);
for (unsigned k = 0; k < ARRAY_SIZE(sctx->cs_copy_image[i][j]); k++) {
if (sctx->cs_copy_image[i][j][k])
sctx->b.delete_compute_state(&sctx->b, sctx->cs_copy_image[i][j][k]);
}
}
}
if (sctx->cs_clear_render_target)

View File

@@ -1006,7 +1006,7 @@ struct si_context {
void *cs_clear_buffer;
void *cs_clear_buffer_rmw;
void *cs_copy_buffer;
void *cs_copy_image[2][2]; /* [src_is_1d][dst_is_1d] */
void *cs_copy_image[3][2][2]; /* [wg_dim-1][src_is_1d][dst_is_1d] */
void *cs_clear_render_target;
void *cs_clear_render_target_1d_array;
void *cs_clear_12bytes_buffer;
@@ -1582,7 +1582,8 @@ void si_suspend_queries(struct si_context *sctx);
void si_resume_queries(struct si_context *sctx);
/* si_shaderlib_nir.c */
void *si_create_copy_image_cs(struct si_context *sctx, bool src_is_1d_array, bool dst_is_1d_array);
void *si_create_copy_image_cs(struct si_context *sctx, unsigned wg_dim,
bool src_is_1d_array, bool dst_is_1d_array);
void *si_create_dcc_retile_cs(struct si_context *sctx, struct radeon_surf *surf);
void *gfx9_create_clear_dcc_msaa_cs(struct si_context *sctx, struct si_texture *tex);
void *si_create_passthrough_tcs(struct si_context *sctx);

View File

@@ -73,7 +73,8 @@ deref_ssa(nir_builder *b, nir_variable *var)
* It expects the source and destination (x,y,z) coords as user_data_amd,
* packed into 3 SGPRs as 2x16bits per component.
*/
void *si_create_copy_image_cs(struct si_context *sctx, bool src_is_1d_array, bool dst_is_1d_array)
void *si_create_copy_image_cs(struct si_context *sctx, unsigned wg_dim,
bool src_is_1d_array, bool dst_is_1d_array)
{
const nir_shader_compiler_options *options =
sctx->b.screen->get_compiler_options(sctx->b.screen, PIPE_SHADER_IR_NIR, PIPE_SHADER_COMPUTE);
@@ -87,14 +88,19 @@ void *si_create_copy_image_cs(struct si_context *sctx, bool src_is_1d_array, boo
b.shader->info.workgroup_size_variable = true;
b.shader->info.cs.user_data_components_amd = 3;
nir_def *ids = get_global_ids(&b, 3);
nir_def *ids = nir_pad_vector_imm_int(&b, get_global_ids(&b, wg_dim), 0, 3);
nir_def *coord_src = NULL, *coord_dst = NULL;
unpack_2x16(&b, nir_load_user_data_amd(&b), &coord_src, &coord_dst);
unpack_2x16(&b, nir_trim_vector(&b, nir_load_user_data_amd(&b), 3),
&coord_src, &coord_dst);
coord_src = nir_iadd(&b, coord_src, ids);
coord_dst = nir_iadd(&b, coord_dst, ids);
/* Coordinates must have 4 channels in NIR. */
coord_src = nir_pad_vector(&b, coord_src, 4);
coord_dst = nir_pad_vector(&b, coord_dst, 4);
static unsigned swizzle_xz[] = {0, 2, 0, 0};
if (src_is_1d_array)