radeonsi: don't use threadID.yz/blockID.yz for copy_image if those are always 0
This can improve performance because fewer VGPRs and SGPRs need to be initialized. Reviewed-by: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer@amd.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/24732>
This commit is contained in:
@@ -537,7 +537,7 @@ void si_copy_buffer(struct si_context *sctx, struct pipe_resource *dst, struct p
|
||||
}
|
||||
}
|
||||
|
||||
static void
|
||||
static unsigned
|
||||
set_work_size(struct pipe_grid_info *info, unsigned block_x, unsigned block_y, unsigned block_z,
|
||||
unsigned work_x, unsigned work_y, unsigned work_z)
|
||||
{
|
||||
@@ -550,6 +550,8 @@ set_work_size(struct pipe_grid_info *info, unsigned block_x, unsigned block_y, u
|
||||
info->last_block[i] = work[i] % info->block[i];
|
||||
info->grid[i] = DIV_ROUND_UP(work[i], info->block[i]);
|
||||
}
|
||||
|
||||
return work_z > 1 ? 3 : (work_y > 1 ? 2 : 1);
|
||||
}
|
||||
|
||||
static void si_launch_grid_internal_images(struct si_context *sctx,
|
||||
@@ -764,12 +766,13 @@ bool si_compute_copy_image(struct si_context *sctx, struct pipe_resource *dst, u
|
||||
sctx->cs_user_data[1] = src_box->y | (dsty << 16);
|
||||
sctx->cs_user_data[2] = src_box->z | (dstz << 16);
|
||||
|
||||
set_work_size(&info, block_x, block_y, block_z,
|
||||
src_box->width, src_box->height, src_box->depth);
|
||||
unsigned wg_dim =
|
||||
set_work_size(&info, block_x, block_y, block_z,
|
||||
src_box->width, src_box->height, src_box->depth);
|
||||
|
||||
void **copy_image_cs_ptr = &sctx->cs_copy_image[src_is_1d][dst_is_1d];
|
||||
void **copy_image_cs_ptr = &sctx->cs_copy_image[wg_dim - 1][src_is_1d][dst_is_1d];
|
||||
if (!*copy_image_cs_ptr)
|
||||
*copy_image_cs_ptr = si_create_copy_image_cs(sctx, src_is_1d, dst_is_1d);
|
||||
*copy_image_cs_ptr = si_create_copy_image_cs(sctx, wg_dim, src_is_1d, dst_is_1d);
|
||||
|
||||
assert(*copy_image_cs_ptr);
|
||||
|
||||
|
@@ -259,8 +259,10 @@ static void si_destroy_context(struct pipe_context *context)
|
||||
sctx->b.delete_compute_state(&sctx->b, sctx->cs_copy_buffer);
|
||||
for (unsigned i = 0; i < ARRAY_SIZE(sctx->cs_copy_image); i++) {
|
||||
for (unsigned j = 0; j < ARRAY_SIZE(sctx->cs_copy_image[i]); j++) {
|
||||
if (sctx->cs_copy_image[i][j])
|
||||
sctx->b.delete_compute_state(&sctx->b, sctx->cs_copy_image[i][j]);
|
||||
for (unsigned k = 0; k < ARRAY_SIZE(sctx->cs_copy_image[i][j]); k++) {
|
||||
if (sctx->cs_copy_image[i][j][k])
|
||||
sctx->b.delete_compute_state(&sctx->b, sctx->cs_copy_image[i][j][k]);
|
||||
}
|
||||
}
|
||||
}
|
||||
if (sctx->cs_clear_render_target)
|
||||
|
@@ -1006,7 +1006,7 @@ struct si_context {
|
||||
void *cs_clear_buffer;
|
||||
void *cs_clear_buffer_rmw;
|
||||
void *cs_copy_buffer;
|
||||
void *cs_copy_image[2][2]; /* [src_is_1d][dst_is_1d] */
|
||||
void *cs_copy_image[3][2][2]; /* [wg_dim-1][src_is_1d][dst_is_1d] */
|
||||
void *cs_clear_render_target;
|
||||
void *cs_clear_render_target_1d_array;
|
||||
void *cs_clear_12bytes_buffer;
|
||||
@@ -1582,7 +1582,8 @@ void si_suspend_queries(struct si_context *sctx);
|
||||
void si_resume_queries(struct si_context *sctx);
|
||||
|
||||
/* si_shaderlib_nir.c */
|
||||
void *si_create_copy_image_cs(struct si_context *sctx, bool src_is_1d_array, bool dst_is_1d_array);
|
||||
void *si_create_copy_image_cs(struct si_context *sctx, unsigned wg_dim,
|
||||
bool src_is_1d_array, bool dst_is_1d_array);
|
||||
void *si_create_dcc_retile_cs(struct si_context *sctx, struct radeon_surf *surf);
|
||||
void *gfx9_create_clear_dcc_msaa_cs(struct si_context *sctx, struct si_texture *tex);
|
||||
void *si_create_passthrough_tcs(struct si_context *sctx);
|
||||
|
@@ -73,7 +73,8 @@ deref_ssa(nir_builder *b, nir_variable *var)
|
||||
* It expects the source and destination (x,y,z) coords as user_data_amd,
|
||||
* packed into 3 SGPRs as 2x16bits per component.
|
||||
*/
|
||||
void *si_create_copy_image_cs(struct si_context *sctx, bool src_is_1d_array, bool dst_is_1d_array)
|
||||
void *si_create_copy_image_cs(struct si_context *sctx, unsigned wg_dim,
|
||||
bool src_is_1d_array, bool dst_is_1d_array)
|
||||
{
|
||||
const nir_shader_compiler_options *options =
|
||||
sctx->b.screen->get_compiler_options(sctx->b.screen, PIPE_SHADER_IR_NIR, PIPE_SHADER_COMPUTE);
|
||||
@@ -87,14 +88,19 @@ void *si_create_copy_image_cs(struct si_context *sctx, bool src_is_1d_array, boo
|
||||
b.shader->info.workgroup_size_variable = true;
|
||||
|
||||
b.shader->info.cs.user_data_components_amd = 3;
|
||||
nir_def *ids = get_global_ids(&b, 3);
|
||||
nir_def *ids = nir_pad_vector_imm_int(&b, get_global_ids(&b, wg_dim), 0, 3);
|
||||
|
||||
nir_def *coord_src = NULL, *coord_dst = NULL;
|
||||
unpack_2x16(&b, nir_load_user_data_amd(&b), &coord_src, &coord_dst);
|
||||
unpack_2x16(&b, nir_trim_vector(&b, nir_load_user_data_amd(&b), 3),
|
||||
&coord_src, &coord_dst);
|
||||
|
||||
coord_src = nir_iadd(&b, coord_src, ids);
|
||||
coord_dst = nir_iadd(&b, coord_dst, ids);
|
||||
|
||||
/* Coordinates must have 4 channels in NIR. */
|
||||
coord_src = nir_pad_vector(&b, coord_src, 4);
|
||||
coord_dst = nir_pad_vector(&b, coord_dst, 4);
|
||||
|
||||
static unsigned swizzle_xz[] = {0, 2, 0, 0};
|
||||
|
||||
if (src_is_1d_array)
|
||||
|
Reference in New Issue
Block a user