radeonsi: pass at most 3 images and/or shader buffers via user SGPRs for compute
This should slightly decrease shader lifetime. Acked-by: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer@amd.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/5209>
This commit is contained in:
@@ -141,6 +141,38 @@ static void si_create_compute_state_async(void *job, int thread_index)
|
||||
program->num_cs_user_data_dwords =
|
||||
sel->info.properties[TGSI_PROPERTY_CS_USER_DATA_COMPONENTS_AMD];
|
||||
|
||||
unsigned user_sgprs = SI_NUM_RESOURCE_SGPRS + (sel->info.uses_grid_size ? 3 : 0) +
|
||||
(program->reads_variable_block_size ? 3 : 0) +
|
||||
program->num_cs_user_data_dwords;
|
||||
|
||||
/* Fast path for compute shaders - some descriptors passed via user SGPRs. */
|
||||
/* Shader buffers in user SGPRs. */
|
||||
for (unsigned i = 0; i < 3 && user_sgprs <= 12 && sel->info.shader_buffers_declared & (1 << i); i++) {
|
||||
user_sgprs = align(user_sgprs, 4);
|
||||
if (i == 0)
|
||||
sel->cs_shaderbufs_sgpr_index = user_sgprs;
|
||||
user_sgprs += 4;
|
||||
sel->cs_num_shaderbufs_in_user_sgprs++;
|
||||
}
|
||||
|
||||
/* Images in user SGPRs. */
|
||||
unsigned non_msaa_images = sel->info.images_declared & ~sel->info.msaa_images_declared;
|
||||
|
||||
for (unsigned i = 0; i < 3 && non_msaa_images & (1 << i); i++) {
|
||||
unsigned num_sgprs = sel->info.image_buffers & (1 << i) ? 4 : 8;
|
||||
|
||||
if (align(user_sgprs, num_sgprs) + num_sgprs > 16)
|
||||
break;
|
||||
|
||||
user_sgprs = align(user_sgprs, num_sgprs);
|
||||
if (i == 0)
|
||||
sel->cs_images_sgpr_index = user_sgprs;
|
||||
user_sgprs += num_sgprs;
|
||||
sel->cs_num_images_in_user_sgprs++;
|
||||
}
|
||||
sel->cs_images_num_sgprs = user_sgprs - sel->cs_images_sgpr_index;
|
||||
assert(user_sgprs <= 16);
|
||||
|
||||
unsigned char ir_sha1_cache_key[20];
|
||||
si_get_ir_cache_key(sel, false, false, ir_sha1_cache_key);
|
||||
|
||||
@@ -164,9 +196,6 @@ static void si_create_compute_state_async(void *job, int thread_index)
|
||||
}
|
||||
|
||||
bool scratch_enabled = shader->config.scratch_bytes_per_wave > 0;
|
||||
unsigned user_sgprs = SI_NUM_RESOURCE_SGPRS + (sel->info.uses_grid_size ? 3 : 0) +
|
||||
(program->reads_variable_block_size ? 3 : 0) +
|
||||
program->num_cs_user_data_dwords;
|
||||
|
||||
shader->config.rsrc1 = S_00B848_VGPRS((shader->config.num_vgprs - 1) /
|
||||
(sscreen->compute_wave_size == 32 ? 8 : 4)) |
|
||||
@@ -275,6 +304,9 @@ static void si_bind_compute_state(struct pipe_context *ctx, void *state)
|
||||
sel->active_const_and_shader_buffers);
|
||||
si_set_active_descriptors(sctx, SI_DESCS_FIRST_COMPUTE + SI_SHADER_DESCS_SAMPLERS_AND_IMAGES,
|
||||
sel->active_samplers_and_images);
|
||||
|
||||
sctx->compute_shaderbuf_sgprs_dirty = true;
|
||||
sctx->compute_image_sgprs_dirty = true;
|
||||
}
|
||||
|
||||
static void si_set_global_binding(struct pipe_context *ctx, unsigned first, unsigned n,
|
||||
|
@@ -812,6 +812,11 @@ static void si_set_shader_images(struct pipe_context *pipe, enum pipe_shader_typ
|
||||
si_set_shader_image(ctx, shader, slot, NULL, false);
|
||||
}
|
||||
|
||||
if (shader == PIPE_SHADER_COMPUTE &&
|
||||
ctx->cs_shader_state.program &&
|
||||
start_slot < ctx->cs_shader_state.program->sel.cs_num_images_in_user_sgprs)
|
||||
ctx->compute_image_sgprs_dirty = true;
|
||||
|
||||
si_update_shader_needs_decompress_mask(ctx, shader);
|
||||
}
|
||||
|
||||
@@ -1338,6 +1343,11 @@ static void si_set_shader_buffers(struct pipe_context *ctx, enum pipe_shader_typ
|
||||
|
||||
assert(start_slot + count <= SI_NUM_SHADER_BUFFERS);
|
||||
|
||||
if (shader == PIPE_SHADER_COMPUTE &&
|
||||
sctx->cs_shader_state.program &&
|
||||
start_slot < sctx->cs_shader_state.program->sel.cs_num_shaderbufs_in_user_sgprs)
|
||||
sctx->compute_shaderbuf_sgprs_dirty = true;
|
||||
|
||||
for (i = 0; i < count; ++i) {
|
||||
const struct pipe_shader_buffer *sbuffer = sbuffers ? &sbuffers[i] : NULL;
|
||||
unsigned slot = si_get_shaderbuf_slot(start_slot + i);
|
||||
@@ -1939,6 +1949,8 @@ void si_shader_pointers_mark_dirty(struct si_context *sctx)
|
||||
si_mark_atom_dirty(sctx, &sctx->atoms.s.shader_pointers);
|
||||
sctx->graphics_bindless_pointer_dirty = sctx->bindless_descriptors.buffer != NULL;
|
||||
sctx->compute_bindless_pointer_dirty = sctx->bindless_descriptors.buffer != NULL;
|
||||
sctx->compute_shaderbuf_sgprs_dirty = true;
|
||||
sctx->compute_image_sgprs_dirty = true;
|
||||
}
|
||||
|
||||
/* Set a base register address for user data constants in the given shader.
|
||||
@@ -2137,6 +2149,8 @@ void si_emit_graphics_shader_pointers(struct si_context *sctx)
|
||||
|
||||
void si_emit_compute_shader_pointers(struct si_context *sctx)
|
||||
{
|
||||
struct radeon_cmdbuf *cs = sctx->gfx_cs;
|
||||
struct si_shader_selector *shader = &sctx->cs_shader_state.program->sel;
|
||||
unsigned base = R_00B900_COMPUTE_USER_DATA_0;
|
||||
|
||||
si_emit_consecutive_shader_pointers(sctx, SI_DESCS_SHADER_MASK(COMPUTE),
|
||||
@@ -2147,6 +2161,46 @@ void si_emit_compute_shader_pointers(struct si_context *sctx)
|
||||
si_emit_shader_pointer(sctx, &sctx->bindless_descriptors, base);
|
||||
sctx->compute_bindless_pointer_dirty = false;
|
||||
}
|
||||
|
||||
/* Set shader buffer descriptors in user SGPRs. */
|
||||
unsigned num_shaderbufs = shader->cs_num_shaderbufs_in_user_sgprs;
|
||||
if (num_shaderbufs && sctx->compute_shaderbuf_sgprs_dirty) {
|
||||
struct si_descriptors *desc = si_const_and_shader_buffer_descriptors(sctx, PIPE_SHADER_COMPUTE);
|
||||
|
||||
si_emit_shader_pointer_head(cs, R_00B900_COMPUTE_USER_DATA_0 +
|
||||
shader->cs_shaderbufs_sgpr_index * 4,
|
||||
num_shaderbufs * 4);
|
||||
|
||||
for (unsigned i = 0; i < num_shaderbufs; i++)
|
||||
radeon_emit_array(cs, &desc->list[si_get_shaderbuf_slot(i) * 4], 4);
|
||||
|
||||
sctx->compute_shaderbuf_sgprs_dirty = false;
|
||||
}
|
||||
|
||||
/* Set image descriptors in user SGPRs. */
|
||||
unsigned num_images = shader->cs_num_images_in_user_sgprs;
|
||||
if (num_images && sctx->compute_image_sgprs_dirty) {
|
||||
struct si_descriptors *desc = si_sampler_and_image_descriptors(sctx, PIPE_SHADER_COMPUTE);
|
||||
|
||||
si_emit_shader_pointer_head(cs, R_00B900_COMPUTE_USER_DATA_0 +
|
||||
shader->cs_images_sgpr_index * 4,
|
||||
shader->cs_images_num_sgprs);
|
||||
|
||||
for (unsigned i = 0; i < num_images; i++) {
|
||||
unsigned desc_offset = si_get_image_slot(i) * 8;
|
||||
unsigned num_sgprs = 8;
|
||||
|
||||
/* Image buffers are in desc[4..7]. */
|
||||
if (shader->info.image_buffers & (1 << i)) {
|
||||
desc_offset += 4;
|
||||
num_sgprs = 4;
|
||||
}
|
||||
|
||||
radeon_emit_array(cs, &desc->list[desc_offset], num_sgprs);
|
||||
}
|
||||
|
||||
sctx->compute_image_sgprs_dirty = false;
|
||||
}
|
||||
}
|
||||
|
||||
/* BINDLESS */
|
||||
|
@@ -1030,6 +1030,8 @@ struct si_context {
|
||||
unsigned cs_max_waves_per_sh;
|
||||
bool flatshade;
|
||||
bool do_update_shaders;
|
||||
bool compute_shaderbuf_sgprs_dirty;
|
||||
bool compute_image_sgprs_dirty;
|
||||
|
||||
/* shader descriptors */
|
||||
struct si_descriptors descriptors[SI_NUM_DESCS];
|
||||
|
@@ -698,6 +698,24 @@ void si_create_function(struct si_shader_context *ctx, bool ngg_cull_shader)
|
||||
ac_add_arg(&ctx->args, AC_ARG_SGPR, cs_user_data_dwords, AC_ARG_INT, &ctx->cs_user_data);
|
||||
}
|
||||
|
||||
/* Some descriptors can be in user SGPRs. */
|
||||
/* Shader buffers in user SGPRs. */
|
||||
for (unsigned i = 0; i < shader->selector->cs_num_shaderbufs_in_user_sgprs; i++) {
|
||||
while (ctx->args.num_sgprs_used % 4 != 0)
|
||||
ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
|
||||
|
||||
ac_add_arg(&ctx->args, AC_ARG_SGPR, 4, AC_ARG_INT, &ctx->cs_shaderbuf[i]);
|
||||
}
|
||||
/* Images in user SGPRs. */
|
||||
for (unsigned i = 0; i < shader->selector->cs_num_images_in_user_sgprs; i++) {
|
||||
unsigned num_sgprs = shader->selector->info.image_buffers & (1 << i) ? 4 : 8;
|
||||
|
||||
while (ctx->args.num_sgprs_used % num_sgprs != 0)
|
||||
ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
|
||||
|
||||
ac_add_arg(&ctx->args, AC_ARG_SGPR, num_sgprs, AC_ARG_INT, &ctx->cs_image[i]);
|
||||
}
|
||||
|
||||
/* Hardware SGPRs. */
|
||||
for (i = 0; i < 3; i++) {
|
||||
if (shader->selector->info.uses_block_id[i]) {
|
||||
|
@@ -394,6 +394,7 @@ struct si_shader_info {
|
||||
unsigned num_written_clipdistance;
|
||||
|
||||
unsigned images_declared; /**< bitmask of declared images */
|
||||
unsigned image_buffers; /**< bitmask of images that are buffers */
|
||||
unsigned msaa_images_declared; /**< bitmask of declared MSAA images */
|
||||
unsigned shader_buffers_declared; /**< bitmask of declared shader buffers */
|
||||
|
||||
@@ -439,6 +440,11 @@ struct si_shader_selector {
|
||||
bool vs_needs_prolog;
|
||||
bool prim_discard_cs_allowed;
|
||||
bool ngg_culling_allowed;
|
||||
ubyte cs_shaderbufs_sgpr_index;
|
||||
ubyte cs_num_shaderbufs_in_user_sgprs;
|
||||
ubyte cs_images_sgpr_index;
|
||||
ubyte cs_images_num_sgprs;
|
||||
ubyte cs_num_images_in_user_sgprs;
|
||||
unsigned num_vs_inputs;
|
||||
unsigned num_vbos_in_user_sgprs;
|
||||
unsigned pa_cl_vs_out_cntl;
|
||||
|
@@ -171,6 +171,8 @@ struct si_shader_context {
|
||||
/* CS */
|
||||
struct ac_arg block_size;
|
||||
struct ac_arg cs_user_data;
|
||||
struct ac_arg cs_shaderbuf[3];
|
||||
struct ac_arg cs_image[3];
|
||||
|
||||
struct ac_llvm_compiler *compiler;
|
||||
|
||||
|
@@ -107,6 +107,12 @@ static LLVMValueRef load_ubo(struct ac_shader_abi *abi, LLVMValueRef index)
|
||||
static LLVMValueRef load_ssbo(struct ac_shader_abi *abi, LLVMValueRef index, bool write)
|
||||
{
|
||||
struct si_shader_context *ctx = si_shader_context_from_abi(abi);
|
||||
|
||||
/* Fast path if the shader buffer is in user SGPRs. */
|
||||
if (LLVMIsConstant(index) &&
|
||||
LLVMConstIntGetZExtValue(index) < ctx->shader->selector->cs_num_shaderbufs_in_user_sgprs)
|
||||
return ac_get_arg(&ctx->ac, ctx->cs_shaderbuf[LLVMConstIntGetZExtValue(index)]);
|
||||
|
||||
LLVMValueRef rsrc_ptr = ac_get_arg(&ctx->ac, ctx->const_and_shader_buffers);
|
||||
|
||||
index = si_llvm_bound_index(ctx, index, ctx->num_shader_buffers);
|
||||
@@ -270,6 +276,12 @@ static LLVMValueRef si_nir_load_sampler_desc(struct ac_shader_abi *abi, unsigned
|
||||
}
|
||||
|
||||
if (image) {
|
||||
/* Fast path if the image is in user SGPRs. */
|
||||
if (!dynamic_index &&
|
||||
const_index < ctx->shader->selector->cs_num_images_in_user_sgprs &&
|
||||
(desc_type == AC_DESC_IMAGE || desc_type == AC_DESC_BUFFER))
|
||||
return ac_get_arg(&ctx->ac, ctx->cs_image[const_index]);
|
||||
|
||||
/* FMASKs are separate from images. */
|
||||
if (desc_type == AC_DESC_FMASK) {
|
||||
index =
|
||||
|
@@ -717,6 +717,7 @@ void si_nir_scan_shader(const struct nir_shader *nir, struct si_shader_info *inf
|
||||
info->const_buffers_declared = u_bit_consecutive(0, nir->info.num_ubos);
|
||||
info->images_declared = u_bit_consecutive(0, nir->info.num_images);
|
||||
info->msaa_images_declared = nir->info.msaa_images;
|
||||
info->image_buffers = nir->info.image_buffers;
|
||||
info->samplers_declared = nir->info.textures_used;
|
||||
|
||||
info->num_written_clipdistance = nir->info.clip_distance_array_size;
|
||||
|
Reference in New Issue
Block a user