radv: add support for push constants inlining when possible
This removes some scalar loads from shaders, but it increases the number of SET_SH_REG packets. This is currently basic but it could be improved if needed. Inlining dynamic offsets might also help. Original idea from Dave Airlie. 29077 shaders in 15096 tests Totals: SGPRS: 1321325 -> 1357101 (2.71 %) VGPRS: 936000 -> 932576 (-0.37 %) Spilled SGPRs: 24804 -> 24791 (-0.05 %) Code Size: 49827960 -> 49642232 (-0.37 %) bytes Max Waves: 242007 -> 242700 (0.29 %) Totals from affected shaders: SGPRS: 290989 -> 326765 (12.29 %) VGPRS: 244680 -> 241256 (-1.40 %) Spilled SGPRs: 1442 -> 1429 (-0.90 %) Code Size: 8126688 -> 7940960 (-2.29 %) bytes Max Waves: 80952 -> 81645 (0.86 %) Signed-off-by: Samuel Pitoiset <samuel.pitoiset@gmail.com> Reviewed-by: Bas Nieuwenhuizen <bas@basnieuwenhuizen.nl>
This commit is contained in:
@@ -1392,10 +1392,31 @@ static LLVMValueRef visit_load_push_constant(struct ac_nir_context *ctx,
|
|||||||
nir_intrinsic_instr *instr)
|
nir_intrinsic_instr *instr)
|
||||||
{
|
{
|
||||||
LLVMValueRef ptr, addr;
|
LLVMValueRef ptr, addr;
|
||||||
|
LLVMValueRef src0 = get_src(ctx, instr->src[0]);
|
||||||
|
unsigned index = nir_intrinsic_base(instr);
|
||||||
|
|
||||||
addr = LLVMConstInt(ctx->ac.i32, nir_intrinsic_base(instr), 0);
|
addr = LLVMConstInt(ctx->ac.i32, index, 0);
|
||||||
addr = LLVMBuildAdd(ctx->ac.builder, addr,
|
addr = LLVMBuildAdd(ctx->ac.builder, addr, src0, "");
|
||||||
get_src(ctx, instr->src[0]), "");
|
|
||||||
|
/* Load constant values from user SGPRS when possible, otherwise
|
||||||
|
* fallback to the default path that loads directly from memory.
|
||||||
|
*/
|
||||||
|
if (LLVMIsConstant(src0) &&
|
||||||
|
instr->dest.ssa.bit_size == 32) {
|
||||||
|
unsigned count = instr->dest.ssa.num_components;
|
||||||
|
unsigned offset = index;
|
||||||
|
|
||||||
|
offset += LLVMConstIntGetZExtValue(src0);
|
||||||
|
offset /= 4;
|
||||||
|
|
||||||
|
offset -= ctx->abi->base_inline_push_consts;
|
||||||
|
|
||||||
|
if (offset + count <= ctx->abi->num_inline_push_consts) {
|
||||||
|
return ac_build_gather_values(&ctx->ac,
|
||||||
|
ctx->abi->inline_push_consts + offset,
|
||||||
|
count);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
ptr = ac_build_gep0(&ctx->ac, ctx->abi->push_constants, addr);
|
ptr = ac_build_gep0(&ctx->ac, ctx->abi->push_constants, addr);
|
||||||
|
|
||||||
|
@@ -32,6 +32,8 @@ struct nir_variable;
|
|||||||
|
|
||||||
#define AC_LLVM_MAX_OUTPUTS (VARYING_SLOT_VAR31 + 1)
|
#define AC_LLVM_MAX_OUTPUTS (VARYING_SLOT_VAR31 + 1)
|
||||||
|
|
||||||
|
#define AC_MAX_INLINE_PUSH_CONSTS 8
|
||||||
|
|
||||||
enum ac_descriptor_type {
|
enum ac_descriptor_type {
|
||||||
AC_DESC_IMAGE,
|
AC_DESC_IMAGE,
|
||||||
AC_DESC_FMASK,
|
AC_DESC_FMASK,
|
||||||
@@ -66,6 +68,9 @@ struct ac_shader_abi {
|
|||||||
|
|
||||||
/* Vulkan only */
|
/* Vulkan only */
|
||||||
LLVMValueRef push_constants;
|
LLVMValueRef push_constants;
|
||||||
|
LLVMValueRef inline_push_consts[AC_MAX_INLINE_PUSH_CONSTS];
|
||||||
|
unsigned num_inline_push_consts;
|
||||||
|
unsigned base_inline_push_consts;
|
||||||
LLVMValueRef view_index;
|
LLVMValueRef view_index;
|
||||||
|
|
||||||
LLVMValueRef outputs[AC_LLVM_MAX_OUTPUTS * 4];
|
LLVMValueRef outputs[AC_LLVM_MAX_OUTPUTS * 4];
|
||||||
|
@@ -628,6 +628,23 @@ radv_emit_descriptor_pointers(struct radv_cmd_buffer *cmd_buffer,
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static void
|
||||||
|
radv_emit_inline_push_consts(struct radv_cmd_buffer *cmd_buffer,
|
||||||
|
struct radv_pipeline *pipeline,
|
||||||
|
gl_shader_stage stage,
|
||||||
|
int idx, int count, uint32_t *values)
|
||||||
|
{
|
||||||
|
struct radv_userdata_info *loc = radv_lookup_user_sgpr(pipeline, stage, idx);
|
||||||
|
uint32_t base_reg = pipeline->user_data_0[stage];
|
||||||
|
if (loc->sgpr_idx == -1)
|
||||||
|
return;
|
||||||
|
|
||||||
|
assert(loc->num_sgprs == count);
|
||||||
|
|
||||||
|
radeon_set_sh_reg_seq(cmd_buffer->cs, base_reg + loc->sgpr_idx * 4, count);
|
||||||
|
radeon_emit_array(cmd_buffer->cs, values, count);
|
||||||
|
}
|
||||||
|
|
||||||
static void
|
static void
|
||||||
radv_update_multisample_state(struct radv_cmd_buffer *cmd_buffer,
|
radv_update_multisample_state(struct radv_cmd_buffer *cmd_buffer,
|
||||||
struct radv_pipeline *pipeline)
|
struct radv_pipeline *pipeline)
|
||||||
@@ -1901,6 +1918,7 @@ radv_flush_constants(struct radv_cmd_buffer *cmd_buffer,
|
|||||||
radv_get_descriptors_state(cmd_buffer, bind_point);
|
radv_get_descriptors_state(cmd_buffer, bind_point);
|
||||||
struct radv_pipeline_layout *layout = pipeline->layout;
|
struct radv_pipeline_layout *layout = pipeline->layout;
|
||||||
struct radv_shader_variant *shader, *prev_shader;
|
struct radv_shader_variant *shader, *prev_shader;
|
||||||
|
bool need_push_constants = false;
|
||||||
unsigned offset;
|
unsigned offset;
|
||||||
void *ptr;
|
void *ptr;
|
||||||
uint64_t va;
|
uint64_t va;
|
||||||
@@ -1910,37 +1928,56 @@ radv_flush_constants(struct radv_cmd_buffer *cmd_buffer,
|
|||||||
(!layout->push_constant_size && !layout->dynamic_offset_count))
|
(!layout->push_constant_size && !layout->dynamic_offset_count))
|
||||||
return;
|
return;
|
||||||
|
|
||||||
if (!radv_cmd_buffer_upload_alloc(cmd_buffer, layout->push_constant_size +
|
|
||||||
16 * layout->dynamic_offset_count,
|
|
||||||
256, &offset, &ptr))
|
|
||||||
return;
|
|
||||||
|
|
||||||
memcpy(ptr, cmd_buffer->push_constants, layout->push_constant_size);
|
|
||||||
memcpy((char*)ptr + layout->push_constant_size,
|
|
||||||
descriptors_state->dynamic_buffers,
|
|
||||||
16 * layout->dynamic_offset_count);
|
|
||||||
|
|
||||||
va = radv_buffer_get_va(cmd_buffer->upload.upload_bo);
|
|
||||||
va += offset;
|
|
||||||
|
|
||||||
MAYBE_UNUSED unsigned cdw_max = radeon_check_space(cmd_buffer->device->ws,
|
|
||||||
cmd_buffer->cs, MESA_SHADER_STAGES * 4);
|
|
||||||
|
|
||||||
prev_shader = NULL;
|
|
||||||
radv_foreach_stage(stage, stages) {
|
radv_foreach_stage(stage, stages) {
|
||||||
shader = radv_get_shader(pipeline, stage);
|
if (!pipeline->shaders[stage])
|
||||||
|
continue;
|
||||||
|
|
||||||
/* Avoid redundantly emitting the address for merged stages. */
|
need_push_constants |= pipeline->shaders[stage]->info.info.loads_push_constants;
|
||||||
if (shader && shader != prev_shader) {
|
need_push_constants |= pipeline->shaders[stage]->info.info.loads_dynamic_offsets;
|
||||||
radv_emit_userdata_address(cmd_buffer, pipeline, stage,
|
|
||||||
AC_UD_PUSH_CONSTANTS, va);
|
|
||||||
|
|
||||||
prev_shader = shader;
|
uint8_t base = pipeline->shaders[stage]->info.info.base_inline_push_consts;
|
||||||
|
uint8_t count = pipeline->shaders[stage]->info.info.num_inline_push_consts;
|
||||||
|
|
||||||
|
radv_emit_inline_push_consts(cmd_buffer, pipeline, stage,
|
||||||
|
AC_UD_INLINE_PUSH_CONSTANTS,
|
||||||
|
count,
|
||||||
|
(uint32_t *)&cmd_buffer->push_constants[base * 4]);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (need_push_constants) {
|
||||||
|
if (!radv_cmd_buffer_upload_alloc(cmd_buffer, layout->push_constant_size +
|
||||||
|
16 * layout->dynamic_offset_count,
|
||||||
|
256, &offset, &ptr))
|
||||||
|
return;
|
||||||
|
|
||||||
|
memcpy(ptr, cmd_buffer->push_constants, layout->push_constant_size);
|
||||||
|
memcpy((char*)ptr + layout->push_constant_size,
|
||||||
|
descriptors_state->dynamic_buffers,
|
||||||
|
16 * layout->dynamic_offset_count);
|
||||||
|
|
||||||
|
va = radv_buffer_get_va(cmd_buffer->upload.upload_bo);
|
||||||
|
va += offset;
|
||||||
|
|
||||||
|
MAYBE_UNUSED unsigned cdw_max =
|
||||||
|
radeon_check_space(cmd_buffer->device->ws,
|
||||||
|
cmd_buffer->cs, MESA_SHADER_STAGES * 4);
|
||||||
|
|
||||||
|
prev_shader = NULL;
|
||||||
|
radv_foreach_stage(stage, stages) {
|
||||||
|
shader = radv_get_shader(pipeline, stage);
|
||||||
|
|
||||||
|
/* Avoid redundantly emitting the address for merged stages. */
|
||||||
|
if (shader && shader != prev_shader) {
|
||||||
|
radv_emit_userdata_address(cmd_buffer, pipeline, stage,
|
||||||
|
AC_UD_PUSH_CONSTANTS, va);
|
||||||
|
|
||||||
|
prev_shader = shader;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
assert(cmd_buffer->cs->cdw <= cdw_max);
|
||||||
}
|
}
|
||||||
|
|
||||||
cmd_buffer->push_constant_stages &= ~stages;
|
cmd_buffer->push_constant_stages &= ~stages;
|
||||||
assert(cmd_buffer->cs->cdw <= cdw_max);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
static void
|
static void
|
||||||
|
@@ -627,6 +627,50 @@ count_vs_user_sgprs(struct radv_shader_context *ctx)
|
|||||||
return count;
|
return count;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static void allocate_inline_push_consts(struct radv_shader_context *ctx,
|
||||||
|
struct user_sgpr_info *user_sgpr_info)
|
||||||
|
{
|
||||||
|
uint8_t remaining_sgprs = user_sgpr_info->remaining_sgprs;
|
||||||
|
|
||||||
|
/* Only supported if shaders use push constants. */
|
||||||
|
if (ctx->shader_info->info.min_push_constant_used == UINT8_MAX)
|
||||||
|
return;
|
||||||
|
|
||||||
|
/* Only supported if shaders don't have indirect push constants. */
|
||||||
|
if (ctx->shader_info->info.has_indirect_push_constants)
|
||||||
|
return;
|
||||||
|
|
||||||
|
/* Only supported for 32-bit push constants. */
|
||||||
|
if (!ctx->shader_info->info.has_only_32bit_push_constants)
|
||||||
|
return;
|
||||||
|
|
||||||
|
uint8_t num_push_consts =
|
||||||
|
(ctx->shader_info->info.max_push_constant_used -
|
||||||
|
ctx->shader_info->info.min_push_constant_used) / 4;
|
||||||
|
|
||||||
|
/* Check if the number of user SGPRs is large enough. */
|
||||||
|
if (num_push_consts < remaining_sgprs) {
|
||||||
|
ctx->shader_info->info.num_inline_push_consts = num_push_consts;
|
||||||
|
} else {
|
||||||
|
ctx->shader_info->info.num_inline_push_consts = remaining_sgprs;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Clamp to the maximum number of allowed inlined push constants. */
|
||||||
|
if (ctx->shader_info->info.num_inline_push_consts > AC_MAX_INLINE_PUSH_CONSTS)
|
||||||
|
ctx->shader_info->info.num_inline_push_consts = AC_MAX_INLINE_PUSH_CONSTS;
|
||||||
|
|
||||||
|
if (ctx->shader_info->info.num_inline_push_consts == num_push_consts &&
|
||||||
|
!ctx->shader_info->info.loads_dynamic_offsets) {
|
||||||
|
/* Disable the default push constants path if all constants are
|
||||||
|
* inlined and if shaders don't use dynamic descriptors.
|
||||||
|
*/
|
||||||
|
ctx->shader_info->info.loads_push_constants = false;
|
||||||
|
}
|
||||||
|
|
||||||
|
ctx->shader_info->info.base_inline_push_consts =
|
||||||
|
ctx->shader_info->info.min_push_constant_used / 4;
|
||||||
|
}
|
||||||
|
|
||||||
static void allocate_user_sgprs(struct radv_shader_context *ctx,
|
static void allocate_user_sgprs(struct radv_shader_context *ctx,
|
||||||
gl_shader_stage stage,
|
gl_shader_stage stage,
|
||||||
bool has_previous_stage,
|
bool has_previous_stage,
|
||||||
@@ -706,6 +750,8 @@ static void allocate_user_sgprs(struct radv_shader_context *ctx,
|
|||||||
} else {
|
} else {
|
||||||
user_sgpr_info->remaining_sgprs = remaining_sgprs - num_desc_set;
|
user_sgpr_info->remaining_sgprs = remaining_sgprs - num_desc_set;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
allocate_inline_push_consts(ctx, user_sgpr_info);
|
||||||
}
|
}
|
||||||
|
|
||||||
static void
|
static void
|
||||||
@@ -735,6 +781,13 @@ declare_global_input_sgprs(struct radv_shader_context *ctx,
|
|||||||
add_arg(args, ARG_SGPR, type, &ctx->abi.push_constants);
|
add_arg(args, ARG_SGPR, type, &ctx->abi.push_constants);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
for (unsigned i = 0; i < ctx->shader_info->info.num_inline_push_consts; i++) {
|
||||||
|
add_arg(args, ARG_SGPR, ctx->ac.i32,
|
||||||
|
&ctx->abi.inline_push_consts[i]);
|
||||||
|
}
|
||||||
|
ctx->abi.num_inline_push_consts = ctx->shader_info->info.num_inline_push_consts;
|
||||||
|
ctx->abi.base_inline_push_consts = ctx->shader_info->info.base_inline_push_consts;
|
||||||
|
|
||||||
if (ctx->shader_info->info.so.num_outputs) {
|
if (ctx->shader_info->info.so.num_outputs) {
|
||||||
add_arg(args, ARG_SGPR,
|
add_arg(args, ARG_SGPR,
|
||||||
ac_array_in_const32_addr_space(ctx->ac.v4i32),
|
ac_array_in_const32_addr_space(ctx->ac.v4i32),
|
||||||
@@ -853,6 +906,11 @@ set_global_input_locs(struct radv_shader_context *ctx,
|
|||||||
set_loc_shader_ptr(ctx, AC_UD_PUSH_CONSTANTS, user_sgpr_idx);
|
set_loc_shader_ptr(ctx, AC_UD_PUSH_CONSTANTS, user_sgpr_idx);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (ctx->shader_info->info.num_inline_push_consts) {
|
||||||
|
set_loc_shader(ctx, AC_UD_INLINE_PUSH_CONSTANTS, user_sgpr_idx,
|
||||||
|
ctx->shader_info->info.num_inline_push_consts);
|
||||||
|
}
|
||||||
|
|
||||||
if (ctx->streamout_buffers) {
|
if (ctx->streamout_buffers) {
|
||||||
set_loc_shader_ptr(ctx, AC_UD_STREAMOUT_BUFFERS,
|
set_loc_shader_ptr(ctx, AC_UD_STREAMOUT_BUFFERS,
|
||||||
user_sgpr_idx);
|
user_sgpr_idx);
|
||||||
|
@@ -129,10 +129,11 @@ struct radv_nir_compiler_options {
|
|||||||
enum radv_ud_index {
|
enum radv_ud_index {
|
||||||
AC_UD_SCRATCH_RING_OFFSETS = 0,
|
AC_UD_SCRATCH_RING_OFFSETS = 0,
|
||||||
AC_UD_PUSH_CONSTANTS = 1,
|
AC_UD_PUSH_CONSTANTS = 1,
|
||||||
AC_UD_INDIRECT_DESCRIPTOR_SETS = 2,
|
AC_UD_INLINE_PUSH_CONSTANTS = 2,
|
||||||
AC_UD_VIEW_INDEX = 3,
|
AC_UD_INDIRECT_DESCRIPTOR_SETS = 3,
|
||||||
AC_UD_STREAMOUT_BUFFERS = 4,
|
AC_UD_VIEW_INDEX = 4,
|
||||||
AC_UD_SHADER_START = 5,
|
AC_UD_STREAMOUT_BUFFERS = 5,
|
||||||
|
AC_UD_SHADER_START = 6,
|
||||||
AC_UD_VS_VERTEX_BUFFERS = AC_UD_SHADER_START,
|
AC_UD_VS_VERTEX_BUFFERS = AC_UD_SHADER_START,
|
||||||
AC_UD_VS_BASE_VERTEX_START_INSTANCE,
|
AC_UD_VS_BASE_VERTEX_START_INSTANCE,
|
||||||
AC_UD_VS_MAX_UD,
|
AC_UD_VS_MAX_UD,
|
||||||
@@ -167,6 +168,8 @@ struct radv_shader_info {
|
|||||||
uint8_t max_push_constant_used;
|
uint8_t max_push_constant_used;
|
||||||
bool has_only_32bit_push_constants;
|
bool has_only_32bit_push_constants;
|
||||||
bool has_indirect_push_constants;
|
bool has_indirect_push_constants;
|
||||||
|
uint8_t num_inline_push_consts;
|
||||||
|
uint8_t base_inline_push_consts;
|
||||||
uint32_t desc_set_used_mask;
|
uint32_t desc_set_used_mask;
|
||||||
bool needs_multiview_view_index;
|
bool needs_multiview_view_index;
|
||||||
bool uses_invocation_id;
|
bool uses_invocation_id;
|
||||||
|
Reference in New Issue
Block a user