radv: fix dynamic RT stack size with VGPR spilling
VGPR spilling might cause VGPRs to be spilled at scratch offset 0, so we can't use that. fossil-db (Sienna Cichlid, Q2RTX and Control): Totals from 4 (0.26% of 1524) affected shaders: Instrs: 8734 -> 8737 (+0.03%) CodeSize: 48492 -> 48504 (+0.02%) Latency: 384375 -> 384369 (-0.00%) InvThroughput: 256250 -> 256246 (-0.00%) Copies: 1312 -> 1313 (+0.08%) Branches: 256 -> 258 (+0.78%) Signed-off-by: Rhys Perry <pendingchaos02@gmail.com> Reviewed-by: Konstantin Seurer <konstantin.seurer@gmail.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/18541>
This commit is contained in:
@@ -152,6 +152,7 @@ struct ac_shader_args {
|
||||
struct ac_arg sbt_descriptors;
|
||||
struct ac_arg ray_launch_size_addr;
|
||||
struct ac_arg force_vrs_rates;
|
||||
struct ac_arg rt_dynamic_callable_stack_base;
|
||||
};
|
||||
|
||||
void ac_add_arg(struct ac_shader_args *info, enum ac_arg_regfile regfile, unsigned registers,
|
||||
|
@@ -9167,6 +9167,10 @@ visit_intrinsic(isel_context* ctx, nir_intrinsic_instr* instr)
|
||||
break;
|
||||
}
|
||||
case nir_intrinsic_bvh64_intersect_ray_amd: visit_bvh64_intersect_ray_amd(ctx, instr); break;
|
||||
case nir_intrinsic_load_rt_dynamic_callable_stack_base_amd:
|
||||
bld.copy(Definition(get_ssa_temp(ctx, &instr->dest.ssa)),
|
||||
get_arg(ctx, ctx->args->ac.rt_dynamic_callable_stack_base));
|
||||
break;
|
||||
case nir_intrinsic_overwrite_vs_arguments_amd: {
|
||||
ctx->arg_temps[ctx->args->ac.vertex_id.arg_index] = get_ssa_temp(ctx, instr->src[0].ssa);
|
||||
ctx->arg_temps[ctx->args->ac.instance_id.arg_index] = get_ssa_temp(ctx, instr->src[1].ssa);
|
||||
|
@@ -8559,6 +8559,14 @@ radv_trace_rays(struct radv_cmd_buffer *cmd_buffer, const VkTraceRaysIndirectCom
|
||||
base_reg + size_loc->sgpr_idx * 4, launch_size_va, true);
|
||||
}
|
||||
|
||||
struct radv_userdata_info *base_loc = radv_lookup_user_sgpr(
|
||||
&pipeline->base, MESA_SHADER_COMPUTE, AC_UD_CS_RAY_DYNAMIC_CALLABLE_STACK_BASE);
|
||||
if (base_loc->sgpr_idx != -1) {
|
||||
struct radv_shader_info *cs_info = &pipeline->base.shaders[MESA_SHADER_COMPUTE]->info;
|
||||
radeon_set_sh_reg(cmd_buffer->cs, R_00B900_COMPUTE_USER_DATA_0 + base_loc->sgpr_idx * 4,
|
||||
pipeline->base.scratch_bytes_per_wave / cs_info->wave_size);
|
||||
}
|
||||
|
||||
radv_dispatch(cmd_buffer, &info, pipeline, VK_PIPELINE_BIND_POINT_RAY_TRACING_KHR);
|
||||
}
|
||||
|
||||
|
@@ -1769,7 +1769,10 @@ create_rt_shader(struct radv_device *device, const VkRayTracingPipelineCreateInf
|
||||
|
||||
struct rt_variables vars = create_rt_variables(b.shader, pCreateInfo, stack_sizes);
|
||||
load_sbt_entry(&b, &vars, nir_imm_int(&b, 0), SBT_RAYGEN, 0);
|
||||
nir_store_var(&b, vars.stack_ptr, nir_imm_int(&b, 0), 0x1);
|
||||
if (radv_rt_pipeline_has_dynamic_stack_size(pCreateInfo))
|
||||
nir_store_var(&b, vars.stack_ptr, nir_load_rt_dynamic_callable_stack_base_amd(&b), 0x1);
|
||||
else
|
||||
nir_store_var(&b, vars.stack_ptr, nir_imm_int(&b, 0), 0x1);
|
||||
|
||||
nir_store_var(&b, vars.main_loop_case_visited, nir_imm_bool(&b, true), 1);
|
||||
|
||||
|
@@ -153,6 +153,7 @@ enum radv_ud_index {
|
||||
AC_UD_CS_GRID_SIZE = AC_UD_SHADER_START,
|
||||
AC_UD_CS_SBT_DESCRIPTORS,
|
||||
AC_UD_CS_RAY_LAUNCH_SIZE_ADDR,
|
||||
AC_UD_CS_RAY_DYNAMIC_CALLABLE_STACK_BASE,
|
||||
AC_UD_CS_TASK_RING_OFFSETS,
|
||||
AC_UD_CS_TASK_DRAW_ID,
|
||||
AC_UD_CS_TASK_IB,
|
||||
@@ -345,6 +346,7 @@ struct radv_shader_info {
|
||||
|
||||
bool uses_sbt;
|
||||
bool uses_ray_launch_size;
|
||||
bool uses_dynamic_rt_callable_stack;
|
||||
} cs;
|
||||
struct {
|
||||
uint64_t tes_inputs_read;
|
||||
|
@@ -189,6 +189,8 @@ allocate_user_sgprs(enum amd_gfx_level gfx_level, const struct radv_shader_info
|
||||
user_sgpr_count += args->load_grid_size_from_user_sgpr ? 3 : 2;
|
||||
if (info->cs.uses_ray_launch_size)
|
||||
user_sgpr_count += 2;
|
||||
if (info->cs.uses_dynamic_rt_callable_stack)
|
||||
user_sgpr_count += 1;
|
||||
if (info->vs.needs_draw_id)
|
||||
user_sgpr_count += 1;
|
||||
if (stage == MESA_SHADER_TASK)
|
||||
@@ -605,6 +607,11 @@ radv_declare_shader_args(enum amd_gfx_level gfx_level, const struct radv_pipelin
|
||||
ac_add_arg(&args->ac, AC_ARG_SGPR, 2, AC_ARG_CONST_PTR, &args->ac.ray_launch_size_addr);
|
||||
}
|
||||
|
||||
if (info->cs.uses_dynamic_rt_callable_stack) {
|
||||
ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT,
|
||||
&args->ac.rt_dynamic_callable_stack_base);
|
||||
}
|
||||
|
||||
if (info->vs.needs_draw_id) {
|
||||
ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.draw_id);
|
||||
}
|
||||
@@ -872,6 +879,9 @@ radv_declare_shader_args(enum amd_gfx_level gfx_level, const struct radv_pipelin
|
||||
if (args->ac.ray_launch_size_addr.used) {
|
||||
set_loc_shader_ptr(args, AC_UD_CS_RAY_LAUNCH_SIZE_ADDR, &user_sgpr_idx);
|
||||
}
|
||||
if (args->ac.rt_dynamic_callable_stack_base.used) {
|
||||
set_loc_shader(args, AC_UD_CS_RAY_DYNAMIC_CALLABLE_STACK_BASE, &user_sgpr_idx, 1);
|
||||
}
|
||||
if (args->ac.draw_id.used) {
|
||||
set_loc_shader(args, AC_UD_CS_TASK_DRAW_ID, &user_sgpr_idx, 1);
|
||||
}
|
||||
|
@@ -210,6 +210,9 @@ gather_intrinsic_info(const nir_shader *nir, const nir_intrinsic_instr *instr,
|
||||
case nir_intrinsic_load_force_vrs_rates_amd:
|
||||
info->force_vrs_per_vertex = true;
|
||||
break;
|
||||
case nir_intrinsic_load_rt_dynamic_callable_stack_base_amd:
|
||||
info->cs.uses_dynamic_rt_callable_stack = true;
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
|
@@ -173,6 +173,7 @@ visit_intrinsic(nir_shader *shader, nir_intrinsic_instr *instr)
|
||||
case nir_intrinsic_load_tess_level_outer_default:
|
||||
case nir_intrinsic_load_scalar_arg_amd:
|
||||
case nir_intrinsic_load_smem_amd:
|
||||
case nir_intrinsic_load_rt_dynamic_callable_stack_base_amd:
|
||||
case nir_intrinsic_load_global_const_block_intel:
|
||||
case nir_intrinsic_load_reloc_const_intel:
|
||||
case nir_intrinsic_load_global_block_intel:
|
||||
|
@@ -1398,6 +1398,9 @@ system_value("intersection_opaque_amd", 1, bit_sizes=[1])
|
||||
# Used for indirect ray tracing.
|
||||
system_value("ray_launch_size_addr_amd", 1, bit_sizes=[64])
|
||||
|
||||
# Scratch base of callable stack for ray tracing.
|
||||
system_value("rt_dynamic_callable_stack_base_amd", 1)
|
||||
|
||||
# Load forced VRS rates.
|
||||
intrinsic("load_force_vrs_rates_amd", dest_comp=1, bit_sizes=[32], flags=[CAN_ELIMINATE, CAN_REORDER])
|
||||
|
||||
|
Reference in New Issue
Block a user