From a8abdc0d89386bc48cc63f48598fa6b18a3a7ed1 Mon Sep 17 00:00:00 2001 From: Bas Nieuwenhuizen Date: Sat, 24 Sep 2022 14:56:06 +0200 Subject: [PATCH] radv: Add traversal backtracking with a short stack. So we can now work with arbitrarily deep BVHs. Reviewed-By: Konstantin Seurer Reviewed-by: Friedrich Vock Part-of: --- src/amd/vulkan/radv_nir_lower_ray_queries.c | 30 ++++- src/amd/vulkan/radv_pipeline_rt.c | 95 ++++++--------- src/amd/vulkan/radv_rt_common.c | 122 ++++++++++++++++---- src/amd/vulkan/radv_rt_common.h | 35 +++--- 4 files changed, 178 insertions(+), 104 deletions(-) diff --git a/src/amd/vulkan/radv_nir_lower_ray_queries.c b/src/amd/vulkan/radv_nir_lower_ray_queries.c index 672766b77fe..6526d528e4d 100644 --- a/src/amd/vulkan/radv_nir_lower_ray_queries.c +++ b/src/amd/vulkan/radv_nir_lower_ray_queries.c @@ -31,6 +31,11 @@ #include "radv_rt_common.h" #include "radv_shader.h" +/* Traversal stack size. Traversal supports backtracking so we can go deeper than this size if + * needed. However, we keep a large stack size to avoid it being put into registers, which hurts + * occupancy. */ +#define MAX_STACK_ENTRY_COUNT 76 + typedef struct { nir_variable *variable; unsigned array_length; @@ -140,7 +145,11 @@ struct ray_query_traversal_vars { rq_variable *bvh_base; rq_variable *stack; rq_variable *top_stack; + rq_variable *stack_base; rq_variable *current_node; + rq_variable *previous_node; + rq_variable *instance_top_node; + rq_variable *instance_bottom_node; }; struct ray_query_intersection_vars { @@ -197,9 +206,16 @@ init_ray_query_traversal_vars(nir_shader *shader, nir_function_impl *impl, unsig rq_variable_create(shader, impl, array_length, glsl_uint_type(), VAR_NAME("_stack")); result.top_stack = rq_variable_create(shader, impl, array_length, glsl_uint_type(), VAR_NAME("_top_stack")); + result.stack_base = + rq_variable_create(shader, impl, array_length, glsl_uint_type(), VAR_NAME("_stack_base")); result.current_node = rq_variable_create(shader, impl, array_length, glsl_uint_type(), VAR_NAME("_current_node")); - + result.previous_node = + rq_variable_create(shader, impl, array_length, glsl_uint_type(), VAR_NAME("_previous_node")); + result.instance_top_node = rq_variable_create(shader, impl, array_length, glsl_uint_type(), + VAR_NAME("_instance_top_node")); + result.instance_bottom_node = rq_variable_create(shader, impl, array_length, glsl_uint_type(), + VAR_NAME("_instance_bottom_node")); return result; } @@ -387,8 +403,13 @@ lower_rq_initialize(nir_builder *b, nir_ssa_def *index, nir_intrinsic_instr *ins rq_store_var(b, index, vars->trav.stack, nir_imm_int(b, 0), 0x1); rq_store_var(b, index, vars->trav.current_node, nir_imm_int(b, RADV_BVH_ROOT_NODE), 0x1); + rq_store_var(b, index, vars->trav.previous_node, nir_imm_int(b, -1), 0x1); + rq_store_var(b, index, vars->trav.instance_top_node, nir_imm_int(b, -1), 0x1); + rq_store_var(b, index, vars->trav.instance_bottom_node, + nir_imm_int(b, RADV_BVH_NO_INSTANCE_ROOT), 0x1); - rq_store_var(b, index, vars->trav.top_stack, nir_imm_int(b, 0), 1); + rq_store_var(b, index, vars->trav.top_stack, nir_imm_int(b, -1), 1); + rq_store_var(b, index, vars->trav.stack_base, nir_imm_int(b, 0), 1); } nir_push_else(b, NULL); { @@ -621,7 +642,11 @@ lower_rq_proceed(nir_builder *b, nir_ssa_def *index, struct ray_query_vars *vars .bvh_base = rq_deref_var(b, index, vars->trav.bvh_base), .stack = rq_deref_var(b, index, vars->trav.stack), .top_stack = rq_deref_var(b, index, vars->trav.top_stack), + .stack_base = rq_deref_var(b, index, vars->trav.stack_base), .current_node = rq_deref_var(b, index, vars->trav.current_node), + .previous_node = rq_deref_var(b, index, vars->trav.previous_node), + .instance_top_node = rq_deref_var(b, index, vars->trav.instance_top_node), + .instance_bottom_node = rq_deref_var(b, index, vars->trav.instance_bottom_node), .instance_id = rq_deref_var(b, index, vars->candidate.instance_id), .instance_addr = rq_deref_var(b, index, vars->candidate.instance_addr), .custom_instance_and_mask = rq_deref_var(b, index, vars->candidate.custom_instance_and_mask), @@ -642,6 +667,7 @@ lower_rq_proceed(nir_builder *b, nir_ssa_def *index, struct ray_query_vars *vars .dir = rq_load_var(b, index, vars->direction), .vars = trav_vars, .stack_stride = 1, + .stack_entries = MAX_STACK_ENTRY_COUNT, .stack_store_cb = store_stack_entry, .stack_load_cb = load_stack_entry, .aabb_cb = handle_candidate_aabb, diff --git a/src/amd/vulkan/radv_pipeline_rt.c b/src/amd/vulkan/radv_pipeline_rt.c index c3be5f81ce6..05194baafe4 100644 --- a/src/amd/vulkan/radv_pipeline_rt.c +++ b/src/amd/vulkan/radv_pipeline_rt.c @@ -32,6 +32,10 @@ #include "nir/nir_builder.h" #include "nir/nir_builtin_builder.h" +/* Traversal stack size. This stack is put in LDS and experimentally 16 entries results in best + * performance. */ +#define MAX_STACK_ENTRY_COUNT 16 + static VkRayTracingPipelineCreateInfoKHR radv_create_merged_rt_create_info(const VkRayTracingPipelineCreateInfoKHR *pCreateInfo) { @@ -1029,9 +1033,12 @@ struct rt_traversal_vars { nir_variable *hit; nir_variable *bvh_base; nir_variable *stack; - nir_variable *lds_stack_base; nir_variable *top_stack; + nir_variable *stack_base; nir_variable *current_node; + nir_variable *previous_node; + nir_variable *instance_top_node; + nir_variable *instance_bottom_node; }; static struct rt_traversal_vars @@ -1057,12 +1064,18 @@ init_traversal_vars(nir_builder *b) "traversal_bvh_base"); ret.stack = nir_variable_create(b->shader, nir_var_shader_temp, glsl_uint_type(), "traversal_stack_ptr"); - ret.lds_stack_base = nir_variable_create(b->shader, nir_var_shader_temp, glsl_uint_type(), - "traversal_lds_stack_base"); ret.top_stack = nir_variable_create(b->shader, nir_var_shader_temp, glsl_uint_type(), "traversal_top_stack_ptr"); + ret.stack_base = + nir_variable_create(b->shader, nir_var_shader_temp, glsl_uint_type(), "traversal_stack_base"); ret.current_node = nir_variable_create(b->shader, nir_var_shader_temp, glsl_uint_type(), "current_node;"); + ret.previous_node = + nir_variable_create(b->shader, nir_var_shader_temp, glsl_uint_type(), "previous_node"); + ret.instance_top_node = + nir_variable_create(b->shader, nir_var_shader_temp, glsl_uint_type(), "instance_top_node"); + ret.instance_bottom_node = + nir_variable_create(b->shader, nir_var_shader_temp, glsl_uint_type(), "instance_bottom_node"); return ret; } @@ -1313,64 +1326,13 @@ static void store_stack_entry(nir_builder *b, nir_ssa_def *index, nir_ssa_def *value, const struct radv_ray_traversal_args *args) { - index = nir_umod(b, index, nir_imm_int(b, args->stack_stride * MAX_STACK_LDS_ENTRY_COUNT)); nir_store_shared(b, value, index, .base = 0, .align_mul = 4); } static nir_ssa_def * load_stack_entry(nir_builder *b, nir_ssa_def *index, const struct radv_ray_traversal_args *args) { - nir_variable *ret = nir_local_variable_create(b->impl, glsl_uint_type(), "load_stack_result"); - struct traversal_data *data = args->data; - nir_push_if(b, nir_ilt(b, index, nir_load_var(b, data->trav_vars->lds_stack_base))); - { - nir_ssa_def *scratch_addr = - nir_imul_imm(b, nir_udiv_imm(b, index, args->stack_stride), sizeof(uint32_t)); - nir_store_var(b, ret, nir_load_scratch(b, 1, 32, scratch_addr), 0x1); - nir_store_var(b, data->trav_vars->lds_stack_base, index, 0x1); - } - nir_push_else(b, NULL); - { - nir_ssa_def *stack_ptr = - nir_umod(b, index, nir_imm_int(b, args->stack_stride * MAX_STACK_LDS_ENTRY_COUNT)); - nir_store_var(b, ret, nir_load_shared(b, 1, 32, stack_ptr, .base = 0, .align_mul = 4), 0x1); - } - nir_pop_if(b, NULL); - - return nir_load_var(b, ret); -} - -static void -check_stack_overflow(nir_builder *b, const struct radv_ray_traversal_args *args) -{ - struct traversal_data *data = args->data; - - nir_ssa_def *might_overflow = - nir_ige(b, - nir_isub(b, nir_load_deref(b, args->vars.stack), - nir_load_var(b, data->trav_vars->lds_stack_base)), - nir_imm_int(b, args->stack_stride * (MAX_STACK_LDS_ENTRY_COUNT - 2))); - nir_push_if(b, might_overflow); - { - nir_ssa_def *scratch_addr = nir_imul_imm( - b, nir_udiv_imm(b, nir_load_var(b, data->trav_vars->lds_stack_base), args->stack_stride), - sizeof(uint32_t)); - for (int i = 0; i < 4; ++i) { - nir_ssa_def *lds_stack_ptr = - nir_umod(b, nir_load_var(b, data->trav_vars->lds_stack_base), - nir_imm_int(b, args->stack_stride * MAX_STACK_LDS_ENTRY_COUNT)); - - nir_ssa_def *node = nir_load_shared(b, 1, 32, lds_stack_ptr, .base = 0, .align_mul = 4); - nir_store_scratch(b, node, scratch_addr); - - nir_store_var( - b, data->trav_vars->lds_stack_base, - nir_iadd_imm(b, nir_load_var(b, data->trav_vars->lds_stack_base), args->stack_stride), - 1); - scratch_addr = nir_iadd_imm(b, scratch_addr, sizeof(uint32_t)); - } - } - nir_pop_if(b, NULL); + return nir_load_shared(b, 1, 32, index, .base = 0, .align_mul = 4); } static nir_shader * @@ -1383,7 +1345,7 @@ build_traversal_shader(struct radv_device *device, b.shader->info.workgroup_size[0] = 8; b.shader->info.workgroup_size[1] = device->physical_device->rt_wave_size == 64 ? 8 : 4; b.shader->info.shared_size = - device->physical_device->rt_wave_size * MAX_STACK_LDS_ENTRY_COUNT * sizeof(uint32_t); + device->physical_device->rt_wave_size * MAX_STACK_ENTRY_COUNT * sizeof(uint32_t); struct rt_variables vars = create_rt_variables(b.shader, pCreateInfo, dst_vars->stack_sizes); map_rt_variables(var_remap, &vars, dst_vars); @@ -1414,10 +1376,14 @@ build_traversal_shader(struct radv_device *device, nir_store_var(&b, trav_vars.instance_addr, nir_imm_int64(&b, 0), 1); nir_store_var(&b, trav_vars.stack, nir_imul_imm(&b, nir_load_local_invocation_index(&b), sizeof(uint32_t)), 1); - nir_store_var(&b, trav_vars.lds_stack_base, nir_load_var(&b, trav_vars.stack), 1); + nir_store_var(&b, trav_vars.stack_base, nir_load_var(&b, trav_vars.stack), 1); nir_store_var(&b, trav_vars.current_node, nir_imm_int(&b, RADV_BVH_ROOT_NODE), 0x1); + nir_store_var(&b, trav_vars.previous_node, nir_imm_int(&b, -1), 0x1); + nir_store_var(&b, trav_vars.instance_top_node, nir_imm_int(&b, -1), 0x1); + nir_store_var(&b, trav_vars.instance_bottom_node, nir_imm_int(&b, RADV_BVH_NO_INSTANCE_ROOT), + 0x1); - nir_store_var(&b, trav_vars.top_stack, nir_imm_int(&b, 0), 1); + nir_store_var(&b, trav_vars.top_stack, nir_imm_int(&b, -1), 1); struct radv_ray_traversal_vars trav_vars_args = { .tmax = nir_build_deref_var(&b, vars.tmax), @@ -1427,7 +1393,11 @@ build_traversal_shader(struct radv_device *device, .bvh_base = nir_build_deref_var(&b, trav_vars.bvh_base), .stack = nir_build_deref_var(&b, trav_vars.stack), .top_stack = nir_build_deref_var(&b, trav_vars.top_stack), + .stack_base = nir_build_deref_var(&b, trav_vars.stack_base), .current_node = nir_build_deref_var(&b, trav_vars.current_node), + .previous_node = nir_build_deref_var(&b, trav_vars.previous_node), + .instance_top_node = nir_build_deref_var(&b, trav_vars.instance_top_node), + .instance_bottom_node = nir_build_deref_var(&b, trav_vars.instance_bottom_node), .instance_id = nir_build_deref_var(&b, trav_vars.instance_id), .instance_addr = nir_build_deref_var(&b, trav_vars.instance_addr), .custom_instance_and_mask = nir_build_deref_var(&b, trav_vars.custom_instance_and_mask), @@ -1450,11 +1420,11 @@ build_traversal_shader(struct radv_device *device, .dir = nir_load_var(&b, vars.direction), .vars = trav_vars_args, .stack_stride = device->physical_device->rt_wave_size * sizeof(uint32_t), + .stack_entries = MAX_STACK_ENTRY_COUNT, .stack_store_cb = store_stack_entry, .stack_load_cb = load_stack_entry, .aabb_cb = handle_candidate_aabb, .triangle_cb = handle_candidate_triangle, - .check_stack_overflow_cb = check_stack_overflow, .data = &data, }; @@ -1643,7 +1613,7 @@ create_rt_shader(struct radv_device *device, const VkRayTracingPipelineCreateInf if (radv_rt_pipeline_has_dynamic_stack_size(pCreateInfo)) nir_store_var(&b, vars.stack_ptr, nir_load_rt_dynamic_callable_stack_base_amd(&b), 0x1); else - nir_store_var(&b, vars.stack_ptr, nir_imm_int(&b, MAX_STACK_SCRATCH_ENTRY_COUNT * 4), 0x1); + nir_store_var(&b, vars.stack_ptr, nir_imm_int(&b, 0), 0x1); nir_loop *loop = nir_push_loop(&b); @@ -1688,8 +1658,9 @@ create_rt_shader(struct radv_device *device, const VkRayTracingPipelineCreateInf nir_pop_loop(&b, loop); - b.shader->scratch_size = MAX2(16, MAX_STACK_SCRATCH_ENTRY_COUNT * 4); - if (!radv_rt_pipeline_has_dynamic_stack_size(pCreateInfo)) + if (radv_rt_pipeline_has_dynamic_stack_size(pCreateInfo)) + b.shader->scratch_size = 16; /* To enable scratch. */ + else b.shader->scratch_size += compute_rt_stack_size(pCreateInfo, stack_sizes); /* Deal with all the inline functions. */ diff --git a/src/amd/vulkan/radv_rt_common.c b/src/amd/vulkan/radv_rt_common.c index bd303a98836..58ef5933d29 100644 --- a/src/amd/vulkan/radv_rt_common.c +++ b/src/amd/vulkan/radv_rt_common.c @@ -522,6 +522,14 @@ insert_traversal_aabb_case(struct radv_device *device, nir_builder *b, nir_pop_if(b, NULL); } +static nir_ssa_def * +fetch_parent_node(nir_builder *b, nir_ssa_def *bvh, nir_ssa_def *node) +{ + nir_ssa_def *offset = nir_iadd_imm(b, nir_imul_imm(b, nir_udiv_imm(b, node, 8), 4), 4); + + return nir_build_load_global(b, 1, 32, nir_isub(b, bvh, nir_u2u64(b, offset)), .align_mul = 4); +} + nir_ssa_def * radv_build_ray_traversal(struct radv_device *device, nir_builder *b, const struct radv_ray_traversal_args *args) @@ -538,19 +546,31 @@ radv_build_ray_traversal(struct radv_device *device, nir_builder *b, { nir_push_if(b, nir_ieq_imm(b, nir_load_deref(b, args->vars.current_node), -1)); { - nir_push_if(b, nir_ilt(b, nir_load_deref(b, args->vars.stack), nir_imm_int(b, args->stack_stride))); + /* Early exit if we never overflowed the stack, to avoid having to backtrack to + * the root for no reason. */ + nir_push_if(b, nir_ilt(b, nir_load_deref(b, args->vars.stack), + nir_imm_int(b, args->stack_stride))); { nir_store_var(b, incomplete, nir_imm_bool(b, false), 0x1); nir_jump(b, nir_jump_break); } nir_pop_if(b, NULL); + nir_ssa_def *stack_instance_exit = nir_ige(b, nir_load_deref(b, args->vars.top_stack), + nir_load_deref(b, args->vars.stack)); + nir_ssa_def *root_instance_exit = + nir_ieq(b, nir_load_deref(b, args->vars.previous_node), + nir_load_deref(b, args->vars.instance_bottom_node)); nir_if *instance_exit = - nir_push_if(b, nir_uge(b, nir_load_deref(b, args->vars.top_stack), - nir_load_deref(b, args->vars.stack))); + nir_push_if(b, nir_ior(b, stack_instance_exit, root_instance_exit)); instance_exit->control = nir_selection_control_dont_flatten; { - nir_store_deref(b, args->vars.top_stack, nir_imm_int(b, 0), 1); + nir_store_deref(b, args->vars.top_stack, nir_imm_int(b, -1), 1); + nir_store_deref(b, args->vars.previous_node, + nir_load_deref(b, args->vars.instance_top_node), 1); + nir_store_deref(b, args->vars.instance_bottom_node, + nir_imm_int(b, RADV_BVH_NO_INSTANCE_ROOT), 1); + nir_store_deref(b, args->vars.bvh_base, args->root_bvh_base, 1); nir_store_deref(b, args->vars.origin, args->origin, 7); nir_store_deref(b, args->vars.dir, args->dir, 7); @@ -558,20 +578,47 @@ radv_build_ray_traversal(struct radv_device *device, nir_builder *b, } nir_pop_if(b, NULL); - nir_store_deref(b, args->vars.stack, - nir_iadd_imm(b, nir_load_deref(b, args->vars.stack), -args->stack_stride), 1); + nir_push_if(b, nir_ige(b, nir_load_deref(b, args->vars.stack_base), + nir_load_deref(b, args->vars.stack))); + { + nir_ssa_def *prev = nir_load_deref(b, args->vars.previous_node); + nir_ssa_def *bvh_addr = + build_node_to_addr(device, b, nir_load_deref(b, args->vars.bvh_base)); - nir_ssa_def *bvh_node = - args->stack_load_cb(b, nir_load_deref(b, args->vars.stack), args); - nir_store_deref(b, args->vars.current_node, bvh_node, 0x1); + nir_ssa_def *parent = fetch_parent_node(b, bvh_addr, prev); + nir_push_if(b, nir_ieq(b, parent, nir_imm_int(b, -1))); + { + nir_store_var(b, incomplete, nir_imm_bool(b, false), 0x1); + nir_jump(b, nir_jump_break); + } + nir_pop_if(b, NULL); + nir_store_deref(b, args->vars.current_node, parent, 0x1); + } + nir_push_else(b, NULL); + { + nir_store_deref( + b, args->vars.stack, + nir_iadd_imm(b, nir_load_deref(b, args->vars.stack), -args->stack_stride), 1); + + nir_ssa_def *stack_ptr = + nir_umod(b, nir_load_deref(b, args->vars.stack), + nir_imm_int(b, args->stack_stride * args->stack_entries)); + nir_ssa_def *bvh_node = args->stack_load_cb(b, stack_ptr, args); + nir_store_deref(b, args->vars.current_node, bvh_node, 0x1); + nir_store_deref(b, args->vars.previous_node, nir_imm_int(b, -1), 0x1); + } + nir_pop_if(b, NULL); + } + nir_push_else(b, NULL); + { + nir_store_deref(b, args->vars.previous_node, nir_imm_int(b, -1), 0x1); } nir_pop_if(b, NULL); - if (args->check_stack_overflow_cb) - args->check_stack_overflow_cb(b, args); - nir_ssa_def *bvh_node = nir_load_deref(b, args->vars.current_node); + nir_ssa_def *prev_node = nir_load_deref(b, args->vars.previous_node); + nir_store_deref(b, args->vars.previous_node, bvh_node, 0x1); nir_store_deref(b, args->vars.current_node, nir_imm_int(b, -1), 0x1); nir_ssa_def *global_bvh_node = @@ -625,6 +672,9 @@ radv_build_ray_traversal(struct radv_device *device, nir_builder *b, /* Push the instance root node onto the stack */ nir_store_deref(b, args->vars.current_node, nir_imm_int(b, RADV_BVH_ROOT_NODE), 0x1); + nir_store_deref(b, args->vars.instance_bottom_node, + nir_imm_int(b, RADV_BVH_ROOT_NODE), 1); + nir_store_deref(b, args->vars.instance_top_node, bvh_node, 1); /* Transform the ray into object space */ nir_store_deref(b, args->vars.origin, @@ -654,20 +704,46 @@ radv_build_ray_traversal(struct radv_device *device, nir_builder *b, nir_load_deref(b, args->vars.inv_dir)); } - nir_ssa_def *new_nodes[4]; - for (unsigned i = 0; i < 4; ++i) - new_nodes[i] = nir_channel(b, result, i); + /* box */ + nir_push_if(b, nir_ieq_imm(b, prev_node, -1)); + { + nir_ssa_def *new_nodes[4]; + for (unsigned i = 0; i < 4; ++i) + new_nodes[i] = nir_channel(b, result, i); - for (unsigned i = 1; i < 4; ++i) - nir_push_if(b, nir_ine_imm(b, new_nodes[i], -1)); + for (unsigned i = 1; i < 4; ++i) + nir_push_if(b, nir_ine_imm(b, new_nodes[i], -1)); - for (unsigned i = 4; i-- > 1;) { - nir_ssa_def *stack = nir_load_deref(b, args->vars.stack); - args->stack_store_cb(b, stack, new_nodes[i], args); - nir_store_deref(b, args->vars.stack, nir_iadd_imm(b, stack, args->stack_stride), 1); - nir_pop_if(b, NULL); + for (unsigned i = 4; i-- > 1;) { + nir_ssa_def *stack = nir_load_deref(b, args->vars.stack); + nir_ssa_def *stack_ptr = nir_umod( + b, stack, nir_imm_int(b, args->stack_entries * args->stack_stride)); + args->stack_store_cb(b, stack_ptr, new_nodes[i], args); + nir_store_deref(b, args->vars.stack, + nir_iadd_imm(b, stack, args->stack_stride), 1); + + if (i == 1) { + nir_ssa_def *new_base = + nir_iadd_imm(b, nir_load_deref(b, args->vars.stack), + -args->stack_entries * args->stack_stride); + new_base = nir_imax(b, nir_load_deref(b, args->vars.stack_base), new_base); + nir_store_deref(b, args->vars.stack_base, new_base, 0x1); + } + + nir_pop_if(b, NULL); + } + nir_store_deref(b, args->vars.current_node, new_nodes[0], 0x1); } - nir_store_deref(b, args->vars.current_node, new_nodes[0], 0x1); + nir_push_else(b, NULL); + { + nir_ssa_def *next = nir_imm_int(b, -1); + for (unsigned i = 0; i < 3; ++i) { + next = nir_bcsel(b, nir_ieq(b, prev_node, nir_channel(b, result, i)), + nir_channel(b, result, i + 1), next); + } + nir_store_deref(b, args->vars.current_node, next, 0x1); + } + nir_pop_if(b, NULL); } nir_pop_if(b, NULL); } diff --git a/src/amd/vulkan/radv_rt_common.h b/src/amd/vulkan/radv_rt_common.h index 91aa06d9ae5..accf7c03690 100644 --- a/src/amd/vulkan/radv_rt_common.h +++ b/src/amd/vulkan/radv_rt_common.h @@ -59,17 +59,6 @@ nir_ssa_def *hit_is_opaque(nir_builder *b, nir_ssa_def *sbt_offset_and_flags, ni nir_ssa_def *create_bvh_descriptor(nir_builder *b); -/* - * A top-level AS can contain 2^24 children and a bottom-level AS can contain 2^24 - * triangles. At a branching factor of 4, that means we may need up to 24 levels of box - * nodes + 1 triangle node - * + 1 instance node. Furthermore, when processing a box node, worst case we actually - * push all 4 children and remove one, so the DFS stack depth is box nodes * 3 + 2. - */ -#define MAX_STACK_ENTRY_COUNT 76 -#define MAX_STACK_LDS_ENTRY_COUNT 16 -#define MAX_STACK_SCRATCH_ENTRY_COUNT (MAX_STACK_ENTRY_COUNT - MAX_STACK_LDS_ENTRY_COUNT) - struct radv_ray_traversal_args; struct radv_leaf_intersection { @@ -101,9 +90,6 @@ typedef void (*radv_rt_stack_store_cb)(nir_builder *b, nir_ssa_def *index, nir_s typedef nir_ssa_def *(*radv_rt_stack_load_cb)(nir_builder *b, nir_ssa_def *index, const struct radv_ray_traversal_args *args); -typedef void (*radv_rt_check_stack_overflow_cb)(nir_builder *b, - const struct radv_ray_traversal_args *args); - struct radv_ray_traversal_vars { /* For each accepted hit, tmax will be set to the t value. This allows for automatic intersection * culling. @@ -119,13 +105,23 @@ struct radv_ray_traversal_vars { nir_deref_instr *bvh_base; /* stack is the current stack pointer/index. top_stack is the pointer/index that marks the end of - * traversal for the current BLAS/TLAS. + * traversal for the current BLAS/TLAS. stack_base is the low watermark of the short stack. */ nir_deref_instr *stack; nir_deref_instr *top_stack; + nir_deref_instr *stack_base; nir_deref_instr *current_node; + /* The node visited in the previous iteration. This is used in backtracking to jump to its parent + * and then find the child after the previously visited node. + */ + nir_deref_instr *previous_node; + + /* When entering an instance these are the instance node and the root node of the BLAS */ + nir_deref_instr *instance_top_node; + nir_deref_instr *instance_bottom_node; + /* Information about the current instance used for culling. */ nir_deref_instr *instance_id; nir_deref_instr *instance_addr; @@ -143,12 +139,13 @@ struct radv_ray_traversal_args { struct radv_ray_traversal_vars vars; - /* The increment/decrement used for radv_ray_traversal_vars::stack */ + /* The increment/decrement used for radv_ray_traversal_vars::stack, and how many entries are + * available. */ uint32_t stack_stride; + uint32_t stack_entries; radv_rt_stack_store_cb stack_store_cb; radv_rt_stack_load_cb stack_load_cb; - radv_rt_check_stack_overflow_cb check_stack_overflow_cb; radv_aabb_intersection_cb aabb_cb; radv_triangle_intersection_cb triangle_cb; @@ -156,6 +153,10 @@ struct radv_ray_traversal_args { void *data; }; +/* For the initialization of instance_bottom_node. Explicitly different than RADV_BVH_INVALID_NODE + * or any real node, to ensure we never exit an instance when we're not in one. */ +#define RADV_BVH_NO_INSTANCE_ROOT 0xfffffffeu + /* Builds the ray traversal loop and returns whether traversal is incomplete, similar to * rayQueryProceedEXT. Traversal will only be considered incomplete, if one of the specified * callbacks breaks out of the traversal loop.