radv: Build accaleration structures using LBVH

This sorts the leaf nodes along a morton curve before
creating the internal nodes. For reference:
https://developer.nvidia.com/blog/thinking-parallel-part-iii-tree-construction-gpu/

Ray query cts:
Test run totals:
  Passed:        22418/23426 (95.7%)
  Failed:        0/23426 (0.0%)
  Not supported: 1008/23426 (4.3%)
  Warnings:      0/23426 (0.0%)
  Waived:        0/23426 (0.0%)

Signed-off-by: Konstantin Seurer <konstantin.seurer@gmail.com>
Reviewed-by: Bas Nieuwenhuizen <bas@basnieuwenhuizen.nl>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/15648>
This commit is contained in:
Konstantin Seurer
2022-03-30 14:09:36 +02:00
parent 5d9ef0efb5
commit be57b085be
2 changed files with 494 additions and 147 deletions

View File

@@ -29,12 +29,48 @@
#include "radv_cs.h"
#include "radv_meta.h"
#include "radix_sort/radv_radix_sort.h"
/* Min and max bounds of the bvh used to compute morton codes */
#define SCRATCH_TOTAL_BOUNDS_SIZE (6 * sizeof(float))
enum accel_struct_build {
accel_struct_build_unoptimized,
accel_struct_build_lbvh,
};
static enum accel_struct_build
get_accel_struct_build(const struct radv_physical_device *pdevice,
VkAccelerationStructureBuildTypeKHR buildType)
{
if (buildType != VK_ACCELERATION_STRUCTURE_BUILD_TYPE_DEVICE_KHR)
return accel_struct_build_unoptimized;
return (pdevice->rad_info.chip_class < GFX10) ? accel_struct_build_unoptimized
: accel_struct_build_lbvh;
}
static uint32_t
get_node_id_stride(enum accel_struct_build build_mode)
{
switch (build_mode) {
case accel_struct_build_unoptimized:
return 4;
case accel_struct_build_lbvh:
return 8;
default:
unreachable("Unhandled accel_struct_build!");
}
}
VKAPI_ATTR void VKAPI_CALL
radv_GetAccelerationStructureBuildSizesKHR(
VkDevice _device, VkAccelerationStructureBuildTypeKHR buildType,
const VkAccelerationStructureBuildGeometryInfoKHR *pBuildInfo,
const uint32_t *pMaxPrimitiveCounts, VkAccelerationStructureBuildSizesInfoKHR *pSizeInfo)
{
RADV_FROM_HANDLE(radv_device, device, _device);
uint64_t triangles = 0, boxes = 0, instances = 0;
STATIC_ASSERT(sizeof(struct radv_bvh_triangle_node) == 64);
@@ -79,9 +115,30 @@ radv_GetAccelerationStructureBuildSizesKHR(
pSizeInfo->accelerationStructureSize = size;
/* 2x the max number of nodes in a BVH layer (one uint32_t each) */
pSizeInfo->updateScratchSize = pSizeInfo->buildScratchSize =
MAX2(4096, 2 * (boxes + instances + triangles) * sizeof(uint32_t));
/* 2x the max number of nodes in a BVH layer and order information for sorting when using
* LBVH (one uint32_t each, two buffers) plus space to store the bounds.
* LBVH is only supported for device builds and hardware that supports global atomics.
*/
enum accel_struct_build build_mode = get_accel_struct_build(device->physical_device, buildType);
uint32_t node_id_stride = get_node_id_stride(build_mode);
uint32_t leaf_count = boxes + instances + triangles;
VkDeviceSize scratchSize = 2 * leaf_count * node_id_stride;
if (build_mode == accel_struct_build_lbvh) {
radix_sort_vk_memory_requirements_t requirements;
radix_sort_vk_get_memory_requirements(device->meta_state.accel_struct_build.radix_sort,
leaf_count, &requirements);
/* Make sure we have the space required by the radix sort. */
scratchSize = MAX2(scratchSize, requirements.keyvals_size * 2);
scratchSize += requirements.internal_size + SCRATCH_TOTAL_BOUNDS_SIZE;
}
scratchSize = MAX2(4096, scratchSize);
pSizeInfo->updateScratchSize = scratchSize;
pSizeInfo->buildScratchSize = scratchSize;
}
VKAPI_ATTR VkResult VKAPI_CALL
@@ -745,6 +802,19 @@ radv_CopyAccelerationStructureKHR(VkDevice _device, VkDeferredOperationKHR defer
return VK_SUCCESS;
}
static nir_builder
create_accel_build_shader(struct radv_device *device, const char *name)
{
nir_builder b = radv_meta_init_shader(device, MESA_SHADER_COMPUTE, "%s", name);
b.shader->info.workgroup_size[0] = 64;
assert(b.shader->info.workgroup_size[1] == 1);
assert(b.shader->info.workgroup_size[2] == 1);
assert(!b.shader->info.workgroup_size_variable);
return b;
}
static nir_ssa_def *
get_indices(nir_builder *b, nir_ssa_def *addr, nir_ssa_def *type, nir_ssa_def *id)
{
@@ -935,6 +1005,21 @@ struct build_primitive_constants {
};
};
struct bounds_constants {
uint64_t node_addr;
uint64_t scratch_addr;
};
struct morton_constants {
uint64_t node_addr;
uint64_t scratch_addr;
};
struct fill_constants {
uint64_t addr;
uint32_t value;
};
struct build_internal_constants {
uint64_t node_dst_addr;
uint64_t scratch_addr;
@@ -972,6 +1057,29 @@ nir_invert_3x3(nir_builder *b, nir_ssa_def *in[3][3], nir_ssa_def *out[3][3])
}
}
static nir_ssa_def *
id_to_node_id_offset(nir_builder *b, nir_ssa_def *global_id,
const struct radv_physical_device *pdevice)
{
uint32_t stride = get_node_id_stride(
get_accel_struct_build(pdevice, VK_ACCELERATION_STRUCTURE_BUILD_TYPE_DEVICE_KHR));
return nir_umul24(b, global_id, nir_imm_int(b, stride));
}
static nir_ssa_def *
id_to_morton_offset(nir_builder *b, nir_ssa_def *global_id,
const struct radv_physical_device *pdevice)
{
enum accel_struct_build build_mode =
get_accel_struct_build(pdevice, VK_ACCELERATION_STRUCTURE_BUILD_TYPE_DEVICE_KHR);
assert(build_mode == accel_struct_build_lbvh);
uint32_t stride = get_node_id_stride(build_mode);
return nir_iadd_imm(b, nir_umul24(b, global_id, nir_imm_int(b, stride)), sizeof(uint32_t));
}
static nir_shader *
build_leaf_shader(struct radv_device *dev)
{
@@ -1003,9 +1111,15 @@ build_leaf_shader(struct radv_device *dev)
nir_umul24(&b, nir_channels(&b, nir_load_workgroup_id(&b, 32), 1),
nir_imm_int(&b, b.shader->info.workgroup_size[0])),
nir_channels(&b, nir_load_local_invocation_id(&b), 1));
scratch_addr = nir_iadd(
&b, scratch_addr,
nir_u2u64(&b, nir_iadd(&b, scratch_offset, nir_umul24(&b, global_id, nir_imm_int(&b, 4)))));
nir_ssa_def *scratch_dst_addr =
nir_iadd(&b, scratch_addr,
nir_u2u64(&b, nir_iadd(&b, scratch_offset,
id_to_node_id_offset(&b, global_id, dev->physical_device))));
nir_variable *bounds[2] = {
nir_variable_create(b.shader, nir_var_shader_temp, vec3_type, "min_bound"),
nir_variable_create(b.shader, nir_var_shader_temp, vec3_type, "max_bound"),
};
nir_push_if(&b, nir_ieq_imm(&b, geom_type, VK_GEOMETRY_TYPE_TRIANGLES_KHR));
{ /* Triangles */
@@ -1053,6 +1167,22 @@ build_leaf_shader(struct radv_device *dev)
for (unsigned j = 0; j < 3; ++j)
node_data[i * 3 + j] = nir_fdph(&b, positions[i], nir_load_var(&b, transform[j]));
nir_ssa_def *min_bound = NULL;
nir_ssa_def *max_bound = NULL;
for (unsigned i = 0; i < 3; ++i) {
nir_ssa_def *position = nir_vec(&b, node_data + i * 3, 3);
if (min_bound) {
min_bound = nir_fmin(&b, min_bound, position);
max_bound = nir_fmax(&b, max_bound, position);
} else {
min_bound = position;
max_bound = position;
}
}
nir_store_var(&b, bounds[0], min_bound, 7);
nir_store_var(&b, bounds[1], max_bound, 7);
node_data[12] = global_id;
node_data[13] = geometry_id;
node_data[15] = nir_imm_int(&b, 9);
@@ -1066,7 +1196,7 @@ build_leaf_shader(struct radv_device *dev)
}
nir_ssa_def *node_id = nir_ushr_imm(&b, node_offset, 3);
nir_build_store_global(&b, node_id, scratch_addr);
nir_build_store_global(&b, node_id, scratch_dst_addr);
}
nir_push_else(&b, NULL);
nir_push_if(&b, nir_ieq_imm(&b, geom_type, VK_GEOMETRY_TYPE_AABBS_KHR));
@@ -1077,14 +1207,18 @@ build_leaf_shader(struct radv_device *dev)
nir_ssa_def *node_offset =
nir_iadd(&b, node_dst_offset, nir_umul24(&b, global_id, nir_imm_int(&b, 64)));
nir_ssa_def *aabb_node_dst_addr = nir_iadd(&b, node_dst_addr, nir_u2u64(&b, node_offset));
nir_ssa_def *node_id = nir_iadd_imm(&b, nir_ushr_imm(&b, node_offset, 3), 7);
nir_build_store_global(&b, node_id, scratch_addr);
nir_build_store_global(&b, node_id, scratch_dst_addr);
aabb_addr = nir_iadd(&b, aabb_addr, nir_u2u64(&b, nir_imul(&b, aabb_stride, global_id)));
nir_ssa_def *min_bound = nir_build_load_global(&b, 3, 32, nir_iadd_imm(&b, aabb_addr, 0));
nir_ssa_def *max_bound = nir_build_load_global(&b, 3, 32, nir_iadd_imm(&b, aabb_addr, 12));
nir_store_var(&b, bounds[0], min_bound, 7);
nir_store_var(&b, bounds[1], max_bound, 7);
nir_ssa_def *values[] = {nir_channel(&b, min_bound, 0),
nir_channel(&b, min_bound, 1),
nir_channel(&b, min_bound, 2),
@@ -1130,16 +1264,9 @@ build_leaf_shader(struct radv_device *dev)
nir_ssa_def *node_offset =
nir_iadd(&b, node_dst_offset, nir_umul24(&b, global_id, nir_imm_int(&b, 128)));
node_dst_addr = nir_iadd(&b, node_dst_addr, nir_u2u64(&b, node_offset));
nir_ssa_def *node_id = nir_iadd_imm(&b, nir_ushr_imm(&b, node_offset, 3), 6);
nir_build_store_global(&b, node_id, scratch_addr);
nir_variable *bounds[2] = {
nir_variable_create(b.shader, nir_var_shader_temp, vec3_type, "min_bound"),
nir_variable_create(b.shader, nir_var_shader_temp, vec3_type, "max_bound"),
};
nir_store_var(&b, bounds[0], nir_channels(&b, nir_imm_vec4(&b, NAN, NAN, NAN, NAN), 7), 7);
nir_store_var(&b, bounds[1], nir_channels(&b, nir_imm_vec4(&b, NAN, NAN, NAN, NAN), 7), 7);
nir_build_store_global(&b, node_id, scratch_dst_addr);
nir_ssa_def *header_addr = nir_pack_64_2x32(&b, nir_channels(&b, inst3, 12));
nir_push_if(&b, nir_ine_imm(&b, header_addr, 0));
@@ -1204,6 +1331,32 @@ build_leaf_shader(struct radv_device *dev)
nir_pop_if(&b, NULL);
nir_pop_if(&b, NULL);
if (get_accel_struct_build(dev->physical_device,
VK_ACCELERATION_STRUCTURE_BUILD_TYPE_DEVICE_KHR) !=
accel_struct_build_unoptimized) {
nir_ssa_def *min = nir_load_var(&b, bounds[0]);
nir_ssa_def *max = nir_load_var(&b, bounds[1]);
nir_ssa_def *min_reduced = nir_reduce(&b, min, .reduction_op = nir_op_fmin);
nir_ssa_def *max_reduced = nir_reduce(&b, max, .reduction_op = nir_op_fmax);
nir_push_if(&b, nir_elect(&b, 1));
nir_global_atomic_fmin(&b, 32, nir_isub(&b, scratch_addr, nir_imm_int64(&b, 24)),
nir_channel(&b, min_reduced, 0));
nir_global_atomic_fmin(&b, 32, nir_isub(&b, scratch_addr, nir_imm_int64(&b, 20)),
nir_channel(&b, min_reduced, 1));
nir_global_atomic_fmin(&b, 32, nir_isub(&b, scratch_addr, nir_imm_int64(&b, 16)),
nir_channel(&b, min_reduced, 2));
nir_global_atomic_fmax(&b, 32, nir_isub(&b, scratch_addr, nir_imm_int64(&b, 12)),
nir_channel(&b, max_reduced, 0));
nir_global_atomic_fmax(&b, 32, nir_isub(&b, scratch_addr, nir_imm_int64(&b, 8)),
nir_channel(&b, max_reduced, 1));
nir_global_atomic_fmax(&b, 32, nir_isub(&b, scratch_addr, nir_imm_int64(&b, 4)),
nir_channel(&b, max_reduced, 2));
}
return b.shader;
}
@@ -1267,6 +1420,89 @@ determine_bounds(nir_builder *b, nir_ssa_def *node_addr, nir_ssa_def *node_id,
nir_pop_if(b, NULL);
}
/* https://developer.nvidia.com/blog/thinking-parallel-part-iii-tree-construction-gpu/ */
static nir_ssa_def *
build_morton_component(nir_builder *b, nir_ssa_def *x)
{
x = nir_iand_imm(b, nir_imul_imm(b, x, 0x00000101u), 0x0F00F00Fu);
x = nir_iand_imm(b, nir_imul_imm(b, x, 0x00000011u), 0xC30C30C3u);
x = nir_iand_imm(b, nir_imul_imm(b, x, 0x00000005u), 0x49249249u);
return x;
}
static nir_shader *
build_morton_shader(struct radv_device *dev)
{
const struct glsl_type *vec3_type = glsl_vector_type(GLSL_TYPE_FLOAT, 3);
nir_builder b = create_accel_build_shader(dev, "accel_build_morton_shader");
/*
* push constants:
* i32 x 2: node address
* i32 x 2: scratch address
*/
nir_ssa_def *pconst0 =
nir_load_push_constant(&b, 4, 32, nir_imm_int(&b, 0), .base = 0, .range = 16);
nir_ssa_def *node_addr = nir_pack_64_2x32(&b, nir_channels(&b, pconst0, 0b0011));
nir_ssa_def *scratch_addr = nir_pack_64_2x32(&b, nir_channels(&b, pconst0, 0b1100));
nir_ssa_def *global_id =
nir_iadd(&b,
nir_imul_imm(&b, nir_channel(&b, nir_load_workgroup_id(&b, 32), 0),
b.shader->info.workgroup_size[0]),
nir_load_local_invocation_index(&b));
nir_ssa_def *node_id_addr = nir_iadd(
&b, scratch_addr, nir_u2u64(&b, id_to_node_id_offset(&b, global_id, dev->physical_device)));
nir_ssa_def *node_id =
nir_build_load_global(&b, 1, 32, node_id_addr, .align_mul = 4, .align_offset = 0);
nir_variable *node_bounds[2] = {
nir_variable_create(b.shader, nir_var_shader_temp, vec3_type, "min_bound"),
nir_variable_create(b.shader, nir_var_shader_temp, vec3_type, "max_bound"),
};
determine_bounds(&b, node_addr, node_id, node_bounds);
nir_ssa_def *node_min = nir_load_var(&b, node_bounds[0]);
nir_ssa_def *node_max = nir_load_var(&b, node_bounds[1]);
nir_ssa_def *node_pos =
nir_fmul(&b, nir_fadd(&b, node_min, node_max), nir_imm_vec3(&b, 0.5, 0.5, 0.5));
nir_ssa_def *bvh_min =
nir_build_load_global(&b, 3, 32, nir_isub(&b, scratch_addr, nir_imm_int64(&b, 24)),
.align_mul = 4, .align_offset = 0);
nir_ssa_def *bvh_max =
nir_build_load_global(&b, 3, 32, nir_isub(&b, scratch_addr, nir_imm_int64(&b, 12)),
.align_mul = 4, .align_offset = 0);
nir_ssa_def *bvh_size = nir_fsub(&b, bvh_max, bvh_min);
nir_ssa_def *normalized_node_pos = nir_fdiv(&b, nir_fsub(&b, node_pos, bvh_min), bvh_size);
nir_ssa_def *x_int =
nir_f2u32(&b, nir_fmul_imm(&b, nir_channel(&b, normalized_node_pos, 0), 255.0));
nir_ssa_def *x_morton = build_morton_component(&b, x_int);
nir_ssa_def *y_int =
nir_f2u32(&b, nir_fmul_imm(&b, nir_channel(&b, normalized_node_pos, 1), 255.0));
nir_ssa_def *y_morton = build_morton_component(&b, y_int);
nir_ssa_def *z_int =
nir_f2u32(&b, nir_fmul_imm(&b, nir_channel(&b, normalized_node_pos, 2), 255.0));
nir_ssa_def *z_morton = build_morton_component(&b, z_int);
nir_ssa_def *morton_code = nir_iadd(
&b, nir_iadd(&b, nir_ishl_imm(&b, x_morton, 2), nir_ishl_imm(&b, y_morton, 1)), z_morton);
nir_ssa_def *dst_addr = nir_iadd(
&b, scratch_addr, nir_u2u64(&b, id_to_morton_offset(&b, global_id, dev->physical_device)));
nir_build_store_global(&b, morton_code, dst_addr, .align_mul = 4);
return b.shader;
}
static nir_shader *
build_internal_shader(struct radv_device *dev)
{
@@ -1308,12 +1544,22 @@ build_internal_shader(struct radv_device *dev)
nir_ssa_def *node_offset = nir_iadd(&b, node_dst_offset, nir_ishl_imm(&b, global_id, 7));
nir_ssa_def *node_dst_addr = nir_iadd(&b, node_addr, nir_u2u64(&b, node_offset));
nir_ssa_def *src_nodes = nir_build_load_global(
&b, 4, 32,
nir_iadd(&b, scratch_addr,
nir_u2u64(&b, nir_iadd(&b, src_scratch_offset, nir_ishl_imm(&b, global_id, 4)))));
nir_build_store_global(&b, src_nodes, nir_iadd_imm(&b, node_dst_addr, 0));
nir_ssa_def *src_base_addr =
nir_iadd(&b, scratch_addr,
nir_u2u64(&b, nir_iadd(&b, src_scratch_offset,
id_to_node_id_offset(&b, src_idx, dev->physical_device))));
enum accel_struct_build build_mode =
get_accel_struct_build(dev->physical_device, VK_ACCELERATION_STRUCTURE_BUILD_TYPE_DEVICE_KHR);
uint32_t node_id_stride = get_node_id_stride(build_mode);
nir_ssa_def *src_nodes[4];
for (uint32_t i = 0; i < 4; i++) {
src_nodes[i] =
nir_build_load_global(&b, 1, 32, nir_iadd_imm(&b, src_base_addr, i * node_id_stride));
nir_build_store_global(&b, src_nodes[i], nir_iadd_imm(&b, node_dst_addr, i * 4));
}
nir_ssa_def *total_bounds[2] = {
nir_channels(&b, nir_imm_vec4(&b, NAN, NAN, NAN, NAN), 7),
@@ -1329,7 +1575,7 @@ build_internal_shader(struct radv_device *dev)
nir_store_var(&b, bounds[1], nir_channels(&b, nir_imm_vec4(&b, NAN, NAN, NAN, NAN), 7), 7);
nir_push_if(&b, nir_ilt(&b, nir_imm_int(&b, i), src_count));
determine_bounds(&b, node_addr, nir_channel(&b, src_nodes, i), bounds);
determine_bounds(&b, node_addr, src_nodes[i], bounds);
nir_pop_if(&b, NULL);
nir_build_store_global(&b, nir_load_var(&b, bounds[0]),
nir_iadd_imm(&b, node_dst_addr, 16 + 24 * i));
@@ -1342,7 +1588,8 @@ build_internal_shader(struct radv_device *dev)
nir_ssa_def *node_id = nir_iadd_imm(&b, nir_ushr_imm(&b, node_offset, 3), 5);
nir_ssa_def *dst_scratch_addr =
nir_iadd(&b, scratch_addr,
nir_u2u64(&b, nir_iadd(&b, dst_scratch_offset, nir_ishl_imm(&b, global_id, 2))));
nir_u2u64(&b, nir_iadd(&b, dst_scratch_offset,
id_to_node_id_offset(&b, global_id, dev->physical_device))));
nir_build_store_global(&b, node_id, dst_scratch_addr);
nir_push_if(&b, fill_header);
@@ -1586,12 +1833,82 @@ radv_device_finish_accel_struct_build_state(struct radv_device *device)
&state->alloc);
radv_DestroyPipeline(radv_device_to_handle(device), state->accel_struct_build.leaf_pipeline,
&state->alloc);
radv_DestroyPipeline(radv_device_to_handle(device), state->accel_struct_build.morton_pipeline,
&state->alloc);
radv_DestroyPipelineLayout(radv_device_to_handle(device),
state->accel_struct_build.copy_p_layout, &state->alloc);
radv_DestroyPipelineLayout(radv_device_to_handle(device),
state->accel_struct_build.internal_p_layout, &state->alloc);
radv_DestroyPipelineLayout(radv_device_to_handle(device),
state->accel_struct_build.leaf_p_layout, &state->alloc);
radv_DestroyPipelineLayout(radv_device_to_handle(device),
state->accel_struct_build.morton_p_layout, &state->alloc);
if (state->accel_struct_build.radix_sort)
radix_sort_vk_destroy(state->accel_struct_build.radix_sort, radv_device_to_handle(device),
&state->alloc);
}
static VkResult
create_build_pipeline(struct radv_device *device, nir_shader *shader, unsigned push_constant_size,
VkPipeline *pipeline, VkPipelineLayout *layout)
{
const VkPipelineLayoutCreateInfo pl_create_info = {
.sType = VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO,
.setLayoutCount = 0,
.pushConstantRangeCount = 1,
.pPushConstantRanges =
&(VkPushConstantRange){VK_SHADER_STAGE_COMPUTE_BIT, 0, push_constant_size},
};
VkResult result = radv_CreatePipelineLayout(radv_device_to_handle(device), &pl_create_info,
&device->meta_state.alloc, layout);
if (result != VK_SUCCESS) {
radv_device_finish_accel_struct_build_state(device);
ralloc_free(shader);
return result;
}
VkPipelineShaderStageCreateInfo shader_stage = {
.sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO,
.stage = VK_SHADER_STAGE_COMPUTE_BIT,
.module = vk_shader_module_handle_from_nir(shader),
.pName = "main",
.pSpecializationInfo = NULL,
};
VkComputePipelineCreateInfo pipeline_info = {
.sType = VK_STRUCTURE_TYPE_COMPUTE_PIPELINE_CREATE_INFO,
.stage = shader_stage,
.flags = 0,
.layout = *layout,
};
result = radv_CreateComputePipelines(radv_device_to_handle(device),
radv_pipeline_cache_to_handle(&device->meta_state.cache), 1,
&pipeline_info, &device->meta_state.alloc, pipeline);
if (result != VK_SUCCESS) {
radv_device_finish_accel_struct_build_state(device);
ralloc_free(shader);
return result;
}
return VK_SUCCESS;
}
static void
radix_sort_fill_buffer(VkCommandBuffer commandBuffer,
radix_sort_vk_buffer_info_t const *buffer_info, VkDeviceSize offset,
VkDeviceSize size, uint32_t data)
{
RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
assert(size % 4 == 0);
assert(size != VK_WHOLE_SIZE);
radv_fill_buffer_shader(cmd_buffer, buffer_info->devaddr + buffer_info->offset + offset, size,
data);
}
VkResult
@@ -1602,122 +1919,47 @@ radv_device_init_accel_struct_build_state(struct radv_device *device)
nir_shader *internal_cs = build_internal_shader(device);
nir_shader *copy_cs = build_copy_shader(device);
const VkPipelineLayoutCreateInfo leaf_pl_create_info = {
.sType = VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO,
.setLayoutCount = 0,
.pushConstantRangeCount = 1,
.pPushConstantRanges = &(VkPushConstantRange){VK_SHADER_STAGE_COMPUTE_BIT, 0,
sizeof(struct build_primitive_constants)},
};
result = radv_CreatePipelineLayout(radv_device_to_handle(device), &leaf_pl_create_info,
&device->meta_state.alloc,
&device->meta_state.accel_struct_build.leaf_p_layout);
result = create_build_pipeline(device, leaf_cs, sizeof(struct build_primitive_constants),
&device->meta_state.accel_struct_build.leaf_pipeline,
&device->meta_state.accel_struct_build.leaf_p_layout);
if (result != VK_SUCCESS)
goto fail;
return result;
VkPipelineShaderStageCreateInfo leaf_shader_stage = {
.sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO,
.stage = VK_SHADER_STAGE_COMPUTE_BIT,
.module = vk_shader_module_handle_from_nir(leaf_cs),
.pName = "main",
.pSpecializationInfo = NULL,
};
VkComputePipelineCreateInfo leaf_pipeline_info = {
.sType = VK_STRUCTURE_TYPE_COMPUTE_PIPELINE_CREATE_INFO,
.stage = leaf_shader_stage,
.flags = 0,
.layout = device->meta_state.accel_struct_build.leaf_p_layout,
};
result = radv_CreateComputePipelines(
radv_device_to_handle(device), radv_pipeline_cache_to_handle(&device->meta_state.cache), 1,
&leaf_pipeline_info, NULL, &device->meta_state.accel_struct_build.leaf_pipeline);
result = create_build_pipeline(device, internal_cs, sizeof(struct build_internal_constants),
&device->meta_state.accel_struct_build.internal_pipeline,
&device->meta_state.accel_struct_build.internal_p_layout);
if (result != VK_SUCCESS)
goto fail;
return result;
const VkPipelineLayoutCreateInfo internal_pl_create_info = {
.sType = VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO,
.setLayoutCount = 0,
.pushConstantRangeCount = 1,
.pPushConstantRanges = &(VkPushConstantRange){VK_SHADER_STAGE_COMPUTE_BIT, 0,
sizeof(struct build_internal_constants)},
};
result = create_build_pipeline(device, copy_cs, sizeof(struct copy_constants),
&device->meta_state.accel_struct_build.copy_pipeline,
&device->meta_state.accel_struct_build.copy_p_layout);
result = radv_CreatePipelineLayout(radv_device_to_handle(device), &internal_pl_create_info,
&device->meta_state.alloc,
&device->meta_state.accel_struct_build.internal_p_layout);
if (result != VK_SUCCESS)
goto fail;
return result;
VkPipelineShaderStageCreateInfo internal_shader_stage = {
.sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO,
.stage = VK_SHADER_STAGE_COMPUTE_BIT,
.module = vk_shader_module_handle_from_nir(internal_cs),
.pName = "main",
.pSpecializationInfo = NULL,
};
if (get_accel_struct_build(device->physical_device,
VK_ACCELERATION_STRUCTURE_BUILD_TYPE_DEVICE_KHR) ==
accel_struct_build_lbvh) {
nir_shader *morton_cs = build_morton_shader(device);
VkComputePipelineCreateInfo internal_pipeline_info = {
.sType = VK_STRUCTURE_TYPE_COMPUTE_PIPELINE_CREATE_INFO,
.stage = internal_shader_stage,
.flags = 0,
.layout = device->meta_state.accel_struct_build.internal_p_layout,
};
result = create_build_pipeline(device, morton_cs, sizeof(struct morton_constants),
&device->meta_state.accel_struct_build.morton_pipeline,
&device->meta_state.accel_struct_build.morton_p_layout);
if (result != VK_SUCCESS)
return result;
result = radv_CreateComputePipelines(
radv_device_to_handle(device), radv_pipeline_cache_to_handle(&device->meta_state.cache), 1,
&internal_pipeline_info, NULL, &device->meta_state.accel_struct_build.internal_pipeline);
if (result != VK_SUCCESS)
goto fail;
device->meta_state.accel_struct_build.radix_sort =
radv_create_radix_sort_u64(radv_device_to_handle(device), &device->meta_state.alloc,
radv_pipeline_cache_to_handle(&device->meta_state.cache));
const VkPipelineLayoutCreateInfo copy_pl_create_info = {
.sType = VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO,
.setLayoutCount = 0,
.pushConstantRangeCount = 1,
.pPushConstantRanges =
&(VkPushConstantRange){VK_SHADER_STAGE_COMPUTE_BIT, 0, sizeof(struct copy_constants)},
};
struct radix_sort_vk_sort_devaddr_info *radix_sort_info =
&device->meta_state.accel_struct_build.radix_sort_info;
radix_sort_info->ext = NULL;
radix_sort_info->key_bits = 24;
radix_sort_info->fill_buffer = radix_sort_fill_buffer;
}
result = radv_CreatePipelineLayout(radv_device_to_handle(device), &copy_pl_create_info,
&device->meta_state.alloc,
&device->meta_state.accel_struct_build.copy_p_layout);
if (result != VK_SUCCESS)
goto fail;
VkPipelineShaderStageCreateInfo copy_shader_stage = {
.sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO,
.stage = VK_SHADER_STAGE_COMPUTE_BIT,
.module = vk_shader_module_handle_from_nir(copy_cs),
.pName = "main",
.pSpecializationInfo = NULL,
};
VkComputePipelineCreateInfo copy_pipeline_info = {
.sType = VK_STRUCTURE_TYPE_COMPUTE_PIPELINE_CREATE_INFO,
.stage = copy_shader_stage,
.flags = 0,
.layout = device->meta_state.accel_struct_build.copy_p_layout,
};
result = radv_CreateComputePipelines(
radv_device_to_handle(device), radv_pipeline_cache_to_handle(&device->meta_state.cache), 1,
&copy_pipeline_info, NULL, &device->meta_state.accel_struct_build.copy_pipeline);
if (result != VK_SUCCESS)
goto fail;
ralloc_free(copy_cs);
ralloc_free(internal_cs);
ralloc_free(leaf_cs);
return VK_SUCCESS;
fail:
radv_device_finish_accel_struct_build_state(device);
ralloc_free(copy_cs);
ralloc_free(internal_cs);
ralloc_free(leaf_cs);
return result;
}
@@ -1725,6 +1967,8 @@ struct bvh_state {
uint32_t node_offset;
uint32_t node_count;
uint32_t scratch_offset;
uint32_t buffer_1_offset;
uint32_t buffer_2_offset;
uint32_t instance_offset;
uint32_t instance_count;
@@ -1739,12 +1983,35 @@ radv_CmdBuildAccelerationStructuresKHR(
RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
struct radv_meta_saved_state saved_state;
enum radv_cmd_flush_bits flush_bits =
RADV_CMD_FLAG_CS_PARTIAL_FLUSH |
radv_src_access_flush(cmd_buffer, VK_ACCESS_2_SHADER_READ_BIT | VK_ACCESS_2_SHADER_WRITE_BIT,
NULL) |
radv_dst_access_flush(cmd_buffer, VK_ACCESS_2_SHADER_READ_BIT | VK_ACCESS_2_SHADER_WRITE_BIT,
NULL);
enum accel_struct_build build_mode = get_accel_struct_build(
cmd_buffer->device->physical_device, VK_ACCELERATION_STRUCTURE_BUILD_TYPE_DEVICE_KHR);
uint32_t node_id_stride = get_node_id_stride(build_mode);
uint32_t scratch_offset =
(build_mode != accel_struct_build_unoptimized) ? SCRATCH_TOTAL_BOUNDS_SIZE : 0;
radv_meta_save(
&saved_state, cmd_buffer,
RADV_META_SAVE_COMPUTE_PIPELINE | RADV_META_SAVE_DESCRIPTORS | RADV_META_SAVE_CONSTANTS);
struct bvh_state *bvh_states = calloc(infoCount, sizeof(struct bvh_state));
radv_CmdBindPipeline(radv_cmd_buffer_to_handle(cmd_buffer), VK_PIPELINE_BIND_POINT_COMPUTE,
if (build_mode != accel_struct_build_unoptimized) {
for (uint32_t i = 0; i < infoCount; ++i) {
/* Clear the bvh bounds with nan. */
radv_fill_buffer_shader(cmd_buffer, pInfos[i].scratchData.deviceAddress, 6 * sizeof(float),
0x7FC00000);
}
cmd_buffer->state.flush_bits |= flush_bits;
}
radv_CmdBindPipeline(commandBuffer, VK_PIPELINE_BIND_POINT_COMPUTE,
cmd_buffer->device->meta_state.accel_struct_build.leaf_pipeline);
for (uint32_t i = 0; i < infoCount; ++i) {
@@ -1753,7 +2020,7 @@ radv_CmdBuildAccelerationStructuresKHR(
struct build_primitive_constants prim_consts = {
.node_dst_addr = radv_accel_struct_get_va(accel_struct),
.scratch_addr = pInfos[i].scratchData.deviceAddress,
.scratch_addr = pInfos[i].scratchData.deviceAddress + scratch_offset,
.dst_offset = ALIGN(sizeof(struct radv_accel_struct_header), 64) + 128,
.dst_scratch_offset = 0,
};
@@ -1805,20 +2072,91 @@ radv_CmdBuildAccelerationStructuresKHR(
unreachable("Unknown geometryType");
}
radv_CmdPushConstants(radv_cmd_buffer_to_handle(cmd_buffer),
cmd_buffer->device->meta_state.accel_struct_build.leaf_p_layout,
VK_SHADER_STAGE_COMPUTE_BIT, 0, sizeof(prim_consts),
&prim_consts);
radv_CmdPushConstants(
commandBuffer, cmd_buffer->device->meta_state.accel_struct_build.leaf_p_layout,
VK_SHADER_STAGE_COMPUTE_BIT, 0, sizeof(prim_consts), &prim_consts);
radv_unaligned_dispatch(cmd_buffer, ppBuildRangeInfos[i][j].primitiveCount, 1, 1);
prim_consts.dst_offset += prim_size * ppBuildRangeInfos[i][j].primitiveCount;
prim_consts.dst_scratch_offset += 4 * ppBuildRangeInfos[i][j].primitiveCount;
prim_consts.dst_scratch_offset +=
node_id_stride * ppBuildRangeInfos[i][j].primitiveCount;
}
}
bvh_states[i].node_offset = prim_consts.dst_offset;
bvh_states[i].node_count = prim_consts.dst_scratch_offset / 4;
bvh_states[i].node_count = prim_consts.dst_scratch_offset / node_id_stride;
}
radv_CmdBindPipeline(radv_cmd_buffer_to_handle(cmd_buffer), VK_PIPELINE_BIND_POINT_COMPUTE,
if (build_mode == accel_struct_build_lbvh) {
cmd_buffer->state.flush_bits |= flush_bits;
radv_CmdBindPipeline(commandBuffer, VK_PIPELINE_BIND_POINT_COMPUTE,
cmd_buffer->device->meta_state.accel_struct_build.morton_pipeline);
for (uint32_t i = 0; i < infoCount; ++i) {
RADV_FROM_HANDLE(radv_acceleration_structure, accel_struct,
pInfos[i].dstAccelerationStructure);
const struct morton_constants consts = {
.node_addr = radv_accel_struct_get_va(accel_struct),
.scratch_addr = pInfos[i].scratchData.deviceAddress + SCRATCH_TOTAL_BOUNDS_SIZE,
};
radv_CmdPushConstants(commandBuffer,
cmd_buffer->device->meta_state.accel_struct_build.morton_p_layout,
VK_SHADER_STAGE_COMPUTE_BIT, 0, sizeof(consts), &consts);
radv_unaligned_dispatch(cmd_buffer, bvh_states[i].node_count, 1, 1);
}
cmd_buffer->state.flush_bits |= flush_bits;
for (uint32_t i = 0; i < infoCount; ++i) {
struct radix_sort_vk_memory_requirements requirements;
radix_sort_vk_get_memory_requirements(
cmd_buffer->device->meta_state.accel_struct_build.radix_sort, bvh_states[i].node_count,
&requirements);
struct radix_sort_vk_sort_devaddr_info info =
cmd_buffer->device->meta_state.accel_struct_build.radix_sort_info;
info.count = bvh_states[i].node_count;
VkDeviceAddress base_addr =
pInfos[i].scratchData.deviceAddress + SCRATCH_TOTAL_BOUNDS_SIZE;
info.keyvals_even.buffer = VK_NULL_HANDLE;
info.keyvals_even.offset = 0;
info.keyvals_even.devaddr = base_addr;
info.keyvals_odd = base_addr + requirements.keyvals_size;
info.internal.buffer = VK_NULL_HANDLE;
info.internal.offset = 0;
info.internal.devaddr = base_addr + requirements.keyvals_size * 2;
VkDeviceAddress result_addr;
radix_sort_vk_sort_devaddr(cmd_buffer->device->meta_state.accel_struct_build.radix_sort,
&info, radv_device_to_handle(cmd_buffer->device), commandBuffer,
&result_addr);
assert(result_addr == info.keyvals_even.devaddr || result_addr == info.keyvals_odd);
if (result_addr == info.keyvals_even.devaddr) {
bvh_states[i].buffer_1_offset = SCRATCH_TOTAL_BOUNDS_SIZE;
bvh_states[i].buffer_2_offset = SCRATCH_TOTAL_BOUNDS_SIZE + requirements.keyvals_size;
} else {
bvh_states[i].buffer_1_offset = SCRATCH_TOTAL_BOUNDS_SIZE + requirements.keyvals_size;
bvh_states[i].buffer_2_offset = SCRATCH_TOTAL_BOUNDS_SIZE;
}
bvh_states[i].scratch_offset = bvh_states[i].buffer_1_offset;
}
cmd_buffer->state.flush_bits |= flush_bits;
} else {
for (uint32_t i = 0; i < infoCount; ++i) {
bvh_states[i].buffer_1_offset = 0;
bvh_states[i].buffer_2_offset = bvh_states[i].node_count * 4;
}
}
radv_CmdBindPipeline(commandBuffer, VK_PIPELINE_BIND_POINT_COMPUTE,
cmd_buffer->device->meta_state.accel_struct_build.internal_pipeline);
bool progress = true;
for (unsigned iter = 0; progress; ++iter) {
@@ -1830,18 +2168,20 @@ radv_CmdBuildAccelerationStructuresKHR(
if (iter && bvh_states[i].node_count == 1)
continue;
if (!progress) {
cmd_buffer->state.flush_bits |=
RADV_CMD_FLAG_CS_PARTIAL_FLUSH |
radv_src_access_flush(cmd_buffer, VK_ACCESS_2_SHADER_WRITE_BIT, NULL) |
radv_dst_access_flush(cmd_buffer,
VK_ACCESS_2_SHADER_READ_BIT | VK_ACCESS_2_SHADER_WRITE_BIT, NULL);
}
if (!progress)
cmd_buffer->state.flush_bits |= flush_bits;
progress = true;
uint32_t dst_node_count = MAX2(1, DIV_ROUND_UP(bvh_states[i].node_count, 4));
bool final_iter = dst_node_count == 1;
uint32_t src_scratch_offset = bvh_states[i].scratch_offset;
uint32_t dst_scratch_offset = src_scratch_offset ? 0 : bvh_states[i].node_count * 4;
uint32_t buffer_1_offset = bvh_states[i].buffer_1_offset;
uint32_t buffer_2_offset = bvh_states[i].buffer_2_offset;
uint32_t dst_scratch_offset =
(src_scratch_offset == buffer_1_offset) ? buffer_2_offset : buffer_1_offset;
uint32_t dst_node_offset = bvh_states[i].node_offset;
if (final_iter)
dst_node_offset = ALIGN(sizeof(struct radv_accel_struct_header), 64);
@@ -1855,7 +2195,7 @@ radv_CmdBuildAccelerationStructuresKHR(
.fill_header = bvh_states[i].node_count | (final_iter ? 0x80000000U : 0),
};
radv_CmdPushConstants(radv_cmd_buffer_to_handle(cmd_buffer),
radv_CmdPushConstants(commandBuffer,
cmd_buffer->device->meta_state.accel_struct_build.internal_p_layout,
VK_SHADER_STAGE_COMPUTE_BIT, 0, sizeof(consts), &consts);
radv_unaligned_dispatch(cmd_buffer, dst_node_count, 1, 1);

View File

@@ -82,6 +82,8 @@
#include "radv_shader_args.h"
#include "sid.h"
#include "radix_sort/radix_sort_vk_devaddr.h"
/* Pre-declarations needed for WSI entrypoints */
struct wl_surface;
struct wl_display;
@@ -661,10 +663,15 @@ struct radv_meta_state {
struct {
VkPipelineLayout leaf_p_layout;
VkPipeline leaf_pipeline;
VkPipelineLayout morton_p_layout;
VkPipeline morton_pipeline;
VkPipelineLayout internal_p_layout;
VkPipeline internal_pipeline;
VkPipelineLayout copy_p_layout;
VkPipeline copy_pipeline;
struct radix_sort_vk *radix_sort;
struct radix_sort_vk_sort_devaddr_info radix_sort_info;
} accel_struct_build;
struct {