diff --git a/meson.build b/meson.build index db2e404a3fc..766883d84e7 100644 --- a/meson.build +++ b/meson.build @@ -316,6 +316,10 @@ with_intel_vk_rt = get_option('intel-rt') \ .disable_if(host_machine.cpu_family() != 'x86_64', error_message : 'Intel Ray Tracing is only supported on x86_64') \ .allowed() +if with_intel_vk_rt + with_intel_bvh_grl = get_option('intel-bvh-grl') +endif + with_any_intel = [ with_gallium_crocus, with_gallium_i915, diff --git a/meson_options.txt b/meson_options.txt index 42e819dacd2..cfc5862c478 100644 --- a/meson_options.txt +++ b/meson_options.txt @@ -657,6 +657,13 @@ option( description : 'Build the intel-clc compiler or use a system version.' ) +option( + 'intel-bvh-grl', + type : 'boolean', + value : true, + description : 'Build the BVH structure using GRL.' +) + option( 'install-intel-clc', type : 'boolean', diff --git a/src/intel/vulkan/anv_device.c b/src/intel/vulkan/anv_device.c index df4769e22c1..40bf5ed6f05 100644 --- a/src/intel/vulkan/anv_device.c +++ b/src/intel/vulkan/anv_device.c @@ -985,14 +985,22 @@ VkResult anv_CreateDevice( anv_device_utrace_init(device); - result = anv_genX(device->info, init_device_state)(device); + result = vk_meta_device_init(&device->vk, &device->meta_device); if (result != VK_SUCCESS) goto fail_utrace; + result = anv_genX(device->info, init_device_state)(device); + if (result != VK_SUCCESS) + goto fail_meta_device; + + simple_mtx_init(&device->accel_struct_build.mutex, mtx_plain); + *pDevice = anv_device_to_handle(device); return VK_SUCCESS; + fail_meta_device: + vk_meta_device_finish(&device->vk, &device->meta_device); fail_utrace: anv_device_utrace_finish(device); fail_queues: @@ -1118,6 +1126,12 @@ void anv_DestroyDevice( /* Do TRTT batch garbage collection before destroying queues. */ anv_device_finish_trtt(device); + if (device->accel_struct_build.radix_sort) { + radix_sort_vk_destroy(device->accel_struct_build.radix_sort, + _device, &device->vk.alloc); + } + vk_meta_device_finish(&device->vk, &device->meta_device); + anv_device_utrace_finish(device); for (uint32_t i = 0; i < device->queue_count; i++) @@ -1218,6 +1232,8 @@ void anv_DestroyDevice( pthread_cond_destroy(&device->queue_submit); pthread_mutex_destroy(&device->mutex); + simple_mtx_destroy(&device->accel_struct_build.mutex); + ralloc_free(device->fp64_nir); anv_device_destroy_context_or_vm(device); diff --git a/src/intel/vulkan/anv_private.h b/src/intel/vulkan/anv_private.h index 1eb2c5a1fde..4b78fd2e61a 100644 --- a/src/intel/vulkan/anv_private.h +++ b/src/intel/vulkan/anv_private.h @@ -102,6 +102,7 @@ #include "vk_log.h" #include "vk_ycbcr_conversion.h" #include "vk_video.h" +#include "vk_meta.h" #ifdef __cplusplus extern "C" { @@ -1828,6 +1829,11 @@ enum anv_rt_bvh_build_method { ANV_BVH_BUILD_METHOD_NEW_SAH, }; +/* If serialization-breaking or algorithm-breaking changes are made, + * increment the digits at the end + */ +#define ANV_RT_UUID_MACRO "ANV_RT_BVH_0001" + struct anv_device_astc_emu { struct vk_texcompress_astc_state *texcompress; @@ -2102,6 +2108,14 @@ struct anv_device { */ struct util_dynarray prints; } printf; + + struct { + simple_mtx_t mutex; + struct radix_sort_vk *radix_sort; + struct vk_acceleration_structure_build_args build_args; + } accel_struct_build; + + struct vk_meta_device meta_device; }; static inline uint32_t diff --git a/src/intel/vulkan/bvh/meson.build b/src/intel/vulkan/bvh/meson.build new file mode 100644 index 00000000000..68c0ebb3222 --- /dev/null +++ b/src/intel/vulkan/bvh/meson.build @@ -0,0 +1,50 @@ +# Copyright © 2022 Konstantin Seurer +# Copyright © 2024 Intel Corporation +# SPDX-License-Identifier: MIT + +# source file, output name, defines +bvh_shaders = [ + [ + 'encode.comp', + 'encode', + [], + ], + [ + 'header.comp', + 'header', + [], + ], + [ + 'copy.comp', + 'copy', + [] + ], +] + +anv_bvh_include_dir = dir_source_root + '/src/intel/vulkan/bvh' + +anv_bvh_includes = files( + 'anv_build_helpers.h', + 'anv_build_interface.h', + 'anv_bvh.h', +) + +bvh_spv = [] +foreach s : bvh_shaders + command = [ + prog_glslang, '-V', '-I' + vk_bvh_include_dir, '-I' + anv_bvh_include_dir, '--target-env', 'spirv1.5', '-x', '-o', '@OUTPUT@', '@INPUT@' + ] + command += glslang_quiet + + foreach define : s[2] + command += '-D' + define + endforeach + + bvh_spv += custom_target( + s[1] + '.spv.h', + input : s[0], + output : s[1] + '.spv.h', + command : command, + depend_files: [vk_bvh_includes, anv_bvh_includes], + ) +endforeach diff --git a/src/intel/vulkan/genX_acceleration_structure.c b/src/intel/vulkan/genX_acceleration_structure.c index e06142308c1..45eb2c03d0d 100644 --- a/src/intel/vulkan/genX_acceleration_structure.c +++ b/src/intel/vulkan/genX_acceleration_structure.c @@ -1,24 +1,5 @@ -/* - * Copyright © 2020 Intel Corporation - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the "Software"), - * to deal in the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice (including the next - * paragraph) shall be included in all copies or substantial portions of the - * Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. +/* Copyright © 2024 Intel Corporation + * SPDX-License-Identifier: MIT */ #include "anv_private.h" @@ -35,8 +16,410 @@ #include "ds/intel_tracepoints.h" +#include "bvh/anv_build_interface.h" +#include "vk_acceleration_structure.h" +#include "radix_sort/radix_sort_u64.h" +#include "radix_sort/common/vk/barrier.h" + +#include "vk_common_entrypoints.h" +#include "genX_mi_builder.h" + #if GFX_VERx10 >= 125 +static const uint32_t encode_spv[] = { +#include "bvh/encode.spv.h" +}; + +static const uint32_t header_spv[] = { +#include "bvh/header.spv.h" +}; + +static const uint32_t copy_spv[] = { +#include "bvh/copy.spv.h" +}; + +static VkResult +get_pipeline_spv(struct anv_device *device, + const char *name, const uint32_t *spv, uint32_t spv_size, + unsigned push_constant_size, VkPipeline *pipeline, + VkPipelineLayout *layout) +{ + + size_t key_size = strlen(name); + + const VkPushConstantRange pc_range = { + .stageFlags = VK_SHADER_STAGE_COMPUTE_BIT, + .offset = 0, + .size = push_constant_size, + }; + + VkResult result = vk_meta_get_pipeline_layout(&device->vk, + &device->meta_device, NULL, + &pc_range, name, key_size, + layout); + + if (result != VK_SUCCESS) + return result; + + VkPipeline pipeline_from_cache = + vk_meta_lookup_pipeline(&device->meta_device, name, key_size); + if (pipeline_from_cache != VK_NULL_HANDLE) { + *pipeline = pipeline_from_cache; + return VK_SUCCESS; + } + + VkShaderModuleCreateInfo module_info = { + .sType = VK_STRUCTURE_TYPE_SHADER_MODULE_CREATE_INFO, + .pNext = NULL, + .flags = 0, + .codeSize = spv_size, + .pCode = spv, + }; + + VkPipelineShaderStageCreateInfo shader_stage = { + .sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO, + .pNext = &module_info, + .flags = 0, + .stage = VK_SHADER_STAGE_COMPUTE_BIT, + .pName = "main", + .pSpecializationInfo = NULL, + }; + + VkComputePipelineCreateInfo pipeline_info = { + .sType = VK_STRUCTURE_TYPE_COMPUTE_PIPELINE_CREATE_INFO, + .flags = 0, + .stage = shader_stage, + .layout = *layout, + }; + + return vk_meta_create_compute_pipeline(&device->vk, &device->meta_device, + &pipeline_info, name, key_size, pipeline); +} + +static void +get_bvh_layout(VkGeometryTypeKHR geometry_type, uint32_t leaf_count, + struct bvh_layout *layout) +{ + uint32_t internal_count = MAX2(leaf_count, 2) - 1; + + uint64_t offset = ANV_RT_BVH_HEADER_SIZE; + + /* For a TLAS, we store the address of anv_instance_leaf after header + * This is for quick access in the copy.comp + */ + if (geometry_type == VK_GEOMETRY_TYPE_INSTANCES_KHR) { + offset += leaf_count * sizeof(uint64_t); + } + /* The BVH and hence bvh_offset needs 64 byte alignment for RT nodes. */ + offset = ALIGN(offset, 64); + + /* This is where internal_nodes/leaves start to be encoded */ + layout->bvh_offset = offset; + + offset += internal_count * ANV_RT_INTERNAL_NODE_SIZE; + + switch (geometry_type) { + case VK_GEOMETRY_TYPE_TRIANGLES_KHR: + /* Currently we encode one triangle within one quad leaf */ + offset += leaf_count * ANV_RT_QUAD_LEAF_SIZE; + break; + case VK_GEOMETRY_TYPE_AABBS_KHR: + offset += leaf_count * ANV_RT_PROCEDURAL_LEAF_SIZE; + break; + case VK_GEOMETRY_TYPE_INSTANCES_KHR: + offset += leaf_count * ANV_RT_INSTANCE_LEAF_SIZE; + break; + default: + unreachable("Unknown VkGeometryTypeKHR"); + } + + layout->size = offset; +} + +static VkDeviceSize +anv_get_as_size(VkDevice device, + const VkAccelerationStructureBuildGeometryInfoKHR *pBuildInfo, + uint32_t leaf_count) +{ + struct bvh_layout layout; + get_bvh_layout(vk_get_as_geometry_type(pBuildInfo), leaf_count, &layout); + return layout.size; +} + +static uint32_t +anv_get_encode_key(VkAccelerationStructureTypeKHR type, + VkBuildAccelerationStructureFlagBitsKHR flags) +{ + return 0; +} + +static VkResult +anv_encode_bind_pipeline(VkCommandBuffer commandBuffer, uint32_t key) +{ + VK_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer); + struct anv_device *device = cmd_buffer->device; + + VkPipeline pipeline; + VkPipelineLayout layout; + VkResult result = get_pipeline_spv(device, "encode", encode_spv, + sizeof(encode_spv), + sizeof(struct encode_args), &pipeline, + &layout); + if (result != VK_SUCCESS) + return result; + + anv_CmdBindPipeline(commandBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, + pipeline); + + return VK_SUCCESS; +} + +static void +anv_encode_as(VkCommandBuffer commandBuffer, + const VkAccelerationStructureBuildGeometryInfoKHR *build_info, + const VkAccelerationStructureBuildRangeInfoKHR *build_range_infos, + VkDeviceAddress intermediate_as_addr, + VkDeviceAddress intermediate_header_addr, uint32_t leaf_count, + uint32_t key, + struct vk_acceleration_structure *dst) +{ + VK_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer); + struct anv_device *device = cmd_buffer->device; + + VkGeometryTypeKHR geometry_type = vk_get_as_geometry_type(build_info); + + VkPipeline pipeline; + VkPipelineLayout layout; + get_pipeline_spv(device, "encode", encode_spv, sizeof(encode_spv), + sizeof(struct encode_args), &pipeline, &layout); + + STATIC_ASSERT(sizeof(struct anv_accel_struct_header) == ANV_RT_BVH_HEADER_SIZE); + STATIC_ASSERT(sizeof(struct anv_instance_leaf) == ANV_RT_INSTANCE_LEAF_SIZE); + STATIC_ASSERT(sizeof(struct anv_quad_leaf_node) == ANV_RT_QUAD_LEAF_SIZE); + STATIC_ASSERT(sizeof(struct anv_procedural_leaf_node) == ANV_RT_PROCEDURAL_LEAF_SIZE); + STATIC_ASSERT(sizeof(struct anv_internal_node) == ANV_RT_INTERNAL_NODE_SIZE); + + struct bvh_layout bvh_layout; + get_bvh_layout(geometry_type, leaf_count, &bvh_layout); + + const struct encode_args args = { + .intermediate_bvh = intermediate_as_addr, + .output_bvh = vk_acceleration_structure_get_va(dst) + + bvh_layout.bvh_offset, + .header = intermediate_header_addr, + .output_bvh_offset = bvh_layout.bvh_offset, + .leaf_node_count = leaf_count, + .geometry_type = geometry_type, + }; + + VkPushConstantsInfoKHR push_info = { + .sType = VK_STRUCTURE_TYPE_PUSH_CONSTANTS_INFO_KHR, + .layout = layout, + .stageFlags = VK_SHADER_STAGE_COMPUTE_BIT, + .offset = 0, + .size = sizeof(args), + .pValues = &args, + }; + + anv_CmdPushConstants2KHR(commandBuffer, &push_info); + + struct anv_address indirect_addr = + anv_address_from_u64(intermediate_header_addr + + offsetof(struct vk_ir_header, ir_internal_node_count)); + anv_genX(cmd_buffer->device->info, cmd_buffer_dispatch_indirect) + (cmd_buffer, indirect_addr, true /* is_unaligned_size_x */); +} + +static uint32_t +anv_get_header_key(VkAccelerationStructureTypeKHR type, + VkBuildAccelerationStructureFlagBitsKHR flags) +{ + return (flags & VK_BUILD_ACCELERATION_STRUCTURE_ALLOW_COMPACTION_BIT_KHR) ? + 1 : 0; +} + +static VkResult +anv_init_header_bind_pipeline(VkCommandBuffer commandBuffer, uint32_t key) +{ + VK_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer); + + if (key == 1) { + VkPipeline pipeline; + VkPipelineLayout layout; + VkResult result = get_pipeline_spv(cmd_buffer->device, "header", + header_spv, sizeof(header_spv), + sizeof(struct header_args), &pipeline, + &layout); + if (result != VK_SUCCESS) + return result; + + anv_CmdBindPipeline(commandBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, + pipeline); + } + + return VK_SUCCESS; +} + +static void +anv_init_header(VkCommandBuffer commandBuffer, + const VkAccelerationStructureBuildGeometryInfoKHR *build_info, + const VkAccelerationStructureBuildRangeInfoKHR *build_range_infos, + VkDeviceAddress intermediate_as_addr, + VkDeviceAddress intermediate_header_addr, uint32_t leaf_count, + uint32_t key, + struct vk_acceleration_structure *dst) +{ + VK_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer); + struct anv_device *device = cmd_buffer->device; + + VkGeometryTypeKHR geometry_type = vk_get_as_geometry_type(build_info); + + struct bvh_layout bvh_layout; + get_bvh_layout(geometry_type, leaf_count, &bvh_layout); + + VkDeviceAddress header_addr = vk_acceleration_structure_get_va(dst); + + UNUSED size_t base = offsetof(struct anv_accel_struct_header, + copy_dispatch_size); + + uint32_t instance_count = geometry_type == VK_GEOMETRY_TYPE_INSTANCES_KHR ? + leaf_count : 0; + + if (key == 1) { + /* Add a barrier to ensure the writes from encode.comp is ready to be + * read by header.comp + */ + vk_barrier_compute_w_to_compute_r(commandBuffer); + + /* VK_BUILD_ACCELERATION_STRUCTURE_ALLOW_COMPACTION_BIT_KHR is set, so we + * want to populate header.compacted_size with the compacted size, which + * needs to be calculated by using ir_header.dst_node_offset, which we'll + * access in the header.comp. + */ + base = offsetof(struct anv_accel_struct_header, instance_count); + + VkPipeline pipeline; + VkPipelineLayout layout; + get_pipeline_spv(device, "header", header_spv, sizeof(header_spv), + sizeof(struct header_args), &pipeline, &layout); + + struct header_args args = { + .src = intermediate_header_addr, + .dst = vk_acceleration_structure_get_va(dst), + .bvh_offset = bvh_layout.bvh_offset, + .instance_count = instance_count, + }; + + VkPushConstantsInfoKHR push_info = { + .sType = VK_STRUCTURE_TYPE_PUSH_CONSTANTS_INFO_KHR, + .layout = layout, + .stageFlags = VK_SHADER_STAGE_COMPUTE_BIT, + .offset = 0, + .size = sizeof(args), + .pValues = &args, + }; + + anv_CmdPushConstants2KHR(commandBuffer, &push_info); + vk_common_CmdDispatch(commandBuffer, 1, 1, 1); + } else { + vk_barrier_compute_w_to_host_r(commandBuffer); + + struct anv_accel_struct_header header = {}; + + header.instance_count = instance_count; + header.self_ptr = header_addr; + header.compacted_size = bvh_layout.size; + + /* 128 is local_size_x in copy.comp shader, 8 is the amount of data + * copied by each iteration of that shader's loop + */ + header.copy_dispatch_size[0] = DIV_ROUND_UP(header.compacted_size, + 8 * 128); + header.copy_dispatch_size[1] = 1; + header.copy_dispatch_size[2] = 1; + + header.serialization_size = + header.compacted_size + + sizeof(struct vk_accel_struct_serialization_header) + + sizeof(uint64_t) * header.instance_count; + + header.size = header.compacted_size; + + size_t header_size = sizeof(struct anv_accel_struct_header) - base; + assert(base % sizeof(uint32_t) == 0); + assert(header_size % sizeof(uint32_t) == 0); + uint32_t *header_ptr = (uint32_t *)((char *)&header + base); + + struct anv_address addr = anv_address_from_u64(header_addr + base); + anv_cmd_buffer_update_addr(cmd_buffer, addr, 0, header_size, + header_ptr, false); + } +} + +static const struct vk_acceleration_structure_build_ops anv_build_ops = { + .get_as_size = anv_get_as_size, + .get_encode_key = { anv_get_encode_key, anv_get_header_key }, + .encode_bind_pipeline = { anv_encode_bind_pipeline, + anv_init_header_bind_pipeline }, + .encode_as = { anv_encode_as, anv_init_header }, +}; + +static VkResult +anv_device_init_accel_struct_build_state(struct anv_device *device) +{ + VkResult result = VK_SUCCESS; + simple_mtx_lock(&device->accel_struct_build.mutex); + + if (device->accel_struct_build.radix_sort) + goto exit; + + const struct radix_sort_vk_target_config radix_sort_config = { + .keyval_dwords = 2, + .init = { .workgroup_size_log2 = 8, }, + .fill = { .workgroup_size_log2 = 8, .block_rows = 8 }, + .histogram = { + .workgroup_size_log2 = 8, + .subgroup_size_log2 = device->info->ver >= 20 ? 4 : 3, + .block_rows = 14, + }, + .prefix = { + .workgroup_size_log2 = 8, + .subgroup_size_log2 = device->info->ver >= 20 ? 4 : 3, + }, + .scatter = { + .workgroup_size_log2 = 8, + .subgroup_size_log2 = device->info->ver >= 20 ? 4 : 3, + .block_rows = 14, + }, + }; + + device->accel_struct_build.radix_sort = + vk_create_radix_sort_u64(anv_device_to_handle(device), + &device->vk.alloc, + VK_NULL_HANDLE, radix_sort_config); + + device->vk.as_build_ops = &anv_build_ops; + device->vk.write_buffer_cp = anv_cmd_write_buffer_cp; + device->vk.flush_buffer_write_cp = anv_cmd_flush_buffer_write_cp; + device->vk.cmd_dispatch_unaligned = anv_cmd_dispatch_unaligned; + device->vk.cmd_fill_buffer_addr = anv_cmd_fill_buffer_addr; + + device->accel_struct_build.build_args = + (struct vk_acceleration_structure_build_args) { + .subgroup_size = device->info->ver >= 20 ? 16 : 8, + .radix_sort = device->accel_struct_build.radix_sort, + /* See struct anv_accel_struct_header from anv_bvh.h + * + * Root pointer starts at offset 0 and bound box start at offset 8. + */ + .bvh_bounds_offset = 8, + }; + +exit: + simple_mtx_unlock(&device->accel_struct_build.mutex); + return result; +} + void genX(GetAccelerationStructureBuildSizesKHR)( VkDevice _device, @@ -45,6 +428,12 @@ genX(GetAccelerationStructureBuildSizesKHR)( const uint32_t* pMaxPrimitiveCounts, VkAccelerationStructureBuildSizesInfoKHR* pSizeInfo) { + ANV_FROM_HANDLE(anv_device, device, _device); + if (anv_device_init_accel_struct_build_state(device) != VK_SUCCESS) + return; + + vk_get_as_build_sizes(_device, buildType, pBuildInfo, pMaxPrimitiveCounts, + pSizeInfo, &device->accel_struct_build.build_args); } void @@ -53,6 +442,18 @@ genX(GetDeviceAccelerationStructureCompatibilityKHR)( const VkAccelerationStructureVersionInfoKHR* pVersionInfo, VkAccelerationStructureCompatibilityKHR* pCompatibility) { + ANV_FROM_HANDLE(anv_device, device, _device); + struct vk_accel_struct_serialization_header* ser_header = + (struct vk_accel_struct_serialization_header*)(pVersionInfo->pVersionData); + + if (memcmp(ser_header->accel_struct_compat, + device->physical->rt_uuid, + sizeof(device->physical->rt_uuid)) == 0) { + *pCompatibility = VK_ACCELERATION_STRUCTURE_COMPATIBILITY_COMPATIBLE_KHR; + } else { + *pCompatibility = + VK_ACCELERATION_STRUCTURE_COMPATIBILITY_INCOMPATIBLE_KHR; + } } void @@ -62,6 +463,27 @@ genX(CmdBuildAccelerationStructuresKHR)( const VkAccelerationStructureBuildGeometryInfoKHR* pInfos, const VkAccelerationStructureBuildRangeInfoKHR* const* ppBuildRangeInfos) { + ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer); + struct anv_device *device = cmd_buffer->device; + + VkResult result = anv_device_init_accel_struct_build_state(device); + if (result != VK_SUCCESS) { + vk_command_buffer_set_error(&cmd_buffer->vk, result); + return; + } + + struct anv_cmd_saved_state saved; + anv_cmd_buffer_save_state(cmd_buffer, + ANV_CMD_SAVED_STATE_COMPUTE_PIPELINE | + ANV_CMD_SAVED_STATE_DESCRIPTOR_SET_ALL | + ANV_CMD_SAVED_STATE_PUSH_CONSTANTS, &saved); + + vk_cmd_build_acceleration_structures(commandBuffer, &device->vk, + &device->meta_device, infoCount, + pInfos, ppBuildRangeInfos, + &device->accel_struct_build.build_args); + + anv_cmd_buffer_restore_state(cmd_buffer, &saved); } void @@ -81,6 +503,61 @@ genX(CmdCopyAccelerationStructureKHR)( VkCommandBuffer commandBuffer, const VkCopyAccelerationStructureInfoKHR* pInfo) { + ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer); + VK_FROM_HANDLE(vk_acceleration_structure, src, pInfo->src); + VK_FROM_HANDLE(vk_acceleration_structure, dst, pInfo->dst); + + VkPipeline pipeline; + VkPipelineLayout layout; + VkResult result = get_pipeline_spv(cmd_buffer->device, "copy", copy_spv, + sizeof(copy_spv), sizeof(struct copy_args), + &pipeline, &layout); + if (result != VK_SUCCESS) { + vk_command_buffer_set_error(&cmd_buffer->vk, result); + return; + } + + struct anv_cmd_saved_state saved; + anv_cmd_buffer_save_state(cmd_buffer, + ANV_CMD_SAVED_STATE_COMPUTE_PIPELINE | + ANV_CMD_SAVED_STATE_DESCRIPTOR_SET_ALL | + ANV_CMD_SAVED_STATE_PUSH_CONSTANTS, &saved); + + anv_CmdBindPipeline(commandBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, + pipeline); + + struct copy_args consts = { + .src_addr = vk_acceleration_structure_get_va(src), + .dst_addr = vk_acceleration_structure_get_va(dst), + .mode = ANV_COPY_MODE_COPY, + }; + + VkPushConstantsInfoKHR push_info = { + .sType = VK_STRUCTURE_TYPE_PUSH_CONSTANTS_INFO_KHR, + .layout = layout, + .stageFlags = VK_SHADER_STAGE_COMPUTE_BIT, + .offset = 0, + .size = sizeof(consts), + .pValues = &consts, + }; + + anv_CmdPushConstants2KHR(commandBuffer, &push_info); + + /* L1/L2 caches flushes should have been dealt with by pipeline barriers. + * Unfortunately some platforms require L3 flush because CS (reading the + * dispatch paramters) is not L3 coherent. + */ + if (!ANV_DEVINFO_HAS_COHERENT_L3_CS(cmd_buffer->device->info)) { + anv_add_pending_pipe_bits(cmd_buffer, ANV_PIPE_DATA_CACHE_FLUSH_BIT, + "bvh size read for dispatch"); + } + + anv_genX(cmd_buffer->device->info, CmdDispatchIndirect)( + commandBuffer, src->buffer, + src->offset + offsetof(struct anv_accel_struct_header, + copy_dispatch_size)); + + anv_cmd_buffer_restore_state(cmd_buffer, &saved); } void @@ -88,6 +565,66 @@ genX(CmdCopyAccelerationStructureToMemoryKHR)( VkCommandBuffer commandBuffer, const VkCopyAccelerationStructureToMemoryInfoKHR* pInfo) { + ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer); + VK_FROM_HANDLE(vk_acceleration_structure, src, pInfo->src); + + struct anv_device *device = cmd_buffer->device; + VkPipeline pipeline; + VkPipelineLayout layout; + VkResult result = get_pipeline_spv(device, "copy", copy_spv, + sizeof(copy_spv), + sizeof(struct copy_args), &pipeline, + &layout); + if (result != VK_SUCCESS) { + vk_command_buffer_set_error(&cmd_buffer->vk, result); + return; + } + + struct anv_cmd_saved_state saved; + anv_cmd_buffer_save_state(cmd_buffer, + ANV_CMD_SAVED_STATE_COMPUTE_PIPELINE | + ANV_CMD_SAVED_STATE_DESCRIPTOR_SET_ALL | + ANV_CMD_SAVED_STATE_PUSH_CONSTANTS, &saved); + + anv_CmdBindPipeline(commandBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, + pipeline); + + struct copy_args consts = { + .src_addr = vk_acceleration_structure_get_va(src), + .dst_addr = pInfo->dst.deviceAddress, + .mode = ANV_COPY_MODE_SERIALIZE, + }; + + memcpy(consts.driver_uuid, device->physical->driver_uuid, VK_UUID_SIZE); + memcpy(consts.accel_struct_compat, device->physical->rt_uuid, VK_UUID_SIZE); + + VkPushConstantsInfoKHR push_info = { + .sType = VK_STRUCTURE_TYPE_PUSH_CONSTANTS_INFO_KHR, + .layout = layout, + .stageFlags = VK_SHADER_STAGE_COMPUTE_BIT, + .offset = 0, + .size = sizeof(consts), + .pValues = &consts, + }; + + anv_CmdPushConstants2KHR(commandBuffer, &push_info); + + /* L1/L2 caches flushes should have been dealt with by pipeline barriers. + * Unfortunately some platforms require L3 flush because CS (reading the + * dispatch paramters) is not L3 coherent. + */ + if (!ANV_DEVINFO_HAS_COHERENT_L3_CS(cmd_buffer->device->info)) { + anv_add_pending_pipe_bits(cmd_buffer, + ANV_PIPE_DATA_CACHE_FLUSH_BIT, + "bvh size read for dispatch"); + } + + anv_genX(device->info, CmdDispatchIndirect)( + commandBuffer, src->buffer, + src->offset + offsetof(struct anv_accel_struct_header, + copy_dispatch_size)); + + anv_cmd_buffer_restore_state(cmd_buffer, &saved); } void @@ -95,6 +632,48 @@ genX(CmdCopyMemoryToAccelerationStructureKHR)( VkCommandBuffer commandBuffer, const VkCopyMemoryToAccelerationStructureInfoKHR* pInfo) { + ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer); + VK_FROM_HANDLE(vk_acceleration_structure, dst, pInfo->dst); + + VkPipeline pipeline; + VkPipelineLayout layout; + VkResult result = get_pipeline_spv(cmd_buffer->device, "copy", copy_spv, + sizeof(copy_spv), + sizeof(struct copy_args), &pipeline, + &layout); + if (result != VK_SUCCESS) { + vk_command_buffer_set_error(&cmd_buffer->vk, result); + return; + } + + struct anv_cmd_saved_state saved; + anv_cmd_buffer_save_state(cmd_buffer, + ANV_CMD_SAVED_STATE_COMPUTE_PIPELINE | + ANV_CMD_SAVED_STATE_DESCRIPTOR_SET_ALL | + ANV_CMD_SAVED_STATE_PUSH_CONSTANTS, &saved); + + anv_CmdBindPipeline(commandBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, + pipeline); + + const struct copy_args consts = { + .src_addr = pInfo->src.deviceAddress, + .dst_addr = vk_acceleration_structure_get_va(dst), + .mode = ANV_COPY_MODE_DESERIALIZE, + }; + + VkPushConstantsInfoKHR push_info = { + .sType = VK_STRUCTURE_TYPE_PUSH_CONSTANTS_INFO_KHR, + .layout = layout, + .stageFlags = VK_SHADER_STAGE_COMPUTE_BIT, + .offset = 0, + .size = sizeof(consts), + .pValues = &consts, + }; + + anv_CmdPushConstants2KHR(commandBuffer, &push_info); + + vk_common_CmdDispatch(commandBuffer, 512, 1, 1); + anv_cmd_buffer_restore_state(cmd_buffer, &saved); } /* TODO: Host commands */ diff --git a/src/intel/vulkan/genX_init_state.c b/src/intel/vulkan/genX_init_state.c index d22147149ba..ab093f202e2 100644 --- a/src/intel/vulkan/genX_init_state.c +++ b/src/intel/vulkan/genX_init_state.c @@ -31,7 +31,7 @@ #include "vk_standard_sample_locations.h" -#if GFX_VERx10 >= 125 && ANV_SUPPORT_RT +#if GFX_VERx10 >= 125 && ANV_SUPPORT_RT_GRL #include "grl/genX_grl.h" #endif @@ -863,9 +863,15 @@ void genX(init_physical_device_state)(ASSERTED struct anv_physical_device *pdevice) { assert(pdevice->info.verx10 == GFX_VERx10); + #if GFX_VERx10 >= 125 && ANV_SUPPORT_RT +#if ANV_SUPPORT_RT_GRL genX(grl_load_rt_uuid)(pdevice->rt_uuid); pdevice->max_grl_scratch_size = genX(grl_max_scratch_size)(); +#else + STATIC_ASSERT(sizeof(ANV_RT_UUID_MACRO) == VK_UUID_SIZE); + memcpy(pdevice->rt_uuid, ANV_RT_UUID_MACRO, VK_UUID_SIZE); +#endif #endif pdevice->cmd_emit_timestamp = genX(cmd_emit_timestamp); diff --git a/src/intel/vulkan/genX_query.c b/src/intel/vulkan/genX_query.c index a084a61b09e..a69139cc56c 100644 --- a/src/intel/vulkan/genX_query.c +++ b/src/intel/vulkan/genX_query.c @@ -1052,19 +1052,19 @@ emit_perf_intel_query(struct anv_cmd_buffer *cmd_buffer, } } -static void -emit_query_clear_flush(struct anv_cmd_buffer *cmd_buffer, - struct anv_query_pool *pool, - const char *reason) +static bool +append_query_clear_flush(struct anv_cmd_buffer *cmd_buffer, + struct anv_query_pool *pool, + const char *reason) { if (cmd_buffer->state.queries.clear_bits == 0) - return; + return false; anv_add_pending_pipe_bits(cmd_buffer, ANV_PIPE_QUERY_BITS( cmd_buffer->state.queries.clear_bits), reason); - genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer); + return true; } @@ -1079,7 +1079,9 @@ void genX(CmdBeginQueryIndexedEXT)( ANV_FROM_HANDLE(anv_query_pool, pool, queryPool); struct anv_address query_addr = anv_query_address(pool, query); - emit_query_clear_flush(cmd_buffer, pool, "CmdBeginQuery* flush query clears"); + if (append_query_clear_flush(cmd_buffer, pool, + "CmdBeginQuery* flush query clears")) + genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer); struct mi_builder b; mi_builder_init(&b, cmd_buffer->device->info, &cmd_buffer->batch); @@ -1532,8 +1534,9 @@ void genX(CmdWriteTimestamp2)( assert(pool->vk.query_type == VK_QUERY_TYPE_TIMESTAMP); - emit_query_clear_flush(cmd_buffer, pool, - "CmdWriteTimestamp flush query clears"); + if (append_query_clear_flush(cmd_buffer, pool, + "CmdWriteTimestamp flush query clears")) + genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer); struct mi_builder b; mi_builder_init(&b, cmd_buffer->device->info, &cmd_buffer->batch); @@ -2014,11 +2017,15 @@ void genX(CmdCopyQueryPoolResults)( #if GFX_VERx10 >= 125 && ANV_SUPPORT_RT +#if ANV_SUPPORT_RT_GRL #include "grl/include/GRLRTASCommon.h" #include "grl/grl_metakernel_postbuild_info.h" +#else +#include "bvh/anv_bvh.h" +#endif -static void -anv_write_acceleration_structure_properties_grl( +void +genX(CmdWriteAccelerationStructuresPropertiesKHR)( VkCommandBuffer commandBuffer, uint32_t accelerationStructureCount, const VkAccelerationStructureKHR* pAccelerationStructures, @@ -2026,15 +2033,31 @@ anv_write_acceleration_structure_properties_grl( VkQueryPool queryPool, uint32_t firstQuery) { + assert(queryType == VK_QUERY_TYPE_ACCELERATION_STRUCTURE_COMPACTED_SIZE_KHR || + queryType == VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SERIALIZATION_SIZE_KHR || + queryType == VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SIZE_KHR || + queryType == VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SERIALIZATION_BOTTOM_LEVEL_POINTERS_KHR); + ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer); ANV_FROM_HANDLE(anv_query_pool, pool, queryPool); - emit_query_clear_flush(cmd_buffer, pool, - "CmdWriteAccelerationStructuresPropertiesKHR flush query clears"); +#if !ANV_SUPPORT_RT_GRL + anv_add_pending_pipe_bits(cmd_buffer, + ANV_PIPE_END_OF_PIPE_SYNC_BIT | + ANV_PIPE_DATA_CACHE_FLUSH_BIT, + "read BVH data using CS"); +#endif + + if (append_query_clear_flush( + cmd_buffer, pool, + "CmdWriteAccelerationStructuresPropertiesKHR flush query clears") || + !ANV_SUPPORT_RT_GRL) + genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer); struct mi_builder b; mi_builder_init(&b, cmd_buffer->device->info, &cmd_buffer->batch); +#if ANV_SUPPORT_RT_GRL for (uint32_t i = 0; i < accelerationStructureCount; i++) { ANV_FROM_HANDLE(vk_acceleration_structure, accel, pAccelerationStructures[i]); struct anv_address query_addr = @@ -2076,26 +2099,47 @@ anv_write_acceleration_structure_properties_grl( for (uint32_t i = 0; i < accelerationStructureCount; i++) emit_query_mi_availability(&b, anv_query_address(pool, firstQuery + i), true); -} -void -genX(CmdWriteAccelerationStructuresPropertiesKHR)( - VkCommandBuffer commandBuffer, - uint32_t accelerationStructureCount, - const VkAccelerationStructureKHR* pAccelerationStructures, - VkQueryType queryType, - VkQueryPool queryPool, - uint32_t firstQuery) -{ - assert(queryType == VK_QUERY_TYPE_ACCELERATION_STRUCTURE_COMPACTED_SIZE_KHR || - queryType == VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SERIALIZATION_SIZE_KHR || - queryType == VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SIZE_KHR || - queryType == VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SERIALIZATION_BOTTOM_LEVEL_POINTERS_KHR); +#else + for (uint32_t i = 0; i < accelerationStructureCount; i++) { + ANV_FROM_HANDLE(vk_acceleration_structure, accel, pAccelerationStructures[i]); + struct anv_address query_addr = + anv_address_add(anv_query_address(pool, firstQuery + i), 8); + uint64_t va = vk_acceleration_structure_get_va(accel); - anv_write_acceleration_structure_properties_grl(commandBuffer, - accelerationStructureCount, - pAccelerationStructures, - queryType, queryPool, - firstQuery); + mi_builder_set_write_check(&b, (i == (accelerationStructureCount - 1))); + + switch (queryType) { + case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_COMPACTED_SIZE_KHR: + va += offsetof(struct anv_accel_struct_header, compacted_size); + break; + case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SIZE_KHR: + va += offsetof(struct anv_accel_struct_header, size); + break; + case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SERIALIZATION_SIZE_KHR: + va += offsetof(struct anv_accel_struct_header, serialization_size); + break; + case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SERIALIZATION_BOTTOM_LEVEL_POINTERS_KHR: + va += offsetof(struct anv_accel_struct_header, instance_count); + /* To respect current set up tailored for GRL, the numBlasPtrs are + * stored at the second slot (third slot, if you count availability) + */ + query_addr = anv_address_add(query_addr, 8); + break; + default: + unreachable("unhandled query type"); + } + + mi_store(&b, mi_mem64(query_addr), mi_mem64(anv_address_from_u64(va))); + } + + struct mi_builder b1; + mi_builder_init(&b1, cmd_buffer->device->info, &cmd_buffer->batch); + + for (uint32_t i = 0; i < accelerationStructureCount; i++) { + mi_builder_set_write_check(&b1, (i == (accelerationStructureCount - 1))); + emit_query_mi_availability(&b1, anv_query_address(pool, firstQuery + i), true); + } +#endif /* ANV_SUPPORT_RT_GRL */ } -#endif +#endif /* GFX_VERx10 >= 125 && ANV_SUPPORT_RT */ diff --git a/src/intel/vulkan/meson.build b/src/intel/vulkan/meson.build index db36748a7b7..4dd08297851 100644 --- a/src/intel/vulkan/meson.build +++ b/src/intel/vulkan/meson.build @@ -38,13 +38,22 @@ idep_anv_headers = declare_dependency( ) if with_intel_vk_rt - subdir('grl') - optional_libgrl = [libgrl] + if with_intel_bvh_grl + subdir('grl') + optional_libgrl = [libgrl] + anv_flags += '-DANV_SUPPORT_RT_GRL=1' + else + subdir('bvh') + idep_grl = null_dep + optional_libgrl = [] + anv_flags += '-DANV_SUPPORT_RT_GRL=0' + endif anv_flags += '-DANV_SUPPORT_RT=1' else idep_grl = null_dep optional_libgrl = [] anv_flags += '-DANV_SUPPORT_RT=0' + anv_flags += '-DANV_SUPPORT_RT_GRL=0' endif intel_icd = custom_target( @@ -101,9 +110,15 @@ anv_per_hw_ver_files = files( 'genX_simple_shader.c', ) if with_intel_vk_rt - anv_per_hw_ver_files += files( - 'genX_acceleration_structure_grl.c', - ) + if with_intel_bvh_grl + anv_per_hw_ver_files += files( + 'genX_acceleration_structure_grl.c', + ) + else + anv_per_hw_ver_files += files( + 'genX_acceleration_structure.c', + ) + endif endif foreach _gfx_ver : ['90', '110', '120', '125', '200', '300'] @@ -194,6 +209,8 @@ libanv_files = files( 'anv_va.c', 'anv_video.c', 'anv_wsi.c', + 'bvh/anv_bvh.h', + 'bvh/anv_build_interface.h', ) anv_deps = [ @@ -248,7 +265,7 @@ libanv_common = static_library( libvulkan_intel = shared_library( 'vulkan_intel', - [files('anv_gem.c'), anv_entrypoints[0]], + [files('anv_gem.c'), anv_entrypoints[0], bvh_spv], include_directories : [ inc_include, inc_src, inc_intel, ],