anv: Implement acceleration structure API

Rework: (Kevin) - Properly setup bvh_layout Our bvh resides in contiguous memory and can be divided into two sections: 1. anv_accel_struct_header, tightly followed by 2. actual bvh, which starts with root node, followed by interleaving leaves or internal nodes. - Update comments for some fields for BVH and nodes. - Properly populate the UUIDs in serialization header - separate header func into completely two paths based on compaction bit - Encode rt_uuid at second VK_UUID_SIZE. - Write query result at correct slot - add assertion for a 4B alignment - move bvh_layout to anv_bvh - Use meson option to decide which files to compile - The alignment of serialization size is not needed - Change static_assert to STATIC_ASSERT and move them inside functions Rework (Sagar) - Use anv_cmd_buffer_update_buffer instead of MI to copy data Rework (Lionel) - Remove flush after builds, and add flush in copy before dispatch - Handle the flushes in CmdWriteAccelerationStructuresPropertiesKHR properly Co-authored-by: Kevin Chuang <kaiwenjon23@gmail.com> Co-authored-by: Sagar Ghuge <sagar.ghuge@intel.com> Reviewed-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/31588>
2024-06-12 18:06:22 -07:00
parent 9002e52037
commit 41baeb3810
9 changed files with 799 additions and 62 deletions
--- a/meson.build
+++ b/meson.build
@@ -316,6 +316,10 @@ with_intel_vk_rt = get_option('intel-rt') \
  .disable_if(host_machine.cpu_family() != 'x86_64', error_message : 'Intel Ray Tracing is only supported on x86_64') \
  .allowed()

+if with_intel_vk_rt
+  with_intel_bvh_grl = get_option('intel-bvh-grl')
+endif
+
 with_any_intel = [
  with_gallium_crocus,
  with_gallium_i915,
--- a/meson_options.txt
+++ b/meson_options.txt
@@ -657,6 +657,13 @@ option(
  description : 'Build the intel-clc compiler or use a system version.'
 )

+option(
+  'intel-bvh-grl',
+  type : 'boolean',
+  value : true,
+  description : 'Build the BVH structure using GRL.'
+)
+
 option(
  'install-intel-clc',
  type : 'boolean',
--- a/src/intel/vulkan/anv_device.c
+++ b/src/intel/vulkan/anv_device.c
@@ -985,14 +985,22 @@ VkResult anv_CreateDevice(

   anv_device_utrace_init(device);

-   result = anv_genX(device->info, init_device_state)(device);
+   result = vk_meta_device_init(&device->vk, &device->meta_device);
   if (result != VK_SUCCESS)
      goto fail_utrace;

+   result = anv_genX(device->info, init_device_state)(device);
+   if (result != VK_SUCCESS)
+      goto fail_meta_device;
+
+   simple_mtx_init(&device->accel_struct_build.mutex, mtx_plain);
+
   *pDevice = anv_device_to_handle(device);

   return VK_SUCCESS;

+ fail_meta_device:
+   vk_meta_device_finish(&device->vk, &device->meta_device);
 fail_utrace:
   anv_device_utrace_finish(device);
 fail_queues:
@@ -1118,6 +1126,12 @@ void anv_DestroyDevice(
   /* Do TRTT batch garbage collection before destroying queues. */
   anv_device_finish_trtt(device);

+   if (device->accel_struct_build.radix_sort) {
+      radix_sort_vk_destroy(device->accel_struct_build.radix_sort,
+                            _device, &device->vk.alloc);
+   }
+   vk_meta_device_finish(&device->vk, &device->meta_device);
+
   anv_device_utrace_finish(device);

   for (uint32_t i = 0; i < device->queue_count; i++)
@@ -1218,6 +1232,8 @@ void anv_DestroyDevice(
   pthread_cond_destroy(&device->queue_submit);
   pthread_mutex_destroy(&device->mutex);

+   simple_mtx_destroy(&device->accel_struct_build.mutex);
+
   ralloc_free(device->fp64_nir);

   anv_device_destroy_context_or_vm(device);
--- a/src/intel/vulkan/anv_private.h
+++ b/src/intel/vulkan/anv_private.h
@@ -102,6 +102,7 @@
 #include "vk_log.h"
 #include "vk_ycbcr_conversion.h"
 #include "vk_video.h"
+#include "vk_meta.h"

 #ifdef __cplusplus
 extern "C" {
@@ -1828,6 +1829,11 @@ enum anv_rt_bvh_build_method {
   ANV_BVH_BUILD_METHOD_NEW_SAH,
 };

+/* If serialization-breaking or algorithm-breaking changes are made,
+ * increment the digits at the end
+ */
+#define ANV_RT_UUID_MACRO             "ANV_RT_BVH_0001"
+
 struct anv_device_astc_emu {
    struct vk_texcompress_astc_state           *texcompress;

@@ -2102,6 +2108,14 @@ struct anv_device {
        */
       struct util_dynarray                      prints;
    } printf;
+
+    struct {
+       simple_mtx_t  mutex;
+       struct radix_sort_vk *radix_sort;
+       struct vk_acceleration_structure_build_args build_args;
+   } accel_struct_build;
+
+   struct vk_meta_device meta_device;
 };

 static inline uint32_t
--- a/src/intel/vulkan/bvh/meson.build
+++ b/src/intel/vulkan/bvh/meson.build
@@ -0,0 +1,50 @@
+# Copyright © 2022 Konstantin Seurer
+# Copyright © 2024 Intel Corporation
+# SPDX-License-Identifier: MIT
+
+# source file, output name, defines
+bvh_shaders = [
+  [
+    'encode.comp',
+    'encode',
+    [],
+  ],
+  [
+    'header.comp',
+    'header',
+    [],
+  ],
+  [
+    'copy.comp',
+    'copy',
+    []
+  ],
+]
+
+anv_bvh_include_dir = dir_source_root + '/src/intel/vulkan/bvh'
+
+anv_bvh_includes = files(
+  'anv_build_helpers.h',
+  'anv_build_interface.h',
+  'anv_bvh.h',
+)
+
+bvh_spv = []
+foreach s : bvh_shaders
+  command = [
+    prog_glslang, '-V', '-I' + vk_bvh_include_dir, '-I' + anv_bvh_include_dir, '--target-env', 'spirv1.5', '-x', '-o', '@OUTPUT@', '@INPUT@'
+  ]
+  command += glslang_quiet
+
+  foreach define : s[2]
+    command += '-D' + define
+  endforeach
+
+  bvh_spv += custom_target(
+    s[1] + '.spv.h',
+    input : s[0],
+    output : s[1] + '.spv.h',
+    command : command,
+    depend_files: [vk_bvh_includes, anv_bvh_includes],
+  )
+endforeach
--- a/src/intel/vulkan/genX_acceleration_structure.c
+++ b/src/intel/vulkan/genX_acceleration_structure.c
@@ -1,24 +1,5 @@
-/*
- * Copyright © 2020 Intel Corporation
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
+/* Copyright © 2024 Intel Corporation
+ * SPDX-License-Identifier: MIT
 */

 #include "anv_private.h"
@@ -35,8 +16,410 @@

 #include "ds/intel_tracepoints.h"

+#include "bvh/anv_build_interface.h"
+#include "vk_acceleration_structure.h"
+#include "radix_sort/radix_sort_u64.h"
+#include "radix_sort/common/vk/barrier.h"
+
+#include "vk_common_entrypoints.h"
+#include "genX_mi_builder.h"
+
 #if GFX_VERx10 >= 125

+static const uint32_t encode_spv[] = {
+#include "bvh/encode.spv.h"
+};
+
+static const uint32_t header_spv[] = {
+#include "bvh/header.spv.h"
+};
+
+static const uint32_t copy_spv[] = {
+#include "bvh/copy.spv.h"
+};
+
+static VkResult
+get_pipeline_spv(struct anv_device *device,
+                 const char *name, const uint32_t *spv, uint32_t spv_size,
+                 unsigned push_constant_size, VkPipeline *pipeline,
+                 VkPipelineLayout *layout)
+{
+
+   size_t key_size = strlen(name);
+
+   const VkPushConstantRange pc_range = {
+      .stageFlags = VK_SHADER_STAGE_COMPUTE_BIT,
+      .offset = 0,
+      .size = push_constant_size,
+   };
+
+   VkResult result = vk_meta_get_pipeline_layout(&device->vk,
+                                                 &device->meta_device, NULL,
+                                                 &pc_range, name, key_size,
+                                                 layout);
+
+   if (result != VK_SUCCESS)
+      return result;
+
+   VkPipeline pipeline_from_cache =
+      vk_meta_lookup_pipeline(&device->meta_device, name, key_size);
+   if (pipeline_from_cache != VK_NULL_HANDLE) {
+      *pipeline = pipeline_from_cache;
+      return VK_SUCCESS;
+   }
+
+   VkShaderModuleCreateInfo module_info = {
+      .sType = VK_STRUCTURE_TYPE_SHADER_MODULE_CREATE_INFO,
+      .pNext = NULL,
+      .flags = 0,
+      .codeSize = spv_size,
+      .pCode = spv,
+   };
+
+   VkPipelineShaderStageCreateInfo shader_stage = {
+      .sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO,
+      .pNext = &module_info,
+      .flags = 0,
+      .stage = VK_SHADER_STAGE_COMPUTE_BIT,
+      .pName = "main",
+      .pSpecializationInfo = NULL,
+   };
+
+   VkComputePipelineCreateInfo pipeline_info = {
+      .sType = VK_STRUCTURE_TYPE_COMPUTE_PIPELINE_CREATE_INFO,
+      .flags = 0,
+      .stage = shader_stage,
+      .layout = *layout,
+   };
+
+   return vk_meta_create_compute_pipeline(&device->vk, &device->meta_device,
+                                          &pipeline_info, name, key_size, pipeline);
+}
+
+static void
+get_bvh_layout(VkGeometryTypeKHR geometry_type, uint32_t leaf_count,
+               struct bvh_layout *layout)
+{
+   uint32_t internal_count = MAX2(leaf_count, 2) - 1;
+
+   uint64_t offset = ANV_RT_BVH_HEADER_SIZE;
+
+   /* For a TLAS, we store the address of anv_instance_leaf after header
+    * This is for quick access in the copy.comp
+    */
+   if (geometry_type == VK_GEOMETRY_TYPE_INSTANCES_KHR) {
+      offset += leaf_count * sizeof(uint64_t);
+   }
+   /* The BVH and hence bvh_offset needs 64 byte alignment for RT nodes. */
+   offset = ALIGN(offset, 64);
+
+   /* This is where internal_nodes/leaves start to be encoded */
+   layout->bvh_offset = offset;
+
+   offset += internal_count * ANV_RT_INTERNAL_NODE_SIZE;
+
+   switch (geometry_type) {
+   case VK_GEOMETRY_TYPE_TRIANGLES_KHR:
+      /* Currently we encode one triangle within one quad leaf */
+      offset += leaf_count * ANV_RT_QUAD_LEAF_SIZE;
+      break;
+   case VK_GEOMETRY_TYPE_AABBS_KHR:
+      offset += leaf_count * ANV_RT_PROCEDURAL_LEAF_SIZE;
+      break;
+   case VK_GEOMETRY_TYPE_INSTANCES_KHR:
+      offset += leaf_count * ANV_RT_INSTANCE_LEAF_SIZE;
+      break;
+   default:
+      unreachable("Unknown VkGeometryTypeKHR");
+   }
+
+   layout->size = offset;
+}
+
+static VkDeviceSize
+anv_get_as_size(VkDevice device,
+                const VkAccelerationStructureBuildGeometryInfoKHR *pBuildInfo,
+                uint32_t leaf_count)
+{
+   struct bvh_layout layout;
+   get_bvh_layout(vk_get_as_geometry_type(pBuildInfo), leaf_count, &layout);
+   return layout.size;
+}
+
+static uint32_t
+anv_get_encode_key(VkAccelerationStructureTypeKHR type,
+                   VkBuildAccelerationStructureFlagBitsKHR flags)
+{
+   return 0;
+}
+
+static VkResult
+anv_encode_bind_pipeline(VkCommandBuffer commandBuffer, uint32_t key)
+{
+   VK_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
+   struct anv_device *device = cmd_buffer->device;
+
+   VkPipeline pipeline;
+   VkPipelineLayout layout;
+   VkResult result = get_pipeline_spv(device, "encode", encode_spv,
+                                      sizeof(encode_spv),
+                                      sizeof(struct encode_args), &pipeline,
+                                      &layout);
+   if (result != VK_SUCCESS)
+      return result;
+
+   anv_CmdBindPipeline(commandBuffer, VK_PIPELINE_BIND_POINT_COMPUTE,
+                       pipeline);
+
+   return VK_SUCCESS;
+}
+
+static void
+anv_encode_as(VkCommandBuffer commandBuffer,
+              const VkAccelerationStructureBuildGeometryInfoKHR *build_info,
+              const VkAccelerationStructureBuildRangeInfoKHR *build_range_infos,
+              VkDeviceAddress intermediate_as_addr,
+              VkDeviceAddress intermediate_header_addr, uint32_t leaf_count,
+              uint32_t key,
+              struct vk_acceleration_structure *dst)
+{
+   VK_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
+   struct anv_device *device = cmd_buffer->device;
+
+   VkGeometryTypeKHR geometry_type = vk_get_as_geometry_type(build_info);
+
+   VkPipeline pipeline;
+   VkPipelineLayout layout;
+   get_pipeline_spv(device, "encode", encode_spv, sizeof(encode_spv),
+                    sizeof(struct encode_args), &pipeline, &layout);
+
+   STATIC_ASSERT(sizeof(struct anv_accel_struct_header) == ANV_RT_BVH_HEADER_SIZE);
+   STATIC_ASSERT(sizeof(struct anv_instance_leaf) == ANV_RT_INSTANCE_LEAF_SIZE);
+   STATIC_ASSERT(sizeof(struct anv_quad_leaf_node) == ANV_RT_QUAD_LEAF_SIZE);
+   STATIC_ASSERT(sizeof(struct anv_procedural_leaf_node) == ANV_RT_PROCEDURAL_LEAF_SIZE);
+   STATIC_ASSERT(sizeof(struct anv_internal_node) == ANV_RT_INTERNAL_NODE_SIZE);
+
+   struct bvh_layout bvh_layout;
+   get_bvh_layout(geometry_type, leaf_count, &bvh_layout);
+
+   const struct encode_args args = {
+      .intermediate_bvh = intermediate_as_addr,
+      .output_bvh = vk_acceleration_structure_get_va(dst) +
+                    bvh_layout.bvh_offset,
+      .header = intermediate_header_addr,
+      .output_bvh_offset = bvh_layout.bvh_offset,
+      .leaf_node_count = leaf_count,
+      .geometry_type = geometry_type,
+   };
+
+   VkPushConstantsInfoKHR push_info = {
+      .sType = VK_STRUCTURE_TYPE_PUSH_CONSTANTS_INFO_KHR,
+      .layout = layout,
+      .stageFlags = VK_SHADER_STAGE_COMPUTE_BIT,
+      .offset = 0,
+      .size = sizeof(args),
+      .pValues = &args,
+   };
+
+   anv_CmdPushConstants2KHR(commandBuffer, &push_info);
+
+   struct anv_address indirect_addr =
+      anv_address_from_u64(intermediate_header_addr +
+                            offsetof(struct vk_ir_header, ir_internal_node_count));
+   anv_genX(cmd_buffer->device->info, cmd_buffer_dispatch_indirect)
+      (cmd_buffer, indirect_addr, true /* is_unaligned_size_x */);
+}
+
+static uint32_t
+anv_get_header_key(VkAccelerationStructureTypeKHR type,
+                   VkBuildAccelerationStructureFlagBitsKHR flags)
+{
+   return (flags & VK_BUILD_ACCELERATION_STRUCTURE_ALLOW_COMPACTION_BIT_KHR) ?
+           1 : 0;
+}
+
+static VkResult
+anv_init_header_bind_pipeline(VkCommandBuffer commandBuffer, uint32_t key)
+{
+   VK_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
+
+   if (key == 1) {
+      VkPipeline pipeline;
+      VkPipelineLayout layout;
+      VkResult result = get_pipeline_spv(cmd_buffer->device, "header",
+                                         header_spv, sizeof(header_spv),
+                                         sizeof(struct header_args), &pipeline,
+                                         &layout);
+      if (result != VK_SUCCESS)
+         return result;
+
+      anv_CmdBindPipeline(commandBuffer, VK_PIPELINE_BIND_POINT_COMPUTE,
+                          pipeline);
+   }
+
+   return VK_SUCCESS;
+}
+
+static void
+anv_init_header(VkCommandBuffer commandBuffer,
+                const VkAccelerationStructureBuildGeometryInfoKHR *build_info,
+                const VkAccelerationStructureBuildRangeInfoKHR *build_range_infos,
+                VkDeviceAddress intermediate_as_addr,
+                VkDeviceAddress intermediate_header_addr, uint32_t leaf_count,
+                uint32_t key,
+                struct vk_acceleration_structure *dst)
+{
+   VK_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
+   struct anv_device *device = cmd_buffer->device;
+
+   VkGeometryTypeKHR geometry_type = vk_get_as_geometry_type(build_info);
+
+   struct bvh_layout bvh_layout;
+   get_bvh_layout(geometry_type, leaf_count, &bvh_layout);
+
+   VkDeviceAddress header_addr = vk_acceleration_structure_get_va(dst);
+
+   UNUSED size_t base = offsetof(struct anv_accel_struct_header,
+                                 copy_dispatch_size);
+
+   uint32_t instance_count = geometry_type == VK_GEOMETRY_TYPE_INSTANCES_KHR ?
+                             leaf_count : 0;
+
+   if (key == 1) {
+      /* Add a barrier to ensure the writes from encode.comp is ready to be
+       * read by header.comp
+       */
+      vk_barrier_compute_w_to_compute_r(commandBuffer);
+
+      /* VK_BUILD_ACCELERATION_STRUCTURE_ALLOW_COMPACTION_BIT_KHR is set, so we
+       * want to populate header.compacted_size with the compacted size, which
+       * needs to be calculated by using ir_header.dst_node_offset, which we'll
+       * access in the header.comp.
+       */
+      base = offsetof(struct anv_accel_struct_header, instance_count);
+
+      VkPipeline pipeline;
+      VkPipelineLayout layout;
+      get_pipeline_spv(device, "header", header_spv, sizeof(header_spv),
+                       sizeof(struct header_args), &pipeline, &layout);
+
+      struct header_args args = {
+         .src = intermediate_header_addr,
+         .dst = vk_acceleration_structure_get_va(dst),
+         .bvh_offset = bvh_layout.bvh_offset,
+         .instance_count = instance_count,
+      };
+
+      VkPushConstantsInfoKHR push_info = {
+         .sType = VK_STRUCTURE_TYPE_PUSH_CONSTANTS_INFO_KHR,
+         .layout = layout,
+         .stageFlags = VK_SHADER_STAGE_COMPUTE_BIT,
+         .offset = 0,
+         .size = sizeof(args),
+         .pValues = &args,
+      };
+
+      anv_CmdPushConstants2KHR(commandBuffer, &push_info);
+      vk_common_CmdDispatch(commandBuffer, 1, 1, 1);
+   } else {
+      vk_barrier_compute_w_to_host_r(commandBuffer);
+
+      struct anv_accel_struct_header header = {};
+
+      header.instance_count = instance_count;
+      header.self_ptr = header_addr;
+      header.compacted_size = bvh_layout.size;
+
+      /* 128 is local_size_x in copy.comp shader, 8 is the amount of data
+       * copied by each iteration of that shader's loop
+       */
+      header.copy_dispatch_size[0] = DIV_ROUND_UP(header.compacted_size,
+                                                  8 * 128);
+      header.copy_dispatch_size[1] = 1;
+      header.copy_dispatch_size[2] = 1;
+
+      header.serialization_size =
+         header.compacted_size +
+         sizeof(struct vk_accel_struct_serialization_header) +
+         sizeof(uint64_t) * header.instance_count;
+
+      header.size = header.compacted_size;
+
+      size_t header_size = sizeof(struct anv_accel_struct_header) - base;
+      assert(base % sizeof(uint32_t) == 0);
+      assert(header_size % sizeof(uint32_t) == 0);
+      uint32_t *header_ptr = (uint32_t *)((char *)&header + base);
+
+      struct anv_address addr = anv_address_from_u64(header_addr + base);
+      anv_cmd_buffer_update_addr(cmd_buffer, addr, 0, header_size,
+                                 header_ptr, false);
+   }
+}
+
+static const struct vk_acceleration_structure_build_ops anv_build_ops = {
+   .get_as_size = anv_get_as_size,
+   .get_encode_key = { anv_get_encode_key, anv_get_header_key },
+   .encode_bind_pipeline = { anv_encode_bind_pipeline,
+                             anv_init_header_bind_pipeline },
+   .encode_as = { anv_encode_as, anv_init_header },
+};
+
+static VkResult
+anv_device_init_accel_struct_build_state(struct anv_device *device)
+{
+   VkResult result = VK_SUCCESS;
+   simple_mtx_lock(&device->accel_struct_build.mutex);
+
+   if (device->accel_struct_build.radix_sort)
+      goto exit;
+
+   const struct radix_sort_vk_target_config radix_sort_config = {
+      .keyval_dwords = 2,
+      .init = { .workgroup_size_log2 = 8, },
+      .fill = { .workgroup_size_log2 = 8, .block_rows = 8 },
+      .histogram = {
+         .workgroup_size_log2 = 8,
+         .subgroup_size_log2 = device->info->ver >= 20 ? 4 : 3,
+         .block_rows = 14,
+      },
+      .prefix = {
+         .workgroup_size_log2 = 8,
+         .subgroup_size_log2 = device->info->ver >= 20 ? 4 : 3,
+      },
+      .scatter = {
+         .workgroup_size_log2 = 8,
+         .subgroup_size_log2 = device->info->ver >= 20 ? 4 : 3,
+         .block_rows = 14,
+      },
+   };
+
+   device->accel_struct_build.radix_sort =
+      vk_create_radix_sort_u64(anv_device_to_handle(device),
+                               &device->vk.alloc,
+                               VK_NULL_HANDLE, radix_sort_config);
+
+   device->vk.as_build_ops = &anv_build_ops;
+   device->vk.write_buffer_cp = anv_cmd_write_buffer_cp;
+   device->vk.flush_buffer_write_cp = anv_cmd_flush_buffer_write_cp;
+   device->vk.cmd_dispatch_unaligned = anv_cmd_dispatch_unaligned;
+   device->vk.cmd_fill_buffer_addr = anv_cmd_fill_buffer_addr;
+
+   device->accel_struct_build.build_args =
+      (struct vk_acceleration_structure_build_args) {
+         .subgroup_size = device->info->ver >= 20 ? 16 : 8,
+         .radix_sort = device->accel_struct_build.radix_sort,
+         /* See struct anv_accel_struct_header from anv_bvh.h
+          *
+          * Root pointer starts at offset 0 and bound box start at offset 8.
+          */
+         .bvh_bounds_offset = 8,
+   };
+
+exit:
+   simple_mtx_unlock(&device->accel_struct_build.mutex);
+   return result;
+}
+
 void
 genX(GetAccelerationStructureBuildSizesKHR)(
    VkDevice                                    _device,
@@ -45,6 +428,12 @@ genX(GetAccelerationStructureBuildSizesKHR)(
    const uint32_t*                             pMaxPrimitiveCounts,
    VkAccelerationStructureBuildSizesInfoKHR*   pSizeInfo)
 {
+   ANV_FROM_HANDLE(anv_device, device, _device);
+   if (anv_device_init_accel_struct_build_state(device) != VK_SUCCESS)
+      return;
+
+   vk_get_as_build_sizes(_device, buildType, pBuildInfo, pMaxPrimitiveCounts,
+                         pSizeInfo, &device->accel_struct_build.build_args);
 }

 void
@@ -53,6 +442,18 @@ genX(GetDeviceAccelerationStructureCompatibilityKHR)(
    const VkAccelerationStructureVersionInfoKHR* pVersionInfo,
    VkAccelerationStructureCompatibilityKHR*    pCompatibility)
 {
+   ANV_FROM_HANDLE(anv_device, device, _device);
+   struct vk_accel_struct_serialization_header* ser_header =
+      (struct vk_accel_struct_serialization_header*)(pVersionInfo->pVersionData);
+
+   if (memcmp(ser_header->accel_struct_compat,
+              device->physical->rt_uuid,
+              sizeof(device->physical->rt_uuid)) == 0) {
+      *pCompatibility = VK_ACCELERATION_STRUCTURE_COMPATIBILITY_COMPATIBLE_KHR;
+   } else {
+      *pCompatibility =
+         VK_ACCELERATION_STRUCTURE_COMPATIBILITY_INCOMPATIBLE_KHR;
+   }
 }

 void
@@ -62,6 +463,27 @@ genX(CmdBuildAccelerationStructuresKHR)(
    const VkAccelerationStructureBuildGeometryInfoKHR* pInfos,
    const VkAccelerationStructureBuildRangeInfoKHR* const* ppBuildRangeInfos)
 {
+   ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
+   struct anv_device *device = cmd_buffer->device;
+
+   VkResult result = anv_device_init_accel_struct_build_state(device);
+   if (result != VK_SUCCESS) {
+      vk_command_buffer_set_error(&cmd_buffer->vk, result);
+      return;
+   }
+
+   struct anv_cmd_saved_state saved;
+   anv_cmd_buffer_save_state(cmd_buffer,
+                             ANV_CMD_SAVED_STATE_COMPUTE_PIPELINE |
+                             ANV_CMD_SAVED_STATE_DESCRIPTOR_SET_ALL |
+                             ANV_CMD_SAVED_STATE_PUSH_CONSTANTS, &saved);
+
+   vk_cmd_build_acceleration_structures(commandBuffer, &device->vk,
+                                        &device->meta_device, infoCount,
+                                        pInfos, ppBuildRangeInfos,
+                                        &device->accel_struct_build.build_args);
+
+   anv_cmd_buffer_restore_state(cmd_buffer, &saved);
 }

 void
@@ -81,6 +503,61 @@ genX(CmdCopyAccelerationStructureKHR)(
    VkCommandBuffer                             commandBuffer,
    const VkCopyAccelerationStructureInfoKHR*   pInfo)
 {
+   ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
+   VK_FROM_HANDLE(vk_acceleration_structure, src, pInfo->src);
+   VK_FROM_HANDLE(vk_acceleration_structure, dst, pInfo->dst);
+
+   VkPipeline pipeline;
+   VkPipelineLayout layout;
+   VkResult result = get_pipeline_spv(cmd_buffer->device, "copy", copy_spv,
+                                      sizeof(copy_spv), sizeof(struct copy_args),
+                                      &pipeline, &layout);
+   if (result != VK_SUCCESS) {
+      vk_command_buffer_set_error(&cmd_buffer->vk, result);
+      return;
+   }
+
+   struct anv_cmd_saved_state saved;
+   anv_cmd_buffer_save_state(cmd_buffer,
+                             ANV_CMD_SAVED_STATE_COMPUTE_PIPELINE |
+                             ANV_CMD_SAVED_STATE_DESCRIPTOR_SET_ALL |
+                             ANV_CMD_SAVED_STATE_PUSH_CONSTANTS, &saved);
+
+   anv_CmdBindPipeline(commandBuffer, VK_PIPELINE_BIND_POINT_COMPUTE,
+                       pipeline);
+
+   struct copy_args consts = {
+      .src_addr = vk_acceleration_structure_get_va(src),
+      .dst_addr = vk_acceleration_structure_get_va(dst),
+      .mode = ANV_COPY_MODE_COPY,
+   };
+
+   VkPushConstantsInfoKHR push_info = {
+      .sType = VK_STRUCTURE_TYPE_PUSH_CONSTANTS_INFO_KHR,
+      .layout = layout,
+      .stageFlags = VK_SHADER_STAGE_COMPUTE_BIT,
+      .offset = 0,
+      .size = sizeof(consts),
+      .pValues = &consts,
+   };
+
+   anv_CmdPushConstants2KHR(commandBuffer, &push_info);
+
+   /* L1/L2 caches flushes should have been dealt with by pipeline barriers.
+    * Unfortunately some platforms require L3 flush because CS (reading the
+    * dispatch paramters) is not L3 coherent.
+    */
+   if (!ANV_DEVINFO_HAS_COHERENT_L3_CS(cmd_buffer->device->info)) {
+      anv_add_pending_pipe_bits(cmd_buffer, ANV_PIPE_DATA_CACHE_FLUSH_BIT,
+                                "bvh size read for dispatch");
+   }
+
+   anv_genX(cmd_buffer->device->info, CmdDispatchIndirect)(
+      commandBuffer, src->buffer,
+      src->offset + offsetof(struct anv_accel_struct_header,
+                             copy_dispatch_size));
+
+   anv_cmd_buffer_restore_state(cmd_buffer, &saved);
 }

 void
@@ -88,6 +565,66 @@ genX(CmdCopyAccelerationStructureToMemoryKHR)(
    VkCommandBuffer                             commandBuffer,
    const VkCopyAccelerationStructureToMemoryInfoKHR* pInfo)
 {
+   ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
+   VK_FROM_HANDLE(vk_acceleration_structure, src, pInfo->src);
+
+   struct anv_device *device = cmd_buffer->device;
+   VkPipeline pipeline;
+   VkPipelineLayout layout;
+   VkResult result = get_pipeline_spv(device, "copy", copy_spv,
+                                      sizeof(copy_spv),
+                                      sizeof(struct copy_args), &pipeline,
+                                      &layout);
+   if (result != VK_SUCCESS) {
+      vk_command_buffer_set_error(&cmd_buffer->vk, result);
+      return;
+   }
+
+   struct anv_cmd_saved_state saved;
+   anv_cmd_buffer_save_state(cmd_buffer,
+                             ANV_CMD_SAVED_STATE_COMPUTE_PIPELINE |
+                             ANV_CMD_SAVED_STATE_DESCRIPTOR_SET_ALL |
+                             ANV_CMD_SAVED_STATE_PUSH_CONSTANTS, &saved);
+
+   anv_CmdBindPipeline(commandBuffer, VK_PIPELINE_BIND_POINT_COMPUTE,
+                       pipeline);
+
+   struct copy_args consts = {
+      .src_addr = vk_acceleration_structure_get_va(src),
+      .dst_addr = pInfo->dst.deviceAddress,
+      .mode = ANV_COPY_MODE_SERIALIZE,
+   };
+
+   memcpy(consts.driver_uuid, device->physical->driver_uuid, VK_UUID_SIZE);
+   memcpy(consts.accel_struct_compat, device->physical->rt_uuid, VK_UUID_SIZE);
+
+   VkPushConstantsInfoKHR push_info = {
+      .sType = VK_STRUCTURE_TYPE_PUSH_CONSTANTS_INFO_KHR,
+      .layout = layout,
+      .stageFlags = VK_SHADER_STAGE_COMPUTE_BIT,
+      .offset = 0,
+      .size = sizeof(consts),
+      .pValues = &consts,
+   };
+
+   anv_CmdPushConstants2KHR(commandBuffer, &push_info);
+
+   /* L1/L2 caches flushes should have been dealt with by pipeline barriers.
+    * Unfortunately some platforms require L3 flush because CS (reading the
+    * dispatch paramters) is not L3 coherent.
+    */
+   if (!ANV_DEVINFO_HAS_COHERENT_L3_CS(cmd_buffer->device->info)) {
+      anv_add_pending_pipe_bits(cmd_buffer,
+                                ANV_PIPE_DATA_CACHE_FLUSH_BIT,
+                                "bvh size read for dispatch");
+   }
+
+   anv_genX(device->info, CmdDispatchIndirect)(
+      commandBuffer, src->buffer,
+      src->offset + offsetof(struct anv_accel_struct_header,
+                             copy_dispatch_size));
+
+   anv_cmd_buffer_restore_state(cmd_buffer, &saved);
 }

 void
@@ -95,6 +632,48 @@ genX(CmdCopyMemoryToAccelerationStructureKHR)(
    VkCommandBuffer                             commandBuffer,
    const VkCopyMemoryToAccelerationStructureInfoKHR* pInfo)
 {
+   ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
+   VK_FROM_HANDLE(vk_acceleration_structure, dst, pInfo->dst);
+
+   VkPipeline pipeline;
+   VkPipelineLayout layout;
+   VkResult result = get_pipeline_spv(cmd_buffer->device, "copy", copy_spv,
+                                      sizeof(copy_spv),
+                                      sizeof(struct copy_args), &pipeline,
+                                      &layout);
+   if (result != VK_SUCCESS) {
+      vk_command_buffer_set_error(&cmd_buffer->vk, result);
+      return;
+   }
+
+   struct anv_cmd_saved_state saved;
+   anv_cmd_buffer_save_state(cmd_buffer,
+                             ANV_CMD_SAVED_STATE_COMPUTE_PIPELINE |
+                             ANV_CMD_SAVED_STATE_DESCRIPTOR_SET_ALL |
+                             ANV_CMD_SAVED_STATE_PUSH_CONSTANTS, &saved);
+
+   anv_CmdBindPipeline(commandBuffer, VK_PIPELINE_BIND_POINT_COMPUTE,
+                       pipeline);
+
+   const struct copy_args consts = {
+      .src_addr = pInfo->src.deviceAddress,
+      .dst_addr = vk_acceleration_structure_get_va(dst),
+      .mode = ANV_COPY_MODE_DESERIALIZE,
+   };
+
+   VkPushConstantsInfoKHR push_info = {
+      .sType = VK_STRUCTURE_TYPE_PUSH_CONSTANTS_INFO_KHR,
+      .layout = layout,
+      .stageFlags = VK_SHADER_STAGE_COMPUTE_BIT,
+      .offset = 0,
+      .size = sizeof(consts),
+      .pValues = &consts,
+   };
+
+   anv_CmdPushConstants2KHR(commandBuffer, &push_info);
+
+   vk_common_CmdDispatch(commandBuffer, 512, 1, 1);
+   anv_cmd_buffer_restore_state(cmd_buffer, &saved);
 }

 /* TODO: Host commands */
--- a/src/intel/vulkan/genX_init_state.c
+++ b/src/intel/vulkan/genX_init_state.c
@@ -31,7 +31,7 @@

 #include "vk_standard_sample_locations.h"

-#if GFX_VERx10 >= 125 && ANV_SUPPORT_RT
+#if GFX_VERx10 >= 125 && ANV_SUPPORT_RT_GRL
 #include "grl/genX_grl.h"
 #endif

@@ -863,9 +863,15 @@ void
 genX(init_physical_device_state)(ASSERTED struct anv_physical_device *pdevice)
 {
   assert(pdevice->info.verx10 == GFX_VERx10);
+
 #if GFX_VERx10 >= 125 && ANV_SUPPORT_RT
+#if ANV_SUPPORT_RT_GRL
   genX(grl_load_rt_uuid)(pdevice->rt_uuid);
   pdevice->max_grl_scratch_size = genX(grl_max_scratch_size)();
+#else
+   STATIC_ASSERT(sizeof(ANV_RT_UUID_MACRO) == VK_UUID_SIZE);
+   memcpy(pdevice->rt_uuid, ANV_RT_UUID_MACRO, VK_UUID_SIZE);
+#endif
 #endif

   pdevice->cmd_emit_timestamp = genX(cmd_emit_timestamp);
--- a/src/intel/vulkan/genX_query.c
+++ b/src/intel/vulkan/genX_query.c
@@ -1052,19 +1052,19 @@ emit_perf_intel_query(struct anv_cmd_buffer *cmd_buffer,
   }
 }

-static void
-emit_query_clear_flush(struct anv_cmd_buffer *cmd_buffer,
-                       struct anv_query_pool *pool,
-                       const char *reason)
+static bool
+append_query_clear_flush(struct anv_cmd_buffer *cmd_buffer,
+                         struct anv_query_pool *pool,
+                         const char *reason)
 {
   if (cmd_buffer->state.queries.clear_bits == 0)
-      return;
+      return false;

   anv_add_pending_pipe_bits(cmd_buffer,
                             ANV_PIPE_QUERY_BITS(
                                cmd_buffer->state.queries.clear_bits),
                             reason);
-   genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
+   return true;
 }


@@ -1079,7 +1079,9 @@ void genX(CmdBeginQueryIndexedEXT)(
   ANV_FROM_HANDLE(anv_query_pool, pool, queryPool);
   struct anv_address query_addr = anv_query_address(pool, query);

-   emit_query_clear_flush(cmd_buffer, pool, "CmdBeginQuery* flush query clears");
+   if (append_query_clear_flush(cmd_buffer, pool,
+                                "CmdBeginQuery* flush query clears"))
+      genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);

   struct mi_builder b;
   mi_builder_init(&b, cmd_buffer->device->info, &cmd_buffer->batch);
@@ -1532,8 +1534,9 @@ void genX(CmdWriteTimestamp2)(

   assert(pool->vk.query_type == VK_QUERY_TYPE_TIMESTAMP);

-   emit_query_clear_flush(cmd_buffer, pool,
-                          "CmdWriteTimestamp flush query clears");
+   if (append_query_clear_flush(cmd_buffer, pool,
+                                "CmdWriteTimestamp flush query clears"))
+      genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);

   struct mi_builder b;
   mi_builder_init(&b, cmd_buffer->device->info, &cmd_buffer->batch);
@@ -2014,11 +2017,15 @@ void genX(CmdCopyQueryPoolResults)(

 #if GFX_VERx10 >= 125 && ANV_SUPPORT_RT

+#if ANV_SUPPORT_RT_GRL
 #include "grl/include/GRLRTASCommon.h"
 #include "grl/grl_metakernel_postbuild_info.h"
+#else
+#include "bvh/anv_bvh.h"
+#endif

-static void
-anv_write_acceleration_structure_properties_grl(
+void
+genX(CmdWriteAccelerationStructuresPropertiesKHR)(
    VkCommandBuffer                             commandBuffer,
    uint32_t                                    accelerationStructureCount,
    const VkAccelerationStructureKHR*           pAccelerationStructures,
@@ -2026,15 +2033,31 @@ anv_write_acceleration_structure_properties_grl(
    VkQueryPool                                 queryPool,
    uint32_t                                    firstQuery)
 {
+   assert(queryType == VK_QUERY_TYPE_ACCELERATION_STRUCTURE_COMPACTED_SIZE_KHR ||
+          queryType == VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SERIALIZATION_SIZE_KHR ||
+          queryType == VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SIZE_KHR ||
+          queryType == VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SERIALIZATION_BOTTOM_LEVEL_POINTERS_KHR);
+
   ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
   ANV_FROM_HANDLE(anv_query_pool, pool, queryPool);

-   emit_query_clear_flush(cmd_buffer, pool,
-                          "CmdWriteAccelerationStructuresPropertiesKHR flush query clears");
+#if !ANV_SUPPORT_RT_GRL
+   anv_add_pending_pipe_bits(cmd_buffer,
+                             ANV_PIPE_END_OF_PIPE_SYNC_BIT |
+                             ANV_PIPE_DATA_CACHE_FLUSH_BIT,
+                             "read BVH data using CS");
+#endif
+
+   if (append_query_clear_flush(
+          cmd_buffer, pool,
+          "CmdWriteAccelerationStructuresPropertiesKHR flush query clears") ||
+       !ANV_SUPPORT_RT_GRL)
+      genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);

   struct mi_builder b;
   mi_builder_init(&b, cmd_buffer->device->info, &cmd_buffer->batch);

+#if ANV_SUPPORT_RT_GRL
   for (uint32_t i = 0; i < accelerationStructureCount; i++) {
      ANV_FROM_HANDLE(vk_acceleration_structure, accel, pAccelerationStructures[i]);
      struct anv_address query_addr =
@@ -2076,26 +2099,47 @@ anv_write_acceleration_structure_properties_grl(

   for (uint32_t i = 0; i < accelerationStructureCount; i++)
      emit_query_mi_availability(&b, anv_query_address(pool, firstQuery + i), true);
-}

-void
-genX(CmdWriteAccelerationStructuresPropertiesKHR)(
-    VkCommandBuffer                             commandBuffer,
-    uint32_t                                    accelerationStructureCount,
-    const VkAccelerationStructureKHR*           pAccelerationStructures,
-    VkQueryType                                 queryType,
-    VkQueryPool                                 queryPool,
-    uint32_t                                    firstQuery)
-{
-   assert(queryType == VK_QUERY_TYPE_ACCELERATION_STRUCTURE_COMPACTED_SIZE_KHR ||
-          queryType == VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SERIALIZATION_SIZE_KHR ||
-          queryType == VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SIZE_KHR ||
-          queryType == VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SERIALIZATION_BOTTOM_LEVEL_POINTERS_KHR);
+#else
+   for (uint32_t i = 0; i < accelerationStructureCount; i++) {
+      ANV_FROM_HANDLE(vk_acceleration_structure, accel, pAccelerationStructures[i]);
+      struct anv_address query_addr =
+         anv_address_add(anv_query_address(pool, firstQuery + i), 8);
+      uint64_t va = vk_acceleration_structure_get_va(accel);

-   anv_write_acceleration_structure_properties_grl(commandBuffer,
-                                                   accelerationStructureCount,
-                                                   pAccelerationStructures,
-                                                   queryType, queryPool,
-                                                   firstQuery);
+      mi_builder_set_write_check(&b, (i == (accelerationStructureCount - 1)));
+
+      switch (queryType) {
+      case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_COMPACTED_SIZE_KHR:
+         va += offsetof(struct anv_accel_struct_header, compacted_size);
+         break;
+      case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SIZE_KHR:
+         va += offsetof(struct anv_accel_struct_header, size);
+         break;
+      case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SERIALIZATION_SIZE_KHR:
+         va += offsetof(struct anv_accel_struct_header, serialization_size);
+         break;
+      case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SERIALIZATION_BOTTOM_LEVEL_POINTERS_KHR:
+         va += offsetof(struct anv_accel_struct_header, instance_count);
+         /* To respect current set up tailored for GRL, the numBlasPtrs are
+          * stored at the second slot (third slot, if you count availability)
+          */
+         query_addr = anv_address_add(query_addr, 8);
+         break;
+      default:
+         unreachable("unhandled query type");
+      }
+
+      mi_store(&b, mi_mem64(query_addr), mi_mem64(anv_address_from_u64(va)));
+   }
+
+   struct mi_builder b1;
+   mi_builder_init(&b1, cmd_buffer->device->info, &cmd_buffer->batch);
+
+   for (uint32_t i = 0; i < accelerationStructureCount; i++) {
+      mi_builder_set_write_check(&b1, (i == (accelerationStructureCount - 1)));
+      emit_query_mi_availability(&b1, anv_query_address(pool, firstQuery + i), true);
+   }
+#endif /* ANV_SUPPORT_RT_GRL */
 }
-#endif
+#endif /* GFX_VERx10 >= 125 && ANV_SUPPORT_RT */
--- a/src/intel/vulkan/meson.build
+++ b/src/intel/vulkan/meson.build
@@ -38,13 +38,22 @@ idep_anv_headers = declare_dependency(
 )

 if with_intel_vk_rt
-  subdir('grl')
-  optional_libgrl = [libgrl]
+  if with_intel_bvh_grl
+    subdir('grl')
+    optional_libgrl = [libgrl]
+    anv_flags += '-DANV_SUPPORT_RT_GRL=1'
+  else
+    subdir('bvh')
+    idep_grl = null_dep
+    optional_libgrl = []
+    anv_flags += '-DANV_SUPPORT_RT_GRL=0'
+  endif
  anv_flags += '-DANV_SUPPORT_RT=1'
 else
  idep_grl = null_dep
  optional_libgrl = []
  anv_flags += '-DANV_SUPPORT_RT=0'
+  anv_flags += '-DANV_SUPPORT_RT_GRL=0'
 endif

 intel_icd = custom_target(
@@ -101,9 +110,15 @@ anv_per_hw_ver_files = files(
  'genX_simple_shader.c',
 )
 if with_intel_vk_rt
-  anv_per_hw_ver_files += files(
-    'genX_acceleration_structure_grl.c',
-  )
+  if with_intel_bvh_grl
+    anv_per_hw_ver_files += files(
+      'genX_acceleration_structure_grl.c',
+    )
+  else
+    anv_per_hw_ver_files += files(
+      'genX_acceleration_structure.c',
+    )
+  endif
 endif

 foreach _gfx_ver : ['90', '110', '120', '125', '200', '300']
@@ -194,6 +209,8 @@ libanv_files = files(
  'anv_va.c',
  'anv_video.c',
  'anv_wsi.c',
+  'bvh/anv_bvh.h',
+  'bvh/anv_build_interface.h',
 )

 anv_deps = [
@@ -248,7 +265,7 @@ libanv_common = static_library(

 libvulkan_intel = shared_library(
  'vulkan_intel',
-  [files('anv_gem.c'), anv_entrypoints[0]],
+  [files('anv_gem.c'), anv_entrypoints[0], bvh_spv],
  include_directories : [
    inc_include, inc_src, inc_intel,
  ],