anv: Build BVHs on the GPU with GRL

Reviewed-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com> Acked-by: Caio Oliveira <caio.oliveira@intel.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/16970>
2021-01-21 02:18:32 -06:00
parent dc1aedef2b
commit f3ddfd81b4
12 changed files with 1934 additions and 19 deletions
--- a/src/intel/vulkan/anv_device.c
+++ b/src/intel/vulkan/anv_device.c
@@ -191,6 +191,7 @@ get_device_extensions(const struct anv_physical_device *device,
   *ext = (struct vk_device_extension_table) {
      .KHR_8bit_storage                      = true,
      .KHR_16bit_storage                     = true,
+      .KHR_acceleration_structure            = device->info.has_ray_tracing,
      .KHR_bind_memory2                      = true,
      .KHR_buffer_device_address             = true,
      .KHR_copy_commands2                    = true,
@@ -1343,11 +1344,12 @@ void anv_GetPhysicalDeviceFeatures2(

      case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_ACCELERATION_STRUCTURE_FEATURES_KHR: {
         VkPhysicalDeviceAccelerationStructureFeaturesKHR *features = (void *)ext;
-         features->accelerationStructure = false;
-         features->accelerationStructureCaptureReplay = false;
-         features->accelerationStructureIndirectBuild = false;
+         features->accelerationStructure = pdevice->info.has_ray_tracing;
+         features->accelerationStructureCaptureReplay = false; /* TODO */
+         features->accelerationStructureIndirectBuild = false; /* TODO */
         features->accelerationStructureHostCommands = false;
-         features->descriptorBindingAccelerationStructureUpdateAfterBind = true;
+         features->descriptorBindingAccelerationStructureUpdateAfterBind =
+            pdevice->info.has_ray_tracing;
         break;
      }

@@ -3393,6 +3395,11 @@ VkResult anv_CreateDevice(
                                       "Anv") + 8, 8),
   };

+   device->rt_uuid_addr = anv_address_add(device->workaround_address, 8);
+   memcpy(device->rt_uuid_addr.bo->map + device->rt_uuid_addr.offset,
+          physical_device->rt_uuid,
+          sizeof(physical_device->rt_uuid));
+
   device->debug_frame_desc =
      intel_debug_get_identifier_block(device->workaround_bo->map,
                                       device->workaround_bo->size,
--- a/src/intel/vulkan/anv_pipeline.c
+++ b/src/intel/vulkan/anv_pipeline.c
@@ -2866,6 +2866,8 @@ anv_pipeline_compile_ray_tracing(struct anv_ray_tracing_pipeline *pipeline,
 VkResult
 anv_device_init_rt_shaders(struct anv_device *device)
 {
+   device->bvh_build_method = ANV_BVH_BUILD_METHOD_NEW_SAH;
+
   if (!device->vk.enabled_extensions.KHR_ray_tracing_pipeline)
      return VK_SUCCESS;

--- a/src/intel/vulkan/anv_private.h
+++ b/src/intel/vulkan/anv_private.h
@@ -975,6 +975,7 @@ struct anv_physical_device {
    uint8_t                                     pipeline_cache_uuid[VK_UUID_SIZE];
    uint8_t                                     driver_uuid[VK_UUID_SIZE];
    uint8_t                                     device_uuid[VK_UUID_SIZE];
+    uint8_t                                     rt_uuid[VK_UUID_SIZE];

    struct vk_sync_type                         sync_syncobj_type;
    struct vk_sync_timeline_type                sync_timeline_type;
@@ -1076,6 +1077,11 @@ anv_device_upload_nir(struct anv_device *device,
                      const struct nir_shader *nir,
                      unsigned char sha1_key[20]);

+enum anv_rt_bvh_build_method {
+   ANV_BVH_BUILD_METHOD_TRIVIAL,
+   ANV_BVH_BUILD_METHOD_NEW_SAH,
+};
+
 struct anv_device {
    struct vk_device                            vk;

@@ -1146,6 +1152,7 @@ struct anv_device {
    struct anv_scratch_pool                     scratch_pool;
    struct anv_bo                              *rt_scratch_bos[16];
    struct anv_bo                              *btd_fifo_bo;
+    struct anv_address                          rt_uuid_addr;

    /** Shadow ray query BO
     *
@@ -1165,6 +1172,8 @@ struct anv_device {
    struct anv_shader_bin                      *rt_trampoline;
    struct anv_shader_bin                      *rt_trivial_return;

+    enum anv_rt_bvh_build_method                bvh_build_method;
+
    pthread_mutex_t                             mutex;
    pthread_cond_t                              queue_submit;

@@ -2087,6 +2096,7 @@ anv_pipe_flush_bits_for_access_flags(struct anv_device *device,
      switch ((VkAccessFlags2)BITFIELD64_BIT(b)) {
      case VK_ACCESS_2_SHADER_WRITE_BIT:
      case VK_ACCESS_2_SHADER_STORAGE_WRITE_BIT:
+      case VK_ACCESS_2_ACCELERATION_STRUCTURE_WRITE_BIT_KHR:
         /* We're transitioning a buffer that was previously used as write
          * destination through the data port. To make its content available
          * to future operations, flush the hdc pipeline.
--- a/src/intel/vulkan/genX_acceleration_structure.c
+++ b/src/intel/vulkan/genX_acceleration_structure.c
--- a/src/intel/vulkan/genX_cmd_buffer.c
+++ b/src/intel/vulkan/genX_cmd_buffer.c
@@ -5345,7 +5345,7 @@ genX(cmd_buffer_dispatch_kernel)(struct anv_cmd_buffer *cmd_buffer,
      mi_builder_init(&b, cmd_buffer->device->info, &cmd_buffer->batch);

      struct anv_address sysvals_addr = {
-         .bo = cmd_buffer->device->general_state_pool.block_pool.bo,
+         .bo = NULL, /* General state buffer is always 0. */
         .offset = indirect_data.offset,
      };

--- a/src/intel/vulkan/genX_query.c
+++ b/src/intel/vulkan/genX_query.c
@@ -159,6 +159,12 @@ VkResult genX(CreateQueryPool)(
      /* Query has two values: begin and end. */
      uint64s_per_slot = 1 + 2;
      break;
+#if GFX_VERx10 >= 125
+   case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_COMPACTED_SIZE_KHR:
+   case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SERIALIZATION_SIZE_KHR:
+      uint64s_per_slot = 1 + 1 /* availability + size */;
+      break;
+#endif
   default:
      assert(!"Invalid query type");
   }
@@ -435,13 +441,18 @@ VkResult genX(GetQueryPoolResults)(
   ANV_FROM_HANDLE(anv_device, device, _device);
   ANV_FROM_HANDLE(anv_query_pool, pool, queryPool);

-   assert(pool->type == VK_QUERY_TYPE_OCCLUSION ||
-          pool->type == VK_QUERY_TYPE_PIPELINE_STATISTICS ||
-          pool->type == VK_QUERY_TYPE_TIMESTAMP ||
-          pool->type == VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT ||
-          pool->type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR ||
-          pool->type == VK_QUERY_TYPE_PERFORMANCE_QUERY_INTEL ||
-          pool->type == VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT);
+   assert(
+#if GFX_VERx10 >= 125
+   pool->type == VK_QUERY_TYPE_ACCELERATION_STRUCTURE_COMPACTED_SIZE_KHR ||
+   pool->type == VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SERIALIZATION_SIZE_KHR ||
+#endif
+   pool->type == VK_QUERY_TYPE_OCCLUSION ||
+   pool->type == VK_QUERY_TYPE_PIPELINE_STATISTICS ||
+   pool->type == VK_QUERY_TYPE_TIMESTAMP ||
+   pool->type == VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT ||
+   pool->type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR ||
+   pool->type == VK_QUERY_TYPE_PERFORMANCE_QUERY_INTEL ||
+   pool->type == VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT);

   if (vk_device_is_lost(&device->vk))
      return VK_ERROR_DEVICE_LOST;
@@ -533,6 +544,10 @@ VkResult genX(GetQueryPoolResults)(
         break;
      }

+#if GFX_VERx10 >= 125
+      case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_COMPACTED_SIZE_KHR:
+      case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SERIALIZATION_SIZE_KHR:
+#endif
      case VK_QUERY_TYPE_TIMESTAMP: {
         uint64_t *slot = query_slot(pool, firstQuery + i);
         if (write_results)
@@ -716,6 +731,10 @@ void genX(CmdResetQueryPool)(

   switch (pool->type) {
   case VK_QUERY_TYPE_OCCLUSION:
+#if GFX_VERx10 >= 125
+   case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_COMPACTED_SIZE_KHR:
+   case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SERIALIZATION_SIZE_KHR:
+#endif
      for (uint32_t i = 0; i < queryCount; i++) {
         emit_query_pc_availability(cmd_buffer,
                                    anv_query_address(pool, firstQuery + i),
@@ -1466,6 +1485,10 @@ void genX(CmdCopyQueryPoolResults)(
         break;

      case VK_QUERY_TYPE_TIMESTAMP:
+#if GFX_VERx10 >= 125
+      case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_COMPACTED_SIZE_KHR:
+      case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SERIALIZATION_SIZE_KHR:
+#endif
         result = mi_mem64(anv_address_add(query_addr, 8));
         gpu_write_query_result(&b, dest_addr, flags, idx++, result);
         break;
@@ -1486,3 +1509,51 @@ void genX(CmdCopyQueryPoolResults)(
      dest_addr = anv_address_add(dest_addr, destStride);
   }
 }
+
+#if GFX_VERx10 >= 125
+
+#include "grl/include/GRLRTASCommon.h"
+#include "grl/grl_metakernel_postbuild_info.h"
+
+void
+genX(CmdWriteAccelerationStructuresPropertiesKHR)(
+    VkCommandBuffer                             commandBuffer,
+    uint32_t                                    accelerationStructureCount,
+    const VkAccelerationStructureKHR*           pAccelerationStructures,
+    VkQueryType                                 queryType,
+    VkQueryPool                                 queryPool,
+    uint32_t                                    firstQuery)
+{
+   ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
+   ANV_FROM_HANDLE(anv_query_pool, pool, queryPool);
+
+   assert(queryType == VK_QUERY_TYPE_ACCELERATION_STRUCTURE_COMPACTED_SIZE_KHR ||
+          queryType == VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SERIALIZATION_SIZE_KHR);
+
+   struct mi_builder b;
+   mi_builder_init(&b, cmd_buffer->device->info, &cmd_buffer->batch);
+
+   for (uint32_t i = 0; i < accelerationStructureCount; i++) {
+      ANV_FROM_HANDLE(anv_acceleration_structure, accel, pAccelerationStructures[i]);
+      struct anv_address query_addr =
+         anv_address_add(anv_query_address(pool, firstQuery + i), 8);
+
+      if (queryType == VK_QUERY_TYPE_ACCELERATION_STRUCTURE_COMPACTED_SIZE_KHR) {
+         genX(grl_postbuild_info_compacted_size)(cmd_buffer,
+                                                 anv_address_physical(accel->address),
+                                                 anv_address_physical(query_addr));
+      } else {
+         assert(queryType == VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SERIALIZATION_SIZE_KHR);
+         genX(grl_postbuild_info_serialized_size)(cmd_buffer,
+                                                  anv_address_physical(accel->address),
+                                                  anv_address_physical(query_addr));
+      }
+   }
+
+   cmd_buffer->state.pending_pipe_bits |= ANV_PIPE_END_OF_PIPE_SYNC_BIT;
+   genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
+
+   for (uint32_t i = 0; i < accelerationStructureCount; i++)
+      emit_query_mi_availability(&b, anv_query_address(pool, firstQuery + i), true);
+}
+#endif
--- a/src/intel/vulkan/genX_state.c
+++ b/src/intel/vulkan/genX_state.c
@@ -36,6 +36,11 @@
 #include "genxml/genX_pack.h"

 #include "vk_standard_sample_locations.h"
+
+#if GFX_VERx10 >= 125
+#include "grl/genX_grl.h"
+#endif
+
 #include "vk_util.h"

 static void
@@ -466,6 +471,9 @@ void
 genX(init_physical_device_state)(ASSERTED struct anv_physical_device *pdevice)
 {
   assert(pdevice->info.verx10 == GFX_VERx10);
+#if GFX_VERx10 >= 125
+   genX(grl_load_rt_uuid)(pdevice->rt_uuid);
+#endif
 }

 VkResult
--- a/src/intel/vulkan/grl/genX_grl.h
+++ b/src/intel/vulkan/grl/genX_grl.h
@@ -39,6 +39,8 @@ genX(grl_dispatch)(struct anv_cmd_buffer *cmd_buffer,
                   uint32_t arg_count,
                   const struct anv_kernel_arg *args);

+void
+genX(grl_load_rt_uuid)(uint8_t *out_uuid);

 #ifdef __cplusplus
 } /* extern "C" */
--- a/src/intel/vulkan/grl/genX_grl_uuid.cpp
+++ b/src/intel/vulkan/grl/genX_grl_uuid.cpp
@@ -0,0 +1,39 @@
+/*
+ * Copyright © 2021 Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include <assert.h>
+#include <string.h>
+
+#include "include/GRLGen12.h"
+
+#include "vulkan/vulkan_core.h"
+
+extern "C" void
+gfx125_grl_load_rt_uuid(uint8_t *out_uuid);
+
+extern "C" void
+gfx125_grl_load_rt_uuid(uint8_t *out_uuid)
+{
+   assert(sizeof(GRL::RTAS::GEN12::BVH_MAGIC) == VK_UUID_SIZE);
+   memcpy(out_uuid, GRL::RTAS::GEN12::BVH_MAGIC, VK_UUID_SIZE);
+}
--- a/src/intel/vulkan/grl/grl_structs.h
+++ b/src/intel/vulkan/grl/grl_structs.h
@@ -0,0 +1,479 @@
+/*
+ * Copyright © 2022 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+/**
+ * This file contains a redefinition of structures defined in the GRL library.
+ * We need to have those structures defined to allocate & prepare data for
+ * the OpenCL kernels building acceleration structures. Unfortunately because
+ * of C++ & OpenCL assumptions in GRL, it's no possible to just include GRL
+ * header files directly so we have to redefine stuff here.
+ */
+
+#ifndef GRL_STRUCTS_H
+#define GRL_STRUCTS_H
+
+#include "GRLStructs.h"
+#include "GRLRTASCommon.h"
+
+struct MKBuilderState {
+   qword geomDesc_buffer;
+   qword build_primref_buffer;
+   qword build_globals;
+   qword bvh_buffer;
+   dword leaf_type;
+   dword leaf_size;
+};
+
+#define PREFIX_MK_STATE(prefix, obj) \
+   (struct prefix##_MKBuilderState) { \
+      .geomDesc_buffer = (obj).geomDesc_buffer, \
+      .build_primref_buffer = (obj).build_primref_buffer, \
+      .build_globals = (obj).build_globals, \
+      .bvh_buffer = (obj).bvh_buffer, \
+      .leaf_type = (obj).leaf_type, \
+      .leaf_size = (obj).leaf_size, \
+   }
+
+struct MKSizeEstimate {
+   dword numTriangles;
+   dword numProcedurals;
+   dword numPrimitives;
+   dword numMeshes;
+   dword numBuildPrimitives;
+   dword numPrimitivesToSplit;
+   dword instance_descs_start;
+   dword geo_meta_data_start;
+   dword node_data_start;
+   dword leaf_data_start;
+   dword procedural_data_start;
+   dword back_pointer_start;
+   dword sizeTotal;
+   dword updateScratchSizeTotal;
+   dword fatleaf_table_start;
+   dword innernode_table_start;
+   dword max_fatleaves;
+
+   size_t max_instance_leafs;
+   size_t max_inner_nodes;
+   size_t leaf_data_size;
+   size_t min_primitives;
+   size_t max_primitives;
+};
+
+#define PREFIX_MK_SIZE(prefix, obj) \
+   (struct prefix##_MKSizeEstimate) { \
+      .numTriangles = (obj).numTriangles, \
+      .numProcedurals = (obj).numProcedurals, \
+      .numPrimitives = (obj).numPrimitives, \
+      .numMeshes = (obj).numMeshes, \
+      .numBuildPrimitives = (obj).numBuildPrimitives, \
+      .numPrimitivesToSplit = (obj).numPrimitivesToSplit, \
+      .instance_descs_start = (obj).instance_descs_start, \
+      .geo_meta_data_start = (obj).geo_meta_data_start, \
+      .node_data_start = (obj).node_data_start, \
+      .leaf_data_start = (obj).leaf_data_start, \
+      .procedural_data_start = (obj).procedural_data_start, \
+      .back_pointer_start = (obj).back_pointer_start, \
+      .sizeTotal = (obj).sizeTotal, \
+      .updateScratchSizeTotal = (obj).updateScratchSizeTotal, \
+      .fatleaf_table_start = (obj).fatleaf_table_start, \
+      .innernode_table_start = (obj).innernode_table_start, \
+      .max_fatleaves = (obj).max_fatleaves, \
+   }
+
+typedef struct AABB {
+   float lower[4];
+   float upper[4];
+} AABB;
+
+struct Globals
+{
+   struct AABB centroidBounds;
+
+   unsigned int build_record_start;
+   unsigned int numPrimitives;
+   unsigned int leafPrimType;
+   unsigned int leafSize;
+
+   unsigned int numSplittedPrimitives;
+   unsigned int numBuildRecords;
+
+   // spatial split sate
+   unsigned int numOriginalPrimitives;
+   float presplitPrioritySum;
+   float probThreshold;
+
+   // binned-sah bfs state
+   unsigned int counter;
+   unsigned int numBuildRecords_extended;
+
+   // sync variable used for global-sync on work groups
+   unsigned int sync;
+
+
+   /* morton code builder state */
+   unsigned int shift;      // used by adaptive mc-builder
+   unsigned int shift_mask; // used by adaptive mc-builder
+   unsigned int binary_hierarchy_root;
+   unsigned int p0_allocated_num;
+   unsigned int p0_created_num;
+   unsigned int morton_sort_in_flight;
+   unsigned int sort_iterations;
+
+   gpuva_t binary_hierarchy_buffer; // pointer to the binary morton code hierarchy.  Stashed here as a debug aid
+};
+
+typedef struct BVHBase
+{
+   // TODO:  Implement the "copy-first-node" trick... duplicate root node here
+
+   uint64_t rootNodeOffset;
+
+   uint32_t reserved;
+
+   uint32_t nodeDataCur; // nodeDataStart is sizeof(BVHBase) / 64 = BVH_ROOT_NODE_OFFSET / 64
+   uint32_t quadLeafStart;
+   uint32_t quadLeafCur;
+   uint32_t proceduralDataStart;
+   uint32_t proceduralDataCur;
+   uint32_t instanceLeafStart;
+   uint32_t instanceLeafEnd;
+   uint32_t backPointerDataStart;     //
+   uint32_t refitTreeletsDataStart;   // refit structs
+   uint32_t refitStartPointDataStart; //
+   uint32_t BVHDataEnd;
+
+   // number of bottom treelets
+   // if 1, then the bottom treelet is also tip treelet
+   uint32_t refitTreeletCnt;
+   uint32_t refitTreeletCnt2; // always 0, used for atomic updates
+   // data layout:
+   // @backPointerDataStart
+   //  'backpointer' - a dword per inner node.
+   //  The bits are used as follows:
+   //     2:0  --> Used as a refit counter during BVH refitting.  MBZ
+   //     5:3  --> Number of children
+   //     31:6 --> Index of the parent node in the internal node array
+   //    The root node has a parent index of all ones
+   // @refitTreeletsDataStart
+   //  RefitTreelet[], the last treelet is for top treelet all previous are for bottom
+   // @refitStartPointDataStart
+   //  for each treelet T there is [T.startpoint_offset, T.numStartpoints) interval of startpoints here in that space
+   // @backPointerDataEnd
+
+   uint32_t fatLeafCount;  // number of internal nodes which are "fat-leaves"
+   uint32_t innerCount;    // number of internal nodes which are true inner nodes (all internalNode children)
+   uint32_t fatLeafTableStart;
+   uint32_t innerTableStart;
+
+   uint32_t _pad[12];
+
+   struct RTASMetaData Meta;
+} BVHBase;
+
+
+struct BatchedInitGlobalsData
+{
+   qword p_build_globals;
+   qword p_bvh_buffer;
+   dword numPrimitives;
+   dword numGeometries;
+   dword numInstances;
+   dword instance_descs_start;
+   dword geo_meta_data_start;
+   dword node_data_start;
+   dword leaf_data_start;
+   dword procedural_data_start;
+   dword back_pointer_start;
+   dword sizeTotal;
+   dword leafType;
+   dword leafSize;
+   dword fatleaf_table_start;
+   dword innernode_table_start;
+};
+
+
+#define BFS_NUM_BINS        16
+#define BFS_NUM_VCONTEXTS   256
+#define BFS_MAX_DEPTH 32
+
+#define QNODE_GLOBAL_ROOT_BUFFER_MIN_ENTRIES_NUM 16384
+
+struct BFS_Split
+{
+   float sah;
+   int dim;
+   int pos;
+};
+
+struct BFS_BinInfo
+{
+   float min_max[18 * BFS_NUM_BINS]; //  layout: bins[axis][num_bins][6]
+   //          The 6 are lower(xyz) and -upper(xyz)
+   // bins use negated-max so that we can use vectorized mins instead of min/max pairs
+   uint counts[3 * BFS_NUM_BINS];
+};
+
+struct SAHBuildGlobals
+{
+   qword   p_primref_index_buffers;
+   qword   p_primrefs_buffer;
+   qword   p_bvh2;
+   qword   p_globals;     // TODO: deprecate this
+   qword   p_bvh_base;
+   gpuva_t p_qnode_root_buffer;
+
+   dword flags; // bit 1 is 'alloc_backpointers'.  bit 2 is 'need_masks'
+   dword num_primrefs;
+   dword leaf_size;
+   dword leaf_type;
+
+   dword root_buffer_num_produced;
+   dword root_buffer_num_produced_hi;
+   dword root_buffer_num_consumed;
+   dword root_buffer_num_consumed_hi;
+   dword root_buffer_num_to_consume;
+   dword root_buffer_num_to_consume_hi;
+};
+
+typedef union LRBounds
+{
+   struct
+   {
+      struct AABB3f left_centroid_bounds;
+      struct AABB3f left_geom_bounds;
+      struct AABB3f right_centroid_bounds;
+      struct AABB3f right_geom_bounds;
+   } boxes;
+   struct
+   {
+      float Array[24];
+   } scalars;
+} LRBounds;
+
+
+struct VContext
+{
+   uint dispatch_primref_begin;    // range of primrefs for this task
+   uint dispatch_primref_end;
+   uint bvh2_root;                 // BVH2 root node for this task
+   uint tree_depth;                // depth of this node in the tree
+   uint num_left;          // primref counts
+   uint num_right;
+   uint lr_mask;      // lower 8b : left mask.  upper 8b : right mask
+   uint batch_index;
+
+   // pass1 global working state and output
+   struct BFS_Split split;
+   struct BFS_BinInfo global_bin_info;
+
+   // pass2 global working state and output
+   LRBounds lr_bounds;
+};
+
+
+
+struct BFSDispatchRecord
+{
+   ushort batch_index;
+   ushort context_id;
+};
+
+
+struct BFSDispatchQueue
+{
+   uint num_dispatches;
+   uint wg_count[BFS_NUM_VCONTEXTS];
+   struct BFSDispatchRecord records[BFS_NUM_VCONTEXTS];
+};
+
+struct BFS1SpillStackEntry
+{
+   uint primref_begin;
+   uint primref_end;
+   uint bvh2_root;
+   ushort tree_depth;
+   ushort batch_index;
+};
+
+struct BFS1SpillStack
+{
+   uint size;
+   struct BFS1SpillStackEntry entries[BFS_NUM_VCONTEXTS * BFS_MAX_DEPTH];
+};
+
+struct QNodeGlobalRootBufferEntry
+{
+   uint bvh2_node;
+   uint qnode;
+   uint build_idx;
+   uint _pad;
+};
+
+struct QNodeGlobalRootBuffer
+{
+   uint curr_entries_offset; // we use "entries" as two buffers, so offset is either 0 or QNODE_GLOBAL_ROOT_BUFFER_MIN_ENTRIES_NUM
+   struct QNodeGlobalRootBufferEntry entries[QNODE_GLOBAL_ROOT_BUFFER_MIN_ENTRIES_NUM * 2];
+};
+
+struct DFSDispatchRecord
+{
+   uint primref_base;
+   uint bvh2_base;
+   uint batch_index;
+   ushort num_primrefs;
+   ushort tree_depth;
+};
+
+
+struct DFSDispatchQueue
+{
+   struct DFSDispatchRecord records[BFS_NUM_VCONTEXTS * 2];
+};
+
+#define VCONTEXT_STATE_EXECUTING   0
+#define VCONTEXT_STATE_UNALLOCATED 1
+
+union SchedulerUnion
+{
+   struct VContextScheduler
+   {
+      /////////////////////////////////////////////////////////////
+      //  State data used for communication with command streamer
+      //   NOTE: This part must match definition in 'new_sah_builder.grl'
+      /////////////////////////////////////////////////////////////
+
+      dword num_bfs_wgs;
+      dword num_dfs_wgs;
+
+      dword scheduler_postsync;
+      dword _pad1;
+
+      dword num_trivial_builds; // number of trivial builds (#primrefs < leaf_size).
+      dword num_single_builds;  // number of single-wg builds (#primrefs < threshold)
+
+      dword batched_build_wg_count;  // number of wgs to dispatch for initial BFS pass
+      dword batched_build_loop_mask; // value is 0 if  #builds <= #contexts.  else 1  command streamer uses this as a loop condition
+
+      /////////////////////////////////////////////////////////////
+
+      dword batched_build_count;  // number of batched builds in the SAHBuildGlobals buffer
+      dword batched_build_offset; // location of the first batched-build in the SAHBuildGlobals buffer
+
+      dword vcontext_state[BFS_NUM_VCONTEXTS];
+
+      struct BFSDispatchQueue bfs_queue;
+      struct DFSDispatchQueue dfs_queue;
+
+      struct VContext contexts[BFS_NUM_VCONTEXTS];
+
+      struct BFS1SpillStack bfs2_spill_stack;
+   } vContextScheduler;
+
+   struct QnodeScheduler
+   {
+      dword num_qnode_grb_curr_entries;
+      dword num_qnode_grb_new_entries;
+
+      dword scheduler_postsync;
+      dword _pad1;
+
+      dword num_trivial_builds; // number of trivial builds (#primrefs < leaf_size).
+      dword num_single_builds;  // number of single-wg builds (#primrefs < threshold)
+
+      dword batched_builds_to_process;
+      dword num_max_qnode_global_root_buffer_entries; // number of maximum entries to global root buffer
+
+      /////////////////////////////////////////////////////////////
+
+      dword batched_build_count;  // number of batched builds in the SAHBuildGlobals buffer
+      dword batched_build_offset; // location of the first batched-build in the SAHBuildGlobals buffer
+
+      struct QNodeGlobalRootBuffer qnode_global_root_buffer;
+   } qnodeScheduler;
+};
+
+
+struct BVH2Node
+{
+   struct AABB3f box;
+   uint  meta_u;   // leaf:  primref start.  inner: offset from node to its first child
+   uint  meta_ss;
+   //ushort meta_s;   // leaf: primref count.  inner: offset from first to second child, in nodes
+   //uchar is_inner; //  1 if inner, 0 if leaf
+   //uchar mask;
+};
+
+struct BVH2
+{
+   uint num_nodes;
+   uint _pad[7];  // align to 32B
+};
+
+struct BatchedBLSDispatchEntry
+{
+   /////////////////////////////////////////////////////////////
+   //  State data used for communication with command streamer
+   //  NOTE: This part must match definition in 'msb_radix_bitonic_sort.grl'
+   /////////////////////////////////////////////////////////////
+   qword p_data_buffer;
+   qword num_elements; // number of elements in p_data_buffer
+};
+
+struct SAHBuildArgsBatchable
+{
+   qword p_globals_ptrs;
+   qword p_scheduler;
+   qword p_buffers_info;
+   qword p_sah_globals;
+
+   dword num_max_qnode_global_root_buffer_entries;
+   dword num_builds;
+};
+
+#define PREFIX_MK_SAH_BUILD_ARGS_BATCHABLE(prefix, obj) \
+   (struct prefix##_SAHBuildArgsBatchable) { \
+      .p_globals_ptrs = (obj).p_globals_ptrs, \
+      .p_scheduler = (obj).p_scheduler, \
+      .p_buffers_info = (obj).p_buffers_info, \
+      .p_sah_globals = (obj).p_sah_globals, \
+      .num_max_qnode_global_root_buffer_entries = \
+      (obj).num_max_qnode_global_root_buffer_entries, \
+      .num_builds = (obj).num_builds, \
+   }
+
+
+struct SAHBuildBuffersInfo
+{
+   gpuva_t p_globals;
+   gpuva_t p_primref_index_buffers;
+   gpuva_t p_primrefs_buffer;
+   gpuva_t p_bvh2;
+   gpuva_t p_bvh_base;
+   gpuva_t p_qnode_root_buffer;
+   dword   sah_globals_flags;
+   dword   _pad;
+   gpuva_t _pad2;
+};
+
+#endif /* GRL_STRUCTS_H */
--- a/src/intel/vulkan/grl/meson.build
+++ b/src/intel/vulkan/grl/meson.build
@@ -49,6 +49,7 @@ endforeach

 grl_genX_files = [
  'genX_grl_dispatch.c',
+  'genX_grl_uuid.cpp',
 ]

 grl_lib_args = []
@@ -151,7 +152,7 @@ foreach t : [['125', 'gfx125', 'dg2']]
  grl_genX_libs += static_library(
    'grl_@0@'.format(genX_prefix),
    [grl_cl_kernel_h, grl_compiled_cl_kernels, grl_cl_kernel_c,
-     grl_genX_files, grl_metakernel_c],
+     grl_genX_files, grl_metakernel_c, grl_metakernel_h],
    include_directories : [
      inc_include, inc_src, inc_mapi, inc_mesa, inc_gallium, inc_compiler,
      inc_intel, inc_anv,
@@ -160,6 +161,10 @@ foreach t : [['125', 'gfx125', 'dg2']]
      no_override_init_args, c_sse2_args,
      '-DGFX_VERx10=@0@'.format(verX10),
    ],
+    cpp_args : [
+      no_override_init_args, c_sse2_args,
+      '-DGFX_VERx10=@0@'.format(verX10),
+    ],
    dependencies : [
      dep_valgrind, idep_nir_headers, idep_vulkan_util_headers, idep_vulkan_wsi_headers,
      idep_vulkan_runtime_headers,
@@ -168,6 +173,13 @@ foreach t : [['125', 'gfx125', 'dg2']]
  )
 endforeach

+libgrl_deps = [
+  dep_valgrind,
+  idep_nir_headers,
+  idep_vulkan_util_headers,
+  idep_vulkan_wsi_headers,
+]
+
 libgrl = static_library(
  'grl',
  [grl_cl_kernel_h],
@@ -176,8 +188,12 @@ libgrl = static_library(
    inc_compiler,
  ],
  link_whole : [grl_genX_libs],
-  dependencies : [
-    dep_valgrind, idep_nir_headers, idep_vulkan_util_headers, idep_vulkan_wsi_headers
-  ],
+  dependencies : libgrl_deps,
  install : true,
 )
+idep_grl = declare_dependency(
+  link_with : libgrl,
+  dependencies : libgrl_deps,
+  sources : grl_metakernel_h,
+  include_directories : include_directories('include', 'gpu'),
+)
--- a/src/intel/vulkan/meson.build
+++ b/src/intel/vulkan/meson.build
@@ -73,6 +73,7 @@ endif

 libanv_per_hw_ver_libs = []
 anv_per_hw_ver_files = files(
+  'genX_acceleration_structure.c',
  'genX_blorp_exec.c',
  'genX_cmd_buffer.c',
  'genX_gpu_memcpy.c',
@@ -100,12 +101,12 @@ foreach g : [['90', ['gfx8_cmd_buffer.c']],
      dep_libdrm, dep_valgrind, idep_nir_headers, idep_genxml,
      idep_vulkan_util_headers, idep_vulkan_wsi_headers,
      idep_vulkan_runtime_headers, idep_intel_driver_ds_headers,
+      idep_grl,
    ],
  )
 endforeach

 libanv_files = files(
-  'anv_acceleration_structure.c',
  'anv_allocator.c',
  'anv_android.h',
  'anv_batch_chain.c',
@@ -194,7 +195,7 @@ libvulkan_intel = shared_library(
  include_directories : [
    inc_include, inc_src, inc_mapi, inc_mesa, inc_gallium, inc_intel, inc_compiler,
  ],
-  link_whole : [libanv_common, libanv_per_hw_ver_libs],
+  link_whole : [libanv_common, libanv_per_hw_ver_libs, libgrl],
  link_with : [
    libintel_compiler, libintel_dev, libisl, libblorp, libintel_perf,
  ],
@@ -232,7 +233,7 @@ if with_tests
    ],
    link_whole : libanv_common,
    link_with : [
-      libanv_per_hw_ver_libs, libintel_compiler, libintel_common, libintel_dev,
+      libanv_per_hw_ver_libs, libgrl, libintel_compiler, libintel_common, libintel_dev,
      libisl, libblorp, libintel_perf,
    ],
    dependencies : [