anv: Build BVHs on the GPU with GRL
Reviewed-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com> Acked-by: Caio Oliveira <caio.oliveira@intel.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/16970>
This commit is contained in:

committed by
Marge Bot

parent
dc1aedef2b
commit
f3ddfd81b4
@@ -191,6 +191,7 @@ get_device_extensions(const struct anv_physical_device *device,
|
||||
*ext = (struct vk_device_extension_table) {
|
||||
.KHR_8bit_storage = true,
|
||||
.KHR_16bit_storage = true,
|
||||
.KHR_acceleration_structure = device->info.has_ray_tracing,
|
||||
.KHR_bind_memory2 = true,
|
||||
.KHR_buffer_device_address = true,
|
||||
.KHR_copy_commands2 = true,
|
||||
@@ -1343,11 +1344,12 @@ void anv_GetPhysicalDeviceFeatures2(
|
||||
|
||||
case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_ACCELERATION_STRUCTURE_FEATURES_KHR: {
|
||||
VkPhysicalDeviceAccelerationStructureFeaturesKHR *features = (void *)ext;
|
||||
features->accelerationStructure = false;
|
||||
features->accelerationStructureCaptureReplay = false;
|
||||
features->accelerationStructureIndirectBuild = false;
|
||||
features->accelerationStructure = pdevice->info.has_ray_tracing;
|
||||
features->accelerationStructureCaptureReplay = false; /* TODO */
|
||||
features->accelerationStructureIndirectBuild = false; /* TODO */
|
||||
features->accelerationStructureHostCommands = false;
|
||||
features->descriptorBindingAccelerationStructureUpdateAfterBind = true;
|
||||
features->descriptorBindingAccelerationStructureUpdateAfterBind =
|
||||
pdevice->info.has_ray_tracing;
|
||||
break;
|
||||
}
|
||||
|
||||
@@ -3393,6 +3395,11 @@ VkResult anv_CreateDevice(
|
||||
"Anv") + 8, 8),
|
||||
};
|
||||
|
||||
device->rt_uuid_addr = anv_address_add(device->workaround_address, 8);
|
||||
memcpy(device->rt_uuid_addr.bo->map + device->rt_uuid_addr.offset,
|
||||
physical_device->rt_uuid,
|
||||
sizeof(physical_device->rt_uuid));
|
||||
|
||||
device->debug_frame_desc =
|
||||
intel_debug_get_identifier_block(device->workaround_bo->map,
|
||||
device->workaround_bo->size,
|
||||
|
@@ -2866,6 +2866,8 @@ anv_pipeline_compile_ray_tracing(struct anv_ray_tracing_pipeline *pipeline,
|
||||
VkResult
|
||||
anv_device_init_rt_shaders(struct anv_device *device)
|
||||
{
|
||||
device->bvh_build_method = ANV_BVH_BUILD_METHOD_NEW_SAH;
|
||||
|
||||
if (!device->vk.enabled_extensions.KHR_ray_tracing_pipeline)
|
||||
return VK_SUCCESS;
|
||||
|
||||
|
@@ -975,6 +975,7 @@ struct anv_physical_device {
|
||||
uint8_t pipeline_cache_uuid[VK_UUID_SIZE];
|
||||
uint8_t driver_uuid[VK_UUID_SIZE];
|
||||
uint8_t device_uuid[VK_UUID_SIZE];
|
||||
uint8_t rt_uuid[VK_UUID_SIZE];
|
||||
|
||||
struct vk_sync_type sync_syncobj_type;
|
||||
struct vk_sync_timeline_type sync_timeline_type;
|
||||
@@ -1076,6 +1077,11 @@ anv_device_upload_nir(struct anv_device *device,
|
||||
const struct nir_shader *nir,
|
||||
unsigned char sha1_key[20]);
|
||||
|
||||
enum anv_rt_bvh_build_method {
|
||||
ANV_BVH_BUILD_METHOD_TRIVIAL,
|
||||
ANV_BVH_BUILD_METHOD_NEW_SAH,
|
||||
};
|
||||
|
||||
struct anv_device {
|
||||
struct vk_device vk;
|
||||
|
||||
@@ -1146,6 +1152,7 @@ struct anv_device {
|
||||
struct anv_scratch_pool scratch_pool;
|
||||
struct anv_bo *rt_scratch_bos[16];
|
||||
struct anv_bo *btd_fifo_bo;
|
||||
struct anv_address rt_uuid_addr;
|
||||
|
||||
/** Shadow ray query BO
|
||||
*
|
||||
@@ -1165,6 +1172,8 @@ struct anv_device {
|
||||
struct anv_shader_bin *rt_trampoline;
|
||||
struct anv_shader_bin *rt_trivial_return;
|
||||
|
||||
enum anv_rt_bvh_build_method bvh_build_method;
|
||||
|
||||
pthread_mutex_t mutex;
|
||||
pthread_cond_t queue_submit;
|
||||
|
||||
@@ -2087,6 +2096,7 @@ anv_pipe_flush_bits_for_access_flags(struct anv_device *device,
|
||||
switch ((VkAccessFlags2)BITFIELD64_BIT(b)) {
|
||||
case VK_ACCESS_2_SHADER_WRITE_BIT:
|
||||
case VK_ACCESS_2_SHADER_STORAGE_WRITE_BIT:
|
||||
case VK_ACCESS_2_ACCELERATION_STRUCTURE_WRITE_BIT_KHR:
|
||||
/* We're transitioning a buffer that was previously used as write
|
||||
* destination through the data port. To make its content available
|
||||
* to future operations, flush the hdc pipeline.
|
||||
|
1280
src/intel/vulkan/genX_acceleration_structure.c
Normal file
1280
src/intel/vulkan/genX_acceleration_structure.c
Normal file
File diff suppressed because it is too large
Load Diff
@@ -5345,7 +5345,7 @@ genX(cmd_buffer_dispatch_kernel)(struct anv_cmd_buffer *cmd_buffer,
|
||||
mi_builder_init(&b, cmd_buffer->device->info, &cmd_buffer->batch);
|
||||
|
||||
struct anv_address sysvals_addr = {
|
||||
.bo = cmd_buffer->device->general_state_pool.block_pool.bo,
|
||||
.bo = NULL, /* General state buffer is always 0. */
|
||||
.offset = indirect_data.offset,
|
||||
};
|
||||
|
||||
|
@@ -159,6 +159,12 @@ VkResult genX(CreateQueryPool)(
|
||||
/* Query has two values: begin and end. */
|
||||
uint64s_per_slot = 1 + 2;
|
||||
break;
|
||||
#if GFX_VERx10 >= 125
|
||||
case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_COMPACTED_SIZE_KHR:
|
||||
case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SERIALIZATION_SIZE_KHR:
|
||||
uint64s_per_slot = 1 + 1 /* availability + size */;
|
||||
break;
|
||||
#endif
|
||||
default:
|
||||
assert(!"Invalid query type");
|
||||
}
|
||||
@@ -435,13 +441,18 @@ VkResult genX(GetQueryPoolResults)(
|
||||
ANV_FROM_HANDLE(anv_device, device, _device);
|
||||
ANV_FROM_HANDLE(anv_query_pool, pool, queryPool);
|
||||
|
||||
assert(pool->type == VK_QUERY_TYPE_OCCLUSION ||
|
||||
pool->type == VK_QUERY_TYPE_PIPELINE_STATISTICS ||
|
||||
pool->type == VK_QUERY_TYPE_TIMESTAMP ||
|
||||
pool->type == VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT ||
|
||||
pool->type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR ||
|
||||
pool->type == VK_QUERY_TYPE_PERFORMANCE_QUERY_INTEL ||
|
||||
pool->type == VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT);
|
||||
assert(
|
||||
#if GFX_VERx10 >= 125
|
||||
pool->type == VK_QUERY_TYPE_ACCELERATION_STRUCTURE_COMPACTED_SIZE_KHR ||
|
||||
pool->type == VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SERIALIZATION_SIZE_KHR ||
|
||||
#endif
|
||||
pool->type == VK_QUERY_TYPE_OCCLUSION ||
|
||||
pool->type == VK_QUERY_TYPE_PIPELINE_STATISTICS ||
|
||||
pool->type == VK_QUERY_TYPE_TIMESTAMP ||
|
||||
pool->type == VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT ||
|
||||
pool->type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR ||
|
||||
pool->type == VK_QUERY_TYPE_PERFORMANCE_QUERY_INTEL ||
|
||||
pool->type == VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT);
|
||||
|
||||
if (vk_device_is_lost(&device->vk))
|
||||
return VK_ERROR_DEVICE_LOST;
|
||||
@@ -533,6 +544,10 @@ VkResult genX(GetQueryPoolResults)(
|
||||
break;
|
||||
}
|
||||
|
||||
#if GFX_VERx10 >= 125
|
||||
case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_COMPACTED_SIZE_KHR:
|
||||
case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SERIALIZATION_SIZE_KHR:
|
||||
#endif
|
||||
case VK_QUERY_TYPE_TIMESTAMP: {
|
||||
uint64_t *slot = query_slot(pool, firstQuery + i);
|
||||
if (write_results)
|
||||
@@ -716,6 +731,10 @@ void genX(CmdResetQueryPool)(
|
||||
|
||||
switch (pool->type) {
|
||||
case VK_QUERY_TYPE_OCCLUSION:
|
||||
#if GFX_VERx10 >= 125
|
||||
case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_COMPACTED_SIZE_KHR:
|
||||
case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SERIALIZATION_SIZE_KHR:
|
||||
#endif
|
||||
for (uint32_t i = 0; i < queryCount; i++) {
|
||||
emit_query_pc_availability(cmd_buffer,
|
||||
anv_query_address(pool, firstQuery + i),
|
||||
@@ -1466,6 +1485,10 @@ void genX(CmdCopyQueryPoolResults)(
|
||||
break;
|
||||
|
||||
case VK_QUERY_TYPE_TIMESTAMP:
|
||||
#if GFX_VERx10 >= 125
|
||||
case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_COMPACTED_SIZE_KHR:
|
||||
case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SERIALIZATION_SIZE_KHR:
|
||||
#endif
|
||||
result = mi_mem64(anv_address_add(query_addr, 8));
|
||||
gpu_write_query_result(&b, dest_addr, flags, idx++, result);
|
||||
break;
|
||||
@@ -1486,3 +1509,51 @@ void genX(CmdCopyQueryPoolResults)(
|
||||
dest_addr = anv_address_add(dest_addr, destStride);
|
||||
}
|
||||
}
|
||||
|
||||
#if GFX_VERx10 >= 125
|
||||
|
||||
#include "grl/include/GRLRTASCommon.h"
|
||||
#include "grl/grl_metakernel_postbuild_info.h"
|
||||
|
||||
void
|
||||
genX(CmdWriteAccelerationStructuresPropertiesKHR)(
|
||||
VkCommandBuffer commandBuffer,
|
||||
uint32_t accelerationStructureCount,
|
||||
const VkAccelerationStructureKHR* pAccelerationStructures,
|
||||
VkQueryType queryType,
|
||||
VkQueryPool queryPool,
|
||||
uint32_t firstQuery)
|
||||
{
|
||||
ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
|
||||
ANV_FROM_HANDLE(anv_query_pool, pool, queryPool);
|
||||
|
||||
assert(queryType == VK_QUERY_TYPE_ACCELERATION_STRUCTURE_COMPACTED_SIZE_KHR ||
|
||||
queryType == VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SERIALIZATION_SIZE_KHR);
|
||||
|
||||
struct mi_builder b;
|
||||
mi_builder_init(&b, cmd_buffer->device->info, &cmd_buffer->batch);
|
||||
|
||||
for (uint32_t i = 0; i < accelerationStructureCount; i++) {
|
||||
ANV_FROM_HANDLE(anv_acceleration_structure, accel, pAccelerationStructures[i]);
|
||||
struct anv_address query_addr =
|
||||
anv_address_add(anv_query_address(pool, firstQuery + i), 8);
|
||||
|
||||
if (queryType == VK_QUERY_TYPE_ACCELERATION_STRUCTURE_COMPACTED_SIZE_KHR) {
|
||||
genX(grl_postbuild_info_compacted_size)(cmd_buffer,
|
||||
anv_address_physical(accel->address),
|
||||
anv_address_physical(query_addr));
|
||||
} else {
|
||||
assert(queryType == VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SERIALIZATION_SIZE_KHR);
|
||||
genX(grl_postbuild_info_serialized_size)(cmd_buffer,
|
||||
anv_address_physical(accel->address),
|
||||
anv_address_physical(query_addr));
|
||||
}
|
||||
}
|
||||
|
||||
cmd_buffer->state.pending_pipe_bits |= ANV_PIPE_END_OF_PIPE_SYNC_BIT;
|
||||
genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
|
||||
|
||||
for (uint32_t i = 0; i < accelerationStructureCount; i++)
|
||||
emit_query_mi_availability(&b, anv_query_address(pool, firstQuery + i), true);
|
||||
}
|
||||
#endif
|
||||
|
@@ -36,6 +36,11 @@
|
||||
#include "genxml/genX_pack.h"
|
||||
|
||||
#include "vk_standard_sample_locations.h"
|
||||
|
||||
#if GFX_VERx10 >= 125
|
||||
#include "grl/genX_grl.h"
|
||||
#endif
|
||||
|
||||
#include "vk_util.h"
|
||||
|
||||
static void
|
||||
@@ -466,6 +471,9 @@ void
|
||||
genX(init_physical_device_state)(ASSERTED struct anv_physical_device *pdevice)
|
||||
{
|
||||
assert(pdevice->info.verx10 == GFX_VERx10);
|
||||
#if GFX_VERx10 >= 125
|
||||
genX(grl_load_rt_uuid)(pdevice->rt_uuid);
|
||||
#endif
|
||||
}
|
||||
|
||||
VkResult
|
||||
|
@@ -39,6 +39,8 @@ genX(grl_dispatch)(struct anv_cmd_buffer *cmd_buffer,
|
||||
uint32_t arg_count,
|
||||
const struct anv_kernel_arg *args);
|
||||
|
||||
void
|
||||
genX(grl_load_rt_uuid)(uint8_t *out_uuid);
|
||||
|
||||
#ifdef __cplusplus
|
||||
} /* extern "C" */
|
||||
|
39
src/intel/vulkan/grl/genX_grl_uuid.cpp
Normal file
39
src/intel/vulkan/grl/genX_grl_uuid.cpp
Normal file
@@ -0,0 +1,39 @@
|
||||
/*
|
||||
* Copyright © 2021 Corporation
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
* to deal in the Software without restriction, including without limitation
|
||||
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
* and/or sell copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice (including the next
|
||||
* paragraph) shall be included in all copies or substantial portions of the
|
||||
* Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
||||
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
|
||||
* IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
#include <assert.h>
|
||||
#include <string.h>
|
||||
|
||||
#include "include/GRLGen12.h"
|
||||
|
||||
#include "vulkan/vulkan_core.h"
|
||||
|
||||
extern "C" void
|
||||
gfx125_grl_load_rt_uuid(uint8_t *out_uuid);
|
||||
|
||||
extern "C" void
|
||||
gfx125_grl_load_rt_uuid(uint8_t *out_uuid)
|
||||
{
|
||||
assert(sizeof(GRL::RTAS::GEN12::BVH_MAGIC) == VK_UUID_SIZE);
|
||||
memcpy(out_uuid, GRL::RTAS::GEN12::BVH_MAGIC, VK_UUID_SIZE);
|
||||
}
|
479
src/intel/vulkan/grl/grl_structs.h
Normal file
479
src/intel/vulkan/grl/grl_structs.h
Normal file
@@ -0,0 +1,479 @@
|
||||
/*
|
||||
* Copyright © 2022 Intel Corporation
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
* to deal in the Software without restriction, including without limitation
|
||||
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
* and/or sell copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice (including the next
|
||||
* paragraph) shall be included in all copies or substantial portions of the
|
||||
* Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
||||
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
|
||||
* IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
/**
|
||||
* This file contains a redefinition of structures defined in the GRL library.
|
||||
* We need to have those structures defined to allocate & prepare data for
|
||||
* the OpenCL kernels building acceleration structures. Unfortunately because
|
||||
* of C++ & OpenCL assumptions in GRL, it's no possible to just include GRL
|
||||
* header files directly so we have to redefine stuff here.
|
||||
*/
|
||||
|
||||
#ifndef GRL_STRUCTS_H
|
||||
#define GRL_STRUCTS_H
|
||||
|
||||
#include "GRLStructs.h"
|
||||
#include "GRLRTASCommon.h"
|
||||
|
||||
struct MKBuilderState {
|
||||
qword geomDesc_buffer;
|
||||
qword build_primref_buffer;
|
||||
qword build_globals;
|
||||
qword bvh_buffer;
|
||||
dword leaf_type;
|
||||
dword leaf_size;
|
||||
};
|
||||
|
||||
#define PREFIX_MK_STATE(prefix, obj) \
|
||||
(struct prefix##_MKBuilderState) { \
|
||||
.geomDesc_buffer = (obj).geomDesc_buffer, \
|
||||
.build_primref_buffer = (obj).build_primref_buffer, \
|
||||
.build_globals = (obj).build_globals, \
|
||||
.bvh_buffer = (obj).bvh_buffer, \
|
||||
.leaf_type = (obj).leaf_type, \
|
||||
.leaf_size = (obj).leaf_size, \
|
||||
}
|
||||
|
||||
struct MKSizeEstimate {
|
||||
dword numTriangles;
|
||||
dword numProcedurals;
|
||||
dword numPrimitives;
|
||||
dword numMeshes;
|
||||
dword numBuildPrimitives;
|
||||
dword numPrimitivesToSplit;
|
||||
dword instance_descs_start;
|
||||
dword geo_meta_data_start;
|
||||
dword node_data_start;
|
||||
dword leaf_data_start;
|
||||
dword procedural_data_start;
|
||||
dword back_pointer_start;
|
||||
dword sizeTotal;
|
||||
dword updateScratchSizeTotal;
|
||||
dword fatleaf_table_start;
|
||||
dword innernode_table_start;
|
||||
dword max_fatleaves;
|
||||
|
||||
size_t max_instance_leafs;
|
||||
size_t max_inner_nodes;
|
||||
size_t leaf_data_size;
|
||||
size_t min_primitives;
|
||||
size_t max_primitives;
|
||||
};
|
||||
|
||||
#define PREFIX_MK_SIZE(prefix, obj) \
|
||||
(struct prefix##_MKSizeEstimate) { \
|
||||
.numTriangles = (obj).numTriangles, \
|
||||
.numProcedurals = (obj).numProcedurals, \
|
||||
.numPrimitives = (obj).numPrimitives, \
|
||||
.numMeshes = (obj).numMeshes, \
|
||||
.numBuildPrimitives = (obj).numBuildPrimitives, \
|
||||
.numPrimitivesToSplit = (obj).numPrimitivesToSplit, \
|
||||
.instance_descs_start = (obj).instance_descs_start, \
|
||||
.geo_meta_data_start = (obj).geo_meta_data_start, \
|
||||
.node_data_start = (obj).node_data_start, \
|
||||
.leaf_data_start = (obj).leaf_data_start, \
|
||||
.procedural_data_start = (obj).procedural_data_start, \
|
||||
.back_pointer_start = (obj).back_pointer_start, \
|
||||
.sizeTotal = (obj).sizeTotal, \
|
||||
.updateScratchSizeTotal = (obj).updateScratchSizeTotal, \
|
||||
.fatleaf_table_start = (obj).fatleaf_table_start, \
|
||||
.innernode_table_start = (obj).innernode_table_start, \
|
||||
.max_fatleaves = (obj).max_fatleaves, \
|
||||
}
|
||||
|
||||
typedef struct AABB {
|
||||
float lower[4];
|
||||
float upper[4];
|
||||
} AABB;
|
||||
|
||||
struct Globals
|
||||
{
|
||||
struct AABB centroidBounds;
|
||||
|
||||
unsigned int build_record_start;
|
||||
unsigned int numPrimitives;
|
||||
unsigned int leafPrimType;
|
||||
unsigned int leafSize;
|
||||
|
||||
unsigned int numSplittedPrimitives;
|
||||
unsigned int numBuildRecords;
|
||||
|
||||
// spatial split sate
|
||||
unsigned int numOriginalPrimitives;
|
||||
float presplitPrioritySum;
|
||||
float probThreshold;
|
||||
|
||||
// binned-sah bfs state
|
||||
unsigned int counter;
|
||||
unsigned int numBuildRecords_extended;
|
||||
|
||||
// sync variable used for global-sync on work groups
|
||||
unsigned int sync;
|
||||
|
||||
|
||||
/* morton code builder state */
|
||||
unsigned int shift; // used by adaptive mc-builder
|
||||
unsigned int shift_mask; // used by adaptive mc-builder
|
||||
unsigned int binary_hierarchy_root;
|
||||
unsigned int p0_allocated_num;
|
||||
unsigned int p0_created_num;
|
||||
unsigned int morton_sort_in_flight;
|
||||
unsigned int sort_iterations;
|
||||
|
||||
gpuva_t binary_hierarchy_buffer; // pointer to the binary morton code hierarchy. Stashed here as a debug aid
|
||||
};
|
||||
|
||||
typedef struct BVHBase
|
||||
{
|
||||
// TODO: Implement the "copy-first-node" trick... duplicate root node here
|
||||
|
||||
uint64_t rootNodeOffset;
|
||||
|
||||
uint32_t reserved;
|
||||
|
||||
uint32_t nodeDataCur; // nodeDataStart is sizeof(BVHBase) / 64 = BVH_ROOT_NODE_OFFSET / 64
|
||||
uint32_t quadLeafStart;
|
||||
uint32_t quadLeafCur;
|
||||
uint32_t proceduralDataStart;
|
||||
uint32_t proceduralDataCur;
|
||||
uint32_t instanceLeafStart;
|
||||
uint32_t instanceLeafEnd;
|
||||
uint32_t backPointerDataStart; //
|
||||
uint32_t refitTreeletsDataStart; // refit structs
|
||||
uint32_t refitStartPointDataStart; //
|
||||
uint32_t BVHDataEnd;
|
||||
|
||||
// number of bottom treelets
|
||||
// if 1, then the bottom treelet is also tip treelet
|
||||
uint32_t refitTreeletCnt;
|
||||
uint32_t refitTreeletCnt2; // always 0, used for atomic updates
|
||||
// data layout:
|
||||
// @backPointerDataStart
|
||||
// 'backpointer' - a dword per inner node.
|
||||
// The bits are used as follows:
|
||||
// 2:0 --> Used as a refit counter during BVH refitting. MBZ
|
||||
// 5:3 --> Number of children
|
||||
// 31:6 --> Index of the parent node in the internal node array
|
||||
// The root node has a parent index of all ones
|
||||
// @refitTreeletsDataStart
|
||||
// RefitTreelet[], the last treelet is for top treelet all previous are for bottom
|
||||
// @refitStartPointDataStart
|
||||
// for each treelet T there is [T.startpoint_offset, T.numStartpoints) interval of startpoints here in that space
|
||||
// @backPointerDataEnd
|
||||
|
||||
uint32_t fatLeafCount; // number of internal nodes which are "fat-leaves"
|
||||
uint32_t innerCount; // number of internal nodes which are true inner nodes (all internalNode children)
|
||||
uint32_t fatLeafTableStart;
|
||||
uint32_t innerTableStart;
|
||||
|
||||
uint32_t _pad[12];
|
||||
|
||||
struct RTASMetaData Meta;
|
||||
} BVHBase;
|
||||
|
||||
|
||||
struct BatchedInitGlobalsData
|
||||
{
|
||||
qword p_build_globals;
|
||||
qword p_bvh_buffer;
|
||||
dword numPrimitives;
|
||||
dword numGeometries;
|
||||
dword numInstances;
|
||||
dword instance_descs_start;
|
||||
dword geo_meta_data_start;
|
||||
dword node_data_start;
|
||||
dword leaf_data_start;
|
||||
dword procedural_data_start;
|
||||
dword back_pointer_start;
|
||||
dword sizeTotal;
|
||||
dword leafType;
|
||||
dword leafSize;
|
||||
dword fatleaf_table_start;
|
||||
dword innernode_table_start;
|
||||
};
|
||||
|
||||
|
||||
#define BFS_NUM_BINS 16
|
||||
#define BFS_NUM_VCONTEXTS 256
|
||||
#define BFS_MAX_DEPTH 32
|
||||
|
||||
#define QNODE_GLOBAL_ROOT_BUFFER_MIN_ENTRIES_NUM 16384
|
||||
|
||||
struct BFS_Split
|
||||
{
|
||||
float sah;
|
||||
int dim;
|
||||
int pos;
|
||||
};
|
||||
|
||||
struct BFS_BinInfo
|
||||
{
|
||||
float min_max[18 * BFS_NUM_BINS]; // layout: bins[axis][num_bins][6]
|
||||
// The 6 are lower(xyz) and -upper(xyz)
|
||||
// bins use negated-max so that we can use vectorized mins instead of min/max pairs
|
||||
uint counts[3 * BFS_NUM_BINS];
|
||||
};
|
||||
|
||||
struct SAHBuildGlobals
|
||||
{
|
||||
qword p_primref_index_buffers;
|
||||
qword p_primrefs_buffer;
|
||||
qword p_bvh2;
|
||||
qword p_globals; // TODO: deprecate this
|
||||
qword p_bvh_base;
|
||||
gpuva_t p_qnode_root_buffer;
|
||||
|
||||
dword flags; // bit 1 is 'alloc_backpointers'. bit 2 is 'need_masks'
|
||||
dword num_primrefs;
|
||||
dword leaf_size;
|
||||
dword leaf_type;
|
||||
|
||||
dword root_buffer_num_produced;
|
||||
dword root_buffer_num_produced_hi;
|
||||
dword root_buffer_num_consumed;
|
||||
dword root_buffer_num_consumed_hi;
|
||||
dword root_buffer_num_to_consume;
|
||||
dword root_buffer_num_to_consume_hi;
|
||||
};
|
||||
|
||||
typedef union LRBounds
|
||||
{
|
||||
struct
|
||||
{
|
||||
struct AABB3f left_centroid_bounds;
|
||||
struct AABB3f left_geom_bounds;
|
||||
struct AABB3f right_centroid_bounds;
|
||||
struct AABB3f right_geom_bounds;
|
||||
} boxes;
|
||||
struct
|
||||
{
|
||||
float Array[24];
|
||||
} scalars;
|
||||
} LRBounds;
|
||||
|
||||
|
||||
struct VContext
|
||||
{
|
||||
uint dispatch_primref_begin; // range of primrefs for this task
|
||||
uint dispatch_primref_end;
|
||||
uint bvh2_root; // BVH2 root node for this task
|
||||
uint tree_depth; // depth of this node in the tree
|
||||
uint num_left; // primref counts
|
||||
uint num_right;
|
||||
uint lr_mask; // lower 8b : left mask. upper 8b : right mask
|
||||
uint batch_index;
|
||||
|
||||
// pass1 global working state and output
|
||||
struct BFS_Split split;
|
||||
struct BFS_BinInfo global_bin_info;
|
||||
|
||||
// pass2 global working state and output
|
||||
LRBounds lr_bounds;
|
||||
};
|
||||
|
||||
|
||||
|
||||
struct BFSDispatchRecord
|
||||
{
|
||||
ushort batch_index;
|
||||
ushort context_id;
|
||||
};
|
||||
|
||||
|
||||
struct BFSDispatchQueue
|
||||
{
|
||||
uint num_dispatches;
|
||||
uint wg_count[BFS_NUM_VCONTEXTS];
|
||||
struct BFSDispatchRecord records[BFS_NUM_VCONTEXTS];
|
||||
};
|
||||
|
||||
struct BFS1SpillStackEntry
|
||||
{
|
||||
uint primref_begin;
|
||||
uint primref_end;
|
||||
uint bvh2_root;
|
||||
ushort tree_depth;
|
||||
ushort batch_index;
|
||||
};
|
||||
|
||||
struct BFS1SpillStack
|
||||
{
|
||||
uint size;
|
||||
struct BFS1SpillStackEntry entries[BFS_NUM_VCONTEXTS * BFS_MAX_DEPTH];
|
||||
};
|
||||
|
||||
struct QNodeGlobalRootBufferEntry
|
||||
{
|
||||
uint bvh2_node;
|
||||
uint qnode;
|
||||
uint build_idx;
|
||||
uint _pad;
|
||||
};
|
||||
|
||||
struct QNodeGlobalRootBuffer
|
||||
{
|
||||
uint curr_entries_offset; // we use "entries" as two buffers, so offset is either 0 or QNODE_GLOBAL_ROOT_BUFFER_MIN_ENTRIES_NUM
|
||||
struct QNodeGlobalRootBufferEntry entries[QNODE_GLOBAL_ROOT_BUFFER_MIN_ENTRIES_NUM * 2];
|
||||
};
|
||||
|
||||
struct DFSDispatchRecord
|
||||
{
|
||||
uint primref_base;
|
||||
uint bvh2_base;
|
||||
uint batch_index;
|
||||
ushort num_primrefs;
|
||||
ushort tree_depth;
|
||||
};
|
||||
|
||||
|
||||
struct DFSDispatchQueue
|
||||
{
|
||||
struct DFSDispatchRecord records[BFS_NUM_VCONTEXTS * 2];
|
||||
};
|
||||
|
||||
#define VCONTEXT_STATE_EXECUTING 0
|
||||
#define VCONTEXT_STATE_UNALLOCATED 1
|
||||
|
||||
union SchedulerUnion
|
||||
{
|
||||
struct VContextScheduler
|
||||
{
|
||||
/////////////////////////////////////////////////////////////
|
||||
// State data used for communication with command streamer
|
||||
// NOTE: This part must match definition in 'new_sah_builder.grl'
|
||||
/////////////////////////////////////////////////////////////
|
||||
|
||||
dword num_bfs_wgs;
|
||||
dword num_dfs_wgs;
|
||||
|
||||
dword scheduler_postsync;
|
||||
dword _pad1;
|
||||
|
||||
dword num_trivial_builds; // number of trivial builds (#primrefs < leaf_size).
|
||||
dword num_single_builds; // number of single-wg builds (#primrefs < threshold)
|
||||
|
||||
dword batched_build_wg_count; // number of wgs to dispatch for initial BFS pass
|
||||
dword batched_build_loop_mask; // value is 0 if #builds <= #contexts. else 1 command streamer uses this as a loop condition
|
||||
|
||||
/////////////////////////////////////////////////////////////
|
||||
|
||||
dword batched_build_count; // number of batched builds in the SAHBuildGlobals buffer
|
||||
dword batched_build_offset; // location of the first batched-build in the SAHBuildGlobals buffer
|
||||
|
||||
dword vcontext_state[BFS_NUM_VCONTEXTS];
|
||||
|
||||
struct BFSDispatchQueue bfs_queue;
|
||||
struct DFSDispatchQueue dfs_queue;
|
||||
|
||||
struct VContext contexts[BFS_NUM_VCONTEXTS];
|
||||
|
||||
struct BFS1SpillStack bfs2_spill_stack;
|
||||
} vContextScheduler;
|
||||
|
||||
struct QnodeScheduler
|
||||
{
|
||||
dword num_qnode_grb_curr_entries;
|
||||
dword num_qnode_grb_new_entries;
|
||||
|
||||
dword scheduler_postsync;
|
||||
dword _pad1;
|
||||
|
||||
dword num_trivial_builds; // number of trivial builds (#primrefs < leaf_size).
|
||||
dword num_single_builds; // number of single-wg builds (#primrefs < threshold)
|
||||
|
||||
dword batched_builds_to_process;
|
||||
dword num_max_qnode_global_root_buffer_entries; // number of maximum entries to global root buffer
|
||||
|
||||
/////////////////////////////////////////////////////////////
|
||||
|
||||
dword batched_build_count; // number of batched builds in the SAHBuildGlobals buffer
|
||||
dword batched_build_offset; // location of the first batched-build in the SAHBuildGlobals buffer
|
||||
|
||||
struct QNodeGlobalRootBuffer qnode_global_root_buffer;
|
||||
} qnodeScheduler;
|
||||
};
|
||||
|
||||
|
||||
struct BVH2Node
|
||||
{
|
||||
struct AABB3f box;
|
||||
uint meta_u; // leaf: primref start. inner: offset from node to its first child
|
||||
uint meta_ss;
|
||||
//ushort meta_s; // leaf: primref count. inner: offset from first to second child, in nodes
|
||||
//uchar is_inner; // 1 if inner, 0 if leaf
|
||||
//uchar mask;
|
||||
};
|
||||
|
||||
struct BVH2
|
||||
{
|
||||
uint num_nodes;
|
||||
uint _pad[7]; // align to 32B
|
||||
};
|
||||
|
||||
struct BatchedBLSDispatchEntry
|
||||
{
|
||||
/////////////////////////////////////////////////////////////
|
||||
// State data used for communication with command streamer
|
||||
// NOTE: This part must match definition in 'msb_radix_bitonic_sort.grl'
|
||||
/////////////////////////////////////////////////////////////
|
||||
qword p_data_buffer;
|
||||
qword num_elements; // number of elements in p_data_buffer
|
||||
};
|
||||
|
||||
struct SAHBuildArgsBatchable
|
||||
{
|
||||
qword p_globals_ptrs;
|
||||
qword p_scheduler;
|
||||
qword p_buffers_info;
|
||||
qword p_sah_globals;
|
||||
|
||||
dword num_max_qnode_global_root_buffer_entries;
|
||||
dword num_builds;
|
||||
};
|
||||
|
||||
#define PREFIX_MK_SAH_BUILD_ARGS_BATCHABLE(prefix, obj) \
|
||||
(struct prefix##_SAHBuildArgsBatchable) { \
|
||||
.p_globals_ptrs = (obj).p_globals_ptrs, \
|
||||
.p_scheduler = (obj).p_scheduler, \
|
||||
.p_buffers_info = (obj).p_buffers_info, \
|
||||
.p_sah_globals = (obj).p_sah_globals, \
|
||||
.num_max_qnode_global_root_buffer_entries = \
|
||||
(obj).num_max_qnode_global_root_buffer_entries, \
|
||||
.num_builds = (obj).num_builds, \
|
||||
}
|
||||
|
||||
|
||||
struct SAHBuildBuffersInfo
|
||||
{
|
||||
gpuva_t p_globals;
|
||||
gpuva_t p_primref_index_buffers;
|
||||
gpuva_t p_primrefs_buffer;
|
||||
gpuva_t p_bvh2;
|
||||
gpuva_t p_bvh_base;
|
||||
gpuva_t p_qnode_root_buffer;
|
||||
dword sah_globals_flags;
|
||||
dword _pad;
|
||||
gpuva_t _pad2;
|
||||
};
|
||||
|
||||
#endif /* GRL_STRUCTS_H */
|
@@ -49,6 +49,7 @@ endforeach
|
||||
|
||||
grl_genX_files = [
|
||||
'genX_grl_dispatch.c',
|
||||
'genX_grl_uuid.cpp',
|
||||
]
|
||||
|
||||
grl_lib_args = []
|
||||
@@ -151,7 +152,7 @@ foreach t : [['125', 'gfx125', 'dg2']]
|
||||
grl_genX_libs += static_library(
|
||||
'grl_@0@'.format(genX_prefix),
|
||||
[grl_cl_kernel_h, grl_compiled_cl_kernels, grl_cl_kernel_c,
|
||||
grl_genX_files, grl_metakernel_c],
|
||||
grl_genX_files, grl_metakernel_c, grl_metakernel_h],
|
||||
include_directories : [
|
||||
inc_include, inc_src, inc_mapi, inc_mesa, inc_gallium, inc_compiler,
|
||||
inc_intel, inc_anv,
|
||||
@@ -160,6 +161,10 @@ foreach t : [['125', 'gfx125', 'dg2']]
|
||||
no_override_init_args, c_sse2_args,
|
||||
'-DGFX_VERx10=@0@'.format(verX10),
|
||||
],
|
||||
cpp_args : [
|
||||
no_override_init_args, c_sse2_args,
|
||||
'-DGFX_VERx10=@0@'.format(verX10),
|
||||
],
|
||||
dependencies : [
|
||||
dep_valgrind, idep_nir_headers, idep_vulkan_util_headers, idep_vulkan_wsi_headers,
|
||||
idep_vulkan_runtime_headers,
|
||||
@@ -168,6 +173,13 @@ foreach t : [['125', 'gfx125', 'dg2']]
|
||||
)
|
||||
endforeach
|
||||
|
||||
libgrl_deps = [
|
||||
dep_valgrind,
|
||||
idep_nir_headers,
|
||||
idep_vulkan_util_headers,
|
||||
idep_vulkan_wsi_headers,
|
||||
]
|
||||
|
||||
libgrl = static_library(
|
||||
'grl',
|
||||
[grl_cl_kernel_h],
|
||||
@@ -176,8 +188,12 @@ libgrl = static_library(
|
||||
inc_compiler,
|
||||
],
|
||||
link_whole : [grl_genX_libs],
|
||||
dependencies : [
|
||||
dep_valgrind, idep_nir_headers, idep_vulkan_util_headers, idep_vulkan_wsi_headers
|
||||
],
|
||||
dependencies : libgrl_deps,
|
||||
install : true,
|
||||
)
|
||||
idep_grl = declare_dependency(
|
||||
link_with : libgrl,
|
||||
dependencies : libgrl_deps,
|
||||
sources : grl_metakernel_h,
|
||||
include_directories : include_directories('include', 'gpu'),
|
||||
)
|
||||
|
@@ -73,6 +73,7 @@ endif
|
||||
|
||||
libanv_per_hw_ver_libs = []
|
||||
anv_per_hw_ver_files = files(
|
||||
'genX_acceleration_structure.c',
|
||||
'genX_blorp_exec.c',
|
||||
'genX_cmd_buffer.c',
|
||||
'genX_gpu_memcpy.c',
|
||||
@@ -100,12 +101,12 @@ foreach g : [['90', ['gfx8_cmd_buffer.c']],
|
||||
dep_libdrm, dep_valgrind, idep_nir_headers, idep_genxml,
|
||||
idep_vulkan_util_headers, idep_vulkan_wsi_headers,
|
||||
idep_vulkan_runtime_headers, idep_intel_driver_ds_headers,
|
||||
idep_grl,
|
||||
],
|
||||
)
|
||||
endforeach
|
||||
|
||||
libanv_files = files(
|
||||
'anv_acceleration_structure.c',
|
||||
'anv_allocator.c',
|
||||
'anv_android.h',
|
||||
'anv_batch_chain.c',
|
||||
@@ -194,7 +195,7 @@ libvulkan_intel = shared_library(
|
||||
include_directories : [
|
||||
inc_include, inc_src, inc_mapi, inc_mesa, inc_gallium, inc_intel, inc_compiler,
|
||||
],
|
||||
link_whole : [libanv_common, libanv_per_hw_ver_libs],
|
||||
link_whole : [libanv_common, libanv_per_hw_ver_libs, libgrl],
|
||||
link_with : [
|
||||
libintel_compiler, libintel_dev, libisl, libblorp, libintel_perf,
|
||||
],
|
||||
@@ -232,7 +233,7 @@ if with_tests
|
||||
],
|
||||
link_whole : libanv_common,
|
||||
link_with : [
|
||||
libanv_per_hw_ver_libs, libintel_compiler, libintel_common, libintel_dev,
|
||||
libanv_per_hw_ver_libs, libgrl, libintel_compiler, libintel_common, libintel_dev,
|
||||
libisl, libblorp, libintel_perf,
|
||||
],
|
||||
dependencies : [
|
||||
|
Reference in New Issue
Block a user