Files
third_party_mesa3d/src/amd/vulkan/bvh/build_helpers.h
Friedrich Vock a9831caa14 radv/rt: Add workaround to make leaves always active
DOOM Eternal builds acceleration structures with inactive primitives and
tries to make them active in later AS updates. This is disallowed by the
spec and triggers a GPU hang. Fix the hang by working around the bug.

Cc: mesa-stable
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/27034>
2024-01-17 15:14:48 +00:00

721 lines
26 KiB
C

/*
* Copyright © 2022 Konstantin Seurer
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice (including the next
* paragraph) shall be included in all copies or substantial portions of the
* Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
*/
#ifndef BVH_BUILD_HELPERS_H
#define BVH_BUILD_HELPERS_H
#include "bvh.h"
#define VK_FORMAT_UNDEFINED 0
#define VK_FORMAT_R4G4_UNORM_PACK8 1
#define VK_FORMAT_R4G4B4A4_UNORM_PACK16 2
#define VK_FORMAT_B4G4R4A4_UNORM_PACK16 3
#define VK_FORMAT_R5G6B5_UNORM_PACK16 4
#define VK_FORMAT_B5G6R5_UNORM_PACK16 5
#define VK_FORMAT_R5G5B5A1_UNORM_PACK16 6
#define VK_FORMAT_B5G5R5A1_UNORM_PACK16 7
#define VK_FORMAT_A1R5G5B5_UNORM_PACK16 8
#define VK_FORMAT_R8_UNORM 9
#define VK_FORMAT_R8_SNORM 10
#define VK_FORMAT_R8_USCALED 11
#define VK_FORMAT_R8_SSCALED 12
#define VK_FORMAT_R8_UINT 13
#define VK_FORMAT_R8_SINT 14
#define VK_FORMAT_R8_SRGB 15
#define VK_FORMAT_R8G8_UNORM 16
#define VK_FORMAT_R8G8_SNORM 17
#define VK_FORMAT_R8G8_USCALED 18
#define VK_FORMAT_R8G8_SSCALED 19
#define VK_FORMAT_R8G8_UINT 20
#define VK_FORMAT_R8G8_SINT 21
#define VK_FORMAT_R8G8_SRGB 22
#define VK_FORMAT_R8G8B8_UNORM 23
#define VK_FORMAT_R8G8B8_SNORM 24
#define VK_FORMAT_R8G8B8_USCALED 25
#define VK_FORMAT_R8G8B8_SSCALED 26
#define VK_FORMAT_R8G8B8_UINT 27
#define VK_FORMAT_R8G8B8_SINT 28
#define VK_FORMAT_R8G8B8_SRGB 29
#define VK_FORMAT_B8G8R8_UNORM 30
#define VK_FORMAT_B8G8R8_SNORM 31
#define VK_FORMAT_B8G8R8_USCALED 32
#define VK_FORMAT_B8G8R8_SSCALED 33
#define VK_FORMAT_B8G8R8_UINT 34
#define VK_FORMAT_B8G8R8_SINT 35
#define VK_FORMAT_B8G8R8_SRGB 36
#define VK_FORMAT_R8G8B8A8_UNORM 37
#define VK_FORMAT_R8G8B8A8_SNORM 38
#define VK_FORMAT_R8G8B8A8_USCALED 39
#define VK_FORMAT_R8G8B8A8_SSCALED 40
#define VK_FORMAT_R8G8B8A8_UINT 41
#define VK_FORMAT_R8G8B8A8_SINT 42
#define VK_FORMAT_R8G8B8A8_SRGB 43
#define VK_FORMAT_B8G8R8A8_UNORM 44
#define VK_FORMAT_B8G8R8A8_SNORM 45
#define VK_FORMAT_B8G8R8A8_USCALED 46
#define VK_FORMAT_B8G8R8A8_SSCALED 47
#define VK_FORMAT_B8G8R8A8_UINT 48
#define VK_FORMAT_B8G8R8A8_SINT 49
#define VK_FORMAT_B8G8R8A8_SRGB 50
#define VK_FORMAT_A8B8G8R8_UNORM_PACK32 51
#define VK_FORMAT_A8B8G8R8_SNORM_PACK32 52
#define VK_FORMAT_A8B8G8R8_USCALED_PACK32 53
#define VK_FORMAT_A8B8G8R8_SSCALED_PACK32 54
#define VK_FORMAT_A8B8G8R8_UINT_PACK32 55
#define VK_FORMAT_A8B8G8R8_SINT_PACK32 56
#define VK_FORMAT_A8B8G8R8_SRGB_PACK32 57
#define VK_FORMAT_A2R10G10B10_UNORM_PACK32 58
#define VK_FORMAT_A2R10G10B10_SNORM_PACK32 59
#define VK_FORMAT_A2R10G10B10_USCALED_PACK32 60
#define VK_FORMAT_A2R10G10B10_SSCALED_PACK32 61
#define VK_FORMAT_A2R10G10B10_UINT_PACK32 62
#define VK_FORMAT_A2R10G10B10_SINT_PACK32 63
#define VK_FORMAT_A2B10G10R10_UNORM_PACK32 64
#define VK_FORMAT_A2B10G10R10_SNORM_PACK32 65
#define VK_FORMAT_A2B10G10R10_USCALED_PACK32 66
#define VK_FORMAT_A2B10G10R10_SSCALED_PACK32 67
#define VK_FORMAT_A2B10G10R10_UINT_PACK32 68
#define VK_FORMAT_A2B10G10R10_SINT_PACK32 69
#define VK_FORMAT_R16_UNORM 70
#define VK_FORMAT_R16_SNORM 71
#define VK_FORMAT_R16_USCALED 72
#define VK_FORMAT_R16_SSCALED 73
#define VK_FORMAT_R16_UINT 74
#define VK_FORMAT_R16_SINT 75
#define VK_FORMAT_R16_SFLOAT 76
#define VK_FORMAT_R16G16_UNORM 77
#define VK_FORMAT_R16G16_SNORM 78
#define VK_FORMAT_R16G16_USCALED 79
#define VK_FORMAT_R16G16_SSCALED 80
#define VK_FORMAT_R16G16_UINT 81
#define VK_FORMAT_R16G16_SINT 82
#define VK_FORMAT_R16G16_SFLOAT 83
#define VK_FORMAT_R16G16B16_UNORM 84
#define VK_FORMAT_R16G16B16_SNORM 85
#define VK_FORMAT_R16G16B16_USCALED 86
#define VK_FORMAT_R16G16B16_SSCALED 87
#define VK_FORMAT_R16G16B16_UINT 88
#define VK_FORMAT_R16G16B16_SINT 89
#define VK_FORMAT_R16G16B16_SFLOAT 90
#define VK_FORMAT_R16G16B16A16_UNORM 91
#define VK_FORMAT_R16G16B16A16_SNORM 92
#define VK_FORMAT_R16G16B16A16_USCALED 93
#define VK_FORMAT_R16G16B16A16_SSCALED 94
#define VK_FORMAT_R16G16B16A16_UINT 95
#define VK_FORMAT_R16G16B16A16_SINT 96
#define VK_FORMAT_R16G16B16A16_SFLOAT 97
#define VK_FORMAT_R32_UINT 98
#define VK_FORMAT_R32_SINT 99
#define VK_FORMAT_R32_SFLOAT 100
#define VK_FORMAT_R32G32_UINT 101
#define VK_FORMAT_R32G32_SINT 102
#define VK_FORMAT_R32G32_SFLOAT 103
#define VK_FORMAT_R32G32B32_UINT 104
#define VK_FORMAT_R32G32B32_SINT 105
#define VK_FORMAT_R32G32B32_SFLOAT 106
#define VK_FORMAT_R32G32B32A32_UINT 107
#define VK_FORMAT_R32G32B32A32_SINT 108
#define VK_FORMAT_R32G32B32A32_SFLOAT 109
#define VK_FORMAT_R64_UINT 110
#define VK_FORMAT_R64_SINT 111
#define VK_FORMAT_R64_SFLOAT 112
#define VK_FORMAT_R64G64_UINT 113
#define VK_FORMAT_R64G64_SINT 114
#define VK_FORMAT_R64G64_SFLOAT 115
#define VK_FORMAT_R64G64B64_UINT 116
#define VK_FORMAT_R64G64B64_SINT 117
#define VK_FORMAT_R64G64B64_SFLOAT 118
#define VK_FORMAT_R64G64B64A64_UINT 119
#define VK_FORMAT_R64G64B64A64_SINT 120
#define VK_FORMAT_R64G64B64A64_SFLOAT 121
#define VK_INDEX_TYPE_UINT16 0
#define VK_INDEX_TYPE_UINT32 1
#define VK_INDEX_TYPE_NONE_KHR 1000165000
#define VK_INDEX_TYPE_UINT8_EXT 1000265000
#define VK_GEOMETRY_TYPE_TRIANGLES_KHR 0
#define VK_GEOMETRY_TYPE_AABBS_KHR 1
#define VK_GEOMETRY_TYPE_INSTANCES_KHR 2
#define VK_GEOMETRY_INSTANCE_TRIANGLE_FACING_CULL_DISABLE_BIT_KHR 1
#define VK_GEOMETRY_INSTANCE_TRIANGLE_FLIP_FACING_BIT_KHR 2
#define VK_GEOMETRY_INSTANCE_FORCE_OPAQUE_BIT_KHR 4
#define VK_GEOMETRY_INSTANCE_FORCE_NO_OPAQUE_BIT_KHR 8
#define TYPE(type, align) \
layout(buffer_reference, buffer_reference_align = align, scalar) buffer type##_ref \
{ \
type value; \
};
#define REF(type) type##_ref
#define VOID_REF uint64_t
#define NULL 0
#define DEREF(var) var.value
#define SIZEOF(type) uint32_t(uint64_t(REF(type)(uint64_t(0)) + 1))
#define OFFSET(ptr, offset) (uint64_t(ptr) + offset)
#define INFINITY (1.0 / 0.0)
#define NAN (0.0 / 0.0)
#define INDEX(type, ptr, index) REF(type)(OFFSET(ptr, (index)*SIZEOF(type)))
TYPE(int8_t, 1);
TYPE(uint8_t, 1);
TYPE(int16_t, 2);
TYPE(uint16_t, 2);
TYPE(int32_t, 4);
TYPE(uint32_t, 4);
TYPE(int64_t, 8);
TYPE(uint64_t, 8);
TYPE(float, 4);
TYPE(vec2, 4);
TYPE(vec3, 4);
TYPE(vec4, 4);
TYPE(uvec4, 16);
TYPE(VOID_REF, 8);
/* copied from u_math.h */
uint32_t
align(uint32_t value, uint32_t alignment)
{
return (value + alignment - 1) & ~(alignment - 1);
}
int32_t
to_emulated_float(float f)
{
int32_t bits = floatBitsToInt(f);
return f < 0 ? -2147483648 - bits : bits;
}
float
from_emulated_float(int32_t bits)
{
return intBitsToFloat(bits < 0 ? -2147483648 - bits : bits);
}
TYPE(radv_aabb, 4);
struct key_id_pair {
uint32_t id;
uint32_t key;
};
TYPE(key_id_pair, 4);
TYPE(radv_accel_struct_serialization_header, 8);
TYPE(radv_accel_struct_header, 8);
TYPE(radv_bvh_triangle_node, 4);
TYPE(radv_bvh_aabb_node, 4);
TYPE(radv_bvh_instance_node, 8);
TYPE(radv_bvh_box16_node, 4);
TYPE(radv_bvh_box32_node, 4);
TYPE(radv_ir_header, 4);
TYPE(radv_ir_node, 4);
TYPE(radv_ir_box_node, 4);
TYPE(radv_global_sync_data, 4);
uint32_t
id_to_offset(uint32_t id)
{
return (id & (~7u)) << 3;
}
uint32_t
id_to_type(uint32_t id)
{
return id & 7u;
}
uint32_t
pack_node_id(uint32_t offset, uint32_t type)
{
return (offset >> 3) | type;
}
uint64_t
node_to_addr(uint64_t node)
{
node &= ~7ul;
node <<= 19;
return int64_t(node) >> 16;
}
uint64_t
addr_to_node(uint64_t addr)
{
return (addr >> 3) & ((1ul << 45) - 1);
}
uint32_t
ir_id_to_offset(uint32_t id)
{
return id & (~3u);
}
uint32_t
ir_id_to_type(uint32_t id)
{
return id & 3u;
}
uint32_t
pack_ir_node_id(uint32_t offset, uint32_t type)
{
return offset | type;
}
uint32_t
ir_type_to_bvh_type(uint32_t type)
{
switch (type) {
case radv_ir_node_triangle:
return radv_bvh_node_triangle;
case radv_ir_node_internal:
return radv_bvh_node_box32;
case radv_ir_node_instance:
return radv_bvh_node_instance;
case radv_ir_node_aabb:
return radv_bvh_node_aabb;
}
/* unreachable in valid nodes */
return RADV_BVH_INVALID_NODE;
}
float
aabb_surface_area(radv_aabb aabb)
{
vec3 diagonal = aabb.max - aabb.min;
return 2 * diagonal.x * diagonal.y + 2 * diagonal.y * diagonal.z + 2 * diagonal.x * diagonal.z;
}
/* Just a wrapper for 3 uints. */
struct triangle_indices {
uint32_t index[3];
};
triangle_indices
load_indices(VOID_REF indices, uint32_t index_format, uint32_t global_id)
{
triangle_indices result;
uint32_t index_base = global_id * 3;
switch (index_format) {
case VK_INDEX_TYPE_UINT16: {
result.index[0] = DEREF(INDEX(uint16_t, indices, index_base + 0));
result.index[1] = DEREF(INDEX(uint16_t, indices, index_base + 1));
result.index[2] = DEREF(INDEX(uint16_t, indices, index_base + 2));
break;
}
case VK_INDEX_TYPE_UINT32: {
result.index[0] = DEREF(INDEX(uint32_t, indices, index_base + 0));
result.index[1] = DEREF(INDEX(uint32_t, indices, index_base + 1));
result.index[2] = DEREF(INDEX(uint32_t, indices, index_base + 2));
break;
}
case VK_INDEX_TYPE_NONE_KHR: {
result.index[0] = index_base + 0;
result.index[1] = index_base + 1;
result.index[2] = index_base + 2;
break;
}
case VK_INDEX_TYPE_UINT8_EXT: {
result.index[0] = DEREF(INDEX(uint8_t, indices, index_base + 0));
result.index[1] = DEREF(INDEX(uint8_t, indices, index_base + 1));
result.index[2] = DEREF(INDEX(uint8_t, indices, index_base + 2));
break;
}
}
return result;
}
/* Just a wrapper for 3 vec4s. */
struct triangle_vertices {
vec4 vertex[3];
};
TYPE(float16_t, 2);
triangle_vertices
load_vertices(VOID_REF vertices, triangle_indices indices, uint32_t vertex_format, uint32_t stride)
{
triangle_vertices result;
for (uint32_t i = 0; i < 3; i++) {
VOID_REF vertex_ptr = OFFSET(vertices, indices.index[i] * stride);
vec4 vertex = vec4(0.0, 0.0, 0.0, 1.0);
switch (vertex_format) {
case VK_FORMAT_R32G32_SFLOAT:
vertex.x = DEREF(INDEX(float, vertex_ptr, 0));
vertex.y = DEREF(INDEX(float, vertex_ptr, 1));
break;
case VK_FORMAT_R32G32B32_SFLOAT:
case VK_FORMAT_R32G32B32A32_SFLOAT:
vertex.x = DEREF(INDEX(float, vertex_ptr, 0));
vertex.y = DEREF(INDEX(float, vertex_ptr, 1));
vertex.z = DEREF(INDEX(float, vertex_ptr, 2));
break;
case VK_FORMAT_R16G16_SFLOAT:
vertex.x = DEREF(INDEX(float16_t, vertex_ptr, 0));
vertex.y = DEREF(INDEX(float16_t, vertex_ptr, 1));
break;
case VK_FORMAT_R16G16B16_SFLOAT:
case VK_FORMAT_R16G16B16A16_SFLOAT:
vertex.x = DEREF(INDEX(float16_t, vertex_ptr, 0));
vertex.y = DEREF(INDEX(float16_t, vertex_ptr, 1));
vertex.z = DEREF(INDEX(float16_t, vertex_ptr, 2));
break;
case VK_FORMAT_R16G16_SNORM:
vertex.x = max(-1.0, DEREF(INDEX(int16_t, vertex_ptr, 0)) / float(0x7FFF));
vertex.y = max(-1.0, DEREF(INDEX(int16_t, vertex_ptr, 1)) / float(0x7FFF));
break;
case VK_FORMAT_R16G16B16A16_SNORM:
vertex.x = max(-1.0, DEREF(INDEX(int16_t, vertex_ptr, 0)) / float(0x7FFF));
vertex.y = max(-1.0, DEREF(INDEX(int16_t, vertex_ptr, 1)) / float(0x7FFF));
vertex.z = max(-1.0, DEREF(INDEX(int16_t, vertex_ptr, 2)) / float(0x7FFF));
break;
case VK_FORMAT_R8G8_SNORM:
vertex.x = max(-1.0, DEREF(INDEX(int8_t, vertex_ptr, 0)) / float(0x7F));
vertex.y = max(-1.0, DEREF(INDEX(int8_t, vertex_ptr, 1)) / float(0x7F));
break;
case VK_FORMAT_R8G8B8A8_SNORM:
vertex.x = max(-1.0, DEREF(INDEX(int8_t, vertex_ptr, 0)) / float(0x7F));
vertex.y = max(-1.0, DEREF(INDEX(int8_t, vertex_ptr, 1)) / float(0x7F));
vertex.z = max(-1.0, DEREF(INDEX(int8_t, vertex_ptr, 2)) / float(0x7F));
break;
case VK_FORMAT_R16G16_UNORM:
vertex.x = DEREF(INDEX(uint16_t, vertex_ptr, 0)) / float(0xFFFF);
vertex.y = DEREF(INDEX(uint16_t, vertex_ptr, 1)) / float(0xFFFF);
break;
case VK_FORMAT_R16G16B16A16_UNORM:
vertex.x = DEREF(INDEX(uint16_t, vertex_ptr, 0)) / float(0xFFFF);
vertex.y = DEREF(INDEX(uint16_t, vertex_ptr, 1)) / float(0xFFFF);
vertex.z = DEREF(INDEX(uint16_t, vertex_ptr, 2)) / float(0xFFFF);
break;
case VK_FORMAT_R8G8_UNORM:
vertex.x = DEREF(INDEX(uint8_t, vertex_ptr, 0)) / float(0xFF);
vertex.y = DEREF(INDEX(uint8_t, vertex_ptr, 1)) / float(0xFF);
break;
case VK_FORMAT_R8G8B8A8_UNORM:
vertex.x = DEREF(INDEX(uint8_t, vertex_ptr, 0)) / float(0xFF);
vertex.y = DEREF(INDEX(uint8_t, vertex_ptr, 1)) / float(0xFF);
vertex.z = DEREF(INDEX(uint8_t, vertex_ptr, 2)) / float(0xFF);
break;
case VK_FORMAT_A2B10G10R10_UNORM_PACK32: {
uint32_t data = DEREF(REF(uint32_t)(vertex_ptr));
vertex.x = float(data & 0x3FF) / 0x3FF;
vertex.y = float((data >> 10) & 0x3FF) / 0x3FF;
vertex.z = float((data >> 20) & 0x3FF) / 0x3FF;
break;
}
}
result.vertex[i] = vertex;
}
return result;
}
/* A GLSL-adapted copy of VkAccelerationStructureInstanceKHR. */
struct AccelerationStructureInstance {
mat3x4 transform;
uint32_t custom_instance_and_mask;
uint32_t sbt_offset_and_flags;
uint64_t accelerationStructureReference;
};
TYPE(AccelerationStructureInstance, 8);
bool
build_triangle(inout radv_aabb bounds, VOID_REF dst_ptr, radv_bvh_geometry_data geom_data, uint32_t global_id)
{
triangle_indices indices = load_indices(geom_data.indices, geom_data.index_format, global_id);
triangle_vertices vertices = load_vertices(geom_data.data, indices, geom_data.vertex_format, geom_data.stride);
/* An inactive triangle is one for which the first (X) component of any vertex is NaN. If any
* other vertex component is NaN, and the first is not, the behavior is undefined. If the vertex
* format does not have a NaN representation, then all triangles are considered active.
*/
if (isnan(vertices.vertex[0].x) || isnan(vertices.vertex[1].x) || isnan(vertices.vertex[2].x))
return false;
if (geom_data.transform != NULL) {
mat4 transform = mat4(1.0);
for (uint32_t col = 0; col < 4; col++)
for (uint32_t row = 0; row < 3; row++)
transform[col][row] = DEREF(INDEX(float, geom_data.transform, col + row * 4));
for (uint32_t i = 0; i < 3; i++)
vertices.vertex[i] = transform * vertices.vertex[i];
}
REF(radv_bvh_triangle_node) node = REF(radv_bvh_triangle_node)(dst_ptr);
bounds.min = vec3(INFINITY);
bounds.max = vec3(-INFINITY);
for (uint32_t coord = 0; coord < 3; coord++)
for (uint32_t comp = 0; comp < 3; comp++) {
DEREF(node).coords[coord][comp] = vertices.vertex[coord][comp];
bounds.min[comp] = min(bounds.min[comp], vertices.vertex[coord][comp]);
bounds.max[comp] = max(bounds.max[comp], vertices.vertex[coord][comp]);
}
DEREF(node).triangle_id = global_id;
DEREF(node).geometry_id_and_flags = geom_data.geometry_id;
DEREF(node).id = 9;
return true;
}
bool
build_aabb(inout radv_aabb bounds, VOID_REF src_ptr, VOID_REF dst_ptr, uint32_t geometry_id, uint32_t global_id)
{
REF(radv_bvh_aabb_node) node = REF(radv_bvh_aabb_node)(dst_ptr);
for (uint32_t vec = 0; vec < 2; vec++)
for (uint32_t comp = 0; comp < 3; comp++) {
float coord = DEREF(INDEX(float, src_ptr, comp + vec * 3));
if (vec == 0)
bounds.min[comp] = coord;
else
bounds.max[comp] = coord;
}
/* An inactive AABB is one for which the minimum X coordinate is NaN. If any other component is
* NaN, and the first is not, the behavior is undefined.
*/
if (isnan(bounds.min.x))
return false;
DEREF(node).primitive_id = global_id;
DEREF(node).geometry_id_and_flags = geometry_id;
return true;
}
radv_aabb
calculate_instance_node_bounds(radv_accel_struct_header header, mat3x4 otw_matrix)
{
radv_aabb aabb;
for (uint32_t comp = 0; comp < 3; ++comp) {
aabb.min[comp] = otw_matrix[comp][3];
aabb.max[comp] = otw_matrix[comp][3];
for (uint32_t col = 0; col < 3; ++col) {
aabb.min[comp] +=
min(otw_matrix[comp][col] * header.aabb.min[col], otw_matrix[comp][col] * header.aabb.max[col]);
aabb.max[comp] +=
max(otw_matrix[comp][col] * header.aabb.min[col], otw_matrix[comp][col] * header.aabb.max[col]);
}
}
return aabb;
}
uint32_t
encode_sbt_offset_and_flags(uint32_t src)
{
uint32_t flags = src >> 24;
uint32_t ret = src & 0xffffffu;
if ((flags & VK_GEOMETRY_INSTANCE_FORCE_OPAQUE_BIT_KHR) != 0)
ret |= RADV_INSTANCE_FORCE_OPAQUE;
if ((flags & VK_GEOMETRY_INSTANCE_FORCE_NO_OPAQUE_BIT_KHR) == 0)
ret |= RADV_INSTANCE_NO_FORCE_NOT_OPAQUE;
if ((flags & VK_GEOMETRY_INSTANCE_TRIANGLE_FACING_CULL_DISABLE_BIT_KHR) != 0)
ret |= RADV_INSTANCE_TRIANGLE_FACING_CULL_DISABLE;
if ((flags & VK_GEOMETRY_INSTANCE_TRIANGLE_FLIP_FACING_BIT_KHR) != 0)
ret |= RADV_INSTANCE_TRIANGLE_FLIP_FACING;
return ret;
}
bool
build_instance(inout radv_aabb bounds, VOID_REF src_ptr, VOID_REF dst_ptr, uint32_t global_id)
{
REF(radv_bvh_instance_node) node = REF(radv_bvh_instance_node)(dst_ptr);
AccelerationStructureInstance instance = DEREF(REF(AccelerationStructureInstance)(src_ptr));
/* An inactive instance is one whose acceleration structure handle is VK_NULL_HANDLE. Since the active terminology is
* only relevant for BVH updates, which we do not implement, we can also skip instances with mask == 0.
*/
if (instance.accelerationStructureReference == 0 || instance.custom_instance_and_mask < (1u << 24u))
return false;
radv_accel_struct_header instance_header =
DEREF(REF(radv_accel_struct_header)(instance.accelerationStructureReference));
DEREF(node).bvh_ptr = addr_to_node(instance.accelerationStructureReference + instance_header.bvh_offset);
DEREF(node).bvh_offset = instance_header.bvh_offset;
mat4 transform = mat4(instance.transform);
mat4 inv_transform = transpose(inverse(transpose(transform)));
DEREF(node).wto_matrix = mat3x4(inv_transform);
DEREF(node).otw_matrix = mat3x4(transform);
bounds = calculate_instance_node_bounds(instance_header, mat3x4(transform));
DEREF(node).custom_instance_and_mask = instance.custom_instance_and_mask;
DEREF(node).sbt_offset_and_flags = encode_sbt_offset_and_flags(instance.sbt_offset_and_flags);
DEREF(node).instance_id = global_id;
return true;
}
/** Compute ceiling of integer quotient of A divided by B.
From macros.h */
#define DIV_ROUND_UP(A, B) (((A) + (B)-1) / (B))
#ifdef USE_GLOBAL_SYNC
/* There might be more invocations available than tasks to do.
* In that case, the fetched task index is greater than the
* counter offset for the next phase. To avoid out-of-bounds
* accessing, phases will be skipped until the task index is
* is in-bounds again. */
uint32_t num_tasks_to_skip = 0;
uint32_t phase_index = 0;
bool should_skip = false;
shared uint32_t global_task_index;
shared uint32_t shared_phase_index;
uint32_t
task_count(REF(radv_ir_header) header)
{
uint32_t phase_index = DEREF(header).sync_data.phase_index;
return DEREF(header).sync_data.task_counts[phase_index & 1];
}
/* Sets the task count for the next phase. */
void
set_next_task_count(REF(radv_ir_header) header, uint32_t new_count)
{
uint32_t phase_index = DEREF(header).sync_data.phase_index;
DEREF(header).sync_data.task_counts[(phase_index + 1) & 1] = new_count;
}
/*
* This function has two main objectives:
* Firstly, it partitions pending work among free invocations.
* Secondly, it guarantees global synchronization between different phases.
*
* After every call to fetch_task, a new task index is returned.
* fetch_task will also set num_tasks_to_skip. Use should_execute_phase
* to determine if the current phase should be executed or skipped.
*
* Since tasks are assigned per-workgroup, there is a possibility of the task index being
* greater than the total task count.
*/
uint32_t
fetch_task(REF(radv_ir_header) header, bool did_work)
{
/* Perform a memory + control barrier for all buffer writes for the entire workgroup.
* This guarantees that once the workgroup leaves the PHASE loop, all invocations have finished
* and their results are written to memory. */
controlBarrier(gl_ScopeWorkgroup, gl_ScopeDevice, gl_StorageSemanticsBuffer,
gl_SemanticsAcquireRelease | gl_SemanticsMakeAvailable | gl_SemanticsMakeVisible);
if (gl_LocalInvocationIndex == 0) {
if (did_work)
atomicAdd(DEREF(header).sync_data.task_done_counter, 1);
global_task_index = atomicAdd(DEREF(header).sync_data.task_started_counter, 1);
do {
/* Perform a memory barrier to refresh the current phase's end counter, in case
* another workgroup changed it. */
memoryBarrier(gl_ScopeDevice, gl_StorageSemanticsBuffer,
gl_SemanticsAcquireRelease | gl_SemanticsMakeAvailable | gl_SemanticsMakeVisible);
/* The first invocation of the first workgroup in a new phase is responsible to initiate the
* switch to a new phase. It is only possible to switch to a new phase if all tasks of the
* previous phase have been completed. Switching to a new phase and incrementing the phase
* end counter in turn notifies all invocations for that phase that it is safe to execute.
*/
if (global_task_index == DEREF(header).sync_data.current_phase_end_counter &&
DEREF(header).sync_data.task_done_counter == DEREF(header).sync_data.current_phase_end_counter) {
if (DEREF(header).sync_data.next_phase_exit_flag != 0) {
DEREF(header).sync_data.phase_index = TASK_INDEX_INVALID;
memoryBarrier(gl_ScopeDevice, gl_StorageSemanticsBuffer,
gl_SemanticsAcquireRelease | gl_SemanticsMakeAvailable | gl_SemanticsMakeVisible);
} else {
atomicAdd(DEREF(header).sync_data.phase_index, 1);
DEREF(header).sync_data.current_phase_start_counter = DEREF(header).sync_data.current_phase_end_counter;
/* Ensure the changes to the phase index and start/end counter are visible for other
* workgroup waiting in the loop. */
memoryBarrier(gl_ScopeDevice, gl_StorageSemanticsBuffer,
gl_SemanticsAcquireRelease | gl_SemanticsMakeAvailable | gl_SemanticsMakeVisible);
atomicAdd(DEREF(header).sync_data.current_phase_end_counter,
DIV_ROUND_UP(task_count(header), gl_WorkGroupSize.x));
}
break;
}
/* If other invocations have finished all nodes, break out; there is no work to do */
if (DEREF(header).sync_data.phase_index == TASK_INDEX_INVALID) {
break;
}
} while (global_task_index >= DEREF(header).sync_data.current_phase_end_counter);
shared_phase_index = DEREF(header).sync_data.phase_index;
}
barrier();
if (DEREF(header).sync_data.phase_index == TASK_INDEX_INVALID)
return TASK_INDEX_INVALID;
num_tasks_to_skip = shared_phase_index - phase_index;
uint32_t local_task_index = global_task_index - DEREF(header).sync_data.current_phase_start_counter;
return local_task_index * gl_WorkGroupSize.x + gl_LocalInvocationID.x;
}
bool
should_execute_phase()
{
if (num_tasks_to_skip > 0) {
/* Skip to next phase. */
++phase_index;
--num_tasks_to_skip;
return false;
}
return true;
}
#define PHASE(header) \
for (; task_index != TASK_INDEX_INVALID && should_execute_phase(); task_index = fetch_task(header, true))
#endif
#endif